Skip to content

Commit 8a74902

Browse files
Harden PDF XMP metadata parsing
1 parent e7b366d commit 8a74902

2 files changed

Lines changed: 72 additions & 3 deletions

File tree

OfficeIMO.Pdf/Reading/Core/PdfReadDocument.XmpMetadata.cs

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
using System.Xml;
12
using System.Xml.Linq;
23
using OfficeIMO.Pdf.Filters;
34

@@ -6,6 +7,8 @@ namespace OfficeIMO.Pdf;
67
public sealed partial class PdfReadDocument {
78
private const string DublinCoreNamespaceUri = "http://purl.org/dc/elements/1.1/";
89
private const string PdfAIdentificationNamespaceUri = "http://www.aiim.org/pdfa/ns/id/";
10+
/// <summary>Maximum decoded XMP metadata size parsed as XML.</summary>
11+
public const int MaxXmpMetadataBytes = 4_000_000;
912

1013
/// <summary>Catalog XMP metadata stream discovered from /Metadata.</summary>
1114
public PdfXmpMetadataInfo? XmpMetadata { get; }
@@ -23,8 +26,8 @@ public sealed partial class PdfReadDocument {
2326
}
2427

2528
byte[] decoded = StreamDecoder.Decode(stream.Dictionary, stream.Data, _objects);
26-
string? rawXml = DecodeMetadataText(decoded);
27-
XDocument? document = TryParseXml(rawXml);
29+
string? rawXml = decoded.Length <= MaxXmpMetadataBytes ? DecodeMetadataText(decoded) : null;
30+
XDocument? document = rawXml is null ? null : TryParseXml(rawXml);
2831
return new PdfXmpMetadataInfo(
2932
objectNumber,
3033
TryReadName(stream.Dictionary, "Subtype"),
@@ -82,7 +85,14 @@ public sealed partial class PdfReadDocument {
8285
}
8386

8487
try {
85-
return XDocument.Parse(rawXml!, LoadOptions.None);
88+
var settings = new XmlReaderSettings {
89+
DtdProcessing = DtdProcessing.Prohibit,
90+
MaxCharactersInDocument = MaxXmpMetadataBytes,
91+
XmlResolver = null
92+
};
93+
using var stringReader = new StringReader(rawXml!);
94+
using XmlReader reader = XmlReader.Create(stringReader, settings);
95+
return XDocument.Load(reader, LoadOptions.None);
8696
} catch (Exception ex) when (ex is System.Xml.XmlException || ex is InvalidOperationException) {
8797
return null;
8898
}

OfficeIMO.Tests/Pdf/PdfInspectorViewerMetadataPreflightTests.cs

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,34 @@ public void Inspect_ReadsXmpIdentificationFieldsByNamespaceUri() {
223223
Assert.Equal("BASIC", xmp.ElectronicInvoiceConformanceLevel);
224224
}
225225

226+
[Fact]
227+
public void Inspect_XmpMetadataRejectsDtdEntityExpansion() {
228+
const string xmp = "<!DOCTYPE xmp [<!ENTITY boom \"expanded\">]><xmp>&boom;</xmp>";
229+
230+
PdfDocumentInfo info = PdfInspector.Inspect(BuildXmpMetadataPdfWithPayload(xmp));
231+
232+
Assert.True(info.HasXmpMetadata);
233+
PdfXmpMetadataInfo metadata = Assert.IsType<PdfXmpMetadataInfo>(info.XmpMetadata);
234+
Assert.Equal(xmp.Length, metadata.DecodedSizeBytes);
235+
Assert.Contains("<!DOCTYPE", metadata.RawXml, StringComparison.Ordinal);
236+
Assert.False(metadata.IsWellFormedXml);
237+
Assert.Null(metadata.Title);
238+
Assert.Empty(metadata.Subjects);
239+
}
240+
241+
[Fact]
242+
public void Inspect_XmpMetadataOverLimitKeepsSizeButDoesNotMaterializeRawXml() {
243+
string xmp = new('x', PdfReadDocument.MaxXmpMetadataBytes + 1);
244+
245+
PdfDocumentInfo info = PdfInspector.Inspect(BuildXmpMetadataPdfWithPayload(xmp));
246+
247+
Assert.True(info.HasXmpMetadata);
248+
PdfXmpMetadataInfo metadata = Assert.IsType<PdfXmpMetadataInfo>(info.XmpMetadata);
249+
Assert.Equal(PdfReadDocument.MaxXmpMetadataBytes + 1, metadata.DecodedSizeBytes);
250+
Assert.Null(metadata.RawXml);
251+
Assert.False(metadata.IsWellFormedXml);
252+
}
253+
226254
[Fact]
227255
public void Preflight_AllowsSimpleCatalogUriPdfReadAndRewrite() {
228256
PdfDocumentPreflight report = PdfInspector.Preflight(BuildCatalogUriPdf());
@@ -363,5 +391,36 @@ public void Inspect_DecodesFilteredOutputIntentProfileBeforeReadingIccHeader() {
363391
Assert.True(outputIntent.DestinationOutputProfileHasIccSignature);
364392
}
365393

394+
private static byte[] BuildXmpMetadataPdfWithPayload(string xmp) {
395+
string pdf = string.Join("\n", new[] {
396+
"%PDF-1.4",
397+
"1 0 obj",
398+
"<< /Type /Catalog /Pages 2 0 R /Metadata 5 0 R >>",
399+
"endobj",
400+
"2 0 obj",
401+
"<< /Type /Pages /Count 1 /Kids [3 0 R] >>",
402+
"endobj",
403+
"3 0 obj",
404+
"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 200 200] /Contents 4 0 R >>",
405+
"endobj",
406+
"4 0 obj",
407+
"<< /Length 0 >>",
408+
"stream",
409+
"",
410+
"endstream",
411+
"endobj",
412+
"5 0 obj",
413+
"<< /Type /Metadata /Subtype /XML /Length " + System.Text.Encoding.UTF8.GetByteCount(xmp).ToString(System.Globalization.CultureInfo.InvariantCulture) + " >>",
414+
"stream",
415+
xmp,
416+
"endstream",
417+
"endobj",
418+
"trailer",
419+
"<< /Root 1 0 R /Size 6 >>",
420+
"%%EOF"
421+
});
422+
423+
return System.Text.Encoding.UTF8.GetBytes(pdf);
424+
}
366425

367426
}

0 commit comments

Comments
 (0)