Skip to content

Commit 0a635aa

Browse files
Bound XMP stream decoding before XML parsing
1 parent 9064604 commit 0a635aa

3 files changed

Lines changed: 142 additions & 3 deletions

File tree

OfficeIMO.Pdf/Reading/Core/PdfReadDocument.XmpMetadata.cs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,16 @@ public sealed partial class PdfReadDocument {
2525
return null;
2626
}
2727

28-
byte[] decoded = StreamDecoder.Decode(stream.Dictionary, stream.Data, _objects);
29-
string? rawXml = decoded.Length <= MaxXmpMetadataBytes ? DecodeMetadataText(decoded) : null;
28+
bool decodedWithinLimit = StreamDecoder.TryDecode(stream.Dictionary, stream.Data, MaxXmpMetadataBytes, out byte[] decoded, _objects);
29+
string? rawXml = decodedWithinLimit ? DecodeMetadataText(decoded) : null;
30+
int decodedSizeBytes = decodedWithinLimit ? decoded.Length : MaxXmpMetadataBytes + 1;
3031
XDocument? document = rawXml is null ? null : TryParseXml(rawXml);
3132
return new PdfXmpMetadataInfo(
3233
objectNumber,
3334
TryReadName(stream.Dictionary, "Subtype"),
3435
TryReadStreamFilter(stream),
3536
stream.Data.Length,
36-
decoded.Length,
37+
decodedSizeBytes,
3738
StreamDecoder.GetUnsupportedFilters(stream.Dictionary, _objects).AsReadOnly(),
3839
rawXml,
3940
document is not null,

OfficeIMO.Pdf/Reading/Filters/StreamDecoder.cs

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,76 @@ public static byte[] Decode(PdfDictionary dict, byte[] data, Dictionary<int, Pdf
5353
return current;
5454
}
5555

56+
public static bool TryDecode(PdfDictionary dict, byte[] data, int maxOutputBytes, out byte[] decoded, Dictionary<int, PdfIndirectObject>? objects = null) {
57+
decoded = Array.Empty<byte>();
58+
if (maxOutputBytes < 0) {
59+
return false;
60+
}
61+
62+
if (data == null || data.Length == 0 || !dict.Items.TryGetValue("Filter", out var filterObj)) {
63+
return TryUseOriginal(data ?? Array.Empty<byte>(), maxOutputBytes, out decoded);
64+
}
65+
66+
byte[] original = data;
67+
byte[] current = data;
68+
int filterIndex = 0;
69+
foreach (string filterName in EnumerateFilters(filterObj, objects)) {
70+
try {
71+
switch (GetFilterKind(filterName)) {
72+
case DecodeFilterKind.Flate:
73+
if (!FlateDecoder.TryDecode(current, maxOutputBytes, out current)) {
74+
return false;
75+
}
76+
77+
current = ApplyDecodeParms(dict, filterIndex, current, objects);
78+
if (!IsWithinLimit(current, maxOutputBytes)) {
79+
return false;
80+
}
81+
82+
break;
83+
case DecodeFilterKind.AsciiHex:
84+
current = AsciiHexDecoder.Decode(current);
85+
if (!IsWithinLimit(current, maxOutputBytes)) {
86+
return false;
87+
}
88+
89+
break;
90+
case DecodeFilterKind.Ascii85:
91+
current = Ascii85Decoder.Decode(current);
92+
if (!IsWithinLimit(current, maxOutputBytes)) {
93+
return false;
94+
}
95+
96+
break;
97+
case DecodeFilterKind.RunLength:
98+
current = RunLengthDecoder.Decode(current);
99+
if (!IsWithinLimit(current, maxOutputBytes)) {
100+
return false;
101+
}
102+
103+
break;
104+
case DecodeFilterKind.Lzw:
105+
current = LzwDecoder.Decode(current, GetEarlyChange(dict, filterIndex, objects));
106+
current = ApplyDecodeParms(dict, filterIndex, current, objects);
107+
if (!IsWithinLimit(current, maxOutputBytes)) {
108+
return false;
109+
}
110+
111+
break;
112+
default:
113+
return TryUseOriginal(original, maxOutputBytes, out decoded);
114+
}
115+
} catch {
116+
return TryUseOriginal(original, maxOutputBytes, out decoded);
117+
}
118+
119+
filterIndex++;
120+
}
121+
122+
decoded = current;
123+
return true;
124+
}
125+
56126
internal static List<string> GetUnsupportedFilters(PdfDictionary dict, Dictionary<int, PdfIndirectObject>? objects = null) {
57127
if (!dict.Items.TryGetValue("Filter", out var filterObj)) {
58128
return new List<string>(0);
@@ -104,6 +174,20 @@ private static bool ContainsFilter(List<string> filters, string filterName) {
104174
return false;
105175
}
106176

177+
private static bool TryUseOriginal(byte[] data, int maxOutputBytes, out byte[] decoded) {
178+
if (!IsWithinLimit(data, maxOutputBytes)) {
179+
decoded = Array.Empty<byte>();
180+
return false;
181+
}
182+
183+
decoded = data;
184+
return true;
185+
}
186+
187+
private static bool IsWithinLimit(byte[] data, int maxOutputBytes) {
188+
return data.LongLength <= maxOutputBytes;
189+
}
190+
107191
private static byte[] ApplyDecodeParms(PdfDictionary dict, int filterIndex, byte[] data, Dictionary<int, PdfIndirectObject>? objects) {
108192
var decodeParms = GetDecodeParms(dict, filterIndex, objects);
109193
if (decodeParms is null) {

OfficeIMO.Tests/Pdf/PdfInspectorViewerMetadataPreflightTests.cs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,20 @@ public void Inspect_XmpMetadataOverLimitKeepsSizeButDoesNotMaterializeRawXml() {
251251
Assert.False(metadata.IsWellFormedXml);
252252
}
253253

254+
[Fact]
255+
public void Inspect_CompressedXmpMetadataOverLimitDoesNotMaterializeDecodedXml() {
256+
string xmp = new('x', PdfReadDocument.MaxXmpMetadataBytes + 1);
257+
258+
PdfDocumentInfo info = PdfInspector.Inspect(BuildCompressedXmpMetadataPdfWithPayload(xmp));
259+
260+
Assert.True(info.HasXmpMetadata);
261+
PdfXmpMetadataInfo metadata = Assert.IsType<PdfXmpMetadataInfo>(info.XmpMetadata);
262+
Assert.True(metadata.StreamSizeBytes < PdfReadDocument.MaxXmpMetadataBytes);
263+
Assert.Equal(PdfReadDocument.MaxXmpMetadataBytes + 1, metadata.DecodedSizeBytes);
264+
Assert.Null(metadata.RawXml);
265+
Assert.False(metadata.IsWellFormedXml);
266+
}
267+
254268
[Fact]
255269
public void Preflight_AllowsSimpleCatalogUriPdfReadAndRewrite() {
256270
PdfDocumentPreflight report = PdfInspector.Preflight(BuildCatalogUriPdf());
@@ -423,4 +437,44 @@ private static byte[] BuildXmpMetadataPdfWithPayload(string xmp) {
423437
return System.Text.Encoding.UTF8.GetBytes(pdf);
424438
}
425439

440+
private static byte[] BuildCompressedXmpMetadataPdfWithPayload(string xmp) {
441+
byte[] compressed = Compress(System.Text.Encoding.UTF8.GetBytes(xmp));
442+
using var output = new MemoryStream();
443+
WriteAscii(output, string.Join("\n", new[] {
444+
"%PDF-1.4",
445+
"1 0 obj",
446+
"<< /Type /Catalog /Pages 2 0 R /Metadata 5 0 R >>",
447+
"endobj",
448+
"2 0 obj",
449+
"<< /Type /Pages /Count 1 /Kids [3 0 R] >>",
450+
"endobj",
451+
"3 0 obj",
452+
"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 200 200] /Contents 4 0 R >>",
453+
"endobj",
454+
"4 0 obj",
455+
"<< /Length 0 >>",
456+
"stream",
457+
"",
458+
"endstream",
459+
"endobj",
460+
"5 0 obj",
461+
"<< /Type /Metadata /Subtype /XML /Length " + compressed.Length.ToString(System.Globalization.CultureInfo.InvariantCulture) + " /Filter /FlateDecode >>",
462+
"stream"
463+
}) + "\n");
464+
output.Write(compressed, 0, compressed.Length);
465+
WriteAscii(output, "\n" + string.Join("\n", new[] {
466+
"endstream",
467+
"endobj",
468+
"trailer",
469+
"<< /Root 1 0 R /Size 6 >>",
470+
"%%EOF"
471+
}));
472+
return output.ToArray();
473+
}
474+
475+
private static void WriteAscii(Stream stream, string value) {
476+
byte[] bytes = System.Text.Encoding.ASCII.GetBytes(value);
477+
stream.Write(bytes, 0, bytes.Length);
478+
}
479+
426480
}

0 commit comments

Comments
 (0)