Skip to content

Commit ce48286

Browse files
authored
feat: implement fast metadata-only save for PDFs to optimize performance (#15)
1 parent ff69997 commit ce48286

4 files changed

Lines changed: 438 additions & 4 deletions

File tree

src/main/java/org/pdfium4j/PdfDocument.java

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ public final class PdfDocument implements AutoCloseable {
5353
private final Set<PdfPage> openPages;
5454
private volatile boolean closed = false;
5555
private boolean metadataDirty = false;
56+
private boolean structurallyModified = false;
5657
private final Map<MetadataTag, String> pendingMetadata = new LinkedHashMap<>();
5758
private String pendingXmpMetadata = null;
5859

@@ -612,6 +613,7 @@ public void deletePage(int pageIndex) {
612613
}
613614
try {
614615
EditBindings.FPDFPage_Delete.invokeExact(handle, pageIndex);
616+
structurallyModified = true;
615617
} catch (Throwable t) {
616618
throw new PdfiumException("Failed to delete page " + pageIndex, t);
617619
}
@@ -637,6 +639,7 @@ public void insertBlankPage(int pageIndex, PageSize size) {
637639
} finally {
638640
ViewBindings.FPDF_ClosePage.invokeExact(pageSeg);
639641
}
642+
structurallyModified = true;
640643
} catch (PdfiumException e) {
641644
throw e;
642645
} catch (Throwable t) {
@@ -674,6 +677,7 @@ public void importPages(PdfDocument source, String pageRange, int insertIndex) {
674677
if (ok == 0) {
675678
throw new PdfiumException("FPDF_ImportPages failed for range: " + pageRange);
676679
}
680+
structurallyModified = true;
677681
} catch (PdfiumException e) {
678682
throw e;
679683
} catch (Throwable t) {
@@ -1063,13 +1067,20 @@ public List<Bookmark> bookmarks() {
10631067
/**
10641068
* Save the document to a file.
10651069
*
1066-
* <p>This saves the current state of the document, including any
1067-
* modifications made via the API (e.g., page rotation changes).
1070+
* <p>When only metadata has been modified (no page additions, deletions,
1071+
* or imports), this uses a fast path that reads the original PDF bytes
1072+
* and appends an incremental update — avoiding the expensive native
1073+
* {@code FPDF_SaveAsCopy} serialization entirely.
10681074
*
10691075
* @param path output file path
10701076
*/
10711077
public void save(Path path) {
1072-
byte[] bytes = saveToBytes();
1078+
byte[] bytes;
1079+
if (!structurallyModified && metadataDirty) {
1080+
bytes = saveMetadataOnly();
1081+
} else {
1082+
bytes = saveToBytes();
1083+
}
10731084
try {
10741085
Files.write(path, bytes);
10751086
} catch (IOException e) {
@@ -1087,6 +1098,29 @@ public byte[] saveToBytes() {
10871098
return PdfSaver.saveToBytes(handle, pendingMetadata, pendingXmpMetadata);
10881099
}
10891100

1101+
/**
1102+
* Fast metadata-only save: reads original bytes and appends an incremental
1103+
* update with the pending Info Dictionary and/or XMP changes.
1104+
* Avoids the expensive {@code FPDF_SaveAsCopy} native serialization.
1105+
*/
1106+
private byte[] saveMetadataOnly() {
1107+
ensureOpen();
1108+
byte[] original;
1109+
if (sourceBytes != null) {
1110+
original = sourceBytes;
1111+
} else if (sourcePath != null) {
1112+
try {
1113+
original = Files.readAllBytes(sourcePath);
1114+
} catch (IOException e) {
1115+
throw new PdfiumException("Failed to read source PDF for metadata update: " + sourcePath, e);
1116+
}
1117+
} else {
1118+
// No original bytes available, fall back to full save
1119+
return saveToBytes();
1120+
}
1121+
return PdfSaver.applyIncrementalUpdate(original, pendingMetadata, pendingXmpMetadata);
1122+
}
1123+
10901124
/**
10911125
* Save the document directly to an OutputStream, suitable for streaming
10921126
* responses (e.g., HTTP responses) without intermediate byte arrays.
@@ -1095,7 +1129,12 @@ public byte[] saveToBytes() {
10951129
* @throws PdfiumException if saving fails
10961130
*/
10971131
public void save(OutputStream out) {
1098-
byte[] bytes = saveToBytes();
1132+
byte[] bytes;
1133+
if (!structurallyModified && metadataDirty) {
1134+
bytes = saveMetadataOnly();
1135+
} else {
1136+
bytes = saveToBytes();
1137+
}
10991138
try {
11001139
out.write(bytes);
11011140
} catch (IOException e) {

src/main/java/org/pdfium4j/PdfSaver.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,20 @@ final class PdfSaver {
3333

3434
private PdfSaver() {}
3535

36+
/**
37+
* Apply an incremental update to existing PDF bytes without native serialization.
38+
* This is the fast path for metadata-only changes — it reads the original file
39+
* and appends new Info/XMP objects + xref + trailer at the end.
40+
*/
41+
static byte[] applyIncrementalUpdate(byte[] originalPdf, Map<MetadataTag, String> pendingMetadata, String pendingXmp) {
42+
boolean hasInfoUpdate = pendingMetadata != null && !pendingMetadata.isEmpty();
43+
boolean hasXmpUpdate = pendingXmp != null && !pendingXmp.isEmpty();
44+
if (!hasInfoUpdate && !hasXmpUpdate) {
45+
return originalPdf;
46+
}
47+
return appendIncrementalUpdate(originalPdf, pendingMetadata, pendingXmp);
48+
}
49+
3650
static byte[] saveToBytes(MemorySegment docHandle, Map<MetadataTag, String> pendingMetadata, String pendingXmp) {
3751
ByteArrayOutputStream baos = new ByteArrayOutputStream();
3852
WRITE_BUFFER.set(baos);

src/test/java/org/pdfium4j/PdfDocumentTest.java

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -907,4 +907,231 @@ private static byte[] minimalPdfWithText() {
907907
""";
908908
return pdf.getBytes(StandardCharsets.US_ASCII);
909909
}
910+
911+
// --- Metadata-only fast save tests ---
912+
913+
@Test
914+
@EnabledIf("pdfiumAvailable")
915+
void metadataOnlySaveIsFasterThanFullSave(@TempDir Path tempDir) throws IOException {
916+
Path testPdf = getTestPdf();
917+
if (testPdf == null) return;
918+
919+
Path outFull = tempDir.resolve("full-save.pdf");
920+
Path outFast = tempDir.resolve("fast-save.pdf");
921+
922+
// Full save (through FPDF_SaveAsCopy)
923+
try (PdfDocument doc = PdfDocument.open(testPdf)) {
924+
doc.save(outFull); // no metadata changes, triggers full save
925+
}
926+
927+
// Metadata-only save (fast path)
928+
try (PdfDocument doc = PdfDocument.open(testPdf)) {
929+
doc.setMetadata(MetadataTag.TITLE, "Fast Save Title");
930+
doc.save(outFast);
931+
}
932+
933+
// Verify both produce valid PDFs
934+
assertTrue(Files.size(outFull) > 0);
935+
assertTrue(Files.size(outFast) > 0);
936+
937+
// Verify the fast-save file contains the metadata
938+
try (PdfDocument doc = PdfDocument.open(outFast)) {
939+
assertEquals("Fast Save Title", doc.metadata(MetadataTag.TITLE).orElse(""));
940+
}
941+
}
942+
943+
@Test
944+
@EnabledIf("pdfiumAvailable")
945+
void metadataOnlySaveFromBytesWorks(@TempDir Path tempDir) throws IOException {
946+
Path testPdf = getTestPdf();
947+
if (testPdf == null) return;
948+
949+
byte[] originalBytes = Files.readAllBytes(testPdf);
950+
Path outPath = tempDir.resolve("bytes-meta.pdf");
951+
952+
try (PdfDocument doc = PdfDocument.open(originalBytes)) {
953+
doc.setMetadata(MetadataTag.TITLE, "BytesSave");
954+
doc.setMetadata(MetadataTag.AUTHOR, "Test Author");
955+
doc.save(outPath);
956+
}
957+
958+
try (PdfDocument doc = PdfDocument.open(outPath)) {
959+
assertEquals("BytesSave", doc.metadata(MetadataTag.TITLE).orElse(""));
960+
assertEquals("Test Author", doc.metadata(MetadataTag.AUTHOR).orElse(""));
961+
}
962+
}
963+
964+
@Test
965+
@EnabledIf("pdfiumAvailable")
966+
void metadataOnlySavePreservesPageContent(@TempDir Path tempDir) throws IOException {
967+
Path testPdf = getTestPdf();
968+
if (testPdf == null) return;
969+
970+
Path output = tempDir.resolve("preserve-content.pdf");
971+
972+
int originalPageCount;
973+
try (PdfDocument doc = PdfDocument.open(testPdf)) {
974+
originalPageCount = doc.pageCount();
975+
}
976+
977+
try (PdfDocument doc = PdfDocument.open(testPdf)) {
978+
doc.setMetadata(MetadataTag.TITLE, "Preserves Content");
979+
doc.setXmpMetadata(buildBookloreXmp("Preserves Content", "Test Author"));
980+
doc.save(output);
981+
}
982+
983+
try (PdfDocument doc = PdfDocument.open(output)) {
984+
assertEquals(originalPageCount, doc.pageCount());
985+
assertEquals("Preserves Content", doc.metadata(MetadataTag.TITLE).orElse(""));
986+
987+
String xmp = doc.xmpMetadataString();
988+
assertTrue(xmp.contains("Preserves Content"), "XMP should be in file");
989+
}
990+
}
991+
992+
@Test
993+
@EnabledIf("pdfiumAvailable")
994+
void structuralChangeUsesFullSave(@TempDir Path tempDir) throws IOException {
995+
Path testPdf = getTestPdf();
996+
if (testPdf == null) return;
997+
998+
Path output = tempDir.resolve("structural.pdf");
999+
1000+
try (PdfDocument doc = PdfDocument.open(testPdf)) {
1001+
doc.insertBlankPage(0, new PageSize(612, 792));
1002+
doc.setMetadata(MetadataTag.TITLE, "Structural Change");
1003+
doc.save(output);
1004+
}
1005+
1006+
try (PdfDocument doc = PdfDocument.open(output)) {
1007+
assertEquals(2, doc.pageCount(), "Should have original + inserted page");
1008+
assertEquals("Structural Change", doc.metadata(MetadataTag.TITLE).orElse(""));
1009+
}
1010+
}
1011+
1012+
@Test
1013+
@EnabledIf("pdfiumAvailable")
1014+
void xmpMetadataRoundTripWithBookloreNamespace(@TempDir Path tempDir) throws IOException {
1015+
Path testPdf = getTestPdf();
1016+
if (testPdf == null) return;
1017+
1018+
Path output = tempDir.resolve("booklore-xmp.pdf");
1019+
1020+
String bookloreXmp = """
1021+
<?xpacket begin="\uFEFF" id="W5M0MpCehiHzreSzNTczkc9d"?>
1022+
<x:xmpmeta xmlns:x="adobe:ns:meta/">
1023+
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
1024+
<rdf:Description rdf:about=""
1025+
xmlns:dc="http://purl.org/dc/elements/1.1/">
1026+
<dc:title><rdf:Alt><rdf:li xml:lang="x-default">Dead Simple Python</rdf:li></rdf:Alt></dc:title>
1027+
<dc:creator><rdf:Seq><rdf:li>Jason C. McDonald</rdf:li></rdf:Seq></dc:creator>
1028+
<dc:publisher><rdf:Bag><rdf:li>No Starch Press</rdf:li></rdf:Bag></dc:publisher>
1029+
<dc:subject>
1030+
<rdf:Bag>
1031+
<rdf:li>Programming</rdf:li>
1032+
<rdf:li>Python</rdf:li>
1033+
</rdf:Bag>
1034+
</dc:subject>
1035+
<dc:date><rdf:Seq><rdf:li>2023-01-01</rdf:li></rdf:Seq></dc:date>
1036+
<dc:language><rdf:Bag><rdf:li>English</rdf:li></rdf:Bag></dc:language>
1037+
</rdf:Description>
1038+
<rdf:Description rdf:about=""
1039+
xmlns:booklore="http://booklore.org/metadata/1.0/">
1040+
<booklore:subtitle>Idiomatic Python for the Impatient Programmer</booklore:subtitle>
1041+
<booklore:isbn13>9781718500921</booklore:isbn13>
1042+
<booklore:isbn10>1718500920</booklore:isbn10>
1043+
<booklore:goodreadsId>52555538</booklore:goodreadsId>
1044+
<booklore:goodreadsRating>4.4</booklore:goodreadsRating>
1045+
<booklore:pageCount>713</booklore:pageCount>
1046+
</rdf:Description>
1047+
</rdf:RDF>
1048+
</x:xmpmeta>
1049+
<?xpacket end="w"?>""";
1050+
1051+
// Write XMP + Info Dict
1052+
try (PdfDocument doc = PdfDocument.open(testPdf)) {
1053+
doc.setMetadata(MetadataTag.TITLE, "Dead Simple Python");
1054+
doc.setMetadata(MetadataTag.AUTHOR, "Jason C. McDonald");
1055+
doc.setXmpMetadata(bookloreXmp);
1056+
doc.save(output);
1057+
}
1058+
1059+
// Read back and verify BOTH Info Dict and XMP
1060+
try (PdfDocument doc = PdfDocument.open(output)) {
1061+
// Info Dict
1062+
assertEquals("Dead Simple Python", doc.metadata(MetadataTag.TITLE).orElse(""));
1063+
assertEquals("Jason C. McDonald", doc.metadata(MetadataTag.AUTHOR).orElse(""));
1064+
1065+
// XMP
1066+
String xmp = doc.xmpMetadataString();
1067+
assertFalse(xmp.isEmpty(), "XMP should be present");
1068+
1069+
XmpMetadata parsed = XmpMetadataParser.parse(xmp);
1070+
assertEquals("Dead Simple Python", parsed.title().orElse(""));
1071+
assertEquals(List.of("Jason C. McDonald"), parsed.creators());
1072+
assertEquals("No Starch Press", parsed.publisher().orElse(""));
1073+
assertEquals("2023-01-01", parsed.date().orElse(""));
1074+
assertEquals("English", parsed.language().orElse(""));
1075+
assertTrue(parsed.subjects().contains("Programming"));
1076+
assertTrue(parsed.subjects().contains("Python"));
1077+
1078+
// Verify raw XMP string contains booklore namespace elements
1079+
assertTrue(xmp.contains("booklore:subtitle"), "XMP should contain subtitle");
1080+
assertTrue(xmp.contains("Idiomatic Python"), "XMP should contain subtitle value");
1081+
assertTrue(xmp.contains("booklore:isbn13"), "XMP should contain isbn13");
1082+
assertTrue(xmp.contains("9781718500921"));
1083+
assertTrue(xmp.contains("booklore:isbn10"), "XMP should contain isbn10");
1084+
assertTrue(xmp.contains("1718500920"));
1085+
assertTrue(xmp.contains("booklore:goodreadsId"), "XMP should contain goodreadsId");
1086+
assertTrue(xmp.contains("52555538"));
1087+
}
1088+
}
1089+
1090+
@Test
1091+
@EnabledIf("pdfiumAvailable")
1092+
void xmpMetadataOverwritePrevious(@TempDir Path tempDir) throws IOException {
1093+
Path testPdf = getTestPdf();
1094+
if (testPdf == null) return;
1095+
1096+
Path firstSave = tempDir.resolve("first.pdf");
1097+
Path secondSave = tempDir.resolve("second.pdf");
1098+
1099+
// First write
1100+
try (PdfDocument doc = PdfDocument.open(testPdf)) {
1101+
doc.setMetadata(MetadataTag.TITLE, "First Title");
1102+
doc.setXmpMetadata(buildBookloreXmp("First Title", "First Author"));
1103+
doc.save(firstSave);
1104+
}
1105+
1106+
// Second write overwrites - open the FIRST save and update
1107+
try (PdfDocument doc = PdfDocument.open(firstSave)) {
1108+
doc.setMetadata(MetadataTag.TITLE, "Second Title");
1109+
doc.setXmpMetadata(buildBookloreXmp("Second Title", "Second Author"));
1110+
doc.save(secondSave);
1111+
}
1112+
1113+
// Verify the SECOND save has the NEW values, not the old ones
1114+
try (PdfDocument doc = PdfDocument.open(secondSave)) {
1115+
assertEquals("Second Title", doc.metadata(MetadataTag.TITLE).orElse(""));
1116+
1117+
XmpMetadata parsed = XmpMetadataParser.parse(doc.xmpMetadata());
1118+
assertEquals("Second Title", parsed.title().orElse(""));
1119+
assertEquals(List.of("Second Author"), parsed.creators());
1120+
}
1121+
}
1122+
1123+
private String buildBookloreXmp(String title, String author) {
1124+
return """
1125+
<?xpacket begin="\uFEFF" id="W5M0MpCehiHzreSzNTczkc9d"?>
1126+
<x:xmpmeta xmlns:x="adobe:ns:meta/">
1127+
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
1128+
<rdf:Description rdf:about=""
1129+
xmlns:dc="http://purl.org/dc/elements/1.1/">
1130+
<dc:title><rdf:Alt><rdf:li xml:lang="x-default">%s</rdf:li></rdf:Alt></dc:title>
1131+
<dc:creator><rdf:Seq><rdf:li>%s</rdf:li></rdf:Seq></dc:creator>
1132+
</rdf:Description>
1133+
</rdf:RDF>
1134+
</x:xmpmeta>
1135+
<?xpacket end="w"?>""".formatted(title, author);
1136+
}
9101137
}

0 commit comments

Comments
 (0)