Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2289,6 +2289,85 @@ private <T extends PDTerminalField> void registerNewField(
acroForm.getFields().add(field);
}

/** Drops AcroForm fields whose widgets are no longer on any page of {@code document}. */
public void pruneOrphanedFormFields(PDDocument document) {
if (document == null) {
return;
}
PDDocumentCatalog catalog = document.getDocumentCatalog();
if (catalog == null) {
return;
}
PDAcroForm form = catalog.getAcroForm(null);
if (form == null) {
return;
}
List<PDField> fields = form.getFields();
if (fields.isEmpty()) {
return;
}

Set<COSDictionary> liveWidgets = collectLiveWidgetDictionaries(document);
List<PDField> kept = pruneFieldList(fields, liveWidgets);
if (kept.isEmpty()) {
catalog.setAcroForm(null);
} else if (kept.size() != fields.size()) {
form.setFields(kept);
}
}

private Set<COSDictionary> collectLiveWidgetDictionaries(PDDocument document) {
Set<COSDictionary> live = new HashSet<>();
int pageCount = document.getNumberOfPages();
for (int i = 0; i < pageCount; i++) {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential place for threaded implementation. If you got a chonky doc that is. May not be worth it

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is worth doing, used very big docs to test and had no issue nad I don't think the overhead for this is all that significant

try {
for (PDAnnotation annotation : document.getPage(i).getAnnotations()) {
if (annotation instanceof PDAnnotationWidget) {
live.add(annotation.getCOSObject());
}
}
} catch (IOException e) {
log.debug("Failed reading page {} annotations: {}", i, e.getMessage());
}
}
return live;
}

private List<PDField> pruneFieldList(List<PDField> fields, Set<COSDictionary> liveWidgets) {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fancy recursion! What happens if the number of fields get really big?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's on tree depth not field count. This matches how pdf box does it if this fails due to tree depth then we have biger problems

List<PDField> kept = new ArrayList<>(fields.size());
for (PDField field : fields) {
if (field instanceof PDNonTerminalField nonTerminal) {
List<PDField> children = nonTerminal.getChildren();
List<PDField> remaining = pruneFieldList(children, liveWidgets);
Comment thread
reecebrowne marked this conversation as resolved.
if (remaining.isEmpty()) {
continue;
}
if (remaining.size() != children.size()) {
nonTerminal.setChildren(remaining);
}
kept.add(nonTerminal);
} else if (field instanceof PDTerminalField terminal) {
List<PDAnnotationWidget> widgets = terminal.getWidgets();
List<PDAnnotationWidget> liveOnes = new ArrayList<>(widgets.size());
for (PDAnnotationWidget widget : widgets) {
if (liveWidgets.contains(widget.getCOSObject())) {
liveOnes.add(widget);
}
}
if (liveOnes.isEmpty()) {
continue;
}
if (liveOnes.size() != widgets.size()) {
terminal.setWidgets(liveOnes);
}
kept.add(terminal);
} else {
kept.add(field);
}
}
return kept;
}

// Delegation methods to GeneralFormCopyUtils for form field transformation
public boolean hasAnyRotatedPage(PDDocument document) {
return stirling.software.common.util.GeneralFormCopyUtils.hasAnyRotatedPage(document);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
package stirling.software.common.util;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget;
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.apache.pdfbox.pdmodel.interactive.form.PDField;
import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField;
import org.apache.pdfbox.pdmodel.interactive.form.PDTextField;
import org.junit.jupiter.api.Test;

class FormUtilsPruneOrphanedFieldsTest {

@Test
void noAcroFormIsNoOp() throws IOException {
try (PDDocument document = new PDDocument()) {
document.addPage(new PDPage(PDRectangle.A4));
FormUtils.pruneOrphanedFormFields(document);
assertNull(document.getDocumentCatalog().getAcroForm(null));
}
}

@Test
void dropsFieldsWhoseWidgetsAreAllOnRemovedPages() throws IOException {
byte[] pdfBytes = buildPdfWithFieldPerPage(3);

try (PDDocument document = Loader.loadPDF(pdfBytes)) {
document.removePage(2);
document.removePage(0);

FormUtils.pruneOrphanedFormFields(document);

PDAcroForm form = document.getDocumentCatalog().getAcroForm(null);
assertNotNull(form);
List<String> remainingNames = new ArrayList<>();
for (PDField field : form.getFields()) {
remainingNames.add(field.getPartialName());
}
assertEquals(List.of("field_1"), remainingNames);
}
}

@Test
void dropsAcroFormEntirelyWhenNoFieldsSurvive() throws IOException {
byte[] pdfBytes = buildPdfWithFieldPerPage(2);

try (PDDocument document = Loader.loadPDF(pdfBytes)) {
document.removePage(1);
document.removePage(0);
document.addPage(new PDPage(PDRectangle.A4));

FormUtils.pruneOrphanedFormFields(document);

assertNull(document.getDocumentCatalog().getAcroForm(null));
}
}

@Test
void keepsLiveWidgetsAndDropsOrphanWidgetsFromMultiWidgetField() throws IOException {
byte[] pdfBytes = buildPdfWithMultiWidgetField();

try (PDDocument document = Loader.loadPDF(pdfBytes)) {
document.removePage(0);

FormUtils.pruneOrphanedFormFields(document);

PDAcroForm form = document.getDocumentCatalog().getAcroForm(null);
assertNotNull(form);
assertEquals(1, form.getFields().size());
PDField field = form.getFields().get(0);
assertEquals("multi", field.getPartialName());
assertEquals(2, field.getWidgets().size(), "two widgets remain after one is dropped");
}
}

@Test
void survivesRoundTripWithoutOrphanPagesInOutput() throws IOException {
byte[] pdfBytes = buildPdfWithFieldPerPage(3);

byte[] writtenBytes;
try (PDDocument document = Loader.loadPDF(pdfBytes)) {
document.removePage(2);
document.removePage(1);
FormUtils.pruneOrphanedFormFields(document);
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
document.save(out);
writtenBytes = out.toByteArray();
}
}

try (PDDocument reloaded = Loader.loadPDF(writtenBytes)) {
assertEquals(1, reloaded.getNumberOfPages());
PDAcroForm form = reloaded.getDocumentCatalog().getAcroForm(null);
assertNotNull(form);
assertEquals(1, form.getFields().size());
assertEquals("field_0", form.getFields().get(0).getPartialName());
}
}

@Test
void prunesNestedNonTerminalFields() throws IOException {
byte[] pdfBytes = buildPdfWithNestedFields();

try (PDDocument document = Loader.loadPDF(pdfBytes)) {
document.removePage(1);

FormUtils.pruneOrphanedFormFields(document);

PDAcroForm form = document.getDocumentCatalog().getAcroForm(null);
assertNotNull(form);
assertEquals(1, form.getFields().size());
PDField group = form.getFields().get(0);
assertEquals("group", group.getPartialName());
assertTrue(group instanceof PDNonTerminalField);
PDNonTerminalField nonTerminal = (PDNonTerminalField) group;
assertEquals(1, nonTerminal.getChildren().size());
assertEquals("kept", nonTerminal.getChildren().get(0).getPartialName());
}
}

private static byte[] buildPdfWithFieldPerPage(int pageCount) throws IOException {
try (PDDocument document = new PDDocument()) {
PDAcroForm acroForm = new PDAcroForm(document);
acroForm.setDefaultResources(new PDResources());
document.getDocumentCatalog().setAcroForm(acroForm);

for (int i = 0; i < pageCount; i++) {
PDPage page = new PDPage(PDRectangle.A4);
document.addPage(page);

PDTextField field = new PDTextField(acroForm);
field.setPartialName("field_" + i);
PDAnnotationWidget widget = new PDAnnotationWidget();
widget.setRectangle(new PDRectangle(50, 50, 100, 20));
widget.setPage(page);
field.setWidgets(List.of(widget));
acroForm.getFields().add(field);
page.getAnnotations().add(widget);
}

try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
document.save(out);
return out.toByteArray();
}
}
}

private static byte[] buildPdfWithMultiWidgetField() throws IOException {
try (PDDocument document = new PDDocument()) {
PDAcroForm acroForm = new PDAcroForm(document);
acroForm.setDefaultResources(new PDResources());
document.getDocumentCatalog().setAcroForm(acroForm);

List<PDAnnotationWidget> widgets = new ArrayList<>();
for (int i = 0; i < 3; i++) {
PDPage page = new PDPage(PDRectangle.A4);
document.addPage(page);
PDAnnotationWidget widget = new PDAnnotationWidget();
widget.setRectangle(new PDRectangle(50, 50, 100, 20));
widget.setPage(page);
page.getAnnotations().add(widget);
widgets.add(widget);
}

PDTextField field = new PDTextField(acroForm);
field.setPartialName("multi");
field.setWidgets(widgets);
acroForm.getFields().add(field);

try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
document.save(out);
return out.toByteArray();
}
}
}

private static byte[] buildPdfWithNestedFields() throws IOException {
try (PDDocument document = new PDDocument()) {
PDAcroForm acroForm = new PDAcroForm(document);
acroForm.setDefaultResources(new PDResources());
document.getDocumentCatalog().setAcroForm(acroForm);

PDPage pageA = new PDPage(PDRectangle.A4);
PDPage pageB = new PDPage(PDRectangle.A4);
document.addPage(pageA);
document.addPage(pageB);

PDNonTerminalField group = new PDNonTerminalField(acroForm);
group.setPartialName("group");

PDTextField kept = new PDTextField(acroForm);
kept.setPartialName("kept");
PDAnnotationWidget keptWidget = new PDAnnotationWidget();
keptWidget.setRectangle(new PDRectangle(50, 50, 100, 20));
keptWidget.setPage(pageA);
kept.setWidgets(List.of(keptWidget));
pageA.getAnnotations().add(keptWidget);

PDTextField dropped = new PDTextField(acroForm);
dropped.setPartialName("dropped");
Comment on lines +202 to +213
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these names shown to the user? Does it matter if they are english only?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are just for tests. Shouldn't have added this file though it can just be moved to formutilstest

PDAnnotationWidget droppedWidget = new PDAnnotationWidget();
droppedWidget.setRectangle(new PDRectangle(50, 100, 100, 20));
droppedWidget.setPage(pageB);
dropped.setWidgets(List.of(droppedWidget));
pageB.getAnnotations().add(droppedWidget);

group.setChildren(List.of(kept, dropped));
acroForm.getFields().add(group);

try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
document.save(out);
return out.toByteArray();
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@
import java.util.List;
import java.util.Locale;

import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.springframework.core.io.Resource;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
Expand All @@ -27,6 +30,7 @@
import stirling.software.common.annotations.api.GeneralApi;
import stirling.software.common.service.CustomPDFDocumentFactory;
import stirling.software.common.util.ExceptionUtils;
import stirling.software.common.util.FormUtils;
import stirling.software.common.util.GeneralUtils;
import stirling.software.common.util.TempFileManager;
import stirling.software.common.util.WebResponseUtils;
Expand Down Expand Up @@ -67,6 +71,7 @@ public ResponseEntity<Resource> deletePages(@ModelAttribute PDFWithPageNums requ
int pageIndex = pagesToRemove.get(i);
document.removePage(pageIndex);
}
FormUtils.pruneOrphanedFormFields(document);
return WebResponseUtils.pdfDocToWebResponse(
document,
GeneralUtils.generateFilename(
Expand Down Expand Up @@ -265,6 +270,17 @@ public ResponseEntity<Resource> rearrangePages(@ModelAttribute RearrangePagesReq
rearrangedDocument.addPage(page);
}

PDDocumentCatalog sourceCatalog = document.getDocumentCatalog();
if (sourceCatalog != null) {
PDAcroForm sourceForm = sourceCatalog.getAcroForm(null);
if (sourceForm != null) {
rearrangedDocument
.getDocumentCatalog()
.getCOSObject()
.setItem(COSName.ACRO_FORM, sourceForm.getCOSObject());
}
}

return WebResponseUtils.pdfDocToWebResponse(
rearrangedDocument,
GeneralUtils.generateFilename(
Expand Down
Loading
Loading