-
Notifications
You must be signed in to change notification settings - Fork 6.8k
Split and delete forms #6277
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Split and delete forms #6277
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2289,6 +2289,85 @@ private <T extends PDTerminalField> void registerNewField( | |
| acroForm.getFields().add(field); | ||
| } | ||
|
|
||
| /** Drops AcroForm fields whose widgets are no longer on any page of {@code document}. */ | ||
| public void pruneOrphanedFormFields(PDDocument document) { | ||
| if (document == null) { | ||
| return; | ||
| } | ||
| PDDocumentCatalog catalog = document.getDocumentCatalog(); | ||
| if (catalog == null) { | ||
| return; | ||
| } | ||
| PDAcroForm form = catalog.getAcroForm(null); | ||
| if (form == null) { | ||
| return; | ||
| } | ||
| List<PDField> fields = form.getFields(); | ||
| if (fields.isEmpty()) { | ||
| return; | ||
| } | ||
|
|
||
| Set<COSDictionary> liveWidgets = collectLiveWidgetDictionaries(document); | ||
| List<PDField> kept = pruneFieldList(fields, liveWidgets); | ||
| if (kept.isEmpty()) { | ||
| catalog.setAcroForm(null); | ||
| } else if (kept.size() != fields.size()) { | ||
| form.setFields(kept); | ||
| } | ||
| } | ||
|
|
||
| private Set<COSDictionary> collectLiveWidgetDictionaries(PDDocument document) { | ||
| Set<COSDictionary> live = new HashSet<>(); | ||
| int pageCount = document.getNumberOfPages(); | ||
| for (int i = 0; i < pageCount; i++) { | ||
| try { | ||
| for (PDAnnotation annotation : document.getPage(i).getAnnotations()) { | ||
| if (annotation instanceof PDAnnotationWidget) { | ||
| live.add(annotation.getCOSObject()); | ||
| } | ||
| } | ||
| } catch (IOException e) { | ||
| log.debug("Failed reading page {} annotations: {}", i, e.getMessage()); | ||
| } | ||
| } | ||
| return live; | ||
| } | ||
|
|
||
| private List<PDField> pruneFieldList(List<PDField> fields, Set<COSDictionary> liveWidgets) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fancy recursion! What happens if the number of fields get really big?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's on tree depth not field count. This matches how pdf box does it if this fails due to tree depth then we have biger problems |
||
| List<PDField> kept = new ArrayList<>(fields.size()); | ||
| for (PDField field : fields) { | ||
| if (field instanceof PDNonTerminalField nonTerminal) { | ||
| List<PDField> children = nonTerminal.getChildren(); | ||
| List<PDField> remaining = pruneFieldList(children, liveWidgets); | ||
|
reecebrowne marked this conversation as resolved.
|
||
| if (remaining.isEmpty()) { | ||
| continue; | ||
| } | ||
| if (remaining.size() != children.size()) { | ||
| nonTerminal.setChildren(remaining); | ||
| } | ||
| kept.add(nonTerminal); | ||
| } else if (field instanceof PDTerminalField terminal) { | ||
| List<PDAnnotationWidget> widgets = terminal.getWidgets(); | ||
| List<PDAnnotationWidget> liveOnes = new ArrayList<>(widgets.size()); | ||
| for (PDAnnotationWidget widget : widgets) { | ||
| if (liveWidgets.contains(widget.getCOSObject())) { | ||
| liveOnes.add(widget); | ||
| } | ||
| } | ||
| if (liveOnes.isEmpty()) { | ||
| continue; | ||
| } | ||
| if (liveOnes.size() != widgets.size()) { | ||
| terminal.setWidgets(liveOnes); | ||
| } | ||
| kept.add(terminal); | ||
| } else { | ||
| kept.add(field); | ||
| } | ||
| } | ||
| return kept; | ||
| } | ||
|
|
||
| // Delegation methods to GeneralFormCopyUtils for form field transformation | ||
| public boolean hasAnyRotatedPage(PDDocument document) { | ||
| return stirling.software.common.util.GeneralFormCopyUtils.hasAnyRotatedPage(document); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,229 @@ | ||
| package stirling.software.common.util; | ||
|
|
||
| import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| import static org.junit.jupiter.api.Assertions.assertNotNull; | ||
| import static org.junit.jupiter.api.Assertions.assertNull; | ||
| import static org.junit.jupiter.api.Assertions.assertTrue; | ||
|
|
||
| import java.io.ByteArrayOutputStream; | ||
| import java.io.IOException; | ||
| import java.util.ArrayList; | ||
| import java.util.List; | ||
|
|
||
| import org.apache.pdfbox.Loader; | ||
| import org.apache.pdfbox.pdmodel.PDDocument; | ||
| import org.apache.pdfbox.pdmodel.PDPage; | ||
| import org.apache.pdfbox.pdmodel.PDResources; | ||
| import org.apache.pdfbox.pdmodel.common.PDRectangle; | ||
| import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget; | ||
| import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; | ||
| import org.apache.pdfbox.pdmodel.interactive.form.PDField; | ||
| import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField; | ||
| import org.apache.pdfbox.pdmodel.interactive.form.PDTextField; | ||
| import org.junit.jupiter.api.Test; | ||
|
|
||
| class FormUtilsPruneOrphanedFieldsTest { | ||
|
|
||
| @Test | ||
| void noAcroFormIsNoOp() throws IOException { | ||
| try (PDDocument document = new PDDocument()) { | ||
| document.addPage(new PDPage(PDRectangle.A4)); | ||
| FormUtils.pruneOrphanedFormFields(document); | ||
| assertNull(document.getDocumentCatalog().getAcroForm(null)); | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| void dropsFieldsWhoseWidgetsAreAllOnRemovedPages() throws IOException { | ||
| byte[] pdfBytes = buildPdfWithFieldPerPage(3); | ||
|
|
||
| try (PDDocument document = Loader.loadPDF(pdfBytes)) { | ||
| document.removePage(2); | ||
| document.removePage(0); | ||
|
|
||
| FormUtils.pruneOrphanedFormFields(document); | ||
|
|
||
| PDAcroForm form = document.getDocumentCatalog().getAcroForm(null); | ||
| assertNotNull(form); | ||
| List<String> remainingNames = new ArrayList<>(); | ||
| for (PDField field : form.getFields()) { | ||
| remainingNames.add(field.getPartialName()); | ||
| } | ||
| assertEquals(List.of("field_1"), remainingNames); | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| void dropsAcroFormEntirelyWhenNoFieldsSurvive() throws IOException { | ||
| byte[] pdfBytes = buildPdfWithFieldPerPage(2); | ||
|
|
||
| try (PDDocument document = Loader.loadPDF(pdfBytes)) { | ||
| document.removePage(1); | ||
| document.removePage(0); | ||
| document.addPage(new PDPage(PDRectangle.A4)); | ||
|
|
||
| FormUtils.pruneOrphanedFormFields(document); | ||
|
|
||
| assertNull(document.getDocumentCatalog().getAcroForm(null)); | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| void keepsLiveWidgetsAndDropsOrphanWidgetsFromMultiWidgetField() throws IOException { | ||
| byte[] pdfBytes = buildPdfWithMultiWidgetField(); | ||
|
|
||
| try (PDDocument document = Loader.loadPDF(pdfBytes)) { | ||
| document.removePage(0); | ||
|
|
||
| FormUtils.pruneOrphanedFormFields(document); | ||
|
|
||
| PDAcroForm form = document.getDocumentCatalog().getAcroForm(null); | ||
| assertNotNull(form); | ||
| assertEquals(1, form.getFields().size()); | ||
| PDField field = form.getFields().get(0); | ||
| assertEquals("multi", field.getPartialName()); | ||
| assertEquals(2, field.getWidgets().size(), "two widgets remain after one is dropped"); | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| void survivesRoundTripWithoutOrphanPagesInOutput() throws IOException { | ||
| byte[] pdfBytes = buildPdfWithFieldPerPage(3); | ||
|
|
||
| byte[] writtenBytes; | ||
| try (PDDocument document = Loader.loadPDF(pdfBytes)) { | ||
| document.removePage(2); | ||
| document.removePage(1); | ||
| FormUtils.pruneOrphanedFormFields(document); | ||
| try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { | ||
| document.save(out); | ||
| writtenBytes = out.toByteArray(); | ||
| } | ||
| } | ||
|
|
||
| try (PDDocument reloaded = Loader.loadPDF(writtenBytes)) { | ||
| assertEquals(1, reloaded.getNumberOfPages()); | ||
| PDAcroForm form = reloaded.getDocumentCatalog().getAcroForm(null); | ||
| assertNotNull(form); | ||
| assertEquals(1, form.getFields().size()); | ||
| assertEquals("field_0", form.getFields().get(0).getPartialName()); | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| void prunesNestedNonTerminalFields() throws IOException { | ||
| byte[] pdfBytes = buildPdfWithNestedFields(); | ||
|
|
||
| try (PDDocument document = Loader.loadPDF(pdfBytes)) { | ||
| document.removePage(1); | ||
|
|
||
| FormUtils.pruneOrphanedFormFields(document); | ||
|
|
||
| PDAcroForm form = document.getDocumentCatalog().getAcroForm(null); | ||
| assertNotNull(form); | ||
| assertEquals(1, form.getFields().size()); | ||
| PDField group = form.getFields().get(0); | ||
| assertEquals("group", group.getPartialName()); | ||
| assertTrue(group instanceof PDNonTerminalField); | ||
| PDNonTerminalField nonTerminal = (PDNonTerminalField) group; | ||
| assertEquals(1, nonTerminal.getChildren().size()); | ||
| assertEquals("kept", nonTerminal.getChildren().get(0).getPartialName()); | ||
| } | ||
| } | ||
|
|
||
| private static byte[] buildPdfWithFieldPerPage(int pageCount) throws IOException { | ||
| try (PDDocument document = new PDDocument()) { | ||
| PDAcroForm acroForm = new PDAcroForm(document); | ||
| acroForm.setDefaultResources(new PDResources()); | ||
| document.getDocumentCatalog().setAcroForm(acroForm); | ||
|
|
||
| for (int i = 0; i < pageCount; i++) { | ||
| PDPage page = new PDPage(PDRectangle.A4); | ||
| document.addPage(page); | ||
|
|
||
| PDTextField field = new PDTextField(acroForm); | ||
| field.setPartialName("field_" + i); | ||
| PDAnnotationWidget widget = new PDAnnotationWidget(); | ||
| widget.setRectangle(new PDRectangle(50, 50, 100, 20)); | ||
| widget.setPage(page); | ||
| field.setWidgets(List.of(widget)); | ||
| acroForm.getFields().add(field); | ||
| page.getAnnotations().add(widget); | ||
| } | ||
|
|
||
| try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { | ||
| document.save(out); | ||
| return out.toByteArray(); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private static byte[] buildPdfWithMultiWidgetField() throws IOException { | ||
| try (PDDocument document = new PDDocument()) { | ||
| PDAcroForm acroForm = new PDAcroForm(document); | ||
| acroForm.setDefaultResources(new PDResources()); | ||
| document.getDocumentCatalog().setAcroForm(acroForm); | ||
|
|
||
| List<PDAnnotationWidget> widgets = new ArrayList<>(); | ||
| for (int i = 0; i < 3; i++) { | ||
| PDPage page = new PDPage(PDRectangle.A4); | ||
| document.addPage(page); | ||
| PDAnnotationWidget widget = new PDAnnotationWidget(); | ||
| widget.setRectangle(new PDRectangle(50, 50, 100, 20)); | ||
| widget.setPage(page); | ||
| page.getAnnotations().add(widget); | ||
| widgets.add(widget); | ||
| } | ||
|
|
||
| PDTextField field = new PDTextField(acroForm); | ||
| field.setPartialName("multi"); | ||
| field.setWidgets(widgets); | ||
| acroForm.getFields().add(field); | ||
|
|
||
| try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { | ||
| document.save(out); | ||
| return out.toByteArray(); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private static byte[] buildPdfWithNestedFields() throws IOException { | ||
| try (PDDocument document = new PDDocument()) { | ||
| PDAcroForm acroForm = new PDAcroForm(document); | ||
| acroForm.setDefaultResources(new PDResources()); | ||
| document.getDocumentCatalog().setAcroForm(acroForm); | ||
|
|
||
| PDPage pageA = new PDPage(PDRectangle.A4); | ||
| PDPage pageB = new PDPage(PDRectangle.A4); | ||
| document.addPage(pageA); | ||
| document.addPage(pageB); | ||
|
|
||
| PDNonTerminalField group = new PDNonTerminalField(acroForm); | ||
| group.setPartialName("group"); | ||
|
|
||
| PDTextField kept = new PDTextField(acroForm); | ||
| kept.setPartialName("kept"); | ||
| PDAnnotationWidget keptWidget = new PDAnnotationWidget(); | ||
| keptWidget.setRectangle(new PDRectangle(50, 50, 100, 20)); | ||
| keptWidget.setPage(pageA); | ||
| kept.setWidgets(List.of(keptWidget)); | ||
| pageA.getAnnotations().add(keptWidget); | ||
|
|
||
| PDTextField dropped = new PDTextField(acroForm); | ||
| dropped.setPartialName("dropped"); | ||
|
Comment on lines
+202
to
+213
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are these names shown to the user? Does it matter if they are english only?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are just for tests. Shouldn't have added this file though it can just be moved to formutilstest |
||
| PDAnnotationWidget droppedWidget = new PDAnnotationWidget(); | ||
| droppedWidget.setRectangle(new PDRectangle(50, 100, 100, 20)); | ||
| droppedWidget.setPage(pageB); | ||
| dropped.setWidgets(List.of(droppedWidget)); | ||
| pageB.getAnnotations().add(droppedWidget); | ||
|
|
||
| group.setChildren(List.of(kept, dropped)); | ||
| acroForm.getFields().add(group); | ||
|
|
||
| try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { | ||
| document.save(out); | ||
| return out.toByteArray(); | ||
| } | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Potential place for threaded implementation. If you got a chonky doc that is. May not be worth it
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this is worth doing, used very big docs to test and had no issue nad I don't think the overhead for this is all that significant