diff --git a/app/common/src/main/java/stirling/software/common/util/FormUtils.java b/app/common/src/main/java/stirling/software/common/util/FormUtils.java index a08463b8a4..25cb4b5825 100644 --- a/app/common/src/main/java/stirling/software/common/util/FormUtils.java +++ b/app/common/src/main/java/stirling/software/common/util/FormUtils.java @@ -2289,6 +2289,85 @@ private void registerNewField( acroForm.getFields().add(field); } + /** Drops AcroForm fields whose widgets are no longer on any page of {@code document}. */ + public void pruneOrphanedFormFields(PDDocument document) { + if (document == null) { + return; + } + PDDocumentCatalog catalog = document.getDocumentCatalog(); + if (catalog == null) { + return; + } + PDAcroForm form = catalog.getAcroForm(null); + if (form == null) { + return; + } + List fields = form.getFields(); + if (fields.isEmpty()) { + return; + } + + Set liveWidgets = collectLiveWidgetDictionaries(document); + List kept = pruneFieldList(fields, liveWidgets); + if (kept.isEmpty()) { + catalog.setAcroForm(null); + } else if (kept.size() != fields.size()) { + form.setFields(kept); + } + } + + private Set collectLiveWidgetDictionaries(PDDocument document) { + Set live = new HashSet<>(); + int pageCount = document.getNumberOfPages(); + for (int i = 0; i < pageCount; i++) { + try { + for (PDAnnotation annotation : document.getPage(i).getAnnotations()) { + if (annotation instanceof PDAnnotationWidget) { + live.add(annotation.getCOSObject()); + } + } + } catch (IOException e) { + log.debug("Failed reading page {} annotations: {}", i, e.getMessage()); + } + } + return live; + } + + private List pruneFieldList(List fields, Set liveWidgets) { + List kept = new ArrayList<>(fields.size()); + for (PDField field : fields) { + if (field instanceof PDNonTerminalField nonTerminal) { + List children = nonTerminal.getChildren(); + List remaining = pruneFieldList(children, liveWidgets); + if (remaining.isEmpty()) { + continue; + } + if (remaining.size() != children.size()) { + nonTerminal.setChildren(remaining); + } + kept.add(nonTerminal); + } else if (field instanceof PDTerminalField terminal) { + List widgets = terminal.getWidgets(); + List liveOnes = new ArrayList<>(widgets.size()); + for (PDAnnotationWidget widget : widgets) { + if (liveWidgets.contains(widget.getCOSObject())) { + liveOnes.add(widget); + } + } + if (liveOnes.isEmpty()) { + continue; + } + if (liveOnes.size() != widgets.size()) { + terminal.setWidgets(liveOnes); + } + kept.add(terminal); + } else { + kept.add(field); + } + } + return kept; + } + // Delegation methods to GeneralFormCopyUtils for form field transformation public boolean hasAnyRotatedPage(PDDocument document) { return stirling.software.common.util.GeneralFormCopyUtils.hasAnyRotatedPage(document); diff --git a/app/common/src/test/java/stirling/software/common/util/FormUtilsPruneOrphanedFieldsTest.java b/app/common/src/test/java/stirling/software/common/util/FormUtilsPruneOrphanedFieldsTest.java new file mode 100644 index 0000000000..a082ee0ccf --- /dev/null +++ b/app/common/src/test/java/stirling/software/common/util/FormUtilsPruneOrphanedFieldsTest.java @@ -0,0 +1,229 @@ +package stirling.software.common.util; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDResources; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget; +import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; +import org.apache.pdfbox.pdmodel.interactive.form.PDField; +import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField; +import org.apache.pdfbox.pdmodel.interactive.form.PDTextField; +import org.junit.jupiter.api.Test; + +class FormUtilsPruneOrphanedFieldsTest { + + @Test + void noAcroFormIsNoOp() throws IOException { + try (PDDocument document = new PDDocument()) { + document.addPage(new PDPage(PDRectangle.A4)); + FormUtils.pruneOrphanedFormFields(document); + assertNull(document.getDocumentCatalog().getAcroForm(null)); + } + } + + @Test + void dropsFieldsWhoseWidgetsAreAllOnRemovedPages() throws IOException { + byte[] pdfBytes = buildPdfWithFieldPerPage(3); + + try (PDDocument document = Loader.loadPDF(pdfBytes)) { + document.removePage(2); + document.removePage(0); + + FormUtils.pruneOrphanedFormFields(document); + + PDAcroForm form = document.getDocumentCatalog().getAcroForm(null); + assertNotNull(form); + List remainingNames = new ArrayList<>(); + for (PDField field : form.getFields()) { + remainingNames.add(field.getPartialName()); + } + assertEquals(List.of("field_1"), remainingNames); + } + } + + @Test + void dropsAcroFormEntirelyWhenNoFieldsSurvive() throws IOException { + byte[] pdfBytes = buildPdfWithFieldPerPage(2); + + try (PDDocument document = Loader.loadPDF(pdfBytes)) { + document.removePage(1); + document.removePage(0); + document.addPage(new PDPage(PDRectangle.A4)); + + FormUtils.pruneOrphanedFormFields(document); + + assertNull(document.getDocumentCatalog().getAcroForm(null)); + } + } + + @Test + void keepsLiveWidgetsAndDropsOrphanWidgetsFromMultiWidgetField() throws IOException { + byte[] pdfBytes = buildPdfWithMultiWidgetField(); + + try (PDDocument document = Loader.loadPDF(pdfBytes)) { + document.removePage(0); + + FormUtils.pruneOrphanedFormFields(document); + + PDAcroForm form = document.getDocumentCatalog().getAcroForm(null); + assertNotNull(form); + assertEquals(1, form.getFields().size()); + PDField field = form.getFields().get(0); + assertEquals("multi", field.getPartialName()); + assertEquals(2, field.getWidgets().size(), "two widgets remain after one is dropped"); + } + } + + @Test + void survivesRoundTripWithoutOrphanPagesInOutput() throws IOException { + byte[] pdfBytes = buildPdfWithFieldPerPage(3); + + byte[] writtenBytes; + try (PDDocument document = Loader.loadPDF(pdfBytes)) { + document.removePage(2); + document.removePage(1); + FormUtils.pruneOrphanedFormFields(document); + try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { + document.save(out); + writtenBytes = out.toByteArray(); + } + } + + try (PDDocument reloaded = Loader.loadPDF(writtenBytes)) { + assertEquals(1, reloaded.getNumberOfPages()); + PDAcroForm form = reloaded.getDocumentCatalog().getAcroForm(null); + assertNotNull(form); + assertEquals(1, form.getFields().size()); + assertEquals("field_0", form.getFields().get(0).getPartialName()); + } + } + + @Test + void prunesNestedNonTerminalFields() throws IOException { + byte[] pdfBytes = buildPdfWithNestedFields(); + + try (PDDocument document = Loader.loadPDF(pdfBytes)) { + document.removePage(1); + + FormUtils.pruneOrphanedFormFields(document); + + PDAcroForm form = document.getDocumentCatalog().getAcroForm(null); + assertNotNull(form); + assertEquals(1, form.getFields().size()); + PDField group = form.getFields().get(0); + assertEquals("group", group.getPartialName()); + assertTrue(group instanceof PDNonTerminalField); + PDNonTerminalField nonTerminal = (PDNonTerminalField) group; + assertEquals(1, nonTerminal.getChildren().size()); + assertEquals("kept", nonTerminal.getChildren().get(0).getPartialName()); + } + } + + private static byte[] buildPdfWithFieldPerPage(int pageCount) throws IOException { + try (PDDocument document = new PDDocument()) { + PDAcroForm acroForm = new PDAcroForm(document); + acroForm.setDefaultResources(new PDResources()); + document.getDocumentCatalog().setAcroForm(acroForm); + + for (int i = 0; i < pageCount; i++) { + PDPage page = new PDPage(PDRectangle.A4); + document.addPage(page); + + PDTextField field = new PDTextField(acroForm); + field.setPartialName("field_" + i); + PDAnnotationWidget widget = new PDAnnotationWidget(); + widget.setRectangle(new PDRectangle(50, 50, 100, 20)); + widget.setPage(page); + field.setWidgets(List.of(widget)); + acroForm.getFields().add(field); + page.getAnnotations().add(widget); + } + + try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { + document.save(out); + return out.toByteArray(); + } + } + } + + private static byte[] buildPdfWithMultiWidgetField() throws IOException { + try (PDDocument document = new PDDocument()) { + PDAcroForm acroForm = new PDAcroForm(document); + acroForm.setDefaultResources(new PDResources()); + document.getDocumentCatalog().setAcroForm(acroForm); + + List widgets = new ArrayList<>(); + for (int i = 0; i < 3; i++) { + PDPage page = new PDPage(PDRectangle.A4); + document.addPage(page); + PDAnnotationWidget widget = new PDAnnotationWidget(); + widget.setRectangle(new PDRectangle(50, 50, 100, 20)); + widget.setPage(page); + page.getAnnotations().add(widget); + widgets.add(widget); + } + + PDTextField field = new PDTextField(acroForm); + field.setPartialName("multi"); + field.setWidgets(widgets); + acroForm.getFields().add(field); + + try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { + document.save(out); + return out.toByteArray(); + } + } + } + + private static byte[] buildPdfWithNestedFields() throws IOException { + try (PDDocument document = new PDDocument()) { + PDAcroForm acroForm = new PDAcroForm(document); + acroForm.setDefaultResources(new PDResources()); + document.getDocumentCatalog().setAcroForm(acroForm); + + PDPage pageA = new PDPage(PDRectangle.A4); + PDPage pageB = new PDPage(PDRectangle.A4); + document.addPage(pageA); + document.addPage(pageB); + + PDNonTerminalField group = new PDNonTerminalField(acroForm); + group.setPartialName("group"); + + PDTextField kept = new PDTextField(acroForm); + kept.setPartialName("kept"); + PDAnnotationWidget keptWidget = new PDAnnotationWidget(); + keptWidget.setRectangle(new PDRectangle(50, 50, 100, 20)); + keptWidget.setPage(pageA); + kept.setWidgets(List.of(keptWidget)); + pageA.getAnnotations().add(keptWidget); + + PDTextField dropped = new PDTextField(acroForm); + dropped.setPartialName("dropped"); + PDAnnotationWidget droppedWidget = new PDAnnotationWidget(); + droppedWidget.setRectangle(new PDRectangle(50, 100, 100, 20)); + droppedWidget.setPage(pageB); + dropped.setWidgets(List.of(droppedWidget)); + pageB.getAnnotations().add(droppedWidget); + + group.setChildren(List.of(kept, dropped)); + acroForm.getFields().add(group); + + try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { + document.save(out); + return out.toByteArray(); + } + } + } +} diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/RearrangePagesPDFController.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/RearrangePagesPDFController.java index 1aac42bea9..8825775195 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/RearrangePagesPDFController.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/RearrangePagesPDFController.java @@ -6,8 +6,11 @@ import java.util.List; import java.util.Locale; +import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentCatalog; import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; import org.springframework.core.io.Resource; import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; @@ -27,6 +30,7 @@ import stirling.software.common.annotations.api.GeneralApi; import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.util.ExceptionUtils; +import stirling.software.common.util.FormUtils; import stirling.software.common.util.GeneralUtils; import stirling.software.common.util.TempFileManager; import stirling.software.common.util.WebResponseUtils; @@ -67,6 +71,7 @@ public ResponseEntity deletePages(@ModelAttribute PDFWithPageNums requ int pageIndex = pagesToRemove.get(i); document.removePage(pageIndex); } + FormUtils.pruneOrphanedFormFields(document); return WebResponseUtils.pdfDocToWebResponse( document, GeneralUtils.generateFilename( @@ -265,6 +270,17 @@ public ResponseEntity rearrangePages(@ModelAttribute RearrangePagesReq rearrangedDocument.addPage(page); } + PDDocumentCatalog sourceCatalog = document.getDocumentCatalog(); + if (sourceCatalog != null) { + PDAcroForm sourceForm = sourceCatalog.getAcroForm(null); + if (sourceForm != null) { + rearrangedDocument + .getDocumentCatalog() + .getCOSObject() + .setItem(COSName.ACRO_FORM, sourceForm.getCOSObject()); + } + } + return WebResponseUtils.pdfDocToWebResponse( rearrangedDocument, GeneralUtils.generateFilename( diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/SplitPDFController.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/SplitPDFController.java index fc2bcd3a22..dda9d0892e 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/SplitPDFController.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/SplitPDFController.java @@ -1,9 +1,13 @@ package stirling.software.SPDF.controller.api; +import java.io.File; import java.io.IOException; import java.nio.file.Files; +import java.nio.file.StandardCopyOption; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.stream.Collectors; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; @@ -26,6 +30,7 @@ import stirling.software.common.annotations.api.GeneralApi; import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.util.ExceptionUtils; +import stirling.software.common.util.FormUtils; import stirling.software.common.util.GeneralUtils; import stirling.software.common.util.TempFile; import stirling.software.common.util.TempFileManager; @@ -54,9 +59,21 @@ public ResponseEntity splitPdf(@ModelAttribute SplitPagesRequest reque MultipartFile file = request.getFileInput(); TempFile outputTempFile = new TempFile(tempFileManager, ".zip"); try { - try (PDDocument document = pdfDocumentFactory.load(file)) { - int totalPages = document.getNumberOfPages(); - List pageNumbers = request.getPageNumbersList(document, false); + try (TempFile sourceTempFile = new TempFile(tempFileManager, ".pdf")) { + Files.copy( + file.getInputStream(), + sourceTempFile.getPath(), + StandardCopyOption.REPLACE_EXISTING); + + int totalPages; + List pageNumbers; + boolean hasForm; + try (PDDocument document = + pdfDocumentFactory.load(sourceTempFile.getFile(), true)) { + totalPages = document.getNumberOfPages(); + pageNumbers = request.getPageNumbersList(document, false); + hasForm = document.getDocumentCatalog().getAcroForm(null) != null; + } if (!pageNumbers.contains(totalPages - 1)) { pageNumbers = new ArrayList<>(pageNumbers); pageNumbers.add(totalPages - 1); @@ -69,33 +86,16 @@ public ResponseEntity splitPdf(@ModelAttribute SplitPagesRequest reque String baseFilename = GeneralUtils.removeExtension(file.getOriginalFilename()); try (ZipOutputStream zipOut = new ZipOutputStream(Files.newOutputStream(outputTempFile.getPath()))) { - int previousPageNumber = 0; - for (int splitIndex = 0; splitIndex < pageNumbers.size(); splitIndex++) { - int splitPoint = pageNumbers.get(splitIndex); - try (PDDocument splitDocument = - pdfDocumentFactory.createNewDocumentBasedOnOldDocument(document)) { - for (int i = previousPageNumber; i <= splitPoint; i++) { - splitDocument.addPage(document.getPage(i)); - log.debug("Adding page {} to split document", i); - } - previousPageNumber = splitPoint + 1; - - String fileName = baseFilename + "_" + (splitIndex + 1) + ".pdf"; - zipOut.putNextEntry(new ZipEntry(fileName)); - splitDocument.save(zipOut); - zipOut.closeEntry(); - log.debug("Wrote split document {} to zip file", fileName); - } catch (Exception e) { - ExceptionUtils.logException("document splitting and saving", e); - throw e; - } + if (hasForm) { + writeSplitsViaReload( + sourceTempFile.getFile(), pageNumbers, baseFilename, zipOut); + } else { + writeSplitsViaSharedSource( + sourceTempFile.getFile(), pageNumbers, baseFilename, zipOut); } } } - log.debug( - "Successfully created zip file with split documents: {}", - outputTempFile.getPath().toString()); String zipFilename = GeneralUtils.generateFilename(file.getOriginalFilename(), "_split.zip"); return WebResponseUtils.zipFileToWebResponse(outputTempFile, zipFilename); @@ -104,4 +104,60 @@ public ResponseEntity splitPdf(@ModelAttribute SplitPagesRequest reque throw e; } } + + private void writeSplitsViaReload( + File source, List pageNumbers, String baseFilename, ZipOutputStream zipOut) + throws IOException { + int previousPageNumber = 0; + for (int splitIndex = 0; splitIndex < pageNumbers.size(); splitIndex++) { + int splitPoint = pageNumbers.get(splitIndex); + Set keep = new HashSet<>(); + for (int i = previousPageNumber; i <= splitPoint; i++) { + keep.add(i); + } + previousPageNumber = splitPoint + 1; + + try (PDDocument splitDoc = pdfDocumentFactory.load(source)) { + for (int p = splitDoc.getNumberOfPages() - 1; p >= 0; p--) { + if (!keep.contains(p)) { + splitDoc.removePage(p); + } + } + FormUtils.pruneOrphanedFormFields(splitDoc); + writeEntry(zipOut, baseFilename, splitIndex + 1, splitDoc); + } catch (Exception e) { + ExceptionUtils.logException("document splitting and saving", e); + throw e; + } + } + } + + private void writeSplitsViaSharedSource( + File source, List pageNumbers, String baseFilename, ZipOutputStream zipOut) + throws IOException { + try (PDDocument sourceDoc = pdfDocumentFactory.load(source)) { + int previousPageNumber = 0; + for (int splitIndex = 0; splitIndex < pageNumbers.size(); splitIndex++) { + int splitPoint = pageNumbers.get(splitIndex); + try (PDDocument splitDoc = + pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDoc)) { + for (int i = previousPageNumber; i <= splitPoint; i++) { + splitDoc.addPage(sourceDoc.getPage(i)); + } + previousPageNumber = splitPoint + 1; + writeEntry(zipOut, baseFilename, splitIndex + 1, splitDoc); + } catch (Exception e) { + ExceptionUtils.logException("document splitting and saving", e); + throw e; + } + } + } + } + + private void writeEntry(ZipOutputStream zipOut, String baseFilename, int index, PDDocument doc) + throws IOException { + zipOut.putNextEntry(new ZipEntry(baseFilename + "_" + index + ".pdf")); + doc.save(zipOut); + zipOut.closeEntry(); + } } diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/SplitPdfBySizeController.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/SplitPdfBySizeController.java index 697ee62f39..806771b2ba 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/SplitPdfBySizeController.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/SplitPdfBySizeController.java @@ -1,8 +1,14 @@ package stirling.software.SPDF.controller.api; import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.IOException; import java.nio.file.Files; +import java.nio.file.StandardCopyOption; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; @@ -16,7 +22,6 @@ import io.swagger.v3.oas.annotations.Operation; -import lombok.Getter; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -26,6 +31,7 @@ import stirling.software.common.annotations.api.GeneralApi; import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.util.ExceptionUtils; +import stirling.software.common.util.FormUtils; import stirling.software.common.util.GeneralUtils; import stirling.software.common.util.TempFile; import stirling.software.common.util.TempFileManager; @@ -54,52 +60,40 @@ public class SplitPdfBySizeController { public ResponseEntity autoSplitPdf( @ModelAttribute SplitPdfBySizeOrCountRequest request) throws Exception { - log.debug("Starting PDF split process with request: {}", request); MultipartFile file = request.getFileInput(); - String filename = GeneralUtils.generateFilename(file.getOriginalFilename(), ""); - log.debug("Base filename for output: {}", filename); TempFile zipTempFile = new TempFile(tempFileManager, ".zip"); try { - log.debug("Created temporary managed zip file: {}", zipTempFile.getPath()); - log.debug("Creating ZIP output stream"); - try (ZipOutputStream zipOut = - new ZipOutputStream(Files.newOutputStream(zipTempFile.getPath())); - PDDocument sourceDocument = pdfDocumentFactory.load(file)) { - log.debug( - "Successfully loaded PDF with {} pages", sourceDocument.getNumberOfPages()); - - int type = request.getSplitType(); - String value = request.getSplitValue(); - log.debug("Split type: {}, Split value: {}", type, value); - - if (type == 0) { - log.debug("Processing split by size"); - long maxBytes = GeneralUtils.convertSizeToBytes(value); - log.debug("Max bytes per document: {}", maxBytes); - handleSplitBySize(sourceDocument, maxBytes, zipOut, filename); - } else if (type == 1) { - log.debug("Processing split by page count"); - int pageCount = Integer.parseInt(value); - log.debug("Pages per document: {}", pageCount); - handleSplitByPageCount(sourceDocument, pageCount, zipOut, filename); - } else if (type == 2) { - log.debug("Processing split by document count"); - int documentCount = Integer.parseInt(value); - log.debug("Total number of documents: {}", documentCount); - handleSplitByDocCount(sourceDocument, documentCount, zipOut, filename); - } else { - log.error("Invalid split type: {}", type); - throw ExceptionUtils.createIllegalArgumentException( - "error.invalidArgument", - "Invalid argument: {0}", - "split type: " + type); + try (TempFile sourceTempFile = new TempFile(tempFileManager, ".pdf"); + ZipOutputStream zipOut = + new ZipOutputStream(Files.newOutputStream(zipTempFile.getPath()))) { + Files.copy( + file.getInputStream(), + sourceTempFile.getPath(), + StandardCopyOption.REPLACE_EXISTING); + + try (PDDocument sourceDocument = + pdfDocumentFactory.load(sourceTempFile.getFile(), true)) { + boolean hasForm = sourceDocument.getDocumentCatalog().getAcroForm(null) != null; + List> ranges = computeRanges(request, sourceDocument); + + int fileIndex = 1; + for (List range : ranges) { + if (range.isEmpty()) { + continue; + } + if (hasForm) { + writeRangeViaReload( + sourceTempFile.getFile(), range, zipOut, filename, fileIndex++); + } else { + writeRangeViaSharedSource( + sourceDocument, range, zipOut, filename, fileIndex++); + } + } } - log.debug("PDF splitting completed successfully"); } - log.debug("Returning streaming response for zip file"); return WebResponseUtils.zipFileToWebResponse(zipTempFile, filename + ".zip"); } catch (Exception e) { ExceptionUtils.logException("PDF splitting process", e); @@ -108,387 +102,198 @@ public ResponseEntity autoSplitPdf( } } - private void handleSplitBySize( - PDDocument sourceDocument, long maxBytes, ZipOutputStream zipOut, String baseFilename) - throws IOException { - log.debug("Starting handleSplitBySize with maxBytes={}", maxBytes); - - @Getter - class DocHolder implements AutoCloseable { - private PDDocument doc; - - public DocHolder(PDDocument doc) { - this.doc = doc; - } + private List> computeRanges( + SplitPdfBySizeOrCountRequest request, PDDocument sourceDocument) throws IOException { + int type = request.getSplitType(); + String value = request.getSplitValue(); + if (type == 0) { + return computeSizeRanges(sourceDocument, GeneralUtils.convertSizeToBytes(value)); + } else if (type == 1) { + return computePageCountRanges(sourceDocument, Integer.parseInt(value)); + } else if (type == 2) { + return computeDocCountRanges(sourceDocument, Integer.parseInt(value)); + } + throw ExceptionUtils.createIllegalArgumentException( + "error.invalidArgument", "Invalid argument: {0}", "split type: " + type); + } - public void setDoc(PDDocument doc) { - if (this.doc != null) { - try { - this.doc.close(); - } catch (IOException e) { - log.error("Error closing document", e); - } + private void writeRangeViaReload( + File sourceFile, + List keepIndices, + ZipOutputStream zipOut, + String baseFilename, + int fileIndex) + throws IOException { + Set keep = new HashSet<>(keepIndices); + try (PDDocument doc = pdfDocumentFactory.load(sourceFile)) { + for (int i = doc.getNumberOfPages() - 1; i >= 0; i--) { + if (!keep.contains(i)) { + doc.removePage(i); } - this.doc = doc; } + FormUtils.pruneOrphanedFormFields(doc); + writeEntry(zipOut, baseFilename, fileIndex, doc); + } + } - @Override - public void close() throws IOException { - if (doc != null) { - doc.close(); - } + private void writeRangeViaSharedSource( + PDDocument sourceDocument, + List keepIndices, + ZipOutputStream zipOut, + String baseFilename, + int fileIndex) + throws IOException { + try (PDDocument doc = + pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument)) { + for (int p : keepIndices) { + doc.addPage(sourceDocument.getPage(p)); } + writeEntry(zipOut, baseFilename, fileIndex, doc); } + } - int fileIndex = 1; - try (DocHolder holder = - new DocHolder( - pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument))) { - int totalPages = sourceDocument.getNumberOfPages(); - int pageAdded = 0; + private void writeEntry( + ZipOutputStream zipOut, String baseFilename, int fileIndex, PDDocument doc) + throws IOException { + zipOut.putNextEntry(new ZipEntry(baseFilename + "_" + fileIndex + ".pdf")); + doc.save(zipOut); + zipOut.closeEntry(); + } - // Smart size check frequency - check more often with larger documents - int baseCheckFrequency = 5; + /** Page-index ranges each output should contain. AcroForm overhead isn't modeled. */ + private List> computeSizeRanges(PDDocument sourceDocument, long maxBytes) + throws IOException { + List> ranges = new ArrayList<>(); + List currentRange = new ArrayList<>(); + int totalPages = sourceDocument.getNumberOfPages(); + int baseCheckFrequency = 5; + PDDocument scratch = new PDDocument(); + try { for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { PDPage page = sourceDocument.getPage(pageIndex); - log.debug("Processing page {} of {}", pageIndex + 1, totalPages); - - // Add the page to current document - PDPage newPage = new PDPage(page.getCOSObject()); - holder.getDoc().addPage(newPage); - pageAdded++; + scratch.addPage(new PDPage(page.getCOSObject())); + currentRange.add(pageIndex); - // Dynamic size checking based on document size and page count + int pageAdded = currentRange.size(); boolean shouldCheckSize = (pageAdded % baseCheckFrequency == 0) || (pageIndex == totalPages - 1) - || (pageAdded >= 20); // Always check after 20 pages - - if (shouldCheckSize) { - log.debug("Performing size check after {} pages", pageAdded); - long actualSize; - try (ByteArrayOutputStream checkSizeStream = new ByteArrayOutputStream()) { - holder.getDoc().save(checkSizeStream); - actualSize = checkSizeStream.size(); - } - log.debug( - "Current document size: {} bytes (max: {} bytes)", - actualSize, - maxBytes); - - if (actualSize > maxBytes) { - // We exceeded the limit - remove the last page and save - if (holder.getDoc().getNumberOfPages() > 1) { - holder.getDoc().removePage(holder.getDoc().getNumberOfPages() - 1); - pageIndex--; // Process this page again in the next document - log.debug("Size limit exceeded - removed last page"); - } - - log.debug( - "Saving document with {} pages as part {}", - holder.getDoc().getNumberOfPages(), - fileIndex); - saveDocumentToZip(holder.getDoc(), zipOut, baseFilename, fileIndex++); - holder.setDoc(new PDDocument()); - pageAdded = 0; - } else if (pageIndex < totalPages - 1) { - // We're under the limit, calculate if we might fit more pages - // Try to predict how many more similar pages might fit - if (actualSize < maxBytes * 0.75 && pageAdded > 0) { - // Rather than using a ratio, look ahead to test actual upcoming pages - int pagesToLookAhead = Math.min(5, totalPages - pageIndex - 1); - - if (pagesToLookAhead > 0) { - log.debug( - "Testing {} upcoming pages for potential addition", - pagesToLookAhead); - - // Create a temp document with current pages + look-ahead pages - try (PDDocument testDoc = new PDDocument()) { - // First copy existing pages - for (int i = 0; i < holder.getDoc().getNumberOfPages(); i++) { - testDoc.addPage( - new PDPage( - holder.getDoc().getPage(i).getCOSObject())); - } - - // Try adding look-ahead pages one by one - int extraPagesAdded = 0; - for (int i = 0; i < pagesToLookAhead; i++) { - int testPageIndex = pageIndex + 1 + i; - PDPage testPage = sourceDocument.getPage(testPageIndex); - testDoc.addPage(new PDPage(testPage.getCOSObject())); - - // Check if we're still under size - long testSize; - try (ByteArrayOutputStream testStream = - new ByteArrayOutputStream()) { - testDoc.save(testStream); - testSize = testStream.size(); - } - - if (testSize <= maxBytes) { - extraPagesAdded++; - log.debug( - "Test: Can add page {} (size would be {})", - testPageIndex + 1, - testSize); - } else { - log.debug( - "Test: Cannot add page {} (size would be {})", - testPageIndex + 1, - testSize); - break; - } - } - // Add the pages we verified would fit - if (extraPagesAdded > 0) { - log.debug( - "Adding {} verified pages ahead", extraPagesAdded); - for (int i = 0; i < extraPagesAdded; i++) { - int extraPageIndex = pageIndex + 1 + i; - PDPage extraPage = - sourceDocument.getPage(extraPageIndex); - holder.getDoc() - .addPage(new PDPage(extraPage.getCOSObject())); - } - pageIndex += extraPagesAdded; - pageAdded += extraPagesAdded; - } - } - } - } - } + || (pageAdded >= 20); + if (!shouldCheckSize) { + continue; } - } - - // Save final document if it has any pages - if (holder.getDoc() != null && holder.getDoc().getNumberOfPages() > 0) { - log.debug( - "Saving final document with {} pages as part {}", - holder.getDoc().getNumberOfPages(), - fileIndex); - saveDocumentToZip(holder.getDoc(), zipOut, baseFilename, fileIndex++); - holder.setDoc(null); - } - } - log.debug("Completed handleSplitBySize with {} document parts created", fileIndex - 1); - } - - private void handleSplitByPageCount( - PDDocument sourceDocument, int pageCount, ZipOutputStream zipOut, String baseFilename) - throws IOException { - log.debug("Starting handleSplitByPageCount with pageCount={}", pageCount); - int currentPageCount = 0; - PDDocument currentDoc = null; - int fileIndex = 1; - - try { - log.debug("Creating initial output document"); - try { - currentDoc = pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument); - log.debug("Successfully created initial output document"); - } catch (Exception e) { - ExceptionUtils.logException("initial output document creation", e); - throw ExceptionUtils.createFileProcessingException("split", e); - } - - int pageIndex = 0; - int totalPages = sourceDocument.getNumberOfPages(); - log.debug("Processing {} pages", totalPages); - - try { - for (PDPage page : sourceDocument.getPages()) { - pageIndex++; - log.debug("Processing page {} of {}", pageIndex, totalPages); + long actualSize; + try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { + scratch.save(out); + actualSize = out.size(); + } - try { - log.debug("Adding page {} to current document", pageIndex); - currentDoc.addPage(page); - log.debug("Successfully added page {} to current document", pageIndex); - } catch (Exception e) { - log.error("Error adding page {} to current document", pageIndex, e); - throw ExceptionUtils.createFileProcessingException("split", e); + if (actualSize > maxBytes) { + if (scratch.getNumberOfPages() > 1) { + scratch.removePage(scratch.getNumberOfPages() - 1); + currentRange.remove(currentRange.size() - 1); + pageIndex--; // retry this page in the next chunk } - - currentPageCount++; - log.debug("Current page count: {}/{}", currentPageCount, pageCount); - - if (currentPageCount == pageCount) { - log.debug( - "Reached target page count ({}), saving current document as part {}", - pageCount, - fileIndex); - try { - saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++); - currentDoc = null; // Document is closed by saveDocumentToZip - log.debug("Successfully saved document part {}", fileIndex - 1); - } catch (Exception e) { - log.error("Error saving document part {}", fileIndex - 1, e); - throw e; - } - - try { - log.debug("Creating new document for next part"); - currentDoc = new PDDocument(); - log.debug("Successfully created new document"); - } catch (Exception e) { - log.error("Error creating new document for next part", e); - throw ExceptionUtils.createFileProcessingException("split", e); - } - - currentPageCount = 0; - log.debug("Reset current page count to 0"); + ranges.add(new ArrayList<>(currentRange)); + currentRange.clear(); + scratch.close(); + scratch = new PDDocument(); + } else if (pageIndex < totalPages - 1 && actualSize < maxBytes * 0.75) { + int extraPagesAdded = + lookAheadFit(scratch, sourceDocument, pageIndex, maxBytes); + for (int i = 0; i < extraPagesAdded; i++) { + int extra = pageIndex + 1 + i; + scratch.addPage(new PDPage(sourceDocument.getPage(extra).getCOSObject())); + currentRange.add(extra); } + pageIndex += extraPagesAdded; } - } catch (Exception e) { - log.error("Error iterating through pages", e); - throw ExceptionUtils.createFileProcessingException("split", e); } - // Add the last document if it contains any pages - try { - if (currentDoc != null && currentDoc.getPages().getCount() != 0) { - log.debug( - "Saving final document with {} pages as part {}", - currentDoc.getPages().getCount(), - fileIndex); - try { - saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++); - currentDoc = null; // Document is closed by saveDocumentToZip - log.debug("Successfully saved final document part {}", fileIndex - 1); - } catch (Exception e) { - log.error("Error saving final document part {}", fileIndex - 1, e); - throw e; - } - } else { - log.debug("Final document has no pages, skipping"); - } - } catch (Exception e) { - log.error("Error checking or saving final document", e); - throw ExceptionUtils.createFileProcessingException("split", e); + if (!currentRange.isEmpty()) { + ranges.add(new ArrayList<>(currentRange)); } } finally { - if (currentDoc != null) { - try { - log.debug("Closing remaining document"); - currentDoc.close(); - log.debug("Successfully closed remaining document"); - } catch (Exception e) { - log.error("Error closing remaining document", e); - } - } + scratch.close(); } - - log.debug("Completed handleSplitByPageCount with {} document parts created", fileIndex - 1); + return ranges; } - private void handleSplitByDocCount( - PDDocument sourceDocument, - int documentCount, - ZipOutputStream zipOut, - String baseFilename) + /** Speculatively tries up to 5 next pages; returns how many fit under {@code maxBytes}. */ + private int lookAheadFit(PDDocument scratch, PDDocument source, int pageIndex, long maxBytes) throws IOException { - log.debug("Starting handleSplitByDocCount with documentCount={}", documentCount); - int totalPageCount = sourceDocument.getNumberOfPages(); - log.debug("Total pages in source document: {}", totalPageCount); - - int pagesPerDocument = totalPageCount / documentCount; - int extraPages = totalPageCount % documentCount; - log.debug("Pages per document: {}, Extra pages: {}", pagesPerDocument, extraPages); - - int currentPageIndex = 0; - int fileIndex = 1; - - for (int i = 0; i < documentCount; i++) { - log.debug("Creating document {} of {}", i + 1, documentCount); - PDDocument currentDoc = null; - try { - currentDoc = pdfDocumentFactory.createNewDocumentBasedOnOldDocument(sourceDocument); - log.debug("Successfully created document {} of {}", i + 1, documentCount); - - int pagesToAdd = pagesPerDocument + (i < extraPages ? 1 : 0); - log.debug("Adding {} pages to document {}", pagesToAdd, i + 1); - - for (int j = 0; j < pagesToAdd; j++) { - try { - log.debug( - "Adding page {} (index {}) to document {}", - j + 1, - currentPageIndex, - i + 1); - currentDoc.addPage(sourceDocument.getPage(currentPageIndex)); - log.debug("Successfully added page {} to document {}", j + 1, i + 1); - currentPageIndex++; - } catch (Exception e) { - log.error("Error adding page {} to document {}", j + 1, i + 1, e); - throw ExceptionUtils.createFileProcessingException("split", e); - } - } + int totalPages = source.getNumberOfPages(); + int pagesToLookAhead = Math.min(5, totalPages - pageIndex - 1); + if (pagesToLookAhead == 0) { + return 0; + } - try { - log.debug("Saving document {} with {} pages", i + 1, pagesToAdd); - saveDocumentToZip(currentDoc, zipOut, baseFilename, fileIndex++); - // saveDocumentToZip closes the document - currentDoc = null; - log.debug("Successfully saved document {}", i + 1); - } catch (Exception e) { - log.error("Error saving document {}", i + 1, e); - throw e; + int extraPagesAdded = 0; + try (PDDocument testDoc = new PDDocument()) { + for (int i = 0; i < scratch.getNumberOfPages(); i++) { + testDoc.addPage(new PDPage(scratch.getPage(i).getCOSObject())); + } + for (int i = 0; i < pagesToLookAhead; i++) { + testDoc.addPage(new PDPage(source.getPage(pageIndex + 1 + i).getCOSObject())); + long testSize; + try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { + testDoc.save(out); + testSize = out.size(); } - } catch (Exception e) { - log.error("Error creating document {} of {}", i + 1, documentCount, e); - throw ExceptionUtils.createFileProcessingException("split", e); - } finally { - if (currentDoc != null) { - try { - currentDoc.close(); - } catch (IOException e) { - log.error("Error closing document {} of {}", i + 1, documentCount, e); - } + if (testSize > maxBytes) { + break; } + extraPagesAdded++; } } - - log.debug("Completed handleSplitByDocCount with {} documents created", documentCount); + return extraPagesAdded; } - private void saveDocumentToZip( - PDDocument document, ZipOutputStream zipOut, String baseFilename, int index) - throws IOException { - log.debug("Starting saveDocumentToZip for document part {}", index); - try (ByteArrayOutputStream outStream = new ByteArrayOutputStream()) { - - try (PDDocument doc = document) { - log.debug("Saving document part {} to byte array", index); - doc.save(outStream); - log.debug( - "Successfully saved document part {} ({} bytes)", index, outStream.size()); - } catch (Exception e) { - log.error("Error saving document part {} to byte array", index, e); - throw ExceptionUtils.createFileProcessingException("split", e); + private List> computePageCountRanges(PDDocument sourceDocument, int pageCount) { + if (pageCount <= 0) { + throw ExceptionUtils.createIllegalArgumentException( + "error.invalidArgument", "Invalid argument: {0}", "page count: " + pageCount); + } + int totalPages = sourceDocument.getNumberOfPages(); + List> ranges = new ArrayList<>(); + List current = new ArrayList<>(pageCount); + for (int i = 0; i < totalPages; i++) { + current.add(i); + if (current.size() == pageCount) { + ranges.add(current); + current = new ArrayList<>(pageCount); } + } + if (!current.isEmpty()) { + ranges.add(current); + } + return ranges; + } - try { - // Create a new zip entry - String entryName = baseFilename + "_" + index + ".pdf"; - log.debug("Creating ZIP entry: {}", entryName); - ZipEntry zipEntry = new ZipEntry(entryName); - zipOut.putNextEntry(zipEntry); - - byte[] bytes = outStream.toByteArray(); - log.debug("Writing {} bytes to ZIP entry", bytes.length); - zipOut.write(bytes); + private List> computeDocCountRanges( + PDDocument sourceDocument, int documentCount) { + if (documentCount <= 0) { + throw ExceptionUtils.createIllegalArgumentException( + "error.invalidArgument", + "Invalid argument: {0}", + "document count: " + documentCount); + } + int totalPages = sourceDocument.getNumberOfPages(); + int pagesPerDocument = totalPages / documentCount; + int extraPages = totalPages % documentCount; - log.debug("Closing ZIP entry"); - zipOut.closeEntry(); - log.debug("Successfully added document part {} to ZIP", index); - } catch (Exception e) { - log.error("Error adding document part {} to ZIP", index, e); - throw ExceptionUtils.createFileProcessingException("split", e); + List> ranges = new ArrayList<>(); + int cursor = 0; + for (int i = 0; i < documentCount; i++) { + int pagesToAdd = pagesPerDocument + (i < extraPages ? 1 : 0); + List range = new ArrayList<>(pagesToAdd); + for (int j = 0; j < pagesToAdd; j++) { + range.add(cursor++); } + ranges.add(range); } + return ranges; } } diff --git a/app/core/src/test/java/stirling/software/SPDF/controller/api/SplitPDFControllerTest.java b/app/core/src/test/java/stirling/software/SPDF/controller/api/SplitPDFControllerTest.java index 01a2030147..49a8d8bfc4 100644 --- a/app/core/src/test/java/stirling/software/SPDF/controller/api/SplitPDFControllerTest.java +++ b/app/core/src/test/java/stirling/software/SPDF/controller/api/SplitPDFControllerTest.java @@ -3,8 +3,10 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.when; +import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -24,7 +26,6 @@ import org.springframework.http.HttpStatus; import org.springframework.http.MediaType; import org.springframework.mock.web.MockMultipartFile; -import org.springframework.web.multipart.MultipartFile; import stirling.software.SPDF.model.api.SplitPagesRequest; import stirling.software.common.service.CustomPDFDocumentFactory; @@ -60,8 +61,10 @@ private byte[] createPdf(int numPages) throws IOException { } private void setupFactory() throws IOException { - when(pdfDocumentFactory.load(any(MultipartFile.class))) - .thenAnswer(inv -> Loader.loadPDF(((MultipartFile) inv.getArgument(0)).getBytes())); + when(pdfDocumentFactory.load(any(File.class), eq(true))) + .thenAnswer(inv -> Loader.loadPDF((File) inv.getArgument(0))); + when(pdfDocumentFactory.load(any(File.class))) + .thenAnswer(inv -> Loader.loadPDF((File) inv.getArgument(0))); when(pdfDocumentFactory.createNewDocumentBasedOnOldDocument(any(PDDocument.class))) .thenAnswer(inv -> new PDDocument()); } diff --git a/app/core/src/test/java/stirling/software/SPDF/controller/api/SplitPdfBySizeControllerTest.java b/app/core/src/test/java/stirling/software/SPDF/controller/api/SplitPdfBySizeControllerTest.java index aa201a2290..1e47652185 100644 --- a/app/core/src/test/java/stirling/software/SPDF/controller/api/SplitPdfBySizeControllerTest.java +++ b/app/core/src/test/java/stirling/software/SPDF/controller/api/SplitPdfBySizeControllerTest.java @@ -3,8 +3,10 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.when; +import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -25,7 +27,6 @@ import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; import org.springframework.mock.web.MockMultipartFile; -import org.springframework.web.multipart.MultipartFile; import stirling.software.SPDF.model.api.general.SplitPdfBySizeOrCountRequest; import stirling.software.common.service.CustomPDFDocumentFactory; @@ -70,9 +71,8 @@ void shouldSplitByPageCount() throws Exception { request.setSplitType(1); // Page count request.setSplitValue("2"); - when(pdfDocumentFactory.load(any(MultipartFile.class))) - .thenAnswer(inv -> Loader.loadPDF(((MultipartFile) inv.getArgument(0)).getBytes())); - + when(pdfDocumentFactory.load(any(File.class), eq(true))) + .thenAnswer(inv -> Loader.loadPDF((File) inv.getArgument(0))); when(pdfDocumentFactory.createNewDocumentBasedOnOldDocument(any(PDDocument.class))) .thenAnswer(inv -> new PDDocument()); @@ -104,14 +104,8 @@ void shouldSplitByDocCount() throws Exception { request.setSplitType(2); // Document count request.setSplitValue("3"); // Split into 3 docs (2 pages each) - when(pdfDocumentFactory.load(any(org.springframework.web.multipart.MultipartFile.class))) - .thenAnswer( - inv -> - Loader.loadPDF( - ((org.springframework.web.multipart.MultipartFile) - inv.getArgument(0)) - .getBytes())); - + when(pdfDocumentFactory.load(any(File.class), eq(true))) + .thenAnswer(inv -> Loader.loadPDF((File) inv.getArgument(0))); when(pdfDocumentFactory.createNewDocumentBasedOnOldDocument(any(PDDocument.class))) .thenAnswer(inv -> new PDDocument());