Skip to content

Commit 6fcf1fd

Browse files
committed
Enhance auto-rename functionality with additional options
- Added ability to rename files based on the content of specific lines. - Implemented renaming based on text between specified markers. - Added option to rename files using content from specific line numbers. - Enabled renaming based on text before or after a specified keyword. - Included regex-based renaming to allow more complex patterns.
1 parent e07f73d commit 6fcf1fd

File tree

3 files changed

+106
-13
lines changed

3 files changed

+106
-13
lines changed

Diff for: src/main/java/stirling/software/SPDF/controller/api/misc/AutoRenameController.java

+78-13
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
import org.springframework.web.bind.annotation.RestController;
1919
import org.springframework.web.multipart.MultipartFile;
2020

21-
import io.github.pixee.security.Filenames;
2221
import io.swagger.v3.oas.annotations.Operation;
2322
import io.swagger.v3.oas.annotations.tags.Tag;
2423

@@ -33,19 +32,83 @@ public class AutoRenameController {
3332
private static final Logger logger = LoggerFactory.getLogger(AutoRenameController.class);
3433

3534
private static final float TITLE_FONT_SIZE_THRESHOLD = 20.0f;
36-
private static final int LINE_LIMIT = 11;
35+
private static final int DEFAULT_LINE_LIMIT = 11;
3736

3837
@PostMapping(consumes = "multipart/form-data", value = "/auto-rename")
3938
@Operation(
40-
summary = "Extract header from PDF file",
39+
summary = "Extract header from PDF file or Auto rename ",
4140
description =
42-
"This endpoint accepts a PDF file and attempts to extract its title or header based on heuristics. Input:PDF Output:PDF Type:SISO")
41+
"This endpoint accepts a PDF file and attempts to rename it based on various methods. Based on keyword or else extract its title or header based on heuristics. Input:PDF Output:PDF Type:SISO")
4342
public ResponseEntity<byte[]> extractHeader(@ModelAttribute ExtractHeaderRequest request)
44-
throws Exception {
43+
throws IOException {
4544
MultipartFile file = request.getFileInput();
4645
Boolean useFirstTextAsFallback = request.isUseFirstTextAsFallback();
4746

47+
String keyword = request.getKeyword();
48+
Boolean useAfter = request.getUseAfter();
49+
Integer linesToCheck =
50+
request.getLinesToCheck() != null ? request.getLinesToCheck() : DEFAULT_LINE_LIMIT;
51+
4852
PDDocument document = Loader.loadPDF(file.getBytes());
53+
boolean check = keyword != null && !keyword.isEmpty();
54+
55+
String newFileName;
56+
if (keyword != null && !keyword.isEmpty()) {
57+
newFileName = getTextByKeyword(document, keyword, useAfter, linesToCheck);
58+
if ("Untitled".equals(newFileName)) {
59+
newFileName =
60+
extractHeaderUsingFontSize(document, useFirstTextAsFallback, linesToCheck);
61+
}
62+
} else {
63+
newFileName =
64+
extractHeaderUsingFontSize(document, useFirstTextAsFallback, linesToCheck);
65+
}
66+
newFileName = sanitizeFileName(newFileName) + ".pdf";
67+
return WebResponseUtils.pdfDocToWebResponse(document, newFileName);
68+
}
69+
70+
private String getTextByKeyword(
71+
PDDocument document, String keyword, Boolean useAfter, int linesToCheck)
72+
throws IOException {
73+
PDFTextStripper stripper = new PDFTextStripper();
74+
stripper.setStartPage(1);
75+
stripper.setEndPage(1);
76+
String text = stripper.getText(document);
77+
78+
String[] lines = text.split("\n");
79+
keyword = keyword.toLowerCase().trim();
80+
for (int i = 0; i < Math.min(linesToCheck, lines.length); i++) {
81+
String line = lines[i].trim();
82+
String lineLower = line.toLowerCase();
83+
if (lineLower.contains(keyword)) {
84+
if (useAfter) {
85+
int index = lineLower.indexOf(keyword) + keyword.length();
86+
String afterKeyword = line.substring(index).trim();
87+
if (afterKeyword.isEmpty() || afterKeyword.equals(".")) {
88+
if (i + 1 < lines.length) {
89+
afterKeyword = lines[i + 1].trim();
90+
}
91+
}
92+
if (afterKeyword.isEmpty() || afterKeyword.equals(".")) {
93+
return "Untitled";
94+
} else {
95+
return afterKeyword;
96+
}
97+
} else {
98+
if (i + 1 < lines.length && !lines[i + 1].toLowerCase().contains(keyword)) {
99+
String result = (line + " " + lines[i + 1].trim()).trim();
100+
return result;
101+
}
102+
return line;
103+
}
104+
}
105+
}
106+
return "Untitled";
107+
}
108+
109+
private String extractHeaderUsingFontSize(
110+
PDDocument document, Boolean useFirstTextAsFallback, int linesToCheck)
111+
throws IOException {
49112
PDFTextStripper reader =
50113
new PDFTextStripper() {
51114
class LineInfo {
@@ -66,13 +129,13 @@ class LineInfo {
66129

67130
@Override
68131
protected void processTextPosition(TextPosition text) {
69-
if (lastY != text.getY() && lineCount < LINE_LIMIT) {
132+
if (lastY != text.getY() && lineCount < linesToCheck) {
70133
processLine();
71134
lineBuilder = new StringBuilder(text.getUnicode());
72135
maxFontSizeInLine = text.getFontSizeInPt();
73136
lastY = text.getY();
74137
lineCount++;
75-
} else if (lineCount < LINE_LIMIT) {
138+
} else if (lineCount < linesToCheck) {
76139
lineBuilder.append(text.getUnicode());
77140
if (text.getFontSizeInPt() > maxFontSizeInLine) {
78141
maxFontSizeInLine = text.getFontSizeInPt();
@@ -81,7 +144,7 @@ protected void processTextPosition(TextPosition text) {
81144
}
82145

83146
private void processLine() {
84-
if (lineBuilder.length() > 0 && lineCount < LINE_LIMIT) {
147+
if (lineBuilder.length() > 0 && lineCount < linesToCheck) {
85148
lineInfos.add(new LineInfo(lineBuilder.toString(), maxFontSizeInLine));
86149
}
87150
}
@@ -125,17 +188,19 @@ public String getText(PDDocument doc) throws IOException {
125188
: null);
126189
}
127190
};
191+
reader.setEndPage(1);
128192

129193
String header = reader.getText(document);
130194

131-
// Sanitize the header string by removing characters not allowed in a filename.
132195
if (header != null && header.length() < 255) {
133-
header = header.replaceAll("[/\\\\?%*:|\"<>]", "").trim();
134-
return WebResponseUtils.pdfDocToWebResponse(document, header + ".pdf");
196+
return header.trim();
135197
} else {
136198
logger.info("File has no good title to be found");
137-
return WebResponseUtils.pdfDocToWebResponse(
138-
document, Filenames.toSimpleFileName(file.getOriginalFilename()));
199+
return "Untitled";
139200
}
140201
}
202+
203+
private String sanitizeFileName(String fileName) {
204+
return fileName.replaceAll("[/\\\\?%*:|\"<>]", "").trim();
205+
}
141206
}

Diff for: src/main/java/stirling/software/SPDF/model/api/misc/ExtractHeaderRequest.java

+7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
package stirling.software.SPDF.model.api.misc;
22

3+
import org.springframework.web.multipart.MultipartFile;
4+
35
import io.swagger.v3.oas.annotations.media.Schema;
46

57
import lombok.Data;
@@ -16,4 +18,9 @@ public class ExtractHeaderRequest extends PDFFile {
1618
required = false,
1719
defaultValue = "false")
1820
private boolean useFirstTextAsFallback;
21+
22+
private MultipartFile fileInput;
23+
private String keyword;
24+
private Boolean useAfter;
25+
private Integer linesToCheck;
1926
}

Diff for: src/main/resources/templates/misc/auto-rename.html

+21
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,29 @@
2020
<form method="post" enctype="multipart/form-data" th:action="@{'/api/v1/misc/auto-rename'}">
2121
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
2222
<br>
23+
<div class="form-group">
24+
<label for="keyword">Keyword:</label>
25+
<input type="text" class="form-control" id="keyword" name="keyword" placeholder="e.g., Company, Name, Invoice" required>
26+
</div>
27+
<br>
28+
<div class="form-group">
29+
<label for="useAfter">Text to use:</label>
30+
<select class="form-control" id="useAfter" name="useAfter">
31+
<option value="false">Entire line containing keyword</option>
32+
<option value="true">Text after keyword</option>
33+
</select>
34+
</div>
35+
<br>
36+
37+
<div class="form-group">
38+
<label for="linesToCheck">Number of Lines to Check:</label>
39+
<input type="number" class="form-control" id="linesToCheck" name="linesToCheck" value="10" min="1">
40+
</div>
41+
<br>
42+
2343
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{auto-rename.submit}"></button>
2444
</form>
45+
2546
</div>
2647
</div>
2748
</div>

0 commit comments

Comments
 (0)