18
18
import org .springframework .web .bind .annotation .RestController ;
19
19
import org .springframework .web .multipart .MultipartFile ;
20
20
21
- import io .github .pixee .security .Filenames ;
22
21
import io .swagger .v3 .oas .annotations .Operation ;
23
22
import io .swagger .v3 .oas .annotations .tags .Tag ;
24
23
@@ -33,19 +32,83 @@ public class AutoRenameController {
33
32
private static final Logger logger = LoggerFactory .getLogger (AutoRenameController .class );
34
33
35
34
private static final float TITLE_FONT_SIZE_THRESHOLD = 20.0f ;
36
- private static final int LINE_LIMIT = 11 ;
35
+ private static final int DEFAULT_LINE_LIMIT = 11 ;
37
36
38
37
@ PostMapping (consumes = "multipart/form-data" , value = "/auto-rename" )
39
38
@ Operation (
40
- summary = "Extract header from PDF file" ,
39
+ summary = "Extract header from PDF file or Auto rename " ,
41
40
description =
42
- "This endpoint accepts a PDF file and attempts to extract its title or header based on heuristics. Input:PDF Output:PDF Type:SISO" )
41
+ "This endpoint accepts a PDF file and attempts to rename it based on various methods. Based on keyword or else extract its title or header based on heuristics. Input:PDF Output:PDF Type:SISO" )
43
42
public ResponseEntity <byte []> extractHeader (@ ModelAttribute ExtractHeaderRequest request )
44
- throws Exception {
43
+ throws IOException {
45
44
MultipartFile file = request .getFileInput ();
46
45
Boolean useFirstTextAsFallback = request .isUseFirstTextAsFallback ();
47
46
47
+ String keyword = request .getKeyword ();
48
+ Boolean useAfter = request .getUseAfter ();
49
+ Integer linesToCheck =
50
+ request .getLinesToCheck () != null ? request .getLinesToCheck () : DEFAULT_LINE_LIMIT ;
51
+
48
52
PDDocument document = Loader .loadPDF (file .getBytes ());
53
+ boolean check = keyword != null && !keyword .isEmpty ();
54
+
55
+ String newFileName ;
56
+ if (keyword != null && !keyword .isEmpty ()) {
57
+ newFileName = getTextByKeyword (document , keyword , useAfter , linesToCheck );
58
+ if ("Untitled" .equals (newFileName )) {
59
+ newFileName =
60
+ extractHeaderUsingFontSize (document , useFirstTextAsFallback , linesToCheck );
61
+ }
62
+ } else {
63
+ newFileName =
64
+ extractHeaderUsingFontSize (document , useFirstTextAsFallback , linesToCheck );
65
+ }
66
+ newFileName = sanitizeFileName (newFileName ) + ".pdf" ;
67
+ return WebResponseUtils .pdfDocToWebResponse (document , newFileName );
68
+ }
69
+
70
+ private String getTextByKeyword (
71
+ PDDocument document , String keyword , Boolean useAfter , int linesToCheck )
72
+ throws IOException {
73
+ PDFTextStripper stripper = new PDFTextStripper ();
74
+ stripper .setStartPage (1 );
75
+ stripper .setEndPage (1 );
76
+ String text = stripper .getText (document );
77
+
78
+ String [] lines = text .split ("\n " );
79
+ keyword = keyword .toLowerCase ().trim ();
80
+ for (int i = 0 ; i < Math .min (linesToCheck , lines .length ); i ++) {
81
+ String line = lines [i ].trim ();
82
+ String lineLower = line .toLowerCase ();
83
+ if (lineLower .contains (keyword )) {
84
+ if (useAfter ) {
85
+ int index = lineLower .indexOf (keyword ) + keyword .length ();
86
+ String afterKeyword = line .substring (index ).trim ();
87
+ if (afterKeyword .isEmpty () || afterKeyword .equals ("." )) {
88
+ if (i + 1 < lines .length ) {
89
+ afterKeyword = lines [i + 1 ].trim ();
90
+ }
91
+ }
92
+ if (afterKeyword .isEmpty () || afterKeyword .equals ("." )) {
93
+ return "Untitled" ;
94
+ } else {
95
+ return afterKeyword ;
96
+ }
97
+ } else {
98
+ if (i + 1 < lines .length && !lines [i + 1 ].toLowerCase ().contains (keyword )) {
99
+ String result = (line + " " + lines [i + 1 ].trim ()).trim ();
100
+ return result ;
101
+ }
102
+ return line ;
103
+ }
104
+ }
105
+ }
106
+ return "Untitled" ;
107
+ }
108
+
109
+ private String extractHeaderUsingFontSize (
110
+ PDDocument document , Boolean useFirstTextAsFallback , int linesToCheck )
111
+ throws IOException {
49
112
PDFTextStripper reader =
50
113
new PDFTextStripper () {
51
114
class LineInfo {
@@ -66,13 +129,13 @@ class LineInfo {
66
129
67
130
@ Override
68
131
protected void processTextPosition (TextPosition text ) {
69
- if (lastY != text .getY () && lineCount < LINE_LIMIT ) {
132
+ if (lastY != text .getY () && lineCount < linesToCheck ) {
70
133
processLine ();
71
134
lineBuilder = new StringBuilder (text .getUnicode ());
72
135
maxFontSizeInLine = text .getFontSizeInPt ();
73
136
lastY = text .getY ();
74
137
lineCount ++;
75
- } else if (lineCount < LINE_LIMIT ) {
138
+ } else if (lineCount < linesToCheck ) {
76
139
lineBuilder .append (text .getUnicode ());
77
140
if (text .getFontSizeInPt () > maxFontSizeInLine ) {
78
141
maxFontSizeInLine = text .getFontSizeInPt ();
@@ -81,7 +144,7 @@ protected void processTextPosition(TextPosition text) {
81
144
}
82
145
83
146
private void processLine () {
84
- if (lineBuilder .length () > 0 && lineCount < LINE_LIMIT ) {
147
+ if (lineBuilder .length () > 0 && lineCount < linesToCheck ) {
85
148
lineInfos .add (new LineInfo (lineBuilder .toString (), maxFontSizeInLine ));
86
149
}
87
150
}
@@ -125,17 +188,19 @@ public String getText(PDDocument doc) throws IOException {
125
188
: null );
126
189
}
127
190
};
191
+ reader .setEndPage (1 );
128
192
129
193
String header = reader .getText (document );
130
194
131
- // Sanitize the header string by removing characters not allowed in a filename.
132
195
if (header != null && header .length () < 255 ) {
133
- header = header .replaceAll ("[/\\ \\ ?%*:|\" <>]" , "" ).trim ();
134
- return WebResponseUtils .pdfDocToWebResponse (document , header + ".pdf" );
196
+ return header .trim ();
135
197
} else {
136
198
logger .info ("File has no good title to be found" );
137
- return WebResponseUtils .pdfDocToWebResponse (
138
- document , Filenames .toSimpleFileName (file .getOriginalFilename ()));
199
+ return "Untitled" ;
139
200
}
140
201
}
202
+
203
+ private String sanitizeFileName (String fileName ) {
204
+ return fileName .replaceAll ("[/\\ \\ ?%*:|\" <>]" , "" ).trim ();
205
+ }
141
206
}
0 commit comments