Skip to content

Commit d694d4f

Browse files
committed
Add structured PDF layout extraction alongside text extraction
This change adds a structured PDF layout extraction layer next to the existing text extraction flow. The new layout processing preserves geometric PDF information and builds a reusable structure model consisting of: * key values * table regions * logical table entries * text blocks The existing text-based PDF import logic remains unchanged. The extracted structure is attached to `PDFInputFile` and can be accessed later by PDF extractors without affecting current import behavior. The PR also extends the debug output with compact structural evidence to make detected layout structures visible and easier to validate. No bank-specific extractor migration is included in this change.
1 parent 8e45ab1 commit d694d4f

41 files changed

Lines changed: 2546 additions & 51 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

name.abuchen.portfolio.pdfbox3/META-INF/MANIFEST.MF

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Bundle-Name: %Bundle-Name
44
Bundle-SymbolicName: name.abuchen.portfolio.pdfbox3
55
Bundle-Version: 0.83.3.qualifier
66
Bundle-RequiredExecutionEnvironment: JavaSE-21
7-
Export-Package: name.abuchen.portfolio.pdfbox3
7+
Export-Package: name.abuchen.portfolio.pdfbox3, name.abuchen.portfolio.pdfbox3.layout
88
Import-Package: org.osgi.framework
99
Require-Bundle: org.apache.pdfbox;bundle-version="[3.0.3,4.0.0)"
1010
Bundle-ClassPath: .
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
package name.abuchen.portfolio.pdfbox3.layout;
2+
3+
import java.io.File;
4+
import java.io.IOException;
5+
import java.util.ArrayList;
6+
import java.util.Comparator;
7+
import java.util.LinkedHashMap;
8+
import java.util.List;
9+
import java.util.Locale;
10+
import java.util.Map;
11+
12+
import org.apache.pdfbox.Loader;
13+
import org.apache.pdfbox.pdmodel.PDDocument;
14+
import org.apache.pdfbox.text.PDFTextStripper;
15+
import org.apache.pdfbox.text.TextPosition;
16+
17+
public final class PDFLayoutBcbcDebugTextExtractor
18+
{
19+
public String extract(File file) throws IOException
20+
{
21+
try (PDDocument document = Loader.loadPDF(file))
22+
{
23+
var stripper = new Stripper();
24+
stripper.setSortByPosition(true);
25+
stripper.getText(document);
26+
27+
return buildDebug(stripper.glyphs);
28+
}
29+
}
30+
31+
public PDFLayoutSegmentDocument extractDocument(File file) throws IOException
32+
{
33+
try (PDDocument document = Loader.loadPDF(file))
34+
{
35+
var stripper = new Stripper();
36+
stripper.setSortByPosition(true);
37+
stripper.getText(document);
38+
39+
List<PDFLayoutSegmentRow> rows = new ArrayList<>();
40+
41+
Map<Integer, List<PDFLayoutGlyph>> byPage = groupByPage(stripper.glyphs);
42+
43+
for (var page : byPage.entrySet())
44+
rows.addAll(buildSegmentRows(page.getValue()));
45+
46+
List<PDFLayoutSegmentBlock> blocks = new PDFLayoutSegmentBlockBuilder().build(rows);
47+
48+
return new PDFLayoutSegmentDocument(List.copyOf(rows), blocks);
49+
}
50+
}
51+
52+
private String buildDebug(List<PDFLayoutGlyph> glyphs)
53+
{
54+
StringBuilder out = new StringBuilder();
55+
56+
Map<Integer, List<PDFLayoutGlyph>> byPage = groupByPage(glyphs);
57+
58+
for (var page : byPage.entrySet())
59+
{
60+
out.append("PAGE ").append(page.getKey()).append('\n');
61+
out.append("==================================================\n");
62+
63+
List<PDFLayoutSegmentRow> segmentRows = buildSegmentRows(page.getValue());
64+
List<PDFLayoutSegmentBlock> blocks = new PDFLayoutSegmentBlockBuilder().build(segmentRows);
65+
writeBlocks(out, blocks);
66+
67+
out.append('\n');
68+
}
69+
70+
return out.toString();
71+
}
72+
73+
private Map<Integer, List<PDFLayoutGlyph>> groupByPage(List<PDFLayoutGlyph> glyphs)
74+
{
75+
Map<Integer, List<PDFLayoutGlyph>> answer = new LinkedHashMap<>();
76+
77+
glyphs.stream().sorted(Comparator.comparing(PDFLayoutGlyph::page).thenComparing(PDFLayoutGlyph::y)
78+
.thenComparing(PDFLayoutGlyph::x))
79+
.forEach(glyph -> answer.computeIfAbsent(glyph.page(), k -> new ArrayList<>()).add(glyph));
80+
81+
return answer;
82+
}
83+
84+
private List<List<PDFLayoutSegment>> buildRows(List<PDFLayoutGlyph> glyphs)
85+
{
86+
Map<Integer, List<PDFLayoutGlyph>> byY = new LinkedHashMap<>();
87+
88+
glyphs.stream().sorted(Comparator.comparing(PDFLayoutGlyph::y).thenComparing(PDFLayoutGlyph::x))
89+
.forEach(glyph -> byY.computeIfAbsent(yBucket(glyph.y()), k -> new ArrayList<>()).add(glyph));
90+
91+
List<List<PDFLayoutSegment>> rows = new ArrayList<>();
92+
93+
for (var entry : byY.entrySet())
94+
{
95+
List<PDFLayoutGlyph> rowGlyphs = entry.getValue();
96+
rowGlyphs.sort(Comparator.comparing(PDFLayoutGlyph::x));
97+
98+
rows.add(buildSegments(rowGlyphs));
99+
}
100+
101+
return List.copyOf(rows);
102+
}
103+
104+
private List<PDFLayoutSegment> buildSegments(List<PDFLayoutGlyph> glyphs)
105+
{
106+
List<PDFLayoutSegment> answer = new ArrayList<>();
107+
108+
StringBuilder text = new StringBuilder();
109+
110+
PDFLayoutGlyph first = null;
111+
PDFLayoutGlyph previous = null;
112+
113+
for (PDFLayoutGlyph glyph : glyphs)
114+
{
115+
if (previous != null && isSegmentGap(previous, glyph))
116+
{
117+
addSegment(answer, first, previous, text.toString());
118+
119+
text.setLength(0);
120+
first = null;
121+
}
122+
123+
if (first == null)
124+
first = glyph;
125+
126+
if (previous != null && isWordGap(previous, glyph))
127+
text.append(' ');
128+
129+
text.append(glyph.text());
130+
previous = glyph;
131+
}
132+
133+
if (first != null && previous != null && !text.isEmpty())
134+
addSegment(answer, first, previous, text.toString());
135+
136+
return List.copyOf(answer);
137+
}
138+
139+
private void addSegment(List<PDFLayoutSegment> segments, PDFLayoutGlyph first, PDFLayoutGlyph last, String text)
140+
{
141+
if (text == null || text.isBlank())
142+
return;
143+
144+
float xStart = first.x();
145+
float xEnd = last.x() + last.width();
146+
147+
segments.add(new PDFLayoutSegment(first.page(), xStart, xEnd, first.y(), text));
148+
}
149+
150+
private void writeBlocks(StringBuilder out, List<PDFLayoutSegmentBlock> blocks)
151+
{
152+
for (PDFLayoutSegmentBlock block : blocks)
153+
{
154+
out.append(block.id().toUpperCase(Locale.ROOT)).append('\n');
155+
out.append("page=").append(block.page()).append('\n');
156+
out.append("pattern=").append(block.pattern()).append('\n');
157+
out.append("rows=").append(block.rowCount()).append('\n');
158+
out.append("y=").append(String.format(Locale.US, "%.2f", block.yStart())).append("-")
159+
.append(String.format(Locale.US, "%.2f", block.yEnd())).append('\n');
160+
out.append("--------------------------------------------------\n");
161+
162+
for (PDFLayoutSegmentRow row : block.rows())
163+
{
164+
out.append("row=").append(row.index()).append(" ");
165+
out.append("y=").append(String.format(Locale.US, "%.2f", row.y())).append(" | ");
166+
167+
boolean first = true;
168+
for (PDFLayoutSegment segment : row.segments())
169+
{
170+
if (!first)
171+
out.append(" | ");
172+
173+
out.append('[').append(segment.xBucket()).append("] ");
174+
out.append(segment.text());
175+
first = false;
176+
}
177+
178+
out.append('\n');
179+
}
180+
181+
out.append('\n');
182+
}
183+
}
184+
185+
private int yBucket(float y)
186+
{
187+
return ((int) Math.floor(y / 3f)) * 3;
188+
}
189+
190+
private boolean isSegmentGap(PDFLayoutGlyph left, PDFLayoutGlyph right)
191+
{
192+
float gap = right.x() - (left.x() + left.width());
193+
194+
if (gap <= 0)
195+
return false;
196+
197+
float averageWidth = (left.width() + right.width()) / 2f;
198+
199+
return gap > Math.max(14f, averageWidth * 2.8f);
200+
}
201+
202+
private boolean isWordGap(PDFLayoutGlyph left, PDFLayoutGlyph right)
203+
{
204+
float gap = right.x() - (left.x() + left.width());
205+
206+
if (gap <= 0)
207+
return false;
208+
209+
float averageWidth = (left.width() + right.width()) / 2f;
210+
211+
return gap > Math.max(1.1f, averageWidth * 0.28f);
212+
}
213+
214+
private static final class Stripper extends PDFTextStripper
215+
{
216+
private final List<PDFLayoutGlyph> glyphs = new ArrayList<>();
217+
218+
private Stripper() throws IOException
219+
{
220+
221+
}
222+
223+
@Override
224+
protected void processTextPosition(TextPosition text)
225+
{
226+
if (text.getUnicode() == null || text.getUnicode().isBlank())
227+
return;
228+
229+
glyphs.add(new PDFLayoutGlyph(getCurrentPageNo(), text.getXDirAdj(), text.getYDirAdj(),
230+
text.getWidthDirAdj(), text.getHeightDir(), text.getUnicode()));
231+
}
232+
}
233+
234+
private List<PDFLayoutSegmentRow> buildSegmentRows(List<PDFLayoutGlyph> glyphs)
235+
{
236+
List<List<PDFLayoutSegment>> rows = buildRows(glyphs);
237+
List<PDFLayoutSegmentRow> answer = new ArrayList<>();
238+
239+
for (int ii = 0; ii < rows.size(); ii++)
240+
{
241+
List<PDFLayoutSegment> segments = rows.get(ii);
242+
if (segments.isEmpty())
243+
continue;
244+
245+
answer.add(new PDFLayoutSegmentRow(segments.getFirst().page(), ii, segments.getFirst().y(), segments));
246+
}
247+
248+
return List.copyOf(answer);
249+
}
250+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
package name.abuchen.portfolio.pdfbox3.layout;
2+
3+
public record PDFLayoutGlyph(int page, float x, float y, float width, float height, String text)
4+
{}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
package name.abuchen.portfolio.pdfbox3.layout;
2+
3+
public record PDFLayoutSegment(int page, float xStart, float xEnd, float y, String text)
4+
{
5+
public int xBucket()
6+
{
7+
return ((int) Math.floor(xStart / 10f)) * 10;
8+
}
9+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
package name.abuchen.portfolio.pdfbox3.layout;
2+
3+
import java.util.List;
4+
5+
public record PDFLayoutSegmentBlock(String id, int page, String pattern, List<PDFLayoutSegmentRow> rows)
6+
{
7+
public int rowCount()
8+
{
9+
return rows.size();
10+
}
11+
12+
public float yStart()
13+
{
14+
return rows.isEmpty() ? 0f : rows.getFirst().y();
15+
}
16+
17+
public float yEnd()
18+
{
19+
return rows.isEmpty() ? 0f : rows.getLast().y();
20+
}
21+
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
package name.abuchen.portfolio.pdfbox3.layout;
2+
3+
import java.util.ArrayList;
4+
import java.util.List;
5+
import java.util.Locale;
6+
7+
public final class PDFLayoutSegmentBlockBuilder
8+
{
9+
public List<PDFLayoutSegmentBlock> build(List<PDFLayoutSegmentRow> rows)
10+
{
11+
List<PDFLayoutSegmentBlock> answer = new ArrayList<>();
12+
13+
List<PDFLayoutSegmentRow> current = new ArrayList<>();
14+
String currentPattern = null;
15+
int blockIndex = 1;
16+
17+
for (PDFLayoutSegmentRow row : rows)
18+
{
19+
if (row.isEmpty())
20+
continue;
21+
22+
String pattern = row.pattern();
23+
24+
if (currentPattern != null && !currentPattern.equals(pattern) && !current.isEmpty())
25+
{
26+
answer.add(createBlock(blockIndex++, currentPattern, current));
27+
current = new ArrayList<>();
28+
}
29+
30+
current.add(row);
31+
currentPattern = pattern;
32+
}
33+
34+
if (!current.isEmpty())
35+
answer.add(createBlock(blockIndex, currentPattern, current));
36+
37+
return List.copyOf(answer);
38+
}
39+
40+
private PDFLayoutSegmentBlock createBlock(int index, String pattern, List<PDFLayoutSegmentRow> rows)
41+
{
42+
int page = rows.getFirst().page();
43+
44+
return new PDFLayoutSegmentBlock("block-" + String.format(Locale.ROOT, "%03d", index), page, pattern,
45+
List.copyOf(rows));
46+
}
47+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
package name.abuchen.portfolio.pdfbox3.layout;
2+
3+
import java.util.List;
4+
5+
public record PDFLayoutSegmentDocument(List<PDFLayoutSegmentRow> rows, List<PDFLayoutSegmentBlock> blocks)
6+
{}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
package name.abuchen.portfolio.pdfbox3.layout;
2+
3+
import java.io.File;
4+
import java.io.IOException;
5+
6+
public final class PDFLayoutSegmentDocumentExtractor
7+
{
8+
public PDFLayoutSegmentDocument extract(File file) throws IOException
9+
{
10+
return new PDFLayoutBcbcDebugTextExtractor().extractDocument(file);
11+
}
12+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package name.abuchen.portfolio.pdfbox3.layout;
2+
3+
import java.util.List;
4+
import java.util.stream.Collectors;
5+
6+
public record PDFLayoutSegmentRow(int page, int index, float y, List<PDFLayoutSegment> segments)
7+
{
8+
public String pattern()
9+
{
10+
return segments.stream().map(segment -> Integer.toString(segment.xBucket()))
11+
.collect(Collectors.joining(",", "[", "]"));
12+
}
13+
14+
public String text()
15+
{
16+
return segments.stream().map(PDFLayoutSegment::text).collect(Collectors.joining(" "));
17+
}
18+
19+
public boolean isEmpty()
20+
{
21+
return segments.isEmpty();
22+
}
23+
}

0 commit comments

Comments
 (0)