Skip to content

Commit 46c08e1

Browse files
author
Dominika Tkaczyk
committed
code optimization
1 parent 4761404 commit 46c08e1

1 file changed

Lines changed: 24 additions & 16 deletions

File tree

cermine-impl/src/main/java/pl/edu/icm/cermine/content/headers/HeuristicContentHeadersExtractor.java

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,13 @@
1919
package pl.edu.icm.cermine.content.headers;
2020

2121
import com.google.common.collect.Lists;
22+
import com.google.common.collect.Sets;
23+
import java.util.Map.Entry;
2224
import java.util.*;
2325
import pl.edu.icm.cermine.content.model.BxContentStructure;
2426
import pl.edu.icm.cermine.exception.AnalysisException;
2527
import pl.edu.icm.cermine.structure.model.*;
28+
import pl.edu.icm.cermine.tools.CountMap;
2629
import pl.edu.icm.cermine.tools.statistics.Population;
2730

2831
/**
@@ -48,7 +51,7 @@ public BxContentStructure extractHeaders(BxDocument document) throws AnalysisExc
4851
Population distancePopulation = new Population();
4952
Population lengthPopulation = new Population();
5053
Population indentationPopulation = new Population();
51-
54+
5255
Set<BxLine> candidates = new HashSet<BxLine>();
5356
for (BxPage page : document) {
5457
for (BxZone zone : page) {
@@ -86,19 +89,19 @@ public BxContentStructure extractHeaders(BxDocument document) throws AnalysisExc
8689

8790
Set<String> headerFonts = new HashSet<String>();
8891
List<BxLine> candidatesList = Lists.newArrayList(candidates);
89-
90-
for (int x = 0; x < candidatesList.size(); x++) {
91-
BxLine line1 = candidatesList.get(x);
92-
for (int y = x+1; y < candidatesList.size(); y++) {
93-
BxLine line2 = candidatesList.get(y);
94-
for (int z = y+1; z < candidatesList.size(); z++) {
95-
BxLine line3 = candidatesList.get(z);
96-
if (line1.getMostPopularFontName().equals(line2.getMostPopularFontName())
97-
&& line3.getMostPopularFontName().equals(line2.getMostPopularFontName())
98-
&& Math.abs(fontPopulation.getZScore(getFontIndex(line1))) > outlFontZScore) {
99-
headerFonts.add(line1.getMostPopularFontName());
100-
}
101-
}
92+
93+
Set<String> docFontPopulation = Sets.newHashSet();
94+
if (!candidatesList.isEmpty()) {
95+
docFontPopulation = candidatesList.get(0).getParent().getParent().getParent().getFontNames();
96+
}
97+
CountMap<String> fontCandidates = new CountMap<String>();
98+
for (int x = 0; x < candidatesList.size(); x++) {
99+
fontCandidates.add(candidatesList.get(x).getMostPopularFontName());
100+
}
101+
for (Entry<String, Integer> entry : fontCandidates.getSortedEntries(3)) {
102+
if (Math.abs(fontPopulation.getZScore(getFontIndex(entry.getKey(), docFontPopulation)))
103+
> outlFontZScore) {
104+
headerFonts.add(entry.getKey());
102105
}
103106
}
104107

@@ -199,9 +202,14 @@ public BxContentStructure extractHeaders(BxDocument document) throws AnalysisExc
199202
}
200203

201204
private double getFontIndex(BxLine line) {
202-
List<String> fonts = Lists.newArrayList(line.getParent().getParent().getParent().getFontNames());
205+
return getFontIndex(line.getMostPopularFontName(),
206+
line.getParent().getParent().getParent().getFontNames());
207+
}
208+
209+
private double getFontIndex(String fontName, Set<String> population) {
210+
List<String> fonts = Lists.newArrayList(population);
203211
Collections.sort(fonts);
204-
return fonts.indexOf(line.getMostPopularFontName());
212+
return fonts.indexOf(fontName);
205213
}
206214

207215
private boolean isFirstInZone(BxLine line) {

0 commit comments

Comments
 (0)