1919package pl .edu .icm .cermine .content .headers ;
2020
2121import com .google .common .collect .Lists ;
22+ import com .google .common .collect .Sets ;
23+ import java .util .Map .Entry ;
2224import java .util .*;
2325import pl .edu .icm .cermine .content .model .BxContentStructure ;
2426import pl .edu .icm .cermine .exception .AnalysisException ;
2527import pl .edu .icm .cermine .structure .model .*;
28+ import pl .edu .icm .cermine .tools .CountMap ;
2629import pl .edu .icm .cermine .tools .statistics .Population ;
2730
2831/**
@@ -48,7 +51,7 @@ public BxContentStructure extractHeaders(BxDocument document) throws AnalysisExc
4851 Population distancePopulation = new Population ();
4952 Population lengthPopulation = new Population ();
5053 Population indentationPopulation = new Population ();
51-
54+
5255 Set <BxLine > candidates = new HashSet <BxLine >();
5356 for (BxPage page : document ) {
5457 for (BxZone zone : page ) {
@@ -86,19 +89,19 @@ public BxContentStructure extractHeaders(BxDocument document) throws AnalysisExc
8689
8790 Set <String > headerFonts = new HashSet <String >();
8891 List <BxLine > candidatesList = Lists .newArrayList (candidates );
89-
90- for ( int x = 0 ; x < candidatesList . size (); x ++) {
91- BxLine line1 = candidatesList .get ( x );
92- for ( int y = x + 1 ; y < candidatesList .size (); y ++) {
93- BxLine line2 = candidatesList . get ( y );
94- for ( int z = y + 1 ; z < candidatesList . size (); z ++) {
95- BxLine line3 = candidatesList . get ( z );
96- if ( line1 . getMostPopularFontName (). equals ( line2 . getMostPopularFontName ())
97- && line3 . getMostPopularFontName (). equals ( line2 . getMostPopularFontName ())
98- && Math . abs ( fontPopulation . getZScore ( getFontIndex ( line1 ))) > outlFontZScore ) {
99- headerFonts . add ( line1 . getMostPopularFontName ());
100- }
101- }
92+
93+ Set < String > docFontPopulation = Sets . newHashSet ();
94+ if (! candidatesList .isEmpty ()) {
95+ docFontPopulation = candidatesList .get ( 0 ). getParent (). getParent (). getParent (). getFontNames ();
96+ }
97+ CountMap < String > fontCandidates = new CountMap < String > ();
98+ for ( int x = 0 ; x < candidatesList . size (); x ++) {
99+ fontCandidates . add ( candidatesList . get ( x ). getMostPopularFontName ());
100+ }
101+ for ( Entry < String , Integer > entry : fontCandidates . getSortedEntries ( 3 ) ) {
102+ if ( Math . abs ( fontPopulation . getZScore ( getFontIndex ( entry . getKey (), docFontPopulation )))
103+ > outlFontZScore ) {
104+ headerFonts . add ( entry . getKey ());
102105 }
103106 }
104107
@@ -199,9 +202,14 @@ public BxContentStructure extractHeaders(BxDocument document) throws AnalysisExc
199202 }
200203
201204 private double getFontIndex (BxLine line ) {
202- List <String > fonts = Lists .newArrayList (line .getParent ().getParent ().getParent ().getFontNames ());
205+ return getFontIndex (line .getMostPopularFontName (),
206+ line .getParent ().getParent ().getParent ().getFontNames ());
207+ }
208+
209+ private double getFontIndex (String fontName , Set <String > population ) {
210+ List <String > fonts = Lists .newArrayList (population );
203211 Collections .sort (fonts );
204- return fonts .indexOf (line . getMostPopularFontName () );
212+ return fonts .indexOf (fontName );
205213 }
206214
207215 private boolean isFirstInZone (BxLine line ) {
0 commit comments