55import org .grobid .core .lang .SentenceDetector ;
66import org .grobid .core .lang .Language ;
77import org .grobid .core .utilities .OffsetPosition ;
8- import org .grobid .core .utilities .GrobidProperties ;
98
109import org .slf4j .Logger ;
1110import org .slf4j .LoggerFactory ;
1211
13- import java .io .File ;
1412import java .util .ArrayList ;
1513import java .util .List ;
1614
1715/**
1816 * Implementation of sentence segmentation via Microsoft BlingFire.
19- * BlingFire's sbd.bin model is language-agnostic.
17+ * Uses the built-in default model (no external sbd.bin file required).
18+ * BlingFire's sentence segmentation is language-agnostic.
2019 */
2120public class BlingFireSentenceDetector implements SentenceDetector {
2221 private static final Logger LOGGER = LoggerFactory .getLogger (BlingFireSentenceDetector .class );
2322
24- private final BlingFire .Model model ;
25-
26- public BlingFireSentenceDetector () {
27- this (GrobidProperties .getGrobidHomePath () + File .separator
28- + "sentence-segmentation" + File .separator + "blingfire" + File .separator + "sbd.bin" );
29- }
30-
31- BlingFireSentenceDetector (String modelPath ) {
32- LOGGER .info ("Loading BlingFire sentence segmentation model from: " + modelPath );
33- model = new BlingFire .Model (modelPath );
34- }
35-
3623 @ Override
3724 public List <OffsetPosition > detect (String text ) {
3825 return detect (text , new Language (Language .EN ));
@@ -44,13 +31,30 @@ public List<OffsetPosition> detect(String text, Language lang) {
4431 return new ArrayList <>();
4532 }
4633
47- List < BlingFire . TokenWithOffset > sentencesWithOffsets = model . textToSentencesWithOffsets (text );
34+ String [] sentences = BlingFire . textToSentences (text );
4835 List <OffsetPosition > result = new ArrayList <>();
49-
50- for (BlingFire .TokenWithOffset token : sentencesWithOffsets ) {
51- // BlingFire returns inclusive end offset, convert to exclusive to match OffsetPosition convention
52- int end = Math .min (token .getEndOffset () + 1 , text .length ());
53- result .add (new OffsetPosition (token .getStartOffset (), end ));
36+ int pos = 0 ;
37+
38+ for (String sentence : sentences ) {
39+ // BlingFire's native TextToSentences includes a trailing null byte in output;
40+ // the last sentence may contain a \0 character that prevents matching
41+ if (!sentence .isEmpty () && sentence .charAt (sentence .length () - 1 ) == '\0' ) {
42+ sentence = sentence .substring (0 , sentence .length () - 1 );
43+ }
44+ if (sentence .isEmpty ()) {
45+ continue ;
46+ }
47+ int start = text .indexOf (sentence , pos );
48+ if (start == -1 ) {
49+ LOGGER .warn ("Extracted sentence does not match original text - " + sentence );
50+ start = pos ;
51+ }
52+ int end = start + sentence .length ();
53+ if (end > text .length ()) {
54+ end = text .length ();
55+ }
56+ result .add (new OffsetPosition (start , end ));
57+ pos = end ;
5458 }
5559
5660 return result ;
0 commit comments