Skip to content

Commit d321d2f

Browse files
committed
feat: simplify BlingFireSentenceDetector by removing external model dependency
1 parent fd4967d commit d321d2f

File tree

4 files changed

+26
-33
lines changed

4 files changed

+26
-33
lines changed
911 Bytes
Binary file not shown.

grobid-core/src/main/java/org/grobid/core/lang/impl/BlingFireSentenceDetector.java

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,34 +5,21 @@
55
import org.grobid.core.lang.SentenceDetector;
66
import org.grobid.core.lang.Language;
77
import org.grobid.core.utilities.OffsetPosition;
8-
import org.grobid.core.utilities.GrobidProperties;
98

109
import org.slf4j.Logger;
1110
import org.slf4j.LoggerFactory;
1211

13-
import java.io.File;
1412
import java.util.ArrayList;
1513
import java.util.List;
1614

1715
/**
1816
* Implementation of sentence segmentation via Microsoft BlingFire.
19-
* BlingFire's sbd.bin model is language-agnostic.
17+
* Uses the built-in default model (no external sbd.bin file required).
18+
* BlingFire's sentence segmentation is language-agnostic.
2019
*/
2120
public class BlingFireSentenceDetector implements SentenceDetector {
2221
private static final Logger LOGGER = LoggerFactory.getLogger(BlingFireSentenceDetector.class);
2322

24-
private final BlingFire.Model model;
25-
26-
public BlingFireSentenceDetector() {
27-
this(GrobidProperties.getGrobidHomePath() + File.separator
28-
+ "sentence-segmentation" + File.separator + "blingfire" + File.separator + "sbd.bin");
29-
}
30-
31-
BlingFireSentenceDetector(String modelPath) {
32-
LOGGER.info("Loading BlingFire sentence segmentation model from: " + modelPath);
33-
model = new BlingFire.Model(modelPath);
34-
}
35-
3623
@Override
3724
public List<OffsetPosition> detect(String text) {
3825
return detect(text, new Language(Language.EN));
@@ -44,13 +31,30 @@ public List<OffsetPosition> detect(String text, Language lang) {
4431
return new ArrayList<>();
4532
}
4633

47-
List<BlingFire.TokenWithOffset> sentencesWithOffsets = model.textToSentencesWithOffsets(text);
34+
String[] sentences = BlingFire.textToSentences(text);
4835
List<OffsetPosition> result = new ArrayList<>();
49-
50-
for (BlingFire.TokenWithOffset token : sentencesWithOffsets) {
51-
// BlingFire returns inclusive end offset, convert to exclusive to match OffsetPosition convention
52-
int end = Math.min(token.getEndOffset() + 1, text.length());
53-
result.add(new OffsetPosition(token.getStartOffset(), end));
36+
int pos = 0;
37+
38+
for (String sentence : sentences) {
39+
// BlingFire's native TextToSentences includes a trailing null byte in output;
40+
// the last sentence may contain a \0 character that prevents matching
41+
if (!sentence.isEmpty() && sentence.charAt(sentence.length() - 1) == '\0') {
42+
sentence = sentence.substring(0, sentence.length() - 1);
43+
}
44+
if (sentence.isEmpty()) {
45+
continue;
46+
}
47+
int start = text.indexOf(sentence, pos);
48+
if (start == -1) {
49+
LOGGER.warn("Extracted sentence does not match original text - " + sentence);
50+
start = pos;
51+
}
52+
int end = start + sentence.length();
53+
if (end > text.length()) {
54+
end = text.length();
55+
}
56+
result.add(new OffsetPosition(start, end));
57+
pos = end;
5458
}
5559

5660
return result;

grobid-core/src/test/java/org/grobid/core/lang/impl/BlingFireSentenceDetectorTest.java

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,16 @@
11
package org.grobid.core.lang.impl;
22

33
import org.grobid.core.utilities.OffsetPosition;
4-
import org.junit.jupiter.api.BeforeAll;
54
import org.junit.jupiter.api.Test;
65

7-
import java.io.File;
86
import java.util.List;
97

108
import static org.hamcrest.MatcherAssert.assertThat;
119
import static org.hamcrest.Matchers.*;
1210

1311
public class BlingFireSentenceDetectorTest {
1412

15-
private static BlingFireSentenceDetector detector;
16-
17-
@BeforeAll
18-
static void setUp() {
19-
// Resolve from project root (parent of grobid-core module directory)
20-
File moduleDir = new File(System.getProperty("user.dir"));
21-
File projectRoot = moduleDir.getName().equals("grobid-core") ? moduleDir.getParentFile() : moduleDir;
22-
String modelPath = new File(projectRoot, "grobid-home/sentence-segmentation/blingfire/sbd.bin").getAbsolutePath();
23-
detector = new BlingFireSentenceDetector(modelPath);
24-
}
13+
private final BlingFireSentenceDetector detector = new BlingFireSentenceDetector();
2514

2615
@Test
2716
public void testDetect_singleSentence() {
-328 KB
Binary file not shown.

0 commit comments

Comments
 (0)