Skip to content

Commit 07a39d4

Browse files
committed
OPENNLP-1261: The Language Detector should not ignore ngram counts.
git push
1 parent cfa7bb6 commit 07a39d4

File tree

2 files changed

+12
-9
lines changed

2 files changed

+12
-9
lines changed

Diff for: opennlp-tools/src/main/java/opennlp/tools/langdetect/DefaultLanguageDetectorContextGenerator.java

+11-7
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,7 @@
2020
import java.util.ArrayList;
2121
import java.util.Collection;
2222

23-
import opennlp.tools.ngram.NGramModel;
24-
import opennlp.tools.util.StringList;
23+
import opennlp.tools.util.StringUtil;
2524
import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer;
2625
import opennlp.tools.util.normalizer.CharSequenceNormalizer;
2726

@@ -58,14 +57,19 @@ public DefaultLanguageDetectorContextGenerator(int minLength, int maxLength,
5857
public String[] getContext(CharSequence document) {
5958
Collection<String> context = new ArrayList<>();
6059

61-
NGramModel model = new NGramModel();
62-
model.add(normalizer.normalize(document), minLength, maxLength);
60+
CharSequence chars = normalizer.normalize(document);
6361

64-
for (StringList tokenList : model) {
65-
if (tokenList.size() > 0) {
66-
context.add(tokenList.getToken(0));
62+
for (int lengthIndex = minLength; lengthIndex < maxLength + 1; lengthIndex++) {
63+
for (int textIndex = 0;
64+
textIndex + lengthIndex - 1 < chars.length(); textIndex++) {
65+
66+
String gram = StringUtil.toLowerCase(
67+
chars.subSequence(textIndex, textIndex + lengthIndex));
68+
69+
context.add(gram);
6770
}
6871
}
72+
6973
return context.toArray(new String[context.size()]);
7074
}
7175
}

Diff for: opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java

+1-2
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ public void missclassified(LanguageSample reference,
5858
cv.evaluate(sampleStream, 2);
5959

6060
Assert.assertEquals(99, cv.getDocumentCount());
61-
Assert.assertEquals(0.98989898989899, cv.getDocumentAccuracy(), 0.01);
61+
Assert.assertEquals(1, cv.getDocumentAccuracy(), 0.01);
6262
}
63-
6463
}

0 commit comments

Comments
 (0)