Skip to content

Commit a11f5f5

Browse files
authored
Add lingua for language recognition (as an optional alternative) (#1239)
* add Lingua as language detection library * fix: improve language detection handling and update configuration comments
1 parent f41cd9f commit a11f5f5

File tree

8 files changed

+134
-6
lines changed

8 files changed

+134
-6
lines changed

build.gradle

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ subprojects {
180180
implementation "com.google.guava:guava:31.0.1-jre"
181181
implementation "org.apache.httpcomponents:httpclient:4.5.3"
182182
implementation "black.ninia:jep:4.2.2"
183+
implementation 'com.github.pemistahl:lingua:1.2.2'
183184

184185
implementation "com.fasterxml.jackson.core:jackson-core:2.14.3"
185186
implementation "com.fasterxml.jackson.core:jackson-databind:2.14.3"
@@ -216,7 +217,7 @@ subprojects {
216217
"--add-opens", "java.base/java.lang=ALL-UNNAMED",
217218
"--add-opens", "java.base/java.util=ALL-UNNAMED"
218219
}
219-
220+
220221
systemProperty "java.library.path", "${javaLibraryPath}"
221222
}
222223
}
@@ -596,7 +597,7 @@ project(":grobid-trainer") {
596597
classpath = sourceSets.main.runtimeClasspath
597598
args 'tei', getArg('p2t', '.'), getArg('run', '0'), getArg('fileRatio', '1.0'), getArg('flavor', '')
598599
def javaLibraryPath = getJavaLibraryPath()
599-
600+
600601
if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) {
601602
jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.lang=ALL-UNNAMED"
602603
} else {

grobid-core/src/main/java/org/grobid/core/utilities/LanguageUtilities.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ public static LanguageUtilities getInstance() {
2222
if (instance == null) {
2323
synchronized (LanguageUtilities.class) {
2424
if (instance == null) {
25-
LOGGER.debug("synchronized getNewInstance");
2625
instance = new LanguageUtilities();
2726
}
2827
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
package org.grobid.core.lang.impl
2+
3+
import com.github.pemistahl.lingua.api.LanguageDetectorBuilder
4+
import org.grobid.core.lang.Language
5+
import org.grobid.core.lang.LanguageDetector
6+
import org.slf4j.Logger
7+
import org.slf4j.LoggerFactory
8+
9+
class LinguaLanguageDetector : LanguageDetector {
10+
private val detector: com.github.pemistahl.lingua.api.LanguageDetector = LanguageDetectorBuilder
11+
.fromAllLanguages()
12+
.withLowAccuracyMode()
13+
.build()
14+
15+
override fun detect(text: String?): Language? {
16+
if (text.isNullOrBlank()) {
17+
return null
18+
}
19+
20+
val languages = detector.computeLanguageConfidenceValues(text = text)
21+
22+
if (LOGGER.isDebugEnabled) {
23+
LOGGER.debug(languages.toString())
24+
}
25+
26+
if (languages.isEmpty()) {
27+
return null
28+
}
29+
30+
val l = languages.firstKey()
31+
val p = languages[l] ?: 0.0
32+
33+
return Language(l.isoCode639_1.toString(), p)
34+
}
35+
36+
companion object {
37+
private val LOGGER: Logger = LoggerFactory.getLogger(LinguaLanguageDetector::class.java)
38+
}
39+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package org.grobid.core.lang.impl;
2+
3+
import org.grobid.core.lang.LanguageDetector;
4+
import org.grobid.core.lang.LanguageDetectorFactory;
5+
6+
/**
7+
* Implementation of a language detector factory with Lingua language identifier
8+
*/
9+
public class LinguaLanguageDetectorFactory implements LanguageDetectorFactory {
10+
private static volatile LanguageDetector instance = null;
11+
12+
private static void init() {
13+
14+
}
15+
16+
public LanguageDetector getInstance() {
17+
if (instance == null) {
18+
synchronized (this) {
19+
if(instance == null) {
20+
init();
21+
instance = new LinguaLanguageDetector();
22+
}
23+
}
24+
25+
}
26+
return instance;
27+
}
28+
29+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
package org.grobid.core.lang.impl
2+
3+
import org.hamcrest.CoreMatchers.`is`
4+
import org.hamcrest.CoreMatchers.nullValue
5+
import org.hamcrest.MatcherAssert.assertThat
6+
import org.hamcrest.Matchers.greaterThan
7+
import kotlin.test.Test
8+
9+
class LinguaLanguageDetectorTest {
10+
11+
private val target = LinguaLanguageDetector()
12+
13+
@Test
14+
fun testDetect_englishText_shouldReturnEnglish() {
15+
val result = target.detect("This is a simple English sentence for language detection.")
16+
17+
assertThat(result!!.lang, `is`("en"))
18+
assertThat(result.conf, greaterThan(0.0))
19+
}
20+
21+
@Test
22+
fun testDetect_frenchText_shouldReturnFrench() {
23+
val result = target.detect("Ceci est une phrase en français pour la détection de langue.")
24+
25+
assertThat(result!!.lang, `is`("fr"))
26+
assertThat(result.conf, greaterThan(0.0))
27+
}
28+
29+
@Test
30+
fun testDetect_germanText_shouldReturnGerman() {
31+
val result = target.detect("Dies ist ein einfacher deutscher Satz zur Spracherkennung.")
32+
33+
assertThat(result!!.lang, `is`("de"))
34+
assertThat(result.conf, greaterThan(0.0))
35+
}
36+
37+
@Test
38+
fun testDetect_emptyText_shouldReturnNull() {
39+
val result = target.detect("")
40+
41+
assertThat(result, `is`(nullValue()))
42+
}
43+
44+
@Test
45+
fun testDetect_nullText_shouldReturnNull() {
46+
val result = target.detect(null)
47+
48+
assertThat(result, `is`(nullValue()))
49+
}
50+
51+
@Test
52+
fun testDetect_blankText_shouldReturnNull() {
53+
val result = target.detect(" ")
54+
55+
assertThat(result, `is`(nullValue()))
56+
}
57+
}

grobid-home/config/grobid-evaluation.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,9 @@ grobid:
5050
corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
5151
corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"
5252

53-
# the actual implementation for language recognition to be used
53+
# the actual implementation for language recognition to be used (Cybozu or Lingua)
5454
languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory"
55+
#languageDetectorFactory: "org.grobid.core.lang.impl.LinguaLanguageDetectorFactory"
5556

5657
# the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP)
5758
#sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory"

grobid-home/config/grobid-full.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,9 @@ grobid:
5050
corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
5151
corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"
5252

53-
# the actual implementation for language recognition to be used
53+
# the actual implementation for language recognition to be used (Cybozu or Lingua)
5454
languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory"
55+
#languageDetectorFactory: "org.grobid.core.lang.impl.LinguaLanguageDetectorFactory"
5556

5657
# the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP)
5758
#sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory"

grobid-home/config/grobid.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,9 @@ grobid:
5050
corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
5151
corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"
5252

53-
# the actual implementation for language recognition to be used
53+
# the actual implementation for language recognition to be used (Cybozu or Lingua)
5454
languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory"
55+
#languageDetectorFactory: "org.grobid.core.lang.impl.LinguaLanguageDetectorFactory"
5556

5657
# the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP)
5758
#sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory"

0 commit comments

Comments
 (0)