Skip to content

Commit 85f957a

Browse files
committed
Add native spaCy support
Change-Id: Ibd3d660d2fc27a142e8d5e013b8bbb400bff5b9c
1 parent 9d2dc81 commit 85f957a

File tree

3 files changed

+131
-28
lines changed

3 files changed

+131
-28
lines changed

app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ interface AnnotationToolBridge {
1818

1919
class AnnotationToolBridgeFactory {
2020
companion object {
21-
const val taggerFoundries = "marmot|opennlp|corenlp|treetagger"
22-
const val parserFoundries = "malt|corenlp"
21+
const val taggerFoundries = "marmot|opennlp|corenlp|treetagger|spacy"
22+
const val parserFoundries = "malt|corenlp|spacy"
2323

2424
fun getAnnotationToolBridge(foundry: String, model: String, LOGGER: Logger): AnnotationToolBridge? {
2525
when (foundry) {
@@ -28,6 +28,7 @@ class AnnotationToolBridgeFactory {
2828
"malt" -> return MaltParserBridge(model, LOGGER)
2929
"corenlp" -> return CoreNLPBridge(model, LOGGER)
3030
"treetagger", "tree_tagger" -> return null
31+
"spacy" -> return null
3132
else -> LOGGER.severe("Unknown tagger/parser $foundry")
3233
}
3334
return null
@@ -40,6 +41,7 @@ class AnnotationToolBridgeFactory {
4041
"opennlp" -> return OpenNlpBridge(model, LOGGER)
4142
"corenlp" -> return CoreNLPTaggerBridge(model, LOGGER)
4243
"treetagger", "tree_tagger" -> return null
44+
"spacy" -> return null
4345
else -> LOGGER.severe("Unknown tagger $foundry")
4446
}
4547
return null
@@ -50,6 +52,7 @@ class AnnotationToolBridgeFactory {
5052
when (foundry) {
5153
"malt" -> return MaltParserBridge(model, LOGGER)
5254
"corenlp" -> return CoreNLPBridge(model, LOGGER, taggerModel)
55+
"spacy" -> return null
5356
else -> LOGGER.severe("Unknown parser $foundry")
5457
}
5558
return null

app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt

Lines changed: 80 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,13 @@ val ZIP_ENTRY_UNIX_MODE = parseInt("644", 8)
9292
" ./build/bin/korapxmltool -t zip -T marmot:de.marmot -P malt:german.mco app/src/test/resources/goe.zip",
9393
" # (uses KORAPXMLTOOL_MODELS_PATH if model not found in current directory)",
9494
"",
95-
" Use external spaCy annotation (without dependencies):",
95+
" Native Docker spaCy tagging (without dependencies):",
96+
" ./build/bin/korapxmltool -t zip -T spacy app/src/test/resources/goe.zip",
97+
"",
98+
" Native Docker spaCy tagging and dependency parsing:",
99+
" ./build/bin/korapxmltool -t zip -P spacy app/src/test/resources/goe.zip",
100+
"",
101+
" Use external spaCy annotation (legacy method):",
96102
" ./build/bin/korapxmltool -j4 -A \"docker run -e SPACY_USE_DEPENDENCIES=False --rm -i korap/conllu2spacy:latest\" -t zip ./app/src/test/resources/goe.zip",
97103
"",
98104
" Generate Krill tar from wud24_sample with multiple annotation foundries:",
@@ -323,12 +329,14 @@ class KorapXmlTool : Callable<Int> {
323329

324330
data class DockerTaggerConfig(val image: String, val defaultModel: String, val defaultArgs: String)
325331
private val dockerTaggers = mapOf(
326-
"treetagger" to DockerTaggerConfig("korap/conllu-treetagger", "german", "-p")
332+
"treetagger" to DockerTaggerConfig("korap/conllu-treetagger", "german", "-p"),
333+
"spacy" to DockerTaggerConfig("korap/conllu-spacy", "de_core_news_lg", "")
327334
)
328335

329336
private val defaultParserModels = mapOf(
330337
"malt" to "german.mco",
331-
"corenlp" to "germanSR.ser.gz"
338+
"corenlp" to "germanSR.ser.gz",
339+
"spacy" to "de_core_news_lg"
332340
)
333341

334342
// Calculate optimal thread count based on format, memory, and input characteristics
@@ -467,7 +475,7 @@ class KorapXmlTool : Callable<Int> {
467475
names = ["-T", "--tag-with"],
468476
paramLabel = "TAGGER[:MODEL]",
469477
description = ["Specify a tagger and optionally a model: ${taggerFoundries}[:<path/to/model>].",
470-
"If model is omitted, defaults are: marmot→de.marmot, opennlp→de-pos-maxent.bin, corenlp→german-fast.tagger"]
478+
"If model is omitted, defaults are: marmot→de.marmot, opennlp→de-pos-maxent.bin, corenlp→german-fast.tagger, treetagger→german, spacy→de_core_news_lg"]
471479
)
472480
fun setTagWith(tagWith: String) {
473481
// Pattern now makes the model part optional
@@ -518,7 +526,16 @@ class KorapXmlTool : Callable<Int> {
518526
// The user request said: "docker run -v $KORAPXMLTOOL_MODELS_PATH:/local/models ..."
519527
// AnnotationWorkerPool uses /bin/sh -c, so environment variables should be expanded by the shell.
520528

521-
annotateWith = "docker run -v \${KORAPXMLTOOL_MODELS_PATH:-.}:/local/models --rm -i ${config.image} $args -l $model"
529+
// Handle different Docker command formats
530+
if (taggerName == "spacy") {
531+
// spaCy uses -m for model and -d to disable dependencies (tagging only)
532+
annotateWith = "docker run -v \${KORAPXMLTOOL_MODELS_PATH:-.}:/local/models --rm -i ${config.image} -m $model -d"
533+
} else if (taggerName == "treetagger") {
534+
// TreeTagger uses -l for language/model and -p in args
535+
annotateWith = "docker run -v \${KORAPXMLTOOL_MODELS_PATH:-.}:/local/models --rm -i ${config.image} $args -l $model"
536+
} else {
537+
annotateWith = "docker run -v \${KORAPXMLTOOL_MODELS_PATH:-.}:/local/models --rm -i ${config.image} $args $model"
538+
}
522539
dockerLogMessage = "Configured Docker tagger '$taggerName' with command: $annotateWith"
523540

524541
} else {
@@ -558,7 +575,7 @@ class KorapXmlTool : Callable<Int> {
558575
names = ["-P", "--parse-with"],
559576
paramLabel = "PARSER[:MODEL]",
560577
description = ["Specify a parser and optionally a model: ${parserFoundries}[:<path/to/model>].",
561-
"If model is omitted, defaults are: malt→german.mco, corenlp→germanSR.ser.gz"]
578+
"If model is omitted, defaults are: malt→german.mco, corenlp→germanSR.ser.gz, spacy→de_core_news_lg"]
562579
)
563580
fun setParseWith(parseWith: String) {
564581
// Pattern now makes the model part optional
@@ -570,31 +587,68 @@ class KorapXmlTool : Callable<Int> {
570587
"value does not match the expected pattern ${parserFoundries}[:<path/to/model>]", parseWith))
571588
} else {
572589
parserName = matcher.group(1)
573-
val originalModelPath = matcher.group(2) ?: defaultParserModels[parserName]
574-
575-
if (originalModelPath == null) {
576-
throw ParameterException(spec.commandLine(),
577-
String.format(Locale.ROOT, "No default model available for parser '%s'", parserName))
578-
}
579-
580-
val resolvedModelPath = resolveModelPath(originalModelPath)
581590

582-
if (resolvedModelPath != null) {
583-
parserModel = resolvedModelPath
584-
if (resolvedModelPath != originalModelPath) {
585-
// Store for logging after logger initialization
586-
modelPathResolutions.add(originalModelPath to resolvedModelPath)
591+
// Handle Docker parsers (like spaCy)
592+
if (dockerTaggers.containsKey(parserName)) {
593+
val config = dockerTaggers[parserName]!!
594+
val modelPart = matcher.group(2)
595+
596+
var model = config.defaultModel
597+
var args = config.defaultArgs
598+
599+
if (modelPart != null) {
600+
val parts = modelPart.split(":", limit = 2)
601+
if (parts.isNotEmpty() && parts[0].isNotBlank()) {
602+
model = parts[0]
603+
}
604+
if (parts.size > 1) {
605+
val customArgs = parts[1]
606+
args = if (config.defaultArgs.isNotBlank()) {
607+
"${config.defaultArgs} $customArgs"
608+
} else {
609+
customArgs
610+
}
611+
}
587612
}
613+
614+
parserModel = model // For logging
615+
616+
// For spaCy parsing, do NOT add -d flag (parsing is enabled by default)
617+
if (parserName == "spacy") {
618+
// spaCy uses -m for model, no -d flag for parsing mode
619+
annotateWith = "docker run -v \${KORAPXMLTOOL_MODELS_PATH:-.}:/local/models --rm -i ${config.image} -m $model"
620+
} else {
621+
annotateWith = "docker run -v \${KORAPXMLTOOL_MODELS_PATH:-.}:/local/models --rm -i ${config.image} $args $model"
622+
}
623+
dockerLogMessage = "Configured Docker parser '$parserName' with command: $annotateWith"
624+
588625
} else {
589-
val defaultModelsPath = System.getenv("KORAPXMLTOOL_MODELS_PATH")
590-
val searchInfo = if (defaultModelsPath != null) {
591-
" (searched in current directory and KORAPXMLTOOL_MODELS_PATH='$defaultModelsPath')"
626+
val originalModelPath = matcher.group(2) ?: defaultParserModels[parserName]
627+
628+
if (originalModelPath == null) {
629+
throw ParameterException(spec.commandLine(),
630+
String.format(Locale.ROOT, "No default model available for parser '%s'", parserName))
631+
}
632+
633+
val resolvedModelPath = resolveModelPath(originalModelPath)
634+
635+
if (resolvedModelPath != null) {
636+
parserModel = resolvedModelPath
637+
if (resolvedModelPath != originalModelPath) {
638+
// Store for logging after logger initialization
639+
modelPathResolutions.add(originalModelPath to resolvedModelPath)
640+
}
592641
} else {
593-
" (searched in current directory; KORAPXMLTOOL_MODELS_PATH defaults to ../lib/models relative to executable)"
642+
val defaultModelsPath = System.getenv("KORAPXMLTOOL_MODELS_PATH")
643+
val searchInfo = if (defaultModelsPath != null) {
644+
" (searched in current directory and KORAPXMLTOOL_MODELS_PATH='$defaultModelsPath')"
645+
} else {
646+
" (searched in current directory; KORAPXMLTOOL_MODELS_PATH defaults to ../lib/models relative to executable)"
647+
}
648+
throw ParameterException(spec.commandLine(),
649+
String.format(Locale.ROOT, "Invalid value for option '--parse-with': "+
650+
"model file '%s' does not exist%s", originalModelPath, searchInfo))
594651
}
595-
throw ParameterException(spec.commandLine(),
596-
String.format(Locale.ROOT, "Invalid value for option '--parse-with': "+
597-
"model file '%s' does not exist%s", originalModelPath, searchInfo))
598652
}
599653
}
600654
}

app/src/test/kotlin/de/ids_mannheim/korapxmltools/DockerTaggerTest.kt

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,56 @@ package de.ids_mannheim.korapxmltools
22

33
import org.junit.Test
44
import kotlin.test.assertTrue
5+
import kotlin.test.assertFalse
56

67
class DockerTaggerTest {
78

9+
@Test
10+
fun testSpacyTaggerConfiguration() {
11+
val tool = KorapXmlTool()
12+
13+
// Test spaCy tagging (should add -d flag to disable parsing)
14+
tool.setTagWith("spacy")
15+
16+
val annotateWithField = KorapXmlTool::class.java.getDeclaredField("annotateWith")
17+
annotateWithField.isAccessible = true
18+
val annotateWith = annotateWithField.get(tool) as String
19+
20+
assertTrue(annotateWith.contains("-d"), "spaCy tagger should contain -d flag to disable parsing. Output: $annotateWith")
21+
assertTrue(annotateWith.contains("-m de_core_news_lg"), "Should contain default model")
22+
assertTrue(annotateWith.contains("korap/conllu-spacy"), "Should use correct Docker image")
23+
}
824

25+
@Test
26+
fun testSpacyParserConfiguration() {
27+
val tool = KorapXmlTool()
28+
29+
// Test spaCy parsing (should NOT add -d flag)
30+
tool.setParseWith("spacy")
31+
32+
val annotateWithField = KorapXmlTool::class.java.getDeclaredField("annotateWith")
33+
annotateWithField.isAccessible = true
34+
val annotateWith = annotateWithField.get(tool) as String
35+
36+
assertFalse(annotateWith.contains("-d"), "spaCy parser should NOT contain -d flag. Output: $annotateWith")
37+
assertTrue(annotateWith.contains("-m de_core_news_lg"), "Should contain default model")
38+
assertTrue(annotateWith.contains("korap/conllu-spacy"), "Should use correct Docker image")
39+
}
40+
41+
@Test
42+
fun testSpacyCustomModel() {
43+
val tool = KorapXmlTool()
44+
45+
// Test spaCy with custom model
46+
tool.setTagWith("spacy:de_core_news_sm")
47+
48+
val annotateWithField = KorapXmlTool::class.java.getDeclaredField("annotateWith")
49+
annotateWithField.isAccessible = true
50+
val annotateWith = annotateWithField.get(tool) as String
51+
52+
assertTrue(annotateWith.contains("-m de_core_news_sm"), "Should contain custom model. Output: $annotateWith")
53+
assertTrue(annotateWith.contains("-d"), "Should still contain -d flag for tagging")
54+
}
955

1056
@Test
1157
fun testTreeTaggerArgumentAppending() {

0 commit comments

Comments
 (0)