Skip to content

Commit a2db4c1

Browse files
committed
Fixed thousands separators not being handled consistently
Thanks @notesjor Resolves #135 Change-Id: I2d1be0329af6729bdea51431cfcbf24b6dcbc3db
1 parent e030297 commit a2db4c1

File tree

3 files changed

+61
-1
lines changed

3 files changed

+61
-1
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
- Short forms for determiners, adjectives, pronouns: `eine(n)`, `gute:r`, `ihm/r`, `diese(r)`, `ein(e)`
1313
* Added `de_old` German tokenizer variant without gender-sensitive rules
1414
(use `-l de_old` to split forms like `Nutzer:in` into separate tokens)
15+
* Fixed thousands separators not being handled consistently (issue #135):
16+
- Apostrophe `'` (Swiss format: `1'000'000`)
17+
- Thin space U+2009 and narrow no-break space U+202F
1518

1619
## 2.3.1 [2026-01-28]
1720

src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -663,12 +663,17 @@ EMAIL = {EMAILlocalPart} ("@"|"["at"]") ({EMAILbracketedHost})
663663

664664
// floating point, serial, model numbers, ip addresses, etc.
665665
// every other segment must have at least one digit
666+
// THOUSANDS_SEP for thousands separators: apostrophe (Swiss) and thin/narrow no-break space (issue #135)
667+
THOUSANDS_SEP = ("'"|"'"|[\u2009\u202F])
668+
666669
NUM = ({ALPHANUM} {P} {HAS_DIGIT}
667670
| {HAS_DIGIT} {P} {ALPHANUM}
668671
| {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
669672
| {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
670673
| {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
671-
| {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
674+
| {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
675+
| {HAS_DIGIT} ({THOUSANDS_SEP} {HAS_DIGIT})+
676+
| {HAS_DIGIT} ({THOUSANDS_SEP} {HAS_DIGIT})+ {P} {HAS_DIGIT})
672677

673678

674679
/* floating point literals */
@@ -682,6 +687,7 @@ Exponent = [eE] [+-]? [0-9]+
682687
// punctuation
683688
P = ("_"|"-"|"."|",")|{SLASH}
684689

690+
685691
Q = [\'`]
686692

687693
PUNCT = ({P}|{Q}|[?!@#$%\^&*_:;\]\[\"»«\202\204\206\207\213\221\222\223\224\225\226\227\233])

src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1215,4 +1215,55 @@ public void testEmoticonNotMatchBeforeLetter() {
12151215
assertEquals("!", tokens[2]);
12161216
assertEquals(3, tokens.length);
12171217
}
1218+
1219+
// Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/135
1220+
@Test
1221+
public void testTokenizerThousandsSeparators() {
1222+
DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
1223+
1224+
// Swiss apostrophe format (straight apostrophe)
1225+
String[] tokens = tok.tokenize("Das kostet 1'000'000 Franken");
1226+
assertEquals("Das", tokens[0]);
1227+
assertEquals("kostet", tokens[1]);
1228+
assertEquals("1'000'000", tokens[2]);
1229+
assertEquals("Franken", tokens[3]);
1230+
assertEquals(4, tokens.length);
1231+
1232+
// Swiss apostrophe format (curly apostrophe)
1233+
tokens = tok.tokenize("Der Preis ist 1'234'567 CHF");
1234+
assertEquals("Der", tokens[0]);
1235+
assertEquals("Preis", tokens[1]);
1236+
assertEquals("ist", tokens[2]);
1237+
assertEquals("1'234'567", tokens[3]);
1238+
assertEquals("CHF", tokens[4]);
1239+
assertEquals(5, tokens.length);
1240+
1241+
// Swiss format with decimal
1242+
tokens = tok.tokenize("Betrag: 1'234'567.89");
1243+
assertEquals("Betrag", tokens[0]);
1244+
assertEquals(":", tokens[1]);
1245+
assertEquals("1'234'567.89", tokens[2]);
1246+
assertEquals(3, tokens.length);
1247+
1248+
// Thin space format (U+2009)
1249+
tokens = tok.tokenize("Population: 1\u2009000\u2009000");
1250+
assertEquals("Population", tokens[0]);
1251+
assertEquals(":", tokens[1]);
1252+
assertEquals("1\u2009000\u2009000", tokens[2]);
1253+
assertEquals(3, tokens.length);
1254+
1255+
// Narrow no-break space format (U+202F)
1256+
tokens = tok.tokenize("Value: 1\u202F234\u202F567");
1257+
assertEquals("Value", tokens[0]);
1258+
assertEquals(":", tokens[1]);
1259+
assertEquals("1\u202F234\u202F567", tokens[2]);
1260+
assertEquals(3, tokens.length);
1261+
1262+
// Thin space with decimal
1263+
tokens = tok.tokenize("Result: 1\u2009000\u2009000,50");
1264+
assertEquals("Result", tokens[0]);
1265+
assertEquals(":", tokens[1]);
1266+
assertEquals("1\u2009000\u2009000,50", tokens[2]);
1267+
assertEquals(3, tokens.length);
1268+
}
12181269
}

0 commit comments

Comments
 (0)