Skip to content

Commit 4d59ee4

Browse files
committed
Fix emoticon matching before letters (e.g., Wikipedia:Diskussionen)
Added trailing context to emoticon rule so :D only matches when NOT followed by a letter. This prevents false emoticon matches in patterns like Wikipedia:Diskussionen where the colon is a namespace separator. Before: Wikipedia:Diskussionen → Wikipedia :D iskussionen After: Wikipedia:Diskussionen → Wikipedia : Diskussionen Resolves #134 Change-Id: Ia9d6659e604eb514172e2182c94a206b5b45023f
1 parent 658e605 commit 4d59ee4

File tree

2 files changed

+21
-3
lines changed

2 files changed

+21
-3
lines changed

src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -910,7 +910,7 @@ d{Q} / ye {return currentT
910910
([.][.]+|+) {return currentToken("...");}
911911
{LONG_END_PUNCT} { return currentToken();}
912912
{PUNCT} { return currentToken();}
913-
{EMOTICON} { return currentToken();}
913+
{EMOTICON} / [^[:letter:]] { return currentToken();}
914914
{DASH}{DoubleLiteral} { return currentToken();}
915915
{EMOJI_COMPLEX} { return currentToken();}
916916
<<EOF>> { fileEnd(); return null;}

src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1195,6 +1195,24 @@ public void testVerdiAbbreviation() {
11951195
assertEquals(".", tokens[6]);
11961196
assertEquals(7, tokens.length);
11971197
}
1198-
}
1199-
12001198

1199+
// Regression test for emoticon not matching before letters
1200+
// Wikipedia:Diskussionen should NOT tokenize :D as an emoticon
1201+
@Test
1202+
public void testEmoticonNotMatchBeforeLetter() {
1203+
DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
1204+
1205+
String[] tokens = tok.tokenize("Wikipedia:Diskussionen");
1206+
assertEquals("Wikipedia", tokens[0]);
1207+
assertEquals(":", tokens[1]);
1208+
assertEquals("Diskussionen", tokens[2]);
1209+
assertEquals(3, tokens.length);
1210+
1211+
// But emoticons followed by space/punct should still work
1212+
tokens = tok.tokenize("Great :D!");
1213+
assertEquals("Great", tokens[0]);
1214+
assertEquals(":D", tokens[1]);
1215+
assertEquals("!", tokens[2]);
1216+
assertEquals(3, tokens.length);
1217+
}
1218+
}

0 commit comments

Comments
 (0)