Skip to content

Commit b428755

Browse files
committed
Support punctuation after quotes
Change-Id: I33522a817f1e34f732d4d7131412e6d0e28e90a6
1 parent df27581 commit b428755

File tree

7 files changed

+12
-8
lines changed

7 files changed

+12
-8
lines changed

Changes

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
0.1.4 2022-03-11
1+
0.1.4 2022-03-27
22
- Improved handling of ellipsis.
3+
- Make algorithm more robust to nevere fail.
4+
- Remove match option.
35

46
0.1.3 2022-03-08
57
- Introduced refined handling of sentences including speech.

datok_test.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -932,11 +932,12 @@ func TestDoubleArrayFullTokenizerSentenceSplitterBug1(t *testing.T) {
932932
w.Reset()
933933
assert.True(dat.Transduce(strings.NewReader(text), w))
934934
sentences = strings.Split(w.String(), "\n\n")
935-
assert.Equal(len(sentences), 5)
935+
assert.Equal(len(sentences), 6)
936936
assert.Equal("Wüllersdorf\nwar\naufgestanden\n.", sentences[0])
937937
assert.Equal(\nIch\nfinde\nes\nfurchtbar\n,\ndaß\nSie\nrecht\nhaben\n,\naber\nSie\nhaben\nrecht\n.", sentences[1])
938-
assert.Equal("Ich\nquäle\nSie\nnicht\nlänger\nmit\nmeinem\n'\nMuß\nes\nsein\n?\n'\n.\n \nDie\nWelt\nist\neinmal\n,\nwie\nsie\nist\n,\nund\ndie\nDinge\nverlaufen\nnicht\n,\nwie\nwir\nwollen\n,\nsondern\nwie\ndie\nandern\nwollen\n.", sentences[2])
939-
assert.Equal("Das\nmit\ndem\n'\nGottesgericht\n'\n,\nwie\nmanche\nhochtrabend\nversichern\n,\nist\nfreilich\nein\nUnsinn\n,\nnichts\ndavon\n,\numgekehrt\n,\nunser\nEhrenkultus\nist\nein\nGötzendienst\n,\naber\nwir\nmüssen\nuns\nihm\nunterwerfen\n,\nsolange\nder\nGötze\ngilt\n.\n«", sentences[3])
938+
assert.Equal("Ich\nquäle\nSie\nnicht\nlänger\nmit\nmeinem\n'\nMuß\nes\nsein\n?\n'\n.", sentences[2])
939+
assert.Equal("Die\nWelt\nist\neinmal\n,\nwie\nsie\nist\n,\nund\ndie\nDinge\nverlaufen\nnicht\n,\nwie\nwir\nwollen\n,\nsondern\nwie\ndie\nandern\nwollen\n.", sentences[3])
940+
assert.Equal("Das\nmit\ndem\n'\nGottesgericht\n'\n,\nwie\nmanche\nhochtrabend\nversichern\n,\nist\nfreilich\nein\nUnsinn\n,\nnichts\ndavon\n,\numgekehrt\n,\nunser\nEhrenkultus\nist\nein\nGötzendienst\n,\naber\nwir\nmüssen\nuns\nihm\nunterwerfen\n,\nsolange\nder\nGötze\ngilt\n.\n«", sentences[4])
940941
}
941942

942943
func TestDoubleArrayLoadFactor1(t *testing.T) {

matrix_test.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -394,11 +394,12 @@ func TestMatrixFullTokenizerMatrixSentenceSplitterBug1(t *testing.T) {
394394
w.Reset()
395395
assert.True(mat.Transduce(strings.NewReader(text), w))
396396
sentences = strings.Split(w.String(), "\n\n")
397-
assert.Equal(len(sentences), 5)
397+
assert.Equal(len(sentences), 6)
398398
assert.Equal("Wüllersdorf\nwar\naufgestanden\n.", sentences[0])
399399
assert.Equal(\nIch\nfinde\nes\nfurchtbar\n,\ndaß\nSie\nrecht\nhaben\n,\naber\nSie\nhaben\nrecht\n.", sentences[1])
400-
assert.Equal("Ich\nquäle\nSie\nnicht\nlänger\nmit\nmeinem\n'\nMuß\nes\nsein\n?\n'\n.\n \nDie\nWelt\nist\neinmal\n,\nwie\nsie\nist\n,\nund\ndie\nDinge\nverlaufen\nnicht\n,\nwie\nwir\nwollen\n,\nsondern\nwie\ndie\nandern\nwollen\n.", sentences[2])
401-
assert.Equal("Das\nmit\ndem\n'\nGottesgericht\n'\n,\nwie\nmanche\nhochtrabend\nversichern\n,\nist\nfreilich\nein\nUnsinn\n,\nnichts\ndavon\n,\numgekehrt\n,\nunser\nEhrenkultus\nist\nein\nGötzendienst\n,\naber\nwir\nmüssen\nuns\nihm\nunterwerfen\n,\nsolange\nder\nGötze\ngilt\n.\n«", sentences[3])
400+
assert.Equal("Ich\nquäle\nSie\nnicht\nlänger\nmit\nmeinem\n'\nMuß\nes\nsein\n?\n'\n.", sentences[2])
401+
assert.Equal("Die\nWelt\nist\neinmal\n,\nwie\nsie\nist\n,\nund\ndie\nDinge\nverlaufen\nnicht\n,\nwie\nwir\nwollen\n,\nsondern\nwie\ndie\nandern\nwollen\n.", sentences[3])
402+
assert.Equal("Das\nmit\ndem\n'\nGottesgericht\n'\n,\nwie\nmanche\nhochtrabend\nversichern\n,\nist\nfreilich\nein\nUnsinn\n,\nnichts\ndavon\n,\numgekehrt\n,\nunser\nEhrenkultus\nist\nein\nGötzendienst\n,\naber\nwir\nmüssen\nuns\nihm\nunterwerfen\n,\nsolange\nder\nGötze\ngilt\n.\n«", sentences[4])
402403
}
403404

404405
func TestMatrixFullTokenizerTokenSplitter(t *testing.T) {

src/tokenizer.xfst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ define DQuotes ["”"|%"|"»"|"«"];
225225
define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä];
226226

227227
read regex Token .o. [
228-
SP NLout [DQuotes | "›" (NLout DQuotes)| %‹ (NLout DQuotes)| %’ (NLout DQuotes)| "'" (NLout DQuotes)] @-> ... NLout \/ _ NLout \%,
228+
SP NLout [DQuotes | "›" (NLout DQuotes) | %‹ (NLout DQuotes) | %’ (NLout DQuotes) | "'" (NLout DQuotes)] (NLout SP) @-> ... NLout \/ _ NLout \%,
229229
] .o. [
230230
SP @-> ... NLout \/ NLout _ NLout [? - "”" - %" - "»" - "«" - "›" - %‹ - %’ - "'" - NLout]
231231
] .o. [

testdata/tokenizer.datok

-7.72 KB
Binary file not shown.

testdata/tokenizer.fst

1 Byte
Binary file not shown.

testdata/tokenizer.matok

757 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)