Skip to content

Commit f94b9ce

Browse files
committed
check parantheses at the end of sentences
Change-Id: Ifa051a59b8a7de88e031a850d11ca95432a0b32e
1 parent b428755 commit f94b9ce

File tree

2 files changed

+9
-2
lines changed

2 files changed

+9
-2
lines changed

matrix_test.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,13 @@ Innstetten!`
376376
assert.Equal(\nNun\n,\ngib\ndich\nzufrieden\n,\nich\nfange\nschon\nan\n...", sentences[0])
377377
assert.Equal("Also\nBaron\nInnstetten\n!", sentences[1])
378378

379+
// Check paranthesis at the end of sentences.
380+
w.Reset()
381+
assert.True(mat.Transduce(strings.NewReader("(Er ging.) Und kam (später)."), w))
382+
sentences = strings.Split(w.String(), "\n\n")
383+
assert.Equal(len(sentences), 3)
384+
assert.Equal("(\nEr\nging\n.\n)", sentences[0])
385+
assert.Equal("Und\nkam\n(\nspäter\n)\n.", sentences[1])
379386
}
380387

381388
func TestMatrixFullTokenizerMatrixSentenceSplitterBug1(t *testing.T) {

src/tokenizer.xfst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,9 +225,9 @@ define DQuotes ["”"|%"|"»"|"«"];
225225
define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä];
226226

227227
read regex Token .o. [
228-
SP NLout [DQuotes | "›" (NLout DQuotes) | %‹ (NLout DQuotes) | %’ (NLout DQuotes) | "'" (NLout DQuotes)] (NLout SP) @-> ... NLout \/ _ NLout \%,
228+
SP NLout [DQuotes | "›" (NLout DQuotes) | %‹ (NLout DQuotes) | %’ (NLout DQuotes) | "'" (NLout DQuotes) | ")" ] (NLout SP) @-> ... NLout \/ _ NLout \%,
229229
] .o. [
230-
SP @-> ... NLout \/ NLout _ NLout [? - "”" - %" - "»" - "«" - "›" - %‹ - %’ - "'" - NLout]
230+
SP @-> ... NLout \/ NLout _ NLout [? - "”" - %" - "»" - "«" - "›" - %‹ - %’ - "'" - ")" - NLout]
231231
] .o. [
232232
[%. %. %.] @-> ... NLout \/ _ NLout WS+ NotSmallCaps
233233
] .o. [

0 commit comments

Comments
 (0)