Skip to content

Commit 0633dfc

Browse files
committed
Normalize question/exclamation marks
Fix #34
1 parent 47123a4 commit 0633dfc

File tree

5 files changed

+74
-17
lines changed

5 files changed

+74
-17
lines changed

.vscode/settings.json

+5-1
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,9 @@
2424
"deno.enable": false,
2525
"deno.enablePaths": [
2626
"scripts/deno/"
27-
]
27+
],
28+
29+
"haskell.serverEnvironment": {
30+
"STACK_YAML": "stack-ghc-9.4.yaml"
31+
}
2832
}

CHANGES.md

+10
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,21 @@ Version 0.5.0
66

77
To be released.
88

9+
- Stops normalizer now normalizes question marks and exclamation marks too.
10+
The below functions were added. [[#34]]
11+
12+
Haskell API-wise, the below record fields were added to
13+
the `Text.Seonbi.Punctuation.Stops` data type:
14+
15+
- `questionMark` field
16+
- `exclamationMark` field
17+
918
- Added prebuilt executable binaries for Linux ARM64 (linux-arm64).
1019

1120
- The official Docker images are now multi-platform (linux/amd64 and
1221
linux/arm64). [[#35]]
1322

23+
[#34]: https://github.com/dahlia/seonbi/issues/34
1424
[#35]: https://github.com/dahlia/seonbi/issues/35
1525

1626

en.utf-8.add

+5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
deno
2+
guillemets
23
hanja
4+
inequal
5+
interpunct
6+
interpuncts
37
phoneticize
48
phoneticized
9+
punct
510
seonbi
611
Sino
712
stdict

src/Text/Seonbi/Punctuation.hs

+39-11
Original file line numberDiff line numberDiff line change
@@ -249,49 +249,51 @@ data TitlePunct
249249
deriving (Eq, Show)
250250

251251

252-
-- | A set of stops—'period', 'comma', and 'interpunct'—to be used by
253-
-- 'normalizeStops' function.
252+
-- | A set of stops—'period', 'comma', 'interpunct', 'questionMark', and
253+
-- 'exclamationMark'—to be used by 'normalizeStops' function.
254254
--
255255
-- There are three presets: 'horizontalStops', 'verticalStops', and
256256
-- 'horizontalStopsWithSlashes'.
257257
data Stops = Stops
258258
{ period :: Text
259259
, comma :: Text
260260
, interpunct :: Text
261+
, questionMark :: Text
262+
, exclamationMark :: Text
261263
} deriving (Eq, Show)
262264

263265
-- | Stop sentences in the modern Korean style which follows Western stops.
264266
-- E.g.:
265267
--
266-
-- > 봄·여름·가을·겨울. 어제, 오늘.
268+
-- > 봄·여름·가을·겨울. 어제, 오늘. 새벽? 아침!
267269
horizontalStops :: Stops
268270
horizontalStops = Stops
269271
{ period = ". "
270272
, comma = ", "
271273
, interpunct = "·"
274+
, questionMark = "? "
275+
, exclamationMark = "! "
272276
}
273277

274278
-- | Stop sentences in the pre-modern Korean style which follows Chinese stops.
275279
-- E.g.:
276280
--
277-
-- > 봄·여름·가을·겨울。어제、오늘。
281+
-- > 봄·여름·가을·겨울。어제、오늘。새벽?아침!
278282
verticalStops :: Stops
279283
verticalStops = Stops
280284
{ period = ""
281285
, comma = ""
282286
, interpunct = "·"
287+
, questionMark = ""
288+
, exclamationMark = ""
283289
}
284290

285291
-- | Similar to 'horizontalStops' except slashes are used instead of
286292
-- interpuncts. E.g.:
287293
--
288-
-- > 봄/여름/가을/겨울. 어제, 오늘.
294+
-- > 봄/여름/가을/겨울. 어제, 오늘. 새벽? 아침!
289295
horizontalStopsWithSlashes :: Stops
290-
horizontalStopsWithSlashes = Stops
291-
{ period = ". "
292-
, comma = ", "
293-
, interpunct = "/"
294-
}
296+
horizontalStopsWithSlashes = horizontalStops { interpunct = "/" }
295297

296298

297299
-- | Normalizes sentence stops (periods, commas, and interpuncts).
@@ -332,6 +334,12 @@ normalizeStops stops input = (`fmap` annotatedEntities) $ \ case
332334
, do { ending <- interpunct'
333335
; return (toEntity $ adjustEnding ending $ interpunct stops)
334336
}
337+
, do { ending <- questionMark'
338+
; return (toEntity $ adjustEnding ending $ questionMark stops)
339+
}
340+
, do { ending <- exclamationMark'
341+
; return (toEntity $ adjustEnding ending $ exclamationMark stops)
342+
}
335343
]
336344
adjustEnding :: Ending -> Text -> Text
337345
adjustEnding ending text
@@ -377,6 +385,26 @@ normalizeStops stops input = (`fmap` annotatedEntities) $ \ case
377385
, string "&#183;"
378386
, asciiCI "&#xb7;"
379387
] >> return Ending
388+
questionMark' :: Parser Ending
389+
questionMark' = choice
390+
[ char '?' >> boundary
391+
, char '' >> trailingSpaces
392+
, string "&quest;" >> boundary
393+
, string "&#63;" >> boundary
394+
, asciiCI "&#x3f;" >> boundary
395+
, string "&#65311;" >> trailingSpaces
396+
, asciiCI "&#xff1f;" >> trailingSpaces
397+
]
398+
exclamationMark' :: Parser Ending
399+
exclamationMark' = choice
400+
[ char '!' >> boundary
401+
, char '' >> trailingSpaces
402+
, string "&excl;" >> boundary
403+
, string "&#33;" >> boundary
404+
, asciiCI "&#x21;" >> boundary
405+
, string "&#65281;" >> trailingSpaces
406+
, asciiCI "&#xff01;" >> trailingSpaces
407+
]
380408
closingChars :: String
381409
closingChars =
382410
[ '"', '', '\'', '', ')', ']', '}', '', '', '', '', '', ''
@@ -419,7 +447,7 @@ normalizeStops stops input = (`fmap` annotatedEntities) $ \ case
419447
]
420448

421449

422-
data Ending = TrailingChars Text | TrailingSpaces Text | Ending
450+
data Ending = TrailingChars Text | TrailingSpaces Text | Ending deriving (Show)
423451

424452

425453
-- | Substitution options for 'transformArrow' function. These options can

test/Text/Seonbi/PunctuationSpec.hs

+15-5
Original file line numberDiff line numberDiff line change
@@ -475,15 +475,25 @@ spec = do
475475
[ "·", "&middot;", "&centerdot;", "&CenterDot;"
476476
, "&#xB7;", "&#xb7;", "&#183;"
477477
] :: [Text]
478+
let questionMarks =
479+
[ "? ", "&quest; ", "&#63; ", "&#x3f; "
480+
, "", "&#65311;", "&#xFF1F;"
481+
] :: [Text]
482+
let exclamationMarks =
483+
[ "! ", "&excl; ", "&#33; ", "&#x21; "
484+
, "", "&#65281;", "&#xFF01;"
485+
] :: [Text]
478486
let s = stripEnd
479487
let examples =
480-
[ [qc|봄{i1}여름{i2}가을{i3}겨울{p1}(括弧{s p3}) 어제{c}오늘{s p2}|]
488+
[ [qc|봄{i1}여름{i2}가을{i3}겨울{p1}(括弧{s p3}) 어제{c}오늘{p2}새벽{q}아침{s e}|]
481489
| p1 <- periods, p2 <- periods, p3 <- periods
482490
, c <- commas
483491
, i1 <- interpuncts, i2 <- interpuncts, i3 <- interpuncts
492+
, q <- questionMarks
493+
, e <- exclamationMarks
484494
] :: [Text]
485495
let gen = unsafePerformIO getStdGen :: StdGen
486-
let randomInts = randomRs (0, 499) gen :: [Int]
496+
let randomInts = randomRs (0, 999) gen :: [Int]
487497
let sampledExamples =
488498
[ e
489499
| (e, r) <- Prelude.zip examples randomInts
@@ -499,19 +509,19 @@ spec = do
499509
normalizeStops horizontalStops input `shouldBe`
500510
[ HtmlStartTag [] P ""
501511
, HtmlText [P]
502-
"봄&#xb7;여름&#xb7;가을&#xb7;겨울. (括弧.) 어제, 오늘."
512+
"봄&#xb7;여름&#xb7;가을&#xb7;겨울. (括弧.) 어제, 오늘. 새벽? 아침!"
503513
, HtmlEndTag [] P
504514
]
505515
normalizeStops verticalStops input `shouldBe`
506516
[ HtmlStartTag [] P ""
507517
, HtmlText [P]
508518
("봄&#xb7;여름&#xb7;가을&#xb7;겨울&#x3002;(括弧&#x3002;) " <>
509-
"어제&#x3001;오늘&#x3002;")
519+
"어제&#x3001;오늘&#x3002;새벽&#xff1f;아침&#xff01;")
510520
, HtmlEndTag [] P
511521
]
512522
normalizeStops horizontalStopsWithSlashes input `shouldBe`
513523
[ HtmlStartTag [] P ""
514-
, HtmlText [P] "봄/여름/가을/겨울. (括弧.) 어제, 오늘."
524+
, HtmlText [P] "봄/여름/가을/겨울. (括弧.) 어제, 오늘. 새벽? 아침!"
515525
, HtmlEndTag [] P
516526
]
517527
it "normalizes stops followed by boundaries as well" $ do

0 commit comments

Comments
 (0)