Skip to content

Commit cdeb57c

Browse files
authored
More punctuation normalization fixes for Chinese (#1107)
This change adds more flexibility for some of the regex and adds normalization for parenthesis. It also updates OpusCleaner, which now has the OpusFilter regex filter as monolingual to avoid messing up columns or English side.
1 parent e528bb3 commit cdeb57c

File tree

3 files changed

+36
-33
lines changed

3 files changed

+36
-33
lines changed

pipeline/clean/opuscleaner/configs/en-zh/default.filters.json

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,26 +28,50 @@
2828
"parameters": {
2929
"patterns": [
3030
[
31-
"([\\u3400-\\u4dbf\\u4e00-\\u9fff\\uf900-\\ufaff!\uff01\uff1f\\?])\\?",
31+
"([\\u3400-\\u4dbf\\u4e00-\\u9fff\\uf900-\\ufaff!\uff01\uff1f\\?])\\s*\\?",
3232
"\\1\uff1f",
3333
0,
3434
""
3535
],
3636
[
37-
"([\\u3400-\\u4dbf\\u4e00-\\u9fff\\uf900-\\ufaff!\uff01\uff1f\\?])\\!",
37+
"([\\u3400-\\u4dbf\\u4e00-\\u9fff\\uf900-\\ufaff!\uff01\uff1f\\?])\\s*\\!",
3838
"\\1\uff01",
3939
0,
4040
""
4141
],
4242
[
43-
"([\\u3400-\\u4dbf\\u4e00-\\u9fff\\uf900-\\ufaff])\\.\\s*(?!\\s*\\.)",
43+
"([\\u3400-\\u4dbf\\u4e00-\\u9fff\\uf900-\\ufaff])\\.\\s*(?!\\s*[\\.a-zA-Z])",
4444
"\\1\uff61",
4545
0,
4646
""
47+
],
48+
[
49+
"([\\u3400-\\u4dbf\\u4e00-\\u9fff\\uf900-\\ufaff])\\s*:",
50+
"\\1\uff1a",
51+
0,
52+
""
53+
],
54+
[
55+
"([\\u3400-\\u4dbf\\u4e00-\\u9fff\\uf900-\\ufaff])\\s*;",
56+
"\\1\uff1b",
57+
0,
58+
""
59+
],
60+
[
61+
"\\(",
62+
"\uff08",
63+
0,
64+
""
65+
],
66+
[
67+
"\\)",
68+
"\uff09",
69+
0,
70+
""
4771
]
4872
]
4973
},
50-
"language": null
74+
"language": "<trg>"
5175
},
5276
{
5377
"filter": "max_length",

pipeline/clean/requirements/clean.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
opuscleaner==0.4.3
1+
opuscleaner==0.5
22
fasttext==0.9.2
33
sacremoses==0.0.53
44
more_itertools==10.1.0

pipeline/clean/requirements/clean.txt

Lines changed: 7 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -347,73 +347,52 @@ lingua-language-detector==2.0.2 \
347347
--hash=sha256:14216ee3aeb0c9ab6a5665d71a1399653fe635ed66f208165ed67346feeb2a5c \
348348
--hash=sha256:15beab4230e16c38cc88e50548076943e960476b2681e9873d764770173c1d3d \
349349
--hash=sha256:177fd7d5073a96b885daa2059c55d19a306550fed7aadafcfd3037cd8ce44ae1 \
350-
--hash=sha256:183c335b8f73286bb67a8a1a780145335af7fd03cc4e4de926da792d1ed120fe \
351350
--hash=sha256:1bc34b9331fc2ded8a78610ac406fa0cb4772a43d6280ee7830cb507ff78a9d5 \
352351
--hash=sha256:1eeb4390c7b2b570013bbcbfb2292beda4b60e6c22631b27937160814fa38f8d \
353352
--hash=sha256:278fa16dcb6b595daae796606dade38f80bcbf630f4816489a3ce0d719a71214 \
354353
--hash=sha256:2c9588ed7e1dfbe06190d6946cd0a9c0f4f22018f09dc37bb6ac686bcfd67907 \
355-
--hash=sha256:3a9395f4030cf6eaaa7e432cb167fbaa577109c38114ab1dbdfdd75693b3048b \
356354
--hash=sha256:3d2e8a34e4d9830714f1de6728eec182a36d243f038d1b7b71a29cc63408ad2d \
357355
--hash=sha256:3e4a936dc42ce8c118afcba3ce5f964b9590c2ed84fb622b7b35c518e4ea1c90 \
358356
--hash=sha256:40629aeac21a3cbc6ea45b925c6adab66badd6c9fc1885285f2aa27658b86157 \
359357
--hash=sha256:436de99680cbe4418295f961d82958bed76c029ab10696a4c46ce0b8d33e369c \
360358
--hash=sha256:48203ec1fbd6be0b6af3888b9494d543b86f3cf8de6f9b1cd08867fa12cd673c \
361359
--hash=sha256:4e162f9aa34c4f78bc48a69b557b58f783e1ee1dd369e99ab7d2b14bdac3447f \
362-
--hash=sha256:51adc3ac5c39a1d245394e6f02a197896e380b908642f4a19bef7554029f6437 \
363360
--hash=sha256:5d39aa5ca1b2d51aee46c7c96fa7ad0463dd2471a1b9827019b71fa367c918be \
364-
--hash=sha256:5fcc06b49c65cf6083afc7b6bd6ea26a43da8107d849842e4647a19c0ce68f66 \
365361
--hash=sha256:652936f5e109784528f643062c704ad02572994cf05cfb7c609f96f0ae6259ed \
366362
--hash=sha256:72866175ff3d78b3d9244932ffbbb731471bca3758a2a825c60331ecdfb10851 \
367363
--hash=sha256:745befc3a1e4c9510d00ad34cac206b678944257fc8b5c1cd7b512310cd7fdbd \
368364
--hash=sha256:796ba624c026ef978819d124d0d47685f2aeeec5b92315827187126347e7f406 \
369365
--hash=sha256:7ca5e4643cbb229c4eaae198458a5fafdeb812576edb3a160a5c2e4951d3cd1c \
370366
--hash=sha256:801e5aec372f3f175838eaf9462f17028cf58924df737118845e3a9502a6d189 \
371-
--hash=sha256:80eaaf6db600303665ab13c582c7818894e28270fee1e1d6af2a36ccc4d5095a \
372-
--hash=sha256:815f3290ab251907e142b82969a1d251c1c03d9ec2c71c8fbe5dc87f46152590 \
373-
--hash=sha256:8286b031302e32ac7a81d4f4f0a379b8ce1031422424eba3b1c8d721b956c4cb \
374367
--hash=sha256:9012c74eea7d07c63c47fceaa3a6bc1e216954107b08beb421b64c717912be0d \
375368
--hash=sha256:972b76218a2d72095c372e8b592b6ba0295a47880387de2d7c9c38da64d76a10 \
376369
--hash=sha256:99e59214d7f9a7f11f812b416cabfccfcdb6bc52a0cf67aa59f4d984d5e1296f \
377-
--hash=sha256:9a256aadabf76a915910dd430724592f417942a443dc980a325a154c3f93f547 \
378370
--hash=sha256:9ae0c1fac75528db15c293160ff1d5feeba0dbc0d2f3a43e62ba07163cf81354 \
379371
--hash=sha256:9be364224fc088cf9b0e95fbe19dcc884238f9194ffd015d223400334f7f57c7 \
380372
--hash=sha256:9d26936378cf2d8c081be332f60d5b1b7e6ead66986cff85df00105f146f8aa2 \
381373
--hash=sha256:9eb520f8de3906db10df68c4dcd48c5ed9a3c6eb593d3d94a9875627eead010a \
382374
--hash=sha256:9fd26d458356942db1d92b2951f9f6fa3aca0ef843a939c2cdcd8779a5148912 \
383-
--hash=sha256:a7ae33db037acb47c6517a938885dab4b8bf14e6d6b27a9fa7e237e8680babbc \
384-
--hash=sha256:a8092529499ab8bb1beece1a9f4c241b1d0c3ab6c36b181833c4107b81ae5d1d \
385-
--hash=sha256:aa499e82d66e12a242f51de51517d86e4cf0969c29379a9bae16a12853519c80 \
386375
--hash=sha256:ac1ca58c8b273ac3ea1a0aa5dccb613c4539b1e4eaf236b565836898a70bd03d \
387376
--hash=sha256:acc85b33c2f5faa46c1f49d184b990c8e6ef9b8d9aabcc720a2f018c2392de44 \
388377
--hash=sha256:ade311c8f7f419e4ad79065b5d757a49550131fc3b18fccb76cb949c562e0705 \
389378
--hash=sha256:ae33435d749478623466aa6315917432de9114226ad0faa7dc02b5bf42faae77 \
390379
--hash=sha256:afb62a4ec7f758d1bc12e0bcb6178d762d4ca26cb5e005f5a24f79ef52f47dec \
391-
--hash=sha256:b30bd3e63d8f0df6527b626c6eaf3836427ebf67dbdd527affbcedad0aa62cc4 \
392-
--hash=sha256:b4c164c7c9e0a151a986ab52668f57e3de265f0b04fb804a8ff2a5cb8a2fd83d \
393380
--hash=sha256:b4e861e49e75d37d26eb5c62c384b473d0641390b5f5f52c5ca30667b6573425 \
394381
--hash=sha256:bd4ee12cb7e8e6b6a201617d2a45b5ca6bbae0c29396326ab247f574997f556b \
395-
--hash=sha256:c0a9fcaaf3a05b7b0b199414bfcdfc06a90b6779bfc37f784fafd45a0dad3201 \
396-
--hash=sha256:c2f788707484686e584e947bbc526fae13759c2bcfe57d8076e3b20c2278d0e0 \
397382
--hash=sha256:c568d1bc24ccf61a76d3b181139dbc519180f14e2eca437ba1e10a62efaf9e4a \
398-
--hash=sha256:c7d1ea36a200f5d7e72eca06a3f1648dcd15fa482bf3b075ec3ef90ed57120a0 \
399383
--hash=sha256:ca558c52130a3a2a1fa432504fc71d4c6e2370340008d4bad261b33c05f81b3f \
400384
--hash=sha256:ccb2aa354d659abddcaa067bc403fc32549d1f531ce99b5d4d336b7c796ed111 \
401385
--hash=sha256:ce3233f1c05c623eafdbfb751bd8ac47e20c4bfd744ba6dda5d5c47db7149425 \
402386
--hash=sha256:d012d95f863c627d4a57a69084ec768a7add7d8ca5f87eb0b51b05d7f4b17232 \
403387
--hash=sha256:d1502bb5e33a9f535b735cb8a898d96b81a63d19a1d2143f76e3b1ae7b7651ae \
404388
--hash=sha256:d24190b5e75c466fe3117310aeae2d11e30e62dfb531c288a85b9b6ab11c94e2 \
405-
--hash=sha256:d4115466f8dc4e63b6dc71fdd17ac39bc5ab334a0f2567237cff5a9f7515655c \
406389
--hash=sha256:d6b22ad7d05db4ae6ef8ff127ccee2afb456941ddd781dc8675f110f77de8337 \
407-
--hash=sha256:d72a650be1c44e69dacac26aa7ab354790f95191bc830b3d43ab32ec71388a25 \
408-
--hash=sha256:d9cf3b724c422c9902c8370c20725e321df5d6a53f98cba7210c02cd5d0fcb30 \
409390
--hash=sha256:daf792f938601d161e93ab8c46b0aee2facc187c2bb9e87a4ee314d7fc7471a1 \
410391
--hash=sha256:eb5dc7f0f867e52cc66b2fee861f7442f95a6be7fdca84fc375b8d57b2cae008 \
411392
--hash=sha256:ed54511cc9e4bb721c7f0530870494918985fabf3a30c1fe9a26649416ed83c7 \
412393
--hash=sha256:f1571a68601a60b3eaf246ce9c2ad7b9d515609116f3a01c7536f20e2f9e7437 \
413-
--hash=sha256:f1fd7d870148dcc73f871f05ad321884efb08d26325ffb5d2a12b03ab9d25043 \
414394
--hash=sha256:f5abfba01b9d1d4e23c647871203692f6e14cbd41a5d99a19ae3504e987a175c \
415-
--hash=sha256:fc6f2548fe6aa94ac0a0868cd67b468dc6b03982ecb9d6d04aeb6716d45995c2 \
416-
--hash=sha256:ffa8eff2bbfbf80469a6c4ee3864ccfd4dc7150717e9a1abcf4580b354ccea10
395+
--hash=sha256:fc6f2548fe6aa94ac0a0868cd67b468dc6b03982ecb9d6d04aeb6716d45995c2
417396
# via opusfilter
418397
matplotlib==3.10.0 \
419398
--hash=sha256:01d2b19f13aeec2e759414d3bfe19ddfb16b13a1250add08d46d5ff6f9be83c6 \
@@ -534,9 +513,9 @@ opus-fast-mosestokenizer==0.0.8.7 \
534513
--hash=sha256:f895d1238c44048d5de76f9b8c8b87f3a5164784b48c4c20f8d67295538ed931 \
535514
--hash=sha256:fa715afafc09f7e25fefc047fdaa50d9061b59422b4f5306e38677fd673b4b4a
536515
# via opusfilter
537-
opuscleaner==0.4.3 \
538-
--hash=sha256:ad2ac4cfd8a040e64c5393bf51c55e3fdc146c4799b8d84fd7c04492ca884041 \
539-
--hash=sha256:c038ccc6c11355588fb98bfec1fc35fa7cdac1c444d19c7063ca9ea88808150b
516+
opuscleaner==0.5 \
517+
--hash=sha256:a58e631179332ed5528c4be8df9dc702a0df146e3bce006d776c551a6dda21be \
518+
--hash=sha256:d01a543a92a6031dd4154e980cbb64d8701826059ca5942b080c45e6159bb833
540519
# via -r pipeline/clean/requirements/clean.in
541520
opusfilter==3.2.0 \
542521
--hash=sha256:63aedd4ce165d3113ef0a2ea6b4aed85598ac60686e3c7cae63c2e26c6be0fbd \
@@ -1257,9 +1236,9 @@ xxhash==3.2.0 \
12571236
# opusfilter
12581237

12591238
# The following packages are considered to be unsafe in a requirements file:
1260-
setuptools==75.8.0 \
1261-
--hash=sha256:c5afc8f407c626b8313a86e10311dd3f661c6cd9c09d4bf8c15c0e11f9f2b0e6 \
1262-
--hash=sha256:e3982f444617239225d675215d51f6ba05f845d4eec313da4418fdbb56fb27e3
1239+
setuptools==80.1.0 \
1240+
--hash=sha256:2e308396e1d83de287ada2c2fd6e64286008fe6aca5008e0b6a8cb0e2c86eedd \
1241+
--hash=sha256:ea0e7655c05b74819f82e76e11a85b31779fee7c4969e82f72bab0664e8317e4
12631242
# via
12641243
# fasttext
12651244
# opusfilter

0 commit comments

Comments
 (0)