Skip to content

Commit 27af48d

Browse files
ZJaumeevgenyrp
andauthored
New vocab __sep__ symbol for DocMT and auxiliary symbols (#1217)
* Add sep and misc user defined symbols to the vocab * Upload docmt-vocab training configs * Updata japanese config --------- Co-authored-by: Evgeny Pavlov <epavlov@mozilla.com>
1 parent a05f70e commit 27af48d

File tree

5 files changed

+828
-3
lines changed

5 files changed

+828
-3
lines changed

configs/docmt-vocab/en-is.yml

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
# The initial configuration was generated using:
2+
# task config-generator -- --name docmt-vocab --remote_branch docmt-vocab en is
3+
#
4+
# The documentation for this config can be found here:
5+
# https://github.com/mozilla/translations/blob/96f10a07dc0205db531e99ca05af5d8c70b6ab61/taskcluster/configs/config.prod.yml
6+
experiment:
7+
name: docmt-vocab
8+
src: en
9+
trg: is
10+
best-model: chrf
11+
opuscleaner-mode: defaults
12+
archive-corpora: true
13+
bicleaner:
14+
default-threshold: 0.5
15+
dataset-thresholds: {}
16+
monocleaner:
17+
mono-src:
18+
default-threshold: 0.0
19+
dataset-thresholds:
20+
hplt_mono_v2_0: 0.5
21+
opus_NLLB_v1: 0.5
22+
mono-trg:
23+
default-threshold: 0.0
24+
dataset-thresholds:
25+
hplt_mono_v2_0: 0.7
26+
opus_NLLB_v1: 0.8
27+
mono-max-sentences-src:
28+
total: 300_000_000
29+
per-dataset: 100_000_000
30+
mono-max-sentences-trg:
31+
total: 200_000_000
32+
per-dataset: 100_000_000
33+
hplt-min-doc-score:
34+
mono-src: 7.0
35+
mono-trg: 9.0
36+
spm-sample-size: 10_000_000
37+
spm-vocab-size: 32000
38+
spm-vocab-split: false
39+
teacher-ensemble: 1
40+
teacher-mode: two-stage
41+
teacher-decoder: ctranslate2
42+
student-model: base-memory
43+
datasets:
44+
devtest:
45+
- mtdata_aug-mix_ParIce-eea_dev-20.05-eng-isl
46+
- mtdata_aug-mix_ParIce-ema_dev-20.05-eng-isl
47+
- mtdata_aug-mix_ParIce-opensubtitles_dev-20.05-eng-isl
48+
- flores_aug-mix_dev
49+
- sacrebleu_aug-mix_wmt21
50+
test:
51+
- mtdata_ParIce-eea_test-20.05-eng-isl
52+
- mtdata_ParIce-opensubtitles_test-20.05-eng-isl
53+
- flores_devtest
54+
- flores_aug-mix_devtest
55+
- flores_aug-noise_devtest
56+
- flores_aug-inline-noise_devtest
57+
- flores_aug-punct_devtest
58+
- flores_aug-title_devtest
59+
- flores_aug-upper_devtest
60+
- flores_aug-typos_devtest
61+
- sacrebleu_wmt21/systems
62+
- sacrebleu_wmt21/dev
63+
64+
# The training data contains:
65+
# 25,112,409 sentences
66+
#
67+
# Skipped datasets:
68+
# - opus_CCMatrix/v1 - ignored datasets (8,723,145 sentences)
69+
# - opus_MultiHPLT/v2 - ignored datasets (2,694,541 sentences)
70+
# - opus_MultiMaCoCu/v2 - ignored datasets (267,366 sentences)
71+
# - opus_Ubuntu/v14.10 - not enough data (0 sentences)
72+
# - opus_WikiTitles/v3 - ignored datasets (0 sentences)
73+
# - mtdata_ELRC-www.iceida.is-1-eng-isl - duplicate with opus
74+
# - mtdata_ELRC-www.pfs.is-1-eng-isl - duplicate with opus
75+
# - mtdata_ELRC-www.lanamal.is-1-eng-isl - duplicate with opus
76+
# - mtdata_ELRC-gallery_iceland-1-eng-isl - duplicate with opus
77+
# - mtdata_ELRC-bokmenntaborgin_is-1-eng-isl - duplicate with opus
78+
# - mtdata_ELRC-icelandic_medicines-1-eng-isl - duplicate with opus
79+
# - mtdata_ELRC-www.nordisketax.net-1-eng-isl - duplicate with opus
80+
# - mtdata_ELRC-statistics_iceland-1-eng-isl - duplicate with opus
81+
# - mtdata_ELRC-www.norden.org-1-eng-isl - duplicate with opus
82+
# - mtdata_ELRC-emea-1-eng-isl - duplicate with opus
83+
# - mtdata_ELRC-antibiotic-1-eng-isl - duplicate with opus
84+
# - mtdata_ELRC-www.malfong.is-1-eng_GB-isl_IS - duplicate with opus
85+
# - mtdata_ELRC-ríkiskaup_2020-1-eng-isl - duplicate with opus
86+
# - mtdata_ELRC-university_iceland-1-eng_GB-isl_IS - duplicate with opus
87+
# - mtdata_ELRC-scipar-1-eng-isl - duplicate with opus
88+
# - mtdata_EU-ecdc-1-eng-isl - duplicate with opus
89+
# - mtdata_Facebook-wikimatrix-1-eng-isl - duplicate with opus
90+
# - mtdata_ParIce-eea_train-20.05-eng-isl - Error fetching (https://repository.clarin.is/repository/xmlui/bitstream/handle/20.500.12537/24/Parice_dev_test.20.05.zip)
91+
# - mtdata_ParIce-ema_train-20.05-eng-isl - Error fetching (https://repository.clarin.is/repository/xmlui/bitstream/handle/20.500.12537/24/Parice_dev_test.20.05.zip)
92+
# - mtdata_ParIce-ema_test-20.05-eng-isl - Error fetching (https://repository.clarin.is/repository/xmlui/bitstream/handle/20.500.12537/24/Parice_dev_test.20.05.zip)
93+
# - mtdata_ParaCrawl-paracrawl-6-eng-isl - duplicate with opus
94+
# - mtdata_ParaCrawl-paracrawl-7.1-eng-isl - duplicate with opus
95+
# - mtdata_ParaCrawl-paracrawl-8-eng-isl - duplicate with opus
96+
# - mtdata_ParaCrawl-paracrawl-9-eng-isl - duplicate with opus
97+
# - mtdata_Statmt-wikititles-3-isl-eng - duplicate with opus
98+
# - mtdata_Statmt-ccaligned-1-eng-isl_IS - duplicate with opus
99+
train:
100+
- opus_NLLB/v1 # 8,723,145 sentences
101+
- opus_OpenSubtitles/v2024 # 4,057,039 sentences
102+
- opus_ParaCrawl/v9 # 2,967,579 sentences
103+
- opus_HPLT/v2 # 2,694,541 sentences
104+
- opus_ParIce/v1 # 2,097,022 sentences
105+
- opus_CCAligned/v1 # 1,192,542 sentences
106+
- opus_XLEnt/v1.2 # 962,661 sentences
107+
- opus_ELRC-2718-EMEA/v1 # 542,624 sentences
108+
- opus_ELRC-EMEA/v1 # 542,624 sentences
109+
- opus_TildeMODEL/v2018 # 420,712 sentences
110+
- opus_MaCoCu/v2 # 267,366 sentences
111+
- opus_ELRC-5067-SciPar/v1 # 110,831 sentences
112+
- opus_KDE4/v2 # 98,989 sentences
113+
- opus_WikiMatrix/v1 # 85,992 sentences
114+
- opus_bible-uedin/v1 # 62,163 sentences
115+
- opus_ELRC-728-www.norden.org/v1 # 41,073 sentences
116+
- opus_ELRC-www.norden.org/v1 # 41,073 sentences
117+
- opus_ELRC-4327-Government_Offices_I/v1 # 36,290 sentences
118+
- opus_GNOME/v1 # 28,776 sentences
119+
- opus_QED/v2.0a # 27,611 sentences
120+
- opus_ELRC-4324-Government_Offices_I/v1 # 18,185 sentences
121+
- opus_ELRC-antibiotic/v1 # 13,070 sentences
122+
- opus_ELRC-4295-www.malfong.is/v1 # 12,634 sentences
123+
- opus_ELRC-4334-Rkiskaup_2020/v1 # 10,236 sentences
124+
- opus_ELRC-4338-University_Iceland/v1 # 10,164 sentences
125+
- opus_EUbookshop/v2 # 9,783 sentences
126+
- opus_Tatoeba/v2023-04-12 # 9,600 sentences
127+
- opus_wikimedia/v20230407 # 4,471 sentences
128+
- opus_ELRC-505-www.pfs.is/v1 # 2,866 sentences
129+
- opus_ECDC/v2016-03-16 # 2,512 sentences
130+
- opus_TED2020/v1 # 2,430 sentences
131+
- opus_ELRC-508-Tilde_Statistics_Ice/v1 # 2,427 sentences
132+
- opus_ELRC-718-Statistics_Iceland/v1 # 2,361 sentences
133+
- opus_ELRC-517-Icelandic_Directorat/v1 # 1,536 sentences
134+
- opus_ELRC-502-Icelandic_Financial_/v1 # 1,525 sentences
135+
- opus_ELRC-510-Harpa_Reykjavik_Conc/v1 # 1,197 sentences
136+
- opus_ELRC-506-www.lanamal.is/v1 # 1,140 sentences
137+
- opus_ELRC-597-www.nordisketax.net/v1 # 1,065 sentences
138+
- opus_ELRC-www.nordisketax.net/v1 # 1,065 sentences
139+
- opus_ELRC-504-www.iceida.is/v1 # 1,055 sentences
140+
- opus_ELRC-3206-antibiotic/v1 # 816 sentences
141+
- opus_ELRC-516-Icelandic_Medicines/v1 # 711 sentences
142+
- opus_ELRC-509-Gallery_Iceland/v1 # 577 sentences
143+
- opus_ELRC-511-bokmenntaborgin_is/v1 # 330 sentences
144+
- mtdata_ELRC-icelandic_financial_supervisory_authority-1-eng-isl # ~1,158 sentences (130.9 kB)
145+
- mtdata_ELRC-tilde_statistics_iceland-1-eng-isl # ~1,778 sentences (201.0 kB)
146+
- mtdata_ELRC-harpa_reykjavik_concert_hall_conference_centre-1-eng-isl # ~1,520 sentences (171.8 kB)
147+
- mtdata_ELRC-icelandic_directorate_immigration-1-eng-isl # ~1,013 sentences (114.5 kB)
148+
- mtdata_ELRC-government_offices_iceland_reports-1-eng-isl # ~19,340 sentences (2.2 MB)
149+
- mtdata_ELRC-government_offices_iceland_legislation_regulations-1-eng-isl # ~38,492 sentences (4.3 MB)
150+
- mtdata_EU-eac_forms-1-eng-isl # ~31,162 sentences (3.5 MB)
151+
- mtdata_EU-eac_reference-1-eng-isl # ~31,162 sentences (3.5 MB)
152+
- mtdata_Statmt-newsdev_enis-2021-eng-isl # ~460,669 sentences (52.1 MB)
153+
- mtdata_Statmt-newsdev_isen-2021-isl-eng # ~460,669 sentences (52.1 MB)
154+
- mtdata_Tilde-eesc-2017-eng-isl # ~221 sentences (25.1 kB)
155+
- mtdata_Tilde-ema-2016-eng-isl # ~201,134 sentences (22.7 MB)
156+
- mtdata_Tilde-rapid-2016-eng-isl # ~173 sentences (19.6 kB)
157+
158+
# The monolingual data contains:
159+
# ~676,854,488 sentences
160+
# Up to 100,000,000 sentences from HPLT
161+
mono-src:
162+
- news-crawl_news.2007 # ~1,557,522 sentences
163+
- news-crawl_news.2008 # ~5,389,380 sentences
164+
- news-crawl_news.2009 # ~6,557,522 sentences
165+
- news-crawl_news.2010 # ~3,247,787 sentences
166+
- news-crawl_news.2011 # ~6,318,584 sentences
167+
- news-crawl_news.2012 # ~6,407,079 sentences
168+
- news-crawl_news.2013 # ~10,619,469 sentences
169+
- news-crawl_news.2014 # ~10,619,469 sentences
170+
- news-crawl_news.2015 # ~10,619,469 sentences
171+
- news-crawl_news.2016 # ~7,982,300 sentences
172+
- news-crawl_news.2017 # ~11,504,424 sentences
173+
- news-crawl_news.2018 # ~7,920,353 sentences
174+
- news-crawl_news.2019 # ~17,699,115 sentences
175+
- news-crawl_news.2020 # ~22,123,893 sentences
176+
- news-crawl_news.2021 # ~21,238,938 sentences
177+
- news-crawl_news.2022 # ~23,008,849 sentences
178+
- news-crawl_news.2023 # ~23,008,849 sentences
179+
- news-crawl_news.2024 # ~18,584,070 sentences
180+
- hplt_mono/v2.0 # Up to 100,000,000 sentences
181+
- opus_NLLB/v1 # ~462,447,416 sentences
182+
183+
# The monolingual data contains:
184+
# ~1,075,648 sentences
185+
# Up to 100,000,000 sentences from HPLT
186+
mono-trg:
187+
- hplt_mono/v2.0 # Up to 100,000,000 sentences
188+
- opus_NLLB/v1 # ~1,075,648 sentences
189+
marian-args:
190+
decoding-backward:
191+
beam-size: '12'
192+
mini-batch-words: '2000'
193+
decoding-teacher:
194+
mini-batch-words: '5000'
195+
maxi-batch: '10000'
196+
training-backward:
197+
early-stopping: '5'
198+
training-teacher:
199+
early-stopping: '20'
200+
training-student:
201+
early-stopping: '15'
202+
training-student-finetuned:
203+
early-stopping: '20'
204+
target-stage: all-pipeline
205+
wandb-publication: true
206+
continuation:
207+
models: {}
208+
taskcluster:
209+
split-chunks: 20
210+
upload-bucket: production
211+
worker-classes:
212+
default: gcp-spot
213+
corpus-align-parallel: gcp-standard
214+
corpus-align-backtranslations: gcp-standard
215+
corpus-align-distillation: gcp-standard
216+
distillation-corpus-build-shortlist: gcp-standard

0 commit comments

Comments
 (0)