Skip to content

Commit a05f70e

Browse files
authored
High resource training (#1239)
* Extract corpus sampling * Always calculate priors on a sample first * Parallelize aligning * Add psutil * Fix pylint errors * Reduce number of workers * Update config * Fix OpusCleaner server * Remove old dataset fixing scripts * Add dataset specific OpusCleaner filters * Add more filters * Move to experiment folder * Add high resource configs * Enable debug logging * Bump max action run time * Update configs * Fix merging * Update configs * Update configs * Update configs * Update configs * Add autogenerated * Increase opustrainer batch * Add configs * Add configs * Remove test models and git lfs * Update configs * Fix pl config * Revert "Enable debug logging" This reverts commit a87d3c0.
1 parent 10732b3 commit a05f70e

30 files changed

+8814
-7
lines changed

configs/autogenerated/en-bg.yml

Lines changed: 267 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
# The initial configuration was generated using:
2+
# task config-generator -- --name autogenerated en bg
3+
#
4+
# The documentation for this config can be found here:
5+
# https://github.com/mozilla/translations/blob/57dedac0771104e02e8c179834e5e0f14aac3c18/taskcluster/configs/config.prod.yml
6+
experiment:
7+
name: autogenerated
8+
src: en
9+
trg: bg
10+
best-model: chrf
11+
opuscleaner-mode: defaults
12+
archive-corpora: true
13+
bicleaner:
14+
default-threshold: 0.5
15+
dataset-thresholds: {}
16+
monocleaner:
17+
mono-src:
18+
default-threshold: 0.0
19+
dataset-thresholds:
20+
hplt_mono_v2_0: 0.5
21+
opus_NLLB_v1: 0.5
22+
mono-trg:
23+
default-threshold: 0.0
24+
dataset-thresholds:
25+
hplt_mono_v2_0: 0.7
26+
opus_NLLB_v1: 0.8
27+
mono-max-sentences-src:
28+
total: 100_000_000
29+
per-dataset: 70_000_000
30+
mono-max-sentences-trg:
31+
total: 100_000_000
32+
per-dataset: 70_000_000
33+
hplt-min-doc-score:
34+
mono-src: 7.0
35+
mono-trg: 9.0
36+
spm-sample-size: 10_000_000
37+
spm-vocab-size: 32000
38+
spm-vocab-split: false
39+
teacher-ensemble: 1
40+
teacher-mode: two-stage
41+
teacher-decoder: ctranslate2
42+
student-model: base-memory
43+
datasets:
44+
devtest:
45+
- mtdata_aug-mix_Neulab-tedtalks_dev-1-eng-bul
46+
- flores_aug-mix_dev
47+
test:
48+
- mtdata_Neulab-tedtalks_test-1-eng-bul
49+
- flores_devtest
50+
- flores_aug-mix_devtest
51+
- flores_aug-noise_devtest
52+
- flores_aug-inline-noise_devtest
53+
- flores_aug-punct_devtest
54+
- flores_aug-title_devtest
55+
- flores_aug-upper_devtest
56+
- flores_aug-typos_devtest
57+
58+
# The training data contains:
59+
# 162,440,413 sentences
60+
#
61+
# Skipped datasets:
62+
# - opus_CCMatrix/v1 - ignored datasets (44,635,282 sentences)
63+
# - opus_MultiHPLT/v2 - ignored datasets (22,725,326 sentences)
64+
# - opus_MultiMaCoCu/v2 - ignored datasets (1,760,778 sentences)
65+
# - opus_GNOME/v1 - not enough data (150 sentences)
66+
# - opus_ELRA-W0308/v1 - not enough data (78 sentences)
67+
# - opus_ELRC-648-Letter_rights_person/v1 - not enough data (60 sentences)
68+
# - opus_ELRC-2546-Competition_Economic/v1 - not enough data (29 sentences)
69+
# - opus_ELRC-403-Rights_Arrested/v1 - not enough data (24 sentences)
70+
# - opus_ELRA-W0301/v1 - not enough data (18 sentences)
71+
# - opus_ELRC-EMEA/v1 - not enough data (0 sentences)
72+
# - opus_translatewiki/v2025-01-01 - not enough data (0 sentences)
73+
# - opus_Ubuntu/v14.10 - not enough data (0 sentences)
74+
# - mtdata_ELRC-bulgarian_strategic_innovations_digital_growth-1-bul-eng - Error fetching (https://elrc-share.eu/repository/download/2cb883806d1b11e7b7d400155d02670626c3d969c20e48cfb5a1a1c1cb0789b4/)
75+
# - mtdata_ELRC-rights_arrested-1-bul-eng - duplicate with opus
76+
# - mtdata_ELRC-transport-1-bul-eng - duplicate with opus
77+
# - mtdata_ELRC-ejtn_handbook-1-bul-eng - duplicate with opus
78+
# - mtdata_ELRC-bulgarian_strategic_telecommunications_broadband-1-bul-eng - Error fetching (https://elrc-share.eu/repository/download/96c49f8e6d1b11e7b7d400155d026706c27c80764e9641fb9d47196e3d4cb42f/)
79+
# - mtdata_ELRC-bugarian_revenue-1-bul-eng - duplicate with opus
80+
# - mtdata_ELRC-letter_rights_persons_arrested_or_detained-1-bul-eng - Error fetching (https://elrc-share.eu/repository/download/0102395e050811e8b7d400155d026706483f8695e5e94dc5beb5b835e17725bb/)
81+
# - mtdata_ELRC-administration-1-bul-eng - duplicate with opus
82+
# - mtdata_ELRC-corpora_state_administration_sites-1-bul-eng - Error fetching (https://elrc-share.eu/repository/download/8507b82eaab611e8b7d400155d026706a07f556156eb445f8b9e28af532acd3e/)
83+
# - 2018_proposal_a_climate_change_adaptation_strategy_action_plan_bulgarian_environment_water - corpus name is too long for tasks
84+
# - mtdata_ELRC-legislation_bulgaria_energy-1-bul-eng - Error fetching (https://elrc-share.eu/repository/download/8a5356bcee7711e8b7d400155d0267068e24cb784154488d86adffe317b6ae54/)
85+
# - mtdata_ELRC-health_care_social_policy-1-bul-eng - Error fetching (https://elrc-share.eu/repository/download/a73e7a36fd7711e8b7d400155d02670602129d46531a4b81bdb700c6ceb07b2c/)
86+
# - mtdata_ELRC-ict_transport-1-bul-eng - duplicate with opus
87+
# - mtdata_ELRC-open_broadband_information_society-1-bul-eng - Error fetching (https://elrc-share.eu/repository/download/85f58aaafdda11e8b7d400155d026706445a8d7a08f440c6bf21315b061373d7/)
88+
# - mtdata_ELRC-information_society-1-bul-eng - duplicate with opus
89+
# - mtdata_ELRC-official_tourism_portal_bulgaria-1-bul-eng - Error fetching (https://elrc-share.eu/repository/download/bfad45621e7d11e9b7d400155d026706545c2a791a6749d1acae2b87fc5d5da4/)
90+
# - mtdata_ELRC-euipo_2017-1-bul-eng - duplicate with opus
91+
# - mtdata_ELRC-emea-1-bul-eng - duplicate with opus
92+
# - mtdata_ELRC-vaccination-1-bul-eng - duplicate with opus
93+
# - mtdata_ELRC-wikipedia_health-1-bul-eng - duplicate with opus
94+
# - mtdata_ELRC-antibiotic-1-bul-eng - duplicate with opus
95+
# - mtdata_ELRC-europarl_covid-1-bul-eng - duplicate with opus
96+
# - mtdata_ELRC-ec_europa_covid-1-bul-eng - duplicate with opus
97+
# - mtdata_ELRC-eur_lex_covid-1-bul-eng - duplicate with opus
98+
# - mtdata_ELRC-presscorner_covid-1-bul-eng - duplicate with opus
99+
# - mtdata_ELRC-scipar-1-bul-eng - duplicate with opus
100+
# - mtdata_EU-ecdc-1-eng-bul - duplicate with opus
101+
# - mtdata_Facebook-wikimatrix-1-bul-eng - duplicate with opus
102+
# - mtdata_LinguaTools-wikititles-2014-bul-eng - duplicate with opus
103+
# - mtdata_Neulab-tedtalks_train-1-eng-bul - duplicate with opus
104+
# - mtdata_ParaCrawl-paracrawl-6-eng-bul - duplicate with opus
105+
# - mtdata_ParaCrawl-paracrawl-7.1-eng-bul - duplicate with opus
106+
# - mtdata_ParaCrawl-paracrawl-8-eng-bul - duplicate with opus
107+
# - mtdata_ParaCrawl-paracrawl-9-eng-bul - duplicate with opus
108+
# - mtdata_Statmt-europarl-7-bul-eng - duplicate with opus
109+
# - mtdata_Statmt-ccaligned-1-bul_BG-eng - duplicate with opus
110+
train:
111+
- opus_OpenSubtitles/v2024 # 54,970,271 sentences
112+
- opus_NLLB/v1 # 44,635,282 sentences
113+
- opus_HPLT/v2 # 22,725,326 sentences
114+
- opus_ParaCrawl/v9 # 13,264,563 sentences
115+
- opus_CCAligned/v1 # 10,418,140 sentences
116+
- opus_DGT/v2019 # 3,603,023 sentences
117+
- opus_XLEnt/v1.2 # 2,536,508 sentences
118+
- opus_LinguaTools-WikiTitles/v2014 # 2,528,551 sentences
119+
- opus_MaCoCu/v2 # 1,760,779 sentences
120+
- opus_EMEA/v3 # 1,001,147 sentences
121+
- opus_ELRC-2712-EMEA/v1 # 772,700 sentences
122+
- opus_ELRC_2682/v1 # 772,699 sentences
123+
- opus_QED/v2.0a # 521,872 sentences
124+
- opus_Europarl/v8 # 408,290 sentences
125+
- opus_WikiMatrix/v1 # 357,970 sentences
126+
- opus_ELITR-ECA/v1 # 268,229 sentences
127+
- opus_TED2020/v1 # 249,483 sentences
128+
- opus_EUbookshop/v2 # 221,229 sentences
129+
- opus_SETIMES/v2 # 213,160 sentences
130+
- opus_NeuLab-TedTalks/v1 # 182,370 sentences
131+
- opus_ELRC-presscorner_covid/v1 # 164,779 sentences
132+
- opus_Tanzil/v1 # 135,477 sentences
133+
- opus_KDE4/v2 # 113,467 sentences
134+
- opus_Wikipedia/v1.0 # 79,781 sentences
135+
- opus_wikimedia/v20230407 # 62,347 sentences
136+
- opus_bible-uedin/v1 # 62,123 sentences
137+
- opus_ELRC-768-corpora_State_Admini/v1 # 52,153 sentences
138+
- opus_JRC-Acquis/v3.0 # 45,850 sentences
139+
- opus_ELRC-3563-EUR_LEX_covid/v1 # 23,010 sentences
140+
- opus_ELRC-EUR_LEX/v1 # 23,010 sentences
141+
- opus_ELRC-EUROPARL_covid/v1 # 21,842 sentences
142+
- opus_ELRC-1176-EUIPO_2017/v1 # 19,610 sentences
143+
- opus_ELRC-EUIPO_2017/v1 # 19,610 sentences
144+
- opus_Tatoeba/v2023-04-12 # 17,860 sentences
145+
- opus_ELRC-antibiotic/v1 # 14,736 sentences
146+
- opus_ELRC-913-2018_Proposal_a/v1 # 14,196 sentences
147+
- opus_ELRA-W0263/v1 # 14,195 sentences
148+
- opus_ELRC-2873-EU_publications_medi/v1 # 13,150 sentences
149+
- opus_ELRC-EU_publications/v1 # 13,150 sentences
150+
- opus_ELRC-wikipedia_health/v1 # 11,788 sentences
151+
- opus_ELRC-664-administration/v1 # 11,263 sentences
152+
- opus_ELRA-W0211/v1 # 11,262 sentences
153+
- opus_ELRC-1158-Official_Tourism_Por/v1 # 10,617 sentences
154+
- opus_ELRC-3604-presscorner_covid/v1 # 7,195 sentences
155+
- opus_ELRA-W0297/v1 # 6,480 sentences
156+
- opus_GlobalVoices/v2018q4 # 6,003 sentences
157+
- opus_ELRC-1000-legislation_Bulgaria/v1 # 4,610 sentences
158+
- opus_ELRC_3382/v1 # 3,691 sentences
159+
- opus_ELRC-408-transport/v1 # 3,624 sentences
160+
- opus_ELRC-1060-Information_society/v1 # 3,257 sentences
161+
- opus_ELRC-437-Bulgarian_strategic_/v1 # 2,990 sentences
162+
- opus_TildeMODEL/v2018 # 2,865 sentences
163+
- opus_ECDC/v2016-03-16 # 2,568 sentences
164+
- opus_ELRC-3462-EC_EUROPA_covid/v1 # 2,422 sentences
165+
- opus_ELRC-EC_EUROPA/v1 # 2,422 sentences
166+
- opus_ELRC-1056-open_broadband_infor/v1 # 2,390 sentences
167+
- opus_ELRA-W0134/v1 # 2,389 sentences
168+
- opus_ELRC-1055-ICT_Transport/v1 # 2,377 sentences
169+
- opus_ELRA-W0133/v1 # 2,376 sentences
170+
- opus_ELRC-391-Bulgarian_strategic_/v1 # 2,350 sentences
171+
- opus_ELRC-5067-SciPar/v1 # 2,302 sentences
172+
- opus_ELRC-1054-health_care_social/v1 # 2,097 sentences
173+
- opus_ELRC-412-EJTN_Handbook/v1 # 1,944 sentences
174+
- opus_ELRC-471-Bugarian_Revenue/v1 # 1,293 sentences
175+
- opus_ELRA-W0173/v1 # 1,292 sentences
176+
- opus_ELRC-3200-antibiotic/v1 # 891 sentences
177+
- opus_ELRC-3291-EUROPARL_covid/v1 # 728 sentences
178+
- opus_ELRC-3060-wikipedia_health/v1 # 630 sentences
179+
- opus_ELRC_2922/v1 # 629 sentences
180+
- opus_ELRC-2733-vaccination/v1 # 524 sentences
181+
- opus_ELRC-vaccination/v1 # 524 sentences
182+
- opus_ELRC_2923/v1 # 467 sentences
183+
- opus_ELRC-2549-Report_Best_practice/v1 # 215 sentences
184+
- mtdata_ELRC-competition_economics_judges-1-bul-eng # ~61 sentences (7.0 kB)
185+
- mtdata_ELRC-report_best_practices_initial_training_programs-1-bul-eng # ~203 sentences (23.0 kB)
186+
- mtdata_ELRC-eu_publications_medical_v2-1-bul-eng # ~16,373 sentences (1.9 MB)
187+
- mtdata_ELRC-web_acquired_data_related_to_scientific_research-1-eng-bul # ~464,317 sentences (52.5 MB)
188+
- mtdata_ELRC-hrw_dataset_v1-1-eng-bul # ~631,760 sentences (71.4 MB)
189+
- mtdata_EU-eac_forms-1-bul-eng # ~31,162 sentences (3.5 MB)
190+
- mtdata_EU-eac_reference-1-bul-eng # ~31,162 sentences (3.5 MB)
191+
- mtdata_EU-dcep-1-bul-eng # ~404,653 sentences (45.7 MB)
192+
- mtdata_Tilde-eesc-2017-bul-eng # ~1,121,039 sentences (126.7 MB)
193+
- mtdata_Tilde-ema-2016-bul-eng # ~274,955 sentences (31.1 MB)
194+
- mtdata_Tilde-ecb-2017-bul-eng # ~1,951 sentences (220.6 kB)
195+
- mtdata_Tilde-rapid-2016-bul-eng # ~198,340 sentences (22.4 MB)
196+
197+
# The monolingual data contains:
198+
# ~676,854,488 sentences
199+
# Up to 70,000,000 sentences from HPLT
200+
mono-src:
201+
- news-crawl_news.2007 # ~1,557,522 sentences
202+
- news-crawl_news.2008 # ~5,389,380 sentences
203+
- news-crawl_news.2009 # ~6,557,522 sentences
204+
- news-crawl_news.2010 # ~3,247,787 sentences
205+
- news-crawl_news.2011 # ~6,318,584 sentences
206+
- news-crawl_news.2012 # ~6,407,079 sentences
207+
- news-crawl_news.2013 # ~10,619,469 sentences
208+
- news-crawl_news.2014 # ~10,619,469 sentences
209+
- news-crawl_news.2015 # ~10,619,469 sentences
210+
- news-crawl_news.2016 # ~7,982,300 sentences
211+
- news-crawl_news.2017 # ~11,504,424 sentences
212+
- news-crawl_news.2018 # ~7,920,353 sentences
213+
- news-crawl_news.2019 # ~17,699,115 sentences
214+
- news-crawl_news.2020 # ~22,123,893 sentences
215+
- news-crawl_news.2021 # ~21,238,938 sentences
216+
- news-crawl_news.2022 # ~23,008,849 sentences
217+
- news-crawl_news.2023 # ~23,008,849 sentences
218+
- news-crawl_news.2024 # ~18,584,070 sentences
219+
- hplt_mono/v2.0 # Up to 70,000,000 sentences
220+
- opus_NLLB/v1 # ~462,447,416 sentences
221+
222+
# The monolingual data contains:
223+
# ~57,427,433 sentences
224+
# Up to 70,000,000 sentences from HPLT
225+
mono-trg:
226+
- news-crawl_news.2013 # ~2,752,212 sentences
227+
- news-crawl_news.2014 # ~4,026,548 sentences
228+
- news-crawl_news.2015 # ~3,584,070 sentences
229+
- news-crawl_news.2016 # ~2,805,309 sentences
230+
- news-crawl_news.2017 # ~2,575,221 sentences
231+
- news-crawl_news.2018 # ~1,814,159 sentences
232+
- news-crawl_news.2019 # ~2,486,725 sentences
233+
- news-crawl_news.2020 # ~2,584,070 sentences
234+
- news-crawl_news.2021 # ~1,681,415 sentences
235+
- news-crawl_news.2022 # ~2,371,681 sentences
236+
- news-crawl_news.2023 # ~2,743,362 sentences
237+
- news-crawl_news.2024 # ~2,442,477 sentences
238+
- hplt_mono/v2.0 # Up to 70,000,000 sentences
239+
- opus_NLLB/v1 # ~25,560,184 sentences
240+
marian-args:
241+
decoding-backward:
242+
beam-size: '12'
243+
mini-batch-words: '2000'
244+
decoding-teacher:
245+
mini-batch-words: '5000'
246+
maxi-batch: '10000'
247+
training-backward:
248+
early-stopping: '5'
249+
training-teacher:
250+
early-stopping: '20'
251+
training-student:
252+
early-stopping: '15'
253+
training-student-finetuned:
254+
early-stopping: '20'
255+
target-stage: evaluate-teacher
256+
wandb-publication: true
257+
continuation:
258+
models: {}
259+
taskcluster:
260+
split-chunks: 20
261+
upload-bucket: production
262+
worker-classes:
263+
default: gcp-spot
264+
corpus-align-parallel: gcp-standard
265+
corpus-align-backtranslations: gcp-standard
266+
corpus-align-distillation: gcp-standard
267+
distillation-corpus-build-shortlist: gcp-standard

0 commit comments

Comments
 (0)