|
| 1 | +# The initial configuration was generated using: |
| 2 | +# task config-generator -- --name autogenerated en bg |
| 3 | +# |
| 4 | +# The documentation for this config can be found here: |
| 5 | +# https://github.com/mozilla/translations/blob/57dedac0771104e02e8c179834e5e0f14aac3c18/taskcluster/configs/config.prod.yml |
| 6 | +experiment: |
| 7 | + name: autogenerated |
| 8 | + src: en |
| 9 | + trg: bg |
| 10 | + best-model: chrf |
| 11 | + opuscleaner-mode: defaults |
| 12 | + archive-corpora: true |
| 13 | + bicleaner: |
| 14 | + default-threshold: 0.5 |
| 15 | + dataset-thresholds: {} |
| 16 | + monocleaner: |
| 17 | + mono-src: |
| 18 | + default-threshold: 0.0 |
| 19 | + dataset-thresholds: |
| 20 | + hplt_mono_v2_0: 0.5 |
| 21 | + opus_NLLB_v1: 0.5 |
| 22 | + mono-trg: |
| 23 | + default-threshold: 0.0 |
| 24 | + dataset-thresholds: |
| 25 | + hplt_mono_v2_0: 0.7 |
| 26 | + opus_NLLB_v1: 0.8 |
| 27 | + mono-max-sentences-src: |
| 28 | + total: 100_000_000 |
| 29 | + per-dataset: 70_000_000 |
| 30 | + mono-max-sentences-trg: |
| 31 | + total: 100_000_000 |
| 32 | + per-dataset: 70_000_000 |
| 33 | + hplt-min-doc-score: |
| 34 | + mono-src: 7.0 |
| 35 | + mono-trg: 9.0 |
| 36 | + spm-sample-size: 10_000_000 |
| 37 | + spm-vocab-size: 32000 |
| 38 | + spm-vocab-split: false |
| 39 | + teacher-ensemble: 1 |
| 40 | + teacher-mode: two-stage |
| 41 | + teacher-decoder: ctranslate2 |
| 42 | + student-model: base-memory |
| 43 | +datasets: |
| 44 | + devtest: |
| 45 | + - mtdata_aug-mix_Neulab-tedtalks_dev-1-eng-bul |
| 46 | + - flores_aug-mix_dev |
| 47 | + test: |
| 48 | + - mtdata_Neulab-tedtalks_test-1-eng-bul |
| 49 | + - flores_devtest |
| 50 | + - flores_aug-mix_devtest |
| 51 | + - flores_aug-noise_devtest |
| 52 | + - flores_aug-inline-noise_devtest |
| 53 | + - flores_aug-punct_devtest |
| 54 | + - flores_aug-title_devtest |
| 55 | + - flores_aug-upper_devtest |
| 56 | + - flores_aug-typos_devtest |
| 57 | + |
| 58 | + # The training data contains: |
| 59 | + # 162,440,413 sentences |
| 60 | + # |
| 61 | + # Skipped datasets: |
| 62 | + # - opus_CCMatrix/v1 - ignored datasets (44,635,282 sentences) |
| 63 | + # - opus_MultiHPLT/v2 - ignored datasets (22,725,326 sentences) |
| 64 | + # - opus_MultiMaCoCu/v2 - ignored datasets (1,760,778 sentences) |
| 65 | + # - opus_GNOME/v1 - not enough data (150 sentences) |
| 66 | + # - opus_ELRA-W0308/v1 - not enough data (78 sentences) |
| 67 | + # - opus_ELRC-648-Letter_rights_person/v1 - not enough data (60 sentences) |
| 68 | + # - opus_ELRC-2546-Competition_Economic/v1 - not enough data (29 sentences) |
| 69 | + # - opus_ELRC-403-Rights_Arrested/v1 - not enough data (24 sentences) |
| 70 | + # - opus_ELRA-W0301/v1 - not enough data (18 sentences) |
| 71 | + # - opus_ELRC-EMEA/v1 - not enough data (0 sentences) |
| 72 | + # - opus_translatewiki/v2025-01-01 - not enough data (0 sentences) |
| 73 | + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) |
| 74 | + # - mtdata_ELRC-bulgarian_strategic_innovations_digital_growth-1-bul-eng - Error fetching (https://elrc-share.eu/repository/download/2cb883806d1b11e7b7d400155d02670626c3d969c20e48cfb5a1a1c1cb0789b4/) |
| 75 | + # - mtdata_ELRC-rights_arrested-1-bul-eng - duplicate with opus |
| 76 | + # - mtdata_ELRC-transport-1-bul-eng - duplicate with opus |
| 77 | + # - mtdata_ELRC-ejtn_handbook-1-bul-eng - duplicate with opus |
| 78 | + # - mtdata_ELRC-bulgarian_strategic_telecommunications_broadband-1-bul-eng - Error fetching (https://elrc-share.eu/repository/download/96c49f8e6d1b11e7b7d400155d026706c27c80764e9641fb9d47196e3d4cb42f/) |
| 79 | + # - mtdata_ELRC-bugarian_revenue-1-bul-eng - duplicate with opus |
| 80 | + # - mtdata_ELRC-letter_rights_persons_arrested_or_detained-1-bul-eng - Error fetching (https://elrc-share.eu/repository/download/0102395e050811e8b7d400155d026706483f8695e5e94dc5beb5b835e17725bb/) |
| 81 | + # - mtdata_ELRC-administration-1-bul-eng - duplicate with opus |
| 82 | + # - mtdata_ELRC-corpora_state_administration_sites-1-bul-eng - Error fetching (https://elrc-share.eu/repository/download/8507b82eaab611e8b7d400155d026706a07f556156eb445f8b9e28af532acd3e/) |
| 83 | + # - 2018_proposal_a_climate_change_adaptation_strategy_action_plan_bulgarian_environment_water - corpus name is too long for tasks |
| 84 | + # - mtdata_ELRC-legislation_bulgaria_energy-1-bul-eng - Error fetching (https://elrc-share.eu/repository/download/8a5356bcee7711e8b7d400155d0267068e24cb784154488d86adffe317b6ae54/) |
| 85 | + # - mtdata_ELRC-health_care_social_policy-1-bul-eng - Error fetching (https://elrc-share.eu/repository/download/a73e7a36fd7711e8b7d400155d02670602129d46531a4b81bdb700c6ceb07b2c/) |
| 86 | + # - mtdata_ELRC-ict_transport-1-bul-eng - duplicate with opus |
| 87 | + # - mtdata_ELRC-open_broadband_information_society-1-bul-eng - Error fetching (https://elrc-share.eu/repository/download/85f58aaafdda11e8b7d400155d026706445a8d7a08f440c6bf21315b061373d7/) |
| 88 | + # - mtdata_ELRC-information_society-1-bul-eng - duplicate with opus |
| 89 | + # - mtdata_ELRC-official_tourism_portal_bulgaria-1-bul-eng - Error fetching (https://elrc-share.eu/repository/download/bfad45621e7d11e9b7d400155d026706545c2a791a6749d1acae2b87fc5d5da4/) |
| 90 | + # - mtdata_ELRC-euipo_2017-1-bul-eng - duplicate with opus |
| 91 | + # - mtdata_ELRC-emea-1-bul-eng - duplicate with opus |
| 92 | + # - mtdata_ELRC-vaccination-1-bul-eng - duplicate with opus |
| 93 | + # - mtdata_ELRC-wikipedia_health-1-bul-eng - duplicate with opus |
| 94 | + # - mtdata_ELRC-antibiotic-1-bul-eng - duplicate with opus |
| 95 | + # - mtdata_ELRC-europarl_covid-1-bul-eng - duplicate with opus |
| 96 | + # - mtdata_ELRC-ec_europa_covid-1-bul-eng - duplicate with opus |
| 97 | + # - mtdata_ELRC-eur_lex_covid-1-bul-eng - duplicate with opus |
| 98 | + # - mtdata_ELRC-presscorner_covid-1-bul-eng - duplicate with opus |
| 99 | + # - mtdata_ELRC-scipar-1-bul-eng - duplicate with opus |
| 100 | + # - mtdata_EU-ecdc-1-eng-bul - duplicate with opus |
| 101 | + # - mtdata_Facebook-wikimatrix-1-bul-eng - duplicate with opus |
| 102 | + # - mtdata_LinguaTools-wikititles-2014-bul-eng - duplicate with opus |
| 103 | + # - mtdata_Neulab-tedtalks_train-1-eng-bul - duplicate with opus |
| 104 | + # - mtdata_ParaCrawl-paracrawl-6-eng-bul - duplicate with opus |
| 105 | + # - mtdata_ParaCrawl-paracrawl-7.1-eng-bul - duplicate with opus |
| 106 | + # - mtdata_ParaCrawl-paracrawl-8-eng-bul - duplicate with opus |
| 107 | + # - mtdata_ParaCrawl-paracrawl-9-eng-bul - duplicate with opus |
| 108 | + # - mtdata_Statmt-europarl-7-bul-eng - duplicate with opus |
| 109 | + # - mtdata_Statmt-ccaligned-1-bul_BG-eng - duplicate with opus |
| 110 | + train: |
| 111 | + - opus_OpenSubtitles/v2024 # 54,970,271 sentences |
| 112 | + - opus_NLLB/v1 # 44,635,282 sentences |
| 113 | + - opus_HPLT/v2 # 22,725,326 sentences |
| 114 | + - opus_ParaCrawl/v9 # 13,264,563 sentences |
| 115 | + - opus_CCAligned/v1 # 10,418,140 sentences |
| 116 | + - opus_DGT/v2019 # 3,603,023 sentences |
| 117 | + - opus_XLEnt/v1.2 # 2,536,508 sentences |
| 118 | + - opus_LinguaTools-WikiTitles/v2014 # 2,528,551 sentences |
| 119 | + - opus_MaCoCu/v2 # 1,760,779 sentences |
| 120 | + - opus_EMEA/v3 # 1,001,147 sentences |
| 121 | + - opus_ELRC-2712-EMEA/v1 # 772,700 sentences |
| 122 | + - opus_ELRC_2682/v1 # 772,699 sentences |
| 123 | + - opus_QED/v2.0a # 521,872 sentences |
| 124 | + - opus_Europarl/v8 # 408,290 sentences |
| 125 | + - opus_WikiMatrix/v1 # 357,970 sentences |
| 126 | + - opus_ELITR-ECA/v1 # 268,229 sentences |
| 127 | + - opus_TED2020/v1 # 249,483 sentences |
| 128 | + - opus_EUbookshop/v2 # 221,229 sentences |
| 129 | + - opus_SETIMES/v2 # 213,160 sentences |
| 130 | + - opus_NeuLab-TedTalks/v1 # 182,370 sentences |
| 131 | + - opus_ELRC-presscorner_covid/v1 # 164,779 sentences |
| 132 | + - opus_Tanzil/v1 # 135,477 sentences |
| 133 | + - opus_KDE4/v2 # 113,467 sentences |
| 134 | + - opus_Wikipedia/v1.0 # 79,781 sentences |
| 135 | + - opus_wikimedia/v20230407 # 62,347 sentences |
| 136 | + - opus_bible-uedin/v1 # 62,123 sentences |
| 137 | + - opus_ELRC-768-corpora_State_Admini/v1 # 52,153 sentences |
| 138 | + - opus_JRC-Acquis/v3.0 # 45,850 sentences |
| 139 | + - opus_ELRC-3563-EUR_LEX_covid/v1 # 23,010 sentences |
| 140 | + - opus_ELRC-EUR_LEX/v1 # 23,010 sentences |
| 141 | + - opus_ELRC-EUROPARL_covid/v1 # 21,842 sentences |
| 142 | + - opus_ELRC-1176-EUIPO_2017/v1 # 19,610 sentences |
| 143 | + - opus_ELRC-EUIPO_2017/v1 # 19,610 sentences |
| 144 | + - opus_Tatoeba/v2023-04-12 # 17,860 sentences |
| 145 | + - opus_ELRC-antibiotic/v1 # 14,736 sentences |
| 146 | + - opus_ELRC-913-2018_Proposal_a/v1 # 14,196 sentences |
| 147 | + - opus_ELRA-W0263/v1 # 14,195 sentences |
| 148 | + - opus_ELRC-2873-EU_publications_medi/v1 # 13,150 sentences |
| 149 | + - opus_ELRC-EU_publications/v1 # 13,150 sentences |
| 150 | + - opus_ELRC-wikipedia_health/v1 # 11,788 sentences |
| 151 | + - opus_ELRC-664-administration/v1 # 11,263 sentences |
| 152 | + - opus_ELRA-W0211/v1 # 11,262 sentences |
| 153 | + - opus_ELRC-1158-Official_Tourism_Por/v1 # 10,617 sentences |
| 154 | + - opus_ELRC-3604-presscorner_covid/v1 # 7,195 sentences |
| 155 | + - opus_ELRA-W0297/v1 # 6,480 sentences |
| 156 | + - opus_GlobalVoices/v2018q4 # 6,003 sentences |
| 157 | + - opus_ELRC-1000-legislation_Bulgaria/v1 # 4,610 sentences |
| 158 | + - opus_ELRC_3382/v1 # 3,691 sentences |
| 159 | + - opus_ELRC-408-transport/v1 # 3,624 sentences |
| 160 | + - opus_ELRC-1060-Information_society/v1 # 3,257 sentences |
| 161 | + - opus_ELRC-437-Bulgarian_strategic_/v1 # 2,990 sentences |
| 162 | + - opus_TildeMODEL/v2018 # 2,865 sentences |
| 163 | + - opus_ECDC/v2016-03-16 # 2,568 sentences |
| 164 | + - opus_ELRC-3462-EC_EUROPA_covid/v1 # 2,422 sentences |
| 165 | + - opus_ELRC-EC_EUROPA/v1 # 2,422 sentences |
| 166 | + - opus_ELRC-1056-open_broadband_infor/v1 # 2,390 sentences |
| 167 | + - opus_ELRA-W0134/v1 # 2,389 sentences |
| 168 | + - opus_ELRC-1055-ICT_Transport/v1 # 2,377 sentences |
| 169 | + - opus_ELRA-W0133/v1 # 2,376 sentences |
| 170 | + - opus_ELRC-391-Bulgarian_strategic_/v1 # 2,350 sentences |
| 171 | + - opus_ELRC-5067-SciPar/v1 # 2,302 sentences |
| 172 | + - opus_ELRC-1054-health_care_social/v1 # 2,097 sentences |
| 173 | + - opus_ELRC-412-EJTN_Handbook/v1 # 1,944 sentences |
| 174 | + - opus_ELRC-471-Bugarian_Revenue/v1 # 1,293 sentences |
| 175 | + - opus_ELRA-W0173/v1 # 1,292 sentences |
| 176 | + - opus_ELRC-3200-antibiotic/v1 # 891 sentences |
| 177 | + - opus_ELRC-3291-EUROPARL_covid/v1 # 728 sentences |
| 178 | + - opus_ELRC-3060-wikipedia_health/v1 # 630 sentences |
| 179 | + - opus_ELRC_2922/v1 # 629 sentences |
| 180 | + - opus_ELRC-2733-vaccination/v1 # 524 sentences |
| 181 | + - opus_ELRC-vaccination/v1 # 524 sentences |
| 182 | + - opus_ELRC_2923/v1 # 467 sentences |
| 183 | + - opus_ELRC-2549-Report_Best_practice/v1 # 215 sentences |
| 184 | + - mtdata_ELRC-competition_economics_judges-1-bul-eng # ~61 sentences (7.0 kB) |
| 185 | + - mtdata_ELRC-report_best_practices_initial_training_programs-1-bul-eng # ~203 sentences (23.0 kB) |
| 186 | + - mtdata_ELRC-eu_publications_medical_v2-1-bul-eng # ~16,373 sentences (1.9 MB) |
| 187 | + - mtdata_ELRC-web_acquired_data_related_to_scientific_research-1-eng-bul # ~464,317 sentences (52.5 MB) |
| 188 | + - mtdata_ELRC-hrw_dataset_v1-1-eng-bul # ~631,760 sentences (71.4 MB) |
| 189 | + - mtdata_EU-eac_forms-1-bul-eng # ~31,162 sentences (3.5 MB) |
| 190 | + - mtdata_EU-eac_reference-1-bul-eng # ~31,162 sentences (3.5 MB) |
| 191 | + - mtdata_EU-dcep-1-bul-eng # ~404,653 sentences (45.7 MB) |
| 192 | + - mtdata_Tilde-eesc-2017-bul-eng # ~1,121,039 sentences (126.7 MB) |
| 193 | + - mtdata_Tilde-ema-2016-bul-eng # ~274,955 sentences (31.1 MB) |
| 194 | + - mtdata_Tilde-ecb-2017-bul-eng # ~1,951 sentences (220.6 kB) |
| 195 | + - mtdata_Tilde-rapid-2016-bul-eng # ~198,340 sentences (22.4 MB) |
| 196 | + |
| 197 | + # The monolingual data contains: |
| 198 | + # ~676,854,488 sentences |
| 199 | + # Up to 70,000,000 sentences from HPLT |
| 200 | + mono-src: |
| 201 | + - news-crawl_news.2007 # ~1,557,522 sentences |
| 202 | + - news-crawl_news.2008 # ~5,389,380 sentences |
| 203 | + - news-crawl_news.2009 # ~6,557,522 sentences |
| 204 | + - news-crawl_news.2010 # ~3,247,787 sentences |
| 205 | + - news-crawl_news.2011 # ~6,318,584 sentences |
| 206 | + - news-crawl_news.2012 # ~6,407,079 sentences |
| 207 | + - news-crawl_news.2013 # ~10,619,469 sentences |
| 208 | + - news-crawl_news.2014 # ~10,619,469 sentences |
| 209 | + - news-crawl_news.2015 # ~10,619,469 sentences |
| 210 | + - news-crawl_news.2016 # ~7,982,300 sentences |
| 211 | + - news-crawl_news.2017 # ~11,504,424 sentences |
| 212 | + - news-crawl_news.2018 # ~7,920,353 sentences |
| 213 | + - news-crawl_news.2019 # ~17,699,115 sentences |
| 214 | + - news-crawl_news.2020 # ~22,123,893 sentences |
| 215 | + - news-crawl_news.2021 # ~21,238,938 sentences |
| 216 | + - news-crawl_news.2022 # ~23,008,849 sentences |
| 217 | + - news-crawl_news.2023 # ~23,008,849 sentences |
| 218 | + - news-crawl_news.2024 # ~18,584,070 sentences |
| 219 | + - hplt_mono/v2.0 # Up to 70,000,000 sentences |
| 220 | + - opus_NLLB/v1 # ~462,447,416 sentences |
| 221 | + |
| 222 | + # The monolingual data contains: |
| 223 | + # ~57,427,433 sentences |
| 224 | + # Up to 70,000,000 sentences from HPLT |
| 225 | + mono-trg: |
| 226 | + - news-crawl_news.2013 # ~2,752,212 sentences |
| 227 | + - news-crawl_news.2014 # ~4,026,548 sentences |
| 228 | + - news-crawl_news.2015 # ~3,584,070 sentences |
| 229 | + - news-crawl_news.2016 # ~2,805,309 sentences |
| 230 | + - news-crawl_news.2017 # ~2,575,221 sentences |
| 231 | + - news-crawl_news.2018 # ~1,814,159 sentences |
| 232 | + - news-crawl_news.2019 # ~2,486,725 sentences |
| 233 | + - news-crawl_news.2020 # ~2,584,070 sentences |
| 234 | + - news-crawl_news.2021 # ~1,681,415 sentences |
| 235 | + - news-crawl_news.2022 # ~2,371,681 sentences |
| 236 | + - news-crawl_news.2023 # ~2,743,362 sentences |
| 237 | + - news-crawl_news.2024 # ~2,442,477 sentences |
| 238 | + - hplt_mono/v2.0 # Up to 70,000,000 sentences |
| 239 | + - opus_NLLB/v1 # ~25,560,184 sentences |
| 240 | +marian-args: |
| 241 | + decoding-backward: |
| 242 | + beam-size: '12' |
| 243 | + mini-batch-words: '2000' |
| 244 | + decoding-teacher: |
| 245 | + mini-batch-words: '5000' |
| 246 | + maxi-batch: '10000' |
| 247 | + training-backward: |
| 248 | + early-stopping: '5' |
| 249 | + training-teacher: |
| 250 | + early-stopping: '20' |
| 251 | + training-student: |
| 252 | + early-stopping: '15' |
| 253 | + training-student-finetuned: |
| 254 | + early-stopping: '20' |
| 255 | +target-stage: evaluate-teacher |
| 256 | +wandb-publication: true |
| 257 | +continuation: |
| 258 | + models: {} |
| 259 | +taskcluster: |
| 260 | + split-chunks: 20 |
| 261 | + upload-bucket: production |
| 262 | + worker-classes: |
| 263 | + default: gcp-spot |
| 264 | + corpus-align-parallel: gcp-standard |
| 265 | + corpus-align-backtranslations: gcp-standard |
| 266 | + corpus-align-distillation: gcp-standard |
| 267 | + distillation-corpus-build-shortlist: gcp-standard |
0 commit comments