|
| 1 | +# The initial configuration was generated using: |
| 2 | +# task config-generator -- --name docmt-vocab --remote_branch docmt-vocab en is |
| 3 | +# |
| 4 | +# The documentation for this config can be found here: |
| 5 | +# https://github.com/mozilla/translations/blob/96f10a07dc0205db531e99ca05af5d8c70b6ab61/taskcluster/configs/config.prod.yml |
| 6 | +experiment: |
| 7 | + name: docmt-vocab |
| 8 | + src: en |
| 9 | + trg: is |
| 10 | + best-model: chrf |
| 11 | + opuscleaner-mode: defaults |
| 12 | + archive-corpora: true |
| 13 | + bicleaner: |
| 14 | + default-threshold: 0.5 |
| 15 | + dataset-thresholds: {} |
| 16 | + monocleaner: |
| 17 | + mono-src: |
| 18 | + default-threshold: 0.0 |
| 19 | + dataset-thresholds: |
| 20 | + hplt_mono_v2_0: 0.5 |
| 21 | + opus_NLLB_v1: 0.5 |
| 22 | + mono-trg: |
| 23 | + default-threshold: 0.0 |
| 24 | + dataset-thresholds: |
| 25 | + hplt_mono_v2_0: 0.7 |
| 26 | + opus_NLLB_v1: 0.8 |
| 27 | + mono-max-sentences-src: |
| 28 | + total: 300_000_000 |
| 29 | + per-dataset: 100_000_000 |
| 30 | + mono-max-sentences-trg: |
| 31 | + total: 200_000_000 |
| 32 | + per-dataset: 100_000_000 |
| 33 | + hplt-min-doc-score: |
| 34 | + mono-src: 7.0 |
| 35 | + mono-trg: 9.0 |
| 36 | + spm-sample-size: 10_000_000 |
| 37 | + spm-vocab-size: 32000 |
| 38 | + spm-vocab-split: false |
| 39 | + teacher-ensemble: 1 |
| 40 | + teacher-mode: two-stage |
| 41 | + teacher-decoder: ctranslate2 |
| 42 | + student-model: base-memory |
| 43 | +datasets: |
| 44 | + devtest: |
| 45 | + - mtdata_aug-mix_ParIce-eea_dev-20.05-eng-isl |
| 46 | + - mtdata_aug-mix_ParIce-ema_dev-20.05-eng-isl |
| 47 | + - mtdata_aug-mix_ParIce-opensubtitles_dev-20.05-eng-isl |
| 48 | + - flores_aug-mix_dev |
| 49 | + - sacrebleu_aug-mix_wmt21 |
| 50 | + test: |
| 51 | + - mtdata_ParIce-eea_test-20.05-eng-isl |
| 52 | + - mtdata_ParIce-opensubtitles_test-20.05-eng-isl |
| 53 | + - flores_devtest |
| 54 | + - flores_aug-mix_devtest |
| 55 | + - flores_aug-noise_devtest |
| 56 | + - flores_aug-inline-noise_devtest |
| 57 | + - flores_aug-punct_devtest |
| 58 | + - flores_aug-title_devtest |
| 59 | + - flores_aug-upper_devtest |
| 60 | + - flores_aug-typos_devtest |
| 61 | + - sacrebleu_wmt21/systems |
| 62 | + - sacrebleu_wmt21/dev |
| 63 | + |
| 64 | + # The training data contains: |
| 65 | + # 25,112,409 sentences |
| 66 | + # |
| 67 | + # Skipped datasets: |
| 68 | + # - opus_CCMatrix/v1 - ignored datasets (8,723,145 sentences) |
| 69 | + # - opus_MultiHPLT/v2 - ignored datasets (2,694,541 sentences) |
| 70 | + # - opus_MultiMaCoCu/v2 - ignored datasets (267,366 sentences) |
| 71 | + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) |
| 72 | + # - opus_WikiTitles/v3 - ignored datasets (0 sentences) |
| 73 | + # - mtdata_ELRC-www.iceida.is-1-eng-isl - duplicate with opus |
| 74 | + # - mtdata_ELRC-www.pfs.is-1-eng-isl - duplicate with opus |
| 75 | + # - mtdata_ELRC-www.lanamal.is-1-eng-isl - duplicate with opus |
| 76 | + # - mtdata_ELRC-gallery_iceland-1-eng-isl - duplicate with opus |
| 77 | + # - mtdata_ELRC-bokmenntaborgin_is-1-eng-isl - duplicate with opus |
| 78 | + # - mtdata_ELRC-icelandic_medicines-1-eng-isl - duplicate with opus |
| 79 | + # - mtdata_ELRC-www.nordisketax.net-1-eng-isl - duplicate with opus |
| 80 | + # - mtdata_ELRC-statistics_iceland-1-eng-isl - duplicate with opus |
| 81 | + # - mtdata_ELRC-www.norden.org-1-eng-isl - duplicate with opus |
| 82 | + # - mtdata_ELRC-emea-1-eng-isl - duplicate with opus |
| 83 | + # - mtdata_ELRC-antibiotic-1-eng-isl - duplicate with opus |
| 84 | + # - mtdata_ELRC-www.malfong.is-1-eng_GB-isl_IS - duplicate with opus |
| 85 | + # - mtdata_ELRC-ríkiskaup_2020-1-eng-isl - duplicate with opus |
| 86 | + # - mtdata_ELRC-university_iceland-1-eng_GB-isl_IS - duplicate with opus |
| 87 | + # - mtdata_ELRC-scipar-1-eng-isl - duplicate with opus |
| 88 | + # - mtdata_EU-ecdc-1-eng-isl - duplicate with opus |
| 89 | + # - mtdata_Facebook-wikimatrix-1-eng-isl - duplicate with opus |
| 90 | + # - mtdata_ParIce-eea_train-20.05-eng-isl - Error fetching (https://repository.clarin.is/repository/xmlui/bitstream/handle/20.500.12537/24/Parice_dev_test.20.05.zip) |
| 91 | + # - mtdata_ParIce-ema_train-20.05-eng-isl - Error fetching (https://repository.clarin.is/repository/xmlui/bitstream/handle/20.500.12537/24/Parice_dev_test.20.05.zip) |
| 92 | + # - mtdata_ParIce-ema_test-20.05-eng-isl - Error fetching (https://repository.clarin.is/repository/xmlui/bitstream/handle/20.500.12537/24/Parice_dev_test.20.05.zip) |
| 93 | + # - mtdata_ParaCrawl-paracrawl-6-eng-isl - duplicate with opus |
| 94 | + # - mtdata_ParaCrawl-paracrawl-7.1-eng-isl - duplicate with opus |
| 95 | + # - mtdata_ParaCrawl-paracrawl-8-eng-isl - duplicate with opus |
| 96 | + # - mtdata_ParaCrawl-paracrawl-9-eng-isl - duplicate with opus |
| 97 | + # - mtdata_Statmt-wikititles-3-isl-eng - duplicate with opus |
| 98 | + # - mtdata_Statmt-ccaligned-1-eng-isl_IS - duplicate with opus |
| 99 | + train: |
| 100 | + - opus_NLLB/v1 # 8,723,145 sentences |
| 101 | + - opus_OpenSubtitles/v2024 # 4,057,039 sentences |
| 102 | + - opus_ParaCrawl/v9 # 2,967,579 sentences |
| 103 | + - opus_HPLT/v2 # 2,694,541 sentences |
| 104 | + - opus_ParIce/v1 # 2,097,022 sentences |
| 105 | + - opus_CCAligned/v1 # 1,192,542 sentences |
| 106 | + - opus_XLEnt/v1.2 # 962,661 sentences |
| 107 | + - opus_ELRC-2718-EMEA/v1 # 542,624 sentences |
| 108 | + - opus_ELRC-EMEA/v1 # 542,624 sentences |
| 109 | + - opus_TildeMODEL/v2018 # 420,712 sentences |
| 110 | + - opus_MaCoCu/v2 # 267,366 sentences |
| 111 | + - opus_ELRC-5067-SciPar/v1 # 110,831 sentences |
| 112 | + - opus_KDE4/v2 # 98,989 sentences |
| 113 | + - opus_WikiMatrix/v1 # 85,992 sentences |
| 114 | + - opus_bible-uedin/v1 # 62,163 sentences |
| 115 | + - opus_ELRC-728-www.norden.org/v1 # 41,073 sentences |
| 116 | + - opus_ELRC-www.norden.org/v1 # 41,073 sentences |
| 117 | + - opus_ELRC-4327-Government_Offices_I/v1 # 36,290 sentences |
| 118 | + - opus_GNOME/v1 # 28,776 sentences |
| 119 | + - opus_QED/v2.0a # 27,611 sentences |
| 120 | + - opus_ELRC-4324-Government_Offices_I/v1 # 18,185 sentences |
| 121 | + - opus_ELRC-antibiotic/v1 # 13,070 sentences |
| 122 | + - opus_ELRC-4295-www.malfong.is/v1 # 12,634 sentences |
| 123 | + - opus_ELRC-4334-Rkiskaup_2020/v1 # 10,236 sentences |
| 124 | + - opus_ELRC-4338-University_Iceland/v1 # 10,164 sentences |
| 125 | + - opus_EUbookshop/v2 # 9,783 sentences |
| 126 | + - opus_Tatoeba/v2023-04-12 # 9,600 sentences |
| 127 | + - opus_wikimedia/v20230407 # 4,471 sentences |
| 128 | + - opus_ELRC-505-www.pfs.is/v1 # 2,866 sentences |
| 129 | + - opus_ECDC/v2016-03-16 # 2,512 sentences |
| 130 | + - opus_TED2020/v1 # 2,430 sentences |
| 131 | + - opus_ELRC-508-Tilde_Statistics_Ice/v1 # 2,427 sentences |
| 132 | + - opus_ELRC-718-Statistics_Iceland/v1 # 2,361 sentences |
| 133 | + - opus_ELRC-517-Icelandic_Directorat/v1 # 1,536 sentences |
| 134 | + - opus_ELRC-502-Icelandic_Financial_/v1 # 1,525 sentences |
| 135 | + - opus_ELRC-510-Harpa_Reykjavik_Conc/v1 # 1,197 sentences |
| 136 | + - opus_ELRC-506-www.lanamal.is/v1 # 1,140 sentences |
| 137 | + - opus_ELRC-597-www.nordisketax.net/v1 # 1,065 sentences |
| 138 | + - opus_ELRC-www.nordisketax.net/v1 # 1,065 sentences |
| 139 | + - opus_ELRC-504-www.iceida.is/v1 # 1,055 sentences |
| 140 | + - opus_ELRC-3206-antibiotic/v1 # 816 sentences |
| 141 | + - opus_ELRC-516-Icelandic_Medicines/v1 # 711 sentences |
| 142 | + - opus_ELRC-509-Gallery_Iceland/v1 # 577 sentences |
| 143 | + - opus_ELRC-511-bokmenntaborgin_is/v1 # 330 sentences |
| 144 | + - mtdata_ELRC-icelandic_financial_supervisory_authority-1-eng-isl # ~1,158 sentences (130.9 kB) |
| 145 | + - mtdata_ELRC-tilde_statistics_iceland-1-eng-isl # ~1,778 sentences (201.0 kB) |
| 146 | + - mtdata_ELRC-harpa_reykjavik_concert_hall_conference_centre-1-eng-isl # ~1,520 sentences (171.8 kB) |
| 147 | + - mtdata_ELRC-icelandic_directorate_immigration-1-eng-isl # ~1,013 sentences (114.5 kB) |
| 148 | + - mtdata_ELRC-government_offices_iceland_reports-1-eng-isl # ~19,340 sentences (2.2 MB) |
| 149 | + - mtdata_ELRC-government_offices_iceland_legislation_regulations-1-eng-isl # ~38,492 sentences (4.3 MB) |
| 150 | + - mtdata_EU-eac_forms-1-eng-isl # ~31,162 sentences (3.5 MB) |
| 151 | + - mtdata_EU-eac_reference-1-eng-isl # ~31,162 sentences (3.5 MB) |
| 152 | + - mtdata_Statmt-newsdev_enis-2021-eng-isl # ~460,669 sentences (52.1 MB) |
| 153 | + - mtdata_Statmt-newsdev_isen-2021-isl-eng # ~460,669 sentences (52.1 MB) |
| 154 | + - mtdata_Tilde-eesc-2017-eng-isl # ~221 sentences (25.1 kB) |
| 155 | + - mtdata_Tilde-ema-2016-eng-isl # ~201,134 sentences (22.7 MB) |
| 156 | + - mtdata_Tilde-rapid-2016-eng-isl # ~173 sentences (19.6 kB) |
| 157 | + |
| 158 | + # The monolingual data contains: |
| 159 | + # ~676,854,488 sentences |
| 160 | + # Up to 100,000,000 sentences from HPLT |
| 161 | + mono-src: |
| 162 | + - news-crawl_news.2007 # ~1,557,522 sentences |
| 163 | + - news-crawl_news.2008 # ~5,389,380 sentences |
| 164 | + - news-crawl_news.2009 # ~6,557,522 sentences |
| 165 | + - news-crawl_news.2010 # ~3,247,787 sentences |
| 166 | + - news-crawl_news.2011 # ~6,318,584 sentences |
| 167 | + - news-crawl_news.2012 # ~6,407,079 sentences |
| 168 | + - news-crawl_news.2013 # ~10,619,469 sentences |
| 169 | + - news-crawl_news.2014 # ~10,619,469 sentences |
| 170 | + - news-crawl_news.2015 # ~10,619,469 sentences |
| 171 | + - news-crawl_news.2016 # ~7,982,300 sentences |
| 172 | + - news-crawl_news.2017 # ~11,504,424 sentences |
| 173 | + - news-crawl_news.2018 # ~7,920,353 sentences |
| 174 | + - news-crawl_news.2019 # ~17,699,115 sentences |
| 175 | + - news-crawl_news.2020 # ~22,123,893 sentences |
| 176 | + - news-crawl_news.2021 # ~21,238,938 sentences |
| 177 | + - news-crawl_news.2022 # ~23,008,849 sentences |
| 178 | + - news-crawl_news.2023 # ~23,008,849 sentences |
| 179 | + - news-crawl_news.2024 # ~18,584,070 sentences |
| 180 | + - hplt_mono/v2.0 # Up to 100,000,000 sentences |
| 181 | + - opus_NLLB/v1 # ~462,447,416 sentences |
| 182 | + |
| 183 | + # The monolingual data contains: |
| 184 | + # ~1,075,648 sentences |
| 185 | + # Up to 100,000,000 sentences from HPLT |
| 186 | + mono-trg: |
| 187 | + - hplt_mono/v2.0 # Up to 100,000,000 sentences |
| 188 | + - opus_NLLB/v1 # ~1,075,648 sentences |
| 189 | +marian-args: |
| 190 | + decoding-backward: |
| 191 | + beam-size: '12' |
| 192 | + mini-batch-words: '2000' |
| 193 | + decoding-teacher: |
| 194 | + mini-batch-words: '5000' |
| 195 | + maxi-batch: '10000' |
| 196 | + training-backward: |
| 197 | + early-stopping: '5' |
| 198 | + training-teacher: |
| 199 | + early-stopping: '20' |
| 200 | + training-student: |
| 201 | + early-stopping: '15' |
| 202 | + training-student-finetuned: |
| 203 | + early-stopping: '20' |
| 204 | +target-stage: all-pipeline |
| 205 | +wandb-publication: true |
| 206 | +continuation: |
| 207 | + models: {} |
| 208 | +taskcluster: |
| 209 | + split-chunks: 20 |
| 210 | + upload-bucket: production |
| 211 | + worker-classes: |
| 212 | + default: gcp-spot |
| 213 | + corpus-align-parallel: gcp-standard |
| 214 | + corpus-align-backtranslations: gcp-standard |
| 215 | + corpus-align-distillation: gcp-standard |
| 216 | + distillation-corpus-build-shortlist: gcp-standard |
0 commit comments