mozilla
diff --git a/‎docs/data-and-cleaning/datasets.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/data-and-cleaning/datasets.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pipeline/data/parallel_downloaders.py‎
Lines changed: 26 additions & 0 deletions b/‎pipeline/data/parallel_downloaders.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎pipeline/langs/codes.py‎
Lines changed: 8 additions & 0 deletions b/‎pipeline/langs/codes.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎pipeline/langs/maps.py‎
Lines changed: 140 additions & 0 deletions b/‎pipeline/langs/maps.py‎
Lines changed: 140 additions & 0 deletions
diff --git a/‎taskcluster/kinds/dataset/kind.yml‎
Lines changed: 30 additions & 0 deletions b/‎taskcluster/kinds/dataset/kind.yml‎
Lines changed: 30 additions & 0 deletions
@@ -15,6 +15,7 @@ Data source | Prefix     | Name examples
 [OPUS](https://opus.nlpl.eu/) | opus       | ParaCrawl/v7.1                                                                                | parallel   | Many open source datasets. Go to the website, choose a language pair, check links under Moses column to see what names and version is used in a link.
 [SacreBLEU](https://github.com/mjpost/sacrebleu) | sacrebleu  | wmt20                                                                                         | parallel   | Official evaluation datasets available in SacreBLEU tool. Recommended to use in `datasets:test` config section. Look up supported datasets and language pairs in `sacrebleu.dataset` python module.
 [Flores](https://github.com/facebookresearch/flores) | flores     | dev, devtest                                                                                  | parallel   | Evaluation dataset from Facebook that supports 100 languages.
+[NTREX-128](https://github.com/MicrosoftTranslator/NTREX) | ntrex     | devtest                                                                                  | parallel   | Evaluation dataset from Microsoft that supports 128 languages.
 Custom parallel | url        | `https://storage.googleapis.com/releng-translations-dev/data/en-ru/pytest-dataset.[LANG].zst` | parallel   | A custom zst compressed parallel dataset, for instance uploaded to GCS. The language pairs should be split into two files. the `[LANG]` will be replaced with the `to` and `from` language codes.
 [News crawl](http://data.statmt.org/news-crawl) | news-crawl | news.2019                                                                                     | mono     | Monolingual news datasets from [WMT](https://www.statmt.org/wmt21/translation-task.html)
 [OPUS](https://opus.nlpl.eu/) | opus       | tldr-pages/v2023-08-29                                                                        | mono     | Monolingual dataset from OPUS.
 
@@ -23,6 +23,7 @@ class Downloader(Enum):
     mtdata = "mtdata"
     sacrebleu = "sacrebleu"
     flores = "flores"
+    ntrex = "ntrex"
     url = "url"
     tmx = "tmx"
 
@@ -267,10 +268,35 @@ def flores(src: LangCode, trg: LangCode, dataset: str, output_prefix: Path):
     logger.info("Done: Downloading flores corpus")
 
 
+def ntrex(src: LangCode, trg: LangCode, dataset: str, output_prefix: Path):
+    """
+    Download NTREX-128 evaluation dataset
+
+    https://github.com/MicrosoftTranslator/NTREX
+    """
+    if dataset != "test":
+        raise ValueError(f"Dataset subset '{dataset}' for NTREX does not exist")
+
+    logger.info("Downloading ntrex corpus")
+    revision = "468c6b6"
+    dataset_url = f"https://github.com/MicrosoftTranslator/NTREX/raw/{revision}/NTREX-128"
+
+    for lang in (src, trg):
+        lang_ntrex = lang.ntrex()
+        file_type = "src" if lang_ntrex == "eng" else "ref"
+        lang_url = f"{dataset_url}/newstest2019-{file_type}.{lang_ntrex}.txt"
+        file = output_prefix.with_suffix(f".{lang}")
+        stream_download_to_file(lang_url, file)
+        compress_file(file, keep_original=False, compression="zst")
+
+    logger.info("Done: Downloading ntrex corpus")
+
+
 mapping = {
     Downloader.opus: opus,
     Downloader.sacrebleu: sacrebleu,
     Downloader.flores: flores,
+    Downloader.ntrex: ntrex,
     Downloader.url: url,
     Downloader.mtdata: mtdata,
     Downloader.tmx: tmx,
 
@@ -36,6 +36,8 @@
     PONTOON_LANGUAGES,
     FLORES_101_DEFAULTS_MAP,
     FLORES_PLUS_DEFAULTS_MAP,
+    NTREX_LANGS,
+    NTREX_DEFAULTS_MAP,
     BICLEANER_AI_DEFAULTS_MAP,
     BOUQUET_DEFAULTS_MAP,
     GOOGLE_LANGS,
@@ -231,6 +233,12 @@ def flores101(self) -> str:
             return FLORES_101_DEFAULTS_MAP[lang]
         return self._find_code(FLORES_101_LANGUAGES, check_script=True)
 
+    def ntrex(self) -> str:
+        lang = str(self)
+        if lang in NTREX_DEFAULTS_MAP:
+            return NTREX_DEFAULTS_MAP[lang]
+        return self._find_code(NTREX_LANGS, check_script=False)
+
     def pontoon(self) -> str:
         # zh_hant -> zh-TW
         lang = str(self)
 
@@ -1096,6 +1096,146 @@
     "zh_hant": "zho_trad",
 }
 
+NTREX_LANGS = {
+    "afr",
+    "amh",
+    "arb",
+    "aze-Latn",
+    "bak",
+    "bel",
+    "bem",
+    "ben",
+    "bod",
+    "bos",
+    "bul",
+    "cat",
+    "ces",
+    "ckb-Arab",
+    "cym",
+    "dan",
+    "deu",
+    "div",
+    "dzo",
+    "ell",
+    "eng",
+    "eng-GB",
+    "eng-IN",
+    "eng-US",
+    "est",
+    "eus",
+    "ewe",
+    "fao",
+    "fas",
+    "fij",
+    "fil",
+    "fin",
+    "fra",
+    "fra-CA",
+    "fuc",
+    "gle",
+    "glg",
+    "guj",
+    "hau",
+    "heb",
+    "hin",
+    "hmn",
+    "hrv",
+    "hun",
+    "hye",
+    "ibo",
+    "ind",
+    "isl",
+    "ita",
+    "jpn",
+    "kan",
+    "kat",
+    "kaz",
+    "khm",
+    "kin",
+    "kir",
+    "kmr",
+    "kor",
+    "lao",
+    "lav",
+    "lit",
+    "ltz",
+    "mal",
+    "mar",
+    "mey",
+    "mkd",
+    "mlg",
+    "mlt",
+    "mon",
+    "mri",
+    "msa",
+    "mya",
+    "nde",
+    "nep",
+    "nld",
+    "nno",
+    "nob",
+    "nso",
+    "nya",
+    "orm",
+    "pan",
+    "pol",
+    "por",
+    "por-BR",
+    "prs",
+    "pus",
+    "ron",
+    "rus",
+    "shi",
+    "sin",
+    "slk",
+    "slv",
+    "smo",
+    "sna-Latn",
+    "snd-Arab",
+    "som",
+    "spa",
+    "spa",
+    "spa-MX",
+    "sqi",
+    "srp-Cyrl",
+    "srp-Latn",
+    "ssw",
+    "swa",
+    "swe",
+    "tah",
+    "tam",
+    "tat",
+    "tel",
+    "tgk-Cyrl",
+    "tha",
+    "tir",
+    "ton",
+    "tsn",
+    "tuk",
+    "tur",
+    "uig",
+    "ukr",
+    "urd",
+    "uzb",
+    "ven",
+    "vie",
+    "wol",
+    "xho",
+    "yor",
+    "yue",
+    "zho-CN",
+    "zho-TW",
+    "zul",
+}
+
+NTREX_DEFAULTS_MAP = {
+    "az": "aze-Latn",
+    "hbs": "hrv",
+    "sr": "srp-Cyrl",
+    "zh": "zho-CN",
+    "zh_hant": "zho-TW",
+}
+
 # https://huggingface.co/bitextor/models
 BICLEANER_AI_DEFAULTS_MAP = {
     "zh_hant": "zh",
 
@@ -82,6 +82,36 @@ tasks:
                     --src {src_locale}
                     --trg {trg_locale}
 
+    ntrex:
+        description: Fetch a NTREX-128 dataset.
+        label: dataset-ntrex-{dataset_sanitized}-{src_locale}-{trg_locale}
+        worker-type: b-cpu
+        dataset-config:
+            provider: ntrex
+        attributes:
+            cache:
+                resources:
+                    - pipeline/data/parallel_downloaders.py
+                    - pipeline/data/parallel_importer.py
+                    - pipeline/data/requirements/data.txt
+                    - pipeline/data/cjk.py
+                    - pipeline/langs/codes.py
+                    - pipeline/langs/maps.py
+                    - pipeline/langs/scripts.py
+        run:
+            command:
+                - bash
+                - -c
+                - >-
+                    pip3 install --upgrade pip setuptools &&
+                    pip3 install -r $VCS_PATH/pipeline/data/requirements/data.txt &&
+                    export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
+                    python3 $VCS_PATH/pipeline/data/parallel_importer.py
+                    --dataset {dataset}
+                    --output_prefix $TASK_WORKDIR/artifacts/{dataset_sanitized}
+                    --src {src_locale}
+                    --trg {trg_locale}
+
     sacrebleu:
         description: Fetch a sacrebleu dataset.
         label: dataset-sacrebleu-{dataset_sanitized}-{src_locale}-{trg_locale}