Skip to content

Commit f81d492

Browse files
authored
Add NTREX dataset downloader support (#1399)
* Add NTREX dataset downloader support * Fix ntrex url
1 parent 9523a0d commit f81d492

File tree

11 files changed

+3690
-0
lines changed

11 files changed

+3690
-0
lines changed

docs/data-and-cleaning/datasets.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ Data source | Prefix | Name examples
1515
[OPUS](https://opus.nlpl.eu/) | opus | ParaCrawl/v7.1 | parallel | Many open source datasets. Go to the website, choose a language pair, check links under Moses column to see what names and version is used in a link.
1616
[SacreBLEU](https://github.com/mjpost/sacrebleu) | sacrebleu | wmt20 | parallel | Official evaluation datasets available in SacreBLEU tool. Recommended to use in `datasets:test` config section. Look up supported datasets and language pairs in `sacrebleu.dataset` python module.
1717
[Flores](https://github.com/facebookresearch/flores) | flores | dev, devtest | parallel | Evaluation dataset from Facebook that supports 100 languages.
18+
[NTREX-128](https://github.com/MicrosoftTranslator/NTREX) | ntrex | devtest | parallel | Evaluation dataset from Microsoft that supports 128 languages.
1819
Custom parallel | url | `https://storage.googleapis.com/releng-translations-dev/data/en-ru/pytest-dataset.[LANG].zst` | parallel | A custom zst compressed parallel dataset, for instance uploaded to GCS. The language pairs should be split into two files. the `[LANG]` will be replaced with the `to` and `from` language codes.
1920
[News crawl](http://data.statmt.org/news-crawl) | news-crawl | news.2019 | mono | Monolingual news datasets from [WMT](https://www.statmt.org/wmt21/translation-task.html)
2021
[OPUS](https://opus.nlpl.eu/) | opus | tldr-pages/v2023-08-29 | mono | Monolingual dataset from OPUS.

pipeline/data/parallel_downloaders.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ class Downloader(Enum):
2323
mtdata = "mtdata"
2424
sacrebleu = "sacrebleu"
2525
flores = "flores"
26+
ntrex = "ntrex"
2627
url = "url"
2728
tmx = "tmx"
2829

@@ -267,10 +268,35 @@ def flores(src: LangCode, trg: LangCode, dataset: str, output_prefix: Path):
267268
logger.info("Done: Downloading flores corpus")
268269

269270

271+
def ntrex(src: LangCode, trg: LangCode, dataset: str, output_prefix: Path):
272+
"""
273+
Download NTREX-128 evaluation dataset
274+
275+
https://github.com/MicrosoftTranslator/NTREX
276+
"""
277+
if dataset != "test":
278+
raise ValueError(f"Dataset subset '{dataset}' for NTREX does not exist")
279+
280+
logger.info("Downloading ntrex corpus")
281+
revision = "468c6b6"
282+
dataset_url = f"https://github.com/MicrosoftTranslator/NTREX/raw/{revision}/NTREX-128"
283+
284+
for lang in (src, trg):
285+
lang_ntrex = lang.ntrex()
286+
file_type = "src" if lang_ntrex == "eng" else "ref"
287+
lang_url = f"{dataset_url}/newstest2019-{file_type}.{lang_ntrex}.txt"
288+
file = output_prefix.with_suffix(f".{lang}")
289+
stream_download_to_file(lang_url, file)
290+
compress_file(file, keep_original=False, compression="zst")
291+
292+
logger.info("Done: Downloading ntrex corpus")
293+
294+
270295
mapping = {
271296
Downloader.opus: opus,
272297
Downloader.sacrebleu: sacrebleu,
273298
Downloader.flores: flores,
299+
Downloader.ntrex: ntrex,
274300
Downloader.url: url,
275301
Downloader.mtdata: mtdata,
276302
Downloader.tmx: tmx,

pipeline/langs/codes.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
PONTOON_LANGUAGES,
3737
FLORES_101_DEFAULTS_MAP,
3838
FLORES_PLUS_DEFAULTS_MAP,
39+
NTREX_LANGS,
40+
NTREX_DEFAULTS_MAP,
3941
BICLEANER_AI_DEFAULTS_MAP,
4042
BOUQUET_DEFAULTS_MAP,
4143
GOOGLE_LANGS,
@@ -231,6 +233,12 @@ def flores101(self) -> str:
231233
return FLORES_101_DEFAULTS_MAP[lang]
232234
return self._find_code(FLORES_101_LANGUAGES, check_script=True)
233235

236+
def ntrex(self) -> str:
237+
lang = str(self)
238+
if lang in NTREX_DEFAULTS_MAP:
239+
return NTREX_DEFAULTS_MAP[lang]
240+
return self._find_code(NTREX_LANGS, check_script=False)
241+
234242
def pontoon(self) -> str:
235243
# zh_hant -> zh-TW
236244
lang = str(self)

pipeline/langs/maps.py

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1096,6 +1096,146 @@
10961096
"zh_hant": "zho_trad",
10971097
}
10981098

1099+
NTREX_LANGS = {
1100+
"afr",
1101+
"amh",
1102+
"arb",
1103+
"aze-Latn",
1104+
"bak",
1105+
"bel",
1106+
"bem",
1107+
"ben",
1108+
"bod",
1109+
"bos",
1110+
"bul",
1111+
"cat",
1112+
"ces",
1113+
"ckb-Arab",
1114+
"cym",
1115+
"dan",
1116+
"deu",
1117+
"div",
1118+
"dzo",
1119+
"ell",
1120+
"eng",
1121+
"eng-GB",
1122+
"eng-IN",
1123+
"eng-US",
1124+
"est",
1125+
"eus",
1126+
"ewe",
1127+
"fao",
1128+
"fas",
1129+
"fij",
1130+
"fil",
1131+
"fin",
1132+
"fra",
1133+
"fra-CA",
1134+
"fuc",
1135+
"gle",
1136+
"glg",
1137+
"guj",
1138+
"hau",
1139+
"heb",
1140+
"hin",
1141+
"hmn",
1142+
"hrv",
1143+
"hun",
1144+
"hye",
1145+
"ibo",
1146+
"ind",
1147+
"isl",
1148+
"ita",
1149+
"jpn",
1150+
"kan",
1151+
"kat",
1152+
"kaz",
1153+
"khm",
1154+
"kin",
1155+
"kir",
1156+
"kmr",
1157+
"kor",
1158+
"lao",
1159+
"lav",
1160+
"lit",
1161+
"ltz",
1162+
"mal",
1163+
"mar",
1164+
"mey",
1165+
"mkd",
1166+
"mlg",
1167+
"mlt",
1168+
"mon",
1169+
"mri",
1170+
"msa",
1171+
"mya",
1172+
"nde",
1173+
"nep",
1174+
"nld",
1175+
"nno",
1176+
"nob",
1177+
"nso",
1178+
"nya",
1179+
"orm",
1180+
"pan",
1181+
"pol",
1182+
"por",
1183+
"por-BR",
1184+
"prs",
1185+
"pus",
1186+
"ron",
1187+
"rus",
1188+
"shi",
1189+
"sin",
1190+
"slk",
1191+
"slv",
1192+
"smo",
1193+
"sna-Latn",
1194+
"snd-Arab",
1195+
"som",
1196+
"spa",
1197+
"spa",
1198+
"spa-MX",
1199+
"sqi",
1200+
"srp-Cyrl",
1201+
"srp-Latn",
1202+
"ssw",
1203+
"swa",
1204+
"swe",
1205+
"tah",
1206+
"tam",
1207+
"tat",
1208+
"tel",
1209+
"tgk-Cyrl",
1210+
"tha",
1211+
"tir",
1212+
"ton",
1213+
"tsn",
1214+
"tuk",
1215+
"tur",
1216+
"uig",
1217+
"ukr",
1218+
"urd",
1219+
"uzb",
1220+
"ven",
1221+
"vie",
1222+
"wol",
1223+
"xho",
1224+
"yor",
1225+
"yue",
1226+
"zho-CN",
1227+
"zho-TW",
1228+
"zul",
1229+
}
1230+
1231+
NTREX_DEFAULTS_MAP = {
1232+
"az": "aze-Latn",
1233+
"hbs": "hrv",
1234+
"sr": "srp-Cyrl",
1235+
"zh": "zho-CN",
1236+
"zh_hant": "zho-TW",
1237+
}
1238+
10991239
# https://huggingface.co/bitextor/models
11001240
BICLEANER_AI_DEFAULTS_MAP = {
11011241
"zh_hant": "zh",

taskcluster/kinds/dataset/kind.yml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,36 @@ tasks:
8282
--src {src_locale}
8383
--trg {trg_locale}
8484
85+
ntrex:
86+
description: Fetch a NTREX-128 dataset.
87+
label: dataset-ntrex-{dataset_sanitized}-{src_locale}-{trg_locale}
88+
worker-type: b-cpu
89+
dataset-config:
90+
provider: ntrex
91+
attributes:
92+
cache:
93+
resources:
94+
- pipeline/data/parallel_downloaders.py
95+
- pipeline/data/parallel_importer.py
96+
- pipeline/data/requirements/data.txt
97+
- pipeline/data/cjk.py
98+
- pipeline/langs/codes.py
99+
- pipeline/langs/maps.py
100+
- pipeline/langs/scripts.py
101+
run:
102+
command:
103+
- bash
104+
- -c
105+
- >-
106+
pip3 install --upgrade pip setuptools &&
107+
pip3 install -r $VCS_PATH/pipeline/data/requirements/data.txt &&
108+
export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
109+
python3 $VCS_PATH/pipeline/data/parallel_importer.py
110+
--dataset {dataset}
111+
--output_prefix $TASK_WORKDIR/artifacts/{dataset_sanitized}
112+
--src {src_locale}
113+
--trg {trg_locale}
114+
85115
sacrebleu:
86116
description: Fetch a sacrebleu dataset.
87117
label: dataset-sacrebleu-{dataset_sanitized}-{src_locale}-{trg_locale}

0 commit comments

Comments
 (0)