diff --git a/shared_task_datasheets.py b/shared_task_datasheets.py new file mode 100644 index 00000000..8ce63cee --- /dev/null +++ b/shared_task_datasheets.py @@ -0,0 +1,125 @@ +import csv +import json +import re + +from scripts.datasheet import CVDatasheet, DatasheetSection + + +def read_json_file(file_name: str) -> dict: + """Read json file from dist + + Parameters + ---------- + file_name : str + File name + + Returns + ------- + dict + json data as python dict + """ + with open(file_name, "r") as f: + data = json.load(f) + return data + + +def read_csv_file(file_name: str, delimiter: str = ",") -> list[list[str]]: + """Read a tsv file + + Parameters + ---------- + file_name: str + File name + + Return + ------ + list: + List with lists where each list is a row and each internal list + has strings + """ + with open(file_name, "r", newline="") as file: + reader = csv.reader(file, delimiter=delimiter) + # Skipping the header + next(reader, None) + data = list(reader) + return data + + +def extract_row_data(row: list): + return { + "locale": row[0], + "speakers": row[1], + "recorded": row[2], + "transcribed": row[3], + "validated": row[4], + } + + +def add_section( + ds: CVDatasheet, + section: DatasheetSection, + content: str, + level: int, +): + section_title = re.sub( + r"[\[\]]", + "", + section.title, + ) + if section_title in ds._section_map: + print(f"Section {section_title} already exists. Skipping addition.") + return + new_section_md = "#" * level + f" {section_title}\n\n{content}" + new_section = DatasheetSection(raw_text=new_section_md) + # Append section + ds.sections.append(new_section) + ds._section_map[section_title] = new_section + + +LOCALE_DATASET_LINKS = read_json_file("mdc_locale_datasets_links.json") +SHARED_TASK_LANGS = read_csv_file("shared_task_langs.csv") +SHARED_TASK_STATS = read_csv_file("shared_task_stats.csv") +DATASHEETS_PATH = "cv-corpus/sps/23.0-2025-09-05/final/{lang}" + + +def main(): + output_ds = CVDatasheet("## Languages description\n\n### Languages Summary") + # table = "|Language|Speakers|Recorded|Transcribed|Validated|\n|-|-|-|-|-|\n" + stats_table = "|Language|Train [hours]|Test [hours]|\n|-|-|-|\n" + # for row in SHARED_TASK_LANGS: + # row_data = extract_row_data(row) + # table += f"|{row_data['locale']}|{row_data['speakers']}|{row_data['recorded']}|{row_data['transcribed']}|{row_data['validated']}|\n" + + for row in SHARED_TASK_STATS: + stats_table += f"|{row[0]}|{float(row[1]):.2f}|{float(row[2]):.2f}|\n" + tables = "#### Stats\n\n" + stats_table + output_ds.append_content("Languages Summary", tables) + for row in SHARED_TASK_LANGS: + row_data = extract_row_data(row) + locale = row_data.get("locale") + try: + path = DATASHEETS_PATH.format(lang="en") + with open(f"{path}/{locale}.md", "r") as f: + datasheet_text = f.read() + except FileNotFoundError: + path = DATASHEETS_PATH.format(lang="es") + with open(f"{path}/{locale}.md", "r") as f: + datasheet_text = f.read() + ds = CVDatasheet(datasheet_text) + section_content = ds.header.content + if "generated automatically" in section_content or "generada automáticamente": + section_content = "".join(section_content.split("\n")[-3:]) + datasheet_link = LOCALE_DATASET_LINKS[locale] + section_content = section_content.replace( + "This datasheet", f"[This datasheet]({datasheet_link})" + ) + section_content = section_content.replace( + "Esta ficha técnica", f"[Esta ficha técnica]({datasheet_link})" + ) + add_section(output_ds, ds.header, section_content, level=4) + with open("shared_task_langs.md", "w") as f: + f.write(output_ds.to_markdown()) + + +if __name__ == "__main__": + main() diff --git a/shared_task_langs.md b/shared_task_langs.md new file mode 100644 index 00000000..3021defe --- /dev/null +++ b/shared_task_langs.md @@ -0,0 +1,87 @@ +### Languages Summary + +#### Stats + +|Language|Train [hours]|Test [hours]| +|-|-|-| +|aln|6.67|2.24| +|bew|6.04|3.19| +|cgg|8.10|1.99| +|el-CY|7.17|2.22| +|hch|6.28|1.72| +|kcn|10.50|1.79| +|koo|11.49|1.86| +|led|12.03|1.63| +|lke|8.61|2.14| +|lth|12.26|1.68| +|meh|6.72|1.75| +|mmc|7.22|2.28| +|pne|8.49|2.05| +|ruc|14.24|1.84| +|rwm|10.04|1.89| +|sco|6.84|1.73| +|tob|4.68|2.75| +|top|5.90|2.52| +|ttj|12.42|2.21| +|ukv|8.51|1.60| + + +#### *Gegnisht* — Gheg Albanian (`aln`) +[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3twrdiw8y4jet4o1z) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Gheg Albanian (`aln`). The dataset contains 11 hours of recordedspeech (11 hours validated) from 14 speakers. + +#### *Betawi* — Betawi (`bew`) +[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3l9oqw5m1ezpzl062) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Betawi (`bew`). The dataset contains 11 hours of recordedspeech (11 hours validated) from 21 speakers. + +#### *Bukusu* — Bukusu (`bxk`) +[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3xeu21j7rl81f5u2g) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Bukusu (`bxk`). The dataset contains 2934 clips representing 15 hours of recordedspeech (11 hours validated) from 27 speakers. + +#### *Cypriot Greek* — Cypriot Greek (`el-CY`) +[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3xq2ok28b4o1ojacv) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Cypriot Greek (`el-CY`). The dataset contains 1284 clips representing 11 hours of recordedspeech (11 hours validated) from 10 speakers. + +#### *Wixárika* — Wixárika (`hch`) +[Esta ficha técnica](https://datacollective.mozillafoundation.org/datasets/cmflnuzz30cvnzohbhwchy6im) corresponde a la versión 23.0 del conjunto de datos *Spontaneous Speech* (habla espontánea) de Mozilla Common Voice para Wixárika (`hch`). El conjunto de datos contiene 1553 representando 11 horas de grabaciones (11 horas validadas) de 10 hablantes. + +#### *Nubi* — Nubi (`kcn`) +[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3yn6qcjgut63ttxgo) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Nubi (`kcn`). The dataset contains 2719 clips representing 15 hours of recordedspeech (10 hours validated) from 26 speakers. + +#### *Konzo* — Konzo (`koo`) +[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3uyclmpbj9qn6a9dk) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Konzo (`koo`). The dataset contains 3255 clips representing 15 hours of recordedspeech (11 hours validated) from 28 speakers. + +#### *Lendu* — Lendu (`led`) +[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz397l4cueywdr90in7) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Lendu (`led`). The dataset contains 2882 clips representing 16 hours of recordedspeech (11 hours validated) from 26 speakers. + +#### *Kenyi* — Kenyi (`lke`) +[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3m13im00aqv88yp1q) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Kenyi (`lke`). The dataset contains 2791 clips representing 13 hours of recordedspeech (11 hours validated) from 26 speakers. + +#### *Thur* — Thur (`lth`) +[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz31xhnkko2c1r18op0) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Thur (`lth`). The dataset contains 3238 clips representing 34 hours of recordedspeech (11 hours validated) from 29 speakers. + +#### *Mixteco Yucuhiti* — Southwestern Tlaxiaco Mixtec (`meh`) +[Esta ficha técnica](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4k7dxqixum94ltueg) corresponde a la versión 23.0 del conjunto de datos *Spontaneous Speech* (habla espontánea) de Mozilla Common Voice para Southwestern Tlaxiaco Mixtec (`meh`). El conjunto de datos contiene 1057 representando 11 horas de grabaciones (11 horas validadas) de 16 hablantes. + +#### *Jñatjo* — Michoacán Mazahua (`mmc`) +[Esta ficha técnica](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4o68p1jzp1i8bhcr9) corresponde a la versión 23.0 del conjunto de datos *Spontaneous Speech* (habla espontánea) de Mozilla Common Voice para Michoacán Mazahua (`mmc`). El conjunto de datos contiene 12 horas de grabaciones (12 horas validadas) de 12 hablantes. + +#### *Western Penan* — Western Penan (`pne`) +[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz46zej6j6cxog37yor) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Western Penan (`pne`). The dataset contains 2630 clips representing 13 hours of recordedspeech (13 hours validated) from 24 speakers. + +#### *Ruuli* — Ruuli (`ruc`) +[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4onc7tkesjgh380bj) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Ruuli (`ruc`). The dataset contains 2868 clips representing 18 hours of recordedspeech (11 hours validated) from 26 speakers. + +#### *Amba* — Amba (`rwm`) +[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4gxtza0n7xwyhdqmb) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Amba (`rwm`). The dataset contains 2443 clips representing 14 hours of recordedspeech (11 hours validated) from 21 speakers. + +#### *Scots* — Scots (`sco`) +[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz49b89ftqllox5o6e5) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Scots (`sco`). The dataset contains 715 clips representing 12 hours of recordedspeech (11 hours validated) from 21 speakers. + +#### *Toba Qom* — Toba Qom (`tob`) +[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4hdum6fm7fr1cc73u) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Toba Qom (`tob`). The dataset contains 1611 clips representing 11 hours of recordedspeech (11 hours validated) from 25 speakers. + +#### *Papantla Totonac* — Papantla Totonac (`top`) +[Esta ficha técnica](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4wyrkutthowtknout) corresponde a la versión 23.0 del conjunto de datos *Spontaneous Speech* (habla espontánea) de Mozilla Common Voice para Papantla Totonac (`top`). El conjunto de datos contiene 411 representando 11 horas de grabaciones (11 horas validadas) de 10 hablantes. + +#### *Rutoro* — Rutoro (`ttj`) +[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4vhppvj6wsbmdxr70) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Rutoro (`ttj`). The dataset contains 3113 clips representing 17 hours of recordedspeech (11 hours validated) from 26 speakers. + +#### *Kuku* — Kuku (`ukv`) +[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4tpqhjtjib1z0cxyn) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Kuku (`ukv`). The dataset contains 2586 clips representing 12 hours of recordedspeech (11 hours validated) from 22 speakers. \ No newline at end of file