common-voice · umoqnier · Sep 23, 2025
diff --git a/shared_task_datasheets.py b/shared_task_datasheets.py
@@ -0,0 +1,125 @@
+import csv
+import json
+import re
+
+from scripts.datasheet import CVDatasheet, DatasheetSection
+
+
+def read_json_file(file_name: str) -> dict:
+    """Read json file from dist
+
+    Parameters
+    ----------
+    file_name : str
+        File name
+
+    Returns
+    -------
+    dict
+        json data as python dict
+    """
+    with open(file_name, "r") as f:
+        data = json.load(f)
+    return data
+
+
+def read_csv_file(file_name: str, delimiter: str = ",") -> list[list[str]]:
+    """Read a tsv file
+
+    Parameters
+    ----------
+    file_name: str
+        File name
+
+    Return
+    ------
+    list:
+        List with lists where each list is a row and each internal list
+        has strings
+    """
+    with open(file_name, "r", newline="") as file:
+        reader = csv.reader(file, delimiter=delimiter)
+        # Skipping the header
+        next(reader, None)
+        data = list(reader)
+    return data
+
+
+def extract_row_data(row: list):
+    return {
+        "locale": row[0],
+        "speakers": row[1],
+        "recorded": row[2],
+        "transcribed": row[3],
+        "validated": row[4],
+    }
+
+
+def add_section(
+    ds: CVDatasheet,
+    section: DatasheetSection,
+    content: str,
+    level: int,
+):
+    section_title = re.sub(
+        r"[\[\]]",
+        "",
+        section.title,
+    )
+    if section_title in ds._section_map:
+        print(f"Section {section_title} already exists. Skipping addition.")
+        return
+    new_section_md = "#" * level + f" {section_title}\n\n{content}"
+    new_section = DatasheetSection(raw_text=new_section_md)
+    # Append section
+    ds.sections.append(new_section)
+    ds._section_map[section_title] = new_section
+
+
+LOCALE_DATASET_LINKS = read_json_file("mdc_locale_datasets_links.json")
+SHARED_TASK_LANGS = read_csv_file("shared_task_langs.csv")
+SHARED_TASK_STATS = read_csv_file("shared_task_stats.csv")
+DATASHEETS_PATH = "cv-corpus/sps/23.0-2025-09-05/final/{lang}"
+
+
+def main():
+    output_ds = CVDatasheet("## Languages description\n\n### Languages Summary")
+    # table = "|Language|Speakers|Recorded|Transcribed|Validated|\n|-|-|-|-|-|\n"
+    stats_table = "|Language|Train [hours]|Test [hours]|\n|-|-|-|\n"
+    # for row in SHARED_TASK_LANGS:
+    #    row_data = extract_row_data(row)
+    #    table += f"|{row_data['locale']}|{row_data['speakers']}|{row_data['recorded']}|{row_data['transcribed']}|{row_data['validated']}|\n"
+
+    for row in SHARED_TASK_STATS:
+        stats_table += f"|{row[0]}|{float(row[1]):.2f}|{float(row[2]):.2f}|\n"
+    tables = "#### Stats\n\n" + stats_table
+    output_ds.append_content("Languages Summary", tables)
+    for row in SHARED_TASK_LANGS:
+        row_data = extract_row_data(row)
+        locale = row_data.get("locale")
+        try:
+            path = DATASHEETS_PATH.format(lang="en")
+            with open(f"{path}/{locale}.md", "r") as f:
+                datasheet_text = f.read()
+        except FileNotFoundError:
+            path = DATASHEETS_PATH.format(lang="es")
+            with open(f"{path}/{locale}.md", "r") as f:
+                datasheet_text = f.read()
+        ds = CVDatasheet(datasheet_text)
+        section_content = ds.header.content
+        if "generated automatically" in section_content or "generada automáticamente":
+            section_content = "".join(section_content.split("\n")[-3:])
+        datasheet_link = LOCALE_DATASET_LINKS[locale]
+        section_content = section_content.replace(
+            "This datasheet", f"[This datasheet]({datasheet_link})"
+        )
+        section_content = section_content.replace(
+            "Esta ficha técnica", f"[Esta ficha técnica]({datasheet_link})"
+        )
+        add_section(output_ds, ds.header, section_content, level=4)
+    with open("shared_task_langs.md", "w") as f:
+        f.write(output_ds.to_markdown())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/shared_task_langs.md b/shared_task_langs.md
@@ -0,0 +1,87 @@
+### Languages Summary
+
+#### Stats
+
+|Language|Train [hours]|Test [hours]|
+|-|-|-|
+|aln|6.67|2.24|
+|bew|6.04|3.19|
+|cgg|8.10|1.99|
+|el-CY|7.17|2.22|
+|hch|6.28|1.72|
+|kcn|10.50|1.79|
+|koo|11.49|1.86|
+|led|12.03|1.63|
+|lke|8.61|2.14|
+|lth|12.26|1.68|
+|meh|6.72|1.75|
+|mmc|7.22|2.28|
+|pne|8.49|2.05|
+|ruc|14.24|1.84|
+|rwm|10.04|1.89|
+|sco|6.84|1.73|
+|tob|4.68|2.75|
+|top|5.90|2.52|
+|ttj|12.42|2.21|
+|ukv|8.51|1.60|
+
+
+#### *Gegnisht* &mdash; Gheg Albanian (`aln`)
+[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3twrdiw8y4jet4o1z) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Gheg Albanian (`aln`). The dataset contains 11 hours of recordedspeech (11 hours validated) from 14 speakers.
+
+#### *Betawi* &mdash; Betawi (`bew`)
+[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3l9oqw5m1ezpzl062) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Betawi (`bew`). The dataset contains 11 hours of recordedspeech (11 hours validated) from 21 speakers.
+
+#### *Bukusu* &mdash; Bukusu (`bxk`)
+[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3xeu21j7rl81f5u2g) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Bukusu (`bxk`). The dataset contains 2934 clips representing 15 hours of recordedspeech (11 hours validated) from 27 speakers.
+
+#### *Cypriot Greek* &mdash; Cypriot Greek (`el-CY`)
+[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3xq2ok28b4o1ojacv) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Cypriot Greek (`el-CY`). The dataset contains 1284 clips representing 11 hours of recordedspeech (11 hours validated) from 10 speakers.
+
+#### *Wixárika* &mdash; Wixárika (`hch`)
+[Esta ficha técnica](https://datacollective.mozillafoundation.org/datasets/cmflnuzz30cvnzohbhwchy6im) corresponde a la versión 23.0 del conjunto de datos *Spontaneous Speech* (habla espontánea) de Mozilla Common Voice para Wixárika (`hch`). El conjunto de datos contiene 1553 representando 11 horas de grabaciones (11 horas validadas) de 10 hablantes.
+
+#### *Nubi* &mdash; Nubi (`kcn`)
+[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3yn6qcjgut63ttxgo) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Nubi (`kcn`). The dataset contains 2719 clips representing 15 hours of recordedspeech (10 hours validated) from 26 speakers.
+
+#### *Konzo* &mdash; Konzo (`koo`)
+[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3uyclmpbj9qn6a9dk) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Konzo (`koo`). The dataset contains 3255 clips representing 15 hours of recordedspeech (11 hours validated) from 28 speakers.
+
+#### *Lendu* &mdash; Lendu (`led`)
+[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz397l4cueywdr90in7) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Lendu (`led`). The dataset contains 2882 clips representing 16 hours of recordedspeech (11 hours validated) from 26 speakers.
+
+#### *Kenyi* &mdash; Kenyi (`lke`)
+[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3m13im00aqv88yp1q) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Kenyi (`lke`). The dataset contains 2791 clips representing 13 hours of recordedspeech (11 hours validated) from 26 speakers.
+
+#### *Thur* &mdash; Thur (`lth`)
+[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz31xhnkko2c1r18op0) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Thur (`lth`). The dataset contains 3238 clips representing 34 hours of recordedspeech (11 hours validated) from 29 speakers.
+
+#### *Mixteco Yucuhiti* &mdash; Southwestern Tlaxiaco Mixtec (`meh`)
+[Esta ficha técnica](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4k7dxqixum94ltueg) corresponde a la versión 23.0 del conjunto de datos *Spontaneous Speech* (habla espontánea) de Mozilla Common Voice para Southwestern Tlaxiaco Mixtec (`meh`). El conjunto de datos contiene 1057 representando 11 horas de grabaciones (11 horas validadas) de 16 hablantes.
+
+#### *Jñatjo* &mdash; Michoacán Mazahua (`mmc`)
+[Esta ficha técnica](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4o68p1jzp1i8bhcr9) corresponde a la versión 23.0 del conjunto de datos *Spontaneous Speech* (habla espontánea) de Mozilla Common Voice para Michoacán Mazahua (`mmc`). El conjunto de datos contiene 12 horas de grabaciones (12 horas validadas) de 12 hablantes.
+
+#### *Western Penan* &mdash; Western Penan (`pne`)
+[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz46zej6j6cxog37yor) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Western Penan (`pne`). The dataset contains 2630 clips representing 13 hours of recordedspeech (13 hours validated) from 24 speakers.
+
+#### *Ruuli* &mdash; Ruuli (`ruc`)
+[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4onc7tkesjgh380bj) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Ruuli (`ruc`). The dataset contains 2868 clips representing 18 hours of recordedspeech (11 hours validated) from 26 speakers.
+
+#### *Amba* &mdash; Amba (`rwm`)
+[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4gxtza0n7xwyhdqmb) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Amba (`rwm`). The dataset contains 2443 clips representing 14 hours of recordedspeech (11 hours validated) from 21 speakers.
+
+#### *Scots* &mdash; Scots (`sco`)
+[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz49b89ftqllox5o6e5) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Scots (`sco`). The dataset contains 715 clips representing 12 hours of recordedspeech (11 hours validated) from 21 speakers.
+
+#### *Toba Qom* &mdash; Toba Qom (`tob`)
+[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4hdum6fm7fr1cc73u) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Toba Qom (`tob`). The dataset contains 1611 clips representing 11 hours of recordedspeech (11 hours validated) from 25 speakers.
+
+#### *Papantla Totonac* &mdash; Papantla Totonac (`top`)
+[Esta ficha técnica](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4wyrkutthowtknout) corresponde a la versión 23.0 del conjunto de datos *Spontaneous Speech* (habla espontánea) de Mozilla Common Voice para Papantla Totonac (`top`). El conjunto de datos contiene 411 representando 11 horas de grabaciones (11 horas validadas) de 10 hablantes.
+
+#### *Rutoro* &mdash; Rutoro (`ttj`)
+[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4vhppvj6wsbmdxr70) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Rutoro (`ttj`). The dataset contains 3113 clips representing 17 hours of recordedspeech (11 hours validated) from 26 speakers.
+
+#### *Kuku* &mdash; Kuku (`ukv`)
+[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4tpqhjtjib1z0cxyn) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Kuku (`ukv`). The dataset contains 2586 clips representing 12 hours of recordedspeech (11 hours validated) from 22 speakers.