Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 125 additions & 0 deletions shared_task_datasheets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import csv
import json
import re

from scripts.datasheet import CVDatasheet, DatasheetSection


def read_json_file(file_name: str) -> dict:
"""Read json file from dist

Parameters
----------
file_name : str
File name

Returns
-------
dict
json data as python dict
"""
with open(file_name, "r") as f:
data = json.load(f)
return data


def read_csv_file(file_name: str, delimiter: str = ",") -> list[list[str]]:
"""Read a tsv file

Parameters
----------
file_name: str
File name

Return
------
list:
List with lists where each list is a row and each internal list
has strings
"""
with open(file_name, "r", newline="") as file:
reader = csv.reader(file, delimiter=delimiter)
# Skipping the header
next(reader, None)
data = list(reader)
return data


def extract_row_data(row: list):
return {
"locale": row[0],
"speakers": row[1],
"recorded": row[2],
"transcribed": row[3],
"validated": row[4],
}


def add_section(
ds: CVDatasheet,
section: DatasheetSection,
content: str,
level: int,
):
section_title = re.sub(
r"[\[\]]",
"",
section.title,
)
if section_title in ds._section_map:
print(f"Section {section_title} already exists. Skipping addition.")
return
new_section_md = "#" * level + f" {section_title}\n\n{content}"
new_section = DatasheetSection(raw_text=new_section_md)
# Append section
ds.sections.append(new_section)
ds._section_map[section_title] = new_section


LOCALE_DATASET_LINKS = read_json_file("mdc_locale_datasets_links.json")
SHARED_TASK_LANGS = read_csv_file("shared_task_langs.csv")
SHARED_TASK_STATS = read_csv_file("shared_task_stats.csv")
DATASHEETS_PATH = "cv-corpus/sps/23.0-2025-09-05/final/{lang}"


def main():
output_ds = CVDatasheet("## Languages description\n\n### Languages Summary")
# table = "|Language|Speakers|Recorded|Transcribed|Validated|\n|-|-|-|-|-|\n"
stats_table = "|Language|Train [hours]|Test [hours]|\n|-|-|-|\n"
# for row in SHARED_TASK_LANGS:
# row_data = extract_row_data(row)
# table += f"|{row_data['locale']}|{row_data['speakers']}|{row_data['recorded']}|{row_data['transcribed']}|{row_data['validated']}|\n"

for row in SHARED_TASK_STATS:
stats_table += f"|{row[0]}|{float(row[1]):.2f}|{float(row[2]):.2f}|\n"
tables = "#### Stats\n\n" + stats_table
output_ds.append_content("Languages Summary", tables)
for row in SHARED_TASK_LANGS:
row_data = extract_row_data(row)
locale = row_data.get("locale")
try:
path = DATASHEETS_PATH.format(lang="en")
with open(f"{path}/{locale}.md", "r") as f:
datasheet_text = f.read()
except FileNotFoundError:
path = DATASHEETS_PATH.format(lang="es")
with open(f"{path}/{locale}.md", "r") as f:
datasheet_text = f.read()
ds = CVDatasheet(datasheet_text)
section_content = ds.header.content
if "generated automatically" in section_content or "generada automáticamente":
section_content = "".join(section_content.split("\n")[-3:])
datasheet_link = LOCALE_DATASET_LINKS[locale]
section_content = section_content.replace(
"This datasheet", f"[This datasheet]({datasheet_link})"
)
section_content = section_content.replace(
"Esta ficha técnica", f"[Esta ficha técnica]({datasheet_link})"
)
add_section(output_ds, ds.header, section_content, level=4)
with open("shared_task_langs.md", "w") as f:
f.write(output_ds.to_markdown())


if __name__ == "__main__":
main()
87 changes: 87 additions & 0 deletions shared_task_langs.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
### Languages Summary

#### Stats

|Language|Train [hours]|Test [hours]|
|-|-|-|
|aln|6.67|2.24|
|bew|6.04|3.19|
|cgg|8.10|1.99|
|el-CY|7.17|2.22|
|hch|6.28|1.72|
|kcn|10.50|1.79|
|koo|11.49|1.86|
|led|12.03|1.63|
|lke|8.61|2.14|
|lth|12.26|1.68|
|meh|6.72|1.75|
|mmc|7.22|2.28|
|pne|8.49|2.05|
|ruc|14.24|1.84|
|rwm|10.04|1.89|
|sco|6.84|1.73|
|tob|4.68|2.75|
|top|5.90|2.52|
|ttj|12.42|2.21|
|ukv|8.51|1.60|


#### *Gegnisht* — Gheg Albanian (`aln`)
[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3twrdiw8y4jet4o1z) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Gheg Albanian (`aln`). The dataset contains 11 hours of recordedspeech (11 hours validated) from 14 speakers.

#### *Betawi* — Betawi (`bew`)
[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3l9oqw5m1ezpzl062) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Betawi (`bew`). The dataset contains 11 hours of recordedspeech (11 hours validated) from 21 speakers.

#### *Bukusu* — Bukusu (`bxk`)
[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3xeu21j7rl81f5u2g) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Bukusu (`bxk`). The dataset contains 2934 clips representing 15 hours of recordedspeech (11 hours validated) from 27 speakers.

#### *Cypriot Greek* — Cypriot Greek (`el-CY`)
[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3xq2ok28b4o1ojacv) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Cypriot Greek (`el-CY`). The dataset contains 1284 clips representing 11 hours of recordedspeech (11 hours validated) from 10 speakers.

#### *Wixárika* — Wixárika (`hch`)
[Esta ficha técnica](https://datacollective.mozillafoundation.org/datasets/cmflnuzz30cvnzohbhwchy6im) corresponde a la versión 23.0 del conjunto de datos *Spontaneous Speech* (habla espontánea) de Mozilla Common Voice para Wixárika (`hch`). El conjunto de datos contiene 1553 representando 11 horas de grabaciones (11 horas validadas) de 10 hablantes.

#### *Nubi* — Nubi (`kcn`)
[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3yn6qcjgut63ttxgo) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Nubi (`kcn`). The dataset contains 2719 clips representing 15 hours of recordedspeech (10 hours validated) from 26 speakers.

#### *Konzo* — Konzo (`koo`)
[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3uyclmpbj9qn6a9dk) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Konzo (`koo`). The dataset contains 3255 clips representing 15 hours of recordedspeech (11 hours validated) from 28 speakers.

#### *Lendu* — Lendu (`led`)
[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz397l4cueywdr90in7) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Lendu (`led`). The dataset contains 2882 clips representing 16 hours of recordedspeech (11 hours validated) from 26 speakers.

#### *Kenyi* — Kenyi (`lke`)
[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz3m13im00aqv88yp1q) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Kenyi (`lke`). The dataset contains 2791 clips representing 13 hours of recordedspeech (11 hours validated) from 26 speakers.

#### *Thur* — Thur (`lth`)
[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz31xhnkko2c1r18op0) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Thur (`lth`). The dataset contains 3238 clips representing 34 hours of recordedspeech (11 hours validated) from 29 speakers.

#### *Mixteco Yucuhiti* — Southwestern Tlaxiaco Mixtec (`meh`)
[Esta ficha técnica](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4k7dxqixum94ltueg) corresponde a la versión 23.0 del conjunto de datos *Spontaneous Speech* (habla espontánea) de Mozilla Common Voice para Southwestern Tlaxiaco Mixtec (`meh`). El conjunto de datos contiene 1057 representando 11 horas de grabaciones (11 horas validadas) de 16 hablantes.

#### *Jñatjo* — Michoacán Mazahua (`mmc`)
[Esta ficha técnica](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4o68p1jzp1i8bhcr9) corresponde a la versión 23.0 del conjunto de datos *Spontaneous Speech* (habla espontánea) de Mozilla Common Voice para Michoacán Mazahua (`mmc`). El conjunto de datos contiene 12 horas de grabaciones (12 horas validadas) de 12 hablantes.

#### *Western Penan* — Western Penan (`pne`)
[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz46zej6j6cxog37yor) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Western Penan (`pne`). The dataset contains 2630 clips representing 13 hours of recordedspeech (13 hours validated) from 24 speakers.

#### *Ruuli* — Ruuli (`ruc`)
[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4onc7tkesjgh380bj) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Ruuli (`ruc`). The dataset contains 2868 clips representing 18 hours of recordedspeech (11 hours validated) from 26 speakers.

#### *Amba* — Amba (`rwm`)
[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4gxtza0n7xwyhdqmb) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Amba (`rwm`). The dataset contains 2443 clips representing 14 hours of recordedspeech (11 hours validated) from 21 speakers.

#### *Scots* — Scots (`sco`)
[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz49b89ftqllox5o6e5) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Scots (`sco`). The dataset contains 715 clips representing 12 hours of recordedspeech (11 hours validated) from 21 speakers.

#### *Toba Qom* — Toba Qom (`tob`)
[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4hdum6fm7fr1cc73u) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Toba Qom (`tob`). The dataset contains 1611 clips representing 11 hours of recordedspeech (11 hours validated) from 25 speakers.

#### *Papantla Totonac* — Papantla Totonac (`top`)
[Esta ficha técnica](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4wyrkutthowtknout) corresponde a la versión 23.0 del conjunto de datos *Spontaneous Speech* (habla espontánea) de Mozilla Common Voice para Papantla Totonac (`top`). El conjunto de datos contiene 411 representando 11 horas de grabaciones (11 horas validadas) de 10 hablantes.

#### *Rutoro* — Rutoro (`ttj`)
[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4vhppvj6wsbmdxr70) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Rutoro (`ttj`). The dataset contains 3113 clips representing 17 hours of recordedspeech (11 hours validated) from 26 speakers.

#### *Kuku* — Kuku (`ukv`)
[This datasheet](https://datacollective.mozillafoundation.org/datasets/cmflnuzz4tpqhjtjib1z0cxyn) is for version 23.0 of the the Mozilla Common Voice *Spontaneous Speech* dataset for Kuku (`ukv`). The dataset contains 2586 clips representing 12 hours of recordedspeech (11 hours validated) from 22 speakers.