-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclean_data.py
More file actions
84 lines (67 loc) · 3.87 KB
/
clean_data.py
File metadata and controls
84 lines (67 loc) · 3.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""
This is a script for cleaning the Reinflection samples of every language. It is inspired by the cleaning methods
implemented at g2p_config.py in the baseline LSTM+Attention model.
Note that this script is relevant for files of Unimorph 3.0, and will not necessarily be needed for later releases.
"""
import os
from os.path import join, isdir, isfile
from itertools import product
from shutil import copy2
datasets_path = join(os.getcwd(), ".data", "Reinflection")
cleaned_datasets_path = join(datasets_path, "CleanedData")
if not isdir(cleaned_datasets_path): os.mkdir(cleaned_datasets_path)
languages = ['kat', 'swc', 'sqi', 'lav', 'bul', 'hun', 'tur', 'fin']
parts_of_speech = ['V','N','ADJ']
splits = ['form', 'lemma']
file_types = ['train', 'dev', 'test']
kat_clean_sample = None # lambda x: x
swc_clean_sample = None # lambda x: x
bul_clean_sample = None # lambda x: x
def sqi_clean_sample(x:str) -> str:
return x.replace("',","") # appears in the data only as part of "për t'u ..." (NFIN)
def lav_clean_sample(x:str) -> str:
return x.replace('í', 'ī').replace('ŗ', 'r').replace("LgSPEC8", "LGSPEC8") # replace the 3 occurences of 'í' with 'ī', and the 28 occ. of 'ŗ'
def hun_clean_sample(x:str) -> str:
# the " |or| " is a bug of the scraping from Wiktionary. It can appear at the end of a form
# (search in the data for "jósolj|or|") or between 2 forms (search for "jóslok |or| jósolok").
# There are also pipes ("|"), alone or preceded by a space " |".
chars_to_remove = [" |or| ", "|or|", " |", "|"]
for p in chars_to_remove: x = x.replace(p, "")
return x
def tur_clean_sample(x:str) -> str:
return x.replace('İ', 'i')
def fin_clean_sample(x:str) -> str:
chars_to_remove = ['\xa0', ":", "/"]
for p in chars_to_remove: x = x.replace(p, "")
x = x.replace("á", "a").replace("â", "a").replace("û", "u").replace("ü", "u")
return x
existing_combinations = os.listdir(datasets_path)
cleaners = {'kat': kat_clean_sample, 'swc': swc_clean_sample, 'sqi': sqi_clean_sample, 'lav': lav_clean_sample,
'bul': bul_clean_sample, 'hun': hun_clean_sample, 'tur': tur_clean_sample, 'fin': fin_clean_sample}
def main():
lang_pos_combs_with_cleaners = {lang: {'combinations': [f"{lang}.{pos}" for pos in parts_of_speech if f"{lang}.{pos}" in existing_combinations],
'cleaner': cleaners[lang]} for lang in languages}
for lang in languages:
combinations = lang_pos_combs_with_cleaners[lang]['combinations']
cleaner = lang_pos_combs_with_cleaners[lang]['cleaner']
for lang_pos in combinations:
cleaned_files_dir = join(cleaned_datasets_path, lang_pos)
if not isdir(cleaned_files_dir): os.mkdir(cleaned_files_dir)
for split, file_type in product(splits, file_types):
relative_file_path = f"{lang_pos}.{split}.{file_type}.txt"
original_file = join(datasets_path, lang_pos, relative_file_path)
cleaned_file = join(cleaned_files_dir, relative_file_path)
if isfile(original_file):
if cleaner is None:
copy2(original_file, cleaned_file)
else:
# read -> replace -> write. assume reinflection format
data = open(original_file, encoding='utf8').read().split('\n')
data = [line.split('\t') for line in data]
for i in range(len(data)):
src_features, src_form, trg_features, trg_form = data[i]
src_form, trg_form = cleaner(src_form), cleaner(trg_form)
data[i] = [src_features, src_form, trg_features, trg_form]
open(cleaned_file, mode='w', encoding='utf8').write('\n'.join(['\t'.join(item) for item in data]))
if __name__ == '__main__':
main()