-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrainingTablesPreprocessing.py
More file actions
210 lines (166 loc) · 7.46 KB
/
trainingTablesPreprocessing.py
File metadata and controls
210 lines (166 loc) · 7.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import pandas as pd
from tqdm import tqdm
import os
import bz2
import pickle
import _pickle as cPickle
from graph import String_token_preprocessor
from graph import Graph
from node_embeddings import *
def decompress_pickle(file: str) -> list:
"""Function to load a pickle file that is also compressed using bz2
Args:
file (str): path to the file
Returns:
list: list containing dictionary associated to wikipedia tables
"""
data = bz2.BZ2File(file, 'rb')
data = cPickle.load(data)
return data
def compute_table_ids(triples_dataset_path: str, output_file: str) -> set:
"""Provided a triple file the id of the table that appear in the triples are provided
Args:
triples_dataset_path (str): path to the dataset containing the triples
output_file (str): path to the file where to save the generated indexes
Returns:
set: set containing the indexes
"""
df = pd.read_csv(triples_dataset_path)
index_list = []
for i in tqdm(range(df.shape[0])):
index_list.append(str(df['r_id'][i]))
index_list.append(str(df['s_id'][i]))
tables_indexes = set(index_list)
with open(output_file, 'wb') as f:
pickle.dump(tables_indexes, f)
return tables_indexes
def get_tables_ids(file: str) -> set:
"""Function to load the indexes set
Args:
file (str): path to the file containing the indexes
Returns:
set: set of the indexes
"""
with open(file, 'rb') as f:
return pickle.load(f)
def process_pickle(file: str, index_set: set) -> dict:
"""Function to generate from a pickle file and an index set a table dictionary
Args:
file (str): path to the file containing the tables
index_set (set): set of the desired indexes
Returns:
dict: dictionary containing only the tables associated to the rpovided indexes
"""
in_list = decompress_pickle(file)
table_dictionary = {}
for t in in_list:
if t['_id'] in index_set:
try:
table_dictionary[t['_id']] = pd.DataFrame(t['content'][t['num_header_rows']:])
except KeyError:
table_dictionary[t['_id']] = pd.DataFrame()
return table_dictionary
def process_all_pickles(directory_path: str, index_path: str, out_path: str) -> dict:
"""Function to process multiple pickle files
Args:
directory_path (str): path to the directory containing the pickle files
index_path (str): path to file containing the indexes
out_path (str): path to the file where to dump the dictionary
Returns:
dict: the generated dictionary of tables
"""
ids = get_tables_ids(index_path)
pickle_list = os.listdir(directory_path)
out = {}
print('Pickle scan starts')
for f in tqdm(pickle_list):
out.update(process_pickle(directory_path+'/'+f, ids))
print('Pickle scan ends')
print('Saving output')
with open(out_path, 'wb') as f:
pickle.dump(out, f)
print('Output saved')
return out
def get_empty_tables_ids(dlist: dict) -> list:
"""Function that provided a dictionary of tables returns th ids of the empty ones
Args:
dlist (dict): a table dictionary
Returns:
list: the list of the indexes of the empty tables
"""
count_none = 0
out = []
for k in tqdm(dlist.keys()):
shape = dlist[k].shape
if shape[0]==0 or shape[1]==0:
count_none+=1
out.append(k)
print(f'Number of empty tables = {count_none}')
return out
def drop_small_tables(table_file: str, old_triple_file: str,new_triple_file_out: str, dim_min: int=3) -> pd.DataFrame:
"""Function to generate a new table dictionary from a rpovided one dropping all the "small tables"
Args:
table_file (str): path do the file containing the table dictionary
old_triple_file (str): path to the old triple file
new_triple_file_out (str): path to the new triple file
dim_min (int, optional): lower band of the dimension of the tables to extract. Defaults to 3.
Returns:
pd.DataFrame: new dataset containg only table that are not small
"""
with open(table_file,'rb') as f:
tables = pickle.load(f)
to_drop_key_list = []
for k in tqdm(tables.keys()):
s = tables[k].shape
if (s[0] < dim_min) or (s[1] < dim_min):
to_drop_key_list.append(k)
df = pd.read_csv(old_triple_file)
to_drop_key_list = set(to_drop_key_list)
to_drop_index_list = []
for i in tqdm(range(df.shape[0])):
if (str(df['r_id'][i]) in to_drop_key_list) or (str(df['s_id'][i]) in to_drop_key_list):
to_drop_index_list.append(i)
out = df.drop(to_drop_index_list)
print(f'Dropped {len(to_drop_index_list)} samples')
out.to_csv(new_triple_file_out, index=False)
return out
def generate_graph_dictionary(table_dict_path: str, out_path: str) -> dict:
"""Generate a graph dictionary from a table dictionary
Args:
table_dict_path (str): path to the table dictionary
out_path (str): path to the file where to save the new graph dictionary
Returns:
dict: the generated graph dictionary
"""
with open(table_dict_path,'rb') as f:
table_dict = pickle.load(f)
embedding_buffer = FasttextEmbeddingBuffer(model='fasttext-wiki-news-subwords-300')
string_token_preprocessor = String_token_preprocessor()
out = {}
print('Graphs generation starts')
for k in tqdm(table_dict.keys()):
try:
out[k] = Graph(table_dict[k], k, embedding_buffer, string_token_preprocessor, token_length_limit=None)
except:
out[k] = None
print('Graph generation ends')
print('Saving output')
with open(out_path, 'wb') as f:
pickle.dump(out, f)
print('Output saved')
return out
if __name__ == '__main__':
# ids = compute_table_ids("/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/test_samples_base.csv",
# "/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/table_id_set.pkl")
# ids = get_tables_ids("/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/table_id_set.pkl")
# print(f'Found {len(ids)} different ids')
# ids = get_tables_ids("/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/table_id_set.pkl")
# t = process_pickle("/dati/home/francesco.pugnaloni/wikipedia_tables/unprocessed_tables/wikipedia_tables_zip/enwiki-20190901-pages-meta-history7.xml-p972010p972235.output.pkl",ids)
# process_all_pickles("/home/francesco.pugnaloni/wikipedia_tables/unprocessed_tables/wikipedia_tables_zip",
# "/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/table_id_set.pkl",
# "/home/francesco.pugnaloni/wikipedia_tables/processed_tables/full_table_dict_with_id.pkl"
# )
gd = generate_graph_dictionary("/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/full_table_dict_with_id.pkl", "/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/full_graphs_dict_with_id.pkl")
# drop_small_tables("/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/full_table_dict_with_id.pkl",
# "/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/test_samples_no_ones.csv",
# "/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/test_samples_no_small_tables.csv")