TableEmbeddingsWithGNNs/trainingTablesPreprocessing.py at master · frapugna/TableEmbeddingsWithGNNs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import pandas as pd
from tqdm import tqdm
import os
import bz2
import pickle
import _pickle as cPickle
from graph import String_token_preprocessor
from graph import Graph
from node_embeddings import *


def decompress_pickle(file: str) -> list:
    """Function to load a pickle file that is also compressed using bz2

    Args:
        file (str): path to the file

    Returns:
        list: list containing dictionary associated to wikipedia tables
    """
    data = bz2.BZ2File(file, 'rb')
    data = cPickle.load(data)
    return data

def compute_table_ids(triples_dataset_path: str, output_file: str) -> set:
    """Provided a triple file the id of the table that appear in the triples are provided

    Args:
        triples_dataset_path (str): path to the dataset containing the triples
        output_file (str): path to the file where to save the generated indexes

    Returns:
        set: set containing the indexes
    """
    df = pd.read_csv(triples_dataset_path)
    index_list = []
    for i in tqdm(range(df.shape[0])):
        index_list.append(str(df['r_id'][i]))
        index_list.append(str(df['s_id'][i]))

    tables_indexes = set(index_list)
    with open(output_file, 'wb') as f:
        pickle.dump(tables_indexes, f)
    return tables_indexes

def get_tables_ids(file: str) -> set:
    """Function to load the indexes set

    Args:
        file (str): path to the file containing the indexes

    Returns:
        set: set of the indexes
    """
    with open(file, 'rb') as f:
        return pickle.load(f)

def process_pickle(file: str, index_set: set) -> dict:
    """Function to generate from a pickle file and an index set a table dictionary

    Args:
        file (str): path to the file containing the tables
        index_set (set): set of the desired indexes

    Returns:
        dict: dictionary containing only the tables associated to the rpovided indexes
    """
    in_list = decompress_pickle(file)
    table_dictionary = {}
    for t in in_list:
        if t['_id'] in index_set:
            try:
                table_dictionary[t['_id']] = pd.DataFrame(t['content'][t['num_header_rows']:])
            except KeyError:
                table_dictionary[t['_id']] = pd.DataFrame()
    return table_dictionary

def process_all_pickles(directory_path: str, index_path: str, out_path: str) -> dict:
    """Function to process multiple pickle files

    Args:
        directory_path (str): path to the directory containing the pickle files
        index_path (str): path to file containing the indexes
        out_path (str): path to the file where to dump the dictionary

    Returns:
        dict: the generated dictionary of tables
    """
    ids = get_tables_ids(index_path)
    pickle_list = os.listdir(directory_path)
    out = {}

    print('Pickle scan starts')
    for f in tqdm(pickle_list):
        out.update(process_pickle(directory_path+'/'+f, ids))
    print('Pickle scan ends')

    print('Saving output')
    with open(out_path, 'wb') as f:
        pickle.dump(out, f)
    print('Output saved')

    return out

def get_empty_tables_ids(dlist: dict) -> list:
    """Function that provided a dictionary of tables returns th ids of the empty ones

    Args:
        dlist (dict): a table dictionary

    Returns:
        list: the list of the indexes of the empty tables
    """
    count_none = 0
    out = []
    for k in tqdm(dlist.keys()):
        shape = dlist[k].shape
        if shape[0]==0 or shape[1]==0:
            count_none+=1
            out.append(k)
    print(f'Number of empty tables = {count_none}')
    return out

def drop_small_tables(table_file: str, old_triple_file: str,new_triple_file_out: str, dim_min: int=3) -> pd.DataFrame:
    """Function to generate a new table dictionary from a rpovided one dropping all the "small tables"

    Args:
        table_file (str): path do the file containing the table dictionary
        old_triple_file (str): path to the old triple file
        new_triple_file_out (str): path to the new triple file
        dim_min (int, optional): lower band of the dimension of the tables to extract. Defaults to 3.

    Returns:
        pd.DataFrame: new dataset containg only table that are not small
    """
    with open(table_file,'rb') as f:
        tables = pickle.load(f)
    to_drop_key_list = []
    for k in tqdm(tables.keys()):
        s = tables[k].shape
        if (s[0] < dim_min) or (s[1] < dim_min):
            to_drop_key_list.append(k)
    df = pd.read_csv(old_triple_file)

    to_drop_key_list = set(to_drop_key_list)

    to_drop_index_list = []
    for i in tqdm(range(df.shape[0])):
        if (str(df['r_id'][i]) in to_drop_key_list) or (str(df['s_id'][i]) in to_drop_key_list):
            to_drop_index_list.append(i)

    out = df.drop(to_drop_index_list)

    print(f'Dropped {len(to_drop_index_list)} samples')

    out.to_csv(new_triple_file_out, index=False)

    return out


def generate_graph_dictionary(table_dict_path: str, out_path: str) -> dict:
    """Generate a graph dictionary from a table dictionary

    Args:
        table_dict_path (str): path to the table dictionary
        out_path (str): path to the file where to save the new graph dictionary

    Returns:
        dict: the generated graph dictionary
    """
    with open(table_dict_path,'rb') as f:
        table_dict = pickle.load(f)

    embedding_buffer = FasttextEmbeddingBuffer(model='fasttext-wiki-news-subwords-300')
    string_token_preprocessor = String_token_preprocessor()

    out = {}

    print('Graphs generation starts')
    for k in tqdm(table_dict.keys()):
        try:
            out[k] = Graph(table_dict[k], k, embedding_buffer, string_token_preprocessor, token_length_limit=None)
        except:
            out[k] = None
    print('Graph generation ends')

    print('Saving output')
    with open(out_path, 'wb') as f:
        pickle.dump(out, f)
    print('Output saved')

    return out

if __name__ == '__main__':
    # ids = compute_table_ids("/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/test_samples_base.csv",
    #                         "/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/table_id_set.pkl")
    # ids = get_tables_ids("/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/table_id_set.pkl")
    # print(f'Found {len(ids)} different ids')
    # ids = get_tables_ids("/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/table_id_set.pkl")
    # t = process_pickle("/dati/home/francesco.pugnaloni/wikipedia_tables/unprocessed_tables/wikipedia_tables_zip/enwiki-20190901-pages-meta-history7.xml-p972010p972235.output.pkl",ids)
    # process_all_pickles("/home/francesco.pugnaloni/wikipedia_tables/unprocessed_tables/wikipedia_tables_zip",
    #                     "/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/table_id_set.pkl",
    #                     "/home/francesco.pugnaloni/wikipedia_tables/processed_tables/full_table_dict_with_id.pkl"
    #                     )

    gd = generate_graph_dictionary("/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/full_table_dict_with_id.pkl", "/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/full_graphs_dict_with_id.pkl")

    # drop_small_tables("/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/full_table_dict_with_id.pkl",
    #                   "/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/test_samples_no_ones.csv",
    #                   "/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/test_samples_no_small_tables.csv")