IR_Project/stemming_task_clean_corpus.py at master · AmitKulkarni23/IR_Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""
Python file which creates the index based on stem queries
"""
import argparse
import json
import os
from pathlib import Path
from create_index import create_inverted_index
import errno


def read_json_document(json_file_name):
    """
    Helper function to read all the path variables form the json file
    Note: We are maintaining a json file to read all the input paths of our
    test collection and also the output_paths

    This will help us change the output/input paths of
    data easily(just change in the json file) instead of worrying about changing
    paths in the codebase
    :return: a dictionary after the reading the entire json file
    """

    with open(json_file_name) as fd:
        data = json.load(fd)

    return data


def parse_user_arguments():
    """
    Function that passes the arguments passed by the user on the command line
    :return: a dictionary with all the user arguments in the form of key
    value pairs
    """

    ap = argparse.ArgumentParser()

    ap.add_argument("-j", "--json_fname", help="Enter the path to the json "
                                               "filename containing"
                                               "all the paths to the "
                                               "test_collection", required=True)

    return vars(ap.parse_args())


# ToDO: This code is repeated from one the files. Remove redundant code
def ignore_table_of_numbers(all_html_data, html_fname):
    """
    Function that will remove the table of numbers from the HTML file
    :param all_html_data: The entire HTML data passed in as string
    :param html_fname: The name of the html file that we are parsing
    :return: string after ignoring / discarding the table of numbers
    """
    # Note: In each of the HTML files, the table of numbers begin
    # after the string "AM" or "PM"
    # Idea : Consider text only till last occurrence of "AM" or "PM"
    # Ignore the rest

    # Convert the entire string to a list
    list_html_data = all_html_data.split()

    if html_fname == "CACM-0189.html":
        last_index = len(list_html_data) - 1 - list_html_data[::-1].index(
            '57pm')
        # Ignore everything after this
        return " ".join(list_html_data[:last_index + 1])

    if html_fname == "CACM-1621.html":
        last_index = len(list_html_data) - 1 - list_html_data[::-1].index('pmb')
        # Ignore everything after this
        return " ".join(list_html_data[:last_index + 1])

    # Check for occurrence of PM first
    if "pm" in list_html_data:
        # Get the last index of PM
        index_of_PM = len(list_html_data) - 1 - list_html_data[::-1].index('pm')

        # Ignore everything after this
        return " ".join(list_html_data[:index_of_PM + 1])

    elif "am" in list_html_data:
        # Get the last index of PM
        index_of_AM = len(list_html_data) - 1 - list_html_data[::-1].index('am')

        # Ignore everything after this
        return " ".join(list_html_data[:index_of_AM + 1])

    else:
        print("The document ", html_fname, "doesn't have either PM or AM")

    return None


def parse_stemmed_version_of_corpus(stem_file_path):
    """
    Function that parses the stemmed version of the corpus
    :param stem_file_path: The path to the text file containing the
    :return: a dictionary of the form
    { doc_id : doc_id contents}
    """

    fp = open(stem_file_path)

    line = fp.readline()

    final_dict = {}

    doc_id = ""
    while line:
        line = line.strip("\n")
        line_list = line.split()

        if line_list[0] == "#":
            doc_id = get_proper_doc_id(line_list[1]) + ".html"
            line = fp.readline()
        while line[0] != "#":
            line_list.append(line.strip("\n"))
            line = fp.readline()

            if not line:
                break

        final_dict[doc_id] = " ".join(line_list[2:])

    # Note: This final dictionary will have all the doc_ids and their respective
    # contents including the the numbers. We have to excliude these numbers

    for item in final_dict:
        final_dict[item] = ignore_table_of_numbers(final_dict[item], item)

    result_dict = {}
    # Now, we have ignored the table of numbers
    for k, v in final_dict.items():
        # Ignore the .html part in all the keys of the final_dict
        result_dict[k[:-5]] = v

    return result_dict


def get_proper_doc_id(given_doc_id):
    """
    Helper function which returns the proper doc ID given a string
    :param given_doc_id: is a string
    :return: a string
    """

    # Note: The file format of the cacm.stem.txt is something as below:

    # # 1
    # ....

    # # 2
    # .....

    # 3204
    # ....

    # We need to extract these numbers
    # The input to the functions are these numbers
    doc_id_len = len(given_doc_id)
    if doc_id_len == 1:
        return "CACM-000" + given_doc_id
    elif doc_id_len == 2:
        return "CACM-00" + given_doc_id
    elif doc_id_len == 3:
        return "CACM-0" + given_doc_id
    else:
        return "CACM-" + given_doc_id


user_args = parse_user_arguments()

json_fname_relative_paths = user_args["json_fname"]

# Note the file "all_paths.json" has all the relative paths
# We will read this file and store the json file in a dictionary
all_paths_dict = read_json_document(json_fname_relative_paths)

# Get the path to the text file which conatins the stemmed version of the corpus
stemmed_text_file_path = Path(os.path.realpath(".") +
                              all_paths_dict["test_data"]["cacm_stem"])

# Parse this text file and store it in a dictionary of the form:
# {doc_id : doc_id_contents}
parsed_corpus = parse_stemmed_version_of_corpus(stemmed_text_file_path)

# get the json file where this will be written
stemmed_corpus_output_json_fname = Path(os.path.realpath(".") +
                                        all_paths_dict[
                                            "stemmed_corpus_json_fname"])

print("The stemmed corpus output file name is ",
      stemmed_corpus_output_json_fname)

if not os.path.exists(os.path.dirname(stemmed_corpus_output_json_fname)):
    try:
        os.makedirs(os.path.dirname(stemmed_corpus_output_json_fname))
    except OSError as exc:
        if exc.errno != errno.EEXIST:
            raise

# We will write this parsed_corpus to a json file
with open(stemmed_corpus_output_json_fname, "w+") as stem_out:
    json.dump(parsed_corpus, stem_out, indent=4)

# We will create an index out of this and write to a json file as well
stemmed_corpus_inverted_index_fname = Path(os.path.realpath(".") +
                                           all_paths_dict[
                                               "stemmed_inverted_index"])

print("The stemmed corpus output file name is ",
      stemmed_corpus_inverted_index_fname)


stemmed_inverted_index = create_inverted_index(stemmed_corpus_output_json_fname, stemmed_corpus_inverted_index_fname)