IR_Project/task_1_main.py at master · AmitKulkarni23/IR_Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
Python file that will be used to perform all the tasks related to Task1
"""

import argparse
from baseline_runs import new_bm25_scores, write_top_100_scores_to_txt, tf_idf, jm_likelihood_scores
import json
from pathlib import Path
import os


def parse_user_args():
    """
    Function that passes the arguments passed by the user on the command line
    :return: a dictionary with all the user arguments in the form of key
    value pairs
    """

    ap = argparse.ArgumentParser()

    ap.add_argument("-m", "--method", help="Enter the type of baseline run, "
                                           "bm_25, tf_idf or jm_qlm", required=True)

    ap.add_argument("-j", "--json_fname", help="Enter the path to the json "
                                               "filename containing"
                                               "all the paths to the "
                                               "test_collection", required=True)

    return vars(ap.parse_args())


def display_first_n_items_in_dict(given_dict, n):
    """
    This is just a utility function to display the first n items in a dictionary
    :param given_dict: a dictionary
    """
    count = 0
    for k in given_dict:
        print(k , "-> ", given_dict[k])
        count += 1
        if count >= n:
            break


def write_output_to_json_file(inv_index, fname):
    """
    Function that writes to a json file
    :param inv_index: The inverted index which is of the form
    {term_1 : {doc_1 : term_1_freq_in_doc_1, doc_2 : term_1_freq_in_doc_2},
    term_2 : {doc_1 : term2_freq_in_doc_1, doc_2 : term2_freq_in_doc_2} .....}
    :param fname: The json file name where you want to write the dictionary
    into
    """

    with open(fname, "w+") as o_fd:
        json.dump(inv_index, o_fd, indent=4)


def read_json_document(json_file_name):
    """
    Helper function to read all the path variables form the json file
    Note: We are maintaining a json file to read all the input paths of our
    test collection and also the output_paths

    This will help us change the output/input paths of
    data easily(just change in the json file) instead of worrying about changing
    paths in the codebase
    :return: a dictionary after the reading the entire json file
    """

    with open(json_file_name) as fd:
        data = json.load(fd)

    return data


def convert_to_non_os_specific_path(fname):
    """
    This is a utility function that converts the the given fname
    (which is a relative in Windows path format) to a path which can be used
    on all OS'
    :param fname: a relative path on Windows path format(using \\)
    :return: a non-OS specific path
    """
    return Path(os.path.realpath(".") + fname)


if __name__ == "__main__":

    # Get the user arguments
    user_args = parse_user_args()
    baseline = user_args["method"]
    json_fname_relative_paths = user_args["json_fname"]

    # Note the file "all_paths.json" has all the relative paths
    # We will read this file and store the json file in a dictionary
    all_paths_dict = read_json_document(json_fname_relative_paths)

    print("Running ", baseline, " model")

    # Create Index
    # To create index we first need to parse all the 3204 documents
    # NOTE: We have already parsed all the 3204 documents and stored it in
    # a json file( using the script create_collection_data_dict.py)

    # Now we will load this json file into a dictionary
    # Note: url_text_dict is of the form
    # {CACM_file_1 : parsed_tokenized_text_file_1,
    # CACM_file_2 : parsed_tokenized_text_file_2}

    collection_data_fname = Path(os.path.realpath(".") + all_paths_dict[
                                     "parsed_tokenized_output_json_file"])

    print("THE COLLECTION DATA FNMAE IS ", collection_data_fname)

    with open(collection_data_fname) as c_fd:
        url_text_dict = json.load(c_fd)

    # Now that we have received a dictionary containing all the doc_IDs as keys
    # and their contents parsed as values, we will create the inverted index
    # The inverted index is of the form
    # {term_1 : {doc_1 : term_1_freq_in_doc_1, doc_2 : term_1_freq_in_doc_2},
    # term_2 : {doc_1 : term2_freq_in_doc_1, doc_2 : term2_freq_in_doc_2} .....}

    # NOTE: WE HAVE ALREADY CREATED THE INVERTED INDEX AND
    # STORED IT IN A JSON FILE
    # using the script(create_index.py)

    # We will load the json file and read it into a dictionary
    inverted_index_json_fname = Path(os.path.realpath(".") +
                                 all_paths_dict[
                                     "indexer_output_json_file"])

    print("The inverted index filename is ", inverted_index_json_fname)
    with open(inverted_index_json_fname) as inv_fd:
        inverted_index = json.load(inv_fd)

    # Get the non-OS dependent path to the query text file
    query_text_file = convert_to_non_os_specific_path(all_paths_dict["test_data"]["query_text_file"])
    print("the query text file is ", query_text_file)

    # Get the non-OS dependent path to the query text file
    relevance_text_file = convert_to_non_os_specific_path(all_paths_dict["test_data"]["relevance_text_file"])
    print("the relevenace text file is ", relevance_text_file)

    # Get the BM25 scores in a dictionary
    if baseline == "bm25":
        bm_25_scores = new_bm25_scores(url_text_dict, inverted_index, query_text_file, relevance_text_file)

        # Writing the results to a text file
        output_text_fname = Path(os.path.realpath(".") + all_paths_dict[
                                         "bm_25_score_output_text_file"])
        write_top_100_scores_to_txt(bm_25_scores, output_text_fname, "bm25")

    elif baseline == "tf_idf":
        tf_idf_scores = tf_idf(url_text_dict, inverted_index, query_text_file)
        tf_idf_output_text_fname = Path(os.path.realpath(".") +
                                     all_paths_dict[
                                         "tf_idf_score_output_text_file"])
        write_top_100_scores_to_txt(tf_idf_scores, tf_idf_output_text_fname, "tf_idf")

    elif baseline == "jm_qlm":
        jm_qlm_scores = jm_likelihood_scores(url_text_dict, inverted_index, query_text_file)
        jm_qlm_score_output_text_file = Path(os.path.realpath(".") + all_paths_dict["jm_qlm_score_output_text_file"])

        write_top_100_scores_to_txt(jm_qlm_scores,jm_qlm_score_output_text_file,"jm_qlm")