-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpre_lucene.py
More file actions
119 lines (83 loc) · 3.45 KB
/
Copy pathpre_lucene.py
File metadata and controls
119 lines (83 loc) · 3.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""
Python file which actually creates text files out of the parsed and tokenized
collection
"""
import os
import argparse
from parse_queries import parse_query_text_file
import json
from pathlib import Path
def read_json_document(json_file_name):
"""
Helper function to read all the path variables form the json file
Note: We are maintaining a json file to read all the input paths of our
test collection and also the output_paths
This will help us change the output/input paths of
data easily(just change in the json file) instead of worrying about changing
paths in the codebase
:return: a dictionary after the reading the entire json file
"""
with open(json_file_name) as fd:
data = json.load(fd)
return data
def parse_user_arguments():
"""
Helper function to parse user arguments
:return: a dictionary containing the user arguments
"""
ap = argparse.ArgumentParser()
ap.add_argument("-c", "--corpus_collection", help="Enter the path to "
"the json file containing "
"the cleaned corpus", required=True)
ap.add_argument("-q", "--query_text_file", help="Enter the path to the "
"query text file "
"with <DOC> tags", required=True)
ap.add_argument("-co", "--corpus_out", help="Enter dirname where you "
"want to store the corpus "
"collection", required=True)
ap.add_argument("-qo", "--query_output", help="Enter the path to the file where "
"you want to store the query outputs",
required=True)
return vars(ap.parse_args())
def write_dict_to_text_file(fname, given_dict):
"""
Helper function to items present in dictionary to a text file
:param fname: The text file where we want to write
:return:
"""
fp = open(fname, "w+")
for item in given_dict.values():
fp.write(item + "\n")
fp.close()
def write_collection_corpus_to_text_file(fname, corpus_dict):
"""
Helper function to write corpus collection to text files
:param fname: The fname where we want to write the collection to
:param corpus_dict: The corpus collection dictionary
"""
dir_path = Path(os.path.realpath(".") + "\\" + fname)
print("the directory path is ", dir_path)
if not os.path.exists(dir_path):
os.makedirs(str(dir_path))
print("Directory created")
for item in corpus_dict:
file_path = os.path.join(dir_path, item)
f = open(file_path + ".txt", "w+")
# for word in corpus_dict[item].split():
# f.write(word + "\n")
f.write(corpus_dict[item])
f.close()
# Capture the user arguments
user_args = parse_user_arguments()
query_text_file = user_args["query_text_file"]
cleaned_corpus_file = user_args["corpus_collection"]
query_out = user_args["query_output"]
corpus_out = user_args["corpus_out"]
# Create the query dictionary
query_dict = parse_query_text_file(query_text_file)
write_dict_to_text_file(query_out, query_dict)
# Get the colelction dictionary
with open(cleaned_corpus_file) as fd:
corpus = json.load(fd)
# Create text files out of the collection
write_collection_corpus_to_text_file(corpus_out, corpus)