Skip to content

Commit 5ef5a07

Browse files
Merge pull request #2196 from Xceptions/searchengine
Adding a Search Engine to the repo
2 parents ddde32a + ab16fbc commit 5ef5a07

File tree

4 files changed

+189
-0
lines changed

4 files changed

+189
-0
lines changed

Search_Engine/README.md

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
Python Program to search through various documents and return the documents containing the search term. Algorithm involves using a reverse index to store each word in each document where a document is defined by an index. To get the document that contains a search term, we simply find an intersect of all the words in the search term, and using the resulting indexes, retrieve the document(s) that contain these words
2+
3+
To use directly, run
4+
5+
```python3 backend.py```
6+
7+
To use a gui, run
8+
9+
```python3 frontend.py```

Search_Engine/backend.py

+135
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import sqlite3
2+
import test_data
3+
import ast
4+
import json
5+
6+
class SearchEngine:
7+
"""
8+
It works by building a reverse index store that maps
9+
words to an id. To find the document(s) that contain
10+
a certain search term, we then take an intersection
11+
of the ids
12+
"""
13+
14+
def __init__(self):
15+
"""
16+
Returns - None
17+
Input - None
18+
----------
19+
- Initialize database. we use sqlite3
20+
- Check if the tables exist, if not create them
21+
- maintain a class level access to the database
22+
connection object
23+
"""
24+
self.conn = sqlite3.connect("searchengine.sqlite3", autocommit=True)
25+
cur = self.conn.cursor()
26+
res = cur.execute("SELECT name FROM sqlite_master WHERE name='IdToDoc'")
27+
tables_exist = res.fetchone()
28+
29+
if not tables_exist:
30+
self.conn.execute("CREATE TABLE IdToDoc(id INTEGER PRIMARY KEY, document TEXT)")
31+
self.conn.execute('CREATE TABLE WordToId (name TEXT, value TEXT)')
32+
cur.execute("INSERT INTO WordToId VALUES (?, ?)", ("index", "{}",))
33+
34+
def index_document(self, document):
35+
"""
36+
Returns - string
37+
Input - str: a string of words called document
38+
----------
39+
Indexes the document. It does this by performing two
40+
operations - add the document to the IdToDoc, then
41+
adds the words in the document to WordToId
42+
- takes in the document (str)
43+
- passes the document to a method to add the document
44+
to IdToDoc
45+
- retrieves the id of the inserted document
46+
- uses the id to call the method that adds the words of
47+
the document to the reverse index WordToId if the word has not
48+
already been indexed
49+
"""
50+
row_id = self._add_to_IdToDoc(document)
51+
cur = self.conn.cursor()
52+
reverse_idx = cur.execute("SELECT value FROM WordToId WHERE name='index'").fetchone()[0]
53+
reverse_idx = json.loads(reverse_idx)
54+
document = document.split()
55+
for word in document:
56+
if word not in reverse_idx:
57+
reverse_idx[word] = [row_id]
58+
else:
59+
if row_id not in reverse_idx[word]:
60+
reverse_idx[word].append(row_id)
61+
reverse_idx = json.dumps(reverse_idx)
62+
cur = self.conn.cursor()
63+
result = cur.execute("UPDATE WordToId SET value = (?) WHERE name='index'", (reverse_idx,))
64+
return("index successful")
65+
66+
def _add_to_IdToDoc(self, document):
67+
"""
68+
Returns - int: the id of the inserted document
69+
Input - str: a string of words called `document`
70+
---------
71+
- use the class-level connection object to insert the document
72+
into the db
73+
- retrieve and return the row id of the inserted document
74+
"""
75+
cur = self.conn.cursor()
76+
res = cur.execute("INSERT INTO IdToDoc (document) VALUES (?)", (document,))
77+
return res.lastrowid
78+
79+
def find_documents(self, search_term):
80+
"""
81+
Returns - <class method>: the return value of the _find_documents_with_idx method
82+
Input - str: a string of words called `search_term`
83+
---------
84+
- retrieve the reverse index
85+
- use the words contained in the search term to find all the idxs
86+
that contain the word
87+
- use idxs to call the _find_documents_with_idx method
88+
- return the result of the called method
89+
"""
90+
cur = self.conn.cursor()
91+
reverse_idx = cur.execute("SELECT value FROM WordToId WHERE name='index'").fetchone()[0]
92+
reverse_idx = json.loads(reverse_idx)
93+
search_term = search_term.split(" ")
94+
all_docs_with_search_term = []
95+
for term in search_term:
96+
if term in reverse_idx:
97+
all_docs_with_search_term.append(reverse_idx[term])
98+
99+
if not all_docs_with_search_term: # the search term does not exist
100+
return []
101+
102+
common_idx_of_docs = set(all_docs_with_search_term[0])
103+
for idx in all_docs_with_search_term[1:]:
104+
common_idx_of_docs.intersection_update(idx)
105+
106+
if not common_idx_of_docs: # the search term does not exist
107+
return []
108+
109+
return self._find_documents_with_idx(common_idx_of_docs)
110+
111+
def _find_documents_with_idx(self, idxs):
112+
"""
113+
Returns - list[str]: the list of documents with the idxs
114+
Input - list of idxs
115+
---------
116+
- use the class-level connection object to retrieve the documents that
117+
have the idx in the input list of idxs.
118+
- retrieve and return these documents as a list
119+
"""
120+
idxs = list(idxs)
121+
cur = self.conn.cursor()
122+
sql="SELECT document FROM IdToDoc WHERE id in ({seq})".format(
123+
seq=','.join(['?']*len(idxs))
124+
)
125+
result = cur.execute(sql, idxs).fetchall()
126+
return(result)
127+
128+
129+
if __name__ == "__main__":
130+
se = SearchEngine()
131+
se.index_document("we should all strive to be happy and happy again")
132+
print(se.index_document("happiness is all you need"))
133+
se.index_document("no way should we be sad")
134+
se.index_document("a cheerful heart is a happy one even in Nigeria")
135+
print(se.find_documents("happy"))

Search_Engine/frontend.py

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from tkinter import *
2+
from tkinter import messagebox
3+
import backend
4+
5+
6+
def add_document():
7+
document = add_documents_entry.get()
8+
se = backend.SearchEngine()
9+
print(se.index_document(document))
10+
11+
def find_term():
12+
term = find_term_entry.get()
13+
se = backend.SearchEngine()
14+
print(se.find_documents(term))
15+
16+
if __name__ == "__main__":
17+
root = Tk()
18+
root.title("Registration Form")
19+
root.geometry('300x300')
20+
21+
add_documents_label = Label(root, text="Add Document:")
22+
add_documents_label.pack()
23+
add_documents_entry = Entry(root)
24+
add_documents_entry.pack()
25+
26+
add_document_button = Button(root, text="add", command=add_document)
27+
add_document_button.pack()
28+
29+
find_term_label = Label(root, text="Input term to search:")
30+
find_term_label.pack()
31+
find_term_entry = Entry(root)
32+
find_term_entry.pack()
33+
34+
search_term_button = Button(root, text="search", command=find_term)
35+
search_term_button.pack()
36+
37+
root.mainloop()

Search_Engine/test_data.py

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
documents = [
2+
"we should all strive to be happy",
3+
"happiness is all you need",
4+
"a cheerful heart is a happy one",
5+
"no way should we be sad"
6+
]
7+
8+
search = "happy"

0 commit comments

Comments
 (0)