1
+ import sqlite3
2
+ import test_data
3
+ import ast
4
+ import json
5
+
6
+ class SearchEngine :
7
+ """
8
+ It works by building a reverse index store that maps
9
+ words to an id. To find the document(s) that contain
10
+ a certain search term, we then take an intersection
11
+ of the ids
12
+ """
13
+
14
+ def __init__ (self ):
15
+ """
16
+ Returns - None
17
+ Input - None
18
+ ----------
19
+ - Initialize database. we use sqlite3
20
+ - Check if the tables exist, if not create them
21
+ - maintain a class level access to the database
22
+ connection object
23
+ """
24
+ self .conn = sqlite3 .connect ("searchengine.sqlite3" , autocommit = True )
25
+ cur = self .conn .cursor ()
26
+ res = cur .execute ("SELECT name FROM sqlite_master WHERE name='IdToDoc'" )
27
+ tables_exist = res .fetchone ()
28
+
29
+ if not tables_exist :
30
+ self .conn .execute ("CREATE TABLE IdToDoc(id INTEGER PRIMARY KEY, document TEXT)" )
31
+ self .conn .execute ('CREATE TABLE WordToId (name TEXT, value TEXT)' )
32
+ cur .execute ("INSERT INTO WordToId VALUES (?, ?)" , ("index" , "{}" ,))
33
+
34
+ def index_document (self , document ):
35
+ """
36
+ Returns - string
37
+ Input - str: a string of words called document
38
+ ----------
39
+ Indexes the document. It does this by performing two
40
+ operations - add the document to the IdToDoc, then
41
+ adds the words in the document to WordToId
42
+ - takes in the document (str)
43
+ - passes the document to a method to add the document
44
+ to IdToDoc
45
+ - retrieves the id of the inserted document
46
+ - uses the id to call the method that adds the words of
47
+ the document to the reverse index WordToId if the word has not
48
+ already been indexed
49
+ """
50
+ row_id = self ._add_to_IdToDoc (document )
51
+ cur = self .conn .cursor ()
52
+ reverse_idx = cur .execute ("SELECT value FROM WordToId WHERE name='index'" ).fetchone ()[0 ]
53
+ reverse_idx = json .loads (reverse_idx )
54
+ document = document .split ()
55
+ for word in document :
56
+ if word not in reverse_idx :
57
+ reverse_idx [word ] = [row_id ]
58
+ else :
59
+ if row_id not in reverse_idx [word ]:
60
+ reverse_idx [word ].append (row_id )
61
+ reverse_idx = json .dumps (reverse_idx )
62
+ cur = self .conn .cursor ()
63
+ result = cur .execute ("UPDATE WordToId SET value = (?) WHERE name='index'" , (reverse_idx ,))
64
+ return ("index successful" )
65
+
66
+ def _add_to_IdToDoc (self , document ):
67
+ """
68
+ Returns - int: the id of the inserted document
69
+ Input - str: a string of words called `document`
70
+ ---------
71
+ - use the class-level connection object to insert the document
72
+ into the db
73
+ - retrieve and return the row id of the inserted document
74
+ """
75
+ cur = self .conn .cursor ()
76
+ res = cur .execute ("INSERT INTO IdToDoc (document) VALUES (?)" , (document ,))
77
+ return res .lastrowid
78
+
79
+ def find_documents (self , search_term ):
80
+ """
81
+ Returns - <class method>: the return value of the _find_documents_with_idx method
82
+ Input - str: a string of words called `search_term`
83
+ ---------
84
+ - retrieve the reverse index
85
+ - use the words contained in the search term to find all the idxs
86
+ that contain the word
87
+ - use idxs to call the _find_documents_with_idx method
88
+ - return the result of the called method
89
+ """
90
+ cur = self .conn .cursor ()
91
+ reverse_idx = cur .execute ("SELECT value FROM WordToId WHERE name='index'" ).fetchone ()[0 ]
92
+ reverse_idx = json .loads (reverse_idx )
93
+ search_term = search_term .split (" " )
94
+ all_docs_with_search_term = []
95
+ for term in search_term :
96
+ if term in reverse_idx :
97
+ all_docs_with_search_term .append (reverse_idx [term ])
98
+
99
+ if not all_docs_with_search_term : # the search term does not exist
100
+ return []
101
+
102
+ common_idx_of_docs = set (all_docs_with_search_term [0 ])
103
+ for idx in all_docs_with_search_term [1 :]:
104
+ common_idx_of_docs .intersection_update (idx )
105
+
106
+ if not common_idx_of_docs : # the search term does not exist
107
+ return []
108
+
109
+ return self ._find_documents_with_idx (common_idx_of_docs )
110
+
111
+ def _find_documents_with_idx (self , idxs ):
112
+ """
113
+ Returns - list[str]: the list of documents with the idxs
114
+ Input - list of idxs
115
+ ---------
116
+ - use the class-level connection object to retrieve the documents that
117
+ have the idx in the input list of idxs.
118
+ - retrieve and return these documents as a list
119
+ """
120
+ idxs = list (idxs )
121
+ cur = self .conn .cursor ()
122
+ sql = "SELECT document FROM IdToDoc WHERE id in ({seq})" .format (
123
+ seq = ',' .join (['?' ]* len (idxs ))
124
+ )
125
+ result = cur .execute (sql , idxs ).fetchall ()
126
+ return (result )
127
+
128
+
129
+ if __name__ == "__main__" :
130
+ se = SearchEngine ()
131
+ se .index_document ("we should all strive to be happy and happy again" )
132
+ print (se .index_document ("happiness is all you need" ))
133
+ se .index_document ("no way should we be sad" )
134
+ se .index_document ("a cheerful heart is a happy one even in Nigeria" )
135
+ print (se .find_documents ("happy" ))
0 commit comments