Merge pull request #2196 from Xceptions/searchengine

geekcomputers · web-flow · commit 5ef5a07dd33a · 2024-05-17T16:13:17.000+01:00
Adding a Search Engine to the repo
diff --git a/Search_Engine/README.md b/Search_Engine/README.md
@@ -0,0 +1,9 @@
+Python Program to search through various documents and return the documents containing the search term. Algorithm involves using a reverse index to store each word in each document where a document is defined by an index. To get the document that contains a search term, we simply find an intersect of all the words in the search term, and using the resulting indexes, retrieve the document(s) that contain these words
+
+To use directly, run
+
+```python3 backend.py```
+
+To use a gui, run
+
+```python3 frontend.py```
diff --git a/Search_Engine/backend.py b/Search_Engine/backend.py
@@ -0,0 +1,135 @@
+import sqlite3
+import test_data
+import ast
+import json
+
+class SearchEngine:
+    """
+    It works by building a reverse index store that maps
+    words to an id. To find the document(s) that contain
+    a certain search term, we then take an intersection
+    of the ids
+    """
+
+    def __init__(self):
+        """
+        Returns - None
+        Input - None
+        ----------
+        - Initialize database. we use sqlite3
+        - Check if the tables exist, if not create them
+        - maintain a class level access to the database
+          connection object
+        """
+        self.conn = sqlite3.connect("searchengine.sqlite3", autocommit=True)
+        cur = self.conn.cursor()
+        res = cur.execute("SELECT name FROM sqlite_master WHERE name='IdToDoc'")
+        tables_exist = res.fetchone()
+
+        if not tables_exist:
+            self.conn.execute("CREATE TABLE IdToDoc(id INTEGER PRIMARY KEY, document TEXT)")
+            self.conn.execute('CREATE TABLE WordToId (name TEXT, value TEXT)')
+            cur.execute("INSERT INTO WordToId VALUES (?, ?)", ("index", "{}",))
+
+    def index_document(self, document):
+        """
+        Returns - string
+        Input - str: a string of words called document
+        ----------
+        Indexes the document. It does this by performing two
+        operations - add the document to the IdToDoc, then
+        adds the words in the document to WordToId
+        - takes in the document (str)
+        - passes the document to a method to add the document
+          to IdToDoc
+        - retrieves the id of the inserted document
+        - uses the id to call the method that adds the words of 
+          the document to the reverse index WordToId if the word has not
+          already been indexed
+        """
+        row_id = self._add_to_IdToDoc(document)
+        cur = self.conn.cursor()
+        reverse_idx = cur.execute("SELECT value FROM WordToId WHERE name='index'").fetchone()[0]
+        reverse_idx = json.loads(reverse_idx)
+        document = document.split()
+        for word in document:
+            if word not in reverse_idx:
+                reverse_idx[word] = [row_id]
+            else:
+                if row_id not in reverse_idx[word]:
+                    reverse_idx[word].append(row_id)
+        reverse_idx = json.dumps(reverse_idx)
+        cur = self.conn.cursor()
+        result = cur.execute("UPDATE WordToId SET value = (?) WHERE name='index'", (reverse_idx,))
+        return("index successful")
+
+    def _add_to_IdToDoc(self, document):
+        """
+        Returns - int: the id of the inserted document
+        Input - str: a string of words called `document`
+        ---------
+        - use the class-level connection object to insert the document
+          into the db
+        - retrieve and return the row id of the inserted document
+        """
+        cur = self.conn.cursor()
+        res = cur.execute("INSERT INTO IdToDoc (document) VALUES (?)", (document,))
+        return res.lastrowid
+
+    def find_documents(self, search_term):
+        """
+        Returns - <class method>: the return value of the _find_documents_with_idx method
+        Input - str: a string of words called `search_term`
+        ---------
+        - retrieve the reverse index
+        - use the words contained in the search term to find all the idxs
+          that contain the word
+        - use idxs to call the _find_documents_with_idx method
+        - return the result of the called method
+        """
+        cur = self.conn.cursor()
+        reverse_idx = cur.execute("SELECT value FROM WordToId WHERE name='index'").fetchone()[0]
+        reverse_idx = json.loads(reverse_idx)
+        search_term = search_term.split(" ")
+        all_docs_with_search_term = []
+        for term in search_term:
+            if term in reverse_idx:
+                all_docs_with_search_term.append(reverse_idx[term])
+
+        if not all_docs_with_search_term: # the search term does not exist
+            return []
+
+        common_idx_of_docs = set(all_docs_with_search_term[0])
+        for idx in all_docs_with_search_term[1:]:
+            common_idx_of_docs.intersection_update(idx)
+
+        if not common_idx_of_docs: # the search term does not exist
+            return []
+
+        return self._find_documents_with_idx(common_idx_of_docs)
+        
+    def _find_documents_with_idx(self, idxs):
+        """
+        Returns - list[str]: the list of documents with the idxs
+        Input - list of idxs
+        ---------
+        - use the class-level connection object to retrieve the documents that
+          have the idx in the input list of idxs.
+        - retrieve and return these documents as a list
+        """
+        idxs = list(idxs)
+        cur = self.conn.cursor()
+        sql="SELECT document FROM IdToDoc WHERE id in ({seq})".format(
+                                                                seq=','.join(['?']*len(idxs))
+                                                               )
+        result = cur.execute(sql, idxs).fetchall()
+        return(result)
+
+
+if __name__ == "__main__":
+    se = SearchEngine()
+    se.index_document("we should all strive to be happy and happy again")
+    print(se.index_document("happiness is all you need"))
+    se.index_document("no way should we be sad")
+    se.index_document("a cheerful heart is a happy one even in Nigeria")
+    print(se.find_documents("happy"))
diff --git a/Search_Engine/frontend.py b/Search_Engine/frontend.py
@@ -0,0 +1,37 @@
+from tkinter import *
+from tkinter import messagebox
+import backend
+
+
+def add_document():
+    document = add_documents_entry.get()
+    se = backend.SearchEngine()
+    print(se.index_document(document))
+
+def find_term():
+    term = find_term_entry.get()
+    se = backend.SearchEngine()
+    print(se.find_documents(term))
+
+if __name__ == "__main__":
+    root = Tk()
+    root.title("Registration Form")
+    root.geometry('300x300')
+
+    add_documents_label = Label(root, text="Add Document:")
+    add_documents_label.pack()
+    add_documents_entry = Entry(root)
+    add_documents_entry.pack()
+
+    add_document_button = Button(root, text="add", command=add_document)
+    add_document_button.pack()
+
+    find_term_label = Label(root, text="Input term to search:")
+    find_term_label.pack()
+    find_term_entry = Entry(root)
+    find_term_entry.pack()
+
+    search_term_button = Button(root, text="search", command=find_term)
+    search_term_button.pack()
+
+    root.mainloop()
diff --git a/Search_Engine/test_data.py b/Search_Engine/test_data.py
@@ -0,0 +1,8 @@
+documents = [
+    "we should all strive to be happy",
+    "happiness is all you need",
+    "a cheerful heart is a happy one",
+    "no way should we be sad"
+]
+
+search = "happy"