dc5/model_magnitude.py at master · Joey-Rose/dc5 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import math
from scipy import spatial
import numpy as np
from nltk.corpus import stopwords
import string

def termFrequency(term, doc):
    # Splitting the document into individual terms
    translator = str.maketrans('', '', string.punctuation)
    doc = doc.translate(translator)
    normalizeTermFreq = doc.split()

    # Number of times the term occurs in the document
    term_in_document = normalizeTermFreq.count(term.lower())
    # if term_in_document == 3:
    #     print(document)

    #normalize value
    if term_in_document > 0:
        term_in_document = 1 + math.log(term_in_document)

    return term_in_document

def inverseDocumentFrequency(term, allDocs):
    num_docs_with_given_term = 0

    # Iterate through all the documents
    for doc in allDocs:
        translator = str.maketrans('', '', string.punctuation)
        doc = doc.translate(translator)
        if term.lower() in doc.split():
            num_docs_with_given_term += 1

    if num_docs_with_given_term > 0:
        # Total number of documents
        total_num_docs = len(allDocs)

        # Calculating the IDF
        idf_val = math.log(float(total_num_docs) / num_docs_with_given_term)
        return idf_val
    else:
        return 0

#read data into a dataframe
data = pd.read_csv('data.txt', sep=",", header=None)
data.columns = ["transcript", "url"]

#get a list of transcipts to calculate the idf values
documents = data["transcript"].tolist()

phrase = input("Please enter a query to search: ").lower()
#create list of words in query to calculate the query's idf values, remove stopwords from it
tmpList = phrase.split()
stopwords = set(stopwords.words('english'))
phraseList = []
for word in tmpList:
    if word not in stopwords:
        phraseList.append(word)
print(phraseList)

#don't need to find the tf-idf values for the query since we're only evaluating the magnitudes of the document vectors!
phrase_idfs = [None] * len(phraseList)
for i in range(len(phraseList)):
    phrase_idf = inverseDocumentFrequency(phraseList[i], documents)
    #since the idf is how many documents this word appears in, just calculate it once and put it in a list!
    phrase_idfs[i] = phrase_idf

class Document():
	def __init__(self, link, magnitude):
		self.link = link
		self.magnitude = magnitude

#holds objects with magnitude values and corresponding link to TED talk
documents = []
data = data.values.tolist()

for array in data:
    document = array[0].lower()
    link = array[1]
    document_tfidfs = [None] * len(phraseList)
    for i in range(len(phraseList)):
        document_tf = termFrequency(phraseList[i], document)
        #print("the phrase {} has a frequency of {}".format(phraseList[i], document_tf))
        document_idf = phrase_idfs[i]
        document_tfidfs[i]  = document_tf * document_idf
    magnitude = np.linalg.norm(document_tfidfs)
    documents.append(Document(link, magnitude))

#sort documents by their magnitudes
#One line sort function method using an inline lambda function lambda x: x.date
documents.sort(key=lambda x: x.magnitude, reverse=True)

#print out top 10 most relavent talks, if possible
for i in range(10):
    if documents[i].magnitude > 0.0:
        print("The talk {} had a magnitude of {}".format(documents[i].link, documents[i].magnitude))
    else:
        break