diff --git a/similarity.py b/similarity.py index 087d472..66a3d2b 100644 --- a/similarity.py +++ b/similarity.py @@ -1,10 +1,11 @@ import nltk import websearch -from difflib import SequenceMatcher import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity nltk.download('stopwords') -nltk.download('punkt') +nltk.download('punkt_tab') stop_words = set(nltk.corpus.stopwords.words('english')) def purifyText(string): @@ -21,9 +22,11 @@ def webVerify(string, results_per_sentence): matching_sites.append(url) return (list(set(matching_sites))) - + def similarity(str1, str2): - return (SequenceMatcher(None,str1,str2).ratio())*100 + vectorizer = TfidfVectorizer().fit_transform([str1, str2]) + vectors = vectorizer.toarray() + return cosine_similarity([vectors[0]], [vectors[1]])[0][0] * 100 def report(text):