-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathREADME.md
145 lines (133 loc) · 4.73 KB
/
README.md
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# Search Engine
Project in [ Language Engineering ] FCS Level 3
---
Install Libraries
```
pip install b4
nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')
```
###### Libraries Used
```
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
```
Search by using google query
```
query = input("Enter word to search: ")
url = f"https://google.com/search?q={query}"
res = requests.get(url, headers={"User-Agent":"Mozilla/0.5"})
```
Extract HTML Content and all divs that contain title and description , link by class of div
```
soup = BeautifulSoup(res.content, "html.parser")
result_div = soup.find_all('div', attrs = {'class': 'ZINbbc'})
```
```
links = [ ] # Store Links
titles = [ ] # Store Titles
descriptions = [ ] # Store Descriptions
```
```
for r in result_div:
try:
link = r.find('a', href = True)
title = r.find('div', attrs={'class':'vvjwJb'}).get_text()
description = r.find('div', attrs={'class':'s3v9rd'}).get_text()
# Check if extract empty data else store to arrays
if link != '' and title != '' and description != '':
links.append(link['href'])
titles.append(title)
descriptions.append(description)
except:
continue
```
```
to_remove = [ ]
clean_links = [ ]
```
```
for i, l in enumerate(links): # enumerate return index and value
# make sure that file is a link by start with /url\?q
clean = re.search('\/url\?q\=(.*)\&sa',l)
# if return none value means not a useful link
if clean is None:
to_remove.append(i)
continue
clean_links.append(clean.group(1))
# remove title and descript of none links from arrays
for x in to_remove:
del titles[x]
del descriptions[x]
```
function sort tuple by second value or any determine by ind
```
def Sort_Tuple(tup,ind):
tup.sort(key = lambda x: x[ind])
return tup
```
```
List_all_rank = [ ] # store rank by page
def lesk(query, sentence,ind):
Text1 = sentence.lower() # string to lowercase
words = nltk.word_tokenize(Text1)
stop_words = stopwords.words("english")
stop_words += ['can', 'will', 'use', 'one', 'using', 'used', 'also', 'see', 'first', 'like']
stop_words += ['page', 'get', 'new', 'two', 'site', 'blog', 'many', 'may' ,"don't", 'dont', 'way']
stop_words += ['last', 'best', 'able', 'even', 'next', 'last', 'let', "none", 'every', 'three']
stop_words += ['lot', 'well', 'chart', 'much', 'based', 'important', 'posts', 'reads', 'least']
stop_words += ['still', 'follow', 'called', 'and','this', 'that', 'there', 'as','the', 'is']
stop_words += ['/', '=', '.', ',', '.']
filtered_words = []
for word in words:
if word not in stop_words:
filtered_words.append(word)
word = query
len_syn = len(wordnet.synsets(word)) # length all synsets
da=[ ] # store definition to avoid repetition
ea=[ ] # store examples to avoid repetition
List_rank = [ ] # store rank by synsets
# How many words description is mentioned in each synset
# at the end get max and store in List_all_rank = [ ]
for x in range(len_syn):
for i in filtered_words:
synset = wordnet.synsets(word)[x]
if i in synset.definition().lower():
if i in da:
pass
else:
da.append(i)
if len(synset.examples()) == 0:
pass
else:
if i in synset.examples()[0].lower() and word in synset.examples()[0].lower():
if i in ea:
pass
else:
ea.append(i)
# create array and add first item number of synset
l = []
l.insert(0,x)
r = len(da)+len(ea) # collection of description word found in each synset
l.append(r)
y2 = tuple(l) # convert list to tuple
List_rank.append(y2)
Sorted_list_rank = Sort_Tuple(List_rank,1) #sort by second item [number of words]
l = list(Sorted_list_rank[-1]) # after sort get last tuple ,Express the greatest value then convert to list
l.insert(0,ind) # add first item number of page
y2 = tuple(l) # convert tuple again
List_all_rank.append(y2)
for idx, val in enumerate(descriptions):
lesk(query, val, idx)
s_li = Sort_Tuple(List_all_rank,2) # (numOf page, numOf syn, how many words) here sort by how many words
for x in range(len(List_all_rank),0,-1):
print("Page Number",s_li[x-1][0]+1, ", Most Synset Num",s_li[x-1][1],", With Rank",s_li[x-1][2])
print("Title:",titles[s_li[x-1][0]])
print("Link:",clean_links[s_li[x-1][0]])
print("-------------------------\n")
```