-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathkeyword_parse.py
40 lines (33 loc) · 1.53 KB
/
keyword_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import re
import tldextract
from parse_curl import websites_with_timer, websites_without_timer
keywords = []
search_engine_keyword = []
search_engine_domain_tld = []
# open keyword file and read content , content doesn't matter what it is , it will get the domain name from url etc, or we can directly give keyword
with open('keyword.txt', 'r') as f:
read_keyword = f.readlines()
for i in read_keyword:
# extract domain name from the readed content and append it to keywords list , for use in the main.py
t = tldextract.extract(i).domain
if len(t) <= 3:
search_engine_keyword.append(t)
else:
keywords.append(tldextract.extract(i).domain)
# print("keyword for looking:",tldextract.extract(i).domain)
#take domain from other list and append it to keywords list , for use in the main.py
for i in websites_without_timer:
t = re.findall(r'(?P<url>https?://[^\s]+[^\s]+)',
i)[1].replace('https://', '').replace('http://',
'').strip('\'')
search_engine_domain_tld.append(t)
#take domain from other list and append it to keywords list , for use in the main.py
for i in websites_with_timer:
t = re.findall(r'(?P<url>https?://[^\s]+[^\s]+)',
i)[1].replace('https://', '').replace('http://',
'').strip('\'')
search_engine_domain_tld.append(t)
if __name__ == '__main__':
print(keywords)
print(search_engine_keyword)
print(search_engine_domain_tld)