-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape_wiki.py
More file actions
59 lines (53 loc) · 1.84 KB
/
Copy pathscrape_wiki.py
File metadata and controls
59 lines (53 loc) · 1.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import json
import urllib
import re
#The content of the characters' wiki-pages is extracted using the wikipedia API
baseurl = 'https://en.wikipedia.org/w/api.php?'
action = 'action=query'
title = 'titles='
content = 'prop=revisions&rvprop=content'
dataformat = 'format=json'
def look_up_decade(year: int)-> str:
'''
This function takes a particular year as an argument and returns
the wiki pages corresponding to the decade to which the year belongs
'''
decade_start=int(year/10)*10
query = '%s%s&%s&%s&%s' % (baseurl,action,f'titles={decade_start}s',content,dataformat)
res = json.loads(urllib.request.urlopen(query).read().decode('utf-8'))
pages = res.get('query').get('pages')
if not pages:
raise Exception('No pages found')
data = []
for page in pages.keys():
try:
data.append(res['query']['pages'][page]['revisions'][0]['*'])
except:
print(f"Failed on pages{page}")
return data
def process_data(d:list, limit=3)->list:
'''
This function takes in a list of wikipedia str pages and returns
the relevant informations (e.g removing links, redirects and other special characters)
'''
data_string = ''
temp_str = ''
i=0
for x in d:
if i>=limit:
break
# Remove special chars and data in links
temp_str=re.sub("[\{\<.*?[\}\>]", "", x)
# Remove links
temp_str=re.sub('url=.\S*','',temp_str)
# Weird chars
temp_str=re.sub('[^a-zA-Z0-9 \n\.]', '', temp_str)
# Remaning links
temp_str=re.sub('http.\S*','',re.sub('[^a-zA-Z0-9 \n\.]', '', temp_str))
# Remaning links
temp_str=re.sub('redirect.\S*','',temp_str)
data_string += temp_str
i+=1
return data_string
def scrape_wiki(year):
return process_data(look_up_decade(year))