-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_all_page.py
80 lines (68 loc) · 2.66 KB
/
get_all_page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re
def getnextpagenamecn(text):
pattern = re.compile('下一页((.*))')
res_list = pattern.findall(text)
if len(res_list) == 0:
return None
if len(res_list) == 1:
return res_list[0]
print('err list length is greater than 1')
return None
def getpagefromcn(from_name, filename):
with open(filename, 'a', encoding='utf-8') as file_object:
params = {'from': from_name, 'hideredirects': 1}
ret = requests.get(
url='https://minecraft-zh.gamepedia.com/index.php?title=Special:%E6%89%80%E6%9C%89%E9%A1%B5%E9%9D%A2', params=params)
ret.encoding = 'utf8'
soup = BeautifulSoup(ret.text, 'lxml')
navdiv = soup.find(name='div', attrs={'class': 'mw-allpages-nav'})
div = soup.find(name='div', attrs={"class": "mw-allpages-body"})
li_list = div.find_all(name='li')
for li in li_list:
if not li:
print('err li is empty')
continue
file_object.write(li.find(name='a').get('href') + '\n')
return getnextpagenamecn(navdiv.text)
def getnextpagenameen(text):
pattern = re.compile(r'Next page \((.*)\)')
res_list = pattern.findall(text)
if len(res_list) == 0:
return None
if len(res_list) == 1:
return res_list[0]
print('err list length is greater than 1')
return None
def getpagefromen(from_name, filename):
with open(filename, 'a', encoding='utf-8') as file_object:
params = {'from': from_name, 'hideredirects': 1}
ret = requests.get(
url='https://minecraft.gamepedia.com/Special:AllPages', params=params)
ret.encoding = 'utf8'
soup = BeautifulSoup(ret.text, 'lxml')
navdiv = soup.find(name='div', attrs={'class': 'mw-allpages-nav'})
div = soup.find(name='div', attrs={"class": "mw-allpages-body"})
li_list = div.find_all(name='li')
for li in li_list:
if not li:
print('err li is empty')
continue
file_object.write(li.find(name='a').get('href') + '\n')
return getnextpagenameen(navdiv.text)
def getallpagescn(filename):
now = getpagefromcn('',filename)
while now:
now = getpagefromcn(now,filename)
print(now)
def getallpagesen(filename):
now = getpagefromen('',filename)
while now:
now = getpagefromen(now,filename)
print(now)
if __name__ == "__main__":
getallpagescn('cnallpages.txt')
getallpagesen('enallpages.txt')