-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathDouyuSpiderV2.py
More file actions
107 lines (95 loc) · 3.19 KB
/
DouyuSpiderV2.py
File metadata and controls
107 lines (95 loc) · 3.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/python3
from bs4 import BeautifulSoup
import re
import requests
from Host_info import *
from UnitMysql import Unit_Mysql
import time
import json
import datetime
def open_html(url):
r = requests.get(url)
r.encoding = 'utf-8'
return r.text
def get_online_number(room_number): #获取斗鱼的实际在线人数
url = "https://www.douyu.com/swf_api/h5room/" + room_number
html = open_html(url)
try:
json_a = json.loads(html)
online_number = 0
online_number = int(json_a['data']['online'])
except:
online_number = 0
finally:
return online_number
def caculate_rate(w_watching, online): #计算影响因子
result = 0.0
if online == 0:
return result
else:
result = w_watching / online
return result
def get_info():
starttime = datetime.datetime.now() #记录起始时间
a = 1 #每一个类型的直播间 比如:王者荣耀
b = 1 #每个直播间的分类
pages= 0
count = 0
save_a = 0
sum = 600 #设置最多的直播间数量
Unit_Mysql.set_info() #输入数据库的用户名以及密码
Unit_Mysql.create_structured()
Hashfilter ={} #数据中包含了重复的数,需要除去
while a < sum :
b = 1
url = "https://www.douyu.com/gapi/rkc/directory/2_"+str(a)+"/"+str(b)
html = open_html(url)
counta = 1
countb = 110
rid =""
username =""
w_watching =""
kind = ""
online = 0
rid = ""
json_a = json.loads(html)
pages = json_a['data']['pgcnt']
while b <= pages:
url = "https://www.douyu.com/gapi/rkc/directory/2_" + str(a) + "/" + str(b) # 更新链接
html = open_html(url)
json_a = json.loads(html)
for data in json_a['data']['rl']:
rid = str(data['rid'])
if rid not in Hashfilter:
Hashfilter[rid] =True
username = data['nn']
w_watching = data['ol']
kind = data['c2name']
online = get_online_number(rid)
coefficient = caculate_rate(w_watching, online)
link = "https://www.douyu.com/" + rid
datetime1 = time.strftime("%Y-%m-%d %H:%M:%S")
host = Host()
host.username = username
host.online = online
host.w_number = w_watching
host.kind = kind
host.room_number = rid
host.coefficient = coefficient
host.link = link
host.localtime = datetime1
# unit = Unit_Mysql()
Unit_Mysql.insert_db(host)
count += 1
print("正在抓取第%d条数据,%s(%s:第%d/%d页)" % (count, username, kind, b, pages))
else:
break
b += 1
b = 1
if pages != 0:
save_a = a
a += 1
print(save_a)
endtime = datetime.datetime.now()
print("Running Time:"+(endtime-starttime).seconds)
get_info()