-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpeye.py
More file actions
133 lines (108 loc) · 4.75 KB
/
peye.py
File metadata and controls
133 lines (108 loc) · 4.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# coding: utf-8
from utils import user_agent, config
import time
import jsonpath
from utils.log import get_logger
from w3lib.html import remove_tags
import datetime
from urllib import parse
import hashlib
import requests
from lxml import etree
from requests.adapters import HTTPAdapter
import re
import execjs
logger = get_logger(__name__)
timestamp = int(round(time.time() * 10000))
class PeyeSpider(object):
"""
网贷天眼
"""
def __init__(self):
self.headers = {"User-Agent": user_agent.UserAgent(mobile_ua=True)}
self.url = 'https://www.p2peye.com/search.php?'
self.title_hash = list()
self.session = requests.session()
# 超时自动重新请求3次
max_retries = 3
adapter = requests.adapters.HTTPAdapter(max_retries=max_retries)
self.session.mount('http://', adapter)
self.session.mount('https://', adapter)
self.session.keep_alive = False
def _parse_js(self, script):
js = script.replace("<script>", "", 1).split("while(z++)", 1)[0]
js += r'function get_z(){return z}function get_y(z){return y.replace(/\b\w+\b/g, function (y) {return x[f(y, z) - 1] || ("_" + y)})}'
ctx = execjs.compile(js)
z = ctx.call("get_z")
for i in range(10):
y = ctx.call("get_y", z + i)
if "setTimeout('location.href=" in y:
return y
else:
raise Exception("解析js失败")
@staticmethod
def _parse_cookie(js):
cookie_string, anonymous_function = re.search(
r"(__jsl_clearance=\d+\.?\d+\|0\|)'\+(\(function\(\).+)\+';Expires=", js).groups()
result = execjs.eval(anonymous_function)
key, value = f"{cookie_string}{result}".split("=")
return {key: value}
def get_cookies(self, script):
js = self._parse_js(script)
return self._parse_cookie(js)
def efactoring_cookie(self):
for _ in config.keywords_list:
payload = {
"mod": "h5",
"keywords": _,
"ajax": "1",
"page": "1",
}
first_res = self.session.get(self.url, headers=self.headers, params=payload)
if first_res.status_code == 521: # 网贷天眼经常改变网页规则,所以这里来了判断,如果是521就先解密
cookie_dict = self.get_cookies(first_res.text)
requests.utils.add_dict_to_cookiejar(self.session.cookies, cookie_dict)
res = self.session.get(self.url, headers=self.headers, params=payload)
self.main_crawler(res)
elif first_res.status_code == 200:
self.main_crawler(first_res)
else:
break
def main_crawler(self, response):
try:
result = response.json()
if result.get('message') == 'ok':
article_list = jsonpath.jsonpath(result, "$...list")[0]
for i in article_list:
if i.get('index') != 'post': # 去除平台帖子
article_url = "https:" + i.get('url')
article_title = remove_tags(i.get('subject'))
hl = hashlib.md5()
hl.update(article_title.encode(encoding='utf-8'))
title_sign = hl.hexdigest()
if title_sign in self.title_hash: # 根据标题去重
continue
self.title_hash.append(title_sign)
response = self.session.get(article_url, headers=self.headers)
if response.status_code == 200:
e = etree.HTML(response.text)
article_data = e.xpath(".//script[@type='application/ld+json']/text()")[0]
published_time = eval(article_data).get('pubDate').replace('T', ' ')
if published_time > datetime.datetime.today().strftime("%Y-%m-%d"): # 当天的日期:
information_data = {
"platform_name": "网贷天眼",
"title_sign": title_sign,
"published_time": published_time,
"article_title": article_title,
"article_url": article_url,
"create_time": int(time.time())
}
print(information_data)
except Exception as e:
pass
# logger.error(f"{'GET'} {self.url} headers={self.headers}")
def main():
b = PeyeSpider()
b.efactoring_cookie()
if __name__ == '__main__':
main()