Skip to content

Commit b83430f

Browse files
committed
add bihuspider|test
1 parent 7b42231 commit b83430f

File tree

8 files changed

+110
-16
lines changed

8 files changed

+110
-16
lines changed

app/celery.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def init_sdks():
2828
s.app = capp
2929
name = s.__str__()
3030
log.info("load %s", name)
31-
tasks.__dict__[name] = s.ptask(name)
31+
tasks.__dict__[name] = s.ptask(name, rate_limit='10/m')
3232
queues.append(Queue(name, exchange=Exchange(name, type='direct'), routing_key=name))
3333

3434
capp.conf.update(

app/register.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,6 @@
66

77
# sdk 要先引入到这里
88

9-
from sdks.test import TestTask, LagouTask
9+
from sdks.test import TestTask, LagouTask, BihuTask
1010

11-
_all_sdk_ = [TestTask, LagouTask]
11+
_all_sdk_ = [TestTask, LagouTask, BihuTask]

app/task.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,10 @@ def _instance(*args, **kwargs):
4343
return _instance
4444

4545
@classmethod
46-
def send(cls, tasks: list, group: str):
46+
def send(cls, tasks: list, group: str, source: str):
4747
name = cls.__str__()
4848
log.info("%s send task", name)
49-
return cls.app.send_task(name, kwargs={'tasks':tasks, 'group':group}, queue=name, routing_key=name)
49+
return cls.app.send_task(name, kwargs={'tasks':tasks, 'group':group, 'source':source}, queue=name, routing_key=name)
5050

5151
def start(self):
5252
self.log_task()

conf/dev_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import redis
99
import pymongo
1010

11-
LOGPATH = '/Users/mioji/Desktop/newpy/log' # os.environ.get('pspider_log')
11+
LOGPATH = "/home/ubuntu/log" # os.environ.get('pspider_log')
1212
PROXY = os.environ.get('psoider_proxy')
1313

1414
celery_broker = 'mongodb://127.0.0.1:27017/dev_broker'

example/testspider.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def parse_data(self, resp):
4141
class SiteSpider(Pspider):
4242

4343
def task(self):
44-
return "https://www.google.com"
44+
return "https://www.zhihu.com/people/pyy-69-54/following"
4545

4646
def req_resp(self):
4747

@@ -53,6 +53,7 @@ def first_page():
5353
},
5454
"response":{
5555
"handler": self.parse_data,
56+
'result_tag': 'test'
5657
}}
5758
yield first_page
5859

@@ -61,12 +62,15 @@ def parse_data(self, resp):
6162
return 'skr'
6263

6364
if __name__ == '__main__':
65+
t = SiteSpider()
66+
t.start()
67+
6468
# sp = SiteSpider()
6569
# sp.start()
66-
sp = LagouSpider()
67-
sp.tasks = ['https://www.lagou.com/zhaopin/Python/1']
68-
sp.start()
69-
70-
for s in sp.result['job'].export_sql('test.test'):
71-
print(s)
70+
# sp = LagouSpider()
71+
# sp.tasks = ['https://www.lagou.com/zhaopin/Python/1']
72+
# sp.start()
73+
#
74+
# for s in sp.result['job'].export_sql('test.test'):
75+
# print(s)
7276
# sp.result['job'].export_csvfile('/Users/mioji/Desktop/newpy/pspider/example/lagoutest.csv')

example/zhihuspider.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# -*- coding: utf-8 -*-
2+
# @Time : 2019/3/26 19:19
3+
# @Author : py
4+
5+
# @File : zhihuspider.py
6+
# @Software: PyCharm
7+
8+
from spider.pspider import Pspider, req
9+
from spider.model import BaseModel
10+
11+
class BihuSpider(Pspider):
12+
13+
def task(self):
14+
# p = 'pyy-69-54'
15+
# return 'https://www.zhihu.com/api/v4/members/{}/followees?offset=0&limit=20'.format(p)
16+
return self.tasks
17+
18+
def req_resp(self):
19+
@req()
20+
def pages():
21+
url = self.task()
22+
return {"request": {
23+
'url': url,
24+
},
25+
"response": {
26+
"handler": self.parse_data,
27+
"result_tag": 'data'
28+
}}
29+
yield pages
30+
31+
def parse_data(self, resp):
32+
if resp.json()['paging']['is_end']:
33+
npage = ''
34+
else:
35+
npage = resp.json()['paging']['next'].replace('members', 'api/v4/members')
36+
res = {
37+
'next_page': npage,
38+
'data': list(map(lambda x:{'name':x['name'], 'url_token':x['url_token']}, resp.json()['data']))
39+
}
40+
41+
return res
42+
43+
if __name__ == '__main__':
44+
sp = BihuSpider()
45+
sp.tasks = 'https://www.zhihu.com/api/v4/members/pyy-69-54/followees?limit=20&offset=40'
46+
sp.start()
47+
print(sp.result['data'])

sdks/test.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@
66

77
from app.task import Task
88
from example.testspider import LagouSpider
9+
from example.zhihuspider import BihuSpider
10+
from conf.config import redis_client, mongo_storage
11+
import time
12+
from plogger import get_logger
13+
14+
log = get_logger('testsdks')
915

1016
class TestTask(Task):
1117

@@ -22,5 +28,32 @@ def start(self):
2228
for d in spider.result['job'].export_sql('test.test'):
2329
print(d)
2430

31+
32+
class BihuTask(Task):
33+
34+
def create_task(self, data):
35+
for k in data:
36+
if redis_client.sadd('bihuset',k['url_token']):
37+
redis_client.lpush('bihutask',k['url_token'] )
38+
log.info('now task number {} {}'.format(redis_client.llen('bihutask'), redis_client.scard('bihuset')))
39+
40+
def execute(self):
41+
42+
sp = BihuSpider() # 按照用户为粒度
43+
sp.tasks = 'https://www.zhihu.com/api/v4/members/{}/followees?offset=0&limit=20'.format(self.tasks)
44+
sp.start()
45+
mongo_storage['dev']['bihu'].insert({'name':self.tasks, 'followee': sp.result['data']['data']})
46+
self.create_task(sp.result['data']['data'])
47+
for _ in range(100):
48+
time.sleep(3)
49+
if sp.result['data']['next_page']:
50+
sp.tasks = sp.result['data']['next_page']
51+
sp.start()
52+
mongo_storage['dev']['bihu'].insert({'name': self.tasks, 'followee': sp.result['data']['data']})
53+
self.create_task(sp.result['data']['data'])
54+
else:
55+
break
56+
57+
2558
if __name__ == '__main__':
26-
TestTask(tasks=[]).log_task()
59+
BihuTask(tasks='', group='test', source='ptest').execute()

sender.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,20 @@
77
from app.celery import capp
88
from app.task import Task
99
from app.register import _all_sdk_
10+
from conf.config import redis_client
11+
from functools import partial
12+
from apscheduler.schedulers.blocking import BlockingScheduler
13+
schedule = BlockingScheduler()
14+
15+
def bihusender():
16+
test = _all_sdk_[-1]
17+
t = redis_client.rpop('bihutask')
18+
test.send(tasks=t.decode(), source='follow', group='20190327')
19+
20+
schedule.add_job(bihusender,'cron', second='*/10', max_instances=1)
1021

1122

1223

1324
if __name__ == '__main__':
1425
# for i in range(10):
15-
test = _all_sdk_[1]
16-
test.send(tasks=['https://www.lagou.com/zhaopin/Python/1'])
26+
schedule.start()

0 commit comments

Comments
 (0)