@@ -26,11 +26,12 @@ def __init__(self, **kwargs):
2626 :return: :class:Producer object
2727 :rtype: Producer
2828 """
29+ self .context = kwargs .pop ('context' )
2930 self .__mongo_db = kwargs .pop ('mongo_db' )
3031 self .mongo_handle = None
3132 self .redis_handle = RedisUtils (db = kwargs .pop ('redis_db' ), tld = kwargs .pop ('tld' ))
3233
33- def produce (self , tspider_context ):
34+ def produce (self ):
3435 # mongodb with multipleprocessing must be init after fork
3536 self .mongo_handle = MongoUtils (db = self .__mongo_db )
3637 if not self .redis_handle .connected or not self .mongo_handle .connected :
@@ -40,8 +41,9 @@ def produce(self, tspider_context):
4041 while True :
4142 try :
4243 _ , req = self .redis_handle .fetch_one_result ()
43- remainder_result = self .redis_handle .result_counts
44- logger .debug ('got req, %d results left' % remainder_result )
44+ with self .context ['lock' ]:
45+ self .context ['result_counts' ].value -= 1
46+ logger .debug ('got req, %d results left' % self .context ['result_counts' ].value )
4547 self .proc_req (req )
4648 except :
4749 logger .exception ('produce exception!' )
@@ -53,11 +55,10 @@ def produce(self, tspider_context):
5355 self .mongo_handle .connect ()
5456 time .sleep (10 )
5557 finally :
56- if remainder_result == 0 and self .redis_handle .task_counts == 0 :
57- with tspider_context ['lock' ]:
58- live_spider_counts = tspider_context ['live_spider_counts' ].value
59- if live_spider_counts == 0 :
60- tspider_context ['task_done' ].set ()
58+ with self .context ['lock' ]:
59+ if self .context ['result_counts' ].value == 0 :
60+ if self .context ['live_spider_counts' ].value == 0 and self .context ['task_counts' ].value == 0 :
61+ self .context ['task_done' ].set ()
6162
6263 def proc_req (self , req ):
6364 try :
@@ -99,16 +100,18 @@ def proc_req(self, req):
99100 elif method == 'GET' :
100101 # new host found, add index page to task queue
101102 if self .redis_handle .get_hostname_reqcount (url .hostname ) == 0 :
102- self .redis_handle . create_task_from_url (URL (url .index_page ), add_whitelist = False )
103+ self .create_task_from_url (URL (url .index_page ), add_whitelist = False )
103104 # check url validation inside create_url_task
104- self .redis_handle . create_task_from_url (url , add_whitelist = False )
105+ self .create_task_from_url (url , add_whitelist = False )
105106 else :
106107 # not GET nor POST
107108 logger .error ('HTTP Verb %s found!' % method )
108109 logger .debug (data )
109110
110- def create_task_from_url (self , url ):
111- self .redis_handle .create_task_from_url (url )
111+ def create_task_from_url (self , url , ** kwargs ):
112+ with self .context ['lock' ]:
113+ if self .redis_handle .create_task_from_url (url , ** kwargs ):
114+ self .context ['task_counts' ].value += 1
112115
113116 def create_task_from_file (self , fileobj ):
114117 """
@@ -121,7 +124,7 @@ def create_task_from_file(self, fileobj):
121124 line = line .strip ()
122125 if not line : continue
123126 url = URL (line )
124- self .redis_handle . create_task_from_url (url )
127+ self .create_task_from_url (url )
125128
126129
127130if __name__ == '__main__' :
0 commit comments