From 5fed4aaa1e1b0216e7a724d57a9f128784d64c90 Mon Sep 17 00:00:00 2001 From: choocewhatulike <1901722105@qq.com> Date: Thu, 30 Mar 2017 16:41:17 +0800 Subject: [PATCH] fix parse_link in crawler --- crawler/code/crawling.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/crawler/code/crawling.py b/crawler/code/crawling.py index 01752ad83..65679bcf7 100644 --- a/crawler/code/crawling.py +++ b/crawler/code/crawling.py @@ -128,7 +128,11 @@ def parse_links(self, response): content_type = None encoding = None body = yield from response.read() - + if isinstance(response.url, str): + resp_url = response.url + else: + resp_url = response.url.scheme + '://' + \ + response.url.host + response.url.path if response.status == 200: content_type = response.headers.get('content-type') pdict = {} @@ -147,13 +151,13 @@ def parse_links(self, response): LOGGER.info('got %r distinct urls from %r', len(urls), response.url) for url in urls: - normalized = urllib.parse.urljoin(response.url, url) + normalized = urllib.parse.urljoin(resp_url, url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stat = FetchStatistic( - url=response.url, + url=resp_url, next_url=None, status=response.status, exception=None,