From 5fed4aaa1e1b0216e7a724d57a9f128784d64c90 Mon Sep 17 00:00:00 2001
From: choocewhatulike <1901722105@qq.com>
Date: Thu, 30 Mar 2017 16:41:17 +0800
Subject: [PATCH] fix parse_link in crawler

---
 crawler/code/crawling.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/crawler/code/crawling.py b/crawler/code/crawling.py
index 01752ad83..65679bcf7 100644
--- a/crawler/code/crawling.py
+++ b/crawler/code/crawling.py
@@ -128,7 +128,11 @@ def parse_links(self, response):
         content_type = None
         encoding = None
         body = yield from response.read()
-
+        if isinstance(response.url, str):
+            resp_url = response.url
+        else:
+            resp_url = response.url.scheme + '://' + \
+            response.url.host + response.url.path
         if response.status == 200:
             content_type = response.headers.get('content-type')
             pdict = {}
@@ -147,13 +151,13 @@ def parse_links(self, response):
                     LOGGER.info('got %r distinct urls from %r',
                                 len(urls), response.url)
                 for url in urls:
-                    normalized = urllib.parse.urljoin(response.url, url)
+                    normalized = urllib.parse.urljoin(resp_url, url)
                     defragmented, frag = urllib.parse.urldefrag(normalized)
                     if self.url_allowed(defragmented):
                         links.add(defragmented)
 
         stat = FetchStatistic(
-            url=response.url,
+            url=resp_url,
             next_url=None,
             status=response.status,
             exception=None,