fixed a crawler bug

2019-04-30 22:00:52 +05:30
parent d923cb99e3
commit 3e3b719157
1 changed files with 7 additions and 10 deletions
--- a/core/utils.py
+++ b/core/utils.py
@@ -217,22 +217,19 @@ def js_extractor(response):


 def handle_anchor(parent_url, url):
-    if parent_url.count('/') > 2:
-        replacable = re.search(r'/[^/]*?$', parent_url).group()
-        if replacable != '/':
-            parent_url = parent_url.replace(replacable, '')
-    scheme = urlparse(parent_url).scheme
    if url[:4] == 'http':
        return url
    elif url[:2] == '//':
        return scheme + ':' + url
-    elif url[:1] == '/':
+    elif url.startswith('/') and parent_url.endswith('/'):
+        return parent_url[:-1] + url
+    elif url.startswith('/') or parent_url.endswith('/'):
        return parent_url + url
    else:
-        if parent_url.endswith('/') or url.startswith('/'):
-            return parent_url + url
-        else:
-            return parent_url + '/' + url
+        host = urlparse(parent_url).netloc
+        scheme = urlparse(parent_url).scheme
+        parent_url = scheme + '://' + host
+        return parent_url + '/' + url


 def deJSON(data):