fixed a crawler bug

This commit is contained in:
Somdev Sangwan
2019-04-30 22:00:52 +05:30
committed by GitHub
parent d923cb99e3
commit 3e3b719157

View File

@@ -217,22 +217,19 @@ def js_extractor(response):
def handle_anchor(parent_url, url):
if parent_url.count('/') > 2:
replacable = re.search(r'/[^/]*?$', parent_url).group()
if replacable != '/':
parent_url = parent_url.replace(replacable, '')
scheme = urlparse(parent_url).scheme
if url[:4] == 'http':
return url
elif url[:2] == '//':
return scheme + ':' + url
elif url[:1] == '/':
elif url.startswith('/') and parent_url.endswith('/'):
return parent_url[:-1] + url
elif url.startswith('/') or parent_url.endswith('/'):
return parent_url + url
else:
if parent_url.endswith('/') or url.startswith('/'):
return parent_url + url
else:
return parent_url + '/' + url
host = urlparse(parent_url).netloc
scheme = urlparse(parent_url).scheme
parent_url = scheme + '://' + host
return parent_url + '/' + url
def deJSON(data):