Files
2017-11-16 16:59:58 +08:00

64 lines
1.6 KiB
Python

#!python3
# fetch_URL.py - Fetch all the link started at URL.
import sys
import os
import requests
import bs4
achieve = set()
def back_to_parent(url):
while url[-1] != '/':
url = url[:-1]
return url
def find_all_url(origin_url):
res = requests.get(origin_url)
if res.status_code == 404:
print("%s is 404..." % origin_url)
return
res.raise_for_status()
prev = back_to_parent(origin_url)
if origin_url is not sys.argv[1]:
achieve.add(origin_url.split('/')[-1])
soup = bs4.BeautifulSoup(res.text, 'html.parser')
for link in soup.find_all('a', href=True):
url = link.get('href')
if not url.startswith('http:') and not url.startswith('https:'):
if url is '#':
continue
elif url.startswith('../'):
url = back_to_parent(prev[:-1]) + url[3:]
else:
url = prev + url
filename = url.split('/')[-1]
if filename in achieve:
continue
if not url.endswith('.html'):
res = requests.get(url)
if res.status_code == 404:
continue
res.raise_for_status()
print('downloading %s...' % filename)
page = open('psi/' + filename, 'wb')
for chunk in res.iter_content(100000):
page.write(chunk)
page.close()
achieve.add(filename)
else:
print('opening %s...' % url)
find_all_url(url)
if len(sys.argv) != 2:
print('Usage: python fetch_URL.py [URL]')
sys.exit()
os.makedirs('psi', exist_ok=True)
find_all_url(sys.argv[1])