64 lines
1.6 KiB
Python
64 lines
1.6 KiB
Python
|
|
#!python3
|
||
|
|
# fetch_URL.py - Fetch all the link started at URL.
|
||
|
|
|
||
|
|
import sys
|
||
|
|
import os
|
||
|
|
import requests
|
||
|
|
import bs4
|
||
|
|
|
||
|
|
achieve = set()
|
||
|
|
|
||
|
|
|
||
|
|
def back_to_parent(url):
|
||
|
|
while url[-1] != '/':
|
||
|
|
url = url[:-1]
|
||
|
|
return url
|
||
|
|
|
||
|
|
|
||
|
|
def find_all_url(origin_url):
|
||
|
|
res = requests.get(origin_url)
|
||
|
|
if res.status_code == 404:
|
||
|
|
print("%s is 404..." % origin_url)
|
||
|
|
return
|
||
|
|
res.raise_for_status()
|
||
|
|
|
||
|
|
prev = back_to_parent(origin_url)
|
||
|
|
if origin_url is not sys.argv[1]:
|
||
|
|
achieve.add(origin_url.split('/')[-1])
|
||
|
|
|
||
|
|
soup = bs4.BeautifulSoup(res.text, 'html.parser')
|
||
|
|
for link in soup.find_all('a', href=True):
|
||
|
|
url = link.get('href')
|
||
|
|
if not url.startswith('http:') and not url.startswith('https:'):
|
||
|
|
if url is '#':
|
||
|
|
continue
|
||
|
|
elif url.startswith('../'):
|
||
|
|
url = back_to_parent(prev[:-1]) + url[3:]
|
||
|
|
else:
|
||
|
|
url = prev + url
|
||
|
|
filename = url.split('/')[-1]
|
||
|
|
if filename in achieve:
|
||
|
|
continue
|
||
|
|
if not url.endswith('.html'):
|
||
|
|
res = requests.get(url)
|
||
|
|
if res.status_code == 404:
|
||
|
|
continue
|
||
|
|
res.raise_for_status()
|
||
|
|
print('downloading %s...' % filename)
|
||
|
|
page = open('psi/' + filename, 'wb')
|
||
|
|
for chunk in res.iter_content(100000):
|
||
|
|
page.write(chunk)
|
||
|
|
page.close()
|
||
|
|
achieve.add(filename)
|
||
|
|
else:
|
||
|
|
print('opening %s...' % url)
|
||
|
|
find_all_url(url)
|
||
|
|
|
||
|
|
|
||
|
|
if len(sys.argv) != 2:
|
||
|
|
print('Usage: python fetch_URL.py [URL]')
|
||
|
|
sys.exit()
|
||
|
|
|
||
|
|
os.makedirs('psi', exist_ok=True)
|
||
|
|
find_all_url(sys.argv[1])
|