AutomateTheBoringStuffWithP…/practice_projects/fetch_URL.py

#!python3
# fetch_URL.py - Fetch all the link started at URL.

import sys
import os
import requests
import bs4

achieve = set()


def back_to_parent(url):
    while url[-1] != '/':
        url = url[:-1]
    return url


def find_all_url(origin_url):
    res = requests.get(origin_url)
    if res.status_code == 404:
        print("%s is 404..." % origin_url)
        return
    res.raise_for_status()

    prev = back_to_parent(origin_url)
    if origin_url is not sys.argv[1]:
        achieve.add(origin_url.split('/')[-1])

    soup = bs4.BeautifulSoup(res.text, 'html.parser')
    for link in soup.find_all('a', href=True):
        url = link.get('href')
        if not url.startswith('http:') and not url.startswith('https:'):
            if url is '#':
                continue
            elif url.startswith('../'):
                url = back_to_parent(prev[:-1]) + url[3:]
            else:
                url = prev + url
        filename = url.split('/')[-1]
        if filename in achieve:
            continue
        if not url.endswith('.html'):
            res = requests.get(url)
            if res.status_code == 404:
                continue
            res.raise_for_status()
            print('downloading %s...' % filename)
            page = open('psi/' + filename, 'wb')
            for chunk in res.iter_content(100000):
                page.write(chunk)
            page.close()
            achieve.add(filename)
        else:
            print('opening %s...' % url)
            find_all_url(url)


if len(sys.argv) != 2:
    print('Usage: python fetch_URL.py [URL]')
    sys.exit()

os.makedirs('psi', exist_ok=True)
find_all_url(sys.argv[1])