This commit is contained in:
seaky
2019-06-20 17:11:05 +08:00
parent ee37a13fe6
commit 96ea50acee
276 changed files with 102064 additions and 0 deletions

175
notebook/fetch_cookbook.py Normal file
View File

@@ -0,0 +1,175 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: Seaky
# @Date: 2019/6/19 14:40
# pip install requests beautifulsoup4
import json
import re
from copy import deepcopy
from pathlib import Path
import requests
from bs4 import BeautifulSoup
TEMPLATE = {
"cells": [],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": True,
"sideBar": True,
"skip_h1_title": True,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": False,
"toc_position": {},
"toc_section_display": True,
"toc_window_display": True
}
},
"nbformat": 4,
"nbformat_minor": 2
}
class Chapter:
def __init__(self, chapter_address):
self.chapter_address = chapter_address
self.path = re.sub('/[^/]+?$', '/', chapter_address)
self.ss = requests.session()
def fetch(self, url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
print(url)
raw = self.ss.get(url, headers=headers).content
m = re.search('charset=\W*(?P<charset>\w+)', raw[:200].decode(errors='ignore'))
charset = m.groupdict().get('charset', 'utf-8')
if charset == 'gb2312':
charset = 'cp936'
return raw.decode(encoding=charset)
def fetch_list(self):
content = self.fetch(self.chapter_address)
soup = BeautifulSoup(content, 'html.parser')
self.chapter_title = soup.find('h1').text.replace('', '')
self.chapter_desc = soup.find('p').text
self.sections = []
for x in soup.find_all('a', class_='reference internal', href=re.compile('/p\d+_')):
if x['href'] not in self.sections:
self.sections.append(x['href'])
def fetch_sections(self, sep=False):
cells = [{
"cell_type": "markdown",
"metadata": {},
"source": ['# {}\n {}'.format(self.chapter_title, self.chapter_desc)]
}]
dpath = Path('ipynb')
dpath.mkdir(exist_ok=True)
for href in self.sections[:]:
_cells = self.fetch_content(self.path + href)
if sep:
_dpath = dpath / self.chapter_title
_dpath.mkdir(exist_ok=True)
TEMPLATE['cells'] = _cells
*_, section_name = href.split('/')
open(str(_dpath / '{}.ipynb'.format(section_name.split('.')[0])), 'w').write(json.dumps(TEMPLATE, indent=2))
cells.extend(_cells)
TEMPLATE['cells'] = cells
open(str(dpath / '{}.ipynb'.format(self.chapter_title)), 'w').write(json.dumps(TEMPLATE, indent=2))
def fetch_content(self, url):
content = self.fetch(url)
soup = BeautifulSoup(content, 'html.parser')
cell_markdown = {
"cell_type": "markdown",
"metadata": {},
"source": []
}
cell_code = {
"cell_type": "code",
"execution_count": None,
"metadata": {},
"outputs": [],
"source": []
}
cells = []
p_header = re.compile('^h(?P<level>\d)$')
for tag in [x for x in soup.descendants if x.name]:
if p_header.search(tag.name):
cell = deepcopy(cell_markdown)
cell['source'].append(
'{} {}\n'.format('#' * (int(p_header.search(tag.name).group('level')) + 1), tag.text))
cells.append(cell)
elif tag.name == 'p':
if 'Copyright' in tag.text:
continue
cell = deepcopy(cell_markdown)
cell['source'].append(tag.text)
cells.append(cell)
elif tag.name == 'pre':
if '>>>' not in tag.text:
# code
source = [re.sub('(^\n*|\n*$)', '', tag.text)]
else:
# idle
source = []
for line in tag.text.split('\n'):
if re.search('^(>|\.){3}', line):
if re.search('^(>|\.){3}\s*$', line):
continue
source.append(re.sub('^(>|\.){3} ', '', line))
else:
if source:
cell = deepcopy(cell_code)
cell['source'].append(re.sub('(^\n*|\n*$)', '', '\n'.join(source)))
cells.append(cell)
source = []
else:
continue
if source:
cell = deepcopy(cell_code)
cell['source'].append('\n'.join(source))
cells.append(cell)
for cell in cells:
for i, text in enumerate(cell['source']):
cell['source'][i] = text.replace('', '')
return cells
def fetch_all(sep=False):
content = requests.get('https://python3-cookbook.readthedocs.io/zh_CN/latest/').content
soup = BeautifulSoup(content)
for x in soup.find_all('a', class_='reference internal', href=re.compile('chapters/p\d+'))[2:15]:
ch = Chapter('https://python3-cookbook.readthedocs.io/zh_CN/latest/' + x['href'])
ch.fetch_list()
ch.fetch_sections(sep=sep)
if __name__ == '__main__':
# ch = Chapter('https://python3-cookbook.readthedocs.io/zh_CN/latest/chapters/p01_data_structures_algorithms.html')
# ch.fetch_list()
# ch.fetch_sections()
fetch_all(sep=True)