From 463d6d7585784f4fb95cc5b63bb6b332fd328d75 Mon Sep 17 00:00:00 2001 From: wendell <727169395@qq.com> Date: Tue, 7 May 2019 16:07:29 +0800 Subject: [PATCH] =?UTF-8?q?=E8=8E=B7=E5=8F=96=E7=BB=9F=E8=AE=A1=E7=94=A8?= =?UTF-8?q?=E5=8C=BA=E5=88=92=E4=BB=A3=E7=A0=81=E5=92=8C=E5=9F=8E=E4=B9=A1?= =?UTF-8?q?=E5=88=92=E5=88=86=E4=BB=A3=E7=A0=81=20(=E4=BA=8C=E7=BA=A7?= =?UTF-8?q?=EF=BC=9A=E5=9C=B0=E7=BA=A7=E5=B8=82)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- source/area/china/city/CitySpider.py | 81 ++++++++++++++++++++++++ source/area/china/city/ProvinceSpider.py | 2 - source/area/china/city/__init__.py | 0 3 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 source/area/china/city/CitySpider.py create mode 100644 source/area/china/city/__init__.py diff --git a/source/area/china/city/CitySpider.py b/source/area/china/city/CitySpider.py new file mode 100644 index 0000000..39f433e --- /dev/null +++ b/source/area/china/city/CitySpider.py @@ -0,0 +1,81 @@ +# -*- coding: UTF-8 -*- +""" +@description 获取统计用区划代码和城乡划分代码 (二级:地级市) +@author wendell +""" +from source.area.china.util import DbUtil, RequestUtil +from pyquery import PyQuery +import time +import random + + +def start_requests(encoding, headers): + """ + 开始请求二级:地级市 + :param encoding 编码 + :param headers header + :return: + """ + db = DbUtil.get_db() + while True: + try: + # 查询一级集合中存在URL的并且没有搜索过下级链接地址的数据 + provinces = db['province'].find({'searched': False, 'url': {'$ne': None}}) + if not provinces.count(): + break + for province in provinces: + if not province.get('url'): + continue + print('开始获取[', province.get('name'), ']下的二级地级市...') + res = RequestUtil.get(url=province.get('url'), headers=headers, encoding=encoding) + if not res: + print(province.get('name'), '请求失败...') + continue + doc = PyQuery(res, url=province.get('url'), encoding=encoding) + if not doc: + print('二级地级市信息获取错误,检查页面变化...') + continue + # 当前一级下的所有二级地级市信息 + cities = [] + for tr in doc('.citytr').items(): + tr.make_links_absolute() + data = tr('a').text().split() + cities.append({ + 'code': data[0], # 统计汇总识别码-划分代码 + 'name': data[1], # 城市名称 + 'province_id': province.get('_id'), # 省ID + 'province_name': province.get('name'), # 省名称 + 'url': tr('a').attr('href'), # 下级链接地址 + 'searched': False # 是否搜索过下级链接地址 + }) + # 更新省级信息 + province['searched'] = True + db['province'].save(province) + print(cities) + # 准备入库, 城市信息, 添加到collection集合(数据库表)里 + db['city'].insert_many(cities) + except Exception as e: + print(e) + print('休眠中.....') + time.sleep(round(5 + random.uniform(1, 3), 2)) + print('数据获取完毕') + DbUtil.close_client(db.client) + + +def main(): + headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Cache-Control': 'max-age=0', + 'Connection': 'keep-alive', + 'Host': 'www.stats.gov.cn', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 ' + 'Safari/537.36' + } + encoding = 'gb2312' + start_requests(encoding, headers) + + +if __name__ == '__main__': + main() diff --git a/source/area/china/city/ProvinceSpider.py b/source/area/china/city/ProvinceSpider.py index 1e11dc0..53c6b8c 100644 --- a/source/area/china/city/ProvinceSpider.py +++ b/source/area/china/city/ProvinceSpider.py @@ -27,8 +27,6 @@ def start_requests(domain_url, encoding, headers): client = DbUtil.get_client() # 选择数据库 db = client["python"] - # 先删除表 - db['province'].drop() provinces = [] for td in doc('td').items(): a_tag = td('a') diff --git a/source/area/china/city/__init__.py b/source/area/china/city/__init__.py new file mode 100644 index 0000000..e69de29