38 lines
985 B
Python
38 lines
985 B
Python
|
|
#!/usr/bin/env python
|
|||
|
|
# -*- encoding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
Topic: 每天一句情话
|
|||
|
|
"""
|
|||
|
|
import requests
|
|||
|
|
import re
|
|||
|
|
from io import StringIO
|
|||
|
|
import json
|
|||
|
|
import xml.etree.ElementTree as ET
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_content(xml):
|
|||
|
|
"""xpath解析,或者使用lxml库"""
|
|||
|
|
doc = ET.fromstring(xml)
|
|||
|
|
tt= doc.findall("//div[@class='articleText']")
|
|||
|
|
print(tt)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def lover_sentences_01():
|
|||
|
|
"""获取情话网的情话列表!"""
|
|||
|
|
urls = ['http://www.siandian.com/qinghua/510.html',
|
|||
|
|
'http://www.siandian.com/qinghua/510_2.html',
|
|||
|
|
'http://www.siandian.com/qinghua/1608.html']
|
|||
|
|
for url in urls:
|
|||
|
|
# 读取返回结果
|
|||
|
|
r = requests.get(url)
|
|||
|
|
# 改变r.encoding
|
|||
|
|
encoding = re.search('content="text/html;\s*charset=(.*?)"', r.text).group(1)
|
|||
|
|
r.encoding = encoding
|
|||
|
|
finds = re.finditer(r'<p>\s*(((?!</).)+)\s*</p>', r.text)
|
|||
|
|
for f in finds:
|
|||
|
|
print(f.group(1))
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
lover_sentences_01()
|