#coding=utf-8 import os import re import sys import MySQLdb from bs4 import BeautifulSoup path='bugs' #预编译 pattern0=re.compile(r'.*?漏洞标题:(.*)提交时间:(.*)') pattern2=re.compile(r'>(.*)') pattern3=re.compile(r'') pattern4=re.compile(r':(.*)') pattern5=re.compile(r'漏洞标题:(.*)') for docs in os.listdir(path): #打开文件,提取内容 if os.path.isdir('bugs/'+docs): print "目录跳过" continue doc=open('bugs/'+docs,'r') html=doc.read() doc.close() #提取信息 soup=BeautifulSoup(html,"html.parser") corps=soup.find_all('h3',class_='wybug_corp') corps=corps[0].find_all('a') corp=corps[0] corp=str(corp).replace(' ','').replace('\n','') authors=soup.find_all('h3',class_='wybug_author') authors=authors[0].find_all('a') author=authors[0] author=str(author).replace(' ','') types=soup.find_all('h3',class_='wybug_type') type0=str(types[0]).replace(' ','') title=re.findall(pattern0,html) if title: title1=title[0].replace(' ','').replace(' ','') else: title=soup.find_all('h3',class_='wybug_title') title0=title[0].text.encode('utf-8') title0=re.findall(pattern5,title0) title1=title0[0].replace('\n','').replace(' ','').replace(' ','') date=re.findall(pattern1,html) date1=date[0].replace(' ','') corp1=re.findall(pattern2,corp) author1=re.findall(pattern3,author) type1=re.findall(pattern4,type0) print title1,date1,author1[0],type1[0],corp1[0] #连接数据库 try: conn=MySQLdb.connect(host='localhost',port=3306,user='root',passwd='',db='wooyun',charset='utf8') cur=conn.cursor() reload(sys) sys.setdefaultencoding('utf-8') tmp=(title1,date1,author1[0],type1[0],corp1[0],docs) cur.execute("INSERT INTO `bugs`(`title`,`dates`,`author`,`type`,`corp`,`doc`) VALUES(%s,%s,%s,%s,%s,%s)",tmp) conn.commit() cur.close() conn.close() except MySQLdb.Error,e: print "Mysql Error %d: %s" % (e.args[0], e.args[1])