Files
sqlmapapi_pi/search/baidu.py
2016-08-02 14:56:15 +08:00

99 lines
3.6 KiB
Python
Executable File

#coding: utf-8
from __future__ import unicode_literals
import urllib2
import string
import urllib
import re
import random
import logging
__all__ = ["geturl"]
USER_AGENTS = ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0', \
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0', \
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \
(KHTML, like Gecko) Element Browser 5.0', \
'IBM WebExplorer /v0.94', 'Galaxy/1.0 [en] (Mac OS X 10.5.6; U; en)', \
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', \
'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14', \
'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \
Version/6.0 Mobile/10A5355d Safari/8536.25', \
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/28.0.1468.0 Safari/537.36', \
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)']
logger = logging.getLogger('app.baidu')
def baidu_search(keyword, pn):
p = {'wd': keyword}
res = urllib2.urlopen(("http://www.baidu.com/s?"+urllib.urlencode(p)+"&pn={0}&cl=3&rn=10").format(pn)) #rn为每页的显示数目 pn表示当前显示的是第pn条搜索结果
html = res.read()
return html
def getList(regex, text): #将获取的url去重并存入list
arr = []
res = re.findall(regex, text)
if res:
for r in res:
arr.append(r)
return arr
def getMatch(regex, text): #匹配函数
res = re.findall(regex, text)
if res:
return res[0]
return ''
def is_get(url): #是否是sqlmap可识别的get型链接
regex = r'(\S*?)\?.*=.*'
res = re.match(regex,url)
if res:
#print res.group(1)
return res.group(1)
else:
return 0
# def Deduplication():
# regex=r'\S'
def geturl(keyword, pages): #获取url
targets = []
hosts = []
for page in range(0,int(pages)):
pn = (page+1)*10
html = baidu_search(keyword, pn)
content = unicode(html, 'utf-8','ignore')
arrList = getList(u"<div class=\"f13\">(.*)</div>", content) #分割页面块
#print arrList
# f2=open('content.txt','a')
# f2.write(str(arrList)+'\n')#调试使用,获取内容
# f2.close()
for item in arrList:
regex = u"data-tools='\{\"title\":\"(.*)\",\"url\":\"(.*)\"\}'"
link = getMatch(regex,item)
url = link[1] #获取百度改写url
try:
domain = urllib2.Request(url)
r = random.randint(0, len(USER_AGENTS))
domain.add_header('User-agent', USER_AGENTS[r])
domain.add_header('connection', 'keep-alive')
response = urllib2.urlopen(domain)
uri = response.geturl() #获取真实url
urs = is_get(uri) #是否是传统的get型
if (uri in targets) or (urs in hosts) :
continue
else:
targets.append(uri)
hosts.append(urs)
f1 = open('data/targets.txt','a') #存放url链接
f1.write(uri+'\n')
f1.close()
except:
continue
logger.info("urls have been grabed already!!!")
return targets
if __name__ == '__main__':
pass