Files
Bolt/core/photon.py

63 lines
2.4 KiB
Python
Raw Normal View History

2018-12-30 03:07:15 +05:30
# Let's import what we need
from re import findall
import concurrent.futures
2019-04-25 01:29:54 +05:30
from urllib.parse import urlparse # for python3
2018-12-30 03:07:15 +05:30
from core.colors import run
from core.zetanize import zetanize
from core.requester import requester
2019-04-25 01:29:54 +05:30
from core.utils import getUrl, getParams, remove_file
2018-12-30 03:07:15 +05:30
def photon(seedUrl, headers, depth, threadCount):
2019-04-25 01:29:54 +05:30
forms = [] # web forms
processed = set() # urls that have been crawled
storage = set() # urls that belong to the target i.e. in-scope
2018-12-30 03:07:15 +05:30
scheme = urlparse(seedUrl).scheme
host = urlparse(seedUrl).netloc
main_url = scheme + '://' + host
storage.add(seedUrl)
2019-04-25 01:29:54 +05:30
2018-12-30 03:07:15 +05:30
def rec(url):
processed.add(url)
urlPrint = (url + (' ' * 60))[:60]
print ('%s Parsing %-40s' % (run, urlPrint), end='\r')
url = getUrl(url, '', True)
params = getParams(url, '', True)
if '=' in url:
inps = []
for name, value in params.items():
inps.append({'name': name, 'value': value})
2019-04-25 01:29:54 +05:30
forms.append(
{url: {0: {'action': url, 'method': 'get', 'inputs': inps}}})
2018-12-30 03:07:15 +05:30
response = requester(url, params, headers, True, 0).text
2019-04-25 01:29:54 +05:30
forms.append({url: zetanize(url, response)})
matches = findall(
r'<[aA][^>]*?(href|HREF)=["\']{0,1}(.*?)["\']', response)
for link in matches: # iterate over the matches
# remove everything after a "#" to deal with in-page anchors
link = link[1].split('#')[0].lstrip(' ')
2018-12-30 03:07:15 +05:30
if link[:4] == 'http':
if link.startswith(main_url):
storage.add(link)
elif link[:2] == '//':
if link.split('/')[2].startswith(host):
2019-04-25 01:29:54 +05:30
storage.add(scheme + '://' + link)
2018-12-30 03:07:15 +05:30
elif link[:1] == '/':
2019-04-25 01:29:54 +05:30
storage.add(remove_file(url) + link)
2018-12-30 03:07:15 +05:30
else:
2019-04-25 01:29:54 +05:30
usable_url = remove_file(url)
if usable_url.endswith('/'):
storage.add(usable_url + link)
elif link.startswith('/'):
storage.add(usable_url + link)
else:
storage.add(usable_url + '/' + link)
2018-12-30 03:07:15 +05:30
for x in range(depth):
urls = storage - processed
threadpool = concurrent.futures.ThreadPoolExecutor(max_workers=10)
futures = (threadpool.submit(rec, url) for url in urls)
for i in concurrent.futures.as_completed(futures):
pass
2019-04-25 01:29:54 +05:30
return [forms, len(processed)]