Bolt/core/photon.py

# Let's import what we need
from re import findall
import concurrent.futures
from urllib.parse import urlparse  # for python3

from core.colors import run
from core.zetanize import zetanize
from core.requester import requester
from core.utils import getUrl, getParams, remove_file


def photon(seedUrl, headers, depth, threadCount):
    forms = []  # web forms
    processed = set()  # urls that have been crawled
    storage = set()  # urls that belong to the target i.e. in-scope
    scheme = urlparse(seedUrl).scheme
    host = urlparse(seedUrl).netloc
    main_url = scheme + '://' + host
    storage.add(seedUrl)

    def rec(url):
        processed.add(url)
        urlPrint = (url + (' ' * 60))[:60]
        print ('%s Parsing %-40s' % (run, urlPrint), end='\r')
        url = getUrl(url, '', True)
        params = getParams(url, '', True)
        if '=' in url:
            inps = []
            for name, value in params.items():
                inps.append({'name': name, 'value': value})
            forms.append(
                {url: {0: {'action': url, 'method': 'get', 'inputs': inps}}})
        response = requester(url, params, headers, True, 0).text
        forms.append({url: zetanize(url, response)})
        matches = findall(
            r'<[aA][^>]*?(href|HREF)=["\']{0,1}(.*?)["\']', response)
        for link in matches:  # iterate over the matches
            # remove everything after a "#" to deal with in-page anchors
            link = link[1].split('#')[0].lstrip(' ')
            if link[:4] == 'http':
                if link.startswith(main_url):
                    storage.add(link)
            elif link[:2] == '//':
                if link.split('/')[2].startswith(host):
                    storage.add(scheme + '://' + link)
            elif link[:1] == '/':
                storage.add(remove_file(url) + link)
            else:
                usable_url = remove_file(url)
                if usable_url.endswith('/'):
                    storage.add(usable_url + link)
                elif link.startswith('/'):
                    storage.add(usable_url + link)
                else:
                    storage.add(usable_url + '/' + link)
    for x in range(depth):
        urls = storage - processed
        threadpool = concurrent.futures.ThreadPoolExecutor(max_workers=10)
        futures = (threadpool.submit(rec, url) for url in urls)
        for i in concurrent.futures.as_completed(futures):
            pass
    return [forms, len(processed)]
alpha release 2018-12-30 03:07:15 +05:30			`# Let's import what we need`
			`from re import findall`
			`import concurrent.futures`
Add files via upload 2019-04-25 01:29:54 +05:30			`from urllib.parse import urlparse # for python3`
alpha release 2018-12-30 03:07:15 +05:30
			`from core.colors import run`
			`from core.zetanize import zetanize`
			`from core.requester import requester`
Add files via upload 2019-04-25 01:29:54 +05:30			`from core.utils import getUrl, getParams, remove_file`

alpha release 2018-12-30 03:07:15 +05:30
			`def photon(seedUrl, headers, depth, threadCount):`
Add files via upload 2019-04-25 01:29:54 +05:30			`forms = [] # web forms`
			`processed = set() # urls that have been crawled`
			`storage = set() # urls that belong to the target i.e. in-scope`
alpha release 2018-12-30 03:07:15 +05:30			`scheme = urlparse(seedUrl).scheme`
			`host = urlparse(seedUrl).netloc`
			`main_url = scheme + '://' + host`
			`storage.add(seedUrl)`
Add files via upload 2019-04-25 01:29:54 +05:30
alpha release 2018-12-30 03:07:15 +05:30			`def rec(url):`
			`processed.add(url)`
			`urlPrint = (url + (' ' * 60))[:60]`
			`print ('%s Parsing %-40s' % (run, urlPrint), end='\r')`
			`url = getUrl(url, '', True)`
			`params = getParams(url, '', True)`
			`if '=' in url:`
			`inps = []`
			`for name, value in params.items():`
			`inps.append({'name': name, 'value': value})`
Add files via upload 2019-04-25 01:29:54 +05:30			`forms.append(`
			`{url: {0: {'action': url, 'method': 'get', 'inputs': inps}}})`
alpha release 2018-12-30 03:07:15 +05:30			`response = requester(url, params, headers, True, 0).text`
Add files via upload 2019-04-25 01:29:54 +05:30			`forms.append({url: zetanize(url, response)})`
			`matches = findall(`
			`r'<[aA][^>]?(href\|HREF)=["\']{0,1}(.?)["\']', response)`
			`for link in matches: # iterate over the matches`
			`# remove everything after a "#" to deal with in-page anchors`
			`link = link[1].split('#')[0].lstrip(' ')`
alpha release 2018-12-30 03:07:15 +05:30			`if link[:4] == 'http':`
			`if link.startswith(main_url):`
			`storage.add(link)`
			`elif link[:2] == '//':`
			`if link.split('/')[2].startswith(host):`
Add files via upload 2019-04-25 01:29:54 +05:30			`storage.add(scheme + '://' + link)`
alpha release 2018-12-30 03:07:15 +05:30			`elif link[:1] == '/':`
Add files via upload 2019-04-25 01:29:54 +05:30			`storage.add(remove_file(url) + link)`
alpha release 2018-12-30 03:07:15 +05:30			`else:`
Add files via upload 2019-04-25 01:29:54 +05:30			`usable_url = remove_file(url)`
			`if usable_url.endswith('/'):`
			`storage.add(usable_url + link)`
			`elif link.startswith('/'):`
			`storage.add(usable_url + link)`
			`else:`
			`storage.add(usable_url + '/' + link)`
alpha release 2018-12-30 03:07:15 +05:30			`for x in range(depth):`
			`urls = storage - processed`
			`threadpool = concurrent.futures.ThreadPoolExecutor(max_workers=10)`
			`futures = (threadpool.submit(rec, url) for url in urls)`
			`for i in concurrent.futures.as_completed(futures):`
			`pass`
Add files via upload 2019-04-25 01:29:54 +05:30			`return [forms, len(processed)]`