XSStrike/core/photon.py

import re
import concurrent.futures
from urllib.parse import urlparse

from core.dom import dom
from core.log import setup_logger
from core.utils import getUrl, getParams
from core.requester import requester
from core.zetanize import zetanize
from plugins.retireJs import retireJs

logger = setup_logger(__name__)


def photon(seedUrl, headers, level, threadCount, delay, timeout, skipDOM):
    forms = []  # web forms
    processed = set()  # urls that have been crawled
    storage = set()  # urls that belong to the target i.e. in-scope
    schema = urlparse(seedUrl).scheme  # extract the scheme e.g. http or https
    host = urlparse(seedUrl).netloc  # extract the host e.g. example.com
    main_url = schema + '://' + host  # join scheme and host to make the root url
    storage.add(seedUrl)  # add the url to storage
    checkedDOMs = []

    def rec(target):
        processed.add(target)
        printableTarget = '/'.join(target.split('/')[3:])
        if len(printableTarget) > 40:
            printableTarget = printableTarget[-40:]
        else:
            printableTarget = (printableTarget + (' ' * (40 - len(printableTarget))))
        logger.run('Parsing %s\r' % printableTarget)
        url = getUrl(target, True)
        params = getParams(target, '', True)
        if '=' in target:  # if there's a = in the url, there should be GET parameters
            inps = []
            for name, value in params.items():
                inps.append({'name': name, 'value': value})
            forms.append({0: {'action': url, 'method': 'get', 'inputs': inps}})
        response = requester(url, params, headers, True, delay, timeout).text
        retireJs(url, response)
        if not skipDOM:
            highlighted = dom(response)
            clean_highlighted = ''.join([re.sub(r'^\d+\s+', '', line) for line in highlighted])
            if highlighted and clean_highlighted not in checkedDOMs:
                checkedDOMs.append(clean_highlighted)
                logger.good('Potentially vulnerable objects found at %s' % url)
                logger.red_line(level='good')
                for line in highlighted:
                    logger.no_format(line, level='good')
                logger.red_line(level='good')
        forms.append(zetanize(response))
        matches = re.findall(r'<[aA].*href=["\']{0,1}(.*?)["\']', response)
        for link in matches:  # iterate over the matches
            # remove everything after a "#" to deal with in-page anchors
            link = link.split('#')[0]
            if link.endswith(('.pdf', '.png', '.jpg', '.jpeg', '.xls', '.xml', '.docx', '.doc')):
                pass
            else:
                if link[:4] == 'http':
                    if link.startswith(main_url):
                        storage.add(link)
                elif link[:2] == '//':
                    if link.split('/')[2].startswith(host):
                        storage.add(schema + link)
                elif link[:1] == '/':
                    storage.add(main_url + link)
                else:
                    storage.add(main_url + '/' + link)
    try:
        for x in range(level):
            urls = storage - processed  # urls to crawl = all urls - urls that have been crawled
            # for url in urls:
            #     rec(url)
            threadpool = concurrent.futures.ThreadPoolExecutor(
                max_workers=threadCount)
            futures = (threadpool.submit(rec, url) for url in urls)
            for i in concurrent.futures.as_completed(futures):
                pass
    except KeyboardInterrupt:
        return [forms, processed]
    return [forms, processed]
Add files via upload 2019-04-08 13:48:44 +05:30			`import re`
Add files via upload 2018-10-27 18:58:52 +05:30			`import concurrent.futures`
removed unused imports 2018-10-28 00:52:19 +05:30			`from urllib.parse import urlparse`
Add files via upload 2018-10-27 18:58:52 +05:30
Add files via upload 2019-04-08 13:48:44 +05:30			`from core.dom import dom`
			`from core.log import setup_logger`
Add files via upload 2018-10-27 18:58:52 +05:30			`from core.utils import getUrl, getParams`
Fixed HTML comment context handling + Refactor 2018-11-15 15:41:01 +05:30			`from core.requester import requester`
			`from core.zetanize import zetanize`
Add files via upload 2019-04-08 13:48:44 +05:30			`from plugins.retireJs import retireJs`
Logging functionality (#193) * Add files via upload * Add files via upload * Logging functionality (Resolves #146) * Created customized logger and setup file * Start replacing prints * Custom StreamHandler to allow '\r' as line terminator and updated more prints * Remove setup.py * Logger functionality to write red lines and records without format * Possibility to set logging level when logging without format and usage of debug level instead of verboseOutput * Replace utils logger function calls * Fixes * Import missing info color * Move xsstrike.py imports to properly initialize loggers and add logger method to debug data using json * Minor fix 2019-01-21 04:57:55 +05:30
			`logger = setup_logger(__name__)`
Add files via upload 2018-10-27 18:58:52 +05:30
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30
Add files via upload 2019-04-08 13:48:44 +05:30			`def photon(seedUrl, headers, level, threadCount, delay, timeout, skipDOM):`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30			`forms = [] # web forms`
			`processed = set() # urls that have been crawled`
			`storage = set() # urls that belong to the target i.e. in-scope`
			`schema = urlparse(seedUrl).scheme # extract the scheme e.g. http or https`
			`host = urlparse(seedUrl).netloc # extract the host e.g. example.com`
			`main_url = schema + '://' + host # join scheme and host to make the root url`
			`storage.add(seedUrl) # add the url to storage`
Add files via upload 2019-04-08 13:48:44 +05:30			`checkedDOMs = []`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30
Add files via upload 2018-10-27 18:58:52 +05:30			`def rec(target):`
added -l option to control crawling depth 2018-10-28 14:57:36 +05:30			`processed.add(target)`
cleaner crawling dashboard 2018-11-22 14:45:34 +05:30			`printableTarget = '/'.join(target.split('/')[3:])`
			`if len(printableTarget) > 40:`
			`printableTarget = printableTarget[-40:]`
			`else:`
			`printableTarget = (printableTarget + (' ' * (40 - len(printableTarget))))`
Logging functionality (#193) * Add files via upload * Add files via upload * Logging functionality (Resolves #146) * Created customized logger and setup file * Start replacing prints * Custom StreamHandler to allow '\r' as line terminator and updated more prints * Remove setup.py * Logger functionality to write red lines and records without format * Possibility to set logging level when logging without format and usage of debug level instead of verboseOutput * Replace utils logger function calls * Fixes * Import missing info color * Move xsstrike.py imports to properly initialize loggers and add logger method to debug data using json * Minor fix 2019-01-21 04:57:55 +05:30			`logger.run('Parsing %s\r' % printableTarget)`
Verbose switch, Fixes #71, Fixes #93 2018-11-13 12:43:47 +05:30			`url = getUrl(target, True)`
Add files via upload 2018-10-27 18:58:52 +05:30			`params = getParams(target, '', True)`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30			`if '=' in target: # if there's a = in the url, there should be GET parameters`
Add files via upload 2018-10-27 18:58:52 +05:30			`inps = []`
			`for name, value in params.items():`
			`inps.append({'name': name, 'value': value})`
			`forms.append({0: {'action': url, 'method': 'get', 'inputs': inps}})`
stable release 2018-11-10 17:33:48 +05:30			`response = requester(url, params, headers, True, delay, timeout).text`
Add files via upload 2019-04-06 20:45:10 +05:30			`retireJs(url, response)`
Add files via upload 2019-04-08 13:48:44 +05:30			`if not skipDOM:`
			`highlighted = dom(response)`
			`clean_highlighted = ''.join([re.sub(r'^\d+\s+', '', line) for line in highlighted])`
			`if highlighted and clean_highlighted not in checkedDOMs:`
			`checkedDOMs.append(clean_highlighted)`
			`logger.good('Potentially vulnerable objects found at %s' % url)`
			`logger.red_line(level='good')`
			`for line in highlighted:`
			`logger.no_format(line, level='good')`
			`logger.red_line(level='good')`
Add files via upload 2018-10-27 18:58:52 +05:30			`forms.append(zetanize(response))`
Add files via upload 2019-04-08 13:48:44 +05:30			`matches = re.findall(r'<[aA].href=["\']{0,1}(.?)["\']', response)`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30			`for link in matches: # iterate over the matches`
			`# remove everything after a "#" to deal with in-page anchors`
			`link = link.split('#')[0]`
Add files via upload 2019-11-01 00:23:25 +05:30			`if link.endswith(('.pdf', '.png', '.jpg', '.jpeg', '.xls', '.xml', '.docx', '.doc')):`
			`pass`
added -l option to control crawling depth 2018-10-28 14:57:36 +05:30			`else:`
Add files via upload 2019-11-01 00:23:25 +05:30			`if link[:4] == 'http':`
			`if link.startswith(main_url):`
			`storage.add(link)`
			`elif link[:2] == '//':`
			`if link.split('/')[2].startswith(host):`
			`storage.add(schema + link)`
			`elif link[:1] == '/':`
			`storage.add(main_url + link)`
			`else:`
			`storage.add(main_url + '/' + link)`
handle keyboard interrupt + connection errors 2022-03-20 15:38:34 +05:30			`try:`
			`for x in range(level):`
			`urls = storage - processed # urls to crawl = all urls - urls that have been crawled`
			`# for url in urls:`
			`# rec(url)`
			`threadpool = concurrent.futures.ThreadPoolExecutor(`
			`max_workers=threadCount)`
			`futures = (threadpool.submit(rec, url) for url in urls)`
			`for i in concurrent.futures.as_completed(futures):`
			`pass`
			`except KeyboardInterrupt:`
			`return [forms, processed]`
Verbose switch, Fixes #71, Fixes #93 2018-11-13 12:43:47 +05:30			`return [forms, processed]`