XSStrike/core/htmlParser.py

import re

from core.config import badTags, xsschecker
from core.utils import isBadContext


def htmlParser(response, encoding):
    rawResponse = response  # raw response returned by requests
    response = response.text  # response content
    if encoding:  # if the user has specified an encoding, encode the probe in that
        response = response.replace(encoding(xsschecker), xsschecker)
    tags = []  # tags in which the input is reflected
    locations = []  # contexts in which the input is reflected
    attributes = []  # attribute names
    environments = []  # strings needed to break out of the context
    positions = []  # postions of all the reflections of the xsschecker
    for match in re.finditer(xsschecker, response):
        positions.append(match.start())

#  It finds the contexts of the reflections

    parts = response.split(xsschecker)
    # remove first element since it doesn't contain xsschecker
    parts.remove(parts[0])
    # add xsschecker in front of all elements
    parts = [xsschecker + s for s in parts]
    for part in parts:  # iterate over the parts
        deep = part.split('>')
        if '</script' in deep[0]:
            location = 'script'
        elif '</' in deep[0] or len(parts) == 1:
            location = 'html'
        else:
            num = 0
            for i in deep:
                if i[-2:] == '--':
                    if '<!--' not in ''.join(deep[:num + 1]):
                        location = 'comment'
                        break
                        continue
                location = 'script'
                for char in part:
                    # the only way to find out if it's attribute context is to see if '<' is present.
                    if char == '<':
                        location = 'attribute'  # no, it doesn't match '<script>'
                        break
                num += 1
        if '<' not in response:
            if rawResponse.headers['Content-Type'].startswith('text/html'):
                location = 'html'
        locations.append(location)  # add location to locations list

    bad_contexts = re.finditer(r'''(?s)(?i)<(style|template|textarea|title|noembed|noscript)>[.\s\S]*(%s)[.\s\S]*</\1>''' % xsschecker, response)
    non_executable_contexts = []
    for each in bad_contexts:
        non_executable_contexts.append([each.start(), each.end(), each.group(1)])
#  Finds the "environment" of reflections. is it within double quotes? Which tag contains the reflection?
    num = 0  # dummy value to keep record of occurence being processed
    # find xsschecker in response and return matches
    for occ in re.finditer(xsschecker, response, re.IGNORECASE):
        # convert "xsschecker to EOF" into a list
        toLook = list(response[occ.end():])
        for loc in range(len(toLook)):  # interate over the chars
            if toLook[loc] in ('\'', '"', '`'):  # if the char is a quote
                environments.append(toLook[loc])  # add it to environments list
                tokens = response.split('<')
                goodTokens = []  # tokens which contain xsschecker
                for token in tokens:  # iterate over tokens
                    if xsschecker in token:  # if xsschecker is in token
                        goodTokens.append(token)  # add it to goodTokens list
                        # attributes and their values are generally seperated with space so...
                        attrs = token.split(' ')
                        for attr in attrs:  # iterate over the attribute
                            if xsschecker in attr:  # is xsschecker in this attribute?
                                # alright, this is the one we need
                                attributeName = attr.split('=')[0]
                                attributeValue = ''.join(attr.split('=')[1:])
                                if attributeValue.startswith('\'') or attributeValue.startswith('"'):
                                    attributeValue = attributeValue[1:-1]
                                attributes.append({attributeName:attributeValue})
                                break
                try:
                    # finds the tag "inside" which input is refelcted
                    tag = re.search(r'\w+', goodTokens[num]).group()
                except IndexError:
                    try:
                        # finds the tag "inside" which input is refelcted
                        tag = re.search(r'\w+', goodTokens[num - 1]).group()
                    except IndexError:
                        tag = 'null'
                tags.append(tag)  # add the tag to the tags list
                break
            else:  # if we encounter a closing angular brackt
                # check if the next character to it is a / to make sure its a closing tag
                badContext = isBadContext(positions[num], non_executable_contexts)
                if badContext:
                    environments.append('</' + badContext + '>')
                else:
                    environments.append('')
                tags.append('')
                attributes.append('')
                break
            loc += 1
        num += 1
    occurences = {}  # a dict to store all the collected information about the reflections
    for i, loc, env, tag, attr, position in zip(range(len(locations)), locations, environments, tags, attributes, positions):
        occurences[i] = {}
        occurences[i]['position'] = position
        if loc == 'comment':  # if context is html comment
            env = '-->'  # add --> as environment as we need this to break out
        occurences[i]['context'] = [loc, env, tag, attr]
    return [occurences, positions]
Add files via upload 2018-10-27 18:58:52 +05:30			`import re`
Fixed HTML comment context handling + Refactor 2018-11-15 15:41:01 +05:30
			`from core.config import badTags, xsschecker`
Update htmlParser.py 2019-04-10 18:05:56 +05:30			`from core.utils import isBadContext`
Add files via upload 2018-10-27 18:58:52 +05:30
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30
Ability to encode payloads, Fixed a bug in bruteforcer 2018-11-13 16:47:00 +05:30			`def htmlParser(response, encoding):`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30			`rawResponse = response # raw response returned by requests`
			`response = response.text # response content`
			`if encoding: # if the user has specified an encoding, encode the probe in that`
Ability to encode payloads, Fixed a bug in bruteforcer 2018-11-13 16:47:00 +05:30			`response = response.replace(encoding(xsschecker), xsschecker)`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30			`tags = [] # tags in which the input is reflected`
			`locations = [] # contexts in which the input is reflected`
			`attributes = [] # attribute names`
			`environments = [] # strings needed to break out of the context`
			`positions = [] # postions of all the reflections of the xsschecker`
Handle dynamic number of reflections (Fixes #78) 2018-10-30 16:28:56 +05:30			`for match in re.finditer(xsschecker, response):`
			`positions.append(match.start())`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30
			`# It finds the contexts of the reflections`

Add files via upload 2018-10-27 18:58:52 +05:30			`parts = response.split(xsschecker)`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30			`# remove first element since it doesn't contain xsschecker`
			`parts.remove(parts[0])`
			`# add xsschecker in front of all elements`
			`parts = [xsschecker + s for s in parts]`
			`for part in parts: # iterate over the parts`
Add files via upload 2018-10-27 18:58:52 +05:30			`deep = part.split('>')`
			`if '</script' in deep[0]:`
			`location = 'script'`
Potential fix for #226 2019-04-10 11:09:03 +05:30			`elif '</' in deep[0] or len(parts) == 1:`
Add files via upload 2018-10-27 18:58:52 +05:30			`location = 'html'`
			`else:`
fixed a major bug in htmlParser 2018-11-16 01:30:48 +05:30			`num = 0`
Fixes #79, Fixes 80, Fixes #81 2018-11-03 22:49:40 +05:30			`for i in deep:`
			`if i[-2:] == '--':`
fixed a major bug in htmlParser 2018-11-16 01:30:48 +05:30			`if '<!--' not in ''.join(deep[:num + 1]):`
Add files via upload 2018-11-11 14:56:19 +05:30			`location = 'comment'`
			`break`
			`continue`
handle wrong content type 2018-10-28 12:42:59 +05:30			`location = 'script'`
			`for char in part:`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30			`# the only way to find out if it's attribute context is to see if '<' is present.`
			`if char == '<':`
			`location = 'attribute' # no, it doesn't match '<script>'`
handle wrong content type 2018-10-28 12:42:59 +05:30			`break`
fixed a major bug in htmlParser 2018-11-16 01:30:48 +05:30			`num += 1`
Fixes #79, Fixes 80, Fixes #81 2018-11-03 22:49:40 +05:30			`if '<' not in response:`
fix wrong content type handling 2018-11-27 15:59:01 +05:30			`if rawResponse.headers['Content-Type'].startswith('text/html'):`
handle wrong content type 2018-10-28 12:42:59 +05:30			`location = 'html'`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30			`locations.append(location) # add location to locations list`

Update htmlParser.py 2019-04-10 18:05:56 +05:30			`bad_contexts = re.finditer(r'''(?s)(?i)<(style\|template\|textarea\|title\|noembed\|noscript)>[.\s\S](%s)[.\s\S]</\1>''' % xsschecker, response)`
			`non_executable_contexts = []`
			`for each in bad_contexts:`
			`non_executable_contexts.append([each.start(), each.end(), each.group(1)])`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30			`# Finds the "environment" of reflections. is it within double quotes? Which tag contains the reflection?`
			`num = 0 # dummy value to keep record of occurence being processed`
			`# find xsschecker in response and return matches`
			`for occ in re.finditer(xsschecker, response, re.IGNORECASE):`
			`# convert "xsschecker to EOF" into a list`
			`toLook = list(response[occ.end():])`
			`for loc in range(len(toLook)): # interate over the chars`
			if toLook[loc] in ('\'', '"', '`'): # if the char is a quote
Fixes #226 2019-04-10 17:09:21 +05:30			`environments.append(toLook[loc]) # add it to environments list`
Add files via upload 2018-10-27 18:58:52 +05:30			`tokens = response.split('<')`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30			`goodTokens = [] # tokens which contain xsschecker`
			`for token in tokens: # iterate over tokens`
			`if xsschecker in token: # if xsschecker is in token`
			`goodTokens.append(token) # add it to goodTokens list`
			`# attributes and their values are generally seperated with space so...`
Add files via upload 2018-10-27 18:58:52 +05:30			`attrs = token.split(' ')`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30			`for attr in attrs: # iterate over the attribute`
			`if xsschecker in attr: # is xsschecker in this attribute?`
			`# alright, this is the one we need`
v3.1.0 - browser engine integration for zero false positives - coverage of event handler context - bug fixes 2018-11-21 19:20:10 +05:30			`attributeName = attr.split('=')[0]`
			`attributeValue = ''.join(attr.split('=')[1:])`
			`if attributeValue.startswith('\'') or attributeValue.startswith('"'):`
			`attributeValue = attributeValue[1:-1]`
			`attributes.append({attributeName:attributeValue})`
Add files via upload 2018-10-27 18:58:52 +05:30			`break`
			`try:`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30			`# finds the tag "inside" which input is refelcted`
			`tag = re.search(r'\w+', goodTokens[num]).group()`
handle wrong content type 2018-10-28 12:42:59 +05:30			`except IndexError:`
			`try:`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30			`# finds the tag "inside" which input is refelcted`
			`tag = re.search(r'\w+', goodTokens[num - 1]).group()`
handle wrong content type 2018-10-28 12:42:59 +05:30			`except IndexError:`
			`tag = 'null'`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30			`tags.append(tag) # add the tag to the tags list`
Add files via upload 2018-10-27 18:58:52 +05:30			`break`
Update htmlParser.py 2019-04-10 18:05:56 +05:30			`else: # if we encounter a closing angular brackt`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30			`# check if the next character to it is a / to make sure its a closing tag`
Update htmlParser.py 2019-04-10 18:05:56 +05:30			`badContext = isBadContext(positions[num], non_executable_contexts)`
			`if badContext:`
			`environments.append('</' + badContext + '>')`
Fixes #226 2019-04-10 17:09:21 +05:30			`else:`
			`environments.append('')`
Update htmlParser.py 2019-04-10 18:05:56 +05:30			`tags.append('')`
			`attributes.append('')`
Add files via upload 2018-10-27 18:58:52 +05:30			`break`
			`loc += 1`
			`num += 1`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30			`occurences = {} # a dict to store all the collected information about the reflections`
Handle dynamic number of reflections (Fixes #78) 2018-10-30 16:28:56 +05:30			`for i, loc, env, tag, attr, position in zip(range(len(locations)), locations, environments, tags, attributes, positions):`
Add files via upload 2018-10-27 18:58:52 +05:30			`occurences[i] = {}`
Handle dynamic number of reflections (Fixes #78) 2018-10-30 16:28:56 +05:30			`occurences[i]['position'] = position`
Documentation and pep8 compilance 2018-11-16 21:13:45 +05:30			`if loc == 'comment': # if context is html comment`
			`env = '-->' # add --> as environment as we need this to break out`
Add files via upload 2018-10-27 18:58:52 +05:30			`occurences[i]['context'] = [loc, env, tag, attr]`
Update htmlParser.py 2018-11-10 22:55:04 +05:30			`return [occurences, positions]`