diff --git a/core/htmlParser.py b/core/htmlParser.py index a1d8815..38b830d 100644 --- a/core/htmlParser.py +++ b/core/htmlParser.py @@ -1,7 +1,7 @@ import re from core.config import badTags, xsschecker -from core.utils import isBadContext +from core.utils import isBadContext, equalize def htmlParser(response, encoding): @@ -9,104 +9,70 @@ def htmlParser(response, encoding): response = response.text # response content if encoding: # if the user has specified an encoding, encode the probe in that response = response.replace(encoding(xsschecker), xsschecker) - tags = [] # tags in which the input is reflected - locations = [] # contexts in which the input is reflected - attributes = [] # attribute names - environments = [] # strings needed to break out of the context - positions = [] # postions of all the reflections of the xsschecker - for match in re.finditer(xsschecker, response): - positions.append(match.start()) + reflections = response.count(xsschecker) + position_and_context = {} + environment_details = {} + clean_response = re.sub(r'', '', response) + script_context = re.finditer(r'(?i)]*>.*?(%s).*?' % xsschecker, clean_response) + for occurence in script_context: + thisPosition = occurence.start(1) + position_and_context[thisPosition] = 'script' + environment_details[thisPosition] = {} + environment_details[thisPosition]['details'] = {} + attribute_context = re.finditer(r'<[^>]*?(%s)[^>]*?>' % xsschecker, clean_response) + for occurence in attribute_context: + match = occurence.group(0) + thisPosition = occurence.start(1) + parts = re.split(r'\s', match) + tag = parts[0][1:] + for part in parts: + if xsschecker in part: + Type, quote, name, value = '', '', '', '' + if '=' in part: + quote = re.search(r'=([\'`"])?', part).group(1) + name_and_value = part.split('=')[0], '='.join(part.split('=')[1:]) + if xsschecker == name_and_value[0]: + Type = 'name' + else: + Type = 'value' + name = name_and_value[0] + value = name_and_value[1].rstrip('>').rstrip(quote).lstrip(quote) + else: + Type = 'flag' + position_and_context[thisPosition] = 'attribute' + environment_details[thisPosition] = {} + environment_details[thisPosition]['details'] = {'tag' : tag, 'type' : Type, 'quote' : quote, 'value' : value, 'name' : name} + html_context = re.finditer(xsschecker, clean_response) + for occurence in html_context: + thisPosition = occurence.start() + if thisPosition not in position_and_context: + position_and_context[occurence.start()] = 'html' + environment_details[thisPosition] = {} + environment_details[thisPosition]['details'] = {} + comment_context = re.finditer(r')[.\s\S]*(%s)[.\s\S]*?-->' % xsschecker, response) + for occurence in comment_context: + thisPosition = occurence.start(1) + position_and_context[thisPosition] = 'comment' + environment_details[thisPosition] = {} + environment_details[thisPosition]['details'] = {} + database = {} + for i in sorted(position_and_context): + database[i] = {} + database[i]['position'] = i + database[i]['context'] = position_and_context[i] + database[i]['details'] = environment_details[i]['details'] -# It finds the contexts of the reflections - - parts = response.split(xsschecker) - # remove first element since it doesn't contain xsschecker - parts.remove(parts[0]) - # add xsschecker in front of all elements - parts = [xsschecker + s for s in parts] - for part in parts: # iterate over the parts - deep = part.split('>') - if '' - break - num += 1 - if '<' not in response: - if rawResponse.headers['Content-Type'].startswith('text/html'): - location = 'html' - locations.append(location) # add location to locations list - - bad_contexts = re.finditer(r'''(?s)(?i)<(style|template|textarea|title|noembed|noscript)>[.\s\S]*(%s)[.\s\S]*''' % xsschecker, response) + bad_contexts = re.finditer(r'(?s)(?i)<(style|template|textarea|title|noembed|noscript)>[.\s\S]*(%s)[.\s\S]*' % xsschecker, response) non_executable_contexts = [] for each in bad_contexts: non_executable_contexts.append([each.start(), each.end(), each.group(1)]) -# Finds the "environment" of reflections. is it within double quotes? Which tag contains the reflection? - num = 0 # dummy value to keep record of occurence being processed - # find xsschecker in response and return matches - for occ in re.finditer(xsschecker, response, re.IGNORECASE): - # convert "xsschecker to EOF" into a list - toLook = list(response[occ.end():]) - for loc in range(len(toLook)): # interate over the chars - if toLook[loc] in ('\'', '"', '`'): # if the char is a quote - environments.append(toLook[loc]) # add it to environments list - tokens = response.split('<') - goodTokens = [] # tokens which contain xsschecker - for token in tokens: # iterate over tokens - if xsschecker in token: # if xsschecker is in token - goodTokens.append(token) # add it to goodTokens list - # attributes and their values are generally seperated with space so... - attrs = token.split(' ') - for attr in attrs: # iterate over the attribute - if xsschecker in attr: # is xsschecker in this attribute? - # alright, this is the one we need - attributeName = attr.split('=')[0] - attributeValue = ''.join(attr.split('=')[1:]) - if attributeValue.startswith('\'') or attributeValue.startswith('"'): - attributeValue = attributeValue[1:-1] - attributes.append({attributeName:attributeValue}) - break - try: - # finds the tag "inside" which input is refelcted - tag = re.search(r'\w+', goodTokens[num]).group() - except IndexError: - try: - # finds the tag "inside" which input is refelcted - tag = re.search(r'\w+', goodTokens[num - 1]).group() - except IndexError: - tag = 'null' - tags.append(tag) # add the tag to the tags list - break - else: # if we encounter a closing angular brackt - # check if the next character to it is a / to make sure its a closing tag - badContext = isBadContext(positions[num], non_executable_contexts) - if badContext: - environments.append('') - else: - environments.append('') - tags.append('') - attributes.append('') - break - loc += 1 - num += 1 - occurences = {} # a dict to store all the collected information about the reflections - for i, loc, env, tag, attr, position in zip(range(len(locations)), locations, environments, tags, attributes, positions): - occurences[i] = {} - occurences[i]['position'] = position - if loc == 'comment': # if context is html comment - env = '-->' # add --> as environment as we need this to break out - occurences[i]['context'] = [loc, env, tag, attr] - return [occurences, positions] + + if non_executable_contexts: + for key in database.keys(): + position = database[key]['position'] + badTag = isBadContext(position, non_executable_contexts) + if badTag: + database[key]['details']['badTag'] = badTag + else: + database[key]['details']['badTag'] = '' + return database