Files
XSStrike/core/htmlParser.py

113 lines
5.6 KiB
Python
Raw Normal View History

2018-10-27 18:58:52 +05:30
import re
from core.config import badTags, xsschecker
2019-04-10 18:05:56 +05:30
from core.utils import isBadContext
2018-10-27 18:58:52 +05:30
2018-11-16 21:13:45 +05:30
def htmlParser(response, encoding):
2018-11-16 21:13:45 +05:30
rawResponse = response # raw response returned by requests
response = response.text # response content
if encoding: # if the user has specified an encoding, encode the probe in that
response = response.replace(encoding(xsschecker), xsschecker)
2018-11-16 21:13:45 +05:30
tags = [] # tags in which the input is reflected
locations = [] # contexts in which the input is reflected
attributes = [] # attribute names
environments = [] # strings needed to break out of the context
positions = [] # postions of all the reflections of the xsschecker
for match in re.finditer(xsschecker, response):
positions.append(match.start())
2018-11-16 21:13:45 +05:30
# It finds the contexts of the reflections
2018-10-27 18:58:52 +05:30
parts = response.split(xsschecker)
2018-11-16 21:13:45 +05:30
# remove first element since it doesn't contain xsschecker
parts.remove(parts[0])
# add xsschecker in front of all elements
parts = [xsschecker + s for s in parts]
for part in parts: # iterate over the parts
2018-10-27 18:58:52 +05:30
deep = part.split('>')
if '</script' in deep[0]:
location = 'script'
2019-04-10 11:09:03 +05:30
elif '</' in deep[0] or len(parts) == 1:
2018-10-27 18:58:52 +05:30
location = 'html'
else:
2018-11-16 01:30:48 +05:30
num = 0
2018-11-03 22:49:40 +05:30
for i in deep:
if i[-2:] == '--':
2018-11-16 01:30:48 +05:30
if '<!--' not in ''.join(deep[:num + 1]):
2018-11-11 14:56:19 +05:30
location = 'comment'
break
continue
2018-10-28 12:42:59 +05:30
location = 'script'
for char in part:
2018-11-16 21:13:45 +05:30
# the only way to find out if it's attribute context is to see if '<' is present.
if char == '<':
location = 'attribute' # no, it doesn't match '<script>'
2018-10-28 12:42:59 +05:30
break
2018-11-16 01:30:48 +05:30
num += 1
2018-11-03 22:49:40 +05:30
if '<' not in response:
2018-11-27 15:59:01 +05:30
if rawResponse.headers['Content-Type'].startswith('text/html'):
2018-10-28 12:42:59 +05:30
location = 'html'
2018-11-16 21:13:45 +05:30
locations.append(location) # add location to locations list
2019-04-10 18:05:56 +05:30
bad_contexts = re.finditer(r'''(?s)(?i)<(style|template|textarea|title|noembed|noscript)>[.\s\S]*(%s)[.\s\S]*</\1>''' % xsschecker, response)
non_executable_contexts = []
for each in bad_contexts:
non_executable_contexts.append([each.start(), each.end(), each.group(1)])
2018-11-16 21:13:45 +05:30
# Finds the "environment" of reflections. is it within double quotes? Which tag contains the reflection?
num = 0 # dummy value to keep record of occurence being processed
# find xsschecker in response and return matches
for occ in re.finditer(xsschecker, response, re.IGNORECASE):
# convert "xsschecker to EOF" into a list
toLook = list(response[occ.end():])
for loc in range(len(toLook)): # interate over the chars
if toLook[loc] in ('\'', '"', '`'): # if the char is a quote
2019-04-10 17:09:21 +05:30
environments.append(toLook[loc]) # add it to environments list
2018-10-27 18:58:52 +05:30
tokens = response.split('<')
2018-11-16 21:13:45 +05:30
goodTokens = [] # tokens which contain xsschecker
for token in tokens: # iterate over tokens
if xsschecker in token: # if xsschecker is in token
goodTokens.append(token) # add it to goodTokens list
# attributes and their values are generally seperated with space so...
2018-10-27 18:58:52 +05:30
attrs = token.split(' ')
2018-11-16 21:13:45 +05:30
for attr in attrs: # iterate over the attribute
if xsschecker in attr: # is xsschecker in this attribute?
# alright, this is the one we need
attributeName = attr.split('=')[0]
attributeValue = ''.join(attr.split('=')[1:])
if attributeValue.startswith('\'') or attributeValue.startswith('"'):
attributeValue = attributeValue[1:-1]
attributes.append({attributeName:attributeValue})
2018-10-27 18:58:52 +05:30
break
try:
2018-11-16 21:13:45 +05:30
# finds the tag "inside" which input is refelcted
tag = re.search(r'\w+', goodTokens[num]).group()
2018-10-28 12:42:59 +05:30
except IndexError:
try:
2018-11-16 21:13:45 +05:30
# finds the tag "inside" which input is refelcted
tag = re.search(r'\w+', goodTokens[num - 1]).group()
2018-10-28 12:42:59 +05:30
except IndexError:
tag = 'null'
2018-11-16 21:13:45 +05:30
tags.append(tag) # add the tag to the tags list
2018-10-27 18:58:52 +05:30
break
2019-04-10 18:05:56 +05:30
else: # if we encounter a closing angular brackt
2018-11-16 21:13:45 +05:30
# check if the next character to it is a / to make sure its a closing tag
2019-04-10 18:05:56 +05:30
badContext = isBadContext(positions[num], non_executable_contexts)
if badContext:
environments.append('</' + badContext + '>')
2019-04-10 17:09:21 +05:30
else:
environments.append('')
2019-04-10 18:05:56 +05:30
tags.append('')
attributes.append('')
2018-10-27 18:58:52 +05:30
break
loc += 1
num += 1
2018-11-16 21:13:45 +05:30
occurences = {} # a dict to store all the collected information about the reflections
for i, loc, env, tag, attr, position in zip(range(len(locations)), locations, environments, tags, attributes, positions):
2018-10-27 18:58:52 +05:30
occurences[i] = {}
occurences[i]['position'] = position
2018-11-16 21:13:45 +05:30
if loc == 'comment': # if context is html comment
env = '-->' # add --> as environment as we need this to break out
2018-10-27 18:58:52 +05:30
occurences[i]['context'] = [loc, env, tag, attr]
2018-11-10 22:55:04 +05:30
return [occurences, positions]