From ea5950bdebcde5e1e4c348fea0d284f9ff5e82db Mon Sep 17 00:00:00 2001
From: Dan McInerney <danhmcinerney@gmail.com>
Date: Sat, 13 Dec 2014 18:40:23 -0700
Subject: [PATCH] added url path xss detection, refactored url param functions

---
 .gitignore                     |   1 +
 xsscrapy/middlewares.py        |   7 +-
 xsscrapy/pipelines.py          |  38 ++++----
 xsscrapy/spiders/xss_spider.py | 154 +++++++++++++++++++++++++--------
 4 files changed, 138 insertions(+), 62 deletions(-)
diff --git a/.gitignore b/.gitignore
index 270fadb..082d288 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 *.pyc
 xsscrapy-vulns*
+*.txt
 *.swp
 *.swo
 *.png
diff --git a/xsscrapy/middlewares.py b/xsscrapy/middlewares.py
index 07b9d12..29737b8 100644
--- a/xsscrapy/middlewares.py
+++ b/xsscrapy/middlewares.py
@@ -35,16 +35,17 @@ class InjectedDupeFilter(object):
     def process_request(self, request, spider):
 
         meta = request.meta
-        if 'xss_place' not in meta or 'delim' not in meta:
+        if 'xss_place' not in meta:
             return
         delim = meta['delim']
 
         # Injected URL dupe handling
         if meta['xss_place'] == 'url':
+            url = request.url
             #replace the delim characters with nothing so we only test the URL
             #with the payload
-            url = request.url.replace(delim, '')
-            if url in URLS_SEEN:
+            no_delim_url = url.replace(delim, '')
+            if no_delim_url in URLS_SEEN:
                 raise IgnoreRequest
             spider.log('Sending payloaded URL: %s' % url)
             URLS_SEEN.add(url)
diff --git a/xsscrapy/pipelines.py b/xsscrapy/pipelines.py
index 36802ae..39503e2 100644
--- a/xsscrapy/pipelines.py
+++ b/xsscrapy/pipelines.py
@@ -10,12 +10,10 @@ import lxml.etree
 import lxml.html
 from lxml.html import soupparser, fromstring
 import itertools
-#from IPython import embed
+from IPython import embed
 
 class XSSCharFinder(object):
     def __init__(self):
-        self.redir_pld = 'JaVAscRIPT:prompt(99)'
-        self.test_str = '\'"(){}<x>:/'
         self.url_param_xss_items = []
 
     def process_item(self, item, spider):
@@ -24,6 +22,7 @@ class XSSCharFinder(object):
 
         payload = meta['payload']
         delim = meta['delim']
+        param = meta['xss_param']
         resp_url = response.url
         body = response.body
         mismatch = False
@@ -32,12 +31,9 @@ class XSSCharFinder(object):
         # Regex: ( ) mean group 1 is within the parens, . means any char,
         # {1,80} means match any char 0 to 80 times, 80 chosen because double URL encoding
         # ? makes the search nongreedy so it stops after hitting its limits
-        #full_match = '%s.*?%s' % (delim, delim)
         full_match = '%s.{0,80}?%s' % (delim, delim)
         # matches with semicolon which sometimes cuts results off
         sc_full_match = '%s.{0,80}?%s;9' % (delim, delim)
-        #chars_between_delims = '%s(.*?)%s' % (delim, delim)
-        #chars_between_delims = '%s(.{0,80}?)%s' % (delim, delim)
 
         # Quick sqli check based on DSSS
         dbms, regex = self.sqli_check(body, meta['orig_body'])
@@ -52,11 +48,13 @@ class XSSCharFinder(object):
 
         # XSS detection starts here
         re_matches = sorted([(m.start(), m.group()) for m in re.finditer(full_match, body)])
-        if re_matches:
+        if '/verifypasswd.php/1zqj' in resp_url:
+            embed()
+
+        if len(re_matches) > 0:
             scolon_matches = sorted([(m.start(), m.group()) for m in re.finditer(sc_full_match, body)])
             lxml_injs = self.get_lxml_matches(full_match, body, resp_url, delim)
             if lxml_injs:
-
                 err = None
                 if len(re_matches) != len(lxml_injs):
                     spider.log('Error: mismatch in injections found by lxml and regex. Higher chance of false positive for %s' % resp_url)
@@ -64,7 +62,6 @@ class XSSCharFinder(object):
                     mismatch = True
 
                 inj_data = self.combine_regex_lxml(lxml_injs, re_matches, scolon_matches, body, mismatch)
-
                 # If mismatch is True, then "for offset in sorted(inj_data)" will fail with TypeError
                 try:
                     for offset in sorted(inj_data):
@@ -239,8 +236,7 @@ class XSSCharFinder(object):
 
         tag_index, tag, attr, attr_val, payload, reflected_chars, line = injection
         pl_delim = payload[:7]
-        #full_match = '%s.*?%s' % (pl_delim, pl_delim)
-        full_match = '%s.{0,80}?%s' % (pl_delim, pl_delim)
+        full_match = '%s.{0,85}?%s' % (pl_delim, pl_delim)
         line = re.sub(full_match, 'INJECTION', line)
 
         all_chars_payloads = {}
@@ -375,6 +371,9 @@ class XSSCharFinder(object):
         # javascript:alert(1) vulns
         # We do this slicing operation because ;9 might be at the end
         # although it's unnecessary for the payload
+
+        # CHECK HERE, PASS DOWN THE ORIG ATTR VAL
+        #if delim+'subbed' in attr_val:
         if attr_val[:len(delim+'subbed')] == delim+'subbed':
             if tag == 'a' and attr == 'href':
                 # Only need : ( and ) to use javascript:prompt(4) redir payload
@@ -559,8 +558,7 @@ class XSSCharFinder(object):
         subbed_body = re.sub(full_match, sub, body)
         doc = self.html_parser(subbed_body, resp_url)
         lxml_injs = self.xpath_inj_points(sub, doc)
-        if lxml_injs:
-            return lxml_injs
+        return lxml_injs
 
     def html_parser(self, body, resp_url):
         try:
@@ -657,8 +655,8 @@ class XSSCharFinder(object):
             #unfiltered_chars = self.get_unfiltered_chars(payload, pl_delim, scolon_matches, match_offset)
             reflected_chars = self.get_reflected_chars(tag, attr, payload, pl_delim, scolon_matches, match_offset)
             # Common false+ shows only "> as unfiltered if script parses the chars between 2 unrelated delim strs
-            if reflected_chars == '">':
-                reflected_chars = ''
+            #if reflected_chars == '">':
+            #    reflected_chars = ''
             all_inj_data[match_offset] = [tag_index, tag, attr, attr_val, payload, reflected_chars, line]
 
         return all_inj_data
@@ -869,20 +867,16 @@ class XSSCharFinder(object):
         # Make sure js payloads remove escaped ' and ", also remove ;
         # since ; will show up in html encoded entities. If ; is unfiltered
         # it will be added after this function
-        #escaped_chars = re.findall(r'\\(.)', chars)
         chars_between_delim = payload.replace(delim, '')#.replace("\\'", "").replace('\\"', '').replace(';', '').replace('\\>', '').replace('\\<', '').replace('\\/', '')
+
         #If injection is inside script tag, remove the escaped chars
         if tag == 'script' or attr in self.event_attributes():
             chars_between_delim = chars_between_delim.replace("\\'", "").replace('\\"', '').replace(';', '').replace('\\>', '').replace('\\<', '').replace('\\/', '')
         else:
+            # If it's not a script then just remove the \'s otherwise they show up in Unfiltered in the item
             chars_between_delim = chars_between_delim.replace("\\", "")
 
-        # List for just the inj point
-        #for c in chars_found:
-        #    if c in self.test_str:
-        #        unfiltered_chars.append(c)
-
- #       # Check if a colon needs to be added to the unfiltered chars
+        # Check if a colon needs to be added to the unfiltered chars
         for scolon_match in scolon_matches:
             # Confirm the string offset of the match is the same
             # Since scolon_match will only exist when ;9 was found
diff --git a/xsscrapy/spiders/xss_spider.py b/xsscrapy/spiders/xss_spider.py
index 14c7554..48a4b62 100644
--- a/xsscrapy/spiders/xss_spider.py
+++ b/xsscrapy/spiders/xss_spider.py
@@ -6,7 +6,7 @@ from scrapy.http import FormRequest, Request
 from scrapy.selector import Selector
 from xsscrapy.items import inj_resp
 from xsscrapy.loginform import fill_login_form
-from urlparse import urlparse, parse_qsl, urljoin
+from urlparse import urlparse, parse_qsl, urljoin, urlunparse, urlunsplit
 
 from scrapy.http.cookies import CookieJar
 from cookielib import Cookie
@@ -32,7 +32,8 @@ class XSSspider(CrawlSpider):
     # If you're logging into a site with a logout link, you'll want to
     # uncomment the rule below and comment the shorter one right after to
     # prevent yourself from being logged out automatically
-    rules = (Rule(LinkExtractor(), callback='parse_resp', follow=True), )
+    #rules = (Rule(LinkExtractor(), callback='parse_resp', follow=True), )
+    rules = (Rule(LinkExtractor(deny='contactus'), callback='parse_resp', follow=True), )
 
     def __init__(self, *args, **kwargs):
         # run using: scrapy crawl xss_spider -a url='http://example.com'
@@ -41,7 +42,7 @@ class XSSspider(CrawlSpider):
         hostname = urlparse(self.start_urls[0]).hostname
         # With subdomains
         self.allowed_domains = [hostname] # adding [] around the value seems to allow it to crawl subdomain of value
-        self.delim = '1zqjx'
+        self.delim = '1zqj'
         # semi colon goes on end because sometimes it cuts stuff off like
         # gruyere or the second cookie delim
         self.test_str = '\'"(){}<x>:/'
@@ -135,6 +136,10 @@ class XSSspider(CrawlSpider):
         reqs = []
         orig_url = response.url
         body = response.body
+        parsed_url = urlparse(orig_url)
+        # parse_qsl rather than parse_qs in order to preserve order
+        # will always return a list
+        url_params = parse_qsl(parsed_url.query, keep_blank_values=True)
 
         try:
             # soupparser will handle broken HTML better (like identical attributes) but god damn will you pay for it
@@ -177,11 +182,9 @@ class XSSspider(CrawlSpider):
             if form_reqs:
                 reqs += form_reqs
 
-        # Test URL variables with xss strings
-        payloaded_urls, url_delim_str = self.make_URLs(orig_url, payload) # list of tuples where item[0]=url, and item[1]=changed param
-        print 'URL:', payloaded_urls, url_delim_str
+        payloaded_urls = self.make_URLs(orig_url, parsed_url, url_params)
         if payloaded_urls:
-            url_reqs = self.make_url_reqs(orig_url, payloaded_urls, url_delim_str)
+            url_reqs = self.make_url_reqs(orig_url, payloaded_urls)
             if url_reqs:
                 reqs += url_reqs
 
@@ -243,10 +246,8 @@ class XSSspider(CrawlSpider):
         ''' Payload each form input in each input's own request '''
         reqs = []
         vals_urls_meths = []
-
-        two_rand_letters = random.choice(string.lowercase) + random.choice(string.lowercase)
-        delim_str = self.delim + two_rand_letters
-        payload = delim_str + payload + delim_str + ';9'
+        
+        payload = self.make_payload()
 
         for form in forms:
             if form.inputs:
@@ -283,7 +284,7 @@ class XSSspider(CrawlSpider):
                                                     'orig_url':orig_url,
                                                     'xss_place':'form',
                                                     'POST_to':url,
-                                                    'delim':delim_str},
+                                                    'delim':payload[:len(self.delim)+2]},
                                               dont_filter=True,
                                               callback=self.xss_chars_finder)
                             reqs.append(req)
@@ -300,9 +301,7 @@ class XSSspider(CrawlSpider):
     def make_cookie_reqs(self, url, payload, xss_param):
         ''' Generate payloaded cookie header requests '''
 
-        two_rand_letters = random.choice(string.lowercase) + random.choice(string.lowercase)
-        delim_str = self.delim + two_rand_letters
-        payload = delim_str + payload + delim_str + ';9'
+        payload = self.make_payload()
 
         reqs = [Request(url,
                         meta={'xss_place':'header',
@@ -310,7 +309,7 @@ class XSSspider(CrawlSpider):
                               'xss_param':xss_param,
                               'orig_url':url,
                               'payload':payload,
-                              'delim':delim_str},
+                              'delim':payload[:len(self.delim)+2]},
                         cookies={'userinput':payload},
                         callback=self.xss_chars_finder,
                         dont_filter=True)]
@@ -318,33 +317,116 @@ class XSSspider(CrawlSpider):
         if len(reqs) > 0:
             return reqs
 
-    def make_URLs(self, url, payload):
-        ''' Add links with variables in them to the queue again but with XSS testing payloads 
-        Will return a tuple: (url, injection point, payload) '''
+    def make_URLs(self, orig_url, parsed_url, url_params):
+        """
+        Create the URL parameter payloaded URLs
+        """
+        payloaded_urls = []
 
+        # Create 1 URL per payloaded param
+        new_query_strings = self.get_single_payload_queries(url_params)
+        if new_query_strings:
+            # Payload the parameters
+            for query in new_query_strings:
+
+                query_str =  query[0]
+                params = query[1]
+                payload = query[2]
+                                           # scheme       #netlo         #path          #params        #query (url params) #fragment
+                payloaded_url = urlunparse((parsed_url[0], parsed_url[1], parsed_url[2], parsed_url[3], query_str, parsed_url[5]))
+                payloaded_url = urllib.unquote(payloaded_url)
+                payloaded_urls.append((payloaded_url, params, payload))
+
+            # Payload the URL path
+            payloaded_url_path = self.payload_url_path(parsed_url)
+            payloaded_urls.append(payloaded_url_path)
+        else:
+            # Payload end of URL if there's no parameters
+            payloaded_end_of_url = self.payload_end_of_url(orig_url)
+            payloaded_urls.append(payloaded_end_of_url)
+
+        if len(payloaded_urls) > 0:
+            return payloaded_urls
+
+    def payload_url_path(self, parsed_url):
+        """
+        Payload the URL path like:
+        http://example.com/page1.php?x=1&y=2 -->
+        http://example.com/page1.php/FUZZ/?x=1&y=2
+        """
+        # Remove / so that it doesn't think it's 2 folders in the fuzz chars
+        payload = self.make_payload().replace('/', '')
+        path = parsed_url[2]
+        if path.endswith('/'):
+            path = path + payload + '/'
+        else:
+            path = path + '/' + payload + '/'
+                                    #scheme, netloc, path, params, query (url params), fragment
+        payloaded_url = urlunparse((parsed_url[0], parsed_url[1], path, parsed_url[3], parsed_url[4], parsed_url[5]))
+        payloaded_url = urllib.unquote(payloaded_url)
+        payloaded_data = (payloaded_url, 'URL path', payload)
+
+        return payloaded_data
+
+    def get_single_payload_queries(self, url_params):
+        """
+        Make a list of lists of tuples where each secondary list has 1 payloaded
+        param and the rest are original value
+        """
+        new_payloaded_params = []
+        changed_params = []
+        modified = False
+        # Create a list of lists where num of lists = len(params)
+        for x in xrange(0, len(url_params)):
+            single_url_params = []
+
+            # Make the payload
+            payload = self.make_payload()
+
+            for p in url_params:
+                param, value = p
+
+                # if param has not been modified and we haven't changed a parameter for this loop
+                if param not in changed_params and modified == False:
+                    # Do we need the original value there? Might be helpful sometimes but think about testing for <frame src="FUZZCHARS">
+                    # versus <frame src="http://something.com/FUZZCHARS"> and the xss payload javascript:alert(1)
+                    new_param_val = (param, payload)
+                    #new_param_val = (param, value+payload)
+                    single_url_params.append(new_param_val)
+                    changed_params.append(param)
+                    modified = param
+                else:
+                    single_url_params.append(p)
+
+            # Add the modified, urlencoded params to the master list
+            new_payloaded_params.append((urllib.urlencode(single_url_params), modified, payload))
+            # Reset the changed parameter tracker
+            modified = False
+
+        if len(new_payloaded_params) > 0:
+            # [(payloaded params, payloaded param, payload), (payloaded params, payloaded param, payload)]
+            return new_payloaded_params
+
+    def make_payload(self):
+        """
+        Make the payload with a unique delim
+        """
         two_rand_letters = random.choice(string.lowercase) + random.choice(string.lowercase)
         delim_str = self.delim + two_rand_letters
-        payload = delim_str + payload + delim_str + ';9'
+        payload = delim_str + self.test_str + delim_str + ';9'
+        return payload
 
-        if '=' in url and '?' in url:
-            # If URL has variables, payload them
-            payloaded_urls = self.payload_url_vars(url, payload) 
-        else:
-            # If URL has no variables, tack payload onto end of URL
-            payloaded_urls = self.payload_end_of_url(url, payload)
-
-        return payloaded_urls, delim_str
-
-    def payload_end_of_url(self, url, payload):
+    def payload_end_of_url(self, url):
         ''' Payload the end of the URL to catch some DOM(?) and other reflected XSSes '''
 
+        payload = self.make_payload()
         # Make URL test and delim strings unique
         if url[-1] == '/':
             payloaded_url = url+payload
         else:
             payloaded_url = url+'/'+payload
 
-        return [(payloaded_url, 'end of url', payload)]
+        return (payloaded_url, 'end of url', payload)
 
     def payload_url_vars(self, url, payload):
         ''' Payload the URL variables '''
@@ -443,7 +525,7 @@ class XSSspider(CrawlSpider):
 
         return (netloc, protocol, doc_domain, path)
 
-    def make_url_reqs(self, orig_url, payloaded_urls, delim_str):
+    def make_url_reqs(self, orig_url, payloaded_urls):
         ''' Make the URL requests '''
 
         reqs = [Request(url[0],
@@ -451,7 +533,7 @@ class XSSspider(CrawlSpider):
                               'xss_param':url[1],
                               'orig_url':orig_url,
                               'payload':url[2],
-                              'delim':delim_str},
+                              'delim':url[2][:len(self.delim)+2]},
                         callback = self.xss_chars_finder)
                         for url in payloaded_urls] # Meta is the payload
 
@@ -461,9 +543,7 @@ class XSSspider(CrawlSpider):
     def make_header_reqs(self, url, payload, inj_headers):
         ''' Generate header requests '''
 
-        two_rand_letters = random.choice(string.lowercase) + random.choice(string.lowercase)
-        delim_str = self.delim + two_rand_letters
-        payload = delim_str + payload + delim_str + ';9'
+        payload = self.make_payload()
 
         reqs = [Request(url,
                         headers={inj_header:payload},
@@ -471,7 +551,7 @@ class XSSspider(CrawlSpider):
                               'xss_param':inj_header,
                               'orig_url':url,
                               'payload':payload,
-                              'delim':delim_str,
+                              'delim':payload[:len(self.delim)+2],
                               'UA':self.get_user_agent(inj_header, payload)},
                         dont_filter=True,
                         callback = self.xss_chars_finder)