better heuristics and refactoring

2021-06-15 21:56:55 +05:30
parent 5652855b55 aed767956d
commit a6c8da355b
7 changed files with 63 additions and 39 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,4 @@
+arjun.egg-info
+__pycache__
+build
+dist
--- a/arjun/main.py
+++ b/arjun/main.py
@@ -19,22 +19,22 @@ arjun_dir = compatible_path(mem.__file__.replace('/core/config.py', ''))

 parser = argparse.ArgumentParser() # defines the parser
 # Arguments that can be supplied
-parser.add_argument('-u', help='target url', dest='url')
-parser.add_argument('-o', '-oJ', help='path for json output file', dest='json_file')
-parser.add_argument('-oT', help='path for text output file', dest='text_file')
-parser.add_argument('-oB', help='port for burp suite proxy', dest='burp_port')
-parser.add_argument('-d', help='delay between requests', dest='delay', type=float, default=0)
-parser.add_argument('-t', help='number of threads', dest='threads', type=int, default=2)
-parser.add_argument('-w', help='wordlist path', dest='wordlist', default=arjun_dir+'/db/default.txt')
-parser.add_argument('-m', help='request method: GET/POST/XML/JSON', dest='method', default='GET')
-parser.add_argument('-i', help='import targets from file', dest='import_file', nargs='?', const=True)
-parser.add_argument('-T', help='http request timeout', dest='timeout', type=float, default=15)
-parser.add_argument('-c', help='chunk size/number of parameters to be sent at once', type=int, dest='chunks', default=500)
-parser.add_argument('-q', help='quiet mode, no output', dest='quiet', action='store_true')
-parser.add_argument('--headers', help='add headers', dest='headers', nargs='?', const=True)
-parser.add_argument('--passive', help='collect parameter names from passive sources', dest='passive')
-parser.add_argument('--stable', help='prefer stability over speed', dest='stable', action='store_true')
-parser.add_argument('--include', help='include this data in every request', dest='include', default={})
+parser.add_argument('-u', help='Target URL', dest='url')
+parser.add_argument('-o', '-oJ', help='Path for json output file.', dest='json_file')
+parser.add_argument('-oT', help='Path for text output file.', dest='text_file')
+parser.add_argument('-oB', help='Port for output to Burp Suite Proxy. Default port is 8080.', dest='burp_port', nargs='?', const=8080)
+parser.add_argument('-d', help='Delay between requests in seconds. (default: 0)', dest='delay', type=float, default=0)
+parser.add_argument('-t', help='Number of concurrent threads. (default: 2)', dest='threads', type=int, default=2)
+parser.add_argument('-w', help='Wordlist file path. (default: {arjundir}/db/default.txt)', dest='wordlist', default=arjun_dir+'/db/default.txt')
+parser.add_argument('-m', help='Request method to use: GET/POST/XML/JSON. (default: GET)', dest='method', default='GET')
+parser.add_argument('-i', help='Import target URLs from file.', dest='import_file', nargs='?', const=True)
+parser.add_argument('-T', help='HTTP request timeout in seconds. (default: 15)', dest='timeout', type=float, default=15)
+parser.add_argument('-c', help='Chunk size. The number of parameters to be sent at once', type=int, dest='chunks', default=500)
+parser.add_argument('-q', help='Quiet mode. No output.', dest='quiet', action='store_true')
+parser.add_argument('--headers', help='Add headers. Separate multiple headers with a new line.', dest='headers', nargs='?', const=True)
+parser.add_argument('--passive', help='Collect parameter names from passive sources like wayback, commoncrawl and otx.', dest='passive', nargs='?', const='-')
+parser.add_argument('--stable', help='Prefer stability over speed.', dest='stable', action='store_true')
+parser.add_argument('--include', help='Include this data in every request.', dest='include', default={})
 args = parser.parse_args() # arguments to be parsed

 if args.quiet:
--- a/arjun/core/utils.py
+++ b/arjun/core/utils.py
@@ -158,11 +158,12 @@ def reader(path, mode='string'):
            return ''.join([line for line in file])


+re_extract_js = re.compile(r'(?si)<script[^>]*>([^<].+?)</script')
 def extract_js(response):
    """
    extracts javascript from a given string
    """
-    return re.findall(r'(?s)<script[^>]+>([^<].+?)</script', response.lower(), re.I)
+    return re_extract_js.findall(response)


 def parse_headers(string):
--- a/arjun/plugins/commoncrawl.py
+++ b/arjun/plugins/commoncrawl.py
@@ -4,7 +4,7 @@ from urllib.parse import urlparse

 def commoncrawl(host, page=0):
 	these_params = set()
-	response = requests.get('http://index.commoncrawl.org/CC-MAIN-2020-29-index?url=*.%s&fl=url&page=%s&limit=10000' % (host, page)).text
+	response = requests.get('http://index.commoncrawl.org/CC-MAIN-2020-29-index?url=*.%s&fl=url&page=%s&limit=10000' % (host, page), verify=False).text
 	if response.startswith('<!DOCTYPE html>'):
 		return ([], False, 'commoncrawl')
 	urls = response.split('\n')
--- a/arjun/plugins/heuristic.py
+++ b/arjun/plugins/heuristic.py
@@ -2,25 +2,43 @@ import re

 from arjun.core.utils import extract_js

-def is_not_junk(string):
-    return re.match(r'^[A-Za-z0-9_]+$', string)
-
-def insert_words(words, wordlist, found):
-    if words:
-        for var in words:
-            if var not in found and is_not_junk(var):
-                found.append(var)
-                if var in wordlist:
-                    wordlist.remove(var)
-                wordlist.insert(0, var)
+re_not_junk = re.compile(r'^[A-Za-z0-9_]+$')
+def is_not_junk(param):
+    return (re_not_junk.match(param) is not None)

+# TODO: for map keys, javascript tolerates { param: "value" }
+re_input_names = re.compile(r'''(?i)<input.+?name=["']?([^"'\s>]+)''')
+re_input_ids = re.compile(r'''(?i)<input.+?id=["']?([^"'\s>]+)''')
+re_empty_vars = re.compile(r'''([^\s!=<>]+)\s*=\s*(?:['"`]{2}|true|false|null)''')
+re_map_keys = re.compile(r'''([^'"]+)['"]\s*:\s*['"`]''')
 def heuristic(response, wordlist):
-    found = []
-    inputs = re.findall(r'(?i)<input.+?name=["\']?([^"\'\s>]+)', response)
-    insert_words(inputs, wordlist, found)
+    potential_params = []
+
+    # Parse Inputs
+    input_names = re_input_names.findall(response)
+    potential_params += input_names
+
+    input_ids = re_input_ids.findall(response)
+    potential_params += input_ids
+
+    # Parse Scripts
    for script in extract_js(response):
-        empty_vars = re.findall(r'([^\s!=<>]+)\s*=\s*[\'"`][\'"`]', script)
-        insert_words(empty_vars, wordlist, found)
-        map_keys = re.findall(r'([^\'"]+)[\'"]:\s?[\'"]', script)
-        insert_words(map_keys, wordlist, found)
-    return found
+        empty_vars = re_empty_vars.findall(script)
+        potential_params += empty_vars
+
+        map_keys = re_map_keys.findall(script)
+        potential_params += map_keys
+
+    if len(potential_params) == 0:
+        return []
+
+    found = set()
+    for word in potential_params:
+        if is_not_junk(word) and (word not in found):
+            found.add(word)
+
+            if word in wordlist:
+                wordlist.remove(word)
+            wordlist.insert(0, word)
+
+    return list(found)
--- a/arjun/plugins/otx.py
+++ b/arjun/plugins/otx.py
@@ -4,7 +4,7 @@ from urllib.parse import urlparse

 def otx(host, page):
 	these_params = set()
-	data = requests.get('https://otx.alienvault.com/api/v1/indicators/hostname/%s/url_list?limit=50&page=%d' % (host, page)).json()
+	data = requests.get('https://otx.alienvault.com/api/v1/indicators/hostname/%s/url_list?limit=50&page=%d' % (host, page), verify=False).json()
 	if 'url_list' not in data:
 		return (these_params, False, 'otx')
 	for obj in data['url_list']:
--- a/arjun/plugins/wayback.py
+++ b/arjun/plugins/wayback.py
@@ -19,7 +19,8 @@ def wayback(host, page):
 		response = requests.get(
 			'http://web.archive.org/cdx/search?filter=mimetype:text/html&filter=statuscode:200',
 			params=payload,
-			headers=headers
+			headers=headers,
+			verify=False
 		).text
 		if not response:
 			return (these_params, False, 'wayback')