better heuristics and refactoring

This commit is contained in:
Somdev Sangwan
2021-06-15 21:56:55 +05:30
committed by GitHub
7 changed files with 63 additions and 39 deletions

4
.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
arjun.egg-info
__pycache__
build
dist

View File

@@ -19,22 +19,22 @@ arjun_dir = compatible_path(mem.__file__.replace('/core/config.py', ''))
parser = argparse.ArgumentParser() # defines the parser parser = argparse.ArgumentParser() # defines the parser
# Arguments that can be supplied # Arguments that can be supplied
parser.add_argument('-u', help='target url', dest='url') parser.add_argument('-u', help='Target URL', dest='url')
parser.add_argument('-o', '-oJ', help='path for json output file', dest='json_file') parser.add_argument('-o', '-oJ', help='Path for json output file.', dest='json_file')
parser.add_argument('-oT', help='path for text output file', dest='text_file') parser.add_argument('-oT', help='Path for text output file.', dest='text_file')
parser.add_argument('-oB', help='port for burp suite proxy', dest='burp_port') parser.add_argument('-oB', help='Port for output to Burp Suite Proxy. Default port is 8080.', dest='burp_port', nargs='?', const=8080)
parser.add_argument('-d', help='delay between requests', dest='delay', type=float, default=0) parser.add_argument('-d', help='Delay between requests in seconds. (default: 0)', dest='delay', type=float, default=0)
parser.add_argument('-t', help='number of threads', dest='threads', type=int, default=2) parser.add_argument('-t', help='Number of concurrent threads. (default: 2)', dest='threads', type=int, default=2)
parser.add_argument('-w', help='wordlist path', dest='wordlist', default=arjun_dir+'/db/default.txt') parser.add_argument('-w', help='Wordlist file path. (default: {arjundir}/db/default.txt)', dest='wordlist', default=arjun_dir+'/db/default.txt')
parser.add_argument('-m', help='request method: GET/POST/XML/JSON', dest='method', default='GET') parser.add_argument('-m', help='Request method to use: GET/POST/XML/JSON. (default: GET)', dest='method', default='GET')
parser.add_argument('-i', help='import targets from file', dest='import_file', nargs='?', const=True) parser.add_argument('-i', help='Import target URLs from file.', dest='import_file', nargs='?', const=True)
parser.add_argument('-T', help='http request timeout', dest='timeout', type=float, default=15) parser.add_argument('-T', help='HTTP request timeout in seconds. (default: 15)', dest='timeout', type=float, default=15)
parser.add_argument('-c', help='chunk size/number of parameters to be sent at once', type=int, dest='chunks', default=500) parser.add_argument('-c', help='Chunk size. The number of parameters to be sent at once', type=int, dest='chunks', default=500)
parser.add_argument('-q', help='quiet mode, no output', dest='quiet', action='store_true') parser.add_argument('-q', help='Quiet mode. No output.', dest='quiet', action='store_true')
parser.add_argument('--headers', help='add headers', dest='headers', nargs='?', const=True) parser.add_argument('--headers', help='Add headers. Separate multiple headers with a new line.', dest='headers', nargs='?', const=True)
parser.add_argument('--passive', help='collect parameter names from passive sources', dest='passive') parser.add_argument('--passive', help='Collect parameter names from passive sources like wayback, commoncrawl and otx.', dest='passive', nargs='?', const='-')
parser.add_argument('--stable', help='prefer stability over speed', dest='stable', action='store_true') parser.add_argument('--stable', help='Prefer stability over speed.', dest='stable', action='store_true')
parser.add_argument('--include', help='include this data in every request', dest='include', default={}) parser.add_argument('--include', help='Include this data in every request.', dest='include', default={})
args = parser.parse_args() # arguments to be parsed args = parser.parse_args() # arguments to be parsed
if args.quiet: if args.quiet:

View File

@@ -158,11 +158,12 @@ def reader(path, mode='string'):
return ''.join([line for line in file]) return ''.join([line for line in file])
re_extract_js = re.compile(r'(?si)<script[^>]*>([^<].+?)</script')
def extract_js(response): def extract_js(response):
""" """
extracts javascript from a given string extracts javascript from a given string
""" """
return re.findall(r'(?s)<script[^>]+>([^<].+?)</script', response.lower(), re.I) return re_extract_js.findall(response)
def parse_headers(string): def parse_headers(string):

View File

@@ -4,7 +4,7 @@ from urllib.parse import urlparse
def commoncrawl(host, page=0): def commoncrawl(host, page=0):
these_params = set() these_params = set()
response = requests.get('http://index.commoncrawl.org/CC-MAIN-2020-29-index?url=*.%s&fl=url&page=%s&limit=10000' % (host, page)).text response = requests.get('http://index.commoncrawl.org/CC-MAIN-2020-29-index?url=*.%s&fl=url&page=%s&limit=10000' % (host, page), verify=False).text
if response.startswith('<!DOCTYPE html>'): if response.startswith('<!DOCTYPE html>'):
return ([], False, 'commoncrawl') return ([], False, 'commoncrawl')
urls = response.split('\n') urls = response.split('\n')

View File

@@ -2,25 +2,43 @@ import re
from arjun.core.utils import extract_js from arjun.core.utils import extract_js
def is_not_junk(string): re_not_junk = re.compile(r'^[A-Za-z0-9_]+$')
return re.match(r'^[A-Za-z0-9_]+$', string) def is_not_junk(param):
return (re_not_junk.match(param) is not None)
def insert_words(words, wordlist, found):
if words:
for var in words:
if var not in found and is_not_junk(var):
found.append(var)
if var in wordlist:
wordlist.remove(var)
wordlist.insert(0, var)
# TODO: for map keys, javascript tolerates { param: "value" }
re_input_names = re.compile(r'''(?i)<input.+?name=["']?([^"'\s>]+)''')
re_input_ids = re.compile(r'''(?i)<input.+?id=["']?([^"'\s>]+)''')
re_empty_vars = re.compile(r'''([^\s!=<>]+)\s*=\s*(?:['"`]{2}|true|false|null)''')
re_map_keys = re.compile(r'''([^'"]+)['"]\s*:\s*['"`]''')
def heuristic(response, wordlist): def heuristic(response, wordlist):
found = [] potential_params = []
inputs = re.findall(r'(?i)<input.+?name=["\']?([^"\'\s>]+)', response)
insert_words(inputs, wordlist, found) # Parse Inputs
input_names = re_input_names.findall(response)
potential_params += input_names
input_ids = re_input_ids.findall(response)
potential_params += input_ids
# Parse Scripts
for script in extract_js(response): for script in extract_js(response):
empty_vars = re.findall(r'([^\s!=<>]+)\s*=\s*[\'"`][\'"`]', script) empty_vars = re_empty_vars.findall(script)
insert_words(empty_vars, wordlist, found) potential_params += empty_vars
map_keys = re.findall(r'([^\'"]+)[\'"]:\s?[\'"]', script)
insert_words(map_keys, wordlist, found) map_keys = re_map_keys.findall(script)
return found potential_params += map_keys
if len(potential_params) == 0:
return []
found = set()
for word in potential_params:
if is_not_junk(word) and (word not in found):
found.add(word)
if word in wordlist:
wordlist.remove(word)
wordlist.insert(0, word)
return list(found)

View File

@@ -4,7 +4,7 @@ from urllib.parse import urlparse
def otx(host, page): def otx(host, page):
these_params = set() these_params = set()
data = requests.get('https://otx.alienvault.com/api/v1/indicators/hostname/%s/url_list?limit=50&page=%d' % (host, page)).json() data = requests.get('https://otx.alienvault.com/api/v1/indicators/hostname/%s/url_list?limit=50&page=%d' % (host, page), verify=False).json()
if 'url_list' not in data: if 'url_list' not in data:
return (these_params, False, 'otx') return (these_params, False, 'otx')
for obj in data['url_list']: for obj in data['url_list']:

View File

@@ -19,7 +19,8 @@ def wayback(host, page):
response = requests.get( response = requests.get(
'http://web.archive.org/cdx/search?filter=mimetype:text/html&filter=statuscode:200', 'http://web.archive.org/cdx/search?filter=mimetype:text/html&filter=statuscode:200',
params=payload, params=payload,
headers=headers headers=headers,
verify=False
).text ).text
if not response: if not response:
return (these_params, False, 'wayback') return (these_params, False, 'wayback')