import re
def zetanize(response):
def e(string):
return string.encode('utf-8')
def d(string):
return string.decode('utf-8')
# remove the content between html comments
response = re.sub(r'(?s)', '', response)
forms = {}
matches = re.findall(r'(?i)(?s)
',
response) # extract all the forms
num = 0
for match in matches: # everything else is self explanatory if you know regex
page = re.search(r'(?i)action=[\'"](.*?)[\'"]', match)
method = re.search(r'(?i)method=[\'"](.*?)[\'"]', match)
forms[num] = {}
forms[num]['action'] = d(e(page.group(1))) if page else ''
forms[num]['method'] = d(
e(method.group(1)).lower()) if method else 'get'
forms[num]['inputs'] = []
inputs = re.findall(r'(?i)(?s)', response)
for inp in inputs:
inpName = re.search(r'(?i)name=[\'"](.*?)[\'"]', inp)
if inpName:
inpType = re.search(r'(?i)type=[\'"](.*?)[\'"]', inp)
inpValue = re.search(r'(?i)value=[\'"](.*?)[\'"]', inp)
inpName = d(e(inpName.group(1)))
inpType = d(e(inpType.group(1)))if inpType else ''
inpValue = d(e(inpValue.group(1))) if inpValue else ''
if inpType.lower() == 'submit' and inpValue == '':
inpValue = 'Submit Query'
inpDict = {
'name': inpName,
'type': inpType,
'value': inpValue
}
forms[num]['inputs'].append(inpDict)
num += 1
return forms