Files
Arjun/arjun/plugins/commoncrawl.py
Somdev Sangwan 1b11c3574e 2.2.7 build
2024-11-04 01:59:30 +05:30

16 lines
514 B
Python

import requests
from urllib.parse import urlparse
def commoncrawl(host, page=0):
these_params = set()
response = requests.get('http://index.commoncrawl.org/CC-MAIN-2024-42-index?url=*.%s&fl=url&page=%s&limit=10000' % (host, page), verify=False).text
if response.startswith('<!DOCTYPE html>'):
return ([], False, 'commoncrawl')
urls = response.split('\n')
for url in urls:
for param in urlparse(url).query.split('&'):
these_params.add(param.split('=')[0])
return (these_params, True, 'commoncrawl')