Merge pull request #33 from ddworken/master

Added parameter to configure the bloom filter size
This commit is contained in:
Dan McInerney
2019-12-18 07:42:41 -07:00
committed by GitHub
4 changed files with 8 additions and 27 deletions

View File

@@ -1,23 +0,0 @@
from pybloom import BloomFilter
from scrapy.utils.job import job_dir
from scrapy.dupefilter import BaseDupeFilter
class BloomURLDupeFilter(BaseDupeFilter):
"""Request Fingerprint duplicates filter"""
def __init__(self, path=None):
self.file = None
self.fingerprints = BloomFilter(3000000, 0.0001)
@classmethod
def from_settings(cls, settings):
return cls(job_dir(settings))
def request_seen(self, request):
fp = request.url
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
def close(self, reason):
self.fingerprints = None

View File

@@ -1,13 +1,14 @@
from pybloom import BloomFilter
from scrapy.utils.job import job_dir
from scrapy.dupefilters import BaseDupeFilter
from settings import bloomfilterSize
class BloomURLDupeFilter(BaseDupeFilter):
"""Request Fingerprint duplicates filter"""
def __init__(self, path=None):
self.file = None
self.fingerprints = BloomFilter(3000000, 0.0001)
self.fingerprints = BloomFilter(bloomfilterSize*10, 0.0001)
@classmethod
def from_settings(cls, settings):

View File

@@ -3,12 +3,13 @@ from urlparse import unquote
from pybloom import BloomFilter
import random
import re
from settings import bloomfilterSize
# Filter out duplicate requests with Bloom filters since they're much easier on memory
#URLS_FORMS_HEADERS = BloomFilter(3000000, 0.00001)
URLS_SEEN = BloomFilter(300000, .0001)
FORMS_SEEN = BloomFilter(300000, .0001)
HEADERS_SEEN = BloomFilter(300000, .0001)
URLS_SEEN = BloomFilter(bloomfilterSize, .0001)
FORMS_SEEN = BloomFilter(bloomfilterSize, .0001)
HEADERS_SEEN = BloomFilter(bloomfilterSize, .0001)
USER_AGENT_LIST = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14',

View File

@@ -38,3 +38,5 @@ ITEM_PIPELINES = {'xsscrapy.pipelines.XSSCharFinder':100}
CONCURRENT_REQUESTS = 30
# If you get bloom filter problems, increase this number
bloomfilterSize = 300000