Merge pull request #33 from ddworken/master
Added parameter to configure the bloom filter size
This commit is contained in:
@@ -1,23 +0,0 @@
|
||||
from pybloom import BloomFilter
|
||||
from scrapy.utils.job import job_dir
|
||||
from scrapy.dupefilter import BaseDupeFilter
|
||||
|
||||
class BloomURLDupeFilter(BaseDupeFilter):
|
||||
"""Request Fingerprint duplicates filter"""
|
||||
|
||||
def __init__(self, path=None):
|
||||
self.file = None
|
||||
self.fingerprints = BloomFilter(3000000, 0.0001)
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
return cls(job_dir(settings))
|
||||
|
||||
def request_seen(self, request):
|
||||
fp = request.url
|
||||
if fp in self.fingerprints:
|
||||
return True
|
||||
self.fingerprints.add(fp)
|
||||
|
||||
def close(self, reason):
|
||||
self.fingerprints = None
|
||||
@@ -1,13 +1,14 @@
|
||||
from pybloom import BloomFilter
|
||||
from scrapy.utils.job import job_dir
|
||||
from scrapy.dupefilters import BaseDupeFilter
|
||||
from settings import bloomfilterSize
|
||||
|
||||
class BloomURLDupeFilter(BaseDupeFilter):
|
||||
"""Request Fingerprint duplicates filter"""
|
||||
|
||||
def __init__(self, path=None):
|
||||
self.file = None
|
||||
self.fingerprints = BloomFilter(3000000, 0.0001)
|
||||
self.fingerprints = BloomFilter(bloomfilterSize*10, 0.0001)
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
|
||||
@@ -3,12 +3,13 @@ from urlparse import unquote
|
||||
from pybloom import BloomFilter
|
||||
import random
|
||||
import re
|
||||
from settings import bloomfilterSize
|
||||
|
||||
# Filter out duplicate requests with Bloom filters since they're much easier on memory
|
||||
#URLS_FORMS_HEADERS = BloomFilter(3000000, 0.00001)
|
||||
URLS_SEEN = BloomFilter(300000, .0001)
|
||||
FORMS_SEEN = BloomFilter(300000, .0001)
|
||||
HEADERS_SEEN = BloomFilter(300000, .0001)
|
||||
URLS_SEEN = BloomFilter(bloomfilterSize, .0001)
|
||||
FORMS_SEEN = BloomFilter(bloomfilterSize, .0001)
|
||||
HEADERS_SEEN = BloomFilter(bloomfilterSize, .0001)
|
||||
USER_AGENT_LIST = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14',
|
||||
|
||||
@@ -38,3 +38,5 @@ ITEM_PIPELINES = {'xsscrapy.pipelines.XSSCharFinder':100}
|
||||
|
||||
CONCURRENT_REQUESTS = 30
|
||||
|
||||
# If you get bloom filter problems, increase this number
|
||||
bloomfilterSize = 300000
|
||||
|
||||
Reference in New Issue
Block a user