fixed bug in quote parser
This commit is contained in:
Binary file not shown.
@@ -1 +0,0 @@
|
||||
include ez_setup.py
|
||||
@@ -1,22 +0,0 @@
|
||||
Metadata-Version: 1.0
|
||||
Name: pybloom
|
||||
Version: 1.1
|
||||
Summary: PyBloom: A Probabilistic data structure
|
||||
Home-page: http://github.com/jaybaird/python-bloomfilter/
|
||||
Author: Jay Baird
|
||||
Author-email: jay.baird@me.com
|
||||
License: MIT License
|
||||
Description:
|
||||
pybloom is a Python implementation of the bloom filter probabilistic data
|
||||
structure. The module also provides a Scalable Bloom Filter that allows a
|
||||
bloom filter to grow without knowing the original set size.
|
||||
|
||||
Keywords: data structures,bloom filter,bloom,filter,probabilistic,set
|
||||
Platform: any
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: MIT License
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Topic :: Utilities
|
||||
Classifier: Topic :: Database :: Database Engines/Servers
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
@@ -1,270 +0,0 @@
|
||||
#!python
|
||||
"""Bootstrap setuptools installation
|
||||
|
||||
If you want to use setuptools in your package's setup.py, just include this
|
||||
file in the same directory with it, and add this to the top of your setup.py::
|
||||
|
||||
from ez_setup import use_setuptools
|
||||
use_setuptools()
|
||||
|
||||
If you want to require a specific version of setuptools, set a download
|
||||
mirror, or use an alternate download directory, you can do so by supplying
|
||||
the appropriate options to ``use_setuptools()``.
|
||||
|
||||
This file can also be run as a script to install or upgrade setuptools.
|
||||
"""
|
||||
import sys
|
||||
DEFAULT_VERSION = "0.6c9"
|
||||
DEFAULT_URL = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[:3]
|
||||
|
||||
md5_data = {
|
||||
'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca',
|
||||
'setuptools-0.6b1-py2.4.egg': 'b79a8a403e4502fbb85ee3f1941735cb',
|
||||
'setuptools-0.6b2-py2.3.egg': '5657759d8a6d8fc44070a9d07272d99b',
|
||||
'setuptools-0.6b2-py2.4.egg': '4996a8d169d2be661fa32a6e52e4f82a',
|
||||
'setuptools-0.6b3-py2.3.egg': 'bb31c0fc7399a63579975cad9f5a0618',
|
||||
'setuptools-0.6b3-py2.4.egg': '38a8c6b3d6ecd22247f179f7da669fac',
|
||||
'setuptools-0.6b4-py2.3.egg': '62045a24ed4e1ebc77fe039aa4e6f7e5',
|
||||
'setuptools-0.6b4-py2.4.egg': '4cb2a185d228dacffb2d17f103b3b1c4',
|
||||
'setuptools-0.6c1-py2.3.egg': 'b3f2b5539d65cb7f74ad79127f1a908c',
|
||||
'setuptools-0.6c1-py2.4.egg': 'b45adeda0667d2d2ffe14009364f2a4b',
|
||||
'setuptools-0.6c2-py2.3.egg': 'f0064bf6aa2b7d0f3ba0b43f20817c27',
|
||||
'setuptools-0.6c2-py2.4.egg': '616192eec35f47e8ea16cd6a122b7277',
|
||||
'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa',
|
||||
'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e',
|
||||
'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e',
|
||||
'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f',
|
||||
'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2',
|
||||
'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc',
|
||||
'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167',
|
||||
'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64',
|
||||
'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d',
|
||||
'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20',
|
||||
'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab',
|
||||
'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53',
|
||||
'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2',
|
||||
'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e',
|
||||
'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372',
|
||||
'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902',
|
||||
'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de',
|
||||
'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b',
|
||||
'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03',
|
||||
'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a',
|
||||
'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6',
|
||||
'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a',
|
||||
}
|
||||
|
||||
import sys, os
|
||||
try: from hashlib import md5
|
||||
except ImportError: from md5 import md5
|
||||
|
||||
def _validate_md5(egg_name, data):
|
||||
if egg_name in md5_data:
|
||||
digest = md5(data).hexdigest()
|
||||
if digest != md5_data[egg_name]:
|
||||
print >>sys.stderr, (
|
||||
"md5 validation of %s failed! (Possible download problem?)"
|
||||
% egg_name
|
||||
)
|
||||
sys.exit(2)
|
||||
return data
|
||||
|
||||
def use_setuptools(
|
||||
version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir,
|
||||
download_delay=15
|
||||
):
|
||||
"""Automatically find/download setuptools and make it available on sys.path
|
||||
|
||||
`version` should be a valid setuptools version number that is available
|
||||
as an egg for download under the `download_base` URL (which should end with
|
||||
a '/'). `to_dir` is the directory where setuptools will be downloaded, if
|
||||
it is not already available. If `download_delay` is specified, it should
|
||||
be the number of seconds that will be paused before initiating a download,
|
||||
should one be required. If an older version of setuptools is installed,
|
||||
this routine will print a message to ``sys.stderr`` and raise SystemExit in
|
||||
an attempt to abort the calling script.
|
||||
"""
|
||||
was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules
|
||||
def do_download():
|
||||
egg = download_setuptools(version, download_base, to_dir, download_delay)
|
||||
sys.path.insert(0, egg)
|
||||
import setuptools; setuptools.bootstrap_install_from = egg
|
||||
try:
|
||||
import pkg_resources
|
||||
except ImportError:
|
||||
return do_download()
|
||||
try:
|
||||
pkg_resources.require("setuptools>="+version); return
|
||||
except pkg_resources.VersionConflict, e:
|
||||
if was_imported:
|
||||
print >>sys.stderr, (
|
||||
"The required version of setuptools (>=%s) is not available, and\n"
|
||||
"can't be installed while this script is running. Please install\n"
|
||||
" a more recent version first, using 'easy_install -U setuptools'."
|
||||
"\n\n(Currently using %r)"
|
||||
) % (version, e.args[0])
|
||||
sys.exit(2)
|
||||
else:
|
||||
del pkg_resources, sys.modules['pkg_resources'] # reload ok
|
||||
return do_download()
|
||||
except pkg_resources.DistributionNotFound:
|
||||
return do_download()
|
||||
|
||||
def download_setuptools(
|
||||
version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir,
|
||||
delay = 15
|
||||
):
|
||||
"""Download setuptools from a specified location and return its filename
|
||||
|
||||
`version` should be a valid setuptools version number that is available
|
||||
as an egg for download under the `download_base` URL (which should end
|
||||
with a '/'). `to_dir` is the directory where the egg will be downloaded.
|
||||
`delay` is the number of seconds to pause before an actual download attempt.
|
||||
"""
|
||||
import urllib2, shutil
|
||||
egg_name = "setuptools-%s-py%s.egg" % (version,sys.version[:3])
|
||||
url = download_base + egg_name
|
||||
saveto = os.path.join(to_dir, egg_name)
|
||||
src = dst = None
|
||||
if not os.path.exists(saveto): # Avoid repeated downloads
|
||||
try:
|
||||
from distutils import log
|
||||
if delay:
|
||||
log.warn("""
|
||||
---------------------------------------------------------------------------
|
||||
This script requires setuptools version %s to run (even to display
|
||||
help). I will attempt to download it for you (from
|
||||
%s), but
|
||||
you may need to enable firewall access for this script first.
|
||||
I will start the download in %d seconds.
|
||||
|
||||
(Note: if this machine does not have network access, please obtain the file
|
||||
|
||||
%s
|
||||
|
||||
and place it in this directory before rerunning this script.)
|
||||
---------------------------------------------------------------------------""",
|
||||
version, download_base, delay, url
|
||||
); from time import sleep; sleep(delay)
|
||||
log.warn("Downloading %s", url)
|
||||
src = urllib2.urlopen(url)
|
||||
# Read/write all in one block, so we don't create a corrupt file
|
||||
# if the download is interrupted.
|
||||
data = _validate_md5(egg_name, src.read())
|
||||
dst = open(saveto,"wb"); dst.write(data)
|
||||
finally:
|
||||
if src: src.close()
|
||||
if dst: dst.close()
|
||||
return os.path.realpath(saveto)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def main(argv, version=DEFAULT_VERSION):
|
||||
"""Install or upgrade setuptools and EasyInstall"""
|
||||
try:
|
||||
import setuptools
|
||||
except ImportError:
|
||||
egg = None
|
||||
try:
|
||||
egg = download_setuptools(version, delay=0)
|
||||
sys.path.insert(0,egg)
|
||||
from setuptools.command.easy_install import main
|
||||
return main(list(argv)+[egg]) # we're done here
|
||||
finally:
|
||||
if egg and os.path.exists(egg):
|
||||
os.unlink(egg)
|
||||
else:
|
||||
if setuptools.__version__ == '0.0.1':
|
||||
print >>sys.stderr, (
|
||||
"You have an obsolete version of setuptools installed. Please\n"
|
||||
"remove it from your system entirely before rerunning this script."
|
||||
)
|
||||
sys.exit(2)
|
||||
|
||||
req = "setuptools>="+version
|
||||
import pkg_resources
|
||||
try:
|
||||
pkg_resources.require(req)
|
||||
except pkg_resources.VersionConflict:
|
||||
try:
|
||||
from setuptools.command.easy_install import main
|
||||
except ImportError:
|
||||
from easy_install import main
|
||||
main(list(argv)+[download_setuptools(delay=0)])
|
||||
sys.exit(0) # try to force an exit
|
||||
else:
|
||||
if argv:
|
||||
from setuptools.command.easy_install import main
|
||||
main(argv)
|
||||
else:
|
||||
print "Setuptools version",version,"or greater has been installed."
|
||||
print '(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)'
|
||||
|
||||
def update_md5(filenames):
|
||||
"""Update our built-in md5 registry"""
|
||||
|
||||
import re
|
||||
|
||||
for name in filenames:
|
||||
base = os.path.basename(name)
|
||||
f = open(name,'rb')
|
||||
md5_data[base] = md5(f.read()).hexdigest()
|
||||
f.close()
|
||||
|
||||
data = [" %r: %r,\n" % it for it in md5_data.items()]
|
||||
data.sort()
|
||||
repl = "".join(data)
|
||||
|
||||
import inspect
|
||||
srcfile = inspect.getsourcefile(sys.modules[__name__])
|
||||
f = open(srcfile, 'rb'); src = f.read(); f.close()
|
||||
|
||||
match = re.search("\nmd5_data = {\n([^}]+)}", src)
|
||||
if not match:
|
||||
print >>sys.stderr, "Internal error!"
|
||||
sys.exit(2)
|
||||
|
||||
src = src[:match.start(1)] + repl + src[match.end(1):]
|
||||
f = open(srcfile,'w')
|
||||
f.write(src)
|
||||
f.close()
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
if len(sys.argv)>2 and sys.argv[1]=='--md5update':
|
||||
update_md5(sys.argv[2:])
|
||||
else:
|
||||
main(sys.argv[1:])
|
||||
@@ -1,22 +0,0 @@
|
||||
Metadata-Version: 1.1
|
||||
Name: pybloom
|
||||
Version: 1.1
|
||||
Summary: PyBloom: A Probabilistic data structure
|
||||
Home-page: http://github.com/jaybaird/python-bloomfilter/
|
||||
Author: Jay Baird
|
||||
Author-email: jay.baird@me.com
|
||||
License: MIT License
|
||||
Description:
|
||||
pybloom is a Python implementation of the bloom filter probabilistic data
|
||||
structure. The module also provides a Scalable Bloom Filter that allows a
|
||||
bloom filter to grow without knowing the original set size.
|
||||
|
||||
Keywords: data structures,bloom filter,bloom,filter,probabilistic,set
|
||||
Platform: any
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: MIT License
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Topic :: Utilities
|
||||
Classifier: Topic :: Database :: Database Engines/Servers
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
@@ -1 +0,0 @@
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
Metadata-Version: 1.0
|
||||
Name: pybloom
|
||||
Version: 1.1
|
||||
Summary: PyBloom: A Probabilistic data structure
|
||||
Home-page: http://github.com/jaybaird/python-bloomfilter/
|
||||
Author: Jay Baird
|
||||
Author-email: jay.baird@me.com
|
||||
License: MIT License
|
||||
Description:
|
||||
pybloom is a Python implementation of the bloom filter probabilistic data
|
||||
structure. The module also provides a Scalable Bloom Filter that allows a
|
||||
bloom filter to grow without knowing the original set size.
|
||||
|
||||
Keywords: data structures,bloom filter,bloom,filter,probabilistic,set
|
||||
Platform: any
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: MIT License
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Topic :: Utilities
|
||||
Classifier: Topic :: Database :: Database Engines/Servers
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
@@ -1 +0,0 @@
|
||||
|
||||
Binary file not shown.
@@ -1,5 +0,0 @@
|
||||
"""pybloom
|
||||
|
||||
"""
|
||||
from pybloom import BloomFilter, ScalableBloomFilter, __version__, __author__
|
||||
|
||||
@@ -1,409 +0,0 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
"""This module implements a bloom filter probabilistic data structure and
|
||||
an a Scalable Bloom Filter that grows in size as your add more items to it
|
||||
without increasing the false positive error_rate.
|
||||
|
||||
Requires the bitarray library: http://pypi.python.org/pypi/bitarray/
|
||||
|
||||
>>> from pybloom import BloomFilter
|
||||
>>> f = BloomFilter(capacity=10000, error_rate=0.001)
|
||||
>>> for i in xrange(0, f.capacity):
|
||||
... _ = f.add(i)
|
||||
...
|
||||
>>> 0 in f
|
||||
True
|
||||
>>> f.capacity in f
|
||||
False
|
||||
>>> len(f) <= f.capacity
|
||||
True
|
||||
>>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate
|
||||
True
|
||||
|
||||
>>> from pybloom import ScalableBloomFilter
|
||||
>>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
|
||||
>>> count = 10000
|
||||
>>> for i in xrange(0, count):
|
||||
... _ = sbf.add(i)
|
||||
...
|
||||
>>> sbf.capacity > count
|
||||
True
|
||||
>>> len(sbf) <= count
|
||||
True
|
||||
>>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate
|
||||
True
|
||||
|
||||
"""
|
||||
import math
|
||||
import hashlib
|
||||
from struct import unpack, pack, calcsize
|
||||
|
||||
try:
|
||||
import bitarray
|
||||
except ImportError:
|
||||
raise ImportError('pybloom requires bitarray >= 0.3.4')
|
||||
|
||||
__version__ = '1.1'
|
||||
__author__ = "Jay Baird <jay@mochimedia.com>, Bob Ippolito <bob@redivi.com>,\
|
||||
Marius Eriksen <marius@monkey.org>,\
|
||||
Alex Brasetvik <alex@brasetvik.com>"
|
||||
|
||||
def make_hashfuncs(num_slices, num_bits):
|
||||
if num_bits >= (1 << 31):
|
||||
fmt_code, chunk_size = 'Q', 8
|
||||
elif num_bits >= (1 << 15):
|
||||
fmt_code, chunk_size = 'I', 4
|
||||
else:
|
||||
fmt_code, chunk_size = 'H', 2
|
||||
total_hash_bits = 8 * num_slices * chunk_size
|
||||
if total_hash_bits > 384:
|
||||
hashfn = hashlib.sha512
|
||||
elif total_hash_bits > 256:
|
||||
hashfn = hashlib.sha384
|
||||
elif total_hash_bits > 160:
|
||||
hashfn = hashlib.sha256
|
||||
elif total_hash_bits > 128:
|
||||
hashfn = hashlib.sha1
|
||||
else:
|
||||
hashfn = hashlib.md5
|
||||
fmt = fmt_code * (hashfn().digest_size // chunk_size)
|
||||
num_salts, extra = divmod(num_slices, len(fmt))
|
||||
if extra:
|
||||
num_salts += 1
|
||||
salts = [hashfn(hashfn(pack('I', i)).digest()) for i in xrange(num_salts)]
|
||||
def _make_hashfuncs(key):
|
||||
if isinstance(key, unicode):
|
||||
key = key.encode('utf-8')
|
||||
else:
|
||||
key = str(key)
|
||||
rval = []
|
||||
for salt in salts:
|
||||
h = salt.copy()
|
||||
h.update(key)
|
||||
rval.extend(uint % num_bits for uint in unpack(fmt, h.digest()))
|
||||
del rval[num_slices:]
|
||||
return rval
|
||||
return _make_hashfuncs
|
||||
|
||||
|
||||
class BloomFilter(object):
|
||||
FILE_FMT = '<dQQQQ'
|
||||
|
||||
def __init__(self, capacity, error_rate=0.001):
|
||||
"""Implements a space-efficient probabilistic data structure
|
||||
|
||||
capacity
|
||||
this BloomFilter must be able to store at least *capacity* elements
|
||||
while maintaining no more than *error_rate* chance of false
|
||||
positives
|
||||
error_rate
|
||||
the error_rate of the filter returning false positives. This
|
||||
determines the filters capacity. Inserting more than capacity
|
||||
elements greatly increases the chance of false positives.
|
||||
|
||||
>>> b = BloomFilter(capacity=100000, error_rate=0.001)
|
||||
>>> b.add("test")
|
||||
False
|
||||
>>> "test" in b
|
||||
True
|
||||
|
||||
"""
|
||||
if not (0 < error_rate < 1):
|
||||
raise ValueError("Error_Rate must be between 0 and 1.")
|
||||
if not capacity > 0:
|
||||
raise ValueError("Capacity must be > 0")
|
||||
# given M = num_bits, k = num_slices, p = error_rate, n = capacity
|
||||
# solving for m = bits_per_slice
|
||||
# n ~= M * ((ln(2) ** 2) / abs(ln(P)))
|
||||
# n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
|
||||
# m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
|
||||
num_slices = int(math.ceil(math.log(1 / error_rate, 2)))
|
||||
# the error_rate constraint assumes a fill rate of 1/2
|
||||
# so we double the capacity to simplify the API
|
||||
bits_per_slice = int(math.ceil(
|
||||
(2 * capacity * abs(math.log(error_rate))) /
|
||||
(num_slices * (math.log(2) ** 2))))
|
||||
self._setup(error_rate, num_slices, bits_per_slice, capacity, 0)
|
||||
self.bitarray = bitarray.bitarray(self.num_bits, endian='little')
|
||||
self.bitarray.setall(False)
|
||||
|
||||
def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count):
|
||||
self.error_rate = error_rate
|
||||
self.num_slices = num_slices
|
||||
self.bits_per_slice = bits_per_slice
|
||||
self.capacity = capacity
|
||||
self.num_bits = num_slices * bits_per_slice
|
||||
self.count = count
|
||||
self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
|
||||
|
||||
def __contains__(self, key):
|
||||
"""Tests a key's membership in this bloom filter.
|
||||
|
||||
>>> b = BloomFilter(capacity=100)
|
||||
>>> b.add("hello")
|
||||
False
|
||||
>>> "hello" in b
|
||||
True
|
||||
|
||||
"""
|
||||
bits_per_slice = self.bits_per_slice
|
||||
bitarray = self.bitarray
|
||||
if not isinstance(key, list):
|
||||
hashes = self.make_hashes(key)
|
||||
else:
|
||||
hashes = key
|
||||
offset = 0
|
||||
for k in hashes:
|
||||
if not bitarray[offset + k]:
|
||||
return False
|
||||
offset += bits_per_slice
|
||||
return True
|
||||
|
||||
def __len__(self):
|
||||
"""Return the number of keys stored by this bloom filter."""
|
||||
return self.count
|
||||
|
||||
def add(self, key, skip_check=False):
|
||||
""" Adds a key to this bloom filter. If the key already exists in this
|
||||
filter it will return True. Otherwise False.
|
||||
|
||||
>>> b = BloomFilter(capacity=100)
|
||||
>>> b.add("hello")
|
||||
False
|
||||
>>> b.add("hello")
|
||||
True
|
||||
|
||||
"""
|
||||
bitarray = self.bitarray
|
||||
bits_per_slice = self.bits_per_slice
|
||||
hashes = self.make_hashes(key)
|
||||
if not skip_check and hashes in self:
|
||||
return True
|
||||
if self.count > self.capacity:
|
||||
raise IndexError("BloomFilter is at capacity")
|
||||
offset = 0
|
||||
for k in hashes:
|
||||
self.bitarray[offset + k] = True
|
||||
offset += bits_per_slice
|
||||
self.count += 1
|
||||
return False
|
||||
|
||||
def copy(self):
|
||||
"""Return a copy of this bloom filter.
|
||||
"""
|
||||
new_filter = BloomFilter(self.capacity, self.error_rate)
|
||||
new_filter.bitarray = self.bitarray.copy()
|
||||
return new_filter
|
||||
|
||||
def union(self, other):
|
||||
""" Calculates the union of the two underlying bitarrays and returns
|
||||
a new bloom filter object."""
|
||||
if self.capacity != other.capacity or \
|
||||
self.error_rate != other.error_rate:
|
||||
raise ValueError("Unioning filters requires both filters to have \
|
||||
both the same capacity and error rate")
|
||||
new_bloom = self.copy()
|
||||
new_bloom.bitarray = new_bloom.bitarray | other.bitarray
|
||||
return new_bloom
|
||||
|
||||
def __or__(self, other):
|
||||
return self.union(other)
|
||||
|
||||
def intersection(self, other):
|
||||
""" Calculates the union of the two underlying bitarrays and returns
|
||||
a new bloom filter object."""
|
||||
if self.capacity != other.capacity or \
|
||||
self.error_rate != other.error_rate:
|
||||
raise ValueError("Intersecting filters requires both filters to \
|
||||
have equal capacity and error rate")
|
||||
new_bloom = self.copy()
|
||||
new_bloom.bitarray = new_bloom.bitarray & other.bitarray
|
||||
return new_bloom
|
||||
|
||||
def __and__(self, other):
|
||||
return self.intersection(other)
|
||||
|
||||
def tofile(self, f):
|
||||
"""Write the bloom filter to file object `f'. Underlying bits
|
||||
are written as machine values. This is much more space
|
||||
efficient than pickling the object."""
|
||||
f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices,
|
||||
self.bits_per_slice, self.capacity, self.count))
|
||||
self.bitarray.tofile(f)
|
||||
|
||||
@classmethod
|
||||
def fromfile(cls, f, n=-1):
|
||||
"""Read a bloom filter from file-object `f' serialized with
|
||||
``BloomFilter.tofile''. If `n' > 0 read only so many bytes."""
|
||||
headerlen = calcsize(cls.FILE_FMT)
|
||||
|
||||
if 0 < n < headerlen:
|
||||
raise ValueError, 'n too small!'
|
||||
|
||||
filter = cls(1) # Bogus instantiation, we will `_setup'.
|
||||
filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen)))
|
||||
filter.bitarray = bitarray.bitarray(endian='little')
|
||||
if n > 0:
|
||||
filter.bitarray.fromfile(f, n - headerlen)
|
||||
else:
|
||||
filter.bitarray.fromfile(f)
|
||||
if filter.num_bits != filter.bitarray.length() and \
|
||||
(filter.num_bits + (8 - filter.num_bits % 8)
|
||||
!= filter.bitarray.length()):
|
||||
raise ValueError, 'Bit length mismatch!'
|
||||
|
||||
return filter
|
||||
|
||||
def __getstate__(self):
|
||||
d = self.__dict__.copy()
|
||||
del d['make_hashes']
|
||||
return d
|
||||
|
||||
def __setstate__(self, d):
|
||||
self.__dict__.update(d)
|
||||
self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
|
||||
|
||||
class ScalableBloomFilter(object):
|
||||
SMALL_SET_GROWTH = 2 # slower, but takes up less memory
|
||||
LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster
|
||||
FILE_FMT = '<idQd'
|
||||
|
||||
def __init__(self, initial_capacity=100, error_rate=0.001,
|
||||
mode=SMALL_SET_GROWTH):
|
||||
"""Implements a space-efficient probabilistic data structure that
|
||||
grows as more items are added while maintaining a steady false
|
||||
positive rate
|
||||
|
||||
initial_capacity
|
||||
the initial capacity of the filter
|
||||
error_rate
|
||||
the error_rate of the filter returning false positives. This
|
||||
determines the filters capacity. Going over capacity greatly
|
||||
increases the chance of false positives.
|
||||
mode
|
||||
can be either ScalableBloomFilter.SMALL_SET_GROWTH or
|
||||
ScalableBloomFilter.LARGE_SET_GROWTH. SMALL_SET_GROWTH is slower
|
||||
but uses less memory. LARGE_SET_GROWTH is faster but consumes
|
||||
memory faster.
|
||||
|
||||
>>> b = ScalableBloomFilter(initial_capacity=512, error_rate=0.001, \
|
||||
mode=ScalableBloomFilter.SMALL_SET_GROWTH)
|
||||
>>> b.add("test")
|
||||
False
|
||||
>>> "test" in b
|
||||
True
|
||||
>>> unicode_string = u'¡'
|
||||
>>> b.add(unicode_string)
|
||||
False
|
||||
>>> unicode_string in b
|
||||
True
|
||||
"""
|
||||
if not error_rate or error_rate < 0:
|
||||
raise ValueError("Error_Rate must be a decimal less than 0.")
|
||||
self._setup(mode, 0.9, initial_capacity, error_rate)
|
||||
self.filters = []
|
||||
|
||||
def _setup(self, mode, ratio, initial_capacity, error_rate):
|
||||
self.scale = mode
|
||||
self.ratio = ratio
|
||||
self.initial_capacity = initial_capacity
|
||||
self.error_rate = error_rate
|
||||
|
||||
def __contains__(self, key):
|
||||
"""Tests a key's membership in this bloom filter.
|
||||
|
||||
>>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \
|
||||
mode=ScalableBloomFilter.SMALL_SET_GROWTH)
|
||||
>>> b.add("hello")
|
||||
False
|
||||
>>> "hello" in b
|
||||
True
|
||||
|
||||
"""
|
||||
for f in reversed(self.filters):
|
||||
if key in f:
|
||||
return True
|
||||
return False
|
||||
|
||||
def add(self, key):
|
||||
"""Adds a key to this bloom filter.
|
||||
If the key already exists in this filter it will return True.
|
||||
Otherwise False.
|
||||
|
||||
>>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \
|
||||
mode=ScalableBloomFilter.SMALL_SET_GROWTH)
|
||||
>>> b.add("hello")
|
||||
False
|
||||
>>> b.add("hello")
|
||||
True
|
||||
|
||||
"""
|
||||
if key in self:
|
||||
return True
|
||||
filter = self.filters[-1] if self.filters else None
|
||||
if filter is None or filter.count >= filter.capacity:
|
||||
num_filters = len(self.filters)
|
||||
filter = BloomFilter(
|
||||
capacity=self.initial_capacity * (self.scale ** num_filters),
|
||||
error_rate=self.error_rate * (self.ratio ** num_filters))
|
||||
self.filters.append(filter)
|
||||
filter.add(key, skip_check=True)
|
||||
return False
|
||||
|
||||
@property
|
||||
def capacity(self):
|
||||
"""Returns the total capacity for all filters in this SBF"""
|
||||
return sum([f.capacity for f in self.filters])
|
||||
|
||||
@property
|
||||
def count(self):
|
||||
return len(self)
|
||||
|
||||
def tofile(self, f):
|
||||
"""Serialize this ScalableBloomFilter into the file-object
|
||||
`f'."""
|
||||
f.write(pack(self.FILE_FMT, self.scale, self.ratio,
|
||||
self.initial_capacity, self.error_rate))
|
||||
|
||||
# Write #-of-filters
|
||||
f.write(pack('<l', len(self.filters)))
|
||||
|
||||
if len(self.filters) > 0:
|
||||
# Then each filter directly, with a header describing
|
||||
# their lengths.
|
||||
headerpos = f.tell()
|
||||
headerfmt = '<' + 'Q'*(len(self.filters))
|
||||
f.write('.' * calcsize(headerfmt))
|
||||
filter_sizes = []
|
||||
for filter in self.filters:
|
||||
begin = f.tell()
|
||||
filter.tofile(f)
|
||||
filter_sizes.append(f.tell() - begin)
|
||||
|
||||
f.seek(headerpos)
|
||||
f.write(pack(headerfmt, *filter_sizes))
|
||||
|
||||
@classmethod
|
||||
def fromfile(cls, f):
|
||||
"""Deserialize the ScalableBloomFilter in file object `f'."""
|
||||
filter = cls()
|
||||
filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT))))
|
||||
nfilters, = unpack('<l', f.read(calcsize('<l')))
|
||||
if nfilters > 0:
|
||||
header_fmt = '<' + 'Q'*nfilters
|
||||
bytes = f.read(calcsize(header_fmt))
|
||||
filter_lengths = unpack(header_fmt, bytes)
|
||||
for fl in filter_lengths:
|
||||
filter.filters.append(BloomFilter.fromfile(f, fl))
|
||||
else:
|
||||
filter.filters = []
|
||||
|
||||
return filter
|
||||
|
||||
def __len__(self):
|
||||
"""Returns the total number of elements stored in this SBF"""
|
||||
return sum([f.count for f in self.filters])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
@@ -1,96 +0,0 @@
|
||||
import os
|
||||
import doctest
|
||||
import unittest
|
||||
import random
|
||||
import tempfile
|
||||
from pybloom import BloomFilter, ScalableBloomFilter
|
||||
from unittest import TestSuite
|
||||
|
||||
def additional_tests():
|
||||
proj_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
readme_fn = os.path.join(proj_dir, 'README.txt')
|
||||
suite = TestSuite([doctest.DocTestSuite('pybloom.pybloom')])
|
||||
if os.path.exists(readme_fn):
|
||||
suite.addTest(doctest.DocFileSuite(readme_fn, module_relative=False))
|
||||
return suite
|
||||
|
||||
class TestUnionIntersection(unittest.TestCase):
|
||||
def test_union(self):
|
||||
bloom_one = BloomFilter(100, 0.001)
|
||||
bloom_two = BloomFilter(100, 0.001)
|
||||
chars = [chr(i) for i in range(97, 123)]
|
||||
for char in chars[len(chars)/2:]:
|
||||
bloom_one.add(char)
|
||||
for char in chars[:len(chars)/2]:
|
||||
bloom_two.add(char)
|
||||
new_bloom = bloom_one.union(bloom_two)
|
||||
for char in chars:
|
||||
self.assert_(char in new_bloom)
|
||||
|
||||
def test_intersection(self):
|
||||
bloom_one = BloomFilter(100, 0.001)
|
||||
bloom_two = BloomFilter(100, 0.001)
|
||||
chars = [chr(i) for i in range(97, 123)]
|
||||
for char in chars:
|
||||
bloom_one.add(char)
|
||||
for char in chars[:len(chars)/2]:
|
||||
bloom_two.add(char)
|
||||
new_bloom = bloom_one.intersection(bloom_two)
|
||||
for char in chars[:len(chars)/2]:
|
||||
self.assert_(char in new_bloom)
|
||||
for char in chars[len(chars)/2:]:
|
||||
self.assert_(char not in new_bloom)
|
||||
|
||||
def test_intersection_capacity_fail(self):
|
||||
bloom_one = BloomFilter(1000, 0.001)
|
||||
bloom_two = BloomFilter(100, 0.001)
|
||||
def _run():
|
||||
new_bloom = bloom_one.intersection(bloom_two)
|
||||
self.assertRaises(ValueError, _run)
|
||||
|
||||
def test_union_capacity_fail(self):
|
||||
bloom_one = BloomFilter(1000, 0.001)
|
||||
bloom_two = BloomFilter(100, 0.001)
|
||||
def _run():
|
||||
new_bloom = bloom_one.union(bloom_two)
|
||||
self.assertRaises(ValueError, _run)
|
||||
|
||||
def test_intersection_k_fail(self):
|
||||
bloom_one = BloomFilter(100, 0.001)
|
||||
bloom_two = BloomFilter(100, 0.01)
|
||||
def _run():
|
||||
new_bloom = bloom_one.intersection(bloom_two)
|
||||
self.assertRaises(ValueError, _run)
|
||||
|
||||
def test_union_k_fail(self):
|
||||
bloom_one = BloomFilter(100, 0.01)
|
||||
bloom_two = BloomFilter(100, 0.001)
|
||||
def _run():
|
||||
new_bloom = bloom_one.union(bloom_two)
|
||||
self.assertRaises(ValueError, _run)
|
||||
|
||||
class Serialization(unittest.TestCase):
|
||||
SIZE = 12345
|
||||
EXPECTED = set([random.randint(0, 10000100) for _ in xrange(SIZE)])
|
||||
|
||||
def test_serialization(self):
|
||||
for klass, args in [(BloomFilter, (self.SIZE,)),
|
||||
(ScalableBloomFilter, ())]:
|
||||
filter = klass(*args)
|
||||
for item in self.EXPECTED:
|
||||
filter.add(item)
|
||||
|
||||
# It seems bitarray is finicky about the object being an
|
||||
# actual file, so we can't just use StringIO. Grr.
|
||||
f = tempfile.TemporaryFile()
|
||||
filter.tofile(f)
|
||||
del filter
|
||||
|
||||
f.seek(0)
|
||||
filter = klass.fromfile(f)
|
||||
|
||||
for item in self.EXPECTED:
|
||||
self.assert_(item in filter)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -1,5 +0,0 @@
|
||||
[egg_info]
|
||||
tag_build =
|
||||
tag_date = 0
|
||||
tag_svn_revision = 0
|
||||
|
||||
@@ -1,45 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
from ez_setup import use_setuptools
|
||||
use_setuptools()
|
||||
|
||||
import os
|
||||
|
||||
from setuptools import setup, find_packages, Extension
|
||||
|
||||
VERSION = '1.1'
|
||||
DESCRIPTION = "PyBloom: A Probabilistic data structure"
|
||||
LONG_DESCRIPTION = """
|
||||
pybloom is a Python implementation of the bloom filter probabilistic data
|
||||
structure. The module also provides a Scalable Bloom Filter that allows a
|
||||
bloom filter to grow without knowing the original set size.
|
||||
"""
|
||||
|
||||
CLASSIFIERS = filter(None, map(str.strip,
|
||||
"""
|
||||
Intended Audience :: Developers
|
||||
License :: OSI Approved :: MIT License
|
||||
Programming Language :: Python
|
||||
Operating System :: OS Independent
|
||||
Topic :: Utilities
|
||||
Topic :: Database :: Database Engines/Servers
|
||||
Topic :: Software Development :: Libraries :: Python Modules
|
||||
""".splitlines()))
|
||||
|
||||
setup(
|
||||
name="pybloom",
|
||||
version=VERSION,
|
||||
description=DESCRIPTION,
|
||||
long_description=LONG_DESCRIPTION,
|
||||
classifiers=CLASSIFIERS,
|
||||
keywords=('data structures', 'bloom filter', 'bloom', 'filter',
|
||||
'probabilistic', 'set'),
|
||||
author="Jay Baird",
|
||||
author_email="jay.baird@me.com",
|
||||
url="http://github.com/jaybaird/python-bloomfilter/",
|
||||
license="MIT License",
|
||||
packages=find_packages(exclude=['ez_setup']),
|
||||
platforms=['any'],
|
||||
test_suite="pybloom.tests",
|
||||
zip_safe=True,
|
||||
install_requires=['bitarray>=0.3.4']
|
||||
)
|
||||
Reference in New Issue
Block a user