fixed bug in quote parser

This commit is contained in:
Dan McInerney
2014-08-31 11:52:30 -04:00
parent 566b4c68f0
commit c9a8e41cb6
14 changed files with 0 additions and 899 deletions

Binary file not shown.

View File

@@ -1 +0,0 @@
include ez_setup.py

View File

@@ -1,22 +0,0 @@
Metadata-Version: 1.0
Name: pybloom
Version: 1.1
Summary: PyBloom: A Probabilistic data structure
Home-page: http://github.com/jaybaird/python-bloomfilter/
Author: Jay Baird
Author-email: jay.baird@me.com
License: MIT License
Description:
pybloom is a Python implementation of the bloom filter probabilistic data
structure. The module also provides a Scalable Bloom Filter that allows a
bloom filter to grow without knowing the original set size.
Keywords: data structures,bloom filter,bloom,filter,probabilistic,set
Platform: any
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: MIT License
Classifier: Programming Language :: Python
Classifier: Operating System :: OS Independent
Classifier: Topic :: Utilities
Classifier: Topic :: Database :: Database Engines/Servers
Classifier: Topic :: Software Development :: Libraries :: Python Modules

View File

@@ -1,270 +0,0 @@
#!python
"""Bootstrap setuptools installation
If you want to use setuptools in your package's setup.py, just include this
file in the same directory with it, and add this to the top of your setup.py::
from ez_setup import use_setuptools
use_setuptools()
If you want to require a specific version of setuptools, set a download
mirror, or use an alternate download directory, you can do so by supplying
the appropriate options to ``use_setuptools()``.
This file can also be run as a script to install or upgrade setuptools.
"""
import sys
DEFAULT_VERSION = "0.6c9"
DEFAULT_URL = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[:3]
md5_data = {
'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca',
'setuptools-0.6b1-py2.4.egg': 'b79a8a403e4502fbb85ee3f1941735cb',
'setuptools-0.6b2-py2.3.egg': '5657759d8a6d8fc44070a9d07272d99b',
'setuptools-0.6b2-py2.4.egg': '4996a8d169d2be661fa32a6e52e4f82a',
'setuptools-0.6b3-py2.3.egg': 'bb31c0fc7399a63579975cad9f5a0618',
'setuptools-0.6b3-py2.4.egg': '38a8c6b3d6ecd22247f179f7da669fac',
'setuptools-0.6b4-py2.3.egg': '62045a24ed4e1ebc77fe039aa4e6f7e5',
'setuptools-0.6b4-py2.4.egg': '4cb2a185d228dacffb2d17f103b3b1c4',
'setuptools-0.6c1-py2.3.egg': 'b3f2b5539d65cb7f74ad79127f1a908c',
'setuptools-0.6c1-py2.4.egg': 'b45adeda0667d2d2ffe14009364f2a4b',
'setuptools-0.6c2-py2.3.egg': 'f0064bf6aa2b7d0f3ba0b43f20817c27',
'setuptools-0.6c2-py2.4.egg': '616192eec35f47e8ea16cd6a122b7277',
'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa',
'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e',
'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e',
'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f',
'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2',
'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc',
'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167',
'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64',
'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d',
'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20',
'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab',
'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53',
'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2',
'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e',
'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372',
'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902',
'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de',
'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b',
'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03',
'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a',
'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6',
'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a',
}
import sys, os
try: from hashlib import md5
except ImportError: from md5 import md5
def _validate_md5(egg_name, data):
if egg_name in md5_data:
digest = md5(data).hexdigest()
if digest != md5_data[egg_name]:
print >>sys.stderr, (
"md5 validation of %s failed! (Possible download problem?)"
% egg_name
)
sys.exit(2)
return data
def use_setuptools(
version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir,
download_delay=15
):
"""Automatically find/download setuptools and make it available on sys.path
`version` should be a valid setuptools version number that is available
as an egg for download under the `download_base` URL (which should end with
a '/'). `to_dir` is the directory where setuptools will be downloaded, if
it is not already available. If `download_delay` is specified, it should
be the number of seconds that will be paused before initiating a download,
should one be required. If an older version of setuptools is installed,
this routine will print a message to ``sys.stderr`` and raise SystemExit in
an attempt to abort the calling script.
"""
was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules
def do_download():
egg = download_setuptools(version, download_base, to_dir, download_delay)
sys.path.insert(0, egg)
import setuptools; setuptools.bootstrap_install_from = egg
try:
import pkg_resources
except ImportError:
return do_download()
try:
pkg_resources.require("setuptools>="+version); return
except pkg_resources.VersionConflict, e:
if was_imported:
print >>sys.stderr, (
"The required version of setuptools (>=%s) is not available, and\n"
"can't be installed while this script is running. Please install\n"
" a more recent version first, using 'easy_install -U setuptools'."
"\n\n(Currently using %r)"
) % (version, e.args[0])
sys.exit(2)
else:
del pkg_resources, sys.modules['pkg_resources'] # reload ok
return do_download()
except pkg_resources.DistributionNotFound:
return do_download()
def download_setuptools(
version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir,
delay = 15
):
"""Download setuptools from a specified location and return its filename
`version` should be a valid setuptools version number that is available
as an egg for download under the `download_base` URL (which should end
with a '/'). `to_dir` is the directory where the egg will be downloaded.
`delay` is the number of seconds to pause before an actual download attempt.
"""
import urllib2, shutil
egg_name = "setuptools-%s-py%s.egg" % (version,sys.version[:3])
url = download_base + egg_name
saveto = os.path.join(to_dir, egg_name)
src = dst = None
if not os.path.exists(saveto): # Avoid repeated downloads
try:
from distutils import log
if delay:
log.warn("""
---------------------------------------------------------------------------
This script requires setuptools version %s to run (even to display
help). I will attempt to download it for you (from
%s), but
you may need to enable firewall access for this script first.
I will start the download in %d seconds.
(Note: if this machine does not have network access, please obtain the file
%s
and place it in this directory before rerunning this script.)
---------------------------------------------------------------------------""",
version, download_base, delay, url
); from time import sleep; sleep(delay)
log.warn("Downloading %s", url)
src = urllib2.urlopen(url)
# Read/write all in one block, so we don't create a corrupt file
# if the download is interrupted.
data = _validate_md5(egg_name, src.read())
dst = open(saveto,"wb"); dst.write(data)
finally:
if src: src.close()
if dst: dst.close()
return os.path.realpath(saveto)
def main(argv, version=DEFAULT_VERSION):
"""Install or upgrade setuptools and EasyInstall"""
try:
import setuptools
except ImportError:
egg = None
try:
egg = download_setuptools(version, delay=0)
sys.path.insert(0,egg)
from setuptools.command.easy_install import main
return main(list(argv)+[egg]) # we're done here
finally:
if egg and os.path.exists(egg):
os.unlink(egg)
else:
if setuptools.__version__ == '0.0.1':
print >>sys.stderr, (
"You have an obsolete version of setuptools installed. Please\n"
"remove it from your system entirely before rerunning this script."
)
sys.exit(2)
req = "setuptools>="+version
import pkg_resources
try:
pkg_resources.require(req)
except pkg_resources.VersionConflict:
try:
from setuptools.command.easy_install import main
except ImportError:
from easy_install import main
main(list(argv)+[download_setuptools(delay=0)])
sys.exit(0) # try to force an exit
else:
if argv:
from setuptools.command.easy_install import main
main(argv)
else:
print "Setuptools version",version,"or greater has been installed."
print '(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)'
def update_md5(filenames):
"""Update our built-in md5 registry"""
import re
for name in filenames:
base = os.path.basename(name)
f = open(name,'rb')
md5_data[base] = md5(f.read()).hexdigest()
f.close()
data = [" %r: %r,\n" % it for it in md5_data.items()]
data.sort()
repl = "".join(data)
import inspect
srcfile = inspect.getsourcefile(sys.modules[__name__])
f = open(srcfile, 'rb'); src = f.read(); f.close()
match = re.search("\nmd5_data = {\n([^}]+)}", src)
if not match:
print >>sys.stderr, "Internal error!"
sys.exit(2)
src = src[:match.start(1)] + repl + src[match.end(1):]
f = open(srcfile,'w')
f.write(src)
f.close()
if __name__=='__main__':
if len(sys.argv)>2 and sys.argv[1]=='--md5update':
update_md5(sys.argv[2:])
else:
main(sys.argv[1:])

View File

@@ -1,22 +0,0 @@
Metadata-Version: 1.1
Name: pybloom
Version: 1.1
Summary: PyBloom: A Probabilistic data structure
Home-page: http://github.com/jaybaird/python-bloomfilter/
Author: Jay Baird
Author-email: jay.baird@me.com
License: MIT License
Description:
pybloom is a Python implementation of the bloom filter probabilistic data
structure. The module also provides a Scalable Bloom Filter that allows a
bloom filter to grow without knowing the original set size.
Keywords: data structures,bloom filter,bloom,filter,probabilistic,set
Platform: any
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: MIT License
Classifier: Programming Language :: Python
Classifier: Operating System :: OS Independent
Classifier: Topic :: Utilities
Classifier: Topic :: Database :: Database Engines/Servers
Classifier: Topic :: Software Development :: Libraries :: Python Modules

View File

@@ -1,22 +0,0 @@
Metadata-Version: 1.0
Name: pybloom
Version: 1.1
Summary: PyBloom: A Probabilistic data structure
Home-page: http://github.com/jaybaird/python-bloomfilter/
Author: Jay Baird
Author-email: jay.baird@me.com
License: MIT License
Description:
pybloom is a Python implementation of the bloom filter probabilistic data
structure. The module also provides a Scalable Bloom Filter that allows a
bloom filter to grow without knowing the original set size.
Keywords: data structures,bloom filter,bloom,filter,probabilistic,set
Platform: any
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: MIT License
Classifier: Programming Language :: Python
Classifier: Operating System :: OS Independent
Classifier: Topic :: Utilities
Classifier: Topic :: Database :: Database Engines/Servers
Classifier: Topic :: Software Development :: Libraries :: Python Modules

View File

@@ -1 +0,0 @@

Binary file not shown.

View File

@@ -1,5 +0,0 @@
"""pybloom
"""
from pybloom import BloomFilter, ScalableBloomFilter, __version__, __author__

View File

@@ -1,409 +0,0 @@
# -*- encoding: utf-8 -*-
"""This module implements a bloom filter probabilistic data structure and
an a Scalable Bloom Filter that grows in size as your add more items to it
without increasing the false positive error_rate.
Requires the bitarray library: http://pypi.python.org/pypi/bitarray/
>>> from pybloom import BloomFilter
>>> f = BloomFilter(capacity=10000, error_rate=0.001)
>>> for i in xrange(0, f.capacity):
... _ = f.add(i)
...
>>> 0 in f
True
>>> f.capacity in f
False
>>> len(f) <= f.capacity
True
>>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate
True
>>> from pybloom import ScalableBloomFilter
>>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
>>> count = 10000
>>> for i in xrange(0, count):
... _ = sbf.add(i)
...
>>> sbf.capacity > count
True
>>> len(sbf) <= count
True
>>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate
True
"""
import math
import hashlib
from struct import unpack, pack, calcsize
try:
import bitarray
except ImportError:
raise ImportError('pybloom requires bitarray >= 0.3.4')
__version__ = '1.1'
__author__ = "Jay Baird <jay@mochimedia.com>, Bob Ippolito <bob@redivi.com>,\
Marius Eriksen <marius@monkey.org>,\
Alex Brasetvik <alex@brasetvik.com>"
def make_hashfuncs(num_slices, num_bits):
if num_bits >= (1 << 31):
fmt_code, chunk_size = 'Q', 8
elif num_bits >= (1 << 15):
fmt_code, chunk_size = 'I', 4
else:
fmt_code, chunk_size = 'H', 2
total_hash_bits = 8 * num_slices * chunk_size
if total_hash_bits > 384:
hashfn = hashlib.sha512
elif total_hash_bits > 256:
hashfn = hashlib.sha384
elif total_hash_bits > 160:
hashfn = hashlib.sha256
elif total_hash_bits > 128:
hashfn = hashlib.sha1
else:
hashfn = hashlib.md5
fmt = fmt_code * (hashfn().digest_size // chunk_size)
num_salts, extra = divmod(num_slices, len(fmt))
if extra:
num_salts += 1
salts = [hashfn(hashfn(pack('I', i)).digest()) for i in xrange(num_salts)]
def _make_hashfuncs(key):
if isinstance(key, unicode):
key = key.encode('utf-8')
else:
key = str(key)
rval = []
for salt in salts:
h = salt.copy()
h.update(key)
rval.extend(uint % num_bits for uint in unpack(fmt, h.digest()))
del rval[num_slices:]
return rval
return _make_hashfuncs
class BloomFilter(object):
FILE_FMT = '<dQQQQ'
def __init__(self, capacity, error_rate=0.001):
"""Implements a space-efficient probabilistic data structure
capacity
this BloomFilter must be able to store at least *capacity* elements
while maintaining no more than *error_rate* chance of false
positives
error_rate
the error_rate of the filter returning false positives. This
determines the filters capacity. Inserting more than capacity
elements greatly increases the chance of false positives.
>>> b = BloomFilter(capacity=100000, error_rate=0.001)
>>> b.add("test")
False
>>> "test" in b
True
"""
if not (0 < error_rate < 1):
raise ValueError("Error_Rate must be between 0 and 1.")
if not capacity > 0:
raise ValueError("Capacity must be > 0")
# given M = num_bits, k = num_slices, p = error_rate, n = capacity
# solving for m = bits_per_slice
# n ~= M * ((ln(2) ** 2) / abs(ln(P)))
# n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
# m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
num_slices = int(math.ceil(math.log(1 / error_rate, 2)))
# the error_rate constraint assumes a fill rate of 1/2
# so we double the capacity to simplify the API
bits_per_slice = int(math.ceil(
(2 * capacity * abs(math.log(error_rate))) /
(num_slices * (math.log(2) ** 2))))
self._setup(error_rate, num_slices, bits_per_slice, capacity, 0)
self.bitarray = bitarray.bitarray(self.num_bits, endian='little')
self.bitarray.setall(False)
def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count):
self.error_rate = error_rate
self.num_slices = num_slices
self.bits_per_slice = bits_per_slice
self.capacity = capacity
self.num_bits = num_slices * bits_per_slice
self.count = count
self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
def __contains__(self, key):
"""Tests a key's membership in this bloom filter.
>>> b = BloomFilter(capacity=100)
>>> b.add("hello")
False
>>> "hello" in b
True
"""
bits_per_slice = self.bits_per_slice
bitarray = self.bitarray
if not isinstance(key, list):
hashes = self.make_hashes(key)
else:
hashes = key
offset = 0
for k in hashes:
if not bitarray[offset + k]:
return False
offset += bits_per_slice
return True
def __len__(self):
"""Return the number of keys stored by this bloom filter."""
return self.count
def add(self, key, skip_check=False):
""" Adds a key to this bloom filter. If the key already exists in this
filter it will return True. Otherwise False.
>>> b = BloomFilter(capacity=100)
>>> b.add("hello")
False
>>> b.add("hello")
True
"""
bitarray = self.bitarray
bits_per_slice = self.bits_per_slice
hashes = self.make_hashes(key)
if not skip_check and hashes in self:
return True
if self.count > self.capacity:
raise IndexError("BloomFilter is at capacity")
offset = 0
for k in hashes:
self.bitarray[offset + k] = True
offset += bits_per_slice
self.count += 1
return False
def copy(self):
"""Return a copy of this bloom filter.
"""
new_filter = BloomFilter(self.capacity, self.error_rate)
new_filter.bitarray = self.bitarray.copy()
return new_filter
def union(self, other):
""" Calculates the union of the two underlying bitarrays and returns
a new bloom filter object."""
if self.capacity != other.capacity or \
self.error_rate != other.error_rate:
raise ValueError("Unioning filters requires both filters to have \
both the same capacity and error rate")
new_bloom = self.copy()
new_bloom.bitarray = new_bloom.bitarray | other.bitarray
return new_bloom
def __or__(self, other):
return self.union(other)
def intersection(self, other):
""" Calculates the union of the two underlying bitarrays and returns
a new bloom filter object."""
if self.capacity != other.capacity or \
self.error_rate != other.error_rate:
raise ValueError("Intersecting filters requires both filters to \
have equal capacity and error rate")
new_bloom = self.copy()
new_bloom.bitarray = new_bloom.bitarray & other.bitarray
return new_bloom
def __and__(self, other):
return self.intersection(other)
def tofile(self, f):
"""Write the bloom filter to file object `f'. Underlying bits
are written as machine values. This is much more space
efficient than pickling the object."""
f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices,
self.bits_per_slice, self.capacity, self.count))
self.bitarray.tofile(f)
@classmethod
def fromfile(cls, f, n=-1):
"""Read a bloom filter from file-object `f' serialized with
``BloomFilter.tofile''. If `n' > 0 read only so many bytes."""
headerlen = calcsize(cls.FILE_FMT)
if 0 < n < headerlen:
raise ValueError, 'n too small!'
filter = cls(1) # Bogus instantiation, we will `_setup'.
filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen)))
filter.bitarray = bitarray.bitarray(endian='little')
if n > 0:
filter.bitarray.fromfile(f, n - headerlen)
else:
filter.bitarray.fromfile(f)
if filter.num_bits != filter.bitarray.length() and \
(filter.num_bits + (8 - filter.num_bits % 8)
!= filter.bitarray.length()):
raise ValueError, 'Bit length mismatch!'
return filter
def __getstate__(self):
d = self.__dict__.copy()
del d['make_hashes']
return d
def __setstate__(self, d):
self.__dict__.update(d)
self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
class ScalableBloomFilter(object):
SMALL_SET_GROWTH = 2 # slower, but takes up less memory
LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster
FILE_FMT = '<idQd'
def __init__(self, initial_capacity=100, error_rate=0.001,
mode=SMALL_SET_GROWTH):
"""Implements a space-efficient probabilistic data structure that
grows as more items are added while maintaining a steady false
positive rate
initial_capacity
the initial capacity of the filter
error_rate
the error_rate of the filter returning false positives. This
determines the filters capacity. Going over capacity greatly
increases the chance of false positives.
mode
can be either ScalableBloomFilter.SMALL_SET_GROWTH or
ScalableBloomFilter.LARGE_SET_GROWTH. SMALL_SET_GROWTH is slower
but uses less memory. LARGE_SET_GROWTH is faster but consumes
memory faster.
>>> b = ScalableBloomFilter(initial_capacity=512, error_rate=0.001, \
mode=ScalableBloomFilter.SMALL_SET_GROWTH)
>>> b.add("test")
False
>>> "test" in b
True
>>> unicode_string = u'¡'
>>> b.add(unicode_string)
False
>>> unicode_string in b
True
"""
if not error_rate or error_rate < 0:
raise ValueError("Error_Rate must be a decimal less than 0.")
self._setup(mode, 0.9, initial_capacity, error_rate)
self.filters = []
def _setup(self, mode, ratio, initial_capacity, error_rate):
self.scale = mode
self.ratio = ratio
self.initial_capacity = initial_capacity
self.error_rate = error_rate
def __contains__(self, key):
"""Tests a key's membership in this bloom filter.
>>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \
mode=ScalableBloomFilter.SMALL_SET_GROWTH)
>>> b.add("hello")
False
>>> "hello" in b
True
"""
for f in reversed(self.filters):
if key in f:
return True
return False
def add(self, key):
"""Adds a key to this bloom filter.
If the key already exists in this filter it will return True.
Otherwise False.
>>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \
mode=ScalableBloomFilter.SMALL_SET_GROWTH)
>>> b.add("hello")
False
>>> b.add("hello")
True
"""
if key in self:
return True
filter = self.filters[-1] if self.filters else None
if filter is None or filter.count >= filter.capacity:
num_filters = len(self.filters)
filter = BloomFilter(
capacity=self.initial_capacity * (self.scale ** num_filters),
error_rate=self.error_rate * (self.ratio ** num_filters))
self.filters.append(filter)
filter.add(key, skip_check=True)
return False
@property
def capacity(self):
"""Returns the total capacity for all filters in this SBF"""
return sum([f.capacity for f in self.filters])
@property
def count(self):
return len(self)
def tofile(self, f):
"""Serialize this ScalableBloomFilter into the file-object
`f'."""
f.write(pack(self.FILE_FMT, self.scale, self.ratio,
self.initial_capacity, self.error_rate))
# Write #-of-filters
f.write(pack('<l', len(self.filters)))
if len(self.filters) > 0:
# Then each filter directly, with a header describing
# their lengths.
headerpos = f.tell()
headerfmt = '<' + 'Q'*(len(self.filters))
f.write('.' * calcsize(headerfmt))
filter_sizes = []
for filter in self.filters:
begin = f.tell()
filter.tofile(f)
filter_sizes.append(f.tell() - begin)
f.seek(headerpos)
f.write(pack(headerfmt, *filter_sizes))
@classmethod
def fromfile(cls, f):
"""Deserialize the ScalableBloomFilter in file object `f'."""
filter = cls()
filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT))))
nfilters, = unpack('<l', f.read(calcsize('<l')))
if nfilters > 0:
header_fmt = '<' + 'Q'*nfilters
bytes = f.read(calcsize(header_fmt))
filter_lengths = unpack(header_fmt, bytes)
for fl in filter_lengths:
filter.filters.append(BloomFilter.fromfile(f, fl))
else:
filter.filters = []
return filter
def __len__(self):
"""Returns the total number of elements stored in this SBF"""
return sum([f.count for f in self.filters])
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@@ -1,96 +0,0 @@
import os
import doctest
import unittest
import random
import tempfile
from pybloom import BloomFilter, ScalableBloomFilter
from unittest import TestSuite
def additional_tests():
proj_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
readme_fn = os.path.join(proj_dir, 'README.txt')
suite = TestSuite([doctest.DocTestSuite('pybloom.pybloom')])
if os.path.exists(readme_fn):
suite.addTest(doctest.DocFileSuite(readme_fn, module_relative=False))
return suite
class TestUnionIntersection(unittest.TestCase):
def test_union(self):
bloom_one = BloomFilter(100, 0.001)
bloom_two = BloomFilter(100, 0.001)
chars = [chr(i) for i in range(97, 123)]
for char in chars[len(chars)/2:]:
bloom_one.add(char)
for char in chars[:len(chars)/2]:
bloom_two.add(char)
new_bloom = bloom_one.union(bloom_two)
for char in chars:
self.assert_(char in new_bloom)
def test_intersection(self):
bloom_one = BloomFilter(100, 0.001)
bloom_two = BloomFilter(100, 0.001)
chars = [chr(i) for i in range(97, 123)]
for char in chars:
bloom_one.add(char)
for char in chars[:len(chars)/2]:
bloom_two.add(char)
new_bloom = bloom_one.intersection(bloom_two)
for char in chars[:len(chars)/2]:
self.assert_(char in new_bloom)
for char in chars[len(chars)/2:]:
self.assert_(char not in new_bloom)
def test_intersection_capacity_fail(self):
bloom_one = BloomFilter(1000, 0.001)
bloom_two = BloomFilter(100, 0.001)
def _run():
new_bloom = bloom_one.intersection(bloom_two)
self.assertRaises(ValueError, _run)
def test_union_capacity_fail(self):
bloom_one = BloomFilter(1000, 0.001)
bloom_two = BloomFilter(100, 0.001)
def _run():
new_bloom = bloom_one.union(bloom_two)
self.assertRaises(ValueError, _run)
def test_intersection_k_fail(self):
bloom_one = BloomFilter(100, 0.001)
bloom_two = BloomFilter(100, 0.01)
def _run():
new_bloom = bloom_one.intersection(bloom_two)
self.assertRaises(ValueError, _run)
def test_union_k_fail(self):
bloom_one = BloomFilter(100, 0.01)
bloom_two = BloomFilter(100, 0.001)
def _run():
new_bloom = bloom_one.union(bloom_two)
self.assertRaises(ValueError, _run)
class Serialization(unittest.TestCase):
SIZE = 12345
EXPECTED = set([random.randint(0, 10000100) for _ in xrange(SIZE)])
def test_serialization(self):
for klass, args in [(BloomFilter, (self.SIZE,)),
(ScalableBloomFilter, ())]:
filter = klass(*args)
for item in self.EXPECTED:
filter.add(item)
# It seems bitarray is finicky about the object being an
# actual file, so we can't just use StringIO. Grr.
f = tempfile.TemporaryFile()
filter.tofile(f)
del filter
f.seek(0)
filter = klass.fromfile(f)
for item in self.EXPECTED:
self.assert_(item in filter)
if __name__ == '__main__':
unittest.main()

View File

@@ -1,5 +0,0 @@
[egg_info]
tag_build =
tag_date = 0
tag_svn_revision = 0

View File

@@ -1,45 +0,0 @@
#!/usr/bin/env python
from ez_setup import use_setuptools
use_setuptools()
import os
from setuptools import setup, find_packages, Extension
VERSION = '1.1'
DESCRIPTION = "PyBloom: A Probabilistic data structure"
LONG_DESCRIPTION = """
pybloom is a Python implementation of the bloom filter probabilistic data
structure. The module also provides a Scalable Bloom Filter that allows a
bloom filter to grow without knowing the original set size.
"""
CLASSIFIERS = filter(None, map(str.strip,
"""
Intended Audience :: Developers
License :: OSI Approved :: MIT License
Programming Language :: Python
Operating System :: OS Independent
Topic :: Utilities
Topic :: Database :: Database Engines/Servers
Topic :: Software Development :: Libraries :: Python Modules
""".splitlines()))
setup(
name="pybloom",
version=VERSION,
description=DESCRIPTION,
long_description=LONG_DESCRIPTION,
classifiers=CLASSIFIERS,
keywords=('data structures', 'bloom filter', 'bloom', 'filter',
'probabilistic', 'set'),
author="Jay Baird",
author_email="jay.baird@me.com",
url="http://github.com/jaybaird/python-bloomfilter/",
license="MIT License",
packages=find_packages(exclude=['ez_setup']),
platforms=['any'],
test_suite="pybloom.tests",
zip_safe=True,
install_requires=['bitarray>=0.3.4']
)