Skip to content

Commit

Permalink
32 refactor bloom filters (#44)
Browse files Browse the repository at this point in the history
* Refactored BloomFilterEncoder

Made the code in the class much simpler by inlining functions.
Changed defaults to size=1024, num_hashes=2, and offset=0
Removed unexplainable 'size - 1' in __init__

* Removed redundant params from 'convert_dataframe_to_bf'

* Updated README for doctest
  • Loading branch information
matweldon authored Apr 3, 2024
1 parent 58e3559 commit 50b6643
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 88 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,9 @@ uses the Soft Cosine Measure to calculate record-wise similarity scores.
```python
>>> similarities = embedder.compare(edf1, edf2)
>>> similarities
SimilarityArray([[0.80074101, 0.18160957, 0.09722178],
[0.40124732, 0.1877348 , 0.58792979],
[0.13147656, 0.51426533, 0.11772856]])
SimilarityArray([[0.81229552, 0.1115206 , 0.09557733],
[0.35460909, 0.16368072, 0.60428527],
[0.11720977, 0.50957391, 0.10343462]])

```

Expand Down
5 changes: 2 additions & 3 deletions docs/tutorials/example-febrl.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,10 @@ feature_factory = dict(

## Initialise the embedder instance

This instance embeds each feature twice into a Bloom filter of length 1025
(with the offset).
This instance embeds each feature twice into a Bloom filter of length 1024.

```{python}
embedder = Embedder(feature_factory, bf_size=2**10, num_hashes=2)
embedder = Embedder(feature_factory, bf_size=1024, num_hashes=2)
```

## Embed the datasets
Expand Down
2 changes: 1 addition & 1 deletion docs/tutorials/run-through.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ ff_args = dict(name={}, sex={}, dob={})
## Embedding

Now we can create an `Embedder` object. We want our Bloom filter vectors to
have a length of 1024 elements (actually 1025 because of an offset), and we
have a length of 1024 elements, and we
choose to hash each feature two times. These choices seem to work ok, but we
haven't explored them systematically.

Expand Down
6 changes: 0 additions & 6 deletions src/pprl/app/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,8 @@ def convert_dataframe_to_bf(
other_columns = []

output_columns = other_columns + ["bf_indices", "bf_norms", "thresholds"]
NUMHASHES = 2
OFFSET = 1
NGRAMS = [1, 2, 3, 4]
FFARGS = {"name": {"ngram_length": NGRAMS, "use_gen_skip_grams": True}}
BFSIZE = 2**10

column_types_dict = {
"name": features.gen_name_features,
Expand All @@ -155,9 +152,6 @@ def convert_dataframe_to_bf(
embedder = Embedder(
feature_factory=column_types_dict,
ff_args=FFARGS,
bf_size=BFSIZE,
num_hashes=NUMHASHES,
offset=OFFSET,
salt=salt,
)

Expand Down
104 changes: 34 additions & 70 deletions src/pprl/embedder/bloom_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,38 +11,39 @@ class BloomFilterEncoder:
1. Compute the hash digest for your tokens
2. Convert the digest bytes into integers
3. Map the integer to a bloom filter vector (modulo `b`, where `b`
represents the length of the vector)
3. Map the integer to a bloom filter vector (modulo the length of the vector)
Parameters
----------
size: int
Size of the Bloom filter.
Size of the Bloom filter. Defaults to 1024
num_hashes: int
Number of hashes to perform. Defaults to three.
Number of hashes to perform. Defaults to two.
offset: int
Offset for Bloom filter indices to allow for masking. Defaults
to one.
to zero.
salt: str, optional
Cryptographic salt appended to tokens prior to hashing.
Attributes
----------
hash_function: func
Hashing function (`hashlib.sha1`).
Hashing function (`hashlib.sha256`).
"""

def __init__(
self, size: int, num_hashes: int = 3, offset: int = 1, salt: str | None = None
self, size: int = 1024, num_hashes: int = 2, offset: int = 0, salt: str | None = None
) -> None:
self.size = size - 1
self.size = size
self.num_hashes = num_hashes
self.offset = offset
self.salt = salt or ""

self.hash_function = hashlib.sha1
self.hash_function = hashlib.sha256

def bloom_filter_vector_collision_fraction(self, feature: list) -> tuple[list, float]:
def bloom_filter_vector_collision_fraction(
self, feature: list[str]
) -> tuple[list[int], float]:
"""Convert a feature vector and return its collision fraction.
The index vector uses an optional offset for masking.
Expand All @@ -58,15 +59,28 @@ def bloom_filter_vector_collision_fraction(self, feature: list) -> tuple[list, f
Index values used to create the Bloom filter vector.
collision_fraction: float
Proportion of repeated indices.
Examples
--------
>>> bfe = BloomFilterEncoder()
>>> bfe.bloom_filter_vector_collision_fraction(["a","b","c"])
([334, 1013, 192, 381, 18, 720], 0.0)
"""
feature_int_repr = self.feature_to_big_int_repr(feature)
vec_idx = self.big_int_to_vec(feature_int_repr, offset=self.offset)
vec_idx: list = []

for gram in feature:
for i in range(self.num_hashes):
utf_string_with_salt = (str(gram) + str(i) + str(self.salt)).encode("UTF-8")
digest = self.hash_function(utf_string_with_salt).digest()
digest_as_int = (int.from_bytes(digest, "little") % self.size) + self.offset
vec_idx.append(digest_as_int)

vec_idx_deduped = [*set(vec_idx)]
collision_fraction = 1 - len(vec_idx_deduped) / len(vec_idx)

return vec_idx_deduped, collision_fraction

def bloom_filter_vector(self, feature: list) -> list[int]:
def bloom_filter_vector(self, feature: list[str]) -> list[int]:
"""Convert a feature vector into indices for a Bloom vector.
The index vector uses an optional offset for masking.
Expand All @@ -80,63 +94,13 @@ def bloom_filter_vector(self, feature: list) -> list[int]:
-------
vector_idxs: list
Index values used to create the Bloom filter vector.
"""
feature_int_repr = self.feature_to_big_int_repr(feature)
vec_idx = self.big_int_to_vec(feature_int_repr, offset=self.offset)
vec_idx_deduped = [*set(vec_idx)]

return vec_idx_deduped

def big_int_to_vec(self, feature_ints: list, offset: int = 1) -> list[int]:
"""Convert an integer vector into indices for a Bloom vector.
This conversion inserts 1 at the location derived from the
integer vector, which is an integer representation of a
deterministic hash value, modulo to the size of the Bloom
filter.
Parameters
----------
feature_ints: list
List of integer values representing the feature.
offset: int
An offset to indices to allow for masking. Defaults to one.
Returns
-------
vector_idxs: list
List of integers representing an index on the Bloom filter.
"""
return list(map(lambda x: x % self.size + offset, feature_ints))

def feature_to_big_int_repr(self, feature: list) -> list[int]:
"""Convert a feature vector into an integer vector.
This conversion first generates a hash digest for each member of
the feature vector and then converts them to an integer.
Parameters
----------
feature: list
List of features to be processed.
Returns
-------
feature_ints: list
List of features as integers.
Examples
--------
>>> bfe = BloomFilterEncoder()
>>> bfe.bloom_filter_vector(["a","b","c"])
[334, 1013, 192, 381, 18, 720]
"""
feature_int_repr: list = []
# hash function will create a 256-bit integer
# under the random oracle model this integer will be deterministic
# depending on the token passed to
# the hash function
vec_idx_deduped, _ = self.bloom_filter_vector_collision_fraction(feature)

for gram in feature:
for i in range(self.num_hashes):
utf_string_with_salt = (str(gram) + str(i) + str(self.salt)).encode("UTF-8")
digest = self.hash_function(utf_string_with_salt).digest()
# integer value uses little endianness for amd64 architecture
int_repr = int.from_bytes(digest, "little")
feature_int_repr.append(int_repr)

return feature_int_repr
return vec_idx_deduped
8 changes: 4 additions & 4 deletions src/pprl/embedder/embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,11 +279,11 @@ class Embedder:
Mapping from dataset columns to keyword arguments for their
respective feature generation functions.
bf_size: int
Size of the Bloom filter. Default is `2**10`.
Size of the Bloom filter. Default is 1024.
num_hashes: int
Number of hashes to perform. Default is two.
offset: int
Offset for Bloom filter to enable masking. Default is one.
Offset for Bloom filter to enable masking. Default is zero.
salt: str, optional
Cryptographic salt added to tokens from the data before hashing.
Expand Down Expand Up @@ -324,9 +324,9 @@ def __init__(
self,
feature_factory: dict,
ff_args: dict[str, dict] | None = None,
bf_size: int = 2**10,
bf_size: int = 1024,
num_hashes: int = 2,
offset: int = 1,
offset: int = 0,
salt: str | None = None,
) -> None:
# Get embedding from model
Expand Down
2 changes: 1 addition & 1 deletion test/embedder/test_bloom_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def test_bloom_filter_vector_collision_fraction(feature, size, num_hashes, offse
vec_idx_deduped, collision_fraction = bfencoder.bloom_filter_vector_collision_fraction(feature)

assert all(isinstance(element, int) for element in vec_idx_deduped)
assert all(element <= (size + offset - 2) for element in vec_idx_deduped)
assert all(element <= (size + offset - 1) for element in vec_idx_deduped)
assert all(element >= offset for element in vec_idx_deduped)

assert collision_fraction <= 1
Expand Down

0 comments on commit 50b6643

Please sign in to comment.