diff --git a/README.md b/README.md index d4a0543..34ab35b 100644 --- a/README.md +++ b/README.md @@ -133,9 +133,9 @@ uses the Soft Cosine Measure to calculate record-wise similarity scores. ```python >>> similarities = embedder.compare(edf1, edf2) >>> similarities -SimilarityArray([[0.80074101, 0.18160957, 0.09722178], - [0.40124732, 0.1877348 , 0.58792979], - [0.13147656, 0.51426533, 0.11772856]]) +SimilarityArray([[0.81229552, 0.1115206 , 0.09557733], + [0.35460909, 0.16368072, 0.60428527], + [0.11720977, 0.50957391, 0.10343462]]) ``` diff --git a/docs/tutorials/example-febrl.qmd b/docs/tutorials/example-febrl.qmd index 5c674cf..bbd26e2 100644 --- a/docs/tutorials/example-febrl.qmd +++ b/docs/tutorials/example-febrl.qmd @@ -72,11 +72,10 @@ feature_factory = dict( ## Initialise the embedder instance -This instance embeds each feature twice into a Bloom filter of length 1025 -(with the offset). +This instance embeds each feature twice into a Bloom filter of length 1024. ```{python} -embedder = Embedder(feature_factory, bf_size=2**10, num_hashes=2) +embedder = Embedder(feature_factory, bf_size=1024, num_hashes=2) ``` ## Embed the datasets diff --git a/docs/tutorials/run-through.qmd b/docs/tutorials/run-through.qmd index 39201f6..1835cd5 100644 --- a/docs/tutorials/run-through.qmd +++ b/docs/tutorials/run-through.qmd @@ -72,7 +72,7 @@ ff_args = dict(name={}, sex={}, dob={}) ## Embedding Now we can create an `Embedder` object. We want our Bloom filter vectors to -have a length of 1024 elements (actually 1025 because of an offset), and we +have a length of 1024 elements, and we choose to hash each feature two times. These choices seem to work ok, but we haven't explored them systematically. diff --git a/src/pprl/app/utils.py b/src/pprl/app/utils.py index 6e16970..83e5e43 100644 --- a/src/pprl/app/utils.py +++ b/src/pprl/app/utils.py @@ -138,11 +138,8 @@ def convert_dataframe_to_bf( other_columns = [] output_columns = other_columns + ["bf_indices", "bf_norms", "thresholds"] - NUMHASHES = 2 - OFFSET = 1 NGRAMS = [1, 2, 3, 4] FFARGS = {"name": {"ngram_length": NGRAMS, "use_gen_skip_grams": True}} - BFSIZE = 2**10 column_types_dict = { "name": features.gen_name_features, @@ -155,9 +152,6 @@ def convert_dataframe_to_bf( embedder = Embedder( feature_factory=column_types_dict, ff_args=FFARGS, - bf_size=BFSIZE, - num_hashes=NUMHASHES, - offset=OFFSET, salt=salt, ) diff --git a/src/pprl/embedder/bloom_filters.py b/src/pprl/embedder/bloom_filters.py index d12a52d..6e21abd 100644 --- a/src/pprl/embedder/bloom_filters.py +++ b/src/pprl/embedder/bloom_filters.py @@ -11,38 +11,39 @@ class BloomFilterEncoder: 1. Compute the hash digest for your tokens 2. Convert the digest bytes into integers - 3. Map the integer to a bloom filter vector (modulo `b`, where `b` - represents the length of the vector) + 3. Map the integer to a bloom filter vector (modulo the length of the vector) Parameters ---------- size: int - Size of the Bloom filter. + Size of the Bloom filter. Defaults to 1024 num_hashes: int - Number of hashes to perform. Defaults to three. + Number of hashes to perform. Defaults to two. offset: int Offset for Bloom filter indices to allow for masking. Defaults - to one. + to zero. salt: str, optional Cryptographic salt appended to tokens prior to hashing. Attributes ---------- hash_function: func - Hashing function (`hashlib.sha1`). + Hashing function (`hashlib.sha256`). """ def __init__( - self, size: int, num_hashes: int = 3, offset: int = 1, salt: str | None = None + self, size: int = 1024, num_hashes: int = 2, offset: int = 0, salt: str | None = None ) -> None: - self.size = size - 1 + self.size = size self.num_hashes = num_hashes self.offset = offset self.salt = salt or "" - self.hash_function = hashlib.sha1 + self.hash_function = hashlib.sha256 - def bloom_filter_vector_collision_fraction(self, feature: list) -> tuple[list, float]: + def bloom_filter_vector_collision_fraction( + self, feature: list[str] + ) -> tuple[list[int], float]: """Convert a feature vector and return its collision fraction. The index vector uses an optional offset for masking. @@ -58,15 +59,28 @@ def bloom_filter_vector_collision_fraction(self, feature: list) -> tuple[list, f Index values used to create the Bloom filter vector. collision_fraction: float Proportion of repeated indices. + + Examples + -------- + >>> bfe = BloomFilterEncoder() + >>> bfe.bloom_filter_vector_collision_fraction(["a","b","c"]) + ([334, 1013, 192, 381, 18, 720], 0.0) """ - feature_int_repr = self.feature_to_big_int_repr(feature) - vec_idx = self.big_int_to_vec(feature_int_repr, offset=self.offset) + vec_idx: list = [] + + for gram in feature: + for i in range(self.num_hashes): + utf_string_with_salt = (str(gram) + str(i) + str(self.salt)).encode("UTF-8") + digest = self.hash_function(utf_string_with_salt).digest() + digest_as_int = (int.from_bytes(digest, "little") % self.size) + self.offset + vec_idx.append(digest_as_int) + vec_idx_deduped = [*set(vec_idx)] collision_fraction = 1 - len(vec_idx_deduped) / len(vec_idx) return vec_idx_deduped, collision_fraction - def bloom_filter_vector(self, feature: list) -> list[int]: + def bloom_filter_vector(self, feature: list[str]) -> list[int]: """Convert a feature vector into indices for a Bloom vector. The index vector uses an optional offset for masking. @@ -80,63 +94,13 @@ def bloom_filter_vector(self, feature: list) -> list[int]: ------- vector_idxs: list Index values used to create the Bloom filter vector. - """ - feature_int_repr = self.feature_to_big_int_repr(feature) - vec_idx = self.big_int_to_vec(feature_int_repr, offset=self.offset) - vec_idx_deduped = [*set(vec_idx)] - - return vec_idx_deduped - - def big_int_to_vec(self, feature_ints: list, offset: int = 1) -> list[int]: - """Convert an integer vector into indices for a Bloom vector. - - This conversion inserts 1 at the location derived from the - integer vector, which is an integer representation of a - deterministic hash value, modulo to the size of the Bloom - filter. - - Parameters - ---------- - feature_ints: list - List of integer values representing the feature. - offset: int - An offset to indices to allow for masking. Defaults to one. - - Returns - ------- - vector_idxs: list - List of integers representing an index on the Bloom filter. - """ - return list(map(lambda x: x % self.size + offset, feature_ints)) - - def feature_to_big_int_repr(self, feature: list) -> list[int]: - """Convert a feature vector into an integer vector. - - This conversion first generates a hash digest for each member of - the feature vector and then converts them to an integer. - - Parameters - ---------- - feature: list - List of features to be processed. - Returns - ------- - feature_ints: list - List of features as integers. + Examples + -------- + >>> bfe = BloomFilterEncoder() + >>> bfe.bloom_filter_vector(["a","b","c"]) + [334, 1013, 192, 381, 18, 720] """ - feature_int_repr: list = [] - # hash function will create a 256-bit integer - # under the random oracle model this integer will be deterministic - # depending on the token passed to - # the hash function + vec_idx_deduped, _ = self.bloom_filter_vector_collision_fraction(feature) - for gram in feature: - for i in range(self.num_hashes): - utf_string_with_salt = (str(gram) + str(i) + str(self.salt)).encode("UTF-8") - digest = self.hash_function(utf_string_with_salt).digest() - # integer value uses little endianness for amd64 architecture - int_repr = int.from_bytes(digest, "little") - feature_int_repr.append(int_repr) - - return feature_int_repr + return vec_idx_deduped diff --git a/src/pprl/embedder/embedder.py b/src/pprl/embedder/embedder.py index e5844be..22fd68a 100644 --- a/src/pprl/embedder/embedder.py +++ b/src/pprl/embedder/embedder.py @@ -279,11 +279,11 @@ class Embedder: Mapping from dataset columns to keyword arguments for their respective feature generation functions. bf_size: int - Size of the Bloom filter. Default is `2**10`. + Size of the Bloom filter. Default is 1024. num_hashes: int Number of hashes to perform. Default is two. offset: int - Offset for Bloom filter to enable masking. Default is one. + Offset for Bloom filter to enable masking. Default is zero. salt: str, optional Cryptographic salt added to tokens from the data before hashing. @@ -324,9 +324,9 @@ def __init__( self, feature_factory: dict, ff_args: dict[str, dict] | None = None, - bf_size: int = 2**10, + bf_size: int = 1024, num_hashes: int = 2, - offset: int = 1, + offset: int = 0, salt: str | None = None, ) -> None: # Get embedding from model diff --git a/test/embedder/test_bloom_filters.py b/test/embedder/test_bloom_filters.py index f6e4476..cdd6487 100644 --- a/test/embedder/test_bloom_filters.py +++ b/test/embedder/test_bloom_filters.py @@ -23,7 +23,7 @@ def test_bloom_filter_vector_collision_fraction(feature, size, num_hashes, offse vec_idx_deduped, collision_fraction = bfencoder.bloom_filter_vector_collision_fraction(feature) assert all(isinstance(element, int) for element in vec_idx_deduped) - assert all(element <= (size + offset - 2) for element in vec_idx_deduped) + assert all(element <= (size + offset - 1) for element in vec_idx_deduped) assert all(element >= offset for element in vec_idx_deduped) assert collision_fraction <= 1