-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #35 from input-output-hk/jdral/bloomfilters
Add bloom filters with false-negative tests and FPR measurements
- Loading branch information
Showing
6 changed files
with
469 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
{-# LANGUAGE NumericUnderscores #-} | ||
{-# LANGUAGE TypeApplications #-} | ||
{- HLINT ignore "Use camelCase" -} | ||
|
||
module Database.LSMTree.Extras ( | ||
-- * Bloom filter construction | ||
BloomMaker | ||
, mkBloomST | ||
, mkBloomST_Monkey | ||
, mkBloomEasy | ||
) where | ||
|
||
import Control.Monad.ST (runST) | ||
import qualified Data.BloomFilter.Easy as Bloom.Easy (easyList) | ||
import Database.LSMTree.Internal.Run.BloomFilter (Bloom, Hashable) | ||
import qualified Database.LSMTree.Internal.Run.BloomFilter as Bloom | ||
|
||
{------------------------------------------------------------------------------- | ||
Bloom filter construction | ||
-------------------------------------------------------------------------------} | ||
|
||
type BloomMaker a = [a] -> Bloom a | ||
|
||
-- | Create a bloom filter through the 'MBloom' interface. Tunes the bloom | ||
-- filter using 'suggestSizing'. | ||
mkBloomST :: Hashable a => Double -> BloomMaker a | ||
mkBloomST requestedFPR xs = runST $ do | ||
b <- Bloom.new (Bloom.cheapHashes numHashFuncs) numBits | ||
mapM_ (Bloom.insert b) xs | ||
Bloom.freeze b | ||
where | ||
numEntries = length xs | ||
(numBits, numHashFuncs) = Bloom.suggestSizing numEntries requestedFPR | ||
|
||
-- | Create a bloom filter through the 'MBloom' interface. Tunes the bloom | ||
-- filter a la Monkey. | ||
-- | ||
-- === TODO | ||
-- | ||
-- The measured FPR exceeds the requested FPR by a number of percentages. | ||
-- Example: @withNewStdGen $ measureApproximateFPR (Proxy @Word64) (mkBloomST' | ||
-- 0.37) 1000000@. I'm unsure why, but I have a number of ideas | ||
-- | ||
-- * The FPR (and bits/hash functions) calculations are approximations. | ||
-- * Rounding errors in the Haskell implementation of FPR calculations | ||
-- * The Monkey tuning is incompatible with @bloomfilter@'s /next power of 2/ | ||
-- rounding of th ebits. | ||
mkBloomST_Monkey :: Hashable a => Double -> BloomMaker a | ||
mkBloomST_Monkey requestedFPR xs = runST $ do | ||
b <- Bloom.new (Bloom.cheapHashes numHashFuncs) numBits | ||
mapM_ (Bloom.insert b) xs | ||
Bloom.freeze b | ||
where | ||
numEntries = length xs | ||
numBits = Bloom.monkeyBits numEntries requestedFPR | ||
numHashFuncs = Bloom.monkeyHashFuncs numBits numEntries | ||
|
||
-- | Create a bloom filter through the "Data.BloomFilter.Easy" interface. Tunes | ||
-- the bloom filter using 'suggestSizing'. | ||
mkBloomEasy :: Hashable a => Double -> BloomMaker a | ||
mkBloomEasy = Bloom.Easy.easyList | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
{-# LANGUAGE BangPatterns #-} | ||
|
||
module System.Random.Extras ( | ||
-- * Sampling from uniform distributions | ||
uniformWithoutReplacement | ||
, uniformWithReplacement | ||
, sampleUniformWithoutReplacement | ||
, sampleUniformWithReplacement | ||
) where | ||
|
||
import Data.List (unfoldr) | ||
import qualified Data.Set as Set | ||
import System.Random (StdGen, Uniform, uniform, uniformR) | ||
import Text.Printf (printf) | ||
|
||
{------------------------------------------------------------------------------- | ||
Sampling from uniform distributions | ||
-------------------------------------------------------------------------------} | ||
|
||
uniformWithoutReplacement :: (Ord a, Uniform a) => StdGen -> Int -> [a] | ||
uniformWithoutReplacement rng0 n0 = take n0 $ | ||
go Set.empty (0 :: Int) rng0 | ||
where | ||
go !seen !n !rng | ||
| Set.member x seen = go seen n rng' | ||
| otherwise = x : go (Set.insert x seen) (n+1) rng' | ||
where | ||
(!x, !rng') = uniform rng | ||
|
||
uniformWithReplacement :: Uniform a => StdGen -> Int -> [a] | ||
uniformWithReplacement rng0 n0 = take n0 $ | ||
unfoldr (Just . uniform) rng0 | ||
|
||
sampleUniformWithoutReplacement :: Ord a => StdGen -> Int -> [a] -> [a] | ||
sampleUniformWithoutReplacement rng0 n xs0 = take n $ | ||
go (Set.fromList xs0) rng0 | ||
where | ||
go !xs !_rng | Set.null xs = error $ | ||
printf "sampleUniformWithoutReplacement: n > length xs0 for n=%d, \ | ||
\ length xs0=%d" | ||
n | ||
(length xs0) | ||
|
||
go !xs !rng = x : go xs' rng' | ||
where | ||
(i, rng') = uniformR (0, Set.size xs - 1) rng | ||
!x = Set.elemAt i xs | ||
!xs' = Set.deleteAt i xs | ||
|
||
sampleUniformWithReplacement :: Ord a => StdGen -> Int -> [a] -> [a] | ||
sampleUniformWithReplacement rng0 n xs0 = take n $ | ||
go rng0 | ||
where | ||
xs = Set.fromList xs0 | ||
|
||
go !rng = x : go rng' | ||
where | ||
(i, rng') = uniformR (0, Set.size xs - 1) rng | ||
!x = Set.elemAt i xs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.