Skip to content

Commit

Permalink
Merge pull request #35 from input-output-hk/jdral/bloomfilters
Browse files Browse the repository at this point in the history
Add bloom filters with false-negative tests and FPR measurements
  • Loading branch information
dcoutts authored Nov 22, 2023
2 parents c7473d3 + edfc420 commit 4facb38
Show file tree
Hide file tree
Showing 6 changed files with 469 additions and 6 deletions.
29 changes: 24 additions & 5 deletions lsm-tree.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ common warnings

library
import: warnings
hs-source-dirs: src
default-language: Haskell2010
exposed-modules:
Data.Map.Range
Database.LSMTree.Common
Expand All @@ -42,6 +44,7 @@ library

build-depends:
, base >=4.14 && <4.19
, bloomfilter ^>=2.0.1.2
, bytestring
, containers
, filepath
Expand All @@ -52,8 +55,21 @@ library
, strict-mvar ^>=1.2
, strict-stm ^>=1.2

hs-source-dirs: src
library lsm-tree-utils
import: warnings
visibility: private
hs-source-dirs: src/utils
default-language: Haskell2010
exposed-modules:
Database.LSMTree.Extras
System.Random.Extras

build-depends:
, base
, bloomfilter
, containers
, lsm-tree
, random

test-suite lsm-tree-test
import: warnings
Expand All @@ -69,6 +85,7 @@ test-suite lsm-tree-test
Database.LSMTree.ModelIO.Normal
Database.LSMTree.ModelIO.Session
Test.Database.LSMTree.Common
Test.Database.LSMTree.Internal.Run.BloomFilter
Test.Database.LSMTree.Model.Monoidal
Test.Database.LSMTree.Model.Normal
Test.Database.LSMTree.ModelIO.Class
Expand All @@ -81,23 +98,25 @@ test-suite lsm-tree-test
Test.Util.TypeFamilyWrappers

build-depends:
, base >=4.14 && <4.19
, base >=4.14 && <4.19
, bloomfilter
, bytestring
, constraints
, containers
, cryptohash-sha256
, deepseq
, directory
, fs-api
, fs-sim >=0.2
, fs-sim >=0.2
, io-classes
, io-sim >=1.2
, lsm-tree
, io-sim >=1.2
, lsm-tree:{lsm-tree, lsm-tree-utils}
, mtl
, QuickCheck
, quickcheck-dynamic
, quickcheck-instances
, quickcheck-lockstep
, random
, stm
, tasty
, tasty-hunit
Expand Down
84 changes: 83 additions & 1 deletion src/Database/LSMTree/Internal/Run/BloomFilter.hs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
{-# LANGUAGE TypeApplications #-}

-- | Bloom filters for probing runs during lookups.
--
-- === TODO
Expand All @@ -22,4 +24,84 @@
-- The above list is a sketch. Functionality may move around, and the list is
-- not exhaustive.
--
module Database.LSMTree.Internal.Run.BloomFilter () where
module Database.LSMTree.Internal.Run.BloomFilter (
-- * Tuning
-- ** @bloomfilter@ package
Easy.suggestSizing
, Easy.safeSuggestSizing
-- ** Monkey
-- $tuning
, monkeyFPR
, monkeyBits
, monkeyHashFuncs
-- * Bloom
, Bloom
, Bloom.elem
-- ** Hashes
, Hash
, Hash.Hashable (..)
, Hash.cheapHashes
-- ** Incremental construction
, MBloom
, Mutable.new
, Mutable.insert
, Bloom.freeze
, Bloom.unsafeFreeze
) where

import Data.BloomFilter (Bloom, Hash)
import qualified Data.BloomFilter as Bloom
import qualified Data.BloomFilter.Easy as Easy
import qualified Data.BloomFilter.Hash as Hash
import Data.BloomFilter.Mutable (MBloom)
import qualified Data.BloomFilter.Mutable as Mutable
import Prelude hiding (elem)

{-------------------------------------------------------------------------------
Tuning a la Monkey
-------------------------------------------------------------------------------}

-- $tuning
--
-- These functions are experimental, and will not yet guarantee correct false
-- positive rates. For now, use 'Easy.suggestSizing' and 'Easy.safeSuggestSizing' instead.
--
-- TODO: un-experimental these functions.

-- | Compute the false positive rate for a bloom filter.
--
-- Assumes that the bloom filter uses 'monkeyHashFuncs' hash functions.
--
-- REF: Equation 2 from the paper /Optimal Bloom Filters and Adaptive Merging
-- for LSM-Trees/.
monkeyFPR ::
Int -- ^ Number of bits assigned to the bloom filter.
-> Int -- ^ Number of entries inserted into the bloom filter.
-> Double
monkeyFPR numBits numEntries =
exp ((-(fromIntegral numBits / fromIntegral numEntries)) * (log 2 ** 2))

-- | Compute the number of bits in a bloom filter.
--
-- Assumes that the bloom filter uses 'monkeyHashFuncs' hash functions.
--
-- REF: Equation 2 from the paper /Optimal Bloom Filters and Adaptive Merging
-- for LSM-Trees/, rewritten in terms of @bits@ on page 11.
monkeyBits ::
Int -- ^ Number of entries inserted into the bloom filter.
-> Double -- ^ False positive rate.
-> Int
monkeyBits numEntries fpr = ceiling $
(- fromIntegral numEntries) * (log fpr / (log 2 ** 2))

-- | Computes the optimal number of hash functions that minimses the false
-- positive rate for a bloom filter.
--
-- REF: Footnote 2, page 6 from the paper /Optimal Bloom Filters and Adaptive
-- Merging for LSM-Trees/.
monkeyHashFuncs ::
Int -- ^ Number of bits assigned to the bloom filter.
-> Int -- ^ Number of entries inserted into the bloom filter.
-> Int
monkeyHashFuncs numBits numEntries = truncate @Double $
(fromIntegral numBits / fromIntegral numEntries) * log 2
62 changes: 62 additions & 0 deletions src/utils/Database/LSMTree/Extras.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
{-# LANGUAGE NumericUnderscores #-}
{-# LANGUAGE TypeApplications #-}
{- HLINT ignore "Use camelCase" -}

module Database.LSMTree.Extras (
-- * Bloom filter construction
BloomMaker
, mkBloomST
, mkBloomST_Monkey
, mkBloomEasy
) where

import Control.Monad.ST (runST)
import qualified Data.BloomFilter.Easy as Bloom.Easy (easyList)
import Database.LSMTree.Internal.Run.BloomFilter (Bloom, Hashable)
import qualified Database.LSMTree.Internal.Run.BloomFilter as Bloom

{-------------------------------------------------------------------------------
Bloom filter construction
-------------------------------------------------------------------------------}

type BloomMaker a = [a] -> Bloom a

-- | Create a bloom filter through the 'MBloom' interface. Tunes the bloom
-- filter using 'suggestSizing'.
mkBloomST :: Hashable a => Double -> BloomMaker a
mkBloomST requestedFPR xs = runST $ do
b <- Bloom.new (Bloom.cheapHashes numHashFuncs) numBits
mapM_ (Bloom.insert b) xs
Bloom.freeze b
where
numEntries = length xs
(numBits, numHashFuncs) = Bloom.suggestSizing numEntries requestedFPR

-- | Create a bloom filter through the 'MBloom' interface. Tunes the bloom
-- filter a la Monkey.
--
-- === TODO
--
-- The measured FPR exceeds the requested FPR by a number of percentages.
-- Example: @withNewStdGen $ measureApproximateFPR (Proxy @Word64) (mkBloomST'
-- 0.37) 1000000@. I'm unsure why, but I have a number of ideas
--
-- * The FPR (and bits/hash functions) calculations are approximations.
-- * Rounding errors in the Haskell implementation of FPR calculations
-- * The Monkey tuning is incompatible with @bloomfilter@'s /next power of 2/
-- rounding of th ebits.
mkBloomST_Monkey :: Hashable a => Double -> BloomMaker a
mkBloomST_Monkey requestedFPR xs = runST $ do
b <- Bloom.new (Bloom.cheapHashes numHashFuncs) numBits
mapM_ (Bloom.insert b) xs
Bloom.freeze b
where
numEntries = length xs
numBits = Bloom.monkeyBits numEntries requestedFPR
numHashFuncs = Bloom.monkeyHashFuncs numBits numEntries

-- | Create a bloom filter through the "Data.BloomFilter.Easy" interface. Tunes
-- the bloom filter using 'suggestSizing'.
mkBloomEasy :: Hashable a => Double -> BloomMaker a
mkBloomEasy = Bloom.Easy.easyList

59 changes: 59 additions & 0 deletions src/utils/System/Random/Extras.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{-# LANGUAGE BangPatterns #-}

module System.Random.Extras (
-- * Sampling from uniform distributions
uniformWithoutReplacement
, uniformWithReplacement
, sampleUniformWithoutReplacement
, sampleUniformWithReplacement
) where

import Data.List (unfoldr)
import qualified Data.Set as Set
import System.Random (StdGen, Uniform, uniform, uniformR)
import Text.Printf (printf)

{-------------------------------------------------------------------------------
Sampling from uniform distributions
-------------------------------------------------------------------------------}

uniformWithoutReplacement :: (Ord a, Uniform a) => StdGen -> Int -> [a]
uniformWithoutReplacement rng0 n0 = take n0 $
go Set.empty (0 :: Int) rng0
where
go !seen !n !rng
| Set.member x seen = go seen n rng'
| otherwise = x : go (Set.insert x seen) (n+1) rng'
where
(!x, !rng') = uniform rng

uniformWithReplacement :: Uniform a => StdGen -> Int -> [a]
uniformWithReplacement rng0 n0 = take n0 $
unfoldr (Just . uniform) rng0

sampleUniformWithoutReplacement :: Ord a => StdGen -> Int -> [a] -> [a]
sampleUniformWithoutReplacement rng0 n xs0 = take n $
go (Set.fromList xs0) rng0
where
go !xs !_rng | Set.null xs = error $
printf "sampleUniformWithoutReplacement: n > length xs0 for n=%d, \
\ length xs0=%d"
n
(length xs0)

go !xs !rng = x : go xs' rng'
where
(i, rng') = uniformR (0, Set.size xs - 1) rng
!x = Set.elemAt i xs
!xs' = Set.deleteAt i xs

sampleUniformWithReplacement :: Ord a => StdGen -> Int -> [a] -> [a]
sampleUniformWithReplacement rng0 n xs0 = take n $
go rng0
where
xs = Set.fromList xs0

go !rng = x : go rng'
where
(i, rng') = uniformR (0, Set.size xs - 1) rng
!x = Set.elemAt i xs
2 changes: 2 additions & 0 deletions test/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
module Main (main) where

import qualified Test.Database.LSMTree.Common
import qualified Test.Database.LSMTree.Internal.Run.BloomFilter
import qualified Test.Database.LSMTree.Model.Monoidal
import qualified Test.Database.LSMTree.Model.Normal
import qualified Test.Database.LSMTree.ModelIO.Monoidal
Expand All @@ -18,4 +19,5 @@ main = defaultMain $ testGroup "lsm-tree"
, Test.Database.LSMTree.ModelIO.Normal.tests
, Test.Database.LSMTree.ModelIO.Monoidal.tests
, Test.Database.LSMTree.Normal.StateMachine.tests
, Test.Database.LSMTree.Internal.Run.BloomFilter.tests
]
Loading

0 comments on commit 4facb38

Please sign in to comment.