From edfc420374ff622c21a53d24d572343c7cad24ca Mon Sep 17 00:00:00 2001 From: Joris Dral Date: Fri, 17 Nov 2023 11:19:57 +0100 Subject: [PATCH] Bloom filters and tests --- lsm-tree.cabal | 29 ++- .../LSMTree/Internal/Run/BloomFilter.hs | 84 +++++- src/utils/Database/LSMTree/Extras.hs | 62 +++++ src/utils/System/Random/Extras.hs | 59 +++++ test/Main.hs | 2 + .../LSMTree/Internal/Run/BloomFilter.hs | 239 ++++++++++++++++++ 6 files changed, 469 insertions(+), 6 deletions(-) create mode 100644 src/utils/Database/LSMTree/Extras.hs create mode 100644 src/utils/System/Random/Extras.hs create mode 100644 test/Test/Database/LSMTree/Internal/Run/BloomFilter.hs diff --git a/lsm-tree.cabal b/lsm-tree.cabal index 9cadbd07c..125e143fa 100644 --- a/lsm-tree.cabal +++ b/lsm-tree.cabal @@ -28,6 +28,8 @@ common warnings library import: warnings + hs-source-dirs: src + default-language: Haskell2010 exposed-modules: Data.Map.Range Database.LSMTree.Common @@ -42,6 +44,7 @@ library build-depends: , base >=4.14 && <4.19 + , bloomfilter ^>=2.0.1.2 , bytestring , containers , filepath @@ -52,8 +55,21 @@ library , strict-mvar ^>=1.2 , strict-stm ^>=1.2 - hs-source-dirs: src +library lsm-tree-utils + import: warnings + visibility: private + hs-source-dirs: src/utils default-language: Haskell2010 + exposed-modules: + Database.LSMTree.Extras + System.Random.Extras + + build-depends: + , base + , bloomfilter + , containers + , lsm-tree + , random test-suite lsm-tree-test import: warnings @@ -69,6 +85,7 @@ test-suite lsm-tree-test Database.LSMTree.ModelIO.Normal Database.LSMTree.ModelIO.Session Test.Database.LSMTree.Common + Test.Database.LSMTree.Internal.Run.BloomFilter Test.Database.LSMTree.Model.Monoidal Test.Database.LSMTree.Model.Normal Test.Database.LSMTree.ModelIO.Class @@ -81,7 +98,8 @@ test-suite lsm-tree-test Test.Util.TypeFamilyWrappers build-depends: - , base >=4.14 && <4.19 + , base >=4.14 && <4.19 + , bloomfilter , bytestring , constraints , containers @@ -89,15 +107,16 @@ test-suite lsm-tree-test , deepseq , directory , fs-api - , fs-sim >=0.2 + , fs-sim >=0.2 , io-classes - , io-sim >=1.2 - , lsm-tree + , io-sim >=1.2 + , lsm-tree:{lsm-tree, lsm-tree-utils} , mtl , QuickCheck , quickcheck-dynamic , quickcheck-instances , quickcheck-lockstep + , random , stm , tasty , tasty-hunit diff --git a/src/Database/LSMTree/Internal/Run/BloomFilter.hs b/src/Database/LSMTree/Internal/Run/BloomFilter.hs index e3815b921..82319ffb2 100644 --- a/src/Database/LSMTree/Internal/Run/BloomFilter.hs +++ b/src/Database/LSMTree/Internal/Run/BloomFilter.hs @@ -1,3 +1,5 @@ +{-# LANGUAGE TypeApplications #-} + -- | Bloom filters for probing runs during lookups. -- -- === TODO @@ -22,4 +24,84 @@ -- The above list is a sketch. Functionality may move around, and the list is -- not exhaustive. -- -module Database.LSMTree.Internal.Run.BloomFilter () where +module Database.LSMTree.Internal.Run.BloomFilter ( + -- * Tuning + -- ** @bloomfilter@ package + Easy.suggestSizing + , Easy.safeSuggestSizing + -- ** Monkey + -- $tuning + , monkeyFPR + , monkeyBits + , monkeyHashFuncs + -- * Bloom + , Bloom + , Bloom.elem + -- ** Hashes + , Hash + , Hash.Hashable (..) + , Hash.cheapHashes + -- ** Incremental construction + , MBloom + , Mutable.new + , Mutable.insert + , Bloom.freeze + , Bloom.unsafeFreeze + ) where + +import Data.BloomFilter (Bloom, Hash) +import qualified Data.BloomFilter as Bloom +import qualified Data.BloomFilter.Easy as Easy +import qualified Data.BloomFilter.Hash as Hash +import Data.BloomFilter.Mutable (MBloom) +import qualified Data.BloomFilter.Mutable as Mutable +import Prelude hiding (elem) + +{------------------------------------------------------------------------------- + Tuning a la Monkey +-------------------------------------------------------------------------------} + +-- $tuning +-- +-- These functions are experimental, and will not yet guarantee correct false +-- positive rates. For now, use 'Easy.suggestSizing' and 'Easy.safeSuggestSizing' instead. +-- +-- TODO: un-experimental these functions. + +-- | Compute the false positive rate for a bloom filter. +-- +-- Assumes that the bloom filter uses 'monkeyHashFuncs' hash functions. +-- +-- REF: Equation 2 from the paper /Optimal Bloom Filters and Adaptive Merging +-- for LSM-Trees/. +monkeyFPR :: + Int -- ^ Number of bits assigned to the bloom filter. + -> Int -- ^ Number of entries inserted into the bloom filter. + -> Double +monkeyFPR numBits numEntries = + exp ((-(fromIntegral numBits / fromIntegral numEntries)) * (log 2 ** 2)) + +-- | Compute the number of bits in a bloom filter. +-- +-- Assumes that the bloom filter uses 'monkeyHashFuncs' hash functions. +-- +-- REF: Equation 2 from the paper /Optimal Bloom Filters and Adaptive Merging +-- for LSM-Trees/, rewritten in terms of @bits@ on page 11. +monkeyBits :: + Int -- ^ Number of entries inserted into the bloom filter. + -> Double -- ^ False positive rate. + -> Int +monkeyBits numEntries fpr = ceiling $ + (- fromIntegral numEntries) * (log fpr / (log 2 ** 2)) + +-- | Computes the optimal number of hash functions that minimses the false +-- positive rate for a bloom filter. +-- +-- REF: Footnote 2, page 6 from the paper /Optimal Bloom Filters and Adaptive +-- Merging for LSM-Trees/. +monkeyHashFuncs :: + Int -- ^ Number of bits assigned to the bloom filter. + -> Int -- ^ Number of entries inserted into the bloom filter. + -> Int +monkeyHashFuncs numBits numEntries = truncate @Double $ + (fromIntegral numBits / fromIntegral numEntries) * log 2 diff --git a/src/utils/Database/LSMTree/Extras.hs b/src/utils/Database/LSMTree/Extras.hs new file mode 100644 index 000000000..03890dbc8 --- /dev/null +++ b/src/utils/Database/LSMTree/Extras.hs @@ -0,0 +1,62 @@ +{-# LANGUAGE NumericUnderscores #-} +{-# LANGUAGE TypeApplications #-} +{- HLINT ignore "Use camelCase" -} + +module Database.LSMTree.Extras ( + -- * Bloom filter construction + BloomMaker + , mkBloomST + , mkBloomST_Monkey + , mkBloomEasy + ) where + +import Control.Monad.ST (runST) +import qualified Data.BloomFilter.Easy as Bloom.Easy (easyList) +import Database.LSMTree.Internal.Run.BloomFilter (Bloom, Hashable) +import qualified Database.LSMTree.Internal.Run.BloomFilter as Bloom + +{------------------------------------------------------------------------------- + Bloom filter construction +-------------------------------------------------------------------------------} + +type BloomMaker a = [a] -> Bloom a + +-- | Create a bloom filter through the 'MBloom' interface. Tunes the bloom +-- filter using 'suggestSizing'. +mkBloomST :: Hashable a => Double -> BloomMaker a +mkBloomST requestedFPR xs = runST $ do + b <- Bloom.new (Bloom.cheapHashes numHashFuncs) numBits + mapM_ (Bloom.insert b) xs + Bloom.freeze b + where + numEntries = length xs + (numBits, numHashFuncs) = Bloom.suggestSizing numEntries requestedFPR + +-- | Create a bloom filter through the 'MBloom' interface. Tunes the bloom +-- filter a la Monkey. +-- +-- === TODO +-- +-- The measured FPR exceeds the requested FPR by a number of percentages. +-- Example: @withNewStdGen $ measureApproximateFPR (Proxy @Word64) (mkBloomST' +-- 0.37) 1000000@. I'm unsure why, but I have a number of ideas +-- +-- * The FPR (and bits/hash functions) calculations are approximations. +-- * Rounding errors in the Haskell implementation of FPR calculations +-- * The Monkey tuning is incompatible with @bloomfilter@'s /next power of 2/ +-- rounding of th ebits. +mkBloomST_Monkey :: Hashable a => Double -> BloomMaker a +mkBloomST_Monkey requestedFPR xs = runST $ do + b <- Bloom.new (Bloom.cheapHashes numHashFuncs) numBits + mapM_ (Bloom.insert b) xs + Bloom.freeze b + where + numEntries = length xs + numBits = Bloom.monkeyBits numEntries requestedFPR + numHashFuncs = Bloom.monkeyHashFuncs numBits numEntries + +-- | Create a bloom filter through the "Data.BloomFilter.Easy" interface. Tunes +-- the bloom filter using 'suggestSizing'. +mkBloomEasy :: Hashable a => Double -> BloomMaker a +mkBloomEasy = Bloom.Easy.easyList + diff --git a/src/utils/System/Random/Extras.hs b/src/utils/System/Random/Extras.hs new file mode 100644 index 000000000..f0e502859 --- /dev/null +++ b/src/utils/System/Random/Extras.hs @@ -0,0 +1,59 @@ +{-# LANGUAGE BangPatterns #-} + +module System.Random.Extras ( + -- * Sampling from uniform distributions + uniformWithoutReplacement + , uniformWithReplacement + , sampleUniformWithoutReplacement + , sampleUniformWithReplacement + ) where + +import Data.List (unfoldr) +import qualified Data.Set as Set +import System.Random (StdGen, Uniform, uniform, uniformR) +import Text.Printf (printf) + +{------------------------------------------------------------------------------- + Sampling from uniform distributions +-------------------------------------------------------------------------------} + +uniformWithoutReplacement :: (Ord a, Uniform a) => StdGen -> Int -> [a] +uniformWithoutReplacement rng0 n0 = take n0 $ + go Set.empty (0 :: Int) rng0 + where + go !seen !n !rng + | Set.member x seen = go seen n rng' + | otherwise = x : go (Set.insert x seen) (n+1) rng' + where + (!x, !rng') = uniform rng + +uniformWithReplacement :: Uniform a => StdGen -> Int -> [a] +uniformWithReplacement rng0 n0 = take n0 $ + unfoldr (Just . uniform) rng0 + +sampleUniformWithoutReplacement :: Ord a => StdGen -> Int -> [a] -> [a] +sampleUniformWithoutReplacement rng0 n xs0 = take n $ + go (Set.fromList xs0) rng0 + where + go !xs !_rng | Set.null xs = error $ + printf "sampleUniformWithoutReplacement: n > length xs0 for n=%d, \ + \ length xs0=%d" + n + (length xs0) + + go !xs !rng = x : go xs' rng' + where + (i, rng') = uniformR (0, Set.size xs - 1) rng + !x = Set.elemAt i xs + !xs' = Set.deleteAt i xs + +sampleUniformWithReplacement :: Ord a => StdGen -> Int -> [a] -> [a] +sampleUniformWithReplacement rng0 n xs0 = take n $ + go rng0 + where + xs = Set.fromList xs0 + + go !rng = x : go rng' + where + (i, rng') = uniformR (0, Set.size xs - 1) rng + !x = Set.elemAt i xs diff --git a/test/Main.hs b/test/Main.hs index 1c2825600..ff2e8d51d 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -3,6 +3,7 @@ module Main (main) where import qualified Test.Database.LSMTree.Common +import qualified Test.Database.LSMTree.Internal.Run.BloomFilter import qualified Test.Database.LSMTree.Model.Monoidal import qualified Test.Database.LSMTree.Model.Normal import qualified Test.Database.LSMTree.ModelIO.Monoidal @@ -18,4 +19,5 @@ main = defaultMain $ testGroup "lsm-tree" , Test.Database.LSMTree.ModelIO.Normal.tests , Test.Database.LSMTree.ModelIO.Monoidal.tests , Test.Database.LSMTree.Normal.StateMachine.tests + , Test.Database.LSMTree.Internal.Run.BloomFilter.tests ] diff --git a/test/Test/Database/LSMTree/Internal/Run/BloomFilter.hs b/test/Test/Database/LSMTree/Internal/Run/BloomFilter.hs new file mode 100644 index 000000000..2fd3201dc --- /dev/null +++ b/test/Test/Database/LSMTree/Internal/Run/BloomFilter.hs @@ -0,0 +1,239 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE GeneralisedNewtypeDeriving #-} +{-# LANGUAGE InstanceSigs #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE NumericUnderscores #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} +{- HLINT ignore "Use camelCase" -} + +module Test.Database.LSMTree.Internal.Run.BloomFilter ( + -- * Main test tree + tests + -- * Bloom filter construction + -- + -- A common interface to bloom filter construction, based on expected false + -- positive rates. + , BloomMaker + , mkBloomST + , mkBloomST_Monkey + , mkBloomEasy + -- * Verifying FPRs + , measureApproximateFPR + , measureExactFPR + ) where + +import Control.Exception (assert) +import Data.Foldable (Foldable (..)) +import Data.Proxy (Proxy (..)) +import Data.Set (Set) +import qualified Data.Set as Set +import Data.Word (Word64) +import Database.LSMTree.Extras +import Database.LSMTree.Internal.Run.BloomFilter (Bloom) +import qualified Database.LSMTree.Internal.Run.BloomFilter as Bloom +import System.Random +import System.Random.Extras +import Test.QuickCheck +import Test.Tasty (TestTree, testGroup) +import Test.Tasty.QuickCheck (testProperty) +import Text.Printf (printf) + +tests :: TestTree +tests = testGroup "Database.LSMTree.Internal.Run.BloomFilter" [ + testGroup "No false negatives" [ + testProperty "mkBloomEasy" $ prop_noFalseNegatives (Proxy @Word64) mkBloomEasy + , testProperty "mkBloomST" $ prop_noFalseNegatives (Proxy @Word64) mkBloomST + , testProperty "mkBloomST_Monkey" $ prop_noFalseNegatives (Proxy @Word64) mkBloomST_Monkey + ] + , testGroup "Verify FPR" [ + testProperty "mkBloomEasy" $ prop_verifyFPR (Proxy @Word64) mkBloomEasy + , testProperty "mkBloomST" $ prop_verifyFPR (Proxy @Word64) mkBloomST + , testProperty "mkBloomST_Monkey" $ expectFailure -- TODO: see 'mkBloomST_Monkey'. + $ prop_verifyFPR (Proxy @Word64) mkBloomST_Monkey + ] + ] + +{------------------------------------------------------------------------------- + Properties +-------------------------------------------------------------------------------} + +prop_noFalseNegatives :: forall a proxy. + proxy a + -> (Double -> BloomMaker a) + -> FPR -- ^ Requested FPR + -> UniformWithoutReplacement a + -> Property +prop_noFalseNegatives _ mkBloom (FPR requestedFPR) (UniformWithoutReplacement xs) = + let xsBloom = mkBloom requestedFPR xs + in property $ all (`Bloom.elem` xsBloom) xs + +prop_verifyFPR :: + (Ord a, Uniform a) + => proxy a + -> (Double -> BloomMaker a) + -> FPR -- ^ Requested FPR + -> NumEntries -- ^ @numEntries@ + -> Seed -- ^ 'StdGen' seed + -> Property +prop_verifyFPR p mkBloom (FPR requestedFPR) (NumEntries numEntries) (Seed seed) = + let stdgen = mkStdGen seed + measuredFPR = measureApproximateFPR p (mkBloom requestedFPR) numEntries stdgen + in counterexample (printf "expected %f <= %f" measuredFPR requestedFPR) $ + FPR measuredFPR <= FPR (requestedFPR + 0.01) + +{------------------------------------------------------------------------------- + Modifiers +-------------------------------------------------------------------------------} + +-- +-- FPR +-- + +newtype FPR = FPR { getFPR :: Double } + deriving (Show, Eq, Ord, Num, Fractional, Floating) + +instance Arbitrary FPR where + arbitrary = FPR <$> arbitrary `suchThat` fprInvariant + shrink (FPR x) = [FPR x' | x' <- shrink x, fprInvariant x'] + +fprInvariant :: Double -> Bool +fprInvariant x = x >= 0.01 && x <= 0.99 + +-- +-- NumEntries +-- + +newtype NumEntries = NumEntries { getNumEntries :: Int } + deriving Show + +instance Arbitrary NumEntries where + arbitrary = NumEntries <$> chooseInt (numEntriesLB, numEntriesUB) + shrink (NumEntries x) = [NumEntries x' | x' <- shrink x, numEntriesInvariant x'] + +numEntriesLB :: Int +numEntriesLB = 1_000 + +numEntriesUB :: Int +numEntriesUB = 100_000 + +numEntriesInvariant :: Int -> Bool +numEntriesInvariant x = x >= numEntriesLB && x <= numEntriesUB + +-- +-- Seed +-- + +newtype Seed = Seed { getSeed :: Int } + deriving Show + +instance Arbitrary Seed where + arbitrary = Seed <$> arbitraryBoundedIntegral + shrink (Seed x) = Seed <$> shrink x + +-- +-- UniformWithoutReplacement +-- + +newtype UniformWithoutReplacement a = UniformWithoutReplacement [a] + +instance Show (UniformWithoutReplacement a) where + show (UniformWithoutReplacement xs) = "UniformWithoutReplacement " <> show (length xs) + +instance (Ord a, Uniform a) => Arbitrary (UniformWithoutReplacement a) where + arbitrary = do + stdgen <- mkStdGen . getSeed <$> arbitrary + numEntries <- getNumEntries <$> arbitrary + pure $ UniformWithoutReplacement $ uniformWithoutReplacement stdgen numEntries + +{------------------------------------------------------------------------------- + Verifying FPRs +-------------------------------------------------------------------------------} + +-- | Measure the /approximate/ FPR for a bloom filter. +-- +-- Ensure that @a@ is large enough to draw @2 * numEntries@ uniformly random +-- values, or the computation will get stuck. +-- +-- REF: based on https://stackoverflow.com/questions/74999807/how-to-measure-the-rate-of-false-positives-in-a-bloom-filter +-- +-- REF: https://en.wikipedia.org/wiki/False_positive_rate +measureApproximateFPR :: + forall a proxy. (Ord a, Uniform a) + => proxy a -- ^ The types of values to generate. + -> BloomMaker a -- ^ How to construct the bloom filter. + -> Int -- ^ @numEntries@: number of entries to put into the bloom filter. + -> StdGen + -> Double +measureApproximateFPR _ mkBloom numEntries stdgen = + let !xs = uniformWithoutReplacement @a stdgen (2 * numEntries) + (!ys, !zs) = splitAt numEntries xs + !ysBloom = mkBloom ys + !ysSet = Set.fromList ys + oneIfElem z = assert (not $ Set.member z ysSet) + $ if Bloom.elem z ysBloom then 1 else 0 + !fp = foldl' (\acc x -> acc + oneIfElem x) (0 :: Int) zs + !fp' = fromIntegral fp :: Double + in fp' / fromIntegral numEntries -- FPR = FP / FP + TN + +-- | Measure the /exact/ FPR for a bloom filter. +-- +-- Ensure that @a@ is small enough that we can enumare it within reasonable +-- time. For example, a 'Word16' would be fine, but a 'Word32' would take much +-- too long. +measureExactFPR :: + forall a proxy. (Ord a, Enum a, Bounded a, Uniform a) + => proxy a -- ^ The types of values to generate. + -> BloomMaker a -- ^ How to construct the bloom filter. + -> Int -- ^ @numEntries@: number of entries to put into the bloom filter. + -> StdGen + -> Double +measureExactFPR _ mkBloom numEntries stdgen = + let !xs = uniformWithoutReplacement @a stdgen numEntries + !xsBloom = mkBloom xs + !xsSet = Set.fromList xs + !aEnumerated = [minBound .. maxBound] + Counts _ !fp !tn _ = foldMap' (fromTest . analyse xsBloom xsSet) aEnumerated + fp' = fromIntegral fp :: Double + tn' = fromIntegral tn :: Double + in fp' / (fp' + tn') -- FPR = FP / FP + TN + +data Test = + TruePositive + | FalsePositive + | TrueNegative + | FalseNegative + +analyse :: Ord a => Bloom a -> Set a -> a -> Test +analyse xsBloom xsSet y + | isBloomMember && isTrueMember = TruePositive + | isBloomMember && not isTrueMember = FalsePositive + | not isBloomMember && not isTrueMember = TrueNegative + | otherwise = FalseNegative + where + isBloomMember = Bloom.elem y xsBloom + isTrueMember = Set.member y xsSet + +fromTest :: Test -> Counts +fromTest = \case + TruePositive -> Counts 1 0 0 0 + FalsePositive -> Counts 0 1 0 0 + TrueNegative -> Counts 0 0 1 0 + FalseNegative -> Counts 0 0 0 1 + + +data Counts = Counts { + _cTruePositives :: !Int + , _cFalsePositives :: !Int + , _cTrueNegatives :: !Int + , _cFalseNegatives :: !Int + } + +instance Semigroup Counts where + (<>) :: Counts -> Counts -> Counts + (Counts tp1 fp1 tn1 fn1) <> (Counts tp2 fp2 tn2 fn2) = + Counts (tp1 + tp2) (fp1 + fp2) (tn1 + tn2) (fn1 + fn2) + +instance Monoid Counts where + mempty :: Counts + mempty = Counts 0 0 0 0