Skip to content

Commit

Permalink
Merge pull request #52 from input-output-hk/jdral/inmem-lookups
Browse files Browse the repository at this point in the history
The in-memory aspect of `lookups` and comparative benchmarks
  • Loading branch information
jorisdral authored Jan 8, 2024
2 parents 2a99c3e + e438fb4 commit a2f5ff2
Show file tree
Hide file tree
Showing 9 changed files with 354 additions and 7 deletions.
268 changes: 268 additions & 0 deletions bench/micro/Bench/Database/LSMTree/Internal/Integration.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE NumericUnderscores #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}
{- HLINT ignore "Use const" -}

module Bench.Database.LSMTree.Internal.Integration (benchmarks, analysis) where

import Bench.Database.LSMTree.Internal.Run.BloomFilter (elems)
import Bench.Database.LSMTree.Internal.Run.Index.Compact (searches)
import Control.DeepSeq (NFData)
import Control.Monad
import Criterion.Main (Benchmark, bench, bgroup, env, nf, whnf)
import Data.List (sort)
import Data.List.Extra (nubSort)
import Data.List.NonEmpty (NonEmpty (..))
import qualified Data.List.NonEmpty as NonEmpty
import Data.Maybe (fromMaybe)
import Data.Proxy (Proxy (..))
import Database.LSMTree.Generators (RFPrecision (..), UTxOKey)
import Database.LSMTree.Internal.Integration (prepLookups)
import Database.LSMTree.Internal.Run.BloomFilter (Bloom)
import qualified Database.LSMTree.Internal.Run.BloomFilter as Bloom
import Database.LSMTree.Internal.Run.Index.Compact (CompactIndex)
import qualified Database.LSMTree.Internal.Run.Index.Compact as Index
import Database.LSMTree.Internal.Serialise (Serialise (..),
SerialisedKey, topBits16)
import Database.LSMTree.Util.Orphans ()
import GHC.Generics (Generic)
import Prelude hiding (getContents)
import System.Random (Uniform, newStdGen)
import System.Random.Extras (sampleUniformWithReplacement,
uniformWithoutReplacement)
import Test.QuickCheck (generate, shuffle)
import Text.Printf (printf)

benchmarks :: Benchmark
benchmarks = bgroup "Bench.Database.LSMTree.Internal.Integration" [
bgroup "prepLookups for a single run" [
benchPrepLookups defaultConfig
, benchPrepLookups (defaultConfig {
name = "default onlyPos"
, nneg = 0
})
, benchPrepLookups (defaultConfig {
name = "default onlyNeg"
, npos = 0
})
, benchPrepLookups (defaultConfig {
name = "default high fpr"
, fpr = 0.9
})
, benchPrepLookups (defaultConfig {
name = "default small"
, npages = 10_000
, npos = 1_000
, nneg = 1_000
})
]
]
where
benchPrepLookups :: Config -> Benchmark
benchPrepLookups conf@Config{name} =
env (prepLookupsEnv (Proxy @UTxOKey) conf) $ \ ~(b, ci, ks) ->
-- see 'npages'.
bgroup (printf "%s, %d actual pages" name (Index.sizeInPages ci)) [
-- the bloomfilter is queried for all lookup keys
bench "Bloomfilter query" $ whnf (elems b) ks
-- the compact index is only searched for (true and false) positive
-- lookup keys
, env (pure $ filter (`Bloom.elem` b) ks) $ \ks' ->
bench "Compact index search" $ whnf (searches ci) ks'
-- All prepped lookups are going to be used eventually so we use
-- @nf@ on the result of 'prepLookups' to ensure that we actually
-- compute the full list.
, bench "In-memory lookup" $ nf (prepLookups [((), b, ci)]) ks
]

{-------------------------------------------------------------------------------
Analysis
-------------------------------------------------------------------------------}

-- In this analysis, around @15%@ to @20%@ of the measured time for
-- 'prepLookups' is not accounted for by bloom filter queries and compact index
-- searches.
analysis :: IO ()
analysis = do
-- (name, bloomfilter query, compact index search, prepLookups)
let def = ("default", 1.722 , 0.966 , 3.294)
onlyPos = ("onlyPos", 0.9108, 0.8873 , 2.139)
onlyNeg = ("onlyNeg", 0.6784, 0.009573, 0.8683)
highFpr = ("highFpr", 1.155 , 1.652 , 3.417)
small = ("small" , 0.1602, 0.06589 , 0.2823)

results :: [(String, Double, Double, Double)]
results = [def, onlyPos, onlyNeg, highFpr, small]

forM_ results $ \(name, query, search, prep) -> do
-- the measured time for 'prepLookups' should be close to the time spent on
-- bloom filter queries and compact index searches
let diff = prep - (query + search)
diffPercent = diff / prep
print (name, query, search, prep, diff, diffPercent)

{-------------------------------------------------------------------------------
Environments
-------------------------------------------------------------------------------}

-- | Config options describing a benchmarking scenario
data Config = Config {
-- | Name for the benchmark scenario described by this config.
name :: !String
-- | If 'Nothing', use 'suggestRangeFinderPrecision'.
, rfprecDef :: !(Maybe Int)
-- | Chunk size for compact index construction
, csize :: !Int
-- | Number of pages in total
--
-- Note: the actual number of pages can be higher, because of the
-- partitioned pages restriction.
, npages :: !Int
-- | Number of entries per page
, npageEntries :: !Int
-- | Number of positive lookups
, npos :: !Int
-- | Number of negative lookups
, nneg :: !Int
, fpr :: !Double
}

defaultConfig :: Config
defaultConfig = Config {
name = "default"
, rfprecDef = Nothing
, csize = 100
, npages = 50_000
, npageEntries = 40
, npos = 10_000
, nneg = 10_000
, fpr = 0.1
}

-- | Use 'lookupsEnv' to set up an environment for the in-memory aspect of
-- lookups.
prepLookupsEnv ::
forall k. (Ord k, Uniform k, Serialise k)
=> Proxy k
-> Config
-> IO (Bloom SerialisedKey, CompactIndex, [SerialisedKey])
prepLookupsEnv _ Config {..} = do
(storedKeys, lookupKeys) <- lookupsEnv @k totalEntries npos nneg
let b = Bloom.fromList fpr $ fmap serialise storedKeys
ps = mkPages (RFPrecision rfprec) $ NonEmpty.fromList storedKeys
ps' = fmap serialise ps
psMinMax = (\p -> (minKey p, maxKey p)) <$> getPages ps'
ci = Index.fromList rfprec csize psMinMax
pure (b, ci, fmap serialise lookupKeys)
where
totalEntries = npages * npageEntries
rfprec = fromMaybe (Index.suggestRangeFinderPrecision npages) rfprecDef

-- | Generate keys to store and keys to lookup
lookupsEnv ::
(Ord k, Uniform k)
=> Int -- ^ Number of stored keys
-> Int -- ^ Number of positive lookups
-> Int -- ^ Number of negative lookups
-> IO ([k], [k])
lookupsEnv nkeys npos nneg = do
stdgen <- newStdGen
stdgen' <- newStdGen
let (xs, ys1) = splitAt nkeys
$ uniformWithoutReplacement stdgen (nkeys + nneg)
ys2 = sampleUniformWithReplacement stdgen' npos xs
zs <- generate $ shuffle (ys1 ++ ys2)
pure (xs, zs)

{-------------------------------------------------------------------------------
Pages
-------------------------------------------------------------------------------}

-- TODO: either remove the @f@ parameter and specialise for 'NonEmpty', or merge
-- this code with the @Pages@ type from "Database.LSMTree.Generators".

pageResidency :: Int
pageResidency = 40

class MinMax f where
minKey :: Page f k -> k
maxKey :: Page f k -> k

instance MinMax NonEmpty where
minKey = NonEmpty.head . getContents
maxKey = NonEmpty.last . getContents

newtype Page f k = Page { getContents :: f k }
deriving stock (Show, Generic, Functor)
deriving anyclass NFData

-- | We model a disk page in a run as a pair of its minimum and maximum key.
--
-- A run consists of multiple pages in sorted order, and keys are unique. Pages
-- are partitioned, meaning all keys inside a page have the same range-finder
-- bits. A run can not be empty, and a page can not be empty.
data Pages f k = Pages {
getRangeFinderPrecision :: RFPrecision
, getPages :: [Page f k]
}
deriving stock (Show, Generic, Functor)
deriving anyclass NFData

mkPages ::
forall k. (Ord k, Serialise k)
=> RFPrecision
-> NonEmpty k
-> Pages NonEmpty k
mkPages rfprec@(RFPrecision n) =
Pages rfprec . go . nubSort . NonEmpty.toList
where
go :: [k] -> [Page NonEmpty k]
go [] = []
go [k] = [Page $ k :| []]
go (k:ks) = Page (NonEmpty.fromList (k:ks1)) : go ks2
where
(ks1, ks2) = spanN
(pageResidency - 1)
(\k' -> topBits16 n (serialise k) == topBits16 n (serialise k'))
ks

_pagesInvariant :: (Ord k, Serialise k) => Pages NonEmpty k -> Bool
_pagesInvariant (Pages (RFPrecision rfprec) ks) =
sort ks' == ks'
&& nubSort ks' == ks'
&& not (null ks)
&& all partitioned ks
where
ks' = flatten ks
partitioned p =
-- keys should be sorted within pages, so it's sufficient to check
-- the minimum key against the maximum key
topBits16 rfprec (serialise $ minKey p)
== topBits16 rfprec (serialise $ maxKey p)

flatten :: Eq k => [Page NonEmpty k] -> [k]
flatten [] = []
flatten (Page ks'':kss) = NonEmpty.toList ks'' ++ flatten kss

-- | @'spanN' n p xs@ finds the longest prefix of at most length @n@ of @xs@ of
-- elements that satisfy @p@.
--
-- Note: this is a frankenstein fusion of 'span' and 'splitAt', which is
-- hopefully slightly faster than using 'take', 'takeWhile', 'drop' and
-- 'dropWhile' to achieve the same result.
spanN :: Int -> (a -> Bool) -> [a] -> ([a], [a])
spanN n p ls
| n <= 0 = ([], ls)
| otherwise = spanN' n ls
where
spanN' _ xs@[] = (xs, xs)
spanN' m xs@(x:xs') | 0 <- m = ([], xs)
| p x = let (ys, zs) = spanN' (m-1) xs'
in (x:ys, zs)
| otherwise = ([], xs)
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
{- HLINT ignore "Use camelCase" -}
{- HLINT ignore "Eta reduce" -}

module Bench.Database.LSMTree.Internal.Run.BloomFilter (benchmarks) where
module Bench.Database.LSMTree.Internal.Run.BloomFilter (
benchmarks
-- * Benchmarked functions
, elems
) where

import Criterion.Main
import qualified Data.BloomFilter.Easy as Bloom.Easy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
{-# LANGUAGE TypeApplications #-}
{- HLINT ignore "Eta reduce" -}

module Bench.Database.LSMTree.Internal.Run.Index.Compact (benchmarks) where
module Bench.Database.LSMTree.Internal.Run.Index.Compact (
benchmarks
-- * Benchmarked functions
, searches
) where

import Control.DeepSeq (deepseq)
import Criterion.Main
Expand Down
6 changes: 4 additions & 2 deletions bench/micro/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@
--
module Main (main) where

import qualified Bench.Database.LSMTree.Internal.Integration
import qualified Bench.Database.LSMTree.Internal.Run.BloomFilter
import qualified Bench.Database.LSMTree.Internal.Run.Index.Compact
import Criterion.Main
import Criterion.Main (defaultMain)

main :: IO ()
main = defaultMain [
Bench.Database.LSMTree.Internal.Run.BloomFilter.benchmarks
Bench.Database.LSMTree.Internal.Integration.benchmarks
, Bench.Database.LSMTree.Internal.Run.BloomFilter.benchmarks
, Bench.Database.LSMTree.Internal.Run.Index.Compact.benchmarks
]
2 changes: 2 additions & 0 deletions lsm-tree.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ benchmark lsm-tree-micro-bench
hs-source-dirs: bench/micro
main-is: Main.hs
other-modules:
Bench.Database.LSMTree.Internal.Integration
Bench.Database.LSMTree.Internal.Run.BloomFilter
Bench.Database.LSMTree.Internal.Run.Index.Compact

Expand All @@ -179,6 +180,7 @@ benchmark lsm-tree-micro-bench
, containers
, criterion
, deepseq
, extra
, lsm-tree:{lsm-tree, bloomfilter, lsm-tree-utils}
, QuickCheck
, random
Expand Down
34 changes: 33 additions & 1 deletion src/Database/LSMTree/Internal/Integration.hs
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
{-# LANGUAGE TupleSections #-}
{- HLINT ignore "Eta reduce" -}

-- | Integration of LSM-Tree components into a full levels structure.
--
-- === TODO
Expand Down Expand Up @@ -26,4 +29,33 @@
-- The above list is a sketch. Functionality may move around, and the list is
-- not exhaustive.
--
module Database.LSMTree.Internal.Integration () where
module Database.LSMTree.Internal.Integration (
Run
, prepLookups
) where

import Data.Maybe
import Database.LSMTree.Internal.Run.BloomFilter (Bloom)
import qualified Database.LSMTree.Internal.Run.BloomFilter as Bloom
import Database.LSMTree.Internal.Run.Index.Compact (CompactIndex)
import qualified Database.LSMTree.Internal.Run.Index.Compact as Index
import Database.LSMTree.Internal.Serialise

type Run fd = (fd, Bloom SerialisedKey, CompactIndex)

-- | Prepare disk lookups by doing bloom filter queries and index searches.
--
-- Note: results are grouped by key instead of file descriptor, because this
-- means that results for a single key are close together.
prepLookups :: [Run fd] -> [SerialisedKey] -> [(SerialisedKey, [(fd, Int)])]
prepLookups runs ks = fmap f ks
where f k = (k, prepLookupMany runs k)

prepLookupMany :: [Run fd] -> SerialisedKey -> [(fd, Int)]
prepLookupMany runs k = mapMaybe f runs
where f run@(fd,_,_) = (fd,) <$> prepLookupOne run k

prepLookupOne :: Run fd -> SerialisedKey -> Maybe Int
prepLookupOne (_fd, b, fpix) k
| Bloom.elem k b = Index.search k fpix
| otherwise = Nothing
Loading

0 comments on commit a2f5ff2

Please sign in to comment.