Skip to content

Commit

Permalink
Merge pull request #441 from IntersectMBO/jeltsch/growing-vector/tests
Browse files Browse the repository at this point in the history
Add tests for growing vectors
  • Loading branch information
jorisdral authored Oct 31, 2024
2 parents 361f4dc + 5a041c8 commit 460f257
Show file tree
Hide file tree
Showing 3 changed files with 225 additions and 0 deletions.
1 change: 1 addition & 0 deletions lsm-tree.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ test-suite lsm-tree-test
Test.Database.LSMTree.Internal.Serialise
Test.Database.LSMTree.Internal.Serialise.Class
Test.Database.LSMTree.Internal.Vector
Test.Database.LSMTree.Internal.Vector.Growing
Test.Database.LSMTree.Model.Table
Test.Database.LSMTree.Monoidal
Test.Database.LSMTree.Normal.StateMachine
Expand Down
2 changes: 2 additions & 0 deletions test/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import qualified Test.Database.LSMTree.Internal.RunReaders
import qualified Test.Database.LSMTree.Internal.Serialise
import qualified Test.Database.LSMTree.Internal.Serialise.Class
import qualified Test.Database.LSMTree.Internal.Vector
import qualified Test.Database.LSMTree.Internal.Vector.Growing
import qualified Test.Database.LSMTree.Model.Table
import qualified Test.Database.LSMTree.Monoidal
import qualified Test.Database.LSMTree.Normal.StateMachine
Expand Down Expand Up @@ -63,6 +64,7 @@ main = defaultMain $ testGroup "lsm-tree"
, Test.Database.LSMTree.Internal.Serialise.tests
, Test.Database.LSMTree.Internal.Serialise.Class.tests
, Test.Database.LSMTree.Internal.Vector.tests
, Test.Database.LSMTree.Internal.Vector.Growing.tests
, Test.Database.LSMTree.Model.Table.tests
, Test.Database.LSMTree.Monoidal.tests
, Test.Database.LSMTree.Normal.UnitTests.tests
Expand Down
222 changes: 222 additions & 0 deletions test/Test/Database/LSMTree/Internal/Vector/Growing.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
module Test.Database.LSMTree.Internal.Vector.Growing (tests) where

import Prelude hiding (head, length, tail)

import Control.Category ((>>>))
import Control.Monad.ST.Strict (ST, runST)
import qualified Data.List as List (length)
import Data.Vector (Vector, toList)
import Database.LSMTree.Internal.Vector.Growing (GrowingVector,
append, freeze, new)
import Test.QuickCheck (Arbitrary (arbitrary, shrink), Gen,
NonNegative (NonNegative, getNonNegative),
Positive (Positive), Property, Small (Small), Testable,
chooseInt, coverTable, shrinkList, shrinkMap, shrinkMapBy,
sized, tabulate, (===))
import Test.Tasty (TestTree, testGroup)
import Test.Tasty.QuickCheck (testProperty)

-- * Tests

tests :: TestTree
tests = testGroup "Test.Database.LSMTree.Internal.Vector.Growing" $
[
testProperty "Final vector is correct"
prop_finalVectorIsCorrect
]

-- * Utilities

-- ** Segments

data Segment a = Replicate Int a deriving stock Show

segmentLength :: Segment a -> Int
segmentLength (Replicate count _) = max 0 count

segmentToList :: Segment a -> [a]
segmentToList (Replicate count val) = replicate count val

appendSegment :: GrowingVector s a -> Segment a -> ST s ()
appendSegment vec (Replicate count val) = append vec count val

-- ** Vector construction

{-
Constructs an ordinary vector by creating a growing vector, appending
segments to it, and finally freezing it.
-}
finalVector :: Int -- Initial buffer size
-> [Segment a] -- Segments
-> Vector a -- Final vector
finalVector initialBufferSize segments = runST $ do
vec <- new initialBufferSize
mapM_ (appendSegment vec) segments
freeze vec

{-
Supplies the final contents of a growing vector for constructing a property.
The resulting property additionally provides information about the
occurrence of buffer size scaling exponents, where a buffer size scaling
exponent is the binary logarithm of the ratio between the buffer size after
and before appending a segment. If buffer enlargement for an append would
happen in cycles where in each cycle the buffer size is doubled, then the
buffer size scaling exponent would be equal to the number of such cycles.
Therefore, a buffer size scaling exponent tells if the buffer is enlarged
and, if yes, how much.
-}
withFinalVector
:: Testable prop
=> Int -- Initial buffer size
-> [Segment a] -- Segments
-> (Vector a -> prop) -- Property from final vector
-> Property -- Resulting property
withFinalVector initialBufferSize segments consumer
= coverTable
"Buffer size scaling exponents"
[(show scalingExp, 10) | scalingExp <- [0 .. 3] :: [Int]]
$
tabulate
"Buffer size scaling exponents"
(show <$> bufferSizeScalingExponents availableBufferSizes 0 segments)
$
consumer (finalVector initialBufferSize segments)
where

availableBufferSizes :: [Int]
availableBufferSizes = iterate (* 2) initialBufferSize
{-
We do not need to account for overflow here, because the lengths of the
vectors we construct are small compared to @maxBound :: Int@.
-}

bufferSizeScalingExponents :: [Int] -> Int -> [Segment a] -> [Int]
bufferSizeScalingExponents _ _ []
= []
bufferSizeScalingExponents availableSizes length (head : tail)
= List.length insufficient :
bufferSizeScalingExponents sufficient length' tail
where

length' :: Int
!length' = length + segmentLength head

insufficient, sufficient :: [Int]
(insufficient, sufficient) = span (< length') availableSizes

-- * Properties to test

prop_finalVectorIsCorrect :: Positive (Small Int)
-> Segments Integer
-> Property
prop_finalVectorIsCorrect (Positive (Small initialBufferSize))
(Segments segments)
= withFinalVector initialBufferSize segments $ \ vec ->
toList vec === concatMap segmentToList segments

-- * Test case generation and shrinking

generateSegment :: Arbitrary a => Int -> Gen (Segment a)
generateSegment maxCount = Replicate <$> chooseInt (0, maxCount) <*> arbitrary

shrinkSegment :: Arbitrary a => Segment a -> [Segment a]
shrinkSegment (Replicate count val)
= [Replicate count' val | count' <- shrinkNat count] ++
[Replicate count val' | val' <- shrink val]
where

shrinkNat :: Int -> [Int]
shrinkNat = shrinkMap getNonNegative NonNegative

{-
A list of segments to be appended to an initially empty vector. 'Segments a'
is isomorphic to '[Segment a]' but has a special way of generating test
cases, which avoids skewing with respect to buffer size scaling exponents.
The key issue of segment list generation is how to generate the length of an
individual segment. It seems worthwhile to randomly pick a natural number
smaller than or equal to some maximum length, using a uniform distribution.
The question to be answered is how to choose the maximum lengths for the
different segments.
A naïve approach is to use a common maximum length for all segments in a
particular list, possibly computed from the size parameter. However, this
results in most appends not enlarging the buffer, meaning that the buffer
size scaling exponent is zero in most cases. This is because, as the buffer
size increases, the number of elements needed to cause the next buffer
enlargement increases too.
Note that it does not help much to choose the maximum segment length large
compared to the initial buffer size. If this is done, then the buffer is
enlarged very quickly, likely during the first append, and the distributions
of buffer size scaling exponents up to this first enlargement may actually
be very good. However, after the first enlargement, the buffer size is
typically of the same order of magnitude as the maximum segment length, so
that the buffer size scaling exponents of the following appends are likely
to be zero or close to it, with further buffer enlargements making the
situation only worse.
A better idea is to choose the maximum length of each segment but the first
proportional to the length of the vector just before appending it (for the
first segment, such a choice is not appropriate, because it would lead to a
maximum length of zero). With this approach, the distribution of buffer size
scaling exponents stays roughly the same as soon as the buffer has been
enlarged for the first time, because then the ratio between the current
buffer size and the current vector length is always in the interval \([1,
2)\) and thus varies only slightly. Reasonable distributions of buffer size
scaling exponents can be achieved by choosing the ratio between maximum
segment lengths and corresponding current-vector lengths appropriately.
A downside of this approach is that it requires tracking the current vector
length. This tracking can be avoided by considering only the /expected/
current vector length and choosing the maximum segment length proportional
to it. The expected length of a vector is the sum of the expected lengths of
the segments constituting it, and the expected length of a segment is
proportional to the maximum length used for generating it. Therefore, this
modified solution boils down to choosing the maximum length of each segment
but the first proportional to the sum of the maximum lengths of the segments
preceding it.
Note that, with the approach just set out, the maximum lengths of the
segments in a segment list almost form an exponential sequence, whose base
is determined by the chosen ratio between maximum segment lengths and
corresponding sums of maximum predecessor segment lengths. Therefore, a
similarly good, but simpler, solution is to have the maximum segment lengths
precisely form an exponential sequence. This is the approach that we use in
our segment list generation algorithm.
The concrete exponential sequences that we employ start with the size
parameter and use 3 as the base of the exponentiation. Since initial buffer
sizes are generated using a uniform distribution with the size parameter as
the maximum, the choice of the size parameter as the first segment’s maximum
length leads to non-trivial distributions of buffer size scaling exponents
for the appends up to the one that causes the first buffer enlargement. The
choice of 3 as the exponentiation’s base leads to a good overall
distribution of buffer size scaling exponents, as experiments have shown.
Because of the exponential growth of maximum segment lengths, vectors
constructed during testing get very large for longer segment lists.
Therefore, we do not pick the lengths of segment lists in the usual way but
instead make all segment lists have a common, small length. Concretely, we
use 9 as this common length, because this leads to vectors that are not
trivial but still consume reasonable amounts of memory.
-}
newtype Segments a = Segments [Segment a] deriving stock Show

instance Arbitrary a => Arbitrary (Segments a) where

arbitrary = sized $ iterate (* scalingFactor) >>>
take count >>>
mapM generateSegment >>>
fmap Segments
where

scalingFactor :: Int
scalingFactor = 3

count :: Int
count = 9

shrink = shrinkMapBy Segments (\ (Segments segments) -> segments) $
shrinkList shrinkSegment

0 comments on commit 460f257

Please sign in to comment.