From c5e3bc862a3c70119d818a83998a84b741230c8c Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Mon, 1 Jul 2024 13:18:21 -0700 Subject: [PATCH 1/2] fix(blooms): ensure tokenizer cache is reset between series Signed-off-by: Owen Diehl --- pkg/storage/bloom/v1/bloom_tokenizer.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/storage/bloom/v1/bloom_tokenizer.go b/pkg/storage/bloom/v1/bloom_tokenizer.go index 7d2ba41b7f49c..f0365f7cc78d8 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer.go @@ -97,11 +97,14 @@ func (bt *BloomTokenizer) newBloom() *Bloom { } } +// Populates a bloom filter(s) with the tokens from the given chunks. +// Called once per series func (bt *BloomTokenizer) Populate( blooms SizedIterator[*Bloom], chks Iterator[ChunkRefWithIter], ch chan *BloomCreation, ) { + clear(bt.cache) // MUST always clear the cache before starting a new series var next bool // All but the last bloom are considered full -- send back unaltered From f95eab188408650d0e590c0aec90a0970add6dd9 Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Mon, 1 Jul 2024 16:01:11 -0700 Subject: [PATCH 2/2] bloom tokenizer cache reset test Signed-off-by: Owen Diehl --- pkg/storage/bloom/v1/bloom_tokenizer_test.go | 39 ++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/pkg/storage/bloom/v1/bloom_tokenizer_test.go b/pkg/storage/bloom/v1/bloom_tokenizer_test.go index 9bef1ab2ca202..0f837fbee27bc 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer_test.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer_test.go @@ -288,6 +288,45 @@ func BenchmarkPopulateSeriesWithBloom(b *testing.B) { } } +func TestTokenizerClearsCacheBetweenPopulateCalls(t *testing.T) { + bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, NewMetrics(nil)) + line := "foobarbazz" + var blooms []*Bloom + + for i := 0; i < 2; i++ { + ch := make(chan *BloomCreation) + itr, err := chunkRefItrFromLines(line) + require.NoError(t, err) + go bt.Populate( + NewEmptyIter[*Bloom](), + NewSliceIter([]ChunkRefWithIter{ + { + Ref: ChunkRef{}, + Itr: itr, + }, + }), + ch, + ) + var ct int + for created := range ch { + blooms = append(blooms, created.Bloom) + ct++ + } + // ensure we created one bloom for each call + require.Equal(t, 1, ct) + + } + + for _, bloom := range blooms { + toks := bt.lineTokenizer.Tokens(line) + for toks.Next() { + token := toks.At() + require.True(t, bloom.Test(token)) + } + require.NoError(t, toks.Err()) + } +} + func BenchmarkMapClear(b *testing.B) { bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, metrics) for i := 0; i < b.N; i++ {