From 2eec2754cd2a21da8d9d2af2fad4466a951eae51 Mon Sep 17 00:00:00 2001 From: Paul Rogers Date: Fri, 3 Nov 2023 11:20:53 -0400 Subject: [PATCH] Tokenize line with a skip tokenizer properly --- pkg/storage/bloom/v1/bloom_tokenizer.go | 17 ++++++- pkg/storage/bloom/v1/bloom_tokenizer_test.go | 49 +++++++++++++++++++- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/pkg/storage/bloom/v1/bloom_tokenizer.go b/pkg/storage/bloom/v1/bloom_tokenizer.go index c1e4fbddb1825..9aacc7ce97ed0 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer.go @@ -118,6 +118,21 @@ func (bt *BloomTokenizer) PopulateSeriesWithBloom(seriesWithBloom *SeriesWithBlo } // for each chunk } +// TokenizeLine returns a slice of tokens for the given line, based on the current value of the tokenizer +// If the tokenizer has a skip value, then the line will be tokenized multiple times, +// starting at the beginning of the line, with "skip" number of iterations, offset by one each time func (bt *BloomTokenizer) TokenizeLine(line string) []Token { - return bt.lineTokenizer.Tokens(line) + tokens := make([]Token, 0, 100) + if len(line) >= bt.lineTokenizer.GetMin() && len(line) >= bt.lineTokenizer.GetSkip() { + for i := 0; i <= bt.lineTokenizer.GetSkip(); i++ { + tmp := bt.lineTokenizer.Tokens(line[i:]) + for _, token := range tmp { + tmpToken := Token{} + tmpToken.Key = make([]byte, len(token.Key), len(token.Key)) + copy(tmpToken.Key, token.Key) + tokens = append(tokens, tmpToken) + } + } + } + return tokens } diff --git a/pkg/storage/bloom/v1/bloom_tokenizer_test.go b/pkg/storage/bloom/v1/bloom_tokenizer_test.go index 49d7edc2e4236..10618ccd82cd7 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer_test.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer_test.go @@ -39,7 +39,7 @@ func TestSetLineTokenizer(t *testing.T) { require.Equal(t, bt.chunkIDTokenizer.GetSkip(), 2) } -func TestTokenizeLine(t *testing.T) { +func TestDefaultTokenizeLine(t *testing.T) { bt, _ := NewBloomTokenizer(prometheus.DefaultRegisterer) for _, tc := range []struct { @@ -88,6 +88,53 @@ func TestTokenizeLine(t *testing.T) { } } +func TestTokenizeLineWithSkips(t *testing.T) { + bt, _ := NewBloomTokenizer(prometheus.DefaultRegisterer) + bt.SetLineTokenizer(NewNGramTokenizer(DefaultNGramLength, DefaultNGramLength+1, 2)) + + for _, tc := range []struct { + desc string + input string + exp []Token + }{ + { + desc: "empty", + input: "", + exp: []Token{}, + }, + { + desc: "single char", + input: "a", + exp: []Token{}, + }, + { + desc: "four chars", + input: "abcd", + exp: []Token{ + {Key: []byte("abcd")}}, + }, + { + desc: "longer string", + input: "abcdefghijkl", + exp: []Token{ + {Key: []byte("abcd")}, + {Key: []byte("defg")}, + {Key: []byte("ghij")}, + {Key: []byte("bcde")}, + {Key: []byte("efgh")}, + {Key: []byte("hijk")}, + {Key: []byte("cdef")}, + {Key: []byte("fghi")}, + {Key: []byte("ijkl")}, + }, + }, + } { + t.Run(tc.desc, func(t *testing.T) { + require.Equal(t, tc.exp, bt.TokenizeLine(tc.input)) + }) + } +} + func TestPopulateSeriesWithBloom(t *testing.T) { var testLine = "this is a log line" bt, _ := NewBloomTokenizer(prometheus.DefaultRegisterer)