From 0c94702395124de872490baf127322f0b6ca0d71 Mon Sep 17 00:00:00 2001 From: Paul Rogers <129207811+paul1r@users.noreply.github.com> Date: Mon, 20 Nov 2023 16:46:15 -0500 Subject: [PATCH] tokenizer v1 cleanup (#11272) **What this PR does / why we need it**: Removes all usage of v1 tokenizers, renames v2 to v1 since we never released this in a production way. **Which issue(s) this PR fixes**: Fixes # **Special notes for your reviewer**: **Checklist** - [ ] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [ ] Documentation added - [ ] Tests updated - [ ] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --- pkg/storage/bloom/v1/bloom_tokenizer.go | 87 +-- pkg/storage/bloom/v1/bloom_tokenizer_test.go | 100 +-- pkg/storage/bloom/v1/tokenizer.go | 175 +----- pkg/storage/bloom/v1/tokenizer_test.go | 621 ++----------------- tools/tsdb/bloom-tester/lib.go | 20 +- tools/tsdb/bloom-tester/lib_test.go | 262 ++------ tools/tsdb/bloom-tester/metrics.go | 4 +- tools/tsdb/bloom-tester/readlib.go | 36 +- tools/tsdb/bloom-tester/readlib_test.go | 15 +- 9 files changed, 183 insertions(+), 1137 deletions(-) diff --git a/pkg/storage/bloom/v1/bloom_tokenizer.go b/pkg/storage/bloom/v1/bloom_tokenizer.go index 26ebd6300638..c5dd5e514507 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer.go @@ -2,6 +2,7 @@ package v1 import ( "context" + "encoding/binary" "math" "time" @@ -27,9 +28,8 @@ Bloom filters are utilized for faster lookups of log lines. type BloomTokenizer struct { metrics *metrics - lineTokenizer Tokenizer - chunkIDTokenizer *WrappedTokenizer - cache map[string]interface{} + lineTokenizer *NGramTokenizer + cache map[string]interface{} } const CacheSize = 150000 @@ -46,17 +46,15 @@ func NewBloomTokenizer(reg prometheus.Registerer) (*BloomTokenizer, error) { metrics: newMetrics(reg), } t.cache = make(map[string]interface{}, CacheSize) - t.lineTokenizer = NewNGramTokenizer(DefaultNGramLength, DefaultNGramLength+1, DefaultNGramSkip) // default to 4-grams, no skip - t.chunkIDTokenizer = ChunkIDTokenizer(t.lineTokenizer) + t.lineTokenizer = NewNGramTokenizer(DefaultNGramLength, DefaultNGramSkip) // default to 4-grams, no skip level.Info(util_log.Logger).Log("bloom tokenizer created") return t, nil } -func (bt *BloomTokenizer) SetLineTokenizer(t Tokenizer) { +func (bt *BloomTokenizer) SetLineTokenizer(t *NGramTokenizer) { bt.lineTokenizer = t - bt.chunkIDTokenizer = ChunkIDTokenizer(bt.lineTokenizer) } // TODO: Something real here with metrics @@ -70,12 +68,27 @@ func clearCache(cache map[string]interface{}) { } } +func calculatePrefix(chk logproto.ChunkRef) []byte { + i64buf := make([]byte, binary.MaxVarintLen64) + i32buf := make([]byte, 4) + prefix := make([]byte, 32) + + binary.PutVarint(i64buf, int64(chk.From)) + prefix = append(prefix, i64buf...) + binary.PutVarint(i64buf, int64(chk.Through)) + prefix = append(prefix, i64buf...) + binary.LittleEndian.PutUint32(i32buf, chk.Checksum) + prefix = append(prefix, i32buf...) + + return prefix +} + // PopulateSeriesWithBloom is intended to be called on the write path, and is used to populate the bloom filter for a given series. func (bt *BloomTokenizer) PopulateSeriesWithBloom(seriesWithBloom *SeriesWithBloom, chunks []chunk.Chunk) { clearCache(bt.cache) for idx := range chunks { lc := chunks[idx].Data.(*chunkenc.Facade).LokiChunk() - bt.chunkIDTokenizer.Reinit(chunks[idx].ChunkRef) + prefix := calculatePrefix(chunks[idx].ChunkRef) // TODO: error handling itr, err := lc.Iterator( @@ -93,16 +106,33 @@ func (bt *BloomTokenizer) PopulateSeriesWithBloom(seriesWithBloom *SeriesWithBlo defer itr.Close() for itr.Next() && itr.Error() == nil { - toks := bt.chunkIDTokenizer.Tokens(itr.Entry().Line) + chunkTokenizer := NewPrefixedTokenIter(prefix, bt.lineTokenizer.Tokens(itr.Entry().Line)) + for chunkTokenizer.Next() { + tok := chunkTokenizer.At() + if tok != nil { + str := string(tok) + _, found := bt.cache[str] // A cache is used ahead of the SBF, as it cuts out the costly operations of scaling bloom filters + if !found { + bt.cache[str] = nil + + seriesWithBloom.Bloom.ScalableBloomFilter.TestAndAdd(tok) - for _, tok := range toks { - if tok.Key != nil { - str := string(tok.Key) + if len(bt.cache) >= CacheSize { // While crude, this has proven efficient in performance testing. This speaks to the similarity in log lines near each other + clearCache(bt.cache) + } + } + } + } + lineTokenizer := bt.lineTokenizer.Tokens(itr.Entry().Line) + for lineTokenizer.Next() { + tok := lineTokenizer.At() + if tok != nil { + str := string(tok) _, found := bt.cache[str] // A cache is used ahead of the SBF, as it cuts out the costly operations of scaling bloom filters if !found { bt.cache[str] = nil - seriesWithBloom.Bloom.ScalableBloomFilter.TestAndAdd(tok.Key) + seriesWithBloom.Bloom.ScalableBloomFilter.TestAndAdd(tok) if len(bt.cache) >= CacheSize { // While crude, this has proven efficient in performance testing. This speaks to the similarity in log lines near each other clearCache(bt.cache) @@ -110,6 +140,7 @@ func (bt *BloomTokenizer) PopulateSeriesWithBloom(seriesWithBloom *SeriesWithBlo } } } + } seriesWithBloom.Series.Chunks = append(seriesWithBloom.Series.Chunks, ChunkRef{ Start: chunks[idx].From, @@ -118,33 +149,3 @@ func (bt *BloomTokenizer) PopulateSeriesWithBloom(seriesWithBloom *SeriesWithBlo }) } // for each chunk } - -// SearchesForTokenizerAndLine is for taking a given search string (ex: on the read/query path) and returning -// all the possible tokens, given a tokenizer. -// This is a multi-dimensional slice where the first slice is the offset into the line, and the -// second slice is the tokens for that offset. If an offset into the line returns no tokens, this first dimension -// will be less than 1 + the number of skips specified in the tokenizer -// The offset is used if the Tokenizer has a skip value being utilized. -func SearchesForTokenizerAndLine(t Tokenizer, line string) (res [][]Token) { - res = make([][]Token, 0, 10) - for i := range line { // iterate by runes - if i >= t.GetSkip()+1 { - break - } - tmpTokens := make([]Token, 0, 100) - tokens := t.Tokens(line[i:]) - // As the way the tokenizer is coded, it will reuse its internal buffers, - // but we need to save the data, hence the need for copying - for _, token := range tokens { - tmpToken := Token{} - tmpToken.Key = make([]byte, len(token.Key)) - copy(tmpToken.Key, token.Key) - tmpTokens = append(tmpTokens, tmpToken) - } - if len(tokens) > 0 { - res = append(res, tmpTokens) - } - } - - return res -} diff --git a/pkg/storage/bloom/v1/bloom_tokenizer_test.go b/pkg/storage/bloom/v1/bloom_tokenizer_test.go index 034301f88c1a..104524da479f 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer_test.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer_test.go @@ -20,95 +20,21 @@ import ( "github.com/prometheus/client_golang/prometheus" ) +var ( + four = NewNGramTokenizer(4, 0) +) + func TestSetLineTokenizer(t *testing.T) { bt, _ := NewBloomTokenizer(prometheus.DefaultRegisterer) // Validate defaults - require.Equal(t, bt.lineTokenizer.GetMin(), DefaultNGramLength) - require.Equal(t, bt.lineTokenizer.GetMax(), DefaultNGramLength+1) - require.Equal(t, bt.lineTokenizer.GetSkip(), DefaultNGramSkip) - - require.Equal(t, bt.chunkIDTokenizer.GetMin(), DefaultNGramLength) - require.Equal(t, bt.chunkIDTokenizer.GetMax(), DefaultNGramLength+1) - require.Equal(t, bt.chunkIDTokenizer.GetSkip(), DefaultNGramSkip) + require.Equal(t, bt.lineTokenizer.N, DefaultNGramLength) + require.Equal(t, bt.lineTokenizer.Skip, DefaultNGramSkip) // Set new tokenizer, and validate against that - bt.SetLineTokenizer(NewNGramTokenizer(6, 7, 2)) - require.Equal(t, bt.lineTokenizer.GetMin(), 6) - require.Equal(t, bt.lineTokenizer.GetMax(), 7) - require.Equal(t, bt.lineTokenizer.GetSkip(), 2) - - require.Equal(t, bt.chunkIDTokenizer.GetMin(), 6) - require.Equal(t, bt.chunkIDTokenizer.GetMax(), 7) - require.Equal(t, bt.chunkIDTokenizer.GetSkip(), 2) -} - -func TestSearchesForTokenizerAndLine(t *testing.T) { - for _, tc := range []struct { - desc string - input string - t Tokenizer - exp [][]Token - }{ - { - desc: "empty", - input: "", - t: four, - exp: [][]Token{}, - }, - { - desc: "single char", - input: "a", - t: four, - exp: [][]Token{}, - }, - { - desc: "four chars", - input: "abcd", - t: four, - exp: [][]Token{ - {{Key: []byte("abcd")}}}, - }, - { - desc: "uuid partial", - input: "2b1a5e46-36a2-4", - t: four, - exp: [][]Token{{ - {Key: []byte("2b1a")}, - {Key: []byte("b1a5")}, - {Key: []byte("1a5e")}, - {Key: []byte("a5e4")}, - {Key: []byte("5e46")}, - {Key: []byte("e46-")}, - {Key: []byte("46-3")}, - {Key: []byte("6-36")}, - {Key: []byte("-36a")}, - {Key: []byte("36a2")}, - {Key: []byte("6a2-")}, - {Key: []byte("a2-4")}}, - }, - }, - { - desc: "short special chars", - t: four, - input: "日本語", - exp: [][]Token{}, - }, - { - desc: "longer special chars", - t: four, - input: "日本語日本語", - exp: [][]Token{{ - {Key: []byte("日本語日")}, - {Key: []byte("本語日本")}, - {Key: []byte("語日本語")}}}, - }, - } { - t.Run(tc.desc, func(t *testing.T) { - require.Equal(t, tc.exp, SearchesForTokenizerAndLine(tc.t, tc.input)) - }) - } - + bt.SetLineTokenizer(NewNGramTokenizer(6, 7)) + require.Equal(t, bt.lineTokenizer.N, 6) + require.Equal(t, bt.lineTokenizer.Skip, 7) } func TestPopulateSeriesWithBloom(t *testing.T) { @@ -149,9 +75,11 @@ func TestPopulateSeriesWithBloom(t *testing.T) { } bt.PopulateSeriesWithBloom(&swb, chunks) - tokens := SearchesForTokenizerAndLine(four, testLine) - for _, token := range tokens[0] { - require.True(t, swb.Bloom.Test(token.Key)) + tokenizer := NewNGramTokenizer(DefaultNGramLength, DefaultNGramSkip) + itr := tokenizer.Tokens(testLine) + for itr.Next() { + token := itr.At() + require.True(t, swb.Bloom.Test(token)) } } diff --git a/pkg/storage/bloom/v1/tokenizer.go b/pkg/storage/bloom/v1/tokenizer.go index e27fa04e312f..458231780944 100644 --- a/pkg/storage/bloom/v1/tokenizer.go +++ b/pkg/storage/bloom/v1/tokenizer.go @@ -1,100 +1,9 @@ package v1 import ( - "encoding/binary" "unicode/utf8" - - "github.com/grafana/loki/pkg/logproto" ) -type Token struct { - Key []byte -} - -type Tokenizer interface { - Tokens(line string) []Token - GetSkip() int - GetMin() int - GetMax() int -} - -const TokenBufferSize = 4096 -const TokenKeySize = 132 - -type NgramTokenizer struct { - // [min,max) exclusivity - min, max, skip int - buffers [][]rune // circular buffers used for ngram generation - runeBuffer []byte // buffer used for token generation - internalTokenBuffer []Token // circular buffer for tokens -} - -/* -N-Grams (https://en.wikipedia.org/wiki/N-gram) are a series of 'n' adjacent characters in a string. -These will be utilized for the bloom filters to allow for fuzzy searching. -*/ -func NewNGramTokenizer(min, max, skip int) *NgramTokenizer { - capacity := max - min - t := &NgramTokenizer{ - min: min, - max: max, - skip: skip, - buffers: make([][]rune, capacity), - runeBuffer: make([]byte, 0, max*4), - internalTokenBuffer: make([]Token, 0, TokenBufferSize), - } - - for i := range t.buffers { - t.buffers[i] = make([]rune, t.min+i) - } - - for i := 0; i < cap(t.internalTokenBuffer); i++ { - t.internalTokenBuffer = append(t.internalTokenBuffer, Token{Key: make([]byte, 0, TokenKeySize)}) - } - - return t -} - -func (t *NgramTokenizer) GetSkip() int { - return t.skip -} - -func (t *NgramTokenizer) GetMin() int { - return t.min -} - -func (t *NgramTokenizer) GetMax() int { - return t.max -} - -func (t *NgramTokenizer) Tokens(line string) []Token { - var i int // rune index (not position that is measured in the range loop) - numToks := 0 - for _, r := range line { - - // j is the index of the buffer to use - for j := 0; j < (t.max - t.min); j++ { - // n is the length of the ngram - n := j + t.min - // pos is the position in the buffer to overwrite - pos := i % n - t.buffers[j][pos] = r - - if i >= n-1 && (i+1-n)%(t.skip+1) == 0 { - t.runeBuffer = reassemble(t.buffers[j], len(t.buffers[j]), (i+1)%n, t.runeBuffer) - if numToks >= cap(t.internalTokenBuffer) || numToks == len(t.internalTokenBuffer) { - t.internalTokenBuffer = append(t.internalTokenBuffer, Token{Key: make([]byte, 0, TokenKeySize)}) - } - t.internalTokenBuffer[numToks].Key = t.internalTokenBuffer[numToks].Key[:0] - t.internalTokenBuffer[numToks].Key = append(t.internalTokenBuffer[numToks].Key, t.runeBuffer...) - numToks++ - } - } - i++ - } - return t.internalTokenBuffer[0:numToks] -} - func reassemble(buf []rune, ln, pos int, result []byte) []byte { result = result[:0] // Reset the result slice for i := 0; i < ln; i++ { @@ -104,75 +13,9 @@ func reassemble(buf []rune, ln, pos int, result []byte) []byte { return result } -func chunkIDTransformer(tok Token, prefix []byte) Token { - tok.Key = append(append(tok.Key, prefix...), tok.Key...)[len(tok.Key):] - return tok -} - -type WrappedTokenizer struct { - t Tokenizer - tokenBuffer []Token - prefix []byte - i64buf []byte - i32buf []byte -} - -func (w *WrappedTokenizer) Tokens(line string) []Token { - w.tokenBuffer = w.tokenBuffer[:0] // Reset the result slice - toks := w.t.Tokens(line) - for _, tok := range toks { - w.tokenBuffer = append(w.tokenBuffer, chunkIDTransformer(tok, w.prefix), tok) - } - - return w.tokenBuffer -} - -func (w *WrappedTokenizer) GetSkip() int { - return w.t.GetSkip() -} - -func (w *WrappedTokenizer) GetMin() int { - return w.t.GetMin() -} - -func (w *WrappedTokenizer) GetMax() int { - return w.t.GetMax() -} - -func ChunkIDTokenizer(t Tokenizer) *WrappedTokenizer { - p := make([]byte, 0, 256) - return &WrappedTokenizer{ - t: t, - tokenBuffer: make([]Token, 0, TokenBufferSize), - prefix: p, - i64buf: make([]byte, binary.MaxVarintLen64), - i32buf: make([]byte, 4), - } -} - -func zeroBuffer(buf []byte) { - for i := range buf { - buf[i] = 0 - } -} - -func (w *WrappedTokenizer) Reinit(chk logproto.ChunkRef) { - w.prefix = w.prefix[:0] - zeroBuffer(w.i64buf) - zeroBuffer(w.i32buf) - - binary.PutVarint(w.i64buf, int64(chk.From)) - w.prefix = append(w.prefix, w.i64buf...) - binary.PutVarint(w.i64buf, int64(chk.Through)) - w.prefix = append(w.prefix, w.i64buf...) - binary.LittleEndian.PutUint32(w.i32buf, chk.Checksum) - w.prefix = append(w.prefix, w.i32buf...) -} - // Iterable variants (more performant, less space) - -type NGramTokenizerV2 struct { - n, skip int +type NGramTokenizer struct { + N, Skip int buffer []rune // circular buffer used for ngram generation res []byte // buffer used for token generation } @@ -181,10 +24,10 @@ type NGramTokenizerV2 struct { N-Grams (https://en.wikipedia.org/wiki/N-gram) are a series of 'n' adjacent characters in a string. These will be utilized for the bloom filters to allow for fuzzy searching. */ -func NewNGramTokenizerV2(n, skip int) *NGramTokenizerV2 { - t := &NGramTokenizerV2{ - n: n, - skip: skip, +func NewNGramTokenizer(n, skip int) *NGramTokenizer { + t := &NGramTokenizer{ + N: n, + Skip: skip, buffer: make([]rune, n+skip), res: make([]byte, 0, n*4), // maximum 4 bytes per rune } @@ -194,10 +37,10 @@ func NewNGramTokenizerV2(n, skip int) *NGramTokenizerV2 { // The Token iterator uses shared buffers for performance. The []byte returned by At() // is not safe for use after subsequent calls to Next() -func (t *NGramTokenizerV2) Tokens(line string) NGramTokenIter { +func (t *NGramTokenizer) Tokens(line string) NGramTokenIter { return NGramTokenIter{ - n: t.n, - skip: t.skip, + n: t.N, + skip: t.Skip, line: line, diff --git a/pkg/storage/bloom/v1/tokenizer_test.go b/pkg/storage/bloom/v1/tokenizer_test.go index a0becd464646..3532c28a4f60 100644 --- a/pkg/storage/bloom/v1/tokenizer_test.go +++ b/pkg/storage/bloom/v1/tokenizer_test.go @@ -1,43 +1,36 @@ package v1 import ( - "bufio" - "encoding/binary" - "os" "testing" - "github.com/grafana/loki/pkg/logproto" - "github.com/stretchr/testify/require" ) const BigFile = "../../../logql/sketch/testdata/war_peace.txt" -var ( - twoSkipOne = NewNGramTokenizer(2, 3, 1) - three = NewNGramTokenizer(3, 4, 0) - threeSkip1 = NewNGramTokenizer(3, 4, 1) - threeSkip2 = NewNGramTokenizer(3, 4, 2) - four = NewNGramTokenizer(4, 5, 0) - fourSkip1 = NewNGramTokenizer(4, 5, 1) - fourSkip2 = NewNGramTokenizer(4, 5, 2) - five = NewNGramTokenizer(5, 6, 0) - six = NewNGramTokenizer(6, 7, 0) -) - func TestNGramIterator(t *testing.T) { var ( - three = NewNGramTokenizerV2(3, 0) - threeSkip1 = NewNGramTokenizerV2(3, 1) - threeSkip3 = NewNGramTokenizerV2(3, 3) + three = NewNGramTokenizer(3, 0) + threeSkip1 = NewNGramTokenizer(3, 1) + threeSkip3 = NewNGramTokenizer(3, 3) ) for _, tc := range []struct { desc string - t *NGramTokenizerV2 + t *NGramTokenizer input string exp []string }{ + { + t: three, + input: "", + exp: []string{}, + }, + { + t: three, + input: "ab", + exp: []string{}, + }, { t: three, input: "abcdefg", @@ -53,6 +46,19 @@ func TestNGramIterator(t *testing.T) { input: "abcdefgh", exp: []string{"abc", "efg"}, }, + { + t: three, + input: "日本語", + exp: []string{"日本語"}, + }, + { + t: four, + input: "日本語日本語", + exp: []string{ + "日本語日", + "本語日本", + "語日本語"}, + }, } { t.Run(tc.desc, func(t *testing.T) { itr := tc.t.Tokens(tc.input) @@ -65,518 +71,42 @@ func TestNGramIterator(t *testing.T) { } } -func TestNGrams(t *testing.T) { - tokenizer := NewNGramTokenizer(2, 4, 0) - for _, tc := range []struct { - desc string - input string - exp []Token - }{ - { - desc: "empty", - input: "", - exp: []Token{}, - }, - { - desc: "single char", - input: "a", - exp: []Token{}, - }, - { - desc: "two chars", - input: "ab", - exp: []Token{{Key: []byte("ab")}}, - }, - { - desc: "three chars", - input: "abc", - exp: []Token{{Key: []byte("ab")}, {Key: []byte("bc")}, {Key: []byte("abc")}}, - }, - { - desc: "four chars", - input: "abcd", - exp: []Token{{Key: []byte("ab")}, {Key: []byte("bc")}, {Key: []byte("abc")}, {Key: []byte("cd")}, {Key: []byte("bcd")}}, - }, - { - desc: "foo", - input: "日本語", - exp: []Token{{Key: []byte("日本")}, {Key: []byte("本語")}, {Key: []byte("日本語")}}, - }, - } { - t.Run(tc.desc, func(t *testing.T) { - require.Equal(t, tc.exp, tokenizer.Tokens(tc.input)) - }) - } -} - -func TestNGramsSkip(t *testing.T) { - - for _, tc := range []struct { - desc string - tokenizer *NgramTokenizer - input string - exp []Token - }{ - { - desc: "four chars", - tokenizer: twoSkipOne, - input: "abcd", - exp: []Token{{Key: []byte("ab")}, {Key: []byte("cd")}}, - }, - { - desc: "special chars", - tokenizer: twoSkipOne, - input: "日本語", - exp: []Token{{Key: []byte("日本")}}, - }, - { - desc: "multi", - tokenizer: NewNGramTokenizer(2, 4, 1), - input: "abcdefghij", - exp: []Token{ - {Key: []byte("ab")}, - {Key: []byte("abc")}, - {Key: []byte("cd")}, - {Key: []byte("cde")}, - {Key: []byte("ef")}, - {Key: []byte("efg")}, - {Key: []byte("gh")}, - {Key: []byte("ghi")}, - {Key: []byte("ij")}, - }, - }, - } { - t.Run(tc.desc, func(t *testing.T) { - require.Equal(t, tc.exp, tc.tokenizer.Tokens(tc.input)) - }) - } -} - -func Test3GramSkip0Tokenizer(t *testing.T) { - tokenizer := three - for _, tc := range []struct { - desc string - input string - exp []Token - }{ - { - desc: "empty", - input: "", - exp: []Token{}, - }, - { - desc: "single char", - input: "a", - exp: []Token{}, - }, - { - desc: "three char", - input: "abc", - exp: []Token{{Key: []byte("abc")}}, - }, - { - desc: "four chars", - input: "abcd", - exp: []Token{{Key: []byte("abc")}, {Key: []byte("bcd")}}, - }, - } { - t.Run(tc.desc, func(t *testing.T) { - require.Equal(t, tc.exp, tokenizer.Tokens(tc.input)) - }) - } -} - -func Test3GramSkip1Tokenizer(t *testing.T) { - tokenizer := threeSkip1 - for _, tc := range []struct { - desc string - input string - exp []Token - }{ - { - desc: "empty", - input: "", - exp: []Token{}, - }, - { - desc: "single char", - input: "a", - exp: []Token{}, - }, - { - desc: "three char", - input: "abc", - exp: []Token{{Key: []byte("abc")}}, - }, - { - desc: "four chars", - input: "abcd", - exp: []Token{{Key: []byte("abc")}}, - }, - { - desc: "five chars", - input: "abcde", - exp: []Token{{Key: []byte("abc")}, {Key: []byte("cde")}}, - }, - } { - t.Run(tc.desc, func(t *testing.T) { - require.Equal(t, tc.exp, tokenizer.Tokens(tc.input)) - }) - } -} - -func Test3GramSkip2Tokenizer(t *testing.T) { - tokenizer := threeSkip2 - for _, tc := range []struct { - desc string - input string - exp []Token - }{ - { - desc: "empty", - input: "", - exp: []Token{}, - }, - { - desc: "single char", - input: "a", - exp: []Token{}, - }, - { - desc: "four chars", - input: "abcd", - exp: []Token{{Key: []byte("abc")}}, - }, - } { - t.Run(tc.desc, func(t *testing.T) { - require.Equal(t, tc.exp, tokenizer.Tokens(tc.input)) - }) - } -} - -func Test4GramSkip0Tokenizer(t *testing.T) { - tokenizer := four - for _, tc := range []struct { - desc string - input string - exp []Token - }{ - { - desc: "empty", - input: "", - exp: []Token{}, - }, - { - desc: "single char", - input: "a", - exp: []Token{}, - }, - { - desc: "three char", - input: "abc", - exp: []Token{}, - }, - { - desc: "four chars", - input: "abcd", - exp: []Token{{Key: []byte("abcd")}}, - }, - { - desc: "five chars", - input: "abcde", - exp: []Token{{Key: []byte("abcd")}, {Key: []byte("bcde")}}, - }, - } { - t.Run(tc.desc, func(t *testing.T) { - require.Equal(t, tc.exp, tokenizer.Tokens(tc.input)) - }) - } -} - -func Test4GramSkip1Tokenizer(t *testing.T) { - tokenizer := fourSkip1 - for _, tc := range []struct { - desc string - input string - exp []Token - }{ - { - desc: "empty", - input: "", - exp: []Token{}, - }, - { - desc: "single char", - input: "a", - exp: []Token{}, - }, - { - desc: "three char", - input: "abc", - exp: []Token{}, - }, - { - desc: "four chars", - input: "abcd", - exp: []Token{{Key: []byte("abcd")}}, - }, - { - desc: "five chars", - input: "abcde", - exp: []Token{{Key: []byte("abcd")}}, - }, - { - desc: "six chars", - input: "abcdef", - exp: []Token{{Key: []byte("abcd")}, {Key: []byte("cdef")}}, - }, - { - desc: "seven chars", - input: "abcdefg", - exp: []Token{{Key: []byte("abcd")}, {Key: []byte("cdef")}}, - }, - { - desc: "eight chars", - input: "abcdefgh", - exp: []Token{{Key: []byte("abcd")}, {Key: []byte("cdef")}, {Key: []byte("efgh")}}, - }, - } { - t.Run(tc.desc, func(t *testing.T) { - require.Equal(t, tc.exp, tokenizer.Tokens(tc.input)) - }) - } -} - -func Test4GramSkip2Tokenizer(t *testing.T) { - tokenizer := fourSkip2 - for _, tc := range []struct { - desc string - input string - exp []Token - }{ - { - desc: "empty", - input: "", - exp: []Token{}, - }, - { - desc: "single char", - input: "a", - exp: []Token{}, - }, - { - desc: "three char", - input: "abc", - exp: []Token{}, - }, - { - desc: "four chars", - input: "abcd", - exp: []Token{{Key: []byte("abcd")}}, - }, - { - desc: "five chars", - input: "abcde", - exp: []Token{{Key: []byte("abcd")}}, - }, - { - desc: "six chars", - input: "abcdef", - exp: []Token{{Key: []byte("abcd")}}, - }, - { - desc: "seven chars", - input: "abcdefg", - exp: []Token{{Key: []byte("abcd")}, {Key: []byte("defg")}}, - }, - { - desc: "eight chars", - input: "abcdefgh", - exp: []Token{{Key: []byte("abcd")}, {Key: []byte("defg")}}, - }, - { - desc: "nine chars", - input: "abcdefghi", - exp: []Token{{Key: []byte("abcd")}, {Key: []byte("defg")}}, - }, - { - desc: "ten chars", - input: "abcdefghij", - exp: []Token{{Key: []byte("abcd")}, {Key: []byte("defg")}, {Key: []byte("ghij")}}, - }, - } { - t.Run(tc.desc, func(t *testing.T) { - require.Equal(t, tc.exp, tokenizer.Tokens(tc.input)) - }) - } -} - -func Test5GramSkip0Tokenizer(t *testing.T) { - tokenizer := five - for _, tc := range []struct { - desc string - input string - exp []Token - }{ - { - desc: "empty", - input: "", - exp: []Token{}, - }, - { - desc: "single char", - input: "a", - exp: []Token{}, - }, - { - desc: "three char", - input: "abc", - exp: []Token{}, - }, - { - desc: "four chars", - input: "abcd", - exp: []Token{}, - }, - { - desc: "five chars", - input: "abcde", - exp: []Token{{Key: []byte("abcde")}}, - }, - { - desc: "six chars", - input: "abcdef", - exp: []Token{{Key: []byte("abcde")}, {Key: []byte("bcdef")}}, - }, - } { - t.Run(tc.desc, func(t *testing.T) { - require.Equal(t, tc.exp, tokenizer.Tokens(tc.input)) - }) - } -} +func TestPrefixedIterator(t *testing.T) { + var ( + three = NewNGramTokenizer(3, 0) + ) -func Test6GramSkip0Tokenizer(t *testing.T) { - tokenizer := six for _, tc := range []struct { desc string input string - exp []Token + exp []string }{ { - desc: "empty", input: "", - exp: []Token{}, - }, - { - desc: "single char", - input: "a", - exp: []Token{}, - }, - { - desc: "three char", - input: "abc", - exp: []Token{}, - }, - { - desc: "four chars", - input: "abcd", - exp: []Token{}, + exp: []string{}, }, { - desc: "five chars", - input: "abcde", - exp: []Token{}, - }, - { - desc: "six chars", - input: "abcdef", - exp: []Token{{Key: []byte("abcdef")}}, + input: "ab", + exp: []string{}, }, { - desc: "seven chars", input: "abcdefg", - exp: []Token{{Key: []byte("abcdef")}, {Key: []byte("bcdefg")}}, + exp: []string{"0123abc", "0123bcd", "0123cde", "0123def", "0123efg"}, }, - } { - t.Run(tc.desc, func(t *testing.T) { - require.Equal(t, tc.exp, tokenizer.Tokens(tc.input)) - }) - } -} - -func makeBuf(from, through, checksum int) []byte { - p := make([]byte, 0, 256) - i64buf := make([]byte, binary.MaxVarintLen64) - i32buf := make([]byte, 4) - binary.PutVarint(i64buf, int64(from)) - p = append(p, i64buf...) - binary.PutVarint(i64buf, int64(through)) - p = append(p, i64buf...) - binary.LittleEndian.PutUint32(i32buf, uint32(checksum)) - p = append(p, i32buf...) - return p -} - -func TestWrappedTokenizer(t *testing.T) { - tokenizer := threeSkip2 - for _, tc := range []struct { - desc string - input string - exp []Token - }{ - { - desc: "empty", - input: "", - exp: []Token{}, - }, - { - desc: "single char", - input: "a", - exp: []Token{}, - }, - { - desc: "four chars", - input: "abcd", - exp: []Token{ - {Key: append(makeBuf(0, 999999, 1), []byte("abc")...)}, - {Key: []byte("abc")}}, - }, { - desc: "uuid", - input: "2b1a5e46-36a2-4694-a4b1-f34cc7bdfc45", - exp: []Token{ - {Key: append(makeBuf(0, 999999, 1), []byte("2b1")...)}, - {Key: []byte("2b1")}, - {Key: append(makeBuf(0, 999999, 1), []byte("a5e")...)}, - {Key: []byte("a5e")}, - {Key: append(makeBuf(0, 999999, 1), []byte("46-")...)}, - {Key: []byte("46-")}, - {Key: append(makeBuf(0, 999999, 1), []byte("36a")...)}, - {Key: []byte("36a")}, - {Key: append(makeBuf(0, 999999, 1), []byte("2-4")...)}, - {Key: []byte("2-4")}, - {Key: append(makeBuf(0, 999999, 1), []byte("694")...)}, - {Key: []byte("694")}, - {Key: append(makeBuf(0, 999999, 1), []byte("-a4")...)}, - {Key: []byte("-a4")}, - {Key: append(makeBuf(0, 999999, 1), []byte("b1-")...)}, - {Key: []byte("b1-")}, - {Key: append(makeBuf(0, 999999, 1), []byte("f34")...)}, - {Key: []byte("f34")}, - {Key: append(makeBuf(0, 999999, 1), []byte("cc7")...)}, - {Key: []byte("cc7")}, - {Key: append(makeBuf(0, 999999, 1), []byte("bdf")...)}, - {Key: []byte("bdf")}, - {Key: append(makeBuf(0, 999999, 1), []byte("c45")...)}, - {Key: []byte("c45")}, - }, + input: "日本語", + exp: []string{"0123日本語"}, }, } { + prefix := []byte("0123") t.Run(tc.desc, func(t *testing.T) { - chunkTokenizer := ChunkIDTokenizer(tokenizer) - chunkTokenizer.Reinit(logproto.ChunkRef{From: 0, Through: 999999, Checksum: 1}) - require.Equal(t, tc.exp, chunkTokenizer.Tokens(tc.input)) + itr := NewPrefixedTokenIter(prefix, three.Tokens(tc.input)) + for _, exp := range tc.exp { + require.True(t, itr.Next()) + require.Equal(t, exp, string(itr.At())) + } + require.False(t, itr.Next()) }) } } @@ -594,8 +124,8 @@ sint occaecat cupidatat non proident sunt in culpa qui officia deserunt mollit a func BenchmarkTokens(b *testing.B) { var ( - v2Three = NewNGramTokenizerV2(3, 0) - v2ThreeSkip1 = NewNGramTokenizerV2(3, 1) + v2Three = NewNGramTokenizer(3, 0) + v2ThreeSkip1 = NewNGramTokenizer(3, 1) // fp + from + through + checksum chunkPrefixLen = 8 + 8 + 8 + 4 @@ -613,14 +143,6 @@ func BenchmarkTokens(b *testing.B) { { desc: "three", impls: []impl{ - { - desc: "v1", - f: func() { - for _, tok := range three.Tokens(lorem) { - _ = tok - } - }, - }, { desc: "v2", f: func() { @@ -635,14 +157,6 @@ func BenchmarkTokens(b *testing.B) { { desc: "threeSkip1", impls: []impl{ - { - desc: "v1", - f: func() { - for _, tok := range threeSkip1.Tokens(lorem) { - _ = tok - } - }, - }, { desc: "v2", f: func() { @@ -657,18 +171,6 @@ func BenchmarkTokens(b *testing.B) { { desc: "threeChunk", impls: []impl{ - { - desc: "v1", - f: func() func() { - chunkTokenizer := ChunkIDTokenizer(three) - chunkTokenizer.Reinit(logproto.ChunkRef{}) - return func() { - for _, tok := range chunkTokenizer.Tokens(lorem) { - _ = tok - } - } - }(), - }, { desc: "v2", f: func() func() { @@ -686,18 +188,6 @@ func BenchmarkTokens(b *testing.B) { { desc: "threeSkip1Chunk", impls: []impl{ - { - desc: "v1", - f: func() func() { - chunkTokenizer := ChunkIDTokenizer(threeSkip1) - chunkTokenizer.Reinit(logproto.ChunkRef{}) - return func() { - for _, tok := range chunkTokenizer.Tokens(lorem) { - _ = tok - } - } - }(), - }, { desc: "v2", f: func() func() { @@ -724,20 +214,3 @@ func BenchmarkTokens(b *testing.B) { }) } } - -func BenchmarkWrappedTokens(b *testing.B) { - chunkTokenizer := ChunkIDTokenizer(three) - chunkTokenizer.Reinit(logproto.ChunkRef{From: 0, Through: 999999, Checksum: 1}) - for i := 0; i < b.N; i++ { - b.StopTimer() - file, _ := os.Open(BigFile) - defer file.Close() - scanner := bufio.NewScanner(file) - - b.StartTimer() - for scanner.Scan() { - line := scanner.Text() - _ = chunkTokenizer.Tokens(line) - } - } -} diff --git a/tools/tsdb/bloom-tester/lib.go b/tools/tsdb/bloom-tester/lib.go index 7eefb56342c4..36926bcd3034 100644 --- a/tools/tsdb/bloom-tester/lib.go +++ b/tools/tsdb/bloom-tester/lib.go @@ -89,18 +89,10 @@ func execute() { } var ( - three = bt.NewNGramTokenizer(3, 4, 0) - threeSkip1 = bt.NewNGramTokenizer(3, 4, 1) - threeSkip2 = bt.NewNGramTokenizer(3, 4, 2) - threeSkip3 = bt.NewNGramTokenizer(3, 4, 3) - four = bt.NewNGramTokenizer(4, 5, 0) - fourSkip1 = bt.NewNGramTokenizer(4, 5, 1) - fourSkip2 = bt.NewNGramTokenizer(4, 5, 2) - five = bt.NewNGramTokenizer(5, 6, 0) - six = bt.NewNGramTokenizer(6, 7, 0) - - onePctError = func() *filter.ScalableBloomFilter { return filter.NewScalableBloomFilter(1024, 0.01, 0.8) } - fivePctError = func() *filter.ScalableBloomFilter { return filter.NewScalableBloomFilter(1024, 0.05, 0.8) } + three = bt.NewNGramTokenizer(3, 0) + four = bt.NewNGramTokenizer(4, 0) + + onePctError = func() *filter.ScalableBloomFilter { return filter.NewScalableBloomFilter(1024, 0.01, 0.8) } ) var experiments = []Experiment{ @@ -116,7 +108,7 @@ var experiments = []Experiment{ */ NewExperiment( "token=4skip0_error=1%_indexchunks=true", - four, + *four, true, onePctError, ), @@ -344,7 +336,7 @@ func analyze(metrics *Metrics, sampler Sampler, indexShipper indexshipper.IndexS tenant, ls.String(), objectClient) { - bloomTokenizer.SetLineTokenizer(experiment.tokenizer) + bloomTokenizer.SetLineTokenizer(&experiment.tokenizer) level.Info(util_log.Logger).Log("Starting work on: ", ls.String(), "'", FNV32a(ls.String()), "'", experiment.name, tenant) startTime := time.Now().UnixMilli() diff --git a/tools/tsdb/bloom-tester/lib_test.go b/tools/tsdb/bloom-tester/lib_test.go index 419ff44f5900..3269592f4abc 100644 --- a/tools/tsdb/bloom-tester/lib_test.go +++ b/tools/tsdb/bloom-tester/lib_test.go @@ -16,7 +16,7 @@ func BenchmarkSBFTestAndAdd(b *testing.B) { scanner := bufio.NewScanner(file) experiment := NewExperiment( "token=3skip0_error=1%_indexchunks=true", - three, + *three, true, onePctError, ) @@ -25,8 +25,10 @@ func BenchmarkSBFTestAndAdd(b *testing.B) { for scanner.Scan() { line := scanner.Text() tokens := experiment.tokenizer.Tokens(line) - for _, token := range tokens { - sbf.TestAndAdd(token.Key) + + for tokens.Next() { + tok := tokens.At() + sbf.TestAndAdd(tok) } } } @@ -40,7 +42,7 @@ func BenchmarkSBFAdd(b *testing.B) { scanner := bufio.NewScanner(file) experiment := NewExperiment( "token=3skip0_error=1%_indexchunks=true", - three, + *three, true, onePctError, ) @@ -49,8 +51,10 @@ func BenchmarkSBFAdd(b *testing.B) { for scanner.Scan() { line := scanner.Text() tokens := experiment.tokenizer.Tokens(line) - for _, token := range tokens { - sbf.Add(token.Key) + + for tokens.Next() { + tok := tokens.At() + sbf.TestAndAdd(tok) } } } @@ -64,7 +68,7 @@ func BenchmarkSBFSeparateTestAndAdd(b *testing.B) { scanner := bufio.NewScanner(file) experiment := NewExperiment( "token=3skip0_error=1%_indexchunks=true", - three, + *three, true, onePctError, ) @@ -73,45 +77,16 @@ func BenchmarkSBFSeparateTestAndAdd(b *testing.B) { for scanner.Scan() { line := scanner.Text() tokens := experiment.tokenizer.Tokens(line) - for _, token := range tokens { - found := sbf.Test(token.Key) - if !found { - sbf.Add(token.Key) - } - } - } - } -} -func BenchmarkSBFTestAndAddWithLRU(b *testing.B) { - for i := 0; i < b.N; i++ { - b.StopTimer() - file, _ := os.Open(BigFile) - defer file.Close() - scanner := bufio.NewScanner(file) - experiment := NewExperiment( - "token=3skip0_error=1%_indexchunks=true", - three, - true, - onePctError, - ) - sbf := experiment.bloom() - cache := NewLRUCache4(150000) - b.StartTimer() - for scanner.Scan() { - line := scanner.Text() - tokens := experiment.tokenizer.Tokens(line) - for _, token := range tokens { - if !cache.Get(token.Key) { - cache.Put(token.Key) - sbf.TestAndAdd(token.Key) - } + for tokens.Next() { + tok := tokens.At() + sbf.TestAndAdd(tok) } } } } -func BenchmarkSBFSeparateTestAndAddWithLRU(b *testing.B) { +func BenchmarkSBFTestAndAddWithLRU(b *testing.B) { for i := 0; i < b.N; i++ { b.StopTimer() file, _ := os.Open(BigFile) @@ -119,7 +94,7 @@ func BenchmarkSBFSeparateTestAndAddWithLRU(b *testing.B) { scanner := bufio.NewScanner(file) experiment := NewExperiment( "token=3skip0_error=1%_indexchunks=true", - three, + *three, true, onePctError, ) @@ -129,151 +104,20 @@ func BenchmarkSBFSeparateTestAndAddWithLRU(b *testing.B) { for scanner.Scan() { line := scanner.Text() tokens := experiment.tokenizer.Tokens(line) - for _, token := range tokens { - if !cache.Get(token.Key) { - cache.Put(token.Key) - - found := sbf.Test(token.Key) - if !found { - sbf.Add(token.Key) - } - //sbf.TestAndAdd(token.Key) - } - } - } - } -} - -func BenchmarkSBFSeparateTestAndAddWithLRU5(b *testing.B) { - for i := 0; i < b.N; i++ { - b.StopTimer() - file, _ := os.Open(BigFile) - defer file.Close() - scanner := bufio.NewScanner(file) - experiment := NewExperiment( - "token=3skip0_error=1%_indexchunks=true", - three, - true, - onePctError, - ) - sbf := experiment.bloom() - cache := NewLRUCache5(150000) - - b.StartTimer() - for scanner.Scan() { - line := scanner.Text() - tokens := experiment.tokenizer.Tokens(line) - for _, token := range tokens { - str := string(token.Key) - if !cache.Get(str) { - cache.Put(str) - - found := sbf.Test(token.Key) - if !found { - sbf.Add(token.Key) - } - } - } - } - } -} - -func BenchmarkSBFTestAndAddWithLRU5(b *testing.B) { - for i := 0; i < b.N; i++ { - b.StopTimer() - file, _ := os.Open(BigFile) - defer file.Close() - scanner := bufio.NewScanner(file) - experiment := NewExperiment( - "token=3skip0_error=1%_indexchunks=true", - three, - true, - onePctError, - ) - sbf := experiment.bloom() - cache := NewLRUCache5(150000) - - b.StartTimer() - for scanner.Scan() { - line := scanner.Text() - tokens := experiment.tokenizer.Tokens(line) - for _, token := range tokens { - str := string(token.Key) - if !cache.Get(str) { - cache.Put(str) - - sbf.TestAndAdd(token.Key) - } - } - } - } -} - -func BenchmarkSBFTestAndAddWithByteKeyLRU(b *testing.B) { - for i := 0; i < b.N; i++ { - b.StopTimer() - file, _ := os.Open(BigFile) - defer file.Close() - scanner := bufio.NewScanner(file) - experiment := NewExperiment( - "token=4skip0_error=1%_indexchunks=false", - four, - false, - onePctError, - ) - sbf := experiment.bloom() - cache := NewByteKeyLRUCache(150000) - b.StartTimer() - for scanner.Scan() { - line := scanner.Text() - tokens := experiment.tokenizer.Tokens(line) - for _, token := range tokens { - - array := NewFourByteKeyFromSlice(token.Key) - if !cache.Get(array) { - cache.Put(array) - sbf.TestAndAdd(token.Key) - } - } - } - } -} - -func BenchmarkSBFTestAndAddWithFourByteKeyLRU(b *testing.B) { - for i := 0; i < b.N; i++ { - b.StopTimer() - file, _ := os.Open(BigFile) - defer file.Close() - scanner := bufio.NewScanner(file) - experiment := NewExperiment( - "token=4skip0_error=1%_indexchunks=false", - four, - false, - onePctError, - ) - sbf := experiment.bloom() - cache := NewFourByteKeyLRUCache(150000) - b.StartTimer() - for scanner.Scan() { - line := scanner.Text() - tokens := experiment.tokenizer.Tokens(line) - for _, token := range tokens { - if !cache.Get([4]byte(token.Key)) { - cache.Put([4]byte(token.Key)) - found := sbf.Test(token.Key) - if !found { - sbf.Add(token.Key) - } - //sbf.TestAndAdd(token.Key) + for tokens.Next() { + tok := tokens.At() + if !cache.Get(tok) { + cache.Put(tok) + sbf.TestAndAdd(tok) } - + sbf.TestAndAdd(tok) } } } } -func BenchmarkSBFAddWithLRU(b *testing.B) { +func BenchmarkSBFSeparateTestAndAddWithLRU(b *testing.B) { for i := 0; i < b.N; i++ { b.StopTimer() file, _ := os.Open(BigFile) @@ -281,7 +125,7 @@ func BenchmarkSBFAddWithLRU(b *testing.B) { scanner := bufio.NewScanner(file) experiment := NewExperiment( "token=3skip0_error=1%_indexchunks=true", - three, + *three, true, onePctError, ) @@ -291,44 +135,16 @@ func BenchmarkSBFAddWithLRU(b *testing.B) { for scanner.Scan() { line := scanner.Text() tokens := experiment.tokenizer.Tokens(line) - for _, token := range tokens { - if !cache.Get(token.Key) { - cache.Put(token.Key) - sbf.Add(token.Key) - } - } - } - } -} - -func BenchmarkSBFSeparateTestAndAddWithLRU1(b *testing.B) { - for i := 0; i < b.N; i++ { - b.StopTimer() - file, _ := os.Open(BigFile) - defer file.Close() - scanner := bufio.NewScanner(file) - experiment := NewExperiment( - "token=3skip0_error=1%_indexchunks=true", - three, - true, - onePctError, - ) - sbf := experiment.bloom() - cache := NewLRUCache(150000) - b.StartTimer() - for scanner.Scan() { - line := scanner.Text() - tokens := experiment.tokenizer.Tokens(line) - for _, token := range tokens { - str := string(token.Key) - if !cache.Get(str) { - cache.Put(str) - found := sbf.Test(token.Key) + for tokens.Next() { + tok := tokens.At() + if !cache.Get(tok) { + cache.Put(tok) + found := sbf.Test(tok) if !found { - sbf.Add(token.Key) + sbf.Add(tok) } - //sbf.Add(token.Key) } + sbf.TestAndAdd(tok) } } } @@ -342,7 +158,7 @@ func BenchmarkSBFSeparateTestAndAddWithMap(b *testing.B) { scanner := bufio.NewScanner(file) experiment := NewExperiment( "token=3skip0_error=1%_indexchunks=true", - three, + *three, true, onePctError, ) @@ -352,15 +168,15 @@ func BenchmarkSBFSeparateTestAndAddWithMap(b *testing.B) { for scanner.Scan() { line := scanner.Text() tokens := experiment.tokenizer.Tokens(line) - for _, token := range tokens { - str := string(token.Key) - - _, found := cache[str] + for tokens.Next() { + tok := tokens.At() + tokStr := string(tok) + _, found := cache[tokStr] if !found { - cache[str] = "" - f := sbf.Test(token.Key) + cache[tokStr] = "" + f := sbf.Test(tok) if !f { - sbf.Add(token.Key) + sbf.Add(tok) } if len(cache) > 150000 { diff --git a/tools/tsdb/bloom-tester/metrics.go b/tools/tsdb/bloom-tester/metrics.go index 193f829063db..2805901a3b9c 100644 --- a/tools/tsdb/bloom-tester/metrics.go +++ b/tools/tsdb/bloom-tester/metrics.go @@ -10,12 +10,12 @@ import ( type Experiment struct { name string - tokenizer bt.Tokenizer + tokenizer bt.NGramTokenizer bloom func() *filter.ScalableBloomFilter encodeChunkID bool } -func NewExperiment(name string, tokenizer bt.Tokenizer, encodeChunkID bool, bloom func() *filter.ScalableBloomFilter) Experiment { +func NewExperiment(name string, tokenizer bt.NGramTokenizer, encodeChunkID bool, bloom func() *filter.ScalableBloomFilter) Experiment { return Experiment{ name: name, tokenizer: tokenizer, diff --git a/tools/tsdb/bloom-tester/readlib.go b/tools/tsdb/bloom-tester/readlib.go index eaca7a38c15b..93b0ba75b6d1 100644 --- a/tools/tsdb/bloom-tester/readlib.go +++ b/tools/tsdb/bloom-tester/readlib.go @@ -4,7 +4,6 @@ import ( "context" "flag" "fmt" - "github.com/grafana/dskit/services" "github.com/grafana/loki/pkg/chunkenc" @@ -200,10 +199,10 @@ func analyzeRead(metrics *Metrics, sampler Sampler, shipper indexshipper.IndexSh tenant, ls.String(), objectClient) - bloomTokenizer.SetLineTokenizer(experiment.tokenizer) + bloomTokenizer.SetLineTokenizer(&experiment.tokenizer) for gotIdx := range got { // for every chunk for _, queryExperiment := range queryExperiments { // for each search string - if len(queryExperiment.searchString) >= experiment.tokenizer.GetMin()+experiment.tokenizer.GetSkip() { + if len(queryExperiment.searchString) >= experiment.tokenizer.N+experiment.tokenizer.Skip { foundInChunk := false foundInSbf := false @@ -245,11 +244,6 @@ func analyzeRead(metrics *Metrics, sampler Sampler, shipper indexshipper.IndexSh helpers.ExitErr("iterating chunks ", itr.Error()) } - /*else // if search string is long enough - { - // fmt.Println("Skipping", queryExperiment.name, "because it's too short", experiment.name) - }*/ - } // for each search string } // for every chunk @@ -306,21 +300,21 @@ func readSBFFromObjectStorage(location, prefix, period, tenant, series string, o return sbf } -func searchSbf(sbf *filter.ScalableBloomFilter, tokenizer bt.Tokenizer, searchString string) bool { - tokens := bt.SearchesForTokenizerAndLine(tokenizer, searchString) - for _, tokenSet := range tokens { - numMatches := 0 - for _, token := range tokenSet { - if sbf.Test(token.Key) { - numMatches++ - } +func searchSbf(sbf *filter.ScalableBloomFilter, tokenizer bt.NGramTokenizer, searchString string) bool { + itr := tokenizer.Tokens(searchString) + numMatches := 0 + numTokens := 0 + for itr.Next() { + token := itr.At() + numTokens++ + if sbf.Test(token) { + numMatches++ } - if numMatches > 0 { - if numMatches == len(tokenSet) { - return true - } + } + if numMatches > 0 { + if numMatches == numTokens { + return true } - } return false diff --git a/tools/tsdb/bloom-tester/readlib_test.go b/tools/tsdb/bloom-tester/readlib_test.go index 5216918010bc..edec2c37fe59 100644 --- a/tools/tsdb/bloom-tester/readlib_test.go +++ b/tools/tsdb/bloom-tester/readlib_test.go @@ -1,7 +1,6 @@ package main import ( - bt "github.com/grafana/loki/pkg/storage/bloom/v1" "testing" "github.com/stretchr/testify/require" @@ -10,7 +9,7 @@ import ( func TestSearchSbf(t *testing.T) { experiment := NewExperiment( "token=4skip0_error=1%_indexchunks=true", - four, + *four, true, onePctError, ) @@ -66,13 +65,13 @@ func TestSearchSbf(t *testing.T) { } { t.Run(tc.desc, func(t *testing.T) { sbf := experiment.bloom() - tokens := bt.SearchesForTokenizerAndLine(four, tc.inputLine) - for _, tokenSet := range tokens { - for _, token := range tokenSet { - sbf.Add(token.Key) - } + tokens := four.Tokens(tc.inputLine) + for tokens.Next() { + tok := tokens.At() + sbf.Add(tok) } - require.Equal(t, tc.exp, searchSbf(sbf, four, tc.inputSearch)) + + require.Equal(t, tc.exp, searchSbf(sbf, *four, tc.inputSearch)) }) } }