Removed MatchPrefix fixed other Match functions (#11345)

closes #7855 --------- Co-authored-by: JkLondon <[email protected]> Co-authored-by: alex.sharov <[email protected]>
erigontech · Jul 28, 2024 · 0bb8970 · 0bb8970
1 parent be1a349
commit 0bb8970
Show file tree

Hide file tree

Showing 6 changed files with 16 additions and 252 deletions.
diff --git a/erigon-lib/seg/compress_test.go b/erigon-lib/seg/compress_test.go
@@ -125,9 +125,6 @@ func TestCompressDict1(t *testing.T) {
 		require.True(t, g.MatchPrefix([]byte("")))
 		require.True(t, g.MatchPrefix([]byte{}))
 
-		require.Equal(t, 1, g.MatchPrefixCmp([]byte("long")))
-		require.Equal(t, 0, g.MatchPrefixCmp([]byte("")))
-		require.Equal(t, 0, g.MatchPrefixCmp([]byte{}))
 		word, _ := g.Next(nil)
 		require.NotNil(t, word)
 		require.Zero(t, len(word))
@@ -139,11 +136,6 @@ func TestCompressDict1(t *testing.T) {
 		require.False(t, g.MatchPrefix([]byte("longnotmatch")))
 		require.True(t, g.MatchPrefix([]byte{}))
 
-		require.Equal(t, 0, g.MatchPrefixCmp([]byte("long")))
-		require.Equal(t, 1, g.MatchPrefixCmp([]byte("longlong")))
-		require.Equal(t, 1, g.MatchPrefixCmp([]byte("wordnotmatch")))
-		require.Equal(t, 1, g.MatchPrefixCmp([]byte("longnotmatch")))
-		require.Equal(t, 0, g.MatchPrefixCmp([]byte{}))
 		_, _ = g.Next(nil)
 
 		// next word is `word`
@@ -155,13 +147,6 @@ func TestCompressDict1(t *testing.T) {
 		require.False(t, g.MatchPrefix([]byte("wordnotmatch")))
 		require.False(t, g.MatchPrefix([]byte("longnotmatch")))
 
-		require.Equal(t, -1, g.MatchPrefixCmp([]byte("long")))
-		require.Equal(t, -1, g.MatchPrefixCmp([]byte("longlong")))
-		require.Equal(t, 0, g.MatchPrefixCmp([]byte("word")))
-		require.Equal(t, 0, g.MatchPrefixCmp([]byte("")))
-		require.Equal(t, 0, g.MatchPrefixCmp(nil))
-		require.Equal(t, 1, g.MatchPrefixCmp([]byte("wordnotmatch")))
-		require.Equal(t, -1, g.MatchPrefixCmp([]byte("longnotmatch")))
 		_, _ = g.Next(nil)
 
 		// next word is `longlongword %d`
@@ -175,13 +160,6 @@ func TestCompressDict1(t *testing.T) {
 		require.False(t, g.MatchPrefix([]byte("longnotmatch")))
 		require.True(t, g.MatchPrefix([]byte{}))
 
-		require.Equal(t, 0, g.MatchPrefixCmp([]byte(fmt.Sprintf("%d", i))))
-		require.Equal(t, 0, g.MatchPrefixCmp([]byte(expectPrefix)))
-		require.Equal(t, 0, g.MatchPrefixCmp([]byte(expectPrefix+"long")))
-		require.Equal(t, 0, g.MatchPrefixCmp([]byte(expectPrefix+"longword ")))
-		require.Equal(t, 1, g.MatchPrefixCmp([]byte("wordnotmatch")))
-		require.Equal(t, 1, g.MatchPrefixCmp([]byte("longnotmatch")))
-		require.Equal(t, 0, g.MatchPrefixCmp([]byte{}))
 		savePos := g.dataP
 		word, nextPos := g.Next(nil)
 		expected := fmt.Sprintf("%d longlongword %d", i, i)

diff --git a/erigon-lib/seg/decompress.go b/erigon-lib/seg/decompress.go
@@ -817,105 +817,6 @@ func (g *Getter) SkipUncompressed() (uint64, int) {
 	return g.dataP, int(wordLen)
 }
 
-// Match returns
-//
-//	1 if the word at current offset is greater than the buf
-//
-// -1 if it is less than the buf
-//
-//	0 if they are equal.
-func (g *Getter) Match(buf []byte) int {
-	savePos := g.dataP
-	wordLen := g.nextPos(true)
-	wordLen-- // because when create huffman tree we do ++ , because 0 is terminator
-	lenBuf := len(buf)
-	if wordLen == 0 || int(wordLen) != lenBuf {
-		if g.dataBit > 0 {
-			g.dataP++
-			g.dataBit = 0
-		}
-		if lenBuf != 0 || lenBuf != int(wordLen) {
-			g.dataP, g.dataBit = savePos, 0
-		}
-		if lenBuf == int(wordLen) {
-			return 0
-		}
-		if lenBuf < int(wordLen) {
-			return -1
-		}
-		if lenBuf > int(wordLen) {
-			return 1
-		}
-	}
-
-	var bufPos int
-	// In the first pass, we only check patterns
-	for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
-		bufPos += int(pos) - 1
-		pattern := g.nextPattern()
-		compared := bytes.Compare(buf[bufPos:bufPos+len(pattern)], pattern)
-		if compared != 0 {
-			g.dataP, g.dataBit = savePos, 0
-			return compared
-		}
-		if lenBuf < bufPos+len(pattern) {
-			g.dataP, g.dataBit = savePos, 0
-			return -1
-		}
-	}
-	if g.dataBit > 0 {
-		g.dataP++
-		g.dataBit = 0
-	}
-	postLoopPos := g.dataP
-	g.dataP, g.dataBit = savePos, 0
-	g.nextPos(true /* clean */) // Reset the state of huffman decoder
-	// Second pass - we check spaces not covered by the patterns
-	var lastUncovered int
-	bufPos = 0
-	for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
-		bufPos += int(pos) - 1
-		if bufPos > lastUncovered {
-			dif := uint64(bufPos - lastUncovered)
-			compared := bytes.Compare(buf[lastUncovered:bufPos], g.data[postLoopPos:postLoopPos+dif])
-			if compared != 0 {
-				g.dataP, g.dataBit = savePos, 0
-				return compared
-			}
-			if lenBuf < bufPos {
-				g.dataP, g.dataBit = savePos, 0
-				return -1
-			}
-			postLoopPos += dif
-		}
-		lastUncovered = bufPos + len(g.nextPattern())
-	}
-	if int(wordLen) > lastUncovered {
-		dif := wordLen - uint64(lastUncovered)
-
-		compared := bytes.Compare(buf[lastUncovered:wordLen], g.data[postLoopPos:postLoopPos+dif])
-		if compared != 0 {
-			g.dataP, g.dataBit = savePos, 0
-			return compared
-		}
-		if lenBuf < int(wordLen) {
-			g.dataP, g.dataBit = savePos, 0
-			return -1
-		}
-		postLoopPos += dif
-	}
-	if lenBuf < int(wordLen) {
-		g.dataP, g.dataBit = savePos, 0
-		return -1
-	}
-	if lenBuf > int(wordLen) {
-		g.dataP, g.dataBit = savePos, 0
-		return 1
-	}
-	g.dataP, g.dataBit = postLoopPos, 0
-	return 0
-}
-
 // MatchPrefix only checks if the word at the current offset has a buf prefix. Does not move offset to the next word.
 func (g *Getter) MatchPrefix(prefix []byte) bool {
 	savePos := g.dataP
@@ -1060,9 +961,7 @@ func (g *Getter) MatchCmp(buf []byte) int {
 	return cmp
 }
 
-// MatchPrefixCmp lexicographically compares given prefix with the word at the current offset in the file.
-// returns 0 if buf == word, -1 if buf < word, 1 if buf > word
-func (g *Getter) MatchPrefixCmp(prefix []byte) int {
+func (g *Getter) MatchPrefixUncompressed(prefix []byte) bool {
 	savePos := g.dataP
 	defer func() {
 		g.dataP, g.dataBit = savePos, 0
@@ -1072,87 +971,36 @@ func (g *Getter) MatchPrefixCmp(prefix []byte) int {
 	wordLen-- // because when create huffman tree we do ++ , because 0 is terminator
 	prefixLen := len(prefix)
 	if wordLen == 0 && prefixLen != 0 {
-		return 1
+		return true
 	}
 	if prefixLen == 0 {
-		return 0
-	}
-
-	decoded := make([]byte, wordLen)
-	var bufPos int
-	// In the first pass, we only check patterns
-	// Only run this loop as far as the prefix goes, there is no need to check further
-	for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
-		bufPos += int(pos) - 1
-		if bufPos > prefixLen {
-			break
-		}
-		pattern := g.nextPattern()
-		copy(decoded[bufPos:], pattern)
+		return false
 	}
 
-	if g.dataBit > 0 {
-		g.dataP++
-		g.dataBit = 0
-	}
-	postLoopPos := g.dataP
-	g.dataP, g.dataBit = savePos, 0
-	g.nextPos(true /* clean */) // Reset the state of huffman decoder
-	// Second pass - we check spaces not covered by the patterns
-	var lastUncovered int
-	bufPos = 0
-	for pos := g.nextPos(false /* clean */); pos != 0 && lastUncovered < prefixLen; pos = g.nextPos(false) {
-		bufPos += int(pos) - 1
-		if bufPos > lastUncovered {
-			dif := uint64(bufPos - lastUncovered)
-			copy(decoded[lastUncovered:bufPos], g.data[postLoopPos:postLoopPos+dif])
-			postLoopPos += dif
-		}
-		lastUncovered = bufPos + len(g.nextPattern())
-	}
-	if prefixLen > lastUncovered && int(wordLen) > lastUncovered {
-		dif := wordLen - uint64(lastUncovered)
-		copy(decoded[lastUncovered:wordLen], g.data[postLoopPos:postLoopPos+dif])
-		// postLoopPos += dif
-	}
-	var cmp int
-	if prefixLen > int(wordLen) {
-		// TODO(racytech): handle this case
-		// e.g: prefix = 'aaacb'
-		// 		word = 'aaa'
-		cmp = bytes.Compare(prefix, decoded)
-	} else {
-		cmp = bytes.Compare(prefix, decoded[:prefixLen])
-	}
+	g.nextPos(true)
 
-	return cmp
+	return bytes.HasPrefix(g.data[g.dataP:g.dataP+wordLen], prefix)
 }
 
-func (g *Getter) MatchPrefixUncompressed(prefix []byte) int {
+func (g *Getter) MatchCmpUncompressed(buf []byte) int {
 	savePos := g.dataP
 	defer func() {
 		g.dataP, g.dataBit = savePos, 0
 	}()
 
 	wordLen := g.nextPos(true /* clean */)
 	wordLen-- // because when create huffman tree we do ++ , because 0 is terminator
-	prefixLen := len(prefix)
-	if wordLen == 0 && prefixLen != 0 {
+	bufLen := len(buf)
+	if wordLen == 0 && bufLen != 0 {
 		return 1
 	}
-	if prefixLen == 0 {
-		return 0
+	if bufLen == 0 {
+		return -1
 	}
 
 	g.nextPos(true)
 
-	// if prefixLen > int(wordLen) {
-	// 	// TODO(racytech): handle this case
-	// 	// e.g: prefix = 'aaacb'
-	// 	// 		word = 'aaa'
-	// }
-
-	return bytes.Compare(prefix, g.data[g.dataP:g.dataP+wordLen])
+	return bytes.Compare(buf, g.data[g.dataP:g.dataP+wordLen])
 }
 
 // FastNext extracts a compressed word from current offset in the file

diff --git a/erigon-lib/seg/decompress_bench_test.go b/erigon-lib/seg/decompress_bench_test.go
@@ -65,16 +65,6 @@ func BenchmarkDecompressSkip(b *testing.B) {
 	}
 }
 
-func BenchmarkDecompressMatch(b *testing.B) {
-	t := new(testing.T)
-	d := prepareDict(t)
-	defer d.Close()
-	g := d.MakeGetter()
-	for i := 0; i < b.N; i++ {
-		_ = g.Match([]byte("longlongword"))
-	}
-}
-
 func BenchmarkDecompressMatchCmp(b *testing.B) {
 	t := new(testing.T)
 	d := prepareDict(t)
@@ -99,17 +89,6 @@ func BenchmarkDecompressMatchPrefix(b *testing.B) {
 	}
 }
 
-func BenchmarkDecompressMatchPrefixCmp(b *testing.B) {
-	t := new(testing.T)
-	d := prepareDict(t)
-	defer d.Close()
-	g := d.MakeGetter()
-
-	for i := 0; i < b.N; i++ {
-		_ = g.MatchPrefixCmp([]byte("longlongword"))
-	}
-}
-
 func BenchmarkDecompressTorrent(t *testing.B) {
 	t.Skip()
 

diff --git a/erigon-lib/seg/decompress_fuzz_test.go b/erigon-lib/seg/decompress_fuzz_test.go
@@ -83,7 +83,7 @@ func FuzzDecompressMatch(f *testing.F) {
 				t.Fatalf("MatchCmp: expected match: %v\n", expected)
 			}
 			g.Reset(savePos)
-			ok := g.Match(expected)
+			ok := g.MatchCmp(expected)
 			pos2 := g.dataP
 			if ok != 0 {
 				t.Fatalf("MatchBool: expected match: %v\n", expected)

diff --git a/erigon-lib/seg/decompress_test.go b/erigon-lib/seg/decompress_test.go
@@ -88,7 +88,7 @@ func TestDecompressMatchOK(t *testing.T) {
 		w := loremStrings[i]
 		if i%2 != 0 {
 			expected := fmt.Sprintf("%s %d", w, i)
-			cmp := g.Match([]byte(expected))
+			cmp := g.MatchCmp([]byte(expected))
 			if cmp != 0 {
 				t.Errorf("expexted match with %s", expected)
 			}
@@ -164,7 +164,7 @@ func TestDecompressMatchOKCondensed(t *testing.T) {
 	for g.HasNext() {
 		if i%2 != 0 {
 			expected := fmt.Sprintf("word-%d", i)
-			cmp := g.Match([]byte(expected))
+			cmp := g.MatchCmp([]byte(expected))
 			if cmp != 0 {
 				t.Errorf("expexted match with %s", expected)
 			}
@@ -188,7 +188,7 @@ func TestDecompressMatchNotOK(t *testing.T) {
 	for g.HasNext() {
 		w := loremStrings[i]
 		expected := fmt.Sprintf("%s %d", w, i+1)
-		cmp := g.Match([]byte(expected))
+		cmp := g.MatchCmp([]byte(expected))
 		if cmp == 0 {
 			t.Errorf("not expexted match with %s", expected)
 		} else {
@@ -241,47 +241,6 @@ func TestDecompressMatchPrefix(t *testing.T) {
 	}
 }
 
-func TestDecompressMatchPrefixCmp(t *testing.T) {
-	d := prepareLoremDict(t)
-	defer d.Close()
-	g := d.MakeGetter()
-	i := 0
-	skipCount := 0
-	for g.HasNext() {
-		w := loremStrings[i]
-		expected := []byte(fmt.Sprintf("%s %d", w, i+1))
-		expected = expected[:len(expected)/2]
-		cmp := g.MatchPrefixCmp(expected)
-		if cmp != 0 {
-			t.Errorf("expexted match with %s", expected)
-		}
-		g.Skip()
-		skipCount++
-		i++
-	}
-	if skipCount != i {
-		t.Errorf("something wrong with match logic")
-	}
-	g.Reset(0)
-	skipCount = 0
-	i = 0
-	for g.HasNext() {
-		w := loremStrings[i]
-		expected := []byte(fmt.Sprintf("%s %d", w, i+1))
-		expected = expected[:len(expected)/2]
-		if len(expected) > 0 {
-			expected[len(expected)-1]++
-			cmp := g.MatchPrefixCmp(expected)
-			if cmp == 0 {
-				t.Errorf("not expexted match with %s", expected)
-			}
-		}
-		g.Skip()
-		skipCount++
-		i++
-	}
-}
-
 func prepareLoremDictUncompressed(t *testing.T) *Decompressor {
 	t.Helper()
 	logger := log.New()

diff --git a/erigon-lib/state/archive.go b/erigon-lib/state/archive.go
@@ -61,7 +61,7 @@ func (g *getter) MatchPrefix(prefix []byte) bool {
 	if g.c&CompressKeys != 0 {
 		return g.Getter.MatchPrefix(prefix)
 	}
-	return g.Getter.MatchPrefixUncompressed(prefix) == 0
+	return g.Getter.MatchPrefixUncompressed(prefix)
 }
 
 func (g *getter) Next(buf []byte) ([]byte, uint64) {