Skip to content

Commit

Permalink
Removed MatchPrefix fixed other Match functions (#11345)
Browse files Browse the repository at this point in the history
closes #7855

---------

Co-authored-by: JkLondon <[email protected]>
Co-authored-by: alex.sharov <[email protected]>
  • Loading branch information
3 people authored Jul 28, 2024
1 parent be1a349 commit 0bb8970
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 252 deletions.
22 changes: 0 additions & 22 deletions erigon-lib/seg/compress_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,6 @@ func TestCompressDict1(t *testing.T) {
require.True(t, g.MatchPrefix([]byte("")))
require.True(t, g.MatchPrefix([]byte{}))

require.Equal(t, 1, g.MatchPrefixCmp([]byte("long")))
require.Equal(t, 0, g.MatchPrefixCmp([]byte("")))
require.Equal(t, 0, g.MatchPrefixCmp([]byte{}))
word, _ := g.Next(nil)
require.NotNil(t, word)
require.Zero(t, len(word))
Expand All @@ -139,11 +136,6 @@ func TestCompressDict1(t *testing.T) {
require.False(t, g.MatchPrefix([]byte("longnotmatch")))
require.True(t, g.MatchPrefix([]byte{}))

require.Equal(t, 0, g.MatchPrefixCmp([]byte("long")))
require.Equal(t, 1, g.MatchPrefixCmp([]byte("longlong")))
require.Equal(t, 1, g.MatchPrefixCmp([]byte("wordnotmatch")))
require.Equal(t, 1, g.MatchPrefixCmp([]byte("longnotmatch")))
require.Equal(t, 0, g.MatchPrefixCmp([]byte{}))
_, _ = g.Next(nil)

// next word is `word`
Expand All @@ -155,13 +147,6 @@ func TestCompressDict1(t *testing.T) {
require.False(t, g.MatchPrefix([]byte("wordnotmatch")))
require.False(t, g.MatchPrefix([]byte("longnotmatch")))

require.Equal(t, -1, g.MatchPrefixCmp([]byte("long")))
require.Equal(t, -1, g.MatchPrefixCmp([]byte("longlong")))
require.Equal(t, 0, g.MatchPrefixCmp([]byte("word")))
require.Equal(t, 0, g.MatchPrefixCmp([]byte("")))
require.Equal(t, 0, g.MatchPrefixCmp(nil))
require.Equal(t, 1, g.MatchPrefixCmp([]byte("wordnotmatch")))
require.Equal(t, -1, g.MatchPrefixCmp([]byte("longnotmatch")))
_, _ = g.Next(nil)

// next word is `longlongword %d`
Expand All @@ -175,13 +160,6 @@ func TestCompressDict1(t *testing.T) {
require.False(t, g.MatchPrefix([]byte("longnotmatch")))
require.True(t, g.MatchPrefix([]byte{}))

require.Equal(t, 0, g.MatchPrefixCmp([]byte(fmt.Sprintf("%d", i))))
require.Equal(t, 0, g.MatchPrefixCmp([]byte(expectPrefix)))
require.Equal(t, 0, g.MatchPrefixCmp([]byte(expectPrefix+"long")))
require.Equal(t, 0, g.MatchPrefixCmp([]byte(expectPrefix+"longword ")))
require.Equal(t, 1, g.MatchPrefixCmp([]byte("wordnotmatch")))
require.Equal(t, 1, g.MatchPrefixCmp([]byte("longnotmatch")))
require.Equal(t, 0, g.MatchPrefixCmp([]byte{}))
savePos := g.dataP
word, nextPos := g.Next(nil)
expected := fmt.Sprintf("%d longlongword %d", i, i)
Expand Down
174 changes: 11 additions & 163 deletions erigon-lib/seg/decompress.go
Original file line number Diff line number Diff line change
Expand Up @@ -817,105 +817,6 @@ func (g *Getter) SkipUncompressed() (uint64, int) {
return g.dataP, int(wordLen)
}

// Match returns
//
// 1 if the word at current offset is greater than the buf
//
// -1 if it is less than the buf
//
// 0 if they are equal.
func (g *Getter) Match(buf []byte) int {
savePos := g.dataP
wordLen := g.nextPos(true)
wordLen-- // because when create huffman tree we do ++ , because 0 is terminator
lenBuf := len(buf)
if wordLen == 0 || int(wordLen) != lenBuf {
if g.dataBit > 0 {
g.dataP++
g.dataBit = 0
}
if lenBuf != 0 || lenBuf != int(wordLen) {
g.dataP, g.dataBit = savePos, 0
}
if lenBuf == int(wordLen) {
return 0
}
if lenBuf < int(wordLen) {
return -1
}
if lenBuf > int(wordLen) {
return 1
}
}

var bufPos int
// In the first pass, we only check patterns
for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
bufPos += int(pos) - 1
pattern := g.nextPattern()
compared := bytes.Compare(buf[bufPos:bufPos+len(pattern)], pattern)
if compared != 0 {
g.dataP, g.dataBit = savePos, 0
return compared
}
if lenBuf < bufPos+len(pattern) {
g.dataP, g.dataBit = savePos, 0
return -1
}
}
if g.dataBit > 0 {
g.dataP++
g.dataBit = 0
}
postLoopPos := g.dataP
g.dataP, g.dataBit = savePos, 0
g.nextPos(true /* clean */) // Reset the state of huffman decoder
// Second pass - we check spaces not covered by the patterns
var lastUncovered int
bufPos = 0
for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
bufPos += int(pos) - 1
if bufPos > lastUncovered {
dif := uint64(bufPos - lastUncovered)
compared := bytes.Compare(buf[lastUncovered:bufPos], g.data[postLoopPos:postLoopPos+dif])
if compared != 0 {
g.dataP, g.dataBit = savePos, 0
return compared
}
if lenBuf < bufPos {
g.dataP, g.dataBit = savePos, 0
return -1
}
postLoopPos += dif
}
lastUncovered = bufPos + len(g.nextPattern())
}
if int(wordLen) > lastUncovered {
dif := wordLen - uint64(lastUncovered)

compared := bytes.Compare(buf[lastUncovered:wordLen], g.data[postLoopPos:postLoopPos+dif])
if compared != 0 {
g.dataP, g.dataBit = savePos, 0
return compared
}
if lenBuf < int(wordLen) {
g.dataP, g.dataBit = savePos, 0
return -1
}
postLoopPos += dif
}
if lenBuf < int(wordLen) {
g.dataP, g.dataBit = savePos, 0
return -1
}
if lenBuf > int(wordLen) {
g.dataP, g.dataBit = savePos, 0
return 1
}
g.dataP, g.dataBit = postLoopPos, 0
return 0
}

// MatchPrefix only checks if the word at the current offset has a buf prefix. Does not move offset to the next word.
func (g *Getter) MatchPrefix(prefix []byte) bool {
savePos := g.dataP
Expand Down Expand Up @@ -1060,9 +961,7 @@ func (g *Getter) MatchCmp(buf []byte) int {
return cmp
}

// MatchPrefixCmp lexicographically compares given prefix with the word at the current offset in the file.
// returns 0 if buf == word, -1 if buf < word, 1 if buf > word
func (g *Getter) MatchPrefixCmp(prefix []byte) int {
func (g *Getter) MatchPrefixUncompressed(prefix []byte) bool {
savePos := g.dataP
defer func() {
g.dataP, g.dataBit = savePos, 0
Expand All @@ -1072,87 +971,36 @@ func (g *Getter) MatchPrefixCmp(prefix []byte) int {
wordLen-- // because when create huffman tree we do ++ , because 0 is terminator
prefixLen := len(prefix)
if wordLen == 0 && prefixLen != 0 {
return 1
return true
}
if prefixLen == 0 {
return 0
}

decoded := make([]byte, wordLen)
var bufPos int
// In the first pass, we only check patterns
// Only run this loop as far as the prefix goes, there is no need to check further
for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
bufPos += int(pos) - 1
if bufPos > prefixLen {
break
}
pattern := g.nextPattern()
copy(decoded[bufPos:], pattern)
return false
}

if g.dataBit > 0 {
g.dataP++
g.dataBit = 0
}
postLoopPos := g.dataP
g.dataP, g.dataBit = savePos, 0
g.nextPos(true /* clean */) // Reset the state of huffman decoder
// Second pass - we check spaces not covered by the patterns
var lastUncovered int
bufPos = 0
for pos := g.nextPos(false /* clean */); pos != 0 && lastUncovered < prefixLen; pos = g.nextPos(false) {
bufPos += int(pos) - 1
if bufPos > lastUncovered {
dif := uint64(bufPos - lastUncovered)
copy(decoded[lastUncovered:bufPos], g.data[postLoopPos:postLoopPos+dif])
postLoopPos += dif
}
lastUncovered = bufPos + len(g.nextPattern())
}
if prefixLen > lastUncovered && int(wordLen) > lastUncovered {
dif := wordLen - uint64(lastUncovered)
copy(decoded[lastUncovered:wordLen], g.data[postLoopPos:postLoopPos+dif])
// postLoopPos += dif
}
var cmp int
if prefixLen > int(wordLen) {
// TODO(racytech): handle this case
// e.g: prefix = 'aaacb'
// word = 'aaa'
cmp = bytes.Compare(prefix, decoded)
} else {
cmp = bytes.Compare(prefix, decoded[:prefixLen])
}
g.nextPos(true)

return cmp
return bytes.HasPrefix(g.data[g.dataP:g.dataP+wordLen], prefix)
}

func (g *Getter) MatchPrefixUncompressed(prefix []byte) int {
func (g *Getter) MatchCmpUncompressed(buf []byte) int {
savePos := g.dataP
defer func() {
g.dataP, g.dataBit = savePos, 0
}()

wordLen := g.nextPos(true /* clean */)
wordLen-- // because when create huffman tree we do ++ , because 0 is terminator
prefixLen := len(prefix)
if wordLen == 0 && prefixLen != 0 {
bufLen := len(buf)
if wordLen == 0 && bufLen != 0 {
return 1
}
if prefixLen == 0 {
return 0
if bufLen == 0 {
return -1
}

g.nextPos(true)

// if prefixLen > int(wordLen) {
// // TODO(racytech): handle this case
// // e.g: prefix = 'aaacb'
// // word = 'aaa'
// }

return bytes.Compare(prefix, g.data[g.dataP:g.dataP+wordLen])
return bytes.Compare(buf, g.data[g.dataP:g.dataP+wordLen])
}

// FastNext extracts a compressed word from current offset in the file
Expand Down
21 changes: 0 additions & 21 deletions erigon-lib/seg/decompress_bench_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,16 +65,6 @@ func BenchmarkDecompressSkip(b *testing.B) {
}
}

func BenchmarkDecompressMatch(b *testing.B) {
t := new(testing.T)
d := prepareDict(t)
defer d.Close()
g := d.MakeGetter()
for i := 0; i < b.N; i++ {
_ = g.Match([]byte("longlongword"))
}
}

func BenchmarkDecompressMatchCmp(b *testing.B) {
t := new(testing.T)
d := prepareDict(t)
Expand All @@ -99,17 +89,6 @@ func BenchmarkDecompressMatchPrefix(b *testing.B) {
}
}

func BenchmarkDecompressMatchPrefixCmp(b *testing.B) {
t := new(testing.T)
d := prepareDict(t)
defer d.Close()
g := d.MakeGetter()

for i := 0; i < b.N; i++ {
_ = g.MatchPrefixCmp([]byte("longlongword"))
}
}

func BenchmarkDecompressTorrent(t *testing.B) {
t.Skip()

Expand Down
2 changes: 1 addition & 1 deletion erigon-lib/seg/decompress_fuzz_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ func FuzzDecompressMatch(f *testing.F) {
t.Fatalf("MatchCmp: expected match: %v\n", expected)
}
g.Reset(savePos)
ok := g.Match(expected)
ok := g.MatchCmp(expected)
pos2 := g.dataP
if ok != 0 {
t.Fatalf("MatchBool: expected match: %v\n", expected)
Expand Down
47 changes: 3 additions & 44 deletions erigon-lib/seg/decompress_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ func TestDecompressMatchOK(t *testing.T) {
w := loremStrings[i]
if i%2 != 0 {
expected := fmt.Sprintf("%s %d", w, i)
cmp := g.Match([]byte(expected))
cmp := g.MatchCmp([]byte(expected))
if cmp != 0 {
t.Errorf("expexted match with %s", expected)
}
Expand Down Expand Up @@ -164,7 +164,7 @@ func TestDecompressMatchOKCondensed(t *testing.T) {
for g.HasNext() {
if i%2 != 0 {
expected := fmt.Sprintf("word-%d", i)
cmp := g.Match([]byte(expected))
cmp := g.MatchCmp([]byte(expected))
if cmp != 0 {
t.Errorf("expexted match with %s", expected)
}
Expand All @@ -188,7 +188,7 @@ func TestDecompressMatchNotOK(t *testing.T) {
for g.HasNext() {
w := loremStrings[i]
expected := fmt.Sprintf("%s %d", w, i+1)
cmp := g.Match([]byte(expected))
cmp := g.MatchCmp([]byte(expected))
if cmp == 0 {
t.Errorf("not expexted match with %s", expected)
} else {
Expand Down Expand Up @@ -241,47 +241,6 @@ func TestDecompressMatchPrefix(t *testing.T) {
}
}

func TestDecompressMatchPrefixCmp(t *testing.T) {
d := prepareLoremDict(t)
defer d.Close()
g := d.MakeGetter()
i := 0
skipCount := 0
for g.HasNext() {
w := loremStrings[i]
expected := []byte(fmt.Sprintf("%s %d", w, i+1))
expected = expected[:len(expected)/2]
cmp := g.MatchPrefixCmp(expected)
if cmp != 0 {
t.Errorf("expexted match with %s", expected)
}
g.Skip()
skipCount++
i++
}
if skipCount != i {
t.Errorf("something wrong with match logic")
}
g.Reset(0)
skipCount = 0
i = 0
for g.HasNext() {
w := loremStrings[i]
expected := []byte(fmt.Sprintf("%s %d", w, i+1))
expected = expected[:len(expected)/2]
if len(expected) > 0 {
expected[len(expected)-1]++
cmp := g.MatchPrefixCmp(expected)
if cmp == 0 {
t.Errorf("not expexted match with %s", expected)
}
}
g.Skip()
skipCount++
i++
}
}

func prepareLoremDictUncompressed(t *testing.T) *Decompressor {
t.Helper()
logger := log.New()
Expand Down
2 changes: 1 addition & 1 deletion erigon-lib/state/archive.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ func (g *getter) MatchPrefix(prefix []byte) bool {
if g.c&CompressKeys != 0 {
return g.Getter.MatchPrefix(prefix)
}
return g.Getter.MatchPrefixUncompressed(prefix) == 0
return g.Getter.MatchPrefixUncompressed(prefix)
}

func (g *getter) Next(buf []byte) ([]byte, uint64) {
Expand Down

0 comments on commit 0bb8970

Please sign in to comment.