From 7c81fa4f70d4c17b2dd8bbb6128ac9b1b90b062e Mon Sep 17 00:00:00 2001 From: Steven Hartland Date: Tue, 6 Dec 2022 16:05:37 +0000 Subject: [PATCH] fix: ecma ranges with set terminator Fix ECMAScript un-escaped literal '-' when followed by predefined character sets. Also: * Fixed missing error check on parseProperty() call. * Use addChar(ch) helper instead of addRange(ch, ch). Fixes #54 --- go.mod | 2 ++ go.sum | 17 +++++++++++++++ regexp_ecma_test.go | 50 +++++++++++++++++++++++++++++++++++++++++++++ syntax/parser.go | 33 +++++++++++++++++++++++------- 4 files changed, 95 insertions(+), 7 deletions(-) create mode 100644 go.sum create mode 100644 regexp_ecma_test.go diff --git a/go.mod b/go.mod index 9f7f391..d5db1e4 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module github.com/dlclark/regexp2 go 1.13 + +require github.com/stretchr/testify v1.8.1 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..2ec90f7 --- /dev/null +++ b/go.sum @@ -0,0 +1,17 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/regexp_ecma_test.go b/regexp_ecma_test.go new file mode 100644 index 0000000..11f8827 --- /dev/null +++ b/regexp_ecma_test.go @@ -0,0 +1,50 @@ +package regexp2_test + +import ( + "testing" + + "github.com/dlclark/regexp2" + "github.com/stretchr/testify/require" +) + +func TestECMA_basic(t *testing.T) { + tests := map[string]struct { + expr string + data string + want []string + }{ + "charset": { + expr: `[a-c]`, + data: "abcd", + want: []string{"a", "b", "c"}, + }, + "charset-set": { + expr: `[a-\s]`, + data: "a-b cd", + want: []string{"a", "-", " "}, + }, + } + + for name, tt := range tests { + t.Run(name, func(t *testing.T) { + re, err := regexp2.Compile(tt.expr, regexp2.ECMAScript) + require.NoError(t, err) + + match, err := re.FindStringMatch(tt.data) + require.NoError(t, err) + + var res []string + for match != nil { + for _, g := range match.Groups() { + for _, c := range g.Captures { + res = append(res, c.String()) + } + } + + match, err = re.FindNextMatch(match) + require.NoError(t, err) + } + require.Equal(t, tt.want, res) + }) + } +} diff --git a/syntax/parser.go b/syntax/parser.go index 9dc6e31..4e1d4ad 100644 --- a/syntax/parser.go +++ b/syntax/parser.go @@ -1427,7 +1427,7 @@ func (p *parser) scanCapname() string { return string(p.pattern[startpos:p.textpos()]) } -//Scans contents of [] (not including []'s), and converts to a set. +// Scans contents of [] (not including []'s), and converts to a set. func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { ch := '\x00' chPrev := '\x00' @@ -1467,7 +1467,11 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { case 'D', 'd': if !scanOnly { if inRange { - return nil, p.getErr(ErrBadClassInCharRange, ch) + if !p.useOptionE() { + return nil, p.getErr(ErrBadClassInCharRange, ch) + } + cc.addChar('-') + cc.addChar(chPrev) } cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw) } @@ -1476,7 +1480,11 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { case 'S', 's': if !scanOnly { if inRange { - return nil, p.getErr(ErrBadClassInCharRange, ch) + if !p.useOptionE() { + return nil, p.getErr(ErrBadClassInCharRange, ch) + } + cc.addChar('-') + cc.addChar(chPrev) } cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S') } @@ -1485,7 +1493,11 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { case 'W', 'w': if !scanOnly { if inRange { - return nil, p.getErr(ErrBadClassInCharRange, ch) + if !p.useOptionE() { + return nil, p.getErr(ErrBadClassInCharRange, ch) + } + cc.addChar('-') + cc.addChar(chPrev) } cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W') @@ -1495,7 +1507,11 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { case 'p', 'P': if !scanOnly { if inRange { - return nil, p.getErr(ErrBadClassInCharRange, ch) + if !p.useOptionE() { + return nil, p.getErr(ErrBadClassInCharRange, ch) + } + cc.addChar('-') + cc.addChar(chPrev) } prop, err := p.parseProperty() if err != nil { @@ -1503,14 +1519,17 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { } cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw) } else { - p.parseProperty() + _, err := p.parseProperty() + if err != nil { + return nil, err + } } continue case '-': if !scanOnly { - cc.addRange(ch, ch) + cc.addChar(ch) } continue