From a332ca6d26a98872abef466e7b56430c9bc2954c Mon Sep 17 00:00:00 2001 From: adam-azarchs <22033990+adam-azarchs@users.noreply.github.com> Date: Thu, 3 Jan 2019 14:02:46 -0800 Subject: [PATCH] Support escaping in mro string literals. Use json syntax for encoding strings. Allow decoding from golang escaping format, which is more permissive. --- martian/syntax/formatter.go | 93 ++++++++++++++++++++++- martian/syntax/formatter_test.go | 6 +- martian/syntax/parsenum.go | 17 +++++ martian/syntax/string_intern.go | 109 ++++++++++++++++++++++++++- martian/syntax/string_intern_test.go | 79 +++++++++++++++++++ martian/syntax/tokenizer.go | 3 +- martian/syntax/tokenizer_test.go | 13 ++++ 7 files changed, 311 insertions(+), 9 deletions(-) diff --git a/martian/syntax/formatter.go b/martian/syntax/formatter.go index dca21543..66d44d50 100644 --- a/martian/syntax/formatter.go +++ b/martian/syntax/formatter.go @@ -14,6 +14,7 @@ import ( "path/filepath" "sort" "strings" + "unicode/utf8" ) const ( @@ -109,6 +110,91 @@ func (self *printer) String() string { return self.buf.String() } +// QuoteString writes a string, quoted and escaped as json. +// +// The reason we don't just use json.Marshal here is because the default +// encoder html-escapes strings, and disabling that by using json.Encoder +// puts carriage returns at the end of the string, which is also bad for +// this use case. Plus this way we can bypass a lot of reflection junk. +// +// This method is mostly copy/pasted from unexported go standard library +// json encoder implementation (see +// https://github.com/golang/go/blob/release-branch.go1.11/src/encoding/json/encode.go#L884) +func quoteString(w stringWriter, s string) { + w.WriteByte('"') + const hex = "0123456789abcdef" + start := 0 + for i := 0; i < len(s); { + // Single-byte code points. + if b := s[i]; b < utf8.RuneSelf { + if b >= ' ' && b != '"' && b != '\\' { + i++ + continue + } + if start < i { + w.WriteString(s[start:i]) + } + switch b { + case '\\', '"': + w.WriteByte('\\') + w.WriteByte(b) + case '\n': + w.WriteByte('\\') + w.WriteByte('n') + case '\r': + w.WriteByte('\\') + w.WriteByte('r') + case '\t': + w.WriteByte('\\') + w.WriteByte('t') + default: + // This encodes bytes < 0x20 except for \t, \n and \r. + w.WriteString(`\u00`) + w.WriteByte(hex[b>>4]) + w.WriteByte(hex[b&0xF]) + } + i++ + start = i + continue + } + // Multi-byte code points. + c, size := utf8.DecodeRuneInString(s[i:]) + if c == utf8.RuneError && size == 1 { + // Transform invalid code points into unicode + // "replacement character". + if start < i { + w.WriteString(s[start:i]) + } + w.WriteString(`\ufffd`) + i += size + start = i + continue + } + // U+2028 is LINE SEPARATOR. + // U+2029 is PARAGRAPH SEPARATOR. + // They are both technically valid characters in JSON strings, + // but don't work in JSONP, which has to be evaluated as JavaScript, + // and can lead to security holes there. It is valid JSON to + // escape them, so we do so unconditionally. + // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion. + if c == '\u2028' || c == '\u2029' { + if start < i { + w.WriteString(s[start:i]) + } + w.WriteString(`\u202`) + w.WriteByte(hex[c&0xF]) + i += size + start = i + continue + } + i += size + } + if start < len(s) { + w.WriteString(s[start:]) + } + w.WriteByte('"') +} + // // Expression // @@ -120,7 +206,12 @@ func (self *ValExp) format(w stringWriter, prefix string) { } else if self.Kind == KindFloat { fmt.Fprintf(w, "%g", self.Value) } else if self.Kind == KindString { - fmt.Fprintf(w, "\"%s\"", self.Value) + switch s := self.Value.(type) { + case string: + quoteString(w, s) + default: + fmt.Fprintf(w, "%q", self.Value) + } } else if self.Kind == KindMap { self.formatMap(w, prefix) } else if self.Kind == KindArray { diff --git a/martian/syntax/formatter_test.go b/martian/syntax/formatter_test.go index ccdc7a9c..dcb24572 100644 --- a/martian/syntax/formatter_test.go +++ b/martian/syntax/formatter_test.go @@ -92,9 +92,9 @@ func TestFormatValueExpression(t *testing.T) { Equal(t, buff.String(), "\"blah\"", "Double quote a string.") buff.Reset() - ve.Value = "\"blah\"" + ve.Value = `"blah"` ve.format(&buff, "") - Equal(t, buff.String(), "\"\"blah\"\"", "Double quote a double-quoted string.") + Equal(t, buff.String(), `"\"blah\""`, "Double quote a double-quoted string.") buff.Reset() // @@ -218,7 +218,7 @@ pipeline AWESOME( call ADD_KEY1( key = self.key1, value = self.value1, - failfile = "fail1", + failfile = "fail \n\"1\"", start = null, ) using ( local = true, diff --git a/martian/syntax/parsenum.go b/martian/syntax/parsenum.go index 75eba8fb..1e3fcbee 100644 --- a/martian/syntax/parsenum.go +++ b/martian/syntax/parsenum.go @@ -60,3 +60,20 @@ func parseFloat(s []byte) float64 { } return f } + +func unhex(c byte) byte { + switch { + case '0' <= c && c <= '9': + return c - '0' + case 'a' <= c && c <= 'f': + return c - 'a' + 10 + case 'A' <= c && c <= 'F': + return c - 'A' + 10 + default: + panic(string(append([]byte("Invalid character "), c))) + } +} + +func parseHexByte(c0, c1 byte) byte { + return (unhex(c0) << 4) + unhex(c1) +} diff --git a/martian/syntax/string_intern.go b/martian/syntax/string_intern.go index 66044a8d..255eacb6 100644 --- a/martian/syntax/string_intern.go +++ b/martian/syntax/string_intern.go @@ -4,7 +4,10 @@ package syntax -import "bytes" +import ( + "bytes" + "unicode/utf8" +) type stringIntern struct { internSet map[string]string @@ -65,12 +68,110 @@ func (store *stringIntern) Get(value []byte) string { } } -var quoteBytes = []byte(`"`) +func runeError() []byte { + b := make([]byte, 3) + utf8.EncodeRune(b, utf8.RuneError) + return b +} + +func unquoteBytes(value []byte) []byte { + n := len(value) + if n < 2 || value[0] != '"' || value[n-1] != '"' { + // Should be prevented by the tokenizer. + panic("string was not quoted: " + string(value)) + } + value = value[1 : n-1] + if !bytes.ContainsAny(value, `\"`) { + // Trivial value, avoid allocation. + return value + } + + buf := make([]byte, 0, len(value)+2*utf8.UTFMax) + for len(value) > 0 { + switch c := value[0]; { + case c >= utf8.RuneSelf: + // Multibyte character. + _, size := utf8.DecodeRune(value) + buf = append(buf, value[:size]...) + value = value[size:] + case c != '\\': + buf = append(buf, value[0]) + value = value[1:] + default: + // Escape + c2 := value[1] + value = value[2:] + switch c2 { + // easy cases + case 'a': + buf = append(buf, '\a') + case 'b': + buf = append(buf, '\b') + case 'f': + buf = append(buf, '\f') + case 'n': + buf = append(buf, '\n') + case 'r': + buf = append(buf, '\r') + case 't': + buf = append(buf, '\t') + case 'v': + buf = append(buf, '\v') + // Harder cases + case 'x': + // one-byte hex-encoded unicode. + buf = append(buf, parseHexByte(value[0], value[1])) + value = value[2:] + case 'u': + // two-byte hex-encoded unicode. + if len(value) < 4 { + buf = append(buf, runeError()...) + value = value[len(value):] + } else { + var enc [2]byte + n := utf8.EncodeRune(enc[:], + rune(parseHexByte(value[2], value[3]))+ + (rune(parseHexByte(value[0], value[1]))<<8)) + buf = append(buf, enc[:n]...) + value = value[4:] + } + case 'U': + // four-byte hex-encoded unicode. + if len(value) < 8 { + buf = append(buf, runeError()...) + value = value[len(value):] + } else { + var enc [4]byte + n := utf8.EncodeRune(enc[:], + rune(parseHexByte(value[6], value[7]))+ + (rune(parseHexByte(value[4], value[5]))<<8)+ + (rune(parseHexByte(value[2], value[3]))<<16)+ + (rune(parseHexByte(value[0], value[1]))<<24)) + buf = append(buf, enc[:n]...) + value = value[8:] + } + case '0', '1', '2', '3', '4', '5', '6', '7': + // one-byte octal unicode + if value[1] < '0' || value[1] > '7' || value[0] < '0' || value[0] > '7' { + buf = append(buf, runeError()...) + value = value[len(value):] + } else { + buf = append(buf, ((c2-'0')<<6)+((value[0]-'0')<<3)+(value[1]-'0')) + value = value[2:] + } + default: + // \, ", etc. + buf = append(buf, c2) + } + } + } + return buf +} func (store *stringIntern) unquote(value []byte) string { - return store.Get(bytes.Replace(value, quoteBytes, nil, -1)) + return store.Get(unquoteBytes(value)) } func unquote(qs []byte) string { - return string(bytes.Replace(qs, quoteBytes, nil, -1)) + return string(unquoteBytes(qs)) } diff --git a/martian/syntax/string_intern_test.go b/martian/syntax/string_intern_test.go index 8e509414..f0c035cd 100644 --- a/martian/syntax/string_intern_test.go +++ b/martian/syntax/string_intern_test.go @@ -3,7 +3,10 @@ package syntax import ( + "bytes" + "encoding/json" "testing" + "testing/quick" ) func TestStringIntern(t *testing.T) { @@ -27,3 +30,79 @@ func TestStringIntern(t *testing.T) { t.Errorf("Bytes key lookup AllocsPerRun = %f, want 0", n) } } + +func TestUnquote(t *testing.T) { + check := func(t *testing.T, input, expect string) { + t.Helper() + if s := unquote([]byte(input)); s != expect { + t.Errorf("Expected: %q, got %q", + expect, s) + } + } + check(t, + `"\"hey\" is\\\n\tfor \U0001f40es"`, + "\"hey\" is\\\n\tfor \U0001f40es") + check(t, + `"\xf2Y\xbb\x8a,\xd0(\xf0\xff=\x8c\xbd"`, + "\xf2Y\xbb\x8a,\xd0(\xf0\xff=\x8c\xbd") + check(t, `"multibyte \"ဤ\" character"`, "multibyte \"\xe1\x80\xa4\" character") + check(t, `"Octal is \167eird"`, "Octal is weird") + check(t, `"Hex is \x6eormal"`, "Hex is normal") + check(t, `"Hex is \x6Eormal"`, "Hex is normal") + check(t, `"Hex is \u0146ormal"`, "Hex is \u0146ormal") + check(t, `"We căn use anỿ valid utf-8 ☺"`, "We căn use anỿ valid utf-8 ☺") + check(t, `"Case sensitivity is \U0001f4A9"`, "Case sensitivity is \U0001f4A9") + check(t, `"Control\a\b\f\n\r\t\v characters"`, "Control\a\b\f\n\r\t\v characters") + check(t, `"Invalid \u123"`, "Invalid \ufffd") +} + +// Fuzz test for unquote. +func TestUnquoteFuzz(t *testing.T) { + t.Parallel() + if err := quick.CheckEqual(func(s string) string { + return s + }, func(s string) string { + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + enc.SetEscapeHTML(false) + enc.Encode(s) + return unquote(buf.Bytes()[:buf.Len()-1]) + }, nil); err != nil { + t.Error(err) + } +} + +// Fuzzer test for format/decode round trip. +func TestUnquoteFormat(t *testing.T) { + t.Parallel() + enc := func(s string) []byte { + var buf bytes.Buffer + quoteString(&buf, s) + return buf.Bytes() + } + roundTrip := func(s string) []byte { + return enc(unquote(enc(s))) + } + if err := quick.CheckEqual(enc, roundTrip, nil); err != nil { + t.Error(err) + } + jsonEnc := func(s string) []byte { + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + enc.SetEscapeHTML(false) + enc.Encode(s) + return buf.Bytes()[:buf.Len()-1] + } + if err := quick.CheckEqual(enc, jsonEnc, nil); err != nil { + t.Error(err) + } + check := func(t *testing.T, s string) { + t.Helper() + if e, a := jsonEnc(s), enc(s); !bytes.Equal(e, a) { + t.Errorf("Expected %q -> %q, got %q", s, e, a) + } + } + check(t, "\"hey\" is\\\n\tfor \U0001f40es") + check(t, "Control\a\b\f\n\r\t\v \u2029 characters") + check(t, "Invalid character \x88\xee") +} diff --git a/martian/syntax/tokenizer.go b/martian/syntax/tokenizer.go index de71c621..153e9f89 100644 --- a/martian/syntax/tokenizer.go +++ b/martian/syntax/tokenizer.go @@ -32,7 +32,8 @@ var rules = [...]rule{ {regexp.MustCompile(`^;`), SEMICOLON}, {regexp.MustCompile(`^,`), COMMA}, {regexp.MustCompile(`^\.`), DOT}, - {regexp.MustCompile(`^"[^\"]*"`), LITSTRING}, // double-quoted strings. escapes not supported + // double-quoted strings with escaping. + {regexp.MustCompile(`^"(?:[^\\"]|\\[abfnrtv\\"]|\\[0-7]{3}|\\x[0-9a-fA-f]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"`), LITSTRING}, {regexp.MustCompile(`^filetype\b`), FILETYPE}, {regexp.MustCompile(`^stage\b`), STAGE}, {regexp.MustCompile(`^pipeline\b`), PIPELINE}, diff --git a/martian/syntax/tokenizer_test.go b/martian/syntax/tokenizer_test.go index 67fa5220..cee93ee3 100644 --- a/martian/syntax/tokenizer_test.go +++ b/martian/syntax/tokenizer_test.go @@ -19,6 +19,19 @@ func TestNextToken(t *testing.T) { } check("# this is a comment\n", COMMENT) check(`"this/is/a/string"`, LITSTRING) + check(`""`, LITSTRING) + check(`"this\nis\na\nstring"`, LITSTRING) + check(`"this \"is\" a string"`, LITSTRING) + check(`"\"this \"is\" a string\""`, LITSTRING) + check(`"this is a \U0001F9F5"`, LITSTRING) + check(`"this is \u0146ot bad"`, LITSTRING) + check(`"this is \167eird but fine"`, LITSTRING) + check(`\"this is not a string"`, INVALID) + check(`"\"this\" is not a string\"`, INVALID) + check(`"Invalid unicode\u001!"`, INVALID) + check(`"Invalid unicode\U0001!"`, INVALID) + check(`"Invalid unicode\x1!"`, INVALID) + check(`"Invalid unicode\xaz"`, INVALID) check(`@include`, INCLUDE_DIRECTIVE) check(`_INTERNAL_PIPELINE`, ID) check(`_type_name`, ID)