Skip to content

Commit

Permalink
Support escaping in mro string literals.
Browse files Browse the repository at this point in the history
Use json syntax for encoding strings.  Allow decoding from golang
escaping format, which is more permissive.
  • Loading branch information
adam-azarchs authored Jan 3, 2019
1 parent 847dc18 commit a332ca6
Show file tree
Hide file tree
Showing 7 changed files with 311 additions and 9 deletions.
93 changes: 92 additions & 1 deletion martian/syntax/formatter.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"path/filepath"
"sort"
"strings"
"unicode/utf8"
)

const (
Expand Down Expand Up @@ -109,6 +110,91 @@ func (self *printer) String() string {
return self.buf.String()
}

// QuoteString writes a string, quoted and escaped as json.
//
// The reason we don't just use json.Marshal here is because the default
// encoder html-escapes strings, and disabling that by using json.Encoder
// puts carriage returns at the end of the string, which is also bad for
// this use case. Plus this way we can bypass a lot of reflection junk.
//
// This method is mostly copy/pasted from unexported go standard library
// json encoder implementation (see
// https://github.com/golang/go/blob/release-branch.go1.11/src/encoding/json/encode.go#L884)
func quoteString(w stringWriter, s string) {
w.WriteByte('"')
const hex = "0123456789abcdef"
start := 0
for i := 0; i < len(s); {
// Single-byte code points.
if b := s[i]; b < utf8.RuneSelf {
if b >= ' ' && b != '"' && b != '\\' {
i++
continue
}
if start < i {
w.WriteString(s[start:i])
}
switch b {
case '\\', '"':
w.WriteByte('\\')
w.WriteByte(b)
case '\n':
w.WriteByte('\\')
w.WriteByte('n')
case '\r':
w.WriteByte('\\')
w.WriteByte('r')
case '\t':
w.WriteByte('\\')
w.WriteByte('t')
default:
// This encodes bytes < 0x20 except for \t, \n and \r.
w.WriteString(`\u00`)
w.WriteByte(hex[b>>4])
w.WriteByte(hex[b&0xF])
}
i++
start = i
continue
}
// Multi-byte code points.
c, size := utf8.DecodeRuneInString(s[i:])
if c == utf8.RuneError && size == 1 {
// Transform invalid code points into unicode
// "replacement character".
if start < i {
w.WriteString(s[start:i])
}
w.WriteString(`\ufffd`)
i += size
start = i
continue
}
// U+2028 is LINE SEPARATOR.
// U+2029 is PARAGRAPH SEPARATOR.
// They are both technically valid characters in JSON strings,
// but don't work in JSONP, which has to be evaluated as JavaScript,
// and can lead to security holes there. It is valid JSON to
// escape them, so we do so unconditionally.
// See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
if c == '\u2028' || c == '\u2029' {
if start < i {
w.WriteString(s[start:i])
}
w.WriteString(`\u202`)
w.WriteByte(hex[c&0xF])
i += size
start = i
continue
}
i += size
}
if start < len(s) {
w.WriteString(s[start:])
}
w.WriteByte('"')
}

//
// Expression
//
Expand All @@ -120,7 +206,12 @@ func (self *ValExp) format(w stringWriter, prefix string) {
} else if self.Kind == KindFloat {
fmt.Fprintf(w, "%g", self.Value)
} else if self.Kind == KindString {
fmt.Fprintf(w, "\"%s\"", self.Value)
switch s := self.Value.(type) {
case string:
quoteString(w, s)
default:
fmt.Fprintf(w, "%q", self.Value)
}
} else if self.Kind == KindMap {
self.formatMap(w, prefix)
} else if self.Kind == KindArray {
Expand Down
6 changes: 3 additions & 3 deletions martian/syntax/formatter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,9 @@ func TestFormatValueExpression(t *testing.T) {
Equal(t, buff.String(), "\"blah\"", "Double quote a string.")
buff.Reset()

ve.Value = "\"blah\""
ve.Value = `"blah"`
ve.format(&buff, "")
Equal(t, buff.String(), "\"\"blah\"\"", "Double quote a double-quoted string.")
Equal(t, buff.String(), `"\"blah\""`, "Double quote a double-quoted string.")
buff.Reset()

//
Expand Down Expand Up @@ -218,7 +218,7 @@ pipeline AWESOME(
call ADD_KEY1(
key = self.key1,
value = self.value1,
failfile = "fail1",
failfile = "fail \n\"1\"",
start = null,
) using (
local = true,
Expand Down
17 changes: 17 additions & 0 deletions martian/syntax/parsenum.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,20 @@ func parseFloat(s []byte) float64 {
}
return f
}

func unhex(c byte) byte {
switch {
case '0' <= c && c <= '9':
return c - '0'
case 'a' <= c && c <= 'f':
return c - 'a' + 10
case 'A' <= c && c <= 'F':
return c - 'A' + 10
default:
panic(string(append([]byte("Invalid character "), c)))
}
}

func parseHexByte(c0, c1 byte) byte {
return (unhex(c0) << 4) + unhex(c1)
}
109 changes: 105 additions & 4 deletions martian/syntax/string_intern.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@

package syntax

import "bytes"
import (
"bytes"
"unicode/utf8"
)

type stringIntern struct {
internSet map[string]string
Expand Down Expand Up @@ -65,12 +68,110 @@ func (store *stringIntern) Get(value []byte) string {
}
}

var quoteBytes = []byte(`"`)
func runeError() []byte {
b := make([]byte, 3)
utf8.EncodeRune(b, utf8.RuneError)
return b
}

func unquoteBytes(value []byte) []byte {
n := len(value)
if n < 2 || value[0] != '"' || value[n-1] != '"' {
// Should be prevented by the tokenizer.
panic("string was not quoted: " + string(value))
}
value = value[1 : n-1]
if !bytes.ContainsAny(value, `\"`) {
// Trivial value, avoid allocation.
return value
}

buf := make([]byte, 0, len(value)+2*utf8.UTFMax)
for len(value) > 0 {
switch c := value[0]; {
case c >= utf8.RuneSelf:
// Multibyte character.
_, size := utf8.DecodeRune(value)
buf = append(buf, value[:size]...)
value = value[size:]
case c != '\\':
buf = append(buf, value[0])
value = value[1:]
default:
// Escape
c2 := value[1]
value = value[2:]
switch c2 {
// easy cases
case 'a':
buf = append(buf, '\a')
case 'b':
buf = append(buf, '\b')
case 'f':
buf = append(buf, '\f')
case 'n':
buf = append(buf, '\n')
case 'r':
buf = append(buf, '\r')
case 't':
buf = append(buf, '\t')
case 'v':
buf = append(buf, '\v')
// Harder cases
case 'x':
// one-byte hex-encoded unicode.
buf = append(buf, parseHexByte(value[0], value[1]))
value = value[2:]
case 'u':
// two-byte hex-encoded unicode.
if len(value) < 4 {
buf = append(buf, runeError()...)
value = value[len(value):]
} else {
var enc [2]byte
n := utf8.EncodeRune(enc[:],
rune(parseHexByte(value[2], value[3]))+
(rune(parseHexByte(value[0], value[1]))<<8))
buf = append(buf, enc[:n]...)
value = value[4:]
}
case 'U':
// four-byte hex-encoded unicode.
if len(value) < 8 {
buf = append(buf, runeError()...)
value = value[len(value):]
} else {
var enc [4]byte
n := utf8.EncodeRune(enc[:],
rune(parseHexByte(value[6], value[7]))+
(rune(parseHexByte(value[4], value[5]))<<8)+
(rune(parseHexByte(value[2], value[3]))<<16)+
(rune(parseHexByte(value[0], value[1]))<<24))
buf = append(buf, enc[:n]...)
value = value[8:]
}
case '0', '1', '2', '3', '4', '5', '6', '7':
// one-byte octal unicode
if value[1] < '0' || value[1] > '7' || value[0] < '0' || value[0] > '7' {
buf = append(buf, runeError()...)
value = value[len(value):]
} else {
buf = append(buf, ((c2-'0')<<6)+((value[0]-'0')<<3)+(value[1]-'0'))
value = value[2:]
}
default:
// \, ", etc.
buf = append(buf, c2)
}
}
}
return buf
}

func (store *stringIntern) unquote(value []byte) string {
return store.Get(bytes.Replace(value, quoteBytes, nil, -1))
return store.Get(unquoteBytes(value))
}

func unquote(qs []byte) string {
return string(bytes.Replace(qs, quoteBytes, nil, -1))
return string(unquoteBytes(qs))
}
79 changes: 79 additions & 0 deletions martian/syntax/string_intern_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
package syntax

import (
"bytes"
"encoding/json"
"testing"
"testing/quick"
)

func TestStringIntern(t *testing.T) {
Expand All @@ -27,3 +30,79 @@ func TestStringIntern(t *testing.T) {
t.Errorf("Bytes key lookup AllocsPerRun = %f, want 0", n)
}
}

func TestUnquote(t *testing.T) {
check := func(t *testing.T, input, expect string) {
t.Helper()
if s := unquote([]byte(input)); s != expect {
t.Errorf("Expected: %q, got %q",
expect, s)
}
}
check(t,
`"\"hey\" is\\\n\tfor \U0001f40es"`,
"\"hey\" is\\\n\tfor \U0001f40es")
check(t,
`"\xf2Y\xbb\x8a,\xd0(\xf0\xff=\x8c\xbd"`,
"\xf2Y\xbb\x8a,\xd0(\xf0\xff=\x8c\xbd")
check(t, `"multibyte \"ဤ\" character"`, "multibyte \"\xe1\x80\xa4\" character")
check(t, `"Octal is \167eird"`, "Octal is weird")
check(t, `"Hex is \x6eormal"`, "Hex is normal")
check(t, `"Hex is \x6Eormal"`, "Hex is normal")
check(t, `"Hex is \u0146ormal"`, "Hex is \u0146ormal")
check(t, `"We căn use anỿ valid utf-8 ☺"`, "We căn use anỿ valid utf-8 ☺")
check(t, `"Case sensitivity is \U0001f4A9"`, "Case sensitivity is \U0001f4A9")
check(t, `"Control\a\b\f\n\r\t\v characters"`, "Control\a\b\f\n\r\t\v characters")
check(t, `"Invalid \u123"`, "Invalid \ufffd")
}

// Fuzz test for unquote.
func TestUnquoteFuzz(t *testing.T) {
t.Parallel()
if err := quick.CheckEqual(func(s string) string {
return s
}, func(s string) string {
var buf bytes.Buffer
enc := json.NewEncoder(&buf)
enc.SetEscapeHTML(false)
enc.Encode(s)
return unquote(buf.Bytes()[:buf.Len()-1])
}, nil); err != nil {
t.Error(err)
}
}

// Fuzzer test for format/decode round trip.
func TestUnquoteFormat(t *testing.T) {
t.Parallel()
enc := func(s string) []byte {
var buf bytes.Buffer
quoteString(&buf, s)
return buf.Bytes()
}
roundTrip := func(s string) []byte {
return enc(unquote(enc(s)))
}
if err := quick.CheckEqual(enc, roundTrip, nil); err != nil {
t.Error(err)
}
jsonEnc := func(s string) []byte {
var buf bytes.Buffer
enc := json.NewEncoder(&buf)
enc.SetEscapeHTML(false)
enc.Encode(s)
return buf.Bytes()[:buf.Len()-1]
}
if err := quick.CheckEqual(enc, jsonEnc, nil); err != nil {
t.Error(err)
}
check := func(t *testing.T, s string) {
t.Helper()
if e, a := jsonEnc(s), enc(s); !bytes.Equal(e, a) {
t.Errorf("Expected %q -> %q, got %q", s, e, a)
}
}
check(t, "\"hey\" is\\\n\tfor \U0001f40es")
check(t, "Control\a\b\f\n\r\t\v \u2029 characters")
check(t, "Invalid character \x88\xee")
}
3 changes: 2 additions & 1 deletion martian/syntax/tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ var rules = [...]rule{
{regexp.MustCompile(`^;`), SEMICOLON},
{regexp.MustCompile(`^,`), COMMA},
{regexp.MustCompile(`^\.`), DOT},
{regexp.MustCompile(`^"[^\"]*"`), LITSTRING}, // double-quoted strings. escapes not supported
// double-quoted strings with escaping.
{regexp.MustCompile(`^"(?:[^\\"]|\\[abfnrtv\\"]|\\[0-7]{3}|\\x[0-9a-fA-f]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"`), LITSTRING},
{regexp.MustCompile(`^filetype\b`), FILETYPE},
{regexp.MustCompile(`^stage\b`), STAGE},
{regexp.MustCompile(`^pipeline\b`), PIPELINE},
Expand Down
Loading

0 comments on commit a332ca6

Please sign in to comment.