From a332ca6d26a98872abef466e7b56430c9bc2954c Mon Sep 17 00:00:00 2001
From: adam-azarchs <22033990+adam-azarchs@users.noreply.github.com>
Date: Thu, 3 Jan 2019 14:02:46 -0800
Subject: [PATCH] Support escaping in mro string literals.

Use json syntax for encoding strings.  Allow decoding from golang
escaping format, which is more permissive.
---
 martian/syntax/formatter.go          |  93 ++++++++++++++++++++++-
 martian/syntax/formatter_test.go     |   6 +-
 martian/syntax/parsenum.go           |  17 +++++
 martian/syntax/string_intern.go      | 109 ++++++++++++++++++++++++++-
 martian/syntax/string_intern_test.go |  79 +++++++++++++++++++
 martian/syntax/tokenizer.go          |   3 +-
 martian/syntax/tokenizer_test.go     |  13 ++++
 7 files changed, 311 insertions(+), 9 deletions(-)

diff --git a/martian/syntax/formatter.go b/martian/syntax/formatter.go
index dca21543..66d44d50 100644
--- a/martian/syntax/formatter.go
+++ b/martian/syntax/formatter.go
@@ -14,6 +14,7 @@ import (
 	"path/filepath"
 	"sort"
 	"strings"
+	"unicode/utf8"
 )
 
 const (
@@ -109,6 +110,91 @@ func (self *printer) String() string {
 	return self.buf.String()
 }
 
+// QuoteString writes a string, quoted and escaped as json.
+//
+// The reason we don't just use json.Marshal here is because the default
+// encoder html-escapes strings, and disabling that by using json.Encoder
+// puts carriage returns at the end of the string, which is also bad for
+// this use case.  Plus this way we can bypass a lot of reflection junk.
+//
+// This method is mostly copy/pasted from unexported go standard library
+// json encoder implementation (see
+// https://github.com/golang/go/blob/release-branch.go1.11/src/encoding/json/encode.go#L884)
+func quoteString(w stringWriter, s string) {
+	w.WriteByte('"')
+	const hex = "0123456789abcdef"
+	start := 0
+	for i := 0; i < len(s); {
+		// Single-byte code points.
+		if b := s[i]; b < utf8.RuneSelf {
+			if b >= ' ' && b != '"' && b != '\\' {
+				i++
+				continue
+			}
+			if start < i {
+				w.WriteString(s[start:i])
+			}
+			switch b {
+			case '\\', '"':
+				w.WriteByte('\\')
+				w.WriteByte(b)
+			case '\n':
+				w.WriteByte('\\')
+				w.WriteByte('n')
+			case '\r':
+				w.WriteByte('\\')
+				w.WriteByte('r')
+			case '\t':
+				w.WriteByte('\\')
+				w.WriteByte('t')
+			default:
+				// This encodes bytes < 0x20 except for \t, \n and \r.
+				w.WriteString(`\u00`)
+				w.WriteByte(hex[b>>4])
+				w.WriteByte(hex[b&0xF])
+			}
+			i++
+			start = i
+			continue
+		}
+		// Multi-byte code points.
+		c, size := utf8.DecodeRuneInString(s[i:])
+		if c == utf8.RuneError && size == 1 {
+			// Transform invalid code points into unicode
+			// "replacement character".
+			if start < i {
+				w.WriteString(s[start:i])
+			}
+			w.WriteString(`\ufffd`)
+			i += size
+			start = i
+			continue
+		}
+		// U+2028 is LINE SEPARATOR.
+		// U+2029 is PARAGRAPH SEPARATOR.
+		// They are both technically valid characters in JSON strings,
+		// but don't work in JSONP, which has to be evaluated as JavaScript,
+		// and can lead to security holes there. It is valid JSON to
+		// escape them, so we do so unconditionally.
+		// See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
+		if c == '\u2028' || c == '\u2029' {
+			if start < i {
+				w.WriteString(s[start:i])
+			}
+			w.WriteString(`\u202`)
+			w.WriteByte(hex[c&0xF])
+			i += size
+			start = i
+			continue
+		}
+		i += size
+	}
+	if start < len(s) {
+		w.WriteString(s[start:])
+	}
+	w.WriteByte('"')
+}
+
 //
 // Expression
 //
@@ -120,7 +206,12 @@ func (self *ValExp) format(w stringWriter, prefix string) {
 	} else if self.Kind == KindFloat {
 		fmt.Fprintf(w, "%g", self.Value)
 	} else if self.Kind == KindString {
-		fmt.Fprintf(w, "\"%s\"", self.Value)
+		switch s := self.Value.(type) {
+		case string:
+			quoteString(w, s)
+		default:
+			fmt.Fprintf(w, "%q", self.Value)
+		}
 	} else if self.Kind == KindMap {
 		self.formatMap(w, prefix)
 	} else if self.Kind == KindArray {
diff --git a/martian/syntax/formatter_test.go b/martian/syntax/formatter_test.go
index ccdc7a9c..dcb24572 100644
--- a/martian/syntax/formatter_test.go
+++ b/martian/syntax/formatter_test.go
@@ -92,9 +92,9 @@ func TestFormatValueExpression(t *testing.T) {
 	Equal(t, buff.String(), "\"blah\"", "Double quote a string.")
 	buff.Reset()
 
-	ve.Value = "\"blah\""
+	ve.Value = `"blah"`
 	ve.format(&buff, "")
-	Equal(t, buff.String(), "\"\"blah\"\"", "Double quote a double-quoted string.")
+	Equal(t, buff.String(), `"\"blah\""`, "Double quote a double-quoted string.")
 	buff.Reset()
 
 	//
@@ -218,7 +218,7 @@ pipeline AWESOME(
     call ADD_KEY1(
         key      = self.key1,
         value    = self.value1,
-        failfile = "fail1",
+        failfile = "fail \n\"1\"",
         start    = null,
     ) using (
         local = true,
diff --git a/martian/syntax/parsenum.go b/martian/syntax/parsenum.go
index 75eba8fb..1e3fcbee 100644
--- a/martian/syntax/parsenum.go
+++ b/martian/syntax/parsenum.go
@@ -60,3 +60,20 @@ func parseFloat(s []byte) float64 {
 	}
 	return f
 }
+
+func unhex(c byte) byte {
+	switch {
+	case '0' <= c && c <= '9':
+		return c - '0'
+	case 'a' <= c && c <= 'f':
+		return c - 'a' + 10
+	case 'A' <= c && c <= 'F':
+		return c - 'A' + 10
+	default:
+		panic(string(append([]byte("Invalid character "), c)))
+	}
+}
+
+func parseHexByte(c0, c1 byte) byte {
+	return (unhex(c0) << 4) + unhex(c1)
+}
diff --git a/martian/syntax/string_intern.go b/martian/syntax/string_intern.go
index 66044a8d..255eacb6 100644
--- a/martian/syntax/string_intern.go
+++ b/martian/syntax/string_intern.go
@@ -4,7 +4,10 @@
 
 package syntax
 
-import "bytes"
+import (
+	"bytes"
+	"unicode/utf8"
+)
 
 type stringIntern struct {
 	internSet map[string]string
@@ -65,12 +68,110 @@ func (store *stringIntern) Get(value []byte) string {
 	}
 }
 
-var quoteBytes = []byte(`"`)
+func runeError() []byte {
+	b := make([]byte, 3)
+	utf8.EncodeRune(b, utf8.RuneError)
+	return b
+}
+
+func unquoteBytes(value []byte) []byte {
+	n := len(value)
+	if n < 2 || value[0] != '"' || value[n-1] != '"' {
+		// Should be prevented by the tokenizer.
+		panic("string was not quoted: " + string(value))
+	}
+	value = value[1 : n-1]
+	if !bytes.ContainsAny(value, `\"`) {
+		// Trivial value, avoid allocation.
+		return value
+	}
+
+	buf := make([]byte, 0, len(value)+2*utf8.UTFMax)
+	for len(value) > 0 {
+		switch c := value[0]; {
+		case c >= utf8.RuneSelf:
+			// Multibyte character.
+			_, size := utf8.DecodeRune(value)
+			buf = append(buf, value[:size]...)
+			value = value[size:]
+		case c != '\\':
+			buf = append(buf, value[0])
+			value = value[1:]
+		default:
+			// Escape
+			c2 := value[1]
+			value = value[2:]
+			switch c2 {
+			// easy cases
+			case 'a':
+				buf = append(buf, '\a')
+			case 'b':
+				buf = append(buf, '\b')
+			case 'f':
+				buf = append(buf, '\f')
+			case 'n':
+				buf = append(buf, '\n')
+			case 'r':
+				buf = append(buf, '\r')
+			case 't':
+				buf = append(buf, '\t')
+			case 'v':
+				buf = append(buf, '\v')
+				// Harder cases
+			case 'x':
+				// one-byte hex-encoded unicode.
+				buf = append(buf, parseHexByte(value[0], value[1]))
+				value = value[2:]
+			case 'u':
+				// two-byte hex-encoded unicode.
+				if len(value) < 4 {
+					buf = append(buf, runeError()...)
+					value = value[len(value):]
+				} else {
+					var enc [2]byte
+					n := utf8.EncodeRune(enc[:],
+						rune(parseHexByte(value[2], value[3]))+
+							(rune(parseHexByte(value[0], value[1]))<<8))
+					buf = append(buf, enc[:n]...)
+					value = value[4:]
+				}
+			case 'U':
+				// four-byte hex-encoded unicode.
+				if len(value) < 8 {
+					buf = append(buf, runeError()...)
+					value = value[len(value):]
+				} else {
+					var enc [4]byte
+					n := utf8.EncodeRune(enc[:],
+						rune(parseHexByte(value[6], value[7]))+
+							(rune(parseHexByte(value[4], value[5]))<<8)+
+							(rune(parseHexByte(value[2], value[3]))<<16)+
+							(rune(parseHexByte(value[0], value[1]))<<24))
+					buf = append(buf, enc[:n]...)
+					value = value[8:]
+				}
+			case '0', '1', '2', '3', '4', '5', '6', '7':
+				// one-byte octal unicode
+				if value[1] < '0' || value[1] > '7' || value[0] < '0' || value[0] > '7' {
+					buf = append(buf, runeError()...)
+					value = value[len(value):]
+				} else {
+					buf = append(buf, ((c2-'0')<<6)+((value[0]-'0')<<3)+(value[1]-'0'))
+					value = value[2:]
+				}
+			default:
+				// \, ", etc.
+				buf = append(buf, c2)
+			}
+		}
+	}
+	return buf
+}
 
 func (store *stringIntern) unquote(value []byte) string {
-	return store.Get(bytes.Replace(value, quoteBytes, nil, -1))
+	return store.Get(unquoteBytes(value))
 }
 
 func unquote(qs []byte) string {
-	return string(bytes.Replace(qs, quoteBytes, nil, -1))
+	return string(unquoteBytes(qs))
 }
diff --git a/martian/syntax/string_intern_test.go b/martian/syntax/string_intern_test.go
index 8e509414..f0c035cd 100644
--- a/martian/syntax/string_intern_test.go
+++ b/martian/syntax/string_intern_test.go
@@ -3,7 +3,10 @@
 package syntax
 
 import (
+	"bytes"
+	"encoding/json"
 	"testing"
+	"testing/quick"
 )
 
 func TestStringIntern(t *testing.T) {
@@ -27,3 +30,79 @@ func TestStringIntern(t *testing.T) {
 		t.Errorf("Bytes key lookup AllocsPerRun = %f, want 0", n)
 	}
 }
+
+func TestUnquote(t *testing.T) {
+	check := func(t *testing.T, input, expect string) {
+		t.Helper()
+		if s := unquote([]byte(input)); s != expect {
+			t.Errorf("Expected: %q, got %q",
+				expect, s)
+		}
+	}
+	check(t,
+		`"\"hey\" is\\\n\tfor \U0001f40es"`,
+		"\"hey\" is\\\n\tfor \U0001f40es")
+	check(t,
+		`"\xf2Y\xbb\x8a,\xd0(\xf0\xff=\x8c\xbd"`,
+		"\xf2Y\xbb\x8a,\xd0(\xf0\xff=\x8c\xbd")
+	check(t, `"multibyte \"ဤ\" character"`, "multibyte \"\xe1\x80\xa4\" character")
+	check(t, `"Octal is \167eird"`, "Octal is weird")
+	check(t, `"Hex is \x6eormal"`, "Hex is normal")
+	check(t, `"Hex is \x6Eormal"`, "Hex is normal")
+	check(t, `"Hex is \u0146ormal"`, "Hex is \u0146ormal")
+	check(t, `"We căn use anỿ valid utf-8 ☺"`, "We căn use anỿ valid utf-8 ☺")
+	check(t, `"Case sensitivity is \U0001f4A9"`, "Case sensitivity is \U0001f4A9")
+	check(t, `"Control\a\b\f\n\r\t\v characters"`, "Control\a\b\f\n\r\t\v characters")
+	check(t, `"Invalid \u123"`, "Invalid \ufffd")
+}
+
+// Fuzz test for unquote.
+func TestUnquoteFuzz(t *testing.T) {
+	t.Parallel()
+	if err := quick.CheckEqual(func(s string) string {
+		return s
+	}, func(s string) string {
+		var buf bytes.Buffer
+		enc := json.NewEncoder(&buf)
+		enc.SetEscapeHTML(false)
+		enc.Encode(s)
+		return unquote(buf.Bytes()[:buf.Len()-1])
+	}, nil); err != nil {
+		t.Error(err)
+	}
+}
+
+// Fuzzer test for format/decode round trip.
+func TestUnquoteFormat(t *testing.T) {
+	t.Parallel()
+	enc := func(s string) []byte {
+		var buf bytes.Buffer
+		quoteString(&buf, s)
+		return buf.Bytes()
+	}
+	roundTrip := func(s string) []byte {
+		return enc(unquote(enc(s)))
+	}
+	if err := quick.CheckEqual(enc, roundTrip, nil); err != nil {
+		t.Error(err)
+	}
+	jsonEnc := func(s string) []byte {
+		var buf bytes.Buffer
+		enc := json.NewEncoder(&buf)
+		enc.SetEscapeHTML(false)
+		enc.Encode(s)
+		return buf.Bytes()[:buf.Len()-1]
+	}
+	if err := quick.CheckEqual(enc, jsonEnc, nil); err != nil {
+		t.Error(err)
+	}
+	check := func(t *testing.T, s string) {
+		t.Helper()
+		if e, a := jsonEnc(s), enc(s); !bytes.Equal(e, a) {
+			t.Errorf("Expected %q -> %q, got %q", s, e, a)
+		}
+	}
+	check(t, "\"hey\" is\\\n\tfor \U0001f40es")
+	check(t, "Control\a\b\f\n\r\t\v \u2029 characters")
+	check(t, "Invalid character \x88\xee")
+}
diff --git a/martian/syntax/tokenizer.go b/martian/syntax/tokenizer.go
index de71c621..153e9f89 100644
--- a/martian/syntax/tokenizer.go
+++ b/martian/syntax/tokenizer.go
@@ -32,7 +32,8 @@ var rules = [...]rule{
 	{regexp.MustCompile(`^;`), SEMICOLON},
 	{regexp.MustCompile(`^,`), COMMA},
 	{regexp.MustCompile(`^\.`), DOT},
-	{regexp.MustCompile(`^"[^\"]*"`), LITSTRING}, // double-quoted strings. escapes not supported
+	// double-quoted strings with escaping.
+	{regexp.MustCompile(`^"(?:[^\\"]|\\[abfnrtv\\"]|\\[0-7]{3}|\\x[0-9a-fA-f]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"`), LITSTRING},
 	{regexp.MustCompile(`^filetype\b`), FILETYPE},
 	{regexp.MustCompile(`^stage\b`), STAGE},
 	{regexp.MustCompile(`^pipeline\b`), PIPELINE},
diff --git a/martian/syntax/tokenizer_test.go b/martian/syntax/tokenizer_test.go
index 67fa5220..cee93ee3 100644
--- a/martian/syntax/tokenizer_test.go
+++ b/martian/syntax/tokenizer_test.go
@@ -19,6 +19,19 @@ func TestNextToken(t *testing.T) {
 	}
 	check("# this is a comment\n", COMMENT)
 	check(`"this/is/a/string"`, LITSTRING)
+	check(`""`, LITSTRING)
+	check(`"this\nis\na\nstring"`, LITSTRING)
+	check(`"this \"is\" a string"`, LITSTRING)
+	check(`"\"this \"is\" a string\""`, LITSTRING)
+	check(`"this is a \U0001F9F5"`, LITSTRING)
+	check(`"this is \u0146ot bad"`, LITSTRING)
+	check(`"this is \167eird but fine"`, LITSTRING)
+	check(`\"this is not a string"`, INVALID)
+	check(`"\"this\" is not a string\"`, INVALID)
+	check(`"Invalid unicode\u001!"`, INVALID)
+	check(`"Invalid unicode\U0001!"`, INVALID)
+	check(`"Invalid unicode\x1!"`, INVALID)
+	check(`"Invalid unicode\xaz"`, INVALID)
 	check(`@include`, INCLUDE_DIRECTIVE)
 	check(`_INTERNAL_PIPELINE`, ID)
 	check(`_type_name`, ID)