From e3de553bc56d69b8f364e20d74a2bee98bb87382 Mon Sep 17 00:00:00 2001
From: Florian Pommerening <florian.pommerening@unibas.ch>
Date: Thu, 16 Nov 2023 19:01:55 +0100
Subject: [PATCH] support escaped quotes in strings

---
 src/search/parser/abstract_syntax_tree.cc     | 18 ++++++---
 .../parser/decorated_abstract_syntax_tree.cc  |  2 +-
 src/search/parser/lexical_analyzer.cc         | 40 ++++++++++---------
 src/search/parser/token_stream.cc             | 12 +++++-
 src/search/parser/token_stream.h              |  1 +
 src/search/utils/strings.cc                   | 38 ++++++++++++++++++
 src/search/utils/strings.h                    |  2 +
 7 files changed, 86 insertions(+), 27 deletions(-)
diff --git a/src/search/parser/abstract_syntax_tree.cc b/src/search/parser/abstract_syntax_tree.cc
index 49c1635a93..b77855a934 100644
--- a/src/search/parser/abstract_syntax_tree.cc
+++ b/src/search/parser/abstract_syntax_tree.cc
@@ -408,13 +408,21 @@ LiteralNode::LiteralNode(Token value)
 }
 
 DecoratedASTNodePtr LiteralNode::decorate(DecorateContext &context) const {
-    utils::TraceBlock block(context, "Checking Literal: " + value.content);
+    utils::TraceBlock block(context, "Checking Literal: " + value.repr());
     if (context.has_variable(value.content)) {
-        if (value.type != TokenType::IDENTIFIER) {
+        if (value.type == TokenType::IDENTIFIER) {
+            string variable_name = value.content;
+            return utils::make_unique_ptr<VariableNode>(variable_name);
+        } else if (value.type != TokenType::STRING) {
+            /*
+              Variable names may be identical to a string literal but not
+              identical to any other token, e.g., a boolean:
+                  "let(true, blind(), astar(true))"
+              This kind of mistake is handled earlier, so ending up here is a
+              programming mistake, not an input error.
+            */
             ABORT("A non-identifier token was defined as variable.");
         }
-        string variable_name = value.content;
-        return utils::make_unique_ptr<VariableNode>(variable_name);
     }
 
     switch (value.type) {
@@ -436,7 +444,7 @@ DecoratedASTNodePtr LiteralNode::decorate(DecorateContext &context) const {
 
 void LiteralNode::dump(string indent) const {
     cout << indent << token_type_name(value.type) << ": "
-         << value.content << endl;
+         << value.repr() << endl;
 }
 
 const plugins::Type &LiteralNode::get_type(DecorateContext &context) const {
diff --git a/src/search/parser/decorated_abstract_syntax_tree.cc b/src/search/parser/decorated_abstract_syntax_tree.cc
index 51576f20b8..98ff88b14b 100644
--- a/src/search/parser/decorated_abstract_syntax_tree.cc
+++ b/src/search/parser/decorated_abstract_syntax_tree.cc
@@ -228,7 +228,7 @@ plugins::Any StringLiteralNode::construct(ConstructContext &context) const {
 }
 
 void StringLiteralNode::dump(string indent) const {
-    cout << indent << "STRING: " << value << endl;
+    cout << indent << "STRING: \"" << utils::escape(value) << "\"" << endl;
 }
 
 IntLiteralNode::IntLiteralNode(const string &value)
diff --git a/src/search/parser/lexical_analyzer.cc b/src/search/parser/lexical_analyzer.cc
index 94bd594cad..d69f3b7c35 100644
--- a/src/search/parser/lexical_analyzer.cc
+++ b/src/search/parser/lexical_analyzer.cc
@@ -29,8 +29,7 @@ static vector<pair<TokenType, regex>> construct_token_type_expressions() {
         {TokenType::INTEGER,
          R"([+-]?(infinity|\d+([kmg]\b)?))"},
         {TokenType::BOOLEAN, R"(true|false)"},
-        // TODO: support quoted strings.
-        {TokenType::STRING, R"("([^"]*)\")"},
+        {TokenType::STRING, R"(\"((\\\\|\\"|\\n|[^"\\])*)\")"},
         {TokenType::LET, R"(let)"},
         {TokenType::IDENTIFIER, R"([a-zA-Z_]\w*)"}
     };
@@ -44,6 +43,23 @@ static vector<pair<TokenType, regex>> construct_token_type_expressions() {
 static const vector<pair<TokenType, regex>> token_type_expressions =
     construct_token_type_expressions();
 
+static string highlight_position(const string &text, string::const_iterator pos) {
+    ostringstream error;
+    int distance_to_error = pos - text.begin();
+    for (const string &line : utils::split(text, "\n")) {
+        int line_length = line.size();
+        bool error_in_line =
+            distance_to_error < line_length && distance_to_error >= 0;
+        error << (error_in_line ? "> " : "  ") << line << endl;
+        if (error_in_line)
+            error << string(distance_to_error + 2, ' ') << "^" << endl;
+
+        distance_to_error -= line.size() + 1;
+    }
+    string message = error.str();
+    utils::rstrip(message);
+    return message;
+}
 
 TokenStream split_tokens(const string &text) {
     utils::Context context;
@@ -63,7 +79,7 @@ TokenStream split_tokens(const string &text) {
             if (regex_search(start, end, match, expression, regex_constants::match_continuous)) {
                 string value;
                 if (token_type == TokenType::STRING) {
-                    value = match[2];
+                    value = utils::unescape(match[2]);
                 } else {
                     value = utils::tolower(match[1]);
                 }
@@ -74,22 +90,8 @@ TokenStream split_tokens(const string &text) {
             }
         }
         if (!has_match) {
-            ostringstream error;
-            error << "Unable to recognize next token:" << endl;
-            int distance_to_error = start - text.begin();
-            for (const string &line : utils::split(text, "\n")) {
-                int line_length = line.size();
-                bool error_in_line =
-                    distance_to_error < line_length && distance_to_error >= 0;
-                error << (error_in_line ? "> " : "  ") << line << endl;
-                if (error_in_line)
-                    error << string(distance_to_error + 2, ' ') << "^" << endl;
-
-                distance_to_error -= line.size() + 1;
-            }
-            string message = error.str();
-            utils::rstrip(message);
-            context.error(message);
+            context.error("Unable to recognize next token:\n" +
+                          highlight_position(text, start));
         }
     }
     return TokenStream(move(tokens));
diff --git a/src/search/parser/token_stream.cc b/src/search/parser/token_stream.cc
index c8ff79f2ca..9685ccc233 100644
--- a/src/search/parser/token_stream.cc
+++ b/src/search/parser/token_stream.cc
@@ -15,6 +15,14 @@ Token::Token(const string &content, TokenType type)
     : content(content), type(type) {
 }
 
+string Token::repr() const {
+    if (type == TokenType::STRING) {
+        return "\"" + utils::escape(content) + "\"";
+    } else {
+        return content;
+    }
+}
+
 TokenStream::TokenStream(vector<Token> &&tokens)
     : tokens(move(tokens)), pos(0) {
 }
@@ -70,7 +78,7 @@ string TokenStream::str(int from, int to) const {
     int max_position = min(static_cast<int>(tokens.size()), to);
     ostringstream message;
     while (curr_position < max_position) {
-        message << tokens[curr_position].content;
+        message << tokens[curr_position].repr();
         curr_position++;
     }
     return message.str();
@@ -113,7 +121,7 @@ ostream &operator<<(ostream &out, TokenType token_type) {
 }
 
 ostream &operator<<(ostream &out, const Token &token) {
-    out << "<Type: '" << token.type << "', Value: '" << token.content << "'>";
+    out << "<Type: '" << token.type << "', Value: '" << token.repr() << "'>";
     return out;
 }
 }
diff --git a/src/search/parser/token_stream.h b/src/search/parser/token_stream.h
index 027946b2d7..9512eb2ca6 100644
--- a/src/search/parser/token_stream.h
+++ b/src/search/parser/token_stream.h
@@ -29,6 +29,7 @@ struct Token {
     TokenType type;
 
     Token(const std::string &content, TokenType type);
+    std::string repr() const;
 };
 
 class TokenStream {
diff --git a/src/search/utils/strings.cc b/src/search/utils/strings.cc
index 46ea4c8d5f..93ff1a3a14 100644
--- a/src/search/utils/strings.cc
+++ b/src/search/utils/strings.cc
@@ -8,6 +8,44 @@
 using namespace std;
 
 namespace utils {
+string escape(const string &s) {
+    string result;
+    result.reserve(s.length());
+    for (char c : s) {
+        if (c == '\\') {
+            result += "\\\\";
+        } else if (c == '"') {
+            result += "\\\"";
+        } else if (c == '\n') {
+            result += "\\n";
+        } else {
+            result += c;
+        }
+    }
+    return result;
+}
+
+string unescape(const string &s) {
+    string result;
+    result.reserve(s.length());
+    bool escaped = false;
+    for (char c : s) {
+        if (escaped) {
+            escaped = false;
+            if (c == 'n') {
+                result += "\n";
+            } else {
+                result += c;
+            }
+        } else if (c == '\\') {
+            escaped = true;
+        } else {
+            result += c;
+        }
+    }
+    return result;
+}
+
 void lstrip(string &s) {
     s.erase(s.begin(), find_if(s.begin(), s.end(), [](int ch) {
                                    return !isspace(ch);
diff --git a/src/search/utils/strings.h b/src/search/utils/strings.h
index 5b0e34a9d9..c5bb226105 100644
--- a/src/search/utils/strings.h
+++ b/src/search/utils/strings.h
@@ -8,6 +8,8 @@
 #include <vector>
 
 namespace utils {
+extern std::string escape(const std::string &s);
+extern std::string unescape(const std::string &s);
 extern void lstrip(std::string &s);
 extern void rstrip(std::string &s);
 extern void strip(std::string &s);