From e3de553bc56d69b8f364e20d74a2bee98bb87382 Mon Sep 17 00:00:00 2001 From: Florian Pommerening Date: Thu, 16 Nov 2023 19:01:55 +0100 Subject: [PATCH] support escaped quotes in strings --- src/search/parser/abstract_syntax_tree.cc | 18 ++++++--- .../parser/decorated_abstract_syntax_tree.cc | 2 +- src/search/parser/lexical_analyzer.cc | 40 ++++++++++--------- src/search/parser/token_stream.cc | 12 +++++- src/search/parser/token_stream.h | 1 + src/search/utils/strings.cc | 38 ++++++++++++++++++ src/search/utils/strings.h | 2 + 7 files changed, 86 insertions(+), 27 deletions(-) diff --git a/src/search/parser/abstract_syntax_tree.cc b/src/search/parser/abstract_syntax_tree.cc index 49c1635a93..b77855a934 100644 --- a/src/search/parser/abstract_syntax_tree.cc +++ b/src/search/parser/abstract_syntax_tree.cc @@ -408,13 +408,21 @@ LiteralNode::LiteralNode(Token value) } DecoratedASTNodePtr LiteralNode::decorate(DecorateContext &context) const { - utils::TraceBlock block(context, "Checking Literal: " + value.content); + utils::TraceBlock block(context, "Checking Literal: " + value.repr()); if (context.has_variable(value.content)) { - if (value.type != TokenType::IDENTIFIER) { + if (value.type == TokenType::IDENTIFIER) { + string variable_name = value.content; + return utils::make_unique_ptr(variable_name); + } else if (value.type != TokenType::STRING) { + /* + Variable names may be identical to a string literal but not + identical to any other token, e.g., a boolean: + "let(true, blind(), astar(true))" + This kind of mistake is handled earlier, so ending up here is a + programming mistake, not an input error. + */ ABORT("A non-identifier token was defined as variable."); } - string variable_name = value.content; - return utils::make_unique_ptr(variable_name); } switch (value.type) { @@ -436,7 +444,7 @@ DecoratedASTNodePtr LiteralNode::decorate(DecorateContext &context) const { void LiteralNode::dump(string indent) const { cout << indent << token_type_name(value.type) << ": " - << value.content << endl; + << value.repr() << endl; } const plugins::Type &LiteralNode::get_type(DecorateContext &context) const { diff --git a/src/search/parser/decorated_abstract_syntax_tree.cc b/src/search/parser/decorated_abstract_syntax_tree.cc index 51576f20b8..98ff88b14b 100644 --- a/src/search/parser/decorated_abstract_syntax_tree.cc +++ b/src/search/parser/decorated_abstract_syntax_tree.cc @@ -228,7 +228,7 @@ plugins::Any StringLiteralNode::construct(ConstructContext &context) const { } void StringLiteralNode::dump(string indent) const { - cout << indent << "STRING: " << value << endl; + cout << indent << "STRING: \"" << utils::escape(value) << "\"" << endl; } IntLiteralNode::IntLiteralNode(const string &value) diff --git a/src/search/parser/lexical_analyzer.cc b/src/search/parser/lexical_analyzer.cc index 94bd594cad..d69f3b7c35 100644 --- a/src/search/parser/lexical_analyzer.cc +++ b/src/search/parser/lexical_analyzer.cc @@ -29,8 +29,7 @@ static vector> construct_token_type_expressions() { {TokenType::INTEGER, R"([+-]?(infinity|\d+([kmg]\b)?))"}, {TokenType::BOOLEAN, R"(true|false)"}, - // TODO: support quoted strings. - {TokenType::STRING, R"("([^"]*)\")"}, + {TokenType::STRING, R"(\"((\\\\|\\"|\\n|[^"\\])*)\")"}, {TokenType::LET, R"(let)"}, {TokenType::IDENTIFIER, R"([a-zA-Z_]\w*)"} }; @@ -44,6 +43,23 @@ static vector> construct_token_type_expressions() { static const vector> token_type_expressions = construct_token_type_expressions(); +static string highlight_position(const string &text, string::const_iterator pos) { + ostringstream error; + int distance_to_error = pos - text.begin(); + for (const string &line : utils::split(text, "\n")) { + int line_length = line.size(); + bool error_in_line = + distance_to_error < line_length && distance_to_error >= 0; + error << (error_in_line ? "> " : " ") << line << endl; + if (error_in_line) + error << string(distance_to_error + 2, ' ') << "^" << endl; + + distance_to_error -= line.size() + 1; + } + string message = error.str(); + utils::rstrip(message); + return message; +} TokenStream split_tokens(const string &text) { utils::Context context; @@ -63,7 +79,7 @@ TokenStream split_tokens(const string &text) { if (regex_search(start, end, match, expression, regex_constants::match_continuous)) { string value; if (token_type == TokenType::STRING) { - value = match[2]; + value = utils::unescape(match[2]); } else { value = utils::tolower(match[1]); } @@ -74,22 +90,8 @@ TokenStream split_tokens(const string &text) { } } if (!has_match) { - ostringstream error; - error << "Unable to recognize next token:" << endl; - int distance_to_error = start - text.begin(); - for (const string &line : utils::split(text, "\n")) { - int line_length = line.size(); - bool error_in_line = - distance_to_error < line_length && distance_to_error >= 0; - error << (error_in_line ? "> " : " ") << line << endl; - if (error_in_line) - error << string(distance_to_error + 2, ' ') << "^" << endl; - - distance_to_error -= line.size() + 1; - } - string message = error.str(); - utils::rstrip(message); - context.error(message); + context.error("Unable to recognize next token:\n" + + highlight_position(text, start)); } } return TokenStream(move(tokens)); diff --git a/src/search/parser/token_stream.cc b/src/search/parser/token_stream.cc index c8ff79f2ca..9685ccc233 100644 --- a/src/search/parser/token_stream.cc +++ b/src/search/parser/token_stream.cc @@ -15,6 +15,14 @@ Token::Token(const string &content, TokenType type) : content(content), type(type) { } +string Token::repr() const { + if (type == TokenType::STRING) { + return "\"" + utils::escape(content) + "\""; + } else { + return content; + } +} + TokenStream::TokenStream(vector &&tokens) : tokens(move(tokens)), pos(0) { } @@ -70,7 +78,7 @@ string TokenStream::str(int from, int to) const { int max_position = min(static_cast(tokens.size()), to); ostringstream message; while (curr_position < max_position) { - message << tokens[curr_position].content; + message << tokens[curr_position].repr(); curr_position++; } return message.str(); @@ -113,7 +121,7 @@ ostream &operator<<(ostream &out, TokenType token_type) { } ostream &operator<<(ostream &out, const Token &token) { - out << ""; + out << ""; return out; } } diff --git a/src/search/parser/token_stream.h b/src/search/parser/token_stream.h index 027946b2d7..9512eb2ca6 100644 --- a/src/search/parser/token_stream.h +++ b/src/search/parser/token_stream.h @@ -29,6 +29,7 @@ struct Token { TokenType type; Token(const std::string &content, TokenType type); + std::string repr() const; }; class TokenStream { diff --git a/src/search/utils/strings.cc b/src/search/utils/strings.cc index 46ea4c8d5f..93ff1a3a14 100644 --- a/src/search/utils/strings.cc +++ b/src/search/utils/strings.cc @@ -8,6 +8,44 @@ using namespace std; namespace utils { +string escape(const string &s) { + string result; + result.reserve(s.length()); + for (char c : s) { + if (c == '\\') { + result += "\\\\"; + } else if (c == '"') { + result += "\\\""; + } else if (c == '\n') { + result += "\\n"; + } else { + result += c; + } + } + return result; +} + +string unescape(const string &s) { + string result; + result.reserve(s.length()); + bool escaped = false; + for (char c : s) { + if (escaped) { + escaped = false; + if (c == 'n') { + result += "\n"; + } else { + result += c; + } + } else if (c == '\\') { + escaped = true; + } else { + result += c; + } + } + return result; +} + void lstrip(string &s) { s.erase(s.begin(), find_if(s.begin(), s.end(), [](int ch) { return !isspace(ch); diff --git a/src/search/utils/strings.h b/src/search/utils/strings.h index 5b0e34a9d9..c5bb226105 100644 --- a/src/search/utils/strings.h +++ b/src/search/utils/strings.h @@ -8,6 +8,8 @@ #include namespace utils { +extern std::string escape(const std::string &s); +extern std::string unescape(const std::string &s); extern void lstrip(std::string &s); extern void rstrip(std::string &s); extern void strip(std::string &s);