diff --git a/velox/type/Subfield.cpp b/velox/type/Subfield.cpp index 2df88b5a3a73e..c30d4cf4ea492 100644 --- a/velox/type/Subfield.cpp +++ b/velox/type/Subfield.cpp @@ -18,8 +18,8 @@ namespace facebook::velox::common { -Subfield::Subfield(const std::string& path) { - Tokenizer tokenizer(path); +Subfield::Subfield(const std::string& path, const std::vector& regularChars) { + Tokenizer tokenizer(path, regularChars); VELOX_CHECK(tokenizer.hasNext(), "Column name is missing: {}", path); auto firstElement = tokenizer.next(); diff --git a/velox/type/Subfield.h b/velox/type/Subfield.h index 407c38f060e34..bf79f8a17af36 100644 --- a/velox/type/Subfield.h +++ b/velox/type/Subfield.h @@ -191,7 +191,7 @@ class Subfield { }; public: - explicit Subfield(const std::string& path); + explicit Subfield(const std::string& path, const std::vector& regularChars); explicit Subfield(std::vector>&& path); diff --git a/velox/type/Tokenizer.cpp b/velox/type/Tokenizer.cpp index 53c4d9d595b23..dc80408a07c4a 100644 --- a/velox/type/Tokenizer.cpp +++ b/velox/type/Tokenizer.cpp @@ -17,7 +17,7 @@ namespace facebook::velox::common { -Tokenizer::Tokenizer(const std::string& path) : path_(path) { +Tokenizer::Tokenizer(const std::string& path, const std::vector& regularChars) : path_(path), regularChars_(regularChars) { state = State::kNotReady; index_ = 0; } @@ -87,6 +87,9 @@ void Tokenizer::match(char expected) { } bool Tokenizer::tryMatch(char expected) { + if (isRegularCharacter(expected)) { + return false; + } if (!hasNextCharacter() || peekCharacter() != expected) { return false; } @@ -143,13 +146,18 @@ std::unique_ptr Tokenizer::matchUnquotedSubscript() { return std::make_unique(index); } -bool Tokenizer::isUnquotedPathCharacter(char c) { - return c == ':' || c == '$' || c == '-' || c == '/' || c == '@' || c == '|' || +bool Tokenizer::isRegularCharacter(char c) { + return std::find( + regularChars_.begin(), regularChars_.end(), c) != regularChars_.end(); +} + +bool Tokenizer::isUnquotedPathCharacter(char c) { + return isRegularCharacter(c) || c == ':' || c == '$' || c == '-' || c == '/' || c == '@' || c == '|' || c == '#' || isUnquotedSubscriptCharacter(c); } bool Tokenizer::isUnquotedSubscriptCharacter(char c) { - return c == '-' || c == '_' || isalnum(c); + return isRegularCharacter(c) || c == '-' || c == '_' || isalnum(c); } std::unique_ptr Tokenizer::matchQuotedSubscript() { diff --git a/velox/type/Tokenizer.h b/velox/type/Tokenizer.h index 56380f0aead26..687f671bb89d6 100644 --- a/velox/type/Tokenizer.h +++ b/velox/type/Tokenizer.h @@ -35,7 +35,7 @@ class Tokenizer { kFailed, }; - explicit Tokenizer(const std::string& path); + explicit Tokenizer(const std::string& path, const std::vector& regularChars); bool hasNext(); @@ -51,6 +51,7 @@ class Tokenizer { const char UNICODE_CARET = '^'; const std::string path_; + std::vector regularChars_; int index_; State state; bool firstSegment = true; @@ -74,6 +75,8 @@ class Tokenizer { bool tryToComputeNext(); + bool isRegularCharacter(char c); + void invalidSubfieldPath(); bool isUnquotedPathCharacter(char c);