From 4808c9844898fdd2be9c4f5c519da5ca01b697c1 Mon Sep 17 00:00:00 2001 From: rui-mo Date: Fri, 4 Aug 2023 13:18:58 +0000 Subject: [PATCH] add regular chars --- velox/type/Subfield.cpp | 6 ++++-- velox/type/Subfield.h | 4 +++- velox/type/Tokenizer.cpp | 33 +++++++++++++++++++++++-------- velox/type/Tokenizer.h | 10 +++++++++- velox/type/tests/SubfieldTest.cpp | 13 ++++++++---- 5 files changed, 50 insertions(+), 16 deletions(-) diff --git a/velox/type/Subfield.cpp b/velox/type/Subfield.cpp index 2df88b5a3a73e..b24f8b8ef442b 100644 --- a/velox/type/Subfield.cpp +++ b/velox/type/Subfield.cpp @@ -18,8 +18,10 @@ namespace facebook::velox::common { -Subfield::Subfield(const std::string& path) { - Tokenizer tokenizer(path); +Subfield::Subfield( + const std::string& path, + const std::optional>& separators) { + Tokenizer tokenizer(path, separators); VELOX_CHECK(tokenizer.hasNext(), "Column name is missing: {}", path); auto firstElement = tokenizer.next(); diff --git a/velox/type/Subfield.h b/velox/type/Subfield.h index 407c38f060e34..628bb2e622e5d 100644 --- a/velox/type/Subfield.h +++ b/velox/type/Subfield.h @@ -191,7 +191,9 @@ class Subfield { }; public: - explicit Subfield(const std::string& path); + explicit Subfield( + const std::string& path, + const std::optional>& separators = std::nullopt); explicit Subfield(std::vector>&& path); diff --git a/velox/type/Tokenizer.cpp b/velox/type/Tokenizer.cpp index 53c4d9d595b23..7e7f7477e1b03 100644 --- a/velox/type/Tokenizer.cpp +++ b/velox/type/Tokenizer.cpp @@ -17,9 +17,15 @@ namespace facebook::velox::common { -Tokenizer::Tokenizer(const std::string& path) : path_(path) { +Tokenizer::Tokenizer( + const std::string& path, + const std::optional>& separators) + : path_(path) { state = State::kNotReady; index_ = 0; + if (separators.has_value()) { + separators_ = separators.value(); + } } bool Tokenizer::hasNext() { @@ -54,17 +60,17 @@ std::unique_ptr Tokenizer::computeNext() { return nullptr; } - if (tryMatch(DOT)) { + if (tryMatchSeparator(DOT)) { std::unique_ptr token = matchPathSegment(); firstSegment = false; return token; } - if (tryMatch(OPEN_BRACKET)) { - std::unique_ptr token = tryMatch(QUOTE) + if (tryMatchSeparator(OPEN_BRACKET)) { + std::unique_ptr token = tryMatchSeparator(QUOTE) ? matchQuotedSubscript() - : tryMatch(WILDCARD) ? matchWildcardSubscript() - : matchUnquotedSubscript(); + : tryMatchSeparator(WILDCARD) ? matchWildcardSubscript() + : matchUnquotedSubscript(); match(CLOSE_BRACKET); firstSegment = false; @@ -80,6 +86,10 @@ std::unique_ptr Tokenizer::computeNext() { VELOX_UNREACHABLE(); } +bool Tokenizer::tryMatchSeparator(char expected) { + return isSeparator(expected) && tryMatch(expected); +} + void Tokenizer::match(char expected) { if (!tryMatch(expected)) { invalidSubfieldPath(); @@ -105,7 +115,9 @@ char Tokenizer::peekCharacter() { std::unique_ptr Tokenizer::matchPathSegment() { // seek until we see a special character or whitespace int start = index_; - while (hasNextCharacter() && isUnquotedPathCharacter(peekCharacter())) { + while (hasNextCharacter() && + (!isSeparator(peekCharacter()) && + isUnquotedPathCharacter(peekCharacter()))) { nextCharacter(); } int end = index_; @@ -143,9 +155,14 @@ std::unique_ptr Tokenizer::matchUnquotedSubscript() { return std::make_unique(index); } +bool Tokenizer::isSeparator(char c) { + return std::find(separators_.begin(), separators_.end(), c) != + separators_.end(); +} + bool Tokenizer::isUnquotedPathCharacter(char c) { return c == ':' || c == '$' || c == '-' || c == '/' || c == '@' || c == '|' || - c == '#' || isUnquotedSubscriptCharacter(c); + c == '#' || c == '.' || isUnquotedSubscriptCharacter(c); } bool Tokenizer::isUnquotedSubscriptCharacter(char c) { diff --git a/velox/type/Tokenizer.h b/velox/type/Tokenizer.h index 56380f0aead26..6c8418e7c6e02 100644 --- a/velox/type/Tokenizer.h +++ b/velox/type/Tokenizer.h @@ -35,7 +35,9 @@ class Tokenizer { kFailed, }; - explicit Tokenizer(const std::string& path); + explicit Tokenizer( + const std::string& path, + const std::optional>& separators); bool hasNext(); @@ -51,6 +53,8 @@ class Tokenizer { const char UNICODE_CARET = '^'; const std::string path_; + std::vector separators_ = + {DOT, OPEN_BRACKET, QUOTE, WILDCARD, CLOSE_BRACKET}; int index_; State state; bool firstSegment = true; @@ -60,6 +64,8 @@ class Tokenizer { std::unique_ptr computeNext(); + bool tryMatchSeparator(char expected); + void match(char expected); bool tryMatch(char expected); @@ -74,6 +80,8 @@ class Tokenizer { bool tryToComputeNext(); + bool isSeparator(char c); + void invalidSubfieldPath(); bool isUnquotedPathCharacter(char c); diff --git a/velox/type/tests/SubfieldTest.cpp b/velox/type/tests/SubfieldTest.cpp index 91252ae134dd1..c1c083cf02919 100644 --- a/velox/type/tests/SubfieldTest.cpp +++ b/velox/type/tests/SubfieldTest.cpp @@ -20,9 +20,10 @@ using namespace facebook::velox::common; std::vector> tokenize( - const std::string& path) { + const std::string& path, + const std::optional>& separators = std::nullopt) { std::vector> elements; - Tokenizer tokenizer(path); + Tokenizer tokenizer(path, separators); while (tokenizer.hasNext()) { elements.push_back(tokenizer.next()); } @@ -47,8 +48,10 @@ TEST(SubfieldTest, invalidPaths) { assertInvalidSubfield("a[2].[3].", "Invalid subfield path: a[2].^[3]."); } -void testColumnName(const std::string& name) { - auto elements = tokenize(name); +void testColumnName( + const std::string& name, + const std::optional>& separators = std::nullopt) { + auto elements = tokenize(name, separators); EXPECT_EQ(elements.size(), 1); EXPECT_EQ(*elements[0].get(), Subfield::NestedField(name)); } @@ -59,6 +62,8 @@ TEST(SubfieldTest, columnNamesWithSpecialCharacters) { testColumnName("a/b/c:12"); testColumnName("@basis"); testColumnName("@basis|city_id"); + std::vector separators = {'[', ']', '\"', '*'}; + testColumnName("city.id@address:number/date|day$a-b$10_bucket", separators); } std::vector> createElements() {