Skip to content

Commit

Permalink
Revert "[7484 ] Add tokenizer factory to support plugin custom tokeni…
Browse files Browse the repository at this point in the history
…zer (7484)"

This reverts commit 61cf65f.
  • Loading branch information
rui-mo committed Sep 4, 2024
1 parent 92a88e9 commit b46dfc9
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 122 deletions.
14 changes: 8 additions & 6 deletions velox/type/Subfield.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,21 @@

namespace facebook::velox::common {

Subfield::Subfield(const std::string& path) {
auto tokenizer = Tokenizer::getInstance(path);
VELOX_CHECK(tokenizer->hasNext(), "Column name is missing: {}", path);
Subfield::Subfield(
const std::string& path,
const std::shared_ptr<Separators>& separators) {
Tokenizer tokenizer(path, separators);
VELOX_CHECK(tokenizer.hasNext(), "Column name is missing: {}", path);

auto firstElement = tokenizer->next();
auto firstElement = tokenizer.next();
VELOX_CHECK(
firstElement->kind() == kNestedField,
"Subfield path must start with a name: {}",
path);
std::vector<std::unique_ptr<PathElement>> pathElements;
pathElements.push_back(std::move(firstElement));
while (tokenizer->hasNext()) {
pathElements.push_back(tokenizer->next());
while (tokenizer.hasNext()) {
pathElements.push_back(tokenizer.next());
}
path_ = std::move(pathElements);
}
Expand Down
5 changes: 4 additions & 1 deletion velox/type/Subfield.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,10 @@ class Subfield {
};

public:
explicit Subfield(const std::string& path);
// Separators: the customized separators to tokenize field name.
explicit Subfield(
const std::string& path,
const std::shared_ptr<Separators>& separators = Separators::get());

explicit Subfield(std::vector<std::unique_ptr<PathElement>>&& path);

Expand Down
65 changes: 22 additions & 43 deletions velox/type/Tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@

namespace facebook::velox::common {

DefaultTokenizer::DefaultTokenizer(const std::string& path)
: path_(path), separators_(Separators::get()) {
Tokenizer::Tokenizer(
const std::string& path,
const std::shared_ptr<Separators>& separators)
: path_(path), separators_(separators) {
state = State::kNotReady;
index_ = 0;
}

bool DefaultTokenizer::hasNext() {
bool Tokenizer::hasNext() {
switch (state) {
case State::kDone:
return false;
Expand All @@ -37,19 +39,19 @@ bool DefaultTokenizer::hasNext() {
return tryToComputeNext();
}

std::unique_ptr<Subfield::PathElement> DefaultTokenizer::next() {
std::unique_ptr<Subfield::PathElement> Tokenizer::next() {
if (!hasNext()) {
VELOX_FAIL("No more tokens");
}
state = State::kNotReady;
return std::move(next_);
}

bool DefaultTokenizer::hasNextCharacter() {
bool Tokenizer::hasNextCharacter() {
return index_ < path_.length();
}

std::unique_ptr<Subfield::PathElement> DefaultTokenizer::computeNext() {
std::unique_ptr<Subfield::PathElement> Tokenizer::computeNext() {
if (!hasNextCharacter()) {
state = State::kDone;
return nullptr;
Expand Down Expand Up @@ -81,33 +83,33 @@ std::unique_ptr<Subfield::PathElement> DefaultTokenizer::computeNext() {
VELOX_UNREACHABLE();
}

bool DefaultTokenizer::tryMatchSeparator(char expected) {
bool Tokenizer::tryMatchSeparator(char expected) {
return separators_->isSeparator(expected) && tryMatch(expected);
}

void DefaultTokenizer::match(char expected) {
void Tokenizer::match(char expected) {
if (!tryMatch(expected)) {
invalidSubfieldPath();
}
}

bool DefaultTokenizer::tryMatch(char expected) {
bool Tokenizer::tryMatch(char expected) {
if (!hasNextCharacter() || peekCharacter() != expected) {
return false;
}
index_++;
return true;
}

void DefaultTokenizer::nextCharacter() {
void Tokenizer::nextCharacter() {
index_++;
}

char DefaultTokenizer::peekCharacter() {
char Tokenizer::peekCharacter() {
return path_[index_];
}

std::unique_ptr<Subfield::PathElement> DefaultTokenizer::matchPathSegment() {
std::unique_ptr<Subfield::PathElement> Tokenizer::matchPathSegment() {
// seek until we see a special character or whitespace
int start = index_;
while (hasNextCharacter() && !separators_->isSeparator(peekCharacter()) &&
Expand All @@ -126,8 +128,7 @@ std::unique_ptr<Subfield::PathElement> DefaultTokenizer::matchPathSegment() {
return std::make_unique<Subfield::NestedField>(token);
}

std::unique_ptr<Subfield::PathElement>
DefaultTokenizer::matchUnquotedSubscript() {
std::unique_ptr<Subfield::PathElement> Tokenizer::matchUnquotedSubscript() {
// seek until we see a special character or whitespace
int start = index_;
while (hasNextCharacter() && isUnquotedSubscriptCharacter(peekCharacter())) {
Expand All @@ -150,17 +151,16 @@ DefaultTokenizer::matchUnquotedSubscript() {
return std::make_unique<Subfield::LongSubscript>(index);
}

bool DefaultTokenizer::isUnquotedPathCharacter(char c) {
bool Tokenizer::isUnquotedPathCharacter(char c) {
return c == ':' || c == '$' || c == '-' || c == '/' || c == '@' || c == '|' ||
c == '#' || c == '.' || isUnquotedSubscriptCharacter(c);
}

bool DefaultTokenizer::isUnquotedSubscriptCharacter(char c) {
bool Tokenizer::isUnquotedSubscriptCharacter(char c) {
return c == '-' || c == '_' || c == ' ' || isalnum(c);
}

std::unique_ptr<Subfield::PathElement>
DefaultTokenizer::matchQuotedSubscript() {
std::unique_ptr<Subfield::PathElement> Tokenizer::matchQuotedSubscript() {
// quote has already been matched

// seek until we see the close quote
Expand Down Expand Up @@ -200,21 +200,20 @@ DefaultTokenizer::matchQuotedSubscript() {
return std::make_unique<Subfield::StringSubscript>(token);
}

std::unique_ptr<Subfield::PathElement>
DefaultTokenizer::matchWildcardSubscript() {
std::unique_ptr<Subfield::PathElement> Tokenizer::matchWildcardSubscript() {
return std::make_unique<Subfield::AllSubscripts>();
}

void DefaultTokenizer::invalidSubfieldPath() {
void Tokenizer::invalidSubfieldPath() {
VELOX_FAIL("Invalid subfield path: {}", this->toString());
}

std::string DefaultTokenizer::toString() {
std::string Tokenizer::toString() {
return path_.substr(0, index_) + separators_->unicodeCaret +
path_.substr(index_);
}

bool DefaultTokenizer::tryToComputeNext() {
bool Tokenizer::tryToComputeNext() {
state = State::kFailed; // temporary pessimism
next_ = computeNext();
if (state != State::kDone) {
Expand All @@ -223,24 +222,4 @@ bool DefaultTokenizer::tryToComputeNext() {
}
return false;
}

std::function<std::unique_ptr<Tokenizer>(const std::string&)>
Tokenizer::tokenizerFactory_ = nullptr;

// static
std::unique_ptr<Tokenizer> Tokenizer::getInstance(const std::string& path) {
if (!tokenizerFactory_) {
tokenizerFactory_ = [](const std::string& p) {
return std::make_unique<DefaultTokenizer>(p);
};
}
return tokenizerFactory_(path);
}

// static
void Tokenizer::registerInstanceFactory(
std::function<std::unique_ptr<Tokenizer>(const std::string&)>
tokenizerFactory) {
tokenizerFactory_ = tokenizerFactory;
}
} // namespace facebook::velox::common
28 changes: 6 additions & 22 deletions velox/type/Tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,30 +35,14 @@ class Tokenizer {
kFailed,
};

virtual ~Tokenizer() = default;
// Separators: the customized separators to tokenize field name.
explicit Tokenizer(
const std::string& path,
const std::shared_ptr<Separators>& separators);

virtual bool hasNext() = 0;
bool hasNext();

virtual std::unique_ptr<Subfield::PathElement> next() = 0;

static std::unique_ptr<Tokenizer> getInstance(const std::string& path);

static void registerInstanceFactory(
std::function<std::unique_ptr<Tokenizer>(const std::string&)>
tokenizerFactory);

private:
static std::function<std::unique_ptr<Tokenizer>(const std::string&)>
tokenizerFactory_;
};

class DefaultTokenizer : public Tokenizer {
public:
explicit DefaultTokenizer(const std::string& path);

bool hasNext() override;

std::unique_ptr<Subfield::PathElement> next() override;
std::unique_ptr<Subfield::PathElement> next();

private:
const std::string path_;
Expand Down
62 changes: 12 additions & 50 deletions velox/type/tests/SubfieldTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@
using namespace facebook::velox::common;

std::vector<std::unique_ptr<Subfield::PathElement>> tokenize(
const std::string& path) {
const std::string& path,
const std::shared_ptr<Separators>& separators = Separators::get()) {
std::vector<std::unique_ptr<Subfield::PathElement>> elements;
auto tokenizer = Tokenizer::getInstance(path);
while (tokenizer->hasNext()) {
elements.push_back(tokenizer->next());
Tokenizer tokenizer(path, separators);
while (tokenizer.hasNext()) {
elements.push_back(tokenizer.next());
}
return elements;
}
Expand All @@ -47,8 +48,10 @@ TEST(SubfieldTest, invalidPaths) {
assertInvalidSubfield("a[2].[3].", "Invalid subfield path: a[2].^[3].");
}

void testColumnName(const std::string& name) {
auto elements = tokenize(name);
void testColumnName(
const std::string& name,
const std::shared_ptr<Separators>& separators = Separators::get()) {
auto elements = tokenize(name, separators);
EXPECT_EQ(elements.size(), 1);
EXPECT_EQ(*elements[0].get(), Subfield::NestedField(name));
}
Expand All @@ -62,6 +65,9 @@ TEST(SubfieldTest, columnNamesWithSpecialCharacters) {
testColumnName("a/b/c:12");
testColumnName("@basis");
testColumnName("@basis|city_id");
auto separators = std::make_shared<Separators>();
separators->dot = '\0';
testColumnName("city.id@address:number/date|day$a-b$10_bucket", separators);
}

std::vector<std::unique_ptr<Subfield::PathElement>> createElements() {
Expand Down Expand Up @@ -151,47 +157,3 @@ TEST(SubfieldTest, longSubscript) {
ASSERT_TRUE(longSubscript);
ASSERT_EQ(longSubscript->index(), 3309189884973035076);
}

class FakeTokenizer : public Tokenizer {
public:
explicit FakeTokenizer(const std::string& path) : path_(path) {
state = State::kNotReady;
}

bool hasNext() override {
if (state == State::kDone) {
return false;
} else if (state == State::kNotReady) {
return true;
}
VELOX_FAIL("Illegal state");
}

std::unique_ptr<Subfield::PathElement> next() override {
if (!hasNext()) {
VELOX_USER_FAIL("No more tokens");
}
state = State::kDone;
return std::make_unique<Subfield::NestedField>(path_);
}

private:
const std::string path_;
State state;
};

TEST(SubfieldTest, CustomTokenizer) {
Tokenizer::registerInstanceFactory(
[](const std::string& p) { return std::make_unique<FakeTokenizer>(p); });

testColumnName("$bucket");
testColumnName("apollo-11");
testColumnName("a/b/c:12");
testColumnName("@basis");
testColumnName("@basis|city_id");
testColumnName("city.id@address*:number/date|day$a-b$10_bucket");

Tokenizer::registerInstanceFactory([](const std::string& p) {
return std::make_unique<DefaultTokenizer>(p);
});
}

0 comments on commit b46dfc9

Please sign in to comment.