Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "[7484 ] Add tokenizer factory to support plugin custom tokenizer (7484)" #498

Merged
merged 1 commit into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions velox/type/Subfield.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,21 @@

namespace facebook::velox::common {

Subfield::Subfield(const std::string& path) {
auto tokenizer = Tokenizer::getInstance(path);
VELOX_CHECK(tokenizer->hasNext(), "Column name is missing: {}", path);
Subfield::Subfield(
const std::string& path,
const std::shared_ptr<Separators>& separators) {
Tokenizer tokenizer(path, separators);
VELOX_CHECK(tokenizer.hasNext(), "Column name is missing: {}", path);

auto firstElement = tokenizer->next();
auto firstElement = tokenizer.next();
VELOX_CHECK(
firstElement->kind() == kNestedField,
"Subfield path must start with a name: {}",
path);
std::vector<std::unique_ptr<PathElement>> pathElements;
pathElements.push_back(std::move(firstElement));
while (tokenizer->hasNext()) {
pathElements.push_back(tokenizer->next());
while (tokenizer.hasNext()) {
pathElements.push_back(tokenizer.next());
}
path_ = std::move(pathElements);
}
Expand Down
5 changes: 4 additions & 1 deletion velox/type/Subfield.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,10 @@ class Subfield {
};

public:
explicit Subfield(const std::string& path);
// Separators: the customized separators to tokenize field name.
explicit Subfield(
const std::string& path,
const std::shared_ptr<Separators>& separators = Separators::get());

explicit Subfield(std::vector<std::unique_ptr<PathElement>>&& path);

Expand Down
65 changes: 22 additions & 43 deletions velox/type/Tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@

namespace facebook::velox::common {

DefaultTokenizer::DefaultTokenizer(const std::string& path)
: path_(path), separators_(Separators::get()) {
Tokenizer::Tokenizer(
const std::string& path,
const std::shared_ptr<Separators>& separators)
: path_(path), separators_(separators) {
state = State::kNotReady;
index_ = 0;
}

bool DefaultTokenizer::hasNext() {
bool Tokenizer::hasNext() {
switch (state) {
case State::kDone:
return false;
Expand All @@ -37,19 +39,19 @@ bool DefaultTokenizer::hasNext() {
return tryToComputeNext();
}

std::unique_ptr<Subfield::PathElement> DefaultTokenizer::next() {
std::unique_ptr<Subfield::PathElement> Tokenizer::next() {
if (!hasNext()) {
VELOX_FAIL("No more tokens");
}
state = State::kNotReady;
return std::move(next_);
}

bool DefaultTokenizer::hasNextCharacter() {
bool Tokenizer::hasNextCharacter() {
return index_ < path_.length();
}

std::unique_ptr<Subfield::PathElement> DefaultTokenizer::computeNext() {
std::unique_ptr<Subfield::PathElement> Tokenizer::computeNext() {
if (!hasNextCharacter()) {
state = State::kDone;
return nullptr;
Expand Down Expand Up @@ -81,33 +83,33 @@ std::unique_ptr<Subfield::PathElement> DefaultTokenizer::computeNext() {
VELOX_UNREACHABLE();
}

bool DefaultTokenizer::tryMatchSeparator(char expected) {
bool Tokenizer::tryMatchSeparator(char expected) {
return separators_->isSeparator(expected) && tryMatch(expected);
}

void DefaultTokenizer::match(char expected) {
void Tokenizer::match(char expected) {
if (!tryMatch(expected)) {
invalidSubfieldPath();
}
}

bool DefaultTokenizer::tryMatch(char expected) {
bool Tokenizer::tryMatch(char expected) {
if (!hasNextCharacter() || peekCharacter() != expected) {
return false;
}
index_++;
return true;
}

void DefaultTokenizer::nextCharacter() {
void Tokenizer::nextCharacter() {
index_++;
}

char DefaultTokenizer::peekCharacter() {
char Tokenizer::peekCharacter() {
return path_[index_];
}

std::unique_ptr<Subfield::PathElement> DefaultTokenizer::matchPathSegment() {
std::unique_ptr<Subfield::PathElement> Tokenizer::matchPathSegment() {
// seek until we see a special character or whitespace
int start = index_;
while (hasNextCharacter() && !separators_->isSeparator(peekCharacter()) &&
Expand All @@ -126,8 +128,7 @@ std::unique_ptr<Subfield::PathElement> DefaultTokenizer::matchPathSegment() {
return std::make_unique<Subfield::NestedField>(token);
}

std::unique_ptr<Subfield::PathElement>
DefaultTokenizer::matchUnquotedSubscript() {
std::unique_ptr<Subfield::PathElement> Tokenizer::matchUnquotedSubscript() {
// seek until we see a special character or whitespace
int start = index_;
while (hasNextCharacter() && isUnquotedSubscriptCharacter(peekCharacter())) {
Expand All @@ -150,17 +151,16 @@ DefaultTokenizer::matchUnquotedSubscript() {
return std::make_unique<Subfield::LongSubscript>(index);
}

bool DefaultTokenizer::isUnquotedPathCharacter(char c) {
bool Tokenizer::isUnquotedPathCharacter(char c) {
return c == ':' || c == '$' || c == '-' || c == '/' || c == '@' || c == '|' ||
c == '#' || c == '.' || isUnquotedSubscriptCharacter(c);
}

bool DefaultTokenizer::isUnquotedSubscriptCharacter(char c) {
bool Tokenizer::isUnquotedSubscriptCharacter(char c) {
return c == '-' || c == '_' || c == ' ' || isalnum(c);
}

std::unique_ptr<Subfield::PathElement>
DefaultTokenizer::matchQuotedSubscript() {
std::unique_ptr<Subfield::PathElement> Tokenizer::matchQuotedSubscript() {
// quote has already been matched

// seek until we see the close quote
Expand Down Expand Up @@ -200,21 +200,20 @@ DefaultTokenizer::matchQuotedSubscript() {
return std::make_unique<Subfield::StringSubscript>(token);
}

std::unique_ptr<Subfield::PathElement>
DefaultTokenizer::matchWildcardSubscript() {
std::unique_ptr<Subfield::PathElement> Tokenizer::matchWildcardSubscript() {
return std::make_unique<Subfield::AllSubscripts>();
}

void DefaultTokenizer::invalidSubfieldPath() {
void Tokenizer::invalidSubfieldPath() {
VELOX_FAIL("Invalid subfield path: {}", this->toString());
}

std::string DefaultTokenizer::toString() {
std::string Tokenizer::toString() {
return path_.substr(0, index_) + separators_->unicodeCaret +
path_.substr(index_);
}

bool DefaultTokenizer::tryToComputeNext() {
bool Tokenizer::tryToComputeNext() {
state = State::kFailed; // temporary pessimism
next_ = computeNext();
if (state != State::kDone) {
Expand All @@ -223,24 +222,4 @@ bool DefaultTokenizer::tryToComputeNext() {
}
return false;
}

std::function<std::unique_ptr<Tokenizer>(const std::string&)>
Tokenizer::tokenizerFactory_ = nullptr;

// static
std::unique_ptr<Tokenizer> Tokenizer::getInstance(const std::string& path) {
if (!tokenizerFactory_) {
tokenizerFactory_ = [](const std::string& p) {
return std::make_unique<DefaultTokenizer>(p);
};
}
return tokenizerFactory_(path);
}

// static
void Tokenizer::registerInstanceFactory(
std::function<std::unique_ptr<Tokenizer>(const std::string&)>
tokenizerFactory) {
tokenizerFactory_ = tokenizerFactory;
}
} // namespace facebook::velox::common
28 changes: 6 additions & 22 deletions velox/type/Tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,30 +35,14 @@ class Tokenizer {
kFailed,
};

virtual ~Tokenizer() = default;
// Separators: the customized separators to tokenize field name.
explicit Tokenizer(
const std::string& path,
const std::shared_ptr<Separators>& separators);

virtual bool hasNext() = 0;
bool hasNext();

virtual std::unique_ptr<Subfield::PathElement> next() = 0;

static std::unique_ptr<Tokenizer> getInstance(const std::string& path);

static void registerInstanceFactory(
std::function<std::unique_ptr<Tokenizer>(const std::string&)>
tokenizerFactory);

private:
static std::function<std::unique_ptr<Tokenizer>(const std::string&)>
tokenizerFactory_;
};

class DefaultTokenizer : public Tokenizer {
public:
explicit DefaultTokenizer(const std::string& path);

bool hasNext() override;

std::unique_ptr<Subfield::PathElement> next() override;
std::unique_ptr<Subfield::PathElement> next();

private:
const std::string path_;
Expand Down
62 changes: 12 additions & 50 deletions velox/type/tests/SubfieldTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@
using namespace facebook::velox::common;

std::vector<std::unique_ptr<Subfield::PathElement>> tokenize(
const std::string& path) {
const std::string& path,
const std::shared_ptr<Separators>& separators = Separators::get()) {
std::vector<std::unique_ptr<Subfield::PathElement>> elements;
auto tokenizer = Tokenizer::getInstance(path);
while (tokenizer->hasNext()) {
elements.push_back(tokenizer->next());
Tokenizer tokenizer(path, separators);
while (tokenizer.hasNext()) {
elements.push_back(tokenizer.next());
}
return elements;
}
Expand All @@ -47,8 +48,10 @@ TEST(SubfieldTest, invalidPaths) {
assertInvalidSubfield("a[2].[3].", "Invalid subfield path: a[2].^[3].");
}

void testColumnName(const std::string& name) {
auto elements = tokenize(name);
void testColumnName(
const std::string& name,
const std::shared_ptr<Separators>& separators = Separators::get()) {
auto elements = tokenize(name, separators);
EXPECT_EQ(elements.size(), 1);
EXPECT_EQ(*elements[0].get(), Subfield::NestedField(name));
}
Expand All @@ -62,6 +65,9 @@ TEST(SubfieldTest, columnNamesWithSpecialCharacters) {
testColumnName("a/b/c:12");
testColumnName("@basis");
testColumnName("@basis|city_id");
auto separators = std::make_shared<Separators>();
separators->dot = '\0';
testColumnName("city.id@address:number/date|day$a-b$10_bucket", separators);
}

std::vector<std::unique_ptr<Subfield::PathElement>> createElements() {
Expand Down Expand Up @@ -151,47 +157,3 @@ TEST(SubfieldTest, longSubscript) {
ASSERT_TRUE(longSubscript);
ASSERT_EQ(longSubscript->index(), 3309189884973035076);
}

class FakeTokenizer : public Tokenizer {
public:
explicit FakeTokenizer(const std::string& path) : path_(path) {
state = State::kNotReady;
}

bool hasNext() override {
if (state == State::kDone) {
return false;
} else if (state == State::kNotReady) {
return true;
}
VELOX_FAIL("Illegal state");
}

std::unique_ptr<Subfield::PathElement> next() override {
if (!hasNext()) {
VELOX_USER_FAIL("No more tokens");
}
state = State::kDone;
return std::make_unique<Subfield::NestedField>(path_);
}

private:
const std::string path_;
State state;
};

TEST(SubfieldTest, CustomTokenizer) {
Tokenizer::registerInstanceFactory(
[](const std::string& p) { return std::make_unique<FakeTokenizer>(p); });

testColumnName("$bucket");
testColumnName("apollo-11");
testColumnName("a/b/c:12");
testColumnName("@basis");
testColumnName("@basis|city_id");
testColumnName("city.id@address*:number/date|day$a-b$10_bucket");

Tokenizer::registerInstanceFactory([](const std::string& p) {
return std::make_unique<DefaultTokenizer>(p);
});
}
Loading