From 9b4d5af6d3a9f0066b2db968a9a996d719697c67 Mon Sep 17 00:00:00 2001 From: Rui Mo Date: Fri, 24 Nov 2023 09:14:50 +0800 Subject: [PATCH] Add tokenizer factory to support plugin custom tokenizer (7484) (#444) * Revert "Fix subfield separator" This reverts commit 48886bd83621e1e11220a64f50076a052fd5d36c. * Add tokenizer factory to support plugin custom tokenizer (7484) --- velox/type/Subfield.cpp | 14 +++---- velox/type/Subfield.h | 7 +--- velox/type/Tokenizer.cpp | 65 ++++++++++++++++++++----------- velox/type/Tokenizer.h | 28 ++++++++++--- velox/type/tests/SubfieldTest.cpp | 62 +++++++++++++++++++++++------ 5 files changed, 123 insertions(+), 53 deletions(-) diff --git a/velox/type/Subfield.cpp b/velox/type/Subfield.cpp index 87e3de4a3490..9f9310e8cdd3 100644 --- a/velox/type/Subfield.cpp +++ b/velox/type/Subfield.cpp @@ -18,21 +18,19 @@ namespace facebook::velox::common { -Subfield::Subfield( - const std::string& path, - const std::shared_ptr& separators) { - Tokenizer tokenizer(path, separators); - VELOX_CHECK(tokenizer.hasNext(), "Column name is missing: {}", path); +Subfield::Subfield(const std::string& path) { + auto tokenizer = Tokenizer::getInstance(path); + VELOX_CHECK(tokenizer->hasNext(), "Column name is missing: {}", path); - auto firstElement = tokenizer.next(); + auto firstElement = tokenizer->next(); VELOX_CHECK( firstElement->kind() == kNestedField, "Subfield path must start with a name: {}", path); std::vector> pathElements; pathElements.push_back(std::move(firstElement)); - while (tokenizer.hasNext()) { - pathElements.push_back(tokenizer.next()); + while (tokenizer->hasNext()) { + pathElements.push_back(tokenizer->next()); } path_ = std::move(pathElements); } diff --git a/velox/type/Subfield.h b/velox/type/Subfield.h index b49c7dcaa51d..fb79ab64d534 100644 --- a/velox/type/Subfield.h +++ b/velox/type/Subfield.h @@ -45,7 +45,7 @@ struct Separators { char backSlash = '\\'; char closeBracket = ']'; - char dot = '\0'; + char dot = '.'; char openBracket = '['; char quote = '\"'; char wildCard = '*'; @@ -218,10 +218,7 @@ class Subfield { }; public: - // Separators: the customized separators to tokenize field name. - explicit Subfield( - const std::string& path, - const std::shared_ptr& separators = Separators::get()); + explicit Subfield(const std::string& path); explicit Subfield(std::vector>&& path); diff --git a/velox/type/Tokenizer.cpp b/velox/type/Tokenizer.cpp index f740596cede7..c37369799b98 100644 --- a/velox/type/Tokenizer.cpp +++ b/velox/type/Tokenizer.cpp @@ -17,15 +17,13 @@ namespace facebook::velox::common { -Tokenizer::Tokenizer( - const std::string& path, - const std::shared_ptr& separators) - : path_(path), separators_(separators) { +DefaultTokenizer::DefaultTokenizer(const std::string& path) + : path_(path), separators_(Separators::get()) { state = State::kNotReady; index_ = 0; } -bool Tokenizer::hasNext() { +bool DefaultTokenizer::hasNext() { switch (state) { case State::kDone: return false; @@ -39,7 +37,7 @@ bool Tokenizer::hasNext() { return tryToComputeNext(); } -std::unique_ptr Tokenizer::next() { +std::unique_ptr DefaultTokenizer::next() { if (!hasNext()) { VELOX_FAIL("No more tokens"); } @@ -47,11 +45,11 @@ std::unique_ptr Tokenizer::next() { return std::move(next_); } -bool Tokenizer::hasNextCharacter() { +bool DefaultTokenizer::hasNextCharacter() { return index_ < path_.length(); } -std::unique_ptr Tokenizer::computeNext() { +std::unique_ptr DefaultTokenizer::computeNext() { if (!hasNextCharacter()) { state = State::kDone; return nullptr; @@ -83,17 +81,17 @@ std::unique_ptr Tokenizer::computeNext() { VELOX_UNREACHABLE(); } -bool Tokenizer::tryMatchSeparator(char expected) { +bool DefaultTokenizer::tryMatchSeparator(char expected) { return separators_->isSeparator(expected) && tryMatch(expected); } -void Tokenizer::match(char expected) { +void DefaultTokenizer::match(char expected) { if (!tryMatch(expected)) { invalidSubfieldPath(); } } -bool Tokenizer::tryMatch(char expected) { +bool DefaultTokenizer::tryMatch(char expected) { if (!hasNextCharacter() || peekCharacter() != expected) { return false; } @@ -101,15 +99,15 @@ bool Tokenizer::tryMatch(char expected) { return true; } -void Tokenizer::nextCharacter() { +void DefaultTokenizer::nextCharacter() { index_++; } -char Tokenizer::peekCharacter() { +char DefaultTokenizer::peekCharacter() { return path_[index_]; } -std::unique_ptr Tokenizer::matchPathSegment() { +std::unique_ptr DefaultTokenizer::matchPathSegment() { // seek until we see a special character or whitespace int start = index_; while (hasNextCharacter() && !separators_->isSeparator(peekCharacter()) && @@ -128,7 +126,8 @@ std::unique_ptr Tokenizer::matchPathSegment() { return std::make_unique(token); } -std::unique_ptr Tokenizer::matchUnquotedSubscript() { +std::unique_ptr +DefaultTokenizer::matchUnquotedSubscript() { // seek until we see a special character or whitespace int start = index_; while (hasNextCharacter() && isUnquotedSubscriptCharacter(peekCharacter())) { @@ -151,16 +150,17 @@ std::unique_ptr Tokenizer::matchUnquotedSubscript() { return std::make_unique(index); } -bool Tokenizer::isUnquotedPathCharacter(char c) { +bool DefaultTokenizer::isUnquotedPathCharacter(char c) { return c == ':' || c == '$' || c == '-' || c == '/' || c == '@' || c == '|' || c == '#' || c == '.' || isUnquotedSubscriptCharacter(c); } -bool Tokenizer::isUnquotedSubscriptCharacter(char c) { +bool DefaultTokenizer::isUnquotedSubscriptCharacter(char c) { return c == '-' || c == '_' || isalnum(c); } -std::unique_ptr Tokenizer::matchQuotedSubscript() { +std::unique_ptr +DefaultTokenizer::matchQuotedSubscript() { // quote has already been matched // seek until we see the close quote @@ -200,20 +200,21 @@ std::unique_ptr Tokenizer::matchQuotedSubscript() { return std::make_unique(token); } -std::unique_ptr Tokenizer::matchWildcardSubscript() { +std::unique_ptr +DefaultTokenizer::matchWildcardSubscript() { return std::make_unique(); } -void Tokenizer::invalidSubfieldPath() { +void DefaultTokenizer::invalidSubfieldPath() { VELOX_FAIL("Invalid subfield path: {}", this->toString()); } -std::string Tokenizer::toString() { +std::string DefaultTokenizer::toString() { return path_.substr(0, index_) + separators_->unicodeCaret + path_.substr(index_); } -bool Tokenizer::tryToComputeNext() { +bool DefaultTokenizer::tryToComputeNext() { state = State::kFailed; // temporary pessimism next_ = computeNext(); if (state != State::kDone) { @@ -222,4 +223,24 @@ bool Tokenizer::tryToComputeNext() { } return false; } + +std::function(const std::string&)> + Tokenizer::tokenizerFactory_ = nullptr; + +// static +std::unique_ptr Tokenizer::getInstance(const std::string& path) { + if (!tokenizerFactory_) { + tokenizerFactory_ = [](const std::string& p) { + return std::make_unique(p); + }; + } + return tokenizerFactory_(path); +} + +// static +void Tokenizer::registerInstanceFactory( + std::function(const std::string&)> + tokenizerFactory) { + tokenizerFactory_ = tokenizerFactory; +} } // namespace facebook::velox::common diff --git a/velox/type/Tokenizer.h b/velox/type/Tokenizer.h index 9b14d532449c..c2f55be0d614 100644 --- a/velox/type/Tokenizer.h +++ b/velox/type/Tokenizer.h @@ -35,14 +35,30 @@ class Tokenizer { kFailed, }; - // Separators: the customized separators to tokenize field name. - explicit Tokenizer( - const std::string& path, - const std::shared_ptr& separators); + virtual ~Tokenizer() = default; - bool hasNext(); + virtual bool hasNext() = 0; - std::unique_ptr next(); + virtual std::unique_ptr next() = 0; + + static std::unique_ptr getInstance(const std::string& path); + + static void registerInstanceFactory( + std::function(const std::string&)> + tokenizerFactory); + + private: + static std::function(const std::string&)> + tokenizerFactory_; +}; + +class DefaultTokenizer : public Tokenizer { + public: + explicit DefaultTokenizer(const std::string& path); + + bool hasNext() override; + + std::unique_ptr next() override; private: const std::string path_; diff --git a/velox/type/tests/SubfieldTest.cpp b/velox/type/tests/SubfieldTest.cpp index ea2393d2e17d..df9bb43694eb 100644 --- a/velox/type/tests/SubfieldTest.cpp +++ b/velox/type/tests/SubfieldTest.cpp @@ -20,12 +20,11 @@ using namespace facebook::velox::common; std::vector> tokenize( - const std::string& path, - const std::shared_ptr& separators = Separators::get()) { + const std::string& path) { std::vector> elements; - Tokenizer tokenizer(path, separators); - while (tokenizer.hasNext()) { - elements.push_back(tokenizer.next()); + auto tokenizer = Tokenizer::getInstance(path); + while (tokenizer->hasNext()) { + elements.push_back(tokenizer->next()); } return elements; } @@ -48,10 +47,8 @@ TEST(SubfieldTest, invalidPaths) { assertInvalidSubfield("a[2].[3].", "Invalid subfield path: a[2].^[3]."); } -void testColumnName( - const std::string& name, - const std::shared_ptr& separators = Separators::get()) { - auto elements = tokenize(name, separators); +void testColumnName(const std::string& name) { + auto elements = tokenize(name); EXPECT_EQ(elements.size(), 1); EXPECT_EQ(*elements[0].get(), Subfield::NestedField(name)); } @@ -62,9 +59,6 @@ TEST(SubfieldTest, columnNamesWithSpecialCharacters) { testColumnName("a/b/c:12"); testColumnName("@basis"); testColumnName("@basis|city_id"); - auto separators = std::make_shared(); - separators->dot = '\0'; - testColumnName("city.id@address:number/date|day$a-b$10_bucket", separators); } std::vector> createElements() { @@ -154,3 +148,47 @@ TEST(SubfieldTest, longSubscript) { ASSERT_TRUE(longSubscript); ASSERT_EQ(longSubscript->index(), 3309189884973035076); } + +class FakeTokenizer : public Tokenizer { + public: + explicit FakeTokenizer(const std::string& path) : path_(path) { + state = State::kNotReady; + } + + bool hasNext() override { + if (state == State::kDone) { + return false; + } else if (state == State::kNotReady) { + return true; + } + VELOX_FAIL("Illegal state"); + } + + std::unique_ptr next() override { + if (!hasNext()) { + VELOX_USER_FAIL("No more tokens"); + } + state = State::kDone; + return std::make_unique(path_); + } + + private: + const std::string path_; + State state; +}; + +TEST(SubfieldTest, CustomTokenizer) { + Tokenizer::registerInstanceFactory( + [](const std::string& p) { return std::make_unique(p); }); + + testColumnName("$bucket"); + testColumnName("apollo-11"); + testColumnName("a/b/c:12"); + testColumnName("@basis"); + testColumnName("@basis|city_id"); + testColumnName("city.id@address*:number/date|day$a-b$10_bucket"); + + Tokenizer::registerInstanceFactory([](const std::string& p) { + return std::make_unique(p); + }); +}