Skip to content

Commit

Permalink
[VL] Register Spark tokenizer (#3810)
Browse files Browse the repository at this point in the history
  • Loading branch information
rui-mo authored Nov 24, 2023
1 parent c96d438 commit 4f63ddc
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 2 deletions.
1 change: 1 addition & 0 deletions cpp/velox/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ set(VELOX_SRCS
memory/VeloxMemoryManager.cc
operators/functions/RegistrationAllFunctions.cc
operators/functions/RowConstructorWithNull.cc
operators/functions/SparkTokenizer.cc
operators/serializer/VeloxColumnarToRowConverter.cc
operators/serializer/VeloxColumnarBatchSerializer.cc
operators/serializer/VeloxRowToColumnarConverter.cc
Expand Down
2 changes: 2 additions & 0 deletions cpp/velox/compute/VeloxBackend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#endif
#include "config/GlutenConfig.h"
#include "jni/JniFileSystem.h"
#include "operators/functions/SparkTokenizer.h"
#include "udf/UdfLoader.h"
#include "utils/ConfigExtractor.h"
#include "utils/exception.h"
Expand Down Expand Up @@ -270,6 +271,7 @@ void VeloxBackend::init(const std::unordered_map<std::string, std::string>& conf
if (veloxcfg->get<bool>(kDebugModeEnabled, false)) {
LOG(INFO) << "VeloxBackend config:" << printConfig(veloxcfg->valuesCopy());
}
registerSparkTokenizer();
}

facebook::velox::cache::AsyncDataCache* VeloxBackend::getAsyncDataCache() const {
Expand Down
58 changes: 58 additions & 0 deletions cpp/velox/operators/functions/SparkTokenizer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "operators/functions/SparkTokenizer.h"
#include "velox/type/Tokenizer.h"

namespace gluten {
namespace {

class SparkTokenizer : public facebook::velox::common::Tokenizer {
public:
explicit SparkTokenizer(const std::string& path) : path_(path) {
state_ = State::kNotReady;
}

bool hasNext() override {
if (state_ == State::kDone) {
return false;
} else if (state_ == State::kNotReady) {
return true;
}
VELOX_FAIL("Illegal state.");
}

std::unique_ptr<facebook::velox::common::Subfield::PathElement> next() override {
if (!hasNext()) {
VELOX_USER_FAIL("No more tokens.");
}
state_ = State::kDone;
return std::make_unique<facebook::velox::common::Subfield::NestedField>(path_);
}

private:
const std::string path_;
State state_;
};
} // namespace

void registerSparkTokenizer() {
facebook::velox::common::Tokenizer::registerInstanceFactory(
[](const std::string& p) { return std::make_unique<SparkTokenizer>(p); });
}

} // namespace gluten
24 changes: 24 additions & 0 deletions cpp/velox/operators/functions/SparkTokenizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

namespace gluten {

void registerSparkTokenizer();

} // namespace gluten
2 changes: 0 additions & 2 deletions cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
#include "velox/exec/Aggregate.h"
#include "velox/expression/Expr.h"
#include "velox/expression/SignatureBinder.h"
#include "velox/type/Tokenizer.h"

namespace gluten {

Expand Down Expand Up @@ -62,7 +61,6 @@ bool validateColNames(const ::substrait::NamedStruct& schema) {
};

for (const auto& name : schema.names()) {
common::Tokenizer token(name, common::Separators::get());
for (auto i = 0; i < name.size(); i++) {
auto c = name[i];
if (!isUnquotedPathCharacter(c)) {
Expand Down

0 comments on commit 4f63ddc

Please sign in to comment.