diff --git a/cpp/velox/CMakeLists.txt b/cpp/velox/CMakeLists.txt index 1ea9b38386a4..33aea2552a4b 100644 --- a/cpp/velox/CMakeLists.txt +++ b/cpp/velox/CMakeLists.txt @@ -266,6 +266,7 @@ set(VELOX_SRCS memory/VeloxMemoryManager.cc operators/functions/RegistrationAllFunctions.cc operators/functions/RowConstructorWithNull.cc + operators/functions/SparkTokenizer.cc operators/serializer/VeloxColumnarToRowConverter.cc operators/serializer/VeloxColumnarBatchSerializer.cc operators/serializer/VeloxRowToColumnarConverter.cc diff --git a/cpp/velox/compute/VeloxBackend.cc b/cpp/velox/compute/VeloxBackend.cc index e69155356010..6bc2aeaabae4 100644 --- a/cpp/velox/compute/VeloxBackend.cc +++ b/cpp/velox/compute/VeloxBackend.cc @@ -36,6 +36,7 @@ #endif #include "config/GlutenConfig.h" #include "jni/JniFileSystem.h" +#include "operators/functions/SparkTokenizer.h" #include "udf/UdfLoader.h" #include "utils/ConfigExtractor.h" #include "utils/exception.h" @@ -270,6 +271,7 @@ void VeloxBackend::init(const std::unordered_map& conf if (veloxcfg->get(kDebugModeEnabled, false)) { LOG(INFO) << "VeloxBackend config:" << printConfig(veloxcfg->valuesCopy()); } + registerSparkTokenizer(); } facebook::velox::cache::AsyncDataCache* VeloxBackend::getAsyncDataCache() const { diff --git a/cpp/velox/operators/functions/SparkTokenizer.cc b/cpp/velox/operators/functions/SparkTokenizer.cc new file mode 100644 index 000000000000..952abc0c6e9c --- /dev/null +++ b/cpp/velox/operators/functions/SparkTokenizer.cc @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "operators/functions/SparkTokenizer.h" +#include "velox/type/Tokenizer.h" + +namespace gluten { +namespace { + +class SparkTokenizer : public facebook::velox::common::Tokenizer { + public: + explicit SparkTokenizer(const std::string& path) : path_(path) { + state_ = State::kNotReady; + } + + bool hasNext() override { + if (state_ == State::kDone) { + return false; + } else if (state_ == State::kNotReady) { + return true; + } + VELOX_FAIL("Illegal state."); + } + + std::unique_ptr next() override { + if (!hasNext()) { + VELOX_USER_FAIL("No more tokens."); + } + state_ = State::kDone; + return std::make_unique(path_); + } + + private: + const std::string path_; + State state_; +}; +} // namespace + +void registerSparkTokenizer() { + facebook::velox::common::Tokenizer::registerInstanceFactory( + [](const std::string& p) { return std::make_unique(p); }); +} + +} // namespace gluten diff --git a/cpp/velox/operators/functions/SparkTokenizer.h b/cpp/velox/operators/functions/SparkTokenizer.h new file mode 100644 index 000000000000..9ed267f47898 --- /dev/null +++ b/cpp/velox/operators/functions/SparkTokenizer.h @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace gluten { + +void registerSparkTokenizer(); + +} // namespace gluten diff --git a/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc b/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc index e5a45951a6bd..939985e1ca57 100644 --- a/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc +++ b/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc @@ -25,7 +25,6 @@ #include "velox/exec/Aggregate.h" #include "velox/expression/Expr.h" #include "velox/expression/SignatureBinder.h" -#include "velox/type/Tokenizer.h" namespace gluten { @@ -62,7 +61,6 @@ bool validateColNames(const ::substrait::NamedStruct& schema) { }; for (const auto& name : schema.names()) { - common::Tokenizer token(name, common::Separators::get()); for (auto i = 0; i < name.size(); i++) { auto c = name[i]; if (!isUnquotedPathCharacter(c)) {