Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GLUTEN-7896][CH]Fix to_date diff for time parser policy config #7923

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -2192,7 +2192,7 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr
}
}

test("GLUTEN-3135: Bug fix to_date") {
test("GLUTEN-3135/GLUTEN-7896: Bug fix to_date") {
val create_table_sql =
"""
| create table test_tbl_3135(id bigint, data string) using parquet
Expand All @@ -2216,6 +2216,18 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr

val select_sql = "select id, to_date(data) from test_tbl_3135"
compareResultsAgainstVanillaSpark(select_sql, true, { _ => })
withSQLConf(("spark.sql.legacy.timeParserPolicy" -> "corrected")) {
compareResultsAgainstVanillaSpark(
"select id, to_date('2025-07-22 10:00:00', 'yyyy-MM-dd') from test_tbl_3135 where id = 1",
true,
{ _ => })
}
withSQLConf(("spark.sql.legacy.timeParserPolicy" -> "legacy")) {
compareResultsAgainstVanillaSpark(
"select id, to_date('2025-07-22 10:00:00', 'yyyy-MM-dd') from test_tbl_3135 where id = 1",
true,
{ _ => })
}
spark.sql("drop table test_tbl_3135")
}

Expand Down
5 changes: 5 additions & 0 deletions cpp-ch/local-engine/Common/CHUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -776,6 +776,11 @@ void BackendInitializerUtil::initSettings(const SparkConfigs::ConfigMap & spark_
settings.set(key, toField(key, value));
LOG_DEBUG(&Poco::Logger::get("CHUtil"), "Set settings key:{} value:{}", key, value);
}
else if (key == TIMER_PARSER_POLICY)
{
settings.set(key, value);
LOG_DEBUG(&Poco::Logger::get("CHUtil"), "Set settings key:{} value:{}", key, value);
}
}

/// Finally apply some fixed kvs to settings.
Expand Down
1 change: 1 addition & 0 deletions cpp-ch/local-engine/Common/CHUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ namespace local_engine
static const String MERGETREE_INSERT_WITHOUT_LOCAL_STORAGE = "mergetree.insert_without_local_storage";
static const String MERGETREE_MERGE_AFTER_INSERT = "mergetree.merge_after_insert";
static const std::string DECIMAL_OPERATIONS_ALLOW_PREC_LOSS = "spark.sql.decimalOperations.allowPrecisionLoss";
static const std::string TIMER_PARSER_POLICY = "spark.sql.legacy.timeParserPolicy";

static const std::unordered_set<String> BOOL_VALUE_SETTINGS{
MERGETREE_MERGE_AFTER_INSERT, MERGETREE_INSERT_WITHOUT_LOCAL_STORAGE, DECIMAL_OPERATIONS_ALLOW_PREC_LOSS};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Not, not, not );
REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Xor, xor, xor);

REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Cast, cast, CAST);
REGISTER_COMMON_SCALAR_FUNCTION_PARSER(GetTimestamp, get_timestamp, parseDateTime64InJodaSyntaxOrNull);
// REGISTER_COMMON_SCALAR_FUNCTION_PARSER(GetTimestamp, get_timestamp, parseDateTime64InJodaSyntaxOrNull);
REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Quarter, quarter, toQuarter);

// math functions
Expand Down
23 changes: 23 additions & 0 deletions cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <Parser/scalar_function_parser/getTimestamp.h>

namespace local_engine
{
static FunctionParserRegister<FunctionParserGetTimestamp> register_get_timestamp;
}
106 changes: 106 additions & 0 deletions cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <DataTypes/DataTypeNullable.h>
#include <Parser/FunctionParser.h>
#include <Core/Settings.h>
#include <Core/Field.h>
#include <Common/CHUtil.h>
#include <boost/algorithm/string/case_conv.hpp>

namespace DB
{

namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
}


namespace local_engine
{
class FunctionParserGetTimestamp : public FunctionParser
{
public:
explicit FunctionParserGetTimestamp(ParserContextPtr parser_context_) : FunctionParser(parser_context_) {}
~FunctionParserGetTimestamp() override = default;

static constexpr auto name = "get_timestamp";
String getName() const override { return name; }

const ActionsDAG::Node * parse(
const substrait::Expression_ScalarFunction & substrait_func,
ActionsDAG & actions_dag) const override
{
/*
spark function: get_timestamp(expr, fmt)
1. If timeParserPolicy is LEGACY
1) fmt has 0 'S', ch function = parseDateTime64InJodaSyntaxOrNull(substr(expr,1,length(fmt)), fmt);
2) fmt has 'S' more than 0, make the fmt has 3 'S', ch function = parseDateTime64InJodaSyntaxOrNull(expr, fmt)
2. Else ch function = parseDateTime64InJodaSyntaxOrNull(expr, fmt)
*/
auto parsed_args = parseFunctionArguments(substrait_func, actions_dag);
if (parsed_args.size() != 2)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires exactly two arguments", getName());
const auto * expr_arg = parsed_args[0];
const auto * fmt_arg = parsed_args[1];

const auto & args = substrait_func.arguments();
bool fmt_string_literal = args[1].value().has_literal();
String fmt;
if (fmt_string_literal)
{
const auto & literal_fmt_expr = args[1].value().literal();
fmt_string_literal = literal_fmt_expr.has_string();
fmt = fmt_string_literal ? literal_fmt_expr.string() : "";
}
if (!fmt_string_literal)
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The second of function {} must be const String.", name);

UInt32 s_count = std::count(fmt.begin(), fmt.end(), 'S');
String time_parser_policy = getContext()->getSettingsRef().has(TIMER_PARSER_POLICY) ? toString(getContext()->getSettingsRef().get(TIMER_PARSER_POLICY)) : "";
boost::to_lower(time_parser_policy);
if (time_parser_policy == "legacy")
{
if (s_count == 0)
{
const auto * index_begin_node = addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeUInt64>(), 1);
const auto * index_end_node = addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeUInt64>(), fmt.size());
const auto * substr_node = toFunctionNode(actions_dag, "substringUTF8", {expr_arg, index_begin_node, index_end_node});
const auto * fmt_node = addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeString>(), fmt);
const auto * result_node = toFunctionNode(actions_dag, "parseDateTime64InJodaSyntaxOrNull", {substr_node, fmt_node});
return convertNodeTypeIfNeeded(substrait_func, result_node, actions_dag);
}
else if (s_count < 3)
fmt += String(3 - s_count, 'S');
else
fmt = fmt.substr(0, fmt.size() - (s_count - 3));

const auto * fmt_node = addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeString>(), fmt);
const auto * result_node = toFunctionNode(actions_dag, "parseDateTime64InJodaSyntaxOrNull", {expr_arg, fmt_node});
return convertNodeTypeIfNeeded(substrait_func, result_node, actions_dag);
}
else
{
const auto * result_node = toFunctionNode(actions_dag, "parseDateTime64InJodaSyntaxOrNull", {expr_arg, fmt_arg});
return convertNodeTypeIfNeeded(substrait_func, result_node, actions_dag);
}
}
};
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

#include <DataTypes/DataTypeNullable.h>
#include <Parser/FunctionParser.h>

#include <Parser/scalar_function_parser/getTimestamp.h>

namespace DB
{
Expand All @@ -34,10 +34,10 @@ namespace local_engine
{

template<typename Name>
class FunctionParserUnixTimestamp : public FunctionParser
class FunctionParserUnixTimestamp : public FunctionParserGetTimestamp
{
public:
explicit FunctionParserUnixTimestamp(ParserContextPtr parser_context_) : FunctionParser(parser_context_) {}
explicit FunctionParserUnixTimestamp(ParserContextPtr parser_context_) : FunctionParserGetTimestamp(parser_context_) {}
~FunctionParserUnixTimestamp() override = default;

static constexpr auto name = Name::name;
Expand All @@ -60,13 +60,13 @@ class FunctionParserUnixTimestamp : public FunctionParser
const auto * expr_arg = parsed_args[0];
const auto * fmt_arg = parsed_args[1];
auto expr_type = removeNullable(expr_arg->result_type);
if (isString(expr_type))
return FunctionParserGetTimestamp::parse(substrait_func, actions_dag);

const DateLUTImpl * date_lut = &DateLUT::instance();
const auto * time_zone_node = addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeString>(), date_lut->getTimeZone());

const DB::ActionsDAG::Node * result_node = nullptr;
if (isString(expr_type))
result_node = toFunctionNode(actions_dag, "parseDateTime64InJodaSyntaxOrNull", {expr_arg, fmt_arg, time_zone_node});
else if (isDateOrDate32(expr_type))
if (isDateOrDate32(expr_type))
result_node = toFunctionNode(actions_dag, "sparkDateToUnixTimestamp", {expr_arg, time_zone_node});
else if (isDateTime(expr_type) || isDateTime64(expr_type))
result_node = toFunctionNode(actions_dag, "toUnixTimestamp", {expr_arg, time_zone_node});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -797,7 +797,8 @@ object GlutenConfig {
SPARK_OFFHEAP_ENABLED,
SESSION_LOCAL_TIMEZONE.key,
DECIMAL_OPERATIONS_ALLOW_PREC_LOSS.key,
SPARK_REDACTION_REGEX
SPARK_REDACTION_REGEX,
LEGACY_TIME_PARSER_POLICY.key
)
nativeConfMap.putAll(conf.filter(e => keys.contains(e._1)).asJava)

Expand Down
Loading