diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSuite.scala index a56f45d1ba3d..8dc178e46ce5 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSuite.scala @@ -563,5 +563,12 @@ class GlutenClickHouseTPCHSuite extends GlutenClickHouseTPCHAbstractSuite { compareResultsAgainstVanillaSpark(sql, true, { _ => }) spark.sql("drop table t1") } + + test("GLUTEN-7780 fix split diff") { + val sql = "select split(concat('a|b|c', cast(id as string)), '\\|')" + + ", split(concat('a|b|c', cast(id as string)), '\\\\|')" + + ", split(concat('a|b|c', cast(id as string)), '|') from range(10)" + compareResultsAgainstVanillaSpark(sql, true, { _ => }) + } } // scalastyle:off line.size.limit diff --git a/cpp-ch/local-engine/Functions/SparkFunctionSplitByRegexp.cpp b/cpp-ch/local-engine/Functions/SparkFunctionSplitByRegexp.cpp new file mode 100644 index 000000000000..66f37c62033f --- /dev/null +++ b/cpp-ch/local-engine/Functions/SparkFunctionSplitByRegexp.cpp @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; +} + + +/** Functions that split strings into an array of strings or vice versa. + * + * splitByRegexp(regexp, s[, max_substrings]) + */ +namespace +{ + +using Pos = const char *; + +class SparkSplitByRegexpImpl +{ +private: + Regexps::RegexpPtr re; + OptimizedRegularExpression::MatchVec matches; + + Pos pos; + Pos end; + + std::optional max_splits; + size_t splits; + bool max_substrings_includes_remaining_string; + +public: + static constexpr auto name = "splitByRegexpSpark"; + + static bool isVariadic() { return true; } + static size_t getNumberOfArguments() { return 0; } + + static ColumnNumbers getArgumentsThatAreAlwaysConstant() { return {0, 2}; } + + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) + { + checkArgumentsWithSeparatorAndOptionalMaxSubstrings(func, arguments); + } + + static constexpr auto strings_argument_position = 1uz; + + void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) + { + const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); + + if (!col) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}. " + "Must be constant string.", arguments[0].column->getName(), name); + + if (!col->getValue().empty()) + re = std::make_shared(Regexps::createRegexp(col->getValue())); + + max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; + max_splits = extractMaxSplits(arguments, 2); + } + + /// Called for each next string. + void set(Pos pos_, Pos end_) + { + pos = pos_; + end = end_; + splits = 0; + } + + /// Get the next token, if any, or return false. + bool get(Pos & token_begin, Pos & token_end) + { + if (!re) + { + if (pos == end) + return false; + + token_begin = pos; + + if (max_splits) + { + if (max_substrings_includes_remaining_string) + { + if (splits == *max_splits - 1) + { + token_end = end; + pos = end; + return true; + } + } + else + if (splits == *max_splits) + return false; + } + + ++pos; + token_end = pos; + ++splits; + } + else + { + if (!pos || pos > end) + return false; + + token_begin = pos; + + if (max_splits) + { + if (max_substrings_includes_remaining_string) + { + if (splits == *max_splits - 1) + { + token_end = end; + pos = nullptr; + return true; + } + } + else + if (splits == *max_splits) + return false; + } + + auto res = re->match(pos, end - pos, matches); + if (!res) + { + token_end = end; + pos = end + 1; + } + else if (!matches[0].length) + { + /// If match part is empty, increment position to avoid infinite loop. + token_end = (pos == end ? end : pos + 1); + ++pos; + ++splits; + } + else + { + token_end = pos + matches[0].offset; + pos = token_end + matches[0].length; + ++splits; + } + } + + return true; + } +}; + +using SparkFunctionSplitByRegexp = FunctionTokens; + +/// Fallback splitByRegexp to splitByChar when its 1st argument is a trivial char for better performance +class SparkSplitByRegexpOverloadResolver : public IFunctionOverloadResolver +{ +public: + static constexpr auto name = "splitByRegexpSpark"; + static FunctionOverloadResolverPtr create(ContextPtr context) { return std::make_unique(context); } + + explicit SparkSplitByRegexpOverloadResolver(ContextPtr context_) + : context(context_) + , split_by_regexp(SparkFunctionSplitByRegexp::create(context)) {} + + String getName() const override { return name; } + size_t getNumberOfArguments() const override { return SparkSplitByRegexpImpl::getNumberOfArguments(); } + bool isVariadic() const override { return SparkSplitByRegexpImpl::isVariadic(); } + + FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override + { + if (patternIsTrivialChar(arguments)) + return FunctionFactory::instance().getImpl("splitByChar", context)->build(arguments); + return std::make_unique( + split_by_regexp, collections::map(arguments, [](const auto & elem) { return elem.type; }), return_type); + } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + return split_by_regexp->getReturnTypeImpl(arguments); + } + +private: + bool patternIsTrivialChar(const ColumnsWithTypeAndName & arguments) const + { + if (!arguments[0].column.get()) + return false; + const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); + if (!col) + return false; + + String pattern = col->getValue(); + if (pattern.size() == 1) + { + OptimizedRegularExpression re = Regexps::createRegexp(pattern); + + std::string required_substring; + bool is_trivial; + bool required_substring_is_prefix; + re.getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix); + return is_trivial && required_substring == pattern; + } + return false; + } + + ContextPtr context; + FunctionPtr split_by_regexp; +}; +} + +REGISTER_FUNCTION(SparkSplitByRegexp) +{ + factory.registerFunction(); +} + +} diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/split.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/split.cpp index ed17c27eade9..3ffd64decb92 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/split.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/split.cpp @@ -19,14 +19,14 @@ namespace local_engine { -class SparkFunctionSplitParser : public FunctionParser +class FunctionSplitParser : public FunctionParser { public: - SparkFunctionSplitParser(ParserContextPtr parser_context_) : FunctionParser(parser_context_) {} - ~SparkFunctionSplitParser() override = default; + FunctionSplitParser(ParserContextPtr parser_context_) : FunctionParser(parser_context_) {} + ~FunctionSplitParser() override = default; static constexpr auto name = "split"; String getName() const override { return name; } - String getCHFunctionName(const substrait::Expression_ScalarFunction &) const override { return "splitByRegexp"; } + String getCHFunctionName(const substrait::Expression_ScalarFunction &) const override { return "splitByRegexpSpark"; } const DB::ActionsDAG::Node * parse(const substrait::Expression_ScalarFunction & substrait_func, DB::ActionsDAG & actions_dag) const override { @@ -35,7 +35,7 @@ class SparkFunctionSplitParser : public FunctionParser for (const auto & arg : args) parsed_args.emplace_back(parseExpression(actions_dag, arg.value())); /// In Spark: split(str, regex [, limit] ) - /// In CH: splitByRegexp(regexp, str [, limit]) + /// In CH: splitByRegexpSpark(regexp, str [, limit]) if (parsed_args.size() >= 2) std::swap(parsed_args[0], parsed_args[1]); auto ch_function_name = getCHFunctionName(substrait_func); @@ -43,6 +43,6 @@ class SparkFunctionSplitParser : public FunctionParser return convertNodeTypeIfNeeded(substrait_func, func_node, actions_dag); } }; -static FunctionParserRegister register_split; +static FunctionParserRegister register_split; } diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 27e26606f653..50110f15d457 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -844,8 +844,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-32110: compare special double/float values in struct") enableSuite[GlutenRandomSuite].exclude("random").exclude("SPARK-9127 codegen with long seed") enableSuite[GlutenRegexpExpressionsSuite] - .exclude("LIKE ALL") - .exclude("LIKE ANY") .exclude("LIKE Pattern") .exclude("LIKE Pattern ESCAPE '/'") .exclude("LIKE Pattern ESCAPE '#'") @@ -854,8 +852,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("RegexReplace") .exclude("RegexExtract") .exclude("RegexExtractAll") - .exclude("SPLIT") - .exclude("SPARK-34814: LikeSimplification should handle NULL") enableSuite[GlutenSortOrderExpressionsSuite].exclude("SortPrefix") enableSuite[GlutenStringExpressionsSuite] .exclude("StringComparison") diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index da950e2fc1ee..9b3b090e326d 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -817,8 +817,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-32110: compare special double/float values in struct") enableSuite[GlutenRandomSuite].exclude("random").exclude("SPARK-9127 codegen with long seed") enableSuite[GlutenRegexpExpressionsSuite] - .exclude("LIKE ALL") - .exclude("LIKE ANY") .exclude("LIKE Pattern") .exclude("LIKE Pattern ESCAPE '/'") .exclude("LIKE Pattern ESCAPE '#'") @@ -827,8 +825,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("RegexReplace") .exclude("RegexExtract") .exclude("RegexExtractAll") - .exclude("SPLIT") - .exclude("SPARK - 34814: LikeSimplification should handleNULL") enableSuite[GlutenSortOrderExpressionsSuite].exclude("SortPrefix") enableSuite[GlutenStringExpressionsSuite] .exclude("StringComparison") diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index ac08fc5a80cc..e91f1495fbe9 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -740,8 +740,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-32110: compare special double/float values in struct") enableSuite[GlutenRandomSuite].exclude("random").exclude("SPARK-9127 codegen with long seed") enableSuite[GlutenRegexpExpressionsSuite] - .exclude("LIKE ALL") - .exclude("LIKE ANY") .exclude("LIKE Pattern") .exclude("LIKE Pattern ESCAPE '/'") .exclude("LIKE Pattern ESCAPE '#'") @@ -750,8 +748,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("RegexReplace") .exclude("RegexExtract") .exclude("RegexExtractAll") - .exclude("SPLIT") - .exclude("SPARK - 34814: LikeSimplification should handleNULL") enableSuite[GlutenSortOrderExpressionsSuite].exclude("SortPrefix") enableSuite[GlutenStringExpressionsSuite] .exclude("StringComparison") diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 9e4c81081de1..f0637839a762 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -740,8 +740,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-32110: compare special double/float values in struct") enableSuite[GlutenRandomSuite].exclude("random").exclude("SPARK-9127 codegen with long seed") enableSuite[GlutenRegexpExpressionsSuite] - .exclude("LIKE ALL") - .exclude("LIKE ANY") .exclude("LIKE Pattern") .exclude("LIKE Pattern ESCAPE '/'") .exclude("LIKE Pattern ESCAPE '#'") @@ -750,8 +748,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("RegexReplace") .exclude("RegexExtract") .exclude("RegexExtractAll") - .exclude("SPLIT") - .exclude("SPARK - 34814: LikeSimplification should handleNULL") enableSuite[GlutenSortOrderExpressionsSuite].exclude("SortPrefix") enableSuite[GlutenStringExpressionsSuite] .exclude("StringComparison")