From 69ffdffdeda94343e5e30e7adfe7e913d7b1bf9b Mon Sep 17 00:00:00 2001 From: "Ma, Rong" Date: Tue, 21 Nov 2023 09:25:51 +0800 Subject: [PATCH] replace for spark33 --- .../backendsapi/clickhouse/CHTestApi.scala | 6 +- .../backendsapi/velox/TestApiImpl.scala | 212 +++- .../glutenproject/backendsapi/TestApi.scala | 10 +- .../spark/sql/GlutenSQLQueryTestSuite.scala | 7 +- .../resources/sql-tests/inputs/group-by.sql | 293 +++++ .../sql-tests/inputs/udf/udf-group-by.sql | 152 +++ .../sql-tests/results/group-by.sql.out | 1021 +++++++++++++++++ .../results/udf/udf-group-by.sql.out | 514 +++++++++ .../spark/sql/GlutenSQLQueryTestSuite.scala | 259 +---- 9 files changed, 2241 insertions(+), 233 deletions(-) create mode 100644 gluten-ut/spark33/src/test/resources/sql-tests/inputs/group-by.sql create mode 100644 gluten-ut/spark33/src/test/resources/sql-tests/inputs/udf/udf-group-by.sql create mode 100644 gluten-ut/spark33/src/test/resources/sql-tests/results/group-by.sql.out create mode 100644 gluten-ut/spark33/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out diff --git a/backends-clickhouse/src/main/scala/io/glutenproject/backendsapi/clickhouse/CHTestApi.scala b/backends-clickhouse/src/main/scala/io/glutenproject/backendsapi/clickhouse/CHTestApi.scala index 910eb91bf3b42..d141eb3b0f9bd 100644 --- a/backends-clickhouse/src/main/scala/io/glutenproject/backendsapi/clickhouse/CHTestApi.scala +++ b/backends-clickhouse/src/main/scala/io/glutenproject/backendsapi/clickhouse/CHTestApi.scala @@ -19,7 +19,9 @@ package io.glutenproject.backendsapi.clickhouse import io.glutenproject.backendsapi.TestApi class CHTestApi extends TestApi { - override def getSupportedSQLQueryTests: Set[String] = Set[String]() + override def getSupportedSQLQueryTests(sparkVersion: TestApi.SparkVersion): Set[String] = + Set[String]() - override def getOverwriteSQLQueryTests: Set[String] = Set[String]() + override def getOverwriteSQLQueryTests(sparkVersion: TestApi.SparkVersion): Set[String] = + Set[String]() } diff --git a/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/TestApiImpl.scala b/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/TestApiImpl.scala index 40f6d8b1a560f..ec234f7d623dd 100644 --- a/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/TestApiImpl.scala +++ b/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/TestApiImpl.scala @@ -17,6 +17,7 @@ package io.glutenproject.backendsapi.velox import io.glutenproject.backendsapi.TestApi +import io.glutenproject.backendsapi.TestApi.{SparkVersion, SparkVersion32, SparkVersion33} class TestApiImpl extends TestApi { @@ -24,7 +25,25 @@ class TestApiImpl extends TestApi { * List of supported cases to run with Velox backend, in lower case. Please add to the supported * list after enabling a sql test. */ - override def getSupportedSQLQueryTests: Set[String] = { + override def getSupportedSQLQueryTests(sparkVersion: SparkVersion): Set[String] = { + sparkVersion match { + case SparkVersion32() => + SUPPORTED_SQL_QUERY_LIST_SPARK32 + case SparkVersion33() => + SUPPORTED_SQL_QUERY_LIST_SPARK33 + } + } + + override def getOverwriteSQLQueryTests(sparkVersion: SparkVersion): Set[String] = { + sparkVersion match { + case SparkVersion32() => + OVERWRITE_SQL_QUERY_LIST_SPARK32 + case SparkVersion33() => + OVERWRITE_SQL_QUERY_LIST_SPARK33 + } + } + + private val SUPPORTED_SQL_QUERY_LIST_SPARK32: Set[String] = Set( "bitwise.sql", "cast.sql", @@ -201,13 +220,196 @@ class TestApiImpl extends TestApi { "udf/udf-union.sql", "udf/udf-window.sql" ) - } - override def getOverwriteSQLQueryTests: Set[String] = Set[String]( + private val OVERWRITE_SQL_QUERY_LIST_SPARK32: Set[String] = Set( + // Velox corr has better computation logic but it fails Spark's precision check. + // Remove -- SPARK-24369 multiple distinct aggregations having the same argument set + "group-by.sql", + // Remove -- SPARK-24369 multiple distinct aggregations having the same argument set + "udf/udf-group-by.sql" + ) + + private val SUPPORTED_SQL_QUERY_LIST_SPARK33: Set[String] = Set( + "bitwise.sql", + "cast.sql", + "change-column.sql", + "charvarchar.sql", + "columnresolution-negative.sql", + "columnresolution-views.sql", + "columnresolution.sql", + "comments.sql", + "comparator.sql", + "count.sql", + "cross-join.sql", + "csv-functions.sql", + "cte-legacy.sql", + "cte-nested.sql", + "cte-nonlegacy.sql", + "cte.sql", + "current_database_catalog.sql", + "date.sql", + "datetime-formatting-invalid.sql", + // Velox had different handling for some illegal cases. + // "datetime-formatting-legacy.sql", + // "datetime-formatting.sql", + "datetime-legacy.sql", + "datetime-parsing-invalid.sql", + "datetime-parsing-legacy.sql", + "datetime-parsing.sql", + "datetime-special.sql", + "decimalArithmeticOperations.sql", + "describe-part-after-analyze.sql", + "describe-query.sql", + "describe-table-after-alter-table.sql", + "describe-table-column.sql", + "describe.sql", + "except-all.sql", + "except.sql", + "extract.sql", + "group-by-filter.sql", + "group-by-ordinal.sql", + "grouping_set.sql", + "having.sql", + "ignored.sql", + "inline-table.sql", + "inner-join.sql", + "intersect-all.sql", + "interval.sql", + "join-empty-relation.sql", + "join-lateral.sql", + "json-functions.sql", + "like-all.sql", + "like-any.sql", + "limit.sql", + "literals.sql", + "map.sql", + "misc-functions.sql", + "natural-join.sql", + "null-handling.sql", + "null-propagation.sql", + "operators.sql", + "order-by-nulls-ordering.sql", + "order-by-ordinal.sql", + "outer-join.sql", + "parse-schema-string.sql", + "pivot.sql", + "pred-pushdown.sql", + "predicate-functions.sql", + "query_regex_column.sql", + "random.sql", + "regexp-functions.sql", + "show-create-table.sql", + "show-tables.sql", + "show-tblproperties.sql", + "show-views.sql", + "show_columns.sql", + "sql-compatibility-functions.sql", + "string-functions.sql", + "struct.sql", + "subexp-elimination.sql", + "table-aliases.sql", + "table-valued-functions.sql", + "tablesample-negative.sql", + "subquery/exists-subquery/exists-aggregate.sql", + "subquery/exists-subquery/exists-basic.sql", + "subquery/exists-subquery/exists-cte.sql", + "subquery/exists-subquery/exists-having.sql", + "subquery/exists-subquery/exists-joins-and-set-ops.sql", + "subquery/exists-subquery/exists-orderby-limit.sql", + "subquery/exists-subquery/exists-within-and-or.sql", + "subquery/in-subquery/in-basic.sql", + "subquery/in-subquery/in-group-by.sql", + "subquery/in-subquery/in-having.sql", + "subquery/in-subquery/in-joins.sql", + "subquery/in-subquery/in-limit.sql", + "subquery/in-subquery/in-multiple-columns.sql", + "subquery/in-subquery/in-order-by.sql", + "subquery/in-subquery/in-set-operations.sql", + "subquery/in-subquery/in-with-cte.sql", + "subquery/in-subquery/nested-not-in.sql", + "subquery/in-subquery/not-in-group-by.sql", + "subquery/in-subquery/not-in-joins.sql", + "subquery/in-subquery/not-in-unit-tests-multi-column.sql", + "subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql", + "subquery/in-subquery/not-in-unit-tests-single-column.sql", + "subquery/in-subquery/not-in-unit-tests-single-column-literal.sql", + "subquery/in-subquery/simple-in.sql", + "subquery/negative-cases/invalid-correlation.sql", + "subquery/negative-cases/subq-input-typecheck.sql", + "subquery/scalar-subquery/scalar-subquery-predicate.sql", + "subquery/scalar-subquery/scalar-subquery-select.sql", + "subquery/subquery-in-from.sql", + "postgreSQL/aggregates_part1.sql", + "postgreSQL/aggregates_part2.sql", + "postgreSQL/aggregates_part3.sql", + "postgreSQL/aggregates_part4.sql", + "postgreSQL/boolean.sql", + "postgreSQL/case.sql", + "postgreSQL/comments.sql", + "postgreSQL/create_view.sql", + "postgreSQL/date.sql", + "postgreSQL/float4.sql", + "postgreSQL/insert.sql", + "postgreSQL/int2.sql", + "postgreSQL/int4.sql", + "postgreSQL/int8.sql", + "postgreSQL/interval.sql", + "postgreSQL/join.sql", + "postgreSQL/limit.sql", + "postgreSQL/numeric.sql", + "postgreSQL/select.sql", + "postgreSQL/select_distinct.sql", + "postgreSQL/select_having.sql", + "postgreSQL/select_implicit.sql", + "postgreSQL/strings.sql", + "postgreSQL/text.sql", + "postgreSQL/timestamp.sql", + "postgreSQL/union.sql", + "postgreSQL/window_part1.sql", + "postgreSQL/window_part2.sql", + "postgreSQL/window_part3.sql", + "postgreSQL/window_part4.sql", + "postgreSQL/with.sql", + "datetime-special.sql", + "timestamp-ansi.sql", + "timestamp.sql", + "arrayJoin.sql", + "binaryComparison.sql", + "booleanEquality.sql", + "caseWhenCoercion.sql", + "concat.sql", + "dateTimeOperations.sql", + "decimalPrecision.sql", + "division.sql", + "elt.sql", + "ifCoercion.sql", + "implicitTypeCasts.sql", + "inConversion.sql", + "mapZipWith.sql", + "mapconcat.sql", + "promoteStrings.sql", + "stringCastAndExpressions.sql", + "widenSetOperationTypes.sql", + "windowFrameCoercion.sql", + "timestamp-ltz.sql", + "timestamp-ntz.sql", + "timezone.sql", + "transform.sql", + "try_arithmetic.sql", + "try_cast.sql", + "udaf.sql", + "union.sql", + "using-join.sql", + "window.sql", + "udf-union.sql", + "udf-window.sql" + ) + + private val OVERWRITE_SQL_QUERY_LIST_SPARK33: Set[String] = Set( // Velox corr has better computation logic but it fails Spark's precision check. - // Remove corr in group-by.sql + // Remove -- SPARK-24369 multiple distinct aggregations having the same argument set "group-by.sql", - // Remove corr in udf/udf-group-by.sql + // Remove -- SPARK-24369 multiple distinct aggregations having the same argument set "udf/udf-group-by.sql" ) } diff --git a/gluten-core/src/main/scala/io/glutenproject/backendsapi/TestApi.scala b/gluten-core/src/main/scala/io/glutenproject/backendsapi/TestApi.scala index 8866167046387..bb41b649d8bc8 100644 --- a/gluten-core/src/main/scala/io/glutenproject/backendsapi/TestApi.scala +++ b/gluten-core/src/main/scala/io/glutenproject/backendsapi/TestApi.scala @@ -17,7 +17,13 @@ package io.glutenproject.backendsapi trait TestApi { - def getSupportedSQLQueryTests: Set[String] + def getSupportedSQLQueryTests(sparkVersion: TestApi.SparkVersion): Set[String] - def getOverwriteSQLQueryTests: Set[String] + def getOverwriteSQLQueryTests(sparkVersion: TestApi.SparkVersion): Set[String] +} + +object TestApi { + trait SparkVersion + case class SparkVersion32() extends SparkVersion + case class SparkVersion33() extends SparkVersion } diff --git a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala index 1a48ad1b381eb..cfeb11b3a189e 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql import io.glutenproject.GlutenConfig -import io.glutenproject.backendsapi.BackendsApiManager +import io.glutenproject.backendsapi.{BackendsApiManager, TestApi} import io.glutenproject.utils.{BackendTestUtils, SystemParameters} import org.apache.spark.SparkConf @@ -158,7 +158,6 @@ class GlutenSQLQueryTestSuite protected val overwriteResourcePath = getClass.getResource("/").getPath + "../../../src/test/resources/sql-tests" - protected val overwriteInputFilePath = new File(overwriteResourcePath, "inputs").getAbsolutePath protected val overwriteGoldenFilePath = new File(overwriteResourcePath, "results").getAbsolutePath @@ -225,8 +224,8 @@ class GlutenSQLQueryTestSuite // List of supported cases to run with a certain backend, in lower case. private val supportedList: Set[String] = - BackendsApiManager.getTestApiInstance.getSupportedSQLQueryTests ++ - BackendsApiManager.getTestApiInstance.getOverwriteSQLQueryTests + BackendsApiManager.getTestApiInstance.getSupportedSQLQueryTests(TestApi.SparkVersion32()) ++ + BackendsApiManager.getTestApiInstance.getOverwriteSQLQueryTests(TestApi.SparkVersion32()) // Create all the test cases. listTestCases.foreach(createScalaTestCase) diff --git a/gluten-ut/spark33/src/test/resources/sql-tests/inputs/group-by.sql b/gluten-ut/spark33/src/test/resources/sql-tests/inputs/group-by.sql new file mode 100644 index 0000000000000..52af0b7b5e553 --- /dev/null +++ b/gluten-ut/spark33/src/test/resources/sql-tests/inputs/group-by.sql @@ -0,0 +1,293 @@ +-- Test aggregate operator with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + +-- Test data. +CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES +(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null) +AS testData(a, b); +CREATE OR REPLACE TEMPORARY VIEW testRegression AS SELECT * FROM VALUES +(1, 10, null), (2, 10, 11), (2, 20, 22), (2, 25, null), (2, 30, 35) +AS testRegression(k, y, x); +CREATE OR REPLACE TEMPORARY VIEW aggr AS SELECT * FROM VALUES +(0, 0), (0, 10), (0, 20), (0, 30), (0, 40), (1, 10), (1, 20), (2, 10), (2, 20), (2, 25), (2, 30), (3, 60), (4, null) +AS aggr(k, v); + +-- Aggregate with empty GroupBy expressions. +SELECT a, COUNT(b) FROM testData; +SELECT COUNT(a), COUNT(b) FROM testData; + +-- Aggregate with non-empty GroupBy expressions. +SELECT a, COUNT(b) FROM testData GROUP BY a; +SELECT a, COUNT(b) FROM testData GROUP BY b; +SELECT COUNT(a), COUNT(b) FROM testData GROUP BY a; + +-- Aggregate grouped by literals. +SELECT 'foo', COUNT(a) FROM testData GROUP BY 1; + +-- Aggregate grouped by literals (whole stage code generation). +SELECT 'foo' FROM testData WHERE a = 0 GROUP BY 1; + +-- Aggregate grouped by literals (hash aggregate). +SELECT 'foo', APPROX_COUNT_DISTINCT(a) FROM testData WHERE a = 0 GROUP BY 1; + +-- Aggregate grouped by literals (sort aggregate). +SELECT 'foo', MAX(STRUCT(a)) FROM testData WHERE a = 0 GROUP BY 1; + +-- Aggregate with complex GroupBy expressions. +SELECT a + b, COUNT(b) FROM testData GROUP BY a + b; +SELECT a + 2, COUNT(b) FROM testData GROUP BY a + 1; +SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1; + +-- Aggregate with nulls. +SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a) +FROM testData; + +-- Aggregate with foldable input and multiple distinct groups. +SELECT COUNT(DISTINCT b), COUNT(DISTINCT b, c) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a; + +-- Aliases in SELECT could be used in GROUP BY +SELECT a AS k, COUNT(b) FROM testData GROUP BY k; +SELECT a AS k, COUNT(b) FROM testData GROUP BY k HAVING k > 1; + +-- GROUP BY alias with invalid col in SELECT list +SELECT a AS k, COUNT(non_existing) FROM testData GROUP BY k; + +-- Aggregate functions cannot be used in GROUP BY +SELECT COUNT(b) AS k FROM testData GROUP BY k; + +-- Test data. +CREATE OR REPLACE TEMPORARY VIEW testDataHasSameNameWithAlias AS SELECT * FROM VALUES +(1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v); +SELECT k AS a, COUNT(v) FROM testDataHasSameNameWithAlias GROUP BY a; + +-- turn off group by aliases +set spark.sql.groupByAliases=false; + +-- Check analysis exceptions +SELECT a AS k, COUNT(b) FROM testData GROUP BY k; + +-- Aggregate with empty input and non-empty GroupBy expressions. +SELECT a, COUNT(1) FROM testData WHERE false GROUP BY a; + +-- Aggregate with empty input and empty GroupBy expressions. +SELECT COUNT(1) FROM testData WHERE false; +SELECT 1 FROM (SELECT COUNT(1) FROM testData WHERE false) t; + +-- Aggregate with empty GroupBy expressions and filter on top +SELECT 1 from ( + SELECT 1 AS z, + MIN(a.x) + FROM (select 1 as x) a + WHERE false +) b +where b.z != b.z; + +-- SPARK-25708 HAVING without GROUP BY means global aggregate +SELECT 1 FROM range(10) HAVING true; + +SELECT 1 FROM range(10) HAVING MAX(id) > 0; + +SELECT id FROM range(10) HAVING id > 0; + +SET spark.sql.legacy.parser.havingWithoutGroupByAsWhere=true; + +SELECT 1 FROM range(10) HAVING true; + +SELECT 1 FROM range(10) HAVING MAX(id) > 0; + +SELECT id FROM range(10) HAVING id > 0; + +SET spark.sql.legacy.parser.havingWithoutGroupByAsWhere=false; + +-- Test data +CREATE OR REPLACE TEMPORARY VIEW test_agg AS SELECT * FROM VALUES + (1, true), (1, false), + (2, true), + (3, false), (3, null), + (4, null), (4, null), + (5, null), (5, true), (5, false) AS test_agg(k, v); + +-- empty table +SELECT every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg WHERE 1 = 0; + +-- all null values +SELECT every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg WHERE k = 4; + +-- aggregates are null Filtering +SELECT every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg WHERE k = 5; + +-- group by +SELECT k, every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg GROUP BY k; + +-- having +SELECT k, every(v) FROM test_agg GROUP BY k HAVING every(v) = false; +SELECT k, every(v) FROM test_agg GROUP BY k HAVING every(v) IS NULL; + +-- basic subquery path to make sure rewrite happens in both parent and child plans. +SELECT k, + Every(v) AS every +FROM test_agg +WHERE k = 2 + AND v IN (SELECT Any(v) + FROM test_agg + WHERE k = 1) +GROUP BY k; + +-- basic subquery path to make sure rewrite happens in both parent and child plans. +SELECT k, + Every(v) AS every +FROM test_agg +WHERE k = 2 + AND v IN (SELECT Every(v) + FROM test_agg + WHERE k = 1) +GROUP BY k; + +-- input type checking Int +SELECT every(1); + +-- input type checking Short +SELECT some(1S); + +-- input type checking Long +SELECT any(1L); + +-- input type checking String +SELECT every("true"); + +-- input type checking Decimal +SELECT bool_and(1.0); + +-- input type checking double +SELECT bool_or(1.0D); + +-- every/some/any aggregates/bool_and/bool_or are supported as windows expression. +SELECT k, v, every(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; +SELECT k, v, some(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; +SELECT k, v, any(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; +SELECT k, v, bool_and(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; +SELECT k, v, bool_or(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; + +-- Having referencing aggregate expressions is ok. +SELECT count(*) FROM test_agg HAVING count(*) > 1L; +SELECT k, max(v) FROM test_agg GROUP BY k HAVING max(v) = true; + +-- Aggrgate expressions can be referenced through an alias +SELECT * FROM (SELECT COUNT(*) AS cnt FROM test_agg) WHERE cnt > 1L; + +-- Error when aggregate expressions are in where clause directly +SELECT count(*) FROM test_agg WHERE count(*) > 1L; +SELECT count(*) FROM test_agg WHERE count(*) + 1L > 1L; +SELECT count(*) FROM test_agg WHERE k = 1 or k = 2 or count(*) + 1L > 1L or max(k) > 1; + +-- Aggregate with multiple distinct decimal columns +SELECT AVG(DISTINCT decimal_col), SUM(DISTINCT decimal_col) FROM VALUES (CAST(1 AS DECIMAL(9, 0))) t(decimal_col); + +-- SPARK-34581: Don't optimize out grouping expressions from aggregate expressions without aggregate function +SELECT not(a IS NULL), count(*) AS c +FROM testData +GROUP BY a IS NULL; + +SELECT if(not(a IS NULL), rand(0), 1), count(*) AS c +FROM testData +GROUP BY a IS NULL; + + +-- Histogram aggregates with different numeric input types +SELECT + histogram_numeric(col, 2) as histogram_2, + histogram_numeric(col, 3) as histogram_3, + histogram_numeric(col, 5) as histogram_5, + histogram_numeric(col, 10) as histogram_10 +FROM VALUES + (1), (2), (3), (4), (5), (6), (7), (8), (9), (10), + (11), (12), (13), (14), (15), (16), (17), (18), (19), (20), + (21), (22), (23), (24), (25), (26), (27), (28), (29), (30), + (31), (32), (33), (34), (35), (3), (37), (38), (39), (40), + (41), (42), (43), (44), (45), (46), (47), (48), (49), (50) AS tab(col); +SELECT histogram_numeric(col, 3) FROM VALUES (1), (2), (3) AS tab(col); +SELECT histogram_numeric(col, 3) FROM VALUES (1L), (2L), (3L) AS tab(col); +SELECT histogram_numeric(col, 3) FROM VALUES (1F), (2F), (3F) AS tab(col); +SELECT histogram_numeric(col, 3) FROM VALUES (1D), (2D), (3D) AS tab(col); +SELECT histogram_numeric(col, 3) FROM VALUES (1S), (2S), (3S) AS tab(col); +SELECT histogram_numeric(col, 3) FROM VALUES + (CAST(1 AS BYTE)), (CAST(2 AS BYTE)), (CAST(3 AS BYTE)) AS tab(col); +SELECT histogram_numeric(col, 3) FROM VALUES + (CAST(1 AS TINYINT)), (CAST(2 AS TINYINT)), (CAST(3 AS TINYINT)) AS tab(col); +SELECT histogram_numeric(col, 3) FROM VALUES + (CAST(1 AS SMALLINT)), (CAST(2 AS SMALLINT)), (CAST(3 AS SMALLINT)) AS tab(col); +SELECT histogram_numeric(col, 3) FROM VALUES + (CAST(1 AS BIGINT)), (CAST(2 AS BIGINT)), (CAST(3 AS BIGINT)) AS tab(col); +SELECT histogram_numeric(col, 3) FROM VALUES (TIMESTAMP '2017-03-01 00:00:00'), + (TIMESTAMP '2017-04-01 00:00:00'), (TIMESTAMP '2017-05-01 00:00:00') AS tab(col); +SELECT histogram_numeric(col, 3) FROM VALUES (INTERVAL '100-00' YEAR TO MONTH), + (INTERVAL '110-00' YEAR TO MONTH), (INTERVAL '120-00' YEAR TO MONTH) AS tab(col); +SELECT histogram_numeric(col, 3) FROM VALUES (INTERVAL '12 20:4:0' DAY TO SECOND), + (INTERVAL '12 21:4:0' DAY TO SECOND), (INTERVAL '12 22:4:0' DAY TO SECOND) AS tab(col); +SELECT histogram_numeric(col, 3) +FROM VALUES (NULL), (NULL), (NULL) AS tab(col); +SELECT histogram_numeric(col, 3) +FROM VALUES (CAST(NULL AS DOUBLE)), (CAST(NULL AS DOUBLE)), (CAST(NULL AS DOUBLE)) AS tab(col); +SELECT histogram_numeric(col, 3) +FROM VALUES (CAST(NULL AS INT)), (CAST(NULL AS INT)), (CAST(NULL AS INT)) AS tab(col); + + +-- SPARK-37613: Support ANSI Aggregate Function: regr_count +SELECT regr_count(y, x) FROM testRegression; +SELECT regr_count(y, x) FROM testRegression WHERE x IS NOT NULL; +SELECT k, count(*), regr_count(y, x) FROM testRegression GROUP BY k; +SELECT k, count(*) FILTER (WHERE x IS NOT NULL), regr_count(y, x) FROM testRegression GROUP BY k; + +-- SPARK-37613: Support ANSI Aggregate Function: regr_r2 +SELECT regr_r2(y, x) FROM testRegression; +SELECT regr_r2(y, x) FROM testRegression WHERE x IS NOT NULL; +SELECT k, corr(y, x), regr_r2(y, x) FROM testRegression GROUP BY k; +SELECT k, corr(y, x) FILTER (WHERE x IS NOT NULL), regr_r2(y, x) FROM testRegression GROUP BY k; + +-- SPARK-27974: Support ANSI Aggregate Function: array_agg +SELECT + collect_list(col), + array_agg(col) +FROM VALUES + (1), (2), (1) AS tab(col); +SELECT + a, + collect_list(b), + array_agg(b) +FROM VALUES + (1,4),(2,3),(1,4),(2,4) AS v(a,b) +GROUP BY a; + +-- SPARK-37614: Support ANSI Aggregate Function: regr_avgx & regr_avgy +SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression; +SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL; +SELECT k, avg(x), avg(y), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression GROUP BY k; +SELECT k, avg(x) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), avg(y) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression GROUP BY k; + +-- SPARK-37676: Support ANSI Aggregation Function: percentile_cont +SELECT + percentile_cont(0.25) WITHIN GROUP (ORDER BY v), + percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC) +FROM aggr; +SELECT + k, + percentile_cont(0.25) WITHIN GROUP (ORDER BY v), + percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC) +FROM aggr +GROUP BY k +ORDER BY k; + +-- SPARK-37691: Support ANSI Aggregation Function: percentile_disc +SELECT + percentile_disc(0.25) WITHIN GROUP (ORDER BY v), + percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC) +FROM aggr; +SELECT + k, + percentile_disc(0.25) WITHIN GROUP (ORDER BY v), + percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC) +FROM aggr +GROUP BY k +ORDER BY k; diff --git a/gluten-ut/spark33/src/test/resources/sql-tests/inputs/udf/udf-group-by.sql b/gluten-ut/spark33/src/test/resources/sql-tests/inputs/udf/udf-group-by.sql new file mode 100644 index 0000000000000..a4df72f44ebfc --- /dev/null +++ b/gluten-ut/spark33/src/test/resources/sql-tests/inputs/udf/udf-group-by.sql @@ -0,0 +1,152 @@ +-- This test file was converted from group-by.sql. +-- Test data. +CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES +(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null) +AS testData(a, b); + +-- Aggregate with empty GroupBy expressions. +SELECT udf(a), udf(COUNT(b)) FROM testData; +SELECT COUNT(udf(a)), udf(COUNT(b)) FROM testData; + +-- Aggregate with non-empty GroupBy expressions. +SELECT udf(a), COUNT(udf(b)) FROM testData GROUP BY a; +SELECT udf(a), udf(COUNT(udf(b))) FROM testData GROUP BY b; +SELECT COUNT(udf(a)), COUNT(udf(b)) FROM testData GROUP BY udf(a); + +-- Aggregate grouped by literals. +SELECT 'foo', COUNT(udf(a)) FROM testData GROUP BY 1; + +-- Aggregate grouped by literals (whole stage code generation). +SELECT 'foo' FROM testData WHERE a = 0 GROUP BY udf(1); + +-- Aggregate grouped by literals (hash aggregate). +SELECT 'foo', udf(APPROX_COUNT_DISTINCT(udf(a))) FROM testData WHERE a = 0 GROUP BY udf(1); + +-- Aggregate grouped by literals (sort aggregate). +SELECT 'foo', MAX(STRUCT(udf(a))) FROM testData WHERE a = 0 GROUP BY udf(1); + +-- Aggregate with complex GroupBy expressions. +SELECT udf(a + b), udf(COUNT(b)) FROM testData GROUP BY a + b; +SELECT udf(a + 2), udf(COUNT(b)) FROM testData GROUP BY a + 1; +SELECT udf(a + 1) + 1, udf(COUNT(b)) FROM testData GROUP BY udf(a + 1); + +-- Aggregate with nulls. +SELECT SKEWNESS(udf(a)), udf(KURTOSIS(a)), udf(MIN(a)), MAX(udf(a)), udf(AVG(udf(a))), udf(VARIANCE(a)), STDDEV(udf(a)), udf(SUM(a)), udf(COUNT(a)) +FROM testData; + +-- Aggregate with foldable input and multiple distinct groups. +SELECT COUNT(DISTINCT udf(b)), udf(COUNT(DISTINCT b, c)) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY udf(a); + +-- Aliases in SELECT could be used in GROUP BY +SELECT udf(a) AS k, COUNT(udf(b)) FROM testData GROUP BY k; +SELECT a AS k, udf(COUNT(b)) FROM testData GROUP BY k HAVING k > 1; + +-- Aggregate functions cannot be used in GROUP BY +SELECT udf(COUNT(b)) AS k FROM testData GROUP BY k; + +-- Test data. +CREATE OR REPLACE TEMPORARY VIEW testDataHasSameNameWithAlias AS SELECT * FROM VALUES +(1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v); +SELECT k AS a, udf(COUNT(udf(v))) FROM testDataHasSameNameWithAlias GROUP BY udf(a); + +-- turn off group by aliases +set spark.sql.groupByAliases=false; + +-- Check analysis exceptions +SELECT a AS k, udf(COUNT(udf(b))) FROM testData GROUP BY k; + +-- Aggregate with empty input and non-empty GroupBy expressions. +SELECT udf(a), COUNT(udf(1)) FROM testData WHERE false GROUP BY udf(a); + +-- Aggregate with empty input and empty GroupBy expressions. +SELECT udf(COUNT(1)) FROM testData WHERE false; +SELECT 1 FROM (SELECT udf(COUNT(1)) FROM testData WHERE false) t; + +-- Aggregate with empty GroupBy expressions and filter on top +SELECT 1 from ( + SELECT 1 AS z, + udf(MIN(a.x)) + FROM (select 1 as x) a + WHERE false +) b +where b.z != b.z; + +-- SPARK-25708 HAVING without GROUP BY means global aggregate +SELECT udf(1) FROM range(10) HAVING true; + +SELECT udf(udf(1)) FROM range(10) HAVING MAX(id) > 0; + +SELECT udf(id) FROM range(10) HAVING id > 0; + +-- Test data +CREATE OR REPLACE TEMPORARY VIEW test_agg AS SELECT * FROM VALUES + (1, true), (1, false), + (2, true), + (3, false), (3, null), + (4, null), (4, null), + (5, null), (5, true), (5, false) AS test_agg(k, v); + +-- empty table +SELECT udf(every(v)), udf(some(v)), any(v) FROM test_agg WHERE 1 = 0; + +-- all null values +SELECT udf(every(udf(v))), some(v), any(v) FROM test_agg WHERE k = 4; + +-- aggregates are null Filtering +SELECT every(v), udf(some(v)), any(v) FROM test_agg WHERE k = 5; + +-- group by +SELECT udf(k), every(v), udf(some(v)), any(v) FROM test_agg GROUP BY udf(k); + +-- having +SELECT udf(k), every(v) FROM test_agg GROUP BY k HAVING every(v) = false; +SELECT udf(k), udf(every(v)) FROM test_agg GROUP BY udf(k) HAVING every(v) IS NULL; + +-- basic subquery path to make sure rewrite happens in both parent and child plans. +SELECT udf(k), + udf(Every(v)) AS every +FROM test_agg +WHERE k = 2 + AND v IN (SELECT Any(v) + FROM test_agg + WHERE k = 1) +GROUP BY udf(k); + +-- basic subquery path to make sure rewrite happens in both parent and child plans. +SELECT udf(udf(k)), + Every(v) AS every +FROM test_agg +WHERE k = 2 + AND v IN (SELECT Every(v) + FROM test_agg + WHERE k = 1) +GROUP BY udf(udf(k)); + +-- input type checking Int +SELECT every(udf(1)); + +-- input type checking Short +SELECT some(udf(1S)); + +-- input type checking Long +SELECT any(udf(1L)); + +-- input type checking String +SELECT udf(every("true")); + +-- every/some/any aggregates are supported as windows expression. +SELECT k, v, every(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; +SELECT k, udf(udf(v)), some(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; +SELECT udf(udf(k)), v, any(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; + +-- Having referencing aggregate expressions is ok. +SELECT udf(count(*)) FROM test_agg HAVING count(*) > 1L; +SELECT k, udf(max(v)) FROM test_agg GROUP BY k HAVING max(v) = true; + +-- Aggrgate expressions can be referenced through an alias +SELECT * FROM (SELECT udf(COUNT(*)) AS cnt FROM test_agg) WHERE cnt > 1L; + +-- Error when aggregate expressions are in where clause directly +SELECT udf(count(*)) FROM test_agg WHERE count(*) > 1L; +SELECT udf(count(*)) FROM test_agg WHERE count(*) + 1L > 1L; +SELECT udf(count(*)) FROM test_agg WHERE k = 1 or k = 2 or count(*) + 1L > 1L or max(k) > 1; diff --git a/gluten-ut/spark33/src/test/resources/sql-tests/results/group-by.sql.out b/gluten-ut/spark33/src/test/resources/sql-tests/results/group-by.sql.out new file mode 100644 index 0000000000000..a78cccb8f7aae --- /dev/null +++ b/gluten-ut/spark33/src/test/resources/sql-tests/results/group-by.sql.out @@ -0,0 +1,1021 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 101 + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES +(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null) +AS testData(a, b) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW testRegression AS SELECT * FROM VALUES +(1, 10, null), (2, 10, 11), (2, 20, 22), (2, 25, null), (2, 30, 35) +AS testRegression(k, y, x) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW aggr AS SELECT * FROM VALUES +(0, 0), (0, 10), (0, 20), (0, 30), (0, 40), (1, 10), (1, 20), (2, 10), (2, 20), (2, 25), (2, 30), (3, 60), (4, null) +AS aggr(k, v) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT a, COUNT(b) FROM testData +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +grouping expressions sequence is empty, and 'testdata.a' is not an aggregate function. Wrap '(count(testdata.b) AS `count(b)`)' in windowing function(s) or wrap 'testdata.a' in first() (or first_value) if you don't care which value you get. + + +-- !query +SELECT COUNT(a), COUNT(b) FROM testData +-- !query schema +struct +-- !query output +7 7 + + +-- !query +SELECT a, COUNT(b) FROM testData GROUP BY a +-- !query schema +struct +-- !query output +1 2 +2 2 +3 2 +NULL 1 + + +-- !query +SELECT a, COUNT(b) FROM testData GROUP BY b +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +expression 'testdata.a' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get. + + +-- !query +SELECT COUNT(a), COUNT(b) FROM testData GROUP BY a +-- !query schema +struct +-- !query output +0 1 +2 2 +2 2 +3 2 + + +-- !query +SELECT 'foo', COUNT(a) FROM testData GROUP BY 1 +-- !query schema +struct +-- !query output +foo 7 + + +-- !query +SELECT 'foo' FROM testData WHERE a = 0 GROUP BY 1 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT 'foo', APPROX_COUNT_DISTINCT(a) FROM testData WHERE a = 0 GROUP BY 1 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT 'foo', MAX(STRUCT(a)) FROM testData WHERE a = 0 GROUP BY 1 +-- !query schema +struct> +-- !query output + + + +-- !query +SELECT a + b, COUNT(b) FROM testData GROUP BY a + b +-- !query schema +struct<(a + b):int,count(b):bigint> +-- !query output +2 1 +3 2 +4 2 +5 1 +NULL 1 + + +-- !query +SELECT a + 2, COUNT(b) FROM testData GROUP BY a + 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +expression 'testdata.a' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get. + + +-- !query +SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1 +-- !query schema +struct<((a + 1) + 1):int,count(b):bigint> +-- !query output +3 2 +4 2 +5 2 +NULL 1 + + +-- !query +SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a) +FROM testData +-- !query schema +struct +-- !query output +-0.2723801058145729 -1.5069204152249134 1 3 2.142857142857143 0.8095238095238094 0.8997354108424372 15 7 + + +-- !query +SELECT COUNT(DISTINCT b), COUNT(DISTINCT b, c) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a +-- !query schema +struct +-- !query output +1 1 + + +-- !query +SELECT a AS k, COUNT(b) FROM testData GROUP BY k +-- !query schema +struct +-- !query output +1 2 +2 2 +3 2 +NULL 1 + + +-- !query +SELECT a AS k, COUNT(b) FROM testData GROUP BY k HAVING k > 1 +-- !query schema +struct +-- !query output +2 2 +3 2 + + +-- !query +SELECT a AS k, COUNT(non_existing) FROM testData GROUP BY k +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Column 'non_existing' does not exist. Did you mean one of the following? [testdata.a, testdata.b]; line 1 pos 21 + + +-- !query +SELECT COUNT(b) AS k FROM testData GROUP BY k +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +aggregate functions are not allowed in GROUP BY, but found count(testdata.b) + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW testDataHasSameNameWithAlias AS SELECT * FROM VALUES +(1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT k AS a, COUNT(v) FROM testDataHasSameNameWithAlias GROUP BY a +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +expression 'testdatahassamenamewithalias.k' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get. + + +-- !query +set spark.sql.groupByAliases=false +-- !query schema +struct +-- !query output +spark.sql.groupByAliases false + + +-- !query +SELECT a AS k, COUNT(b) FROM testData GROUP BY k +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Column 'k' does not exist. Did you mean one of the following? [testdata.a, testdata.b]; line 1 pos 47 + + +-- !query +SELECT a, COUNT(1) FROM testData WHERE false GROUP BY a +-- !query schema +struct +-- !query output + + + +-- !query +SELECT COUNT(1) FROM testData WHERE false +-- !query schema +struct +-- !query output +0 + + +-- !query +SELECT 1 FROM (SELECT COUNT(1) FROM testData WHERE false) t +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +SELECT 1 from ( + SELECT 1 AS z, + MIN(a.x) + FROM (select 1 as x) a + WHERE false +) b +where b.z != b.z +-- !query schema +struct<1:int> +-- !query output + + + +-- !query +SELECT 1 FROM range(10) HAVING true +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +SELECT 1 FROM range(10) HAVING MAX(id) > 0 +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +SELECT id FROM range(10) HAVING id > 0 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +grouping expressions sequence is empty, and 'id' is not an aggregate function. Wrap '()' in windowing function(s) or wrap 'id' in first() (or first_value) if you don't care which value you get. + + +-- !query +SET spark.sql.legacy.parser.havingWithoutGroupByAsWhere=true +-- !query schema +struct +-- !query output +spark.sql.legacy.parser.havingWithoutGroupByAsWhere true + + +-- !query +SELECT 1 FROM range(10) HAVING true +-- !query schema +struct<1:int> +-- !query output +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 + + +-- !query +SELECT 1 FROM range(10) HAVING MAX(id) > 0 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException + +Aggregate/Window/Generate expressions are not valid in where clause of the query. +Expression in where clause: [(max(id) > CAST(0 AS BIGINT))] +Invalid expressions: [max(id)] + + +-- !query +SELECT id FROM range(10) HAVING id > 0 +-- !query schema +struct +-- !query output +1 +2 +3 +4 +5 +6 +7 +8 +9 + + +-- !query +SET spark.sql.legacy.parser.havingWithoutGroupByAsWhere=false +-- !query schema +struct +-- !query output +spark.sql.legacy.parser.havingWithoutGroupByAsWhere false + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW test_agg AS SELECT * FROM VALUES + (1, true), (1, false), + (2, true), + (3, false), (3, null), + (4, null), (4, null), + (5, null), (5, true), (5, false) AS test_agg(k, v) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg WHERE 1 = 0 +-- !query schema +struct +-- !query output +NULL NULL NULL NULL NULL + + +-- !query +SELECT every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg WHERE k = 4 +-- !query schema +struct +-- !query output +NULL NULL NULL NULL NULL + + +-- !query +SELECT every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg WHERE k = 5 +-- !query schema +struct +-- !query output +false true true false true + + +-- !query +SELECT k, every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg GROUP BY k +-- !query schema +struct +-- !query output +1 false true true false true +2 true true true true true +3 false false false false false +4 NULL NULL NULL NULL NULL +5 false true true false true + + +-- !query +SELECT k, every(v) FROM test_agg GROUP BY k HAVING every(v) = false +-- !query schema +struct +-- !query output +1 false +3 false +5 false + + +-- !query +SELECT k, every(v) FROM test_agg GROUP BY k HAVING every(v) IS NULL +-- !query schema +struct +-- !query output +4 NULL + + +-- !query +SELECT k, + Every(v) AS every +FROM test_agg +WHERE k = 2 + AND v IN (SELECT Any(v) + FROM test_agg + WHERE k = 1) +GROUP BY k +-- !query schema +struct +-- !query output +2 true + + +-- !query +SELECT k, + Every(v) AS every +FROM test_agg +WHERE k = 2 + AND v IN (SELECT Every(v) + FROM test_agg + WHERE k = 1) +GROUP BY k +-- !query schema +struct +-- !query output + + + +-- !query +SELECT every(1) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'every(1)' due to data type mismatch: argument 1 requires boolean type, however, '1' is of int type.; line 1 pos 7 + + +-- !query +SELECT some(1S) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'some(1S)' due to data type mismatch: argument 1 requires boolean type, however, '1S' is of smallint type.; line 1 pos 7 + + +-- !query +SELECT any(1L) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'any(1L)' due to data type mismatch: argument 1 requires boolean type, however, '1L' is of bigint type.; line 1 pos 7 + + +-- !query +SELECT every("true") +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'every('true')' due to data type mismatch: argument 1 requires boolean type, however, ''true'' is of string type.; line 1 pos 7 + + +-- !query +SELECT bool_and(1.0) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'bool_and(1.0BD)' due to data type mismatch: argument 1 requires boolean type, however, '1.0BD' is of decimal(2,1) type.; line 1 pos 7 + + +-- !query +SELECT bool_or(1.0D) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'bool_or(1.0D)' due to data type mismatch: argument 1 requires boolean type, however, '1.0D' is of double type.; line 1 pos 7 + + +-- !query +SELECT k, v, every(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +-- !query schema +struct +-- !query output +1 false false +1 true false +2 true true +3 NULL NULL +3 false false +4 NULL NULL +4 NULL NULL +5 NULL NULL +5 false false +5 true false + + +-- !query +SELECT k, v, some(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +-- !query schema +struct +-- !query output +1 false false +1 true true +2 true true +3 NULL NULL +3 false false +4 NULL NULL +4 NULL NULL +5 NULL NULL +5 false false +5 true true + + +-- !query +SELECT k, v, any(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +-- !query schema +struct +-- !query output +1 false false +1 true true +2 true true +3 NULL NULL +3 false false +4 NULL NULL +4 NULL NULL +5 NULL NULL +5 false false +5 true true + + +-- !query +SELECT k, v, bool_and(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +-- !query schema +struct +-- !query output +1 false false +1 true false +2 true true +3 NULL NULL +3 false false +4 NULL NULL +4 NULL NULL +5 NULL NULL +5 false false +5 true false + + +-- !query +SELECT k, v, bool_or(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +-- !query schema +struct +-- !query output +1 false false +1 true true +2 true true +3 NULL NULL +3 false false +4 NULL NULL +4 NULL NULL +5 NULL NULL +5 false false +5 true true + + +-- !query +SELECT count(*) FROM test_agg HAVING count(*) > 1L +-- !query schema +struct +-- !query output +10 + + +-- !query +SELECT k, max(v) FROM test_agg GROUP BY k HAVING max(v) = true +-- !query schema +struct +-- !query output +1 true +2 true +5 true + + +-- !query +SELECT * FROM (SELECT COUNT(*) AS cnt FROM test_agg) WHERE cnt > 1L +-- !query schema +struct +-- !query output +10 + + +-- !query +SELECT count(*) FROM test_agg WHERE count(*) > 1L +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException + +Aggregate/Window/Generate expressions are not valid in where clause of the query. +Expression in where clause: [(count(1) > 1L)] +Invalid expressions: [count(1)] + + +-- !query +SELECT count(*) FROM test_agg WHERE count(*) + 1L > 1L +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException + +Aggregate/Window/Generate expressions are not valid in where clause of the query. +Expression in where clause: [((count(1) + 1L) > 1L)] +Invalid expressions: [count(1)] + + +-- !query +SELECT count(*) FROM test_agg WHERE k = 1 or k = 2 or count(*) + 1L > 1L or max(k) > 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException + +Aggregate/Window/Generate expressions are not valid in where clause of the query. +Expression in where clause: [(((test_agg.k = 1) OR (test_agg.k = 2)) OR (((count(1) + 1L) > 1L) OR (max(test_agg.k) > 1)))] +Invalid expressions: [count(1), max(test_agg.k)] + + +-- !query +SELECT AVG(DISTINCT decimal_col), SUM(DISTINCT decimal_col) FROM VALUES (CAST(1 AS DECIMAL(9, 0))) t(decimal_col) +-- !query schema +struct +-- !query output +1.0000 1 + + +-- !query +SELECT not(a IS NULL), count(*) AS c +FROM testData +GROUP BY a IS NULL +-- !query schema +struct<(NOT (a IS NULL)):boolean,c:bigint> +-- !query output +false 2 +true 7 + + +-- !query +SELECT if(not(a IS NULL), rand(0), 1), count(*) AS c +FROM testData +GROUP BY a IS NULL +-- !query schema +struct<(IF((NOT (a IS NULL)), rand(0), 1)):double,c:bigint> +-- !query output +0.7604953758285915 7 +1.0 2 + + +-- !query +SELECT + histogram_numeric(col, 2) as histogram_2, + histogram_numeric(col, 3) as histogram_3, + histogram_numeric(col, 5) as histogram_5, + histogram_numeric(col, 10) as histogram_10 +FROM VALUES + (1), (2), (3), (4), (5), (6), (7), (8), (9), (10), + (11), (12), (13), (14), (15), (16), (17), (18), (19), (20), + (21), (22), (23), (24), (25), (26), (27), (28), (29), (30), + (31), (32), (33), (34), (35), (3), (37), (38), (39), (40), + (41), (42), (43), (44), (45), (46), (47), (48), (49), (50) AS tab(col) +-- !query schema +struct>,histogram_3:array>,histogram_5:array>,histogram_10:array>> +-- !query output +[{"x":12,"y":26.0},{"x":38,"y":24.0}] [{"x":9,"y":20.0},{"x":25,"y":11.0},{"x":40,"y":19.0}] [{"x":5,"y":11.0},{"x":14,"y":8.0},{"x":22,"y":7.0},{"x":30,"y":10.0},{"x":43,"y":14.0}] [{"x":3,"y":6.0},{"x":8,"y":6.0},{"x":13,"y":4.0},{"x":17,"y":3.0},{"x":20,"y":4.0},{"x":25,"y":6.0},{"x":31,"y":7.0},{"x":39,"y":5.0},{"x":43,"y":4.0},{"x":48,"y":5.0}] + + +-- !query +SELECT histogram_numeric(col, 3) FROM VALUES (1), (2), (3) AS tab(col) +-- !query schema +struct>> +-- !query output +[{"x":1,"y":1.0},{"x":2,"y":1.0},{"x":3,"y":1.0}] + + +-- !query +SELECT histogram_numeric(col, 3) FROM VALUES (1L), (2L), (3L) AS tab(col) +-- !query schema +struct>> +-- !query output +[{"x":1,"y":1.0},{"x":2,"y":1.0},{"x":3,"y":1.0}] + + +-- !query +SELECT histogram_numeric(col, 3) FROM VALUES (1F), (2F), (3F) AS tab(col) +-- !query schema +struct>> +-- !query output +[{"x":1.0,"y":1.0},{"x":2.0,"y":1.0},{"x":3.0,"y":1.0}] + + +-- !query +SELECT histogram_numeric(col, 3) FROM VALUES (1D), (2D), (3D) AS tab(col) +-- !query schema +struct>> +-- !query output +[{"x":1.0,"y":1.0},{"x":2.0,"y":1.0},{"x":3.0,"y":1.0}] + + +-- !query +SELECT histogram_numeric(col, 3) FROM VALUES (1S), (2S), (3S) AS tab(col) +-- !query schema +struct>> +-- !query output +[{"x":1,"y":1.0},{"x":2,"y":1.0},{"x":3,"y":1.0}] + + +-- !query +SELECT histogram_numeric(col, 3) FROM VALUES + (CAST(1 AS BYTE)), (CAST(2 AS BYTE)), (CAST(3 AS BYTE)) AS tab(col) +-- !query schema +struct>> +-- !query output +[{"x":1,"y":1.0},{"x":2,"y":1.0},{"x":3,"y":1.0}] + + +-- !query +SELECT histogram_numeric(col, 3) FROM VALUES + (CAST(1 AS TINYINT)), (CAST(2 AS TINYINT)), (CAST(3 AS TINYINT)) AS tab(col) +-- !query schema +struct>> +-- !query output +[{"x":1,"y":1.0},{"x":2,"y":1.0},{"x":3,"y":1.0}] + + +-- !query +SELECT histogram_numeric(col, 3) FROM VALUES + (CAST(1 AS SMALLINT)), (CAST(2 AS SMALLINT)), (CAST(3 AS SMALLINT)) AS tab(col) +-- !query schema +struct>> +-- !query output +[{"x":1,"y":1.0},{"x":2,"y":1.0},{"x":3,"y":1.0}] + + +-- !query +SELECT histogram_numeric(col, 3) FROM VALUES + (CAST(1 AS BIGINT)), (CAST(2 AS BIGINT)), (CAST(3 AS BIGINT)) AS tab(col) +-- !query schema +struct>> +-- !query output +[{"x":1,"y":1.0},{"x":2,"y":1.0},{"x":3,"y":1.0}] + + +-- !query +SELECT histogram_numeric(col, 3) FROM VALUES (TIMESTAMP '2017-03-01 00:00:00'), + (TIMESTAMP '2017-04-01 00:00:00'), (TIMESTAMP '2017-05-01 00:00:00') AS tab(col) +-- !query schema +struct>> +-- !query output +[{"x":2017-03-01 00:00:00,"y":1.0},{"x":2017-04-01 00:00:00,"y":1.0},{"x":2017-05-01 00:00:00,"y":1.0}] + + +-- !query +SELECT histogram_numeric(col, 3) FROM VALUES (INTERVAL '100-00' YEAR TO MONTH), + (INTERVAL '110-00' YEAR TO MONTH), (INTERVAL '120-00' YEAR TO MONTH) AS tab(col) +-- !query schema +struct>> +-- !query output +[{"x":100-0,"y":1.0},{"x":110-0,"y":1.0},{"x":120-0,"y":1.0}] + + +-- !query +SELECT histogram_numeric(col, 3) FROM VALUES (INTERVAL '12 20:4:0' DAY TO SECOND), + (INTERVAL '12 21:4:0' DAY TO SECOND), (INTERVAL '12 22:4:0' DAY TO SECOND) AS tab(col) +-- !query schema +struct>> +-- !query output +[{"x":12 20:04:00.000000000,"y":1.0},{"x":12 21:04:00.000000000,"y":1.0},{"x":12 22:04:00.000000000,"y":1.0}] + + +-- !query +SELECT histogram_numeric(col, 3) +FROM VALUES (NULL), (NULL), (NULL) AS tab(col) +-- !query schema +struct>> +-- !query output +NULL + + +-- !query +SELECT histogram_numeric(col, 3) +FROM VALUES (CAST(NULL AS DOUBLE)), (CAST(NULL AS DOUBLE)), (CAST(NULL AS DOUBLE)) AS tab(col) +-- !query schema +struct>> +-- !query output +NULL + + +-- !query +SELECT histogram_numeric(col, 3) +FROM VALUES (CAST(NULL AS INT)), (CAST(NULL AS INT)), (CAST(NULL AS INT)) AS tab(col) +-- !query schema +struct>> +-- !query output +NULL + + +-- !query +SELECT regr_count(y, x) FROM testRegression +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT regr_count(y, x) FROM testRegression WHERE x IS NOT NULL +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT k, count(*), regr_count(y, x) FROM testRegression GROUP BY k +-- !query schema +struct +-- !query output +1 1 0 +2 4 3 + + +-- !query +SELECT k, count(*) FILTER (WHERE x IS NOT NULL), regr_count(y, x) FROM testRegression GROUP BY k +-- !query schema +struct +-- !query output +1 0 0 +2 3 3 + + +-- !query +SELECT regr_r2(y, x) FROM testRegression +-- !query schema +struct +-- !query output +0.997690531177829 + + +-- !query +SELECT regr_r2(y, x) FROM testRegression WHERE x IS NOT NULL +-- !query schema +struct +-- !query output +0.997690531177829 + + +-- !query +SELECT k, corr(y, x), regr_r2(y, x) FROM testRegression GROUP BY k +-- !query schema +struct +-- !query output +1 NULL NULL +2 0.9988445981121533 0.997690531177829 + + +-- !query +SELECT k, corr(y, x) FILTER (WHERE x IS NOT NULL), regr_r2(y, x) FROM testRegression GROUP BY k +-- !query schema +struct +-- !query output +1 NULL NULL +2 0.9988445981121533 0.997690531177829 + + +-- !query +SELECT + collect_list(col), + array_agg(col) +FROM VALUES + (1), (2), (1) AS tab(col) +-- !query schema +struct,collect_list(col):array> +-- !query output +[1,2,1] [1,2,1] + + +-- !query +SELECT + a, + collect_list(b), + array_agg(b) +FROM VALUES + (1,4),(2,3),(1,4),(2,4) AS v(a,b) +GROUP BY a +-- !query schema +struct,collect_list(b):array> +-- !query output +1 [4,4] [4,4] +2 [3,4] [3,4] + + +-- !query +SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression +-- !query schema +struct +-- !query output +22.666666666666668 20.0 + + +-- !query +SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL +-- !query schema +struct +-- !query output +22.666666666666668 20.0 + + +-- !query +SELECT k, avg(x), avg(y), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression GROUP BY k +-- !query schema +struct +-- !query output +1 NULL 10.0 NULL NULL +2 22.666666666666668 21.25 22.666666666666668 20.0 + + +-- !query +SELECT k, avg(x) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), avg(y) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression GROUP BY k +-- !query schema +struct +-- !query output +1 NULL NULL NULL NULL +2 22.666666666666668 20.0 22.666666666666668 20.0 + + +-- !query +SELECT + percentile_cont(0.25) WITHIN GROUP (ORDER BY v), + percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC) +FROM aggr +-- !query schema +struct +-- !query output +10.0 30.0 + + +-- !query +SELECT + k, + percentile_cont(0.25) WITHIN GROUP (ORDER BY v), + percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC) +FROM aggr +GROUP BY k +ORDER BY k +-- !query schema +struct +-- !query output +0 10.0 30.0 +1 12.5 17.5 +2 17.5 26.25 +3 60.0 60.0 +4 NULL NULL + + +-- !query +SELECT + percentile_disc(0.25) WITHIN GROUP (ORDER BY v), + percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC) +FROM aggr +-- !query schema +struct +-- !query output +10.0 30.0 + + +-- !query +SELECT + k, + percentile_disc(0.25) WITHIN GROUP (ORDER BY v), + percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC) +FROM aggr +GROUP BY k +ORDER BY k +-- !query schema +struct +-- !query output +0 10.0 30.0 +1 10.0 20.0 +2 10.0 30.0 +3 60.0 60.0 +4 NULL NULL diff --git a/gluten-ut/spark33/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out b/gluten-ut/spark33/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out new file mode 100644 index 0000000000000..14b2780210516 --- /dev/null +++ b/gluten-ut/spark33/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out @@ -0,0 +1,514 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 52 + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES +(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null) +AS testData(a, b) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT udf(a), udf(COUNT(b)) FROM testData +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +grouping expressions sequence is empty, and 'testdata.a' is not an aggregate function. Wrap '(CAST(udf(cast(count(b) as string)) AS BIGINT) AS `udf(count(b))`)' in windowing function(s) or wrap 'testdata.a' in first() (or first_value) if you don't care which value you get. + + +-- !query +SELECT COUNT(udf(a)), udf(COUNT(b)) FROM testData +-- !query schema +struct +-- !query output +7 7 + + +-- !query +SELECT udf(a), COUNT(udf(b)) FROM testData GROUP BY a +-- !query schema +struct +-- !query output +1 2 +2 2 +3 2 +NULL 1 + + +-- !query +SELECT udf(a), udf(COUNT(udf(b))) FROM testData GROUP BY b +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +expression 'testdata.a' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get. + + +-- !query +SELECT COUNT(udf(a)), COUNT(udf(b)) FROM testData GROUP BY udf(a) +-- !query schema +struct +-- !query output +0 1 +2 2 +2 2 +3 2 + + +-- !query +SELECT 'foo', COUNT(udf(a)) FROM testData GROUP BY 1 +-- !query schema +struct +-- !query output +foo 7 + + +-- !query +SELECT 'foo' FROM testData WHERE a = 0 GROUP BY udf(1) +-- !query schema +struct +-- !query output + + + +-- !query +SELECT 'foo', udf(APPROX_COUNT_DISTINCT(udf(a))) FROM testData WHERE a = 0 GROUP BY udf(1) +-- !query schema +struct +-- !query output + + + +-- !query +SELECT 'foo', MAX(STRUCT(udf(a))) FROM testData WHERE a = 0 GROUP BY udf(1) +-- !query schema +struct> +-- !query output + + + +-- !query +SELECT udf(a + b), udf(COUNT(b)) FROM testData GROUP BY a + b +-- !query schema +struct +-- !query output +2 1 +3 2 +4 2 +5 1 +NULL 1 + + +-- !query +SELECT udf(a + 2), udf(COUNT(b)) FROM testData GROUP BY a + 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +expression 'testdata.a' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get. + + +-- !query +SELECT udf(a + 1) + 1, udf(COUNT(b)) FROM testData GROUP BY udf(a + 1) +-- !query schema +struct<(udf((a + 1)) + 1):int,udf(count(b)):bigint> +-- !query output +3 2 +4 2 +5 2 +NULL 1 + + +-- !query +SELECT SKEWNESS(udf(a)), udf(KURTOSIS(a)), udf(MIN(a)), MAX(udf(a)), udf(AVG(udf(a))), udf(VARIANCE(a)), STDDEV(udf(a)), udf(SUM(a)), udf(COUNT(a)) +FROM testData +-- !query schema +struct +-- !query output +-0.2723801058145729 -1.5069204152249134 1 3 2.142857142857143 0.8095238095238094 0.8997354108424372 15 7 + + +-- !query +SELECT COUNT(DISTINCT udf(b)), udf(COUNT(DISTINCT b, c)) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY udf(a) +-- !query schema +struct +-- !query output +1 1 + + +-- !query +SELECT udf(a) AS k, COUNT(udf(b)) FROM testData GROUP BY k +-- !query schema +struct +-- !query output +1 2 +2 2 +3 2 +NULL 1 + + +-- !query +SELECT a AS k, udf(COUNT(b)) FROM testData GROUP BY k HAVING k > 1 +-- !query schema +struct +-- !query output +2 2 +3 2 + + +-- !query +SELECT udf(COUNT(b)) AS k FROM testData GROUP BY k +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +aggregate functions are not allowed in GROUP BY, but found CAST(udf(cast(count(b) as string)) AS BIGINT) + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW testDataHasSameNameWithAlias AS SELECT * FROM VALUES +(1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT k AS a, udf(COUNT(udf(v))) FROM testDataHasSameNameWithAlias GROUP BY udf(a) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +expression 'testdatahassamenamewithalias.k' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get. + + +-- !query +set spark.sql.groupByAliases=false +-- !query schema +struct +-- !query output +spark.sql.groupByAliases false + + +-- !query +SELECT a AS k, udf(COUNT(udf(b))) FROM testData GROUP BY k +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Column 'k' does not exist. Did you mean one of the following? [testdata.a, testdata.b]; line 1 pos 57 + + +-- !query +SELECT udf(a), COUNT(udf(1)) FROM testData WHERE false GROUP BY udf(a) +-- !query schema +struct +-- !query output + + + +-- !query +SELECT udf(COUNT(1)) FROM testData WHERE false +-- !query schema +struct +-- !query output +0 + + +-- !query +SELECT 1 FROM (SELECT udf(COUNT(1)) FROM testData WHERE false) t +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +SELECT 1 from ( + SELECT 1 AS z, + udf(MIN(a.x)) + FROM (select 1 as x) a + WHERE false +) b +where b.z != b.z +-- !query schema +struct<1:int> +-- !query output + + + +-- !query +SELECT udf(1) FROM range(10) HAVING true +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT udf(udf(1)) FROM range(10) HAVING MAX(id) > 0 +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT udf(id) FROM range(10) HAVING id > 0 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +grouping expressions sequence is empty, and 'id' is not an aggregate function. Wrap '()' in windowing function(s) or wrap 'id' in first() (or first_value) if you don't care which value you get. + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW test_agg AS SELECT * FROM VALUES + (1, true), (1, false), + (2, true), + (3, false), (3, null), + (4, null), (4, null), + (5, null), (5, true), (5, false) AS test_agg(k, v) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT udf(every(v)), udf(some(v)), any(v) FROM test_agg WHERE 1 = 0 +-- !query schema +struct +-- !query output +NULL NULL NULL + + +-- !query +SELECT udf(every(udf(v))), some(v), any(v) FROM test_agg WHERE k = 4 +-- !query schema +struct +-- !query output +NULL NULL NULL + + +-- !query +SELECT every(v), udf(some(v)), any(v) FROM test_agg WHERE k = 5 +-- !query schema +struct +-- !query output +false true true + + +-- !query +SELECT udf(k), every(v), udf(some(v)), any(v) FROM test_agg GROUP BY udf(k) +-- !query schema +struct +-- !query output +1 false true true +2 true true true +3 false false false +4 NULL NULL NULL +5 false true true + + +-- !query +SELECT udf(k), every(v) FROM test_agg GROUP BY k HAVING every(v) = false +-- !query schema +struct +-- !query output +1 false +3 false +5 false + + +-- !query +SELECT udf(k), udf(every(v)) FROM test_agg GROUP BY udf(k) HAVING every(v) IS NULL +-- !query schema +struct +-- !query output +4 NULL + + +-- !query +SELECT udf(k), + udf(Every(v)) AS every +FROM test_agg +WHERE k = 2 + AND v IN (SELECT Any(v) + FROM test_agg + WHERE k = 1) +GROUP BY udf(k) +-- !query schema +struct +-- !query output +2 true + + +-- !query +SELECT udf(udf(k)), + Every(v) AS every +FROM test_agg +WHERE k = 2 + AND v IN (SELECT Every(v) + FROM test_agg + WHERE k = 1) +GROUP BY udf(udf(k)) +-- !query schema +struct +-- !query output + + + +-- !query +SELECT every(udf(1)) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'every(CAST(udf(cast(1 as string)) AS INT))' due to data type mismatch: argument 1 requires boolean type, however, 'CAST(udf(cast(1 as string)) AS INT)' is of int type.; line 1 pos 7 + + +-- !query +SELECT some(udf(1S)) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'some(CAST(udf(cast(1 as string)) AS SMALLINT))' due to data type mismatch: argument 1 requires boolean type, however, 'CAST(udf(cast(1 as string)) AS SMALLINT)' is of smallint type.; line 1 pos 7 + + +-- !query +SELECT any(udf(1L)) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'any(CAST(udf(cast(1 as string)) AS BIGINT))' due to data type mismatch: argument 1 requires boolean type, however, 'CAST(udf(cast(1 as string)) AS BIGINT)' is of bigint type.; line 1 pos 7 + + +-- !query +SELECT udf(every("true")) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'every('true')' due to data type mismatch: argument 1 requires boolean type, however, ''true'' is of string type.; line 1 pos 11 + + +-- !query +SELECT k, v, every(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +-- !query schema +struct +-- !query output +1 false false +1 true false +2 true true +3 NULL NULL +3 false false +4 NULL NULL +4 NULL NULL +5 NULL NULL +5 false false +5 true false + + +-- !query +SELECT k, udf(udf(v)), some(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +-- !query schema +struct +-- !query output +1 false false +1 true true +2 true true +3 NULL NULL +3 false false +4 NULL NULL +4 NULL NULL +5 NULL NULL +5 false false +5 true true + + +-- !query +SELECT udf(udf(k)), v, any(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +-- !query schema +struct +-- !query output +1 false false +1 true true +2 true true +3 NULL NULL +3 false false +4 NULL NULL +4 NULL NULL +5 NULL NULL +5 false false +5 true true + + +-- !query +SELECT udf(count(*)) FROM test_agg HAVING count(*) > 1L +-- !query schema +struct +-- !query output +10 + + +-- !query +SELECT k, udf(max(v)) FROM test_agg GROUP BY k HAVING max(v) = true +-- !query schema +struct +-- !query output +1 true +2 true +5 true + + +-- !query +SELECT * FROM (SELECT udf(COUNT(*)) AS cnt FROM test_agg) WHERE cnt > 1L +-- !query schema +struct +-- !query output +10 + + +-- !query +SELECT udf(count(*)) FROM test_agg WHERE count(*) > 1L +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException + +Aggregate/Window/Generate expressions are not valid in where clause of the query. +Expression in where clause: [(count(1) > 1L)] +Invalid expressions: [count(1)] + + +-- !query +SELECT udf(count(*)) FROM test_agg WHERE count(*) + 1L > 1L +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException + +Aggregate/Window/Generate expressions are not valid in where clause of the query. +Expression in where clause: [((count(1) + 1L) > 1L)] +Invalid expressions: [count(1)] + + +-- !query +SELECT udf(count(*)) FROM test_agg WHERE k = 1 or k = 2 or count(*) + 1L > 1L or max(k) > 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException + +Aggregate/Window/Generate expressions are not valid in where clause of the query. +Expression in where clause: [(((test_agg.k = 1) OR (test_agg.k = 2)) OR (((count(1) + 1L) > 1L) OR (max(test_agg.k) > 1)))] +Invalid expressions: [count(1), max(test_agg.k)] diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala index 90046c0b15b26..a0ed9441ef71d 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql import io.glutenproject.GlutenConfig +import io.glutenproject.backendsapi.{BackendsApiManager, TestApi} import io.glutenproject.utils.{BackendTestUtils, SystemParameters} import org.apache.spark.SparkConf @@ -155,6 +156,11 @@ class GlutenSQLQueryTestSuite protected val goldenFilePath = new File(baseResourcePath, "results").getAbsolutePath protected val testDataPath = new File(resourcesPath, "test-data").getAbsolutePath + protected val overwriteResourcePath = + getClass.getResource("/").getPath + "../../../src/test/resources/sql-tests" + protected val overwriteInputFilePath = new File(overwriteResourcePath, "inputs").getAbsolutePath + protected val overwriteGoldenFilePath = new File(overwriteResourcePath, "results").getAbsolutePath + protected val validFileExtensions = ".sql" /** Test if a command is available. */ @@ -217,200 +223,10 @@ class GlutenSQLQueryTestSuite "window.sql" // Local window fixes are not added. ) ++ otherIgnoreList - /** - * List of supported cases to run with Velox backend, in lower case. Please add to the supported - * list after enabling a sql test. - */ - - private val veloxSupportedList: Set[String] = Set( - "bitwise.sql", - "cast.sql", - "change-column.sql", - "charvarchar.sql", - "columnresolution-negative.sql", - "columnresolution-views.sql", - "columnresolution.sql", - "comments.sql", - "comparator.sql", - "count.sql", - "cross-join.sql", - "csv-functions.sql", - "cte-legacy.sql", - "cte-nested.sql", - "cte-nonlegacy.sql", - "cte.sql", - "current_database_catalog.sql", - "date.sql", - "datetime-formatting-invalid.sql", - // Velox had different handling for some illegal cases. -// "datetime-formatting-legacy.sql", -// "datetime-formatting.sql", - "datetime-legacy.sql", - "datetime-parsing-invalid.sql", - "datetime-parsing-legacy.sql", - "datetime-parsing.sql", - "datetime-special.sql", - "decimalArithmeticOperations.sql", - "describe-part-after-analyze.sql", - "describe-query.sql", - "describe-table-after-alter-table.sql", - "describe-table-column.sql", - "describe.sql", - "except-all.sql", - "except.sql", - "extract.sql", - "group-by-filter.sql", - "group-by-ordinal.sql", - "group-by.sql", - "grouping_set.sql", - "having.sql", - "ignored.sql", - "inline-table.sql", - "inner-join.sql", - "intersect-all.sql", - "interval.sql", - "join-empty-relation.sql", - "join-lateral.sql", - "json-functions.sql", - "like-all.sql", - "like-any.sql", - "limit.sql", - "literals.sql", - "map.sql", - "misc-functions.sql", - "natural-join.sql", - "null-handling.sql", - "null-propagation.sql", - "operators.sql", - "order-by-nulls-ordering.sql", - "order-by-ordinal.sql", - "outer-join.sql", - "parse-schema-string.sql", - "pivot.sql", - "pred-pushdown.sql", - "predicate-functions.sql", - "query_regex_column.sql", - "random.sql", - "regexp-functions.sql", - "show-create-table.sql", - "show-tables.sql", - "show-tblproperties.sql", - "show-views.sql", - "show_columns.sql", - "sql-compatibility-functions.sql", - "string-functions.sql", - "struct.sql", - "subexp-elimination.sql", - "table-aliases.sql", - "table-valued-functions.sql", - "tablesample-negative.sql", - "subquery/exists-subquery/exists-aggregate.sql", - "subquery/exists-subquery/exists-basic.sql", - "subquery/exists-subquery/exists-cte.sql", - "subquery/exists-subquery/exists-having.sql", - "subquery/exists-subquery/exists-joins-and-set-ops.sql", - "subquery/exists-subquery/exists-orderby-limit.sql", - "subquery/exists-subquery/exists-within-and-or.sql", - "subquery/in-subquery/in-basic.sql", - "subquery/in-subquery/in-group-by.sql", - "subquery/in-subquery/in-having.sql", - "subquery/in-subquery/in-joins.sql", - "subquery/in-subquery/in-limit.sql", - "subquery/in-subquery/in-multiple-columns.sql", - "subquery/in-subquery/in-order-by.sql", - "subquery/in-subquery/in-set-operations.sql", - "subquery/in-subquery/in-with-cte.sql", - "subquery/in-subquery/nested-not-in.sql", - "subquery/in-subquery/not-in-group-by.sql", - "subquery/in-subquery/not-in-joins.sql", - "subquery/in-subquery/not-in-unit-tests-multi-column.sql", - "subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql", - "subquery/in-subquery/not-in-unit-tests-single-column.sql", - "subquery/in-subquery/not-in-unit-tests-single-column-literal.sql", - "subquery/in-subquery/simple-in.sql", - "subquery/negative-cases/invalid-correlation.sql", - "subquery/negative-cases/subq-input-typecheck.sql", - "subquery/scalar-subquery/scalar-subquery-predicate.sql", - "subquery/scalar-subquery/scalar-subquery-select.sql", - "subquery/subquery-in-from.sql", - "postgreSQL/aggregates_part1.sql", - "postgreSQL/aggregates_part2.sql", - "postgreSQL/aggregates_part3.sql", - "postgreSQL/aggregates_part4.sql", - "postgreSQL/boolean.sql", - "postgreSQL/case.sql", - "postgreSQL/comments.sql", - "postgreSQL/create_view.sql", - "postgreSQL/date.sql", - "postgreSQL/float4.sql", - "postgreSQL/insert.sql", - "postgreSQL/int2.sql", - "postgreSQL/int4.sql", - "postgreSQL/int8.sql", - "postgreSQL/interval.sql", - "postgreSQL/join.sql", - "postgreSQL/limit.sql", - "postgreSQL/numeric.sql", - "postgreSQL/select.sql", - "postgreSQL/select_distinct.sql", - "postgreSQL/select_having.sql", - "postgreSQL/select_implicit.sql", - "postgreSQL/strings.sql", - "postgreSQL/text.sql", - "postgreSQL/timestamp.sql", - "postgreSQL/union.sql", - "postgreSQL/window_part1.sql", - "postgreSQL/window_part2.sql", - "postgreSQL/window_part3.sql", - "postgreSQL/window_part4.sql", - "postgreSQL/with.sql", - "datetime-special.sql", - "timestamp-ansi.sql", - "timestamp.sql", - "arrayJoin.sql", - "binaryComparison.sql", - "booleanEquality.sql", - "caseWhenCoercion.sql", - "concat.sql", - "dateTimeOperations.sql", - "decimalPrecision.sql", - "division.sql", - "elt.sql", - "ifCoercion.sql", - "implicitTypeCasts.sql", - "inConversion.sql", - "mapZipWith.sql", - "mapconcat.sql", - "promoteStrings.sql", - "stringCastAndExpressions.sql", - "widenSetOperationTypes.sql", - "windowFrameCoercion.sql", - "timestamp-ltz.sql", - "timestamp-ntz.sql", - "timezone.sql", - "transform.sql", - "try_arithmetic.sql", - "try_cast.sql", - "udaf.sql", - "union.sql", - "using-join.sql", - "window.sql", - "udf-union.sql", - "udf-window.sql" - ) - - /** - * List of supported cases to run with Clickhouse backend, in lower case. Please add to the - * supported list after enabling a sql test. - */ - private val CHSupportedList: Set[String] = Set() - // List of supported cases to run with a certain backend, in lower case. - private val supportedList: Set[String] = if (isCHBackend) { - CHSupportedList - } else { - veloxSupportedList - } + private val supportedList: Set[String] = + BackendsApiManager.getTestApiInstance.getSupportedSQLQueryTests(TestApi.SparkVersion33()) ++ + BackendsApiManager.getTestApiInstance.getOverwriteSQLQueryTests(TestApi.SparkVersion33()) // Create all the test cases. listTestCases.foreach(createScalaTestCase) @@ -768,35 +584,38 @@ class GlutenSQLQueryTestSuite } protected lazy val listTestCases: Seq[TestCase] = { - listFilesRecursively(new File(inputFilePath)).flatMap { - file => - val resultFile = file.getAbsolutePath.replace(inputFilePath, goldenFilePath) + ".out" - val absPath = file.getAbsolutePath - val testCaseName = absPath.stripPrefix(inputFilePath).stripPrefix(File.separator) - - if ( - file.getAbsolutePath.startsWith( - s"$inputFilePath${File.separator}udf${File.separator}postgreSQL") - ) { - Seq(TestScalaUDF("udf"), TestPythonUDF("udf"), TestScalarPandasUDF("udf")).map { - udf => UDFPgSQLTestCase(s"$testCaseName - ${udf.prettyName}", absPath, resultFile, udf) - } - } else if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}udf")) { - Seq(TestScalaUDF("udf"), TestPythonUDF("udf"), TestScalarPandasUDF("udf")).map { - udf => UDFTestCase(s"$testCaseName - ${udf.prettyName}", absPath, resultFile, udf) - } - } else if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}postgreSQL")) { - PgSQLTestCase(testCaseName, absPath, resultFile) :: Nil - } else if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}ansi")) { - AnsiTestCase(testCaseName, absPath, resultFile) :: Nil - } else if ( - file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}timestampNTZ") - ) { - TimestampNTZTestCase(testCaseName, absPath, resultFile) :: Nil - } else { - RegularTestCase(testCaseName, absPath, resultFile) :: Nil + val createTestCase = (file: File, parentDir: String, resultPath: String) => { + val resultFile = file.getAbsolutePath.replace(parentDir, resultPath) + ".out" + val absPath = file.getAbsolutePath + val testCaseName = absPath.stripPrefix(parentDir).stripPrefix(File.separator) + + if ( + file.getAbsolutePath.startsWith( + s"$parentDir${File.separator}udf${File.separator}postgreSQL") + ) { + Seq(TestScalaUDF("udf"), TestPythonUDF("udf"), TestScalarPandasUDF("udf")).map { + udf => UDFPgSQLTestCase(s"$testCaseName - ${udf.prettyName}", absPath, resultFile, udf) + } + } else if (file.getAbsolutePath.startsWith(s"$parentDir${File.separator}udf")) { + Seq(TestScalaUDF("udf"), TestPythonUDF("udf"), TestScalarPandasUDF("udf")).map { + udf => UDFTestCase(s"$testCaseName - ${udf.prettyName}", absPath, resultFile, udf) } + } else if (file.getAbsolutePath.startsWith(s"$parentDir${File.separator}postgreSQL")) { + PgSQLTestCase(testCaseName, absPath, resultFile) :: Nil + } else if (file.getAbsolutePath.startsWith(s"$parentDir${File.separator}ansi")) { + AnsiTestCase(testCaseName, absPath, resultFile) :: Nil + } else if (file.getAbsolutePath.startsWith(s"$parentDir${File.separator}timestampNTZ")) { + TimestampNTZTestCase(testCaseName, absPath, resultFile) :: Nil + } else { + RegularTestCase(testCaseName, absPath, resultFile) :: Nil + } } + val overwriteTestCases = listFilesRecursively(new File(overwriteInputFilePath)) + .flatMap(createTestCase(_, overwriteInputFilePath, overwriteGoldenFilePath)) + val overwriteTestCaseNames = overwriteTestCases.map(_.name) + listFilesRecursively(new File(inputFilePath)) + .flatMap(createTestCase(_, inputFilePath, goldenFilePath)) + .filterNot(testCase => overwriteTestCaseNames.contains(testCase.name)) ++ overwriteTestCases } /** Returns all the files (not directories) in a directory, recursively. */