From 050069e22f7a29a50e01dc1824bd8cd0cabbf479 Mon Sep 17 00:00:00 2001 From: "Ma, Rong" Date: Mon, 20 Nov 2023 12:11:22 +0800 Subject: [PATCH] remove corr in group-by.sql --- .../backendsapi/clickhouse/CHBackend.scala | 2 + .../backendsapi/clickhouse/CHTestApi.scala | 28 + .../backendsapi/velox/TestApiImpl.scala | 216 ++++++ .../backendsapi/velox/VeloxBackend.scala | 2 + .../resources/sql-tests/inputs/group-by.sql | 187 +++++ .../sql-tests/inputs/udf/udf-group-by.sql | 152 ++++ .../sql-tests/results/group-by.sql.out | 657 ++++++++++++++++++ .../results/udf/udf-group-by.sql.out | 514 ++++++++++++++ .../glutenproject/backendsapi/Backend.scala | 2 + .../backendsapi/BackendsApiManager.scala | 4 + .../glutenproject/backendsapi/TestApi.scala | 25 + .../spark/sql/GlutenSQLQueryTestSuite.scala | 254 ++----- 12 files changed, 1831 insertions(+), 212 deletions(-) create mode 100644 backends-clickhouse/src/main/scala/io/glutenproject/backendsapi/clickhouse/CHTestApi.scala create mode 100644 backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/TestApiImpl.scala create mode 100644 backends-velox/src/test/resources/sql-tests/inputs/group-by.sql create mode 100644 backends-velox/src/test/resources/sql-tests/inputs/udf/udf-group-by.sql create mode 100644 backends-velox/src/test/resources/sql-tests/results/group-by.sql.out create mode 100644 backends-velox/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out create mode 100644 gluten-core/src/main/scala/io/glutenproject/backendsapi/TestApi.scala diff --git a/backends-clickhouse/src/main/scala/io/glutenproject/backendsapi/clickhouse/CHBackend.scala b/backends-clickhouse/src/main/scala/io/glutenproject/backendsapi/clickhouse/CHBackend.scala index ad65c65e64e07..d5af65aa89502 100644 --- a/backends-clickhouse/src/main/scala/io/glutenproject/backendsapi/clickhouse/CHBackend.scala +++ b/backends-clickhouse/src/main/scala/io/glutenproject/backendsapi/clickhouse/CHBackend.scala @@ -46,6 +46,8 @@ class CHBackend extends Backend { override def listenerApi(): ListenerApi = new CHListenerApi override def broadcastApi(): BroadcastApi = new CHBroadcastApi override def settings(): BackendSettingsApi = CHBackendSettings + + override def testApi(): TestApi = new CHTestApi } object CHBackend { diff --git a/backends-clickhouse/src/main/scala/io/glutenproject/backendsapi/clickhouse/CHTestApi.scala b/backends-clickhouse/src/main/scala/io/glutenproject/backendsapi/clickhouse/CHTestApi.scala new file mode 100644 index 0000000000000..2a7a9a4785e32 --- /dev/null +++ b/backends-clickhouse/src/main/scala/io/glutenproject/backendsapi/clickhouse/CHTestApi.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.glutenproject.backendsapi.clickhouse + +import io.glutenproject.backendsapi.TestApi + +class CHTestApi extends TestApi { + override def getSupportedSQLQueryTests: Set[String] = Set[String]() + + override def getOverwriteSQLQueryTests: Set[String] = Set[String]() + + override def getOverwriteSQLQueryResourcePath: String = + getClass.getResource("/") + "../../test/resources/resources/sql-tests" +} diff --git a/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/TestApiImpl.scala b/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/TestApiImpl.scala new file mode 100644 index 0000000000000..f1ed05cb28194 --- /dev/null +++ b/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/TestApiImpl.scala @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.glutenproject.backendsapi.velox + +import io.glutenproject.backendsapi.TestApi + +class TestApiImpl extends TestApi { + + /** + * List of supported cases to run with Velox backend, in lower case. Please add to the supported + * list after enabling a sql test. + */ + override def getSupportedSQLQueryTests: Set[String] = { + Set( + "bitwise.sql", + "cast.sql", + "change-column.sql", + "charvarchar.sql", + "columnresolution-negative.sql", + "columnresolution-views.sql", + "columnresolution.sql", + "comments.sql", + "comparator.sql", + "count.sql", + "cross-join.sql", + "csv-functions.sql", + "cte-legacy.sql", + "cte-nested.sql", + "cte-nonlegacy.sql", + "cte.sql", + "current_database_catalog.sql", + "date.sql", + "datetime-formatting-invalid.sql", + "datetime-formatting-legacy.sql", + "datetime-formatting.sql", + "datetime-legacy.sql", + "datetime-parsing-invalid.sql", + "datetime-parsing-legacy.sql", + "datetime-parsing.sql", + "datetime-special.sql", + "decimalArithmeticOperations.sql", + "describe-part-after-analyze.sql", + "describe-query.sql", + "describe-table-after-alter-table.sql", + // result match, but the order is not right + // "describe-table-column.sql", + "describe.sql", + "except-all.sql", + "except.sql", + "extract.sql", + "group-by-filter.sql", + "group-by-ordinal.sql", + "grouping_set.sql", + "having.sql", + "ignored.sql", + "inline-table.sql", + "inner-join.sql", + "intersect-all.sql", + "interval.sql", + "join-empty-relation.sql", + "join-lateral.sql", + "json-functions.sql", + "like-all.sql", + "like-any.sql", + "limit.sql", + "literals.sql", + "map.sql", + "misc-functions.sql", + "natural-join.sql", + "null-handling.sql", + "null-propagation.sql", + "operators.sql", + "order-by-nulls-ordering.sql", + "order-by-ordinal.sql", + "outer-join.sql", + "parse-schema-string.sql", + "pivot.sql", + "pred-pushdown.sql", + "predicate-functions.sql", + "query_regex_column.sql", + "random.sql", + "regexp-functions.sql", + "show-create-table.sql", + "show-tables.sql", + "show-tblproperties.sql", + "show-views.sql", + "show_columns.sql", + "sql-compatibility-functions.sql", + "string-functions.sql", + "struct.sql", + "subexp-elimination.sql", + "table-aliases.sql", + "table-valued-functions.sql", + "tablesample-negative.sql", + "subquery/exists-subquery/exists-aggregate.sql", + "subquery/exists-subquery/exists-basic.sql", + "subquery/exists-subquery/exists-cte.sql", + "subquery/exists-subquery/exists-having.sql", + "subquery/exists-subquery/exists-joins-and-set-ops.sql", + "subquery/exists-subquery/exists-orderby-limit.sql", + "subquery/exists-subquery/exists-within-and-or.sql", + "subquery/in-subquery/in-basic.sql", + "subquery/in-subquery/in-group-by.sql", + "subquery/in-subquery/in-having.sql", + "subquery/in-subquery/in-joins.sql", + "subquery/in-subquery/in-limit.sql", + "subquery/in-subquery/in-multiple-columns.sql", + "subquery/in-subquery/in-order-by.sql", + "subquery/in-subquery/in-set-operations.sql", + "subquery/in-subquery/in-with-cte.sql", + "subquery/in-subquery/nested-not-in.sql", + "subquery/in-subquery/not-in-group-by.sql", + "subquery/in-subquery/not-in-joins.sql", + "subquery/in-subquery/not-in-unit-tests-multi-column.sql", + "subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql", + "subquery/in-subquery/not-in-unit-tests-single-column.sql", + "subquery/in-subquery/not-in-unit-tests-single-column-literal.sql", + "subquery/in-subquery/simple-in.sql", + "subquery/negative-cases/invalid-correlation.sql", + "subquery/negative-cases/subq-input-typecheck.sql", + "subquery/scalar-subquery/scalar-subquery-predicate.sql", + "subquery/scalar-subquery/scalar-subquery-select.sql", + "subquery/subquery-in-from.sql", + "postgreSQL/aggregates_part1.sql", + "postgreSQL/aggregates_part2.sql", + "postgreSQL/aggregates_part3.sql", + "postgreSQL/aggregates_part4.sql", + "postgreSQL/boolean.sql", + "postgreSQL/case.sql", + "postgreSQL/comments.sql", + "postgreSQL/create_view.sql", + "postgreSQL/date.sql", + "postgreSQL/float4.sql", + "postgreSQL/insert.sql", + "postgreSQL/int2.sql", + "postgreSQL/int4.sql", + "postgreSQL/int8.sql", + "postgreSQL/interval.sql", + "postgreSQL/join.sql", + "postgreSQL/limit.sql", + "postgreSQL/numeric.sql", + "postgreSQL/select.sql", + "postgreSQL/select_distinct.sql", + "postgreSQL/select_having.sql", + "postgreSQL/select_implicit.sql", + "postgreSQL/strings.sql", + "postgreSQL/text.sql", + "postgreSQL/timestamp.sql", + "postgreSQL/union.sql", + "postgreSQL/window_part1.sql", + "postgreSQL/window_part2.sql", + "postgreSQL/window_part3.sql", + "postgreSQL/window_part4.sql", + "postgreSQL/with.sql", + "datetime-special.sql", + "timestamp-ansi.sql", + "timestamp.sql", + "arrayJoin.sql", + "binaryComparison.sql", + "booleanEquality.sql", + "caseWhenCoercion.sql", + "concat.sql", + "dateTimeOperations.sql", + "decimalPrecision.sql", + "division.sql", + "elt.sql", + "ifCoercion.sql", + "implicitTypeCasts.sql", + "inConversion.sql", + "mapZipWith.sql", + "mapconcat.sql", + "promoteStrings.sql", + "stringCastAndExpressions.sql", + "widenSetOperationTypes.sql", + "windowFrameCoercion.sql", + "timestamp-ltz.sql", + "timestamp-ntz.sql", + "timezone.sql", + "transform.sql", + "try_arithmetic.sql", + "try_cast.sql", + "udaf.sql", + "union.sql", + "using-join.sql", + // result match, but the order is not right + // "window.sql", + "udf/udf-union.sql", + "udf/udf-window.sql" + ) + } + + override def getOverwriteSQLQueryTests: Set[String] = Set[String]( + // Velox corr has better computation logic but it fails Spark's precision check. + // Remove corr in group-by.sql + "group-by.sql", + // Remove corr in udf/udf-group-by.sql + "udf/udf-group-by.sql" + ) + + override def getOverwriteSQLQueryResourcePath: String = + getClass.getResource("/") + "../../test/resources/resources/sql-tests" +} diff --git a/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/VeloxBackend.scala b/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/VeloxBackend.scala index 40d42b90873ec..bc34604495a7e 100644 --- a/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/VeloxBackend.scala +++ b/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/VeloxBackend.scala @@ -47,6 +47,8 @@ class VeloxBackend extends Backend { override def listenerApi(): ListenerApi = new ListenerApiImpl override def broadcastApi(): BroadcastApi = new BroadcastApiImpl override def settings(): BackendSettingsApi = BackendSettings + + override def testApi(): TestApi = new TestApiImpl } object VeloxBackend { diff --git a/backends-velox/src/test/resources/sql-tests/inputs/group-by.sql b/backends-velox/src/test/resources/sql-tests/inputs/group-by.sql new file mode 100644 index 0000000000000..4b2e12975a329 --- /dev/null +++ b/backends-velox/src/test/resources/sql-tests/inputs/group-by.sql @@ -0,0 +1,187 @@ +-- Test aggregate operator with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + +-- Test data. +CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES +(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null) +AS testData(a, b); + +-- Aggregate with empty GroupBy expressions. +SELECT a, COUNT(b) FROM testData; +SELECT COUNT(a), COUNT(b) FROM testData; + +-- Aggregate with non-empty GroupBy expressions. +SELECT a, COUNT(b) FROM testData GROUP BY a; +SELECT a, COUNT(b) FROM testData GROUP BY b; +SELECT COUNT(a), COUNT(b) FROM testData GROUP BY a; + +-- Aggregate grouped by literals. +SELECT 'foo', COUNT(a) FROM testData GROUP BY 1; + +-- Aggregate grouped by literals (whole stage code generation). +SELECT 'foo' FROM testData WHERE a = 0 GROUP BY 1; + +-- Aggregate grouped by literals (hash aggregate). +SELECT 'foo', APPROX_COUNT_DISTINCT(a) FROM testData WHERE a = 0 GROUP BY 1; + +-- Aggregate grouped by literals (sort aggregate). +SELECT 'foo', MAX(STRUCT(a)) FROM testData WHERE a = 0 GROUP BY 1; + +-- Aggregate with complex GroupBy expressions. +SELECT a + b, COUNT(b) FROM testData GROUP BY a + b; +SELECT a + 2, COUNT(b) FROM testData GROUP BY a + 1; +SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1; + +-- Aggregate with nulls. +SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a) +FROM testData; + +-- Aggregate with foldable input and multiple distinct groups. +SELECT COUNT(DISTINCT b), COUNT(DISTINCT b, c) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a; + +-- Aliases in SELECT could be used in GROUP BY +SELECT a AS k, COUNT(b) FROM testData GROUP BY k; +SELECT a AS k, COUNT(b) FROM testData GROUP BY k HAVING k > 1; + +-- Aggregate functions cannot be used in GROUP BY +SELECT COUNT(b) AS k FROM testData GROUP BY k; + +-- Test data. +CREATE OR REPLACE TEMPORARY VIEW testDataHasSameNameWithAlias AS SELECT * FROM VALUES +(1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v); +SELECT k AS a, COUNT(v) FROM testDataHasSameNameWithAlias GROUP BY a; + +-- turn off group by aliases +set spark.sql.groupByAliases=false; + +-- Check analysis exceptions +SELECT a AS k, COUNT(b) FROM testData GROUP BY k; + +-- Aggregate with empty input and non-empty GroupBy expressions. +SELECT a, COUNT(1) FROM testData WHERE false GROUP BY a; + +-- Aggregate with empty input and empty GroupBy expressions. +SELECT COUNT(1) FROM testData WHERE false; +SELECT 1 FROM (SELECT COUNT(1) FROM testData WHERE false) t; + +-- Aggregate with empty GroupBy expressions and filter on top +SELECT 1 from ( + SELECT 1 AS z, + MIN(a.x) + FROM (select 1 as x) a + WHERE false +) b +where b.z != b.z; + +-- SPARK-25708 HAVING without GROUP BY means global aggregate +SELECT 1 FROM range(10) HAVING true; + +SELECT 1 FROM range(10) HAVING MAX(id) > 0; + +SELECT id FROM range(10) HAVING id > 0; + +SET spark.sql.legacy.parser.havingWithoutGroupByAsWhere=true; + +SELECT 1 FROM range(10) HAVING true; + +SELECT 1 FROM range(10) HAVING MAX(id) > 0; + +SELECT id FROM range(10) HAVING id > 0; + +SET spark.sql.legacy.parser.havingWithoutGroupByAsWhere=false; + +-- Test data +CREATE OR REPLACE TEMPORARY VIEW test_agg AS SELECT * FROM VALUES + (1, true), (1, false), + (2, true), + (3, false), (3, null), + (4, null), (4, null), + (5, null), (5, true), (5, false) AS test_agg(k, v); + +-- empty table +SELECT every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg WHERE 1 = 0; + +-- all null values +SELECT every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg WHERE k = 4; + +-- aggregates are null Filtering +SELECT every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg WHERE k = 5; + +-- group by +SELECT k, every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg GROUP BY k; + +-- having +SELECT k, every(v) FROM test_agg GROUP BY k HAVING every(v) = false; +SELECT k, every(v) FROM test_agg GROUP BY k HAVING every(v) IS NULL; + +-- basic subquery path to make sure rewrite happens in both parent and child plans. +SELECT k, + Every(v) AS every +FROM test_agg +WHERE k = 2 + AND v IN (SELECT Any(v) + FROM test_agg + WHERE k = 1) +GROUP BY k; + +-- basic subquery path to make sure rewrite happens in both parent and child plans. +SELECT k, + Every(v) AS every +FROM test_agg +WHERE k = 2 + AND v IN (SELECT Every(v) + FROM test_agg + WHERE k = 1) +GROUP BY k; + +-- input type checking Int +SELECT every(1); + +-- input type checking Short +SELECT some(1S); + +-- input type checking Long +SELECT any(1L); + +-- input type checking String +SELECT every("true"); + +-- input type checking Decimal +SELECT bool_and(1.0); + +-- input type checking double +SELECT bool_or(1.0D); + +-- every/some/any aggregates/bool_and/bool_or are supported as windows expression. +SELECT k, v, every(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; +SELECT k, v, some(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; +SELECT k, v, any(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; +SELECT k, v, bool_and(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; +SELECT k, v, bool_or(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; + +-- Having referencing aggregate expressions is ok. +SELECT count(*) FROM test_agg HAVING count(*) > 1L; +SELECT k, max(v) FROM test_agg GROUP BY k HAVING max(v) = true; + +-- Aggrgate expressions can be referenced through an alias +SELECT * FROM (SELECT COUNT(*) AS cnt FROM test_agg) WHERE cnt > 1L; + +-- Error when aggregate expressions are in where clause directly +SELECT count(*) FROM test_agg WHERE count(*) > 1L; +SELECT count(*) FROM test_agg WHERE count(*) + 1L > 1L; +SELECT count(*) FROM test_agg WHERE k = 1 or k = 2 or count(*) + 1L > 1L or max(k) > 1; + +-- Aggregate with multiple distinct decimal columns +SELECT AVG(DISTINCT decimal_col), SUM(DISTINCT decimal_col) FROM VALUES (CAST(1 AS DECIMAL(9, 0))) t(decimal_col); + +-- SPARK-34581: Don't optimize out grouping expressions from aggregate expressions without aggregate function +SELECT not(a IS NULL), count(*) AS c +FROM testData +GROUP BY a IS NULL; + +SELECT if(not(a IS NULL), rand(0), 1), count(*) AS c +FROM testData +GROUP BY a IS NULL; + diff --git a/backends-velox/src/test/resources/sql-tests/inputs/udf/udf-group-by.sql b/backends-velox/src/test/resources/sql-tests/inputs/udf/udf-group-by.sql new file mode 100644 index 0000000000000..a4df72f44ebfc --- /dev/null +++ b/backends-velox/src/test/resources/sql-tests/inputs/udf/udf-group-by.sql @@ -0,0 +1,152 @@ +-- This test file was converted from group-by.sql. +-- Test data. +CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES +(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null) +AS testData(a, b); + +-- Aggregate with empty GroupBy expressions. +SELECT udf(a), udf(COUNT(b)) FROM testData; +SELECT COUNT(udf(a)), udf(COUNT(b)) FROM testData; + +-- Aggregate with non-empty GroupBy expressions. +SELECT udf(a), COUNT(udf(b)) FROM testData GROUP BY a; +SELECT udf(a), udf(COUNT(udf(b))) FROM testData GROUP BY b; +SELECT COUNT(udf(a)), COUNT(udf(b)) FROM testData GROUP BY udf(a); + +-- Aggregate grouped by literals. +SELECT 'foo', COUNT(udf(a)) FROM testData GROUP BY 1; + +-- Aggregate grouped by literals (whole stage code generation). +SELECT 'foo' FROM testData WHERE a = 0 GROUP BY udf(1); + +-- Aggregate grouped by literals (hash aggregate). +SELECT 'foo', udf(APPROX_COUNT_DISTINCT(udf(a))) FROM testData WHERE a = 0 GROUP BY udf(1); + +-- Aggregate grouped by literals (sort aggregate). +SELECT 'foo', MAX(STRUCT(udf(a))) FROM testData WHERE a = 0 GROUP BY udf(1); + +-- Aggregate with complex GroupBy expressions. +SELECT udf(a + b), udf(COUNT(b)) FROM testData GROUP BY a + b; +SELECT udf(a + 2), udf(COUNT(b)) FROM testData GROUP BY a + 1; +SELECT udf(a + 1) + 1, udf(COUNT(b)) FROM testData GROUP BY udf(a + 1); + +-- Aggregate with nulls. +SELECT SKEWNESS(udf(a)), udf(KURTOSIS(a)), udf(MIN(a)), MAX(udf(a)), udf(AVG(udf(a))), udf(VARIANCE(a)), STDDEV(udf(a)), udf(SUM(a)), udf(COUNT(a)) +FROM testData; + +-- Aggregate with foldable input and multiple distinct groups. +SELECT COUNT(DISTINCT udf(b)), udf(COUNT(DISTINCT b, c)) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY udf(a); + +-- Aliases in SELECT could be used in GROUP BY +SELECT udf(a) AS k, COUNT(udf(b)) FROM testData GROUP BY k; +SELECT a AS k, udf(COUNT(b)) FROM testData GROUP BY k HAVING k > 1; + +-- Aggregate functions cannot be used in GROUP BY +SELECT udf(COUNT(b)) AS k FROM testData GROUP BY k; + +-- Test data. +CREATE OR REPLACE TEMPORARY VIEW testDataHasSameNameWithAlias AS SELECT * FROM VALUES +(1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v); +SELECT k AS a, udf(COUNT(udf(v))) FROM testDataHasSameNameWithAlias GROUP BY udf(a); + +-- turn off group by aliases +set spark.sql.groupByAliases=false; + +-- Check analysis exceptions +SELECT a AS k, udf(COUNT(udf(b))) FROM testData GROUP BY k; + +-- Aggregate with empty input and non-empty GroupBy expressions. +SELECT udf(a), COUNT(udf(1)) FROM testData WHERE false GROUP BY udf(a); + +-- Aggregate with empty input and empty GroupBy expressions. +SELECT udf(COUNT(1)) FROM testData WHERE false; +SELECT 1 FROM (SELECT udf(COUNT(1)) FROM testData WHERE false) t; + +-- Aggregate with empty GroupBy expressions and filter on top +SELECT 1 from ( + SELECT 1 AS z, + udf(MIN(a.x)) + FROM (select 1 as x) a + WHERE false +) b +where b.z != b.z; + +-- SPARK-25708 HAVING without GROUP BY means global aggregate +SELECT udf(1) FROM range(10) HAVING true; + +SELECT udf(udf(1)) FROM range(10) HAVING MAX(id) > 0; + +SELECT udf(id) FROM range(10) HAVING id > 0; + +-- Test data +CREATE OR REPLACE TEMPORARY VIEW test_agg AS SELECT * FROM VALUES + (1, true), (1, false), + (2, true), + (3, false), (3, null), + (4, null), (4, null), + (5, null), (5, true), (5, false) AS test_agg(k, v); + +-- empty table +SELECT udf(every(v)), udf(some(v)), any(v) FROM test_agg WHERE 1 = 0; + +-- all null values +SELECT udf(every(udf(v))), some(v), any(v) FROM test_agg WHERE k = 4; + +-- aggregates are null Filtering +SELECT every(v), udf(some(v)), any(v) FROM test_agg WHERE k = 5; + +-- group by +SELECT udf(k), every(v), udf(some(v)), any(v) FROM test_agg GROUP BY udf(k); + +-- having +SELECT udf(k), every(v) FROM test_agg GROUP BY k HAVING every(v) = false; +SELECT udf(k), udf(every(v)) FROM test_agg GROUP BY udf(k) HAVING every(v) IS NULL; + +-- basic subquery path to make sure rewrite happens in both parent and child plans. +SELECT udf(k), + udf(Every(v)) AS every +FROM test_agg +WHERE k = 2 + AND v IN (SELECT Any(v) + FROM test_agg + WHERE k = 1) +GROUP BY udf(k); + +-- basic subquery path to make sure rewrite happens in both parent and child plans. +SELECT udf(udf(k)), + Every(v) AS every +FROM test_agg +WHERE k = 2 + AND v IN (SELECT Every(v) + FROM test_agg + WHERE k = 1) +GROUP BY udf(udf(k)); + +-- input type checking Int +SELECT every(udf(1)); + +-- input type checking Short +SELECT some(udf(1S)); + +-- input type checking Long +SELECT any(udf(1L)); + +-- input type checking String +SELECT udf(every("true")); + +-- every/some/any aggregates are supported as windows expression. +SELECT k, v, every(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; +SELECT k, udf(udf(v)), some(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; +SELECT udf(udf(k)), v, any(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; + +-- Having referencing aggregate expressions is ok. +SELECT udf(count(*)) FROM test_agg HAVING count(*) > 1L; +SELECT k, udf(max(v)) FROM test_agg GROUP BY k HAVING max(v) = true; + +-- Aggrgate expressions can be referenced through an alias +SELECT * FROM (SELECT udf(COUNT(*)) AS cnt FROM test_agg) WHERE cnt > 1L; + +-- Error when aggregate expressions are in where clause directly +SELECT udf(count(*)) FROM test_agg WHERE count(*) > 1L; +SELECT udf(count(*)) FROM test_agg WHERE count(*) + 1L > 1L; +SELECT udf(count(*)) FROM test_agg WHERE k = 1 or k = 2 or count(*) + 1L > 1L or max(k) > 1; diff --git a/backends-velox/src/test/resources/sql-tests/results/group-by.sql.out b/backends-velox/src/test/resources/sql-tests/results/group-by.sql.out new file mode 100644 index 0000000000000..8986ca9b0504b --- /dev/null +++ b/backends-velox/src/test/resources/sql-tests/results/group-by.sql.out @@ -0,0 +1,657 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 64 + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES +(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null) +AS testData(a, b) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT a, COUNT(b) FROM testData +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +grouping expressions sequence is empty, and 'testdata.a' is not an aggregate function. Wrap '(count(testdata.b) AS `count(b)`)' in windowing function(s) or wrap 'testdata.a' in first() (or first_value) if you don't care which value you get. + + +-- !query +SELECT COUNT(a), COUNT(b) FROM testData +-- !query schema +struct +-- !query output +7 7 + + +-- !query +SELECT a, COUNT(b) FROM testData GROUP BY a +-- !query schema +struct +-- !query output +1 2 +2 2 +3 2 +NULL 1 + + +-- !query +SELECT a, COUNT(b) FROM testData GROUP BY b +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +expression 'testdata.a' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get. + + +-- !query +SELECT COUNT(a), COUNT(b) FROM testData GROUP BY a +-- !query schema +struct +-- !query output +0 1 +2 2 +2 2 +3 2 + + +-- !query +SELECT 'foo', COUNT(a) FROM testData GROUP BY 1 +-- !query schema +struct +-- !query output +foo 7 + + +-- !query +SELECT 'foo' FROM testData WHERE a = 0 GROUP BY 1 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT 'foo', APPROX_COUNT_DISTINCT(a) FROM testData WHERE a = 0 GROUP BY 1 +-- !query schema +struct +-- !query output + + + +-- !query +SELECT 'foo', MAX(STRUCT(a)) FROM testData WHERE a = 0 GROUP BY 1 +-- !query schema +struct> +-- !query output + + + +-- !query +SELECT a + b, COUNT(b) FROM testData GROUP BY a + b +-- !query schema +struct<(a + b):int,count(b):bigint> +-- !query output +2 1 +3 2 +4 2 +5 1 +NULL 1 + + +-- !query +SELECT a + 2, COUNT(b) FROM testData GROUP BY a + 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +expression 'testdata.a' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get. + + +-- !query +SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1 +-- !query schema +struct<((a + 1) + 1):int,count(b):bigint> +-- !query output +3 2 +4 2 +5 2 +NULL 1 + + +-- !query +SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a) +FROM testData +-- !query schema +struct +-- !query output +-0.2723801058145729 -1.5069204152249134 1 3 2.142857142857143 0.8095238095238094 0.8997354108424372 15 7 + + +-- !query +SELECT COUNT(DISTINCT b), COUNT(DISTINCT b, c) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a +-- !query schema +struct +-- !query output +1 1 + + +-- !query +SELECT a AS k, COUNT(b) FROM testData GROUP BY k +-- !query schema +struct +-- !query output +1 2 +2 2 +3 2 +NULL 1 + + +-- !query +SELECT a AS k, COUNT(b) FROM testData GROUP BY k HAVING k > 1 +-- !query schema +struct +-- !query output +2 2 +3 2 + + +-- !query +SELECT COUNT(b) AS k FROM testData GROUP BY k +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +aggregate functions are not allowed in GROUP BY, but found count(testdata.b) + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW testDataHasSameNameWithAlias AS SELECT * FROM VALUES +(1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT k AS a, COUNT(v) FROM testDataHasSameNameWithAlias GROUP BY a +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +expression 'testdatahassamenamewithalias.k' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get. + + +-- !query +set spark.sql.groupByAliases=false +-- !query schema +struct +-- !query output +spark.sql.groupByAliases false + + +-- !query +SELECT a AS k, COUNT(b) FROM testData GROUP BY k +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'k' given input columns: [testdata.a, testdata.b]; line 1 pos 47 + + +-- !query +SELECT a, COUNT(1) FROM testData WHERE false GROUP BY a +-- !query schema +struct +-- !query output + + + +-- !query +SELECT COUNT(1) FROM testData WHERE false +-- !query schema +struct +-- !query output +0 + + +-- !query +SELECT 1 FROM (SELECT COUNT(1) FROM testData WHERE false) t +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +SELECT 1 from ( + SELECT 1 AS z, + MIN(a.x) + FROM (select 1 as x) a + WHERE false +) b +where b.z != b.z +-- !query schema +struct<1:int> +-- !query output + + + +-- !query +SELECT 1 FROM range(10) HAVING true +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +SELECT 1 FROM range(10) HAVING MAX(id) > 0 +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +SELECT id FROM range(10) HAVING id > 0 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +grouping expressions sequence is empty, and 'id' is not an aggregate function. Wrap '()' in windowing function(s) or wrap 'id' in first() (or first_value) if you don't care which value you get. + + +-- !query +SET spark.sql.legacy.parser.havingWithoutGroupByAsWhere=true +-- !query schema +struct +-- !query output +spark.sql.legacy.parser.havingWithoutGroupByAsWhere true + + +-- !query +SELECT 1 FROM range(10) HAVING true +-- !query schema +struct<1:int> +-- !query output +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 + + +-- !query +SELECT 1 FROM range(10) HAVING MAX(id) > 0 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException + +Aggregate/Window/Generate expressions are not valid in where clause of the query. +Expression in where clause: [(max(id) > CAST(0 AS BIGINT))] +Invalid expressions: [max(id)] + + +-- !query +SELECT id FROM range(10) HAVING id > 0 +-- !query schema +struct +-- !query output +1 +2 +3 +4 +5 +6 +7 +8 +9 + + +-- !query +SET spark.sql.legacy.parser.havingWithoutGroupByAsWhere=false +-- !query schema +struct +-- !query output +spark.sql.legacy.parser.havingWithoutGroupByAsWhere false + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW test_agg AS SELECT * FROM VALUES + (1, true), (1, false), + (2, true), + (3, false), (3, null), + (4, null), (4, null), + (5, null), (5, true), (5, false) AS test_agg(k, v) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg WHERE 1 = 0 +-- !query schema +struct +-- !query output +NULL NULL NULL NULL NULL + + +-- !query +SELECT every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg WHERE k = 4 +-- !query schema +struct +-- !query output +NULL NULL NULL NULL NULL + + +-- !query +SELECT every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg WHERE k = 5 +-- !query schema +struct +-- !query output +false true true false true + + +-- !query +SELECT k, every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg GROUP BY k +-- !query schema +struct +-- !query output +1 false true true false true +2 true true true true true +3 false false false false false +4 NULL NULL NULL NULL NULL +5 false true true false true + + +-- !query +SELECT k, every(v) FROM test_agg GROUP BY k HAVING every(v) = false +-- !query schema +struct +-- !query output +1 false +3 false +5 false + + +-- !query +SELECT k, every(v) FROM test_agg GROUP BY k HAVING every(v) IS NULL +-- !query schema +struct +-- !query output +4 NULL + + +-- !query +SELECT k, + Every(v) AS every +FROM test_agg +WHERE k = 2 + AND v IN (SELECT Any(v) + FROM test_agg + WHERE k = 1) +GROUP BY k +-- !query schema +struct +-- !query output +2 true + + +-- !query +SELECT k, + Every(v) AS every +FROM test_agg +WHERE k = 2 + AND v IN (SELECT Every(v) + FROM test_agg + WHERE k = 1) +GROUP BY k +-- !query schema +struct +-- !query output + + + +-- !query +SELECT every(1) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'every(1)' due to data type mismatch: Input to function 'every' should have been boolean, but it's [int].; line 1 pos 7 + + +-- !query +SELECT some(1S) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'some(1S)' due to data type mismatch: Input to function 'some' should have been boolean, but it's [smallint].; line 1 pos 7 + + +-- !query +SELECT any(1L) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'any(1L)' due to data type mismatch: Input to function 'any' should have been boolean, but it's [bigint].; line 1 pos 7 + + +-- !query +SELECT every("true") +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'every('true')' due to data type mismatch: Input to function 'every' should have been boolean, but it's [string].; line 1 pos 7 + + +-- !query +SELECT bool_and(1.0) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'bool_and(1.0BD)' due to data type mismatch: Input to function 'bool_and' should have been boolean, but it's [decimal(2,1)].; line 1 pos 7 + + +-- !query +SELECT bool_or(1.0D) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'bool_or(1.0D)' due to data type mismatch: Input to function 'bool_or' should have been boolean, but it's [double].; line 1 pos 7 + + +-- !query +SELECT k, v, every(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +-- !query schema +struct +-- !query output +1 false false +1 true false +2 true true +3 NULL NULL +3 false false +4 NULL NULL +4 NULL NULL +5 NULL NULL +5 false false +5 true false + + +-- !query +SELECT k, v, some(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +-- !query schema +struct +-- !query output +1 false false +1 true true +2 true true +3 NULL NULL +3 false false +4 NULL NULL +4 NULL NULL +5 NULL NULL +5 false false +5 true true + + +-- !query +SELECT k, v, any(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +-- !query schema +struct +-- !query output +1 false false +1 true true +2 true true +3 NULL NULL +3 false false +4 NULL NULL +4 NULL NULL +5 NULL NULL +5 false false +5 true true + + +-- !query +SELECT k, v, bool_and(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +-- !query schema +struct +-- !query output +1 false false +1 true false +2 true true +3 NULL NULL +3 false false +4 NULL NULL +4 NULL NULL +5 NULL NULL +5 false false +5 true false + + +-- !query +SELECT k, v, bool_or(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +-- !query schema +struct +-- !query output +1 false false +1 true true +2 true true +3 NULL NULL +3 false false +4 NULL NULL +4 NULL NULL +5 NULL NULL +5 false false +5 true true + + +-- !query +SELECT count(*) FROM test_agg HAVING count(*) > 1L +-- !query schema +struct +-- !query output +10 + + +-- !query +SELECT k, max(v) FROM test_agg GROUP BY k HAVING max(v) = true +-- !query schema +struct +-- !query output +1 true +2 true +5 true + + +-- !query +SELECT * FROM (SELECT COUNT(*) AS cnt FROM test_agg) WHERE cnt > 1L +-- !query schema +struct +-- !query output +10 + + +-- !query +SELECT count(*) FROM test_agg WHERE count(*) > 1L +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException + +Aggregate/Window/Generate expressions are not valid in where clause of the query. +Expression in where clause: [(count(1) > 1L)] +Invalid expressions: [count(1)] + + +-- !query +SELECT count(*) FROM test_agg WHERE count(*) + 1L > 1L +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException + +Aggregate/Window/Generate expressions are not valid in where clause of the query. +Expression in where clause: [((count(1) + 1L) > 1L)] +Invalid expressions: [count(1)] + + +-- !query +SELECT count(*) FROM test_agg WHERE k = 1 or k = 2 or count(*) + 1L > 1L or max(k) > 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException + +Aggregate/Window/Generate expressions are not valid in where clause of the query. +Expression in where clause: [(((test_agg.k = 1) OR (test_agg.k = 2)) OR (((count(1) + 1L) > 1L) OR (max(test_agg.k) > 1)))] +Invalid expressions: [count(1), max(test_agg.k)] + + +-- !query +SELECT AVG(DISTINCT decimal_col), SUM(DISTINCT decimal_col) FROM VALUES (CAST(1 AS DECIMAL(9, 0))) t(decimal_col) +-- !query schema +struct +-- !query output +1.0000 1 + + +-- !query +SELECT not(a IS NULL), count(*) AS c +FROM testData +GROUP BY a IS NULL +-- !query schema +struct<(NOT (a IS NULL)):boolean,c:bigint> +-- !query output +false 2 +true 7 + + +-- !query +SELECT if(not(a IS NULL), rand(0), 1), count(*) AS c +FROM testData +GROUP BY a IS NULL +-- !query schema +struct<(IF((NOT (a IS NULL)), rand(0), 1)):double,c:bigint> +-- !query output +0.7604953758285915 7 +1.0 2 diff --git a/backends-velox/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out b/backends-velox/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out new file mode 100644 index 0000000000000..26d55d341ae72 --- /dev/null +++ b/backends-velox/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out @@ -0,0 +1,514 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 52 + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES +(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null) +AS testData(a, b) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT udf(a), udf(COUNT(b)) FROM testData +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +grouping expressions sequence is empty, and 'testdata.a' is not an aggregate function. Wrap '(CAST(udf(cast(count(b) as string)) AS BIGINT) AS `udf(count(b))`)' in windowing function(s) or wrap 'testdata.a' in first() (or first_value) if you don't care which value you get. + + +-- !query +SELECT COUNT(udf(a)), udf(COUNT(b)) FROM testData +-- !query schema +struct +-- !query output +7 7 + + +-- !query +SELECT udf(a), COUNT(udf(b)) FROM testData GROUP BY a +-- !query schema +struct +-- !query output +1 2 +2 2 +3 2 +NULL 1 + + +-- !query +SELECT udf(a), udf(COUNT(udf(b))) FROM testData GROUP BY b +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +expression 'testdata.a' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get. + + +-- !query +SELECT COUNT(udf(a)), COUNT(udf(b)) FROM testData GROUP BY udf(a) +-- !query schema +struct +-- !query output +0 1 +2 2 +2 2 +3 2 + + +-- !query +SELECT 'foo', COUNT(udf(a)) FROM testData GROUP BY 1 +-- !query schema +struct +-- !query output +foo 7 + + +-- !query +SELECT 'foo' FROM testData WHERE a = 0 GROUP BY udf(1) +-- !query schema +struct +-- !query output + + + +-- !query +SELECT 'foo', udf(APPROX_COUNT_DISTINCT(udf(a))) FROM testData WHERE a = 0 GROUP BY udf(1) +-- !query schema +struct +-- !query output + + + +-- !query +SELECT 'foo', MAX(STRUCT(udf(a))) FROM testData WHERE a = 0 GROUP BY udf(1) +-- !query schema +struct> +-- !query output + + + +-- !query +SELECT udf(a + b), udf(COUNT(b)) FROM testData GROUP BY a + b +-- !query schema +struct +-- !query output +2 1 +3 2 +4 2 +5 1 +NULL 1 + + +-- !query +SELECT udf(a + 2), udf(COUNT(b)) FROM testData GROUP BY a + 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +expression 'testdata.a' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get. + + +-- !query +SELECT udf(a + 1) + 1, udf(COUNT(b)) FROM testData GROUP BY udf(a + 1) +-- !query schema +struct<(udf((a + 1)) + 1):int,udf(count(b)):bigint> +-- !query output +3 2 +4 2 +5 2 +NULL 1 + + +-- !query +SELECT SKEWNESS(udf(a)), udf(KURTOSIS(a)), udf(MIN(a)), MAX(udf(a)), udf(AVG(udf(a))), udf(VARIANCE(a)), STDDEV(udf(a)), udf(SUM(a)), udf(COUNT(a)) +FROM testData +-- !query schema +struct +-- !query output +-0.2723801058145729 -1.5069204152249134 1 3 2.142857142857143 0.8095238095238094 0.8997354108424372 15 7 + + +-- !query +SELECT COUNT(DISTINCT udf(b)), udf(COUNT(DISTINCT b, c)) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY udf(a) +-- !query schema +struct +-- !query output +1 1 + + +-- !query +SELECT udf(a) AS k, COUNT(udf(b)) FROM testData GROUP BY k +-- !query schema +struct +-- !query output +1 2 +2 2 +3 2 +NULL 1 + + +-- !query +SELECT a AS k, udf(COUNT(b)) FROM testData GROUP BY k HAVING k > 1 +-- !query schema +struct +-- !query output +2 2 +3 2 + + +-- !query +SELECT udf(COUNT(b)) AS k FROM testData GROUP BY k +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +aggregate functions are not allowed in GROUP BY, but found CAST(udf(cast(count(b) as string)) AS BIGINT) + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW testDataHasSameNameWithAlias AS SELECT * FROM VALUES +(1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT k AS a, udf(COUNT(udf(v))) FROM testDataHasSameNameWithAlias GROUP BY udf(a) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +expression 'testdatahassamenamewithalias.k' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get. + + +-- !query +set spark.sql.groupByAliases=false +-- !query schema +struct +-- !query output +spark.sql.groupByAliases false + + +-- !query +SELECT a AS k, udf(COUNT(udf(b))) FROM testData GROUP BY k +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'k' given input columns: [testdata.a, testdata.b]; line 1 pos 57 + + +-- !query +SELECT udf(a), COUNT(udf(1)) FROM testData WHERE false GROUP BY udf(a) +-- !query schema +struct +-- !query output + + + +-- !query +SELECT udf(COUNT(1)) FROM testData WHERE false +-- !query schema +struct +-- !query output +0 + + +-- !query +SELECT 1 FROM (SELECT udf(COUNT(1)) FROM testData WHERE false) t +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +SELECT 1 from ( + SELECT 1 AS z, + udf(MIN(a.x)) + FROM (select 1 as x) a + WHERE false +) b +where b.z != b.z +-- !query schema +struct<1:int> +-- !query output + + + +-- !query +SELECT udf(1) FROM range(10) HAVING true +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT udf(udf(1)) FROM range(10) HAVING MAX(id) > 0 +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT udf(id) FROM range(10) HAVING id > 0 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +grouping expressions sequence is empty, and 'id' is not an aggregate function. Wrap '()' in windowing function(s) or wrap 'id' in first() (or first_value) if you don't care which value you get. + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW test_agg AS SELECT * FROM VALUES + (1, true), (1, false), + (2, true), + (3, false), (3, null), + (4, null), (4, null), + (5, null), (5, true), (5, false) AS test_agg(k, v) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT udf(every(v)), udf(some(v)), any(v) FROM test_agg WHERE 1 = 0 +-- !query schema +struct +-- !query output +NULL NULL NULL + + +-- !query +SELECT udf(every(udf(v))), some(v), any(v) FROM test_agg WHERE k = 4 +-- !query schema +struct +-- !query output +NULL NULL NULL + + +-- !query +SELECT every(v), udf(some(v)), any(v) FROM test_agg WHERE k = 5 +-- !query schema +struct +-- !query output +false true true + + +-- !query +SELECT udf(k), every(v), udf(some(v)), any(v) FROM test_agg GROUP BY udf(k) +-- !query schema +struct +-- !query output +1 false true true +2 true true true +3 false false false +4 NULL NULL NULL +5 false true true + + +-- !query +SELECT udf(k), every(v) FROM test_agg GROUP BY k HAVING every(v) = false +-- !query schema +struct +-- !query output +1 false +3 false +5 false + + +-- !query +SELECT udf(k), udf(every(v)) FROM test_agg GROUP BY udf(k) HAVING every(v) IS NULL +-- !query schema +struct +-- !query output +4 NULL + + +-- !query +SELECT udf(k), + udf(Every(v)) AS every +FROM test_agg +WHERE k = 2 + AND v IN (SELECT Any(v) + FROM test_agg + WHERE k = 1) +GROUP BY udf(k) +-- !query schema +struct +-- !query output +2 true + + +-- !query +SELECT udf(udf(k)), + Every(v) AS every +FROM test_agg +WHERE k = 2 + AND v IN (SELECT Every(v) + FROM test_agg + WHERE k = 1) +GROUP BY udf(udf(k)) +-- !query schema +struct +-- !query output + + + +-- !query +SELECT every(udf(1)) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'every(CAST(udf(cast(1 as string)) AS INT))' due to data type mismatch: Input to function 'every' should have been boolean, but it's [int].; line 1 pos 7 + + +-- !query +SELECT some(udf(1S)) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'some(CAST(udf(cast(1 as string)) AS SMALLINT))' due to data type mismatch: Input to function 'some' should have been boolean, but it's [smallint].; line 1 pos 7 + + +-- !query +SELECT any(udf(1L)) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'any(CAST(udf(cast(1 as string)) AS BIGINT))' due to data type mismatch: Input to function 'any' should have been boolean, but it's [bigint].; line 1 pos 7 + + +-- !query +SELECT udf(every("true")) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'every('true')' due to data type mismatch: Input to function 'every' should have been boolean, but it's [string].; line 1 pos 11 + + +-- !query +SELECT k, v, every(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +-- !query schema +struct +-- !query output +1 false false +1 true false +2 true true +3 NULL NULL +3 false false +4 NULL NULL +4 NULL NULL +5 NULL NULL +5 false false +5 true false + + +-- !query +SELECT k, udf(udf(v)), some(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +-- !query schema +struct +-- !query output +1 false false +1 true true +2 true true +3 NULL NULL +3 false false +4 NULL NULL +4 NULL NULL +5 NULL NULL +5 false false +5 true true + + +-- !query +SELECT udf(udf(k)), v, any(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +-- !query schema +struct +-- !query output +1 false false +1 true true +2 true true +3 NULL NULL +3 false false +4 NULL NULL +4 NULL NULL +5 NULL NULL +5 false false +5 true true + + +-- !query +SELECT udf(count(*)) FROM test_agg HAVING count(*) > 1L +-- !query schema +struct +-- !query output +10 + + +-- !query +SELECT k, udf(max(v)) FROM test_agg GROUP BY k HAVING max(v) = true +-- !query schema +struct +-- !query output +1 true +2 true +5 true + + +-- !query +SELECT * FROM (SELECT udf(COUNT(*)) AS cnt FROM test_agg) WHERE cnt > 1L +-- !query schema +struct +-- !query output +10 + + +-- !query +SELECT udf(count(*)) FROM test_agg WHERE count(*) > 1L +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException + +Aggregate/Window/Generate expressions are not valid in where clause of the query. +Expression in where clause: [(count(1) > 1L)] +Invalid expressions: [count(1)] + + +-- !query +SELECT udf(count(*)) FROM test_agg WHERE count(*) + 1L > 1L +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException + +Aggregate/Window/Generate expressions are not valid in where clause of the query. +Expression in where clause: [((count(1) + 1L) > 1L)] +Invalid expressions: [count(1)] + + +-- !query +SELECT udf(count(*)) FROM test_agg WHERE k = 1 or k = 2 or count(*) + 1L > 1L or max(k) > 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException + +Aggregate/Window/Generate expressions are not valid in where clause of the query. +Expression in where clause: [(((test_agg.k = 1) OR (test_agg.k = 2)) OR (((count(1) + 1L) > 1L) OR (max(test_agg.k) > 1)))] +Invalid expressions: [count(1), max(test_agg.k)] diff --git a/gluten-core/src/main/scala/io/glutenproject/backendsapi/Backend.scala b/gluten-core/src/main/scala/io/glutenproject/backendsapi/Backend.scala index 438194a3619be..c91d5527ace86 100644 --- a/gluten-core/src/main/scala/io/glutenproject/backendsapi/Backend.scala +++ b/gluten-core/src/main/scala/io/glutenproject/backendsapi/Backend.scala @@ -38,4 +38,6 @@ trait Backend { def broadcastApi(): BroadcastApi def settings(): BackendSettingsApi + + def testApi(): TestApi } diff --git a/gluten-core/src/main/scala/io/glutenproject/backendsapi/BackendsApiManager.scala b/gluten-core/src/main/scala/io/glutenproject/backendsapi/BackendsApiManager.scala index bd896c3ddb5e3..ddc4c580c7789 100644 --- a/gluten-core/src/main/scala/io/glutenproject/backendsapi/BackendsApiManager.scala +++ b/gluten-core/src/main/scala/io/glutenproject/backendsapi/BackendsApiManager.scala @@ -92,4 +92,8 @@ object BackendsApiManager { def getSettings: BackendSettingsApi = { backend.settings } + + def getTestApiInstance: TestApi = { + backend.testApi() + } } diff --git a/gluten-core/src/main/scala/io/glutenproject/backendsapi/TestApi.scala b/gluten-core/src/main/scala/io/glutenproject/backendsapi/TestApi.scala new file mode 100644 index 0000000000000..40ef5b25477c0 --- /dev/null +++ b/gluten-core/src/main/scala/io/glutenproject/backendsapi/TestApi.scala @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.glutenproject.backendsapi + +trait TestApi { + def getSupportedSQLQueryTests: Set[String] + + def getOverwriteSQLQueryTests: Set[String] + + def getOverwriteSQLQueryResourcePath: String +} diff --git a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala index 24a44b802d68e..540d126b48b87 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql import io.glutenproject.GlutenConfig +import io.glutenproject.backendsapi.BackendsApiManager import io.glutenproject.utils.{BackendTestUtils, SystemParameters} import org.apache.spark.SparkConf @@ -155,6 +156,12 @@ class GlutenSQLQueryTestSuite protected val goldenFilePath = new File(baseResourcePath, "results").getAbsolutePath protected val testDataPath = new File(resourcesPath, "test-data").getAbsolutePath + protected val overwriteResourcePath = + BackendsApiManager.getTestApiInstance.getOverwriteSQLQueryResourcePath + + protected val overwriteInputFilePath = new File(overwriteResourcePath, "inputs").getAbsolutePath + protected val overwriteGoldenFilePath = new File(overwriteResourcePath, "results").getAbsolutePath + protected val validFileExtensions = ".sql" /** Test if a command is available. */ @@ -217,187 +224,10 @@ class GlutenSQLQueryTestSuite ) ++ otherIgnoreList /** +<<<<<<< HEAD * List of supported cases to run with Velox backend, in lower case. Please add to the supported * list after enabling a sql test. */ - private val veloxSupportedList: Set[String] = Set( - "bitwise.sql", - "cast.sql", - "change-column.sql", - "charvarchar.sql", - "columnresolution-negative.sql", - "columnresolution-views.sql", - "columnresolution.sql", - "comments.sql", - "comparator.sql", - "count.sql", - "cross-join.sql", - "csv-functions.sql", - "cte-legacy.sql", - "cte-nested.sql", - "cte-nonlegacy.sql", - "cte.sql", - "current_database_catalog.sql", - "date.sql", - "datetime-formatting-invalid.sql", - // Velox had different handling for some illegal cases. -// "datetime-formatting-legacy.sql", -// "datetime-formatting.sql", - "datetime-legacy.sql", - "datetime-parsing-invalid.sql", - "datetime-parsing-legacy.sql", - "datetime-parsing.sql", - "datetime-special.sql", - "decimalArithmeticOperations.sql", - "describe-part-after-analyze.sql", - "describe-query.sql", - "describe-table-after-alter-table.sql", - // result match, but the order is not right - // "describe-table-column.sql", - "describe.sql", - "except-all.sql", - "except.sql", - "extract.sql", - "group-by-filter.sql", - "group-by-ordinal.sql", - "group-by.sql", - "grouping_set.sql", - "having.sql", - "ignored.sql", - "inline-table.sql", - "inner-join.sql", - "intersect-all.sql", - "interval.sql", - "join-empty-relation.sql", - "join-lateral.sql", - "json-functions.sql", - "like-all.sql", - "like-any.sql", - "limit.sql", - "literals.sql", - "map.sql", - "misc-functions.sql", - "natural-join.sql", - "null-handling.sql", - "null-propagation.sql", - "operators.sql", - "order-by-nulls-ordering.sql", - "order-by-ordinal.sql", - "outer-join.sql", - "parse-schema-string.sql", - "pivot.sql", - "pred-pushdown.sql", - "predicate-functions.sql", - "query_regex_column.sql", - "random.sql", - "regexp-functions.sql", - "show-create-table.sql", - "show-tables.sql", - "show-tblproperties.sql", - "show-views.sql", - "show_columns.sql", - "sql-compatibility-functions.sql", - "string-functions.sql", - "struct.sql", - "subexp-elimination.sql", - "table-aliases.sql", - "table-valued-functions.sql", - "tablesample-negative.sql", - "subquery/exists-subquery/exists-aggregate.sql", - "subquery/exists-subquery/exists-basic.sql", - "subquery/exists-subquery/exists-cte.sql", - "subquery/exists-subquery/exists-having.sql", - "subquery/exists-subquery/exists-joins-and-set-ops.sql", - "subquery/exists-subquery/exists-orderby-limit.sql", - "subquery/exists-subquery/exists-within-and-or.sql", - "subquery/in-subquery/in-basic.sql", - "subquery/in-subquery/in-group-by.sql", - "subquery/in-subquery/in-having.sql", - "subquery/in-subquery/in-joins.sql", - "subquery/in-subquery/in-limit.sql", - "subquery/in-subquery/in-multiple-columns.sql", - "subquery/in-subquery/in-order-by.sql", - "subquery/in-subquery/in-set-operations.sql", - "subquery/in-subquery/in-with-cte.sql", - "subquery/in-subquery/nested-not-in.sql", - "subquery/in-subquery/not-in-group-by.sql", - "subquery/in-subquery/not-in-joins.sql", - "subquery/in-subquery/not-in-unit-tests-multi-column.sql", - "subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql", - "subquery/in-subquery/not-in-unit-tests-single-column.sql", - "subquery/in-subquery/not-in-unit-tests-single-column-literal.sql", - "subquery/in-subquery/simple-in.sql", - "subquery/negative-cases/invalid-correlation.sql", - "subquery/negative-cases/subq-input-typecheck.sql", - "subquery/scalar-subquery/scalar-subquery-predicate.sql", - "subquery/scalar-subquery/scalar-subquery-select.sql", - "subquery/subquery-in-from.sql", - "postgreSQL/aggregates_part1.sql", - "postgreSQL/aggregates_part2.sql", - "postgreSQL/aggregates_part3.sql", - "postgreSQL/aggregates_part4.sql", - "postgreSQL/boolean.sql", - "postgreSQL/case.sql", - "postgreSQL/comments.sql", - "postgreSQL/create_view.sql", - "postgreSQL/date.sql", - "postgreSQL/float4.sql", - "postgreSQL/insert.sql", - "postgreSQL/int2.sql", - "postgreSQL/int4.sql", - "postgreSQL/int8.sql", - "postgreSQL/interval.sql", - "postgreSQL/join.sql", - "postgreSQL/limit.sql", - "postgreSQL/numeric.sql", - "postgreSQL/select.sql", - "postgreSQL/select_distinct.sql", - "postgreSQL/select_having.sql", - "postgreSQL/select_implicit.sql", - "postgreSQL/strings.sql", - "postgreSQL/text.sql", - "postgreSQL/timestamp.sql", - "postgreSQL/union.sql", - "postgreSQL/window_part1.sql", - "postgreSQL/window_part2.sql", - "postgreSQL/window_part3.sql", - "postgreSQL/window_part4.sql", - "postgreSQL/with.sql", - "datetime-special.sql", - "timestamp-ansi.sql", - "timestamp.sql", - "arrayJoin.sql", - "binaryComparison.sql", - "booleanEquality.sql", - "caseWhenCoercion.sql", - "concat.sql", - "dateTimeOperations.sql", - "decimalPrecision.sql", - "division.sql", - "elt.sql", - "ifCoercion.sql", - "implicitTypeCasts.sql", - "inConversion.sql", - "mapZipWith.sql", - "mapconcat.sql", - "promoteStrings.sql", - "stringCastAndExpressions.sql", - "widenSetOperationTypes.sql", - "windowFrameCoercion.sql", - "timestamp-ltz.sql", - "timestamp-ntz.sql", - "timezone.sql", - "transform.sql", - "try_arithmetic.sql", - "try_cast.sql", - "udaf.sql", - "union.sql", - "using-join.sql", - // result match, but the order is not right - // "window.sql", - "udf-union.sql", - "udf-window.sql" - ) /** * List of supported cases to run with Clickhouse backend, in lower case. Please add to the @@ -406,11 +236,9 @@ class GlutenSQLQueryTestSuite private val CHSupportedList: Set[String] = Set() // List of supported cases to run with a certain backend, in lower case. - private val supportedList: Set[String] = if (isCHBackend) { - CHSupportedList - } else { - veloxSupportedList - } + private val supportedList: Set[String] = + BackendsApiManager.getTestApiInstance.getSupportedSQLQueryTests ++ + BackendsApiManager.getTestApiInstance.getOverwriteSQLQueryTests // Create all the test cases. listTestCases.foreach(createScalaTestCase) @@ -492,9 +320,9 @@ class GlutenSQLQueryTestSuite // If a test case is not in the test list, or it is in the ignore list, ignore this test case. if ( !supportedList.exists( - t => testCase.name.toLowerCase(Locale.ROOT).contains(t.toLowerCase(Locale.ROOT))) || + t => testCase.name.toLowerCase(Locale.ROOT).equals(t.toLowerCase(Locale.ROOT))) || ignoreList.exists( - t => testCase.name.toLowerCase(Locale.ROOT).contains(t.toLowerCase(Locale.ROOT))) + t => testCase.name.toLowerCase(Locale.ROOT).equals(t.toLowerCase(Locale.ROOT))) ) { // Create a test case to ignore this case. ignore(testCase.name) { /* Do nothing */ } @@ -768,35 +596,37 @@ class GlutenSQLQueryTestSuite } protected lazy val listTestCases: Seq[TestCase] = { - listFilesRecursively(new File(inputFilePath)).flatMap { - file => - val resultFile = file.getAbsolutePath.replace(inputFilePath, goldenFilePath) + ".out" - val absPath = file.getAbsolutePath - val testCaseName = absPath.stripPrefix(inputFilePath).stripPrefix(File.separator) - - if ( - file.getAbsolutePath.startsWith( - s"$inputFilePath${File.separator}udf${File.separator}postgreSQL") - ) { - Seq(TestScalaUDF("udf"), TestPythonUDF("udf"), TestScalarPandasUDF("udf")).map { - udf => UDFPgSQLTestCase(s"$testCaseName - ${udf.prettyName}", absPath, resultFile, udf) - } - } else if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}udf")) { - Seq(TestScalaUDF("udf"), TestPythonUDF("udf"), TestScalarPandasUDF("udf")).map { - udf => UDFTestCase(s"$testCaseName - ${udf.prettyName}", absPath, resultFile, udf) - } - } else if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}postgreSQL")) { - PgSQLTestCase(testCaseName, absPath, resultFile) :: Nil - } else if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}ansi")) { - AnsiTestCase(testCaseName, absPath, resultFile) :: Nil - } else if ( - file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}timestampNTZ") - ) { - TimestampNTZTestCase(testCaseName, absPath, resultFile) :: Nil - } else { - RegularTestCase(testCaseName, absPath, resultFile) :: Nil + val createTestCase = (file: File, parentDir: String, resultPath: String) => { + val resultFile = file.getAbsolutePath.replace(parentDir, resultPath) + ".out" + val absPath = file.getAbsolutePath + val testCaseName = absPath.stripPrefix(parentDir).stripPrefix(File.separator) + + if ( + file.getAbsolutePath.startsWith( + s"$parentDir${File.separator}udf${File.separator}postgreSQL") + ) { + Seq(TestScalaUDF("udf"), TestPythonUDF("udf"), TestScalarPandasUDF("udf")).map { + udf => UDFPgSQLTestCase(s"$testCaseName - ${udf.prettyName}", absPath, resultFile, udf) + } + } else if (file.getAbsolutePath.startsWith(s"$parentDir${File.separator}udf")) { + Seq(TestScalaUDF("udf"), TestPythonUDF("udf"), TestScalarPandasUDF("udf")).map { + udf => UDFTestCase(s"$testCaseName - ${udf.prettyName}", absPath, resultFile, udf) } + } else if (file.getAbsolutePath.startsWith(s"$parentDir${File.separator}postgreSQL")) { + PgSQLTestCase(testCaseName, absPath, resultFile) :: Nil + } else if (file.getAbsolutePath.startsWith(s"$parentDir${File.separator}ansi")) { + AnsiTestCase(testCaseName, absPath, resultFile) :: Nil + } else if (file.getAbsolutePath.startsWith(s"$parentDir${File.separator}timestampNTZ")) { + TimestampNTZTestCase(testCaseName, absPath, resultFile) :: Nil + } else { + RegularTestCase(testCaseName, absPath, resultFile) :: Nil + } } + + listFilesRecursively(new File(inputFilePath)) + .flatMap(createTestCase(_, inputFilePath, goldenFilePath)) ++ listFilesRecursively( + new File(overwriteInputFilePath)) + .flatMap(createTestCase(_, overwriteInputFilePath, overwriteGoldenFilePath)) } /** Returns all the files (not directories) in a directory, recursively. */