From db5ed73880edfc35a29e0d7714b01e2551af7901 Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Tue, 29 Oct 2024 11:11:38 +0800 Subject: [PATCH] update --- .../backendsapi/clickhouse/CHRuleApi.scala | 3 +- ...le.scala => LazyAggregateExpandRule.scala} | 290 +++++++++++------- .../execution/GlutenClickHouseTPCHSuite.scala | 58 ---- ...enClickHouseTPCHSaltNullParquetSuite.scala | 64 ++++ 4 files changed, 248 insertions(+), 167 deletions(-) rename backends-clickhouse/src/main/scala/org/apache/gluten/extension/{LazyExpandRule.scala => LazyAggregateExpandRule.scala} (50%) diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHRuleApi.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHRuleApi.scala index f07e80a177ff..8beaa0aecfbb 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHRuleApi.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHRuleApi.scala @@ -49,7 +49,7 @@ private object CHRuleApi { // Inject the regular Spark rules directly. injector.injectQueryStagePrepRule(FallbackBroadcastHashJoinPrepQueryStage.apply) injector.injectQueryStagePrepRule(spark => CHAQEPropagateEmptyRelation(spark)) - injector.injectQueryStagePrepRule(spark => LazyExpandRule(spark)) + // injector.injectQueryStagePrepRule(spark => LazyExpandRule(spark)) injector.injectParser( (spark, parserInterface) => new GlutenCacheFilesSqlParser(spark, parserInterface)) injector.injectParser( @@ -84,6 +84,7 @@ private object CHRuleApi { injector.injectTransform(_ => CollapseProjectExecTransformer) injector.injectTransform(c => RewriteSortMergeJoinToHashJoinRule.apply(c.session)) injector.injectTransform(c => PushdownAggregatePreProjectionAheadExpand.apply(c.session)) + injector.injectTransform(c => LazyAggregateExpandRule.apply(c.session)) injector.injectTransform( c => intercept( diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/extension/LazyExpandRule.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/extension/LazyAggregateExpandRule.scala similarity index 50% rename from backends-clickhouse/src/main/scala/org/apache/gluten/extension/LazyExpandRule.scala rename to backends-clickhouse/src/main/scala/org/apache/gluten/extension/LazyAggregateExpandRule.scala index b78feb78b193..ad5893d801cb 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/extension/LazyExpandRule.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/extension/LazyAggregateExpandRule.scala @@ -17,16 +17,17 @@ package org.apache.gluten.extension import org.apache.gluten.backendsapi.clickhouse.CHBackendSettings +import org.apache.gluten.execution._ import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.aggregate._ -import org.apache.spark.sql.execution.exchange._ +import org.apache.spark.sql.types._ /* * For aggregation with grouping sets, we need to expand the grouping sets @@ -49,48 +50,39 @@ import org.apache.spark.sql.execution.exchange._ * If the aggregation involves distinct, we can't do this optimization. */ -case class LazyExpandRule(session: SparkSession) extends Rule[SparkPlan] with Logging { +case class LazyAggregateExpandRule(session: SparkSession) extends Rule[SparkPlan] with Logging { override def apply(plan: SparkPlan): SparkPlan = { - logDebug(s"xxx enable lazy aggregate expand: ${CHBackendSettings.enableLazyAggregateExpand}") + logError( + s"xxx 1031 enable lazy aggregate expand: " + + s"${CHBackendSettings.enableLazyAggregateExpand}") if (!CHBackendSettings.enableLazyAggregateExpand) { return plan } plan.transformUp { - case finalAggregate @ HashAggregateExec( - _, - _, - _, - _, - _, - _, + case shuffle @ ColumnarShuffleExchangeExec( + HashPartitioning(hashExpressions, _), + CHHashAggregateExecTransformer( + _, + groupingExpressions, + aggregateExpressions, + _, + _, + resultExpressions, + ExpandExecTransformer(projections, output, child)), _, _, - ShuffleExchangeExec( - HashPartitioning(hashExpressions, _), - HashAggregateExec( - _, - _, - _, - groupingExpressions, - aggregateExpressions, - _, - _, - resultExpressions, - ExpandExec(projections, output, child)), - _ - ) + _ ) => - logError(s"xxx match plan:$finalAggregate") - // move expand node after shuffle node - if ( - groupingExpressions.forall(_.isInstanceOf[Attribute]) && - hashExpressions.forall(_.isInstanceOf[Attribute]) && - aggregateExpressions.forall(_.filter.isEmpty) - ) { - val shuffle = - finalAggregate.asInstanceOf[HashAggregateExec].child.asInstanceOf[ShuffleExchangeExec] - val partialAggregate = shuffle.child.asInstanceOf[HashAggregateExec] - val expand = partialAggregate.child.asInstanceOf[ExpandExec] + logError(s"xxx match plan:$shuffle") + val partialAggregate = shuffle.child.asInstanceOf[CHHashAggregateExecTransformer] + val expand = partialAggregate.child.asInstanceOf[ExpandExecTransformer] + logError( + s"xxx partialAggregate: groupingExpressions:" + + s"${partialAggregate.groupingExpressions}\n" + + s"aggregateAttributes:${partialAggregate.aggregateAttributes}\n" + + s"aggregateExpressions:${partialAggregate.aggregateExpressions}\n" + + s"resultExpressions:${partialAggregate.resultExpressions}") + if (isSupportedAggregate(partialAggregate, expand, shuffle)) { val attributesToReplace = buildReplaceAttributeMapForAggregate( groupingExpressions, @@ -113,54 +105,41 @@ case class LazyExpandRule(session: SparkSession) extends Rule[SparkPlan] with Lo ) val newShuffle = shuffle.copy(child = newExpand) - val newFinalAggregate = finalAggregate.copy(child = newShuffle) - logError(s"xxx new plan: $newFinalAggregate") - newFinalAggregate + logError(s"xxx new plan: $newShuffle") + newShuffle } else { - finalAggregate + shuffle } - case finalAggregate @ HashAggregateExec( - _, - _, - _, - _, + case shuffle @ ColumnarShuffleExchangeExec( + HashPartitioning(hashExpressions, _), + CHHashAggregateExecTransformer( + _, + groupingExpressions, + aggregateExpressions, + _, + _, + resultExpressions, + FilterExecTransformer(_, ExpandExecTransformer(projections, output, child))), _, _, - _, - _, - ShuffleExchangeExec( - HashPartitioning(hashExpressions, _), - HashAggregateExec( - _, - _, - _, - groupingExpressions, - aggregateExpressions, - _, - _, - resultExpressions, - FilterExec(_, ExpandExec(projections, output, child))), - _ - ) + _ ) => - logError(s"xxx match plan:$finalAggregate") - if ( - groupingExpressions.forall(_.isInstanceOf[Attribute]) && - hashExpressions.forall(_.isInstanceOf[Attribute]) && - aggregateExpressions.forall(_.filter.isEmpty) - ) { - val shuffle = - finalAggregate.asInstanceOf[HashAggregateExec].child.asInstanceOf[ShuffleExchangeExec] - val partialAggregate = shuffle.child.asInstanceOf[HashAggregateExec] - val filter = partialAggregate.child.asInstanceOf[FilterExec] - val expand = filter.child.asInstanceOf[ExpandExec] - + val partialAggregate = shuffle.child.asInstanceOf[CHHashAggregateExecTransformer] + val filter = partialAggregate.child.asInstanceOf[FilterExecTransformer] + val expand = filter.child.asInstanceOf[ExpandExecTransformer] + logError( + s"xxx partialAggregate: groupingExpressions:" + + s"${partialAggregate.groupingExpressions}\n" + + s"aggregateAttributes:${partialAggregate.aggregateAttributes}\n" + + s"aggregateExpressions:${partialAggregate.aggregateExpressions}\n" + + s"resultExpressions:${partialAggregate.resultExpressions}") + if (isSupportedAggregate(partialAggregate, expand, shuffle)) { val attributesToReplace = buildReplaceAttributeMapForAggregate( groupingExpressions, projections, output ) - logDebug(s"xxx attributesToReplace: $attributesToReplace") + logError(s"xxx attributesToReplace: $attributesToReplace") val newPartialAggregate = buildNewAggregateExec( partialAggregate, @@ -176,16 +155,125 @@ case class LazyExpandRule(session: SparkSession) extends Rule[SparkPlan] with Lo ) val newFilter = filter.copy(child = newExpand) + val newShuffle = shuffle.copy(child = newFilter) - val newFinalAggregate = finalAggregate.copy(child = newShuffle) - logError(s"xxx new plan: $newFinalAggregate") - newFinalAggregate + logError(s"xxx new plan: $newShuffle") + newShuffle + } else { - finalAggregate + shuffle } } } + // Just enable for simple cases. Some of cases that are not supported: + // 1. select count(a),count(b), count(1), count(distinct(a)), count(distinct(b)) from values + // (1, null), (2,2) as data(a,b); + // 2. select n_name, count(distinct n_regionkey) as col1, + // count(distinct concat(n_regionkey, n_nationkey)) as col2 from + // nation group by n_name; + def isSupportedAggregate( + aggregate: CHHashAggregateExecTransformer, + expand: ExpandExecTransformer, + shuffle: ColumnarShuffleExchangeExec): Boolean = { + // all grouping keys must be attribute references + val expandOutputAttributes = expand.child.output.toSet + if (aggregate.groupingExpressions.exists(!_.isInstanceOf[Attribute])) { + logError(s"xxx Not all grouping expression are attribute references") + return false + } + // all shuffle keys must be attribute references + if ( + shuffle.outputPartitioning + .asInstanceOf[HashPartitioning] + .expressions + .exists(!_.isInstanceOf[Attribute]) + ) { + logError(s"xxx Not all shuffle hash expression are attribute references") + return false + } + + // For safety, only enalbe for some aggregate functions + // All the parameters in aggregate functions must be the references of the output of expand's + // child + if ( + !aggregate.aggregateExpressions.forall( + e => + isSupportedAggregateFunction(e) && e.aggregateFunction.references.forall( + expandOutputAttributes.contains(_))) + ) { + logError(s"xxx Some aggregate functions are not supported") + return false + } + + // ensure the last column of expand is grouping id + val groupIdIndex = findGroupingIdIndex(expand) + logError(s"xxx Find group id at index: $groupIdIndex") + if (groupIdIndex == -1) { + return false; + } + val groupIdAttribute = expand.output(groupIdIndex) + if ( + !groupIdAttribute.name.startsWith("grouping_id") && !groupIdAttribute.name.startsWith("gid") + && !groupIdAttribute.name.startsWith("spark_grouping_id") + ) { + logError(s"xxx Not found group id column at index $groupIdIndex") + return false + } + expand.projections.forall { + projection => + val groupId = projection(groupIdIndex) + groupId + .isInstanceOf[Literal] && (groupId.dataType.isInstanceOf[LongType] || groupId.dataType + .isInstanceOf[IntegerType]) + } + } + + def findGroupingIdIndex(expand: ExpandExecTransformer): Int = { + var groupIdIndexes = Seq[Int]() + for (col <- 0 until expand.output.length) { + val expandCol = expand.projections(0)(col) + if ( + expandCol.isInstanceOf[Literal] && (expandCol.dataType + .isInstanceOf[LongType] || expandCol.dataType.isInstanceOf[IntegerType]) + ) { + if ( + expand.projections.forall { + projection => + val e = projection(col) + e.isInstanceOf[Literal] && + (e.dataType.isInstanceOf[LongType] || e.dataType.isInstanceOf[IntegerType]) + } + ) { + groupIdIndexes +:= col + } + } + } + if (groupIdIndexes.length == 1) { + groupIdIndexes(0) + } else { + -1 + } + } + + // Some of aggregate functions' output columns are not consistent with the output of gluten. + // - average: in partial aggregation, the outputs are sum and count, but gluten only generates one + // column, avg. + // - sum: if the input's type is decimal, the output are sum and isEmpty, but gluten doesn't use + // the isEmpty column. + def isSupportedAggregateFunction(aggregateExpression: AggregateExpression): Boolean = { + if (aggregateExpression.filter.isDefined) { + return false + } + aggregateExpression.aggregateFunction match { + case _: Count => true + case _: Max => true + case _: Min => true + case sum: Sum => !sum.dataType.isInstanceOf[DecimalType] + case _ => false + } + } + def getReplaceAttribute( toReplace: Attribute, attributesToReplace: Map[Attribute, Attribute]): Attribute = { @@ -215,7 +303,6 @@ case class LazyExpandRule(session: SparkSession) extends Rule[SparkPlan] with Lo if (attr != null) { attributeMap += (e.toAttribute -> attr) } - // attributeMap +=(e.toAttribute -> fullExpandProjection(index).asInstanceOf[Attribute]) } attributeMap } @@ -225,33 +312,17 @@ case class LazyExpandRule(session: SparkSession) extends Rule[SparkPlan] with Lo originalExpandProjections: Seq[Seq[Expression]], originalExpandOutput: Seq[Attribute], newExpandOutput: Seq[Attribute]): Seq[Seq[Expression]] = { - var groupingKeysPosition = Map[String, Int]() - originalGroupingExpressions.foreach { - e => - e match { - case ne: NamedExpression => - val index = originalExpandOutput.indexWhere(_.semanticEquals(ne.toAttribute)) - if (index != -1) { - groupingKeysPosition += (ne.name -> index) - } - case _ => - } - } - val newExpandProjections = originalExpandProjections.map { projection => - val res = newExpandOutput.map { + newExpandOutput.map { attr => - if (attr.isInstanceOf[Attribute]) { - groupingKeysPosition.get(attr.name) match { - case Some(attrPos) => projection(attrPos) - case None => attr - } + val index = originalExpandOutput.indexWhere(_.semanticEquals(attr)) + if (index != -1) { + projection(index) } else { attr } } - res } newExpandProjections } @@ -259,11 +330,10 @@ case class LazyExpandRule(session: SparkSession) extends Rule[SparkPlan] with Lo // Make the child of expand be the child of aggregate // Need to replace some attributes def buildNewAggregateExec( - partialAggregate: HashAggregateExec, - expand: ExpandExec, + partialAggregate: CHHashAggregateExecTransformer, + expand: ExpandExecTransformer, attributesToReplace: Map[Attribute, Attribute]): SparkPlan = { val expandOutput = expand.output - // As far as know, the last attribute in the output is the groupId attribute. val groupIdAttribute = expandOutput(expandOutput.length - 1) // if the grouping keys contains literal, they should not be in attributesToReplace @@ -274,6 +344,7 @@ case class LazyExpandRule(session: SparkSession) extends Rule[SparkPlan] with Lo .filter( e => e.toAttribute != groupIdAttribute && attributesToReplace.contains(e.toAttribute)) .map(e => getReplaceAttribute(e.toAttribute, attributesToReplace)) + .distinct logError( s"xxx newGroupingExpresion: $newGroupingExpresion,\n" + s"groupingExpressions: $groupingExpressions") @@ -289,6 +360,7 @@ case class LazyExpandRule(session: SparkSession) extends Rule[SparkPlan] with Lo .isEmpty || attributesToReplace.contains(e.toAttribute)) } .map(e => getReplaceAttribute(e.toAttribute, attributesToReplace)) + .distinct logError( s"xxx newResultExpressions: $newResultExpressions\n" + s"resultExpressions:$resultExpressions") @@ -299,13 +371,15 @@ case class LazyExpandRule(session: SparkSession) extends Rule[SparkPlan] with Lo } def buildNewExpandExec( - expand: ExpandExec, - partialAggregate: HashAggregateExec, + expand: ExpandExecTransformer, + partialAggregate: CHHashAggregateExecTransformer, child: SparkPlan, attributesToReplace: Map[Attribute, Attribute]): SparkPlan = { - val newExpandProjectionTemplate = - partialAggregate.output - .map(e => getReplaceAttribute(e, attributesToReplace)) + // The output of the native plan is not completely consistent with Spark. + val aggregateOutput = partialAggregate.output + val newExpandProjectionTemplate = aggregateOutput + // aggregateOutput.map(e => getReplaceAttribute(e, attributesToReplace)) + logError(s"xxx aggregateResultAttributes: ${partialAggregate.aggregateResultAttributes}") logError(s"xxx newExpandProjectionTemplate: $newExpandProjectionTemplate") val newExpandProjections = buildNewExpandProjections( @@ -315,7 +389,7 @@ case class LazyExpandRule(session: SparkSession) extends Rule[SparkPlan] with Lo newExpandProjectionTemplate ) logError(s"xxx newExpandProjections: $newExpandProjections\nprojections:${expand.projections}") - ExpandExec(newExpandProjections, partialAggregate.output, child) + ExpandExecTransformer(newExpandProjections, aggregateOutput, child) } } diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSuite.scala index 2f9b2a78b4a7..a56f45d1ba3d 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSuite.scala @@ -563,63 +563,5 @@ class GlutenClickHouseTPCHSuite extends GlutenClickHouseTPCHAbstractSuite { compareResultsAgainstVanillaSpark(sql, true, { _ => }) spark.sql("drop table t1") } - - test("GLLUTEN-7647 lazy expand") { - var sql = - """ - |select n_regionkey, n_nationkey, sum(n_regionkey), count(n_name) - |from nation group by n_regionkey, n_nationkey with cube - |order by n_regionkey, n_nationkey - |""".stripMargin - compareResultsAgainstVanillaSpark(sql, true, { _ => }) - - sql = """ - |select n_regionkey, n_nationkey, sum(n_regionkey), count(distinct n_name) - |from nation group by n_regionkey, n_nationkey with cube - |order by n_regionkey, n_nationkey - |""".stripMargin - compareResultsAgainstVanillaSpark(sql, true, { _ => }) - - sql = """ - |select n_regionkey, n_nationkey, sum(distinct n_regionkey), count(distinct n_name) - |from nation group by n_regionkey, n_nationkey with cube - |order by n_regionkey, n_nationkey - |""".stripMargin - compareResultsAgainstVanillaSpark(sql, true, { _ => }) - - sql = - """ - |select n_regionkey, n_nationkey, sum(distinct n_regionkey), count(distinct n_name) - |from nation group by n_regionkey, n_nationkey grouping sets((n_regionkey), (n_nationkey)) - |order by n_regionkey, n_nationkey - |""".stripMargin - compareResultsAgainstVanillaSpark(sql, true, { _ => }) - - sql = """ - |select n_regionkey, n_nationkey, sum(distinct n_regionkey), count(distinct n_name) - |from nation group by n_regionkey, n_nationkey - |grouping sets((n_regionkey, null), (null, n_nationkey)) - |order by n_regionkey, n_nationkey - |""".stripMargin - compareResultsAgainstVanillaSpark(sql, true, { _ => }) - - sql = """ - |select * from( - |select n_regionkey, n_nationkey, sum(n_regionkey), count(n_name) - |from nation group by n_regionkey, n_nationkey with cube - |) where n_regionkey != 0 - |order by n_regionkey, n_nationkey - |""".stripMargin - compareResultsAgainstVanillaSpark(sql, true, { _ => }) - - sql = """ - |select * from( - |select n_regionkey, n_nationkey, sum(n_regionkey), count(distinct n_name) - |from nation group by n_regionkey, n_nationkey with cube - |) where n_regionkey != 0 - |order by n_regionkey, n_nationkey - |""".stripMargin - compareResultsAgainstVanillaSpark(sql, true, { _ => }) - } } // scalastyle:off line.size.limit diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala index 739b040dba1d..191325c88195 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala @@ -3022,5 +3022,69 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr compareResultsAgainstVanillaSpark(query_sql, true, { _ => }) spark.sql("drop table test_tbl_7220") } + + test("GLLUTEN-7647 lazy expand") { + var sql = + """ + |select n_regionkey, n_nationkey, + |sum(n_regionkey), count(n_name), max(n_regionkey), min(n_regionkey) + |from nation group by n_regionkey, n_nationkey with cube + |order by n_regionkey, n_nationkey + |""".stripMargin + compareResultsAgainstVanillaSpark(sql, true, { _ => }) + + sql = """ + |select n_regionkey, n_nationkey, sum(n_regionkey), count(distinct n_name) + |from nation group by n_regionkey, n_nationkey with cube + |order by n_regionkey, n_nationkey + |""".stripMargin + compareResultsAgainstVanillaSpark(sql, true, { _ => }) + + sql = """ + |select n_regionkey, n_nationkey, + |sum(distinct n_regionkey), count(distinct n_name), max(n_regionkey), min(n_regionkey) + |from nation group by n_regionkey, n_nationkey with cube + |order by n_regionkey, n_nationkey + |""".stripMargin + compareResultsAgainstVanillaSpark(sql, true, { _ => }) + + sql = + """ + |select n_regionkey, n_nationkey, + |sum(distinct n_regionkey), count(distinct n_name), max(n_regionkey), min(n_regionkey) + |from nation group by n_regionkey, n_nationkey grouping sets((n_regionkey), (n_nationkey)) + |order by n_regionkey, n_nationkey + |""".stripMargin + compareResultsAgainstVanillaSpark(sql, true, { _ => }) + + sql = """ + |select n_regionkey, n_nationkey, + |sum(distinct n_regionkey), count(distinct n_name), max(n_regionkey), min(n_regionkey) + |from nation group by n_regionkey, n_nationkey + |grouping sets((n_regionkey, null), (null, n_nationkey)) + |order by n_regionkey, n_nationkey + |""".stripMargin + compareResultsAgainstVanillaSpark(sql, true, { _ => }) + + sql = """ + |select * from( + |select n_regionkey, n_nationkey, + |sum(n_regionkey), count(n_name), max(n_regionkey), min(n_regionkey) + |from nation group by n_regionkey, n_nationkey with cube + |) where n_regionkey != 0 + |order by n_regionkey, n_nationkey + |""".stripMargin + compareResultsAgainstVanillaSpark(sql, true, { _ => }) + + sql = """ + |select * from( + |select n_regionkey, n_nationkey, + |sum(n_regionkey), count(distinct n_name), max(n_regionkey), min(n_regionkey) + |from nation group by n_regionkey, n_nationkey with cube + |) where n_regionkey != 0 + |order by n_regionkey, n_nationkey + |""".stripMargin + compareResultsAgainstVanillaSpark(sql, true, { _ => }) + } } // scalastyle:on line.size.limit