[VL] Activate CI test for distinct aggregation spill (#3762)

apache · Nov 22, 2023 · c799238 · c799238
1 parent 5ab5646
commit c799238
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 7 deletions.
diff --git a/.github/workflows/velox_be.yml b/.github/workflows/velox_be.yml
@@ -459,7 +459,7 @@ jobs:
             -d=PARTIAL_MODE:ABANDONED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \
             -d=PARTIAL_MODE:CACHED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \
             -d=PARTIAL_MODE:FLUSHED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=0.05,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=0.1,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0'
-      - name: (To be fixed) TPC-DS SF30.0 Parquet local spark3.2 Q97 low memory
+      - name: (To be fixed) TPC-DS SF30.0 Parquet local spark3.2 Q97 low memory # The case currently causes crash with "free: invalid size".
         run: |
           docker exec centos7-test-$GITHUB_RUN_ID bash -c 'cd /opt/gluten/tools/gluten-it && \
           GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh parameterized \

diff --git a/backends-velox/src/test/scala/io/glutenproject/execution/VeloxTPCHSuite.scala b/backends-velox/src/test/scala/io/glutenproject/execution/VeloxTPCHSuite.scala
@@ -19,7 +19,7 @@ package io.glutenproject.execution
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Row, TestUtils}
 
-abstract class VeloxTPCHSuite extends VeloxWholeStageTransformerSuite {
+abstract class VeloxTPCHTableSupport extends VeloxWholeStageTransformerSuite {
   protected val rootPath: String = getClass.getResource("/").getPath
   override protected val backend: String = "velox"
   override protected val resourcePath: String = "/tpch-data-parquet-velox"
@@ -32,11 +32,6 @@ abstract class VeloxTPCHSuite extends VeloxWholeStageTransformerSuite {
   // TODO: result comparison is not supported currently.
   protected val queriesResults: String = rootPath + "queries-output"
 
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    createTPCHNotNullTables()
-  }
-
   override protected def sparkConf: SparkConf = {
     super.sparkConf
       .set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager")
@@ -47,6 +42,13 @@ abstract class VeloxTPCHSuite extends VeloxWholeStageTransformerSuite {
       .set("spark.sql.autoBroadcastJoinThreshold", "-1")
   }
 
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    createTPCHNotNullTables()
+  }
+}
+
+abstract class VeloxTPCHSuite extends VeloxTPCHTableSupport {
   test("TPC-H q1") {
     runTPCHQuery(1, veloxTPCHQueries, queriesResults, compareResult = false, noFallBack = false) {
       _ =>
@@ -195,6 +197,19 @@ abstract class VeloxTPCHSuite extends VeloxWholeStageTransformerSuite {
   }
 }
 
+class VeloxTPCHDistinctSpill extends VeloxTPCHTableSupport {
+  override protected def sparkConf: SparkConf = {
+    super.sparkConf
+      .set("spark.memory.offHeap.size", "50m")
+      .set("spark.gluten.memory.overAcquiredMemoryRatio", "0.9") // to trigger distinct spill early
+  }
+
+  test("distinct spill") {
+    val df = spark.sql("select count(distinct *) from lineitem limit 1")
+    TestUtils.compareAnswers(df.collect(), Seq(Row(60175)))
+  }
+}
+
 class VeloxTPCHV1Suite extends VeloxTPCHSuite {
   override protected def sparkConf: SparkConf = {
     super.sparkConf