From bb08d4cf4b4f94e03b1eb7d6aa559bae5c452ee7 Mon Sep 17 00:00:00 2001
From: "joey.ljy" <joey.ljy@alibaba-inc.com>
Date: Wed, 29 Nov 2023 12:15:35 +0800
Subject: [PATCH] use service loader

---
 .../execution/BasicScanExecTransformer.scala  |  5 +-
 .../DataSourceV2TransformerRegister.scala     | 42 ++++++++++
 .../execution/ScanTransformerFactory.scala    | 52 ++++++++----
 ....execution.DataSourceV2TransformerRegister |  1 +
 .../IcebergTransformerProvider.scala          | 31 +++++++
 .../source/GlutenIcebergSourceUtil.scala      | 84 ++++++++++---------
 .../execution/VeloxIcebergSuite.scala         | 56 +++++++++++++
 .../execution/VeloxTPCHIcebergSuite.scala     | 27 ++++++
 8 files changed, 241 insertions(+), 57 deletions(-)
 create mode 100644 gluten-core/src/main/scala/io/glutenproject/execution/DataSourceV2TransformerRegister.scala
 create mode 100644 gluten-iceberg/src/main/resources/META-INF/services/io.glutenproject.execution.DataSourceV2TransformerRegister
 create mode 100644 gluten-iceberg/src/main/scala/io/glutenproject/execution/IcebergTransformerProvider.scala
 create mode 100644 gluten-iceberg/src/test/scala/io/glutenproject/execution/VeloxIcebergSuite.scala

diff --git a/gluten-core/src/main/scala/io/glutenproject/execution/BasicScanExecTransformer.scala b/gluten-core/src/main/scala/io/glutenproject/execution/BasicScanExecTransformer.scala
index bd1ea255f82d..822d656ff182 100644
--- a/gluten-core/src/main/scala/io/glutenproject/execution/BasicScanExecTransformer.scala
+++ b/gluten-core/src/main/scala/io/glutenproject/execution/BasicScanExecTransformer.scala
@@ -23,14 +23,15 @@ import io.glutenproject.substrait.`type`.ColumnTypeNode
 import io.glutenproject.substrait.SubstraitContext
 import io.glutenproject.substrait.plan.PlanBuilder
 import io.glutenproject.substrait.rel.{ReadRelNode, RelBuilder, SplitInfo}
+import io.glutenproject.substrait.rel.LocalFilesNode.ReadFileFormat
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.expressions.{And, Attribute, Expression}
 import org.apache.spark.sql.vectorized.ColumnarBatch
+
 import com.google.common.collect.Lists
-import scala.collection.JavaConverters._
 
-import io.glutenproject.substrait.rel.LocalFilesNode.ReadFileFormat
+import scala.collection.JavaConverters._
 
 trait BasicScanExecTransformer extends LeafTransformSupport with BaseDataSource {
 
diff --git a/gluten-core/src/main/scala/io/glutenproject/execution/DataSourceV2TransformerRegister.scala b/gluten-core/src/main/scala/io/glutenproject/execution/DataSourceV2TransformerRegister.scala
new file mode 100644
index 000000000000..53dabf06130a
--- /dev/null
+++ b/gluten-core/src/main/scala/io/glutenproject/execution/DataSourceV2TransformerRegister.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.glutenproject.execution
+
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
+
+/**
+ * Data sources v2 transformer should implement this trait so that they can register an alias to
+ * their data source v2 transformer. This allows users to give the data source v2 transformer alias
+ * as the format type over the fully qualified class name.
+ */
+trait DataSourceV2TransformerRegister {
+
+  /**
+   * The string that represents the format that this data source v2 transformer provider uses. This
+   * is overridden by children to provide a nice alias for the data source. For example:
+   *
+   * {{{
+   *   override def shortName(): String = "iceberg"
+   * }}}
+   */
+  def scanClassName(): String
+
+  def createDataSourceV2Transformer(
+      batchScan: BatchScanExec,
+      partitionFilters: Seq[Expression]): BatchScanExecTransformer
+}
diff --git a/gluten-core/src/main/scala/io/glutenproject/execution/ScanTransformerFactory.scala b/gluten-core/src/main/scala/io/glutenproject/execution/ScanTransformerFactory.scala
index 9b21c8e7e096..2e381d2e2b2c 100644
--- a/gluten-core/src/main/scala/io/glutenproject/execution/ScanTransformerFactory.scala
+++ b/gluten-core/src/main/scala/io/glutenproject/execution/ScanTransformerFactory.scala
@@ -24,12 +24,14 @@ import org.apache.spark.sql.connector.read.Scan
 import org.apache.spark.sql.execution.{FileSourceScanExec, FilterExec}
 import org.apache.spark.sql.execution.datasources.v2.{BatchScanExec, FileScan}
 
-import scala.reflect.runtime.{universe => ru}
+import java.util.ServiceLoader
+import java.util.concurrent.ConcurrentHashMap
+
+import scala.collection.JavaConverters._
 
 object ScanTransformerFactory {
 
-  private val IcebergScanClassName = "org.apache.iceberg.spark.source.SparkBatchQueryScan"
-  private val IcebergTransformerClassName = "io.glutenproject.execution.IcebergScanTransformer"
+  private val dataSourceV2TransformerMap = new ConcurrentHashMap[String, Class[_]]()
 
   def createFileSourceScanTransformer(
       scanExec: FileSourceScanExec,
@@ -87,31 +89,49 @@ object ScanTransformerFactory {
     }
     val scan = batchScanExec.scan
     scan match {
-      case _ if scan.getClass.getName == IcebergScanClassName =>
-        createBatchScanTransformer(IcebergTransformerClassName, batchScanExec, newPartitionFilters)
-      case _ =>
+      case _: FileScan =>
         new BatchScanExecTransformer(
           batchScanExec.output,
           batchScanExec.scan,
           newPartitionFilters,
           table = SparkShimLoader.getSparkShims.getBatchScanExecTable(batchScanExec))
+      case _ =>
+        val cls = lookupDataSourceV2Transformer(scan.getClass.getName)
+        cls
+          .newInstance()
+          .asInstanceOf[DataSourceV2TransformerRegister]
+          .createDataSourceV2Transformer(batchScanExec, newPartitionFilters)
     }
   }
 
   def supportedBatchScan(scan: Scan): Boolean = scan match {
     case _: FileScan => true
-    case _ if scan.getClass.getName == IcebergScanClassName => true
+    case _ if dataSourceV2TransformerExists(scan.getClass.getName) => true
     case _ => false
   }
 
-  private def createBatchScanTransformer(
-      className: String,
-      params: Any*): BatchScanExecTransformer = {
-    val classMirror = ru.runtimeMirror(getClass.getClassLoader)
-    val classModule = classMirror.staticModule(className)
-    val mirror = classMirror.reflectModule(classModule)
-    val apply = mirror.symbol.typeSignature.member(ru.TermName("apply")).asMethod
-    val objMirror = classMirror.reflect(mirror.instance)
-    objMirror.reflectMethod(apply)(params: _*).asInstanceOf[BatchScanExecTransformer]
+  private def lookupDataSourceV2Transformer(scanClassName: String): Class[_] = {
+    dataSourceV2TransformerMap.computeIfAbsent(
+      scanClassName,
+      _ => {
+        val loader = Option(Thread.currentThread().getContextClassLoader)
+          .getOrElse(getClass.getClassLoader)
+        val serviceLoader = ServiceLoader.load(classOf[DataSourceV2TransformerRegister], loader)
+        serviceLoader.asScala
+          .filter(_.scanClassName().equalsIgnoreCase(scanClassName))
+          .toList match {
+          case head :: Nil =>
+            // there is exactly one registered alias
+            head.getClass
+          case _ =>
+            throw new UnsupportedOperationException(
+              s"Cannot find the data source v2 transformer for $scanClassName")
+        }
+      }
+    )
+  }
+
+  private def dataSourceV2TransformerExists(scanClassName: String): Boolean = {
+    lookupDataSourceV2Transformer(scanClassName) != null
   }
 }
diff --git a/gluten-iceberg/src/main/resources/META-INF/services/io.glutenproject.execution.DataSourceV2TransformerRegister b/gluten-iceberg/src/main/resources/META-INF/services/io.glutenproject.execution.DataSourceV2TransformerRegister
new file mode 100644
index 000000000000..658967bb99b6
--- /dev/null
+++ b/gluten-iceberg/src/main/resources/META-INF/services/io.glutenproject.execution.DataSourceV2TransformerRegister
@@ -0,0 +1 @@
+io.glutenproject.execution.IcebergTransformerProvider
\ No newline at end of file
diff --git a/gluten-iceberg/src/main/scala/io/glutenproject/execution/IcebergTransformerProvider.scala b/gluten-iceberg/src/main/scala/io/glutenproject/execution/IcebergTransformerProvider.scala
new file mode 100644
index 000000000000..17d146da2021
--- /dev/null
+++ b/gluten-iceberg/src/main/scala/io/glutenproject/execution/IcebergTransformerProvider.scala
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.glutenproject.execution
+
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
+
+class IcebergTransformerProvider extends DataSourceV2TransformerRegister {
+
+  override def scanClassName(): String = "org.apache.iceberg.spark.source.SparkBatchQueryScan"
+
+  override def createDataSourceV2Transformer(
+      batchScan: BatchScanExec,
+      partitionFilters: Seq[Expression]): BatchScanExecTransformer = {
+    IcebergScanTransformer.apply(batchScan, partitionFilters)
+  }
+}
diff --git a/gluten-iceberg/src/main/scala/org/apache/iceberg/spark/source/GlutenIcebergSourceUtil.scala b/gluten-iceberg/src/main/scala/org/apache/iceberg/spark/source/GlutenIcebergSourceUtil.scala
index 5ce4d837c791..2dcfe832e157 100644
--- a/gluten-iceberg/src/main/scala/org/apache/iceberg/spark/source/GlutenIcebergSourceUtil.scala
+++ b/gluten-iceberg/src/main/scala/org/apache/iceberg/spark/source/GlutenIcebergSourceUtil.scala
@@ -22,7 +22,7 @@ import io.glutenproject.substrait.rel.LocalFilesNode.ReadFileFormat
 import org.apache.spark.softaffinity.SoftAffinityUtil
 import org.apache.spark.sql.connector.read.{InputPartition, Scan}
 
-import org.apache.iceberg.{FileFormat, FileScanTask, ScanTask}
+import org.apache.iceberg.{CombinedScanTask, FileFormat, FileScanTask, ScanTask}
 
 import java.lang.{Long => JLong}
 import java.util.{ArrayList => JArrayList, HashMap => JHashMap, Map => JMap}
@@ -30,6 +30,7 @@ import java.util.{ArrayList => JArrayList, HashMap => JHashMap, Map => JMap}
 import scala.collection.JavaConverters._
 
 object GlutenIcebergSourceUtil {
+
   def genSplitInfo(inputPartition: InputPartition, index: Int): SplitInfo = inputPartition match {
     case partition: SparkInputPartition =>
       val paths = new JArrayList[String]()
@@ -39,43 +40,39 @@ object GlutenIcebergSourceUtil {
       var fileFormat = ReadFileFormat.UnknownFormat
 
       val tasks = partition.taskGroup[ScanTask]().tasks().asScala
-      if (tasks.forall(_.isInstanceOf[FileScanTask])) {
-        tasks.map(_.asInstanceOf[FileScanTask]).foreach {
-          task =>
-            paths.add(task.file().path().toString)
-            starts.add(task.start())
-            lengths.add(task.length())
-            partitionColumns.add(new JHashMap[String, String]())
-            val currentFileFormat = task.file().format() match {
-              case FileFormat.PARQUET => ReadFileFormat.ParquetReadFormat
-              case FileFormat.ORC => ReadFileFormat.OrcReadFormat
-              case _ =>
-                throw new UnsupportedOperationException(
-                  "Iceberg Only support parquet and orc file format.")
-            }
-            if (fileFormat == ReadFileFormat.UnknownFormat) {
-              fileFormat = currentFileFormat
-            } else if (fileFormat != currentFileFormat) {
+      asFileScanTask(tasks.toList).foreach {
+        task =>
+          paths.add(task.file().path().toString)
+          starts.add(task.start())
+          lengths.add(task.length())
+          partitionColumns.add(new JHashMap[String, String]())
+          val currentFileFormat = task.file().format() match {
+            case FileFormat.PARQUET => ReadFileFormat.ParquetReadFormat
+            case FileFormat.ORC => ReadFileFormat.OrcReadFormat
+            case _ =>
               throw new UnsupportedOperationException(
-                s"Only one file format is supported, " +
-                  s"find different file format $fileFormat and $currentFileFormat")
-            }
-        }
-        val preferredLoc = SoftAffinityUtil.getFilePartitionLocations(
-          paths.asScala.toArray,
-          inputPartition.preferredLocations())
-        IcebergLocalFilesBuilder.makeIcebergLocalFiles(
-          index,
-          paths,
-          starts,
-          lengths,
-          partitionColumns,
-          fileFormat,
-          preferredLoc.toList.asJava
-        )
-      } else {
-        throw new UnsupportedOperationException("Only support iceberg FileScanTask.")
+                "Iceberg Only support parquet and orc file format.")
+          }
+          if (fileFormat == ReadFileFormat.UnknownFormat) {
+            fileFormat = currentFileFormat
+          } else if (fileFormat != currentFileFormat) {
+            throw new UnsupportedOperationException(
+              s"Only one file format is supported, " +
+                s"find different file format $fileFormat and $currentFileFormat")
+          }
       }
+      val preferredLoc = SoftAffinityUtil.getFilePartitionLocations(
+        paths.asScala.toArray,
+        inputPartition.preferredLocations())
+      IcebergLocalFilesBuilder.makeIcebergLocalFiles(
+        index,
+        paths,
+        starts,
+        lengths,
+        partitionColumns,
+        fileFormat,
+        preferredLoc.toList.asJava
+      )
     case _ =>
       throw new UnsupportedOperationException("Only support iceberg SparkInputPartition.")
   }
@@ -83,10 +80,9 @@ object GlutenIcebergSourceUtil {
   def getFileFormat(sparkScan: Scan): ReadFileFormat = sparkScan match {
     case scan: SparkBatchQueryScan =>
       val tasks = scan.tasks().asScala
-      tasks.map(_.asCombinedScanTask()).foreach {
+      asFileScanTask(tasks.toList).foreach {
         task =>
-          val file = task.files().asScala.head.file()
-          file.format() match {
+          task.file().format() match {
             case FileFormat.PARQUET => return ReadFileFormat.ParquetReadFormat
             case FileFormat.ORC => return ReadFileFormat.OrcReadFormat
             case _ =>
@@ -97,4 +93,14 @@ object GlutenIcebergSourceUtil {
       throw new UnsupportedOperationException("Only support iceberg SparkBatchQueryScan.")
   }
 
+  private def asFileScanTask(tasks: List[ScanTask]): List[FileScanTask] = {
+    if (tasks.forall(_.isFileScanTask)) {
+      tasks.map(_.asFileScanTask())
+    } else if (tasks.forall(_.isInstanceOf[CombinedScanTask])) {
+      tasks.flatMap(_.asCombinedScanTask().tasks().asScala)
+    } else {
+      throw new UnsupportedOperationException(
+        "Only support iceberg CombinedScanTask and FileScanTask.")
+    }
+  }
 }
diff --git a/gluten-iceberg/src/test/scala/io/glutenproject/execution/VeloxIcebergSuite.scala b/gluten-iceberg/src/test/scala/io/glutenproject/execution/VeloxIcebergSuite.scala
new file mode 100644
index 000000000000..97c590dce212
--- /dev/null
+++ b/gluten-iceberg/src/test/scala/io/glutenproject/execution/VeloxIcebergSuite.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.glutenproject.execution
+
+import org.apache.spark.SparkConf
+
+class VeloxIcebergSuite extends WholeStageTransformerSuite {
+
+  protected val rootPath: String = getClass.getResource("/").getPath
+  override protected val backend: String = "velox"
+  override protected val resourcePath: String = "/tpch-data-parquet-velox"
+  override protected val fileFormat: String = "parquet"
+
+  override protected def sparkConf: SparkConf = {
+    super.sparkConf
+      .set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager")
+      .set("spark.sql.files.maxPartitionBytes", "1g")
+      .set("spark.sql.shuffle.partitions", "1")
+      .set("spark.memory.offHeap.size", "2g")
+      .set("spark.unsafe.exceptionOnMemoryLeak", "true")
+      .set("spark.sql.autoBroadcastJoinThreshold", "-1")
+      .set(
+        "spark.sql.extensions",
+        "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
+      .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkCatalog")
+      .set("spark.sql.catalog.spark_catalog.type", "hadoop")
+      .set("spark.sql.catalog.spark_catalog.warehouse", s"file://$rootPath/tpch-data-iceberg-velox")
+  }
+
+  test("iceberg transformer exists") {
+    spark.sql("""
+                |create table iceberg_tb using iceberg as
+                |(select 1 as col1, 2 as col2, 3 as col3)
+                |""".stripMargin)
+
+    runQueryAndCompare("""
+                         |select * from iceberg_tb;
+                         |""".stripMargin) {
+      checkOperatorMatch[IcebergScanTransformer]
+    }
+  }
+}
diff --git a/gluten-iceberg/src/test/scala/io/glutenproject/execution/VeloxTPCHIcebergSuite.scala b/gluten-iceberg/src/test/scala/io/glutenproject/execution/VeloxTPCHIcebergSuite.scala
index 0a76a30e0dfb..b8693a48ccab 100644
--- a/gluten-iceberg/src/test/scala/io/glutenproject/execution/VeloxTPCHIcebergSuite.scala
+++ b/gluten-iceberg/src/test/scala/io/glutenproject/execution/VeloxTPCHIcebergSuite.scala
@@ -53,4 +53,31 @@ class VeloxTPCHIcebergSuite extends VeloxTPCHSuite {
         (table, tableDF)
     }.toMap
   }
+
+  test("iceberg transformer exists") {
+    runQueryAndCompare("""
+                         |SELECT
+                         |  l_orderkey,
+                         |  o_orderdate
+                         |FROM
+                         |  orders,
+                         |  lineitem
+                         |WHERE
+                         |  l_orderkey = o_orderkey
+                         |ORDER BY
+                         |  l_orderkey,
+                         |  o_orderdate
+                         |LIMIT
+                         |  10;
+                         |""".stripMargin) {
+      df =>
+        {
+          assert(
+            getExecutedPlan(df).count(
+              plan => {
+                plan.isInstanceOf[IcebergScanTransformer]
+              }) == 2)
+        }
+    }
+  }
 }