apache · xupefei · Sep 12, 2024 · Sep 16, 2024 · Sep 16, 2024 · Sep 19, 2024
diff --git a/repl/src/test/resources/IntSumUdf.class b/repl/src/test/resources/IntSumUdf.class
diff --git a/repl/src/test/resources/IntSumUdf.scala b/repl/src/test/resources/IntSumUdf.scala
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.spark.sql.api.java.UDF2
+
+class IntSumUdf extends UDF2[Long, Long, Long] {
+  override def call(t1: Long, t2: Long): Long = t1 + t2
+}
diff --git a/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -396,4 +396,41 @@ class ReplSuite extends SparkFunSuite {
     Main.sparkContext.stop()
     System.clearProperty("spark.driver.port")
   }
+
+  test("register artifacts via SparkSession.addArtifact") {
+  val artifactPath = new File("src/test/resources").toPath
+    val intSumUdfPath = artifactPath.resolve("IntSumUdf.class")
+    val output = runInterpreterInPasteMode("local",
+      s"""
+        |import org.apache.spark.sql.api.java.UDF2
+        |import org.apache.spark.sql.types.DataTypes
+        |
+        |spark.addArtifact("${intSumUdfPath.toString}")
+        |
+        |spark.udf.registerJava("intSum", "IntSumUdf", DataTypes.LongType)
+        |
+        |val r = spark.range(5)
+        |  .withColumn("id2", col("id") + 1)
+        |  .selectExpr("intSum(id, id2)")
+        |  .collect()
+        |assert(r.map(_.getLong(0)).toSeq == Seq(1, 3, 5, 7, 9))
+        |
+      """.stripMargin)
+    assertDoesNotContain("error:", output)
+    assertDoesNotContain("Exception", output)
+    assertDoesNotContain("assertion failed", output)
+
+    // The UDF should not work in a nee REPL session.
+    val anotherOutput = runInterpreterInPasteMode("local",
+      s"""
+        |val r = spark.range(5)
+        |  .withColumn("id2", col("id") + 1)
+        |  .selectExpr("intSum(id, id2)")
+        |  .collect()
+        |
+      """.stripMargin)
+    assertContains(
+      "[UNRESOLVED_ROUTINE] Cannot resolve routine `intSum` on search path",
+      anotherOutput)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -44,7 +44,7 @@ import org.apache.spark.util.Utils
  * @since 1.3.0
  */
 @Stable
-class UDFRegistration private[sql] (functionRegistry: FunctionRegistry)
+class UDFRegistration private[sql] (session: SparkSession, functionRegistry: FunctionRegistry)
   extends api.UDFRegistration
   with Logging {
   protected[sql] def registerPython(name: String, udf: UserDefinedPythonFunction): Unit = {
@@ -121,7 +121,9 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry)
    */
   private[sql] def registerJavaUDAF(name: String, className: String): Unit = {
     try {
-      val clazz = Utils.classForName[AnyRef](className)
+      val clazz = session.artifactManager.withResources {
+        Utils.classForName[AnyRef](className, noSparkClassLoader = true)
+      }
       if (!classOf[UserDefinedAggregateFunction].isAssignableFrom(clazz)) {
         throw QueryCompilationErrors
           .classDoesNotImplementUserDefinedAggregateFunctionError(className)
@@ -145,9 +147,11 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry)
    * @param returnDataType return type of udf. If it is null, spark would try to infer
    *                       via reflection.
    */
-  private[sql] def registerJava(name: String, className: String, returnDataType: DataType): Unit = {
+  def registerJava(name: String, className: String, returnDataType: DataType): Unit = {
     try {
-      val clazz = Utils.classForName[AnyRef](className)
+      val clazz = session.artifactManager.withResources {
+        Utils.classForName[AnyRef](className)
+      }
       val udfInterfaces = clazz.getGenericInterfaces
         .filter(_.isInstanceOf[ParameterizedType])
         .map(_.asInstanceOf[ParameterizedType])

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/artifact/ArtifactManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/artifact/ArtifactManager.scala
@@ -71,7 +71,7 @@ class ArtifactManager(session: SparkSession) extends Logging {
     (ArtifactUtils.concatenatePaths(artifactPath, "classes"),
       s"$artifactURI${File.separator}classes${File.separator}")
 
-  protected[artifact] val state: JobArtifactState =
+  protected[sql] val state: JobArtifactState =
     JobArtifactState(session.sessionUUID, Option(classURI))
 
   def withResources[T](f: => T): T = {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
@@ -115,93 +115,95 @@ object SQLExecution extends Logging {
         }
       val redactedConfigs = sparkSession.sessionState.conf.redactOptions(modifiedConfigs)
 
-      withSQLConfPropagated(sparkSession) {
-        var ex: Option[Throwable] = None
-        var isExecutedPlanAvailable = false
-        val startTime = System.nanoTime()
-        val startEvent = SparkListenerSQLExecutionStart(
-          executionId = executionId,
-          rootExecutionId = Some(rootExecutionId),
-          description = desc,
-          details = callSite.longForm,
-          physicalPlanDescription = "",
-          sparkPlanInfo = SparkPlanInfo.EMPTY,
-          time = System.currentTimeMillis(),
-          modifiedConfigs = redactedConfigs,
-          jobTags = sc.getJobTags(),
-          jobGroupId = Option(sc.getLocalProperty(SparkContext.SPARK_JOB_GROUP_ID))
-        )
-        try {
-          body match {
-            case Left(e) =>
-              sc.listenerBus.post(startEvent)
+      JobArtifactSet.withActiveJobArtifactState(sparkSession.artifactManager.state) {
+        withSQLConfPropagated(sparkSession) {
+          var ex: Option[Throwable] = None
+          var isExecutedPlanAvailable = false
+          val startTime = System.nanoTime()
+          val startEvent = SparkListenerSQLExecutionStart(
+            executionId = executionId,
+            rootExecutionId = Some(rootExecutionId),
+            description = desc,
+            details = callSite.longForm,
+            physicalPlanDescription = "",
+            sparkPlanInfo = SparkPlanInfo.EMPTY,
+            time = System.currentTimeMillis(),
+            modifiedConfigs = redactedConfigs,
+            jobTags = sc.getJobTags(),
+            jobGroupId = Option(sc.getLocalProperty(SparkContext.SPARK_JOB_GROUP_ID))
+          )
+          try {
+            body match {
+              case Left(e) =>
+                sc.listenerBus.post(startEvent)
+                throw e
+              case Right(f) =>
+                val planDescriptionMode =
+                  ExplainMode.fromString(sparkSession.sessionState.conf.uiExplainMode)
+                val planDesc = queryExecution.explainString(planDescriptionMode)
+                val planInfo = try {
+                  SparkPlanInfo.fromSparkPlan(queryExecution.executedPlan)
+                } catch {
+                  case NonFatal(e) =>
+                    logDebug("Failed to generate SparkPlanInfo", e)
+                    // If the queryExecution already failed before this, we are not able to generate
+                    // the the plan info, so we use and empty graphviz node to make the UI happy
+                    SparkPlanInfo.EMPTY
+                }
+                sc.listenerBus.post(
+                  startEvent.copy(physicalPlanDescription = planDesc, sparkPlanInfo = planInfo))
+                isExecutedPlanAvailable = true
+                f()
+            }
+          } catch {
+            case e: Throwable =>
+              ex = Some(e)
               throw e
-            case Right(f) =>
-              val planDescriptionMode =
-                ExplainMode.fromString(sparkSession.sessionState.conf.uiExplainMode)
-              val planDesc = queryExecution.explainString(planDescriptionMode)
-              val planInfo = try {
-                SparkPlanInfo.fromSparkPlan(queryExecution.executedPlan)
-              } catch {
-                case NonFatal(e) =>
-                  logDebug("Failed to generate SparkPlanInfo", e)
-                  // If the queryExecution already failed before this, we are not able to generate
-                  // the the plan info, so we use and empty graphviz node to make the UI happy
-                  SparkPlanInfo.EMPTY
-              }
-              sc.listenerBus.post(
-                startEvent.copy(physicalPlanDescription = planDesc, sparkPlanInfo = planInfo))
-              isExecutedPlanAvailable = true
-              f()
-          }
-        } catch {
-          case e: Throwable =>
-            ex = Some(e)
-            throw e
-        } finally {
-          val endTime = System.nanoTime()
-          val errorMessage = ex.map {
-            case e: SparkThrowable =>
-              SparkThrowableHelper.getMessage(e, ErrorMessageFormat.PRETTY)
-            case e =>
-              Utils.exceptionString(e)
-          }
-          if (queryExecution.shuffleCleanupMode != DoNotCleanup
-            && isExecutedPlanAvailable) {
-            val shuffleIds = queryExecution.executedPlan match {
-              case ae: AdaptiveSparkPlanExec =>
-                ae.context.shuffleIds.asScala.keys
-              case _ =>
-                Iterable.empty
+          } finally {
+            val endTime = System.nanoTime()
+            val errorMessage = ex.map {
+              case e: SparkThrowable =>
+                SparkThrowableHelper.getMessage(e, ErrorMessageFormat.PRETTY)
+              case e =>
+                Utils.exceptionString(e)
             }
-            shuffleIds.foreach { shuffleId =>
-              queryExecution.shuffleCleanupMode match {
-                case RemoveShuffleFiles =>
-                  // Same as what we do in ContextCleaner.doCleanupShuffle, but do not unregister
-                  // the shuffle on MapOutputTracker, so that stage retries would be triggered.
-                  // Set blocking to Utils.isTesting to deflake unit tests.
-                  sc.shuffleDriverComponents.removeShuffle(shuffleId, Utils.isTesting)
-                case SkipMigration =>
-                  SparkEnv.get.blockManager.migratableResolver.addShuffleToSkip(shuffleId)
-                case _ => // this should not happen
+            if (queryExecution.shuffleCleanupMode != DoNotCleanup
+              && isExecutedPlanAvailable) {
+              val shuffleIds = queryExecution.executedPlan match {
+                case ae: AdaptiveSparkPlanExec =>
+                  ae.context.shuffleIds.asScala.keys
+                case _ =>
+                  Iterable.empty
+              }
+              shuffleIds.foreach { shuffleId =>
+                queryExecution.shuffleCleanupMode match {
+                  case RemoveShuffleFiles =>
+                    // Same as what we do in ContextCleaner.doCleanupShuffle, but do not unregister
+                    // the shuffle on MapOutputTracker, so that stage retries would be triggered.
+                    // Set blocking to Utils.isTesting to deflake unit tests.
+                    sc.shuffleDriverComponents.removeShuffle(shuffleId, Utils.isTesting)
+                  case SkipMigration =>
+                    SparkEnv.get.blockManager.migratableResolver.addShuffleToSkip(shuffleId)
+                  case _ => // this should not happen
+                }
               }
             }
+            val event = SparkListenerSQLExecutionEnd(
+              executionId,
+              System.currentTimeMillis(),
+              // Use empty string to indicate no error, as None may mean events generated by old
+              // versions of Spark.
+              errorMessage.orElse(Some("")))
+            // Currently only `Dataset.withAction` and `DataFrameWriter.runCommand` specify the
+            // `name` parameter. The `ExecutionListenerManager` only watches SQL executions with
+            // name. We can specify the execution name in more places in the future, so that
+            // `QueryExecutionListener` can track more cases.
+            event.executionName = name
+            event.duration = endTime - startTime
+            event.qe = queryExecution
+            event.executionFailure = ex
+            sc.listenerBus.post(event)
           }
-          val event = SparkListenerSQLExecutionEnd(
-            executionId,
-            System.currentTimeMillis(),
-            // Use empty string to indicate no error, as None may mean events generated by old
-            // versions of Spark.
-            errorMessage.orElse(Some("")))
-          // Currently only `Dataset.withAction` and `DataFrameWriter.runCommand` specify the `name`
-          // parameter. The `ExecutionListenerManager` only watches SQL executions with name. We
-          // can specify the execution name in more places in the future, so that
-          // `QueryExecutionListener` can track more cases.
-          event.executionName = name
-          event.duration = endTime - startTime
-          event.qe = queryExecution
-          event.executionFailure = ex
-          sc.listenerBus.post(event)
         }
       }
     } finally {
@@ -281,7 +283,10 @@ object SQLExecution extends Logging {
     val activeSession = sparkSession
     val sc = sparkSession.sparkContext
     val localProps = Utils.cloneProperties(sc.getLocalProperties)
-    val artifactState = JobArtifactSet.getCurrentJobArtifactState.orNull
+    // `getCurrentJobArtifactState` will return a stat only in Spark Connect mode. In non-Connect
+    // mode, we default back to the resources of the current Spark session.
+    val artifactState = JobArtifactSet.getCurrentJobArtifactState.getOrElse(
+      activeSession.artifactManager.state)
     exec.submit(() => JobArtifactSet.withActiveJobArtifactState(artifactState) {
       val originalSession = SparkSession.getActiveSession
       val originalLocalProps = sc.getLocalProperties

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
@@ -181,7 +181,7 @@ abstract class BaseSessionStateBuilder(
    * Note 1: The user-defined functions must be deterministic.
    * Note 2: This depends on the `functionRegistry` field.
    */
-  protected def udfRegistration: UDFRegistration = new UDFRegistration(functionRegistry)
+  protected def udfRegistration: UDFRegistration = new UDFRegistration(session, functionRegistry)
 
   protected def udtfRegistration: UDTFRegistration = new UDTFRegistration(tableFunctionRegistry)
 

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/artifact/ArtifactManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/artifact/ArtifactManagerSuite.scala
@@ -342,13 +342,11 @@ class ArtifactManagerSuite extends SharedSparkSession {
       .asInstanceOf[UDF2[Long, Long, Long]]
     spark.udf.register("intSum", instance, DataTypes.LongType)
 
-    artifactManager.withResources {
-      val r = spark.range(5)
-        .withColumn("id2", col("id") + 1)
-        .selectExpr("intSum(id, id2)")
-        .collect()
-      assert(r.map(_.getLong(0)).toSeq == Seq(1, 3, 5, 7, 9))
-    }
+    val r = spark.range(5)
+      .withColumn("id2", col("id") + 1)
+      .selectExpr("intSum(id, id2)")
+      .collect()
+    assert(r.map(_.getLong(0)).toSeq == Seq(1, 3, 5, 7, 9))
   }
 
   private def testAddArtifactToLocalSession(