apache · xupefei · Sep 12, 2024 · Sep 16, 2024 · Sep 16, 2024 · Sep 19, 2024
diff --git a/...c/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala b/...c/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala
@@ -297,6 +297,8 @@ object CheckConnectJvmClientCompatibility {
         "org.apache.spark.sql.UDFRegistration.initializeLogIfNecessary"),
       ProblemFilters.exclude[DirectMissingMethodProblem](
         "org.apache.spark.sql.UDFRegistration.initializeLogIfNecessary$default$2"),
+      ProblemFilters.exclude[DirectMissingMethodProblem](
+        "org.apache.spark.sql.UDFRegistration.registerJava"),
 
       // Protected DataFrameReader methods...
       ProblemFilters.exclude[DirectMissingMethodProblem](

diff --git a/core/src/main/scala/org/apache/spark/SparkFiles.scala b/core/src/main/scala/org/apache/spark/SparkFiles.scala
@@ -27,8 +27,12 @@ object SparkFiles {
   /**
    * Get the absolute path of a file added through `SparkContext.addFile()`.
    */
-  def get(filename: String): String =
-    new File(getRootDirectory(), filename).getAbsolutePath()
+  def get(filename: String): String = {
+    val jobArtifactUUID = JobArtifactSet
+      .getCurrentJobArtifactState.map(_.uuid).getOrElse("default")
+    val withUuid = if (jobArtifactUUID == "default") filename else s"$jobArtifactUUID/$filename"
+    new File(getRootDirectory(), withUuid).getAbsolutePath
+  }
 
   /**
    * Get the root directory that contains files added through `SparkContext.addFile()`.

diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -463,7 +463,7 @@ private[spark] class BlockManager(
    * '''Important!''' Callers must not mutate or release the data buffer underlying `bytes`. Doing
    * so may corrupt or change the data stored by the `BlockManager`.
    */
-  private case class ByteBufferBlockStoreUpdater[T](
+  private[spark] case class ByteBufferBlockStoreUpdater[T](
       blockId: BlockId,
       level: StorageLevel,
       classTag: ClassTag[T],

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -176,15 +176,25 @@ private[spark] object Utils
   }
 
   /**
-   * Run a segment of code using a different context class loader in the current thread
-   */
-  def withContextClassLoader[T](ctxClassLoader: ClassLoader)(fn: => T): T = {
-    val oldClassLoader = Thread.currentThread().getContextClassLoader()
+   * Run a segment of code using a different context class loader in the current thread.
+   *
+   * If `retainChange` is `true` and `fn` changed the context class loader during execution,
+   * the class loader will be not reverted to the original one when this method returns.
+   */
+  def withContextClassLoader[T](
+      ctxClassLoader: ClassLoader,
+      retainChange: Boolean = false)(fn: => T): T = {
+    val oldClassLoader = Thread.currentThread().getContextClassLoader
+    var classLoaderAfterFn: ClassLoader = null
     try {
       Thread.currentThread().setContextClassLoader(ctxClassLoader)
-      fn
+      val ret = fn
+      classLoaderAfterFn = Thread.currentThread().getContextClassLoader
+      ret
     } finally {
-      Thread.currentThread().setContextClassLoader(oldClassLoader)
+      if (!retainChange || classLoaderAfterFn == ctxClassLoader) {
+        Thread.currentThread().setContextClassLoader(oldClassLoader)
+      }
     }
   }
 

diff --git a/python/pyspark/core/context.py b/python/pyspark/core/context.py
@@ -84,6 +84,8 @@
 DEFAULT_CONFIGS: Dict[str, Any] = {
     "spark.serializer.objectStreamReset": 100,
     "spark.rdd.compress": True,
+    # Disable artifact isolation in PySpark, or user-added .py file won't work
+    "spark.session.isolate.artifacts": "false",
 }
 
 T = TypeVar("T")

diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py
@@ -1027,7 +1027,11 @@ def _start_connect_server(master: str, opts: Dict[str, Any]) -> None:
             os.environ["SPARK_LOCAL_CONNECT"] = "1"
 
             # Configurations to be set if unset.
-            default_conf = {"spark.plugins": "org.apache.spark.sql.connect.SparkConnectPlugin"}
+            default_conf = {
+                "spark.plugins": "org.apache.spark.sql.connect.SparkConnectPlugin",
+                "spark.repl.isolate.artifacts": "true",
+                "spark.session.isolate.artifacts": "true",
+            }
 
             if "SPARK_TESTING" in os.environ:
                 # For testing, we use 0 to use an ephemeral port to allow parallel testing.

diff --git a/repl/src/test/resources/IntSumUdf.class b/repl/src/test/resources/IntSumUdf.class
diff --git a/repl/src/test/resources/IntSumUdf.scala b/repl/src/test/resources/IntSumUdf.scala
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.spark.sql.api.java.UDF2
+
+class IntSumUdf extends UDF2[Long, Long, Long] {
+  override def call(t1: Long, t2: Long): Long = t1 + t2
+}
diff --git a/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -396,4 +396,67 @@ class ReplSuite extends SparkFunSuite {
     Main.sparkContext.stop()
     System.clearProperty("spark.driver.port")
   }
+
+  test("register UDF via SparkSession.addArtifact") {
+    val artifactPath = new File("src/test/resources").toPath
+    val intSumUdfPath = artifactPath.resolve("IntSumUdf.class")
+    val output = runInterpreterInPasteMode("local",
+      s"""
+         |import org.apache.spark.sql.api.java.UDF2
+         |import org.apache.spark.sql.types.DataTypes
+         |
+         |spark.addArtifact("${intSumUdfPath.toString}")
+         |
+         |spark.udf.registerJava("intSum", "IntSumUdf", DataTypes.LongType)
+         |
+         |val r = spark.range(5)
+         |  .withColumn("id2", col("id") + 1)
+         |  .selectExpr("intSum(id, id2)")
+         |  .collect()
+         |assert(r.map(_.getLong(0)).toSeq == Seq(1, 3, 5, 7, 9))
+         |
+      """.stripMargin)
+    assertContains("Array([1], [3], [5], [7], [9])", output)
+    assertDoesNotContain("error:", output)
+    assertDoesNotContain("Exception", output)
+    assertDoesNotContain("assertion failed", output)
+
+    // The UDF should not work in a new REPL session.
+    val anotherOutput = runInterpreterInPasteMode("local",
+      s"""
+         |val r = spark.range(5)
+         |  .withColumn("id2", col("id") + 1)
+         |  .selectExpr("intSum(id, id2)")
+         |  .collect()
+         |
+      """.stripMargin)
+    assertContains(
+      "[UNRESOLVED_ROUTINE] Cannot resolve routine `intSum` on search path",
+      anotherOutput)
+  }
+
+  test("register a class via SparkSession.addArtifact") {
+    val artifactPath = new File("src/test/resources").toPath
+    val intSumUdfPath = artifactPath.resolve("IntSumUdf.class")
+    val output = runInterpreterInPasteMode("local",
+      s"""
+         |import org.apache.spark.sql.functions.udf
+         |
+         |spark.addArtifact("${intSumUdfPath.toString}")
+         |
+         |val intSumUdf = udf((x: Long, y: Long) => new IntSumUdf().call(x, y))
+         |spark.udf.register("intSum", intSumUdf)
+         |
+         |val r = spark.range(5)
+         |  .withColumn("id2", col("id") + 1)
+         |  .selectExpr("intSum(id, id2)")
+         |  .collect()
+         |assert(r.map(_.getLong(0)).toSeq == Seq(1, 3, 5, 7, 9))
+         |
+      """.stripMargin)
+    assertContains("Array([1], [3], [5], [7], [9])", output)
+    assertDoesNotContain("error:", output)
+    assertDoesNotContain("Exception", output)
+    assertDoesNotContain("assertion failed", output)
+  }
 }
diff --git a/...onnect/server/src/main/scala/org/apache/spark/sql/connect/SimpleSparkConnectService.scala b/...onnect/server/src/main/scala/org/apache/spark/sql/connect/SimpleSparkConnectService.scala
@@ -40,6 +40,8 @@ private[sql] object SimpleSparkConnectService {
   def main(args: Array[String]): Unit = {
     val conf = new SparkConf()
       .set("spark.plugins", "org.apache.spark.sql.connect.SparkConnectPlugin")
+      .set("spark.repl.isolate.artifacts", "true")
+      .set("spark.session.isolate.artifacts", "true")
     val sparkSession = SparkSession.builder().config(conf).getOrCreate()
     val sparkContext = sparkSession.sparkContext // init spark context
     // scalastyle:off println

diff --git a/...nnect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectServer.scala b/...nnect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectServer.scala
@@ -28,7 +28,11 @@ object SparkConnectServer extends Logging {
   def main(args: Array[String]): Unit = {
     // Set the active Spark Session, and starts SparkEnv instance (via Spark Context)
     logInfo("Starting Spark session.")
-    val session = SparkSession.builder().getOrCreate()
+    val session = SparkSession
+      .builder()
+      .config("spark.repl.isolate.artifacts", "true")
+      .config("spark.session.isolate.artifacts", "true")
+      .getOrCreate()
     try {
       try {
         SparkConnectService.start(session.sparkContext)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -275,6 +275,7 @@ class SparkSession private(
       Map.empty,
       managedJobTags.asScala.toMap)
     result.sessionState // force copy of SessionState
+    result.sessionState.artifactManager // force copy of ArtifactManager and its resources
     result.managedJobTags // force copy of userDefinedToRealTagsMap
     result
   }
@@ -898,6 +899,7 @@ object SparkSession extends api.SparkSessionCompanion with Logging {
     override def enableHiveSupport(): this.type = synchronized {
       if (hiveClassesArePresent) {
         super.enableHiveSupport()
+          .config("spark.session.isolate.artifacts", "false")
       } else {
         throw new IllegalArgumentException(
           "Unable to instantiate SparkSession with Hive support because " +

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -32,7 +32,6 @@ import org.apache.spark.sql.execution.python.UserDefinedPythonFunction
 import org.apache.spark.sql.expressions.{SparkUserDefinedFunction, UserDefinedAggregateFunction, UserDefinedAggregator, UserDefinedFunction}
 import org.apache.spark.sql.internal.UserDefinedFunctionUtils.toScalaUDF
 import org.apache.spark.sql.types.DataType
-import org.apache.spark.util.Utils
 
 /**
  * Functions for registering user-defined functions. Use `SparkSession.udf` to access this:
@@ -44,7 +43,7 @@ import org.apache.spark.util.Utils
  * @since 1.3.0
  */
 @Stable
-class UDFRegistration private[sql] (functionRegistry: FunctionRegistry)
+class UDFRegistration private[sql] (session: SparkSession, functionRegistry: FunctionRegistry)
   extends api.UDFRegistration
   with Logging {
   protected[sql] def registerPython(name: String, udf: UserDefinedPythonFunction): Unit = {
@@ -121,7 +120,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry)
    */
   private[sql] def registerJavaUDAF(name: String, className: String): Unit = {
     try {
-      val clazz = Utils.classForName[AnyRef](className)
+      val clazz = session.artifactManager.classloader.loadClass(className)
       if (!classOf[UserDefinedAggregateFunction].isAssignableFrom(clazz)) {
         throw QueryCompilationErrors
           .classDoesNotImplementUserDefinedAggregateFunctionError(className)
@@ -138,16 +137,20 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry)
 
   // scalastyle:off line.size.limit
   /**
-   * Register a Java UDF class using reflection, for use from pyspark
+   * Register a Java UDF class using it's class name. The class must implement one of the UDF
+   * interfaces in the [[org.apache.spark.sql.api.java]] package, and discoverable by the current
+   * session's class loader.
    *
-   * @param name           udf name
-   * @param className      fully qualified class name of udf
-   * @param returnDataType return type of udf. If it is null, spark would try to infer
+   * @param name           Name of the UDF.
+   * @param className      Fully qualified class name of the UDF.
+   * @param returnDataType Return type of UDF. If it is `null`, Spark would try to infer
    *                       via reflection.
+   *
+   * @since 4.0.0
    */
-  private[sql] def registerJava(name: String, className: String, returnDataType: DataType): Unit = {
+  def registerJava(name: String, className: String, returnDataType: DataType): Unit = {
     try {
-      val clazz = Utils.classForName[AnyRef](className)
+      val clazz = session.artifactManager.classloader.loadClass(className)
       val udfInterfaces = clazz.getGenericInterfaces
         .filter(_.isInstanceOf[ParameterizedType])
         .map(_.asInstanceOf[ParameterizedType])