From e77b633a75a074879712ccf4a841125f329d4602 Mon Sep 17 00:00:00 2001 From: albertshau Date: Thu, 20 Jul 2023 12:08:52 -0700 Subject: [PATCH 1/2] CDAP-20674 exclude scala jars from twill jar in Dataproc Exclude scala classes from the twill jar, as they are not used in Dataproc jobs. This ensures that the scala version from the cluster is used instead, which fixes scala incompatibility issues across various Dataproc versions. --- .../spi/runtimejob/DataprocJarUtil.java | 6 +- .../cdap/runtime/spi/DataprocJarUtilTest.java | 65 +++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 cdap-runtime-ext-dataproc/src/test/java/io/cdap/cdap/runtime/spi/DataprocJarUtilTest.java diff --git a/cdap-runtime-ext-dataproc/src/main/java/io/cdap/cdap/runtime/spi/runtimejob/DataprocJarUtil.java b/cdap-runtime-ext-dataproc/src/main/java/io/cdap/cdap/runtime/spi/runtimejob/DataprocJarUtil.java index 25680b6aa388..a5a66bfff7b0 100644 --- a/cdap-runtime-ext-dataproc/src/main/java/io/cdap/cdap/runtime/spi/runtimejob/DataprocJarUtil.java +++ b/cdap-runtime-ext-dataproc/src/main/java/io/cdap/cdap/runtime/spi/runtimejob/DataprocJarUtil.java @@ -55,11 +55,15 @@ public static synchronized LocalFile getTwillJar(LocationFactory locationFactory return getLocalFile(location, true); } + // scala gets bundled in the twill jar because it's a Kafka dependency, + // but Kafka is not used in Dataproc jobs at all. Exclude it to make sure it doesn't + // clash with the scala on the cluster. + // For example, Dataproc 1.5 uses scala-libary 2.12.10, which is incompatible with 2.12.15 ApplicationBundler bundler = new ApplicationBundler(new ClassAcceptor() { @Override public boolean accept(String className, URL classUrl, URL classPathUrl) { return !className.startsWith("org.apache.hadoop") && !classPathUrl.toString() - .contains("spark-assembly"); + .contains("spark-assembly") && !classPathUrl.toString().contains("scala-library"); } }); bundler.createBundle(location, ImmutableList.of(ApplicationMasterMain.class, diff --git a/cdap-runtime-ext-dataproc/src/test/java/io/cdap/cdap/runtime/spi/DataprocJarUtilTest.java b/cdap-runtime-ext-dataproc/src/test/java/io/cdap/cdap/runtime/spi/DataprocJarUtilTest.java new file mode 100644 index 000000000000..88da296c03f4 --- /dev/null +++ b/cdap-runtime-ext-dataproc/src/test/java/io/cdap/cdap/runtime/spi/DataprocJarUtilTest.java @@ -0,0 +1,65 @@ +/* + * Copyright © 2023 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.cdap.runtime.spi; + +import io.cdap.cdap.runtime.spi.runtimejob.DataprocJarUtil; +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.HashSet; +import java.util.Set; +import java.util.jar.JarEntry; +import java.util.jar.JarInputStream; +import java.util.stream.Collectors; +import org.apache.twill.api.LocalFile; +import org.apache.twill.filesystem.LocalLocationFactory; +import org.apache.twill.filesystem.LocationFactory; +import org.junit.Assert; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class DataprocJarUtilTest { + + @ClassRule + public static final TemporaryFolder TMP_FOLDER = new TemporaryFolder(); + + @Test + public void testScalaLibraryNotPackaged() throws IOException { + LocationFactory locationFactory = new LocalLocationFactory(TMP_FOLDER.newFolder()); + LocalFile localFile = DataprocJarUtil.getTwillJar(locationFactory); + File f = new File(localFile.getURI()); + + Set scalaLibraries = new HashSet<>(); + try (JarInputStream jarInput = new JarInputStream(Files.newInputStream(f.toPath()))) { + JarEntry entry; + while ((entry = jarInput.getNextJarEntry()) != null) { + // CDAP-20674: when this test was added, scala-library and scala-parser-combinators were + // both getting pulled into the twill.jar (with just scala-library causing problems) + // It is possible for this test to start failing in the future if twill changes. + // If that happens, it may be ok to add more scala libraries as expected libraries, + // but it should be a conscious, tested, decision + if (entry.getName().contains("scala") && + !entry.getName().contains("scala-parser-combinators")) { + scalaLibraries.add(entry.getName()); + } + } + } + Assert.assertTrue(String.format("Unexpected scala libraries in twill jar: %s", + scalaLibraries.stream().collect(Collectors.joining())), scalaLibraries.isEmpty()); + } +} From eb4500f019ad79c22d4f8e4575e15259cfdfb51b Mon Sep 17 00:00:00 2001 From: albertshau Date: Wed, 26 Jul 2023 13:20:31 -0700 Subject: [PATCH 2/2] fix checkstyle variable naming --- .../cdap/cdap/runtime/spi/runtimejob/DataprocJarUtil.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cdap-runtime-ext-dataproc/src/main/java/io/cdap/cdap/runtime/spi/runtimejob/DataprocJarUtil.java b/cdap-runtime-ext-dataproc/src/main/java/io/cdap/cdap/runtime/spi/runtimejob/DataprocJarUtil.java index a5a66bfff7b0..0ef91a851578 100644 --- a/cdap-runtime-ext-dataproc/src/main/java/io/cdap/cdap/runtime/spi/runtimejob/DataprocJarUtil.java +++ b/cdap-runtime-ext-dataproc/src/main/java/io/cdap/cdap/runtime/spi/runtimejob/DataprocJarUtil.java @@ -106,10 +106,10 @@ public boolean accept(String className, URL classUrl, URL classPathUrl) { }, DataprocJobMain.class.getName()); // Add the logback-console.xml from resources - URL logbackURL = classLoader.getResource("logback-console.xml"); - if (logbackURL != null) { + URL logbackUrl = classLoader.getResource("logback-console.xml"); + if (logbackUrl != null) { jarOut.putNextEntry(new JarEntry("logback-console.xml")); - Resources.copy(logbackURL, jarOut); + Resources.copy(logbackUrl, jarOut); } } return getLocalFile(location, false);