diff --git a/cdap-runtime-ext-dataproc/src/main/java/io/cdap/cdap/runtime/spi/runtimejob/DataprocJarUtil.java b/cdap-runtime-ext-dataproc/src/main/java/io/cdap/cdap/runtime/spi/runtimejob/DataprocJarUtil.java index 25680b6aa388..0ef91a851578 100644 --- a/cdap-runtime-ext-dataproc/src/main/java/io/cdap/cdap/runtime/spi/runtimejob/DataprocJarUtil.java +++ b/cdap-runtime-ext-dataproc/src/main/java/io/cdap/cdap/runtime/spi/runtimejob/DataprocJarUtil.java @@ -55,11 +55,15 @@ public static synchronized LocalFile getTwillJar(LocationFactory locationFactory return getLocalFile(location, true); } + // scala gets bundled in the twill jar because it's a Kafka dependency, + // but Kafka is not used in Dataproc jobs at all. Exclude it to make sure it doesn't + // clash with the scala on the cluster. + // For example, Dataproc 1.5 uses scala-libary 2.12.10, which is incompatible with 2.12.15 ApplicationBundler bundler = new ApplicationBundler(new ClassAcceptor() { @Override public boolean accept(String className, URL classUrl, URL classPathUrl) { return !className.startsWith("org.apache.hadoop") && !classPathUrl.toString() - .contains("spark-assembly"); + .contains("spark-assembly") && !classPathUrl.toString().contains("scala-library"); } }); bundler.createBundle(location, ImmutableList.of(ApplicationMasterMain.class, @@ -102,10 +106,10 @@ public boolean accept(String className, URL classUrl, URL classPathUrl) { }, DataprocJobMain.class.getName()); // Add the logback-console.xml from resources - URL logbackURL = classLoader.getResource("logback-console.xml"); - if (logbackURL != null) { + URL logbackUrl = classLoader.getResource("logback-console.xml"); + if (logbackUrl != null) { jarOut.putNextEntry(new JarEntry("logback-console.xml")); - Resources.copy(logbackURL, jarOut); + Resources.copy(logbackUrl, jarOut); } } return getLocalFile(location, false); diff --git a/cdap-runtime-ext-dataproc/src/test/java/io/cdap/cdap/runtime/spi/DataprocJarUtilTest.java b/cdap-runtime-ext-dataproc/src/test/java/io/cdap/cdap/runtime/spi/DataprocJarUtilTest.java new file mode 100644 index 000000000000..88da296c03f4 --- /dev/null +++ b/cdap-runtime-ext-dataproc/src/test/java/io/cdap/cdap/runtime/spi/DataprocJarUtilTest.java @@ -0,0 +1,65 @@ +/* + * Copyright © 2023 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.cdap.runtime.spi; + +import io.cdap.cdap.runtime.spi.runtimejob.DataprocJarUtil; +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.HashSet; +import java.util.Set; +import java.util.jar.JarEntry; +import java.util.jar.JarInputStream; +import java.util.stream.Collectors; +import org.apache.twill.api.LocalFile; +import org.apache.twill.filesystem.LocalLocationFactory; +import org.apache.twill.filesystem.LocationFactory; +import org.junit.Assert; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class DataprocJarUtilTest { + + @ClassRule + public static final TemporaryFolder TMP_FOLDER = new TemporaryFolder(); + + @Test + public void testScalaLibraryNotPackaged() throws IOException { + LocationFactory locationFactory = new LocalLocationFactory(TMP_FOLDER.newFolder()); + LocalFile localFile = DataprocJarUtil.getTwillJar(locationFactory); + File f = new File(localFile.getURI()); + + Set scalaLibraries = new HashSet<>(); + try (JarInputStream jarInput = new JarInputStream(Files.newInputStream(f.toPath()))) { + JarEntry entry; + while ((entry = jarInput.getNextJarEntry()) != null) { + // CDAP-20674: when this test was added, scala-library and scala-parser-combinators were + // both getting pulled into the twill.jar (with just scala-library causing problems) + // It is possible for this test to start failing in the future if twill changes. + // If that happens, it may be ok to add more scala libraries as expected libraries, + // but it should be a conscious, tested, decision + if (entry.getName().contains("scala") && + !entry.getName().contains("scala-parser-combinators")) { + scalaLibraries.add(entry.getName()); + } + } + } + Assert.assertTrue(String.format("Unexpected scala libraries in twill jar: %s", + scalaLibraries.stream().collect(Collectors.joining())), scalaLibraries.isEmpty()); + } +}