From c01edaba281aa66de32ad3cc7cbafcfedaba09be Mon Sep 17 00:00:00 2001 From: mchades Date: Fri, 30 Aug 2024 13:23:24 +0800 Subject: [PATCH] [#4528] improvement(hive-catalog): reduce hive catalog libs size from 146MB to 43MB (#4531) ### What changes were proposed in this pull request? remove some unnecessary dependencies ### Why are the changes needed? Fix: #4528 ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? CI passed --- LICENSE.bin | 3 ++ .../authorization-ranger/build.gradle.kts | 16 ++++++--- catalogs/catalog-hive/build.gradle.kts | 36 ++++++++++++++++--- .../gravitino/catalog/hive/HiveTable.java | 5 ++- .../catalog/hive/HiveTableOperations.java | 13 ++++--- gradle/libs.versions.toml | 23 +++++++----- 6 files changed, 69 insertions(+), 27 deletions(-) diff --git a/LICENSE.bin b/LICENSE.bin index ca9218f771a..b56a0fdf42b 100644 --- a/LICENSE.bin +++ b/LICENSE.bin @@ -358,6 +358,8 @@ WildFly Confluent Kafka Streams Examples Apache Arrow + Rome + Jettison This product bundles various third-party components also under the Apache Software Foundation License 1.1 @@ -404,6 +406,7 @@ Common Development and Distribution License 1.0 Javax Activation + Javax Mail Stax API Java Servlet API JSR311 API diff --git a/authorizations/authorization-ranger/build.gradle.kts b/authorizations/authorization-ranger/build.gradle.kts index 51dc592a7ae..b197dc20cf4 100644 --- a/authorizations/authorization-ranger/build.gradle.kts +++ b/authorizations/authorization-ranger/build.gradle.kts @@ -31,11 +31,17 @@ dependencies { implementation(project(":core")) { exclude(group = "*") } + implementation(libs.bundles.log4j) implementation(libs.commons.lang3) implementation(libs.guava) - + implementation(libs.javax.jaxb.api) { + exclude("*") + } + implementation(libs.javax.ws.rs.api) + implementation(libs.jettison) compileOnly(libs.lombok) + implementation(libs.mail) implementation(libs.ranger.intg) { exclude("org.apache.hadoop", "hadoop-common") exclude("org.apache.hive", "hive-storage-api") @@ -50,11 +56,9 @@ dependencies { exclude("org.apache.ranger", "ranger-plugin-classloader") exclude("net.java.dev.jna") exclude("javax.ws.rs") + exclude("org.eclipse.jetty") } - implementation(libs.javax.ws.rs.api) - implementation(libs.javax.jaxb.api) { - exclude("*") - } + implementation(libs.rome) testImplementation(project(":common")) testImplementation(project(":clients:client-java")) @@ -70,6 +74,7 @@ dependencies { exclude("org.apache.lucene") exclude("org.apache.solr") exclude("org.apache.kafka") + exclude("org.eclipse.jetty") exclude("org.elasticsearch") exclude("org.elasticsearch.client") exclude("org.elasticsearch.plugin") @@ -78,6 +83,7 @@ dependencies { } testImplementation(libs.hive2.jdbc) { exclude("org.slf4j") + exclude("org.eclipse.jetty.aggregate") } testImplementation(libs.mysql.driver) } diff --git a/catalogs/catalog-hive/build.gradle.kts b/catalogs/catalog-hive/build.gradle.kts index 720428e0adc..776e9bf39c9 100644 --- a/catalogs/catalog-hive/build.gradle.kts +++ b/catalogs/catalog-hive/build.gradle.kts @@ -30,12 +30,24 @@ val icebergVersion: String = libs.versions.iceberg.get() val scalaCollectionCompatVersion: String = libs.versions.scala.collection.compat.get() dependencies { - implementation(project(":api")) - implementation(project(":catalogs:catalog-common")) - implementation(project(":core")) + implementation(project(":api")) { + exclude("*") + } + implementation(project(":catalogs:catalog-common")) { + exclude("*") + } + implementation(project(":core")) { + exclude("*") + } implementation(libs.caffeine) + implementation(libs.commons.collections3) + implementation(libs.commons.configuration1) + implementation(libs.htrace.core4) implementation(libs.guava) + implementation(libs.hadoop2.auth) { + exclude("*") + } implementation(libs.hive2.exec) { artifact { classifier = "core" @@ -43,19 +55,28 @@ dependencies { exclude("com.google.code.findbugs", "jsr305") exclude("com.google.protobuf") exclude("org.apache.avro") + exclude("org.apache.ant") exclude("org.apache.calcite") exclude("org.apache.calcite.avatica") exclude("org.apache.curator") + exclude("org.apache.derby") exclude("org.apache.hadoop", "hadoop-yarn-server-resourcemanager") + exclude("org.apache.hive", "hive-llap-tez") + exclude("org.apache.hive", "hive-vector-code-gen") + exclude("org.apache.ivy") exclude("org.apache.logging.log4j") exclude("org.apache.zookeeper") + exclude("org.codehaus.groovy", "groovy-all") + exclude("org.datanucleus", "datanucleus-core") exclude("org.eclipse.jetty.aggregate", "jetty-all") exclude("org.eclipse.jetty.orbit", "javax.servlet") exclude("org.openjdk.jol") exclude("org.pentaho") exclude("org.slf4j") } + implementation(libs.woodstox.core) implementation(libs.hive2.metastore) { + exclude("ant") exclude("co.cask.tephra") exclude("com.github.joshelser") exclude("com.google.code.findbugs", "jsr305") @@ -64,13 +85,16 @@ dependencies { exclude("com.zaxxer", "HikariCP") exclude("io.dropwizard.metricss") exclude("javax.transaction", "transaction-api") + exclude("org.apache.ant") exclude("org.apache.avro") exclude("org.apache.curator") + exclude("org.apache.derby") exclude("org.apache.hadoop", "hadoop-yarn-server-resourcemanager") exclude("org.apache.hbase") exclude("org.apache.logging.log4j") exclude("org.apache.parquet", "parquet-hadoop-bundle") exclude("org.apache.zookeeper") + exclude("org.datanucleus") exclude("org.eclipse.jetty.aggregate", "jetty-all") exclude("org.eclipse.jetty.orbit", "javax.servlet") exclude("org.openjdk.jol") @@ -135,7 +159,11 @@ tasks { val copyCatalogLibs by registering(Copy::class) { dependsOn("jar", "runtimeJars") - from("build/libs") + from("build/libs") { + exclude("guava-*.jar") + exclude("log4j-*.jar") + exclude("slf4j-*.jar") + } into("$rootDir/distribution/package/catalogs/hive/libs") } diff --git a/catalogs/catalog-hive/src/main/java/org/apache/gravitino/catalog/hive/HiveTable.java b/catalogs/catalog-hive/src/main/java/org/apache/gravitino/catalog/hive/HiveTable.java index f1c5f45fb4a..2108390c86e 100644 --- a/catalogs/catalog-hive/src/main/java/org/apache/gravitino/catalog/hive/HiveTable.java +++ b/catalogs/catalog-hive/src/main/java/org/apache/gravitino/catalog/hive/HiveTable.java @@ -31,7 +31,6 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import lombok.ToString; -import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang3.ArrayUtils; import org.apache.gravitino.catalog.hive.converter.HiveDataTypeConverter; import org.apache.gravitino.connector.BaseTable; @@ -87,7 +86,7 @@ public static HiveTable.Builder fromHiveTable(Table table) { StorageDescriptor sd = table.getSd(); Distribution distribution = Distributions.NONE; - if (CollectionUtils.isNotEmpty(sd.getBucketCols())) { + if (sd.getBucketCols() != null && !sd.getBucketCols().isEmpty()) { // Hive table use hash strategy as bucketing strategy distribution = Distributions.hash( @@ -96,7 +95,7 @@ public static HiveTable.Builder fromHiveTable(Table table) { } SortOrder[] sortOrders = new SortOrder[0]; - if (CollectionUtils.isNotEmpty(sd.getSortCols())) { + if (sd.getSortCols() != null && !sd.getSortCols().isEmpty()) { sortOrders = sd.getSortCols().stream() .map( diff --git a/catalogs/catalog-hive/src/main/java/org/apache/gravitino/catalog/hive/HiveTableOperations.java b/catalogs/catalog-hive/src/main/java/org/apache/gravitino/catalog/hive/HiveTableOperations.java index 4ade7cdc7c9..a5ca0778da1 100644 --- a/catalogs/catalog-hive/src/main/java/org/apache/gravitino/catalog/hive/HiveTableOperations.java +++ b/catalogs/catalog-hive/src/main/java/org/apache/gravitino/catalog/hive/HiveTableOperations.java @@ -45,7 +45,6 @@ import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.metastore.api.UnknownTableException; -import org.apache.parquet.Strings; import org.apache.thrift.TException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -163,19 +162,19 @@ public Partition addPartition(Partition partition) throws PartitionAlreadyExists Preconditions.checkArgument( transformFields.size() == identityPartition.fieldNames().length, "Hive partition field names must be the same as table partitioning field names: %s, but got %s", - Strings.join(transformFields, ","), - Strings.join( + String.join(",", transformFields), + String.join( + ",", Arrays.stream(identityPartition.fieldNames()) - .map(f -> Strings.join(f, ".")) - .collect(Collectors.toList()), - ",")); + .map(f -> String.join(".", f)) + .collect(Collectors.toList()))); Arrays.stream(identityPartition.fieldNames()) .forEach( f -> Preconditions.checkArgument( transformFields.contains(f[0]), "Hive partition field name must be in table partitioning field names: %s, but got %s", - Strings.join(transformFields, ","), + String.join(",", transformFields), f[0])); try { diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 8cc2b0f4813..f278e95a416 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -28,18 +28,19 @@ jetty = "9.4.51.v20230217" jersey = "2.41" mockito = "4.11.0" airlift-json = "237" -airlift-log = "231" airlift-resolver = "1.6" -airlift-units = "1.8" hive2 = "2.3.9" hadoop2 = "2.10.2" hadoop3 = "3.1.0" hadoop-minikdc = "3.3.6" +htrace-core4 = "4.1.0-incubating" httpclient5 = "5.2.1" mockserver = "5.15.0" commons-lang3 = "3.14.0" commons-io = "2.15.0" commons-collections4 = "4.4" +commons-collections3 = "3.2.2" +commons-configuration1 = "1.6" commons-dbcp2 = "2.11.0" caffeine = "2.9.3" rocksdbjni = "7.10.2" @@ -62,7 +63,6 @@ jline = "3.21.0" okhttp3 = "4.11.0" metrics = "4.2.25" prometheus = "0.16.0" -jsqlparser = "4.2" mysql = "8.0.23" postgresql = "42.6.0" immutables-value = "2.10.0" @@ -91,6 +91,10 @@ node-plugin = "7.0.1" commons-cli = "1.2" sun-activation-version = "1.2.0" error-prone = "3.1.0" +woodstox-core = "5.3.0" +mail = "1.4.1" +rome = "1.0" +jettison = "1.1" [libraries] protobuf-java = { group = "com.google.protobuf", name = "protobuf-java", version.ref = "protoc" } @@ -130,18 +134,17 @@ hive2-metastore = { group = "org.apache.hive", name = "hive-metastore", version. hive2-exec = { group = "org.apache.hive", name = "hive-exec", version.ref = "hive2"} hive2-common = { group = "org.apache.hive", name = "hive-common", version.ref = "hive2"} hive2-jdbc = { group = "org.apache.hive", name = "hive-jdbc", version.ref = "hive2"} +hadoop2-auth = { group = "org.apache.hadoop", name = "hadoop-auth", version.ref = "hadoop2" } hadoop2-hdfs = { group = "org.apache.hadoop", name = "hadoop-hdfs", version.ref = "hadoop2" } hadoop2-common = { group = "org.apache.hadoop", name = "hadoop-common", version.ref = "hadoop2"} hadoop2-mapreduce-client-core = { group = "org.apache.hadoop", name = "hadoop-mapreduce-client-core", version.ref = "hadoop2"} hadoop3-hdfs = { group = "org.apache.hadoop", name = "hadoop-hdfs", version.ref = "hadoop3" } hadoop3-common = { group = "org.apache.hadoop", name = "hadoop-common", version.ref = "hadoop3"} hadoop3-client = { group = "org.apache.hadoop", name = "hadoop-client", version.ref = "hadoop3"} -hadoop3-mapreduce-client-core = { group = "org.apache.hadoop", name = "hadoop-mapreduce-client-core", version.ref = "hadoop3"} hadoop3-minicluster = { group = "org.apache.hadoop", name = "hadoop-minicluster", version.ref = "hadoop-minikdc"} +htrace-core4 = { group = "org.apache.htrace", name = "htrace-core4", version.ref = "htrace-core4" } airlift-json = { group = "io.airlift", name = "json", version.ref = "airlift-json"} airlift-resolver = { group = "io.airlift.resolver", name = "resolver", version.ref = "airlift-resolver"} -airlift-units = { group = "io.airlift", name = "units", version.ref = "airlift-units"} -airlift-log = { group = "io.airlift", name = "log", version.ref = "airlift-log"} httpclient5 = { group = "org.apache.httpcomponents.client5", name = "httpclient5", version.ref = "httpclient5" } mockserver-netty = { group = "org.mock-server", name = "mockserver-netty", version.ref = "mockserver" } mockserver-client-java = { group = "org.mock-server", name = "mockserver-client-java", version.ref = "mockserver" } @@ -150,6 +153,8 @@ commons-io = { group = "commons-io", name = "commons-io", version.ref = "commons caffeine = { group = "com.github.ben-manes.caffeine", name = "caffeine", version.ref = "caffeine" } rocksdbjni = { group = "org.rocksdb", name = "rocksdbjni", version.ref = "rocksdbjni" } commons-collections4 = { group = "org.apache.commons", name = "commons-collections4", version.ref = "commons-collections4" } +commons-collections3 = { group = "commons-collections", name = "commons-collections", version.ref = "commons-collections3" } +commons-configuration1 = { group = "commons-configuration", name = "commons-configuration", version.ref = "commons-configuration1" } iceberg-aws = { group = "org.apache.iceberg", name = "iceberg-aws", version.ref = "iceberg" } iceberg-core = { group = "org.apache.iceberg", name = "iceberg-core", version.ref = "iceberg" } iceberg-api = { group = "org.apache.iceberg", name = "iceberg-api", version.ref = "iceberg" } @@ -158,7 +163,6 @@ paimon-core = { group = "org.apache.paimon", name = "paimon-core", version.ref = paimon-format = { group = "org.apache.paimon", name = "paimon-format", version.ref = "paimon" } paimon-hive-catalog = { group = "org.apache.paimon", name = "paimon-hive-catalog", version.ref = "paimon" } trino-spi= { group = "io.trino", name = "trino-spi", version.ref = "trino" } -trino-toolkit= { group = "io.trino", name = "trino-plugin-toolkit", version.ref = "trino" } trino-testing= { group = "io.trino", name = "trino-testing", version.ref = "trino" } trino-memory= { group = "io.trino", name = "trino-memory", version.ref = "trino" } trino-cli= { group = "io.trino", name = "trino-cli", version.ref = "trino" } @@ -183,7 +187,6 @@ metrics-servlets = { group = "io.dropwizard.metrics", name = "metrics-servlets", prometheus-client = { group = "io.prometheus", name = "simpleclient", version.ref = "prometheus" } prometheus-dropwizard = { group = "io.prometheus", name = "simpleclient_dropwizard", version.ref = "prometheus" } prometheus-servlet = { group = "io.prometheus", name = "simpleclient_servlet", version.ref = "prometheus" } -jsqlparser = { group = "com.github.jsqlparser", name = "jsqlparser", version.ref = "jsqlparser" } mysql-driver = { group = "mysql", name = "mysql-connector-java", version.ref = "mysql" } postgresql-driver = { group = "org.postgresql", name = "postgresql", version.ref = "postgresql" } minikdc = { group = "org.apache.hadoop", name = "hadoop-minikdc", version.ref = "hadoop-minikdc"} @@ -194,6 +197,7 @@ kafka-clients = { group = "org.apache.kafka", name = "kafka-clients", version.re kafka = { group = "org.apache.kafka", name = "kafka_2.12", version.ref = "kafka" } curator-test = { group = "org.apache.curator", name = "curator-test", version.ref = "curator"} cglib = { group = "cglib", name = "cglib", version.ref = "cglib"} +woodstox-core = { group = "com.fasterxml.woodstox", name = "woodstox-core", version.ref = "woodstox-core"} ranger-intg = { group = "org.apache.ranger", name = "ranger-intg", version.ref = "ranger" } javax-jaxb-api = { group = "javax.xml.bind", name = "jaxb-api", version.ref = "javax-jaxb-api" } @@ -204,6 +208,9 @@ mybatis = { group = "org.mybatis", name = "mybatis", version.ref = "mybatis"} h2db = { group = "com.h2database", name = "h2", version.ref = "h2db"} awaitility = { group = "org.awaitility", name = "awaitility", version.ref = "awaitility" } servlet = { group = "javax.servlet", name = "javax.servlet-api", version.ref = "servlet" } +mail = { group = "javax.mail", name = "mail", version.ref = "mail" } +rome = { group = "rome", name = "rome", version.ref = "rome" } +jettison = { group = "org.codehaus.jettison", name = "jettison", version.ref = "jettison" } [bundles] log4j = ["slf4j-api", "log4j-slf4j2-impl", "log4j-api", "log4j-core", "log4j-12-api"]