Skip to content

Commit

Permalink
[#4367] feat(iceberg): add OSS support for Gravitino Iceberg REST ser…
Browse files Browse the repository at this point in the history
…ver (#4597)

### What changes were proposed in this pull request?
add OSS support for Gravitino Iceberg REST server

### Why are the changes needed?

Fix: #4367

### Does this PR introduce _any_ user-facing change?
yes, will add a separate PR to add document

### How was this patch tested?
1. setup a Iceberg REST server will following configurations
```
gravitino.iceberg-rest.catalog-backend = jdbc
gravitino.iceberg-rest.jdbc-driver = org.postgresql.Driver
gravitino.iceberg-rest.uri = jdbc:postgresql://127.0.0.1:5432/postgres
gravitino.iceberg-rest.jdbc-user = postgres
gravitino.iceberg-rest.jdbc-password = xxx
gravitino.iceberg-rest.jdbc-initialize = true
# change to s3a://test/my/key/prefix for Hive catalog backend
gravitino.iceberg-rest.warehouse = oss://xxx/key/prefix
gravitino.iceberg-rest.io-impl= org.apache.iceberg.aliyun.oss.OSSFileIO
gravitino.iceberg-rest.oss-access-key-id = xx
gravitino.iceberg-rest.oss-access-key-secret = xx
gravitino.iceberg-rest.oss-endpoint = https://oss-cn-beijing.aliyuncs.com
```
2. test with Spark SQL
  • Loading branch information
FANNG1 authored Sep 2, 2024
1 parent 5e0d772 commit 3ba355e
Show file tree
Hide file tree
Showing 11 changed files with 107 additions and 6 deletions.
5 changes: 5 additions & 0 deletions LICENSE.bin
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,11 @@
Apache HttpCore
Apache HttpClient
Apache Iceberg
Apache Iceberg Aliyun
Apache Iceberg api
Apache Iceberg AWS
Apache Iceberg core
Apache Iceberg Hive metastore
Apache Ivy
Apache Log4j 1.x Compatibility API
Apache Log4j API
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ public class IcebergConstants {
public static final String GRAVITINO_S3_REGION = "s3-region";
public static final String AWS_S3_REGION = "client.region";

public static final String GRAVITINO_OSS_ENDPOINT = "oss-endpoint";
public static final String ICEBERG_OSS_ENDPOINT = "oss.endpoint";
public static final String GRAVITINO_OSS_ACCESS_KEY_ID = "oss-access-key-id";
public static final String ICEBERG_OSS_ACCESS_KEY_ID = "client.access-key-id";
public static final String GRAVITINO_OSS_ACCESS_KEY_SECRET = "oss-access-key-secret";
public static final String ICEBERG_OSS_ACCESS_KEY_SECRET = "client.access-key-secret";

// Iceberg Table properties constants

public static final String COMMENT = "comment";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,20 @@ public class IcebergPropertiesUtils {
map.put(IcebergConstants.WAREHOUSE, IcebergConstants.WAREHOUSE);
map.put(IcebergConstants.CATALOG_BACKEND_NAME, IcebergConstants.CATALOG_BACKEND_NAME);
map.put(IcebergConstants.IO_IMPL, IcebergConstants.IO_IMPL);
// S3
map.put(IcebergConstants.GRAVITINO_S3_ENDPOINT, IcebergConstants.ICEBERG_S3_ENDPOINT);
map.put(IcebergConstants.GRAVITINO_S3_REGION, IcebergConstants.AWS_S3_REGION);
map.put(IcebergConstants.GRAVITINO_S3_ACCESS_KEY_ID, IcebergConstants.ICEBERG_S3_ACCESS_KEY_ID);
map.put(
IcebergConstants.GRAVITINO_S3_SECRET_ACCESS_KEY,
IcebergConstants.ICEBERG_S3_SECRET_ACCESS_KEY);
// OSS
map.put(IcebergConstants.GRAVITINO_OSS_ENDPOINT, IcebergConstants.ICEBERG_OSS_ENDPOINT);
map.put(
IcebergConstants.GRAVITINO_OSS_ACCESS_KEY_ID, IcebergConstants.ICEBERG_OSS_ACCESS_KEY_ID);
map.put(
IcebergConstants.GRAVITINO_OSS_ACCESS_KEY_SECRET,
IcebergConstants.ICEBERG_OSS_ACCESS_KEY_SECRET);
GRAVITINO_CONFIG_TO_ICEBERG = Collections.unmodifiableMap(map);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,18 @@ public class IcebergCatalogPropertiesMetadata extends BaseCatalogPropertiesMetad
"s3 secret-access-key",
false /* immutable */,
null /* defaultValue */,
true /* hidden */),
stringOptionalPropertyEntry(
IcebergConstants.GRAVITINO_OSS_ACCESS_KEY_ID,
"OSS access-key-id",
false /* immutable */,
null /* defaultValue */,
true /* hidden */),
stringOptionalPropertyEntry(
IcebergConstants.GRAVITINO_OSS_ACCESS_KEY_SECRET,
"OSS access-key-secret",
false /* immutable */,
null /* defaultValue */,
true /* hidden */));
HashMap<String, PropertyEntry<?>> result = Maps.newHashMap();
result.putAll(Maps.uniqueIndex(propertyEntries, PropertyEntry::getName));
Expand Down
24 changes: 21 additions & 3 deletions docs/iceberg-rest-service.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@ The Apache Gravitino Iceberg REST Server follows the [Apache Iceberg REST API sp
- multi table transaction
- pagination
- Works as a catalog proxy, supporting `Hive` and `JDBC` as catalog backend.
- Supports HDFS and S3 storage.
- Supports multi storage.
- HDFS
- S3
- OSS
- Supports OAuth2 and HTTPS.
- Provides a pluggable metrics store interface to store and delete Iceberg metrics.

Expand Down Expand Up @@ -100,8 +103,6 @@ The detailed configuration items are as follows:

### Storage

Gravitino Iceberg REST server supports S3 and HDFS for storage.

#### S3 configuration

Gravitino Iceberg REST service supports using static access-key-id and secret-access-key to access S3 data.
Expand All @@ -120,6 +121,23 @@ For other Iceberg s3 properties not managed by Gravitino like `s3.sse.type`, you
To configure the JDBC catalog backend, set the `gravitino.iceberg-rest.warehouse` parameter to `s3://{bucket_name}/${prefix_name}`. For the Hive catalog backend, set `gravitino.iceberg-rest.warehouse` to `s3a://{bucket_name}/${prefix_name}`. Additionally, download the [Iceberg AWS bundle](https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-aws-bundle) and place it in the classpath of Iceberg REST server.
:::

#### OSS configuration

Gravitino Iceberg REST service supports using static access-key-id and secret-access-key to access OSS data.

| Configuration item | Description | Default value | Required | Since Version |
|------------------------------------------------|-------------------------------------------------------------------------------------------------------|---------------|----------|---------------|
| `gravitino.iceberg-rest.io-impl` | The IO implementation for `FileIO` in Iceberg, use `org.apache.iceberg.aliyun.oss.OSSFileIO` for OSS. | (none) | No | 0.6.0 |
| `gravitino.iceberg-rest.oss-access-key-id` | The static access key ID used to access OSS data. | (none) | No | 0.7.0 |
| `gravitino.iceberg-rest.oss-secret-access-key` | The static secret access key used to access OSS data. | (none) | No | 0.7.0 |
| `gravitino.iceberg-rest.oss-endpoint` | The endpoint of Aliyun OSS service. | (none) | No | 0.7.0 |

For other Iceberg OSS properties not managed by Gravitino like `client.security-token`, you could config it directly by `gravitino.iceberg-rest.client.security-token`.

:::info
Please set the `gravitino.iceberg-rest.warehouse` parameter to `oss://{bucket_name}/${prefix_name}`. Additionally, download the [Aliyun OSS SDK](https://gosspublic.alicdn.com/sdks/java/aliyun_java_sdk_3.10.2.zip) and copy `aliyun-sdk-oss-3.10.2.jar`, `hamcrest-core-1.1.jar`, `jdom2-2.0.6.jar` in the classpath of Iceberg REST server, `iceberg-rest-server/libs` for the auxiliary server, `libs` for the standalone server.
:::

#### HDFS configuration

You should place HDFS configuration file to the classpath of the Iceberg REST server, `iceberg-rest-server/conf` for Gravitino server package, `conf` for standalone Gravitino Iceberg REST server package. When writing to HDFS, the Gravitino Iceberg REST catalog service can only operate as the specified HDFS user and doesn't support proxying to other HDFS users. See [How to access Apache Hadoop](gravitino-server-config.md#how-to-access-apache-hadoop) for more details.
Expand Down
22 changes: 21 additions & 1 deletion docs/lakehouse-iceberg-catalog.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ Builds with Apache Iceberg `1.5.2`. The Apache Iceberg table format version is `
- Works as a catalog proxy, supporting `Hive`, `JDBC` and `REST` as catalog backend.
- Supports DDL operations for Iceberg schemas and tables.
- Doesn't support snapshot or table management operations.
- Supports S3 and HDFS storage.
- Supports multi storage.
- S3
- HDFS
- OSS
- Supports Kerberos or simple authentication for Iceberg catalog with Hive backend.

### Catalog properties
Expand Down Expand Up @@ -83,6 +86,23 @@ For other Iceberg s3 properties not managed by Gravitino like `s3.sse.type`, you
To configure the JDBC catalog backend, set the `warehouse` parameter to `s3://{bucket_name}/${prefix_name}`. For the Hive catalog backend, set `warehouse` to `s3a://{bucket_name}/${prefix_name}`. Additionally, download the [Iceberg AWS bundle]([Iceberg AWS bundle](https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-aws-bundle)) and place it in the `catalogs/lakehouse-iceberg/libs/` directory.
:::

#### OSS

Gravitino Iceberg REST service supports using static access-key-id and secret-access-key to access OSS data.

| Configuration item | Description | Default value | Required | Since Version |
|-------------------------|-------------------------------------------------------------------------------------------------------|---------------|----------|---------------|
| `io-impl` | The IO implementation for `FileIO` in Iceberg, use `org.apache.iceberg.aliyun.oss.OSSFileIO` for OSS. | (none) | No | 0.6.0 |
| `oss-access-key-id` | The static access key ID used to access OSS data. | (none) | No | 0.7.0 |
| `oss-secret-access-key` | The static secret access key used to access OSS data. | (none) | No | 0.7.0 |
| `oss-endpoint` | The endpoint of Aliyun OSS service. | (none) | No | 0.7.0 |

For other Iceberg OSS properties not managed by Gravitino like `client.security-token`, you could config it directly by `gravitino.bypass.client.security-token`.

:::info
Please set the `warehouse` parameter to `oss://{bucket_name}/${prefix_name}`. Additionally, download the [Aliyun OSS SDK](https://gosspublic.alicdn.com/sdks/java/aliyun_java_sdk_3.10.2.zip) and copy `aliyun-sdk-oss-3.10.2.jar`, `hamcrest-core-1.1.jar`, `jdom2-2.0.6.jar` in the `catalogs/lakehouse-iceberg/libs/` directory.
:::

#### Catalog backend security

Users can use the following properties to configure the security of the catalog backend if needed. For example, if you are using a Kerberos Hive catalog backend, you must set `authentication.type` to `Kerberos` and provide `authentication.kerberos.principal` and `authentication.kerberos.keytab-uri`.
Expand Down
7 changes: 6 additions & 1 deletion docs/spark-connector/spark-catalog-iceberg.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ Gravitino spark connector will transform below property names which are defined
| `io-impl` | `io-impl` | The io implementation for `FileIO` in Iceberg. | 0.6.0 |
| `s3-endpoint` | `s3.endpoint` | An alternative endpoint of the S3 service, This could be used for S3FileIO with any s3-compatible object storage service that has a different endpoint, or access a private S3 endpoint in a virtual private cloud. | 0.6.0 |
| `s3-region` | `client.region` | The region of the S3 service, like `us-west-2`. | 0.6.0 |
| `oss-endpoint` | `oss.endpoint` | The endpoint of Aliyun OSS service. | 0.7.0 |

Gravitino catalog property names with the prefix `spark.bypass.` are passed to Spark Iceberg connector. For example, using `spark.bypass.clients` to pass the `clients` to the Spark Iceberg connector.

Expand All @@ -122,4 +123,8 @@ Iceberg catalog property `cache-enabled` is setting to `false` internally and no

### S3

You need to add s3 secret to the Spark configuration using `spark.sql.catalog.${iceberg_catalog_name}.s3.access-key-id` and `spark.sql.catalog.${iceberg_catalog_name}.s3.secret-access-key`. Additionally, download the [Iceberg AWS bundle](https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-aws-bundle) and place it in the classpath of Spark.
You need to add s3 secret to the Spark configuration using `spark.sql.catalog.${iceberg_catalog_name}.s3.access-key-id` and `spark.sql.catalog.${iceberg_catalog_name}.s3.secret-access-key`. Additionally, download the [Iceberg AWS bundle](https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-aws-bundle) and place it in the classpath of Spark.

### OSS

You need to add OSS secret key to the Spark configuration using `spark.sql.catalog.${iceberg_catalog_name}.client.access-key-id` and `spark.sql.catalog.${iceberg_catalog_name}.client.access-key-secret`. Additionally, download the [Aliyun OSS SDK](https://gosspublic.alicdn.com/sdks/java/aliyun_java_sdk_3.10.2.zip) and copy `aliyun-sdk-oss-3.10.2.jar`, `hamcrest-core-1.1.jar`, `jdom2-2.0.6.jar` in the classpath of Spark.
1 change: 1 addition & 0 deletions gradle/libs.versions.toml
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ rocksdbjni = { group = "org.rocksdb", name = "rocksdbjni", version.ref = "rocksd
commons-collections4 = { group = "org.apache.commons", name = "commons-collections4", version.ref = "commons-collections4" }
commons-collections3 = { group = "commons-collections", name = "commons-collections", version.ref = "commons-collections3" }
commons-configuration1 = { group = "commons-configuration", name = "commons-configuration", version.ref = "commons-configuration1" }
iceberg-aliyun = { group = "org.apache.iceberg", name = "iceberg-aliyun", version.ref = "iceberg" }
iceberg-aws = { group = "org.apache.iceberg", name = "iceberg-aws", version.ref = "iceberg" }
iceberg-core = { group = "org.apache.iceberg", name = "iceberg-core", version.ref = "iceberg" }
iceberg-api = { group = "org.apache.iceberg", name = "iceberg-api", version.ref = "iceberg" }
Expand Down
1 change: 1 addition & 0 deletions iceberg/iceberg-common/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ dependencies {
implementation(libs.caffeine)
implementation(libs.commons.lang3)
implementation(libs.guava)
implementation(libs.iceberg.aliyun)
implementation(libs.iceberg.aws)
implementation(libs.iceberg.hive.metastore)
implementation(libs.hadoop2.common) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,27 @@ public class IcebergConfig extends Config implements OverwriteDefaultConfig {
.stringConf()
.create();

public static final ConfigEntry<String> OSS_ENDPOINT =
new ConfigBuilder(IcebergConstants.GRAVITINO_OSS_ENDPOINT)
.doc("The endpoint of Aliyun OSS service")
.version(ConfigConstants.VERSION_0_7_0)
.stringConf()
.create();

public static final ConfigEntry<String> OSS_ACCESS_KEY_ID =
new ConfigBuilder(IcebergConstants.GRAVITINO_OSS_ACCESS_KEY_ID)
.doc("The static access key ID used to access OSS data")
.version(ConfigConstants.VERSION_0_7_0)
.stringConf()
.create();

public static final ConfigEntry<String> OSS_ACCESS_KEY_SECRET =
new ConfigBuilder(IcebergConstants.GRAVITINO_OSS_ACCESS_KEY_SECRET)
.doc("The static secret access key used to access OSS data")
.version(ConfigConstants.VERSION_0_7_0)
.stringConf()
.create();

public static final ConfigEntry<String> ICEBERG_METRICS_STORE =
new ConfigBuilder(IcebergConstants.ICEBERG_METRICS_STORE)
.doc("The store to save Iceberg metrics")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,10 @@ public class IcebergTableOps implements AutoCloseable {
IcebergConstants.AWS_S3_REGION,
IcebergConstants.ICEBERG_S3_ACCESS_KEY_ID,
IcebergConstants.ICEBERG_S3_SECRET_ACCESS_KEY,
IcebergConstants.ICEBERG_S3_ENDPOINT);
IcebergConstants.ICEBERG_S3_ENDPOINT,
IcebergConstants.ICEBERG_OSS_ENDPOINT,
IcebergConstants.ICEBERG_OSS_ACCESS_KEY_ID,
IcebergConstants.ICEBERG_OSS_ACCESS_KEY_SECRET);

public IcebergTableOps(IcebergConfig icebergConfig) {
this.catalogBackend =
Expand Down

0 comments on commit 3ba355e

Please sign in to comment.