Skip to content

Commit

Permalink
[fix](glue)support access glue iceberg with credential list
Browse files Browse the repository at this point in the history
support access glue and s3 iceberg with credential list
support iceberg hadoop catalog on s3

(cherry picked from commit ef85d0d)
  • Loading branch information
wsjz committed Mar 18, 2024
1 parent b0a9aaf commit c7dbe62
Show file tree
Hide file tree
Showing 16 changed files with 234 additions and 50 deletions.
42 changes: 26 additions & 16 deletions be/src/util/s3_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <aws/core/auth/AWSAuthSigner.h>
#include <aws/core/auth/AWSCredentials.h>
#include <aws/core/auth/AWSCredentialsProviderChain.h>
#include <aws/core/utils/logging/LogLevel.h>
#include <aws/core/utils/logging/LogSystemInterface.h>
#include <aws/core/utils/memory/stl/AWSStringStream.h>
Expand Down Expand Up @@ -111,8 +112,7 @@ S3ClientFactory& S3ClientFactory::instance() {

bool S3ClientFactory::is_s3_conf_valid(const std::map<std::string, std::string>& prop) {
StringCaseMap<std::string> properties(prop.begin(), prop.end());
if (properties.find(S3_AK) == properties.end() || properties.find(S3_SK) == properties.end() ||
properties.find(S3_ENDPOINT) == properties.end() ||
if (properties.find(S3_ENDPOINT) == properties.end() ||
properties.find(S3_REGION) == properties.end()) {
DCHECK(false) << "aws properties is incorrect.";
LOG(ERROR) << "aws properties is incorrect.";
Expand All @@ -122,7 +122,7 @@ bool S3ClientFactory::is_s3_conf_valid(const std::map<std::string, std::string>&
}

bool S3ClientFactory::is_s3_conf_valid(const S3Conf& s3_conf) {
return !s3_conf.ak.empty() && !s3_conf.sk.empty() && !s3_conf.endpoint.empty();
return !s3_conf.endpoint.empty();
}

std::shared_ptr<Aws::S3::S3Client> S3ClientFactory::create(const S3Conf& s3_conf) {
Expand All @@ -139,12 +139,6 @@ std::shared_ptr<Aws::S3::S3Client> S3ClientFactory::create(const S3Conf& s3_conf
}
}

Aws::Auth::AWSCredentials aws_cred(s3_conf.ak, s3_conf.sk);
DCHECK(!aws_cred.IsExpiredOrEmpty());
if (!s3_conf.token.empty()) {
aws_cred.SetSessionToken(s3_conf.token);
}

Aws::Client::ClientConfiguration aws_config = S3ClientFactory::getClientConfiguration();
aws_config.endpointOverride = s3_conf.endpoint;
aws_config.region = s3_conf.region;
Expand All @@ -167,11 +161,25 @@ std::shared_ptr<Aws::S3::S3Client> S3ClientFactory::create(const S3Conf& s3_conf
if (s3_conf.connect_timeout_ms > 0) {
aws_config.connectTimeoutMs = s3_conf.connect_timeout_ms;
}

std::shared_ptr<Aws::S3::S3Client> new_client = std::make_shared<Aws::S3::S3Client>(
std::move(aws_cred), std::move(aws_config),
Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never,
s3_conf.use_virtual_addressing);
std::shared_ptr<Aws::S3::S3Client> new_client;
if (!s3_conf.ak.empty() && !s3_conf.sk.empty()) {
Aws::Auth::AWSCredentials aws_cred(s3_conf.ak, s3_conf.sk);
DCHECK(!aws_cred.IsExpiredOrEmpty());
if (!s3_conf.token.empty()) {
aws_cred.SetSessionToken(s3_conf.token);
}
new_client = std::make_shared<Aws::S3::S3Client>(
std::move(aws_cred), std::move(aws_config),
Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never,
s3_conf.use_virtual_addressing);
} else {
std::shared_ptr<Aws::Auth::AWSCredentialsProvider> aws_provider_chain =
std::make_shared<Aws::Auth::DefaultAWSCredentialsProviderChain>();
new_client = std::make_shared<Aws::S3::S3Client>(
std::move(aws_provider_chain), std::move(aws_config),
Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never,
s3_conf.use_virtual_addressing);
}

{
std::lock_guard l(_lock);
Expand All @@ -186,8 +194,10 @@ Status S3ClientFactory::convert_properties_to_s3_conf(
return Status::InvalidArgument("S3 properties are incorrect, please check properties.");
}
StringCaseMap<std::string> properties(prop.begin(), prop.end());
s3_conf->ak = properties.find(S3_AK)->second;
s3_conf->sk = properties.find(S3_SK)->second;
if (properties.find(S3_AK) != properties.end() && properties.find(S3_SK) != properties.end()) {
s3_conf->ak = properties.find(S3_AK)->second;
s3_conf->sk = properties.find(S3_SK)->second;
}
if (properties.find(S3_TOKEN) != properties.end()) {
s3_conf->token = properties.find(S3_TOKEN)->second;
}
Expand Down
13 changes: 1 addition & 12 deletions bin/start_be.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,13 @@ OPTS="$(getopt \
eval set -- "${OPTS}"

RUN_DAEMON=0
RUN_IN_AWS=0
RUN_CONSOLE=0
while true; do
case "$1" in
--daemon)
RUN_DAEMON=1
shift
;;
--aws)
RUN_IN_AWS=1
shift
;;
--console)
RUN_CONSOLE=1
shift
Expand Down Expand Up @@ -242,10 +237,7 @@ else
LIMIT="/bin/limit3 -c 0 -n 65536"
fi

## If you are not running in aws cloud, disable this env since https://github.com/aws/aws-sdk-cpp/issues/1410.
if [[ "${RUN_IN_AWS}" -eq 0 ]]; then
export AWS_EC2_METADATA_DISABLED=true
fi
export AWS_MAX_ATTEMPTS=2

## set asan and ubsan env to generate core file
export ASAN_OPTIONS=symbolize=1:abort_on_error=1:disable_coredump=0:unmap_shadow_on_exit=1:detect_container_overflow=0
Expand Down Expand Up @@ -349,9 +341,6 @@ else
export JEMALLOC_CONF="${JEMALLOC_CONF},prof_prefix:${JEMALLOC_PROF_PRFIX}"
fi

export AWS_EC2_METADATA_DISABLED=true
export AWS_MAX_ATTEMPTS=2

if [[ "${RUN_DAEMON}" -eq 1 ]]; then
nohup ${LIMIT:+${LIMIT}} "${DORIS_HOME}/lib/doris_be" "$@" >>"${LOG_DIR}/be.out" 2>&1 </dev/null &
elif [[ "${RUN_CONSOLE}" -eq 1 ]]; then
Expand Down
13 changes: 13 additions & 0 deletions conf/be.conf
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,16 @@ enable_auth = false
# sys_log_verbose_modules = *
# log_buffer_level = -1
# palo_cgroups

# aws sdk log level
# Off = 0,
# Fatal = 1,
# Error = 2,
# Warn = 3,
# Info = 4,
# Debug = 5,
# Trace = 6
# Default to turn off aws sdk log, because aws sdk errors that need to be cared will be output through Doris logs
aws_log_level=0
## If you are not running in aws cloud, you can disable EC2 metadata
AWS_EC2_METADATA_DISABLED=true
4 changes: 4 additions & 0 deletions docs/dev.json
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,10 @@
"message": "多源数据目录",
"description": "The label for category Lakehouse.Multi Catalog in sidebar docs"
},
"sidebar.docs.category.Cloud Service Authentication": {
"message": "云服务认证接入",
"description": "The label for category Lakehouse.Cloud Service Authentication in sidebar docs"
},
"sidebar.docs.category.External Table": {
"message": "外部表",
"description": "The label for category Lakehouse.External Table in sidebar docs"
Expand Down
61 changes: 61 additions & 0 deletions docs/en/docs/lakehouse/cloud-auth/cloud-auth.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
---
{
"title": "Cloud Service Authentication",
"language": "en"
}
---

<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

# Overview

When accessing a service on the cloud, we need to provide the credentials needed to access the service so that the service can be authenticated by IAM of cloud vendors.

## AWS

Now Doris support two types of authentication to access AWS service.

### Catalog Credentials

The Catalog supports filling in basic Credentials properties, such as:
1. For S3: `s3.endpoint``s3.access_key``s3.secret_key`
2. For Glue: `glue.endpoint``glue.access_key``glue.secret_key`

When access Glue though Iceberg Catalog, we can access tables on Glue by filling in the following properties:

```sql
CREATE CATALOG glue PROPERTIES (
"type"="iceberg",
"iceberg.catalog.type" = "glue",
"glue.endpoint" = "https://glue.us-east-1.amazonaws.com",
"glue.access_key" = "ak",
"glue.secret_key" = "sk"
);
```

### System Credentials

For applications running on AWS resources, such as EC2 instances, this approach enhances security by avoiding hardcoded credentials.

If we create the Catalog but not fill any Credentials in properties, the `DefaultAWSCredentialsProviderChain` will be used to read in the system environment variables or instance profile.

For details about how to configure environment variables and system properties, see: [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html) .
- The configurable environment variables are: `AWS_ACCESS_KEY_ID``AWS_SECRET_ACCESS_KEY``AWS_SESSION_TOKEN``AWS_ROLE_ARN``AWS_WEB_IDENTITY_TOKEN_FILE` and so on.
- In addition, you can also use [aws configure](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html) to configure Credentials, the Credentials file will be written to the `~/.aws` directory.
15 changes: 14 additions & 1 deletion docs/en/docs/lakehouse/multi-catalog/iceberg.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,17 @@ CREATE CATALOG iceberg_hadoop_ha PROPERTIES (
);
```

```sql
CREATE CATALOG iceberg_s3 PROPERTIES (
'type'='iceberg',
'iceberg.catalog.type' = 'hadoop',
'warehouse' = 's3://bucket/dir/key',
's3.endpoint' = 's3.us-east-1.amazonaws.com',
's3.access_key' = 'ak',
's3.secret_key' = 'sk'
);
```

#### Hive Metastore

```sql
Expand Down Expand Up @@ -106,7 +117,9 @@ CREATE CATALOG glue PROPERTIES (
);
```

For Iceberg properties, see [Iceberg Glue Catalog](https://iceberg.apache.org/docs/latest/aws/#glue-catalog)
1. For Iceberg properties, see [Iceberg Glue Catalog](https://iceberg.apache.org/docs/latest/aws/#glue-catalog).

2. If you do not fill the credentials(`glue.access_key` and `glue.secret_key`) in glue catalog, the default DefaultAWSCredentialsProviderChain will be used, and it will read credentials and the system environment variables or instance profile properties on AWS EC2.

#### Alibaba Cloud DLF

Expand Down
7 changes: 7 additions & 0 deletions docs/sidebars.json
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,13 @@
"lakehouse/multi-catalog/faq-multi-catalog"
]
},
{
"type": "category",
"label": "Cloud Service Authentication",
"items": [
"lakehouse/cloud-auth/cloud-auth"
]
},
"lakehouse/file",
"lakehouse/filecache",
"lakehouse/compute-node",
Expand Down
61 changes: 61 additions & 0 deletions docs/zh-CN/docs/lakehouse/cloud-auth/cloud-auth.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
---
{
"title": "云服务认证接入",
"language": "zh-CN"
}
---

<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

# 概述

当访问云上的服务时,我们需要提供访问服务所需要的凭证,以便服务能够通过各云厂商IAM的认证。

## AWS

现在Doris访问AWS服务时,能够支持两种类型的身份认证。

### 使用Catalog属性认证

Catalog支持填写基本的Credentials属性,比如:
1. 访问S3时,可以使用s3.endpoint,s3.access_key,s3.secret_key。
2. 访问Glue时,可以使用glue.endpoint,glue.access_key,glue.secret_key。

以Iceberg Catalog访问Glue为例,我们可以填写以下属性访问在Glue上托管的表:

```sql
CREATE CATALOG glue PROPERTIES (
"type"="iceberg",
"iceberg.catalog.type" = "glue",
"glue.endpoint" = "https://glue.us-east-1.amazonaws.com",
"glue.access_key" = "ak",
"glue.secret_key" = "sk"
);
```

### 使用系统属性认证

用于运行在AWS资源(如EC2实例)上的应用程序。可以避免硬编码写入Credentials,能够增强数据安全性。

当我们在创建Catalog时,未填写Credentials属性,那么此时会使用DefaultAWSCredentialsProviderChain,它能够读取系统环境变量或者instance profile中配置的属性。

配置环境变量和系统属性的方式可以参考:[AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html)
- 可以选择的配置的环境变量有:`AWS_ACCESS_KEY_ID``AWS_SECRET_ACCESS_KEY``AWS_SESSION_TOKEN``AWS_ROLE_ARN``AWS_WEB_IDENTITY_TOKEN_FILE`
- 另外,还可以使用[aws configure](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html)直接配置Credentials信息,同时在`~/.aws`目录下生成credentials文件。
15 changes: 14 additions & 1 deletion docs/zh-CN/docs/lakehouse/multi-catalog/iceberg.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,17 @@ CREATE CATALOG iceberg_hadoop_ha PROPERTIES (
);
```

```sql
CREATE CATALOG iceberg_s3 PROPERTIES (
'type'='iceberg',
'iceberg.catalog.type' = 'hadoop',
'warehouse' = 's3://bucket/dir/key',
's3.endpoint' = 's3.us-east-1.amazonaws.com',
's3.access_key' = 'ak',
's3.secret_key' = 'sk'
);
```

#### Hive Metastore

```sql
Expand Down Expand Up @@ -106,7 +117,9 @@ CREATE CATALOG glue PROPERTIES (
);
```

Iceberg 属性详情参见 [Iceberg Glue Catalog](https://iceberg.apache.org/docs/latest/aws/#glue-catalog)
1. Iceberg 属性详情参见 [Iceberg Glue Catalog](https://iceberg.apache.org/docs/latest/aws/#glue-catalog)

2. 如果在AWS服务(如EC2)中,不填写Credentials相关信息(`glue.access_key``glue.secret_key`),Doris就会使用默认的DefaultAWSCredentialsProviderChain,它会读取系统环境变量或者InstanceProfile中配置的属性。

#### 阿里云 DLF

Expand Down
20 changes: 18 additions & 2 deletions fe/fe-core/src/main/java/org/apache/doris/common/util/S3Util.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,15 @@

import software.amazon.awssdk.auth.credentials.AwsBasicCredentials;
import software.amazon.awssdk.auth.credentials.AwsCredentials;
import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider;
import software.amazon.awssdk.auth.credentials.AwsCredentialsProviderChain;
import software.amazon.awssdk.auth.credentials.AwsSessionCredentials;
import software.amazon.awssdk.auth.credentials.EnvironmentVariableCredentialsProvider;
import software.amazon.awssdk.auth.credentials.InstanceProfileCredentialsProvider;
import software.amazon.awssdk.auth.credentials.ProfileCredentialsProvider;
import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider;
import software.amazon.awssdk.auth.credentials.SystemPropertyCredentialsProvider;
import software.amazon.awssdk.auth.credentials.WebIdentityTokenFileCredentialsProvider;
import software.amazon.awssdk.auth.signer.AwsS3V4Signer;
import software.amazon.awssdk.core.client.config.ClientOverrideConfiguration;
import software.amazon.awssdk.core.client.config.SdkAdvancedClientOption;
Expand All @@ -39,15 +46,24 @@
public class S3Util {

public static S3Client buildS3Client(URI endpoint, String region, CloudCredential credential) {
StaticCredentialsProvider scp;
AwsCredentialsProvider scp;
AwsCredentials awsCredential;
if (!credential.isTemporary()) {
awsCredential = AwsBasicCredentials.create(credential.getAccessKey(), credential.getSecretKey());
} else {
awsCredential = AwsSessionCredentials.create(credential.getAccessKey(), credential.getSecretKey(),
credential.getSessionToken());
}
scp = StaticCredentialsProvider.create(awsCredential);
if (!credential.isWhole()) {
scp = AwsCredentialsProviderChain.of(
SystemPropertyCredentialsProvider.create(),
EnvironmentVariableCredentialsProvider.create(),
WebIdentityTokenFileCredentialsProvider.create(),
ProfileCredentialsProvider.create(),
InstanceProfileCredentialsProvider.create());
} else {
scp = StaticCredentialsProvider.create(awsCredential);
}
EqualJitterBackoffStrategy backoffStrategy = EqualJitterBackoffStrategy
.builder()
.baseDelay(Duration.ofSeconds(1))
Expand Down
Loading

0 comments on commit c7dbe62

Please sign in to comment.