Skip to content

Commit

Permalink
[opt](fqdn) Add DNS Cache for FE and BE (#32869)
Browse files Browse the repository at this point in the history
In previously, when enabling FQDN, Doris will call dns resolver to get IP from hostname
each time when 1) FE gets BE's grpc client. 2) BE gets other BE's brpc client.
So when in high concurrency case, the dns resolver be overloaded and failed to resolve hostname.

This PR mainly changes:

1. Add DNSCache for both FE and BE.
    The DNSCache will run on every FE and BE node. It has a cache, key is hostname and value is IP.
    Caller can get IP by hostname from this cache, and if hostname does not exist, it will try to resolve it
    and update the cache.
    In addition, DNSCache has a daemon thread to refresh the cache every 1 min, in case that the IP may
    be changed at anytime.

There are other implements of this dns cache:

1.  kaka11chen@36fed13
    This is for BE side, but it does not handle the IP change case.

3. #28479
    This is for FE side, but it can only work with Master FE. Other FE node will not be aware of the IP change.
    And there are a bunch of BackendServiceProxy, this PR only handle cache in one of them.
  • Loading branch information
morningman committed Mar 28, 2024
1 parent 4aa2111 commit 2039016
Show file tree
Hide file tree
Showing 12 changed files with 303 additions and 7 deletions.
4 changes: 4 additions & 0 deletions be/src/common/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1146,6 +1146,10 @@ DEFINE_mString(ca_cert_file_paths,
"/etc/pki/tls/certs/ca-bundle.crt;/etc/ssl/certs/ca-certificates.crt;"
"/etc/ssl/ca-bundle.pem");

// Number of open tries, default 1 means only try to open once.
// Retry the Open num_retries time waiting 100 milliseconds between retries.
DEFINE_mInt32(thrift_client_open_num_tries, "1");

// clang-format off
#ifdef BE_TEST
// test s3
Expand Down
4 changes: 4 additions & 0 deletions be/src/common/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -1198,6 +1198,10 @@ DECLARE_String(trino_connector_plugin_dir);
// the file paths(one or more) of CA cert, splite using ";" aws s3 lib use it to init s3client
DECLARE_mString(ca_cert_file_paths);

// Number of open tries, default 1 means only try to open once.
// Retry the Open num_retries time waiting 100 milliseconds between retries.
DECLARE_mInt32(thrift_client_open_num_tries);

#ifdef BE_TEST
// test s3
DECLARE_String(test_s3_resource);
Expand Down
2 changes: 1 addition & 1 deletion be/src/runtime/client_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ Status ClientCacheHelper::_create_client(const TNetworkAddress& hostport,

client_impl->set_conn_timeout(config::thrift_connect_timeout_seconds * 1000);

Status status = client_impl->open();
Status status = client_impl->open_with_retry(config::thrift_client_open_num_tries, 100);

if (!status.ok()) {
*client_key = nullptr;
Expand Down
4 changes: 4 additions & 0 deletions be/src/runtime/exec_env.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class ClientCache;
class HeartbeatFlags;
class FrontendServiceClient;
class FileMetaCache;
class DNSCache;

// Execution environment for queries/plan fragments.
// Contains all required global structures, and handles to
Expand Down Expand Up @@ -182,6 +183,7 @@ class ExecEnv {
HeartbeatFlags* heartbeat_flags() { return _heartbeat_flags; }
doris::vectorized::ScannerScheduler* scanner_scheduler() { return _scanner_scheduler; }
FileMetaCache* file_meta_cache() { return _file_meta_cache; }
DNSCache* dns_cache() { return _dns_cache; }

// only for unit test
void set_master_info(TMasterInfo* master_info) { this->_master_info = master_info; }
Expand Down Expand Up @@ -266,6 +268,8 @@ class ExecEnv {
BlockSpillManager* _block_spill_mgr = nullptr;
// To save meta info of external file, such as parquet footer.
FileMetaCache* _file_meta_cache = nullptr;
DNSCache* _dns_cache = nullptr;

RuntimeQueryStatiticsMgr* _runtime_query_statistics_mgr = nullptr;
};

Expand Down
5 changes: 5 additions & 0 deletions be/src/runtime/exec_env_init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
#include "util/bit_util.h"
#include "util/brpc_client_cache.h"
#include "util/cpu_info.h"
#include "util/dns_cache.h"
#include "util/doris_metrics.h"
#include "util/mem_info.h"
#include "util/metrics.h"
Expand Down Expand Up @@ -169,6 +170,8 @@ Status ExecEnv::_init(const std::vector<StorePath>& store_paths) {
_block_spill_mgr = new BlockSpillManager(_store_paths);
_file_meta_cache = new FileMetaCache(config::max_external_file_meta_cache_num);

_dns_cache = new DNSCache();
_spill_stream_mgr = new vectorized::SpillStreamManager(spill_store_paths);
_backend_client_cache->init_metrics("backend");
_frontend_client_cache->init_metrics("frontend");
_broker_client_cache->init_metrics("broker");
Expand Down Expand Up @@ -384,6 +387,8 @@ void ExecEnv::_destroy() {
if (!_is_init) {
return;
}
SAFE_DELETE(_dns_cache);

_deregister_metrics();
SAFE_DELETE(_internal_client_cache);
SAFE_DELETE(_function_client_cache);
Expand Down
4 changes: 3 additions & 1 deletion be/src/util/brpc_client_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
// IWYU pragma: no_include <opentelemetry/common/threadlocal.h>
#include "common/compiler_util.h" // IWYU pragma: keep
#include "common/config.h"
#include "runtime/exec_env.h"
#include "util/dns_cache.h"
#include "util/network_util.h"

namespace doris {
Expand Down Expand Up @@ -80,7 +82,7 @@ class BrpcClientCache {
std::string realhost;
realhost = host;
if (!is_valid_ip(host)) {
Status status = hostname_to_ip(host, realhost);
Status status = ExecEnv::GetInstance()->dns_cache()->get(host, &realhost);
if (!status.ok()) {
LOG(WARNING) << "failed to get ip from host:" << status.to_string();
return nullptr;
Expand Down
84 changes: 84 additions & 0 deletions be/src/util/dns_cache.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "util/dns_cache.h"

#include "service/backend_options.h"
#include "util/network_util.h"

namespace doris {

DNSCache::DNSCache() {
refresh_thread = std::thread(&DNSCache::_refresh_cache, this);
refresh_thread.detach();
}

DNSCache::~DNSCache() {
stop_refresh = true;
if (refresh_thread.joinable()) {
refresh_thread.join();
}
}

Status DNSCache::get(const std::string& hostname, std::string* ip) {
{
std::shared_lock<std::shared_mutex> lock(mutex);
auto it = cache.find(hostname);
if (it != cache.end()) {
*ip = it->second;
return Status::OK();
}
}
// Update if not found
RETURN_IF_ERROR(_update(hostname));
{
std::shared_lock<std::shared_mutex> lock(mutex);
*ip = cache[hostname];
return Status::OK();
}
}

Status DNSCache::_update(const std::string& hostname) {
std::string real_ip = "";
RETURN_IF_ERROR(hostname_to_ip(hostname, real_ip, BackendOptions::is_bind_ipv6()));
std::unique_lock<std::shared_mutex> lock(mutex);
auto it = cache.find(hostname);
if (it == cache.end() || it->second != real_ip) {
cache[hostname] = real_ip;
LOG(INFO) << "update hostname " << hostname << "'s ip to " << real_ip;
}
return Status::OK();
}

void DNSCache::_refresh_cache() {
while (!stop_refresh) {
// refresh every 1 min
std::this_thread::sleep_for(std::chrono::minutes(1));
std::unordered_set<std::string> keys;
{
std::shared_lock<std::shared_mutex> lock(mutex);
std::transform(cache.begin(), cache.end(), std::inserter(keys, keys.end()),
[](const auto& pair) { return pair.first; });
}
Status st;
for (auto& key : keys) {
st = _update(key);
}
}
}

} // end of namespace doris
57 changes: 57 additions & 0 deletions be/src/util/dns_cache.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <chrono>
#include <iostream>
#include <shared_mutex>
#include <string>
#include <thread>
#include <unordered_map>

#include "common/status.h"

namespace doris {

// Same as
// fe/fe-core/src/main/java/org/apache/doris/common/DNSCache.java
class DNSCache {
public:
DNSCache();
~DNSCache();

// get ip by hostname
Status get(const std::string& hostname, std::string* ip);

private:
// update the ip of hostname in cache
Status _update(const std::string& hostname);

// a function for refresh daemon thread
// update cache at fix internal
void _refresh_cache();

private:
// hostname -> ip
std::unordered_map<std::string, std::string> cache;
mutable std::shared_mutex mutex;
std::thread refresh_thread;
bool stop_refresh = false;
};

} // end of namespace doris
31 changes: 31 additions & 0 deletions fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@
import org.apache.doris.common.Config;
import org.apache.doris.common.ConfigBase;
import org.apache.doris.common.ConfigException;
import org.apache.doris.common.DNSCache;
import org.apache.doris.common.DdlException;
import org.apache.doris.common.ErrorCode;
import org.apache.doris.common.ErrorReport;
Expand Down Expand Up @@ -475,6 +476,8 @@ public class Env {

private HiveTransactionMgr hiveTransactionMgr;

private DNSCache dnsCache;

public List<Frontend> getFrontends(FrontendNodeType nodeType) {
if (nodeType == null) {
// get all
Expand Down Expand Up @@ -688,6 +691,15 @@ private Env(boolean isCheckpointCatalog) {
this.binlogManager = new BinlogManager();
this.binlogGcer = new BinlogGcer();
this.columnIdFlusher = new ColumnIdFlushDaemon();
<<<<<<< HEAD
=======
this.queryCancelWorker = new QueryCancelWorker(systemInfo);
this.topicPublisherThread = new TopicPublisherThread(
"TopicPublisher", Config.publish_topic_info_interval_ms, systemInfo);
this.mtmvService = new MTMVService();
this.insertOverwriteManager = new InsertOverwriteManager();
this.dnsCache = new DNSCache();
>>>>>>> 39d695c05c ([opt](fqdn) Add DNS Cache for FE and BE (#32869))
}

public static void destroyCheckpoint() {
Expand Down Expand Up @@ -819,6 +831,10 @@ public static HiveTransactionMgr getCurrentHiveTransactionMgr() {
return getCurrentEnv().getHiveTransactionMgr();
}

public DNSCache getDnsCache() {
return dnsCache;
}

// Use tryLock to avoid potential dead lock
private boolean tryLock(boolean mustLock) {
while (true) {
Expand Down Expand Up @@ -1533,8 +1549,13 @@ private void startMasterOnlyDaemonThreads() {
columnIdFlusher.start();
}

<<<<<<< HEAD
// start threads that should running on all FE
private void startNonMasterDaemonThreads() {
=======
// start threads that should run on all FE
protected void startNonMasterDaemonThreads() {
>>>>>>> 39d695c05c ([opt](fqdn) Add DNS Cache for FE and BE (#32869))
// start load manager thread
loadManager.start();
tabletStatMgr.start();
Expand All @@ -1544,6 +1565,16 @@ private void startNonMasterDaemonThreads() {
getInternalCatalog().getEsRepository().start();
// domain resolver
domainResolver.start();
<<<<<<< HEAD
=======
// fe disk updater
feDiskUpdater.start();
if (Config.enable_hms_events_incremental_sync) {
metastoreEventsProcessor.start();
}

dnsCache.start();
>>>>>>> 39d695c05c ([opt](fqdn) Add DNS Cache for FE and BE (#32869))
}

private void transferToNonMaster(FrontendNodeType newType) {
Expand Down
Loading

0 comments on commit 2039016

Please sign in to comment.