diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index f006fa43342b58..6954de836cafcc 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1164,6 +1164,9 @@ DEFINE_mInt32(report_query_statistics_interval_ms, "3000"); // 30s DEFINE_mInt32(query_statistics_reserve_timeout_ms, "30000"); +// consider two high usage disk at the same available level if they do not exceed this diff. +DEFINE_mDouble(high_disk_avail_level_diff_usages, "0.15"); + // create tablet in partition random robin idx lru size, default 10000 DEFINE_Int32(partition_disk_index_lru_size, "10000"); diff --git a/be/src/common/config.h b/be/src/common/config.h index 64555adbbb7f4f..8a33c8c19d167f 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1240,6 +1240,9 @@ DECLARE_Int32(ignore_invalid_partition_id_rowset_num); DECLARE_mInt32(report_query_statistics_interval_ms); DECLARE_mInt32(query_statistics_reserve_timeout_ms); +// consider two high usage disk at the same available level if they do not exceed this diff. +DECLARE_mDouble(high_disk_avail_level_diff_usages); + // create tablet in partition random robin idx lru size, default 10000 DECLARE_Int32(partition_disk_index_lru_size); diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp index 2090127f41c3b9..069734d8acd7c3 100644 --- a/be/src/olap/storage_engine.cpp +++ b/be/src/olap/storage_engine.cpp @@ -456,16 +456,6 @@ Status StorageEngine::set_cluster_id(int32_t cluster_id) { return Status::OK(); } -StorageEngine::DiskRemainingLevel get_available_level(double disk_usage_percent) { - assert(disk_usage_percent <= 1); - if (disk_usage_percent < 0.7) { - return StorageEngine::DiskRemainingLevel::LOW; - } else if (disk_usage_percent < 0.85) { - return StorageEngine::DiskRemainingLevel::MID; - } - return StorageEngine::DiskRemainingLevel::HIGH; -} - int StorageEngine::_get_and_set_next_disk_index(int64 partition_id, TStorageMedium::type storage_medium) { auto key = CreateTabletIdxCache::get_key(partition_id, storage_medium); @@ -481,6 +471,7 @@ int StorageEngine::_get_and_set_next_disk_index(int64 partition_id, void StorageEngine::_get_candidate_stores(TStorageMedium::type storage_medium, std::vector& dir_infos) { + std::vector usages; for (auto& it : _store_map) { DataDir* data_dir = it.second.get(); if (data_dir->is_used()) { @@ -489,11 +480,51 @@ void StorageEngine::_get_candidate_stores(TStorageMedium::type storage_medium, !data_dir->reach_capacity_limit(0)) { DirInfo dir_info; dir_info.data_dir = data_dir; - dir_info.available_level = get_available_level(data_dir->get_usage(0)); + dir_info.available_level = 0; + usages.push_back(data_dir->get_usage(0)); dir_infos.push_back(dir_info); } } } + + if (dir_infos.size() <= 1) { + return; + } + + std::sort(usages.begin(), usages.end()); + if (usages.back() < 0.7) { + return; + } + + std::vector level_min_usages; + level_min_usages.push_back(usages[0]); + for (auto usage : usages) { + // usage < 0.7 consider as one level, give a small skew + if (usage < 0.7 - (config::high_disk_avail_level_diff_usages / 2.0)) { + continue; + } + + // at high usages, default 15% is one level + // for example: there disk usages are: 0.66, 0.72, 0.83 + // then level_min_usages = [0.66, 0.83], divide disks into 2 levels: [0.66, 0.72], [0.83] + if (usage >= level_min_usages.back() + config::high_disk_avail_level_diff_usages) { + level_min_usages.push_back(usage); + } + } + for (auto& dir_info : dir_infos) { + double usage = dir_info.data_dir->get_usage(0); + for (size_t i = 1; i < level_min_usages.size() && usage >= level_min_usages[i]; i++) { + dir_info.available_level++; + } + + // when usage is too high, no matter consider balance now, + // make it a higher level. + // for example, two disks and usages are: 0.85 and 0.92, then let tablets fall on the first disk. + // by default, storage_flood_stage_usage_percent = 90 + if (usage > config::storage_flood_stage_usage_percent / 100.0) { + dir_info.available_level++; + } + } } std::vector StorageEngine::get_stores_for_create_tablet( diff --git a/be/src/olap/storage_engine.h b/be/src/olap/storage_engine.h index b2b72b6d523ded..bc581aa329ad4d 100644 --- a/be/src/olap/storage_engine.h +++ b/be/src/olap/storage_engine.h @@ -150,8 +150,6 @@ class StorageEngine final : public BaseStorageEngine { StorageEngine(const EngineOptions& options); ~StorageEngine() override; - enum class DiskRemainingLevel { LOW, MID, HIGH }; - Status open() override; Status create_tablet(const TCreateTabletReq& request, RuntimeProfile* profile); @@ -541,7 +539,7 @@ class CreateTabletIdxCache : public LRUCachePolicy { struct DirInfo { DataDir* data_dir; - StorageEngine::DiskRemainingLevel available_level; + int available_level = 0; bool operator<(const DirInfo& other) const { if (available_level != other.available_level) {