From c38fa0272a7e8440145fcdd13b622e198ee0aaf0 Mon Sep 17 00:00:00 2001
From: Mryange <59914473+Mryange@users.noreply.github.com>
Date: Mon, 9 Sep 2024 11:45:18 +0800
Subject: [PATCH 01/44] [opt](function)Optimize the performance of the pad
 function under UTF-8. (#40162)

## Proposed changes

1. Removed the calculation of str_index.
2. If pad is constant, calculate pad_index only once.
3. Do not insert res_chars inside the loop; instead, insert them all
together after the loop completes.
```
mysql [test]>select count(lpad(Title, 100, "abc")) from hits_10m;
+--------------------------------+
| count(lpad(Title, 100, 'abc')) |
+--------------------------------+
|                       10000000 |
+--------------------------------+
1 row in set (3.97 sec)

mysql [test]>select count(lpad(Title, 100, "abc")) from hits_10m;
+--------------------------------+
| count(lpad(Title, 100, 'abc')) |
+--------------------------------+
|                       10000000 |
+--------------------------------+
1 row in set (2.87 sec)
```



<!--Describe your changes.-->
---
 be/src/util/simd/vstring_function.h    |  17 ++++
 be/src/vec/functions/function_string.h | 110 +++++++++++++++----------
 2 files changed, 84 insertions(+), 43 deletions(-)
diff --git a/be/src/util/simd/vstring_function.h b/be/src/util/simd/vstring_function.h
index 579da50d2df230..99313132382e5c 100644
--- a/be/src/util/simd/vstring_function.h
+++ b/be/src/util/simd/vstring_function.h
@@ -187,6 +187,23 @@ class VStringFunctions {
         return p;
     }
 
+    // Iterate a UTF-8 string without exceeding a given length n.
+    // The function returns two values:
+    // the first represents the byte length traversed, and the second represents the char length traversed.
+    static inline std::pair<size_t, size_t> iterate_utf8_with_limit_length(const char* begin,
+                                                                           const char* end,
+                                                                           size_t n) {
+        const char* p = begin;
+        int char_size = 0;
+
+        size_t i = 0;
+        for (; i < n && p < end; ++i, p += char_size) {
+            char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)];
+        }
+
+        return {p - begin, i};
+    }
+
     // Gcc will do auto simd in this function
     static bool is_ascii(const StringRef& str) {
 #ifdef __AVX2__
diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h
index 618641dcfb227e..2e33dba3f332fe 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -1556,64 +1556,92 @@ class FunctionStringPad : public IFunction {
         const auto* padcol = assert_cast<const ColumnString*>(col[2].get());
         const auto& padcol_offsets = padcol->get_offsets();
         const auto& padcol_chars = padcol->get_chars();
+        std::visit(
+                [&](auto str_const, auto len_const, auto pad_const) {
+                    execute_utf8<str_const, len_const, pad_const>(
+                            strcol_offsets, strcol_chars, col_len_data, padcol_offsets,
+                            padcol_chars, res_offsets, res_chars, null_map_data, input_rows_count);
+                },
+                vectorized::make_bool_variant(col_const[0]),
+                vectorized::make_bool_variant(col_const[1]),
+                vectorized::make_bool_variant(col_const[2]));
 
-        std::vector<size_t> str_index;
+        block.get_by_position(result).column =
+                ColumnNullable::create(std::move(res), std::move(null_map));
+        return Status::OK();
+    }
+
+    template <bool str_const, bool len_const, bool pad_const>
+    void execute_utf8(const ColumnString::Offsets& strcol_offsets,
+                      const ColumnString::Chars& strcol_chars,
+                      const ColumnInt32::Container& col_len_data,
+                      const ColumnString::Offsets& padcol_offsets,
+                      const ColumnString::Chars& padcol_chars, ColumnString::Offsets& res_offsets,
+                      ColumnString::Chars& res_chars, ColumnUInt8::Container& null_map_data,
+                      size_t input_rows_count) const {
         std::vector<size_t> pad_index;
+        size_t const_pad_char_size = 0;
+        // If pad_const = true, initialize pad_index only once.
+        // The same logic applies to the if constexpr (!pad_const) condition below.
+        if constexpr (pad_const) {
+            const_pad_char_size = simd::VStringFunctions::get_char_len(
+                    (const char*)padcol_chars.data(), padcol_offsets[0], pad_index);
+        }
 
         fmt::memory_buffer buffer;
-        const bool str_const = col_const[0];
-        const bool len_const = col_const[1];
-        const bool pad_const = col_const[2];
+        buffer.reserve(strcol_chars.size());
+        size_t buffer_len = 0;
+
         for (size_t i = 0; i < input_rows_count; ++i) {
-            str_index.clear();
-            pad_index.clear();
+            if constexpr (!pad_const) {
+                pad_index.clear();
+            }
             buffer.clear();
-            const auto len = col_len_data[index_check_const(i, len_const)];
+            const auto len = col_len_data[index_check_const<len_const>(i)];
             if (len < 0) {
                 // return NULL when input length is invalid number
                 null_map_data[i] = true;
-                StringOP::push_empty_string(i, res_chars, res_offsets);
+                res_offsets[i] = buffer_len;
             } else {
-                const auto str_idx = index_check_const(i, str_const);
+                const auto str_idx = index_check_const<str_const>(i);
                 const int str_len = strcol_offsets[str_idx] - strcol_offsets[str_idx - 1];
                 const auto* str_data = &strcol_chars[strcol_offsets[str_idx - 1]];
-                const auto pad_idx = index_check_const(i, pad_const);
+                const auto pad_idx = index_check_const<pad_const>(i);
                 const int pad_len = padcol_offsets[pad_idx] - padcol_offsets[pad_idx - 1];
                 const auto* pad_data = &padcol_chars[padcol_offsets[pad_idx - 1]];
-                // get utf8 len
-                size_t str_char_size = simd::VStringFunctions::get_char_len((const char*)str_data,
-                                                                            str_len, str_index);
-                size_t pad_char_size = simd::VStringFunctions::get_char_len((const char*)pad_data,
-                                                                            pad_len, pad_index);
-
-                if (len <= str_char_size) {
-                    // truncate the input string
-                    if (len < str_char_size) {
-                        buffer.append(str_data, str_data + str_index[len]);
-                    } else {
-                        buffer.append(str_data, str_data + str_len);
-                    }
 
-                    StringOP::push_value_string(std::string_view(buffer.data(), buffer.size()), i,
-                                                res_chars, res_offsets);
+                auto [iterate_byte_len, iterate_char_len] =
+                        simd::VStringFunctions::iterate_utf8_with_limit_length(
+                                (const char*)str_data, (const char*)str_data + str_len, len);
+                // If iterate_char_len equals len, it indicates that the str length is greater than or equal to len
+                if (iterate_char_len == len) {
+                    buffer.reserve(buffer_len + iterate_byte_len);
+                    memcpy(buffer.data() + buffer_len, str_data, iterate_byte_len);
+                    buffer_len += iterate_byte_len;
+                    res_offsets[i] = buffer_len;
                     continue;
                 }
+                size_t pad_char_size;
+                if constexpr (!pad_const) {
+                    pad_char_size = simd::VStringFunctions::get_char_len((const char*)pad_data,
+                                                                         pad_len, pad_index);
+                } else {
+                    pad_char_size = const_pad_char_size;
+                }
 
                 // make compatible with mysql. return empty string if pad is empty
                 if (pad_char_size == 0) {
-                    StringOP::push_empty_string(i, res_chars, res_offsets);
+                    res_offsets[i] = buffer_len;
                     continue;
                 }
-
-                const int32_t pad_times = (len - str_char_size) / pad_char_size;
-                const int32_t pad_remainder = (len - str_char_size) % pad_char_size;
-                size_t new_capacity = str_len + size_t(pad_times + 1) * pad_len;
+                const size_t str_char_size = iterate_char_len;
+                const size_t pad_times = (len - str_char_size) / pad_char_size;
+                const size_t pad_remainder_len = pad_index[(len - str_char_size) % pad_char_size];
+                const size_t new_capacity = str_len + size_t(pad_times + 1) * pad_len;
                 ColumnString::check_chars_length(new_capacity, 0);
-                buffer.reserve(new_capacity);
-                auto* buffer_data = buffer.data();
-                int32_t buffer_len = 0;
+                buffer.reserve(buffer_len + new_capacity);
                 if constexpr (!Impl::is_lpad) {
-                    memcpy(buffer_data, str_data, str_len);
+                    memcpy(buffer.data() + buffer_len, str_data, str_len);
                     buffer_len += str_len;
                 }
                 // Prepend chars of pad.
@@ -1621,21 +1649,17 @@ class FunctionStringPad : public IFunction {
                                       pad_times);
                 buffer_len += pad_times * pad_len;
 
-                memcpy(buffer_data + buffer_len, pad_data, pad_index[pad_remainder]);
-                buffer_len += pad_index[pad_remainder];
+                memcpy(buffer.data() + buffer_len, pad_data, pad_remainder_len);
+                buffer_len += pad_remainder_len;
 
                 if constexpr (Impl::is_lpad) {
-                    memcpy(buffer_data + buffer_len, str_data, str_len);
+                    memcpy(buffer.data() + buffer_len, str_data, str_len);
                     buffer_len += str_len;
                 }
-                StringOP::push_value_string(std::string_view(buffer_data, buffer_len), i, res_chars,
-                                            res_offsets);
+                res_offsets[i] = buffer_len;
             }
         }
-
-        block.get_by_position(result).column =
-                ColumnNullable::create(std::move(res), std::move(null_map));
-        return Status::OK();
+        res_chars.insert(buffer.data(), buffer.data() + buffer_len);
     }
 };
 

From 84ce9451c6ebdf290d5c1b401e0a282f5acb6577 Mon Sep 17 00:00:00 2001
From: Xinyi Zou <zouxinyi02@gmail.com>
Date: Mon, 9 Sep 2024 12:15:37 +0800
Subject: [PATCH 02/44] [opt](memory) Refactor memory maintenance thread
 (#40344)

step 1. Refresh process memory metrics.
step 2. Refresh allocator memory metrics.
step 3. Update and print memory stat when the memory changes by 256M.
step 4. Asyn Refresh cache capacity
step 5. Cancel top memory task when process memory exceed hard limit.
step 6. Refresh weighted memory ratio of workload groups.
step 7. Analyze blocking queries.
step 8. Flush memtable.
step 9. Jemalloc purge all arena dirty pages.

`memory_maintenance_thread` execute once cost:
- 3ms (cluster idle)
- 20ms (cluster high concurrency, CPU full)

`memory_maintenance_thread` CPU usage:
- 10%-20% (default memory_maintenance_sleep_time_ms=20ms)
- 20%-30% (memory_maintenance_sleep_time_ms=10ms)
- 30%+ (memory_maintenance_sleep_time_ms=5ms)
---
 be/src/common/config.cpp                      |  14 +-
 be/src/common/config.h                        |   9 +-
 be/src/common/daemon.cpp                      | 195 +++++++++++++-----
 be/src/common/daemon.h                        |   2 +-
 be/src/olap/lru_cache.cpp                     |  81 +++++++-
 be/src/olap/lru_cache.h                       |  28 +--
 be/src/runtime/memory/cache_manager.cpp       |  21 +-
 be/src/runtime/memory/cache_manager.h         |   3 +
 be/src/runtime/memory/cache_policy.cpp        |   8 +-
 be/src/runtime/memory/cache_policy.h          |  28 ++-
 .../memory/global_memory_arbitrator.cpp       |   7 +
 .../runtime/memory/global_memory_arbitrator.h |  17 ++
 be/src/runtime/memory/lru_cache_policy.h      |  95 ++++++---
 be/src/runtime/memory/mem_tracker_limiter.cpp |   6 +-
 be/src/runtime/memory/memory_reclamation.cpp  |  12 --
 be/src/service/point_query_executor.h         |   4 +-
 be/src/vec/common/allocator.cpp               |   3 -
 be/test/olap/lru_cache_test.cpp               | 144 +++++++++++--
 18 files changed, 514 insertions(+), 163 deletions(-)

diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index 00f8a042cbcbb7..0c00bd1a38f0da 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -95,6 +95,9 @@ DEFINE_String(mem_limit, "90%");
 // Soft memory limit as a fraction of hard memory limit.
 DEFINE_Double(soft_mem_limit_frac, "0.9");
 
+// Cache capacity reduce mem limit as a fraction of soft mem limit.
+DEFINE_mDouble(cache_capacity_reduce_mem_limit_frac, "0.6");
+
 // Schema change memory limit as a fraction of soft memory limit.
 DEFINE_Double(schema_change_mem_limit_frac, "0.6");
 
@@ -286,7 +289,7 @@ DEFINE_mInt32(exchg_buffer_queue_capacity_factor, "64");
 DEFINE_mInt64(memory_limitation_per_thread_for_schema_change_bytes, "2147483648");
 
 DEFINE_mInt32(cache_prune_interval_sec, "10");
-DEFINE_mInt32(cache_periodic_prune_stale_sweep_sec, "300");
+DEFINE_mInt32(cache_periodic_prune_stale_sweep_sec, "60");
 // the clean interval of tablet lookup cache
 DEFINE_mInt32(tablet_lookup_cache_stale_sweep_time_sec, "30");
 DEFINE_mInt32(point_query_row_cache_stale_sweep_time_sec, "300");
@@ -565,7 +568,7 @@ DEFINE_String(pprof_profile_dir, "${DORIS_HOME}/log");
 // for jeprofile in jemalloc
 DEFINE_mString(jeprofile_dir, "${DORIS_HOME}/log");
 DEFINE_mBool(enable_je_purge_dirty_pages, "true");
-DEFINE_mString(je_dirty_pages_mem_limit_percent, "5%");
+DEFINE_mString(je_dirty_pages_mem_limit_percent, "2%");
 
 // to forward compatibility, will be removed later
 DEFINE_mBool(enable_token_check, "true");
@@ -582,17 +585,12 @@ DEFINE_Int32(num_cores, "0");
 DEFINE_Bool(ignore_broken_disk, "false");
 
 // Sleep time in milliseconds between memory maintenance iterations
-DEFINE_mInt32(memory_maintenance_sleep_time_ms, "100");
+DEFINE_mInt32(memory_maintenance_sleep_time_ms, "20");
 
 // After full gc, no longer full gc and minor gc during sleep.
 // After minor gc, no minor gc during sleep, but full gc is possible.
 DEFINE_mInt32(memory_gc_sleep_time_ms, "500");
 
-// Sleep time in milliseconds between memtbale flush mgr refresh iterations
-DEFINE_mInt64(memtable_mem_tracker_refresh_interval_ms, "5");
-
-DEFINE_mInt64(wg_weighted_memory_ratio_refresh_interval_ms, "50");
-
 // percent of (active memtables size / all memtables size) when reach hard limit
 DEFINE_mInt32(memtable_hard_limit_active_percent, "50");
 
diff --git a/be/src/common/config.h b/be/src/common/config.h
index bd2aa4f51be1a9..720f4f72cb4bf7 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -133,6 +133,9 @@ DECLARE_String(mem_limit);
 // Soft memory limit as a fraction of hard memory limit.
 DECLARE_Double(soft_mem_limit_frac);
 
+// Cache capacity reduce mem limit as a fraction of soft mem limit.
+DECLARE_mDouble(cache_capacity_reduce_mem_limit_frac);
+
 // Schema change memory limit as a fraction of soft memory limit.
 DECLARE_Double(schema_change_mem_limit_frac);
 
@@ -641,12 +644,6 @@ DECLARE_mInt32(memory_maintenance_sleep_time_ms);
 // After minor gc, no minor gc during sleep, but full gc is possible.
 DECLARE_mInt32(memory_gc_sleep_time_ms);
 
-// Sleep time in milliseconds between memtbale flush mgr memory refresh iterations
-DECLARE_mInt64(memtable_mem_tracker_refresh_interval_ms);
-
-// Sleep time in milliseconds between refresh iterations of workload group weighted memory ratio
-DECLARE_mInt64(wg_weighted_memory_ratio_refresh_interval_ms);
-
 // percent of (active memtables size / all memtables size) when reach hard limit
 DECLARE_mInt32(memtable_hard_limit_active_percent);
 
diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp
index d8245f4045ce81..713813b4a334f9 100644
--- a/be/src/common/daemon.cpp
+++ b/be/src/common/daemon.cpp
@@ -73,6 +73,12 @@
 namespace doris {
 namespace {
 
+int64_t last_print_proc_mem = 0;
+int32_t refresh_cache_capacity_sleep_time_ms = 0;
+#ifdef USE_JEMALLOC
+int32_t je_purge_dirty_pages_sleep_time_ms = 0;
+#endif
+
 void update_rowsets_and_segments_num_metrics() {
     if (config::is_cloud_mode()) {
         // TODO(plat1ko): CloudStorageEngine
@@ -204,42 +210,104 @@ void Daemon::tcmalloc_gc_thread() {
 #endif
 }
 
-void Daemon::memory_maintenance_thread() {
-    int32_t interval_milliseconds = config::memory_maintenance_sleep_time_ms;
-    int64_t last_print_proc_mem = PerfCounters::get_vm_rss();
-    while (!_stop_background_threads_latch.wait_for(
-            std::chrono::milliseconds(interval_milliseconds))) {
-        // Refresh process memory metrics.
-        doris::PerfCounters::refresh_proc_status();
-        doris::MemInfo::refresh_proc_meminfo();
-        doris::GlobalMemoryArbitrator::reset_refresh_interval_memory_growth();
-        ExecEnv::GetInstance()->brpc_iobuf_block_memory_tracker()->set_consumption(
-                butil::IOBuf::block_memory());
-        // Refresh allocator memory metrics.
+void refresh_process_memory_metrics() {
+    doris::PerfCounters::refresh_proc_status();
+    doris::MemInfo::refresh_proc_meminfo();
+    doris::GlobalMemoryArbitrator::reset_refresh_interval_memory_growth();
+    ExecEnv::GetInstance()->brpc_iobuf_block_memory_tracker()->set_consumption(
+            butil::IOBuf::block_memory());
+}
+
+void refresh_common_allocator_metrics() {
 #if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && !defined(THREAD_SANITIZER)
-        doris::MemInfo::refresh_allocator_mem();
-#ifdef USE_JEMALLOC
-        if (doris::MemInfo::je_dirty_pages_mem() > doris::MemInfo::je_dirty_pages_mem_limit() &&
-            GlobalMemoryArbitrator::is_exceed_soft_mem_limit()) {
-            doris::MemInfo::notify_je_purge_dirty_pages();
-        }
+    doris::MemInfo::refresh_allocator_mem();
+    if (config::enable_system_metrics) {
+        DorisMetrics::instance()->system_metrics()->update_allocator_metrics();
+    }
 #endif
-        if (config::enable_system_metrics) {
-            DorisMetrics::instance()->system_metrics()->update_allocator_metrics();
+    MemInfo::refresh_memory_bvar();
+}
+
+void refresh_memory_state_after_memory_change() {
+    if (abs(last_print_proc_mem - PerfCounters::get_vm_rss()) > 268435456) {
+        last_print_proc_mem = PerfCounters::get_vm_rss();
+        doris::MemTrackerLimiter::clean_tracker_limiter_group();
+        doris::MemTrackerLimiter::enable_print_log_process_usage();
+        // Refresh mem tracker each type counter.
+        doris::MemTrackerLimiter::refresh_global_counter();
+        LOG(INFO) << doris::GlobalMemoryArbitrator::
+                        process_mem_log_str(); // print mem log when memory state by 256M
+    }
+}
+
+void refresh_cache_capacity() {
+    if (refresh_cache_capacity_sleep_time_ms <= 0) {
+        auto cache_capacity_reduce_mem_limit = uint64_t(
+                doris::MemInfo::soft_mem_limit() * config::cache_capacity_reduce_mem_limit_frac);
+        int64_t process_memory_usage = doris::GlobalMemoryArbitrator::process_memory_usage();
+        double new_cache_capacity_adjust_weighted =
+                process_memory_usage <= cache_capacity_reduce_mem_limit
+                        ? 1
+                        : std::min<double>(
+                                  1 - (process_memory_usage - cache_capacity_reduce_mem_limit) /
+                                                  (doris::MemInfo::soft_mem_limit() -
+                                                   cache_capacity_reduce_mem_limit),
+                                  0);
+        if (new_cache_capacity_adjust_weighted !=
+            doris::GlobalMemoryArbitrator::last_cache_capacity_adjust_weighted) {
+            doris::GlobalMemoryArbitrator::last_cache_capacity_adjust_weighted =
+                    new_cache_capacity_adjust_weighted;
+            doris::GlobalMemoryArbitrator::notify_cache_adjust_capacity();
+            refresh_cache_capacity_sleep_time_ms = config::memory_gc_sleep_time_ms;
         }
+    }
+    refresh_cache_capacity_sleep_time_ms -= config::memory_maintenance_sleep_time_ms;
+}
+
+void je_purge_dirty_pages() {
+#ifdef USE_JEMALLOC
+    if (je_purge_dirty_pages_sleep_time_ms <= 0 &&
+        doris::MemInfo::je_dirty_pages_mem() > doris::MemInfo::je_dirty_pages_mem_limit() &&
+        GlobalMemoryArbitrator::is_exceed_soft_mem_limit()) {
+        doris::MemInfo::notify_je_purge_dirty_pages();
+        je_purge_dirty_pages_sleep_time_ms = config::memory_gc_sleep_time_ms;
+    }
+    je_purge_dirty_pages_sleep_time_ms -= config::memory_maintenance_sleep_time_ms;
 #endif
-        MemInfo::refresh_memory_bvar();
-
-        // Update and print memory stat when the memory changes by 256M.
-        if (abs(last_print_proc_mem - PerfCounters::get_vm_rss()) > 268435456) {
-            last_print_proc_mem = PerfCounters::get_vm_rss();
-            doris::MemTrackerLimiter::clean_tracker_limiter_group();
-            doris::MemTrackerLimiter::enable_print_log_process_usage();
-            // Refresh mem tracker each type counter.
-            doris::MemTrackerLimiter::refresh_global_counter();
-            LOG(INFO) << doris::GlobalMemoryArbitrator::
-                            process_mem_log_str(); // print mem log when memory state by 256M
-        }
+}
+
+void Daemon::memory_maintenance_thread() {
+    while (!_stop_background_threads_latch.wait_for(
+            std::chrono::milliseconds(config::memory_maintenance_sleep_time_ms))) {
+        // step 1. Refresh process memory metrics.
+        refresh_process_memory_metrics();
+
+        // step 2. Refresh jemalloc/tcmalloc metrics.
+        refresh_common_allocator_metrics();
+
+        // step 3. Update and print memory stat when the memory changes by 256M.
+        refresh_memory_state_after_memory_change();
+
+        // step 4. Asyn Refresh cache capacity
+        // TODO adjust cache capacity based on smoothstep (smooth gradient).
+        refresh_cache_capacity();
+
+        // step 5. Cancel top memory task when process memory exceed hard limit.
+        // TODO replace memory_gc_thread.
+
+        // step 6. Refresh weighted memory ratio of workload groups.
+        doris::ExecEnv::GetInstance()->workload_group_mgr()->refresh_wg_weighted_memory_limit();
+
+        // step 7. Analyze blocking queries.
+        // TODO sort the operators that can spill, wake up the pipeline task spill
+        // or continue execution according to certain rules or cancel query.
+
+        // step 8. Flush memtable
+        doris::GlobalMemoryArbitrator::notify_memtable_memory_refresh();
+        // TODO notify flush memtable
+
+        // step 9. Jemalloc purge all arena dirty pages
+        je_purge_dirty_pages();
     }
 }
 
@@ -301,10 +369,21 @@ void Daemon::memory_gc_thread() {
 void Daemon::memtable_memory_refresh_thread() {
     // Refresh the memory statistics of the load channel tracker more frequently,
     // which helps to accurately control the memory of LoadChannelMgr.
-    while (!_stop_background_threads_latch.wait_for(
-            std::chrono::milliseconds(config::memtable_mem_tracker_refresh_interval_ms))) {
+    do {
+        std::unique_lock<std::mutex> l(doris::GlobalMemoryArbitrator::memtable_memory_refresh_lock);
+        while (_stop_background_threads_latch.count() != 0 &&
+               !doris::GlobalMemoryArbitrator::memtable_memory_refresh_notify.load(
+                       std::memory_order_relaxed)) {
+            doris::GlobalMemoryArbitrator::memtable_memory_refresh_cv.wait_for(
+                    l, std::chrono::seconds(1));
+        }
+        if (_stop_background_threads_latch.count() == 0) {
+            break;
+        }
         doris::ExecEnv::GetInstance()->memtable_memory_limiter()->refresh_mem_tracker();
-    }
+        doris::GlobalMemoryArbitrator::memtable_memory_refresh_notify.store(
+                false, std::memory_order_relaxed);
+    } while (true);
 }
 
 /*
@@ -396,6 +475,35 @@ void Daemon::je_purge_dirty_pages_thread() const {
     } while (true);
 }
 
+void Daemon::cache_adjust_capacity_thread() {
+    do {
+        std::unique_lock<std::mutex> l(doris::GlobalMemoryArbitrator::cache_adjust_capacity_lock);
+        while (_stop_background_threads_latch.count() != 0 &&
+               !doris::GlobalMemoryArbitrator::cache_adjust_capacity_notify.load(
+                       std::memory_order_relaxed)) {
+            doris::GlobalMemoryArbitrator::cache_adjust_capacity_cv.wait_for(
+                    l, std::chrono::seconds(1));
+        }
+        double adjust_weighted = GlobalMemoryArbitrator::last_cache_capacity_adjust_weighted;
+        if (_stop_background_threads_latch.count() == 0) {
+            break;
+        }
+        if (config::disable_memory_gc) {
+            continue;
+        }
+        std::unique_ptr<RuntimeProfile> profile = std::make_unique<RuntimeProfile>("");
+        auto freed_mem = CacheManager::instance()->for_each_cache_refresh_capacity(adjust_weighted,
+                                                                                   profile.get());
+        std::stringstream ss;
+        profile->pretty_print(&ss);
+        LOG(INFO) << fmt::format(
+                "[MemoryGC] refresh cache capacity end, free memory {}, details: {}",
+                PrettyPrinter::print(freed_mem, TUnit::BYTES), ss.str());
+        doris::GlobalMemoryArbitrator::cache_adjust_capacity_notify.store(
+                false, std::memory_order_relaxed);
+    } while (true);
+}
+
 void Daemon::cache_prune_stale_thread() {
     int32_t interval = config::cache_periodic_prune_stale_sweep_sec;
     while (!_stop_background_threads_latch.wait_for(std::chrono::seconds(interval))) {
@@ -411,14 +519,6 @@ void Daemon::cache_prune_stale_thread() {
     }
 }
 
-void Daemon::wg_weighted_memory_ratio_refresh_thread() {
-    // Refresh weighted memory ratio of workload groups
-    while (!_stop_background_threads_latch.wait_for(
-            std::chrono::milliseconds(config::wg_weighted_memory_ratio_refresh_interval_ms))) {
-        doris::ExecEnv::GetInstance()->workload_group_mgr()->refresh_wg_weighted_memory_limit();
-    }
-}
-
 void Daemon::be_proc_monitor_thread() {
     while (!_stop_background_threads_latch.wait_for(
             std::chrono::milliseconds(config::be_proc_monitor_interval_ms))) {
@@ -455,6 +555,10 @@ void Daemon::start() {
             "Daemon", "je_purge_dirty_pages_thread",
             [this]() { this->je_purge_dirty_pages_thread(); }, &_threads.emplace_back());
     CHECK(st.ok()) << st;
+    st = Thread::create(
+            "Daemon", "cache_adjust_capacity_thread",
+            [this]() { this->cache_adjust_capacity_thread(); }, &_threads.emplace_back());
+    CHECK(st.ok()) << st;
     st = Thread::create(
             "Daemon", "cache_prune_stale_thread", [this]() { this->cache_prune_stale_thread(); },
             &_threads.emplace_back());
@@ -464,11 +568,6 @@ void Daemon::start() {
             [this]() { this->report_runtime_query_statistics_thread(); }, &_threads.emplace_back());
     CHECK(st.ok()) << st;
 
-    st = Thread::create(
-            "Daemon", "wg_weighted_memory_ratio_refresh_thread",
-            [this]() { this->wg_weighted_memory_ratio_refresh_thread(); },
-            &_threads.emplace_back());
-
     if (config::enable_be_proc_monitor) {
         st = Thread::create(
                 "Daemon", "be_proc_monitor_thread", [this]() { this->be_proc_monitor_thread(); },
diff --git a/be/src/common/daemon.h b/be/src/common/daemon.h
index 64c9f0c8993ae3..fe723877dcd027 100644
--- a/be/src/common/daemon.h
+++ b/be/src/common/daemon.h
@@ -43,9 +43,9 @@ class Daemon {
     void memtable_memory_refresh_thread();
     void calculate_metrics_thread();
     void je_purge_dirty_pages_thread() const;
+    void cache_adjust_capacity_thread();
     void cache_prune_stale_thread();
     void report_runtime_query_statistics_thread();
-    void wg_weighted_memory_ratio_refresh_thread();
     void be_proc_monitor_thread();
 
     CountDownLatch _stop_background_threads_latch;
diff --git a/be/src/olap/lru_cache.cpp b/be/src/olap/lru_cache.cpp
index 741c2423915ede..6e5bb2fa31578f 100644
--- a/be/src/olap/lru_cache.cpp
+++ b/be/src/olap/lru_cache.cpp
@@ -177,6 +177,51 @@ LRUCache::~LRUCache() {
     prune();
 }
 
+PrunedInfo LRUCache::set_capacity(size_t capacity) {
+    LRUHandle* last_ref_list = nullptr;
+    {
+        std::lock_guard l(_mutex);
+        _capacity = capacity;
+        _evict_from_lru(0, &last_ref_list);
+    }
+
+    int64_t pruned_count = 0;
+    int64_t pruned_size = 0;
+    while (last_ref_list != nullptr) {
+        ++pruned_count;
+        pruned_size += last_ref_list->total_size;
+        LRUHandle* next = last_ref_list->next;
+        last_ref_list->free();
+        last_ref_list = next;
+    }
+    return {pruned_count, pruned_size};
+}
+
+uint64_t LRUCache::get_lookup_count() {
+    std::lock_guard l(_mutex);
+    return _lookup_count;
+}
+
+uint64_t LRUCache::get_hit_count() {
+    std::lock_guard l(_mutex);
+    return _hit_count;
+}
+
+size_t LRUCache::get_usage() {
+    std::lock_guard l(_mutex);
+    return _usage;
+}
+
+size_t LRUCache::get_capacity() {
+    std::lock_guard l(_mutex);
+    return _capacity;
+}
+
+size_t LRUCache::get_element_count() {
+    std::lock_guard l(_mutex);
+    return _table.element_count();
+}
+
 bool LRUCache::_unref(LRUHandle* e) {
     DCHECK(e->refs > 0);
     e->refs--;
@@ -515,19 +560,19 @@ inline uint32_t ShardedLRUCache::_hash_slice(const CacheKey& s) {
     return s.hash(s.data(), s.size(), 0);
 }
 
-ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type,
+ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t capacity, LRUCacheType type,
                                  uint32_t num_shards, uint32_t total_element_count_capacity)
         : _name(name),
           _num_shard_bits(Bits::FindLSBSetNonZero(num_shards)),
           _num_shards(num_shards),
           _shards(nullptr),
           _last_id(1),
-          _total_capacity(total_capacity) {
+          _capacity(capacity) {
     CHECK(num_shards > 0) << "num_shards cannot be 0";
     CHECK_EQ((num_shards & (num_shards - 1)), 0)
             << "num_shards should be power of two, but got " << num_shards;
 
-    const size_t per_shard = (total_capacity + (_num_shards - 1)) / _num_shards;
+    const size_t per_shard = (capacity + (_num_shards - 1)) / _num_shards;
     const size_t per_shard_element_count_capacity =
             (total_element_count_capacity + (_num_shards - 1)) / _num_shards;
     LRUCache** shards = new (std::nothrow) LRUCache*[_num_shards];
@@ -557,12 +602,12 @@ ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity,
             "doris_cache", _name + "_persecond", _lookup_count_bvar.get(), 60));
 }
 
-ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type,
+ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t capacity, LRUCacheType type,
                                  uint32_t num_shards,
                                  CacheValueTimeExtractor cache_value_time_extractor,
                                  bool cache_value_check_timestamp,
                                  uint32_t total_element_count_capacity)
-        : ShardedLRUCache(name, total_capacity, type, num_shards, total_element_count_capacity) {
+        : ShardedLRUCache(name, capacity, type, num_shards, total_element_count_capacity) {
     for (int s = 0; s < _num_shards; s++) {
         _shards[s]->set_cache_value_time_extractor(cache_value_time_extractor);
         _shards[s]->set_cache_value_check_timestamp(cache_value_check_timestamp);
@@ -580,6 +625,24 @@ ShardedLRUCache::~ShardedLRUCache() {
     }
 }
 
+PrunedInfo ShardedLRUCache::set_capacity(size_t capacity) {
+    std::lock_guard l(_mutex);
+    PrunedInfo pruned_info;
+    const size_t per_shard = (capacity + (_num_shards - 1)) / _num_shards;
+    for (int s = 0; s < _num_shards; s++) {
+        PrunedInfo info = _shards[s]->set_capacity(per_shard);
+        pruned_info.pruned_count += info.pruned_count;
+        pruned_info.pruned_size += info.pruned_size;
+    }
+    _capacity = capacity;
+    return pruned_info;
+}
+
+size_t ShardedLRUCache::get_capacity() {
+    std::lock_guard l(_mutex);
+    return _capacity;
+}
+
 Cache::Handle* ShardedLRUCache::insert(const CacheKey& key, void* value, size_t charge,
                                        CachePriority priority) {
     const uint32_t hash = _hash_slice(key);
@@ -638,25 +701,25 @@ int64_t ShardedLRUCache::get_usage() {
 }
 
 void ShardedLRUCache::update_cache_metrics() const {
-    size_t total_capacity = 0;
+    size_t capacity = 0;
     size_t total_usage = 0;
     size_t total_lookup_count = 0;
     size_t total_hit_count = 0;
     size_t total_element_count = 0;
     for (int i = 0; i < _num_shards; i++) {
-        total_capacity += _shards[i]->get_capacity();
+        capacity += _shards[i]->get_capacity();
         total_usage += _shards[i]->get_usage();
         total_lookup_count += _shards[i]->get_lookup_count();
         total_hit_count += _shards[i]->get_hit_count();
         total_element_count += _shards[i]->get_element_count();
     }
 
-    cache_capacity->set_value(total_capacity);
+    cache_capacity->set_value(capacity);
     cache_usage->set_value(total_usage);
     cache_element_count->set_value(total_element_count);
     cache_lookup_count->set_value(total_lookup_count);
     cache_hit_count->set_value(total_hit_count);
-    cache_usage_ratio->set_value(total_capacity == 0 ? 0 : ((double)total_usage / total_capacity));
+    cache_usage_ratio->set_value(capacity == 0 ? 0 : ((double)total_usage / capacity));
     cache_hit_ratio->set_value(
             total_lookup_count == 0 ? 0 : ((double)total_hit_count / total_lookup_count));
 }
diff --git a/be/src/olap/lru_cache.h b/be/src/olap/lru_cache.h
index 059020deab58f5..de7084382d7398 100644
--- a/be/src/olap/lru_cache.h
+++ b/be/src/olap/lru_cache.h
@@ -227,7 +227,8 @@ class Cache {
 
     virtual int64_t get_usage() = 0;
 
-    virtual size_t get_total_capacity() = 0;
+    virtual PrunedInfo set_capacity(size_t capacity) = 0;
+    virtual size_t get_capacity() = 0;
 
 private:
     DISALLOW_COPY_AND_ASSIGN(Cache);
@@ -327,7 +328,7 @@ class LRUCache {
     ~LRUCache();
 
     // Separate from constructor so caller can easily make an array of LRUCache
-    void set_capacity(size_t capacity) { _capacity = capacity; }
+    PrunedInfo set_capacity(size_t capacity);
     void set_element_count_capacity(uint32_t element_count_capacity) {
         _element_count_capacity = element_count_capacity;
     }
@@ -345,11 +346,11 @@ class LRUCache {
     void set_cache_value_time_extractor(CacheValueTimeExtractor cache_value_time_extractor);
     void set_cache_value_check_timestamp(bool cache_value_check_timestamp);
 
-    uint64_t get_lookup_count() const { return _lookup_count; }
-    uint64_t get_hit_count() const { return _hit_count; }
-    size_t get_usage() const { return _usage; }
-    size_t get_capacity() const { return _capacity; }
-    size_t get_element_count() const { return _table.element_count(); }
+    uint64_t get_lookup_count();
+    uint64_t get_hit_count();
+    size_t get_usage();
+    size_t get_capacity();
+    size_t get_element_count();
 
 private:
     void _lru_remove(LRUHandle* e);
@@ -403,15 +404,16 @@ class ShardedLRUCache : public Cache {
     PrunedInfo prune() override;
     PrunedInfo prune_if(CachePrunePredicate pred, bool lazy_mode = false) override;
     int64_t get_usage() override;
-    size_t get_total_capacity() override { return _total_capacity; };
+    PrunedInfo set_capacity(size_t capacity) override;
+    size_t get_capacity() override;
 
 private:
     // LRUCache can only be created and managed with LRUCachePolicy.
     friend class LRUCachePolicy;
 
-    explicit ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type,
+    explicit ShardedLRUCache(const std::string& name, size_t capacity, LRUCacheType type,
                              uint32_t num_shards, uint32_t element_count_capacity);
-    explicit ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type,
+    explicit ShardedLRUCache(const std::string& name, size_t capacity, LRUCacheType type,
                              uint32_t num_shards,
                              CacheValueTimeExtractor cache_value_time_extractor,
                              bool cache_value_check_timestamp, uint32_t element_count_capacity);
@@ -429,7 +431,8 @@ class ShardedLRUCache : public Cache {
     const uint32_t _num_shards;
     LRUCache** _shards = nullptr;
     std::atomic<uint64_t> _last_id;
-    size_t _total_capacity;
+    std::mutex _mutex;
+    size_t _capacity {0};
 
     std::shared_ptr<MetricEntity> _entity;
     IntGauge* cache_capacity = nullptr;
@@ -462,7 +465,8 @@ class DummyLRUCache : public Cache {
         return {0, 0};
     };
     int64_t get_usage() override { return 0; };
-    size_t get_total_capacity() override { return 0; };
+    PrunedInfo set_capacity(size_t capacity) override { return {0, 0}; };
+    size_t get_capacity() override { return 0; };
 };
 
 } // namespace doris
diff --git a/be/src/runtime/memory/cache_manager.cpp b/be/src/runtime/memory/cache_manager.cpp
index a6516c40a35770..ec57ffba50d318 100644
--- a/be/src/runtime/memory/cache_manager.cpp
+++ b/be/src/runtime/memory/cache_manager.cpp
@@ -59,11 +59,26 @@ int64_t CacheManager::for_each_cache_prune_all(RuntimeProfile* profile, bool for
 int64_t CacheManager::cache_prune_all(CachePolicy::CacheType type, bool force) {
     std::lock_guard<std::mutex> l(_caches_lock);
     auto* cache_policy = _caches[type];
-    if (!cache_policy->enable_prune()) {
-        return -1;
-    }
     cache_policy->prune_all(force);
     return cache_policy->profile()->get_counter("FreedMemory")->value();
 }
 
+int64_t CacheManager::for_each_cache_refresh_capacity(double adjust_weighted,
+                                                      RuntimeProfile* profile) {
+    int64_t freed_size = 0;
+    std::lock_guard<std::mutex> l(_caches_lock);
+    for (const auto& pair : _caches) {
+        auto* cache_policy = pair.second;
+        if (!cache_policy->enable_prune()) {
+            continue;
+        }
+        cache_policy->adjust_capacity_weighted(adjust_weighted);
+        freed_size += cache_policy->profile()->get_counter("FreedMemory")->value();
+        if (cache_policy->profile()->get_counter("FreedMemory")->value() != 0 && profile) {
+            profile->add_child(cache_policy->profile(), true, nullptr);
+        }
+    }
+    return freed_size;
+}
+
 } // namespace doris
diff --git a/be/src/runtime/memory/cache_manager.h b/be/src/runtime/memory/cache_manager.h
index d94dca501670bf..a2a089b929dbdf 100644
--- a/be/src/runtime/memory/cache_manager.h
+++ b/be/src/runtime/memory/cache_manager.h
@@ -81,6 +81,9 @@ class CacheManager {
         return false;
     }
 
+    int64_t for_each_cache_refresh_capacity(double adjust_weighted,
+                                            RuntimeProfile* profile = nullptr);
+
 private:
     std::mutex _caches_lock;
     std::unordered_map<CachePolicy::CacheType, CachePolicy*> _caches;
diff --git a/be/src/runtime/memory/cache_policy.cpp b/be/src/runtime/memory/cache_policy.cpp
index 4e50d64d88eed1..46b9db1b35ad5f 100644
--- a/be/src/runtime/memory/cache_policy.cpp
+++ b/be/src/runtime/memory/cache_policy.cpp
@@ -21,8 +21,12 @@
 
 namespace doris {
 
-CachePolicy::CachePolicy(CacheType type, uint32_t stale_sweep_time_s, bool enable_prune)
-        : _type(type), _stale_sweep_time_s(stale_sweep_time_s), _enable_prune(enable_prune) {
+CachePolicy::CachePolicy(CacheType type, size_t capacity, uint32_t stale_sweep_time_s,
+                         bool enable_prune)
+        : _type(type),
+          _initial_capacity(capacity),
+          _stale_sweep_time_s(stale_sweep_time_s),
+          _enable_prune(enable_prune) {
     CacheManager::instance()->register_cache(this);
     init_profile();
 }
diff --git a/be/src/runtime/memory/cache_policy.h b/be/src/runtime/memory/cache_policy.h
index c457afd86898f2..c43ca0b2fb7e0a 100644
--- a/be/src/runtime/memory/cache_policy.h
+++ b/be/src/runtime/memory/cache_policy.h
@@ -17,13 +17,12 @@
 
 #pragma once
 
-#include "runtime/exec_env.h"
 #include "util/runtime_profile.h"
 
 namespace doris {
 
-static constexpr int32_t CACHE_MIN_FREE_SIZE = 67108864; // 64M
-static constexpr int32_t CACHE_MIN_FREE_NUMBER = 1024;
+static constexpr int32_t CACHE_MIN_PRUNE_SIZE = 67108864; // 64M
+static constexpr int32_t CACHE_MIN_PRUNE_NUMBER = 1024;
 
 // Base of all caches. register to CacheManager when cache is constructed.
 class CachePolicy {
@@ -42,12 +41,13 @@ class CachePolicy {
         TABLET_VERSION_CACHE = 10,
         LAST_SUCCESS_CHANNEL_CACHE = 11,
         COMMON_OBJ_LRU_CACHE = 12,
-        FOR_UT = 13,
+        FOR_UT_CACHE_SIZE = 13,
         TABLET_SCHEMA_CACHE = 14,
         CREATE_TABLET_RR_IDX_CACHE = 15,
         CLOUD_TABLET_CACHE = 16,
         CLOUD_TXN_DELETE_BITMAP_CACHE = 17,
         NONE = 18, // not be used
+        FOR_UT_CACHE_NUMBER = 19,
     };
 
     static std::string type_string(CacheType type) {
@@ -78,8 +78,8 @@ class CachePolicy {
             return "LastSuccessChannelCache";
         case CacheType::COMMON_OBJ_LRU_CACHE:
             return "CommonObjLRUCache";
-        case CacheType::FOR_UT:
-            return "ForUT";
+        case CacheType::FOR_UT_CACHE_SIZE:
+            return "ForUTCacheSize";
         case CacheType::TABLET_SCHEMA_CACHE:
             return "TabletSchemaCache";
         case CacheType::CREATE_TABLET_RR_IDX_CACHE:
@@ -88,6 +88,8 @@ class CachePolicy {
             return "CloudTabletCache";
         case CacheType::CLOUD_TXN_DELETE_BITMAP_CACHE:
             return "CloudTxnDeleteBitmapCache";
+        case CacheType::FOR_UT_CACHE_NUMBER:
+            return "ForUTCacheNumber";
         default:
             LOG(FATAL) << "not match type of cache policy :" << static_cast<int>(type);
         }
@@ -109,11 +111,12 @@ class CachePolicy {
             {"MowTabletVersionCache", CacheType::TABLET_VERSION_CACHE},
             {"LastSuccessChannelCache", CacheType::LAST_SUCCESS_CHANNEL_CACHE},
             {"CommonObjLRUCache", CacheType::COMMON_OBJ_LRU_CACHE},
-            {"ForUT", CacheType::FOR_UT},
+            {"ForUTCacheSize", CacheType::FOR_UT_CACHE_SIZE},
             {"TabletSchemaCache", CacheType::TABLET_SCHEMA_CACHE},
             {"CreateTabletRRIdxCache", CacheType::CREATE_TABLET_RR_IDX_CACHE},
             {"CloudTabletCache", CacheType::CLOUD_TABLET_CACHE},
-            {"CloudTxnDeleteBitmapCache", CacheType::CLOUD_TXN_DELETE_BITMAP_CACHE}};
+            {"CloudTxnDeleteBitmapCache", CacheType::CLOUD_TXN_DELETE_BITMAP_CACHE},
+            {"ForUTCacheNumber", CacheType::FOR_UT_CACHE_NUMBER}};
 
     static CacheType string_to_type(std::string type) {
         if (StringToType.contains(type)) {
@@ -123,13 +126,16 @@ class CachePolicy {
         }
     }
 
-    CachePolicy(CacheType type, uint32_t stale_sweep_time_s, bool enable_prune);
+    CachePolicy(CacheType type, size_t capacity, uint32_t stale_sweep_time_s, bool enable_prune);
     virtual ~CachePolicy();
 
     virtual void prune_stale() = 0;
     virtual void prune_all(bool force) = 0;
+    virtual int64_t adjust_capacity_weighted(double adjust_weighted) = 0;
+    virtual size_t get_capacity() = 0;
 
     CacheType type() { return _type; }
+    size_t initial_capacity() const { return _initial_capacity; }
     bool enable_prune() const { return _enable_prune; }
     RuntimeProfile* profile() { return _profile.get(); }
 
@@ -139,16 +145,20 @@ class CachePolicy {
                 std::make_unique<RuntimeProfile>(fmt::format("Cache type={}", type_string(_type)));
         _prune_stale_number_counter = ADD_COUNTER(_profile, "PruneStaleNumber", TUnit::UNIT);
         _prune_all_number_counter = ADD_COUNTER(_profile, "PruneAllNumber", TUnit::UNIT);
+        _adjust_capacity_weighted_number_counter =
+                ADD_COUNTER(_profile, "SetCapacityNumber", TUnit::UNIT);
         _freed_memory_counter = ADD_COUNTER(_profile, "FreedMemory", TUnit::BYTES);
         _freed_entrys_counter = ADD_COUNTER(_profile, "FreedEntrys", TUnit::UNIT);
         _cost_timer = ADD_TIMER(_profile, "CostTime");
     }
 
     CacheType _type;
+    size_t _initial_capacity {0};
 
     std::unique_ptr<RuntimeProfile> _profile;
     RuntimeProfile::Counter* _prune_stale_number_counter = nullptr;
     RuntimeProfile::Counter* _prune_all_number_counter = nullptr;
+    RuntimeProfile::Counter* _adjust_capacity_weighted_number_counter = nullptr;
     // Reset before each gc
     RuntimeProfile::Counter* _freed_memory_counter = nullptr;
     RuntimeProfile::Counter* _freed_entrys_counter = nullptr;
diff --git a/be/src/runtime/memory/global_memory_arbitrator.cpp b/be/src/runtime/memory/global_memory_arbitrator.cpp
index 344bcbc59846d9..76a414a6ebdc74 100644
--- a/be/src/runtime/memory/global_memory_arbitrator.cpp
+++ b/be/src/runtime/memory/global_memory_arbitrator.cpp
@@ -38,6 +38,13 @@ bvar::PassiveStatus<int64_t> g_sys_mem_avail(
 
 std::atomic<int64_t> GlobalMemoryArbitrator::_s_process_reserved_memory = 0;
 std::atomic<int64_t> GlobalMemoryArbitrator::refresh_interval_memory_growth = 0;
+std::mutex GlobalMemoryArbitrator::cache_adjust_capacity_lock;
+std::condition_variable GlobalMemoryArbitrator::cache_adjust_capacity_cv;
+std::atomic<bool> GlobalMemoryArbitrator::cache_adjust_capacity_notify {false};
+std::atomic<double> GlobalMemoryArbitrator::last_cache_capacity_adjust_weighted {1};
+std::mutex GlobalMemoryArbitrator::memtable_memory_refresh_lock;
+std::condition_variable GlobalMemoryArbitrator::memtable_memory_refresh_cv;
+std::atomic<bool> GlobalMemoryArbitrator::memtable_memory_refresh_notify {false};
 
 bool GlobalMemoryArbitrator::try_reserve_process_memory(int64_t bytes) {
     if (sys_mem_available() - bytes < MemInfo::sys_mem_available_warning_water_mark()) {
diff --git a/be/src/runtime/memory/global_memory_arbitrator.h b/be/src/runtime/memory/global_memory_arbitrator.h
index f8fda18d0e9a0c..5fbcf232ce4d24 100644
--- a/be/src/runtime/memory/global_memory_arbitrator.h
+++ b/be/src/runtime/memory/global_memory_arbitrator.h
@@ -173,6 +173,23 @@ class GlobalMemoryArbitrator {
     // avoid multiple threads starting at the same time and causing OOM.
     static std::atomic<int64_t> refresh_interval_memory_growth;
 
+    static std::mutex cache_adjust_capacity_lock;
+    static std::condition_variable cache_adjust_capacity_cv;
+    static std::atomic<bool> cache_adjust_capacity_notify;
+    static std::atomic<double> last_cache_capacity_adjust_weighted;
+    static void notify_cache_adjust_capacity() {
+        cache_adjust_capacity_notify.store(true, std::memory_order_relaxed);
+        cache_adjust_capacity_cv.notify_all();
+    }
+
+    static std::mutex memtable_memory_refresh_lock;
+    static std::condition_variable memtable_memory_refresh_cv;
+    static std::atomic<bool> memtable_memory_refresh_notify;
+    static void notify_memtable_memory_refresh() {
+        memtable_memory_refresh_notify.store(true, std::memory_order_relaxed);
+        memtable_memory_refresh_cv.notify_all();
+    }
+
 private:
     static std::atomic<int64_t> _s_process_reserved_memory;
 
diff --git a/be/src/runtime/memory/lru_cache_policy.h b/be/src/runtime/memory/lru_cache_policy.h
index 1b6c9ead6d0086..419825c85c4538 100644
--- a/be/src/runtime/memory/lru_cache_policy.h
+++ b/be/src/runtime/memory/lru_cache_policy.h
@@ -37,7 +37,8 @@ class LRUCachePolicy : public CachePolicy {
                    uint32_t stale_sweep_time_s, uint32_t num_shards = DEFAULT_LRU_CACHE_NUM_SHARDS,
                    uint32_t element_count_capacity = DEFAULT_LRU_CACHE_ELEMENT_COUNT_CAPACITY,
                    bool enable_prune = true)
-            : CachePolicy(type, stale_sweep_time_s, enable_prune), _lru_cache_type(lru_cache_type) {
+            : CachePolicy(type, capacity, stale_sweep_time_s, enable_prune),
+              _lru_cache_type(lru_cache_type) {
         if (check_capacity(capacity, num_shards)) {
             _cache = std::shared_ptr<ShardedLRUCache>(
                     new ShardedLRUCache(type_string(type), capacity, lru_cache_type, num_shards,
@@ -53,7 +54,8 @@ class LRUCachePolicy : public CachePolicy {
                    uint32_t element_count_capacity,
                    CacheValueTimeExtractor cache_value_time_extractor,
                    bool cache_value_check_timestamp, bool enable_prune = true)
-            : CachePolicy(type, stale_sweep_time_s, enable_prune), _lru_cache_type(lru_cache_type) {
+            : CachePolicy(type, capacity, stale_sweep_time_s, enable_prune),
+              _lru_cache_type(lru_cache_type) {
         if (check_capacity(capacity, num_shards)) {
             _cache = std::shared_ptr<ShardedLRUCache>(
                     new ShardedLRUCache(type_string(type), capacity, lru_cache_type, num_shards,
@@ -106,18 +108,19 @@ class LRUCachePolicy : public CachePolicy {
 
     int64_t get_usage() { return _cache->get_usage(); }
 
-    size_t get_total_capacity() { return _cache->get_total_capacity(); }
+    size_t get_capacity() override { return _cache->get_capacity(); }
 
     uint64_t new_id() { return _cache->new_id(); };
 
     // Subclass can override this method to determine whether to do the minor or full gc
     virtual bool exceed_prune_limit() {
-        return _lru_cache_type == LRUCacheType::SIZE ? mem_consumption() > CACHE_MIN_FREE_SIZE
-                                                     : get_usage() > CACHE_MIN_FREE_NUMBER;
+        return _lru_cache_type == LRUCacheType::SIZE ? mem_consumption() > CACHE_MIN_PRUNE_SIZE
+                                                     : get_usage() > CACHE_MIN_PRUNE_NUMBER;
     }
 
     // Try to prune the cache if expired.
     void prune_stale() override {
+        std::lock_guard<std::mutex> l(_lock);
         COUNTER_SET(_freed_entrys_counter, (int64_t)0);
         COUNTER_SET(_freed_memory_counter, (int64_t)0);
         if (_stale_sweep_time_s <= 0 && _cache == ExecEnv::GetInstance()->get_dummy_lru_cache()) {
@@ -125,7 +128,6 @@ class LRUCachePolicy : public CachePolicy {
         }
         if (exceed_prune_limit()) {
             COUNTER_SET(_cost_timer, (int64_t)0);
-            SCOPED_TIMER(_cost_timer);
             const int64_t curtime = UnixMillis();
             auto pred = [this, curtime](const LRUHandle* handle) -> bool {
                 return static_cast<bool>((handle->last_visit_time + _stale_sweep_time_s * 1000) <
@@ -134,33 +136,38 @@ class LRUCachePolicy : public CachePolicy {
 
             LOG(INFO) << fmt::format("[MemoryGC] {} prune stale start, consumption {}, usage {}",
                                      type_string(_type), mem_consumption(), get_usage());
-            // Prune cache in lazy mode to save cpu and minimize the time holding write lock
-            PrunedInfo pruned_info = _cache->prune_if(pred, true);
-            COUNTER_SET(_freed_entrys_counter, pruned_info.pruned_count);
-            COUNTER_SET(_freed_memory_counter, pruned_info.pruned_size);
+            {
+                SCOPED_TIMER(_cost_timer);
+                // Prune cache in lazy mode to save cpu and minimize the time holding write lock
+                PrunedInfo pruned_info = _cache->prune_if(pred, true);
+                COUNTER_SET(_freed_entrys_counter, pruned_info.pruned_count);
+                COUNTER_SET(_freed_memory_counter, pruned_info.pruned_size);
+            }
             COUNTER_UPDATE(_prune_stale_number_counter, 1);
             LOG(INFO) << fmt::format(
-                    "[MemoryGC] {} prune stale {} entries, {} bytes, {} times prune",
+                    "[MemoryGC] {} prune stale {} entries, {} bytes, cost {}, {} times prune",
                     type_string(_type), _freed_entrys_counter->value(),
-                    _freed_memory_counter->value(), _prune_stale_number_counter->value());
+                    _freed_memory_counter->value(), _cost_timer->value(),
+                    _prune_stale_number_counter->value());
         } else {
             if (_lru_cache_type == LRUCacheType::SIZE) {
                 LOG(INFO) << fmt::format(
                         "[MemoryGC] {} not need prune stale, LRUCacheType::SIZE consumption {} "
                         "less "
-                        "than CACHE_MIN_FREE_SIZE {}",
-                        type_string(_type), mem_consumption(), CACHE_MIN_FREE_SIZE);
+                        "than CACHE_MIN_PRUNE_SIZE {}",
+                        type_string(_type), mem_consumption(), CACHE_MIN_PRUNE_SIZE);
             } else if (_lru_cache_type == LRUCacheType::NUMBER) {
                 LOG(INFO) << fmt::format(
                         "[MemoryGC] {} not need prune stale, LRUCacheType::NUMBER usage {} less "
                         "than "
-                        "CACHE_MIN_FREE_NUMBER {}",
-                        type_string(_type), get_usage(), CACHE_MIN_FREE_NUMBER);
+                        "CACHE_MIN_PRUNE_NUMBER {}",
+                        type_string(_type), get_usage(), CACHE_MIN_PRUNE_NUMBER);
             }
         }
     }
 
     void prune_all(bool force) override {
+        std::lock_guard<std::mutex> l(_lock);
         COUNTER_SET(_freed_entrys_counter, (int64_t)0);
         COUNTER_SET(_freed_memory_counter, (int64_t)0);
         if (_cache == ExecEnv::GetInstance()->get_dummy_lru_cache()) {
@@ -168,37 +175,73 @@ class LRUCachePolicy : public CachePolicy {
         }
         if ((force && mem_consumption() != 0) || exceed_prune_limit()) {
             COUNTER_SET(_cost_timer, (int64_t)0);
-            SCOPED_TIMER(_cost_timer);
             LOG(INFO) << fmt::format("[MemoryGC] {} prune all start, consumption {}, usage {}",
                                      type_string(_type), mem_consumption(), get_usage());
-            PrunedInfo pruned_info = _cache->prune();
-            COUNTER_SET(_freed_entrys_counter, pruned_info.pruned_count);
-            COUNTER_SET(_freed_memory_counter, pruned_info.pruned_size);
+            {
+                SCOPED_TIMER(_cost_timer);
+                PrunedInfo pruned_info = _cache->prune();
+                COUNTER_SET(_freed_entrys_counter, pruned_info.pruned_count);
+                COUNTER_SET(_freed_memory_counter, pruned_info.pruned_size);
+            }
             COUNTER_UPDATE(_prune_all_number_counter, 1);
             LOG(INFO) << fmt::format(
-                    "[MemoryGC] {} prune all {} entries, {} bytes, {} times prune, is force: {}",
+                    "[MemoryGC] {} prune all {} entries, {} bytes, cost {}, {} times prune, is "
+                    "force: {}",
                     type_string(_type), _freed_entrys_counter->value(),
-                    _freed_memory_counter->value(), _prune_all_number_counter->value(), force);
+                    _freed_memory_counter->value(), _cost_timer->value(),
+                    _prune_all_number_counter->value(), force);
         } else {
             if (_lru_cache_type == LRUCacheType::SIZE) {
                 LOG(INFO) << fmt::format(
                         "[MemoryGC] {} not need prune all, force is {}, LRUCacheType::SIZE "
                         "consumption {}, "
-                        "CACHE_MIN_FREE_SIZE {}",
-                        type_string(_type), force, mem_consumption(), CACHE_MIN_FREE_SIZE);
+                        "CACHE_MIN_PRUNE_SIZE {}",
+                        type_string(_type), force, mem_consumption(), CACHE_MIN_PRUNE_SIZE);
             } else if (_lru_cache_type == LRUCacheType::NUMBER) {
                 LOG(INFO) << fmt::format(
                         "[MemoryGC] {} not need prune all, force is {}, LRUCacheType::NUMBER "
-                        "usage {}, CACHE_MIN_FREE_NUMBER {}",
-                        type_string(_type), force, get_usage(), CACHE_MIN_FREE_NUMBER);
+                        "usage {}, CACHE_MIN_PRUNE_NUMBER {}",
+                        type_string(_type), force, get_usage(), CACHE_MIN_PRUNE_NUMBER);
             }
         }
     }
 
+    int64_t adjust_capacity_weighted(double adjust_weighted) override {
+        std::lock_guard<std::mutex> l(_lock);
+        auto capacity = static_cast<size_t>(_initial_capacity * adjust_weighted);
+        COUNTER_SET(_freed_entrys_counter, (int64_t)0);
+        COUNTER_SET(_freed_memory_counter, (int64_t)0);
+        COUNTER_SET(_cost_timer, (int64_t)0);
+        if (_cache == ExecEnv::GetInstance()->get_dummy_lru_cache()) {
+            return 0;
+        }
+
+        size_t old_capacity = get_capacity();
+        int64_t old_mem_consumption = mem_consumption();
+        int64_t old_usage = get_usage();
+        {
+            SCOPED_TIMER(_cost_timer);
+            PrunedInfo pruned_info = _cache->set_capacity(capacity);
+            COUNTER_SET(_freed_entrys_counter, pruned_info.pruned_count);
+            COUNTER_SET(_freed_memory_counter, pruned_info.pruned_size);
+        }
+        COUNTER_UPDATE(_adjust_capacity_weighted_number_counter, 1);
+        LOG(INFO) << fmt::format(
+                "[MemoryGC] {} update capacity, old <capacity {}, consumption {}, usage {}>, "
+                "adjust_weighted {}, new <capacity {}, consumption {}, usage {}>, prune {} "
+                "entries, {} bytes, cost {}, {} times prune",
+                type_string(_type), old_capacity, old_mem_consumption, old_usage, adjust_weighted,
+                get_capacity(), mem_consumption(), get_usage(), _freed_entrys_counter->value(),
+                _freed_memory_counter->value(), _cost_timer->value(),
+                _adjust_capacity_weighted_number_counter->value());
+        return _freed_entrys_counter->value();
+    }
+
 protected:
     // if check_capacity failed, will return dummy lru cache,
     // compatible with ShardedLRUCache usage, but will not actually cache.
     std::shared_ptr<Cache> _cache;
+    std::mutex _lock;
     LRUCacheType _lru_cache_type;
 };
 
diff --git a/be/src/runtime/memory/mem_tracker_limiter.cpp b/be/src/runtime/memory/mem_tracker_limiter.cpp
index a8aa44414ebf87..59546b11d51a8a 100644
--- a/be/src/runtime/memory/mem_tracker_limiter.cpp
+++ b/be/src/runtime/memory/mem_tracker_limiter.cpp
@@ -739,10 +739,10 @@ int64_t MemTrackerLimiter::free_top_overcommit_query(
         LOG(INFO) << log_prefix << "finished, no task need be canceled.";
         return 0;
     }
-    if (query_consumption.size() == 1) {
+    if (small_num == 0 && canceling_task.empty() && query_consumption.size() == 1) {
         auto iter = query_consumption.begin();
-        LOG(INFO) << log_prefix << "finished, only one task: " << iter->first
-                  << ", memory consumption: " << iter->second << ", no cancel.";
+        LOG(INFO) << log_prefix << "finished, only one overcommit task: " << iter->first
+                  << ", memory consumption: " << iter->second << ", no other tasks, so no cancel.";
         return 0;
     }
 
diff --git a/be/src/runtime/memory/memory_reclamation.cpp b/be/src/runtime/memory/memory_reclamation.cpp
index 3adf1d1ac75718..17f5a41f462b50 100644
--- a/be/src/runtime/memory/memory_reclamation.cpp
+++ b/be/src/runtime/memory/memory_reclamation.cpp
@@ -37,7 +37,6 @@ bool MemoryReclamation::process_minor_gc(std::string mem_info) {
     std::unique_ptr<RuntimeProfile> profile = std::make_unique<RuntimeProfile>("");
 
     Defer defer {[&]() {
-        MemInfo::notify_je_purge_dirty_pages();
         std::stringstream ss;
         profile->pretty_print(&ss);
         LOG(INFO) << fmt::format(
@@ -46,11 +45,6 @@ bool MemoryReclamation::process_minor_gc(std::string mem_info) {
                 ss.str());
     }};
 
-    freed_mem += CacheManager::instance()->for_each_cache_prune_stale(profile.get());
-    if (freed_mem > MemInfo::process_minor_gc_size()) {
-        return true;
-    }
-
     if (config::enable_workload_group_memory_gc) {
         RuntimeProfile* tg_profile = profile->create_child("WorkloadGroup", true, true);
         freed_mem += tg_enable_overcommit_group_gc(MemInfo::process_minor_gc_size() - freed_mem,
@@ -87,7 +81,6 @@ bool MemoryReclamation::process_full_gc(std::string mem_info) {
     std::unique_ptr<RuntimeProfile> profile = std::make_unique<RuntimeProfile>("");
 
     Defer defer {[&]() {
-        MemInfo::notify_je_purge_dirty_pages();
         std::stringstream ss;
         profile->pretty_print(&ss);
         LOG(INFO) << fmt::format(
@@ -96,11 +89,6 @@ bool MemoryReclamation::process_full_gc(std::string mem_info) {
                 ss.str());
     }};
 
-    freed_mem += CacheManager::instance()->for_each_cache_prune_all(profile.get());
-    if (freed_mem > MemInfo::process_full_gc_size()) {
-        return true;
-    }
-
     if (config::enable_workload_group_memory_gc) {
         RuntimeProfile* tg_profile = profile->create_child("WorkloadGroup", true, true);
         freed_mem += tg_enable_overcommit_group_gc(MemInfo::process_full_gc_size() - freed_mem,
diff --git a/be/src/service/point_query_executor.h b/be/src/service/point_query_executor.h
index 19954479c97ec7..7503fd2c102a68 100644
--- a/be/src/service/point_query_executor.h
+++ b/be/src/service/point_query_executor.h
@@ -246,8 +246,8 @@ class LookupConnectionCache : public LRUCachePolicyTrackingManual {
         auto* value = new CacheValue;
         value->item = item;
         LOG(INFO) << "Add item mem"
-                  << ", cache_capacity: " << get_total_capacity()
-                  << ", cache_usage: " << get_usage() << ", mem_consum: " << mem_consumption();
+                  << ", cache_capacity: " << get_capacity() << ", cache_usage: " << get_usage()
+                  << ", mem_consum: " << mem_consumption();
         auto* lru_handle = insert(key, value, 1, sizeof(Reusable), CachePriority::NORMAL);
         release(lru_handle);
     }
diff --git a/be/src/vec/common/allocator.cpp b/be/src/vec/common/allocator.cpp
index dff1330888f82d..2619c0bafffb16 100644
--- a/be/src/vec/common/allocator.cpp
+++ b/be/src/vec/common/allocator.cpp
@@ -106,9 +106,6 @@ void Allocator<clear_memory_, mmap_populate, use_mmap, MemoryAllocator>::sys_mem
             return;
         }
 
-        // no significant impact on performance is expected.
-        doris::MemInfo::notify_je_purge_dirty_pages();
-
         if (doris::thread_context()->thread_mem_tracker_mgr->is_attach_query() &&
             doris::thread_context()->thread_mem_tracker_mgr->wait_gc()) {
             int64_t wait_milliseconds = 0;
diff --git a/be/test/olap/lru_cache_test.cpp b/be/test/olap/lru_cache_test.cpp
index 4fc096380c754b..9adb30b93054f4 100644
--- a/be/test/olap/lru_cache_test.cpp
+++ b/be/test/olap/lru_cache_test.cpp
@@ -88,25 +88,46 @@ class CacheTest : public testing::Test {
         void* value;
     };
 
-    class CacheTestPolicy : public LRUCachePolicyTrackingManual {
+    class CacheTestSizePolicy : public LRUCachePolicyTrackingManual {
     public:
-        CacheTestPolicy(size_t capacity)
-                : LRUCachePolicyTrackingManual(CachePolicy::CacheType::FOR_UT, capacity,
+        CacheTestSizePolicy(size_t capacity)
+                : LRUCachePolicyTrackingManual(CachePolicy::CacheType::FOR_UT_CACHE_SIZE, capacity,
                                                LRUCacheType::SIZE, -1) {}
     };
 
+    class CacheTestNumberPolicy : public LRUCachePolicyTrackingManual {
+    public:
+        CacheTestNumberPolicy(size_t capacity, uint32_t num_shards)
+                : LRUCachePolicyTrackingManual(CachePolicy::CacheType::FOR_UT_CACHE_NUMBER,
+                                               capacity, LRUCacheType::NUMBER, -1, num_shards) {}
+    };
+
     // there is 16 shards in ShardedLRUCache
     // And the LRUHandle size is about 100B. So the cache size should big enough
     // to run the UT.
     static const int kCacheSize = 1000 * 16;
     std::vector<int> _deleted_keys;
     std::vector<int> _deleted_values;
-    CacheTestPolicy* _cache;
+    LRUCachePolicy* _cache = nullptr;
 
-    CacheTest() : _cache(new CacheTestPolicy(kCacheSize)) { _s_current = this; }
+    CacheTest() { _s_current = this; }
 
     ~CacheTest() override { delete _cache; }
 
+    void init_size_cache(size_t capacity = kCacheSize) {
+        if (_cache != nullptr) {
+            delete _cache;
+        }
+        _cache = new CacheTestSizePolicy(capacity);
+    }
+
+    void init_number_cache(size_t capacity = kCacheSize, uint32_t num_shards = 1) {
+        if (_cache != nullptr) {
+            delete _cache;
+        }
+        _cache = new CacheTestNumberPolicy(capacity, num_shards);
+    }
+
     LRUCachePolicy* cache() const { return _cache; }
 
     int Lookup(int key) const {
@@ -149,7 +170,25 @@ class CacheTest : public testing::Test {
 };
 CacheTest* CacheTest::_s_current;
 
+static void insert_LRUCache(LRUCache& cache, const CacheKey& key, int value,
+                            CachePriority priority) {
+    uint32_t hash = key.hash(key.data(), key.size(), 0);
+    auto* cache_value = new CacheTest::CacheValue(EncodeValue(value));
+    cache.release(cache.insert(key, hash, cache_value, value, priority));
+}
+
+static void insert_number_LRUCache(LRUCache& cache, const CacheKey& key, int value, int charge,
+                                   CachePriority priority) {
+    uint32_t hash = key.hash(key.data(), key.size(), 0);
+    auto* cache_value = new CacheTest::CacheValue(EncodeValue(value));
+    cache.release(cache.insert(key, hash, cache_value, charge, priority));
+}
+
+// https://stackoverflow.com/questions/42756443/undefined-reference-with-gtest
+const int CacheTest::kCacheSize;
+
 TEST_F(CacheTest, HitAndMiss) {
+    init_size_cache();
     EXPECT_EQ(-1, Lookup(100));
 
     Insert(100, 101, 1);
@@ -173,6 +212,7 @@ TEST_F(CacheTest, HitAndMiss) {
 }
 
 TEST_F(CacheTest, Erase) {
+    init_size_cache();
     Erase(200);
     EXPECT_EQ(0, _deleted_keys.size());
 
@@ -192,6 +232,7 @@ TEST_F(CacheTest, Erase) {
 }
 
 TEST_F(CacheTest, EntriesArePinned) {
+    init_size_cache();
     Insert(100, 101, 1);
     std::string result1;
     Cache::Handle* h1 = cache()->lookup(EncodeKey(&result1, 100));
@@ -219,6 +260,7 @@ TEST_F(CacheTest, EntriesArePinned) {
 }
 
 TEST_F(CacheTest, EvictionPolicy) {
+    init_size_cache();
     Insert(100, 101, 1);
     Insert(200, 201, 1);
 
@@ -234,6 +276,7 @@ TEST_F(CacheTest, EvictionPolicy) {
 }
 
 TEST_F(CacheTest, EvictionPolicyWithDurable) {
+    init_size_cache();
     Insert(100, 101, 1);
     InsertDurable(200, 201, 1);
     Insert(300, 101, 1);
@@ -250,20 +293,6 @@ TEST_F(CacheTest, EvictionPolicyWithDurable) {
     EXPECT_EQ(201, Lookup(200));
 }
 
-static void insert_LRUCache(LRUCache& cache, const CacheKey& key, int value,
-                            CachePriority priority) {
-    uint32_t hash = key.hash(key.data(), key.size(), 0);
-    auto* cache_value = new CacheTest::CacheValue(EncodeValue(value));
-    cache.release(cache.insert(key, hash, cache_value, value, priority));
-}
-
-static void insert_number_LRUCache(LRUCache& cache, const CacheKey& key, int value, int charge,
-                                   CachePriority priority) {
-    uint32_t hash = key.hash(key.data(), key.size(), 0);
-    auto* cache_value = new CacheTest::CacheValue(EncodeValue(value));
-    cache.release(cache.insert(key, hash, cache_value, charge, priority));
-}
-
 TEST_F(CacheTest, Usage) {
     LRUCache cache(LRUCacheType::SIZE);
     cache.set_capacity(1040);
@@ -463,6 +492,7 @@ TEST_F(CacheTest, Number) {
 }
 
 TEST_F(CacheTest, HeavyEntries) {
+    init_size_cache();
     // Add a bunch of light and heavy entries and then count the combined
     // size of items still in the cache, which must be approximately the
     // same as the total capacity.
@@ -494,12 +524,14 @@ TEST_F(CacheTest, HeavyEntries) {
 }
 
 TEST_F(CacheTest, NewId) {
+    init_size_cache();
     uint64_t a = cache()->new_id();
     uint64_t b = cache()->new_id();
     EXPECT_NE(a, b);
 }
 
 TEST_F(CacheTest, SimpleBenchmark) {
+    init_size_cache();
     for (int i = 0; i < kCacheSize * LOOP_LESS_OR_MORE(10, 10000); i++) {
         Insert(1000 + i, 2000 + i, 1);
         EXPECT_EQ(2000 + i, Lookup(1000 + i));
@@ -598,4 +630,78 @@ TEST(CacheHandleTest, HandleTableTest) {
     }
 }
 
+TEST_F(CacheTest, SetCapacity) {
+    init_number_cache();
+    for (int i = 0; i < kCacheSize; i++) {
+        Insert(i, 1000 + i, 1);
+        EXPECT_EQ(1000 + i, Lookup(i));
+    }
+    ASSERT_EQ(kCacheSize, cache()->get_capacity());
+    ASSERT_EQ(kCacheSize, cache()->get_usage());
+
+    int64_t prune_num = cache()->adjust_capacity_weighted(2);
+    ASSERT_EQ(prune_num, 0);
+    ASSERT_EQ(kCacheSize * 2, cache()->get_capacity());
+    ASSERT_EQ(kCacheSize, cache()->get_usage());
+
+    prune_num = cache()->adjust_capacity_weighted(0.5);
+    ASSERT_EQ(prune_num, kCacheSize / 2);
+    ASSERT_EQ(kCacheSize / 2, cache()->get_capacity());
+    ASSERT_EQ(kCacheSize / 2, cache()->get_usage());
+
+    std::vector<Cache::Handle*> handles(kCacheSize, nullptr);
+    for (int i = 0; i < kCacheSize; i++) {
+        std::string result;
+        CacheKey cache_key = EncodeKey(&result, kCacheSize + i);
+        auto* cache_value = new CacheValueWithKey(DecodeKey(cache_key), EncodeValue(i));
+        handles[i] = cache()->insert(cache_key, cache_value, 1, 1);
+    }
+    ASSERT_EQ(kCacheSize / 2, cache()->get_capacity());
+    ASSERT_EQ(kCacheSize,
+              cache()->get_usage()); // Handle not be released, so key cannot be evicted.
+
+    for (int i = 0; i < kCacheSize; i++) {
+        Insert(i + kCacheSize, 2000 + i, 1);
+        EXPECT_EQ(-1, Lookup(i + kCacheSize)); // Cache is full, insert failed.
+    }
+    ASSERT_EQ(kCacheSize / 2, cache()->get_capacity());
+    ASSERT_EQ(kCacheSize, cache()->get_usage());
+
+    cache()->adjust_capacity_weighted(2);
+    ASSERT_EQ(kCacheSize * 2, cache()->get_capacity());
+    ASSERT_EQ(kCacheSize, cache()->get_usage());
+
+    for (int i = 0; i < kCacheSize; i++) {
+        Insert(i, 3000 + i, 1);
+        EXPECT_EQ(3000 + i, Lookup(i));
+    }
+    ASSERT_EQ(kCacheSize * 2, cache()->get_capacity());
+    ASSERT_EQ(kCacheSize * 2, cache()->get_usage());
+
+    cache()->adjust_capacity_weighted(0);
+    ASSERT_EQ(0, cache()->get_capacity());
+    ASSERT_EQ(kCacheSize, cache()->get_usage());
+
+    for (auto it : handles) {
+        cache()->release(it);
+    }
+    ASSERT_EQ(0, cache()->get_capacity());
+    ASSERT_EQ(0, cache()->get_usage());
+
+    cache()->adjust_capacity_weighted(1);
+    ASSERT_EQ(kCacheSize, cache()->get_capacity());
+    ASSERT_EQ(0, cache()->get_usage());
+
+    cache()->adjust_capacity_weighted(0);
+    ASSERT_EQ(0, cache()->get_capacity());
+    ASSERT_EQ(0, cache()->get_usage());
+
+    for (int i = 0; i < kCacheSize; i++) {
+        Insert(i, 4000 + i, 1);
+        EXPECT_EQ(-1, Lookup(i));
+    }
+    ASSERT_EQ(0, cache()->get_capacity());
+    ASSERT_EQ(0, cache()->get_usage());
+}
+
 } // namespace doris

From 5dd08b6b4603ff8ef3c8a76d6a160fbce79fc953 Mon Sep 17 00:00:00 2001
From: morrySnow <101034200+morrySnow@users.noreply.github.com>
Date: Mon, 9 Sep 2024 12:19:56 +0800
Subject: [PATCH 03/44] [opt](Nereids) fix several insert into related issues
 (#40467)

- http_stream TVF should always generate one fragment plan
- http_stream TVF plan should not check root as scan node
- distinguish group_commit TVF with normal insert statement
- index and generate slot should based on type cast base slot
- agg_state could cast from nullable to non-nullable
- colocated and bucket scan range compute should only on scan node
---
 .../java/org/apache/doris/catalog/Type.java   |   4 -
 .../nereids/parser/LogicalPlanBuilder.java    |   2 +-
 .../nereids/rules/analysis/BindSink.java      |  81 ++++++++-----
 .../functions/table/HttpStream.java           |  11 ++
 .../plans/commands/info/DMLCommandType.java   |   2 +
 .../java/org/apache/doris/qe/Coordinator.java |   5 +-
 .../org/apache/doris/qe/StmtExecutor.java     |  17 ++-
 .../insert_into_table/insert_use_table_id.out |  48 --------
 .../agg_state/max/test_agg_state_max.groovy   |   2 +-
 .../insert_group_commit_into_unique.groovy    |   9 +-
 .../insert_use_table_id.groovy                | 107 ------------------
 11 files changed, 90 insertions(+), 198 deletions(-)
 delete mode 100644 regression-test/data/nereids_p0/insert_into_table/insert_use_table_id.out
 delete mode 100644 regression-test/suites/nereids_p0/insert_into_table/insert_use_table_id.groovy

diff --git a/fe/fe-common/src/main/java/org/apache/doris/catalog/Type.java b/fe/fe-common/src/main/java/org/apache/doris/catalog/Type.java
index 1dcd062261b2b8..7a8dda5aabedef 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/catalog/Type.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/catalog/Type.java
@@ -866,10 +866,6 @@ public static boolean canCastTo(Type sourceType, Type targetType) {
                 return false;
             }
             for (int i = 0; i < sourceAggState.getSubTypes().size(); i++) {
-                // target subtype is not null but source subtype is nullable
-                if (!targetAggState.getSubTypeNullables().get(i) && sourceAggState.getSubTypeNullables().get(i)) {
-                    return false;
-                }
                 if (!canCastTo(sourceAggState.getSubTypes().get(i), targetAggState.getSubTypes().get(i))) {
                     return false;
                 }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
index 324ab808226930..67ef1ca48c2f02 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
@@ -593,7 +593,7 @@ public LogicalPlan visitInsertTable(InsertTableContext ctx) {
                 isAutoDetect,
                 isOverwrite,
                 ConnectContext.get().getSessionVariable().isEnableUniqueKeyPartialUpdate(),
-                DMLCommandType.INSERT,
+                ctx.tableId == null ? DMLCommandType.INSERT : DMLCommandType.GROUP_COMMIT,
                 plan);
         Optional<LogicalPlan> cte = Optional.empty();
         if (ctx.cte() != null) {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/BindSink.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/BindSink.java
index c9e7f07f5d08e3..6d8ad94242b53c 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/BindSink.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/BindSink.java
@@ -68,6 +68,7 @@
 import org.apache.doris.nereids.types.DataType;
 import org.apache.doris.nereids.types.StringType;
 import org.apache.doris.nereids.types.coercion.CharacterType;
+import org.apache.doris.nereids.util.ExpressionUtils;
 import org.apache.doris.nereids.util.RelationUtil;
 import org.apache.doris.nereids.util.TypeCoercionUtils;
 
@@ -126,7 +127,8 @@ private Plan bindOlapTableSink(MatchingContext<UnboundTableSink<Plan>> ctx) {
                 && table.getSequenceMapCol() != null
                 && sink.getColNames().contains(table.getSequenceMapCol());
         Pair<List<Column>, Integer> bindColumnsResult =
-                bindTargetColumns(table, sink.getColNames(), childHasSeqCol, needExtraSeqCol);
+                bindTargetColumns(table, sink.getColNames(), childHasSeqCol, needExtraSeqCol,
+                        sink.getDMLCommandType() == DMLCommandType.GROUP_COMMIT);
         List<Column> bindColumns = bindColumnsResult.first;
         int extraColumnsNum = bindColumnsResult.second;
 
@@ -176,8 +178,12 @@ private Plan bindOlapTableSink(MatchingContext<UnboundTableSink<Plan>> ctx) {
                             .filter(col -> col.getName().equalsIgnoreCase(table.getSequenceMapCol()))
                             .findFirst();
                 } else {
-                    if (!sink.getColNames().isEmpty()) {
-                        if (sink.getColNames().stream()
+                    // ATTN: must use bindColumns here. Because of insert into from group_commit tvf submitted by BE
+                    //   do not follow any column list with target table, but it contains all inviable data in sink's
+                    //   child. THis is different with other insert action that contain non-inviable data by default.
+                    if (!bindColumns.isEmpty()) {
+                        if (bindColumns.stream()
+                                .map(Column::getName)
                                 .anyMatch(c -> c.equalsIgnoreCase(Column.SEQUENCE_COL))) {
                             haveInputSeqCol = true; // case2.a
                         } // else case2.b
@@ -205,7 +211,8 @@ private Plan bindOlapTableSink(MatchingContext<UnboundTableSink<Plan>> ctx) {
 
         Map<String, NamedExpression> columnToOutput = getColumnToOutput(
                 ctx, table, isPartialUpdate, boundSink, child);
-        LogicalProject<?> fullOutputProject = getOutputProjectByCoercion(table.getFullSchema(), child, columnToOutput);
+        LogicalProject<?> fullOutputProject = getOutputProjectByCoercion(
+                table.getFullSchema(), child, columnToOutput);
         return boundSink.withChildAndUpdateOutput(fullOutputProject);
     }
 
@@ -267,15 +274,14 @@ private static Map<String, NamedExpression> getColumnToOutput(
         // we need to insert all the columns of the target table
         // although some columns are not mentions.
         // so we add a projects to supply the default value.
-
         Map<Column, NamedExpression> columnToChildOutput = Maps.newHashMap();
         for (int i = 0; i < child.getOutput().size(); ++i) {
             columnToChildOutput.put(boundSink.getCols().get(i), child.getOutput().get(i));
         }
-
         Map<String, NamedExpression> columnToOutput = Maps.newTreeMap(String.CASE_INSENSITIVE_ORDER);
+        Map<String, NamedExpression> columnToReplaced = Maps.newTreeMap(String.CASE_INSENSITIVE_ORDER);
+        Map<Expression, Expression> replaceMap = Maps.newHashMap();
         NereidsParser expressionParser = new NereidsParser();
-
         List<Column> generatedColumns = Lists.newArrayList();
         List<Column> materializedViewColumn = Lists.newArrayList();
         // generate slots not mentioned in sql, mv slots and shaded slots.
@@ -291,7 +297,12 @@ private static Map<String, NamedExpression> getColumnToOutput(
                     // do not process explicitly use DEFAULT value here:
                     // insert into table t values(DEFAULT)
                     && !(columnToChildOutput.get(column) instanceof DefaultValueSlot)) {
-                columnToOutput.put(column.getName(), columnToChildOutput.get(column));
+                Alias output = new Alias(TypeCoercionUtils.castIfNotSameType(
+                        columnToChildOutput.get(column), DataType.fromCatalogType(column.getType())),
+                        column.getName());
+                columnToOutput.put(column.getName(), output);
+                columnToReplaced.put(column.getName(), output.toSlot());
+                replaceMap.put(output.toSlot(), output.child());
             } else {
                 if (table instanceof OlapTable && ((OlapTable) table).hasSequenceCol()
                         && column.getName().equals(Column.SEQUENCE_COL)
@@ -312,6 +323,8 @@ private static Map<String, NamedExpression> getColumnToOutput(
                             seqColumn = new Alias(seqColumn, column.getName());
                         }
                         columnToOutput.put(column.getName(), seqColumn);
+                        columnToReplaced.put(column.getName(), seqColumn.toSlot());
+                        replaceMap.put(seqColumn.toSlot(), seqColumn.child(0));
                     }
                 } else if (isPartialUpdate) {
                     // If the current load is a partial update, the values of unmentioned
@@ -328,9 +341,12 @@ private static Map<String, NamedExpression> getColumnToOutput(
                         Expression defualtValueExpression = ExpressionAnalyzer.analyzeFunction(
                                 boundSink, ctx.cascadesContext, unboundFunctionDefaultValue
                         );
-                        columnToOutput.put(column.getName(),
-                                new Alias(defualtValueExpression, column.getName())
-                        );
+                        Alias output = new Alias(TypeCoercionUtils.castIfNotSameType(
+                                defualtValueExpression, DataType.fromCatalogType(column.getType())),
+                                column.getName());
+                        columnToOutput.put(column.getName(), output);
+                        columnToReplaced.put(column.getName(), output.toSlot());
+                        replaceMap.put(output.toSlot(), output.child());
                     } else {
                         continue;
                     }
@@ -343,10 +359,11 @@ private static Map<String, NamedExpression> getColumnToOutput(
                     }
                     // Otherwise, the unmentioned columns should be filled with default values
                     // or null values
-                    columnToOutput.put(column.getName(), new Alias(
-                            new NullLiteral(DataType.fromCatalogType(column.getType())),
-                            column.getName()
-                    ));
+                    Alias output = new Alias(new NullLiteral(DataType.fromCatalogType(column.getType())),
+                            column.getName());
+                    columnToOutput.put(column.getName(), output);
+                    columnToReplaced.put(column.getName(), output.toSlot());
+                    replaceMap.put(output.toSlot(), output.child());
                 } else {
                     try {
                         // it comes from the original planner, if default value expression is
@@ -365,8 +382,12 @@ private static Map<String, NamedExpression> getColumnToOutput(
                             if (defualtValueExpression instanceof Alias) {
                                 defualtValueExpression = ((Alias) defualtValueExpression).child();
                             }
-                            columnToOutput.put(column.getName(),
-                                    new Alias(defualtValueExpression, column.getName()));
+                            Alias output = new Alias((TypeCoercionUtils.castIfNotSameType(
+                                    defualtValueExpression, DataType.fromCatalogType(column.getType()))),
+                                    column.getName());
+                            columnToOutput.put(column.getName(), output);
+                            columnToReplaced.put(column.getName(), output.toSlot());
+                            replaceMap.put(output.toSlot(), output.child());
                         }
                     } catch (Exception e) {
                         throw new AnalysisException(e.getMessage(), e.getCause());
@@ -380,13 +401,16 @@ private static Map<String, NamedExpression> getColumnToOutput(
         for (Column column : generatedColumns) {
             GeneratedColumnInfo info = column.getGeneratedColumnInfo();
             Expression parsedExpression = new NereidsParser().parseExpression(info.getExpr().toSqlWithoutTbl());
-            Expression boundExpression = new CustomExpressionAnalyzer(boundSink, ctx.cascadesContext, columnToOutput)
+            Expression boundExpression = new CustomExpressionAnalyzer(boundSink, ctx.cascadesContext, columnToReplaced)
                     .analyze(parsedExpression);
             if (boundExpression instanceof Alias) {
                 boundExpression = ((Alias) boundExpression).child();
             }
-            NamedExpression slot = new Alias(boundExpression, info.getExprSql());
-            columnToOutput.put(column.getName(), slot);
+            boundExpression = ExpressionUtils.replace(boundExpression, replaceMap);
+            Alias output = new Alias(boundExpression, info.getExprSql());
+            columnToOutput.put(column.getName(), output);
+            columnToReplaced.put(column.getName(), output.toSlot());
+            replaceMap.put(output.toSlot(), output.child());
         }
         for (Column column : materializedViewColumn) {
             if (column.isMaterializedViewColumn()) {
@@ -400,12 +424,15 @@ private static Map<String, NamedExpression> getColumnToOutput(
                 // may not be bound, we have to bind it again.
                 // for example: to_bitmap.
                 Expression boundExpression = new CustomExpressionAnalyzer(
-                        boundSink, ctx.cascadesContext, columnToOutput).analyze(parsedExpression);
+                        boundSink, ctx.cascadesContext, columnToReplaced).analyze(parsedExpression);
                 if (boundExpression instanceof Alias) {
                     boundExpression = ((Alias) boundExpression).child();
                 }
-                NamedExpression slot = new Alias(boundExpression, column.getDefineExpr().toSqlWithoutTbl());
-                columnToOutput.put(column.getName(), slot);
+                boundExpression = ExpressionUtils.replace(boundExpression, replaceMap);
+                boundExpression = TypeCoercionUtils.castIfNotSameType(boundExpression,
+                        DataType.fromCatalogType(column.getType()));
+                Alias output = new Alias(boundExpression, column.getDefineExpr().toSqlWithoutTbl());
+                columnToOutput.put(column.getName(), output);
             }
         }
         return columnToOutput;
@@ -554,12 +581,14 @@ private List<Long> bindPartitionIds(OlapTable table, List<String> partitions, bo
     }
 
     private Pair<List<Column>, Integer> bindTargetColumns(OlapTable table, List<String> colsName,
-            boolean childHasSeqCol, boolean needExtraSeqCol) {
+            boolean childHasSeqCol, boolean needExtraSeqCol, boolean isGroupCommit) {
         // if the table set sequence column in stream load phase, the sequence map column is null, we query it.
         if (colsName.isEmpty()) {
+            // ATTN: group commit without column list should return all base index column
+            //   because it already prepares data for these columns.
             return Pair.of(table.getBaseSchema(true).stream()
-                .filter(c -> validColumn(c, childHasSeqCol))
-                .collect(ImmutableList.toImmutableList()), 0);
+                    .filter(c -> isGroupCommit || validColumn(c, childHasSeqCol))
+                    .collect(ImmutableList.toImmutableList()), 0);
         } else {
             int extraColumnsNum = (needExtraSeqCol ? 1 : 0);
             List<String> processedColsName = Lists.newArrayList(colsName);
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/HttpStream.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/HttpStream.java
index de052b078db43e..8e35e25240e6ca 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/HttpStream.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/HttpStream.java
@@ -19,12 +19,17 @@
 
 import org.apache.doris.catalog.FunctionSignature;
 import org.apache.doris.nereids.exceptions.AnalysisException;
+import org.apache.doris.nereids.properties.DistributionSpecHash;
+import org.apache.doris.nereids.properties.DistributionSpecHash.ShuffleType;
+import org.apache.doris.nereids.properties.PhysicalProperties;
 import org.apache.doris.nereids.trees.expressions.Properties;
 import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
 import org.apache.doris.nereids.types.coercion.AnyDataType;
 import org.apache.doris.tablefunction.HttpStreamTableValuedFunction;
 import org.apache.doris.tablefunction.TableValuedFunctionIf;
 
+import com.google.common.collect.ImmutableList;
+
 import java.util.Map;
 
 /** http_stream */
@@ -49,6 +54,12 @@ protected TableValuedFunctionIf toCatalogFunction() {
         }
     }
 
+    @Override
+    public PhysicalProperties getPhysicalProperties() {
+        return PhysicalProperties.createHash(new DistributionSpecHash(ImmutableList.of(),
+                ShuffleType.EXECUTION_BUCKETED));
+    }
+
     @Override
     public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
         return visitor.visitHttpStream(this, context);
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/DMLCommandType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/DMLCommandType.java
index 18d8179abe4d0e..aa97f26df18c58 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/DMLCommandType.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/DMLCommandType.java
@@ -27,6 +27,8 @@ public enum DMLCommandType {
     NONE,
     // for INSERT INTO or INSERT INTO SELECT
     INSERT,
+    // for group_commit tvf
+    GROUP_COMMIT,
     // for UPDATE
     UPDATE,
     // for DELETE
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java
index 0fff8e9f23c09f..5e3a59d9a54d96 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java
@@ -2166,9 +2166,10 @@ protected void computeScanRangeAssignment() throws Exception {
             FragmentScanRangeAssignment assignment
                     = fragmentExecParamsMap.get(scanNode.getFragmentId()).scanRangeAssignment;
             boolean fragmentContainsColocateJoin = isColocateFragment(scanNode.getFragment(),
-                    scanNode.getFragment().getPlanRoot());
+                    scanNode.getFragment().getPlanRoot()) && (scanNode instanceof OlapScanNode);
             boolean fragmentContainsBucketShuffleJoin = bucketShuffleJoinController
-                    .isBucketShuffleJoin(scanNode.getFragmentId().asInt(), scanNode.getFragment().getPlanRoot());
+                    .isBucketShuffleJoin(scanNode.getFragmentId().asInt(), scanNode.getFragment().getPlanRoot())
+                    && (scanNode instanceof OlapScanNode);
 
             // A fragment may contain both colocate join and bucket shuffle join
             // on need both compute scanRange to init basic data for query coordinator
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java
index 71614770afb70f..6aa2d6d1e0bf7b 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java
@@ -3440,8 +3440,16 @@ private HttpStreamParams generateHttpStreamNereidsPlan(TUniqueId queryId) {
             httpStreamParams.setLabel(insertExecutor.getLabelName());
 
             PlanNode planRoot = planner.getFragments().get(0).getPlanRoot();
-            Preconditions.checkState(planRoot instanceof TVFScanNode || planRoot instanceof GroupCommitScanNode,
-                    "Nereids' planNode cannot be converted to " + planRoot.getClass().getName());
+            boolean isValidPlan = !planner.getScanNodes().isEmpty();
+            for (ScanNode scanNode : planner.getScanNodes()) {
+                if (!(scanNode instanceof TVFScanNode || planRoot instanceof GroupCommitScanNode)) {
+                    isValidPlan = false;
+                    break;
+                }
+            }
+            if (!isValidPlan) {
+                throw new AnalysisException("plan is invalid: " + planRoot.getExplainString());
+            }
         } catch (QueryStateException e) {
             LOG.debug("Command(" + originStmt.originStmt + ") process failed.", e);
             context.setState(e.getQueryState());
@@ -3512,11 +3520,8 @@ public HttpStreamParams generateHttpStreamPlan(TUniqueId queryId) throws Excepti
                         LOG.warn("Analyze failed. {}", context.getQueryIdentifier(), e);
                         throw ((NereidsException) e).getException();
                     }
-                    boolean isInsertIntoCommand = parsedStmt != null && parsedStmt instanceof LogicalPlanAdapter
-                            && ((LogicalPlanAdapter) parsedStmt).getLogicalPlan() instanceof InsertIntoTableCommand;
                     if (e instanceof NereidsException
-                                && !context.getSessionVariable().enableFallbackToOriginalPlanner
-                                && !isInsertIntoCommand) {
+                            && !context.getSessionVariable().enableFallbackToOriginalPlanner) {
                         LOG.warn("Analyze failed. {}", context.getQueryIdentifier(), e);
                         throw ((NereidsException) e).getException();
                     }
diff --git a/regression-test/data/nereids_p0/insert_into_table/insert_use_table_id.out b/regression-test/data/nereids_p0/insert_into_table/insert_use_table_id.out
deleted file mode 100644
index d0020443bf67df..00000000000000
--- a/regression-test/data/nereids_p0/insert_into_table/insert_use_table_id.out
+++ /dev/null
@@ -1,48 +0,0 @@
--- This file is automatically generated. You should know what you did if you want to edit this
--- !sql_cross_join --
-1	10	1	1	1.0	2000-01-01	1	10	10	10.0	2000-01-10	1
-1	10	1	1	1.0	2000-01-01	1	10	10	10.0	2000-01-10	4
-1	10	1	1	1.0	2000-01-01	1	10	10	10.0	2000-01-10	5
-1	10	1	1	1.0	2000-01-01	2	20	20	20.0	2000-01-20	1
-1	10	1	1	1.0	2000-01-01	2	20	20	20.0	2000-01-20	4
-1	10	1	1	1.0	2000-01-01	2	20	20	20.0	2000-01-20	5
-1	10	1	1	1.0	2000-01-01	3	30	30	30.0	2000-01-30	1
-1	10	1	1	1.0	2000-01-01	3	30	30	30.0	2000-01-30	4
-1	10	1	1	1.0	2000-01-01	3	30	30	30.0	2000-01-30	5
-1	10	1	1	1.0	2000-01-01	4	4	4	4.0	2000-01-04	1
-1	10	1	1	1.0	2000-01-01	4	4	4	4.0	2000-01-04	4
-1	10	1	1	1.0	2000-01-01	4	4	4	4.0	2000-01-04	5
-1	10	1	1	1.0	2000-01-01	5	5	5	5.0	2000-01-05	1
-1	10	1	1	1.0	2000-01-01	5	5	5	5.0	2000-01-05	4
-1	10	1	1	1.0	2000-01-01	5	5	5	5.0	2000-01-05	5
-2	20	2	2	2.0	2000-01-02	1	10	10	10.0	2000-01-10	1
-2	20	2	2	2.0	2000-01-02	1	10	10	10.0	2000-01-10	4
-2	20	2	2	2.0	2000-01-02	1	10	10	10.0	2000-01-10	5
-2	20	2	2	2.0	2000-01-02	2	20	20	20.0	2000-01-20	1
-2	20	2	2	2.0	2000-01-02	2	20	20	20.0	2000-01-20	4
-2	20	2	2	2.0	2000-01-02	2	20	20	20.0	2000-01-20	5
-2	20	2	2	2.0	2000-01-02	3	30	30	30.0	2000-01-30	1
-2	20	2	2	2.0	2000-01-02	3	30	30	30.0	2000-01-30	4
-2	20	2	2	2.0	2000-01-02	3	30	30	30.0	2000-01-30	5
-2	20	2	2	2.0	2000-01-02	4	4	4	4.0	2000-01-04	1
-2	20	2	2	2.0	2000-01-02	4	4	4	4.0	2000-01-04	4
-2	20	2	2	2.0	2000-01-02	4	4	4	4.0	2000-01-04	5
-2	20	2	2	2.0	2000-01-02	5	5	5	5.0	2000-01-05	1
-2	20	2	2	2.0	2000-01-02	5	5	5	5.0	2000-01-05	4
-2	20	2	2	2.0	2000-01-02	5	5	5	5.0	2000-01-05	5
-3	30	3	3	3.0	2000-01-03	1	10	10	10.0	2000-01-10	1
-3	30	3	3	3.0	2000-01-03	1	10	10	10.0	2000-01-10	4
-3	30	3	3	3.0	2000-01-03	1	10	10	10.0	2000-01-10	5
-3	30	3	3	3.0	2000-01-03	2	20	20	20.0	2000-01-20	1
-3	30	3	3	3.0	2000-01-03	2	20	20	20.0	2000-01-20	4
-3	30	3	3	3.0	2000-01-03	2	20	20	20.0	2000-01-20	5
-3	30	3	3	3.0	2000-01-03	3	30	30	30.0	2000-01-30	1
-3	30	3	3	3.0	2000-01-03	3	30	30	30.0	2000-01-30	4
-3	30	3	3	3.0	2000-01-03	3	30	30	30.0	2000-01-30	5
-3	30	3	3	3.0	2000-01-03	4	4	4	4.0	2000-01-04	1
-3	30	3	3	3.0	2000-01-03	4	4	4	4.0	2000-01-04	4
-3	30	3	3	3.0	2000-01-03	4	4	4	4.0	2000-01-04	5
-3	30	3	3	3.0	2000-01-03	5	5	5	5.0	2000-01-05	1
-3	30	3	3	3.0	2000-01-03	5	5	5	5.0	2000-01-05	4
-3	30	3	3	3.0	2000-01-03	5	5	5	5.0	2000-01-05	5
-
diff --git a/regression-test/suites/datatype_p0/agg_state/max/test_agg_state_max.groovy b/regression-test/suites/datatype_p0/agg_state/max/test_agg_state_max.groovy
index 983f51beed1f3c..a71da554afb5df 100644
--- a/regression-test/suites/datatype_p0/agg_state/max/test_agg_state_max.groovy
+++ b/regression-test/suites/datatype_p0/agg_state/max/test_agg_state_max.groovy
@@ -30,7 +30,7 @@ suite("test_agg_state_max") {
 
     test {
         sql "insert into a_table values(100,max_state(null));"
-        exception "can not cast from origin type agg_state"
+        exception "illegal for non_nullable"
     }
 
     sql """insert into a_table
diff --git a/regression-test/suites/insert_p0/insert_group_commit_into_unique.groovy b/regression-test/suites/insert_p0/insert_group_commit_into_unique.groovy
index ca280cd17d83eb..8ae0d41565d488 100644
--- a/regression-test/suites/insert_p0/insert_group_commit_into_unique.groovy
+++ b/regression-test/suites/insert_p0/insert_group_commit_into_unique.groovy
@@ -86,7 +86,8 @@ suite("insert_group_commit_into_unique") {
             UNIQUE KEY(`id`, `name`)
             DISTRIBUTED BY HASH(`id`) BUCKETS 1
             PROPERTIES (
-                "replication_num" = "1"
+                "replication_num" = "1",
+                "group_commit_interval_ms" = "100"
             );
             """
 
@@ -171,7 +172,8 @@ suite("insert_group_commit_into_unique") {
             DISTRIBUTED BY HASH(`id`) BUCKETS 1
             PROPERTIES (
                 "replication_num" = "1",
-                "function_column.sequence_col" = "score"
+                "function_column.sequence_col" = "score",
+                "group_commit_interval_ms" = "100"
             );
             """
 
@@ -257,7 +259,8 @@ suite("insert_group_commit_into_unique") {
             DISTRIBUTED BY HASH(`id`) BUCKETS 1
             PROPERTIES (
                 "replication_num" = "1",
-                "function_column.sequence_type" = "int"
+                "function_column.sequence_type" = "int",
+                "group_commit_interval_ms" = "100"
             );
             """
 
diff --git a/regression-test/suites/nereids_p0/insert_into_table/insert_use_table_id.groovy b/regression-test/suites/nereids_p0/insert_into_table/insert_use_table_id.groovy
deleted file mode 100644
index 930fe35b60ed65..00000000000000
--- a/regression-test/suites/nereids_p0/insert_into_table/insert_use_table_id.groovy
+++ /dev/null
@@ -1,107 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-
-
-
-suite('nereids_insert_use_table_id') {
-    sql 'set enable_nereids_planner=true'
-    sql 'set enable_fallback_to_original_planner=false'
-    sql 'set enable_nereids_dml=true'
-    sql 'set enable_strict_consistency_dml=true'
-
-    // sql 'CREATE DATABASE IF NOT EXISTS dnereids_insert_use_table_id_test'
-    // sql 'use nereids_insert_use_table_id_test'
-
-    def t1 = 'table_id_value_t1'
-    def t2 = 'table_id_value_t2'
-    def t3 = 'table_id_value_t3'
-
-    sql "drop table if exists ${t1}"
-    sql "drop table if exists ${t2}"
-    sql "drop table if exists ${t3}"
-
-    sql """
-        create table ${t1} (
-            id int,
-            id1 int,
-            c1 bigint,
-            c2 string,
-            c3 double,
-            c4 date
-        ) unique key (id, id1)
-        distributed by hash(id, id1) buckets 13
-        properties(
-            'replication_num'='1',
-            "function_column.sequence_col" = "c4"
-        );
-    """
-
-    sql """
-        create table ${t2} (
-            id int,
-            c1 bigint,
-            c2 string,
-            c3 double,
-            c4 date
-        ) unique key (id)
-        distributed by hash(id) buckets 13
-        properties(
-            'replication_num'='1'
-        );
-    """
-
-    sql """
-        create table ${t3} (
-            id int
-        ) distributed by hash(id) buckets 13
-        properties(
-            'replication_num'='1'
-        );
-    """
-
-
-    sql """
-        INSERT INTO DORIS_INTERNAL_TABLE_ID(${getTableId(t1)}) VALUES
-            (1, (1 + 9) * (10 - 9), 1, '1', 1.0, '2000-01-01'),
-            (2, 20, 2, '2', 2.0, days_add('2000-01-01', 1)),
-            (3, 30, 3, '3', 3.0, makedate(2000, 3));
-    """
-
-    sql """
-        INSERT INTO DORIS_INTERNAL_TABLE_ID(${getTableId(t2)}) VALUES
-            (1, 10, '10', 10.0, '2000-01-10'),
-            (2, 20, '20', 20.0, '2000-01-20'),
-            (3, 30, '30', 30.0, '2000-01-30'),
-            (4, 4, '4', 4.0, '2000-01-04'),
-            (5, 5, '5', 5.0, '2000-01-05');
-    """
-
-    sql """
-        INSERT INTO DORIS_INTERNAL_TABLE_ID(${getTableId(t3)}) VALUES
-            (1),
-            (4),
-            (5);
-    """
-
-    sql "sync"
-    qt_sql_cross_join "select * from ${t1}, ${t2}, ${t3} order by ${t1}.id, ${t1}.id1, ${t2}.id, ${t3}.id"
-
-
-}
-
-

From cd35604271968612798b483a92a44ebc2ec0c717 Mon Sep 17 00:00:00 2001
From: minghong <englefly@gmail.com>
Date: Mon, 9 Sep 2024 15:18:09 +0800
Subject: [PATCH 04/44] [feat](nereids)set actual row count in physical plan
 according to merged profile (#40361)

## Proposed changes
physical plan is already printed in profile.
however, it is hard to compare the estimated rows of sql operator and
the actual rows.
In this pr, we get actual rows from merged profile, and set it to
corresponding physical node in physical plan.
here is an example:
"PhysicalHashJoin[13890]@115 ( stats=17,964.27 actualRows=20499,
type=INNER_JOIN, hashCondition=[(l_suppkey#19 = s_suppkey#33)]"
the estmated rows is 17,964, the actual rows is 20499
Issue Number: close #xxx

<!--Describe your changes.-->
---
 .../common/profile/ExecutionProfile.java      |  1 +
 .../apache/doris/common/profile/Profile.java  | 78 +++++++++++++++----
 .../doris/common/profile/SummaryProfile.java  |  2 -
 .../doris/common/util/RuntimeProfile.java     | 15 ++++
 .../nereids/trees/plans/AbstractPlan.java     |  4 +
 .../plans/physical/AbstractPhysicalJoin.java  |  3 +-
 .../plans/physical/PhysicalCTEProducer.java   |  1 +
 .../plans/physical/PhysicalHashAggregate.java |  2 +-
 .../plans/physical/PhysicalQuickSort.java     |  4 +-
 .../trees/plans/physical/PhysicalTopN.java    |  1 +
 .../trees/plans/physical/PhysicalUnion.java   |  4 +-
 .../trees/plans/physical/PhysicalWindow.java  |  3 +-
 .../org/apache/doris/qe/StmtExecutor.java     |  4 +
 .../apache/doris/statistics/Statistics.java   | 35 ++++++---
 14 files changed, 121 insertions(+), 36 deletions(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/profile/ExecutionProfile.java b/fe/fe-core/src/main/java/org/apache/doris/common/profile/ExecutionProfile.java
index d3d6826174f91d..7828a38e6eb242 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/profile/ExecutionProfile.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/profile/ExecutionProfile.java
@@ -182,6 +182,7 @@ private RuntimeProfile getPipelineAggregatedProfile(Map<Integer, String> planNod
                 }
                 newFragmentProfile.addChild(mergedpipelineProfile);
                 pipelineIdx++;
+                fragmentsProfile.rowsProducedMap.putAll(mergedpipelineProfile.rowsProducedMap);
             }
         }
         return fragmentsProfile;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/profile/Profile.java b/fe/fe-core/src/main/java/org/apache/doris/common/profile/Profile.java
index 76414677d0a05b..88fd317879451e 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/profile/Profile.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/profile/Profile.java
@@ -22,8 +22,11 @@
 import org.apache.doris.common.util.ProfileManager;
 import org.apache.doris.common.util.RuntimeProfile;
 import org.apache.doris.nereids.NereidsPlanner;
+import org.apache.doris.nereids.trees.plans.AbstractPlan;
+import org.apache.doris.nereids.trees.plans.Plan;
 import org.apache.doris.nereids.trees.plans.distribute.DistributedPlan;
 import org.apache.doris.nereids.trees.plans.distribute.FragmentIdMapping;
+import org.apache.doris.nereids.trees.plans.physical.PhysicalPlan;
 import org.apache.doris.nereids.trees.plans.physical.PhysicalRelation;
 import org.apache.doris.planner.Planner;
 
@@ -45,6 +48,8 @@
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.zip.Deflater;
@@ -107,6 +112,10 @@ public class Profile {
     // Profile size is the size of profile file
     private long profileSize = 0;
 
+    private PhysicalPlan physicalPlan;
+    public Map<String, Long> rowsProducedMap = new HashMap<>();
+    private List<PhysicalRelation> physicalRelations = new ArrayList<>();
+
     // Need default constructor for read from storage
     public Profile() {}
 
@@ -273,20 +282,8 @@ public synchronized void updateSummary(Map<String, String> summaryInfo, boolean
 
             if (planner instanceof NereidsPlanner) {
                 NereidsPlanner nereidsPlanner = ((NereidsPlanner) planner);
-                StringBuilder builder = new StringBuilder();
-                builder.append("\n");
-                builder.append(nereidsPlanner.getPhysicalPlan()
-                        .treeString());
-                builder.append("\n");
-                for (PhysicalRelation relation : nereidsPlanner.getPhysicalRelations()) {
-                    if (relation.getStats() != null) {
-                        builder.append(relation).append("\n")
-                                .append(relation.getStats().printColumnStats());
-                    }
-                }
-                summaryInfo.put(SummaryProfile.PHYSICAL_PLAN,
-                        builder.toString().replace("\n", "\n     "));
-
+                physicalPlan = nereidsPlanner.getPhysicalPlan();
+                physicalRelations.addAll(nereidsPlanner.getPhysicalRelations());
                 FragmentIdMapping<DistributedPlan> distributedPlans = nereidsPlanner.getDistributedPlans();
                 if (distributedPlans != null) {
                     summaryInfo.put(SummaryProfile.DISTRIBUTED_PLAN,
@@ -414,15 +411,43 @@ public void getExecutionProfileContent(StringBuilder builder) {
 
         // Only generate merged profile for select, insert into select.
         // Not support broker load now.
+        RuntimeProfile mergedProfile = null;
         if (this.profileLevel == MergedProfileLevel && this.executionProfiles.size() == 1) {
             try {
-                builder.append("\n MergedProfile \n");
-                this.executionProfiles.get(0).getAggregatedFragmentsProfile(planNodeMap).prettyPrint(builder, "     ");
+                mergedProfile = this.executionProfiles.get(0).getAggregatedFragmentsProfile(planNodeMap);
+                this.rowsProducedMap.putAll(mergedProfile.rowsProducedMap);
+                if (physicalPlan != null) {
+                    updateActualRowCountOnPhysicalPlan(physicalPlan);
+                }
             } catch (Throwable aggProfileException) {
                 LOG.warn("build merged simple profile {} failed", this.id, aggProfileException);
+            }
+        }
+
+        if (physicalPlan != null) {
+            builder.append("\nPhysical Plan \n");
+            StringBuilder physcialPlanBuilder = new StringBuilder();
+            physcialPlanBuilder.append(physicalPlan.treeString());
+            physcialPlanBuilder.append("\n");
+            for (PhysicalRelation relation : physicalRelations) {
+                if (relation.getStats() != null) {
+                    physcialPlanBuilder.append(relation).append("\n")
+                            .append(relation.getStats().printColumnStats());
+                }
+            }
+            builder.append(
+                    physcialPlanBuilder.toString().replace("\n", "\n     "));
+        }
+
+        if (this.profileLevel == MergedProfileLevel && this.executionProfiles.size() == 1) {
+            builder.append("\nMergedProfile \n");
+            if (mergedProfile != null) {
+                mergedProfile.prettyPrint(builder, "     ");
+            } else {
                 builder.append("build merged simple profile failed");
             }
         }
+
         try {
             // For load task, they will have multiple execution_profiles.
             for (ExecutionProfile executionProfile : executionProfiles) {
@@ -646,4 +671,25 @@ public boolean shouldBeRemoveFromMemory() {
 
         return true;
     }
+
+    public PhysicalPlan getPhysicalPlan() {
+        return physicalPlan;
+    }
+
+    public void setPhysicalPlan(PhysicalPlan physicalPlan) {
+        this.physicalPlan = physicalPlan;
+    }
+
+    private void updateActualRowCountOnPhysicalPlan(Plan plan) {
+        if (plan == null || rowsProducedMap.isEmpty()) {
+            return;
+        }
+        Long actualRowCount = rowsProducedMap.get(String.valueOf(((AbstractPlan) plan).getId()));
+        if (actualRowCount != null) {
+            ((AbstractPlan) plan).updateActualRowCount(actualRowCount);
+        }
+        for (Plan child : plan.children()) {
+            updateActualRowCountOnPhysicalPlan(child);
+        }
+    }
 }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/profile/SummaryProfile.java b/fe/fe-core/src/main/java/org/apache/doris/common/profile/SummaryProfile.java
index df4e73be0483ed..20e41b18d6955f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/profile/SummaryProfile.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/profile/SummaryProfile.java
@@ -66,7 +66,6 @@ public class SummaryProfile {
     public static final String PARALLEL_FRAGMENT_EXEC_INSTANCE = "Parallel Fragment Exec Instance Num";
     public static final String TRACE_ID = "Trace ID";
     public static final String WORKLOAD_GROUP = "Workload Group";
-    public static final String PHYSICAL_PLAN = "Physical Plan";
     public static final String DISTRIBUTED_PLAN = "Distributed Plan";
     public static final String SYSTEM_MESSAGE = "System Message";
     public static final String EXECUTED_BY_FRONTEND = "Executed By Frontend";
@@ -129,7 +128,6 @@ public class SummaryProfile {
             START_TIME, END_TIME, TOTAL_TIME, TASK_STATE, USER, DEFAULT_CATALOG, DEFAULT_DB, SQL_STATEMENT);
     public static final ImmutableList<String> SUMMARY_KEYS = new ImmutableList.Builder<String>()
             .addAll(SUMMARY_CAPTIONS)
-            .add(PHYSICAL_PLAN)
             .add(DISTRIBUTED_PLAN)
             .build();
 
diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/RuntimeProfile.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/RuntimeProfile.java
index 60207b49172ba0..3ffc303a6db89d 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/util/RuntimeProfile.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/RuntimeProfile.java
@@ -40,12 +40,15 @@
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Formatter;
+import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 /**
  * It is accessed by two kinds of thread, one is to create this RuntimeProfile
@@ -100,6 +103,8 @@ public class RuntimeProfile {
     @SerializedName(value = "nodeid")
     private int nodeid = -1;
 
+    public Map<String, Long> rowsProducedMap = new HashMap<>();
+
     public RuntimeProfile() {
         init();
     }
@@ -494,6 +499,7 @@ public static void mergeProfiles(List<RuntimeProfile> profiles,
             // RuntimeProfile has at least one counter named TotalTime, should exclude it.
             if (newCreatedMergedChildProfile.counterMap.size() > 1) {
                 simpleProfile.addChildWithCheck(newCreatedMergedChildProfile, planNodeMap);
+                simpleProfile.rowsProducedMap.putAll(newCreatedMergedChildProfile.rowsProducedMap);
             }
         }
     }
@@ -504,6 +510,12 @@ private static void mergeCounters(String parentCounterName, List<RuntimeProfile>
             return;
         }
         RuntimeProfile templateProfile = profiles.get(0);
+        Pattern pattern = Pattern.compile("nereids_id=(\\d+)");
+        Matcher matcher = pattern.matcher(templateProfile.getName());
+        String nereidsId = null;
+        if (matcher.find()) {
+            nereidsId = matcher.group(1);
+        }
         Set<String> childCounterSet = templateProfile.childCounterMap.get(parentCounterName);
         if (childCounterSet == null) {
             return;
@@ -517,6 +529,9 @@ private static void mergeCounters(String parentCounterName, List<RuntimeProfile>
                     Counter orgCounter = profile.counterMap.get(childCounterName);
                     aggCounter.addCounter(orgCounter);
                 }
+                if (nereidsId != null && childCounterName.equals("RowsProduced")) {
+                    simpleProfile.rowsProducedMap.put(nereidsId, aggCounter.sum.getValue());
+                }
                 if (simpleProfile.counterMap.containsKey(parentCounterName)) {
                     simpleProfile.addCounter(childCounterName, aggCounter, parentCounterName);
                 } else {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/AbstractPlan.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/AbstractPlan.java
index 9dfca3195d691e..eb65048050fda1 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/AbstractPlan.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/AbstractPlan.java
@@ -226,4 +226,8 @@ public List<Plan> getAncestors() {
         }
         return ancestors;
     }
+
+    public void updateActualRowCount(long actualRowCount) {
+        statistics.setActualRowCount(actualRowCount);
+    }
 }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/AbstractPhysicalJoin.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/AbstractPhysicalJoin.java
index 56c18908ad69da..a68da1a5b3d12d 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/AbstractPhysicalJoin.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/AbstractPhysicalJoin.java
@@ -267,8 +267,9 @@ public Set<Slot> getConditionSlot() {
 
     @Override
     public String toString() {
-        List<Object> args = Lists.newArrayList("type", joinType,
+        List<Object> args = Lists.newArrayList(
                 "stats", statistics,
+                "type", joinType,
                 "hashCondition", hashJoinConjuncts,
                 "otherCondition", otherJoinConjuncts,
                 "markCondition", markJoinConjuncts);
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalCTEProducer.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalCTEProducer.java
index 53ff3e3025742d..568b8e6660ab39 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalCTEProducer.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalCTEProducer.java
@@ -89,6 +89,7 @@ public int hashCode() {
     @Override
     public String toString() {
         return Utils.toSqlString("PhysicalCTEProducer[" + id.asInt() + "]",
+                "stats", statistics,
                 "cteId", cteId);
     }
 
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalHashAggregate.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalHashAggregate.java
index fb3087e260869c..c8187727da47f2 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalHashAggregate.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalHashAggregate.java
@@ -193,9 +193,9 @@ public String toString() {
         TopnPushInfo topnPushInfo = (TopnPushInfo) getMutableState(
                 MutableState.KEY_PUSH_TOPN_TO_AGG).orElseGet(() -> null);
         return Utils.toSqlString("PhysicalHashAggregate[" + id.asInt() + "]" + getGroupIdWithPrefix(),
+                "stats", statistics,
                 "aggPhase", aggregateParam.aggPhase,
                 "aggMode", aggregateParam.aggMode,
-                "stats", statistics,
                 "maybeUseStreaming", maybeUsingStream,
                 "groupByExpr", groupByExpressions,
                 "outputExpr", outputExpressions,
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalQuickSort.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalQuickSort.java
index c1973668c7d919..0e377b46d238ea 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalQuickSort.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalQuickSort.java
@@ -107,8 +107,8 @@ public String shapeInfo() {
     @Override
     public String toString() {
         return Utils.toSqlString("PhysicalQuickSort[" + id.asInt() + "]" + getGroupIdWithPrefix(),
-                "orderKeys", orderKeys,
-                "phase", phase.toString(), "stats", statistics
+                "stats", statistics, "orderKeys", orderKeys,
+                "phase", phase.toString()
         );
     }
 
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalTopN.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalTopN.java
index 96dc709bbde8b6..c387a58dd0c993 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalTopN.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalTopN.java
@@ -143,6 +143,7 @@ public String shapeInfo() {
     @Override
     public String toString() {
         return Utils.toSqlString("PhysicalTopN[" + id.asInt() + "]" + getGroupIdWithPrefix(),
+                "stats", statistics,
                 "limit", limit,
                 "offset", offset,
                 "orderKeys", orderKeys,
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalUnion.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalUnion.java
index ba20c9267059f1..2a81698812a3c7 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalUnion.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalUnion.java
@@ -89,11 +89,11 @@ public <R, C> R accept(PlanVisitor<R, C> visitor, C context) {
     @Override
     public String toString() {
         return Utils.toSqlString("PhysicalUnion" + "[" + id.asInt() + "]" + getGroupIdWithPrefix(),
+                "stats", statistics,
                 "qualifier", qualifier,
                 "outputs", outputs,
                 "regularChildrenOutputs", regularChildrenOutputs,
-                "constantExprsList", constantExprsList,
-                "stats", statistics);
+                "constantExprsList", constantExprsList);
     }
 
     @Override
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalWindow.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalWindow.java
index b1703f47496706..7e6fd48f02da6d 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalWindow.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalWindow.java
@@ -105,8 +105,9 @@ public List<? extends Expression> getExpressions() {
     @Override
     public String toString() {
         return Utils.toSqlString("PhysicalWindow[" + id.asInt() + "]" + getGroupIdWithPrefix(),
+                "stats", statistics,
             "windowFrameGroup", windowFrameGroup,
-            "requiredProperties", requireProperties, "stats", statistics
+            "requiredProperties", requireProperties
         );
     }
 
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java
index 6aa2d6d1e0bf7b..6bf09f2229bd8d 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java
@@ -1211,6 +1211,10 @@ public void updateProfile(boolean isFinished) {
         // failed, the insert stmt should be success
         try {
             profile.updateSummary(getSummaryInfo(isFinished), isFinished, this.planner);
+            if (planner instanceof NereidsPlanner) {
+                NereidsPlanner nereidsPlanner = ((NereidsPlanner) planner);
+                profile.setPhysicalPlan(nereidsPlanner.getPhysicalPlan());
+            }
         } catch (Throwable t) {
             LOG.warn("failed to update profile, ignore this error", t);
         }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java
index 162dab5d13601c..6883eb0b54208a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java
@@ -46,6 +46,8 @@ public class Statistics {
 
     private double deltaRowCount = 0.0;
 
+    private long actualRowCount = -1L;
+
     public Statistics(Statistics another) {
         this.rowCount = another.rowCount;
         this.widthInJoinCluster = another.widthInJoinCluster;
@@ -193,21 +195,24 @@ public double dataSizeFactor(List<Slot> slots) {
 
     @Override
     public String toString() {
+        StringBuilder builder = new StringBuilder();
         if (Double.isNaN(rowCount)) {
-            return "NaN";
-        }
-        if (Double.POSITIVE_INFINITY == rowCount) {
-            return "Infinite";
-        }
-        if (Double.NEGATIVE_INFINITY == rowCount) {
-            return "-Infinite";
+            builder.append("NaN");
+        } else if (Double.POSITIVE_INFINITY == rowCount) {
+            builder.append("Infinite");
+        } else if (Double.NEGATIVE_INFINITY == rowCount) {
+            builder.append("-Infinite");
+        } else {
+            DecimalFormat format = new DecimalFormat("#,###.##");
+            builder.append(format.format(rowCount));
         }
-        DecimalFormat format = new DecimalFormat("#,###.##");
-        String rows = format.format(rowCount);
         if (deltaRowCount > 0) {
-            rows = rows + "(" + format.format(deltaRowCount) + ")";
+            builder.append("(").append((long) deltaRowCount).append(")");
+        }
+        if (actualRowCount != -1) {
+            builder.append(" actualRows=").append(actualRowCount);
         }
-        return rows;
+        return builder.toString();
     }
 
     public String printColumnStats() {
@@ -292,4 +297,12 @@ public double getDeltaRowCount() {
     public void setDeltaRowCount(double deltaRowCount) {
         this.deltaRowCount = deltaRowCount;
     }
+
+    public long getActualRowCount() {
+        return actualRowCount;
+    }
+
+    public void setActualRowCount(long actualRowCount) {
+        this.actualRowCount = actualRowCount;
+    }
 }

From 9d5da50b7a43f8e4c26a3e0cb3e039d86c6f093d Mon Sep 17 00:00:00 2001
From: Gabriel <gabrielleebuaa@gmail.com>
Date: Mon, 9 Sep 2024 15:34:43 +0800
Subject: [PATCH 05/44] [metrics](shuffle) Add necessary metrics (#40476)

---
 be/src/vec/runtime/vdata_stream_mgr.cpp   |  8 +++++---
 be/src/vec/runtime/vdata_stream_recvr.cpp | 13 ++++++++++---
 be/src/vec/runtime/vdata_stream_recvr.h   |  7 +++++--
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/be/src/vec/runtime/vdata_stream_mgr.cpp b/be/src/vec/runtime/vdata_stream_mgr.cpp
index 80cc2d93f8e103..a5db9a6150dcfa 100644
--- a/be/src/vec/runtime/vdata_stream_mgr.cpp
+++ b/be/src/vec/runtime/vdata_stream_mgr.cpp
@@ -109,6 +109,8 @@ Status VDataStreamMgr::transmit_block(const PTransmitDataParams* request,
     t_finst_id.hi = finst_id.hi();
     t_finst_id.lo = finst_id.lo();
     std::shared_ptr<VDataStreamRecvr> recvr = nullptr;
+    ThreadCpuStopWatch cpu_time_stop_watch;
+    cpu_time_stop_watch.start();
     static_cast<void>(find_recvr(t_finst_id, request->node_id(), &recvr));
     if (recvr == nullptr) {
         // The receiver may remove itself from the receiver map via deregister_recvr()
@@ -137,9 +139,9 @@ Status VDataStreamMgr::transmit_block(const PTransmitDataParams* request,
 
     bool eos = request->eos();
     if (request->has_block()) {
-        RETURN_IF_ERROR(recvr->add_block(request->block(), request->sender_id(),
-                                         request->be_number(), request->packet_seq(),
-                                         eos ? nullptr : done, wait_for_worker));
+        RETURN_IF_ERROR(recvr->add_block(
+                request->block(), request->sender_id(), request->be_number(), request->packet_seq(),
+                eos ? nullptr : done, wait_for_worker, cpu_time_stop_watch.elapsed_time()));
     }
 
     if (eos) {
diff --git a/be/src/vec/runtime/vdata_stream_recvr.cpp b/be/src/vec/runtime/vdata_stream_recvr.cpp
index 5326f2b7d0ab0a..1ca6bb7f2c5931 100644
--- a/be/src/vec/runtime/vdata_stream_recvr.cpp
+++ b/be/src/vec/runtime/vdata_stream_recvr.cpp
@@ -134,7 +134,8 @@ void VDataStreamRecvr::SenderQueue::try_set_dep_ready_without_lock() {
 Status VDataStreamRecvr::SenderQueue::add_block(const PBlock& pblock, int be_number,
                                                 int64_t packet_seq,
                                                 ::google::protobuf::Closure** done,
-                                                const int64_t wait_for_worker) {
+                                                const int64_t wait_for_worker,
+                                                const uint64_t time_to_find_recvr) {
     {
         std::lock_guard<std::mutex> l(_lock);
         if (_is_cancelled) {
@@ -189,6 +190,10 @@ Status VDataStreamRecvr::SenderQueue::add_block(const PBlock& pblock, int be_num
         _recvr->_max_wait_worker_time->set(wait_for_worker);
     }
 
+    if (_recvr->_max_find_recvr_time->value() < time_to_find_recvr) {
+        _recvr->_max_find_recvr_time->set((int64_t)time_to_find_recvr);
+    }
+
     _block_queue.emplace_back(std::move(block), block_byte_size);
     COUNTER_UPDATE(_recvr->_remote_bytes_received_counter, block_byte_size);
     _record_debug_info();
@@ -363,6 +368,7 @@ VDataStreamRecvr::VDataStreamRecvr(VDataStreamMgr* stream_mgr, RuntimeState* sta
     _blocks_produced_counter = ADD_COUNTER(_profile, "BlocksProduced", TUnit::UNIT);
     _max_wait_worker_time = ADD_COUNTER(_profile, "MaxWaitForWorkerTime", TUnit::UNIT);
     _max_wait_to_process_time = ADD_COUNTER(_profile, "MaxWaitToProcessTime", TUnit::UNIT);
+    _max_find_recvr_time = ADD_COUNTER(_profile, "MaxFindRecvrTime(NS)", TUnit::UNIT);
 }
 
 VDataStreamRecvr::~VDataStreamRecvr() {
@@ -391,11 +397,12 @@ Status VDataStreamRecvr::create_merger(const VExprContextSPtrs& ordering_expr,
 
 Status VDataStreamRecvr::add_block(const PBlock& pblock, int sender_id, int be_number,
                                    int64_t packet_seq, ::google::protobuf::Closure** done,
-                                   const int64_t wait_for_worker) {
+                                   const int64_t wait_for_worker,
+                                   const uint64_t time_to_find_recvr) {
     SCOPED_ATTACH_TASK(_query_thread_context);
     int use_sender_id = _is_merging ? sender_id : 0;
     return _sender_queues[use_sender_id]->add_block(pblock, be_number, packet_seq, done,
-                                                    wait_for_worker);
+                                                    wait_for_worker, time_to_find_recvr);
 }
 
 void VDataStreamRecvr::add_block(Block* block, int sender_id, bool use_move) {
diff --git a/be/src/vec/runtime/vdata_stream_recvr.h b/be/src/vec/runtime/vdata_stream_recvr.h
index 7eebdf0249b958..e8dcfdedba5fb9 100644
--- a/be/src/vec/runtime/vdata_stream_recvr.h
+++ b/be/src/vec/runtime/vdata_stream_recvr.h
@@ -83,7 +83,8 @@ class VDataStreamRecvr : public HasTaskExecutionCtx {
     std::vector<SenderQueue*> sender_queues() const { return _sender_queues; }
 
     Status add_block(const PBlock& pblock, int sender_id, int be_number, int64_t packet_seq,
-                     ::google::protobuf::Closure** done, const int64_t wait_for_worker);
+                     ::google::protobuf::Closure** done, const int64_t wait_for_worker,
+                     const uint64_t time_to_find_recvr);
 
     void add_block(Block* block, int sender_id, bool use_move);
 
@@ -160,6 +161,7 @@ class VDataStreamRecvr : public HasTaskExecutionCtx {
     RuntimeProfile::Counter* _blocks_produced_counter = nullptr;
     RuntimeProfile::Counter* _max_wait_worker_time = nullptr;
     RuntimeProfile::Counter* _max_wait_to_process_time = nullptr;
+    RuntimeProfile::Counter* _max_find_recvr_time = nullptr;
 
     std::vector<std::shared_ptr<pipeline::Dependency>> _sender_to_local_channel_dependency;
 };
@@ -178,7 +180,8 @@ class VDataStreamRecvr::SenderQueue {
     Status get_batch(Block* next_block, bool* eos);
 
     Status add_block(const PBlock& pblock, int be_number, int64_t packet_seq,
-                     ::google::protobuf::Closure** done, const int64_t wait_for_worker);
+                     ::google::protobuf::Closure** done, const int64_t wait_for_worker,
+                     const uint64_t time_to_find_recvr);
 
     void add_block(Block* block, bool use_move);
 

From e25560e056bcd1d882039fe8ed38c85021dd78be Mon Sep 17 00:00:00 2001
From: Pxl <pxl290@qq.com>
Date: Mon, 9 Sep 2024 16:03:42 +0800
Subject: [PATCH 06/44] [Improvement](sort) do not sort partitial when spill
 disabled (#40528)

## Proposed changes
do not sort partitial when spill disabled
SELECT count() from (select BrowserLanguage from hits_10m order by
BrowserLanguage limit 10000000)t;
2s -> 0.3s
---
 be/src/vec/common/sort/sorter.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/be/src/vec/common/sort/sorter.h b/be/src/vec/common/sort/sorter.h
index aa7d88dfbc2a3a..a290de65bb6534 100644
--- a/be/src/vec/common/sort/sorter.h
+++ b/be/src/vec/common/sort/sorter.h
@@ -177,8 +177,8 @@ class FullSorter final : public Sorter {
 
 private:
     bool _reach_limit() {
-        return _state->unsorted_block_->rows() > buffered_block_size_ ||
-               _state->unsorted_block_->bytes() > buffered_block_bytes_;
+        return _enable_spill && (_state->unsorted_block_->rows() > buffered_block_size_ ||
+                                 _state->unsorted_block_->bytes() > buffered_block_bytes_);
     }
 
     Status _do_sort();

From 6a19a37d36f772972b3d729faebf8d4c8f380d59 Mon Sep 17 00:00:00 2001
From: Xinyi Zou <zouxinyi02@gmail.com>
Date: Mon, 9 Sep 2024 17:01:44 +0800
Subject: [PATCH 07/44] [fix](memory) Revert "[opt](memory) Refactor memory
 maintenance thread (#40344)" (#40545)

This reverts commit 84ce9451c6ebdf290d5c1b401e0a282f5acb6577.

```
SIGABRT unknown detail explain (@0x4220) received by PID 16928 (TID 18624 OR 0x7fa0a663e700) from PID 16928; stack trace: ***
15:21:03    0# doris::signal::(anonymous namespace)::FailureSignalHandler(int, siginfo_t*, void*) at /root/doris/be/src/common/signal_handler.h:421
15:21:03    1# 0x00007FA7C6131090 in /lib/x86_64-linux-gnu/libc.so.6
15:21:03    2# raise at ../sysdeps/unix/sysv/linux/raise.c:51
15:21:03    3# abort at /build/glibc-SzIz7B/glibc-2.31/stdlib/abort.c:81
15:21:03    4# 0x0000558ACC0918AD in /mnt/ssd01/pipline/OpenSourceDoris/clusterEnv/P0/Cluster0/be/lib/doris_be
15:21:03    5# google::LogMessage::SendToLog() in /mnt/ssd01/pipline/OpenSourceDoris/clusterEnv/P0/Cluster0/be/lib/doris_be
15:21:03    6# google::LogMessage::Flush() in /mnt/ssd01/pipline/OpenSourceDoris/clusterEnv/P0/Cluster0/be/lib/doris_be
15:21:03    7# google::LogMessageFatal::~LogMessageFatal() in /mnt/ssd01/pipline/OpenSourceDoris/clusterEnv/P0/Cluster0/be/lib/doris_be
15:21:03    8# doris::ThreadContext::consume_memory(long) const at /root/doris/be/src/runtime/thread_context.h:244
15:21:03    9# Allocator<false, false, false, DefaultMemoryAllocator>::release_memory(unsigned long) const in /mnt/ssd01/pipline/OpenSourceDoris/clusterEnv/P0/Cluster0/be/lib/doris_be
15:21:03   10# doris::vectorized::ColumnVector<unsigned char>::~ColumnVector() at /root/doris/be/src/vec/columns/column_vector.h:131
15:21:03   11# doris::vectorized::ColumnNullable::~ColumnNullable() at /root/doris/be/src/vec/columns/column_nullable.h:62
15:21:03   12# doris::vectorized::Block::~Block() at /root/doris/be/src/vec/core/block.h:92
15:21:03   13# std::vector<std::unique_ptr<doris::vectorized::Block, std::default_delete<doris::vectorized::Block> >, std::allocator<std::unique_ptr<doris::vectorized::Block, std::default_delete<doris::vectorized::Block> > > >::~vector() at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_vector.h:680
15:21:03   14# doris::Reusable::~Reusable() at /root/doris/be/src/service/point_query_executor.cpp:65
15:21:03   15# std::Sp_counted_base<(_gnu_cxx::_Lock_policy)2>::_M_release() at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/shared_ptr_base.h:180
15:21:03   16# doris::LookupConnectionCache::CacheValue::~CacheValue() at /root/doris/be/src/service/point_query_executor.h:266
15:21:03   17# doris::LRUCache::set_capacity(unsigned long) at /root/doris/be/src/olap/lru_cache.cpp:194
15:21:03   18# doris::ShardedLRUCache::set_capacity(unsigned long) in /mnt/ssd01/pipline/OpenSourceDoris/clusterEnv/P0/Cluster0/be/lib/doris_be
15:21:03   19# doris::LRUCachePolicy::adjust_capacity_weighted(double) in /mnt/ssd01/pipline/OpenSourceDoris/clusterEnv/P0/Cluster0/be/lib/doris_be
15:21:03   20# doris::CacheManager::for_each_cache_refresh_capacity(double, doris::RuntimeProfile*) at /root/doris/be/src/runtime/memory/cache_manager.cpp:76
15:21:03   21# doris::Daemon::cache_adjust_capacity_thread() in /mnt/ssd01/pipline/OpenSourceDoris/clusterEnv/P0/Cluster0/be/lib/doris_be
15:21:03   22# doris::Thread::supervise_thread(void*) at /root/doris/be/src/util/thread.cpp:499
15:21:03   23# start_thread at /build/glibc-SzIz7B/glibc-2.31/nptl/pthread_create.c:478
15:21:03   24# __clone at ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
```
---
 be/src/common/config.cpp                      |  14 +-
 be/src/common/config.h                        |   9 +-
 be/src/common/daemon.cpp                      | 195 +++++-------------
 be/src/common/daemon.h                        |   2 +-
 be/src/olap/lru_cache.cpp                     |  81 +-------
 be/src/olap/lru_cache.h                       |  28 ++-
 be/src/runtime/memory/cache_manager.cpp       |  21 +-
 be/src/runtime/memory/cache_manager.h         |   3 -
 be/src/runtime/memory/cache_policy.cpp        |   8 +-
 be/src/runtime/memory/cache_policy.h          |  28 +--
 .../memory/global_memory_arbitrator.cpp       |   7 -
 .../runtime/memory/global_memory_arbitrator.h |  17 --
 be/src/runtime/memory/lru_cache_policy.h      |  95 +++------
 be/src/runtime/memory/mem_tracker_limiter.cpp |   6 +-
 be/src/runtime/memory/memory_reclamation.cpp  |  12 ++
 be/src/service/point_query_executor.h         |   4 +-
 be/src/vec/common/allocator.cpp               |   3 +
 be/test/olap/lru_cache_test.cpp               | 144 ++-----------
 18 files changed, 163 insertions(+), 514 deletions(-)

diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index 0c00bd1a38f0da..00f8a042cbcbb7 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -95,9 +95,6 @@ DEFINE_String(mem_limit, "90%");
 // Soft memory limit as a fraction of hard memory limit.
 DEFINE_Double(soft_mem_limit_frac, "0.9");
 
-// Cache capacity reduce mem limit as a fraction of soft mem limit.
-DEFINE_mDouble(cache_capacity_reduce_mem_limit_frac, "0.6");
-
 // Schema change memory limit as a fraction of soft memory limit.
 DEFINE_Double(schema_change_mem_limit_frac, "0.6");
 
@@ -289,7 +286,7 @@ DEFINE_mInt32(exchg_buffer_queue_capacity_factor, "64");
 DEFINE_mInt64(memory_limitation_per_thread_for_schema_change_bytes, "2147483648");
 
 DEFINE_mInt32(cache_prune_interval_sec, "10");
-DEFINE_mInt32(cache_periodic_prune_stale_sweep_sec, "60");
+DEFINE_mInt32(cache_periodic_prune_stale_sweep_sec, "300");
 // the clean interval of tablet lookup cache
 DEFINE_mInt32(tablet_lookup_cache_stale_sweep_time_sec, "30");
 DEFINE_mInt32(point_query_row_cache_stale_sweep_time_sec, "300");
@@ -568,7 +565,7 @@ DEFINE_String(pprof_profile_dir, "${DORIS_HOME}/log");
 // for jeprofile in jemalloc
 DEFINE_mString(jeprofile_dir, "${DORIS_HOME}/log");
 DEFINE_mBool(enable_je_purge_dirty_pages, "true");
-DEFINE_mString(je_dirty_pages_mem_limit_percent, "2%");
+DEFINE_mString(je_dirty_pages_mem_limit_percent, "5%");
 
 // to forward compatibility, will be removed later
 DEFINE_mBool(enable_token_check, "true");
@@ -585,12 +582,17 @@ DEFINE_Int32(num_cores, "0");
 DEFINE_Bool(ignore_broken_disk, "false");
 
 // Sleep time in milliseconds between memory maintenance iterations
-DEFINE_mInt32(memory_maintenance_sleep_time_ms, "20");
+DEFINE_mInt32(memory_maintenance_sleep_time_ms, "100");
 
 // After full gc, no longer full gc and minor gc during sleep.
 // After minor gc, no minor gc during sleep, but full gc is possible.
 DEFINE_mInt32(memory_gc_sleep_time_ms, "500");
 
+// Sleep time in milliseconds between memtbale flush mgr refresh iterations
+DEFINE_mInt64(memtable_mem_tracker_refresh_interval_ms, "5");
+
+DEFINE_mInt64(wg_weighted_memory_ratio_refresh_interval_ms, "50");
+
 // percent of (active memtables size / all memtables size) when reach hard limit
 DEFINE_mInt32(memtable_hard_limit_active_percent, "50");
 
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 720f4f72cb4bf7..bd2aa4f51be1a9 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -133,9 +133,6 @@ DECLARE_String(mem_limit);
 // Soft memory limit as a fraction of hard memory limit.
 DECLARE_Double(soft_mem_limit_frac);
 
-// Cache capacity reduce mem limit as a fraction of soft mem limit.
-DECLARE_mDouble(cache_capacity_reduce_mem_limit_frac);
-
 // Schema change memory limit as a fraction of soft memory limit.
 DECLARE_Double(schema_change_mem_limit_frac);
 
@@ -644,6 +641,12 @@ DECLARE_mInt32(memory_maintenance_sleep_time_ms);
 // After minor gc, no minor gc during sleep, but full gc is possible.
 DECLARE_mInt32(memory_gc_sleep_time_ms);
 
+// Sleep time in milliseconds between memtbale flush mgr memory refresh iterations
+DECLARE_mInt64(memtable_mem_tracker_refresh_interval_ms);
+
+// Sleep time in milliseconds between refresh iterations of workload group weighted memory ratio
+DECLARE_mInt64(wg_weighted_memory_ratio_refresh_interval_ms);
+
 // percent of (active memtables size / all memtables size) when reach hard limit
 DECLARE_mInt32(memtable_hard_limit_active_percent);
 
diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp
index 713813b4a334f9..d8245f4045ce81 100644
--- a/be/src/common/daemon.cpp
+++ b/be/src/common/daemon.cpp
@@ -73,12 +73,6 @@
 namespace doris {
 namespace {
 
-int64_t last_print_proc_mem = 0;
-int32_t refresh_cache_capacity_sleep_time_ms = 0;
-#ifdef USE_JEMALLOC
-int32_t je_purge_dirty_pages_sleep_time_ms = 0;
-#endif
-
 void update_rowsets_and_segments_num_metrics() {
     if (config::is_cloud_mode()) {
         // TODO(plat1ko): CloudStorageEngine
@@ -210,104 +204,42 @@ void Daemon::tcmalloc_gc_thread() {
 #endif
 }
 
-void refresh_process_memory_metrics() {
-    doris::PerfCounters::refresh_proc_status();
-    doris::MemInfo::refresh_proc_meminfo();
-    doris::GlobalMemoryArbitrator::reset_refresh_interval_memory_growth();
-    ExecEnv::GetInstance()->brpc_iobuf_block_memory_tracker()->set_consumption(
-            butil::IOBuf::block_memory());
-}
-
-void refresh_common_allocator_metrics() {
+void Daemon::memory_maintenance_thread() {
+    int32_t interval_milliseconds = config::memory_maintenance_sleep_time_ms;
+    int64_t last_print_proc_mem = PerfCounters::get_vm_rss();
+    while (!_stop_background_threads_latch.wait_for(
+            std::chrono::milliseconds(interval_milliseconds))) {
+        // Refresh process memory metrics.
+        doris::PerfCounters::refresh_proc_status();
+        doris::MemInfo::refresh_proc_meminfo();
+        doris::GlobalMemoryArbitrator::reset_refresh_interval_memory_growth();
+        ExecEnv::GetInstance()->brpc_iobuf_block_memory_tracker()->set_consumption(
+                butil::IOBuf::block_memory());
+        // Refresh allocator memory metrics.
 #if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && !defined(THREAD_SANITIZER)
-    doris::MemInfo::refresh_allocator_mem();
-    if (config::enable_system_metrics) {
-        DorisMetrics::instance()->system_metrics()->update_allocator_metrics();
-    }
+        doris::MemInfo::refresh_allocator_mem();
+#ifdef USE_JEMALLOC
+        if (doris::MemInfo::je_dirty_pages_mem() > doris::MemInfo::je_dirty_pages_mem_limit() &&
+            GlobalMemoryArbitrator::is_exceed_soft_mem_limit()) {
+            doris::MemInfo::notify_je_purge_dirty_pages();
+        }
 #endif
-    MemInfo::refresh_memory_bvar();
-}
-
-void refresh_memory_state_after_memory_change() {
-    if (abs(last_print_proc_mem - PerfCounters::get_vm_rss()) > 268435456) {
-        last_print_proc_mem = PerfCounters::get_vm_rss();
-        doris::MemTrackerLimiter::clean_tracker_limiter_group();
-        doris::MemTrackerLimiter::enable_print_log_process_usage();
-        // Refresh mem tracker each type counter.
-        doris::MemTrackerLimiter::refresh_global_counter();
-        LOG(INFO) << doris::GlobalMemoryArbitrator::
-                        process_mem_log_str(); // print mem log when memory state by 256M
-    }
-}
-
-void refresh_cache_capacity() {
-    if (refresh_cache_capacity_sleep_time_ms <= 0) {
-        auto cache_capacity_reduce_mem_limit = uint64_t(
-                doris::MemInfo::soft_mem_limit() * config::cache_capacity_reduce_mem_limit_frac);
-        int64_t process_memory_usage = doris::GlobalMemoryArbitrator::process_memory_usage();
-        double new_cache_capacity_adjust_weighted =
-                process_memory_usage <= cache_capacity_reduce_mem_limit
-                        ? 1
-                        : std::min<double>(
-                                  1 - (process_memory_usage - cache_capacity_reduce_mem_limit) /
-                                                  (doris::MemInfo::soft_mem_limit() -
-                                                   cache_capacity_reduce_mem_limit),
-                                  0);
-        if (new_cache_capacity_adjust_weighted !=
-            doris::GlobalMemoryArbitrator::last_cache_capacity_adjust_weighted) {
-            doris::GlobalMemoryArbitrator::last_cache_capacity_adjust_weighted =
-                    new_cache_capacity_adjust_weighted;
-            doris::GlobalMemoryArbitrator::notify_cache_adjust_capacity();
-            refresh_cache_capacity_sleep_time_ms = config::memory_gc_sleep_time_ms;
+        if (config::enable_system_metrics) {
+            DorisMetrics::instance()->system_metrics()->update_allocator_metrics();
         }
-    }
-    refresh_cache_capacity_sleep_time_ms -= config::memory_maintenance_sleep_time_ms;
-}
-
-void je_purge_dirty_pages() {
-#ifdef USE_JEMALLOC
-    if (je_purge_dirty_pages_sleep_time_ms <= 0 &&
-        doris::MemInfo::je_dirty_pages_mem() > doris::MemInfo::je_dirty_pages_mem_limit() &&
-        GlobalMemoryArbitrator::is_exceed_soft_mem_limit()) {
-        doris::MemInfo::notify_je_purge_dirty_pages();
-        je_purge_dirty_pages_sleep_time_ms = config::memory_gc_sleep_time_ms;
-    }
-    je_purge_dirty_pages_sleep_time_ms -= config::memory_maintenance_sleep_time_ms;
 #endif
-}
-
-void Daemon::memory_maintenance_thread() {
-    while (!_stop_background_threads_latch.wait_for(
-            std::chrono::milliseconds(config::memory_maintenance_sleep_time_ms))) {
-        // step 1. Refresh process memory metrics.
-        refresh_process_memory_metrics();
-
-        // step 2. Refresh jemalloc/tcmalloc metrics.
-        refresh_common_allocator_metrics();
-
-        // step 3. Update and print memory stat when the memory changes by 256M.
-        refresh_memory_state_after_memory_change();
-
-        // step 4. Asyn Refresh cache capacity
-        // TODO adjust cache capacity based on smoothstep (smooth gradient).
-        refresh_cache_capacity();
-
-        // step 5. Cancel top memory task when process memory exceed hard limit.
-        // TODO replace memory_gc_thread.
-
-        // step 6. Refresh weighted memory ratio of workload groups.
-        doris::ExecEnv::GetInstance()->workload_group_mgr()->refresh_wg_weighted_memory_limit();
-
-        // step 7. Analyze blocking queries.
-        // TODO sort the operators that can spill, wake up the pipeline task spill
-        // or continue execution according to certain rules or cancel query.
-
-        // step 8. Flush memtable
-        doris::GlobalMemoryArbitrator::notify_memtable_memory_refresh();
-        // TODO notify flush memtable
-
-        // step 9. Jemalloc purge all arena dirty pages
-        je_purge_dirty_pages();
+        MemInfo::refresh_memory_bvar();
+
+        // Update and print memory stat when the memory changes by 256M.
+        if (abs(last_print_proc_mem - PerfCounters::get_vm_rss()) > 268435456) {
+            last_print_proc_mem = PerfCounters::get_vm_rss();
+            doris::MemTrackerLimiter::clean_tracker_limiter_group();
+            doris::MemTrackerLimiter::enable_print_log_process_usage();
+            // Refresh mem tracker each type counter.
+            doris::MemTrackerLimiter::refresh_global_counter();
+            LOG(INFO) << doris::GlobalMemoryArbitrator::
+                            process_mem_log_str(); // print mem log when memory state by 256M
+        }
     }
 }
 
@@ -369,21 +301,10 @@ void Daemon::memory_gc_thread() {
 void Daemon::memtable_memory_refresh_thread() {
     // Refresh the memory statistics of the load channel tracker more frequently,
     // which helps to accurately control the memory of LoadChannelMgr.
-    do {
-        std::unique_lock<std::mutex> l(doris::GlobalMemoryArbitrator::memtable_memory_refresh_lock);
-        while (_stop_background_threads_latch.count() != 0 &&
-               !doris::GlobalMemoryArbitrator::memtable_memory_refresh_notify.load(
-                       std::memory_order_relaxed)) {
-            doris::GlobalMemoryArbitrator::memtable_memory_refresh_cv.wait_for(
-                    l, std::chrono::seconds(1));
-        }
-        if (_stop_background_threads_latch.count() == 0) {
-            break;
-        }
+    while (!_stop_background_threads_latch.wait_for(
+            std::chrono::milliseconds(config::memtable_mem_tracker_refresh_interval_ms))) {
         doris::ExecEnv::GetInstance()->memtable_memory_limiter()->refresh_mem_tracker();
-        doris::GlobalMemoryArbitrator::memtable_memory_refresh_notify.store(
-                false, std::memory_order_relaxed);
-    } while (true);
+    }
 }
 
 /*
@@ -475,35 +396,6 @@ void Daemon::je_purge_dirty_pages_thread() const {
     } while (true);
 }
 
-void Daemon::cache_adjust_capacity_thread() {
-    do {
-        std::unique_lock<std::mutex> l(doris::GlobalMemoryArbitrator::cache_adjust_capacity_lock);
-        while (_stop_background_threads_latch.count() != 0 &&
-               !doris::GlobalMemoryArbitrator::cache_adjust_capacity_notify.load(
-                       std::memory_order_relaxed)) {
-            doris::GlobalMemoryArbitrator::cache_adjust_capacity_cv.wait_for(
-                    l, std::chrono::seconds(1));
-        }
-        double adjust_weighted = GlobalMemoryArbitrator::last_cache_capacity_adjust_weighted;
-        if (_stop_background_threads_latch.count() == 0) {
-            break;
-        }
-        if (config::disable_memory_gc) {
-            continue;
-        }
-        std::unique_ptr<RuntimeProfile> profile = std::make_unique<RuntimeProfile>("");
-        auto freed_mem = CacheManager::instance()->for_each_cache_refresh_capacity(adjust_weighted,
-                                                                                   profile.get());
-        std::stringstream ss;
-        profile->pretty_print(&ss);
-        LOG(INFO) << fmt::format(
-                "[MemoryGC] refresh cache capacity end, free memory {}, details: {}",
-                PrettyPrinter::print(freed_mem, TUnit::BYTES), ss.str());
-        doris::GlobalMemoryArbitrator::cache_adjust_capacity_notify.store(
-                false, std::memory_order_relaxed);
-    } while (true);
-}
-
 void Daemon::cache_prune_stale_thread() {
     int32_t interval = config::cache_periodic_prune_stale_sweep_sec;
     while (!_stop_background_threads_latch.wait_for(std::chrono::seconds(interval))) {
@@ -519,6 +411,14 @@ void Daemon::cache_prune_stale_thread() {
     }
 }
 
+void Daemon::wg_weighted_memory_ratio_refresh_thread() {
+    // Refresh weighted memory ratio of workload groups
+    while (!_stop_background_threads_latch.wait_for(
+            std::chrono::milliseconds(config::wg_weighted_memory_ratio_refresh_interval_ms))) {
+        doris::ExecEnv::GetInstance()->workload_group_mgr()->refresh_wg_weighted_memory_limit();
+    }
+}
+
 void Daemon::be_proc_monitor_thread() {
     while (!_stop_background_threads_latch.wait_for(
             std::chrono::milliseconds(config::be_proc_monitor_interval_ms))) {
@@ -555,10 +455,6 @@ void Daemon::start() {
             "Daemon", "je_purge_dirty_pages_thread",
             [this]() { this->je_purge_dirty_pages_thread(); }, &_threads.emplace_back());
     CHECK(st.ok()) << st;
-    st = Thread::create(
-            "Daemon", "cache_adjust_capacity_thread",
-            [this]() { this->cache_adjust_capacity_thread(); }, &_threads.emplace_back());
-    CHECK(st.ok()) << st;
     st = Thread::create(
             "Daemon", "cache_prune_stale_thread", [this]() { this->cache_prune_stale_thread(); },
             &_threads.emplace_back());
@@ -568,6 +464,11 @@ void Daemon::start() {
             [this]() { this->report_runtime_query_statistics_thread(); }, &_threads.emplace_back());
     CHECK(st.ok()) << st;
 
+    st = Thread::create(
+            "Daemon", "wg_weighted_memory_ratio_refresh_thread",
+            [this]() { this->wg_weighted_memory_ratio_refresh_thread(); },
+            &_threads.emplace_back());
+
     if (config::enable_be_proc_monitor) {
         st = Thread::create(
                 "Daemon", "be_proc_monitor_thread", [this]() { this->be_proc_monitor_thread(); },
diff --git a/be/src/common/daemon.h b/be/src/common/daemon.h
index fe723877dcd027..64c9f0c8993ae3 100644
--- a/be/src/common/daemon.h
+++ b/be/src/common/daemon.h
@@ -43,9 +43,9 @@ class Daemon {
     void memtable_memory_refresh_thread();
     void calculate_metrics_thread();
     void je_purge_dirty_pages_thread() const;
-    void cache_adjust_capacity_thread();
     void cache_prune_stale_thread();
     void report_runtime_query_statistics_thread();
+    void wg_weighted_memory_ratio_refresh_thread();
     void be_proc_monitor_thread();
 
     CountDownLatch _stop_background_threads_latch;
diff --git a/be/src/olap/lru_cache.cpp b/be/src/olap/lru_cache.cpp
index 6e5bb2fa31578f..741c2423915ede 100644
--- a/be/src/olap/lru_cache.cpp
+++ b/be/src/olap/lru_cache.cpp
@@ -177,51 +177,6 @@ LRUCache::~LRUCache() {
     prune();
 }
 
-PrunedInfo LRUCache::set_capacity(size_t capacity) {
-    LRUHandle* last_ref_list = nullptr;
-    {
-        std::lock_guard l(_mutex);
-        _capacity = capacity;
-        _evict_from_lru(0, &last_ref_list);
-    }
-
-    int64_t pruned_count = 0;
-    int64_t pruned_size = 0;
-    while (last_ref_list != nullptr) {
-        ++pruned_count;
-        pruned_size += last_ref_list->total_size;
-        LRUHandle* next = last_ref_list->next;
-        last_ref_list->free();
-        last_ref_list = next;
-    }
-    return {pruned_count, pruned_size};
-}
-
-uint64_t LRUCache::get_lookup_count() {
-    std::lock_guard l(_mutex);
-    return _lookup_count;
-}
-
-uint64_t LRUCache::get_hit_count() {
-    std::lock_guard l(_mutex);
-    return _hit_count;
-}
-
-size_t LRUCache::get_usage() {
-    std::lock_guard l(_mutex);
-    return _usage;
-}
-
-size_t LRUCache::get_capacity() {
-    std::lock_guard l(_mutex);
-    return _capacity;
-}
-
-size_t LRUCache::get_element_count() {
-    std::lock_guard l(_mutex);
-    return _table.element_count();
-}
-
 bool LRUCache::_unref(LRUHandle* e) {
     DCHECK(e->refs > 0);
     e->refs--;
@@ -560,19 +515,19 @@ inline uint32_t ShardedLRUCache::_hash_slice(const CacheKey& s) {
     return s.hash(s.data(), s.size(), 0);
 }
 
-ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t capacity, LRUCacheType type,
+ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type,
                                  uint32_t num_shards, uint32_t total_element_count_capacity)
         : _name(name),
           _num_shard_bits(Bits::FindLSBSetNonZero(num_shards)),
           _num_shards(num_shards),
           _shards(nullptr),
           _last_id(1),
-          _capacity(capacity) {
+          _total_capacity(total_capacity) {
     CHECK(num_shards > 0) << "num_shards cannot be 0";
     CHECK_EQ((num_shards & (num_shards - 1)), 0)
             << "num_shards should be power of two, but got " << num_shards;
 
-    const size_t per_shard = (capacity + (_num_shards - 1)) / _num_shards;
+    const size_t per_shard = (total_capacity + (_num_shards - 1)) / _num_shards;
     const size_t per_shard_element_count_capacity =
             (total_element_count_capacity + (_num_shards - 1)) / _num_shards;
     LRUCache** shards = new (std::nothrow) LRUCache*[_num_shards];
@@ -602,12 +557,12 @@ ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t capacity, LRUCa
             "doris_cache", _name + "_persecond", _lookup_count_bvar.get(), 60));
 }
 
-ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t capacity, LRUCacheType type,
+ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type,
                                  uint32_t num_shards,
                                  CacheValueTimeExtractor cache_value_time_extractor,
                                  bool cache_value_check_timestamp,
                                  uint32_t total_element_count_capacity)
-        : ShardedLRUCache(name, capacity, type, num_shards, total_element_count_capacity) {
+        : ShardedLRUCache(name, total_capacity, type, num_shards, total_element_count_capacity) {
     for (int s = 0; s < _num_shards; s++) {
         _shards[s]->set_cache_value_time_extractor(cache_value_time_extractor);
         _shards[s]->set_cache_value_check_timestamp(cache_value_check_timestamp);
@@ -625,24 +580,6 @@ ShardedLRUCache::~ShardedLRUCache() {
     }
 }
 
-PrunedInfo ShardedLRUCache::set_capacity(size_t capacity) {
-    std::lock_guard l(_mutex);
-    PrunedInfo pruned_info;
-    const size_t per_shard = (capacity + (_num_shards - 1)) / _num_shards;
-    for (int s = 0; s < _num_shards; s++) {
-        PrunedInfo info = _shards[s]->set_capacity(per_shard);
-        pruned_info.pruned_count += info.pruned_count;
-        pruned_info.pruned_size += info.pruned_size;
-    }
-    _capacity = capacity;
-    return pruned_info;
-}
-
-size_t ShardedLRUCache::get_capacity() {
-    std::lock_guard l(_mutex);
-    return _capacity;
-}
-
 Cache::Handle* ShardedLRUCache::insert(const CacheKey& key, void* value, size_t charge,
                                        CachePriority priority) {
     const uint32_t hash = _hash_slice(key);
@@ -701,25 +638,25 @@ int64_t ShardedLRUCache::get_usage() {
 }
 
 void ShardedLRUCache::update_cache_metrics() const {
-    size_t capacity = 0;
+    size_t total_capacity = 0;
     size_t total_usage = 0;
     size_t total_lookup_count = 0;
     size_t total_hit_count = 0;
     size_t total_element_count = 0;
     for (int i = 0; i < _num_shards; i++) {
-        capacity += _shards[i]->get_capacity();
+        total_capacity += _shards[i]->get_capacity();
         total_usage += _shards[i]->get_usage();
         total_lookup_count += _shards[i]->get_lookup_count();
         total_hit_count += _shards[i]->get_hit_count();
         total_element_count += _shards[i]->get_element_count();
     }
 
-    cache_capacity->set_value(capacity);
+    cache_capacity->set_value(total_capacity);
     cache_usage->set_value(total_usage);
     cache_element_count->set_value(total_element_count);
     cache_lookup_count->set_value(total_lookup_count);
     cache_hit_count->set_value(total_hit_count);
-    cache_usage_ratio->set_value(capacity == 0 ? 0 : ((double)total_usage / capacity));
+    cache_usage_ratio->set_value(total_capacity == 0 ? 0 : ((double)total_usage / total_capacity));
     cache_hit_ratio->set_value(
             total_lookup_count == 0 ? 0 : ((double)total_hit_count / total_lookup_count));
 }
diff --git a/be/src/olap/lru_cache.h b/be/src/olap/lru_cache.h
index de7084382d7398..059020deab58f5 100644
--- a/be/src/olap/lru_cache.h
+++ b/be/src/olap/lru_cache.h
@@ -227,8 +227,7 @@ class Cache {
 
     virtual int64_t get_usage() = 0;
 
-    virtual PrunedInfo set_capacity(size_t capacity) = 0;
-    virtual size_t get_capacity() = 0;
+    virtual size_t get_total_capacity() = 0;
 
 private:
     DISALLOW_COPY_AND_ASSIGN(Cache);
@@ -328,7 +327,7 @@ class LRUCache {
     ~LRUCache();
 
     // Separate from constructor so caller can easily make an array of LRUCache
-    PrunedInfo set_capacity(size_t capacity);
+    void set_capacity(size_t capacity) { _capacity = capacity; }
     void set_element_count_capacity(uint32_t element_count_capacity) {
         _element_count_capacity = element_count_capacity;
     }
@@ -346,11 +345,11 @@ class LRUCache {
     void set_cache_value_time_extractor(CacheValueTimeExtractor cache_value_time_extractor);
     void set_cache_value_check_timestamp(bool cache_value_check_timestamp);
 
-    uint64_t get_lookup_count();
-    uint64_t get_hit_count();
-    size_t get_usage();
-    size_t get_capacity();
-    size_t get_element_count();
+    uint64_t get_lookup_count() const { return _lookup_count; }
+    uint64_t get_hit_count() const { return _hit_count; }
+    size_t get_usage() const { return _usage; }
+    size_t get_capacity() const { return _capacity; }
+    size_t get_element_count() const { return _table.element_count(); }
 
 private:
     void _lru_remove(LRUHandle* e);
@@ -404,16 +403,15 @@ class ShardedLRUCache : public Cache {
     PrunedInfo prune() override;
     PrunedInfo prune_if(CachePrunePredicate pred, bool lazy_mode = false) override;
     int64_t get_usage() override;
-    PrunedInfo set_capacity(size_t capacity) override;
-    size_t get_capacity() override;
+    size_t get_total_capacity() override { return _total_capacity; };
 
 private:
     // LRUCache can only be created and managed with LRUCachePolicy.
     friend class LRUCachePolicy;
 
-    explicit ShardedLRUCache(const std::string& name, size_t capacity, LRUCacheType type,
+    explicit ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type,
                              uint32_t num_shards, uint32_t element_count_capacity);
-    explicit ShardedLRUCache(const std::string& name, size_t capacity, LRUCacheType type,
+    explicit ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type,
                              uint32_t num_shards,
                              CacheValueTimeExtractor cache_value_time_extractor,
                              bool cache_value_check_timestamp, uint32_t element_count_capacity);
@@ -431,8 +429,7 @@ class ShardedLRUCache : public Cache {
     const uint32_t _num_shards;
     LRUCache** _shards = nullptr;
     std::atomic<uint64_t> _last_id;
-    std::mutex _mutex;
-    size_t _capacity {0};
+    size_t _total_capacity;
 
     std::shared_ptr<MetricEntity> _entity;
     IntGauge* cache_capacity = nullptr;
@@ -465,8 +462,7 @@ class DummyLRUCache : public Cache {
         return {0, 0};
     };
     int64_t get_usage() override { return 0; };
-    PrunedInfo set_capacity(size_t capacity) override { return {0, 0}; };
-    size_t get_capacity() override { return 0; };
+    size_t get_total_capacity() override { return 0; };
 };
 
 } // namespace doris
diff --git a/be/src/runtime/memory/cache_manager.cpp b/be/src/runtime/memory/cache_manager.cpp
index ec57ffba50d318..a6516c40a35770 100644
--- a/be/src/runtime/memory/cache_manager.cpp
+++ b/be/src/runtime/memory/cache_manager.cpp
@@ -59,26 +59,11 @@ int64_t CacheManager::for_each_cache_prune_all(RuntimeProfile* profile, bool for
 int64_t CacheManager::cache_prune_all(CachePolicy::CacheType type, bool force) {
     std::lock_guard<std::mutex> l(_caches_lock);
     auto* cache_policy = _caches[type];
+    if (!cache_policy->enable_prune()) {
+        return -1;
+    }
     cache_policy->prune_all(force);
     return cache_policy->profile()->get_counter("FreedMemory")->value();
 }
 
-int64_t CacheManager::for_each_cache_refresh_capacity(double adjust_weighted,
-                                                      RuntimeProfile* profile) {
-    int64_t freed_size = 0;
-    std::lock_guard<std::mutex> l(_caches_lock);
-    for (const auto& pair : _caches) {
-        auto* cache_policy = pair.second;
-        if (!cache_policy->enable_prune()) {
-            continue;
-        }
-        cache_policy->adjust_capacity_weighted(adjust_weighted);
-        freed_size += cache_policy->profile()->get_counter("FreedMemory")->value();
-        if (cache_policy->profile()->get_counter("FreedMemory")->value() != 0 && profile) {
-            profile->add_child(cache_policy->profile(), true, nullptr);
-        }
-    }
-    return freed_size;
-}
-
 } // namespace doris
diff --git a/be/src/runtime/memory/cache_manager.h b/be/src/runtime/memory/cache_manager.h
index a2a089b929dbdf..d94dca501670bf 100644
--- a/be/src/runtime/memory/cache_manager.h
+++ b/be/src/runtime/memory/cache_manager.h
@@ -81,9 +81,6 @@ class CacheManager {
         return false;
     }
 
-    int64_t for_each_cache_refresh_capacity(double adjust_weighted,
-                                            RuntimeProfile* profile = nullptr);
-
 private:
     std::mutex _caches_lock;
     std::unordered_map<CachePolicy::CacheType, CachePolicy*> _caches;
diff --git a/be/src/runtime/memory/cache_policy.cpp b/be/src/runtime/memory/cache_policy.cpp
index 46b9db1b35ad5f..4e50d64d88eed1 100644
--- a/be/src/runtime/memory/cache_policy.cpp
+++ b/be/src/runtime/memory/cache_policy.cpp
@@ -21,12 +21,8 @@
 
 namespace doris {
 
-CachePolicy::CachePolicy(CacheType type, size_t capacity, uint32_t stale_sweep_time_s,
-                         bool enable_prune)
-        : _type(type),
-          _initial_capacity(capacity),
-          _stale_sweep_time_s(stale_sweep_time_s),
-          _enable_prune(enable_prune) {
+CachePolicy::CachePolicy(CacheType type, uint32_t stale_sweep_time_s, bool enable_prune)
+        : _type(type), _stale_sweep_time_s(stale_sweep_time_s), _enable_prune(enable_prune) {
     CacheManager::instance()->register_cache(this);
     init_profile();
 }
diff --git a/be/src/runtime/memory/cache_policy.h b/be/src/runtime/memory/cache_policy.h
index c43ca0b2fb7e0a..c457afd86898f2 100644
--- a/be/src/runtime/memory/cache_policy.h
+++ b/be/src/runtime/memory/cache_policy.h
@@ -17,12 +17,13 @@
 
 #pragma once
 
+#include "runtime/exec_env.h"
 #include "util/runtime_profile.h"
 
 namespace doris {
 
-static constexpr int32_t CACHE_MIN_PRUNE_SIZE = 67108864; // 64M
-static constexpr int32_t CACHE_MIN_PRUNE_NUMBER = 1024;
+static constexpr int32_t CACHE_MIN_FREE_SIZE = 67108864; // 64M
+static constexpr int32_t CACHE_MIN_FREE_NUMBER = 1024;
 
 // Base of all caches. register to CacheManager when cache is constructed.
 class CachePolicy {
@@ -41,13 +42,12 @@ class CachePolicy {
         TABLET_VERSION_CACHE = 10,
         LAST_SUCCESS_CHANNEL_CACHE = 11,
         COMMON_OBJ_LRU_CACHE = 12,
-        FOR_UT_CACHE_SIZE = 13,
+        FOR_UT = 13,
         TABLET_SCHEMA_CACHE = 14,
         CREATE_TABLET_RR_IDX_CACHE = 15,
         CLOUD_TABLET_CACHE = 16,
         CLOUD_TXN_DELETE_BITMAP_CACHE = 17,
         NONE = 18, // not be used
-        FOR_UT_CACHE_NUMBER = 19,
     };
 
     static std::string type_string(CacheType type) {
@@ -78,8 +78,8 @@ class CachePolicy {
             return "LastSuccessChannelCache";
         case CacheType::COMMON_OBJ_LRU_CACHE:
             return "CommonObjLRUCache";
-        case CacheType::FOR_UT_CACHE_SIZE:
-            return "ForUTCacheSize";
+        case CacheType::FOR_UT:
+            return "ForUT";
         case CacheType::TABLET_SCHEMA_CACHE:
             return "TabletSchemaCache";
         case CacheType::CREATE_TABLET_RR_IDX_CACHE:
@@ -88,8 +88,6 @@ class CachePolicy {
             return "CloudTabletCache";
         case CacheType::CLOUD_TXN_DELETE_BITMAP_CACHE:
             return "CloudTxnDeleteBitmapCache";
-        case CacheType::FOR_UT_CACHE_NUMBER:
-            return "ForUTCacheNumber";
         default:
             LOG(FATAL) << "not match type of cache policy :" << static_cast<int>(type);
         }
@@ -111,12 +109,11 @@ class CachePolicy {
             {"MowTabletVersionCache", CacheType::TABLET_VERSION_CACHE},
             {"LastSuccessChannelCache", CacheType::LAST_SUCCESS_CHANNEL_CACHE},
             {"CommonObjLRUCache", CacheType::COMMON_OBJ_LRU_CACHE},
-            {"ForUTCacheSize", CacheType::FOR_UT_CACHE_SIZE},
+            {"ForUT", CacheType::FOR_UT},
             {"TabletSchemaCache", CacheType::TABLET_SCHEMA_CACHE},
             {"CreateTabletRRIdxCache", CacheType::CREATE_TABLET_RR_IDX_CACHE},
             {"CloudTabletCache", CacheType::CLOUD_TABLET_CACHE},
-            {"CloudTxnDeleteBitmapCache", CacheType::CLOUD_TXN_DELETE_BITMAP_CACHE},
-            {"ForUTCacheNumber", CacheType::FOR_UT_CACHE_NUMBER}};
+            {"CloudTxnDeleteBitmapCache", CacheType::CLOUD_TXN_DELETE_BITMAP_CACHE}};
 
     static CacheType string_to_type(std::string type) {
         if (StringToType.contains(type)) {
@@ -126,16 +123,13 @@ class CachePolicy {
         }
     }
 
-    CachePolicy(CacheType type, size_t capacity, uint32_t stale_sweep_time_s, bool enable_prune);
+    CachePolicy(CacheType type, uint32_t stale_sweep_time_s, bool enable_prune);
     virtual ~CachePolicy();
 
     virtual void prune_stale() = 0;
     virtual void prune_all(bool force) = 0;
-    virtual int64_t adjust_capacity_weighted(double adjust_weighted) = 0;
-    virtual size_t get_capacity() = 0;
 
     CacheType type() { return _type; }
-    size_t initial_capacity() const { return _initial_capacity; }
     bool enable_prune() const { return _enable_prune; }
     RuntimeProfile* profile() { return _profile.get(); }
 
@@ -145,20 +139,16 @@ class CachePolicy {
                 std::make_unique<RuntimeProfile>(fmt::format("Cache type={}", type_string(_type)));
         _prune_stale_number_counter = ADD_COUNTER(_profile, "PruneStaleNumber", TUnit::UNIT);
         _prune_all_number_counter = ADD_COUNTER(_profile, "PruneAllNumber", TUnit::UNIT);
-        _adjust_capacity_weighted_number_counter =
-                ADD_COUNTER(_profile, "SetCapacityNumber", TUnit::UNIT);
         _freed_memory_counter = ADD_COUNTER(_profile, "FreedMemory", TUnit::BYTES);
         _freed_entrys_counter = ADD_COUNTER(_profile, "FreedEntrys", TUnit::UNIT);
         _cost_timer = ADD_TIMER(_profile, "CostTime");
     }
 
     CacheType _type;
-    size_t _initial_capacity {0};
 
     std::unique_ptr<RuntimeProfile> _profile;
     RuntimeProfile::Counter* _prune_stale_number_counter = nullptr;
     RuntimeProfile::Counter* _prune_all_number_counter = nullptr;
-    RuntimeProfile::Counter* _adjust_capacity_weighted_number_counter = nullptr;
     // Reset before each gc
     RuntimeProfile::Counter* _freed_memory_counter = nullptr;
     RuntimeProfile::Counter* _freed_entrys_counter = nullptr;
diff --git a/be/src/runtime/memory/global_memory_arbitrator.cpp b/be/src/runtime/memory/global_memory_arbitrator.cpp
index 76a414a6ebdc74..344bcbc59846d9 100644
--- a/be/src/runtime/memory/global_memory_arbitrator.cpp
+++ b/be/src/runtime/memory/global_memory_arbitrator.cpp
@@ -38,13 +38,6 @@ bvar::PassiveStatus<int64_t> g_sys_mem_avail(
 
 std::atomic<int64_t> GlobalMemoryArbitrator::_s_process_reserved_memory = 0;
 std::atomic<int64_t> GlobalMemoryArbitrator::refresh_interval_memory_growth = 0;
-std::mutex GlobalMemoryArbitrator::cache_adjust_capacity_lock;
-std::condition_variable GlobalMemoryArbitrator::cache_adjust_capacity_cv;
-std::atomic<bool> GlobalMemoryArbitrator::cache_adjust_capacity_notify {false};
-std::atomic<double> GlobalMemoryArbitrator::last_cache_capacity_adjust_weighted {1};
-std::mutex GlobalMemoryArbitrator::memtable_memory_refresh_lock;
-std::condition_variable GlobalMemoryArbitrator::memtable_memory_refresh_cv;
-std::atomic<bool> GlobalMemoryArbitrator::memtable_memory_refresh_notify {false};
 
 bool GlobalMemoryArbitrator::try_reserve_process_memory(int64_t bytes) {
     if (sys_mem_available() - bytes < MemInfo::sys_mem_available_warning_water_mark()) {
diff --git a/be/src/runtime/memory/global_memory_arbitrator.h b/be/src/runtime/memory/global_memory_arbitrator.h
index 5fbcf232ce4d24..f8fda18d0e9a0c 100644
--- a/be/src/runtime/memory/global_memory_arbitrator.h
+++ b/be/src/runtime/memory/global_memory_arbitrator.h
@@ -173,23 +173,6 @@ class GlobalMemoryArbitrator {
     // avoid multiple threads starting at the same time and causing OOM.
     static std::atomic<int64_t> refresh_interval_memory_growth;
 
-    static std::mutex cache_adjust_capacity_lock;
-    static std::condition_variable cache_adjust_capacity_cv;
-    static std::atomic<bool> cache_adjust_capacity_notify;
-    static std::atomic<double> last_cache_capacity_adjust_weighted;
-    static void notify_cache_adjust_capacity() {
-        cache_adjust_capacity_notify.store(true, std::memory_order_relaxed);
-        cache_adjust_capacity_cv.notify_all();
-    }
-
-    static std::mutex memtable_memory_refresh_lock;
-    static std::condition_variable memtable_memory_refresh_cv;
-    static std::atomic<bool> memtable_memory_refresh_notify;
-    static void notify_memtable_memory_refresh() {
-        memtable_memory_refresh_notify.store(true, std::memory_order_relaxed);
-        memtable_memory_refresh_cv.notify_all();
-    }
-
 private:
     static std::atomic<int64_t> _s_process_reserved_memory;
 
diff --git a/be/src/runtime/memory/lru_cache_policy.h b/be/src/runtime/memory/lru_cache_policy.h
index 419825c85c4538..1b6c9ead6d0086 100644
--- a/be/src/runtime/memory/lru_cache_policy.h
+++ b/be/src/runtime/memory/lru_cache_policy.h
@@ -37,8 +37,7 @@ class LRUCachePolicy : public CachePolicy {
                    uint32_t stale_sweep_time_s, uint32_t num_shards = DEFAULT_LRU_CACHE_NUM_SHARDS,
                    uint32_t element_count_capacity = DEFAULT_LRU_CACHE_ELEMENT_COUNT_CAPACITY,
                    bool enable_prune = true)
-            : CachePolicy(type, capacity, stale_sweep_time_s, enable_prune),
-              _lru_cache_type(lru_cache_type) {
+            : CachePolicy(type, stale_sweep_time_s, enable_prune), _lru_cache_type(lru_cache_type) {
         if (check_capacity(capacity, num_shards)) {
             _cache = std::shared_ptr<ShardedLRUCache>(
                     new ShardedLRUCache(type_string(type), capacity, lru_cache_type, num_shards,
@@ -54,8 +53,7 @@ class LRUCachePolicy : public CachePolicy {
                    uint32_t element_count_capacity,
                    CacheValueTimeExtractor cache_value_time_extractor,
                    bool cache_value_check_timestamp, bool enable_prune = true)
-            : CachePolicy(type, capacity, stale_sweep_time_s, enable_prune),
-              _lru_cache_type(lru_cache_type) {
+            : CachePolicy(type, stale_sweep_time_s, enable_prune), _lru_cache_type(lru_cache_type) {
         if (check_capacity(capacity, num_shards)) {
             _cache = std::shared_ptr<ShardedLRUCache>(
                     new ShardedLRUCache(type_string(type), capacity, lru_cache_type, num_shards,
@@ -108,19 +106,18 @@ class LRUCachePolicy : public CachePolicy {
 
     int64_t get_usage() { return _cache->get_usage(); }
 
-    size_t get_capacity() override { return _cache->get_capacity(); }
+    size_t get_total_capacity() { return _cache->get_total_capacity(); }
 
     uint64_t new_id() { return _cache->new_id(); };
 
     // Subclass can override this method to determine whether to do the minor or full gc
     virtual bool exceed_prune_limit() {
-        return _lru_cache_type == LRUCacheType::SIZE ? mem_consumption() > CACHE_MIN_PRUNE_SIZE
-                                                     : get_usage() > CACHE_MIN_PRUNE_NUMBER;
+        return _lru_cache_type == LRUCacheType::SIZE ? mem_consumption() > CACHE_MIN_FREE_SIZE
+                                                     : get_usage() > CACHE_MIN_FREE_NUMBER;
     }
 
     // Try to prune the cache if expired.
     void prune_stale() override {
-        std::lock_guard<std::mutex> l(_lock);
         COUNTER_SET(_freed_entrys_counter, (int64_t)0);
         COUNTER_SET(_freed_memory_counter, (int64_t)0);
         if (_stale_sweep_time_s <= 0 && _cache == ExecEnv::GetInstance()->get_dummy_lru_cache()) {
@@ -128,6 +125,7 @@ class LRUCachePolicy : public CachePolicy {
         }
         if (exceed_prune_limit()) {
             COUNTER_SET(_cost_timer, (int64_t)0);
+            SCOPED_TIMER(_cost_timer);
             const int64_t curtime = UnixMillis();
             auto pred = [this, curtime](const LRUHandle* handle) -> bool {
                 return static_cast<bool>((handle->last_visit_time + _stale_sweep_time_s * 1000) <
@@ -136,38 +134,33 @@ class LRUCachePolicy : public CachePolicy {
 
             LOG(INFO) << fmt::format("[MemoryGC] {} prune stale start, consumption {}, usage {}",
                                      type_string(_type), mem_consumption(), get_usage());
-            {
-                SCOPED_TIMER(_cost_timer);
-                // Prune cache in lazy mode to save cpu and minimize the time holding write lock
-                PrunedInfo pruned_info = _cache->prune_if(pred, true);
-                COUNTER_SET(_freed_entrys_counter, pruned_info.pruned_count);
-                COUNTER_SET(_freed_memory_counter, pruned_info.pruned_size);
-            }
+            // Prune cache in lazy mode to save cpu and minimize the time holding write lock
+            PrunedInfo pruned_info = _cache->prune_if(pred, true);
+            COUNTER_SET(_freed_entrys_counter, pruned_info.pruned_count);
+            COUNTER_SET(_freed_memory_counter, pruned_info.pruned_size);
             COUNTER_UPDATE(_prune_stale_number_counter, 1);
             LOG(INFO) << fmt::format(
-                    "[MemoryGC] {} prune stale {} entries, {} bytes, cost {}, {} times prune",
+                    "[MemoryGC] {} prune stale {} entries, {} bytes, {} times prune",
                     type_string(_type), _freed_entrys_counter->value(),
-                    _freed_memory_counter->value(), _cost_timer->value(),
-                    _prune_stale_number_counter->value());
+                    _freed_memory_counter->value(), _prune_stale_number_counter->value());
         } else {
             if (_lru_cache_type == LRUCacheType::SIZE) {
                 LOG(INFO) << fmt::format(
                         "[MemoryGC] {} not need prune stale, LRUCacheType::SIZE consumption {} "
                         "less "
-                        "than CACHE_MIN_PRUNE_SIZE {}",
-                        type_string(_type), mem_consumption(), CACHE_MIN_PRUNE_SIZE);
+                        "than CACHE_MIN_FREE_SIZE {}",
+                        type_string(_type), mem_consumption(), CACHE_MIN_FREE_SIZE);
             } else if (_lru_cache_type == LRUCacheType::NUMBER) {
                 LOG(INFO) << fmt::format(
                         "[MemoryGC] {} not need prune stale, LRUCacheType::NUMBER usage {} less "
                         "than "
-                        "CACHE_MIN_PRUNE_NUMBER {}",
-                        type_string(_type), get_usage(), CACHE_MIN_PRUNE_NUMBER);
+                        "CACHE_MIN_FREE_NUMBER {}",
+                        type_string(_type), get_usage(), CACHE_MIN_FREE_NUMBER);
             }
         }
     }
 
     void prune_all(bool force) override {
-        std::lock_guard<std::mutex> l(_lock);
         COUNTER_SET(_freed_entrys_counter, (int64_t)0);
         COUNTER_SET(_freed_memory_counter, (int64_t)0);
         if (_cache == ExecEnv::GetInstance()->get_dummy_lru_cache()) {
@@ -175,73 +168,37 @@ class LRUCachePolicy : public CachePolicy {
         }
         if ((force && mem_consumption() != 0) || exceed_prune_limit()) {
             COUNTER_SET(_cost_timer, (int64_t)0);
+            SCOPED_TIMER(_cost_timer);
             LOG(INFO) << fmt::format("[MemoryGC] {} prune all start, consumption {}, usage {}",
                                      type_string(_type), mem_consumption(), get_usage());
-            {
-                SCOPED_TIMER(_cost_timer);
-                PrunedInfo pruned_info = _cache->prune();
-                COUNTER_SET(_freed_entrys_counter, pruned_info.pruned_count);
-                COUNTER_SET(_freed_memory_counter, pruned_info.pruned_size);
-            }
+            PrunedInfo pruned_info = _cache->prune();
+            COUNTER_SET(_freed_entrys_counter, pruned_info.pruned_count);
+            COUNTER_SET(_freed_memory_counter, pruned_info.pruned_size);
             COUNTER_UPDATE(_prune_all_number_counter, 1);
             LOG(INFO) << fmt::format(
-                    "[MemoryGC] {} prune all {} entries, {} bytes, cost {}, {} times prune, is "
-                    "force: {}",
+                    "[MemoryGC] {} prune all {} entries, {} bytes, {} times prune, is force: {}",
                     type_string(_type), _freed_entrys_counter->value(),
-                    _freed_memory_counter->value(), _cost_timer->value(),
-                    _prune_all_number_counter->value(), force);
+                    _freed_memory_counter->value(), _prune_all_number_counter->value(), force);
         } else {
             if (_lru_cache_type == LRUCacheType::SIZE) {
                 LOG(INFO) << fmt::format(
                         "[MemoryGC] {} not need prune all, force is {}, LRUCacheType::SIZE "
                         "consumption {}, "
-                        "CACHE_MIN_PRUNE_SIZE {}",
-                        type_string(_type), force, mem_consumption(), CACHE_MIN_PRUNE_SIZE);
+                        "CACHE_MIN_FREE_SIZE {}",
+                        type_string(_type), force, mem_consumption(), CACHE_MIN_FREE_SIZE);
             } else if (_lru_cache_type == LRUCacheType::NUMBER) {
                 LOG(INFO) << fmt::format(
                         "[MemoryGC] {} not need prune all, force is {}, LRUCacheType::NUMBER "
-                        "usage {}, CACHE_MIN_PRUNE_NUMBER {}",
-                        type_string(_type), force, get_usage(), CACHE_MIN_PRUNE_NUMBER);
+                        "usage {}, CACHE_MIN_FREE_NUMBER {}",
+                        type_string(_type), force, get_usage(), CACHE_MIN_FREE_NUMBER);
             }
         }
     }
 
-    int64_t adjust_capacity_weighted(double adjust_weighted) override {
-        std::lock_guard<std::mutex> l(_lock);
-        auto capacity = static_cast<size_t>(_initial_capacity * adjust_weighted);
-        COUNTER_SET(_freed_entrys_counter, (int64_t)0);
-        COUNTER_SET(_freed_memory_counter, (int64_t)0);
-        COUNTER_SET(_cost_timer, (int64_t)0);
-        if (_cache == ExecEnv::GetInstance()->get_dummy_lru_cache()) {
-            return 0;
-        }
-
-        size_t old_capacity = get_capacity();
-        int64_t old_mem_consumption = mem_consumption();
-        int64_t old_usage = get_usage();
-        {
-            SCOPED_TIMER(_cost_timer);
-            PrunedInfo pruned_info = _cache->set_capacity(capacity);
-            COUNTER_SET(_freed_entrys_counter, pruned_info.pruned_count);
-            COUNTER_SET(_freed_memory_counter, pruned_info.pruned_size);
-        }
-        COUNTER_UPDATE(_adjust_capacity_weighted_number_counter, 1);
-        LOG(INFO) << fmt::format(
-                "[MemoryGC] {} update capacity, old <capacity {}, consumption {}, usage {}>, "
-                "adjust_weighted {}, new <capacity {}, consumption {}, usage {}>, prune {} "
-                "entries, {} bytes, cost {}, {} times prune",
-                type_string(_type), old_capacity, old_mem_consumption, old_usage, adjust_weighted,
-                get_capacity(), mem_consumption(), get_usage(), _freed_entrys_counter->value(),
-                _freed_memory_counter->value(), _cost_timer->value(),
-                _adjust_capacity_weighted_number_counter->value());
-        return _freed_entrys_counter->value();
-    }
-
 protected:
     // if check_capacity failed, will return dummy lru cache,
     // compatible with ShardedLRUCache usage, but will not actually cache.
     std::shared_ptr<Cache> _cache;
-    std::mutex _lock;
     LRUCacheType _lru_cache_type;
 };
 
diff --git a/be/src/runtime/memory/mem_tracker_limiter.cpp b/be/src/runtime/memory/mem_tracker_limiter.cpp
index 59546b11d51a8a..a8aa44414ebf87 100644
--- a/be/src/runtime/memory/mem_tracker_limiter.cpp
+++ b/be/src/runtime/memory/mem_tracker_limiter.cpp
@@ -739,10 +739,10 @@ int64_t MemTrackerLimiter::free_top_overcommit_query(
         LOG(INFO) << log_prefix << "finished, no task need be canceled.";
         return 0;
     }
-    if (small_num == 0 && canceling_task.empty() && query_consumption.size() == 1) {
+    if (query_consumption.size() == 1) {
         auto iter = query_consumption.begin();
-        LOG(INFO) << log_prefix << "finished, only one overcommit task: " << iter->first
-                  << ", memory consumption: " << iter->second << ", no other tasks, so no cancel.";
+        LOG(INFO) << log_prefix << "finished, only one task: " << iter->first
+                  << ", memory consumption: " << iter->second << ", no cancel.";
         return 0;
     }
 
diff --git a/be/src/runtime/memory/memory_reclamation.cpp b/be/src/runtime/memory/memory_reclamation.cpp
index 17f5a41f462b50..3adf1d1ac75718 100644
--- a/be/src/runtime/memory/memory_reclamation.cpp
+++ b/be/src/runtime/memory/memory_reclamation.cpp
@@ -37,6 +37,7 @@ bool MemoryReclamation::process_minor_gc(std::string mem_info) {
     std::unique_ptr<RuntimeProfile> profile = std::make_unique<RuntimeProfile>("");
 
     Defer defer {[&]() {
+        MemInfo::notify_je_purge_dirty_pages();
         std::stringstream ss;
         profile->pretty_print(&ss);
         LOG(INFO) << fmt::format(
@@ -45,6 +46,11 @@ bool MemoryReclamation::process_minor_gc(std::string mem_info) {
                 ss.str());
     }};
 
+    freed_mem += CacheManager::instance()->for_each_cache_prune_stale(profile.get());
+    if (freed_mem > MemInfo::process_minor_gc_size()) {
+        return true;
+    }
+
     if (config::enable_workload_group_memory_gc) {
         RuntimeProfile* tg_profile = profile->create_child("WorkloadGroup", true, true);
         freed_mem += tg_enable_overcommit_group_gc(MemInfo::process_minor_gc_size() - freed_mem,
@@ -81,6 +87,7 @@ bool MemoryReclamation::process_full_gc(std::string mem_info) {
     std::unique_ptr<RuntimeProfile> profile = std::make_unique<RuntimeProfile>("");
 
     Defer defer {[&]() {
+        MemInfo::notify_je_purge_dirty_pages();
         std::stringstream ss;
         profile->pretty_print(&ss);
         LOG(INFO) << fmt::format(
@@ -89,6 +96,11 @@ bool MemoryReclamation::process_full_gc(std::string mem_info) {
                 ss.str());
     }};
 
+    freed_mem += CacheManager::instance()->for_each_cache_prune_all(profile.get());
+    if (freed_mem > MemInfo::process_full_gc_size()) {
+        return true;
+    }
+
     if (config::enable_workload_group_memory_gc) {
         RuntimeProfile* tg_profile = profile->create_child("WorkloadGroup", true, true);
         freed_mem += tg_enable_overcommit_group_gc(MemInfo::process_full_gc_size() - freed_mem,
diff --git a/be/src/service/point_query_executor.h b/be/src/service/point_query_executor.h
index 7503fd2c102a68..19954479c97ec7 100644
--- a/be/src/service/point_query_executor.h
+++ b/be/src/service/point_query_executor.h
@@ -246,8 +246,8 @@ class LookupConnectionCache : public LRUCachePolicyTrackingManual {
         auto* value = new CacheValue;
         value->item = item;
         LOG(INFO) << "Add item mem"
-                  << ", cache_capacity: " << get_capacity() << ", cache_usage: " << get_usage()
-                  << ", mem_consum: " << mem_consumption();
+                  << ", cache_capacity: " << get_total_capacity()
+                  << ", cache_usage: " << get_usage() << ", mem_consum: " << mem_consumption();
         auto* lru_handle = insert(key, value, 1, sizeof(Reusable), CachePriority::NORMAL);
         release(lru_handle);
     }
diff --git a/be/src/vec/common/allocator.cpp b/be/src/vec/common/allocator.cpp
index 2619c0bafffb16..dff1330888f82d 100644
--- a/be/src/vec/common/allocator.cpp
+++ b/be/src/vec/common/allocator.cpp
@@ -106,6 +106,9 @@ void Allocator<clear_memory_, mmap_populate, use_mmap, MemoryAllocator>::sys_mem
             return;
         }
 
+        // no significant impact on performance is expected.
+        doris::MemInfo::notify_je_purge_dirty_pages();
+
         if (doris::thread_context()->thread_mem_tracker_mgr->is_attach_query() &&
             doris::thread_context()->thread_mem_tracker_mgr->wait_gc()) {
             int64_t wait_milliseconds = 0;
diff --git a/be/test/olap/lru_cache_test.cpp b/be/test/olap/lru_cache_test.cpp
index 9adb30b93054f4..4fc096380c754b 100644
--- a/be/test/olap/lru_cache_test.cpp
+++ b/be/test/olap/lru_cache_test.cpp
@@ -88,46 +88,25 @@ class CacheTest : public testing::Test {
         void* value;
     };
 
-    class CacheTestSizePolicy : public LRUCachePolicyTrackingManual {
+    class CacheTestPolicy : public LRUCachePolicyTrackingManual {
     public:
-        CacheTestSizePolicy(size_t capacity)
-                : LRUCachePolicyTrackingManual(CachePolicy::CacheType::FOR_UT_CACHE_SIZE, capacity,
+        CacheTestPolicy(size_t capacity)
+                : LRUCachePolicyTrackingManual(CachePolicy::CacheType::FOR_UT, capacity,
                                                LRUCacheType::SIZE, -1) {}
     };
 
-    class CacheTestNumberPolicy : public LRUCachePolicyTrackingManual {
-    public:
-        CacheTestNumberPolicy(size_t capacity, uint32_t num_shards)
-                : LRUCachePolicyTrackingManual(CachePolicy::CacheType::FOR_UT_CACHE_NUMBER,
-                                               capacity, LRUCacheType::NUMBER, -1, num_shards) {}
-    };
-
     // there is 16 shards in ShardedLRUCache
     // And the LRUHandle size is about 100B. So the cache size should big enough
     // to run the UT.
     static const int kCacheSize = 1000 * 16;
     std::vector<int> _deleted_keys;
     std::vector<int> _deleted_values;
-    LRUCachePolicy* _cache = nullptr;
+    CacheTestPolicy* _cache;
 
-    CacheTest() { _s_current = this; }
+    CacheTest() : _cache(new CacheTestPolicy(kCacheSize)) { _s_current = this; }
 
     ~CacheTest() override { delete _cache; }
 
-    void init_size_cache(size_t capacity = kCacheSize) {
-        if (_cache != nullptr) {
-            delete _cache;
-        }
-        _cache = new CacheTestSizePolicy(capacity);
-    }
-
-    void init_number_cache(size_t capacity = kCacheSize, uint32_t num_shards = 1) {
-        if (_cache != nullptr) {
-            delete _cache;
-        }
-        _cache = new CacheTestNumberPolicy(capacity, num_shards);
-    }
-
     LRUCachePolicy* cache() const { return _cache; }
 
     int Lookup(int key) const {
@@ -170,25 +149,7 @@ class CacheTest : public testing::Test {
 };
 CacheTest* CacheTest::_s_current;
 
-static void insert_LRUCache(LRUCache& cache, const CacheKey& key, int value,
-                            CachePriority priority) {
-    uint32_t hash = key.hash(key.data(), key.size(), 0);
-    auto* cache_value = new CacheTest::CacheValue(EncodeValue(value));
-    cache.release(cache.insert(key, hash, cache_value, value, priority));
-}
-
-static void insert_number_LRUCache(LRUCache& cache, const CacheKey& key, int value, int charge,
-                                   CachePriority priority) {
-    uint32_t hash = key.hash(key.data(), key.size(), 0);
-    auto* cache_value = new CacheTest::CacheValue(EncodeValue(value));
-    cache.release(cache.insert(key, hash, cache_value, charge, priority));
-}
-
-// https://stackoverflow.com/questions/42756443/undefined-reference-with-gtest
-const int CacheTest::kCacheSize;
-
 TEST_F(CacheTest, HitAndMiss) {
-    init_size_cache();
     EXPECT_EQ(-1, Lookup(100));
 
     Insert(100, 101, 1);
@@ -212,7 +173,6 @@ TEST_F(CacheTest, HitAndMiss) {
 }
 
 TEST_F(CacheTest, Erase) {
-    init_size_cache();
     Erase(200);
     EXPECT_EQ(0, _deleted_keys.size());
 
@@ -232,7 +192,6 @@ TEST_F(CacheTest, Erase) {
 }
 
 TEST_F(CacheTest, EntriesArePinned) {
-    init_size_cache();
     Insert(100, 101, 1);
     std::string result1;
     Cache::Handle* h1 = cache()->lookup(EncodeKey(&result1, 100));
@@ -260,7 +219,6 @@ TEST_F(CacheTest, EntriesArePinned) {
 }
 
 TEST_F(CacheTest, EvictionPolicy) {
-    init_size_cache();
     Insert(100, 101, 1);
     Insert(200, 201, 1);
 
@@ -276,7 +234,6 @@ TEST_F(CacheTest, EvictionPolicy) {
 }
 
 TEST_F(CacheTest, EvictionPolicyWithDurable) {
-    init_size_cache();
     Insert(100, 101, 1);
     InsertDurable(200, 201, 1);
     Insert(300, 101, 1);
@@ -293,6 +250,20 @@ TEST_F(CacheTest, EvictionPolicyWithDurable) {
     EXPECT_EQ(201, Lookup(200));
 }
 
+static void insert_LRUCache(LRUCache& cache, const CacheKey& key, int value,
+                            CachePriority priority) {
+    uint32_t hash = key.hash(key.data(), key.size(), 0);
+    auto* cache_value = new CacheTest::CacheValue(EncodeValue(value));
+    cache.release(cache.insert(key, hash, cache_value, value, priority));
+}
+
+static void insert_number_LRUCache(LRUCache& cache, const CacheKey& key, int value, int charge,
+                                   CachePriority priority) {
+    uint32_t hash = key.hash(key.data(), key.size(), 0);
+    auto* cache_value = new CacheTest::CacheValue(EncodeValue(value));
+    cache.release(cache.insert(key, hash, cache_value, charge, priority));
+}
+
 TEST_F(CacheTest, Usage) {
     LRUCache cache(LRUCacheType::SIZE);
     cache.set_capacity(1040);
@@ -492,7 +463,6 @@ TEST_F(CacheTest, Number) {
 }
 
 TEST_F(CacheTest, HeavyEntries) {
-    init_size_cache();
     // Add a bunch of light and heavy entries and then count the combined
     // size of items still in the cache, which must be approximately the
     // same as the total capacity.
@@ -524,14 +494,12 @@ TEST_F(CacheTest, HeavyEntries) {
 }
 
 TEST_F(CacheTest, NewId) {
-    init_size_cache();
     uint64_t a = cache()->new_id();
     uint64_t b = cache()->new_id();
     EXPECT_NE(a, b);
 }
 
 TEST_F(CacheTest, SimpleBenchmark) {
-    init_size_cache();
     for (int i = 0; i < kCacheSize * LOOP_LESS_OR_MORE(10, 10000); i++) {
         Insert(1000 + i, 2000 + i, 1);
         EXPECT_EQ(2000 + i, Lookup(1000 + i));
@@ -630,78 +598,4 @@ TEST(CacheHandleTest, HandleTableTest) {
     }
 }
 
-TEST_F(CacheTest, SetCapacity) {
-    init_number_cache();
-    for (int i = 0; i < kCacheSize; i++) {
-        Insert(i, 1000 + i, 1);
-        EXPECT_EQ(1000 + i, Lookup(i));
-    }
-    ASSERT_EQ(kCacheSize, cache()->get_capacity());
-    ASSERT_EQ(kCacheSize, cache()->get_usage());
-
-    int64_t prune_num = cache()->adjust_capacity_weighted(2);
-    ASSERT_EQ(prune_num, 0);
-    ASSERT_EQ(kCacheSize * 2, cache()->get_capacity());
-    ASSERT_EQ(kCacheSize, cache()->get_usage());
-
-    prune_num = cache()->adjust_capacity_weighted(0.5);
-    ASSERT_EQ(prune_num, kCacheSize / 2);
-    ASSERT_EQ(kCacheSize / 2, cache()->get_capacity());
-    ASSERT_EQ(kCacheSize / 2, cache()->get_usage());
-
-    std::vector<Cache::Handle*> handles(kCacheSize, nullptr);
-    for (int i = 0; i < kCacheSize; i++) {
-        std::string result;
-        CacheKey cache_key = EncodeKey(&result, kCacheSize + i);
-        auto* cache_value = new CacheValueWithKey(DecodeKey(cache_key), EncodeValue(i));
-        handles[i] = cache()->insert(cache_key, cache_value, 1, 1);
-    }
-    ASSERT_EQ(kCacheSize / 2, cache()->get_capacity());
-    ASSERT_EQ(kCacheSize,
-              cache()->get_usage()); // Handle not be released, so key cannot be evicted.
-
-    for (int i = 0; i < kCacheSize; i++) {
-        Insert(i + kCacheSize, 2000 + i, 1);
-        EXPECT_EQ(-1, Lookup(i + kCacheSize)); // Cache is full, insert failed.
-    }
-    ASSERT_EQ(kCacheSize / 2, cache()->get_capacity());
-    ASSERT_EQ(kCacheSize, cache()->get_usage());
-
-    cache()->adjust_capacity_weighted(2);
-    ASSERT_EQ(kCacheSize * 2, cache()->get_capacity());
-    ASSERT_EQ(kCacheSize, cache()->get_usage());
-
-    for (int i = 0; i < kCacheSize; i++) {
-        Insert(i, 3000 + i, 1);
-        EXPECT_EQ(3000 + i, Lookup(i));
-    }
-    ASSERT_EQ(kCacheSize * 2, cache()->get_capacity());
-    ASSERT_EQ(kCacheSize * 2, cache()->get_usage());
-
-    cache()->adjust_capacity_weighted(0);
-    ASSERT_EQ(0, cache()->get_capacity());
-    ASSERT_EQ(kCacheSize, cache()->get_usage());
-
-    for (auto it : handles) {
-        cache()->release(it);
-    }
-    ASSERT_EQ(0, cache()->get_capacity());
-    ASSERT_EQ(0, cache()->get_usage());
-
-    cache()->adjust_capacity_weighted(1);
-    ASSERT_EQ(kCacheSize, cache()->get_capacity());
-    ASSERT_EQ(0, cache()->get_usage());
-
-    cache()->adjust_capacity_weighted(0);
-    ASSERT_EQ(0, cache()->get_capacity());
-    ASSERT_EQ(0, cache()->get_usage());
-
-    for (int i = 0; i < kCacheSize; i++) {
-        Insert(i, 4000 + i, 1);
-        EXPECT_EQ(-1, Lookup(i));
-    }
-    ASSERT_EQ(0, cache()->get_capacity());
-    ASSERT_EQ(0, cache()->get_usage());
-}
-
 } // namespace doris

From 5392cb13dfbb582cd726c8a6aeb3e33a01f0d1eb Mon Sep 17 00:00:00 2001
From: zhangdong <493738387@qq.com>
Date: Mon, 9 Sep 2024 19:06:18 +0800
Subject: [PATCH 08/44] [fix](mtmv)fix nested mtmv not refresh (#40433)

fix nested mtmv not refresh because the partition version remains
unchanged after inserting overwrite for the underlying materialized view

we add partitionId in snapshot
---
 .../org/apache/doris/catalog/OlapTable.java   |  3 +-
 .../mtmv/MTMVRefreshPartitionSnapshot.java    | 45 +++++++++++-
 .../doris/mtmv/MTMVVersionSnapshot.java       | 25 ++++++-
 .../data/mtmv_p0/test_multi_level_mtmv.out    | 11 +++
 .../test_upgrade_downgrade_olap_mtmv.out      |  9 +++
 .../mtmv_p0/test_multi_level_mtmv.groovy      | 16 +++++
 .../suites/mtmv_up_down_olap_p0/load.groovy   | 71 +++++++++++++++++++
 .../test_upgrade_downgrade_olap_mtmv.groovy   | 32 +++++++++
 8 files changed, 208 insertions(+), 4 deletions(-)
 create mode 100644 regression-test/data/mtmv_up_down_olap_p0/test_upgrade_downgrade_olap_mtmv.out
 create mode 100644 regression-test/suites/mtmv_up_down_olap_p0/load.groovy
 create mode 100644 regression-test/suites/mtmv_up_down_olap_p0/test_upgrade_downgrade_olap_mtmv.groovy

diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java
index a60da71b299328..2f5eb35ad757fb 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java
@@ -3147,9 +3147,10 @@ public List<Column> getPartitionColumns() {
     public MTMVSnapshotIf getPartitionSnapshot(String partitionName, MTMVRefreshContext context)
             throws AnalysisException {
         Map<String, Long> partitionVersions = context.getBaseVersions().getPartitionVersions();
+        long partitionId = getPartitionOrAnalysisException(partitionName).getId();
         long visibleVersion = partitionVersions.containsKey(partitionName) ? partitionVersions.get(partitionName)
                 : getPartitionOrAnalysisException(partitionName).getVisibleVersion();
-        return new MTMVVersionSnapshot(visibleVersion);
+        return new MTMVVersionSnapshot(visibleVersion, partitionId);
     }
 
     @Override
diff --git a/fe/fe-core/src/main/java/org/apache/doris/mtmv/MTMVRefreshPartitionSnapshot.java b/fe/fe-core/src/main/java/org/apache/doris/mtmv/MTMVRefreshPartitionSnapshot.java
index 63bbfc2e037084..fa17ed766661d0 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/mtmv/MTMVRefreshPartitionSnapshot.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/mtmv/MTMVRefreshPartitionSnapshot.java
@@ -18,6 +18,9 @@
 package org.apache.doris.mtmv;
 
 import org.apache.doris.catalog.MTMV;
+import org.apache.doris.catalog.OlapTable;
+import org.apache.doris.catalog.Partition;
+import org.apache.doris.common.AnalysisException;
 
 import com.google.common.collect.Maps;
 import com.google.gson.annotations.SerializedName;
@@ -74,6 +77,46 @@ public String toString() {
     }
 
     public void compatible(MTMV mtmv) {
+        try {
+            // snapshot add partitionId resolve problem of insert overwrite
+            compatiblePartitions(mtmv);
+        } catch (Throwable e) {
+            LOG.warn("MTMV compatiblePartitions failed, mtmv: {}", mtmv.getName(), e);
+        }
+        try {
+            // change table id to BaseTableInfo
+            compatibleTables(mtmv);
+        } catch (Throwable e) {
+            LOG.warn("MTMV compatibleTables failed, mtmv: {}", mtmv.getName(), e);
+        }
+    }
+
+    private void compatiblePartitions(MTMV mtmv) throws AnalysisException {
+        if (!checkHasDataWithoutPartitionId()) {
+            return;
+        }
+        OlapTable relatedTable = (OlapTable) mtmv.getMvPartitionInfo().getRelatedTable();
+        for (Entry<String, MTMVSnapshotIf> entry : partitions.entrySet()) {
+            MTMVVersionSnapshot versionSnapshot = (MTMVVersionSnapshot) entry.getValue();
+            if (versionSnapshot.getId() == 0) {
+                Partition partition = relatedTable.getPartition(entry.getKey());
+                if (partition != null) {
+                    (versionSnapshot).setId(partition.getId());
+                }
+            }
+        }
+    }
+
+    private boolean checkHasDataWithoutPartitionId() {
+        for (MTMVSnapshotIf snapshot : partitions.values()) {
+            if (snapshot instanceof MTMVVersionSnapshot && ((MTMVVersionSnapshot) snapshot).getId() == 0) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    private void compatibleTables(MTMV mtmv) {
         if (tables.size() == tablesInfo.size()) {
             return;
         }
@@ -87,7 +130,7 @@ public void compatible(MTMV mtmv) {
             if (tableInfo.isPresent()) {
                 tablesInfo.put(tableInfo.get(), entry.getValue());
             } else {
-                LOG.warn("MTMV compatible failed, tableId: {}, relationTables: {}", entry.getKey(),
+                LOG.warn("MTMV compatibleTables failed, tableId: {}, relationTables: {}", entry.getKey(),
                         relation.getBaseTablesOneLevel());
             }
         }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/mtmv/MTMVVersionSnapshot.java b/fe/fe-core/src/main/java/org/apache/doris/mtmv/MTMVVersionSnapshot.java
index 0eb7860bc54ee0..2440649462ebf3 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/mtmv/MTMVVersionSnapshot.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/mtmv/MTMVVersionSnapshot.java
@@ -24,10 +24,30 @@ public class MTMVVersionSnapshot implements MTMVSnapshotIf {
     @SerializedName("v")
     private long version;
 
+    // The partition version after insert overwrite is 1,
+    // which may cause the upper level materialized view to be unaware of changes in the data at the bottom level.
+    // However, the partition ID after overwrite will change, so the partition ID should be added.
+    // only for partition, table will always 0
+    @SerializedName("id")
+    private long id;
+
     public MTMVVersionSnapshot(long version) {
         this.version = version;
     }
 
+    public MTMVVersionSnapshot(long version, long id) {
+        this.version = version;
+        this.id = id;
+    }
+
+    public long getId() {
+        return id;
+    }
+
+    public void setId(long id) {
+        this.id = id;
+    }
+
     @Override
     public boolean equals(Object o) {
         if (this == o) {
@@ -37,18 +57,19 @@ public boolean equals(Object o) {
             return false;
         }
         MTMVVersionSnapshot that = (MTMVVersionSnapshot) o;
-        return version == that.version;
+        return version == that.version && id == that.id;
     }
 
     @Override
     public int hashCode() {
-        return Objects.hashCode(version);
+        return Objects.hashCode(version, id);
     }
 
     @Override
     public String toString() {
         return "MTMVVersionSnapshot{"
                 + "version=" + version
+                + ", id=" + id
                 + '}';
     }
 }
diff --git a/regression-test/data/mtmv_p0/test_multi_level_mtmv.out b/regression-test/data/mtmv_p0/test_multi_level_mtmv.out
index 7543b21ffa7bbc..7d44e381cc8fc2 100644
--- a/regression-test/data/mtmv_p0/test_multi_level_mtmv.out
+++ b/regression-test/data/mtmv_p0/test_multi_level_mtmv.out
@@ -11,6 +11,17 @@
 -- !mv2_should_one_partition --
 ["p_2"]
 
+-- !mv1_should_one_partition_again --
+["p_2"]
+
+-- !mv2_should_one_partition_again --
+["p_2"]
+
+-- !mv2_again --
+1	1
+2	2
+2	3
+
 -- !status1 --
 multi_level_mtmv1	SCHEMA_CHANGE	SUCCESS
 
diff --git a/regression-test/data/mtmv_up_down_olap_p0/test_upgrade_downgrade_olap_mtmv.out b/regression-test/data/mtmv_up_down_olap_p0/test_upgrade_downgrade_olap_mtmv.out
new file mode 100644
index 00000000000000..760e94479a82b0
--- /dev/null
+++ b/regression-test/data/mtmv_up_down_olap_p0/test_upgrade_downgrade_olap_mtmv.out
@@ -0,0 +1,9 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !refresh_init --
+1	2017-01-15	1
+2	2017-02-15	2
+3	2017-03-15	3
+
+-- !mtmv_sync --
+true
+
diff --git a/regression-test/suites/mtmv_p0/test_multi_level_mtmv.groovy b/regression-test/suites/mtmv_p0/test_multi_level_mtmv.groovy
index 55689b741489eb..33a876c46d4ef2 100644
--- a/regression-test/suites/mtmv_p0/test_multi_level_mtmv.groovy
+++ b/regression-test/suites/mtmv_p0/test_multi_level_mtmv.groovy
@@ -87,6 +87,22 @@ suite("test_multi_level_mtmv") {
     waitingMTMVTaskFinishedByMvName(mv2)
     order_qt_mv2_should_one_partition "select NeedRefreshPartitions from tasks('type'='mv') where MvName = '${mv2}' order by CreateTime desc limit 1"
 
+    // insert into p2 again, check partition version if change
+    sql """
+            INSERT INTO ${tableName} VALUES(2,3);
+        """
+    sql """
+           REFRESH MATERIALIZED VIEW ${mv1} AUTO
+       """
+    waitingMTMVTaskFinishedByMvName(mv1)
+    order_qt_mv1_should_one_partition_again "select NeedRefreshPartitions from tasks('type'='mv') where MvName = '${mv1}' order by CreateTime desc limit 1"
+    sql """
+           REFRESH MATERIALIZED VIEW ${mv2} AUTO
+        """
+    waitingMTMVTaskFinishedByMvName(mv2)
+    order_qt_mv2_should_one_partition_again "select NeedRefreshPartitions from tasks('type'='mv') where MvName = '${mv2}' order by CreateTime desc limit 1"
+    order_qt_mv2_again "select * from ${mv2}"
+
     // drop table
     sql """
         drop table ${tableName}
diff --git a/regression-test/suites/mtmv_up_down_olap_p0/load.groovy b/regression-test/suites/mtmv_up_down_olap_p0/load.groovy
new file mode 100644
index 00000000000000..f909b33064d457
--- /dev/null
+++ b/regression-test/suites/mtmv_up_down_olap_p0/load.groovy
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_upgrade_downgrade_prepare_olap_mtmv","p0,mtmv,restart_fe") {
+    String suiteName = "mtmv_up_down_olap"
+    String mvName = "${suiteName}_mtmv"
+    String tableName = "${suiteName}_table"
+    String tableName2 = "${suiteName}_table2"
+
+    sql """drop materialized view if exists ${mvName};"""
+    sql """drop table if exists `${tableName}`"""
+    sql """drop table if exists `${tableName2}`"""
+
+    sql """
+        CREATE TABLE `${tableName}` (
+          `user_id` LARGEINT NOT NULL COMMENT '\"用户id\"',
+          `date` DATE NOT NULL COMMENT '\"数据灌入日期时间\"',
+          `num` SMALLINT NOT NULL COMMENT '\"数量\"'
+        ) ENGINE=OLAP
+        DUPLICATE KEY(`user_id`, `date`, `num`)
+        COMMENT 'OLAP'
+        PARTITION BY RANGE(`date`)
+        (PARTITION p201701_1000 VALUES [('0000-01-01'), ('2017-02-01')),
+        PARTITION p201702_2000 VALUES [('2017-02-01'), ('2017-03-01')),
+        PARTITION p201703_all VALUES [('2017-03-01'), ('2017-04-01')))
+        DISTRIBUTED BY HASH(`user_id`) BUCKETS 2
+        PROPERTIES ('replication_num' = '1') ;
+        """
+    sql """
+        insert into ${tableName} values(1,"2017-01-15",1),(2,"2017-02-15",2),(3,"2017-03-15",3);
+        """
+
+    sql """
+        CREATE TABLE `${tableName2}` (
+          `user_id` LARGEINT NOT NULL COMMENT '\"用户id\"',
+          `age` SMALLINT NOT NULL COMMENT '\"年龄\"'
+        ) ENGINE=OLAP
+        DUPLICATE KEY(`user_id`, `age`)
+        COMMENT 'OLAP'
+        DISTRIBUTED BY HASH(`user_id`) BUCKETS 2
+        PROPERTIES ('replication_num' = '1') ;
+        """
+    sql """
+        insert into ${tableName2} values(1,1),(2,2),(3,3);
+        """
+
+    sql """
+        CREATE MATERIALIZED VIEW ${mvName}
+            REFRESH AUTO ON MANUAL
+            partition by(`date`)
+            DISTRIBUTED BY RANDOM BUCKETS 2
+            PROPERTIES ('replication_num' = '1')
+            AS
+            SELECT a.* FROM ${tableName} a inner join ${tableName2} b on a.user_id=b.user_id;
+    """
+    waitingMTMVTaskFinishedByMvName(mvName)
+}
diff --git a/regression-test/suites/mtmv_up_down_olap_p0/test_upgrade_downgrade_olap_mtmv.groovy b/regression-test/suites/mtmv_up_down_olap_p0/test_upgrade_downgrade_olap_mtmv.groovy
new file mode 100644
index 00000000000000..253908ff4ae8ce
--- /dev/null
+++ b/regression-test/suites/mtmv_up_down_olap_p0/test_upgrade_downgrade_olap_mtmv.groovy
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_upgrade_downgrade_olap_mtmv","p0,mtmv,restart_fe") {
+    String suiteName = "mtmv_up_down_olap"
+    String dbName = context.config.getDbNameByFile(context.file)
+    String mvName = "${suiteName}_mtmv"
+    String tableName = "${suiteName}_table"
+    // test data is normal
+    order_qt_refresh_init "SELECT * FROM ${mvName}"
+    // test is sync
+    order_qt_mtmv_sync "select SyncWithBaseTables from mv_infos('database'='${dbName}') where Name='${mvName}'"
+     sql """
+            REFRESH MATERIALIZED VIEW ${mvName} complete
+        """
+    // test can refresh success
+    waitingMTMVTaskFinishedByMvName(mvName)
+}

From 0d3374bea554c7b4140f56666fb11dad7424b609 Mon Sep 17 00:00:00 2001
From: minghong <englefly@gmail.com>
Date: Mon, 9 Sep 2024 20:17:56 +0800
Subject: [PATCH 09/44] [opt](nereids) tabe row count priority: user injected >
 BE report > analyzed (#40529)

## Proposed changes
there are 2 sources for table row count, one is analyzed result, another
is from BE report.
But neither of them is accurate. the priorities of them from high to
low:
user injected > BE report > analyzed

Issue Number: close #xxx

<!--Describe your changes.-->
---
 .../doris/nereids/stats/StatsCalculator.java  | 50 ++++++++++---------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
index 9ea5811502293a..5946192a27eff9 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
@@ -364,19 +364,28 @@ private void checkIfUnknownStatsUsedAsKey(StatisticsBuilder builder) {
         }
     }
 
-    private Statistics computeOlapScan(OlapScan olapScan) {
+    private double getOlapTableRowCount(OlapScan olapScan) {
         OlapTable olapTable = olapScan.getTable();
-        double tableRowCount = olapTable.getRowCountForIndex(olapScan.getSelectedIndexId(), true);
-        if (tableRowCount <= 0) {
-            AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager();
-            TableStatsMeta tableMeta = analysisManager.findTableStatsStatus(olapScan.getTable().getId());
-            if (tableMeta != null) {
-                // create-view after analyzing, we may get -1 for this view row count
-                tableRowCount = Math.max(1, tableMeta.getRowCount(olapScan.getSelectedIndexId()));
-            } else {
-                tableRowCount = 1;
+        AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager();
+        TableStatsMeta tableMeta = analysisManager.findTableStatsStatus(olapScan.getTable().getId());
+        double rowCount = -1;
+        if (tableMeta != null && tableMeta.userInjected) {
+            rowCount = tableMeta.getRowCount(olapScan.getSelectedIndexId());
+        } else {
+            rowCount = olapTable.getRowCountForIndex(olapScan.getSelectedIndexId(), true);
+            if (rowCount == -1) {
+                if (tableMeta != null) {
+                    rowCount = tableMeta.getRowCount(olapScan.getSelectedIndexId());
+                }
             }
         }
+        return rowCount;
+    }
+
+    private Statistics computeOlapScan(OlapScan olapScan) {
+        OlapTable olapTable = olapScan.getTable();
+        double tableRowCount = getOlapTableRowCount(olapScan);
+        tableRowCount = Math.max(1, tableRowCount);
 
         if (olapScan.getSelectedIndexId() != olapScan.getTable().getBaseIndexId() || olapTable instanceof MTMV) {
             // mv is selected, return its estimated stats
@@ -441,10 +450,13 @@ private Statistics computeOlapScan(OlapScan olapScan) {
             }
         }
 
+        boolean useTableLevelStats = true;
         if (olapScan.getSelectedPartitionIds().size() < olapScan.getTable().getPartitionNum()) {
             // partition pruned
+            // try to use selected partition stats, if failed, fall back to table stats
             double selectedPartitionsRowCount = getSelectedPartitionRowCount(olapScan);
-            if (selectedPartitionsRowCount > 0) {
+            if (selectedPartitionsRowCount >= 0) {
+                useTableLevelStats = false;
                 List<String> selectedPartitionNames = new ArrayList<>(olapScan.getSelectedPartitionIds().size());
                 olapScan.getSelectedPartitionIds().forEach(id -> {
                     selectedPartitionNames.add(olapScan.getTable().getPartition(id).getName());
@@ -458,19 +470,11 @@ private Statistics computeOlapScan(OlapScan olapScan) {
                 }
                 checkIfUnknownStatsUsedAsKey(builder);
                 builder.setRowCount(selectedPartitionsRowCount + deltaRowCount);
-            } else {
-                // if partition row count is invalid (-1), fallback to table stats
-                for (SlotReference slot : visibleOutputSlots) {
-                    ColumnStatistic cache = getColumnStatsFromTableCache((CatalogRelation) olapScan, slot);
-                    ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache);
-                    colStatsBuilder.setCount(tableRowCount);
-                    colStatsBuilder.normalizeAvgSizeByte(slot);
-                    builder.putColumnStatistics(slot, colStatsBuilder.build());
-                }
-                checkIfUnknownStatsUsedAsKey(builder);
-                builder.setRowCount(tableRowCount + deltaRowCount);
             }
-        } else {
+        }
+        // 1. no partition is pruned, or
+        // 2. fall back to table stats
+        if (useTableLevelStats) {
             // get table level stats
             for (SlotReference slot : visibleOutputSlots) {
                 ColumnStatistic cache = getColumnStatsFromTableCache((CatalogRelation) olapScan, slot);

From 258ec3db385ae82c5b37e27f7c19e993092fb242 Mon Sep 17 00:00:00 2001
From: hui lai <1353307710@qq.com>
Date: Mon, 9 Sep 2024 23:47:39 +0800
Subject: [PATCH 10/44] [fix](cloud) should do check before abort transaction
 (#40463)

When routine load task transaction is abort, it should do check before
abort transaction, otherwise, it may cause concurrent modifications to
the `routineLoadTaskInfoList`, which in the Java language may result in
elements in the list being null, leading to a loop throwing
NullPointerException during scheduling and making it impossible to
schedule routine load task to consume Kafka stream.
---
 .../transaction/CloudGlobalTransactionMgr.java     | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java
index f51454ad269c51..f224d2929a65c1 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java
@@ -990,6 +990,20 @@ public void abortTransaction(Long dbId, Long transactionId, String reason,
             TxnCommitAttachment txnCommitAttachment, List<Table> tableList) throws UserException {
         LOG.info("try to abort transaction, dbId:{}, transactionId:{}", dbId, transactionId);
 
+        if (txnCommitAttachment != null) {
+            if (txnCommitAttachment instanceof RLTaskTxnCommitAttachment) {
+                RLTaskTxnCommitAttachment rlTaskTxnCommitAttachment = (RLTaskTxnCommitAttachment) txnCommitAttachment;
+                TxnStateChangeCallback cb = callbackFactory.getCallback(rlTaskTxnCommitAttachment.getJobId());
+                if (cb != null) {
+                    // use a temporary transaction state to do before commit check,
+                    // what actually works is the transactionId
+                    TransactionState tmpTxnState = new TransactionState();
+                    tmpTxnState.setTransactionId(transactionId);
+                    cb.beforeAborted(tmpTxnState);
+                }
+            }
+        }
+
         AbortTxnRequest.Builder builder = AbortTxnRequest.newBuilder();
         builder.setDbId(dbId);
         builder.setTxnId(transactionId);

From f91463c2d308688743fd314892b326244e1e1e4e Mon Sep 17 00:00:00 2001
From: Xin Liao <liaoxinbit@126.com>
Date: Tue, 10 Sep 2024 10:44:17 +0800
Subject: [PATCH 11/44] [Fix](regression-test) fix error url check in
 test_etl_failed case for cloud p0 (#40298)

---
 .../suites/load_p0/broker_load/test_etl_failed.groovy           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regression-test/suites/load_p0/broker_load/test_etl_failed.groovy b/regression-test/suites/load_p0/broker_load/test_etl_failed.groovy
index 4049fdadb1f1f8..70d2a42166dac0 100644
--- a/regression-test/suites/load_p0/broker_load/test_etl_failed.groovy
+++ b/regression-test/suites/load_p0/broker_load/test_etl_failed.groovy
@@ -67,7 +67,7 @@ suite("test_etl_failed", "load_p0") {
             assertTrue(1 == 2, "etl should be failed")
             break;
         }
-        if (result[0][2].equals("CANCELLED") && result[0][13].contains("_load_error_log")) {
+        if (result[0][2].equals("CANCELLED") && result[0][13].contains("error_log")) {
             break;
         }
         Thread.sleep(1000)

From 5f1fe8143d18a8dc676aa58c9cc739c2c5a0e291 Mon Sep 17 00:00:00 2001
From: zhiqiang <seuhezhiqiang@163.com>
Date: Tue, 10 Sep 2024 11:00:56 +0800
Subject: [PATCH 12/44] [fix](scanner) Fix deadlock when scanner submit failed
 (#40495)

We have dead lock when submit scanner to scheduler failed.

pstack looks like
```txt
Thread 2012 (Thread 0x7f87363fb700 (LWP 4179707) "Pipe_normal [wo"):
#0  0x00007f8b8f3dc82d in __lll_lock_wait () from /lib64/libpthread.so.0
#1  0x00007f8b8f3d5ad9 in pthread_mutex_lock () from /lib64/libpthread.so.0
#2  0x000055b20f333e7a in __gthread_mutex_lock (__mutex=0x7f8733d960a8) at /mnt/disk1/hezhiqiang/toolchains/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/x86_64-linux-gnu/c++/11/bits/gthr-default
.h:749
#3  std::mutex::lock (this=0x7f8733d960a8) at /mnt/disk1/hezhiqiang/toolchains/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_mutex.h:100
#4  std::lock_guard<std::mutex>::lock_guard (__m=..., this=<optimized out>) at /mnt/disk1/hezhiqiang/toolchains/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_mutex.h:229
#5  doris::vectorized::ScannerContext::append_block_to_queue (this=<optimized out>, scan_task=...) at /mnt/disk1/hezhiqiang/doris/be/src/vec/exec/scan/scanner_context.cpp:234
#6  0x000055b20f32c0f9 in doris::vectorized::ScannerScheduler::submit (this=<optimized out>, ctx=..., scan_task=...) at /mnt/disk1/hezhiqiang/doris/be/src/vec/exec/scan/scanner_scheduler.cpp:209
#7  0x000055b20f3338fc in doris::vectorized::ScannerContext::submit_scan_task (this=this@entry=0x7f8733d96010, scan_task=...) at /mnt/disk1/hezhiqiang/doris/be/src/vec/exec/scan/scanner_context.cpp:217
#8  0x000055b20f3346cd in doris::vectorized::ScannerContext::get_block_from_queue (this=0x7f8733d96010, state=<optimized out>, block=0x7f871f728de0, eos=0x7f871abce470, id=<optimized out>) at /mnt/disk1/hezhiqiang/doris/be/src/vec/exec/scan/scanner_context.cpp:290
#9  0x000055b214cb4f13 in doris::pipeline::ScanOperatorX<doris::pipeline::OlapScanLocalState>::get_block (this=<optimized out>, state=0x7f872f0eb400, block=0x7f8b8f3dc82d <__lll_lock_wait+29>, eos=0x7f871abce470) at /mnt/disk1/hezhiqiang/doris/be/src/pipeline/exec/scan_operator.cpp:1292
#10 0x000055b2142b5772 in doris::pipeline::ScanOperatorX<doris::pipeline::OlapScanLocalState>::get_block_after_projects (this=0x80, state=0x0, block=0x7f8b8f3dc82d <__lll_lock_wait+29>, eos=0x7f8733d960a8) at /mnt/disk1/hezhiqiang/doris/be/src/pipeline/exec/scan_operator.h:363
#11 0x000055b2142e7880 in doris::pipeline::StatefulOperatorX<doris::pipeline::StreamingAggLocalState>::get_block (this=0x7f871f9bee00, state=0x7f872f0eb400, block=0x7f8716d49060, eos=0x7f87363f4937) at /mnt/disk1/hezhiqiang/doris/be/src/pipeline/exec/operator.cpp:587
```
Deallock happens with following
```cpp
Status ScannerContext::get_block_from_queue {
     std::unique_lock l(_transfer_lock);
     ...
     if (scan_task->is_eos()) {
     ...
     } else {
          // resubmit current running scanner to read the next block
         submit_scan_task(scan_task);
     }
}

ScannerContext::submit_scan_task(std::shared_ptr<ScanTask> scan_task) {
     _scanner_scheduler->submit(shared_from_this(), scan_task);
}

void ScannerScheduler::submit(std::shared_ptr<ScannerContext> ctx,
                              std::shared_ptr<ScanTask> scan_task) {
    ...
    if (auto ret = sumbit_task(); !ret) {
        scan_task->set_status(Status::InternalError(
                "Failed to submit scanner to scanner pool reason:" + std::string(ret.msg()) +
                "|type:" + std::to_string(type)));
        ctx->append_block_to_queue(scan_task);
        return;
    }
}

void ScannerContext::append_block_to_queue(std::shared_ptr<ScanTask> scan_task) {
    ...
    std::lock_guard<std::mutex> l(_transfer_lock);
    ...
}
```
Since mutex in cpp is not re-enterable, so the scanner thread will
deadlock with itself.

This pr fix the problem by making `ScannerScheduler::submit` return a
Status instead of doing append failed task to the ScannerContext. The
caller itself will decide where resubmit the scanner or just abort the
execution of the query.
---
 be/src/vec/exec/scan/scanner_context.cpp   | 44 ++++++++++++++++------
 be/src/vec/exec/scan/scanner_context.h     |  4 +-
 be/src/vec/exec/scan/scanner_scheduler.cpp | 31 ++++++++-------
 be/src/vec/exec/scan/scanner_scheduler.h   |  2 +-
 4 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/be/src/vec/exec/scan/scanner_context.cpp b/be/src/vec/exec/scan/scanner_context.cpp
index 5cc20c214c103b..cbb3d0f572365b 100644
--- a/be/src/vec/exec/scan/scanner_context.cpp
+++ b/be/src/vec/exec/scan/scanner_context.cpp
@@ -152,7 +152,7 @@ Status ScannerContext::init() {
     for (int i = 0; i < _max_thread_num; ++i) {
         std::weak_ptr<ScannerDelegate> next_scanner;
         if (_scanners.try_dequeue(next_scanner)) {
-            submit_scan_task(std::make_shared<ScanTask>(next_scanner));
+            RETURN_IF_ERROR(submit_scan_task(std::make_shared<ScanTask>(next_scanner)));
             _num_running_scanners++;
         }
     }
@@ -196,10 +196,10 @@ bool ScannerContext::empty_in_queue(int id) {
     return _blocks_queue.empty();
 }
 
-void ScannerContext::submit_scan_task(std::shared_ptr<ScanTask> scan_task) {
+Status ScannerContext::submit_scan_task(std::shared_ptr<ScanTask> scan_task) {
     _scanner_sched_counter->update(1);
     _num_scheduled_scanners++;
-    _scanner_scheduler->submit(shared_from_this(), scan_task);
+    return _scanner_scheduler->submit(shared_from_this(), scan_task);
 }
 
 void ScannerContext::append_block_to_queue(std::shared_ptr<ScanTask> scan_task) {
@@ -247,10 +247,15 @@ Status ScannerContext::get_block_from_queue(RuntimeState* state, vectorized::Blo
         auto scan_task = _blocks_queue.front();
         DCHECK(scan_task);
 
+        // The abnormal status of scanner may come from the execution of the scanner itself,
+        // or come from the scanner scheduler, such as TooManyTasks.
         if (!scan_task->status_ok()) {
+            // TODO: If the scanner status is TooManyTasks, maybe we can retry the scanner after a while.
+            _process_status = scan_task->get_status();
             _set_scanner_done();
-            return scan_task->get_status();
+            return _process_status;
         }
+
         if (!scan_task->cached_blocks.empty()) {
             auto [current_block, block_size] = std::move(scan_task->cached_blocks.front());
             scan_task->cached_blocks.pop_front();
@@ -263,13 +268,20 @@ Status ScannerContext::get_block_from_queue(RuntimeState* state, vectorized::Blo
             block->swap(*current_block);
             return_free_block(std::move(current_block));
         } else {
+            // This scan task do not have any cached blocks.
             _blocks_queue.pop_front();
-            if (scan_task->is_eos()) { // current scanner is finished, and no more data to read
+            // current scanner is finished, and no more data to read
+            if (scan_task->is_eos()) {
                 _num_finished_scanners++;
                 std::weak_ptr<ScannerDelegate> next_scanner;
                 // submit one of the remaining scanners
                 if (_scanners.try_dequeue(next_scanner)) {
-                    submit_scan_task(std::make_shared<ScanTask>(next_scanner));
+                    auto submit_status = submit_scan_task(std::make_shared<ScanTask>(next_scanner));
+                    if (!submit_status.ok()) {
+                        _process_status = submit_status;
+                        _set_scanner_done();
+                        return _process_status;
+                    }
                 } else {
                     // no more scanner to be scheduled
                     // `_free_blocks` serve all running scanners, maybe it's too large for the remaining scanners
@@ -284,11 +296,16 @@ Status ScannerContext::get_block_from_queue(RuntimeState* state, vectorized::Blo
                 }
             } else {
                 // resubmit current running scanner to read the next block
-                submit_scan_task(scan_task);
+                Status submit_status = submit_scan_task(scan_task);
+                if (!submit_status.ok()) {
+                    _process_status = submit_status;
+                    _set_scanner_done();
+                    return _process_status;
+                }
             }
         }
         // scale up
-        _try_to_scale_up();
+        RETURN_IF_ERROR(_try_to_scale_up());
     }
 
     if (_num_finished_scanners == _all_scanners.size() && _blocks_queue.empty()) {
@@ -303,7 +320,7 @@ Status ScannerContext::get_block_from_queue(RuntimeState* state, vectorized::Blo
     return Status::OK();
 }
 
-void ScannerContext::_try_to_scale_up() {
+Status ScannerContext::_try_to_scale_up() {
     // Four criteria to determine whether to increase the parallelism of the scanners
     // 1. It ran for at least `SCALE_UP_DURATION` ms after last scale up
     // 2. Half(`WAIT_BLOCK_DURATION_RATIO`) of the duration is waiting to get blocks
@@ -320,7 +337,7 @@ void ScannerContext::_try_to_scale_up() {
             // when _last_wait_duration_ratio > 0, it has scaled up before.
             // we need to determine if the scale-up is effective:
             // the wait duration ratio after last scaling up should less than 80% of `_last_wait_duration_ratio`
-            return;
+            return Status::OK();
         }
 
         bool is_scale_up = false;
@@ -335,7 +352,10 @@ void ScannerContext::_try_to_scale_up() {
             // get enough memory to launch one more scanner.
             std::weak_ptr<ScannerDelegate> scale_up_scanner;
             if (_scanners.try_dequeue(scale_up_scanner)) {
-                submit_scan_task(std::make_shared<ScanTask>(scale_up_scanner));
+                // Just return error to caller.
+                // Because _try_to_scale_up is called under _transfer_lock locked, if we add the scanner
+                // to the block queue, we will get a deadlock.
+                RETURN_IF_ERROR(submit_scan_task(std::make_shared<ScanTask>(scale_up_scanner)));
                 _num_running_scanners++;
                 _scale_up_scanners_counter->update(1);
                 is_scale_up = true;
@@ -350,6 +370,8 @@ void ScannerContext::_try_to_scale_up() {
             _total_wait_block_time = 0;
         }
     }
+
+    return Status::OK();
 }
 
 Status ScannerContext::validate_block_schema(Block* block) {
diff --git a/be/src/vec/exec/scan/scanner_context.h b/be/src/vec/exec/scan/scanner_context.h
index f93d01eef88427..03c4e5a4f1bba7 100644
--- a/be/src/vec/exec/scan/scanner_context.h
+++ b/be/src/vec/exec/scan/scanner_context.h
@@ -139,7 +139,7 @@ class ScannerContext : public std::enable_shared_from_this<ScannerContext>,
     // set the next scanned block to `ScanTask::current_block`
     // set the error state to `ScanTask::status`
     // set the `eos` to `ScanTask::eos` if there is no more data in current scanner
-    void submit_scan_task(std::shared_ptr<ScanTask> scan_task);
+    Status submit_scan_task(std::shared_ptr<ScanTask> scan_task);
 
     // append the running scanner and its cached block to `_blocks_queue`
     void append_block_to_queue(std::shared_ptr<ScanTask> scan_task);
@@ -186,7 +186,7 @@ class ScannerContext : public std::enable_shared_from_this<ScannerContext>,
     /// 3. `_free_blocks_memory_usage` < `_max_bytes_in_queue`, remains enough memory to scale up
     /// 4. At most scale up `MAX_SCALE_UP_RATIO` times to `_max_thread_num`
     void _set_scanner_done();
-    void _try_to_scale_up();
+    Status _try_to_scale_up();
 
     RuntimeState* _state = nullptr;
     pipeline::ScanLocalStateBase* _local_state = nullptr;
diff --git a/be/src/vec/exec/scan/scanner_scheduler.cpp b/be/src/vec/exec/scan/scanner_scheduler.cpp
index e30983932ee244..444ff4dbb0cd9f 100644
--- a/be/src/vec/exec/scan/scanner_scheduler.cpp
+++ b/be/src/vec/exec/scan/scanner_scheduler.cpp
@@ -120,23 +120,23 @@ Status ScannerScheduler::init(ExecEnv* env) {
     return Status::OK();
 }
 
-void ScannerScheduler::submit(std::shared_ptr<ScannerContext> ctx,
-                              std::shared_ptr<ScanTask> scan_task) {
+Status ScannerScheduler::submit(std::shared_ptr<ScannerContext> ctx,
+                                std::shared_ptr<ScanTask> scan_task) {
     scan_task->last_submit_time = GetCurrentTimeNanos();
     if (ctx->done()) {
-        return;
+        return Status::OK();
     }
     auto task_lock = ctx->task_exec_ctx();
     if (task_lock == nullptr) {
         LOG(INFO) << "could not lock task execution context, query " << ctx->debug_string()
                   << " maybe finished";
-        return;
+        return Status::OK();
     }
 
     if (ctx->thread_token != nullptr) {
         std::shared_ptr<ScannerDelegate> scanner_delegate = scan_task->scanner.lock();
         if (scanner_delegate == nullptr) {
-            return;
+            return Status::OK();
         }
 
         scanner_delegate->_scanner->start_wait_worker_timer();
@@ -153,13 +153,12 @@ void ScannerScheduler::submit(std::shared_ptr<ScannerContext> ctx,
         });
         if (!s.ok()) {
             scan_task->set_status(s);
-            ctx->append_block_to_queue(scan_task);
-            return;
+            return s;
         }
     } else {
         std::shared_ptr<ScannerDelegate> scanner_delegate = scan_task->scanner.lock();
         if (scanner_delegate == nullptr) {
-            return;
+            return Status::OK();
         }
 
         scanner_delegate->_scanner->start_wait_worker_timer();
@@ -187,14 +186,18 @@ void ScannerScheduler::submit(std::shared_ptr<ScannerContext> ctx,
             return scan_sched->submit_scan_task(simple_scan_task);
         };
 
-        if (auto ret = sumbit_task(); !ret) {
-            scan_task->set_status(Status::InternalError(
-                    "Failed to submit scanner to scanner pool reason:" + std::string(ret.msg()) +
-                    "|type:" + std::to_string(type)));
-            ctx->append_block_to_queue(scan_task);
-            return;
+        Status submit_status = sumbit_task();
+        if (!submit_status.ok()) {
+            // User will see TooManyTasks error. It looks like a more reasonable error.
+            Status scan_task_status = Status::TooManyTasks(
+                    "Failed to submit scanner to scanner pool reason:" +
+                    std::string(submit_status.msg()) + "|type:" + std::to_string(type));
+            scan_task->set_status(scan_task_status);
+            return scan_task_status;
         }
     }
+
+    return Status::OK();
 }
 
 std::unique_ptr<ThreadPoolToken> ScannerScheduler::new_limited_scan_pool_token(
diff --git a/be/src/vec/exec/scan/scanner_scheduler.h b/be/src/vec/exec/scan/scanner_scheduler.h
index ddc61396e23f15..439291f2107185 100644
--- a/be/src/vec/exec/scan/scanner_scheduler.h
+++ b/be/src/vec/exec/scan/scanner_scheduler.h
@@ -57,7 +57,7 @@ class ScannerScheduler {
 
     [[nodiscard]] Status init(ExecEnv* env);
 
-    void submit(std::shared_ptr<ScannerContext> ctx, std::shared_ptr<ScanTask> scan_task);
+    Status submit(std::shared_ptr<ScannerContext> ctx, std::shared_ptr<ScanTask> scan_task);
 
     void stop();
 

From 140008fd0e6ff47e2613d5d0bc69fb33d1ce856f Mon Sep 17 00:00:00 2001
From: Xinyi Zou <zouxinyi02@gmail.com>
Date: Tue, 10 Sep 2024 11:03:25 +0800
Subject: [PATCH 13/44] [opt](memory) Refactor memory maintenance thread
 (retry) (#40551)

step 1. Refresh process memory metrics.
step 2. Refresh allocator memory metrics.
step 3. Update and print memory stat when the memory changes by 256M.
step 4. Asyn Refresh cache capacity
step 5. Cancel top memory task when process memory exceed hard limit.
step 6. Refresh weighted memory ratio of workload groups.
step 7. Analyze blocking queries.
step 8. Flush memtable.
step 9. Jemalloc purge all arena dirty pages.

`memory_maintenance_thread` execute once cost:
- 3ms (cluster idle)
- 20ms (cluster high concurrency, CPU full)

`memory_maintenance_thread` CPU usage:
- 10%-20% (default memory_maintenance_sleep_time_ms=20ms)
- 20%-30% (memory_maintenance_sleep_time_ms=10ms)
- 30%+ (memory_maintenance_sleep_time_ms=5ms)
---
 be/src/common/config.cpp                      |  14 +-
 be/src/common/config.h                        |   9 +-
 be/src/common/daemon.cpp                      | 195 +++++++++++++-----
 be/src/common/daemon.h                        |   2 +-
 be/src/olap/lru_cache.cpp                     |  81 +++++++-
 be/src/olap/lru_cache.h                       |  28 +--
 be/src/runtime/memory/cache_manager.cpp       |  21 +-
 be/src/runtime/memory/cache_manager.h         |   3 +
 be/src/runtime/memory/cache_policy.cpp        |   8 +-
 be/src/runtime/memory/cache_policy.h          |  28 ++-
 .../memory/global_memory_arbitrator.cpp       |   7 +
 .../runtime/memory/global_memory_arbitrator.h |  17 ++
 be/src/runtime/memory/lru_cache_policy.h      |  95 ++++++---
 be/src/runtime/memory/mem_tracker_limiter.cpp |   6 +-
 be/src/runtime/memory/memory_reclamation.cpp  |  12 --
 be/src/service/point_query_executor.cpp       |   6 +
 be/src/service/point_query_executor.h         |   5 +-
 be/src/vec/common/allocator.cpp               |   3 -
 be/test/olap/lru_cache_test.cpp               | 144 +++++++++++--
 19 files changed, 521 insertions(+), 163 deletions(-)

diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index 00f8a042cbcbb7..0c00bd1a38f0da 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -95,6 +95,9 @@ DEFINE_String(mem_limit, "90%");
 // Soft memory limit as a fraction of hard memory limit.
 DEFINE_Double(soft_mem_limit_frac, "0.9");
 
+// Cache capacity reduce mem limit as a fraction of soft mem limit.
+DEFINE_mDouble(cache_capacity_reduce_mem_limit_frac, "0.6");
+
 // Schema change memory limit as a fraction of soft memory limit.
 DEFINE_Double(schema_change_mem_limit_frac, "0.6");
 
@@ -286,7 +289,7 @@ DEFINE_mInt32(exchg_buffer_queue_capacity_factor, "64");
 DEFINE_mInt64(memory_limitation_per_thread_for_schema_change_bytes, "2147483648");
 
 DEFINE_mInt32(cache_prune_interval_sec, "10");
-DEFINE_mInt32(cache_periodic_prune_stale_sweep_sec, "300");
+DEFINE_mInt32(cache_periodic_prune_stale_sweep_sec, "60");
 // the clean interval of tablet lookup cache
 DEFINE_mInt32(tablet_lookup_cache_stale_sweep_time_sec, "30");
 DEFINE_mInt32(point_query_row_cache_stale_sweep_time_sec, "300");
@@ -565,7 +568,7 @@ DEFINE_String(pprof_profile_dir, "${DORIS_HOME}/log");
 // for jeprofile in jemalloc
 DEFINE_mString(jeprofile_dir, "${DORIS_HOME}/log");
 DEFINE_mBool(enable_je_purge_dirty_pages, "true");
-DEFINE_mString(je_dirty_pages_mem_limit_percent, "5%");
+DEFINE_mString(je_dirty_pages_mem_limit_percent, "2%");
 
 // to forward compatibility, will be removed later
 DEFINE_mBool(enable_token_check, "true");
@@ -582,17 +585,12 @@ DEFINE_Int32(num_cores, "0");
 DEFINE_Bool(ignore_broken_disk, "false");
 
 // Sleep time in milliseconds between memory maintenance iterations
-DEFINE_mInt32(memory_maintenance_sleep_time_ms, "100");
+DEFINE_mInt32(memory_maintenance_sleep_time_ms, "20");
 
 // After full gc, no longer full gc and minor gc during sleep.
 // After minor gc, no minor gc during sleep, but full gc is possible.
 DEFINE_mInt32(memory_gc_sleep_time_ms, "500");
 
-// Sleep time in milliseconds between memtbale flush mgr refresh iterations
-DEFINE_mInt64(memtable_mem_tracker_refresh_interval_ms, "5");
-
-DEFINE_mInt64(wg_weighted_memory_ratio_refresh_interval_ms, "50");
-
 // percent of (active memtables size / all memtables size) when reach hard limit
 DEFINE_mInt32(memtable_hard_limit_active_percent, "50");
 
diff --git a/be/src/common/config.h b/be/src/common/config.h
index bd2aa4f51be1a9..720f4f72cb4bf7 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -133,6 +133,9 @@ DECLARE_String(mem_limit);
 // Soft memory limit as a fraction of hard memory limit.
 DECLARE_Double(soft_mem_limit_frac);
 
+// Cache capacity reduce mem limit as a fraction of soft mem limit.
+DECLARE_mDouble(cache_capacity_reduce_mem_limit_frac);
+
 // Schema change memory limit as a fraction of soft memory limit.
 DECLARE_Double(schema_change_mem_limit_frac);
 
@@ -641,12 +644,6 @@ DECLARE_mInt32(memory_maintenance_sleep_time_ms);
 // After minor gc, no minor gc during sleep, but full gc is possible.
 DECLARE_mInt32(memory_gc_sleep_time_ms);
 
-// Sleep time in milliseconds between memtbale flush mgr memory refresh iterations
-DECLARE_mInt64(memtable_mem_tracker_refresh_interval_ms);
-
-// Sleep time in milliseconds between refresh iterations of workload group weighted memory ratio
-DECLARE_mInt64(wg_weighted_memory_ratio_refresh_interval_ms);
-
 // percent of (active memtables size / all memtables size) when reach hard limit
 DECLARE_mInt32(memtable_hard_limit_active_percent);
 
diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp
index d8245f4045ce81..713813b4a334f9 100644
--- a/be/src/common/daemon.cpp
+++ b/be/src/common/daemon.cpp
@@ -73,6 +73,12 @@
 namespace doris {
 namespace {
 
+int64_t last_print_proc_mem = 0;
+int32_t refresh_cache_capacity_sleep_time_ms = 0;
+#ifdef USE_JEMALLOC
+int32_t je_purge_dirty_pages_sleep_time_ms = 0;
+#endif
+
 void update_rowsets_and_segments_num_metrics() {
     if (config::is_cloud_mode()) {
         // TODO(plat1ko): CloudStorageEngine
@@ -204,42 +210,104 @@ void Daemon::tcmalloc_gc_thread() {
 #endif
 }
 
-void Daemon::memory_maintenance_thread() {
-    int32_t interval_milliseconds = config::memory_maintenance_sleep_time_ms;
-    int64_t last_print_proc_mem = PerfCounters::get_vm_rss();
-    while (!_stop_background_threads_latch.wait_for(
-            std::chrono::milliseconds(interval_milliseconds))) {
-        // Refresh process memory metrics.
-        doris::PerfCounters::refresh_proc_status();
-        doris::MemInfo::refresh_proc_meminfo();
-        doris::GlobalMemoryArbitrator::reset_refresh_interval_memory_growth();
-        ExecEnv::GetInstance()->brpc_iobuf_block_memory_tracker()->set_consumption(
-                butil::IOBuf::block_memory());
-        // Refresh allocator memory metrics.
+void refresh_process_memory_metrics() {
+    doris::PerfCounters::refresh_proc_status();
+    doris::MemInfo::refresh_proc_meminfo();
+    doris::GlobalMemoryArbitrator::reset_refresh_interval_memory_growth();
+    ExecEnv::GetInstance()->brpc_iobuf_block_memory_tracker()->set_consumption(
+            butil::IOBuf::block_memory());
+}
+
+void refresh_common_allocator_metrics() {
 #if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && !defined(THREAD_SANITIZER)
-        doris::MemInfo::refresh_allocator_mem();
-#ifdef USE_JEMALLOC
-        if (doris::MemInfo::je_dirty_pages_mem() > doris::MemInfo::je_dirty_pages_mem_limit() &&
-            GlobalMemoryArbitrator::is_exceed_soft_mem_limit()) {
-            doris::MemInfo::notify_je_purge_dirty_pages();
-        }
+    doris::MemInfo::refresh_allocator_mem();
+    if (config::enable_system_metrics) {
+        DorisMetrics::instance()->system_metrics()->update_allocator_metrics();
+    }
 #endif
-        if (config::enable_system_metrics) {
-            DorisMetrics::instance()->system_metrics()->update_allocator_metrics();
+    MemInfo::refresh_memory_bvar();
+}
+
+void refresh_memory_state_after_memory_change() {
+    if (abs(last_print_proc_mem - PerfCounters::get_vm_rss()) > 268435456) {
+        last_print_proc_mem = PerfCounters::get_vm_rss();
+        doris::MemTrackerLimiter::clean_tracker_limiter_group();
+        doris::MemTrackerLimiter::enable_print_log_process_usage();
+        // Refresh mem tracker each type counter.
+        doris::MemTrackerLimiter::refresh_global_counter();
+        LOG(INFO) << doris::GlobalMemoryArbitrator::
+                        process_mem_log_str(); // print mem log when memory state by 256M
+    }
+}
+
+void refresh_cache_capacity() {
+    if (refresh_cache_capacity_sleep_time_ms <= 0) {
+        auto cache_capacity_reduce_mem_limit = uint64_t(
+                doris::MemInfo::soft_mem_limit() * config::cache_capacity_reduce_mem_limit_frac);
+        int64_t process_memory_usage = doris::GlobalMemoryArbitrator::process_memory_usage();
+        double new_cache_capacity_adjust_weighted =
+                process_memory_usage <= cache_capacity_reduce_mem_limit
+                        ? 1
+                        : std::min<double>(
+                                  1 - (process_memory_usage - cache_capacity_reduce_mem_limit) /
+                                                  (doris::MemInfo::soft_mem_limit() -
+                                                   cache_capacity_reduce_mem_limit),
+                                  0);
+        if (new_cache_capacity_adjust_weighted !=
+            doris::GlobalMemoryArbitrator::last_cache_capacity_adjust_weighted) {
+            doris::GlobalMemoryArbitrator::last_cache_capacity_adjust_weighted =
+                    new_cache_capacity_adjust_weighted;
+            doris::GlobalMemoryArbitrator::notify_cache_adjust_capacity();
+            refresh_cache_capacity_sleep_time_ms = config::memory_gc_sleep_time_ms;
         }
+    }
+    refresh_cache_capacity_sleep_time_ms -= config::memory_maintenance_sleep_time_ms;
+}
+
+void je_purge_dirty_pages() {
+#ifdef USE_JEMALLOC
+    if (je_purge_dirty_pages_sleep_time_ms <= 0 &&
+        doris::MemInfo::je_dirty_pages_mem() > doris::MemInfo::je_dirty_pages_mem_limit() &&
+        GlobalMemoryArbitrator::is_exceed_soft_mem_limit()) {
+        doris::MemInfo::notify_je_purge_dirty_pages();
+        je_purge_dirty_pages_sleep_time_ms = config::memory_gc_sleep_time_ms;
+    }
+    je_purge_dirty_pages_sleep_time_ms -= config::memory_maintenance_sleep_time_ms;
 #endif
-        MemInfo::refresh_memory_bvar();
-
-        // Update and print memory stat when the memory changes by 256M.
-        if (abs(last_print_proc_mem - PerfCounters::get_vm_rss()) > 268435456) {
-            last_print_proc_mem = PerfCounters::get_vm_rss();
-            doris::MemTrackerLimiter::clean_tracker_limiter_group();
-            doris::MemTrackerLimiter::enable_print_log_process_usage();
-            // Refresh mem tracker each type counter.
-            doris::MemTrackerLimiter::refresh_global_counter();
-            LOG(INFO) << doris::GlobalMemoryArbitrator::
-                            process_mem_log_str(); // print mem log when memory state by 256M
-        }
+}
+
+void Daemon::memory_maintenance_thread() {
+    while (!_stop_background_threads_latch.wait_for(
+            std::chrono::milliseconds(config::memory_maintenance_sleep_time_ms))) {
+        // step 1. Refresh process memory metrics.
+        refresh_process_memory_metrics();
+
+        // step 2. Refresh jemalloc/tcmalloc metrics.
+        refresh_common_allocator_metrics();
+
+        // step 3. Update and print memory stat when the memory changes by 256M.
+        refresh_memory_state_after_memory_change();
+
+        // step 4. Asyn Refresh cache capacity
+        // TODO adjust cache capacity based on smoothstep (smooth gradient).
+        refresh_cache_capacity();
+
+        // step 5. Cancel top memory task when process memory exceed hard limit.
+        // TODO replace memory_gc_thread.
+
+        // step 6. Refresh weighted memory ratio of workload groups.
+        doris::ExecEnv::GetInstance()->workload_group_mgr()->refresh_wg_weighted_memory_limit();
+
+        // step 7. Analyze blocking queries.
+        // TODO sort the operators that can spill, wake up the pipeline task spill
+        // or continue execution according to certain rules or cancel query.
+
+        // step 8. Flush memtable
+        doris::GlobalMemoryArbitrator::notify_memtable_memory_refresh();
+        // TODO notify flush memtable
+
+        // step 9. Jemalloc purge all arena dirty pages
+        je_purge_dirty_pages();
     }
 }
 
@@ -301,10 +369,21 @@ void Daemon::memory_gc_thread() {
 void Daemon::memtable_memory_refresh_thread() {
     // Refresh the memory statistics of the load channel tracker more frequently,
     // which helps to accurately control the memory of LoadChannelMgr.
-    while (!_stop_background_threads_latch.wait_for(
-            std::chrono::milliseconds(config::memtable_mem_tracker_refresh_interval_ms))) {
+    do {
+        std::unique_lock<std::mutex> l(doris::GlobalMemoryArbitrator::memtable_memory_refresh_lock);
+        while (_stop_background_threads_latch.count() != 0 &&
+               !doris::GlobalMemoryArbitrator::memtable_memory_refresh_notify.load(
+                       std::memory_order_relaxed)) {
+            doris::GlobalMemoryArbitrator::memtable_memory_refresh_cv.wait_for(
+                    l, std::chrono::seconds(1));
+        }
+        if (_stop_background_threads_latch.count() == 0) {
+            break;
+        }
         doris::ExecEnv::GetInstance()->memtable_memory_limiter()->refresh_mem_tracker();
-    }
+        doris::GlobalMemoryArbitrator::memtable_memory_refresh_notify.store(
+                false, std::memory_order_relaxed);
+    } while (true);
 }
 
 /*
@@ -396,6 +475,35 @@ void Daemon::je_purge_dirty_pages_thread() const {
     } while (true);
 }
 
+void Daemon::cache_adjust_capacity_thread() {
+    do {
+        std::unique_lock<std::mutex> l(doris::GlobalMemoryArbitrator::cache_adjust_capacity_lock);
+        while (_stop_background_threads_latch.count() != 0 &&
+               !doris::GlobalMemoryArbitrator::cache_adjust_capacity_notify.load(
+                       std::memory_order_relaxed)) {
+            doris::GlobalMemoryArbitrator::cache_adjust_capacity_cv.wait_for(
+                    l, std::chrono::seconds(1));
+        }
+        double adjust_weighted = GlobalMemoryArbitrator::last_cache_capacity_adjust_weighted;
+        if (_stop_background_threads_latch.count() == 0) {
+            break;
+        }
+        if (config::disable_memory_gc) {
+            continue;
+        }
+        std::unique_ptr<RuntimeProfile> profile = std::make_unique<RuntimeProfile>("");
+        auto freed_mem = CacheManager::instance()->for_each_cache_refresh_capacity(adjust_weighted,
+                                                                                   profile.get());
+        std::stringstream ss;
+        profile->pretty_print(&ss);
+        LOG(INFO) << fmt::format(
+                "[MemoryGC] refresh cache capacity end, free memory {}, details: {}",
+                PrettyPrinter::print(freed_mem, TUnit::BYTES), ss.str());
+        doris::GlobalMemoryArbitrator::cache_adjust_capacity_notify.store(
+                false, std::memory_order_relaxed);
+    } while (true);
+}
+
 void Daemon::cache_prune_stale_thread() {
     int32_t interval = config::cache_periodic_prune_stale_sweep_sec;
     while (!_stop_background_threads_latch.wait_for(std::chrono::seconds(interval))) {
@@ -411,14 +519,6 @@ void Daemon::cache_prune_stale_thread() {
     }
 }
 
-void Daemon::wg_weighted_memory_ratio_refresh_thread() {
-    // Refresh weighted memory ratio of workload groups
-    while (!_stop_background_threads_latch.wait_for(
-            std::chrono::milliseconds(config::wg_weighted_memory_ratio_refresh_interval_ms))) {
-        doris::ExecEnv::GetInstance()->workload_group_mgr()->refresh_wg_weighted_memory_limit();
-    }
-}
-
 void Daemon::be_proc_monitor_thread() {
     while (!_stop_background_threads_latch.wait_for(
             std::chrono::milliseconds(config::be_proc_monitor_interval_ms))) {
@@ -455,6 +555,10 @@ void Daemon::start() {
             "Daemon", "je_purge_dirty_pages_thread",
             [this]() { this->je_purge_dirty_pages_thread(); }, &_threads.emplace_back());
     CHECK(st.ok()) << st;
+    st = Thread::create(
+            "Daemon", "cache_adjust_capacity_thread",
+            [this]() { this->cache_adjust_capacity_thread(); }, &_threads.emplace_back());
+    CHECK(st.ok()) << st;
     st = Thread::create(
             "Daemon", "cache_prune_stale_thread", [this]() { this->cache_prune_stale_thread(); },
             &_threads.emplace_back());
@@ -464,11 +568,6 @@ void Daemon::start() {
             [this]() { this->report_runtime_query_statistics_thread(); }, &_threads.emplace_back());
     CHECK(st.ok()) << st;
 
-    st = Thread::create(
-            "Daemon", "wg_weighted_memory_ratio_refresh_thread",
-            [this]() { this->wg_weighted_memory_ratio_refresh_thread(); },
-            &_threads.emplace_back());
-
     if (config::enable_be_proc_monitor) {
         st = Thread::create(
                 "Daemon", "be_proc_monitor_thread", [this]() { this->be_proc_monitor_thread(); },
diff --git a/be/src/common/daemon.h b/be/src/common/daemon.h
index 64c9f0c8993ae3..fe723877dcd027 100644
--- a/be/src/common/daemon.h
+++ b/be/src/common/daemon.h
@@ -43,9 +43,9 @@ class Daemon {
     void memtable_memory_refresh_thread();
     void calculate_metrics_thread();
     void je_purge_dirty_pages_thread() const;
+    void cache_adjust_capacity_thread();
     void cache_prune_stale_thread();
     void report_runtime_query_statistics_thread();
-    void wg_weighted_memory_ratio_refresh_thread();
     void be_proc_monitor_thread();
 
     CountDownLatch _stop_background_threads_latch;
diff --git a/be/src/olap/lru_cache.cpp b/be/src/olap/lru_cache.cpp
index 741c2423915ede..6e5bb2fa31578f 100644
--- a/be/src/olap/lru_cache.cpp
+++ b/be/src/olap/lru_cache.cpp
@@ -177,6 +177,51 @@ LRUCache::~LRUCache() {
     prune();
 }
 
+PrunedInfo LRUCache::set_capacity(size_t capacity) {
+    LRUHandle* last_ref_list = nullptr;
+    {
+        std::lock_guard l(_mutex);
+        _capacity = capacity;
+        _evict_from_lru(0, &last_ref_list);
+    }
+
+    int64_t pruned_count = 0;
+    int64_t pruned_size = 0;
+    while (last_ref_list != nullptr) {
+        ++pruned_count;
+        pruned_size += last_ref_list->total_size;
+        LRUHandle* next = last_ref_list->next;
+        last_ref_list->free();
+        last_ref_list = next;
+    }
+    return {pruned_count, pruned_size};
+}
+
+uint64_t LRUCache::get_lookup_count() {
+    std::lock_guard l(_mutex);
+    return _lookup_count;
+}
+
+uint64_t LRUCache::get_hit_count() {
+    std::lock_guard l(_mutex);
+    return _hit_count;
+}
+
+size_t LRUCache::get_usage() {
+    std::lock_guard l(_mutex);
+    return _usage;
+}
+
+size_t LRUCache::get_capacity() {
+    std::lock_guard l(_mutex);
+    return _capacity;
+}
+
+size_t LRUCache::get_element_count() {
+    std::lock_guard l(_mutex);
+    return _table.element_count();
+}
+
 bool LRUCache::_unref(LRUHandle* e) {
     DCHECK(e->refs > 0);
     e->refs--;
@@ -515,19 +560,19 @@ inline uint32_t ShardedLRUCache::_hash_slice(const CacheKey& s) {
     return s.hash(s.data(), s.size(), 0);
 }
 
-ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type,
+ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t capacity, LRUCacheType type,
                                  uint32_t num_shards, uint32_t total_element_count_capacity)
         : _name(name),
           _num_shard_bits(Bits::FindLSBSetNonZero(num_shards)),
           _num_shards(num_shards),
           _shards(nullptr),
           _last_id(1),
-          _total_capacity(total_capacity) {
+          _capacity(capacity) {
     CHECK(num_shards > 0) << "num_shards cannot be 0";
     CHECK_EQ((num_shards & (num_shards - 1)), 0)
             << "num_shards should be power of two, but got " << num_shards;
 
-    const size_t per_shard = (total_capacity + (_num_shards - 1)) / _num_shards;
+    const size_t per_shard = (capacity + (_num_shards - 1)) / _num_shards;
     const size_t per_shard_element_count_capacity =
             (total_element_count_capacity + (_num_shards - 1)) / _num_shards;
     LRUCache** shards = new (std::nothrow) LRUCache*[_num_shards];
@@ -557,12 +602,12 @@ ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity,
             "doris_cache", _name + "_persecond", _lookup_count_bvar.get(), 60));
 }
 
-ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type,
+ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t capacity, LRUCacheType type,
                                  uint32_t num_shards,
                                  CacheValueTimeExtractor cache_value_time_extractor,
                                  bool cache_value_check_timestamp,
                                  uint32_t total_element_count_capacity)
-        : ShardedLRUCache(name, total_capacity, type, num_shards, total_element_count_capacity) {
+        : ShardedLRUCache(name, capacity, type, num_shards, total_element_count_capacity) {
     for (int s = 0; s < _num_shards; s++) {
         _shards[s]->set_cache_value_time_extractor(cache_value_time_extractor);
         _shards[s]->set_cache_value_check_timestamp(cache_value_check_timestamp);
@@ -580,6 +625,24 @@ ShardedLRUCache::~ShardedLRUCache() {
     }
 }
 
+PrunedInfo ShardedLRUCache::set_capacity(size_t capacity) {
+    std::lock_guard l(_mutex);
+    PrunedInfo pruned_info;
+    const size_t per_shard = (capacity + (_num_shards - 1)) / _num_shards;
+    for (int s = 0; s < _num_shards; s++) {
+        PrunedInfo info = _shards[s]->set_capacity(per_shard);
+        pruned_info.pruned_count += info.pruned_count;
+        pruned_info.pruned_size += info.pruned_size;
+    }
+    _capacity = capacity;
+    return pruned_info;
+}
+
+size_t ShardedLRUCache::get_capacity() {
+    std::lock_guard l(_mutex);
+    return _capacity;
+}
+
 Cache::Handle* ShardedLRUCache::insert(const CacheKey& key, void* value, size_t charge,
                                        CachePriority priority) {
     const uint32_t hash = _hash_slice(key);
@@ -638,25 +701,25 @@ int64_t ShardedLRUCache::get_usage() {
 }
 
 void ShardedLRUCache::update_cache_metrics() const {
-    size_t total_capacity = 0;
+    size_t capacity = 0;
     size_t total_usage = 0;
     size_t total_lookup_count = 0;
     size_t total_hit_count = 0;
     size_t total_element_count = 0;
     for (int i = 0; i < _num_shards; i++) {
-        total_capacity += _shards[i]->get_capacity();
+        capacity += _shards[i]->get_capacity();
         total_usage += _shards[i]->get_usage();
         total_lookup_count += _shards[i]->get_lookup_count();
         total_hit_count += _shards[i]->get_hit_count();
         total_element_count += _shards[i]->get_element_count();
     }
 
-    cache_capacity->set_value(total_capacity);
+    cache_capacity->set_value(capacity);
     cache_usage->set_value(total_usage);
     cache_element_count->set_value(total_element_count);
     cache_lookup_count->set_value(total_lookup_count);
     cache_hit_count->set_value(total_hit_count);
-    cache_usage_ratio->set_value(total_capacity == 0 ? 0 : ((double)total_usage / total_capacity));
+    cache_usage_ratio->set_value(capacity == 0 ? 0 : ((double)total_usage / capacity));
     cache_hit_ratio->set_value(
             total_lookup_count == 0 ? 0 : ((double)total_hit_count / total_lookup_count));
 }
diff --git a/be/src/olap/lru_cache.h b/be/src/olap/lru_cache.h
index 059020deab58f5..de7084382d7398 100644
--- a/be/src/olap/lru_cache.h
+++ b/be/src/olap/lru_cache.h
@@ -227,7 +227,8 @@ class Cache {
 
     virtual int64_t get_usage() = 0;
 
-    virtual size_t get_total_capacity() = 0;
+    virtual PrunedInfo set_capacity(size_t capacity) = 0;
+    virtual size_t get_capacity() = 0;
 
 private:
     DISALLOW_COPY_AND_ASSIGN(Cache);
@@ -327,7 +328,7 @@ class LRUCache {
     ~LRUCache();
 
     // Separate from constructor so caller can easily make an array of LRUCache
-    void set_capacity(size_t capacity) { _capacity = capacity; }
+    PrunedInfo set_capacity(size_t capacity);
     void set_element_count_capacity(uint32_t element_count_capacity) {
         _element_count_capacity = element_count_capacity;
     }
@@ -345,11 +346,11 @@ class LRUCache {
     void set_cache_value_time_extractor(CacheValueTimeExtractor cache_value_time_extractor);
     void set_cache_value_check_timestamp(bool cache_value_check_timestamp);
 
-    uint64_t get_lookup_count() const { return _lookup_count; }
-    uint64_t get_hit_count() const { return _hit_count; }
-    size_t get_usage() const { return _usage; }
-    size_t get_capacity() const { return _capacity; }
-    size_t get_element_count() const { return _table.element_count(); }
+    uint64_t get_lookup_count();
+    uint64_t get_hit_count();
+    size_t get_usage();
+    size_t get_capacity();
+    size_t get_element_count();
 
 private:
     void _lru_remove(LRUHandle* e);
@@ -403,15 +404,16 @@ class ShardedLRUCache : public Cache {
     PrunedInfo prune() override;
     PrunedInfo prune_if(CachePrunePredicate pred, bool lazy_mode = false) override;
     int64_t get_usage() override;
-    size_t get_total_capacity() override { return _total_capacity; };
+    PrunedInfo set_capacity(size_t capacity) override;
+    size_t get_capacity() override;
 
 private:
     // LRUCache can only be created and managed with LRUCachePolicy.
     friend class LRUCachePolicy;
 
-    explicit ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type,
+    explicit ShardedLRUCache(const std::string& name, size_t capacity, LRUCacheType type,
                              uint32_t num_shards, uint32_t element_count_capacity);
-    explicit ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type,
+    explicit ShardedLRUCache(const std::string& name, size_t capacity, LRUCacheType type,
                              uint32_t num_shards,
                              CacheValueTimeExtractor cache_value_time_extractor,
                              bool cache_value_check_timestamp, uint32_t element_count_capacity);
@@ -429,7 +431,8 @@ class ShardedLRUCache : public Cache {
     const uint32_t _num_shards;
     LRUCache** _shards = nullptr;
     std::atomic<uint64_t> _last_id;
-    size_t _total_capacity;
+    std::mutex _mutex;
+    size_t _capacity {0};
 
     std::shared_ptr<MetricEntity> _entity;
     IntGauge* cache_capacity = nullptr;
@@ -462,7 +465,8 @@ class DummyLRUCache : public Cache {
         return {0, 0};
     };
     int64_t get_usage() override { return 0; };
-    size_t get_total_capacity() override { return 0; };
+    PrunedInfo set_capacity(size_t capacity) override { return {0, 0}; };
+    size_t get_capacity() override { return 0; };
 };
 
 } // namespace doris
diff --git a/be/src/runtime/memory/cache_manager.cpp b/be/src/runtime/memory/cache_manager.cpp
index a6516c40a35770..ec57ffba50d318 100644
--- a/be/src/runtime/memory/cache_manager.cpp
+++ b/be/src/runtime/memory/cache_manager.cpp
@@ -59,11 +59,26 @@ int64_t CacheManager::for_each_cache_prune_all(RuntimeProfile* profile, bool for
 int64_t CacheManager::cache_prune_all(CachePolicy::CacheType type, bool force) {
     std::lock_guard<std::mutex> l(_caches_lock);
     auto* cache_policy = _caches[type];
-    if (!cache_policy->enable_prune()) {
-        return -1;
-    }
     cache_policy->prune_all(force);
     return cache_policy->profile()->get_counter("FreedMemory")->value();
 }
 
+int64_t CacheManager::for_each_cache_refresh_capacity(double adjust_weighted,
+                                                      RuntimeProfile* profile) {
+    int64_t freed_size = 0;
+    std::lock_guard<std::mutex> l(_caches_lock);
+    for (const auto& pair : _caches) {
+        auto* cache_policy = pair.second;
+        if (!cache_policy->enable_prune()) {
+            continue;
+        }
+        cache_policy->adjust_capacity_weighted(adjust_weighted);
+        freed_size += cache_policy->profile()->get_counter("FreedMemory")->value();
+        if (cache_policy->profile()->get_counter("FreedMemory")->value() != 0 && profile) {
+            profile->add_child(cache_policy->profile(), true, nullptr);
+        }
+    }
+    return freed_size;
+}
+
 } // namespace doris
diff --git a/be/src/runtime/memory/cache_manager.h b/be/src/runtime/memory/cache_manager.h
index d94dca501670bf..a2a089b929dbdf 100644
--- a/be/src/runtime/memory/cache_manager.h
+++ b/be/src/runtime/memory/cache_manager.h
@@ -81,6 +81,9 @@ class CacheManager {
         return false;
     }
 
+    int64_t for_each_cache_refresh_capacity(double adjust_weighted,
+                                            RuntimeProfile* profile = nullptr);
+
 private:
     std::mutex _caches_lock;
     std::unordered_map<CachePolicy::CacheType, CachePolicy*> _caches;
diff --git a/be/src/runtime/memory/cache_policy.cpp b/be/src/runtime/memory/cache_policy.cpp
index 4e50d64d88eed1..46b9db1b35ad5f 100644
--- a/be/src/runtime/memory/cache_policy.cpp
+++ b/be/src/runtime/memory/cache_policy.cpp
@@ -21,8 +21,12 @@
 
 namespace doris {
 
-CachePolicy::CachePolicy(CacheType type, uint32_t stale_sweep_time_s, bool enable_prune)
-        : _type(type), _stale_sweep_time_s(stale_sweep_time_s), _enable_prune(enable_prune) {
+CachePolicy::CachePolicy(CacheType type, size_t capacity, uint32_t stale_sweep_time_s,
+                         bool enable_prune)
+        : _type(type),
+          _initial_capacity(capacity),
+          _stale_sweep_time_s(stale_sweep_time_s),
+          _enable_prune(enable_prune) {
     CacheManager::instance()->register_cache(this);
     init_profile();
 }
diff --git a/be/src/runtime/memory/cache_policy.h b/be/src/runtime/memory/cache_policy.h
index c457afd86898f2..c43ca0b2fb7e0a 100644
--- a/be/src/runtime/memory/cache_policy.h
+++ b/be/src/runtime/memory/cache_policy.h
@@ -17,13 +17,12 @@
 
 #pragma once
 
-#include "runtime/exec_env.h"
 #include "util/runtime_profile.h"
 
 namespace doris {
 
-static constexpr int32_t CACHE_MIN_FREE_SIZE = 67108864; // 64M
-static constexpr int32_t CACHE_MIN_FREE_NUMBER = 1024;
+static constexpr int32_t CACHE_MIN_PRUNE_SIZE = 67108864; // 64M
+static constexpr int32_t CACHE_MIN_PRUNE_NUMBER = 1024;
 
 // Base of all caches. register to CacheManager when cache is constructed.
 class CachePolicy {
@@ -42,12 +41,13 @@ class CachePolicy {
         TABLET_VERSION_CACHE = 10,
         LAST_SUCCESS_CHANNEL_CACHE = 11,
         COMMON_OBJ_LRU_CACHE = 12,
-        FOR_UT = 13,
+        FOR_UT_CACHE_SIZE = 13,
         TABLET_SCHEMA_CACHE = 14,
         CREATE_TABLET_RR_IDX_CACHE = 15,
         CLOUD_TABLET_CACHE = 16,
         CLOUD_TXN_DELETE_BITMAP_CACHE = 17,
         NONE = 18, // not be used
+        FOR_UT_CACHE_NUMBER = 19,
     };
 
     static std::string type_string(CacheType type) {
@@ -78,8 +78,8 @@ class CachePolicy {
             return "LastSuccessChannelCache";
         case CacheType::COMMON_OBJ_LRU_CACHE:
             return "CommonObjLRUCache";
-        case CacheType::FOR_UT:
-            return "ForUT";
+        case CacheType::FOR_UT_CACHE_SIZE:
+            return "ForUTCacheSize";
         case CacheType::TABLET_SCHEMA_CACHE:
             return "TabletSchemaCache";
         case CacheType::CREATE_TABLET_RR_IDX_CACHE:
@@ -88,6 +88,8 @@ class CachePolicy {
             return "CloudTabletCache";
         case CacheType::CLOUD_TXN_DELETE_BITMAP_CACHE:
             return "CloudTxnDeleteBitmapCache";
+        case CacheType::FOR_UT_CACHE_NUMBER:
+            return "ForUTCacheNumber";
         default:
             LOG(FATAL) << "not match type of cache policy :" << static_cast<int>(type);
         }
@@ -109,11 +111,12 @@ class CachePolicy {
             {"MowTabletVersionCache", CacheType::TABLET_VERSION_CACHE},
             {"LastSuccessChannelCache", CacheType::LAST_SUCCESS_CHANNEL_CACHE},
             {"CommonObjLRUCache", CacheType::COMMON_OBJ_LRU_CACHE},
-            {"ForUT", CacheType::FOR_UT},
+            {"ForUTCacheSize", CacheType::FOR_UT_CACHE_SIZE},
             {"TabletSchemaCache", CacheType::TABLET_SCHEMA_CACHE},
             {"CreateTabletRRIdxCache", CacheType::CREATE_TABLET_RR_IDX_CACHE},
             {"CloudTabletCache", CacheType::CLOUD_TABLET_CACHE},
-            {"CloudTxnDeleteBitmapCache", CacheType::CLOUD_TXN_DELETE_BITMAP_CACHE}};
+            {"CloudTxnDeleteBitmapCache", CacheType::CLOUD_TXN_DELETE_BITMAP_CACHE},
+            {"ForUTCacheNumber", CacheType::FOR_UT_CACHE_NUMBER}};
 
     static CacheType string_to_type(std::string type) {
         if (StringToType.contains(type)) {
@@ -123,13 +126,16 @@ class CachePolicy {
         }
     }
 
-    CachePolicy(CacheType type, uint32_t stale_sweep_time_s, bool enable_prune);
+    CachePolicy(CacheType type, size_t capacity, uint32_t stale_sweep_time_s, bool enable_prune);
     virtual ~CachePolicy();
 
     virtual void prune_stale() = 0;
     virtual void prune_all(bool force) = 0;
+    virtual int64_t adjust_capacity_weighted(double adjust_weighted) = 0;
+    virtual size_t get_capacity() = 0;
 
     CacheType type() { return _type; }
+    size_t initial_capacity() const { return _initial_capacity; }
     bool enable_prune() const { return _enable_prune; }
     RuntimeProfile* profile() { return _profile.get(); }
 
@@ -139,16 +145,20 @@ class CachePolicy {
                 std::make_unique<RuntimeProfile>(fmt::format("Cache type={}", type_string(_type)));
         _prune_stale_number_counter = ADD_COUNTER(_profile, "PruneStaleNumber", TUnit::UNIT);
         _prune_all_number_counter = ADD_COUNTER(_profile, "PruneAllNumber", TUnit::UNIT);
+        _adjust_capacity_weighted_number_counter =
+                ADD_COUNTER(_profile, "SetCapacityNumber", TUnit::UNIT);
         _freed_memory_counter = ADD_COUNTER(_profile, "FreedMemory", TUnit::BYTES);
         _freed_entrys_counter = ADD_COUNTER(_profile, "FreedEntrys", TUnit::UNIT);
         _cost_timer = ADD_TIMER(_profile, "CostTime");
     }
 
     CacheType _type;
+    size_t _initial_capacity {0};
 
     std::unique_ptr<RuntimeProfile> _profile;
     RuntimeProfile::Counter* _prune_stale_number_counter = nullptr;
     RuntimeProfile::Counter* _prune_all_number_counter = nullptr;
+    RuntimeProfile::Counter* _adjust_capacity_weighted_number_counter = nullptr;
     // Reset before each gc
     RuntimeProfile::Counter* _freed_memory_counter = nullptr;
     RuntimeProfile::Counter* _freed_entrys_counter = nullptr;
diff --git a/be/src/runtime/memory/global_memory_arbitrator.cpp b/be/src/runtime/memory/global_memory_arbitrator.cpp
index 344bcbc59846d9..76a414a6ebdc74 100644
--- a/be/src/runtime/memory/global_memory_arbitrator.cpp
+++ b/be/src/runtime/memory/global_memory_arbitrator.cpp
@@ -38,6 +38,13 @@ bvar::PassiveStatus<int64_t> g_sys_mem_avail(
 
 std::atomic<int64_t> GlobalMemoryArbitrator::_s_process_reserved_memory = 0;
 std::atomic<int64_t> GlobalMemoryArbitrator::refresh_interval_memory_growth = 0;
+std::mutex GlobalMemoryArbitrator::cache_adjust_capacity_lock;
+std::condition_variable GlobalMemoryArbitrator::cache_adjust_capacity_cv;
+std::atomic<bool> GlobalMemoryArbitrator::cache_adjust_capacity_notify {false};
+std::atomic<double> GlobalMemoryArbitrator::last_cache_capacity_adjust_weighted {1};
+std::mutex GlobalMemoryArbitrator::memtable_memory_refresh_lock;
+std::condition_variable GlobalMemoryArbitrator::memtable_memory_refresh_cv;
+std::atomic<bool> GlobalMemoryArbitrator::memtable_memory_refresh_notify {false};
 
 bool GlobalMemoryArbitrator::try_reserve_process_memory(int64_t bytes) {
     if (sys_mem_available() - bytes < MemInfo::sys_mem_available_warning_water_mark()) {
diff --git a/be/src/runtime/memory/global_memory_arbitrator.h b/be/src/runtime/memory/global_memory_arbitrator.h
index f8fda18d0e9a0c..5fbcf232ce4d24 100644
--- a/be/src/runtime/memory/global_memory_arbitrator.h
+++ b/be/src/runtime/memory/global_memory_arbitrator.h
@@ -173,6 +173,23 @@ class GlobalMemoryArbitrator {
     // avoid multiple threads starting at the same time and causing OOM.
     static std::atomic<int64_t> refresh_interval_memory_growth;
 
+    static std::mutex cache_adjust_capacity_lock;
+    static std::condition_variable cache_adjust_capacity_cv;
+    static std::atomic<bool> cache_adjust_capacity_notify;
+    static std::atomic<double> last_cache_capacity_adjust_weighted;
+    static void notify_cache_adjust_capacity() {
+        cache_adjust_capacity_notify.store(true, std::memory_order_relaxed);
+        cache_adjust_capacity_cv.notify_all();
+    }
+
+    static std::mutex memtable_memory_refresh_lock;
+    static std::condition_variable memtable_memory_refresh_cv;
+    static std::atomic<bool> memtable_memory_refresh_notify;
+    static void notify_memtable_memory_refresh() {
+        memtable_memory_refresh_notify.store(true, std::memory_order_relaxed);
+        memtable_memory_refresh_cv.notify_all();
+    }
+
 private:
     static std::atomic<int64_t> _s_process_reserved_memory;
 
diff --git a/be/src/runtime/memory/lru_cache_policy.h b/be/src/runtime/memory/lru_cache_policy.h
index 1b6c9ead6d0086..419825c85c4538 100644
--- a/be/src/runtime/memory/lru_cache_policy.h
+++ b/be/src/runtime/memory/lru_cache_policy.h
@@ -37,7 +37,8 @@ class LRUCachePolicy : public CachePolicy {
                    uint32_t stale_sweep_time_s, uint32_t num_shards = DEFAULT_LRU_CACHE_NUM_SHARDS,
                    uint32_t element_count_capacity = DEFAULT_LRU_CACHE_ELEMENT_COUNT_CAPACITY,
                    bool enable_prune = true)
-            : CachePolicy(type, stale_sweep_time_s, enable_prune), _lru_cache_type(lru_cache_type) {
+            : CachePolicy(type, capacity, stale_sweep_time_s, enable_prune),
+              _lru_cache_type(lru_cache_type) {
         if (check_capacity(capacity, num_shards)) {
             _cache = std::shared_ptr<ShardedLRUCache>(
                     new ShardedLRUCache(type_string(type), capacity, lru_cache_type, num_shards,
@@ -53,7 +54,8 @@ class LRUCachePolicy : public CachePolicy {
                    uint32_t element_count_capacity,
                    CacheValueTimeExtractor cache_value_time_extractor,
                    bool cache_value_check_timestamp, bool enable_prune = true)
-            : CachePolicy(type, stale_sweep_time_s, enable_prune), _lru_cache_type(lru_cache_type) {
+            : CachePolicy(type, capacity, stale_sweep_time_s, enable_prune),
+              _lru_cache_type(lru_cache_type) {
         if (check_capacity(capacity, num_shards)) {
             _cache = std::shared_ptr<ShardedLRUCache>(
                     new ShardedLRUCache(type_string(type), capacity, lru_cache_type, num_shards,
@@ -106,18 +108,19 @@ class LRUCachePolicy : public CachePolicy {
 
     int64_t get_usage() { return _cache->get_usage(); }
 
-    size_t get_total_capacity() { return _cache->get_total_capacity(); }
+    size_t get_capacity() override { return _cache->get_capacity(); }
 
     uint64_t new_id() { return _cache->new_id(); };
 
     // Subclass can override this method to determine whether to do the minor or full gc
     virtual bool exceed_prune_limit() {
-        return _lru_cache_type == LRUCacheType::SIZE ? mem_consumption() > CACHE_MIN_FREE_SIZE
-                                                     : get_usage() > CACHE_MIN_FREE_NUMBER;
+        return _lru_cache_type == LRUCacheType::SIZE ? mem_consumption() > CACHE_MIN_PRUNE_SIZE
+                                                     : get_usage() > CACHE_MIN_PRUNE_NUMBER;
     }
 
     // Try to prune the cache if expired.
     void prune_stale() override {
+        std::lock_guard<std::mutex> l(_lock);
         COUNTER_SET(_freed_entrys_counter, (int64_t)0);
         COUNTER_SET(_freed_memory_counter, (int64_t)0);
         if (_stale_sweep_time_s <= 0 && _cache == ExecEnv::GetInstance()->get_dummy_lru_cache()) {
@@ -125,7 +128,6 @@ class LRUCachePolicy : public CachePolicy {
         }
         if (exceed_prune_limit()) {
             COUNTER_SET(_cost_timer, (int64_t)0);
-            SCOPED_TIMER(_cost_timer);
             const int64_t curtime = UnixMillis();
             auto pred = [this, curtime](const LRUHandle* handle) -> bool {
                 return static_cast<bool>((handle->last_visit_time + _stale_sweep_time_s * 1000) <
@@ -134,33 +136,38 @@ class LRUCachePolicy : public CachePolicy {
 
             LOG(INFO) << fmt::format("[MemoryGC] {} prune stale start, consumption {}, usage {}",
                                      type_string(_type), mem_consumption(), get_usage());
-            // Prune cache in lazy mode to save cpu and minimize the time holding write lock
-            PrunedInfo pruned_info = _cache->prune_if(pred, true);
-            COUNTER_SET(_freed_entrys_counter, pruned_info.pruned_count);
-            COUNTER_SET(_freed_memory_counter, pruned_info.pruned_size);
+            {
+                SCOPED_TIMER(_cost_timer);
+                // Prune cache in lazy mode to save cpu and minimize the time holding write lock
+                PrunedInfo pruned_info = _cache->prune_if(pred, true);
+                COUNTER_SET(_freed_entrys_counter, pruned_info.pruned_count);
+                COUNTER_SET(_freed_memory_counter, pruned_info.pruned_size);
+            }
             COUNTER_UPDATE(_prune_stale_number_counter, 1);
             LOG(INFO) << fmt::format(
-                    "[MemoryGC] {} prune stale {} entries, {} bytes, {} times prune",
+                    "[MemoryGC] {} prune stale {} entries, {} bytes, cost {}, {} times prune",
                     type_string(_type), _freed_entrys_counter->value(),
-                    _freed_memory_counter->value(), _prune_stale_number_counter->value());
+                    _freed_memory_counter->value(), _cost_timer->value(),
+                    _prune_stale_number_counter->value());
         } else {
             if (_lru_cache_type == LRUCacheType::SIZE) {
                 LOG(INFO) << fmt::format(
                         "[MemoryGC] {} not need prune stale, LRUCacheType::SIZE consumption {} "
                         "less "
-                        "than CACHE_MIN_FREE_SIZE {}",
-                        type_string(_type), mem_consumption(), CACHE_MIN_FREE_SIZE);
+                        "than CACHE_MIN_PRUNE_SIZE {}",
+                        type_string(_type), mem_consumption(), CACHE_MIN_PRUNE_SIZE);
             } else if (_lru_cache_type == LRUCacheType::NUMBER) {
                 LOG(INFO) << fmt::format(
                         "[MemoryGC] {} not need prune stale, LRUCacheType::NUMBER usage {} less "
                         "than "
-                        "CACHE_MIN_FREE_NUMBER {}",
-                        type_string(_type), get_usage(), CACHE_MIN_FREE_NUMBER);
+                        "CACHE_MIN_PRUNE_NUMBER {}",
+                        type_string(_type), get_usage(), CACHE_MIN_PRUNE_NUMBER);
             }
         }
     }
 
     void prune_all(bool force) override {
+        std::lock_guard<std::mutex> l(_lock);
         COUNTER_SET(_freed_entrys_counter, (int64_t)0);
         COUNTER_SET(_freed_memory_counter, (int64_t)0);
         if (_cache == ExecEnv::GetInstance()->get_dummy_lru_cache()) {
@@ -168,37 +175,73 @@ class LRUCachePolicy : public CachePolicy {
         }
         if ((force && mem_consumption() != 0) || exceed_prune_limit()) {
             COUNTER_SET(_cost_timer, (int64_t)0);
-            SCOPED_TIMER(_cost_timer);
             LOG(INFO) << fmt::format("[MemoryGC] {} prune all start, consumption {}, usage {}",
                                      type_string(_type), mem_consumption(), get_usage());
-            PrunedInfo pruned_info = _cache->prune();
-            COUNTER_SET(_freed_entrys_counter, pruned_info.pruned_count);
-            COUNTER_SET(_freed_memory_counter, pruned_info.pruned_size);
+            {
+                SCOPED_TIMER(_cost_timer);
+                PrunedInfo pruned_info = _cache->prune();
+                COUNTER_SET(_freed_entrys_counter, pruned_info.pruned_count);
+                COUNTER_SET(_freed_memory_counter, pruned_info.pruned_size);
+            }
             COUNTER_UPDATE(_prune_all_number_counter, 1);
             LOG(INFO) << fmt::format(
-                    "[MemoryGC] {} prune all {} entries, {} bytes, {} times prune, is force: {}",
+                    "[MemoryGC] {} prune all {} entries, {} bytes, cost {}, {} times prune, is "
+                    "force: {}",
                     type_string(_type), _freed_entrys_counter->value(),
-                    _freed_memory_counter->value(), _prune_all_number_counter->value(), force);
+                    _freed_memory_counter->value(), _cost_timer->value(),
+                    _prune_all_number_counter->value(), force);
         } else {
             if (_lru_cache_type == LRUCacheType::SIZE) {
                 LOG(INFO) << fmt::format(
                         "[MemoryGC] {} not need prune all, force is {}, LRUCacheType::SIZE "
                         "consumption {}, "
-                        "CACHE_MIN_FREE_SIZE {}",
-                        type_string(_type), force, mem_consumption(), CACHE_MIN_FREE_SIZE);
+                        "CACHE_MIN_PRUNE_SIZE {}",
+                        type_string(_type), force, mem_consumption(), CACHE_MIN_PRUNE_SIZE);
             } else if (_lru_cache_type == LRUCacheType::NUMBER) {
                 LOG(INFO) << fmt::format(
                         "[MemoryGC] {} not need prune all, force is {}, LRUCacheType::NUMBER "
-                        "usage {}, CACHE_MIN_FREE_NUMBER {}",
-                        type_string(_type), force, get_usage(), CACHE_MIN_FREE_NUMBER);
+                        "usage {}, CACHE_MIN_PRUNE_NUMBER {}",
+                        type_string(_type), force, get_usage(), CACHE_MIN_PRUNE_NUMBER);
             }
         }
     }
 
+    int64_t adjust_capacity_weighted(double adjust_weighted) override {
+        std::lock_guard<std::mutex> l(_lock);
+        auto capacity = static_cast<size_t>(_initial_capacity * adjust_weighted);
+        COUNTER_SET(_freed_entrys_counter, (int64_t)0);
+        COUNTER_SET(_freed_memory_counter, (int64_t)0);
+        COUNTER_SET(_cost_timer, (int64_t)0);
+        if (_cache == ExecEnv::GetInstance()->get_dummy_lru_cache()) {
+            return 0;
+        }
+
+        size_t old_capacity = get_capacity();
+        int64_t old_mem_consumption = mem_consumption();
+        int64_t old_usage = get_usage();
+        {
+            SCOPED_TIMER(_cost_timer);
+            PrunedInfo pruned_info = _cache->set_capacity(capacity);
+            COUNTER_SET(_freed_entrys_counter, pruned_info.pruned_count);
+            COUNTER_SET(_freed_memory_counter, pruned_info.pruned_size);
+        }
+        COUNTER_UPDATE(_adjust_capacity_weighted_number_counter, 1);
+        LOG(INFO) << fmt::format(
+                "[MemoryGC] {} update capacity, old <capacity {}, consumption {}, usage {}>, "
+                "adjust_weighted {}, new <capacity {}, consumption {}, usage {}>, prune {} "
+                "entries, {} bytes, cost {}, {} times prune",
+                type_string(_type), old_capacity, old_mem_consumption, old_usage, adjust_weighted,
+                get_capacity(), mem_consumption(), get_usage(), _freed_entrys_counter->value(),
+                _freed_memory_counter->value(), _cost_timer->value(),
+                _adjust_capacity_weighted_number_counter->value());
+        return _freed_entrys_counter->value();
+    }
+
 protected:
     // if check_capacity failed, will return dummy lru cache,
     // compatible with ShardedLRUCache usage, but will not actually cache.
     std::shared_ptr<Cache> _cache;
+    std::mutex _lock;
     LRUCacheType _lru_cache_type;
 };
 
diff --git a/be/src/runtime/memory/mem_tracker_limiter.cpp b/be/src/runtime/memory/mem_tracker_limiter.cpp
index a8aa44414ebf87..59546b11d51a8a 100644
--- a/be/src/runtime/memory/mem_tracker_limiter.cpp
+++ b/be/src/runtime/memory/mem_tracker_limiter.cpp
@@ -739,10 +739,10 @@ int64_t MemTrackerLimiter::free_top_overcommit_query(
         LOG(INFO) << log_prefix << "finished, no task need be canceled.";
         return 0;
     }
-    if (query_consumption.size() == 1) {
+    if (small_num == 0 && canceling_task.empty() && query_consumption.size() == 1) {
         auto iter = query_consumption.begin();
-        LOG(INFO) << log_prefix << "finished, only one task: " << iter->first
-                  << ", memory consumption: " << iter->second << ", no cancel.";
+        LOG(INFO) << log_prefix << "finished, only one overcommit task: " << iter->first
+                  << ", memory consumption: " << iter->second << ", no other tasks, so no cancel.";
         return 0;
     }
 
diff --git a/be/src/runtime/memory/memory_reclamation.cpp b/be/src/runtime/memory/memory_reclamation.cpp
index 3adf1d1ac75718..17f5a41f462b50 100644
--- a/be/src/runtime/memory/memory_reclamation.cpp
+++ b/be/src/runtime/memory/memory_reclamation.cpp
@@ -37,7 +37,6 @@ bool MemoryReclamation::process_minor_gc(std::string mem_info) {
     std::unique_ptr<RuntimeProfile> profile = std::make_unique<RuntimeProfile>("");
 
     Defer defer {[&]() {
-        MemInfo::notify_je_purge_dirty_pages();
         std::stringstream ss;
         profile->pretty_print(&ss);
         LOG(INFO) << fmt::format(
@@ -46,11 +45,6 @@ bool MemoryReclamation::process_minor_gc(std::string mem_info) {
                 ss.str());
     }};
 
-    freed_mem += CacheManager::instance()->for_each_cache_prune_stale(profile.get());
-    if (freed_mem > MemInfo::process_minor_gc_size()) {
-        return true;
-    }
-
     if (config::enable_workload_group_memory_gc) {
         RuntimeProfile* tg_profile = profile->create_child("WorkloadGroup", true, true);
         freed_mem += tg_enable_overcommit_group_gc(MemInfo::process_minor_gc_size() - freed_mem,
@@ -87,7 +81,6 @@ bool MemoryReclamation::process_full_gc(std::string mem_info) {
     std::unique_ptr<RuntimeProfile> profile = std::make_unique<RuntimeProfile>("");
 
     Defer defer {[&]() {
-        MemInfo::notify_je_purge_dirty_pages();
         std::stringstream ss;
         profile->pretty_print(&ss);
         LOG(INFO) << fmt::format(
@@ -96,11 +89,6 @@ bool MemoryReclamation::process_full_gc(std::string mem_info) {
                 ss.str());
     }};
 
-    freed_mem += CacheManager::instance()->for_each_cache_prune_all(profile.get());
-    if (freed_mem > MemInfo::process_full_gc_size()) {
-        return true;
-    }
-
     if (config::enable_workload_group_memory_gc) {
         RuntimeProfile* tg_profile = profile->create_child("WorkloadGroup", true, true);
         freed_mem += tg_enable_overcommit_group_gc(MemInfo::process_full_gc_size() - freed_mem,
diff --git a/be/src/service/point_query_executor.cpp b/be/src/service/point_query_executor.cpp
index 8058e1f1be6302..0a27c415a48c0a 100644
--- a/be/src/service/point_query_executor.cpp
+++ b/be/src/service/point_query_executor.cpp
@@ -234,6 +234,12 @@ void RowCache::erase(const RowCacheKey& key) {
     LRUCachePolicy::erase(encoded_key);
 }
 
+LookupConnectionCache::CacheValue::~CacheValue() {
+    SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(
+            ExecEnv::GetInstance()->point_query_executor_mem_tracker());
+    item.reset();
+}
+
 PointQueryExecutor::~PointQueryExecutor() {
     SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(
             ExecEnv::GetInstance()->point_query_executor_mem_tracker());
diff --git a/be/src/service/point_query_executor.h b/be/src/service/point_query_executor.h
index 19954479c97ec7..6c6fb28f95a378 100644
--- a/be/src/service/point_query_executor.h
+++ b/be/src/service/point_query_executor.h
@@ -246,8 +246,8 @@ class LookupConnectionCache : public LRUCachePolicyTrackingManual {
         auto* value = new CacheValue;
         value->item = item;
         LOG(INFO) << "Add item mem"
-                  << ", cache_capacity: " << get_total_capacity()
-                  << ", cache_usage: " << get_usage() << ", mem_consum: " << mem_consumption();
+                  << ", cache_capacity: " << get_capacity() << ", cache_usage: " << get_usage()
+                  << ", mem_consum: " << mem_consumption();
         auto* lru_handle = insert(key, value, 1, sizeof(Reusable), CachePriority::NORMAL);
         release(lru_handle);
     }
@@ -265,6 +265,7 @@ class LookupConnectionCache : public LRUCachePolicyTrackingManual {
 
     class CacheValue : public LRUCacheValueBase {
     public:
+        ~CacheValue() override;
         std::shared_ptr<Reusable> item;
     };
 };
diff --git a/be/src/vec/common/allocator.cpp b/be/src/vec/common/allocator.cpp
index dff1330888f82d..2619c0bafffb16 100644
--- a/be/src/vec/common/allocator.cpp
+++ b/be/src/vec/common/allocator.cpp
@@ -106,9 +106,6 @@ void Allocator<clear_memory_, mmap_populate, use_mmap, MemoryAllocator>::sys_mem
             return;
         }
 
-        // no significant impact on performance is expected.
-        doris::MemInfo::notify_je_purge_dirty_pages();
-
         if (doris::thread_context()->thread_mem_tracker_mgr->is_attach_query() &&
             doris::thread_context()->thread_mem_tracker_mgr->wait_gc()) {
             int64_t wait_milliseconds = 0;
diff --git a/be/test/olap/lru_cache_test.cpp b/be/test/olap/lru_cache_test.cpp
index 4fc096380c754b..9adb30b93054f4 100644
--- a/be/test/olap/lru_cache_test.cpp
+++ b/be/test/olap/lru_cache_test.cpp
@@ -88,25 +88,46 @@ class CacheTest : public testing::Test {
         void* value;
     };
 
-    class CacheTestPolicy : public LRUCachePolicyTrackingManual {
+    class CacheTestSizePolicy : public LRUCachePolicyTrackingManual {
     public:
-        CacheTestPolicy(size_t capacity)
-                : LRUCachePolicyTrackingManual(CachePolicy::CacheType::FOR_UT, capacity,
+        CacheTestSizePolicy(size_t capacity)
+                : LRUCachePolicyTrackingManual(CachePolicy::CacheType::FOR_UT_CACHE_SIZE, capacity,
                                                LRUCacheType::SIZE, -1) {}
     };
 
+    class CacheTestNumberPolicy : public LRUCachePolicyTrackingManual {
+    public:
+        CacheTestNumberPolicy(size_t capacity, uint32_t num_shards)
+                : LRUCachePolicyTrackingManual(CachePolicy::CacheType::FOR_UT_CACHE_NUMBER,
+                                               capacity, LRUCacheType::NUMBER, -1, num_shards) {}
+    };
+
     // there is 16 shards in ShardedLRUCache
     // And the LRUHandle size is about 100B. So the cache size should big enough
     // to run the UT.
     static const int kCacheSize = 1000 * 16;
     std::vector<int> _deleted_keys;
     std::vector<int> _deleted_values;
-    CacheTestPolicy* _cache;
+    LRUCachePolicy* _cache = nullptr;
 
-    CacheTest() : _cache(new CacheTestPolicy(kCacheSize)) { _s_current = this; }
+    CacheTest() { _s_current = this; }
 
     ~CacheTest() override { delete _cache; }
 
+    void init_size_cache(size_t capacity = kCacheSize) {
+        if (_cache != nullptr) {
+            delete _cache;
+        }
+        _cache = new CacheTestSizePolicy(capacity);
+    }
+
+    void init_number_cache(size_t capacity = kCacheSize, uint32_t num_shards = 1) {
+        if (_cache != nullptr) {
+            delete _cache;
+        }
+        _cache = new CacheTestNumberPolicy(capacity, num_shards);
+    }
+
     LRUCachePolicy* cache() const { return _cache; }
 
     int Lookup(int key) const {
@@ -149,7 +170,25 @@ class CacheTest : public testing::Test {
 };
 CacheTest* CacheTest::_s_current;
 
+static void insert_LRUCache(LRUCache& cache, const CacheKey& key, int value,
+                            CachePriority priority) {
+    uint32_t hash = key.hash(key.data(), key.size(), 0);
+    auto* cache_value = new CacheTest::CacheValue(EncodeValue(value));
+    cache.release(cache.insert(key, hash, cache_value, value, priority));
+}
+
+static void insert_number_LRUCache(LRUCache& cache, const CacheKey& key, int value, int charge,
+                                   CachePriority priority) {
+    uint32_t hash = key.hash(key.data(), key.size(), 0);
+    auto* cache_value = new CacheTest::CacheValue(EncodeValue(value));
+    cache.release(cache.insert(key, hash, cache_value, charge, priority));
+}
+
+// https://stackoverflow.com/questions/42756443/undefined-reference-with-gtest
+const int CacheTest::kCacheSize;
+
 TEST_F(CacheTest, HitAndMiss) {
+    init_size_cache();
     EXPECT_EQ(-1, Lookup(100));
 
     Insert(100, 101, 1);
@@ -173,6 +212,7 @@ TEST_F(CacheTest, HitAndMiss) {
 }
 
 TEST_F(CacheTest, Erase) {
+    init_size_cache();
     Erase(200);
     EXPECT_EQ(0, _deleted_keys.size());
 
@@ -192,6 +232,7 @@ TEST_F(CacheTest, Erase) {
 }
 
 TEST_F(CacheTest, EntriesArePinned) {
+    init_size_cache();
     Insert(100, 101, 1);
     std::string result1;
     Cache::Handle* h1 = cache()->lookup(EncodeKey(&result1, 100));
@@ -219,6 +260,7 @@ TEST_F(CacheTest, EntriesArePinned) {
 }
 
 TEST_F(CacheTest, EvictionPolicy) {
+    init_size_cache();
     Insert(100, 101, 1);
     Insert(200, 201, 1);
 
@@ -234,6 +276,7 @@ TEST_F(CacheTest, EvictionPolicy) {
 }
 
 TEST_F(CacheTest, EvictionPolicyWithDurable) {
+    init_size_cache();
     Insert(100, 101, 1);
     InsertDurable(200, 201, 1);
     Insert(300, 101, 1);
@@ -250,20 +293,6 @@ TEST_F(CacheTest, EvictionPolicyWithDurable) {
     EXPECT_EQ(201, Lookup(200));
 }
 
-static void insert_LRUCache(LRUCache& cache, const CacheKey& key, int value,
-                            CachePriority priority) {
-    uint32_t hash = key.hash(key.data(), key.size(), 0);
-    auto* cache_value = new CacheTest::CacheValue(EncodeValue(value));
-    cache.release(cache.insert(key, hash, cache_value, value, priority));
-}
-
-static void insert_number_LRUCache(LRUCache& cache, const CacheKey& key, int value, int charge,
-                                   CachePriority priority) {
-    uint32_t hash = key.hash(key.data(), key.size(), 0);
-    auto* cache_value = new CacheTest::CacheValue(EncodeValue(value));
-    cache.release(cache.insert(key, hash, cache_value, charge, priority));
-}
-
 TEST_F(CacheTest, Usage) {
     LRUCache cache(LRUCacheType::SIZE);
     cache.set_capacity(1040);
@@ -463,6 +492,7 @@ TEST_F(CacheTest, Number) {
 }
 
 TEST_F(CacheTest, HeavyEntries) {
+    init_size_cache();
     // Add a bunch of light and heavy entries and then count the combined
     // size of items still in the cache, which must be approximately the
     // same as the total capacity.
@@ -494,12 +524,14 @@ TEST_F(CacheTest, HeavyEntries) {
 }
 
 TEST_F(CacheTest, NewId) {
+    init_size_cache();
     uint64_t a = cache()->new_id();
     uint64_t b = cache()->new_id();
     EXPECT_NE(a, b);
 }
 
 TEST_F(CacheTest, SimpleBenchmark) {
+    init_size_cache();
     for (int i = 0; i < kCacheSize * LOOP_LESS_OR_MORE(10, 10000); i++) {
         Insert(1000 + i, 2000 + i, 1);
         EXPECT_EQ(2000 + i, Lookup(1000 + i));
@@ -598,4 +630,78 @@ TEST(CacheHandleTest, HandleTableTest) {
     }
 }
 
+TEST_F(CacheTest, SetCapacity) {
+    init_number_cache();
+    for (int i = 0; i < kCacheSize; i++) {
+        Insert(i, 1000 + i, 1);
+        EXPECT_EQ(1000 + i, Lookup(i));
+    }
+    ASSERT_EQ(kCacheSize, cache()->get_capacity());
+    ASSERT_EQ(kCacheSize, cache()->get_usage());
+
+    int64_t prune_num = cache()->adjust_capacity_weighted(2);
+    ASSERT_EQ(prune_num, 0);
+    ASSERT_EQ(kCacheSize * 2, cache()->get_capacity());
+    ASSERT_EQ(kCacheSize, cache()->get_usage());
+
+    prune_num = cache()->adjust_capacity_weighted(0.5);
+    ASSERT_EQ(prune_num, kCacheSize / 2);
+    ASSERT_EQ(kCacheSize / 2, cache()->get_capacity());
+    ASSERT_EQ(kCacheSize / 2, cache()->get_usage());
+
+    std::vector<Cache::Handle*> handles(kCacheSize, nullptr);
+    for (int i = 0; i < kCacheSize; i++) {
+        std::string result;
+        CacheKey cache_key = EncodeKey(&result, kCacheSize + i);
+        auto* cache_value = new CacheValueWithKey(DecodeKey(cache_key), EncodeValue(i));
+        handles[i] = cache()->insert(cache_key, cache_value, 1, 1);
+    }
+    ASSERT_EQ(kCacheSize / 2, cache()->get_capacity());
+    ASSERT_EQ(kCacheSize,
+              cache()->get_usage()); // Handle not be released, so key cannot be evicted.
+
+    for (int i = 0; i < kCacheSize; i++) {
+        Insert(i + kCacheSize, 2000 + i, 1);
+        EXPECT_EQ(-1, Lookup(i + kCacheSize)); // Cache is full, insert failed.
+    }
+    ASSERT_EQ(kCacheSize / 2, cache()->get_capacity());
+    ASSERT_EQ(kCacheSize, cache()->get_usage());
+
+    cache()->adjust_capacity_weighted(2);
+    ASSERT_EQ(kCacheSize * 2, cache()->get_capacity());
+    ASSERT_EQ(kCacheSize, cache()->get_usage());
+
+    for (int i = 0; i < kCacheSize; i++) {
+        Insert(i, 3000 + i, 1);
+        EXPECT_EQ(3000 + i, Lookup(i));
+    }
+    ASSERT_EQ(kCacheSize * 2, cache()->get_capacity());
+    ASSERT_EQ(kCacheSize * 2, cache()->get_usage());
+
+    cache()->adjust_capacity_weighted(0);
+    ASSERT_EQ(0, cache()->get_capacity());
+    ASSERT_EQ(kCacheSize, cache()->get_usage());
+
+    for (auto it : handles) {
+        cache()->release(it);
+    }
+    ASSERT_EQ(0, cache()->get_capacity());
+    ASSERT_EQ(0, cache()->get_usage());
+
+    cache()->adjust_capacity_weighted(1);
+    ASSERT_EQ(kCacheSize, cache()->get_capacity());
+    ASSERT_EQ(0, cache()->get_usage());
+
+    cache()->adjust_capacity_weighted(0);
+    ASSERT_EQ(0, cache()->get_capacity());
+    ASSERT_EQ(0, cache()->get_usage());
+
+    for (int i = 0; i < kCacheSize; i++) {
+        Insert(i, 4000 + i, 1);
+        EXPECT_EQ(-1, Lookup(i));
+    }
+    ASSERT_EQ(0, cache()->get_capacity());
+    ASSERT_EQ(0, cache()->get_usage());
+}
+
 } // namespace doris

From ccd654a572f9a8b563500917b87a8879726bf2e8 Mon Sep 17 00:00:00 2001
From: Xinyi Zou <zouxinyi02@gmail.com>
Date: Tue, 10 Sep 2024 11:03:40 +0800
Subject: [PATCH 14/44] [fix](arrow-flight-sql) Fix regression-test cloud p0
 conf (#40490)

---
 regression-test/conf/regression-conf.groovy | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regression-test/conf/regression-conf.groovy b/regression-test/conf/regression-conf.groovy
index ac66e01f90626c..d3d3ee264cfad7 100644
--- a/regression-test/conf/regression-conf.groovy
+++ b/regression-test/conf/regression-conf.groovy
@@ -207,7 +207,7 @@ txYunSk="***********"
 
 //arrow flight sql test config
 extArrowFlightSqlHost = "127.0.0.1"
-extArrowFlightSqlPort = 8080
+extArrowFlightSqlPort = 8081
 extArrowFlightSqlUser = "root"
 extArrowFlightSqlPassword= ""
 

From 194b85255fd0fcf1a5069e70ccf40728f8db1277 Mon Sep 17 00:00:00 2001
From: wangbo <wangbo@apache.org>
Date: Tue, 10 Sep 2024 11:06:11 +0800
Subject: [PATCH 15/44] [Fix]only publish topic to alive be (#40535)

## Proposed changes
Fix strange core stack when BE not start correctly, and FE send publish
topic request.
---
 .../apache/doris/common/publish/TopicPublisherThread.java    | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/publish/TopicPublisherThread.java b/fe/fe-core/src/main/java/org/apache/doris/common/publish/TopicPublisherThread.java
index 74cefeca4d907e..797b0893936513 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/publish/TopicPublisherThread.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/publish/TopicPublisherThread.java
@@ -85,10 +85,13 @@ protected void runAfterCatalogReady() {
         }
         AckResponseHandler handler = new AckResponseHandler(nodesToPublish);
         for (Backend be : nodesToPublish) {
-            executor.submit(new TopicPublishWorker(request, be, handler));
+            if (be.isAlive()) {
+                executor.submit(new TopicPublishWorker(request, be, handler));
+            }
         }
         try {
             int timeoutMs = Config.publish_topic_info_interval_ms / 3 * 2;
+            timeoutMs = timeoutMs <= 0 ? 3000 : timeoutMs;
             if (!handler.awaitAllInMs(timeoutMs)) {
                 Backend[] backends = handler.pendingNodes();
                 if (backends.length > 0) {

From abf6897a660234fe24e4e2989ec4d4dd74aeab14 Mon Sep 17 00:00:00 2001
From: Sun Chenyang <csun5285@gmail.com>
Date: Tue, 10 Sep 2024 11:13:11 +0800
Subject: [PATCH 16/44] [fix] (Nereids) restrict the execution of match on the
 scan (#40532)

revert parts of #38537 ,restrict the execution of match on the scan
---
 .../rules/analysis/CheckAfterRewrite.java     | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CheckAfterRewrite.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CheckAfterRewrite.java
index e193c5fc4938de..df8ec64fc2e1ff 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CheckAfterRewrite.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CheckAfterRewrite.java
@@ -24,6 +24,7 @@
 import org.apache.doris.nereids.trees.expressions.Alias;
 import org.apache.doris.nereids.trees.expressions.ExprId;
 import org.apache.doris.nereids.trees.expressions.Expression;
+import org.apache.doris.nereids.trees.expressions.Match;
 import org.apache.doris.nereids.trees.expressions.NamedExpression;
 import org.apache.doris.nereids.trees.expressions.Slot;
 import org.apache.doris.nereids.trees.expressions.SlotNotFromChildren;
@@ -38,6 +39,9 @@
 import org.apache.doris.nereids.trees.plans.Plan;
 import org.apache.doris.nereids.trees.plans.algebra.Generate;
 import org.apache.doris.nereids.trees.plans.logical.LogicalAggregate;
+import org.apache.doris.nereids.trees.plans.logical.LogicalDeferMaterializeOlapScan;
+import org.apache.doris.nereids.trees.plans.logical.LogicalFilter;
+import org.apache.doris.nereids.trees.plans.logical.LogicalOlapScan;
 import org.apache.doris.nereids.trees.plans.logical.LogicalSort;
 import org.apache.doris.nereids.trees.plans.logical.LogicalTopN;
 import org.apache.doris.nereids.trees.plans.logical.LogicalWindow;
@@ -60,6 +64,7 @@ public Rule build() {
             checkAllSlotReferenceFromChildren(plan);
             checkUnexpectedExpression(plan);
             checkMetricTypeIsUsedCorrectly(plan);
+            checkMatchIsUsedCorrectly(plan);
             return null;
         }).toRule(RuleType.CHECK_ANALYSIS);
     }
@@ -176,4 +181,19 @@ private void checkMetricTypeIsUsedCorrectly(Plan plan) {
             });
         }
     }
+
+    private void checkMatchIsUsedCorrectly(Plan plan) {
+        for (Expression expression : plan.getExpressions()) {
+            if (expression instanceof Match) {
+                if (plan instanceof LogicalFilter && (plan.child(0) instanceof LogicalOlapScan
+                        || plan.child(0) instanceof LogicalDeferMaterializeOlapScan)) {
+                    return;
+                } else {
+                    throw new AnalysisException(String.format(
+                            "Not support match in %s in plan: %s, only support in olapScan filter",
+                            plan.child(0), plan));
+                }
+            }
+        }
+    }
 }

From b674bd12714048266b8dba9b949c3385f2823628 Mon Sep 17 00:00:00 2001
From: Xinyi Zou <zouxinyi02@gmail.com>
Date: Tue, 10 Sep 2024 11:24:04 +0800
Subject: [PATCH 17/44] [fix](memory) Not check process memory limit when
 thread reserve memory is sufficient (#40548)

---
 .../runtime/memory/global_memory_arbitrator.cpp   |  8 ++++++++
 be/src/runtime/memory/global_memory_arbitrator.h  | 15 +++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/be/src/runtime/memory/global_memory_arbitrator.cpp b/be/src/runtime/memory/global_memory_arbitrator.cpp
index 76a414a6ebdc74..82b69ca02ef9f3 100644
--- a/be/src/runtime/memory/global_memory_arbitrator.cpp
+++ b/be/src/runtime/memory/global_memory_arbitrator.cpp
@@ -86,4 +86,12 @@ void GlobalMemoryArbitrator::release_process_reserved_memory(int64_t bytes) {
     }
 }
 
+int64_t GlobalMemoryArbitrator::sub_thread_reserve_memory(int64_t bytes) {
+    doris::ThreadContext* thread_context = doris::thread_context(true);
+    if (thread_context) {
+        return bytes - doris::thread_context()->thread_mem_tracker_mgr->reserved_mem();
+    }
+    return bytes;
+}
+
 } // namespace doris
diff --git a/be/src/runtime/memory/global_memory_arbitrator.h b/be/src/runtime/memory/global_memory_arbitrator.h
index 5fbcf232ce4d24..f804452956786d 100644
--- a/be/src/runtime/memory/global_memory_arbitrator.h
+++ b/be/src/runtime/memory/global_memory_arbitrator.h
@@ -124,12 +124,27 @@ class GlobalMemoryArbitrator {
         return _s_process_reserved_memory.load(std::memory_order_relaxed);
     }
 
+    // `process_memory_usage` includes all reserved memory. if a thread has `reserved_memory`,
+    // and the memory allocated by thread is less than the thread `reserved_memory`,
+    // even if `process_memory_usage` is greater than `process_mem_limit`, memory can still be allocated.
+    // At this time, `process_memory_usage` will not increase, process physical memory will increase,
+    // and `reserved_memory` will be reduced.
+    static int64_t sub_thread_reserve_memory(int64_t bytes);
+
     static bool is_exceed_soft_mem_limit(int64_t bytes = 0) {
+        bytes = sub_thread_reserve_memory(bytes);
+        if (bytes <= 0) {
+            return false;
+        }
         return process_memory_usage() + bytes >= MemInfo::soft_mem_limit() ||
                sys_mem_available() - bytes < MemInfo::sys_mem_available_warning_water_mark();
     }
 
     static bool is_exceed_hard_mem_limit(int64_t bytes = 0) {
+        bytes = sub_thread_reserve_memory(bytes);
+        if (bytes <= 0) {
+            return false;
+        }
         // Limit process memory usage using the actual physical memory of the process in `/proc/self/status`.
         // This is independent of the consumption value of the mem tracker, which counts the virtual memory
         // of the process malloc.

From e444100523d0cbbed2b03aecc857430837427daf Mon Sep 17 00:00:00 2001
From: Mryange <59914473+Mryange@users.noreply.github.com>
Date: Tue, 10 Sep 2024 11:32:11 +0800
Subject: [PATCH 18/44] [opt](function) Optimize ConvertTz function when using
 constant parameters. (#40366)

mysql [test]>select count(convert_tz(d, 'Asia/Shanghai', 'America/Los_Angeles')), count(convert_tz(dt, 'America/Los_Angeles', '+00:00')) from dates;
+-------------------------------------------------------------------------------------+--------------------------------------------------------+
| count(convert_tz(cast(d as DATETIMEV2(6)), 'Asia/Shanghai', 'America/Los_Angeles')) | count(convert_tz(dt, 'America/Los_Angeles', '+00:00')) |
+-------------------------------------------------------------------------------------+--------------------------------------------------------+
|                                                                            16000000 |                                               16000000 |
+-------------------------------------------------------------------------------------+--------------------------------------------------------+
1 row in set (13.91 sec)

mysql [test]>select count(convert_tz(d, 'Asia/Shanghai', 'America/Los_Angeles')), count(convert_tz(dt, 'America/Los_Angeles', '+00:00')) from dates;
+-------------------------------------------------------------------------------------+--------------------------------------------------------+
| count(convert_tz(cast(d as DATETIMEV2(6)), 'Asia/Shanghai', 'America/Los_Angeles')) | count(convert_tz(dt, 'America/Los_Angeles', '+00:00')) |
+-------------------------------------------------------------------------------------+--------------------------------------------------------+
|                                                                            16000000 |                                               16000000 |
+-------------------------------------------------------------------------------------+--------------------------------------------------------+
1 row in set (4.59 sec)
---
 be/src/vec/functions/function_convert_tz.h | 128 ++++++++++++++++++---
 1 file changed, 115 insertions(+), 13 deletions(-)

diff --git a/be/src/vec/functions/function_convert_tz.h b/be/src/vec/functions/function_convert_tz.h
index af118c80583769..d0a600a9e41a86 100644
--- a/be/src/vec/functions/function_convert_tz.h
+++ b/be/src/vec/functions/function_convert_tz.h
@@ -53,6 +53,13 @@
 #include "vec/runtime/vdatetime_value.h"
 namespace doris::vectorized {
 
+struct ConvertTzState {
+    bool use_state = false;
+    bool is_valid = false;
+    cctz::time_zone from_tz;
+    cctz::time_zone to_tz;
+};
+
 template <typename ArgDateType>
 class FunctionConvertTZ : public IFunction {
     using DateValueType = date_cast::TypeToValueTypeV<ArgDateType>;
@@ -88,8 +95,62 @@ class FunctionConvertTZ : public IFunction {
                 std::make_shared<DataTypeString>()};
     }
 
+    Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override {
+        if (scope == FunctionContext::THREAD_LOCAL) {
+            return Status::OK();
+        }
+        std::shared_ptr<ConvertTzState> state = std::make_shared<ConvertTzState>();
+
+        context->set_function_state(scope, state);
+        DCHECK_EQ(context->get_num_args(), 3);
+        const auto* const_from_tz = context->get_constant_col(1);
+        const auto* const_to_tz = context->get_constant_col(2);
+
+        // ConvertTzState is used only when both the second and third parameters are constants
+        if (const_from_tz != nullptr && const_to_tz != nullptr) {
+            state->use_state = true;
+            init_convert_tz_state(state, const_from_tz, const_to_tz);
+        } else {
+            state->use_state = false;
+        }
+
+        return IFunction::open(context, scope);
+    }
+
+    void init_convert_tz_state(std::shared_ptr<ConvertTzState> state,
+                               const ColumnPtrWrapper* const_from_tz,
+                               const ColumnPtrWrapper* const_to_tz) {
+        auto const_data_from_tz = const_from_tz->column_ptr->get_data_at(0);
+        auto const_data_to_tz = const_to_tz->column_ptr->get_data_at(0);
+
+        // from_tz and to_tz must both be non-null.
+        if (const_data_from_tz.data == nullptr || const_data_to_tz.data == nullptr) {
+            state->is_valid = false;
+            return;
+        }
+
+        auto from_tz_name = const_data_from_tz.to_string();
+        auto to_tz_name = const_data_to_tz.to_string();
+
+        if (!TimezoneUtils::find_cctz_time_zone(from_tz_name, state->from_tz)) {
+            state->is_valid = false;
+            return;
+        }
+        if (!TimezoneUtils::find_cctz_time_zone(to_tz_name, state->to_tz)) {
+            state->is_valid = false;
+            return;
+        }
+        state->is_valid = true;
+    }
+
     Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
                         size_t result, size_t input_rows_count) const override {
+        auto* convert_tz_state = reinterpret_cast<ConvertTzState*>(
+                context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
+        if (!convert_tz_state) {
+            return Status::RuntimeError(
+                    "funciton context for function '{}' must have ConvertTzState;", get_name());
+        }
         auto result_null_map_column = ColumnUInt8::create(input_rows_count, 0);
 
         bool col_const[3];
@@ -106,12 +167,16 @@ class FunctionConvertTZ : public IFunction {
 
         if (col_const[1] && col_const[2]) {
             auto result_column = ColumnType::create();
-            execute_tz_const(context, assert_cast<const ColumnType*>(argument_columns[0].get()),
-                             assert_cast<const ColumnString*>(argument_columns[1].get()),
-                             assert_cast<const ColumnString*>(argument_columns[2].get()),
-                             assert_cast<ReturnColumnType*>(result_column.get()),
-                             assert_cast<ColumnUInt8*>(result_null_map_column.get())->get_data(),
-                             input_rows_count);
+            if (convert_tz_state->use_state) {
+                execute_tz_const_with_state(
+                        convert_tz_state, assert_cast<const ColumnType*>(argument_columns[0].get()),
+                        assert_cast<ReturnColumnType*>(result_column.get()),
+                        assert_cast<ColumnUInt8*>(result_null_map_column.get())->get_data(),
+                        input_rows_count);
+            } else {
+                return Status::RuntimeError("ConvertTzState is not initialized in function {}",
+                                            get_name());
+            }
             block.get_by_position(result).column = ColumnNullable::create(
                     std::move(result_column), std::move(result_null_map_column));
         } else {
@@ -144,18 +209,55 @@ class FunctionConvertTZ : public IFunction {
         }
     }
 
-    static void execute_tz_const(FunctionContext* context, const ColumnType* date_column,
-                                 const ColumnString* from_tz_column,
-                                 const ColumnString* to_tz_column, ReturnColumnType* result_column,
-                                 NullMap& result_null_map, size_t input_rows_count) {
-        auto from_tz = from_tz_column->get_data_at(0).to_string();
-        auto to_tz = to_tz_column->get_data_at(0).to_string();
+    static void execute_tz_const_with_state(ConvertTzState* convert_tz_state,
+                                            const ColumnType* date_column,
+                                            ReturnColumnType* result_column,
+                                            NullMap& result_null_map, size_t input_rows_count) {
+        cctz::time_zone& from_tz = convert_tz_state->from_tz;
+        cctz::time_zone& to_tz = convert_tz_state->to_tz;
+        auto push_null = [&](int row) {
+            result_null_map[row] = true;
+            result_column->insert_default();
+        };
+        if (!convert_tz_state->is_valid) {
+            // If an invalid timezone is present, return null
+            for (size_t i = 0; i < input_rows_count; i++) {
+                push_null(i);
+            }
+            return;
+        }
         for (size_t i = 0; i < input_rows_count; i++) {
             if (result_null_map[i]) {
                 result_column->insert_default();
                 continue;
             }
-            execute_inner_loop(date_column, from_tz, to_tz, result_column, result_null_map, i);
+
+            DateValueType ts_value =
+                    binary_cast<NativeType, DateValueType>(date_column->get_element(i));
+            ReturnDateValueType ts_value2;
+
+            if constexpr (std::is_same_v<ArgDateType, DataTypeDateTimeV2>) {
+                std::pair<int64_t, int64_t> timestamp;
+                if (!ts_value.unix_timestamp(&timestamp, from_tz)) {
+                    push_null(i);
+                    continue;
+                }
+                ts_value2.from_unixtime(timestamp, to_tz);
+            } else {
+                int64_t timestamp;
+                if (!ts_value.unix_timestamp(&timestamp, from_tz)) {
+                    push_null(i);
+                    continue;
+                }
+                ts_value2.from_unixtime(timestamp, to_tz);
+            }
+
+            if (!ts_value2.is_valid_date()) [[unlikely]] {
+                push_null(i);
+                continue;
+            }
+
+            result_column->insert(binary_cast<ReturnDateValueType, ReturnNativeType>(ts_value2));
         }
     }
 

From ab2c1cca92b8cf68cb76a4c1db3da17ef4d6d325 Mon Sep 17 00:00:00 2001
From: zclllhhjj <zhaochangle@selectdb.com>
Date: Tue, 10 Sep 2024 11:41:16 +0800
Subject: [PATCH 19/44] [Chore](GHA) Add code owner for be exec version change
 (#40577)

Add code owner for be exec version change
---
 .github/CODEOWNERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 2757578827c2bb..bac487c65627a9 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -15,5 +15,6 @@
 # limitations under the License.
 #
 be/src/io/* @platoneko @gavinchou @dataroaring
+be/src/agent/be_exec_version_manager.cpp @BiteTheDDDDt
 fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java @dataroaring @CalvinKirs @morningman
 **/pom.xml @CalvinKirs @morningman

From e9eed642814c635314bbcdd3ad768712539bf88b Mon Sep 17 00:00:00 2001
From: zhangstar333 <87313068+zhangstar333@users.noreply.github.com>
Date: Tue, 10 Sep 2024 12:05:55 +0800
Subject: [PATCH 20/44] [test](case) add some test case for encrypt/decrypt
 functions (#40427)

## Proposed changes

add some test case for encrypt/decrypt functions
---
 .../test_encryption_function.out              | 192 ++++++++++++++++++
 .../test_encryption_function.groovy           | 135 ++++++++++++
 2 files changed, 327 insertions(+)

diff --git a/regression-test/data/query_p0/sql_functions/encryption_digest/test_encryption_function.out b/regression-test/data/query_p0/sql_functions/encryption_digest/test_encryption_function.out
index 721412dc0364c1..c652b3074558cc 100644
--- a/regression-test/data/query_p0/sql_functions/encryption_digest/test_encryption_function.out
+++ b/regression-test/data/query_p0/sql_functions/encryption_digest/test_encryption_function.out
@@ -80,3 +80,195 @@ text
 -- !sql --
 aaaaaa
 
+-- !sql1 --
+aaaaaa
+
+-- !sql2 --
+aaaaaa
+
+-- !sql3 --
+zhang
+
+-- !sql4 --
+zhang
+
+-- !sql5 --
+aaaaaa
+
+-- !sql6 --
+aaaaaa
+
+-- !sql7 --
+zhang
+
+-- !sql8 --
+zhang
+
+-- !sql9 --
+aaaaaa
+
+-- !sql10 --
+aaaaaa
+
+-- !sql11 --
+zhang
+
+-- !sql12 --
+zhang
+
+-- !sql9 --
+aaaaaa
+
+-- !sql10 --
+aaaaaa
+
+-- !sql11 --
+zhang
+
+-- !sql12 --
+zhang
+
+-- !sql13 --
+aaaaaa
+
+-- !sql14 --
+aaaaaa
+
+-- !sql15 --
+zhang
+
+-- !sql16 --
+zhang
+
+-- !sql17 --
+aaaaaa
+
+-- !sql18 --
+aaaaaa
+
+-- !sql19 --
+zhang
+
+-- !sql20 --
+zhang
+
+-- !sql21 --
+aaaaaa
+
+-- !sql22 --
+aaaaaa
+
+-- !sql23 --
+zhang
+
+-- !sql24 --
+zhang
+
+-- !sql25 --
+aaaaaa
+
+-- !sql26 --
+aaaaaa
+
+-- !sql27 --
+zhang
+
+-- !sql28 --
+zhang
+
+-- !sql29 --
+aaaaaa
+
+-- !sql30 --
+aaaaaa
+
+-- !sql31 --
+zhang
+
+-- !sql32 --
+zhang
+
+-- !sql29 --
+aaaaaa
+
+-- !sql30 --
+aaaaaa
+
+-- !sql31 --
+zhang
+
+-- !sql32 --
+zhang
+
+-- !sql33 --
+aaaaaa
+
+-- !sql34 --
+aaaaaa
+
+-- !sql35 --
+zhang
+
+-- !sql36 --
+zhang
+
+-- !sql37 --
+aaaaaa
+
+-- !sql38 --
+aaaaaa
+
+-- !sql39 --
+zhang
+
+-- !sql40 --
+zhang
+
+-- !sql41 --
+aaaaaa
+
+-- !sql42 --
+aaaaaa
+
+-- !sql43 --
+zhang
+
+-- !sql44 --
+zhang
+
+-- !sql45 --
+aaaaaa
+
+-- !sql46 --
+aaaaaa
+
+-- !sql47 --
+zhang
+
+-- !sql48 --
+zhang
+
+-- !sql49 --
+aaaaaa
+
+-- !sql50 --
+aaaaaa
+
+-- !sql51 --
+zhang
+
+-- !sql52 --
+zhang
+
+-- !sql53 --
+aaaaaa
+
+-- !sql54 --
+aaaaaa
+
+-- !sql55 --
+zhang
+
+-- !sql56 --
+zhang
+
diff --git a/regression-test/suites/query_p0/sql_functions/encryption_digest/test_encryption_function.groovy b/regression-test/suites/query_p0/sql_functions/encryption_digest/test_encryption_function.groovy
index f4a67b052c3750..25095f46917c96 100644
--- a/regression-test/suites/query_p0/sql_functions/encryption_digest/test_encryption_function.groovy
+++ b/regression-test/suites/query_p0/sql_functions/encryption_digest/test_encryption_function.groovy
@@ -91,4 +91,139 @@ suite("test_encryption_function") {
     """
     sql""" insert into quantile_table values(1,"aaaaaa");"""
     qt_sql """ select sm4_decrypt(sm4_encrypt(k,"doris","0123456789abcdef"),"doris","0123456789abcdef") from quantile_table; """
+
+    // sm4_encrypt sm4_decrypt
+    // aes_encrypt aes_decrypt
+    //two arg (column/const)
+    sql "set enable_fold_constant_by_be = false;"
+    sql """ set block_encryption_mode=""; """ // SM4_128_ECB
+    qt_sql1 """ select sm4_decrypt(sm4_encrypt(k,"doris"),"doris") from quantile_table; """
+    qt_sql2 """ select sm4_decrypt(sm4_encrypt(k,k),k) from quantile_table; """
+    qt_sql3 """ select sm4_decrypt(sm4_encrypt("zhang","doris"),"doris") from quantile_table; """
+    qt_sql4 """ select sm4_decrypt(sm4_encrypt("zhang",k),k) from quantile_table; """
+
+    sql """ set block_encryption_mode="SM4_128_CBC"; """
+    qt_sql5 """ select sm4_decrypt(sm4_encrypt(k,"doris"),"doris") from quantile_table; """
+    qt_sql6 """ select sm4_decrypt(sm4_encrypt(k,k),k) from quantile_table; """
+    qt_sql7 """ select sm4_decrypt(sm4_encrypt("zhang","doris"),"doris") from quantile_table; """
+    qt_sql8 """ select sm4_decrypt(sm4_encrypt("zhang",k),k) from quantile_table; """
+
+    sql """ set block_encryption_mode="SM4_128_OFB"; """
+    qt_sql9 """ select sm4_decrypt(sm4_encrypt(k,"doris"),"doris") from quantile_table; """
+    qt_sql10 """ select sm4_decrypt(sm4_encrypt(k,k),k) from quantile_table; """
+    qt_sql11 """ select sm4_decrypt(sm4_encrypt("zhang","doris"),"doris") from quantile_table; """
+    qt_sql12 """ select sm4_decrypt(sm4_encrypt("zhang",k),k) from quantile_table; """
+
+    sql """ set block_encryption_mode="SM4_128_CTR"; """
+    qt_sql9 """ select sm4_decrypt(sm4_encrypt(k,"doris"),"doris") from quantile_table; """
+    qt_sql10 """ select sm4_decrypt(sm4_encrypt(k,k),k) from quantile_table; """
+    qt_sql11 """ select sm4_decrypt(sm4_encrypt("zhang","doris"),"doris") from quantile_table; """
+    qt_sql12 """ select sm4_decrypt(sm4_encrypt("zhang",k),k) from quantile_table; """
+
+    sql """ set block_encryption_mode=""; """ // AES_128_ECB
+    qt_sql13 """ select aes_decrypt(aes_encrypt(k,"doris"),"doris") from quantile_table; """
+    qt_sql14 """ select aes_decrypt(aes_encrypt(k,k),k) from quantile_table; """
+    qt_sql15 """ select aes_decrypt(aes_encrypt("zhang","doris"),"doris") from quantile_table; """
+    qt_sql16 """ select aes_decrypt(aes_encrypt("zhang",k),k) from quantile_table; """
+
+    sql """ set block_encryption_mode="AES_256_CBC"; """
+    qt_sql17 """ select aes_decrypt(aes_encrypt(k,"doris"),"doris") from quantile_table; """
+    qt_sql18 """ select aes_decrypt(aes_encrypt(k,k),k) from quantile_table; """
+    qt_sql19 """ select aes_decrypt(aes_encrypt("zhang","doris"),"doris") from quantile_table; """
+    qt_sql20 """ select aes_decrypt(aes_encrypt("zhang",k),k) from quantile_table; """
+
+
+    sql """ set block_encryption_mode="AES_128_CTR"; """
+    qt_sql21 """ select aes_decrypt(aes_encrypt(k,"doris"),"doris") from quantile_table; """
+    qt_sql22 """ select aes_decrypt(aes_encrypt(k,k),k) from quantile_table; """
+    qt_sql23 """ select aes_decrypt(aes_encrypt("zhang","doris"),"doris") from quantile_table; """
+    qt_sql24 """ select aes_decrypt(aes_encrypt("zhang",k),k) from quantile_table; """
+
+
+    sql """ set block_encryption_mode="AES_256_OFB"; """
+    qt_sql25 """ select aes_decrypt(aes_encrypt(k,"doris"),"doris") from quantile_table; """
+    qt_sql26 """ select aes_decrypt(aes_encrypt(k,k),k) from quantile_table; """
+    qt_sql27 """ select aes_decrypt(aes_encrypt("zhang","doris"),"doris") from quantile_table; """
+    qt_sql28 """ select aes_decrypt(aes_encrypt("zhang",k),k) from quantile_table; """
+
+    sql """ set block_encryption_mode=""; """
+
+    sql """ select to_base64(aes_encrypt(k,"doris")) from quantile_table;""" // 3A7GoWeuMNEBWzJx+YefZw==
+    qt_sql29 """ select aes_decrypt(FROM_BASE64("3A7GoWeuMNEBWzJx+YefZw=="),"doris") from quantile_table; """
+
+    sql """ select to_base64(aes_encrypt(k,k)) from quantile_table;""" //ADnRqPtFBjreZu06UTD64g==
+    qt_sql30 """ select aes_decrypt(FROM_BASE64("ADnRqPtFBjreZu06UTD64g=="),k) from quantile_table; """
+
+    sql """ select to_base64(aes_encrypt("zhang","doris")) from quantile_table;""" //fLhlYvn/yZhqd2LTRHImrw==
+    qt_sql31 """ select aes_decrypt(FROM_BASE64("fLhlYvn/yZhqd2LTRHImrw=="),"doris") from quantile_table; """
+
+    sql """ select to_base64(aes_encrypt("zhang",k)) from quantile_table;""" //2C8acACKfoRwHZS5B4juNw==
+    qt_sql32 """ select aes_decrypt(FROM_BASE64("2C8acACKfoRwHZS5B4juNw=="),k) from quantile_table; """
+
+
+
+    sql """ select to_base64(sm4_encrypt(k,"doris")) from quantile_table;""" // 7vSaqYqMl9no8trrzbdAEw==
+    qt_sql29 """ select sm4_decrypt(FROM_BASE64("7vSaqYqMl9no8trrzbdAEw=="),"doris") from quantile_table; """
+
+    sql """ select to_base64(sm4_encrypt(k,k)) from quantile_table;""" // PcPR18T6lhMuFTqQtymb8w==
+    qt_sql30 """ select sm4_decrypt(FROM_BASE64("PcPR18T6lhMuFTqQtymb8w=="),k) from quantile_table; """
+
+    sql """ select to_base64(sm4_encrypt("zhang","doris")) from quantile_table;""" // WY+4o1/cZwAFQ0F6dlyEqQ==
+    qt_sql31 """ select sm4_decrypt(FROM_BASE64("WY+4o1/cZwAFQ0F6dlyEqQ=="),"doris") from quantile_table; """
+
+    sql """ select to_base64(sm4_encrypt("zhang",k)) from quantile_table;""" // lhDiiEnRn3PvY6v4sHES0A==
+    qt_sql32 """ select sm4_decrypt(FROM_BASE64("lhDiiEnRn3PvY6v4sHES0A=="),k) from quantile_table; """
+
+
+    sql "DROP TABLE IF EXISTS quantile_table2"
+    sql"""
+        CREATE TABLE quantile_table2
+        (
+            id int,
+            k string,
+            k1 string,
+            k2 string
+        )
+        ENGINE=OLAP
+        UNIQUE KEY(id)
+        DISTRIBUTED BY HASH(id) BUCKETS 4
+        PROPERTIES (
+        "enable_unique_key_merge_on_write" = "true",
+        "replication_num" = "1"
+        );
+    """
+    sql""" insert into quantile_table2 values(1,"aaaaaa", "key_word", "init_word");"""
+
+    //four arg (column/const)
+    sql """ set block_encryption_mode=""; """ // SM4_128_ECB
+    qt_sql33 """ select sm4_decrypt(sm4_encrypt(k,"doris","abcdefghij", "SM4_128_CBC"),"doris","abcdefghij","SM4_128_CBC") from quantile_table2; """
+    qt_sql34 """ select sm4_decrypt(sm4_encrypt(k,k,"abcdefghij", "SM4_128_CBC"),k,"abcdefghij", "SM4_128_CBC") from quantile_table2; """
+    qt_sql35 """ select sm4_decrypt(sm4_encrypt("zhang","doris","abcdefghij", "SM4_128_CBC"),"doris","abcdefghij", "SM4_128_CBC") from quantile_table2; """
+    qt_sql36 """ select sm4_decrypt(sm4_encrypt("zhang",k,"abcdefghij", "SM4_128_CBC"),k,"abcdefghij", "SM4_128_CBC") from quantile_table2; """
+    
+    qt_sql37 """ select sm4_decrypt(sm4_encrypt(k,"doris",k2, "SM4_128_CBC"),"doris",k2,"SM4_128_CBC") from quantile_table2; """
+    qt_sql38 """ select sm4_decrypt(sm4_encrypt(k,k,k2, "SM4_128_CBC"),k,k2, "SM4_128_CBC") from quantile_table2; """
+    qt_sql39 """ select sm4_decrypt(sm4_encrypt("zhang","doris",k2, "SM4_128_CBC"),"doris",k2, "SM4_128_CBC") from quantile_table2; """
+    qt_sql40 """ select sm4_decrypt(sm4_encrypt("zhang",k,k2, "SM4_128_CBC"),k,k2, "SM4_128_CBC") from quantile_table2; """
+    
+    qt_sql41 """ select sm4_decrypt(sm4_encrypt(k,k1,k2, "SM4_128_CBC"),k1,k2,"SM4_128_CBC") from quantile_table2; """
+    qt_sql42 """ select sm4_decrypt(sm4_encrypt(k,k1,k2, "SM4_128_CBC"),k1,k2, "SM4_128_CBC") from quantile_table2; """
+    qt_sql43 """ select sm4_decrypt(sm4_encrypt("zhang",k1,k2, "SM4_128_CBC"),k1,k2, "SM4_128_CBC") from quantile_table2; """
+    qt_sql44 """ select sm4_decrypt(sm4_encrypt("zhang",k1,k2, "SM4_128_CBC"),k1,k2, "SM4_128_CBC") from quantile_table2; """
+    
+
+    qt_sql45 """ select aes_decrypt(aes_encrypt(k,"doris","abcdefghij", "AES_256_CFB"),"doris","abcdefghij","AES_256_CFB") from quantile_table2; """
+    qt_sql46 """ select aes_decrypt(aes_encrypt(k,k,"abcdefghij", "AES_256_CFB"),k,"abcdefghij", "AES_256_CFB") from quantile_table2; """
+    qt_sql47 """ select aes_decrypt(aes_encrypt("zhang","doris","abcdefghij", "AES_256_CFB"),"doris","abcdefghij", "AES_256_CFB") from quantile_table2; """
+    qt_sql48 """ select aes_decrypt(aes_encrypt("zhang",k,"abcdefghij", "AES_256_CFB"),k,"abcdefghij", "AES_256_CFB") from quantile_table2; """
+    
+    qt_sql49 """ select aes_decrypt(aes_encrypt(k,"doris",k2, "AES_256_CFB"),"doris",k2,"AES_256_CFB") from quantile_table2; """
+    qt_sql50 """ select aes_decrypt(aes_encrypt(k,k,k2, "AES_256_CFB"),k,k2, "AES_256_CFB") from quantile_table2; """
+    qt_sql51 """ select aes_decrypt(aes_encrypt("zhang","doris",k2, "AES_256_CFB"),"doris",k2, "AES_256_CFB") from quantile_table2; """
+    qt_sql52 """ select aes_decrypt(aes_encrypt("zhang",k,k2, "AES_256_CFB"),k,k2, "AES_256_CFB") from quantile_table2; """
+    
+    qt_sql53 """ select aes_decrypt(aes_encrypt(k,k1,k2, "AES_256_CFB"),k1,k2,"AES_256_CFB") from quantile_table2; """
+    qt_sql54 """ select aes_decrypt(aes_encrypt(k,k1,k2, "AES_256_CFB"),k1,k2, "AES_256_CFB") from quantile_table2; """
+    qt_sql55 """ select aes_decrypt(aes_encrypt("zhang",k1,k2, "AES_256_CFB"),k1,k2, "AES_256_CFB") from quantile_table2; """
+    qt_sql56 """ select aes_decrypt(aes_encrypt("zhang",k1,k2, "AES_256_CFB"),k1,k2, "AES_256_CFB") from quantile_table2; """
 }

From dd8b0ee480bad63de11159575beae8a32f03e387 Mon Sep 17 00:00:00 2001
From: Vallish Pai <vallishpai@gmail.com>
Date: Tue, 10 Sep 2024 09:55:25 +0530
Subject: [PATCH 21/44] [Enhancement] doris to support function SESSION_USER()
 (#39575)

## Proposed changes

Issue Number: close #39574

Added support for new scalar function session_user()

when run sqlmap tool on Doris master it gave error like session_user()
function not found. mysql support this function.
https://www.w3schools.com/sql/func_mysql_session_user.asp
 Its similar to Doris user() function.

mysql> select  SESSION_USER();
+--------------------+
| session_user()     |
+--------------------+
| 'root'@'127.0.0.1' |
+--------------------+
1 row in set (0.01 sec)

mysql> select  session_user();
+--------------------+
| session_user()     |
+--------------------+
| 'root'@'127.0.0.1' |
+--------------------+
1 row in set (0.00 sec)

mysql>
---
 .../org/apache/doris/nereids/DorisLexer.g4    |  1 +
 .../org/apache/doris/nereids/DorisParser.g4   |  3 +
 .../doris/catalog/BuiltinScalarFunctions.java |  4 +-
 .../nereids/parser/LogicalPlanBuilder.java    |  6 ++
 .../rules/FoldConstantRuleOnFE.java           | 10 +++-
 .../functions/scalar/SessionUser.java         | 59 +++++++++++++++++++
 .../visitor/ScalarFunctionVisitor.java        |  5 ++
 .../query_p0/system/test_query_sys.groovy     |  1 +
 8 files changed, 87 insertions(+), 2 deletions(-)
 create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SessionUser.java

diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4
index 2343f208642d6e..e3ae9788e6af14 100644
--- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4
+++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4
@@ -472,6 +472,7 @@ SEMI: 'SEMI';
 SEQUENCE: 'SEQUENCE';
 SERIALIZABLE: 'SERIALIZABLE';
 SESSION: 'SESSION';
+SESSION_USER: 'SESSION_USER';
 SET: 'SET';
 SETS: 'SETS';
 SET_SESSION_VARIABLE: 'SET_SESSION_VARIABLE';
diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4
index 087af9d717de13..008425fb5a197e 100644
--- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4
+++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4
@@ -1510,6 +1510,7 @@ primaryExpression
     | name=LOCALTIME                                                                           #localTime
     | name=LOCALTIMESTAMP                                                                      #localTimestamp
     | name=CURRENT_USER                                                                        #currentUser
+    | name=SESSION_USER                                                                        #sessionUser
     | CASE whenClause+ (ELSE elseExpression=expression)? END                                   #searchedCase
     | CASE value=expression whenClause+ (ELSE elseExpression=expression)? END                  #simpleCase
     | name=CAST LEFT_PAREN expression AS castDataType RIGHT_PAREN                              #cast
@@ -1577,6 +1578,7 @@ functionNameIdentifier
     | REGEXP
     | RIGHT
     | SCHEMA
+    | SESSION_USER
     | TRIM
     | USER
     ;
@@ -2031,6 +2033,7 @@ nonReserved
     | SET_SESSION_VARIABLE
     | SEQUENCE
     | SESSION
+    | SESSION_USER
     | SHAPE
     | SKEW
     | SNAPSHOT
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
index bcb0864b64f2e2..86ed6e55ab5c6b 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
@@ -360,6 +360,7 @@
 import org.apache.doris.nereids.trees.expressions.functions.scalar.SecondsAdd;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.SecondsDiff;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.SecondsSub;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.SessionUser;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Sha1;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Sha2;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Sign;
@@ -934,7 +935,8 @@ public class BuiltinScalarFunctions implements FunctionHelper {
             scalar(YearsAdd.class, "years_add"),
             scalar(YearsDiff.class, "years_diff"),
             scalar(YearsSub.class, "years_sub"),
-            scalar(MultiMatch.class, "multi_match"));
+            scalar(MultiMatch.class, "multi_match"),
+            scalar(SessionUser.class, "session_user"));
 
     public static final BuiltinScalarFunctions INSTANCE = new BuiltinScalarFunctions();
 
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
index 67ef1ca48c2f02..732b41acc91f1e 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
@@ -327,6 +327,7 @@
 import org.apache.doris.nereids.trees.expressions.functions.scalar.SecondsAdd;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.SecondsDiff;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.SecondsSub;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.SessionUser;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.WeekCeil;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.WeekFloor;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.WeeksAdd;
@@ -2049,6 +2050,11 @@ public Expression visitCurrentUser(DorisParser.CurrentUserContext ctx) {
         return new CurrentUser().alias("CURRENT_USER");
     }
 
+    @Override
+    public Expression visitSessionUser(DorisParser.SessionUserContext ctx) {
+        return new SessionUser().alias("SESSION_USER");
+    }
+
     @Override
     public Expression visitDoublePipes(DorisParser.DoublePipesContext ctx) {
         return ParserUtils.withOrigin(ctx, () -> {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/expression/rules/FoldConstantRuleOnFE.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/expression/rules/FoldConstantRuleOnFE.java
index fdd3b02e6fd483..1b830c7d11de41 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/expression/rules/FoldConstantRuleOnFE.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/expression/rules/FoldConstantRuleOnFE.java
@@ -63,6 +63,7 @@
 import org.apache.doris.nereids.trees.expressions.functions.scalar.EncryptKeyRef;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.If;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Password;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.SessionUser;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.User;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Version;
 import org.apache.doris.nereids.trees.expressions.literal.ArrayLiteral;
@@ -164,7 +165,8 @@ public List<ExpressionPatternMatcher<? extends Expression>> buildRules() {
                 matches(Password.class, this::visitPassword),
                 matches(Array.class, this::visitArray),
                 matches(Date.class, this::visitDate),
-                matches(Version.class, this::visitVersion)
+                matches(Version.class, this::visitVersion),
+                matches(SessionUser.class, this::visitSessionUser)
         );
     }
 
@@ -326,6 +328,12 @@ public Expression visitUser(User user, ExpressionRewriteContext context) {
         return new VarcharLiteral(res);
     }
 
+    @Override
+    public Expression visitSessionUser(SessionUser user, ExpressionRewriteContext context) {
+        String res = context.cascadesContext.getConnectContext().getUserIdentity().toString();
+        return new VarcharLiteral(res);
+    }
+
     @Override
     public Expression visitConnectionId(ConnectionId connectionId, ExpressionRewriteContext context) {
         return new BigIntLiteral(context.cascadesContext.getConnectContext().getConnectionId());
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SessionUser.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SessionUser.java
new file mode 100644
index 00000000000000..b91e2c30942a43
--- /dev/null
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SessionUser.java
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.trees.expressions.functions.AlwaysNotNullable;
+import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import org.apache.doris.nereids.trees.expressions.shape.LeafExpression;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.VarcharType;
+
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'SessionUser'.
+ */
+public class SessionUser extends ScalarFunction
+        implements LeafExpression, ExplicitlyCastableSignature, AlwaysNotNullable {
+
+    public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
+            FunctionSignature.ret(VarcharType.SYSTEM_DEFAULT).args()
+    );
+
+    public SessionUser() {
+        super("session_user", ImmutableList.of());
+    }
+
+    @Override
+    public List<FunctionSignature> getSignatures() {
+        return SIGNATURES;
+    }
+
+    @Override
+    public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+        return visitor.visitSessionUser(this, context);
+    }
+
+    @Override
+    public boolean isDeterministic() {
+        return false;
+    }
+}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
index 20772ae716be2f..8741da5c7d63ec 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
@@ -358,6 +358,7 @@
 import org.apache.doris.nereids.trees.expressions.functions.scalar.SecondsAdd;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.SecondsDiff;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.SecondsSub;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.SessionUser;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Sha1;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Sha2;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Sign;
@@ -2084,6 +2085,10 @@ default R visitUser(User user, C context) {
         return visitScalarFunction(user, context);
     }
 
+    default R visitSessionUser(SessionUser user, C context) {
+        return visitScalarFunction(user, context);
+    }
+
     default R visitUtcTimestamp(UtcTimestamp utcTimestamp, C context) {
         return visitScalarFunction(utcTimestamp, context);
     }
diff --git a/regression-test/suites/query_p0/system/test_query_sys.groovy b/regression-test/suites/query_p0/system/test_query_sys.groovy
index b17500a0ccba0d..dd7998b010f9c8 100644
--- a/regression-test/suites/query_p0/system/test_query_sys.groovy
+++ b/regression-test/suites/query_p0/system/test_query_sys.groovy
@@ -35,6 +35,7 @@ suite("test_query_sys", "query,p0") {
     }
     sql "SELECT CONNECTION_ID();"
     sql "SELECT CURRENT_USER();"
+    sql "SELECT SESSION_USER();"
     sql "SELECT CURRENT_CATALOG();"
     // sql "select now();"
     sql "select localtime();"

From 943ac0425a018628dd8a79123a93262965cab339 Mon Sep 17 00:00:00 2001
From: Vallish Pai <vallishpai@gmail.com>
Date: Tue, 10 Sep 2024 11:08:57 +0530
Subject: [PATCH 22/44] [fix](systable) refactor code to avoid wrong usage
 (#40568)

## Proposed changes

Issue Number: close #xxx

refactor code to avoid wrong usage

Followup
 #40153
#40553



Test result:

mysql> select * from routines;

+---------------------+-----------------+----------------+---------------------+--------------+----------------+-----------------------------------------------------------------------------------------+--------------------+---------------+-------------------+-----------------+------------------+-----------------+----------+---------------+---------------------+---------------------+----------+-----------------+---------+----------------------+----------------------+--------------------+
| SPECIFIC_NAME | ROUTINE_CATALOG | ROUTINE_SCHEMA | ROUTINE_NAME |
ROUTINE_TYPE | DTD_IDENTIFIER | ROUTINE_BODY | ROUTINE_DEFINITION |
EXTERNAL_NAME | EXTERNAL_LANGUAGE | PARAMETER_STYLE | IS_DETERMINISTIC |
SQL_DATA_ACCESS | SQL_PATH | SECURITY_TYPE | CREATED | LAST_ALTERED |
SQL_MODE | ROUTINE_COMMENT | DEFINER | CHARACTER_SET_CLIENT |
COLLATION_CONNECTION | DATABASE_COLLATION |

+---------------------+-----------------+----------------+---------------------+--------------+----------------+-----------------------------------------------------------------------------------------+--------------------+---------------+-------------------+-----------------+------------------+-----------------+----------+---------------+---------------------+---------------------+----------+-----------------+---------+----------------------+----------------------+--------------------+
| TEST_PLSQL_ROUTINE2 | 0 | plsql_routine | TEST_PLSQL_ROUTINE2 |
PROCEDURE | | CREATE OR REPLACE PROCEDURE test_plsql_routine2() BEGIN
DECLARE a int = 1; print a; END | | NULL | | SQL | | | NULL | DEFINER |
2024-09-09 13:54:47 | 2024-09-09 13:54:47 | | | root | | | |
| TEST_PLSQL_ROUTINE3 | 0 | plsql_routine | TEST_PLSQL_ROUTINE3 |
PROCEDURE | | CREATE OR REPLACE PROCEDURE test_plsql_routine3() BEGIN
DECLARE a int = 1; print a; END | | NULL | | SQL | | | NULL | DEFINER |
2024-09-09 13:54:47 | 2024-09-09 13:54:47 | | | root | | | |
| TEST_PLSQL_ROUTINE4 | 0 | plsql_routine | TEST_PLSQL_ROUTINE4 |
PROCEDURE | | CREATE OR REPLACE PROCEDURE test_plsql_routine4() BEGIN
DECLARE a int = 1; print a; END | | NULL | | SQL | | | NULL | DEFINER |
2024-09-09 13:54:47 | 2024-09-09 13:54:47 | | | root | | | |
| TEST_PLSQL_ROUTINE5 | 0 | plsql_routine | TEST_PLSQL_ROUTINE5 |
PROCEDURE | | CREATE OR REPLACE PROCEDURE test_plsql_routine5() BEGIN
DECLARE a int = 1; print a; END | | NULL | | SQL | | | NULL | DEFINER |
2024-09-09 13:54:47 | 2024-09-09 13:54:47 | | | root | | | |
| TEST_PLSQL_ROUTINE1 | 0 | plsql_routine | TEST_PLSQL_ROUTINE1 |
PROCEDURE | | CREATE OR REPLACE PROCEDURE test_plsql_routine1() BEGIN
DECLARE a int = 1; print a; END | | NULL | | SQL | | | NULL | DEFINER |
2024-09-09 13:54:48 | 2024-09-09 13:54:48 | | | root | | | |

+---------------------+-----------------+----------------+---------------------+--------------+----------------+-----------------------------------------------------------------------------------------+--------------------+---------------+-------------------+-----------------+------------------+-------


mysql> select UPDATE_TIME from partitions;
+---------------------+
| UPDATE_TIME         |
+---------------------+
| 2024-09-05 05:54:25 |
| 2024-09-05 05:54:25 |
| 2024-09-05 05:54:25 |
| 2024-09-09 13:27:46 |
| 2024-09-09 13:26:20 |
| 2024-09-09 13:26:20 |
| 2024-09-09 13:26:20 |
| 2024-09-09 13:26:20 |
| 2024-09-09 13:26:20 |
| 2024-09-09 13:26:20 |
| 2024-09-09 13:26:20 |
| 2024-09-09 13:27:46 |
| 2024-09-09 13:27:46 |

mysql> select * from active_queries;

+-----------------------------------+---------------------+---------------+-------------------+--------------------+-------------------+------------------+----------------+--------------+------------------------------+
| QUERY_ID | QUERY_START_TIME | QUERY_TIME_MS | WORKLOAD_GROUP_ID |
DATABASE | FRONTEND_INSTANCE | QUEUE_START_TIME | QUEUE_END_TIME |
QUERY_STATUS | SQL |

+-----------------------------------+---------------------+---------------+-------------------+--------------------+-------------------+------------------+----------------+--------------+------------------------------+
| 8c51705601bf4403-929f1b15051af48e | 2024-09-09 14:06:15 | 19 | 1 |
information_schema | 172.20.80.1 | | | | select * from active_queries |

+-----------------------------------+---------------------+---------------+-------------------+--------------------+-------------------+------------------+----------------+--------------+------------------------------+
---
 be/src/exec/schema_scanner.cpp                | 11 ++++++++
 .../schema_active_queries_scanner.cpp         | 26 +++----------------
 .../schema_partitions_scanner.cpp             | 18 ++-----------
 .../schema_scanner/schema_routine_scanner.cpp | 15 ++---------
 4 files changed, 19 insertions(+), 51 deletions(-)

diff --git a/be/src/exec/schema_scanner.cpp b/be/src/exec/schema_scanner.cpp
index cce150670353d9..90140e748f5d6b 100644
--- a/be/src/exec/schema_scanner.cpp
+++ b/be/src/exec/schema_scanner.cpp
@@ -453,6 +453,17 @@ Status SchemaScanner::insert_block_column(TCell cell, int col_index, vectorized:
         break;
     }
 
+    case TYPE_DATETIME: {
+        std::vector<void*> datas(1);
+        VecDateTimeValue src[1];
+        src[0].from_date_str(cell.stringVal.data(), cell.stringVal.size());
+        datas[0] = src;
+        auto data = datas[0];
+        reinterpret_cast<vectorized::ColumnVector<vectorized::Int64>*>(col_ptr)->insert_data(
+                reinterpret_cast<char*>(data), 0);
+        nullable_column->get_null_map_data().emplace_back(0);
+        break;
+    }
     default: {
         std::stringstream ss;
         ss << "unsupported column type:" << type;
diff --git a/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp b/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp
index 2b516fc6fdac2b..6aa6e758999fb0 100644
--- a/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp
+++ b/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp
@@ -17,7 +17,6 @@
 
 #include "exec/schema_scanner/schema_active_queries_scanner.h"
 
-#include "exec/schema_scanner/schema_scanner_helper.h"
 #include "runtime/client_cache.h"
 #include "runtime/exec_env.h"
 #include "runtime/runtime_state.h"
@@ -101,27 +100,10 @@ Status SchemaActiveQueriesScanner::_get_active_queries_block_from_fe() {
 
     for (int i = 0; i < result_data.size(); i++) {
         TRow row = result_data[i];
-
-        SchemaScannerHelper::insert_string_value(0, row.column_value[0].stringVal,
-                                                 _active_query_block.get());
-        SchemaScannerHelper::insert_string_value(1, row.column_value[1].stringVal,
-                                                 _active_query_block.get());
-        SchemaScannerHelper::insert_int_value(2, row.column_value[2].longVal,
-                                              _active_query_block.get());
-        SchemaScannerHelper::insert_int_value(3, row.column_value[3].longVal,
-                                              _active_query_block.get());
-        SchemaScannerHelper::insert_string_value(4, row.column_value[4].stringVal,
-                                                 _active_query_block.get());
-        SchemaScannerHelper::insert_string_value(5, row.column_value[5].stringVal,
-                                                 _active_query_block.get());
-        SchemaScannerHelper::insert_string_value(6, row.column_value[6].stringVal,
-                                                 _active_query_block.get());
-        SchemaScannerHelper::insert_string_value(7, row.column_value[7].stringVal,
-                                                 _active_query_block.get());
-        SchemaScannerHelper::insert_string_value(8, row.column_value[8].stringVal,
-                                                 _active_query_block.get());
-        SchemaScannerHelper::insert_string_value(9, row.column_value[9].stringVal,
-                                                 _active_query_block.get());
+        for (int j = 0; j < _s_tbls_columns.size(); j++) {
+            RETURN_IF_ERROR(insert_block_column(row.column_value[j], j, _active_query_block.get(),
+                                                _s_tbls_columns[j].type));
+        }
     }
     return Status::OK();
 }
diff --git a/be/src/exec/schema_scanner/schema_partitions_scanner.cpp b/be/src/exec/schema_scanner/schema_partitions_scanner.cpp
index 9f86fe6feb49d9..ebe2bd3b70ec0e 100644
--- a/be/src/exec/schema_scanner/schema_partitions_scanner.cpp
+++ b/be/src/exec/schema_scanner/schema_partitions_scanner.cpp
@@ -22,7 +22,6 @@
 #include <stdint.h>
 
 #include "exec/schema_scanner/schema_helper.h"
-#include "exec/schema_scanner/schema_scanner_helper.h"
 #include "runtime/client_cache.h"
 #include "runtime/exec_env.h"
 #include "runtime/runtime_state.h"
@@ -147,22 +146,9 @@ Status SchemaPartitionsScanner::get_onedb_info_from_fe(int64_t dbId) {
 
     for (int i = 0; i < result_data.size(); i++) {
         TRow row = result_data[i];
-
         for (int j = 0; j < _s_tbls_columns.size(); j++) {
-            if ((_s_tbls_columns[j].type == TYPE_BIGINT) || _s_tbls_columns[j].type == TYPE_INT) {
-                SchemaScannerHelper::insert_int_value(j, row.column_value[j].longVal,
-                                                      _partitions_block.get());
-            } else if (_s_tbls_columns[j].type == TYPE_DATETIME) {
-                std::vector<void*> datas(1);
-                VecDateTimeValue src[1];
-                src[0].from_date_str(row.column_value[j].stringVal.data(),
-                                     row.column_value[j].stringVal.size());
-                datas[0] = src;
-                SchemaScannerHelper::insert_datetime_value(j, datas, _partitions_block.get());
-            } else {
-                SchemaScannerHelper::insert_string_value(j, row.column_value[j].stringVal,
-                                                         _partitions_block.get());
-            }
+            RETURN_IF_ERROR(insert_block_column(row.column_value[j], j, _partitions_block.get(),
+                                                _s_tbls_columns[j].type));
         }
     }
     return Status::OK();
diff --git a/be/src/exec/schema_scanner/schema_routine_scanner.cpp b/be/src/exec/schema_scanner/schema_routine_scanner.cpp
index adb18450f26490..e8d95f0abd6d36 100644
--- a/be/src/exec/schema_scanner/schema_routine_scanner.cpp
+++ b/be/src/exec/schema_scanner/schema_routine_scanner.cpp
@@ -17,7 +17,6 @@
 
 #include "exec/schema_scanner/schema_routine_scanner.h"
 
-#include "exec/schema_scanner/schema_scanner_helper.h"
 #include "runtime/client_cache.h"
 #include "runtime/exec_env.h"
 #include "runtime/runtime_state.h"
@@ -103,19 +102,9 @@ Status SchemaRoutinesScanner::get_block_from_fe() {
 
     for (int i = 0; i < result_data.size(); i++) {
         TRow row = result_data[i];
-
         for (int j = 0; j < _s_tbls_columns.size(); j++) {
-            if (_s_tbls_columns[j].type == TYPE_DATETIME) {
-                std::vector<void*> datas(1);
-                VecDateTimeValue src[1];
-                src[0].from_date_str(row.column_value[j].stringVal.data(),
-                                     row.column_value[j].stringVal.size());
-                datas[0] = src;
-                SchemaScannerHelper::insert_datetime_value(j, datas, _routines_block.get());
-            } else {
-                SchemaScannerHelper::insert_string_value(j, row.column_value[j].stringVal,
-                                                         _routines_block.get());
-            }
+            RETURN_IF_ERROR(insert_block_column(row.column_value[j], j, _routines_block.get(),
+                                                _s_tbls_columns[j].type));
         }
     }
     return Status::OK();

From 140857fa7f4ec93065a9db7fc053769069972a19 Mon Sep 17 00:00:00 2001
From: Siyang Tang <82279870+TangSiyang2001@users.noreply.github.com>
Date: Tue, 10 Sep 2024 14:22:35 +0800
Subject: [PATCH 23/44] [feature](compaction) Add an http action for visibility
 of compaction score on each tablet (#38489)

## Proposed changes

As title.

Usage:
1. `curl http://be_ip:be_host/api/compaction_score?top_n=10`
Returns a json object contains compaction score for top n, n=top_n.
```
[
    {
        "compaction_score": "5",
        "tablet_id": "42595"
    },
    {
        "compaction_score": "5",
        "tablet_id": "42587"
    },
    {
        "compaction_score": "5",
        "tablet_id": "42593"
    },
    {
        "compaction_score": "5",
        "tablet_id": "42597"
    },
    {
        "compaction_score": "5",
        "tablet_id": "42589"
    },
    {
        "compaction_score": "5",
        "tablet_id": "42599"
    },
    {
        "compaction_score": "5",
        "tablet_id": "42601"
    },
    {
        "compaction_score": "5",
        "tablet_id": "42591"
    },
    {
        "compaction_score": "5",
        "tablet_id": "42585"
    },
    {
        "compaction_score": "4",
        "tablet_id": "10034"
    }
]
```
If top_n is not specified, return all compaction score for all tablets.
If top_n is illegal, raise an error.
```
invalid argument: top_n=wrong
```

2. `curl http://be_ip:be_host/api/compaction_score?sync_meta=true`
`sync_meta` is only available on cloud mode, will sync meta from meta
service. It can cooperate with top_n.
If add param `sync_meta` on non-cloud mode, will raise an error.
```
sync meta is only available for cloud mode
```

3. In the future, this endpoint may extend other utility, like fetching
tablet compaction score by table id, etc.
---
 .../http/action/compaction_score_action.cpp   | 236 ++++++++++++++++++
 be/src/http/action/compaction_score_action.h  |  66 +++++
 be/src/olap/base_tablet.cpp                   |   9 +
 be/src/olap/base_tablet.h                     |   4 +
 be/src/olap/tablet.cpp                        |   3 +
 be/src/service/http_service.cpp               |  11 +
 .../test_compaction_score_action.groovy       |  53 ++++
 7 files changed, 382 insertions(+)
 create mode 100644 be/src/http/action/compaction_score_action.cpp
 create mode 100644 be/src/http/action/compaction_score_action.h
 create mode 100644 regression-test/suites/compaction/test_compaction_score_action.groovy

diff --git a/be/src/http/action/compaction_score_action.cpp b/be/src/http/action/compaction_score_action.cpp
new file mode 100644
index 00000000000000..10b8cc6bdbab04
--- /dev/null
+++ b/be/src/http/action/compaction_score_action.cpp
@@ -0,0 +1,236 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "http/action/compaction_score_action.h"
+
+#include <gen_cpp/FrontendService_types.h>
+#include <gen_cpp/Types_types.h>
+#include <glog/logging.h>
+#include <rapidjson/document.h>
+#include <rapidjson/prettywriter.h>
+#include <rapidjson/stringbuffer.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <exception>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <span>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "cloud/cloud_storage_engine.h"
+#include "cloud/cloud_tablet.h"
+#include "cloud/cloud_tablet_mgr.h"
+#include "cloud/config.h"
+#include "common/status.h"
+#include "http/http_channel.h"
+#include "http/http_handler_with_auth.h"
+#include "http/http_headers.h"
+#include "http/http_request.h"
+#include "http/http_status.h"
+#include "olap/tablet_fwd.h"
+#include "olap/tablet_manager.h"
+#include "util/stopwatch.hpp"
+
+namespace doris {
+
+const std::string TOP_N = "top_n";
+const std::string SYNC_META = "sync_meta";
+const std::string COMPACTION_SCORE = "compaction_score";
+constexpr size_t DEFAULT_TOP_N = std::numeric_limits<size_t>::max();
+constexpr bool DEFAULT_SYNC_META = false;
+constexpr std::string_view TABLET_ID = "tablet_id";
+
+template <typename T>
+concept CompactionScoreAccessble = requires(T t) {
+    { t.get_real_compaction_score() } -> std::same_as<uint32_t>;
+};
+
+template <CompactionScoreAccessble T>
+std::vector<CompactionScoreResult> calculate_compaction_scores(
+        std::span<std::shared_ptr<T>> tablets) {
+    std::vector<CompactionScoreResult> result;
+    result.reserve(tablets.size());
+    std::ranges::transform(tablets, std::back_inserter(result),
+                           [](const std::shared_ptr<T>& tablet) -> CompactionScoreResult {
+                               return {.tablet_id = tablet->tablet_id(),
+                                       .compaction_score = tablet->get_real_compaction_score()};
+                           });
+    return result;
+}
+
+struct LocalCompactionScoreAccessor final : CompactionScoresAccessor {
+    LocalCompactionScoreAccessor(TabletManager* tablet_mgr) : tablet_mgr(tablet_mgr) {}
+
+    std::vector<CompactionScoreResult> get_all_tablet_compaction_scores() override {
+        auto tablets = tablet_mgr->get_all_tablet();
+        std::span<TabletSharedPtr> s = {tablets.begin(), tablets.end()};
+        return calculate_compaction_scores(s);
+    }
+
+    TabletManager* tablet_mgr;
+};
+
+struct CloudCompactionScoresAccessor final : CompactionScoresAccessor {
+    CloudCompactionScoresAccessor(CloudTabletMgr& tablet_mgr) : tablet_mgr(tablet_mgr) {}
+
+    std::vector<CompactionScoreResult> get_all_tablet_compaction_scores() override {
+        auto tablets = get_all_tablets();
+        std::span<CloudTabletSPtr> s = {tablets.begin(), tablets.end()};
+        return calculate_compaction_scores(s);
+    }
+
+    Status sync_meta() {
+        auto tablets = get_all_tablets();
+        LOG(INFO) << "start to sync meta from ms";
+
+        MonotonicStopWatch stopwatch;
+        stopwatch.start();
+
+        for (const auto& tablet : tablets) {
+            RETURN_IF_ERROR(tablet->sync_meta());
+            RETURN_IF_ERROR(tablet->sync_rowsets());
+        }
+
+        stopwatch.stop();
+        LOG(INFO) << "sync meta finish, time=" << stopwatch.elapsed_time() << "ns";
+
+        return Status::OK();
+    }
+
+    std::vector<CloudTabletSPtr> get_all_tablets() {
+        auto weak_tablets = tablet_mgr.get_weak_tablets();
+        std::vector<CloudTabletSPtr> tablets;
+        tablets.reserve(weak_tablets.size());
+        for (auto& weak_tablet : weak_tablets) {
+            if (auto tablet = weak_tablet.lock();
+                tablet != nullptr and tablet->tablet_state() == TABLET_RUNNING) {
+                tablets.push_back(std::move(tablet));
+            }
+        }
+        return tablets;
+    }
+
+    CloudTabletMgr& tablet_mgr;
+};
+
+static rapidjson::Value jsonfy_tablet_compaction_score(
+        const CompactionScoreResult& result, rapidjson::MemoryPoolAllocator<>& allocator) {
+    rapidjson::Value node;
+    node.SetObject();
+
+    rapidjson::Value tablet_id_key;
+    tablet_id_key.SetString(TABLET_ID.data(), TABLET_ID.length(), allocator);
+    rapidjson::Value tablet_id_val;
+    auto tablet_id_str = std::to_string(result.tablet_id);
+    tablet_id_val.SetString(tablet_id_str.c_str(), tablet_id_str.length(), allocator);
+
+    rapidjson::Value score_key;
+    score_key.SetString(COMPACTION_SCORE.data(), COMPACTION_SCORE.size());
+    rapidjson::Value score_val;
+    auto score_str = std::to_string(result.compaction_score);
+    score_val.SetString(score_str.c_str(), score_str.length(), allocator);
+    node.AddMember(score_key, score_val, allocator);
+
+    node.AddMember(tablet_id_key, tablet_id_val, allocator);
+    return node;
+}
+
+CompactionScoreAction::CompactionScoreAction(ExecEnv* exec_env, TPrivilegeHier::type hier,
+                                             TPrivilegeType::type type, TabletManager* tablet_mgr)
+        : HttpHandlerWithAuth(exec_env, hier, type),
+          _accessor(std::make_unique<LocalCompactionScoreAccessor>(tablet_mgr)) {}
+
+CompactionScoreAction::CompactionScoreAction(ExecEnv* exec_env, TPrivilegeHier::type hier,
+                                             TPrivilegeType::type type, CloudTabletMgr& tablet_mgr)
+        : HttpHandlerWithAuth(exec_env, hier, type),
+          _accessor(std::make_unique<CloudCompactionScoresAccessor>(tablet_mgr)) {}
+
+void CompactionScoreAction::handle(HttpRequest* req) {
+    req->add_output_header(HttpHeaders::CONTENT_TYPE, HttpHeaders::JsonType.data());
+    auto top_n_param = req->param(TOP_N);
+
+    size_t top_n = DEFAULT_TOP_N;
+    if (!top_n_param.empty()) {
+        try {
+            auto tmp_top_n = std::stoll(top_n_param);
+            if (tmp_top_n < 0) {
+                throw std::invalid_argument("`top_n` cannot less than 0");
+            }
+            top_n = tmp_top_n;
+        } catch (const std::exception& e) {
+            LOG(WARNING) << "convert failed:" << e.what();
+            auto msg = fmt::format("invalid argument: top_n={}", top_n_param);
+            HttpChannel::send_reply(req, HttpStatus::BAD_REQUEST, msg);
+            return;
+        }
+    }
+
+    auto sync_meta_param = req->param(SYNC_META);
+    bool sync_meta = DEFAULT_SYNC_META;
+    if (!sync_meta_param.empty() and !config::is_cloud_mode()) {
+        HttpChannel::send_reply(req, HttpStatus::BAD_REQUEST,
+                                "param `sync_meta` is only available for cloud mode");
+        return;
+    }
+    if (sync_meta_param == "true") {
+        sync_meta = true;
+    } else if (sync_meta_param == "false") {
+        sync_meta = false;
+    } else if (!sync_meta_param.empty()) {
+        auto msg = fmt::format("invalid argument: sync_meta={}", sync_meta_param);
+        HttpChannel::send_reply(req, HttpStatus::BAD_REQUEST, msg);
+        return;
+    }
+
+    std::string result;
+    if (auto st = _handle(top_n, sync_meta, &result); !st) {
+        HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, st.to_json());
+        return;
+    }
+    HttpChannel::send_reply(req, HttpStatus::OK, result);
+}
+
+Status CompactionScoreAction::_handle(size_t top_n, bool sync_meta, std::string* result) {
+    if (sync_meta) {
+        DCHECK(config::is_cloud_mode());
+        RETURN_IF_ERROR(static_cast<CloudCompactionScoresAccessor*>(_accessor.get())->sync_meta());
+    }
+
+    auto scores = _accessor->get_all_tablet_compaction_scores();
+    top_n = std::min(top_n, scores.size());
+    std::partial_sort(scores.begin(), scores.begin() + top_n, scores.end(), std::greater<>());
+
+    rapidjson::Document root;
+    root.SetArray();
+    auto& allocator = root.GetAllocator();
+    std::for_each(scores.begin(), scores.begin() + top_n, [&](const auto& score) {
+        root.PushBack(jsonfy_tablet_compaction_score(score, allocator), allocator);
+    });
+    rapidjson::StringBuffer str_buf;
+    rapidjson::PrettyWriter<rapidjson::StringBuffer> writer(str_buf);
+    root.Accept(writer);
+    *result = str_buf.GetString();
+    return Status::OK();
+}
+
+} // namespace doris
diff --git a/be/src/http/action/compaction_score_action.h b/be/src/http/action/compaction_score_action.h
new file mode 100644
index 00000000000000..1c345a4ae24c65
--- /dev/null
+++ b/be/src/http/action/compaction_score_action.h
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <gen_cpp/FrontendService_types.h>
+
+#include <cstddef>
+#include <memory>
+#include <string>
+
+#include "cloud/cloud_tablet_mgr.h"
+#include "common/status.h"
+#include "http/http_handler_with_auth.h"
+#include "http/http_request.h"
+#include "olap/storage_engine.h"
+#include "runtime/exec_env.h"
+namespace doris {
+
+struct CompactionScoreResult {
+    int64_t tablet_id;
+    size_t compaction_score;
+};
+
+inline bool operator>(const CompactionScoreResult& lhs, const CompactionScoreResult& rhs) {
+    return lhs.compaction_score > rhs.compaction_score;
+}
+
+struct CompactionScoresAccessor {
+    virtual ~CompactionScoresAccessor() = default;
+
+    virtual std::vector<CompactionScoreResult> get_all_tablet_compaction_scores() = 0;
+};
+
+// topn, sync
+class CompactionScoreAction : public HttpHandlerWithAuth {
+public:
+    explicit CompactionScoreAction(ExecEnv* exec_env, TPrivilegeHier::type hier,
+                                   TPrivilegeType::type type, TabletManager* tablet_mgr);
+
+    explicit CompactionScoreAction(ExecEnv* exec_env, TPrivilegeHier::type hier,
+                                   TPrivilegeType::type type, CloudTabletMgr& tablet_mgr);
+
+    void handle(HttpRequest* req) override;
+
+private:
+    Status _handle(size_t top_n, bool sync_meta, std::string* result);
+
+    std::unique_ptr<CompactionScoresAccessor> _accessor;
+};
+
+} // namespace doris
diff --git a/be/src/olap/base_tablet.cpp b/be/src/olap/base_tablet.cpp
index 143c1ad706bbe7..1fd3b785b9072f 100644
--- a/be/src/olap/base_tablet.cpp
+++ b/be/src/olap/base_tablet.cpp
@@ -29,6 +29,7 @@
 #include "olap/rowid_conversion.h"
 #include "olap/rowset/beta_rowset.h"
 #include "olap/rowset/rowset.h"
+#include "olap/rowset/rowset_fwd.h"
 #include "olap/rowset/rowset_reader.h"
 #include "olap/tablet_fwd.h"
 #include "olap/txn_manager.h"
@@ -182,6 +183,14 @@ Status BaseTablet::update_by_least_common_schema(const TabletSchemaSPtr& update_
     return Status::OK();
 }
 
+uint32_t BaseTablet::get_real_compaction_score() const {
+    const auto& rs_metas = _tablet_meta->all_rs_metas();
+    return std::accumulate(rs_metas.begin(), rs_metas.end(), 0,
+                           [](uint32_t score, const RowsetMetaSharedPtr& rs_meta) {
+                               return score + rs_meta->get_compaction_score();
+                           });
+}
+
 Status BaseTablet::capture_rs_readers_unlocked(const Versions& version_path,
                                                std::vector<RowSetSplits>* rs_splits) const {
     DCHECK(rs_splits != nullptr && rs_splits->empty());
diff --git a/be/src/olap/base_tablet.h b/be/src/olap/base_tablet.h
index cfaf536902e03e..943f815581809a 100644
--- a/be/src/olap/base_tablet.h
+++ b/be/src/olap/base_tablet.h
@@ -105,6 +105,10 @@ class BaseTablet {
 
     virtual size_t tablet_footprint() = 0;
 
+    // this method just return the compaction sum on each rowset
+    // note(tsy): we should unify the compaction score calculation finally
+    uint32_t get_real_compaction_score() const;
+
     // MUST hold shared meta lock
     Status capture_rs_readers_unlocked(const Versions& version_path,
                                        std::vector<RowSetSplits>* rs_splits) const;
diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp
index da7a4ec8a6e260..66278afdb666ee 100644
--- a/be/src/olap/tablet.cpp
+++ b/be/src/olap/tablet.cpp
@@ -1023,6 +1023,9 @@ uint32_t Tablet::calc_cold_data_compaction_score() const {
 
 uint32_t Tablet::_calc_cumulative_compaction_score(
         std::shared_ptr<CumulativeCompactionPolicy> cumulative_compaction_policy) {
+    if (cumulative_compaction_policy == nullptr) [[unlikely]] {
+        return 0;
+    }
 #ifndef BE_TEST
     if (_cumulative_compaction_policy == nullptr ||
         _cumulative_compaction_policy->name() != cumulative_compaction_policy->name()) {
diff --git a/be/src/service/http_service.cpp b/be/src/service/http_service.cpp
index 9f98a86bda4c98..f2c325bebc7806 100644
--- a/be/src/service/http_service.cpp
+++ b/be/src/service/http_service.cpp
@@ -19,6 +19,7 @@
 
 #include <event2/bufferevent.h>
 #include <event2/http.h>
+#include <gen_cpp/FrontendService_types.h>
 
 #include <string>
 #include <vector>
@@ -37,6 +38,7 @@
 #include "http/action/checksum_action.h"
 #include "http/action/clear_cache_action.h"
 #include "http/action/compaction_action.h"
+#include "http/action/compaction_score_action.h"
 #include "http/action/config_action.h"
 #include "http/action/debug_point_action.h"
 #include "http/action/download_action.h"
@@ -381,6 +383,11 @@ void HttpService::register_local_handler(StorageEngine& engine) {
             new ShowNestedIndexFileAction(_env, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN));
     _ev_http_server->register_handler(HttpMethod::GET, "/api/show_nested_index_file",
                                       show_nested_index_file_action);
+
+    CompactionScoreAction* compaction_score_action = _pool.add(new CompactionScoreAction(
+            _env, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN, engine.tablet_manager()));
+    _ev_http_server->register_handler(HttpMethod::GET, "/api/compaction_score",
+                                      compaction_score_action);
 }
 
 void HttpService::register_cloud_handler(CloudStorageEngine& engine) {
@@ -417,6 +424,10 @@ void HttpService::register_cloud_handler(CloudStorageEngine& engine) {
             new ShowNestedIndexFileAction(_env, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN));
     _ev_http_server->register_handler(HttpMethod::GET, "/api/show_nested_index_file",
                                       show_nested_index_file_action);
+    CompactionScoreAction* compaction_score_action = _pool.add(new CompactionScoreAction(
+            _env, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN, engine.tablet_mgr()));
+    _ev_http_server->register_handler(HttpMethod::GET, "/api/compaction_score",
+                                      compaction_score_action);
 }
 // NOLINTEND(readability-function-size)
 
diff --git a/regression-test/suites/compaction/test_compaction_score_action.groovy b/regression-test/suites/compaction/test_compaction_score_action.groovy
new file mode 100644
index 00000000000000..9ab8743778fb10
--- /dev/null
+++ b/regression-test/suites/compaction/test_compaction_score_action.groovy
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_compaction_score_action") {
+    def tableName = "test_compaction_score_action";
+
+    sql """ DROP TABLE IF EXISTS ${tableName} """
+    sql """
+        CREATE TABLE IF NOT EXISTS ${tableName} (
+            id INT NOT NULL,
+            name STRING NOT NULL
+        ) DUPLICATE KEY (`id`)
+          PROPERTIES ("replication_num" = "1", "disable_auto_compaction" = "true");
+    """
+    for (i in 0..<30) {
+        sql """ INSERT INTO ${tableName} VALUES(1, "Vedal") """
+        sql """ INSERT INTO ${tableName} VALUES(2, "Neuro") """
+        sql """ INSERT INTO ${tableName} VALUES(3, "Evil") """
+    }
+
+    def backendId_to_backendIP = [:]
+    def backendId_to_backendHttpPort = [:]
+    getBackendIpHttpPort(backendId_to_backendIP, backendId_to_backendHttpPort);
+
+    for (int i=0;i<backendId_to_backendIP.size();i++){
+        def beHttpAddress =backendId_to_backendIP.entrySet()[i].getValue()+":"+backendId_to_backendHttpPort.entrySet()[i].getValue()
+        if (isCloudMode()) {
+            def (code, text, err) = curl("GET",beHttpAddress+ "/api/compaction_score?top_n=1&sync_meta=true")
+            def score_str = parseJson(text).get(0).get("compaction_score")
+            def score = Integer.parseInt(score_str)
+            assertTrue(score >= 90)
+        } else {
+            def (code, text, err) = curl("GET",beHttpAddress+"/api/compaction_score?top_n=1")
+            def score_str = parseJson(text).get(0).get("compaction_score")
+            def score = Integer.parseInt(score_str)
+            assertTrue(score >= 90)
+        }
+    }
+}

From a98a137c8aa0d2f2854be3ac53c839c813df2a82 Mon Sep 17 00:00:00 2001
From: bobhan1 <bh2444151092@outlook.com>
Date: Tue, 10 Sep 2024 14:38:38 +0800
Subject: [PATCH 24/44] [Fix](merge-on-write) AbstractInsertExecutor should
 throw exception after running out of retry times (#40436)

## Proposed changes

For cloud mow table, `AbstractInsertExecutor` will retry for insert
stmt. But it forget to throw exception after running out of retry times,
resulting in returnning OK status to user with the possibility that the
insert stmt is in fact not executed successfully.
---
 .../insert/AbstractInsertExecutor.java        | 11 ++-
 .../BaseExternalTableInsertExecutor.java      |  1 +
 .../commands/insert/OlapInsertExecutor.java   |  1 +
 .../cloud/test_cloud_mow_insert_timeout.out   | 11 +++
 .../test_cloud_mow_insert_timeout.groovy      | 88 +++++++++++++++++++
 5 files changed, 110 insertions(+), 2 deletions(-)
 create mode 100644 regression-test/data/fault_injection_p0/cloud/test_cloud_mow_insert_timeout.out
 create mode 100644 regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_insert_timeout.groovy

diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/AbstractInsertExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/AbstractInsertExecutor.java
index defcd6c6e997fa..cdf74f5e9aca3a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/AbstractInsertExecutor.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/AbstractInsertExecutor.java
@@ -94,6 +94,8 @@ public String getLabelName() {
         return labelName;
     }
 
+    public abstract long getTxnId();
+
     /**
      * begin transaction if necessary
      */
@@ -192,14 +194,19 @@ public void executeSingleInsert(StmtExecutor executor, long jobId) throws Except
             execImpl(executor, jobId);
             checkStrictModeAndFilterRatio();
             int retryTimes = 0;
-            while (retryTimes < Config.mow_insert_into_commit_retry_times) {
+            while (true) {
                 try {
                     onComplete();
                     break;
                 } catch (UserException e) {
-                    LOG.warn("failed to commit txn", e);
+                    LOG.warn("failed to commit txn, txnId={}, jobId={}, retryTimes={}",
+                            getTxnId(), jobId, retryTimes, e);
                     if (e.getErrorCode() == InternalErrorCode.DELETE_BITMAP_LOCK_ERR) {
                         retryTimes++;
+                        if (retryTimes >= Config.mow_insert_into_commit_retry_times) {
+                            // should throw exception after running out of retry times
+                            throw e;
+                        }
                     } else {
                         throw e;
                     }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/BaseExternalTableInsertExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/BaseExternalTableInsertExecutor.java
index 1c22b9bf56a846..a3aa33f96ab02c 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/BaseExternalTableInsertExecutor.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/BaseExternalTableInsertExecutor.java
@@ -70,6 +70,7 @@ public BaseExternalTableInsertExecutor(ConnectContext ctx, ExternalTable table,
         }
     }
 
+    @Override
     public long getTxnId() {
         return txnId;
     }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/OlapInsertExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/OlapInsertExecutor.java
index 1262829aa481da..b57ac3834958d6 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/OlapInsertExecutor.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/OlapInsertExecutor.java
@@ -85,6 +85,7 @@ public OlapInsertExecutor(ConnectContext ctx, Table table,
         super(ctx, table, labelName, planner, insertCtx, emptyInsert);
     }
 
+    @Override
     public long getTxnId() {
         return txnId;
     }
diff --git a/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_insert_timeout.out b/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_insert_timeout.out
new file mode 100644
index 00000000000000..ee71e1e449d57d
--- /dev/null
+++ b/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_insert_timeout.out
@@ -0,0 +1,11 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !sql --
+1	1	1
+2	2	2
+3	3	3
+
+-- !sql --
+1	1	1
+2	2	2
+3	3	3
+
diff --git a/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_insert_timeout.groovy b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_insert_timeout.groovy
new file mode 100644
index 00000000000000..23d92f31e5ad8e
--- /dev/null
+++ b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_insert_timeout.groovy
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import org.junit.Assert
+import java.util.concurrent.TimeUnit
+import org.awaitility.Awaitility
+
+suite("test_cloud_mow_insert_timeout", "nonConcurrent") {
+    if (!isCloudMode()) {
+        return
+    }
+
+    GetDebugPoint().clearDebugPointsForAllFEs()
+    GetDebugPoint().clearDebugPointsForAllBEs()
+
+    def table1 = "test_cloud_mow_insert_timeout"
+    sql "DROP TABLE IF EXISTS ${table1} FORCE;"
+    sql """ CREATE TABLE IF NOT EXISTS ${table1} (
+            `k1` int NOT NULL,
+            `c1` int,
+            `c2` int
+            )UNIQUE KEY(k1)
+        DISTRIBUTED BY HASH(k1) BUCKETS 1
+        PROPERTIES (
+            "enable_mow_light_delete" = "false",
+            "enable_unique_key_merge_on_write" = "true",
+            "disable_auto_compaction" = "true",
+            "replication_num" = "1"); """
+
+    sql "insert into ${table1} values(1,1,1);"
+    sql "insert into ${table1} values(2,2,2);"
+    sql "insert into ${table1} values(3,3,3);"
+    sql "sync;"
+    order_qt_sql "select * from ${table1};"
+
+    def customFeConfig = [
+        delete_bitmap_lock_expiration_seconds : 5,
+        calculate_delete_bitmap_task_timeout_seconds : 2,
+        mow_insert_into_commit_retry_times : 2
+    ]
+
+    setFeConfigTemporary(customFeConfig) {
+        try {
+            explain {
+                sql "delete from ${table1} where k1=2;"
+                contains "IS_PARTIAL_UPDATE: true"
+            }
+
+            // block the calculation of delete bitmap on BE
+            GetDebugPoint().enableDebugPointForAllBEs("BaseTablet::update_delete_bitmap.enable_spin_wait", [token: "token1"])
+            GetDebugPoint().enableDebugPointForAllBEs("BaseTablet::update_delete_bitmap.block", [wait_token: "token1"])
+
+            // should return error after running out of try times
+            test {
+                sql "delete from ${table1} where k1=2;"
+                exception "Failed to calculate delete bitmap. Timeout."
+            }
+
+            test {
+                sql "insert into ${table1} values(4,4,4)"
+                exception "Failed to calculate delete bitmap. Timeout."
+            }
+
+            order_qt_sql "select * from ${table1};"
+            
+        } catch(Exception e) {
+            logger.info(e.getMessage())
+            throw e
+        } finally {
+            GetDebugPoint().clearDebugPointsForAllBEs()
+        }
+        sql "DROP TABLE IF EXISTS ${table1};"
+    }
+}

From 4949a9e7d005944a3e765e23281fc4f45b427fb9 Mon Sep 17 00:00:00 2001
From: Gavin Chou <gavineaglechou@gmail.com>
Date: Tue, 10 Sep 2024 14:44:18 +0800
Subject: [PATCH 25/44] [opt](vault) Do not use latest_fs() in vault mode
 (#40516)

To prevent incorrect storage backends selected by loading data, e.g.
empty vault id passed from FE, we should not use latest_fs() in vault
mode.
---
 be/src/cloud/cloud_meta_mgr.cpp                  |  8 ++++++--
 be/src/cloud/cloud_meta_mgr.h                    |  9 ++++++++-
 be/src/cloud/cloud_storage_engine.cpp            | 14 ++++++++++----
 cloud/src/meta-service/meta_service_resource.cpp |  2 ++
 gensrc/proto/cloud.proto                         |  1 +
 5 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/be/src/cloud/cloud_meta_mgr.cpp b/be/src/cloud/cloud_meta_mgr.cpp
index 8138ea52421102..816f1108299cb8 100644
--- a/be/src/cloud/cloud_meta_mgr.cpp
+++ b/be/src/cloud/cloud_meta_mgr.cpp
@@ -906,7 +906,7 @@ Status CloudMetaMgr::precommit_txn(const StreamLoadContext& ctx) {
     return retry_rpc("precommit txn", req, &res, &MetaService_Stub::precommit_txn);
 }
 
-Status CloudMetaMgr::get_storage_vault_info(StorageVaultInfos* vault_infos) {
+Status CloudMetaMgr::get_storage_vault_info(StorageVaultInfos* vault_infos, bool* is_vault_mode) {
     GetObjStoreInfoRequest req;
     GetObjStoreInfoResponse resp;
     req.set_cloud_unique_id(config::cloud_unique_id);
@@ -916,6 +916,8 @@ Status CloudMetaMgr::get_storage_vault_info(StorageVaultInfos* vault_infos) {
         return s;
     }
 
+    *is_vault_mode = resp.enable_storage_vault();
+
     auto add_obj_store = [&vault_infos](const auto& obj_store) {
         vault_infos->emplace_back(obj_store.id(), S3Conf::get_s3_conf(obj_store),
                                   StorageVaultPB_PathFormat {});
@@ -931,6 +933,7 @@ Status CloudMetaMgr::get_storage_vault_info(StorageVaultInfos* vault_infos) {
         }
     });
 
+    // desensitization, hide secret
     for (int i = 0; i < resp.obj_info_size(); ++i) {
         resp.mutable_obj_info(i)->set_sk(resp.obj_info(i).sk().substr(0, 2) + "xxx");
     }
@@ -940,7 +943,8 @@ Status CloudMetaMgr::get_storage_vault_info(StorageVaultInfos* vault_infos) {
         j->mutable_obj_info()->set_sk(j->obj_info().sk().substr(0, 2) + "xxx");
     }
 
-    LOG(INFO) << "get storage vault response: " << resp.ShortDebugString();
+    LOG(INFO) << "get storage vault, enable_storage_vault=" << is_vault_mode
+              << " response=" << resp.ShortDebugString();
     return Status::OK();
 }
 
diff --git a/be/src/cloud/cloud_meta_mgr.h b/be/src/cloud/cloud_meta_mgr.h
index 6f6cc9c26b47b4..2f776b056866aa 100644
--- a/be/src/cloud/cloud_meta_mgr.h
+++ b/be/src/cloud/cloud_meta_mgr.h
@@ -73,7 +73,14 @@ class CloudMetaMgr {
 
     Status precommit_txn(const StreamLoadContext& ctx);
 
-    Status get_storage_vault_info(StorageVaultInfos* vault_infos);
+    /**
+     * Gets storage vault (storage backends) from meta-service
+     * 
+     * @param vault_info output param, all storage backends
+     * @param is_vault_mode output param, true for pure vault mode, false for legacy mode
+     * @return status
+     */
+    Status get_storage_vault_info(StorageVaultInfos* vault_infos, bool* is_vault_mode);
 
     Status prepare_tablet_job(const TabletJobInfoPB& job, StartTabletJobResponse* res);
 
diff --git a/be/src/cloud/cloud_storage_engine.cpp b/be/src/cloud/cloud_storage_engine.cpp
index 3e56c23d1d3e79..b98b2e3d0efc45 100644
--- a/be/src/cloud/cloud_storage_engine.cpp
+++ b/be/src/cloud/cloud_storage_engine.cpp
@@ -161,8 +161,9 @@ struct RefreshFSVaultVisitor {
 
 Status CloudStorageEngine::open() {
     cloud::StorageVaultInfos vault_infos;
+    bool enable_storage_vault = false;
     do {
-        auto st = _meta_mgr->get_storage_vault_info(&vault_infos);
+        auto st = _meta_mgr->get_storage_vault_info(&vault_infos, &enable_storage_vault);
         if (st.ok()) {
             break;
         }
@@ -177,7 +178,11 @@ Status CloudStorageEngine::open() {
             return vault_process_error(id, vault_info, std::move(st));
         }
     }
-    set_latest_fs(get_filesystem(std::get<0>(vault_infos.back())));
+
+    // vault mode should not support latest_fs to get rid of unexpected storage backends choosen
+    if (!enable_storage_vault) {
+        set_latest_fs(get_filesystem(std::get<0>(vault_infos.back())));
+    }
 
     // TODO(plat1ko): DeleteBitmapTxnManager
 
@@ -340,7 +345,8 @@ void CloudStorageEngine::_check_file_cache_ttl_block_valid() {
 
 void CloudStorageEngine::sync_storage_vault() {
     cloud::StorageVaultInfos vault_infos;
-    auto st = _meta_mgr->get_storage_vault_info(&vault_infos);
+    bool enable_storage_vault = false;
+    auto st = _meta_mgr->get_storage_vault_info(&vault_infos, &enable_storage_vault);
     if (!st.ok()) {
         LOG(WARNING) << "failed to get storage vault info. err=" << st;
         return;
@@ -363,7 +369,7 @@ void CloudStorageEngine::sync_storage_vault() {
     }
 
     if (auto& id = std::get<0>(vault_infos.back());
-        latest_fs() == nullptr || latest_fs()->id() != id) {
+        (latest_fs() == nullptr || latest_fs()->id() != id) && !enable_storage_vault) {
         set_latest_fs(get_filesystem(id));
     }
 }
diff --git a/cloud/src/meta-service/meta_service_resource.cpp b/cloud/src/meta-service/meta_service_resource.cpp
index 90a88f86006643..8a25a73771ccbd 100644
--- a/cloud/src/meta-service/meta_service_resource.cpp
+++ b/cloud/src/meta-service/meta_service_resource.cpp
@@ -254,6 +254,8 @@ void MetaServiceImpl::get_obj_store_info(google::protobuf::RpcController* contro
         }
     }
 
+    response->set_enable_storage_vault(instance.enable_storage_vault());
+
     // Iterate all the resources to return to the rpc caller
     if (!instance.resource_ids().empty()) {
         std::string storage_vault_start = storage_vault_key({instance.instance_id(), ""});
diff --git a/gensrc/proto/cloud.proto b/gensrc/proto/cloud.proto
index b4c2d0d0968ae9..268744a0088f61 100644
--- a/gensrc/proto/cloud.proto
+++ b/gensrc/proto/cloud.proto
@@ -895,6 +895,7 @@ message GetObjStoreInfoResponse {
     repeated StorageVaultPB storage_vault = 3;
     optional string default_storage_vault_id = 4;
     optional string default_storage_vault_name = 5;
+    optional bool enable_storage_vault = 6;
 };
 
 message CreateTabletsRequest {

From 1d0690bf0f3ce9bf0211c9cee97391de8577a3da Mon Sep 17 00:00:00 2001
From: Pxl <pxl290@qq.com>
Date: Tue, 10 Sep 2024 15:20:20 +0800
Subject: [PATCH 26/44] [Enchancement](column) make create_always_true_column
 return column const (#39086)

## Proposed changes
make create_always_true_column return column const
---
 be/src/vec/utils/util.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/be/src/vec/utils/util.hpp b/be/src/vec/utils/util.hpp
index 3c5f4f194aac9b..8d17b2787a53da 100644
--- a/be/src/vec/utils/util.hpp
+++ b/be/src/vec/utils/util.hpp
@@ -173,12 +173,12 @@ inline std::string remove_suffix(const std::string& name, const std::string& suf
 };
 
 inline ColumnPtr create_always_true_column(size_t size, bool is_nullable) {
-    auto res_data_column = ColumnUInt8::create(size, 1);
+    ColumnPtr res_data_column = ColumnUInt8::create(1, 1);
     if (is_nullable) {
-        auto null_map = ColumnVector<UInt8>::create(size, 0);
-        return ColumnNullable::create(std::move(res_data_column), std::move(null_map));
+        auto null_map = ColumnVector<UInt8>::create(1, 0);
+        res_data_column = ColumnNullable::create(res_data_column, std::move(null_map));
     }
-    return res_data_column;
+    return ColumnConst::create(std::move(res_data_column), size);
 }
 
 // change null element to true element

From 7fc42a5fa4fd9bcc2259fde6e0d1ba9c49efec6f Mon Sep 17 00:00:00 2001
From: morrySnow <101034200+morrySnow@users.noreply.github.com>
Date: Tue, 10 Sep 2024 15:46:59 +0800
Subject: [PATCH 27/44] [fix](Nereids) handle continuous filter or project in
 plan (#40176)

if we meet continuous project or filter in translator, we try to
generate SelectNode as far as possible to avoid generate invalid plan

for example

```
Filter(conjuncts 1)
+-- Limit (limit 10)
    +-- Filter(conjuncts 2)
        +-- Aggregate
```

will be translated to

```
SELECT_NODE (conjuncts 1)
+-- AGGREGATE_NODE (conjuncts 2) (limit 10)
```
---
 .../translator/PhysicalPlanTranslator.java    | 44 ++++++++++++++-----
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java
index 2a34bc3ca91dd2..28456041f7d3bb 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java
@@ -720,7 +720,6 @@ public PlanFragment visitPhysicalJdbcScan(PhysicalJdbcScan jdbcScan, PlanTransla
         JdbcScanNode jdbcScanNode = new JdbcScanNode(context.nextPlanNodeId(), tupleDescriptor,
                 table instanceof JdbcExternalTable);
         jdbcScanNode.setNereidsId(jdbcScan.getId());
-        jdbcScanNode.addConjuncts(translateToLegacyConjuncts(jdbcScan.getConjuncts()));
         Utils.execWithUncheckedException(jdbcScanNode::init);
         context.addScanNode(jdbcScanNode, jdbcScan);
         context.getRuntimeTranslator().ifPresent(
@@ -744,7 +743,6 @@ public PlanFragment visitPhysicalOdbcScan(PhysicalOdbcScan odbcScan, PlanTransla
         OdbcScanNode odbcScanNode = new OdbcScanNode(context.nextPlanNodeId(), tupleDescriptor,
                 (OdbcTable) table);
         odbcScanNode.setNereidsId(odbcScan.getId());
-        odbcScanNode.addConjuncts(translateToLegacyConjuncts(odbcScan.getConjuncts()));
         Utils.execWithUncheckedException(odbcScanNode::init);
         context.addScanNode(odbcScanNode, odbcScan);
         context.getRuntimeTranslator().ifPresent(
@@ -1258,6 +1256,12 @@ public PlanFragment visitPhysicalFilter(PhysicalFilter<? extends Plan> filter, P
             MultiCastDataSink multiCastDataSink = (MultiCastDataSink) inputFragment.getSink();
             DataStreamSink dataStreamSink = multiCastDataSink.getDataStreamSinks().get(
                     multiCastDataSink.getDataStreamSinks().size() - 1);
+            if (CollectionUtils.isNotEmpty(dataStreamSink.getConjuncts())
+                    || CollectionUtils.isNotEmpty(dataStreamSink.getProjections())) {
+                String errMsg = "generate invalid plan \n" + filter.treeString();
+                LOG.warn(errMsg);
+                throw new AnalysisException(errMsg);
+            }
             filter.getConjuncts().stream()
                     .map(e -> ExpressionTranslator.translate(e, context))
                     .forEach(dataStreamSink::addConjunct);
@@ -1265,24 +1269,28 @@ public PlanFragment visitPhysicalFilter(PhysicalFilter<? extends Plan> filter, P
         }
 
         PlanNode planNode = inputFragment.getPlanRoot();
-        Plan child = filter.child();
-        while (child instanceof PhysicalLimit) {
-            child = ((PhysicalLimit<?>) child).child();
-        }
-        if (planNode instanceof ExchangeNode || planNode instanceof SortNode || planNode instanceof UnionNode
-                // this means we have filter->limit->project, need a SelectNode
-                || child instanceof PhysicalProject) {
-            // the three nodes don't support conjuncts, need create a SelectNode to filter data
+        // the three nodes don't support conjuncts, need create a SelectNode to filter data
+        if (planNode instanceof ExchangeNode || planNode instanceof SortNode || planNode instanceof UnionNode) {
             SelectNode selectNode = new SelectNode(context.nextPlanNodeId(), planNode);
             selectNode.setNereidsId(filter.getId());
             addConjunctsToPlanNode(filter, selectNode, context);
             addPlanRoot(inputFragment, selectNode, filter);
         } else {
             if (!(filter.child(0) instanceof AbstractPhysicalJoin)) {
+                // already have filter on this node, we should not override it, so need a new node
+                if (!planNode.getConjuncts().isEmpty()
+                        // already have project on this node, filter need execute after project, so need a new node
+                        || CollectionUtils.isNotEmpty(planNode.getProjectList())
+                        // already have limit on this node, filter need execute after limit, so need a new node
+                        || planNode.hasLimit()) {
+                    planNode = new SelectNode(context.nextPlanNodeId(), planNode);
+                    planNode.setNereidsId(filter.getId());
+                    addPlanRoot(inputFragment, planNode, filter);
+                }
                 addConjunctsToPlanNode(filter, planNode, context);
-                updateLegacyPlanIdToPhysicalPlan(inputFragment.getPlanRoot(), filter);
             }
         }
+        updateLegacyPlanIdToPhysicalPlan(inputFragment.getPlanRoot(), filter);
         // in ut, filter.stats may be null
         if (filter.getStats() != null) {
             inputFragment.getPlanRoot().setCardinalityAfterFilter((long) filter.getStats().getRowCount());
@@ -1866,8 +1874,15 @@ public PlanFragment visitPhysicalProject(PhysicalProject<? extends Plan> project
         }
 
         PlanFragment inputFragment = project.child(0).accept(this, context);
-
         PlanNode inputPlanNode = inputFragment.getPlanRoot();
+        // this means already have project on this node, filter need execute after project, so need a new node
+        if (CollectionUtils.isNotEmpty(inputPlanNode.getProjectList())) {
+            SelectNode selectNode = new SelectNode(context.nextPlanNodeId(), inputPlanNode);
+            selectNode.setNereidsId(project.getId());
+            addPlanRoot(inputFragment, selectNode, project);
+            inputPlanNode = selectNode;
+        }
+
         List<Expr> projectionExprs = null;
         List<Expr> allProjectionExprs = Lists.newArrayList();
         List<Slot> slots = null;
@@ -1905,6 +1920,11 @@ public PlanFragment visitPhysicalProject(PhysicalProject<? extends Plan> project
             MultiCastDataSink multiCastDataSink = (MultiCastDataSink) inputFragment.getSink();
             DataStreamSink dataStreamSink = multiCastDataSink.getDataStreamSinks().get(
                     multiCastDataSink.getDataStreamSinks().size() - 1);
+            if (CollectionUtils.isNotEmpty(dataStreamSink.getProjections())) {
+                String errMsg = "generate invalid plan \n" + project.treeString();
+                LOG.warn(errMsg);
+                throw new AnalysisException(errMsg);
+            }
             TupleDescriptor projectionTuple = generateTupleDesc(slots, null, context);
             dataStreamSink.setProjections(projectionExprs);
             dataStreamSink.setOutputTupleDesc(projectionTuple);

From 24af729933af8afb9a51884bbcc65324700bd3a9 Mon Sep 17 00:00:00 2001
From: caiconghui <55968745+caiconghui@users.noreply.github.com>
Date: Tue, 10 Sep 2024 15:59:53 +0800
Subject: [PATCH 28/44] [fix](log) fix fe doesn't print location in log when
 log mode is async (#40513)

Co-authored-by: caiconghui1 <caiconghui1@jd.com>
---
 .../org/apache/doris/common/Log4jConfig.java   | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/Log4jConfig.java b/fe/fe-core/src/main/java/org/apache/doris/common/Log4jConfig.java
index 206d1cb208959b..39d13b0e989727 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/Log4jConfig.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/Log4jConfig.java
@@ -252,13 +252,18 @@ private static void reconfig() throws IOException {
         }
         newXmlConfTemplate = newXmlConfTemplate.replaceAll(VERBOSE_MODULE_PLACEHOLDER, sb.toString());
 
-        if (sysLogMode.equalsIgnoreCase("NORMAL")) {
+        // BRIEF: async, no location
+        // ASYNC: async, with location
+        // NORMAL: sync, with location
+        boolean includeLocation = !sysLogMode.equalsIgnoreCase("BRIEF");
+        boolean immediateFlush = sysLogMode.equalsIgnoreCase("NORMAL");
+        if (includeLocation) {
             newXmlConfTemplate = newXmlConfTemplate.replaceAll(RUNTIME_LOG_FORMAT_PLACEHOLDER, " [%C{1}.%M():%L] ");
         } else {
             newXmlConfTemplate = newXmlConfTemplate.replaceAll(RUNTIME_LOG_FORMAT_PLACEHOLDER, " ");
-            if (sysLogMode.equalsIgnoreCase("ASYNC")) {
-                newXmlConfTemplate = newXmlConfTemplate.replaceAll("Root", "AsyncRoot");
-            }
+        }
+        if (!immediateFlush) {
+            newXmlConfTemplate = newXmlConfTemplate.replaceAll("Root", "AsyncRoot");
         }
 
         if (Config.enable_file_logger) {
@@ -298,11 +303,6 @@ private static void reconfig() throws IOException {
         properties.put("warn_sys_accumulated_file_size", String.valueOf(Config.warn_sys_accumulated_file_size));
         properties.put("audit_sys_accumulated_file_size", String.valueOf(Config.audit_sys_accumulated_file_size));
 
-        // BRIEF: async, no location
-        // ASYNC: async, with location
-        // NORMAL: sync, with location
-        boolean includeLocation = !sysLogMode.equalsIgnoreCase("BRIEF");
-        boolean immediateFlush = sysLogMode.equalsIgnoreCase("NORMAL");
         properties.put("include_location_flag", Boolean.toString(includeLocation));
         properties.put("immediate_flush_flag", Boolean.toString(immediateFlush));
         properties.put("audit_file_postfix", compressAuditLog ? ".gz" : "");

From baefde940f13f63c141bf58d414f0a0ab8717621 Mon Sep 17 00:00:00 2001
From: caiconghui <55968745+caiconghui@users.noreply.github.com>
Date: Tue, 10 Sep 2024 16:00:34 +0800
Subject: [PATCH 29/44] [fix](audit_loader) fix that old external audit loader
 plugin not work because of incompatibility with new audit plugin (#40565)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1、rename org.apache.doris.plugin.audit.AuditLoaderPlugin to
org.apache.doris.plugin.audit.AuditLoader to avoid potential conflict
with old external audit plugin
2、rename org.apache.doris.plugin.audit.AuditEvent to
org.apache.doris.plugin.AuditEvent to keep eventFilter in AuditPlugin be
compatible with old external audit plugin

Co-authored-by: caiconghui1 <caiconghui1@jd.com>
---
 .../java/org/apache/doris/catalog/InternalSchema.java  |  4 ++--
 .../doris/catalog/InternalSchemaInitializer.java       | 10 +++++-----
 .../org/apache/doris/load/StreamLoadRecordMgr.java     |  4 ++--
 .../java/org/apache/doris/load/loadv2/BulkLoadJob.java |  2 +-
 .../apache/doris/plugin/{audit => }/AuditEvent.java    |  2 +-
 .../main/java/org/apache/doris/plugin/AuditPlugin.java |  2 --
 .../main/java/org/apache/doris/plugin/PluginMgr.java   |  4 ++--
 .../audit/{AuditLoaderPlugin.java => AuditLoader.java} |  9 +++++----
 .../org/apache/doris/plugin/audit/AuditLogBuilder.java |  5 +++--
 .../apache/doris/plugin/audit/AuditStreamLoader.java   |  2 +-
 .../org/apache/doris/plugin/audit/LoadAuditEvent.java  |  2 ++
 .../doris/plugin/audit/StreamLoadAuditEvent.java       |  2 ++
 .../java/org/apache/doris/qe/AuditEventProcessor.java  |  2 +-
 .../main/java/org/apache/doris/qe/AuditLogHelper.java  |  4 ++--
 .../main/java/org/apache/doris/qe/ConnectContext.java  |  2 +-
 .../workloadschedpolicy/WorkloadRuntimeStatusMgr.java  |  2 +-
 .../apache/doris/alter/InternalSchemaAlterTest.java    |  9 +++++----
 .../org/apache/doris/qe/AuditEventProcessorTest.java   |  4 ++--
 18 files changed, 38 insertions(+), 33 deletions(-)
 rename fe/fe-core/src/main/java/org/apache/doris/plugin/{audit => }/AuditEvent.java (99%)
 rename fe/fe-core/src/main/java/org/apache/doris/plugin/audit/{AuditLoaderPlugin.java => AuditLoader.java} (98%)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchema.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchema.java
index 15bf65c3c73728..768ae22d202dc4 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchema.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchema.java
@@ -21,7 +21,7 @@
 import org.apache.doris.analysis.ColumnNullableType;
 import org.apache.doris.analysis.TypeDef;
 import org.apache.doris.common.UserException;
-import org.apache.doris.plugin.audit.AuditLoaderPlugin;
+import org.apache.doris.plugin.audit.AuditLoader;
 import org.apache.doris.statistics.StatisticConstants;
 
 import com.google.common.collect.Lists;
@@ -168,7 +168,7 @@ public static List<ColumnDef> getCopiedSchema(String tblName) throws UserExcepti
             case StatisticConstants.HISTOGRAM_TBL_NAME:
                 schema = HISTO_STATS_SCHEMA;
                 break;
-            case AuditLoaderPlugin.AUDIT_LOG_TABLE:
+            case AuditLoader.AUDIT_LOG_TABLE:
                 schema = AUDIT_SCHEMA;
                 break;
             default:
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchemaInitializer.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchemaInitializer.java
index 87e8a0fc3b0ce8..c038414fc65bc6 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchemaInitializer.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchemaInitializer.java
@@ -38,7 +38,7 @@
 import org.apache.doris.common.util.PropertyAnalyzer;
 import org.apache.doris.datasource.InternalCatalog;
 import org.apache.doris.ha.FrontendNodeType;
-import org.apache.doris.plugin.audit.AuditLoaderPlugin;
+import org.apache.doris.plugin.audit.AuditLoader;
 import org.apache.doris.statistics.StatisticConstants;
 import org.apache.doris.statistics.util.StatisticsUtil;
 
@@ -98,7 +98,7 @@ public void run() {
         Database database = op.get();
         modifyTblReplicaCount(database, StatisticConstants.TABLE_STATISTIC_TBL_NAME);
         modifyTblReplicaCount(database, StatisticConstants.PARTITION_STATISTIC_TBL_NAME);
-        modifyTblReplicaCount(database, AuditLoaderPlugin.AUDIT_LOG_TABLE);
+        modifyTblReplicaCount(database, AuditLoader.AUDIT_LOG_TABLE);
     }
 
     @VisibleForTesting
@@ -215,7 +215,7 @@ private static CreateTableStmt buildStatisticsTblStmt(String statsTableName, Lis
 
     private static CreateTableStmt buildAuditTblStmt() throws UserException {
         TableName tableName = new TableName("",
-                FeConstants.INTERNAL_DB_NAME, AuditLoaderPlugin.AUDIT_LOG_TABLE);
+                FeConstants.INTERNAL_DB_NAME, AuditLoader.AUDIT_LOG_TABLE);
 
         String engineName = "olap";
         ArrayList<String> dupKeys = Lists.newArrayList("query_id", "time", "client_ip");
@@ -244,7 +244,7 @@ private static CreateTableStmt buildAuditTblStmt() throws UserException {
 
         PropertyAnalyzer.getInstance().rewriteForceProperties(properties);
         CreateTableStmt createTableStmt = new CreateTableStmt(true, false,
-                tableName, InternalSchema.getCopiedSchema(AuditLoaderPlugin.AUDIT_LOG_TABLE),
+                tableName, InternalSchema.getCopiedSchema(AuditLoader.AUDIT_LOG_TABLE),
                 engineName, keysDesc, partitionDesc, distributionDesc,
                 properties, null, "Doris internal audit table, DO NOT MODIFY IT", null);
         StatisticsUtil.analyze(createTableStmt);
@@ -286,7 +286,7 @@ private boolean created() {
         }
 
         // 3. check audit table
-        optionalStatsTbl = db.getTable(AuditLoaderPlugin.AUDIT_LOG_TABLE);
+        optionalStatsTbl = db.getTable(AuditLoader.AUDIT_LOG_TABLE);
         return optionalStatsTbl.isPresent();
     }
 
diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/StreamLoadRecordMgr.java b/fe/fe-core/src/main/java/org/apache/doris/load/StreamLoadRecordMgr.java
index 7f1d33bd649736..3a38641036fb5f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/load/StreamLoadRecordMgr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/load/StreamLoadRecordMgr.java
@@ -30,8 +30,8 @@
 import org.apache.doris.datasource.InternalCatalog;
 import org.apache.doris.mysql.privilege.PrivPredicate;
 import org.apache.doris.persist.gson.GsonUtils;
-import org.apache.doris.plugin.audit.AuditEvent;
-import org.apache.doris.plugin.audit.AuditEvent.EventType;
+import org.apache.doris.plugin.AuditEvent;
+import org.apache.doris.plugin.AuditEvent.EventType;
 import org.apache.doris.plugin.audit.StreamLoadAuditEvent;
 import org.apache.doris.qe.ConnectContext;
 import org.apache.doris.system.Backend;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/BulkLoadJob.java b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/BulkLoadJob.java
index 7b78efc7a50cd5..12aa673eabf16f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/BulkLoadJob.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/BulkLoadJob.java
@@ -46,7 +46,7 @@
 import org.apache.doris.load.EtlJobType;
 import org.apache.doris.load.FailMsg;
 import org.apache.doris.persist.gson.GsonPostProcessable;
-import org.apache.doris.plugin.audit.AuditEvent;
+import org.apache.doris.plugin.AuditEvent;
 import org.apache.doris.plugin.audit.LoadAuditEvent;
 import org.apache.doris.qe.ConnectContext;
 import org.apache.doris.qe.OriginStatement;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/AuditEvent.java b/fe/fe-core/src/main/java/org/apache/doris/plugin/AuditEvent.java
similarity index 99%
rename from fe/fe-core/src/main/java/org/apache/doris/plugin/audit/AuditEvent.java
rename to fe/fe-core/src/main/java/org/apache/doris/plugin/AuditEvent.java
index 0b64a748a10471..55a8b00d2e8d13 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/AuditEvent.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/plugin/AuditEvent.java
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-package org.apache.doris.plugin.audit;
+package org.apache.doris.plugin;
 
 
 import java.lang.annotation.Retention;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/plugin/AuditPlugin.java b/fe/fe-core/src/main/java/org/apache/doris/plugin/AuditPlugin.java
index 55962a3dd10c85..d9c9ec84697ca4 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/plugin/AuditPlugin.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/plugin/AuditPlugin.java
@@ -17,8 +17,6 @@
 
 package org.apache.doris.plugin;
 
-import org.apache.doris.plugin.audit.AuditEvent;
-
 /**
  * Audit plugin interface describe.
  */
diff --git a/fe/fe-core/src/main/java/org/apache/doris/plugin/PluginMgr.java b/fe/fe-core/src/main/java/org/apache/doris/plugin/PluginMgr.java
index 7fddf54e1ee7d2..ea69b247e66427 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/plugin/PluginMgr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/plugin/PluginMgr.java
@@ -27,7 +27,7 @@
 import org.apache.doris.nereids.parser.Dialect;
 import org.apache.doris.plugin.PluginInfo.PluginType;
 import org.apache.doris.plugin.PluginLoader.PluginStatus;
-import org.apache.doris.plugin.audit.AuditLoaderPlugin;
+import org.apache.doris.plugin.audit.AuditLoader;
 import org.apache.doris.plugin.audit.AuditLogBuilder;
 import org.apache.doris.plugin.dialect.HttpDialectConverterPlugin;
 
@@ -113,7 +113,7 @@ private void initBuiltinPlugins() {
         }
 
         // AuditLoader: log audit log to internal table
-        AuditLoaderPlugin auditLoaderPlugin = new AuditLoaderPlugin();
+        AuditLoader auditLoaderPlugin = new AuditLoader();
         if (!registerBuiltinPlugin(auditLoaderPlugin.getPluginInfo(), auditLoaderPlugin)) {
             LOG.warn("failed to register audit log builder");
         }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/AuditLoaderPlugin.java b/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/AuditLoader.java
similarity index 98%
rename from fe/fe-core/src/main/java/org/apache/doris/plugin/audit/AuditLoaderPlugin.java
rename to fe/fe-core/src/main/java/org/apache/doris/plugin/audit/AuditLoader.java
index 4503b5b8802790..27193856937d87 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/AuditLoaderPlugin.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/AuditLoader.java
@@ -20,6 +20,7 @@
 import org.apache.doris.catalog.Env;
 import org.apache.doris.common.util.DigitalVersion;
 import org.apache.doris.common.util.TimeUtils;
+import org.apache.doris.plugin.AuditEvent;
 import org.apache.doris.plugin.AuditPlugin;
 import org.apache.doris.plugin.Plugin;
 import org.apache.doris.plugin.PluginContext;
@@ -42,8 +43,8 @@
 /*
  * This plugin will load audit log to specified doris table at specified interval
  */
-public class AuditLoaderPlugin extends Plugin implements AuditPlugin {
-    private static final Logger LOG = LogManager.getLogger(AuditLoaderPlugin.class);
+public class AuditLoader extends Plugin implements AuditPlugin {
+    private static final Logger LOG = LogManager.getLogger(AuditLoader.class);
 
     public static final String AUDIT_LOG_TABLE = "audit_log";
 
@@ -65,10 +66,10 @@ public class AuditLoaderPlugin extends Plugin implements AuditPlugin {
 
     private final PluginInfo pluginInfo;
 
-    public AuditLoaderPlugin() {
+    public AuditLoader() {
         pluginInfo = new PluginInfo(PluginMgr.BUILTIN_PLUGIN_PREFIX + "AuditLoader", PluginType.AUDIT,
                 "builtin audit loader, to load audit log to internal table", DigitalVersion.fromString("2.1.0"),
-                DigitalVersion.fromString("1.8.31"), AuditLoaderPlugin.class.getName(), null, null);
+                DigitalVersion.fromString("1.8.31"), AuditLoader.class.getName(), null, null);
     }
 
     public PluginInfo getPluginInfo() {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/AuditLogBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/AuditLogBuilder.java
index 210081b101cd93..8d9e2c9d96efbc 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/AuditLogBuilder.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/AuditLogBuilder.java
@@ -20,13 +20,14 @@
 import org.apache.doris.common.AuditLog;
 import org.apache.doris.common.Config;
 import org.apache.doris.common.util.DigitalVersion;
+import org.apache.doris.plugin.AuditEvent;
+import org.apache.doris.plugin.AuditEvent.AuditField;
+import org.apache.doris.plugin.AuditEvent.EventType;
 import org.apache.doris.plugin.AuditPlugin;
 import org.apache.doris.plugin.Plugin;
 import org.apache.doris.plugin.PluginInfo;
 import org.apache.doris.plugin.PluginInfo.PluginType;
 import org.apache.doris.plugin.PluginMgr;
-import org.apache.doris.plugin.audit.AuditEvent.AuditField;
-import org.apache.doris.plugin.audit.AuditEvent.EventType;
 
 import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/AuditStreamLoader.java b/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/AuditStreamLoader.java
index 3765872810d413..0b70e9591d509d 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/AuditStreamLoader.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/AuditStreamLoader.java
@@ -46,7 +46,7 @@ public class AuditStreamLoader {
     public AuditStreamLoader() {
         this.hostPort = "127.0.0.1:" + Config.http_port;
         this.db = FeConstants.INTERNAL_DB_NAME;
-        this.auditLogTbl = AuditLoaderPlugin.AUDIT_LOG_TABLE;
+        this.auditLogTbl = AuditLoader.AUDIT_LOG_TABLE;
         this.auditLogLoadUrlStr = String.format(loadUrlPattern, hostPort, db, auditLogTbl);
         // currently, FE identity is FE's IP, so we replace the "." in IP to make it suitable for label
         this.feIdentity = hostPort.replaceAll("\\.", "_").replaceAll(":", "_");
diff --git a/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/LoadAuditEvent.java b/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/LoadAuditEvent.java
index eb3e098bf416d8..e9e948df43fdcd 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/LoadAuditEvent.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/LoadAuditEvent.java
@@ -17,6 +17,8 @@
 
 package org.apache.doris.plugin.audit;
 
+import org.apache.doris.plugin.AuditEvent;
+
 public class LoadAuditEvent extends AuditEvent {
 
     @AuditField(value = "JobId")
diff --git a/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/StreamLoadAuditEvent.java b/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/StreamLoadAuditEvent.java
index 8733a59656c228..4a20901673a452 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/StreamLoadAuditEvent.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/plugin/audit/StreamLoadAuditEvent.java
@@ -17,6 +17,8 @@
 
 package org.apache.doris.plugin.audit;
 
+import org.apache.doris.plugin.AuditEvent;
+
 public class StreamLoadAuditEvent extends AuditEvent {
 
     @AuditField(value = "Label")
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/AuditEventProcessor.java b/fe/fe-core/src/main/java/org/apache/doris/qe/AuditEventProcessor.java
index 12e174ab5d0f18..5cb826dc86c990 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/AuditEventProcessor.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/AuditEventProcessor.java
@@ -18,11 +18,11 @@
 package org.apache.doris.qe;
 
 import org.apache.doris.common.Config;
+import org.apache.doris.plugin.AuditEvent;
 import org.apache.doris.plugin.AuditPlugin;
 import org.apache.doris.plugin.Plugin;
 import org.apache.doris.plugin.PluginInfo.PluginType;
 import org.apache.doris.plugin.PluginMgr;
-import org.apache.doris.plugin.audit.AuditEvent;
 
 import com.google.common.base.Strings;
 import com.google.common.collect.Queues;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/AuditLogHelper.java b/fe/fe-core/src/main/java/org/apache/doris/qe/AuditLogHelper.java
index 7d14586bbeaf3a..904910822b8a9f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/AuditLogHelper.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/AuditLogHelper.java
@@ -39,8 +39,8 @@
 import org.apache.doris.nereids.trees.plans.logical.LogicalInlineTable;
 import org.apache.doris.nereids.trees.plans.logical.LogicalPlan;
 import org.apache.doris.nereids.trees.plans.logical.LogicalUnion;
-import org.apache.doris.plugin.audit.AuditEvent.AuditEventBuilder;
-import org.apache.doris.plugin.audit.AuditEvent.EventType;
+import org.apache.doris.plugin.AuditEvent.AuditEventBuilder;
+import org.apache.doris.plugin.AuditEvent.EventType;
 import org.apache.doris.qe.QueryState.MysqlStateType;
 import org.apache.doris.service.FrontendOptions;
 
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectContext.java b/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectContext.java
index 1b70c5b318bd10..ff960439f7668f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectContext.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectContext.java
@@ -58,7 +58,7 @@
 import org.apache.doris.nereids.trees.expressions.literal.Literal;
 import org.apache.doris.plsql.Exec;
 import org.apache.doris.plsql.executor.PlSqlOperation;
-import org.apache.doris.plugin.audit.AuditEvent.AuditEventBuilder;
+import org.apache.doris.plugin.AuditEvent.AuditEventBuilder;
 import org.apache.doris.resource.Tag;
 import org.apache.doris.service.arrowflight.results.FlightSqlChannel;
 import org.apache.doris.statistics.ColumnStatistic;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/resource/workloadschedpolicy/WorkloadRuntimeStatusMgr.java b/fe/fe-core/src/main/java/org/apache/doris/resource/workloadschedpolicy/WorkloadRuntimeStatusMgr.java
index b2de010b9e418f..695bf983dc6b2f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/resource/workloadschedpolicy/WorkloadRuntimeStatusMgr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/resource/workloadschedpolicy/WorkloadRuntimeStatusMgr.java
@@ -21,7 +21,7 @@
 import org.apache.doris.common.Config;
 import org.apache.doris.common.Pair;
 import org.apache.doris.common.util.MasterDaemon;
-import org.apache.doris.plugin.audit.AuditEvent;
+import org.apache.doris.plugin.AuditEvent;
 import org.apache.doris.thrift.TQueryStatistics;
 import org.apache.doris.thrift.TReportWorkloadRuntimeStatusParams;
 
diff --git a/fe/fe-core/src/test/java/org/apache/doris/alter/InternalSchemaAlterTest.java b/fe/fe-core/src/test/java/org/apache/doris/alter/InternalSchemaAlterTest.java
index cf9d31b1ccaf3c..122014f0e8b2c2 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/alter/InternalSchemaAlterTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/alter/InternalSchemaAlterTest.java
@@ -28,7 +28,7 @@
 import org.apache.doris.common.AnalysisException;
 import org.apache.doris.common.Config;
 import org.apache.doris.common.FeConstants;
-import org.apache.doris.plugin.audit.AuditLoaderPlugin;
+import org.apache.doris.plugin.audit.AuditLoader;
 import org.apache.doris.statistics.StatisticConstants;
 import org.apache.doris.utframe.TestWithFeService;
 
@@ -54,11 +54,12 @@ protected void runBeforeAll() throws Exception {
     public void testModifyTblReplicaCount() throws AnalysisException {
         Database db = Env.getCurrentEnv().getCatalogMgr()
                 .getInternalCatalog().getDbNullable(FeConstants.INTERNAL_DB_NAME);
+
         InternalSchemaInitializer.modifyTblReplicaCount(db, StatisticConstants.TABLE_STATISTIC_TBL_NAME);
-        InternalSchemaInitializer.modifyTblReplicaCount(db, AuditLoaderPlugin.AUDIT_LOG_TABLE);
+        InternalSchemaInitializer.modifyTblReplicaCount(db, AuditLoader.AUDIT_LOG_TABLE);
 
         checkReplicationNum(db, StatisticConstants.TABLE_STATISTIC_TBL_NAME);
-        checkReplicationNum(db, AuditLoaderPlugin.AUDIT_LOG_TABLE);
+        checkReplicationNum(db, AuditLoader.AUDIT_LOG_TABLE);
     }
 
     private void checkReplicationNum(Database db, String tblName) throws AnalysisException {
@@ -77,7 +78,7 @@ public void testCheckAuditLogTable() throws AnalysisException {
         Database db = Env.getCurrentEnv().getCatalogMgr()
                 .getInternalCatalog().getDbNullable(FeConstants.INTERNAL_DB_NAME);
         Assertions.assertNotNull(db);
-        OlapTable table = db.getOlapTableOrAnalysisException(AuditLoaderPlugin.AUDIT_LOG_TABLE);
+        OlapTable table = db.getOlapTableOrAnalysisException(AuditLoader.AUDIT_LOG_TABLE);
         Assertions.assertNotNull(table);
         for (ColumnDef def : InternalSchema.AUDIT_SCHEMA) {
             Assertions.assertNotNull(table.getColumn(def.getName()));
diff --git a/fe/fe-core/src/test/java/org/apache/doris/qe/AuditEventProcessorTest.java b/fe/fe-core/src/test/java/org/apache/doris/qe/AuditEventProcessorTest.java
index 6c9f54080a049e..f80b485609c5b9 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/qe/AuditEventProcessorTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/qe/AuditEventProcessorTest.java
@@ -19,9 +19,9 @@
 
 import org.apache.doris.catalog.Env;
 import org.apache.doris.common.util.DigitalVersion;
+import org.apache.doris.plugin.AuditEvent;
+import org.apache.doris.plugin.AuditEvent.EventType;
 import org.apache.doris.plugin.PluginInfo;
-import org.apache.doris.plugin.audit.AuditEvent;
-import org.apache.doris.plugin.audit.AuditEvent.EventType;
 import org.apache.doris.plugin.audit.AuditLogBuilder;
 import org.apache.doris.utframe.UtFrameUtils;
 

From ebc460068ae4e2c0fb5c3c4e42ec3575564860d6 Mon Sep 17 00:00:00 2001
From: Vallish Pai <vallishpai@gmail.com>
Date: Tue, 10 Sep 2024 13:40:13 +0530
Subject: [PATCH 30/44] [fix] try fix code format (#40581)

## Proposed changes

Issue Number: close #xxx

<!--Describe your changes.-->
---
 .github/workflows/clang-format.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
index f49f1987b9ed41..adc77450d78c01 100644
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -64,7 +64,7 @@ jobs:
           git clone https://github.com/DoozyX/clang-format-lint-action .github/actions/clang-format-lint-action
 
           pushd .github/actions/clang-format-lint-action &>/dev/null
-          git checkout 6adbe14579e5b8e19eb3e31e5ff2479f3bd302c7
+          git checkout c71d0bf4e21876ebec3e5647491186f8797fde31 # v0.18.2
           popd &>/dev/null
 
       - name: Install Python dependencies

From cd902b61194aa8a5f7d1a704921f753b5f83d79d Mon Sep 17 00:00:00 2001
From: Gabriel <gabrielleebuaa@gmail.com>
Date: Tue, 10 Sep 2024 16:29:03 +0800
Subject: [PATCH 31/44] [refactor](pipeline) Refactor logics (#40576)

---
 .../exec/aggregation_sink_operator.cpp        |   4 +-
 .../pipeline/exec/aggregation_sink_operator.h |   2 +-
 .../pipeline/exec/analytic_sink_operator.cpp  |   4 +-
 .../exec/analytic_source_operator.cpp         |   4 +-
 ...istinct_streaming_aggregation_operator.cpp |   4 +-
 .../pipeline/exec/exchange_sink_operator.cpp  |   4 +-
 be/src/pipeline/exec/hashjoin_build_sink.cpp  |   4 +-
 be/src/pipeline/exec/hashjoin_build_sink.h    |   2 +-
 .../pipeline/exec/hashjoin_probe_operator.cpp |   8 +-
 be/src/pipeline/exec/join_probe_operator.h    |   4 +-
 .../exec/nested_loop_join_build_operator.cpp  |   6 +-
 .../exec/nested_loop_join_build_operator.h    |   4 +-
 .../exec/nested_loop_join_probe_operator.cpp  |   2 +-
 be/src/pipeline/exec/operator.cpp             | 139 +++++++++---------
 be/src/pipeline/exec/operator.h               |  17 +--
 .../exec/partition_sort_sink_operator.cpp     |   8 +-
 .../partitioned_aggregation_sink_operator.cpp |   4 +-
 .../partitioned_hash_join_probe_operator.cpp  |  14 +-
 .../partitioned_hash_join_sink_operator.cpp   |   6 +-
 be/src/pipeline/exec/repeat_operator.cpp      |   6 +-
 .../pipeline/exec/set_probe_sink_operator.cpp |   2 +-
 .../pipeline/exec/set_probe_sink_operator.h   |   2 +-
 be/src/pipeline/exec/set_sink_operator.cpp    |   2 +-
 be/src/pipeline/exec/set_sink_operator.h      |   2 +-
 be/src/pipeline/exec/sort_sink_operator.cpp   |   8 +-
 be/src/pipeline/exec/sort_source_operator.cpp |   6 +-
 .../exec/spill_sort_sink_operator.cpp         |   2 +-
 .../exec/streaming_aggregation_operator.cpp   |   6 +-
 .../pipeline/exec/table_function_operator.cpp |   4 +-
 be/src/pipeline/exec/union_sink_operator.cpp  |   2 +-
 .../local_exchange_sink_operator.cpp          |   2 +-
 .../local_exchange_source_operator.h          |   6 +-
 be/src/pipeline/pipeline_fragment_context.cpp |  30 ++--
 be/src/pipeline/pipeline_fragment_context.h   |   4 +-
 be/src/runtime/query_context.cpp              |   2 +-
 35 files changed, 162 insertions(+), 164 deletions(-)

diff --git a/be/src/pipeline/exec/aggregation_sink_operator.cpp b/be/src/pipeline/exec/aggregation_sink_operator.cpp
index 8c96b4d744c83d..260a599a947a0d 100644
--- a/be/src/pipeline/exec/aggregation_sink_operator.cpp
+++ b/be/src/pipeline/exec/aggregation_sink_operator.cpp
@@ -775,7 +775,7 @@ Status AggSinkOperatorX::open(RuntimeState* state) {
     _output_tuple_desc = state->desc_tbl().get_tuple_descriptor(_output_tuple_id);
     DCHECK_EQ(_intermediate_tuple_desc->slots().size(), _output_tuple_desc->slots().size());
     RETURN_IF_ERROR(vectorized::VExpr::prepare(
-            _probe_expr_ctxs, state, DataSinkOperatorX<AggSinkLocalState>::_child_x->row_desc()));
+            _probe_expr_ctxs, state, DataSinkOperatorX<AggSinkLocalState>::_child->row_desc()));
 
     int j = _probe_expr_ctxs.size();
     for (int i = 0; i < j; ++i) {
@@ -790,7 +790,7 @@ Status AggSinkOperatorX::open(RuntimeState* state) {
         SlotDescriptor* intermediate_slot_desc = _intermediate_tuple_desc->slots()[j];
         SlotDescriptor* output_slot_desc = _output_tuple_desc->slots()[j];
         RETURN_IF_ERROR(_aggregate_evaluators[i]->prepare(
-                state, DataSinkOperatorX<AggSinkLocalState>::_child_x->row_desc(),
+                state, DataSinkOperatorX<AggSinkLocalState>::_child->row_desc(),
                 intermediate_slot_desc, output_slot_desc));
         _aggregate_evaluators[i]->set_version(state->be_exec_version());
     }
diff --git a/be/src/pipeline/exec/aggregation_sink_operator.h b/be/src/pipeline/exec/aggregation_sink_operator.h
index d55b382931d74b..97440de3f09e4c 100644
--- a/be/src/pipeline/exec/aggregation_sink_operator.h
+++ b/be/src/pipeline/exec/aggregation_sink_operator.h
@@ -143,7 +143,7 @@ class AggSinkOperatorX final : public DataSinkOperatorX<AggSinkLocalState> {
 
     DataDistribution required_data_distribution() const override {
         if (_probe_expr_ctxs.empty()) {
-            return _needs_finalize || DataSinkOperatorX<AggSinkLocalState>::_child_x
+            return _needs_finalize || DataSinkOperatorX<AggSinkLocalState>::_child
                                               ->ignore_data_distribution()
                            ? DataDistribution(ExchangeType::PASSTHROUGH)
                            : DataSinkOperatorX<AggSinkLocalState>::required_data_distribution();
diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp
index cc219ecbe642f0..85d7773bdbd025 100644
--- a/be/src/pipeline/exec/analytic_sink_operator.cpp
+++ b/be/src/pipeline/exec/analytic_sink_operator.cpp
@@ -234,11 +234,11 @@ Status AnalyticSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state)
 Status AnalyticSinkOperatorX::open(RuntimeState* state) {
     RETURN_IF_ERROR(DataSinkOperatorX<AnalyticSinkLocalState>::open(state));
     for (const auto& ctx : _agg_expr_ctxs) {
-        RETURN_IF_ERROR(vectorized::VExpr::prepare(ctx, state, _child_x->row_desc()));
+        RETURN_IF_ERROR(vectorized::VExpr::prepare(ctx, state, _child->row_desc()));
     }
     if (!_partition_by_eq_expr_ctxs.empty() || !_order_by_eq_expr_ctxs.empty()) {
         vector<TTupleId> tuple_ids;
-        tuple_ids.push_back(_child_x->row_desc().tuple_descriptors()[0]->id());
+        tuple_ids.push_back(_child->row_desc().tuple_descriptors()[0]->id());
         tuple_ids.push_back(_buffered_tuple_id);
         RowDescriptor cmp_row_desc(state->desc_tbl(), tuple_ids, vector<bool>(2, false));
         if (!_partition_by_eq_expr_ctxs.empty()) {
diff --git a/be/src/pipeline/exec/analytic_source_operator.cpp b/be/src/pipeline/exec/analytic_source_operator.cpp
index 1996b9af58d2c4..b521a9b583fa94 100644
--- a/be/src/pipeline/exec/analytic_source_operator.cpp
+++ b/be/src/pipeline/exec/analytic_source_operator.cpp
@@ -562,13 +562,13 @@ Status AnalyticLocalState::close(RuntimeState* state) {
 
 Status AnalyticSourceOperatorX::open(RuntimeState* state) {
     RETURN_IF_ERROR(OperatorX<AnalyticLocalState>::open(state));
-    DCHECK(_child_x->row_desc().is_prefix_of(_row_descriptor));
+    DCHECK(_child->row_desc().is_prefix_of(_row_descriptor));
     _intermediate_tuple_desc = state->desc_tbl().get_tuple_descriptor(_intermediate_tuple_id);
     _output_tuple_desc = state->desc_tbl().get_tuple_descriptor(_output_tuple_id);
     for (size_t i = 0; i < _agg_functions.size(); ++i) {
         SlotDescriptor* intermediate_slot_desc = _intermediate_tuple_desc->slots()[i];
         SlotDescriptor* output_slot_desc = _output_tuple_desc->slots()[i];
-        RETURN_IF_ERROR(_agg_functions[i]->prepare(state, _child_x->row_desc(),
+        RETURN_IF_ERROR(_agg_functions[i]->prepare(state, _child->row_desc(),
                                                    intermediate_slot_desc, output_slot_desc));
         _agg_functions[i]->set_version(state->be_exec_version());
         _change_to_nullable_flags.push_back(output_slot_desc->is_nullable() &&
diff --git a/be/src/pipeline/exec/distinct_streaming_aggregation_operator.cpp b/be/src/pipeline/exec/distinct_streaming_aggregation_operator.cpp
index 96212f7fd2ff00..5127605097f4c5 100644
--- a/be/src/pipeline/exec/distinct_streaming_aggregation_operator.cpp
+++ b/be/src/pipeline/exec/distinct_streaming_aggregation_operator.cpp
@@ -374,7 +374,7 @@ Status DistinctStreamingAggOperatorX::open(RuntimeState* state) {
     _intermediate_tuple_desc = state->desc_tbl().get_tuple_descriptor(_intermediate_tuple_id);
     _output_tuple_desc = state->desc_tbl().get_tuple_descriptor(_output_tuple_id);
     DCHECK_EQ(_intermediate_tuple_desc->slots().size(), _output_tuple_desc->slots().size());
-    RETURN_IF_ERROR(vectorized::VExpr::prepare(_probe_expr_ctxs, state, _child_x->row_desc()));
+    RETURN_IF_ERROR(vectorized::VExpr::prepare(_probe_expr_ctxs, state, _child->row_desc()));
 
     int j = _probe_expr_ctxs.size();
     for (int i = 0; i < j; ++i) {
@@ -389,7 +389,7 @@ Status DistinctStreamingAggOperatorX::open(RuntimeState* state) {
         SlotDescriptor* intermediate_slot_desc = _intermediate_tuple_desc->slots()[j];
         SlotDescriptor* output_slot_desc = _output_tuple_desc->slots()[j];
         RETURN_IF_ERROR(_aggregate_evaluators[i]->prepare(
-                state, _child_x->row_desc(), intermediate_slot_desc, output_slot_desc));
+                state, _child->row_desc(), intermediate_slot_desc, output_slot_desc));
         _aggregate_evaluators[i]->set_version(state->be_exec_version());
     }
 
diff --git a/be/src/pipeline/exec/exchange_sink_operator.cpp b/be/src/pipeline/exec/exchange_sink_operator.cpp
index 71649aa21ec3d4..366b3c682f7dd5 100644
--- a/be/src/pipeline/exec/exchange_sink_operator.cpp
+++ b/be/src/pipeline/exec/exchange_sink_operator.cpp
@@ -648,10 +648,10 @@ Status ExchangeSinkLocalState::close(RuntimeState* state, Status exec_status) {
 }
 
 DataDistribution ExchangeSinkOperatorX::required_data_distribution() const {
-    if (_child_x && _enable_local_merge_sort) {
+    if (_child && _enable_local_merge_sort) {
         // SORT_OPERATOR -> DATA_STREAM_SINK_OPERATOR
         // SORT_OPERATOR -> LOCAL_MERGE_SORT -> DATA_STREAM_SINK_OPERATOR
-        if (auto sort_source = std::dynamic_pointer_cast<SortSourceOperatorX>(_child_x);
+        if (auto sort_source = std::dynamic_pointer_cast<SortSourceOperatorX>(_child);
             sort_source && sort_source->use_local_merge()) {
             // Sort the data local
             return ExchangeType::LOCAL_MERGE_SORT;
diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp
index d4ca54da637673..0bee88ed537ea6 100644
--- a/be/src/pipeline/exec/hashjoin_build_sink.cpp
+++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp
@@ -488,7 +488,7 @@ Status HashJoinBuildSinkOperatorX::open(RuntimeState* state) {
             _shared_hash_table_context = _shared_hashtable_controller->get_context(node_id());
         }
     }
-    RETURN_IF_ERROR(vectorized::VExpr::prepare(_build_expr_ctxs, state, _child_x->row_desc()));
+    RETURN_IF_ERROR(vectorized::VExpr::prepare(_build_expr_ctxs, state, _child->row_desc()));
     return vectorized::VExpr::open(_build_expr_ctxs, state);
 }
 
@@ -505,7 +505,7 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block*
 
         if (local_state._build_side_mutable_block.empty()) {
             auto tmp_build_block = vectorized::VectorizedUtils::create_empty_columnswithtypename(
-                    _child_x->row_desc());
+                    _child->row_desc());
             tmp_build_block = *(tmp_build_block.create_same_struct_block(1, false));
             local_state._build_col_ids.resize(_build_expr_ctxs.size());
             RETURN_IF_ERROR(local_state._do_evaluate(tmp_build_block, local_state._build_expr_ctxs,
diff --git a/be/src/pipeline/exec/hashjoin_build_sink.h b/be/src/pipeline/exec/hashjoin_build_sink.h
index c373af5d6622ff..b7ae612510fcb4 100644
--- a/be/src/pipeline/exec/hashjoin_build_sink.h
+++ b/be/src/pipeline/exec/hashjoin_build_sink.h
@@ -132,7 +132,7 @@ class HashJoinBuildSinkOperatorX final
         if (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) {
             return {ExchangeType::NOOP};
         } else if (_is_broadcast_join) {
-            return _child_x->ignore_data_distribution()
+            return _child->ignore_data_distribution()
                            ? DataDistribution(ExchangeType::PASS_TO_ONE)
                            : DataDistribution(ExchangeType::NOOP);
         }
diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.cpp b/be/src/pipeline/exec/hashjoin_probe_operator.cpp
index 7008397db770ae..f91e1eaa2a1b17 100644
--- a/be/src/pipeline/exec/hashjoin_probe_operator.cpp
+++ b/be/src/pipeline/exec/hashjoin_probe_operator.cpp
@@ -276,7 +276,7 @@ Status HashJoinProbeOperatorX::pull(doris::RuntimeState* state, vectorized::Bloc
         /// increase the output rows count(just same as `_probe_block`'s rows count).
         RETURN_IF_ERROR(local_state.filter_data_and_build_output(state, output_block, eos,
                                                                  &local_state._probe_block, false));
-        local_state._probe_block.clear_column_data(_child_x->row_desc().num_materialized_slots());
+        local_state._probe_block.clear_column_data(_child->row_desc().num_materialized_slots());
         return Status::OK();
     }
 
@@ -597,7 +597,7 @@ Status HashJoinProbeOperatorX::open(RuntimeState* state) {
             }
         }
     };
-    init_output_slots_flags(_child_x->row_desc().tuple_descriptors(), _left_output_slot_flags);
+    init_output_slots_flags(_child->row_desc().tuple_descriptors(), _left_output_slot_flags);
     init_output_slots_flags(_build_side_child->row_desc().tuple_descriptors(),
                             _right_output_slot_flags);
     // _other_join_conjuncts are evaluated in the context of the rows produced by this node
@@ -609,12 +609,12 @@ Status HashJoinProbeOperatorX::open(RuntimeState* state) {
         RETURN_IF_ERROR(conjunct->prepare(state, *_intermediate_row_desc));
     }
 
-    RETURN_IF_ERROR(vectorized::VExpr::prepare(_probe_expr_ctxs, state, _child_x->row_desc()));
+    RETURN_IF_ERROR(vectorized::VExpr::prepare(_probe_expr_ctxs, state, _child->row_desc()));
     DCHECK(_build_side_child != nullptr);
     // right table data types
     _right_table_data_types =
             vectorized::VectorizedUtils::get_data_types(_build_side_child->row_desc());
-    _left_table_data_types = vectorized::VectorizedUtils::get_data_types(_child_x->row_desc());
+    _left_table_data_types = vectorized::VectorizedUtils::get_data_types(_child->row_desc());
     _right_table_column_names =
             vectorized::VectorizedUtils::get_column_names(_build_side_child->row_desc());
 
diff --git a/be/src/pipeline/exec/join_probe_operator.h b/be/src/pipeline/exec/join_probe_operator.h
index 65b7a2694e4b47..3f68c73d04b161 100644
--- a/be/src/pipeline/exec/join_probe_operator.h
+++ b/be/src/pipeline/exec/join_probe_operator.h
@@ -85,12 +85,12 @@ class JoinProbeOperatorX : public StatefulOperatorX<LocalStateType> {
     }
 
     Status set_child(OperatorPtr child) override {
-        if (OperatorX<LocalStateType>::_child_x && _build_side_child == nullptr) {
+        if (OperatorX<LocalStateType>::_child && _build_side_child == nullptr) {
             // when there already (probe) child, others is build child.
             set_build_side_child(child);
         } else {
             // first child which is probe side is in this pipeline
-            OperatorX<LocalStateType>::_child_x = std::move(child);
+            OperatorX<LocalStateType>::_child = std::move(child);
         }
         return Status::OK();
     }
diff --git a/be/src/pipeline/exec/nested_loop_join_build_operator.cpp b/be/src/pipeline/exec/nested_loop_join_build_operator.cpp
index 515c151c3c1fa8..793a37c7396a61 100644
--- a/be/src/pipeline/exec/nested_loop_join_build_operator.cpp
+++ b/be/src/pipeline/exec/nested_loop_join_build_operator.cpp
@@ -109,14 +109,14 @@ Status NestedLoopJoinBuildSinkOperatorX::init(const TPlanNode& tnode, RuntimeSta
 
 Status NestedLoopJoinBuildSinkOperatorX::open(RuntimeState* state) {
     RETURN_IF_ERROR(JoinBuildSinkOperatorX<NestedLoopJoinBuildSinkLocalState>::open(state));
-    int num_build_tuples = _child_x->row_desc().tuple_descriptors().size();
+    int num_build_tuples = _child->row_desc().tuple_descriptors().size();
 
     for (int i = 0; i < num_build_tuples; ++i) {
-        TupleDescriptor* build_tuple_desc = _child_x->row_desc().tuple_descriptors()[i];
+        TupleDescriptor* build_tuple_desc = _child->row_desc().tuple_descriptors()[i];
         auto tuple_idx = _row_descriptor.get_tuple_idx(build_tuple_desc->id());
         RETURN_IF_INVALID_TUPLE_IDX(build_tuple_desc->id(), tuple_idx);
     }
-    RETURN_IF_ERROR(vectorized::VExpr::prepare(_filter_src_expr_ctxs, state, _child_x->row_desc()));
+    RETURN_IF_ERROR(vectorized::VExpr::prepare(_filter_src_expr_ctxs, state, _child->row_desc()));
     return vectorized::VExpr::open(_filter_src_expr_ctxs, state);
 }
 
diff --git a/be/src/pipeline/exec/nested_loop_join_build_operator.h b/be/src/pipeline/exec/nested_loop_join_build_operator.h
index da7712e3e17685..f2ca259754b661 100644
--- a/be/src/pipeline/exec/nested_loop_join_build_operator.h
+++ b/be/src/pipeline/exec/nested_loop_join_build_operator.h
@@ -76,8 +76,8 @@ class NestedLoopJoinBuildSinkOperatorX final
         if (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) {
             return {ExchangeType::NOOP};
         }
-        return _child_x->ignore_data_distribution() ? DataDistribution(ExchangeType::BROADCAST)
-                                                    : DataDistribution(ExchangeType::NOOP);
+        return _child->ignore_data_distribution() ? DataDistribution(ExchangeType::BROADCAST)
+                                                  : DataDistribution(ExchangeType::NOOP);
     }
 
 private:
diff --git a/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp b/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp
index 5a0b6680eee765..9546ed8df56671 100644
--- a/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp
+++ b/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp
@@ -450,7 +450,7 @@ Status NestedLoopJoinProbeOperatorX::open(RuntimeState* state) {
     for (auto& conjunct : _join_conjuncts) {
         RETURN_IF_ERROR(conjunct->prepare(state, *_intermediate_row_desc));
     }
-    _num_probe_side_columns = _child_x->row_desc().num_materialized_slots();
+    _num_probe_side_columns = _child->row_desc().num_materialized_slots();
     _num_build_side_columns = _build_side_child->row_desc().num_materialized_slots();
     return vectorized::VExpr::open(_join_conjuncts, state);
 }
diff --git a/be/src/pipeline/exec/operator.cpp b/be/src/pipeline/exec/operator.cpp
index 217c3219d5c36d..d65769254b9dfc 100644
--- a/be/src/pipeline/exec/operator.cpp
+++ b/be/src/pipeline/exec/operator.cpp
@@ -116,12 +116,12 @@ std::string PipelineXSinkLocalState<SharedStateArg>::name_suffix() {
 }
 
 DataDistribution DataSinkOperatorXBase::required_data_distribution() const {
-    return _child_x && _child_x->ignore_data_distribution()
+    return _child && _child->ignore_data_distribution()
                    ? DataDistribution(ExchangeType::PASSTHROUGH)
                    : DataDistribution(ExchangeType::NOOP);
 }
 const RowDescriptor& OperatorBase::row_desc() const {
-    return _child_x->row_desc();
+    return _child->row_desc();
 }
 
 template <typename SharedStateArg>
@@ -220,15 +220,15 @@ Status OperatorXBase::open(RuntimeState* state) {
     for (auto& projections : _intermediate_projections) {
         RETURN_IF_ERROR(vectorized::VExpr::open(projections, state));
     }
-    if (_child_x && !is_source()) {
-        RETURN_IF_ERROR(_child_x->open(state));
+    if (_child && !is_source()) {
+        RETURN_IF_ERROR(_child->open(state));
     }
     return Status::OK();
 }
 
 Status OperatorXBase::close(RuntimeState* state) {
-    if (_child_x && !is_source()) {
-        RETURN_IF_ERROR(_child_x->close(state));
+    if (_child && !is_source()) {
+        RETURN_IF_ERROR(_child->close(state));
     }
     auto result = state->get_local_state_result(operator_id());
     if (!result) {
@@ -572,8 +572,7 @@ Status PipelineXSinkLocalState<SharedState>::close(RuntimeState* state, Status e
 template <typename LocalStateType>
 Status StreamingOperatorX<LocalStateType>::get_block(RuntimeState* state, vectorized::Block* block,
                                                      bool* eos) {
-    RETURN_IF_ERROR(
-            OperatorX<LocalStateType>::_child_x->get_block_after_projects(state, block, eos));
+    RETURN_IF_ERROR(OperatorX<LocalStateType>::_child->get_block_after_projects(state, block, eos));
     return pull(state, block, eos);
 }
 
@@ -583,8 +582,8 @@ Status StatefulOperatorX<LocalStateType>::get_block(RuntimeState* state, vectori
     auto& local_state = get_local_state(state);
     if (need_more_input_data(state)) {
         local_state._child_block->clear_column_data(
-                OperatorX<LocalStateType>::_child_x->row_desc().num_materialized_slots());
-        RETURN_IF_ERROR(OperatorX<LocalStateType>::_child_x->get_block_after_projects(
+                OperatorX<LocalStateType>::_child->row_desc().num_materialized_slots());
+        RETURN_IF_ERROR(OperatorX<LocalStateType>::_child->get_block_after_projects(
                 state, local_state._child_block.get(), &local_state._child_eos));
         *eos = local_state._child_eos;
         if (local_state._child_block->rows() == 0 && !local_state._child_eos) {
@@ -668,66 +667,66 @@ Status AsyncWriterSink<Writer, Parent>::close(RuntimeState* state, Status exec_s
     return Base::close(state, exec_status);
 }
 
-#define DECLARE_OPERATOR_X(LOCAL_STATE) template class DataSinkOperatorX<LOCAL_STATE>;
-DECLARE_OPERATOR_X(HashJoinBuildSinkLocalState)
-DECLARE_OPERATOR_X(ResultSinkLocalState)
-DECLARE_OPERATOR_X(JdbcTableSinkLocalState)
-DECLARE_OPERATOR_X(MemoryScratchSinkLocalState)
-DECLARE_OPERATOR_X(ResultFileSinkLocalState)
-DECLARE_OPERATOR_X(OlapTableSinkLocalState)
-DECLARE_OPERATOR_X(OlapTableSinkV2LocalState)
-DECLARE_OPERATOR_X(HiveTableSinkLocalState)
-DECLARE_OPERATOR_X(IcebergTableSinkLocalState)
-DECLARE_OPERATOR_X(AnalyticSinkLocalState)
-DECLARE_OPERATOR_X(SortSinkLocalState)
-DECLARE_OPERATOR_X(SpillSortSinkLocalState)
-DECLARE_OPERATOR_X(LocalExchangeSinkLocalState)
-DECLARE_OPERATOR_X(AggSinkLocalState)
-DECLARE_OPERATOR_X(PartitionedAggSinkLocalState)
-DECLARE_OPERATOR_X(ExchangeSinkLocalState)
-DECLARE_OPERATOR_X(NestedLoopJoinBuildSinkLocalState)
-DECLARE_OPERATOR_X(UnionSinkLocalState)
-DECLARE_OPERATOR_X(MultiCastDataStreamSinkLocalState)
-DECLARE_OPERATOR_X(PartitionSortSinkLocalState)
-DECLARE_OPERATOR_X(SetProbeSinkLocalState<true>)
-DECLARE_OPERATOR_X(SetProbeSinkLocalState<false>)
-DECLARE_OPERATOR_X(SetSinkLocalState<true>)
-DECLARE_OPERATOR_X(SetSinkLocalState<false>)
-DECLARE_OPERATOR_X(PartitionedHashJoinSinkLocalState)
-DECLARE_OPERATOR_X(GroupCommitBlockSinkLocalState)
-
-#undef DECLARE_OPERATOR_X
-
-#define DECLARE_OPERATOR_X(LOCAL_STATE) template class OperatorX<LOCAL_STATE>;
-DECLARE_OPERATOR_X(HashJoinProbeLocalState)
-DECLARE_OPERATOR_X(OlapScanLocalState)
-DECLARE_OPERATOR_X(GroupCommitLocalState)
-DECLARE_OPERATOR_X(JDBCScanLocalState)
-DECLARE_OPERATOR_X(FileScanLocalState)
-DECLARE_OPERATOR_X(EsScanLocalState)
-DECLARE_OPERATOR_X(AnalyticLocalState)
-DECLARE_OPERATOR_X(SortLocalState)
-DECLARE_OPERATOR_X(SpillSortLocalState)
-DECLARE_OPERATOR_X(AggLocalState)
-DECLARE_OPERATOR_X(PartitionedAggLocalState)
-DECLARE_OPERATOR_X(TableFunctionLocalState)
-DECLARE_OPERATOR_X(ExchangeLocalState)
-DECLARE_OPERATOR_X(RepeatLocalState)
-DECLARE_OPERATOR_X(NestedLoopJoinProbeLocalState)
-DECLARE_OPERATOR_X(AssertNumRowsLocalState)
-DECLARE_OPERATOR_X(EmptySetLocalState)
-DECLARE_OPERATOR_X(UnionSourceLocalState)
-DECLARE_OPERATOR_X(MultiCastDataStreamSourceLocalState)
-DECLARE_OPERATOR_X(PartitionSortSourceLocalState)
-DECLARE_OPERATOR_X(SetSourceLocalState<true>)
-DECLARE_OPERATOR_X(SetSourceLocalState<false>)
-DECLARE_OPERATOR_X(DataGenLocalState)
-DECLARE_OPERATOR_X(SchemaScanLocalState)
-DECLARE_OPERATOR_X(MetaScanLocalState)
-DECLARE_OPERATOR_X(LocalExchangeSourceLocalState)
-DECLARE_OPERATOR_X(PartitionedHashJoinProbeLocalState)
-
-#undef DECLARE_OPERATOR_X
+#define DECLARE_OPERATOR(LOCAL_STATE) template class DataSinkOperatorX<LOCAL_STATE>;
+DECLARE_OPERATOR(HashJoinBuildSinkLocalState)
+DECLARE_OPERATOR(ResultSinkLocalState)
+DECLARE_OPERATOR(JdbcTableSinkLocalState)
+DECLARE_OPERATOR(MemoryScratchSinkLocalState)
+DECLARE_OPERATOR(ResultFileSinkLocalState)
+DECLARE_OPERATOR(OlapTableSinkLocalState)
+DECLARE_OPERATOR(OlapTableSinkV2LocalState)
+DECLARE_OPERATOR(HiveTableSinkLocalState)
+DECLARE_OPERATOR(IcebergTableSinkLocalState)
+DECLARE_OPERATOR(AnalyticSinkLocalState)
+DECLARE_OPERATOR(SortSinkLocalState)
+DECLARE_OPERATOR(SpillSortSinkLocalState)
+DECLARE_OPERATOR(LocalExchangeSinkLocalState)
+DECLARE_OPERATOR(AggSinkLocalState)
+DECLARE_OPERATOR(PartitionedAggSinkLocalState)
+DECLARE_OPERATOR(ExchangeSinkLocalState)
+DECLARE_OPERATOR(NestedLoopJoinBuildSinkLocalState)
+DECLARE_OPERATOR(UnionSinkLocalState)
+DECLARE_OPERATOR(MultiCastDataStreamSinkLocalState)
+DECLARE_OPERATOR(PartitionSortSinkLocalState)
+DECLARE_OPERATOR(SetProbeSinkLocalState<true>)
+DECLARE_OPERATOR(SetProbeSinkLocalState<false>)
+DECLARE_OPERATOR(SetSinkLocalState<true>)
+DECLARE_OPERATOR(SetSinkLocalState<false>)
+DECLARE_OPERATOR(PartitionedHashJoinSinkLocalState)
+DECLARE_OPERATOR(GroupCommitBlockSinkLocalState)
+
+#undef DECLARE_OPERATOR
+
+#define DECLARE_OPERATOR(LOCAL_STATE) template class OperatorX<LOCAL_STATE>;
+DECLARE_OPERATOR(HashJoinProbeLocalState)
+DECLARE_OPERATOR(OlapScanLocalState)
+DECLARE_OPERATOR(GroupCommitLocalState)
+DECLARE_OPERATOR(JDBCScanLocalState)
+DECLARE_OPERATOR(FileScanLocalState)
+DECLARE_OPERATOR(EsScanLocalState)
+DECLARE_OPERATOR(AnalyticLocalState)
+DECLARE_OPERATOR(SortLocalState)
+DECLARE_OPERATOR(SpillSortLocalState)
+DECLARE_OPERATOR(AggLocalState)
+DECLARE_OPERATOR(PartitionedAggLocalState)
+DECLARE_OPERATOR(TableFunctionLocalState)
+DECLARE_OPERATOR(ExchangeLocalState)
+DECLARE_OPERATOR(RepeatLocalState)
+DECLARE_OPERATOR(NestedLoopJoinProbeLocalState)
+DECLARE_OPERATOR(AssertNumRowsLocalState)
+DECLARE_OPERATOR(EmptySetLocalState)
+DECLARE_OPERATOR(UnionSourceLocalState)
+DECLARE_OPERATOR(MultiCastDataStreamSourceLocalState)
+DECLARE_OPERATOR(PartitionSortSourceLocalState)
+DECLARE_OPERATOR(SetSourceLocalState<true>)
+DECLARE_OPERATOR(SetSourceLocalState<false>)
+DECLARE_OPERATOR(DataGenLocalState)
+DECLARE_OPERATOR(SchemaScanLocalState)
+DECLARE_OPERATOR(MetaScanLocalState)
+DECLARE_OPERATOR(LocalExchangeSourceLocalState)
+DECLARE_OPERATOR(PartitionedHashJoinProbeLocalState)
+
+#undef DECLARE_OPERATOR
 
 template class StreamingOperatorX<AssertNumRowsLocalState>;
 template class StreamingOperatorX<SelectLocalState>;
diff --git a/be/src/pipeline/exec/operator.h b/be/src/pipeline/exec/operator.h
index 0863550dc192e4..48f8a2d1836574 100644
--- a/be/src/pipeline/exec/operator.h
+++ b/be/src/pipeline/exec/operator.h
@@ -39,7 +39,6 @@
 #include "vec/runtime/vdata_stream_recvr.h"
 
 namespace doris {
-class DataSink;
 class RowDescriptor;
 class RuntimeState;
 class TDataSink;
@@ -82,7 +81,7 @@ struct LocalSinkStateInfo {
 
 class OperatorBase {
 public:
-    explicit OperatorBase() : _child_x(nullptr), _is_closed(false) {}
+    explicit OperatorBase() : _child(nullptr), _is_closed(false) {}
     virtual ~OperatorBase() = default;
 
     virtual bool is_sink() const { return false; }
@@ -98,7 +97,7 @@ class OperatorBase {
     [[nodiscard]] virtual Status close(RuntimeState* state);
 
     [[nodiscard]] virtual Status set_child(OperatorPtr child) {
-        _child_x = std::move(child);
+        _child = std::move(child);
         return Status::OK();
     }
 
@@ -108,7 +107,7 @@ class OperatorBase {
 
     virtual Status revoke_memory(RuntimeState* state) { return Status::OK(); }
     [[nodiscard]] virtual bool require_data_distribution() const { return false; }
-    OperatorPtr child_x() { return _child_x; }
+    OperatorPtr child() { return _child; }
     [[nodiscard]] bool followed_by_shuffled_join() const { return _followed_by_shuffled_join; }
     void set_followed_by_shuffled_join(bool followed_by_shuffled_join) {
         _followed_by_shuffled_join = followed_by_shuffled_join;
@@ -116,7 +115,7 @@ class OperatorBase {
     [[nodiscard]] virtual bool require_shuffled_data_distribution() const { return false; }
 
 protected:
-    OperatorPtr _child_x = nullptr;
+    OperatorPtr _child = nullptr;
 
     bool _is_closed;
     bool _followed_by_shuffled_join = false;
@@ -645,15 +644,15 @@ class OperatorXBase : public OperatorBase {
     }
     [[nodiscard]] std::string get_name() const override { return _op_name; }
     [[nodiscard]] virtual DataDistribution required_data_distribution() const {
-        return _child_x && _child_x->ignore_data_distribution() && !is_source()
+        return _child && _child->ignore_data_distribution() && !is_source()
                        ? DataDistribution(ExchangeType::PASSTHROUGH)
                        : DataDistribution(ExchangeType::NOOP);
     }
     [[nodiscard]] virtual bool ignore_data_distribution() const {
-        return _child_x ? _child_x->ignore_data_distribution() : _ignore_data_distribution;
+        return _child ? _child->ignore_data_distribution() : _ignore_data_distribution;
     }
     [[nodiscard]] bool ignore_data_hash_distribution() const {
-        return _child_x ? _child_x->ignore_data_hash_distribution() : _ignore_data_distribution;
+        return _child ? _child->ignore_data_hash_distribution() : _ignore_data_distribution;
     }
     [[nodiscard]] virtual bool need_more_input_data(RuntimeState* state) const { return true; }
     void set_ignore_data_distribution() { _ignore_data_distribution = true; }
@@ -708,7 +707,7 @@ class OperatorXBase : public OperatorBase {
         return reinterpret_cast<const TARGET&>(*this);
     }
 
-    [[nodiscard]] OperatorPtr get_child() { return _child_x; }
+    [[nodiscard]] OperatorPtr get_child() { return _child; }
 
     [[nodiscard]] vectorized::VExprContextSPtrs& conjuncts() { return _conjuncts; }
     [[nodiscard]] virtual RowDescriptor& row_descriptor() { return _row_descriptor; }
diff --git a/be/src/pipeline/exec/partition_sort_sink_operator.cpp b/be/src/pipeline/exec/partition_sort_sink_operator.cpp
index 80808185fa8980..94c51e160da2a2 100644
--- a/be/src/pipeline/exec/partition_sort_sink_operator.cpp
+++ b/be/src/pipeline/exec/partition_sort_sink_operator.cpp
@@ -117,7 +117,7 @@ Status PartitionSortSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo
             ADD_COUNTER(_profile, "SortedPartitionInputRows", TUnit::UNIT);
     _partition_sort_info = std::make_shared<PartitionSortInfo>(
             &_vsort_exec_exprs, p._limit, 0, p._pool, p._is_asc_order, p._nulls_first,
-            p._child_x->row_desc(), state, _profile, p._has_global_limit, p._partition_inner_limit,
+            p._child->row_desc(), state, _profile, p._has_global_limit, p._partition_inner_limit,
             p._top_n_algorithm, p._topn_phase);
     RETURN_IF_ERROR(_init_hash_method());
     return Status::OK();
@@ -156,8 +156,8 @@ Status PartitionSortSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* st
 
 Status PartitionSortSinkOperatorX::open(RuntimeState* state) {
     RETURN_IF_ERROR(DataSinkOperatorX<PartitionSortSinkLocalState>::open(state));
-    RETURN_IF_ERROR(_vsort_exec_exprs.prepare(state, _child_x->row_desc(), _row_descriptor));
-    RETURN_IF_ERROR(vectorized::VExpr::prepare(_partition_expr_ctxs, state, _child_x->row_desc()));
+    RETURN_IF_ERROR(_vsort_exec_exprs.prepare(state, _child->row_desc(), _row_descriptor));
+    RETURN_IF_ERROR(vectorized::VExpr::prepare(_partition_expr_ctxs, state, _child->row_desc()));
     RETURN_IF_ERROR(_vsort_exec_exprs.open(state));
     RETURN_IF_ERROR(vectorized::VExpr::open(_partition_expr_ctxs, state));
     return Status::OK();
@@ -175,7 +175,7 @@ Status PartitionSortSinkOperatorX::sink(RuntimeState* state, vectorized::Block*
                 local_state._value_places.push_back(_pool->add(new PartitionBlocks(
                         local_state._partition_sort_info, local_state._value_places.empty())));
             }
-            local_state._value_places[0]->append_whole_block(input_block, _child_x->row_desc());
+            local_state._value_places[0]->append_whole_block(input_block, _child->row_desc());
         } else {
             //just simply use partition num to check
             //if is TWO_PHASE_GLOBAL, must be sort all data thought partition num threshold have been exceeded.
diff --git a/be/src/pipeline/exec/partitioned_aggregation_sink_operator.cpp b/be/src/pipeline/exec/partitioned_aggregation_sink_operator.cpp
index 448d3239949a8a..469716b7a22182 100644
--- a/be/src/pipeline/exec/partitioned_aggregation_sink_operator.cpp
+++ b/be/src/pipeline/exec/partitioned_aggregation_sink_operator.cpp
@@ -141,8 +141,8 @@ Status PartitionedAggSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* s
     }
 
     _agg_sink_operator->set_dests_id(DataSinkOperatorX<PartitionedAggSinkLocalState>::dests_id());
-    RETURN_IF_ERROR(_agg_sink_operator->set_child(
-            DataSinkOperatorX<PartitionedAggSinkLocalState>::_child_x));
+    RETURN_IF_ERROR(
+            _agg_sink_operator->set_child(DataSinkOperatorX<PartitionedAggSinkLocalState>::_child));
     return _agg_sink_operator->init(tnode, state);
 }
 
diff --git a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp
index 6529d1eb6540c5..018d63a6deebb1 100644
--- a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp
+++ b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp
@@ -525,15 +525,15 @@ Status PartitionedHashJoinProbeOperatorX::init(const TPlanNode& tnode, RuntimeSt
 }
 
 Status PartitionedHashJoinProbeOperatorX::open(RuntimeState* state) {
-    // to avoid open _child_x twice
-    auto child_x = std::move(_child_x);
+    // to avoid open _child twice
+    auto child = std::move(_child);
     RETURN_IF_ERROR(JoinProbeOperatorX::open(state));
-    RETURN_IF_ERROR(_inner_probe_operator->set_child(child_x));
+    RETURN_IF_ERROR(_inner_probe_operator->set_child(child));
     DCHECK(_build_side_child != nullptr);
     _inner_probe_operator->set_build_side_child(_build_side_child);
     RETURN_IF_ERROR(_inner_probe_operator->open(state));
-    _child_x = std::move(child_x);
-    RETURN_IF_ERROR(_partitioner->prepare(state, _child_x->row_desc()));
+    _child = std::move(child);
+    RETURN_IF_ERROR(_partitioner->prepare(state, _child->row_desc()));
     RETURN_IF_ERROR(_partitioner->open(state));
     return Status::OK();
 }
@@ -820,8 +820,8 @@ Status PartitionedHashJoinProbeOperatorX::get_block(RuntimeState* state, vectori
             return _revoke_memory(state);
         }
 
-        RETURN_IF_ERROR(_child_x->get_block_after_projects(state, local_state._child_block.get(),
-                                                           &local_state._child_eos));
+        RETURN_IF_ERROR(_child->get_block_after_projects(state, local_state._child_block.get(),
+                                                         &local_state._child_eos));
 
         if (need_to_spill && local_state._child_eos) {
             RETURN_IF_ERROR(local_state.finish_spilling(0));
diff --git a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp
index 7c29fdc6ed08dd..a7297be493f804 100644
--- a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp
+++ b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp
@@ -102,7 +102,7 @@ size_t PartitionedHashJoinSinkLocalState::revocable_mem_size(RuntimeState* state
 Status PartitionedHashJoinSinkLocalState::_revoke_unpartitioned_block(RuntimeState* state) {
     auto& p = _parent->cast<PartitionedHashJoinSinkOperatorX>();
     _shared_state->inner_shared_state->hash_table_variants.reset();
-    auto row_desc = p._child_x->row_desc();
+    auto row_desc = p._child->row_desc();
     const auto num_slots = row_desc.num_slots();
     vectorized::Block build_block;
     auto inner_sink_state_ = _shared_state->inner_runtime_state->get_sink_local_state();
@@ -426,8 +426,8 @@ Status PartitionedHashJoinSinkOperatorX::init(const TPlanNode& tnode, RuntimeSta
 
 Status PartitionedHashJoinSinkOperatorX::open(RuntimeState* state) {
     RETURN_IF_ERROR(JoinBuildSinkOperatorX<PartitionedHashJoinSinkLocalState>::open(state));
-    RETURN_IF_ERROR(_inner_sink_operator->set_child(_child_x));
-    RETURN_IF_ERROR(_partitioner->prepare(state, _child_x->row_desc()));
+    RETURN_IF_ERROR(_inner_sink_operator->set_child(_child));
+    RETURN_IF_ERROR(_partitioner->prepare(state, _child->row_desc()));
     RETURN_IF_ERROR(_partitioner->open(state));
     return _inner_sink_operator->open(state);
 }
diff --git a/be/src/pipeline/exec/repeat_operator.cpp b/be/src/pipeline/exec/repeat_operator.cpp
index b6761186c82fb4..d355d99c2e352f 100644
--- a/be/src/pipeline/exec/repeat_operator.cpp
+++ b/be/src/pipeline/exec/repeat_operator.cpp
@@ -59,7 +59,7 @@ Status RepeatOperatorX::open(RuntimeState* state) {
     if (_output_tuple_desc == nullptr) {
         return Status::InternalError("Failed to get tuple descriptor.");
     }
-    RETURN_IF_ERROR(vectorized::VExpr::prepare(_expr_ctxs, state, _child_x->row_desc()));
+    RETURN_IF_ERROR(vectorized::VExpr::prepare(_expr_ctxs, state, _child->row_desc()));
     for (const auto& slot_desc : _output_tuple_desc->slots()) {
         _output_slots.push_back(slot_desc);
     }
@@ -211,7 +211,7 @@ Status RepeatOperatorX::pull(doris::RuntimeState* state, vectorized::Block* outp
         int size = _repeat_id_list.size();
         if (_repeat_id_idx >= size) {
             _intermediate_block->clear();
-            _child_block.clear_column_data(_child_x->row_desc().num_materialized_slots());
+            _child_block.clear_column_data(_child->row_desc().num_materialized_slots());
             _repeat_id_idx = 0;
         }
     } else if (local_state._expr_ctxs.empty()) {
@@ -225,7 +225,7 @@ Status RepeatOperatorX::pull(doris::RuntimeState* state, vectorized::Block* outp
             RETURN_IF_ERROR(
                     local_state.add_grouping_id_column(rows, cur_col, columns, repeat_id_idx));
         }
-        _child_block.clear_column_data(_child_x->row_desc().num_materialized_slots());
+        _child_block.clear_column_data(_child->row_desc().num_materialized_slots());
     }
     RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, output_block,
                                                            output_block->columns()));
diff --git a/be/src/pipeline/exec/set_probe_sink_operator.cpp b/be/src/pipeline/exec/set_probe_sink_operator.cpp
index bd4b7481aac240..955f956f60d6fe 100644
--- a/be/src/pipeline/exec/set_probe_sink_operator.cpp
+++ b/be/src/pipeline/exec/set_probe_sink_operator.cpp
@@ -57,7 +57,7 @@ Status SetProbeSinkOperatorX<is_intersect>::init(const TPlanNode& tnode, Runtime
 template <bool is_intersect>
 Status SetProbeSinkOperatorX<is_intersect>::open(RuntimeState* state) {
     RETURN_IF_ERROR(DataSinkOperatorX<SetProbeSinkLocalState<is_intersect>>::open(state));
-    RETURN_IF_ERROR(vectorized::VExpr::prepare(_child_exprs, state, _child_x->row_desc()));
+    RETURN_IF_ERROR(vectorized::VExpr::prepare(_child_exprs, state, _child->row_desc()));
     return vectorized::VExpr::open(_child_exprs, state);
 }
 
diff --git a/be/src/pipeline/exec/set_probe_sink_operator.h b/be/src/pipeline/exec/set_probe_sink_operator.h
index 3b3ed2f6a2cabd..ab53f5358c2a91 100644
--- a/be/src/pipeline/exec/set_probe_sink_operator.h
+++ b/be/src/pipeline/exec/set_probe_sink_operator.h
@@ -111,7 +111,7 @@ class SetProbeSinkOperatorX final : public DataSinkOperatorX<SetProbeSinkLocalSt
     vectorized::VExprContextSPtrs _child_exprs;
     const bool _is_colocate;
     const std::vector<TExpr> _partition_exprs;
-    using OperatorBase::_child_x;
+    using OperatorBase::_child;
 };
 
 } // namespace pipeline
diff --git a/be/src/pipeline/exec/set_sink_operator.cpp b/be/src/pipeline/exec/set_sink_operator.cpp
index 9cebcf8611edc0..38667293d4854b 100644
--- a/be/src/pipeline/exec/set_sink_operator.cpp
+++ b/be/src/pipeline/exec/set_sink_operator.cpp
@@ -209,7 +209,7 @@ Status SetSinkOperatorX<is_intersect>::init(const TPlanNode& tnode, RuntimeState
 template <bool is_intersect>
 Status SetSinkOperatorX<is_intersect>::open(RuntimeState* state) {
     RETURN_IF_ERROR(Base::open(state));
-    RETURN_IF_ERROR(vectorized::VExpr::prepare(_child_exprs, state, _child_x->row_desc()));
+    RETURN_IF_ERROR(vectorized::VExpr::prepare(_child_exprs, state, _child->row_desc()));
     return vectorized::VExpr::open(_child_exprs, state);
 }
 
diff --git a/be/src/pipeline/exec/set_sink_operator.h b/be/src/pipeline/exec/set_sink_operator.h
index 48fd7f400dd5b7..1c08eddc141f2e 100644
--- a/be/src/pipeline/exec/set_sink_operator.h
+++ b/be/src/pipeline/exec/set_sink_operator.h
@@ -111,7 +111,7 @@ class SetSinkOperatorX final : public DataSinkOperatorX<SetSinkLocalState<is_int
     vectorized::VExprContextSPtrs _child_exprs;
     const bool _is_colocate;
     const std::vector<TExpr> _partition_exprs;
-    using OperatorBase::_child_x;
+    using OperatorBase::_child;
 };
 
 } // namespace pipeline
diff --git a/be/src/pipeline/exec/sort_sink_operator.cpp b/be/src/pipeline/exec/sort_sink_operator.cpp
index bb7c38d2b709bb..b07942b9ab1c05 100644
--- a/be/src/pipeline/exec/sort_sink_operator.cpp
+++ b/be/src/pipeline/exec/sort_sink_operator.cpp
@@ -46,19 +46,19 @@ Status SortSinkLocalState::open(RuntimeState* state) {
     case TSortAlgorithm::HEAP_SORT: {
         _shared_state->sorter = vectorized::HeapSorter::create_unique(
                 _vsort_exec_exprs, p._limit, p._offset, p._pool, p._is_asc_order, p._nulls_first,
-                p._child_x->row_desc());
+                p._child->row_desc());
         break;
     }
     case TSortAlgorithm::TOPN_SORT: {
         _shared_state->sorter = vectorized::TopNSorter::create_unique(
                 _vsort_exec_exprs, p._limit, p._offset, p._pool, p._is_asc_order, p._nulls_first,
-                p._child_x->row_desc(), state, _profile);
+                p._child->row_desc(), state, _profile);
         break;
     }
     case TSortAlgorithm::FULL_SORT: {
         _shared_state->sorter = vectorized::FullSorter::create_unique(
                 _vsort_exec_exprs, p._limit, p._offset, p._pool, p._is_asc_order, p._nulls_first,
-                p._child_x->row_desc(), state, _profile);
+                p._child->row_desc(), state, _profile);
         break;
     }
     default: {
@@ -108,7 +108,7 @@ Status SortSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) {
 
 Status SortSinkOperatorX::open(RuntimeState* state) {
     RETURN_IF_ERROR(DataSinkOperatorX<SortSinkLocalState>::open(state));
-    RETURN_IF_ERROR(_vsort_exec_exprs.prepare(state, _child_x->row_desc(), _row_descriptor));
+    RETURN_IF_ERROR(_vsort_exec_exprs.prepare(state, _child->row_desc(), _row_descriptor));
     return _vsort_exec_exprs.open(state);
 }
 
diff --git a/be/src/pipeline/exec/sort_source_operator.cpp b/be/src/pipeline/exec/sort_source_operator.cpp
index 17c936846e5c56..02a99e183c852e 100644
--- a/be/src/pipeline/exec/sort_source_operator.cpp
+++ b/be/src/pipeline/exec/sort_source_operator.cpp
@@ -42,9 +42,9 @@ Status SortSourceOperatorX::init(const TPlanNode& tnode, RuntimeState* state) {
 
 Status SortSourceOperatorX::open(RuntimeState* state) {
     RETURN_IF_ERROR(Base::open(state));
-    // spill sort _child_x may be nullptr.
-    if (_child_x) {
-        RETURN_IF_ERROR(_vsort_exec_exprs.prepare(state, _child_x->row_desc(), _row_descriptor));
+    // spill sort _child may be nullptr.
+    if (_child) {
+        RETURN_IF_ERROR(_vsort_exec_exprs.prepare(state, _child->row_desc(), _row_descriptor));
         RETURN_IF_ERROR(_vsort_exec_exprs.open(state));
     }
     return Status::OK();
diff --git a/be/src/pipeline/exec/spill_sort_sink_operator.cpp b/be/src/pipeline/exec/spill_sort_sink_operator.cpp
index 5f767f2b6e3ab8..4bf1ab04efb628 100644
--- a/be/src/pipeline/exec/spill_sort_sink_operator.cpp
+++ b/be/src/pipeline/exec/spill_sort_sink_operator.cpp
@@ -120,7 +120,7 @@ Status SpillSortSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state)
     _name = "SPILL_SORT_SINK_OPERATOR";
 
     _sort_sink_operator->set_dests_id(DataSinkOperatorX<LocalStateType>::dests_id());
-    RETURN_IF_ERROR(_sort_sink_operator->set_child(DataSinkOperatorX<LocalStateType>::_child_x));
+    RETURN_IF_ERROR(_sort_sink_operator->set_child(DataSinkOperatorX<LocalStateType>::_child));
     return _sort_sink_operator->init(tnode, state);
 }
 
diff --git a/be/src/pipeline/exec/streaming_aggregation_operator.cpp b/be/src/pipeline/exec/streaming_aggregation_operator.cpp
index 59e11583f003c2..dfbe42c637ea56 100644
--- a/be/src/pipeline/exec/streaming_aggregation_operator.cpp
+++ b/be/src/pipeline/exec/streaming_aggregation_operator.cpp
@@ -1182,7 +1182,7 @@ Status StreamingAggOperatorX::open(RuntimeState* state) {
     _intermediate_tuple_desc = state->desc_tbl().get_tuple_descriptor(_intermediate_tuple_id);
     _output_tuple_desc = state->desc_tbl().get_tuple_descriptor(_output_tuple_id);
     DCHECK_EQ(_intermediate_tuple_desc->slots().size(), _output_tuple_desc->slots().size());
-    RETURN_IF_ERROR(vectorized::VExpr::prepare(_probe_expr_ctxs, state, _child_x->row_desc()));
+    RETURN_IF_ERROR(vectorized::VExpr::prepare(_probe_expr_ctxs, state, _child->row_desc()));
 
     int j = _probe_expr_ctxs.size();
     for (int i = 0; i < j; ++i) {
@@ -1197,7 +1197,7 @@ Status StreamingAggOperatorX::open(RuntimeState* state) {
         SlotDescriptor* intermediate_slot_desc = _intermediate_tuple_desc->slots()[j];
         SlotDescriptor* output_slot_desc = _output_tuple_desc->slots()[j];
         RETURN_IF_ERROR(_aggregate_evaluators[i]->prepare(
-                state, _child_x->row_desc(), intermediate_slot_desc, output_slot_desc));
+                state, _child->row_desc(), intermediate_slot_desc, output_slot_desc));
         _aggregate_evaluators[i]->set_version(state->be_exec_version());
     }
 
@@ -1295,7 +1295,7 @@ Status StreamingAggOperatorX::push(RuntimeState* state, vectorized::Block* in_bl
     if (in_block->rows() > 0) {
         RETURN_IF_ERROR(local_state.do_pre_agg(in_block, local_state._pre_aggregated_block.get()));
     }
-    in_block->clear_column_data(_child_x->row_desc().num_materialized_slots());
+    in_block->clear_column_data(_child->row_desc().num_materialized_slots());
     return Status::OK();
 }
 
diff --git a/be/src/pipeline/exec/table_function_operator.cpp b/be/src/pipeline/exec/table_function_operator.cpp
index 02f61aa8fa94ea..ff9dfe632faec6 100644
--- a/be/src/pipeline/exec/table_function_operator.cpp
+++ b/be/src/pipeline/exec/table_function_operator.cpp
@@ -215,7 +215,7 @@ void TableFunctionLocalState::process_next_child_row() {
         }
 
         _child_block->clear_column_data(_parent->cast<TableFunctionOperatorX>()
-                                                ._child_x->row_desc()
+                                                ._child->row_desc()
                                                 .num_materialized_slots());
         _cur_child_offset = -1;
         return;
@@ -285,7 +285,7 @@ Status TableFunctionOperatorX::open(doris::RuntimeState* state) {
     }
 
     // get all input slots
-    for (const auto& child_tuple_desc : _child_x->row_desc().tuple_descriptors()) {
+    for (const auto& child_tuple_desc : _child->row_desc().tuple_descriptors()) {
         for (const auto& child_slot_desc : child_tuple_desc->slots()) {
             _child_slots.push_back(child_slot_desc);
         }
diff --git a/be/src/pipeline/exec/union_sink_operator.cpp b/be/src/pipeline/exec/union_sink_operator.cpp
index 06f301bc75ba40..288fc131037fab 100644
--- a/be/src/pipeline/exec/union_sink_operator.cpp
+++ b/be/src/pipeline/exec/union_sink_operator.cpp
@@ -74,7 +74,7 @@ Status UnionSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) {
 
 Status UnionSinkOperatorX::open(RuntimeState* state) {
     RETURN_IF_ERROR(DataSinkOperatorX<UnionSinkLocalState>::open(state));
-    RETURN_IF_ERROR(vectorized::VExpr::prepare(_child_expr, state, _child_x->row_desc()));
+    RETURN_IF_ERROR(vectorized::VExpr::prepare(_child_expr, state, _child->row_desc()));
     RETURN_IF_ERROR(vectorized::VExpr::check_expr_output_type(_child_expr, _row_descriptor));
     // open const expr lists.
     RETURN_IF_ERROR(vectorized::VExpr::open(_const_expr, state));
diff --git a/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp b/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp
index f5f3155b2d3d4d..19c37f3649bcc7 100644
--- a/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp
+++ b/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp
@@ -71,7 +71,7 @@ Status LocalExchangeSinkOperatorX::init(ExchangeType type, const int num_buckets
 Status LocalExchangeSinkOperatorX::open(RuntimeState* state) {
     RETURN_IF_ERROR(DataSinkOperatorX<LocalExchangeSinkLocalState>::open(state));
     if (_type == ExchangeType::HASH_SHUFFLE || _type == ExchangeType::BUCKET_HASH_SHUFFLE) {
-        RETURN_IF_ERROR(_partitioner->prepare(state, _child_x->row_desc()));
+        RETURN_IF_ERROR(_partitioner->prepare(state, _child->row_desc()));
         RETURN_IF_ERROR(_partitioner->open(state));
     }
 
diff --git a/be/src/pipeline/local_exchange/local_exchange_source_operator.h b/be/src/pipeline/local_exchange/local_exchange_source_operator.h
index ad23cb96aef6fe..c0da5c8120c1e9 100644
--- a/be/src/pipeline/local_exchange/local_exchange_source_operator.h
+++ b/be/src/pipeline/local_exchange/local_exchange_source_operator.h
@@ -72,10 +72,10 @@ class LocalExchangeSourceOperatorX final : public OperatorX<LocalExchangeSourceL
     }
     Status open(RuntimeState* state) override { return Status::OK(); }
     const RowDescriptor& intermediate_row_desc() const override {
-        return _child_x->intermediate_row_desc();
+        return _child->intermediate_row_desc();
     }
-    RowDescriptor& row_descriptor() override { return _child_x->row_descriptor(); }
-    const RowDescriptor& row_desc() const override { return _child_x->row_desc(); }
+    RowDescriptor& row_descriptor() override { return _child->row_descriptor(); }
+    const RowDescriptor& row_desc() const override { return _child->row_desc(); }
 
     Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos) override;
 
diff --git a/be/src/pipeline/pipeline_fragment_context.cpp b/be/src/pipeline/pipeline_fragment_context.cpp
index 38a99dd66d421f..a489273b68d129 100644
--- a/be/src/pipeline/pipeline_fragment_context.cpp
+++ b/be/src/pipeline/pipeline_fragment_context.cpp
@@ -706,7 +706,7 @@ Status PipelineFragmentContext::_add_local_exchange_impl(
         const std::map<int, int>& bucket_seq_to_instance_idx,
         const std::map<int, int>& shuffle_idx_to_instance_idx,
         const bool ignore_data_hash_distribution) {
-    auto& operator_xs = cur_pipe->operators();
+    auto& operators = cur_pipe->operators();
     const auto downstream_pipeline_id = cur_pipe->id();
     auto local_exchange_id = next_operator_id();
     // 1. Create a new pipeline with local exchange sink.
@@ -717,8 +717,8 @@ Status PipelineFragmentContext::_add_local_exchange_impl(
      * `bucket_seq_to_instance_idx` is empty if no scan operator is contained in this fragment.
      * So co-located operators(e.g. Agg, Analytic) should use `HASH_SHUFFLE` instead of `BUCKET_HASH_SHUFFLE`.
      */
-    const bool followed_by_shuffled_join = operator_xs.size() > idx
-                                                   ? operator_xs[idx]->followed_by_shuffled_join()
+    const bool followed_by_shuffled_join = operators.size() > idx
+                                                   ? operators[idx]->followed_by_shuffled_join()
                                                    : cur_pipe->sink()->followed_by_shuffled_join();
     const bool should_disable_bucket_shuffle =
             bucket_seq_to_instance_idx.empty() &&
@@ -790,7 +790,7 @@ Status PipelineFragmentContext::_add_local_exchange_impl(
         }
         break;
     case ExchangeType::LOCAL_MERGE_SORT: {
-        auto child_op = cur_pipe->sink()->child_x();
+        auto child_op = cur_pipe->sink()->child();
         auto sort_source = std::dynamic_pointer_cast<SortSourceOperatorX>(child_op);
         if (!sort_source) {
             return Status::InternalError(
@@ -825,21 +825,21 @@ Status PipelineFragmentContext::_add_local_exchange_impl(
     // pipeline1 [Scan - LocalExchangeSink] and pipeline2 [LocalExchangeSource - AggSink].
 
     // 3.1 Initialize new pipeline's operator list.
-    std::copy(operator_xs.begin(), operator_xs.begin() + idx,
+    std::copy(operators.begin(), operators.begin() + idx,
               std::inserter(new_pip->operators(), new_pip->operators().end()));
 
     // 3.2 Erase unused operators in previous pipeline.
-    operator_xs.erase(operator_xs.begin(), operator_xs.begin() + idx);
+    operators.erase(operators.begin(), operators.begin() + idx);
 
     // 4. Initialize LocalExchangeSource and insert it into this pipeline.
     OperatorPtr source_op;
     source_op.reset(new LocalExchangeSourceOperatorX(pool, local_exchange_id));
     RETURN_IF_ERROR(source_op->set_child(new_pip->operators().back()));
     RETURN_IF_ERROR(source_op->init(data_distribution.distribution_type));
-    if (!operator_xs.empty()) {
-        RETURN_IF_ERROR(operator_xs.front()->set_child(source_op));
+    if (!operators.empty()) {
+        RETURN_IF_ERROR(operators.front()->set_child(source_op));
     }
-    operator_xs.insert(operator_xs.begin(), source_op);
+    operators.insert(operators.begin(), source_op);
 
     shared_state->create_dependencies(local_exchange_id);
 
@@ -896,8 +896,8 @@ Status PipelineFragmentContext::_add_local_exchange(
     }
     *do_local_exchange = true;
 
-    auto& operator_xs = cur_pipe->operators();
-    auto total_op_num = operator_xs.size();
+    auto& operators = cur_pipe->operators();
+    auto total_op_num = operators.size();
     auto new_pip = add_pipeline(cur_pipe, pip_idx + 1);
     RETURN_IF_ERROR(_add_local_exchange_impl(
             idx, pool, cur_pipe, new_pip, data_distribution, do_local_exchange, num_buckets,
@@ -1653,8 +1653,8 @@ void PipelineFragmentContext::_close_fragment_instance() {
     }
 
     if (_query_ctx->enable_profile()) {
-        _query_ctx->add_fragment_profile(_fragment_id, collect_realtime_profile_x(),
-                                         collect_realtime_load_channel_profile_x());
+        _query_ctx->add_fragment_profile(_fragment_id, collect_realtime_profile(),
+                                         collect_realtime_load_channel_profile());
     }
 
     // all submitted tasks done
@@ -1724,7 +1724,7 @@ std::string PipelineFragmentContext::debug_string() {
 }
 
 std::vector<std::shared_ptr<TRuntimeProfileTree>>
-PipelineFragmentContext::collect_realtime_profile_x() const {
+PipelineFragmentContext::collect_realtime_profile() const {
     std::vector<std::shared_ptr<TRuntimeProfileTree>> res;
 
     // we do not have mutex to protect pipeline_id_to_profile
@@ -1749,7 +1749,7 @@ PipelineFragmentContext::collect_realtime_profile_x() const {
 }
 
 std::shared_ptr<TRuntimeProfileTree>
-PipelineFragmentContext::collect_realtime_load_channel_profile_x() const {
+PipelineFragmentContext::collect_realtime_load_channel_profile() const {
     // we do not have mutex to protect pipeline_id_to_profile
     // so we need to make sure this funciton is invoked after fragment context
     // has already been prepared.
diff --git a/be/src/pipeline/pipeline_fragment_context.h b/be/src/pipeline/pipeline_fragment_context.h
index 75f3f22c68131c..f46835e95e0647 100644
--- a/be/src/pipeline/pipeline_fragment_context.h
+++ b/be/src/pipeline/pipeline_fragment_context.h
@@ -69,8 +69,8 @@ class PipelineFragmentContext : public TaskExecutionContext {
 
     ~PipelineFragmentContext();
 
-    std::vector<std::shared_ptr<TRuntimeProfileTree>> collect_realtime_profile_x() const;
-    std::shared_ptr<TRuntimeProfileTree> collect_realtime_load_channel_profile_x() const;
+    std::vector<std::shared_ptr<TRuntimeProfileTree>> collect_realtime_profile() const;
+    std::shared_ptr<TRuntimeProfileTree> collect_realtime_load_channel_profile() const;
 
     bool is_timeout(timespec now) const;
 
diff --git a/be/src/runtime/query_context.cpp b/be/src/runtime/query_context.cpp
index 97aba2cae286c8..b9430d3899b8d3 100644
--- a/be/src/runtime/query_context.cpp
+++ b/be/src/runtime/query_context.cpp
@@ -401,7 +401,7 @@ QueryContext::_collect_realtime_query_profile() const {
                 continue;
             }
 
-            auto profile = fragment_ctx->collect_realtime_profile_x();
+            auto profile = fragment_ctx->collect_realtime_profile();
 
             if (profile.empty()) {
                 std::string err_msg = fmt::format(

From 948fb2ab57a8f7f9fcc26b6c45f1e13ecde87a51 Mon Sep 17 00:00:00 2001
From: seawinde <149132972+seawinde@users.noreply.github.com>
Date: Tue, 10 Sep 2024 17:06:45 +0800
Subject: [PATCH 32/44]  [improvement](mtmv) Optimize the materialized view
 hint info when explain (#39998)

Optimize the explain materialized view info, add double horizontal
dividing line between `MATERIALIZATIONS` and`STATISTICS`.
Maybe think rewrite fail because` planed with unknown column statistics `,
actually is rewrite successfully.

Before:

| ========== MATERIALIZATIONS ==========                                                                                                                                                                                                                |
|                                                                                                                                                                                                                                                       |
| MaterializedView                                                                                                                                                                                                                                      |
| MaterializedViewRewriteSuccessAndChose:                                                                                                                                                                                                               |
|   internal#regression_test_nereids_rules_p0_mv_agg_with_roll_up#mv13_1 chose,                                                                                                                                                                         |
|                                                                                                                                                                                                                                                       |
| MaterializedViewRewriteSuccessButNotChose:                                                                                                                                                                                                            |
|   not chose: none,                                                                                                                                                                                                                                    |
|                                                                                                                                                                                                                                                       |
| MaterializedViewRewriteFail:                                                                                                                                                                                                                          |
|                                                                                                                                                                                                                                                       |
| Statistics                                                                                                                                                                                                                                            |
|  planed with unknown column statistics

After:

| ========== MATERIALIZATIONS ==========                                                                                                                                                                                                                |
|                                                                                                                                                                                                                                                       |
| MaterializedView                                                                                                                                                                                                                                      |
| MaterializedViewRewriteSuccessAndChose:                                                                                                                                                                                                               |
|   internal.regression_test_nereids_rules_p0_mv_agg_with_roll_up.mv13_1 chose,                                                                                                                                                                         |
|                                                                                                                                                                                                                                                       |
| MaterializedViewRewriteSuccessButNotChose:                                                                                                                                                                                                            |
|   not chose: none,                                                                                                                                                                                                                                    |
|                                                                                                                                                                                                                                                       |
| MaterializedViewRewriteFail:                                                                                                                                                                                                                          |
|                                                                                                                                                                                                                                                       |
|                                                                                                                                                                                                                                                       |
| ========== STATISTICS ==========                                                                                                                                                                                                                      |
| planed with unknown column statistics
---
 .../java/org/apache/doris/nereids/NereidsPlanner.java  | 10 ++++++----
 .../rules/exploration/mv/MaterializationContext.java   |  2 +-
 .../mv/same_name/sync_async_same_name.groovy           |  2 +-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java
index a304fe36062c80..bd74c5835e287f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java
@@ -564,9 +564,10 @@ public String getExplainString(ExplainOptions explainOptions) {
         String plan = "";
         String mvSummary = "";
         if (this.getPhysicalPlan() != null && cascadesContext != null) {
-            mvSummary = "\n\n========== MATERIALIZATIONS ==========\n"
-                    + MaterializationContext.toSummaryString(cascadesContext.getMaterializationContexts(),
-                    this.getPhysicalPlan());
+            mvSummary = cascadesContext.getMaterializationContexts().isEmpty() ? "" :
+                    "\n\n========== MATERIALIZATIONS ==========\n"
+                            + MaterializationContext.toSummaryString(cascadesContext.getMaterializationContexts(),
+                            this.getPhysicalPlan());
         }
         switch (explainLevel) {
             case PARSED_PLAN:
@@ -625,9 +626,10 @@ public String getExplainString(ExplainOptions explainOptions) {
             default:
                 plan = super.getExplainString(explainOptions);
                 plan += mvSummary;
+                plan += "\n\n\n========== STATISTICS ==========\n";
                 if (statementContext != null) {
                     if (statementContext.isHasUnknownColStats()) {
-                        plan += "\n\nStatistics\n planed with unknown column statistics\n";
+                        plan += "planed with unknown column statistics\n";
                     }
                 }
         }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/exploration/mv/MaterializationContext.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/exploration/mv/MaterializationContext.java
index 7913c47b36a8dd..609125280ded4b 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/exploration/mv/MaterializationContext.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/exploration/mv/MaterializationContext.java
@@ -408,7 +408,7 @@ public Void visitPhysicalRelation(PhysicalRelation physicalRelation, Void contex
     }
 
     private static String generateIdentifierName(List<String> qualifiers) {
-        return String.join("#", qualifiers);
+        return String.join(".", qualifiers);
     }
 
     @Override
diff --git a/regression-test/suites/nereids_rules_p0/mv/same_name/sync_async_same_name.groovy b/regression-test/suites/nereids_rules_p0/mv/same_name/sync_async_same_name.groovy
index e8350d487a88ef..20dbc0a083158f 100644
--- a/regression-test/suites/nereids_rules_p0/mv/same_name/sync_async_same_name.groovy
+++ b/regression-test/suites/nereids_rules_p0/mv/same_name/sync_async_same_name.groovy
@@ -165,7 +165,7 @@ suite("sync_async_same_name") {
         check {result ->
             def splitResult = result.split("MaterializedViewRewriteFail")
             splitResult.length == 2 ? splitResult[0].contains(common_mv_name)
-                    && splitResult[0].contains("orders#${common_mv_name}") : false
+                    && splitResult[0].contains("orders.${common_mv_name}") : false
         }
     }
 

From c7f57ae1eabee22f192af4bba4f796b241a24660 Mon Sep 17 00:00:00 2001
From: wuwenchi <wuwenchihdu@hotmail.com>
Date: Tue, 10 Sep 2024 17:44:31 +0800
Subject: [PATCH 33/44] [bugfix](hive)Delete the temporarily created folder
 (#40424)

## Proposed changes

Delete the temporarily created folder, otherwise it will cause too many
folders on hdfs:
> The directory item limit of xxx is exceeded: limit=xxx items=xxx
---
 .../doris/datasource/hive/HMSTransaction.java |  2 ++
 .../doris/datasource/hive/HmsCommitTest.java  | 22 +++++++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSTransaction.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSTransaction.java
index ac610237cddfcc..6183c277c1bdf5 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSTransaction.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSTransaction.java
@@ -1484,6 +1484,8 @@ public void doCommit() {
             runS3cleanWhenSuccess();
             doAddPartitionsTask();
             doUpdateStatisticsTasks();
+            //delete write path
+            pruneAndDeleteStagingDirectories();
             doNothing();
         }
 
diff --git a/fe/fe-core/src/test/java/org/apache/doris/datasource/hive/HmsCommitTest.java b/fe/fe-core/src/test/java/org/apache/doris/datasource/hive/HmsCommitTest.java
index 7e99667b73124b..395a063fbc8a24 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/datasource/hive/HmsCommitTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/datasource/hive/HmsCommitTest.java
@@ -17,6 +17,7 @@
 
 package org.apache.doris.datasource.hive;
 
+import org.apache.doris.backup.Status;
 import org.apache.doris.catalog.Column;
 import org.apache.doris.catalog.PrimitiveType;
 import org.apache.doris.common.info.SimpleTableInfo;
@@ -169,15 +170,21 @@ public void testNewPartitionForUnPartitionedTable() throws IOException {
     @Test
     public void testAppendPartitionForUnPartitionedTable() throws IOException {
         genQueryID();
-        System.out.println(DebugUtil.printId(connectContext.queryId()));
         List<THivePartitionUpdate> pus = new ArrayList<>();
         pus.add(createRandomAppend(null));
         pus.add(createRandomAppend(null));
         pus.add(createRandomAppend(null));
+        new MockUp<HMSTransaction.HmsCommitter>(HMSTransaction.HmsCommitter.class) {
+            @Mock
+            private void doNothing() {
+                Assert.assertEquals(Status.ErrCode.NOT_FOUND, fs.exists(getWritePath()).getErrCode());
+            }
+        };
         commit(dbName, tbWithoutPartition, pus);
         Table table = hmsClient.getTable(dbName, tbWithoutPartition);
         assertNumRows(3, table);
 
+
         genQueryID();
         List<THivePartitionUpdate> pus2 = new ArrayList<>();
         pus2.add(createRandomAppend(null));
@@ -204,6 +211,12 @@ public void testOverwritePartitionForUnPartitionedTable() throws IOException {
 
     @Test
     public void testNewPartitionForPartitionedTable() throws IOException {
+        new MockUp<HMSTransaction.HmsCommitter>(HMSTransaction.HmsCommitter.class) {
+            @Mock
+            private void doNothing() {
+                Assert.assertEquals(Status.ErrCode.NOT_FOUND, fs.exists(getWritePath()).getErrCode());
+            }
+        };
         genQueryID();
         List<THivePartitionUpdate> pus = new ArrayList<>();
         pus.add(createRandomNew("a"));
@@ -377,6 +390,11 @@ public THivePartitionUpdate createRandomOverwrite(String partition) throws IOExc
                 genOnePartitionUpdate("c3=" + partition, TUpdateMode.OVERWRITE);
     }
 
+    private String getWritePath() {
+        String queryId = DebugUtil.printId(ConnectContext.get().queryId());
+        return writeLocation + queryId + "/";
+    }
+
     public void commit(String dbName,
             String tableName,
             List<THivePartitionUpdate> hivePUs) {
@@ -385,7 +403,7 @@ public void commit(String dbName,
         HiveInsertCommandContext ctx = new HiveInsertCommandContext();
         String queryId = DebugUtil.printId(ConnectContext.get().queryId());
         ctx.setQueryId(queryId);
-        ctx.setWritePath(writeLocation + queryId + "/");
+        ctx.setWritePath(getWritePath());
         hmsTransaction.beginInsertTable(ctx);
         hmsTransaction.finishInsertTable(new SimpleTableInfo(dbName, tableName));
         hmsTransaction.commit();

From 9af79c54b26a2e16b6cae26cf134b63d7be824de Mon Sep 17 00:00:00 2001
From: Mingyu Chen <morningman@163.com>
Date: Tue, 10 Sep 2024 18:14:18 +0800
Subject: [PATCH 34/44] [ci](ga) make clang format required (#40626)

---
 .asf.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.asf.yaml b/.asf.yaml
index 6ff16967c2e415..e71e55de23fc83 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -49,6 +49,7 @@ github:
         strict: false
         contexts:
           - License Check
+          - Clang Formatter
           - CheckStyle
           - P0 Regression (Doris Regression)
           - External Regression (Doris External Regression)
@@ -86,6 +87,7 @@ github:
         strict: false
         contexts:
           - License Check
+          - Clang Formatter
           - CheckStyle
           - Build Broker
           - ShellCheck
@@ -107,6 +109,7 @@ github:
         strict: false
         contexts:
           - License Check
+          - Clang Formatter
           - CheckStyle
           - P0 Regression (Doris Regression)
           - External Regression (Doris External Regression)

From c48c1ebc5a2aecdcec87394df0c5d5dac95a0009 Mon Sep 17 00:00:00 2001
From: zclllhhjj <zhaochangle@selectdb.com>
Date: Tue, 10 Sep 2024 19:18:29 +0800
Subject: [PATCH 35/44] [Enhancement](auto-partition) Re-add deduplication to
 auto partition rpc (#40580)

## Proposed changes

Issue Number: close #xxx

removed in https://github.com/apache/doris/pull/27817. we need it so re
add it.
---
 be/src/pipeline/exec/hashjoin_build_sink.h |  5 ++--
 be/src/vec/sink/vrow_distribution.cpp      |  6 +++--
 be/src/vec/sink/vrow_distribution.h        | 28 ++++++++++++++++++----
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/be/src/pipeline/exec/hashjoin_build_sink.h b/be/src/pipeline/exec/hashjoin_build_sink.h
index b7ae612510fcb4..cf677833fb5b64 100644
--- a/be/src/pipeline/exec/hashjoin_build_sink.h
+++ b/be/src/pipeline/exec/hashjoin_build_sink.h
@@ -132,9 +132,8 @@ class HashJoinBuildSinkOperatorX final
         if (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) {
             return {ExchangeType::NOOP};
         } else if (_is_broadcast_join) {
-            return _child->ignore_data_distribution()
-                           ? DataDistribution(ExchangeType::PASS_TO_ONE)
-                           : DataDistribution(ExchangeType::NOOP);
+            return _child->ignore_data_distribution() ? DataDistribution(ExchangeType::PASS_TO_ONE)
+                                                      : DataDistribution(ExchangeType::NOOP);
         }
         return _join_distribution == TJoinDistributionType::BUCKET_SHUFFLE ||
                                _join_distribution == TJoinDistributionType::COLOCATE
diff --git a/be/src/vec/sink/vrow_distribution.cpp b/be/src/vec/sink/vrow_distribution.cpp
index d45aa2ea911f2e..3a4c7e911f4c14 100644
--- a/be/src/vec/sink/vrow_distribution.cpp
+++ b/be/src/vec/sink/vrow_distribution.cpp
@@ -68,8 +68,10 @@ Status VRowDistribution::_save_missing_values(
             }
             cur_row_values.push_back(node);
         }
-        //For duplicate cur_values, they will be filtered in FE
-        _partitions_need_create.emplace_back(cur_row_values);
+        if (!_deduper.contains(cur_row_values)) {
+            _deduper.insert(cur_row_values);
+            _partitions_need_create.emplace_back(cur_row_values);
+        }
     }
 
     // to avoid too large mem use
diff --git a/be/src/vec/sink/vrow_distribution.h b/be/src/vec/sink/vrow_distribution.h
index 5267b488400b8f..fffe0e3f7f1887 100644
--- a/be/src/vec/sink/vrow_distribution.h
+++ b/be/src/vec/sink/vrow_distribution.h
@@ -24,7 +24,9 @@
 #include <gen_cpp/PaloInternalService_types.h>
 
 #include <cstdint>
+#include <functional>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include "common/status.h"
@@ -133,6 +135,10 @@ class VRowDistribution {
     Status automatic_create_partition();
     void clear_batching_stats();
 
+    // for auto partition
+    std::unique_ptr<MutableBlock> _batching_block;
+    bool _deal_batched = false; // If true, send batched block before any block's append.
+
 private:
     std::pair<vectorized::VExprContextSPtrs, vectorized::VExprSPtrs> _get_partition_function();
 
@@ -170,17 +176,29 @@ class VRowDistribution {
                                     int64_t rows);
     void _reset_find_tablets(int64_t rows);
 
+    struct NullableStringListHash {
+        std::size_t _hash(const TNullableStringLiteral& arg) const {
+            if (arg.is_null) {
+                return 0;
+            }
+            return std::hash<std::string>()(arg.value);
+        }
+        std::size_t operator()(const std::vector<TNullableStringLiteral>& arg) const {
+            std::size_t result = 0;
+            for (const auto& v : arg) {
+                result = (result << 1) ^ _hash(v);
+            }
+            return result;
+        }
+    };
+
     RuntimeState* _state = nullptr;
     int _batch_size = 0;
 
     // for auto partitions
     std::vector<std::vector<TNullableStringLiteral>> _partitions_need_create;
-
-public:
-    std::unique_ptr<MutableBlock> _batching_block;
-    bool _deal_batched = false; // If true, send batched block before any block's append.
-private:
     size_t _batching_rows = 0, _batching_bytes = 0;
+    std::unordered_set<std::vector<TNullableStringLiteral>, NullableStringListHash> _deduper;
 
     OlapTableBlockConvertor* _block_convertor = nullptr;
     OlapTabletFinder* _tablet_finder = nullptr;

From b2eef7652fa4dbeee019798fb0499381932ec8ba Mon Sep 17 00:00:00 2001
From: zhiqiang <seuhezhiqiang@163.com>
Date: Tue, 10 Sep 2024 20:20:05 +0800
Subject: [PATCH 36/44] [fix](regression-test) Fix scanner profile test failed
 occasionally (#40566)

Collection of profile is async. So wait 5 seconds before get profile.

adaptive_pipeline_task_serial_read_on_limit will fail on multi backends
env, since tablets num on each be will not be 10.
---
 ..._pipeline_task_serial_read_on_limit.groovy | 21 ++++++++-----------
 .../query_profile/scanner_profile.groovy      |  4 ++++
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/regression-test/suites/query_profile/adaptive_pipeline_task_serial_read_on_limit.groovy b/regression-test/suites/query_profile/adaptive_pipeline_task_serial_read_on_limit.groovy
index 15481fe7c9a8dc..46ff11b7845c91 100644
--- a/regression-test/suites/query_profile/adaptive_pipeline_task_serial_read_on_limit.groovy
+++ b/regression-test/suites/query_profile/adaptive_pipeline_task_serial_read_on_limit.groovy
@@ -116,6 +116,8 @@ suite('adaptive_pipeline_task_serial_read_on_limit') {
 
     sql "set enable_profile=false"
 
+    Thread.sleep(5)
+
     def wholeString = getProfileList()
     List profileData = new JsonSlurper().parseText(wholeString).data.rows
     String queryIdNoLimit1 = "";
@@ -149,27 +151,22 @@ suite('adaptive_pipeline_task_serial_read_on_limit') {
         }
     }
     
-    logger.info("queryIdNoLimit1_${uuidString}: {}", queryIdNoLimit1)
     logger.info("queryIdWithLimit1_${uuidString}: {}", queryIdWithLimit1)
-    logger.info("queryIdWithLimit2_${uuidString}: {}", queryIdWithLimit2)
-    logger.info("queryIDNotEnableLimit_${uuidString}: {}", queryIDNotEnableLimit)
     logger.info("queryIdModifyTo20_${uuidString}: {}", queryIdModifyTo20)
 
-    assertTrue(queryIdNoLimit1 != "")
     assertTrue(queryIdWithLimit1 != "")
-    assertTrue(queryIdWithLimit2 != "")
-    assertTrue(queryIDNotEnableLimit != "")
     assertTrue(queryIdModifyTo20 != "")
 
-    def String profileNoLimit1 = getProfile(queryIdNoLimit1).toString()
     def String profileWithLimit1 = getProfile(queryIdWithLimit1).toString()
-    def String profileWithLimit2 = getProfile(queryIdWithLimit2).toString()
-    def String profileNotEnableLimit = getProfile(queryIDNotEnableLimit).toString()
     def String profileModifyTo20 = getProfile(queryIdModifyTo20).toString()
     
-    assertTrue(profileNoLimit1.contains("- MaxScannerThreadNum: 10"))
+    if (!profileWithLimit1.contains("- MaxScannerThreadNum: 1")) {
+        logger.info("profileWithLimit1:\n{}", profileWithLimit1)
+    }
     assertTrue(profileWithLimit1.contains("- MaxScannerThreadNum: 1"))
-    assertTrue(profileWithLimit2.contains("- MaxScannerThreadNum: 10"))
-    assertTrue(profileNotEnableLimit.contains("- MaxScannerThreadNum: 10"))
+
+    if (!profileModifyTo20.contains("- MaxScannerThreadNum: 1")) {
+        logger.info("profileModifyTo20:\n{}", profileModifyTo20)
+    }
     assertTrue(profileModifyTo20.contains("- MaxScannerThreadNum: 1"))
 }
\ No newline at end of file
diff --git a/regression-test/suites/query_profile/scanner_profile.groovy b/regression-test/suites/query_profile/scanner_profile.groovy
index 38216d211e65ea..75ae6a5ab65a9b 100644
--- a/regression-test/suites/query_profile/scanner_profile.groovy
+++ b/regression-test/suites/query_profile/scanner_profile.groovy
@@ -98,6 +98,10 @@ suite('scanner_profile') {
     logger.info("queryIdWithLimit1_${uuidString}: {}", queryIdWithLimit1)
 
     assertTrue(queryIdWithLimit1 != "")
+
+    // Sleep 5 seconds to make sure profile collection is done
+    Thread.sleep(5000)
+
     def String profileWithLimit1 = getProfile(queryIdWithLimit1).toString()
     logger.info("query profile {}", profileWithLimit1)
     assertTrue(profileWithLimit1.contains("- PeakRunningScanner: 1"))

From b7e8d8238492f7e42ee2a682350d5c3b44ad58e5 Mon Sep 17 00:00:00 2001
From: Calvin Kirs <kirs@apache.org>
Date: Tue, 10 Sep 2024 21:33:06 +0800
Subject: [PATCH 37/44] [fix](metadata)Add FE metadata-related file checks
 (#40546)

## Proposed changes

Issue Number: close #xxx

<!--Describe your changes.-->
---
 .../java/org/apache/doris/common/Config.java  |  4 +
 .../org/apache/doris/master/MetaHelper.java   | 93 +++++++++++++++++--
 .../apache/doris/master/MetaHelperTest.java   | 47 ++++++++++
 3 files changed, 134 insertions(+), 10 deletions(-)

diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
index 360ec7ae6035c7..f12fff59c0355f 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
@@ -3052,4 +3052,8 @@ public static int metaServiceRpcRetryTimes() {
     @ConfField(mutable = true, description = {"表示最大锁持有时间，超过该时间会打印告警日志，单位秒",
             "Maximum lock hold time; logs a warning if exceeded"})
     public static long max_lock_hold_threshold_seconds = 10;
+
+    @ConfField(mutable = true, description = {"元数据同步是否开启安全模式",
+        "Is metadata synchronization enabled in safe mode"})
+    public static boolean meta_helper_security_mode = false;
 }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/MetaHelper.java b/fe/fe-core/src/main/java/org/apache/doris/master/MetaHelper.java
index e4fd5cacf71262..cf63a82cd870d8 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/master/MetaHelper.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/master/MetaHelper.java
@@ -18,6 +18,7 @@
 package org.apache.doris.master;
 
 import org.apache.doris.catalog.Env;
+import org.apache.doris.common.Config;
 import org.apache.doris.common.io.IOUtils;
 import org.apache.doris.common.util.HttpURLUtil;
 import org.apache.doris.httpv2.entity.ResponseBody;
@@ -32,7 +33,6 @@
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
-import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
@@ -47,6 +47,8 @@ public class MetaHelper {
     public static final String X_IMAGE_MD5 = "X-Image-Md5";
     private static final int BUFFER_BYTES = 8 * 1024;
     private static final int CHECKPOINT_LIMIT_BYTES = 30 * 1024 * 1024;
+    private static final String VALID_FILENAME_REGEX = "^image\\.\\d+(\\.part)?$";
+
 
     public static File getMasterImageDir() {
         String metaDir = Env.getCurrentEnv().getImageDir();
@@ -57,24 +59,89 @@ public static int getLimit() {
         return CHECKPOINT_LIMIT_BYTES;
     }
 
+    private static void completeCheck(File dir, File file, File newFile) throws IOException {
+        if (!Config.meta_helper_security_mode) {
+            return;
+        }
+        String dirPath = dir.getCanonicalPath(); // Get the canonical path of the directory
+        String filePath = file.getCanonicalPath(); // Get the canonical path of the original file
+        String newFilePath = newFile.getCanonicalPath(); // Get the canonical path of the new file
+
+        // Ensure both file paths are within the specified directory to prevent path traversal attacks
+        if (!filePath.startsWith(dirPath) || !newFilePath.startsWith(dirPath)) {
+            throw new SecurityException("File path traversal attempt detected.");
+        }
+
+        // Ensure the original file exists and is a valid file to avoid renaming a non-existing file
+        if (!file.exists() || !file.isFile()) {
+            throw new IOException("Source file does not exist or is not a valid file.");
+        }
+
+    }
+
     // rename the .PART_SUFFIX file to filename
     public static File complete(String filename, File dir) throws IOException {
-        File file = new File(dir, filename + MetaHelper.PART_SUFFIX);
-        File newFile = new File(dir, filename);
+        // Validate that the filename does not contain illegal path elements
+        checkIsValidFileName(filename);
+
+        File file = new File(dir, filename + MetaHelper.PART_SUFFIX); // Original file with a specific suffix
+        File newFile = new File(dir, filename); // Target file without the suffix
+
+        completeCheck(dir, file, newFile);
+        // Attempt to rename the file. If it fails, throw an exception
         if (!file.renameTo(newFile)) {
-            throw new IOException("Complete file" + filename + " failed");
+            throw new IOException("Complete file " + filename + " failed");
         }
-        return newFile;
+
+        return newFile; // Return the newly renamed file
     }
 
-    public static OutputStream getOutputStream(String filename, File dir)
-            throws FileNotFoundException {
+    public static File getFile(String filename, File dir) throws IOException {
+        checkIsValidFileName(filename);
         File file = new File(dir, filename + MetaHelper.PART_SUFFIX);
-        return new FileOutputStream(file);
+        checkFile(dir, file);
+        return file;
+    }
+
+    private static void checkFile(File dir, File file) throws IOException {
+        if (!Config.meta_helper_security_mode) {
+            return;
+        }
+        String dirPath = dir.getCanonicalPath();
+        String filePath = file.getCanonicalPath();
+
+        if (!filePath.startsWith(dirPath)) {
+            throw new SecurityException("File path traversal attempt detected.");
+        }
     }
 
-    public static File getFile(String filename, File dir) {
-        return new File(dir, filename + MetaHelper.PART_SUFFIX);
+
+    private static void checkIsValidFileName(String filename) {
+        if (!Config.meta_helper_security_mode) {
+            return;
+        }
+        if (!filename.matches(VALID_FILENAME_REGEX)) {
+            throw new IllegalArgumentException("Invalid filename");
+        }
+    }
+
+    private static void checkFile(File file) throws IOException {
+        if (!Config.meta_helper_security_mode) {
+            return;
+        }
+        if (!file.getAbsolutePath().startsWith(file.getCanonicalFile().getParent())) {
+            throw new IllegalArgumentException("Invalid file path");
+        }
+
+        File parentDir = file.getParentFile();
+        if (!parentDir.canWrite()) {
+            throw new IOException("No write permission in directory: " + parentDir);
+        }
+
+        if (file.exists() && !file.delete()) {
+            throw new IOException("Failed to delete existing file: " + file);
+        }
+        checkIsValidFileName(file.getName());
     }
 
     public static <T> ResponseBody doGet(String url, int timeout, Class<T> clazz) throws IOException {
@@ -88,6 +155,8 @@ public static <T> ResponseBody doGet(String url, int timeout, Class<T> clazz) th
     public static void getRemoteFile(String urlStr, int timeout, File file)
             throws IOException {
         HttpURLConnection conn = null;
+        checkFile(file);
+        boolean md5Matched = true;
         OutputStream out = new FileOutputStream(file);
         try {
             conn = HttpURLUtil.getConnectionWithNodeIdent(urlStr);
@@ -117,6 +186,7 @@ public static void getRemoteFile(String urlStr, int timeout, File file)
             if (remoteMd5 != null) {
                 String localMd5 = DigestUtils.md5Hex(new FileInputStream(file));
                 if (!remoteMd5.equals(localMd5)) {
+                    md5Matched = false;
                     throw new IOException("Unexpected image md5, expected: " + remoteMd5 + ", actual: " + localMd5);
                 }
             }
@@ -127,6 +197,9 @@ public static void getRemoteFile(String urlStr, int timeout, File file)
             if (out != null) {
                 out.close();
             }
+            if (!md5Matched && file.exists() & Config.meta_helper_security_mode) {
+                file.delete();
+            }
         }
     }
 
diff --git a/fe/fe-core/src/test/java/org/apache/doris/master/MetaHelperTest.java b/fe/fe-core/src/test/java/org/apache/doris/master/MetaHelperTest.java
index 070979494bfd6c..40083abf956aa4 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/master/MetaHelperTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/master/MetaHelperTest.java
@@ -17,6 +17,7 @@
 
 package org.apache.doris.master;
 
+import org.apache.doris.common.Config;
 import org.apache.doris.httpv2.entity.ResponseBody;
 import org.apache.doris.httpv2.rest.RestApiStatusCode;
 import org.apache.doris.persist.StorageInfo;
@@ -25,6 +26,11 @@
 import com.fasterxml.jackson.databind.ObjectMapper;
 import org.junit.Assert;
 import org.junit.Test;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+
+import java.io.File;
+import java.io.IOException;
 
 public class MetaHelperTest {
 
@@ -49,4 +55,45 @@ private ResponseBody<StorageInfo> buildResponseBody() {
         bodyBefore.setMsg("msg");
         return bodyBefore;
     }
+
+    File tempDir = new File(System.getProperty("java.io.tmpdir"), "tempDir");
+
+    @BeforeEach
+    void setUp() {
+
+        if (tempDir.exists()) {
+            tempDir.delete();
+        }
+        tempDir.mkdir();
+    }
+
+    @Test
+    public void testFile() throws IOException {
+
+        String errorFilename = "testfile.";
+        File errorFileWithSuffix = new File(tempDir, errorFilename);
+        String rightFilename = "image.1";
+        File rightFileWithSuffix = new File(tempDir, rightFilename);
+
+        Config.meta_helper_security_mode = true;
+
+        if (errorFileWithSuffix.exists()) {
+            errorFileWithSuffix.delete();
+        }
+        Assert.assertThrows(IllegalArgumentException.class, () -> MetaHelper.complete(errorFilename, tempDir));
+        Assert.assertThrows(IllegalArgumentException.class, () -> MetaHelper.getFile(errorFilename, tempDir));
+        if (rightFileWithSuffix.exists()) {
+            rightFileWithSuffix.delete();
+        }
+        Assert.assertEquals(rightFileWithSuffix.getName() + ".part", MetaHelper.getFile(rightFilename, tempDir).getName());
+
+    }
+
+    @AfterEach
+    public void tearDown() {
+        if (tempDir.exists()) {
+            tempDir.delete();
+        }
+    }
+
 }

From 3f2e98a0b1b50c16d78d573886f3debd6b4ff284 Mon Sep 17 00:00:00 2001
From: Yongqiang YANG <98214048+dataroaring@users.noreply.github.com>
Date: Tue, 10 Sep 2024 23:45:34 +0800
Subject: [PATCH 38/44] [improvement](fdb) add a tool which deploys and starts
 fdb (#39803)

---
 tools/fdb/fdb_ctl.sh  | 418 ++++++++++++++++++++++++++++++++++++++++++
 tools/fdb/fdb_vars.sh |  72 ++++++++
 2 files changed, 490 insertions(+)
 create mode 100755 tools/fdb/fdb_ctl.sh
 create mode 100644 tools/fdb/fdb_vars.sh

diff --git a/tools/fdb/fdb_ctl.sh b/tools/fdb/fdb_ctl.sh
new file mode 100755
index 00000000000000..9c809abd5d4a50
--- /dev/null
+++ b/tools/fdb/fdb_ctl.sh
@@ -0,0 +1,418 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#
+# 1. Run fdb_ctrl.sh deploy on each machine to deploy FoundationDB.
+#    This will create the necessary directories, configuration files.
+#
+# 2. Run fdb_ctrl.sh start on each machine to start the fdb cluster
+#    and get the cluster connection string.
+#
+
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" &>/dev/null && pwd)"
+
+if [[ -f "${ROOT_DIR}/fdb_vars.sh" ]]; then
+    source "${ROOT_DIR}/fdb_vars.sh"
+else
+    echo "Please create fdb_vars.sh first"
+    exit 1
+fi
+
+if [[ ! -d "${FDB_HOME}" ]]; then
+    echo "Please set and create FDB_HOME first"
+    exit 1
+fi
+
+if [[ ! "${FDB_HOME}" = /* ]]; then
+    echo "${FDB_HOME} is not an absolute path."
+    exit 1
+fi
+
+if [[ -z ${FDB_CLUSTER_ID} ]]; then
+    echo "Please set FDB_CLUSTER_ID first"
+    exit 1
+fi
+
+# TODO verify config
+
+FDB_CLUSTER_DESC=${FDB_CLUSTER_DESC:-"doris-fdb"}
+
+# A dir to provide FDB binary pkgs
+FDB_PKG_DIR=${ROOT_DIR}/pkgs/${FDB_VERSION}
+
+FDB_PORT=${FDB_PORT:-4500}
+
+LOG_DIR=${LOG_DIR:-${FDB_HOME}/log}
+
+mkdir -p "${LOG_DIR}"
+mkdir -p "${FDB_HOME}"/conf
+mkdir -p "${FDB_HOME}"/log
+
+function ensure_port_is_listenable() {
+    local component="$1"
+    local port="$2"
+
+    if lsof -nP -iTCP:"${port}" -sTCP:LISTEN >/dev/null; then
+        echo "The port ${port} of ${component} is occupied"
+        exit 1
+    fi
+}
+
+function download_fdb() {
+    if [[ -d "${FDB_PKG_DIR}" ]]; then
+        echo "FDB ${FDB_VERSION} already exists"
+        return
+    fi
+
+    local URL="https://github.com/apple/foundationdb/releases/download/${FDB_VERSION}/"
+    local TMP="${FDB_PKG_DIR}-tmp"
+
+    rm -rf "${TMP}"
+    mkdir -p "${TMP}"
+
+    wget "${URL}/fdbbackup.x86_64" -O "${TMP}/fdbbackup"
+    wget "${URL}/fdbserver.x86_64" -O "${TMP}/fdbserver"
+    wget "${URL}/fdbcli.x86_64" -O "${TMP}/fdbcli"
+    wget "${URL}/fdbmonitor.x86_64" -O "${TMP}/fdbmonitor"
+    wget "${URL}/libfdb_c.x86_64.so" -O "${TMP}/libfdb_c.x86_64.so"
+    chmod +x "${TMP}"/fdb*
+
+    mv "${TMP}" "${FDB_PKG_DIR}"
+    echo "Download fdb binary pkgs success"
+}
+
+# Function to configure coordinators
+get_coordinators() {
+    local num_nodes
+    local num_coordinators
+
+    num_nodes=$(echo "${FDB_CLUSTER_IPS}" | tr ',' '\n' | wc -l)
+
+    if [[ ${num_nodes} -le 2 ]]; then
+        num_coordinators=1
+    elif [[ ${num_nodes} -le 4 ]]; then
+        num_coordinators=3
+    else
+        num_coordinators=5
+    fi
+
+    echo "${FDB_CLUSTER_IPS}" | cut -d',' -f1-"${num_coordinators}" | tr ',' '\n' | sed "s/$/:${FDB_PORT}/" | paste -sd ','
+}
+
+get_fdb_mode() {
+    # Initialize a new database
+    local num_nodes
+    local fdb_mode
+
+    num_nodes=$(echo "${FDB_CLUSTER_IPS}" | tr ',' '\n' | wc -l)
+    if [[ ${num_nodes} -eq 1 ]]; then
+        fdb_mode="single"
+    elif [[ ${num_nodes} -le 4 ]]; then
+        fdb_mode="double"
+    else
+        fdb_mode="triple"
+    fi
+
+    echo "${fdb_mode}"
+}
+
+# Function to calculate number of processes
+calculate_process_numbers() {
+    # local memory_gb=$1
+    local cpu_cores=$2
+
+    local min_processes=1
+    local data_dir_count
+
+    # Convert comma-separated DATA_DIRS into an array
+    IFS=',' read -r -a DATA_DIR_ARRAY <<<"${DATA_DIRS}"
+    data_dir_count=${#DATA_DIR_ARRAY[@]}
+
+    # Stateless processes (at least 1, up to 1/4 of CPU cores)
+    local stateless_processes=$((cpu_cores / 4))
+    [[ ${stateless_processes} -lt ${min_processes} ]] && stateless_processes=${min_processes}
+
+    # Storage processes (must be a multiple of the number of data directories)
+    local storage_processes=$((cpu_cores / 4))
+    [[ ${storage_processes} -lt ${data_dir_count} ]] && storage_processes=${data_dir_count}
+    storage_processes=$(((storage_processes / data_dir_count) * data_dir_count))
+
+    # Transaction processes (must be a multiple of the number of data directories)
+    local transaction_processes=$((cpu_cores / 8))
+    [[ ${transaction_processes} -lt ${min_processes} ]] && transaction_processes=${min_processes}
+    [[ ${transaction_processes} -lt ${data_dir_count} ]] && transaction_processes=${data_dir_count}
+    transaction_processes=$(((transaction_processes / data_dir_count) * data_dir_count))
+
+    # Return the values
+    echo "${stateless_processes} ${storage_processes} ${transaction_processes}"
+}
+
+function deploy_fdb() {
+    download_fdb
+
+    ln -sf "${FDB_PKG_DIR}/fdbserver" "${FDB_HOME}/fdbserver"
+    ln -sf "${FDB_PKG_DIR}/fdbmonitor" "${FDB_HOME}/fdbmonitor"
+    ln -sf "${FDB_PKG_DIR}/fdbbackup" "${FDB_HOME}/backup_agent"
+    ln -sf "${FDB_PKG_DIR}/fdbcli" "${FDB_HOME}/fdbcli"
+
+    CLUSTER_DESC="${FDB_CLUSTER_DESC:-${FDB_CLUSTER_ID}}"
+
+    # Convert comma-separated DATA_DIRS into an array
+    IFS=',' read -r -a DATA_DIR_ARRAY <<<"${DATA_DIRS}"
+    for DIR in "${DATA_DIR_ARRAY[@]}"; do
+        mkdir -p "${DIR}" || handle_error "Failed to create data directory ${DIR}"
+    done
+
+    echo -e "\tCreate fdb.cluster, coordinator: $(get_coordinators)"
+    echo -e "\tfdb.cluster content is: ${CLUSTER_DESC}:${FDB_CLUSTER_ID}@$(get_coordinators)"
+    cat >"${FDB_HOME}/conf/fdb.cluster" <<EOF
+${CLUSTER_DESC}:${FDB_CLUSTER_ID}@$(get_coordinators)
+EOF
+
+    cat >"${FDB_HOME}/conf/fdb.conf" <<EOF
+[fdbmonitor]
+user = ${USER}
+group = ${USER}
+
+[general]
+restart-delay = 60
+cluster-file = ${FDB_HOME}/conf/fdb.cluster
+
+## Default parameters for individual fdbserver processes
+[fdbserver]
+command = ${FDB_HOME}/fdbserver
+public-address = auto:\$ID
+listen-address = public
+logdir = ${LOG_DIR}
+datadir = ${DATA_DIR_ARRAY[0]}/\$ID
+
+EOF
+
+    # Read configuration values
+    MEMORY_LIMIT_GB=${MEMORY_LIMIT_GB:-8}
+    CPU_CORES_LIMIT=${CPU_CORES_LIMIT:-1}
+
+    # Calculate number of processes based on resources and data directories
+    read -r stateless_processes storage_processes transaction_processes <<<"$(calculate_process_numbers "${MEMORY_LIMIT_GB}" "${CPU_CORES_LIMIT}")"
+
+    # Add stateless processes
+    for ((i = 0; i < stateless_processes; i++)); do
+        PORT=$((FDB_PORT + i))
+        echo "[fdbserver.${PORT}]
+class = stateless" >>"${FDB_HOME}/conf/fdb.conf"
+    done
+
+    FDB_PORT=$((FDB_PORT + stateless_processes))
+
+    # Add storage processes
+    STORAGE_DIR_COUNT=${#DATA_DIR_ARRAY[@]}
+    for ((i = 0; i < storage_processes; i++)); do
+        PORT=$((FDB_PORT + i))
+        DIR_INDEX=$((i % STORAGE_DIR_COUNT))
+        echo "[fdbserver.${PORT}]
+class = storage
+datadir = ${DATA_DIR_ARRAY[${DIR_INDEX}]}/${PORT}" | tee -a "${FDB_HOME}/conf/fdb.conf" >/dev/null
+    done
+
+    FDB_PORT=$((FDB_PORT + storage_processes))
+
+    # Add transaction processes
+    for ((i = 0; i < transaction_processes; i++)); do
+        PORT=$((FDB_PORT + i))
+        DIR_INDEX=$((i % STORAGE_DIR_COUNT))
+        echo "[fdbserver.${PORT}]
+class = transaction
+datadir = ${DATA_DIR_ARRAY[${DIR_INDEX}]}/${PORT}" | tee -a "${FDB_HOME}/conf/fdb.conf" >/dev/null
+    done
+
+    echo "[backup_agent]
+command = ${FDB_HOME}/backup_agent
+logdir = ${LOG_DIR}" >>"${FDB_HOME}/conf/fdb.conf"
+
+    echo "Deploy FDB to: ${FDB_HOME}"
+}
+
+function start_fdb() {
+    if [[ ! -f "${FDB_HOME}/fdbmonitor" ]]; then
+        echo 'Please run setup before start fdb server'
+        exit 1
+    fi
+
+    ensure_port_is_listenable "fdbserver" "${FDB_PORT}"
+
+    echo "Run FDB monitor ..."
+    "${FDB_HOME}/fdbmonitor" \
+        --conffile "${FDB_HOME}/conf/fdb.conf" \
+        --lockfile "${FDB_HOME}/fdbmonitor.pid" \
+        --daemonize
+}
+
+function stop_fdb() {
+    if [[ -f "${FDB_HOME}/fdbmonitor.pid" ]]; then
+        local fdb_pid
+        fdb_pid=$(cat "${FDB_HOME}/fdbmonitor.pid")
+        if ps -p "${fdb_pid}" >/dev/null; then
+            echo "Stop fdbmonitor with pid ${fdb_pid}"
+            kill -9 "${fdb_pid}"
+        fi
+    fi
+}
+
+function clean_fdb() {
+    if [[ -f "${FDB_HOME}/fdbmonitor.pid" ]]; then
+        local fdb_pid
+
+        fdb_pid=$(cat "${FDB_HOME}/fdbmonitor.pid")
+        if ps -p "${fdb_pid}" >/dev/null; then
+            echo "fdbmonitor with pid ${fdb_pid} is running, stop it first."
+            exit 1
+        fi
+    fi
+
+    sleep 1
+
+    # Check if FDB_HOME is set and not root
+    if [[ -z "${FDB_HOME}" || "${FDB_HOME}" == "/" ]]; then
+        echo "Error: FDB_HOME is not set or is set to root directory. Aborting cleanup."
+        exit 1
+    fi
+
+    # Check if FDB_HOME is empty
+    if [[ -z "$(ls -A "${FDB_HOME}")" ]]; then
+        echo "Error: FDB_HOME is empty. Nothing to clean."
+        exit 1
+    fi
+
+    # Remove all directories and files under ${FDB_HOME}
+    echo "Removing all directories and files under ${FDB_HOME}"
+    rm -rf "${FDB_HOME:?}"/*
+}
+
+function deploy() {
+    local job="$1"
+    local skip_pkg="$2"
+    local skip_config="$3"
+
+    if [[ ${job} =~ ^(all|fdb)$ ]]; then
+        deploy_fdb
+    fi
+}
+
+function start() {
+    local job="$1"
+    local init="$2"
+
+    if [[ ${job} =~ ^(all|fdb)$ ]]; then
+        start_fdb
+    fi
+
+    if [[ ${init} =~ ^(all|fdb)$ ]]; then
+        echo "Try create database ..."
+        local fdb_mode
+
+        fdb_mode=$(get_fdb_mode)
+        "${FDB_HOME}/fdbcli" -C "${FDB_HOME}/conf/fdb.cluster" \
+            --exec "configure new ${fdb_mode} ssd" || true
+    fi
+
+    echo "Start fdb success, and the cluster is:"
+    cat "${FDB_HOME}/conf/fdb.cluster"
+}
+
+function stop() {
+    local job="$1"
+
+    if [[ ${job} =~ ^(all|fdb)$ ]]; then
+        stop_fdb &
+    fi
+    wait
+}
+
+function clean() {
+    local job="$1"
+
+    if [[ ${job} =~ ^(all|fdb)$ ]]; then
+        clean_fdb &
+    fi
+    wait
+}
+
+function status() {
+    pgrep -f "${FDB_CLUSTER_DESC}"
+}
+
+function usage() {
+    echo "Usage: $0 <CMD> [--skip-pkg] [--skip-config]"
+    echo -e "\t deploy \t setup fdb env (dir, binary, conf ...)"
+    echo -e "\t clean  \t clean fdb data"
+    echo -e "\t start  \t start fdb"
+    echo -e "\t stop   \t stop fdb"
+    echo -e ""
+    echo -e ""
+    echo -e "Args:"
+    echo -e "\t --skip-pkg    \t skip to update binary pkgs during deploy"
+    echo -e "\t --skip-config \t skip to update config during deploy"
+    echo -e ""
+    exit 1
+}
+
+function unknown_cmd() {
+    local cmd="$1"
+
+    printf "Unknown cmd: %s \n" "${cmd}"
+    usage
+}
+
+if [[ $# -lt 1 ]]; then
+    usage
+fi
+
+cmd="$1"
+shift
+
+job="fdb"
+
+init="fdb"
+skip_pkg="false"
+skip_config="false"
+
+case ${cmd} in
+deploy)
+    deploy "${job}" "${skip_pkg}" "${skip_config}"
+    ;;
+start)
+    start "${job}" "${init}"
+    ;;
+stop)
+    stop "${job}"
+    ;;
+clean)
+    clean "${job}"
+    ;;
+fdbcli)
+    "${FDB_HOME}/fdbcli" -C "${FDB_HOME}/conf/fdb.cluster" "$@"
+    ;;
+config)
+    generate_regression_config true
+    ;;
+*)
+    unknown_cmd "${cmd}"
+    ;;
+esac
diff --git a/tools/fdb/fdb_vars.sh b/tools/fdb/fdb_vars.sh
new file mode 100644
index 00000000000000..c0bbadabdd6cd1
--- /dev/null
+++ b/tools/fdb/fdb_vars.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Description: Variables for FoundationDB
+
+#======================= MUST CUSTOMIZATION ====================================
+# Data directories for FoundationDB storage
+# Make sure to create these directories before running the script, and have to be absolute path.
+# For simplicity, you can use one direcotry. For production, you should use SSDs.
+# shellcheck disable=2034
+DATA_DIRS="/mnt/foundationdb/data1,/mnt/foundationdb/data2,/mnt/foundationdb/data3"
+
+# Define the cluster IPs (comma-separated list of IP addresses)
+# You should have at least 3 IP addresses for a production cluster
+# The first IP addresses will be used as the coordinator,
+# num of coordinators depends on the number of nodes, see the function get_coordinators.
+# For high availability, machines should be in diffrent rack.
+# shellcheck disable=2034
+FDB_CLUSTER_IPS="172.200.0.2,172.200.0.3,172.200.0.4"
+
+# Define the FoundationDB home directory, which contains the fdb binaries and logs.
+# default is /fdbhome and have to be absolute path.
+# shellcheck disable=2034
+FDB_HOME="/fdbhome"
+
+# Define the cluster id, shoule be generated random like mktemp -u XXXXXXXX,
+# have to be different for each cluster.
+# shellcheck disable=2034
+FDB_CLUSTER_ID=$(mktemp -u XXXXXXXX)
+
+# Define the cluster description, you 'd better to change it.
+# shellcheck disable=2034
+FDB_CLUSTER_DESC="mycluster"
+
+#======================= OPTIONAL CUSTOMIZATION ============================
+# Define resource limits
+# Memory limit in gigabytes
+# shellcheck disable=2034
+MEMORY_LIMIT_GB=16
+
+# CPU cores limit
+# shellcheck disable=2034
+CPU_CORES_LIMIT=8
+
+#===========================================================================
+# Define starting port for the servers
+# This is the base port number for the fdbserver processes, usually does not need to be changed
+# shellcheck disable=2034
+FDB_PORT=4500
+
+# Define the FoundationDB version
+# shellcheck disable=2034
+FDB_VERSION="7.1.38"
+
+# Users who run the fdb processes, default is the current user
+# shellcheck disable=2034
+USER=$(whoami)

From a8d8798e960b25e49b67c115a864c02164884e98 Mon Sep 17 00:00:00 2001
From: meiyi <myimeiyi@gmail.com>
Date: Tue, 10 Sep 2024 23:59:13 +0800
Subject: [PATCH 39/44] [fix](regression) speed up some group commit case
 (#40624)

group commit interval is default 10s, cause the case cost too much time
---
 .../http_stream/test_group_commit_http_stream.out |  2 +-
 .../stream_load/test_group_commit_stream_load.out |  2 +-
 ...up_commit_async_wal_msg_fault_injection.groovy |  6 +++---
 .../insert_p0/insert_group_commit_into.groovy     | 14 +++++++++-----
 ...sert_group_commit_into_max_filter_ratio.groovy | 15 +--------------
 ...sert_group_commit_into_unique_sync_mode.groovy |  6 +++---
 .../insert_group_commit_with_exception.groovy     | 12 ++++++------
 .../insert_group_commit_with_large_data.groovy    |  1 +
 .../insert_group_commit_with_prepare_stmt.groovy  |  2 ++
 .../test_group_commit_http_stream.groovy          |  6 ++++--
 ...test_group_commit_and_wal_back_pressure.groovy |  1 +
 .../test_group_commit_stream_load.groovy          | 10 ++++++----
 12 files changed, 38 insertions(+), 39 deletions(-)

diff --git a/regression-test/data/load_p0/http_stream/test_group_commit_http_stream.out b/regression-test/data/load_p0/http_stream/test_group_commit_http_stream.out
index 57c2525815ad78..e4f297347cc196 100644
--- a/regression-test/data/load_p0/http_stream/test_group_commit_http_stream.out
+++ b/regression-test/data/load_p0/http_stream/test_group_commit_http_stream.out
@@ -21,5 +21,5 @@
 8	f	80
 
 -- !sql --
-2402288
+1201144
 
diff --git a/regression-test/data/load_p0/stream_load/test_group_commit_stream_load.out b/regression-test/data/load_p0/stream_load/test_group_commit_stream_load.out
index 246be06453bd16..1f1afae813f336 100644
--- a/regression-test/data/load_p0/stream_load/test_group_commit_stream_load.out
+++ b/regression-test/data/load_p0/stream_load/test_group_commit_stream_load.out
@@ -23,5 +23,5 @@
 11	a	11
 
 -- !sql --
-2402288
+1201144
 
diff --git a/regression-test/suites/fault_injection_p0/test_group_commit_async_wal_msg_fault_injection.groovy b/regression-test/suites/fault_injection_p0/test_group_commit_async_wal_msg_fault_injection.groovy
index c9e22504b1b132..2d0b91a01ed5fc 100644
--- a/regression-test/suites/fault_injection_p0/test_group_commit_async_wal_msg_fault_injection.groovy
+++ b/regression-test/suites/fault_injection_p0/test_group_commit_async_wal_msg_fault_injection.groovy
@@ -42,7 +42,7 @@ suite("test_group_commit_async_wal_msg_fault_injection","nonConcurrent") {
         ) engine=olap
         DISTRIBUTED BY HASH(`k`) 
         BUCKETS 5 
-        properties("replication_num" = "1")
+        properties("replication_num" = "1", "group_commit_interval_ms" = "10")
         """
 
     GetDebugPoint().clearDebugPointsForAllBEs()
@@ -79,7 +79,7 @@ suite("test_group_commit_async_wal_msg_fault_injection","nonConcurrent") {
         ) engine=olap
         DISTRIBUTED BY HASH(`k`) 
         BUCKETS 5 
-        properties("replication_num" = "1")
+        properties("replication_num" = "1", "group_commit_interval_ms" = "10")
         """
 
     GetDebugPoint().clearDebugPointsForAllBEs()
@@ -118,7 +118,7 @@ suite("test_group_commit_async_wal_msg_fault_injection","nonConcurrent") {
         ) engine=olap
         DISTRIBUTED BY HASH(`k`) 
         BUCKETS 5 
-        properties("replication_num" = "1")
+        properties("replication_num" = "1", "group_commit_interval_ms" = "10")
         """
 
     GetDebugPoint().clearDebugPointsForAllBEs()
diff --git a/regression-test/suites/insert_p0/insert_group_commit_into.groovy b/regression-test/suites/insert_p0/insert_group_commit_into.groovy
index dbf2bd2e18ef12..7af61dfc25fa6d 100644
--- a/regression-test/suites/insert_p0/insert_group_commit_into.groovy
+++ b/regression-test/suites/insert_p0/insert_group_commit_into.groovy
@@ -116,7 +116,8 @@ suite("insert_group_commit_into") {
             )
             DISTRIBUTED BY HASH(`id`) BUCKETS 1
             PROPERTIES (
-                "replication_num" = "1"
+                "replication_num" = "1",
+                "group_commit_interval_ms" = "200"
             );
             """
 
@@ -333,7 +334,7 @@ suite("insert_group_commit_into") {
             ) UNIQUE key (`teamID`,`service_id`, `start_time`)
             DISTRIBUTED BY hash(`start_time`)
             BUCKETS 1
-            PROPERTIES ("replication_allocation" = "tag.location.default: 1")
+            PROPERTIES ("replication_allocation" = "tag.location.default: 1", "group_commit_interval_ms" = "200")
             """
 
             connect(user = context.config.jdbcUser, password = context.config.jdbcPassword, url = context.config.jdbcUrl) {
@@ -384,7 +385,8 @@ suite("insert_group_commit_into") {
             COMMENT 'OLAP'
             DISTRIBUTED BY HASH(`ordernum`) BUCKETS 3
             PROPERTIES (
-            "replication_allocation" = "tag.location.default: 1"
+            "replication_allocation" = "tag.location.default: 1",
+            "group_commit_interval_ms" = "200"
             );"""
             sql """drop table if exists ${table_tmp};"""
             sql """CREATE TABLE ${table_tmp} (
@@ -403,7 +405,8 @@ suite("insert_group_commit_into") {
             COMMENT 'OLAP'
             DISTRIBUTED BY HASH(`ordernum`) BUCKETS 1
             PROPERTIES (
-            "replication_allocation" = "tag.location.default: 1"
+            "replication_allocation" = "tag.location.default: 1",
+            "group_commit_interval_ms" = "200"
             ); """
             sql """DROP MATERIALIZED VIEW IF EXISTS ods_zn_dnt_max1 ON ${table};"""
             createMV("""create materialized view ods_zn_dnt_max1 as
@@ -508,7 +511,8 @@ suite("insert_group_commit_into") {
                 DUPLICATE KEY(`k1`)
                 DISTRIBUTED BY HASH(`k1`) 
                 BUCKETS 1 PROPERTIES (
-                    "replication_allocation" = "tag.location.default: 1"
+                    "replication_allocation" = "tag.location.default: 1",
+                    "group_commit_interval_ms" = "200"
                 ); 
             """
 
diff --git a/regression-test/suites/insert_p0/insert_group_commit_into_max_filter_ratio.groovy b/regression-test/suites/insert_p0/insert_group_commit_into_max_filter_ratio.groovy
index 0624f1bcf37e7d..64ae30f8f8a63f 100644
--- a/regression-test/suites/insert_p0/insert_group_commit_into_max_filter_ratio.groovy
+++ b/regression-test/suites/insert_p0/insert_group_commit_into_max_filter_ratio.groovy
@@ -41,19 +41,6 @@ suite("insert_group_commit_into_max_filter_ratio") {
         }
     }
 
-    def normal_insert = { sql, expected_row_count ->
-        def stmt = prepareStatement """ ${sql}  """
-        def result = stmt.executeUpdate()
-        logger.info("insert result: " + result)
-        def serverInfo = (((StatementImpl) stmt).results).getServerInfo()
-        logger.info("result server info: " + serverInfo)
-        if (result != expected_row_count) {
-            logger.warn("insert result: " + result + ", expected_row_count: " + expected_row_count + ", sql: " + sql)
-        }
-        assertTrue(serverInfo.contains("'status':'VISIBLE'"))
-            assertTrue(serverInfo.contains("'label':'label"))
-    }
-
     def group_commit_insert = { sql, expected_row_count ->
         def stmt = prepareStatement """ ${sql}  """
         def result = stmt.executeUpdate()
@@ -172,7 +159,7 @@ suite("insert_group_commit_into_max_filter_ratio") {
         DISTRIBUTED BY HASH(`id`) BUCKETS 1
         PROPERTIES (
             "replication_num" = "1",
-            "group_commit_interval_ms" = "1000"
+            "group_commit_interval_ms" = "200"
         );
     """
 
diff --git a/regression-test/suites/insert_p0/insert_group_commit_into_unique_sync_mode.groovy b/regression-test/suites/insert_p0/insert_group_commit_into_unique_sync_mode.groovy
index c3a1e79cba517c..f58b306ab4ef66 100644
--- a/regression-test/suites/insert_p0/insert_group_commit_into_unique_sync_mode.groovy
+++ b/regression-test/suites/insert_p0/insert_group_commit_into_unique_sync_mode.groovy
@@ -124,7 +124,7 @@ suite("insert_group_commit_into_unique_sync_mode") {
             DISTRIBUTED BY HASH(`id`) BUCKETS 1
             PROPERTIES (
                 "replication_num" = "1",
-                "group_commit_interval_ms" = "1000"
+                "group_commit_interval_ms" = "10"
             );
             """
 
@@ -211,7 +211,7 @@ suite("insert_group_commit_into_unique_sync_mode") {
             PROPERTIES (
                 "replication_num" = "1",
                 "function_column.sequence_col" = "score",
-                "group_commit_interval_ms" = "1000"
+                "group_commit_interval_ms" = "10"
             );
             """
 
@@ -301,7 +301,7 @@ suite("insert_group_commit_into_unique_sync_mode") {
             PROPERTIES (
                 "replication_num" = "1",
                 "function_column.sequence_type" = "int",
-                "group_commit_interval_ms" = "1000"
+                "group_commit_interval_ms" = "10"
             );
             """
 
diff --git a/regression-test/suites/insert_p0/insert_group_commit_with_exception.groovy b/regression-test/suites/insert_p0/insert_group_commit_with_exception.groovy
index 1081064d9fe3a3..f59c9bb8b00c69 100644
--- a/regression-test/suites/insert_p0/insert_group_commit_with_exception.groovy
+++ b/regression-test/suites/insert_p0/insert_group_commit_with_exception.groovy
@@ -20,7 +20,7 @@ import java.sql.DriverManager
 import java.sql.Statement
 import java.sql.PreparedStatement
 
-suite("insert_group_commit_with_exception", "nonConcurrent") {
+suite("insert_group_commit_with_exception") {
     def table = "insert_group_commit_with_exception"
     def getRowCount = { expectedRowCount ->
         def retry = 0
@@ -57,6 +57,7 @@ suite("insert_group_commit_with_exception", "nonConcurrent") {
             DUPLICATE KEY(`id`, `name`)
             DISTRIBUTED BY HASH(`id`) BUCKETS 1
             PROPERTIES (
+                "group_commit_interval_ms" = "200",
                 "replication_num" = "1"
             );
             """
@@ -65,10 +66,10 @@ suite("insert_group_commit_with_exception", "nonConcurrent") {
             if (item == "nereids") {
                 sql """ set enable_nereids_planner=true; """
                 sql """ set enable_fallback_to_original_planner=false; """
-                sql "set global enable_server_side_prepared_statement = true"
+                sql "set enable_server_side_prepared_statement = true"
             } else {
                 sql """ set enable_nereids_planner = false; """
-                sql "set global enable_server_side_prepared_statement = false"
+                sql "set enable_server_side_prepared_statement = false"
             }
 
             // insert into without column
@@ -128,10 +129,10 @@ suite("insert_group_commit_with_exception", "nonConcurrent") {
                 if (item == "nereids") {
                     statement.execute("set enable_nereids_planner=true;");
                     statement.execute("set enable_fallback_to_original_planner=false;");
-                    sql "set global enable_server_side_prepared_statement = true"
+                    sql "set enable_server_side_prepared_statement = true"
                 } else {
                     statement.execute("set enable_nereids_planner = false;")
-                    sql "set global enable_server_side_prepared_statement = false"
+                    sql "set enable_server_side_prepared_statement = false"
                 }
                 // without column
                 try (PreparedStatement ps = connection.prepareStatement("insert into ${table} values(?, ?, ?, ?)")) {
@@ -291,5 +292,4 @@ suite("insert_group_commit_with_exception", "nonConcurrent") {
             // try_sql("DROP TABLE ${table}")
         }
     }
-    sql "set global enable_server_side_prepared_statement = true"
 }
diff --git a/regression-test/suites/insert_p0/insert_group_commit_with_large_data.groovy b/regression-test/suites/insert_p0/insert_group_commit_with_large_data.groovy
index 2af290ffc188e1..b66130c9e29627 100644
--- a/regression-test/suites/insert_p0/insert_group_commit_with_large_data.groovy
+++ b/regression-test/suites/insert_p0/insert_group_commit_with_large_data.groovy
@@ -62,6 +62,7 @@ suite("insert_group_commit_with_large_data") {
             DUPLICATE KEY(`id`, `name`)
             DISTRIBUTED BY HASH(`id`) BUCKETS 1
             PROPERTIES (
+                "group_commit_interval_ms" = "40",
                 "replication_num" = "1"
             );
             """
diff --git a/regression-test/suites/insert_p0/insert_group_commit_with_prepare_stmt.groovy b/regression-test/suites/insert_p0/insert_group_commit_with_prepare_stmt.groovy
index 6e05513a8d648b..7f2919f8118d10 100644
--- a/regression-test/suites/insert_p0/insert_group_commit_with_prepare_stmt.groovy
+++ b/regression-test/suites/insert_p0/insert_group_commit_with_prepare_stmt.groovy
@@ -144,6 +144,7 @@ suite("insert_group_commit_with_prepare_stmt") {
             UNIQUE KEY(`id`, `name`)
             DISTRIBUTED BY HASH(`id`) BUCKETS 1
             PROPERTIES (
+                "group_commit_interval_ms" = "40",
                 "replication_num" = "1"
             );
             """
@@ -206,6 +207,7 @@ suite("insert_group_commit_with_prepare_stmt") {
             DUPLICATE KEY(`id`, `name`)
             DISTRIBUTED BY HASH(`id`) BUCKETS 1
             PROPERTIES (
+                "group_commit_interval_ms" = "40",
                 "replication_num" = "1"
             );
             """
diff --git a/regression-test/suites/load_p0/http_stream/test_group_commit_http_stream.groovy b/regression-test/suites/load_p0/http_stream/test_group_commit_http_stream.groovy
index 5f4906662d3453..cb17cc82655fa7 100644
--- a/regression-test/suites/load_p0/http_stream/test_group_commit_http_stream.groovy
+++ b/regression-test/suites/load_p0/http_stream/test_group_commit_http_stream.groovy
@@ -88,6 +88,7 @@ suite("test_group_commit_http_stream") {
         )
         DISTRIBUTED BY HASH(`id`) BUCKETS 1
         PROPERTIES (
+            "group_commit_interval_ms" = "200",
             "replication_num" = "1"
         );
         """
@@ -285,6 +286,7 @@ suite("test_group_commit_http_stream") {
             PARTITION p1998 VALUES [("19980101"), ("19990101")))
             DISTRIBUTED BY HASH(`lo_orderkey`) BUCKETS 4
             PROPERTIES (
+                "group_commit_interval_ms" = "200",
                 "replication_num" = "1"
             );
         """
@@ -307,7 +309,7 @@ suite("test_group_commit_http_stream") {
             sql """ alter table ${tableName} order by (${new_columns}); """
         }).start();*/
 
-        for (int i = 0; i < 4; i++) {
+        for (int i = 0; i < 2; i++) {
 
             streamLoad {
                 set 'version', '1'
@@ -334,7 +336,7 @@ suite("test_group_commit_http_stream") {
             }
         }
 
-        getRowCount(2402288)
+        getRowCount(600572 * 2)
         qt_sql """ select count(*) from ${tableName} """
 
         // assertTrue(getAlterTableState())
diff --git a/regression-test/suites/load_p0/stream_load/test_group_commit_and_wal_back_pressure.groovy b/regression-test/suites/load_p0/stream_load/test_group_commit_and_wal_back_pressure.groovy
index bd443b6eab7509..622ade420a2c91 100644
--- a/regression-test/suites/load_p0/stream_load/test_group_commit_and_wal_back_pressure.groovy
+++ b/regression-test/suites/load_p0/stream_load/test_group_commit_and_wal_back_pressure.groovy
@@ -41,6 +41,7 @@ suite("test_group_commit_and_wal_back_pressure") {
                 UNIQUE KEY(k)  
                 DISTRIBUTED BY HASH (k) BUCKETS 32  
                 PROPERTIES(  
+                "group_commit_interval_ms" = "100",
                 "replication_num" = "1"
             );
         """
diff --git a/regression-test/suites/load_p0/stream_load/test_group_commit_stream_load.groovy b/regression-test/suites/load_p0/stream_load/test_group_commit_stream_load.groovy
index fea930f59125ef..be483c4dd484fc 100644
--- a/regression-test/suites/load_p0/stream_load/test_group_commit_stream_load.groovy
+++ b/regression-test/suites/load_p0/stream_load/test_group_commit_stream_load.groovy
@@ -76,7 +76,8 @@ suite("test_group_commit_stream_load") {
         )
         DISTRIBUTED BY HASH(`id`) BUCKETS 1
         PROPERTIES (
-            "replication_num" = "1"
+            "replication_num" = "1",
+            "group_commit_interval_ms" = "200"
         );
         """
 
@@ -250,7 +251,8 @@ suite("test_group_commit_stream_load") {
             PARTITION p1998 VALUES [("19980101"), ("19990101")))
             DISTRIBUTED BY HASH(`lo_orderkey`) BUCKETS 4
             PROPERTIES (
-                "replication_num" = "1"
+                "replication_num" = "1",
+                "group_commit_interval_ms" = "200"
             );
         """
         // load data
@@ -272,7 +274,7 @@ suite("test_group_commit_stream_load") {
             sql """ alter table ${tableName} order by (${new_columns}); """
         }).start();*/
 
-        for (int i = 0; i < 4; i++) {
+        for (int i = 0; i < 2; i++) {
 
             streamLoad {
                 table tableName
@@ -297,7 +299,7 @@ suite("test_group_commit_stream_load") {
             }
         }
 
-        getRowCount(2402288)
+        getRowCount(600572 * 2)
         qt_sql """ select count(*) from ${tableName} """
 
         // assertTrue(getAlterTableState())

From 42381995925748e107b895b758fc8fb34b840c71 Mon Sep 17 00:00:00 2001
From: yujun <yu.jun.reach@gmail.com>
Date: Wed, 11 Sep 2024 00:07:43 +0800
Subject: [PATCH 40/44] [feature](docker suite) Docker suite use independent
 executor (#40259)

If many docker suites run parallel, the docker containers take a lot
memory, so use independent executor for them to control the memory
usage.

Use options or config dockerSuiteParallel to set the docker suite
parallel, default is 1.
---
 docker/runtime/doris-compose/Readme.md        | 39 +++++++--
 docker/runtime/doris-compose/command.py       | 26 ++++++
 .../runtime/doris-compose/resource/common.sh  |  2 +-
 .../org/apache/doris/regression/Config.groovy |  7 ++
 .../doris/regression/ConfigOptions.groovy     | 10 +++
 .../doris/regression/RegressionTest.groovy    | 82 ++++++++++++-------
 .../doris/regression/suite/Suite.groovy       |  6 ++
 .../regression/suite/SuiteCluster.groovy      |  1 +
 .../test_clone_missing_version.groovy         |  2 +-
 .../test_clone_no_missing_version.groovy      |  2 +-
 .../clone_p0/test_decommission_mtmv.groovy    |  2 +-
 .../test_drop_clone_tablet_path_race.groovy   |  2 +-
 .../multi_cluster/test_rebalance.groovy       |  2 +-
 .../cloud_p0/multi_cluster/test_tvf.groovy    |  4 +-
 .../query_retry/test_retry_e-230.groovy       |  2 +-
 ...est_schema_change_with_compaction10.groovy |  4 +-
 ...est_schema_change_with_compaction11.groovy |  2 +-
 ...test_schema_change_with_compaction5.groovy |  4 +-
 ...test_schema_change_with_compaction6.groovy |  4 +-
 ...test_schema_change_with_compaction9.groovy |  4 +-
 ...est_compaction_with_visible_version.groovy |  2 +-
 .../suites/demo_p0/docker_action.groovy       | 10 ++-
 .../group_commit/replay_wal_restart_fe.groovy |  2 +-
 .../transaction/txn_insert_restart_fe.groovy  |  2 +-
 ...nsert_restart_fe_with_schema_change.groovy |  2 +-
 ...st_build_index_with_clone_by_docker.groovy |  2 +-
 ...est_min_load_replica_num_complicate.groovy |  2 +-
 .../load/insert/test_publish_one_succ.groovy  |  2 +-
 .../insert/test_publish_slow_not_wait.groovy  |  2 +-
 .../test_min_load_replica_num_simple.groovy   |  2 +-
 .../test_routine_load_restart_fe.groovy       |  2 +-
 .../test_coordidator_be_restart.groovy        |  2 +-
 ...t_migrate_disk_with_publish_version.groovy |  2 +-
 ...amic_partition_mod_distribution_key.groovy |  4 +-
 .../test_create_table_exception.groovy        |  2 +-
 .../test_partition_create_tablet_rr.groovy    |  2 +-
 .../suites/query_p0/test_forward_qeury.groovy |  4 +-
 ...t_schema_change_concurrent_with_txn.groovy |  2 +-
 .../test_abort_txn_by_be_cloud1.groovy        |  2 +-
 .../test_abort_txn_by_be_cloud2.groovy        |  2 +-
 .../test_abort_txn_by_be_local5.groovy        |  2 +-
 .../test_abort_txn_by_be_local6.groovy        |  2 +-
 .../test_abort_txn_by_fe_cloud4.groovy        |  2 +-
 .../test_abort_txn_by_fe_local3.groovy        |  2 +-
 .../test_partition_default_medium.groovy      |  2 +-
 .../test_storage_medium_has_disk.groovy       |  2 +-
 .../suites/trash_p0/clean_trash.groovy        |  4 +-
 .../test_mow_full_clone_exception.groovy      |  2 +-
 ..._partial_update_conflict_be_restart.groovy |  2 +-
 .../test_mow_full_clone_exception.groovy      |  2 +-
 run-regression-test.sh                        |  1 +
 51 files changed, 195 insertions(+), 87 deletions(-)

diff --git a/docker/runtime/doris-compose/Readme.md b/docker/runtime/doris-compose/Readme.md
index a83fa81e7615fa..770414f7a2bdf8 100644
--- a/docker/runtime/doris-compose/Readme.md
+++ b/docker/runtime/doris-compose/Readme.md
@@ -23,7 +23,16 @@ Use doris compose to create doris docker compose clusters.
 
 ## Requirements
 
-1. The doris image should contains:
+##### 1. Make sure you have docker permissions
+
+ run:
+```
+docker run hello-world
+```
+
+if have problem with permission denied, then [add-docker-permission](https://docs.docker.com/engine/install/linux-postinstall/).
+
+##### 2. The doris image should contains
 
 ```
 /opt/apache-doris/{fe, be, cloud}
@@ -32,16 +41,14 @@ Use doris compose to create doris docker compose clusters.
 if don't create cloud cluster, the image no need to contains the cloud pkg.
 
 
-if build doris use `sh build.sh --fe --be --cloud`, then its output satisfy with all above, then run command in doris root
+if build doris use `sh build.sh --fe --be --cloud`, then its output satisfy with all above, then run command in doris root directory
+ will generate such a image.
 
 ```
 docker build -f docker/runtime/doris-compose/Dockerfile -t <image> .
 ```
 
-will generate a image.
-
-2. Install the dependent python library in 'docker/runtime/doris-compose/requirements.txt'
-
+##### 3. Install the dependent python library in 'docker/runtime/doris-compose/requirements.txt'
 
 ```
 python -m pip install --user -r docker/runtime/doris-compose/requirements.txt
@@ -49,6 +56,20 @@ python -m pip install --user -r docker/runtime/doris-compose/requirements.txt
 
 ## Usage
 
+### Notice
+
+Each cluster will have a directory in '/tmp/doris/{cluster-name}', user can set env LOCAL_DORIS_PATH to change its directory.
+
+For example, if user export LOCAL_DORIS_PATH=/mydoris, then the cluster's directory is '/mydoris/{cluster-name}'.
+
+And cluster's directory will contains all its containers's logs and data, like fe-1, fe-2, be-1, ..., etc.
+
+If there are multiple users run doris-compose on the same machine, suggest don't change LOCAL_DORIS_PATH or they should export the same LOCAL_DORIS_PATH.
+
+Because when create a new cluster, doris-compose will search the local doris path, and choose a docker network which is different with this path's clusters.
+
+So if multiple users use different LOCAL_DORIS_PATH, their clusters may have docker network conflict!!!
+
 ### Create a cluster or recreate its containers
 
 ```
@@ -65,9 +86,11 @@ add fe/be nodes with the specific image, or update existing nodes with `--fe-id`
 
 
 For create a cloud cluster, steps are as below:
+
 1. Write cloud s3 store config file, its default path is '/tmp/doris/cloud.ini'.
    It's defined in environment variable DORIS_CLOUD_CFG_FILE, user can change this env var to change its path.
    A Example file is locate in 'docker/runtime/doris-compose/resource/cloud.ini.example'.
+
 2. Use doris compose up command with option '--cloud' to create a new cloud cluster.
 
 The simplest way to create a cloud cluster:
@@ -127,7 +150,7 @@ Generate regression-conf-custom.groovy to connect to the specific docker cluster
 
 steps:
 
-1. Create a new cluster:  `python doris-compose.py up my-cluster  my-image  --add-fe-num 2  --add-be-num 4 --cloud`
-2. Generate regression-conf-custom.groovy: `python doris-compose.py config my-cluster  <doris-root-path> --connect-follow-fe`
+1. Create a new cluster:  `python docker/runtime/doris-compose/doris-compose.py up my-cluster  my-image  --add-fe-num 2  --add-be-num 4 --cloud`
+2. Generate regression-conf-custom.groovy: `python docker/runtime/doris-compose/doris-compose.py config my-cluster  <doris-root-path> --connect-follow-fe`
 3. Run regression test: `bash run-regression-test.sh --run -times 1 -parallel 1 -suiteParallel 1 -d cloud/multi_cluster`
 
diff --git a/docker/runtime/doris-compose/command.py b/docker/runtime/doris-compose/command.py
index ed88dd03f4daf8..b6862bdcb000b1 100644
--- a/docker/runtime/doris-compose/command.py
+++ b/docker/runtime/doris-compose/command.py
@@ -826,7 +826,16 @@ def run(self, args):
                 print("\nNo write regression custom file.")
                 return
 
+        annotation_start = "//---------- Start auto generate by doris-compose.py---------"
+        annotation_end = "//---------- End auto generate by doris-compose.py---------"
+
+        old_contents = []
+        if os.path.exists(regression_conf_custom):
+            with open(regression_conf_custom, "r") as f:
+                old_contents = f.readlines()
         with open(regression_conf_custom, "w") as f:
+            # write auto gen config
+            f.write(annotation_start)
             f.write(base_conf.format(fe_ip=fe_ip))
             if cluster.is_cloud:
                 multi_cluster_bes = ",".join([
@@ -845,6 +854,23 @@ def run(self, args):
                         multi_cluster_bes=multi_cluster_bes,
                         fe_cloud_unique_id=cluster.get_node(
                             CLUSTER.Node.TYPE_FE, 1).cloud_unique_id()))
+            f.write(annotation_end + "\n\n")
+            annotation_end_line_count = -1
+
+            # write not-auto gen config
+            in_annotation = False
+            annotation_end_line_idx = -100
+            for line_idx, line in enumerate(old_contents):
+                line = line.rstrip()
+                if line == annotation_start:
+                    in_annotation = True
+                elif line == annotation_end:
+                    in_annotation = False
+                    annotation_end_line_idx = line_idx
+                elif not in_annotation:
+                    if line or line_idx != annotation_end_line_idx + 1:
+                        f.write(line + "\n")
+
         print("\nWrite succ: " + regression_conf_custom)
 
 
diff --git a/docker/runtime/doris-compose/resource/common.sh b/docker/runtime/doris-compose/resource/common.sh
index de6ba29865a948..a1c1b3ff2a5bdf 100644
--- a/docker/runtime/doris-compose/resource/common.sh
+++ b/docker/runtime/doris-compose/resource/common.sh
@@ -23,7 +23,7 @@ export LOG_FILE=$DORIS_HOME/log/health.out
 export LOCK_FILE=$DORIS_HOME/status/token
 
 health_log() {
-    echo "$(date +'%Y-%m-%d %H:%M:%S') $@" >>$LOG_FILE
+    echo "$(date +'%Y-%m-%d %H:%M:%S') $@" | tee -a $LOG_FILE
 }
 
 # concurrent write meta service server will failed due to fdb txn conflict.
diff --git a/regression-test/framework/src/main/groovy/org/apache/doris/regression/Config.groovy b/regression-test/framework/src/main/groovy/org/apache/doris/regression/Config.groovy
index 53fd6b0415faef..0042aa69a0aded 100644
--- a/regression-test/framework/src/main/groovy/org/apache/doris/regression/Config.groovy
+++ b/regression-test/framework/src/main/groovy/org/apache/doris/regression/Config.groovy
@@ -117,6 +117,7 @@ class Config {
     public InetSocketAddress recycleServiceHttpInetSocketAddress
     public Integer parallel
     public Integer suiteParallel
+    public Integer dockerSuiteParallel
     public Integer actionParallel
     public Integer times
     public boolean withOutLoadData
@@ -467,6 +468,7 @@ class Config {
         config.forceGenerateOutputFile = cmd.hasOption(forceGenOutOpt)
         config.parallel = Integer.parseInt(cmd.getOptionValue(parallelOpt, "10"))
         config.suiteParallel = Integer.parseInt(cmd.getOptionValue(suiteParallelOpt, "10"))
+        config.dockerSuiteParallel = Integer.parseInt(cmd.getOptionValue(dockerSuiteParallelOpt, "1"))
         config.actionParallel = Integer.parseInt(cmd.getOptionValue(actionParallelOpt, "10"))
         config.times = Integer.parseInt(cmd.getOptionValue(timesOpt, "1"))
         config.randomOrder = cmd.hasOption(randomOrderOpt)
@@ -888,6 +890,11 @@ class Config {
             log.info("Set suiteParallel to 1 because not specify.".toString())
         }
 
+        if (config.dockerSuiteParallel == null) {
+            config.dockerSuiteParallel = 1
+            log.info("Set dockerSuiteParallel to 1 because not specify.".toString())
+        }
+
         if (config.actionParallel == null) {
             config.actionParallel = 10
             log.info("Set actionParallel to 10 because not specify.".toString())
diff --git a/regression-test/framework/src/main/groovy/org/apache/doris/regression/ConfigOptions.groovy b/regression-test/framework/src/main/groovy/org/apache/doris/regression/ConfigOptions.groovy
index b1a782da94b656..67322287d07aa5 100644
--- a/regression-test/framework/src/main/groovy/org/apache/doris/regression/ConfigOptions.groovy
+++ b/regression-test/framework/src/main/groovy/org/apache/doris/regression/ConfigOptions.groovy
@@ -67,6 +67,7 @@ class ConfigOptions {
     static Option forceGenOutOpt
     static Option parallelOpt
     static Option suiteParallelOpt
+    static Option dockerSuiteParallelOpt
     static Option actionParallelOpt
     static Option randomOrderOpt
     static Option stopWhenFailOpt
@@ -425,6 +426,14 @@ class ConfigOptions {
                 .longOpt("suiteParallel")
                 .desc("the num of threads running for suites")
                 .build()
+        dockerSuiteParallelOpt = Option.builder("dockerSuiteParallel")
+                .argName("dockerSuiteParallel")
+                .required(false)
+                .hasArg(true)
+                .type(String.class)
+                .longOpt("dockerSuiteParallel")
+                .desc("the num of threads running for docker suites")
+                .build()
         actionParallelOpt = Option.builder("actionParallel")
                 .argName("parallel")
                 .required(false)
@@ -607,6 +616,7 @@ class ConfigOptions {
                 .addOption(forceGenOutOpt)
                 .addOption(parallelOpt)
                 .addOption(suiteParallelOpt)
+                .addOption(dockerSuiteParallelOpt)
                 .addOption(actionParallelOpt)
                 .addOption(randomOrderOpt)
                 .addOption(stopWhenFailOpt)
diff --git a/regression-test/framework/src/main/groovy/org/apache/doris/regression/RegressionTest.groovy b/regression-test/framework/src/main/groovy/org/apache/doris/regression/RegressionTest.groovy
index 92e92a9b736c32..a0cc8ba2ea12c4 100644
--- a/regression-test/framework/src/main/groovy/org/apache/doris/regression/RegressionTest.groovy
+++ b/regression-test/framework/src/main/groovy/org/apache/doris/regression/RegressionTest.groovy
@@ -52,16 +52,22 @@ import java.util.function.Predicate
 @CompileStatic
 class RegressionTest {
 
+    static enum GroupExecType {
+        NORMAL,
+        SINGLE,  // group contains nonConcurrent
+        DOCKER,  // group contains docker
+    }
+
     static ClassLoader classloader
     static CompilerConfiguration compileConfig
     static GroovyShell shell
     static ExecutorService scriptExecutors
-    static ExecutorService suiteExecutors
-    static ExecutorService singleSuiteExecutors
     static ExecutorService actionExecutors
+    static Map<GroupExecType, ExecutorService> suiteExecutors
     static ThreadLocal<Integer> threadLoadedClassNum = new ThreadLocal<>()
     static final int cleanLoadedClassesThreshold = 20
     static String nonConcurrentTestGroup = "nonConcurrent"
+    static String dockerTestGroup = "docker"
 
     static {
         ch.qos.logback.classic.Logger loggerOfSuite =
@@ -113,8 +119,9 @@ class RegressionTest {
             }
         }
         actionExecutors.shutdown()
-        suiteExecutors.shutdown()
-        singleSuiteExecutors.shutdown()
+        for (ExecutorService suiteExecutor : suiteExecutors.values()) {
+            suiteExecutor.shutdown()
+        }
         scriptExecutors.shutdown()
         log.info("Test finished")
         if (!success) {
@@ -135,17 +142,24 @@ class RegressionTest {
             .build();
         scriptExecutors = Executors.newFixedThreadPool(config.parallel, scriptFactory)
 
+        suiteExecutors = [:]
         BasicThreadFactory suiteFactory = new BasicThreadFactory.Builder()
             .namingPattern("suite-thread-%d")
             .priority(Thread.MAX_PRIORITY)
             .build();
-        suiteExecutors = Executors.newFixedThreadPool(config.suiteParallel, suiteFactory)
+        suiteExecutors[GroupExecType.NORMAL] = Executors.newFixedThreadPool(config.suiteParallel, suiteFactory)
 
         BasicThreadFactory singleSuiteFactory = new BasicThreadFactory.Builder()
             .namingPattern("non-concurrent-thread-%d")
             .priority(Thread.MAX_PRIORITY)
             .build();
-        singleSuiteExecutors = Executors.newFixedThreadPool(1, singleSuiteFactory)
+        suiteExecutors[GroupExecType.SINGLE] = Executors.newFixedThreadPool(1, singleSuiteFactory)
+
+        BasicThreadFactory dockerSuiteFactory = new BasicThreadFactory.Builder()
+            .namingPattern("docker-suite-thread-%d")
+            .priority(Thread.MAX_PRIORITY)
+            .build();
+        suiteExecutors[GroupExecType.DOCKER] = Executors.newFixedThreadPool(config.dockerSuiteParallel, dockerSuiteFactory)
 
         BasicThreadFactory actionFactory = new BasicThreadFactory.Builder()
             .namingPattern("action-thread-%d")
@@ -198,9 +212,9 @@ class RegressionTest {
         return sources
     }
 
-    static void runScript(Config config, ScriptSource source, Recorder recorder, boolean isSingleThreadScript) {
+    static void runScript(Config config, ScriptSource source, Recorder recorder, GroupExecType grpExecType) {
         def suiteFilter = { String suiteName, String groupName ->
-            canRun(config, suiteName, groupName, isSingleThreadScript)
+            canRun(config, suiteName, groupName, grpExecType)
         }
         def file = source.getFile()
         int failureLimit = Integer.valueOf(config.otherConfigs.getOrDefault("max_failure_num", "-1").toString());
@@ -211,12 +225,7 @@ class RegressionTest {
             return;
         }
         def eventListeners = getEventListeners(config, recorder)
-        ExecutorService executors = null
-        if (isSingleThreadScript) {
-            executors = singleSuiteExecutors
-        } else {
-            executors = suiteExecutors
-        }
+        ExecutorService executors = suiteExecutors[grpExecType]
 
         new ScriptContext(file, executors, actionExecutors,
                 config, eventListeners, suiteFilter).start { scriptContext ->
@@ -242,11 +251,20 @@ class RegressionTest {
         scriptSources.eachWithIndex { source, i ->
 //            log.info("Prepare scripts [${i + 1}/${totalFile}]".toString())
             def future = scriptExecutors.submit {
-                runScript(config, source, recorder, false)
+                runScript(config, source, recorder, GroupExecType.NORMAL)
             }
             futures.add(future)
         }
 
+        List<Future> dockerFutures = Lists.newArrayList()
+        scriptSources.eachWithIndex { source, i ->
+//            log.info("Prepare scripts [${i + 1}/${totalFile}]".toString())
+            def future = scriptExecutors.submit {
+                runScript(config, source, recorder, GroupExecType.DOCKER)
+            }
+            dockerFutures.add(future)
+        }
+
         // wait all scripts
         for (Future future : futures) {
             try {
@@ -261,12 +279,20 @@ class RegressionTest {
         scriptSources.eachWithIndex { source, i ->
 //            log.info("Prepare scripts [${i + 1}/${totalFile}]".toString())
             def future = scriptExecutors.submit {
-                runScript(config, source, recorder, true)
+                runScript(config, source, recorder, GroupExecType.SINGLE)
             }
             futures.add(future)
         }
 
         // wait all scripts
+        for (Future future : dockerFutures) {
+            try {
+                future.get()
+            } catch (Throwable t) {
+                // do nothing, because already save to Recorder
+            }
+        }
+
         for (Future future : futures) {
             try {
                 future.get()
@@ -323,19 +349,19 @@ class RegressionTest {
         return true
     }
 
-    static boolean canRun(Config config, String suiteName, String group, boolean isSingleThreadScript) {
+    static boolean canRun(Config config, String suiteName, String group, GroupExecType grpExecType) {
+        return getGroupExecType(group) == grpExecType && filterGroups(config, group) && filterSuites(config, suiteName)
+    }
+
+    static GroupExecType getGroupExecType(String group) {
         Set<String> suiteGroups = group.split(',').collect { g -> g.trim() }.toSet();
-        if (isSingleThreadScript) {
-            if (!suiteGroups.contains(nonConcurrentTestGroup)) {
-                return false
-            }
+        if (suiteGroups.contains(nonConcurrentTestGroup)) {
+            return GroupExecType.SINGLE
+        } else if (suiteGroups.contains(dockerTestGroup)) {
+            return GroupExecType.DOCKER
         } else {
-            if (suiteGroups.contains(nonConcurrentTestGroup)) {
-                return false
-            }
+            return GroupExecType.NORMAL
         }
-
-        return filterGroups(config, group) && filterSuites(config, suiteName)
     }
 
     static List<EventListener> getEventListeners(Config config, Recorder recorder) {
@@ -421,7 +447,7 @@ class RegressionTest {
         }
         pluginPath.eachFileRecurse({ it ->
             if (it.name.endsWith(".groovy")) {
-                ScriptContext context = new ScriptContext(it, suiteExecutors, actionExecutors,
+                ScriptContext context = new ScriptContext(it, suiteExecutors[GroupExecType.NORMAL], actionExecutors,
                         config, [], { name -> true })
                 File pluginFile = it
                 context.start({
@@ -454,7 +480,7 @@ class RegressionTest {
                     + "output: ${sout.toString()}, error: ${serr.toString()}")
         }
 
-        def pipList = 'pip list'.execute().text
+        def pipList = 'python -m pip list'.execute().text
         log.info("python library: ${pipList}")
     }
 
diff --git a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy
index 713a30e98ca076..eb816ecb73f997 100644
--- a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy
+++ b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy
@@ -29,6 +29,7 @@ import groovy.json.JsonSlurper
 import com.google.common.collect.ImmutableList
 import org.apache.commons.lang3.ObjectUtils
 import org.apache.doris.regression.Config
+import org.apache.doris.regression.RegressionTest
 import org.apache.doris.regression.action.BenchmarkAction
 import org.apache.doris.regression.action.ProfileAction
 import org.apache.doris.regression.action.WaitForAction
@@ -276,6 +277,11 @@ class Suite implements GroovyInterceptable {
             return
         }
 
+        if (RegressionTest.getGroupExecType(group) != RegressionTest.GroupExecType.DOCKER) {
+            throw new Exception("Need to add 'docker' to docker suite's belong groups, "
+                    + "see example demo_p0/docker_action.groovy")
+        }
+
         boolean pipelineIsCloud = isCloudMode()
         boolean dockerIsCloud = false
         if (options.cloudMode == null) {
diff --git a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
index 44220500d1b5bd..a2f99868bd739b 100644
--- a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
+++ b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
@@ -40,6 +40,7 @@ class ClusterOptions {
     ]
 
     List<String> beConfigs = [
+        'max_sys_mem_available_low_water_mark_bytes=0', //no check mem available memory
         'report_disk_state_interval_seconds=2',
         'report_random_wait=false',
     ]
diff --git a/regression-test/suites/clone_p0/test_clone_missing_version.groovy b/regression-test/suites/clone_p0/test_clone_missing_version.groovy
index 2981cf3c5e3638..aa119158dfa8ac 100644
--- a/regression-test/suites/clone_p0/test_clone_missing_version.groovy
+++ b/regression-test/suites/clone_p0/test_clone_missing_version.groovy
@@ -18,7 +18,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.doris.regression.util.NodeType
 
-suite('test_clone_missing_version') {
+suite('test_clone_missing_version', 'docker') {
     def options = new ClusterOptions()
     options.feConfigs += [
         'disable_tablet_scheduler=true',
diff --git a/regression-test/suites/clone_p0/test_clone_no_missing_version.groovy b/regression-test/suites/clone_p0/test_clone_no_missing_version.groovy
index 75eb3866ec8302..b19521441dd20e 100644
--- a/regression-test/suites/clone_p0/test_clone_no_missing_version.groovy
+++ b/regression-test/suites/clone_p0/test_clone_no_missing_version.groovy
@@ -18,7 +18,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.doris.regression.util.NodeType
 
-suite('test_clone_no_missing_version') {
+suite('test_clone_no_missing_version', 'docker') {
     def tbl = 'tbl_test_clone_no_missing_version'
     def options = new ClusterOptions()
     options.feConfigs += [
diff --git a/regression-test/suites/clone_p0/test_decommission_mtmv.groovy b/regression-test/suites/clone_p0/test_decommission_mtmv.groovy
index 24853aa718c574..b29d5c13c94447 100644
--- a/regression-test/suites/clone_p0/test_decommission_mtmv.groovy
+++ b/regression-test/suites/clone_p0/test_decommission_mtmv.groovy
@@ -17,7 +17,7 @@
 
 import org.apache.doris.regression.suite.ClusterOptions
 
-suite('test_decommission_mtmv') {
+suite('test_decommission_mtmv', 'docker') {
     def options = new ClusterOptions()
     options.feConfigs += [
         'disable_balance=true',
diff --git a/regression-test/suites/clone_p0/test_drop_clone_tablet_path_race.groovy b/regression-test/suites/clone_p0/test_drop_clone_tablet_path_race.groovy
index ebf1259a72f2c8..f9c72f3b71c23b 100644
--- a/regression-test/suites/clone_p0/test_drop_clone_tablet_path_race.groovy
+++ b/regression-test/suites/clone_p0/test_drop_clone_tablet_path_race.groovy
@@ -18,7 +18,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import org.junit.Assert
 
-suite('test_drop_clone_tablet_path_race') {
+suite('test_drop_clone_tablet_path_race', 'docker') {
     if (isCloudMode()) {
         return
     }
diff --git a/regression-test/suites/cloud_p0/multi_cluster/test_rebalance.groovy b/regression-test/suites/cloud_p0/multi_cluster/test_rebalance.groovy
index 0aa2e83ccc2bfd..542f9a969f5d6a 100644
--- a/regression-test/suites/cloud_p0/multi_cluster/test_rebalance.groovy
+++ b/regression-test/suites/cloud_p0/multi_cluster/test_rebalance.groovy
@@ -20,7 +20,7 @@ import groovy.json.JsonSlurper
 import org.awaitility.Awaitility;
 import static java.util.concurrent.TimeUnit.SECONDS;
 
-suite('test_rebalance_in_cloud', 'multi_cluster') {
+suite('test_rebalance_in_cloud', 'multi_cluster,docker') {
     if (!isCloudMode()) {
         return;
     }
diff --git a/regression-test/suites/cloud_p0/multi_cluster/test_tvf.groovy b/regression-test/suites/cloud_p0/multi_cluster/test_tvf.groovy
index 13af1209e99db4..90fd6656b8ffbb 100644
--- a/regression-test/suites/cloud_p0/multi_cluster/test_tvf.groovy
+++ b/regression-test/suites/cloud_p0/multi_cluster/test_tvf.groovy
@@ -18,7 +18,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import groovy.json.JsonSlurper
 
-suite('test_tvf_in_cloud', 'multi_cluster') {
+suite('test_tvf_in_cloud', 'multi_cluster,docker') {
     if (!isCloudMode()) {
         return;
     }
@@ -83,4 +83,4 @@ suite('test_tvf_in_cloud', 'multi_cluster') {
         sql """use @${currentCluster.cluster}"""
         testCase.call()
     }
-}
\ No newline at end of file
+}
diff --git a/regression-test/suites/cloud_p0/query_retry/test_retry_e-230.groovy b/regression-test/suites/cloud_p0/query_retry/test_retry_e-230.groovy
index 2d8ca3f529674d..88ec8e8861d6f4 100644
--- a/regression-test/suites/cloud_p0/query_retry/test_retry_e-230.groovy
+++ b/regression-test/suites/cloud_p0/query_retry/test_retry_e-230.groovy
@@ -18,7 +18,7 @@ import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.doris.regression.util.NodeType
 import org.apache.doris.regression.suite.SuiteCluster
 
-suite("test_retry_e-230") {
+suite("test_retry_e-230", 'docker') {
     if (!isCloudMode()) {
         return
     }
diff --git a/regression-test/suites/cloud_p0/schema_change/compaction10/test_schema_change_with_compaction10.groovy b/regression-test/suites/cloud_p0/schema_change/compaction10/test_schema_change_with_compaction10.groovy
index b393979d44218a..ea5e818c2ee06b 100644
--- a/regression-test/suites/cloud_p0/schema_change/compaction10/test_schema_change_with_compaction10.groovy
+++ b/regression-test/suites/cloud_p0/schema_change/compaction10/test_schema_change_with_compaction10.groovy
@@ -20,7 +20,7 @@ import org.apache.http.NoHttpResponseException
 import org.apache.doris.regression.util.DebugPoint
 import org.apache.doris.regression.util.NodeType
 
-suite('test_schema_change_with_compaction10') {
+suite('test_schema_change_with_compaction10', 'docker') {
     def options = new ClusterOptions()
     options.cloudMode = true
     options.enableDebugPoints()
@@ -260,4 +260,4 @@ suite('test_schema_change_with_compaction10') {
             assertTrue(out.contains("[8-16]"))
         }
     }
-}
\ No newline at end of file
+}
diff --git a/regression-test/suites/cloud_p0/schema_change/compaction11/test_schema_change_with_compaction11.groovy b/regression-test/suites/cloud_p0/schema_change/compaction11/test_schema_change_with_compaction11.groovy
index fd257fcb7ea950..d49d8646d3fd51 100644
--- a/regression-test/suites/cloud_p0/schema_change/compaction11/test_schema_change_with_compaction11.groovy
+++ b/regression-test/suites/cloud_p0/schema_change/compaction11/test_schema_change_with_compaction11.groovy
@@ -20,7 +20,7 @@ import org.apache.http.NoHttpResponseException
 import org.apache.doris.regression.util.DebugPoint
 import org.apache.doris.regression.util.NodeType
 
-suite('test_schema_change_with_compaction11') {
+suite('test_schema_change_with_compaction11', 'docker') {
     def options = new ClusterOptions()
     options.cloudMode = true
     options.enableDebugPoints()
diff --git a/regression-test/suites/cloud_p0/schema_change/compaction5/test_schema_change_with_compaction5.groovy b/regression-test/suites/cloud_p0/schema_change/compaction5/test_schema_change_with_compaction5.groovy
index f5028ff9e818c3..fd6267b85bcbc9 100644
--- a/regression-test/suites/cloud_p0/schema_change/compaction5/test_schema_change_with_compaction5.groovy
+++ b/regression-test/suites/cloud_p0/schema_change/compaction5/test_schema_change_with_compaction5.groovy
@@ -20,7 +20,7 @@ import org.apache.http.NoHttpResponseException
 import org.apache.doris.regression.util.DebugPoint
 import org.apache.doris.regression.util.NodeType
 
-suite('test_schema_change_with_compaction5', 'nonConcurrent') {
+suite('test_schema_change_with_compaction5', 'docker') {
     def options = new ClusterOptions()
     options.cloudMode = true
     options.enableDebugPoints()
@@ -257,4 +257,4 @@ suite('test_schema_change_with_compaction5', 'nonConcurrent') {
             assertTrue(out.contains("[8-16]"))
         }
     }
-}
\ No newline at end of file
+}
diff --git a/regression-test/suites/cloud_p0/schema_change/compaction6/test_schema_change_with_compaction6.groovy b/regression-test/suites/cloud_p0/schema_change/compaction6/test_schema_change_with_compaction6.groovy
index 951535433d1362..d77db4eb2df541 100644
--- a/regression-test/suites/cloud_p0/schema_change/compaction6/test_schema_change_with_compaction6.groovy
+++ b/regression-test/suites/cloud_p0/schema_change/compaction6/test_schema_change_with_compaction6.groovy
@@ -20,7 +20,7 @@ import org.apache.http.NoHttpResponseException
 import org.apache.doris.regression.util.DebugPoint
 import org.apache.doris.regression.util.NodeType
 
-suite('test_schema_change_with_compaction6', 'nonConcurrent') {
+suite('test_schema_change_with_compaction6', 'docker') {
     def options = new ClusterOptions()
     options.cloudMode = true
     options.enableDebugPoints()
@@ -260,4 +260,4 @@ suite('test_schema_change_with_compaction6', 'nonConcurrent') {
             assertTrue(out.contains("[8-16]"))
         }
     }
-}
\ No newline at end of file
+}
diff --git a/regression-test/suites/cloud_p0/schema_change/compaction9/test_schema_change_with_compaction9.groovy b/regression-test/suites/cloud_p0/schema_change/compaction9/test_schema_change_with_compaction9.groovy
index 83c549eefc5abd..3797a89f565997 100644
--- a/regression-test/suites/cloud_p0/schema_change/compaction9/test_schema_change_with_compaction9.groovy
+++ b/regression-test/suites/cloud_p0/schema_change/compaction9/test_schema_change_with_compaction9.groovy
@@ -20,7 +20,7 @@ import org.apache.http.NoHttpResponseException
 import org.apache.doris.regression.util.DebugPoint
 import org.apache.doris.regression.util.NodeType
 
-suite('test_schema_change_with_compaction9') {
+suite('test_schema_change_with_compaction9', 'docker') {
     def options = new ClusterOptions()
     options.cloudMode = true
     options.enableDebugPoints()
@@ -257,4 +257,4 @@ suite('test_schema_change_with_compaction9') {
             assertTrue(out.contains("[8-16]"))
         }
     }
-}
\ No newline at end of file
+}
diff --git a/regression-test/suites/compaction/test_compaction_with_visible_version.groovy b/regression-test/suites/compaction/test_compaction_with_visible_version.groovy
index 4a6ee4c847a5db..e9b60774efa22b 100644
--- a/regression-test/suites/compaction/test_compaction_with_visible_version.groovy
+++ b/regression-test/suites/compaction/test_compaction_with_visible_version.groovy
@@ -19,7 +19,7 @@ import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.doris.regression.util.Http
 import org.apache.doris.regression.util.NodeType
 
-suite('test_compaction_with_visible_version') {
+suite('test_compaction_with_visible_version', 'docker') {
     def options = new ClusterOptions()
     def compaction_keep_invisible_version_min_count = 50L
     options.feConfigs += [
diff --git a/regression-test/suites/demo_p0/docker_action.groovy b/regression-test/suites/demo_p0/docker_action.groovy
index 6d62d6ea7bea8d..bfe9c0039e2761 100644
--- a/regression-test/suites/demo_p0/docker_action.groovy
+++ b/regression-test/suites/demo_p0/docker_action.groovy
@@ -17,7 +17,15 @@
 
 import org.apache.doris.regression.suite.ClusterOptions
 
-suite('docker_action') {
+// run docker suite steps:
+// 1. Read 'docker/runtime/doris-compose/Readme.md', make sure you can setup a doris docker cluster;
+// 2. update regression-conf-custom.groovy with config:
+//    image = "xxxx"                // your doris docker image
+//    excludeDockerTest = false     // do run docker suite, default is false
+//    dockerEndDeleteFiles = false  // after run docker suite, whether delete contains's log and data in directory '/tmp/doris/<suite-name>'
+
+// need add 'docker' to suite's group, and don't add 'nonConcurrent' to it
+suite('docker_action', 'docker') {
     // run a new docker
     docker {
         sql '''create table tb1 (k int) DISTRIBUTED BY HASH(k) BUCKETS 10'''
diff --git a/regression-test/suites/insert_p0/group_commit/replay_wal_restart_fe.groovy b/regression-test/suites/insert_p0/group_commit/replay_wal_restart_fe.groovy
index d39bdd9d4a954d..8347950ca6bbe4 100644
--- a/regression-test/suites/insert_p0/group_commit/replay_wal_restart_fe.groovy
+++ b/regression-test/suites/insert_p0/group_commit/replay_wal_restart_fe.groovy
@@ -21,7 +21,7 @@
 
 import org.apache.doris.regression.suite.ClusterOptions
 
-suite("replay_wal_restart_fe") {
+suite("replay_wal_restart_fe", 'docker') {
     def check_schema_change = { state ->
         for (int i = 0; i < 30; i++) {
             def jobs = sql_return_maparray "SHOW ALTER TABLE COLUMN WHERE TableName = 'tbl_2' order by CreateTime desc;"
diff --git a/regression-test/suites/insert_p0/transaction/txn_insert_restart_fe.groovy b/regression-test/suites/insert_p0/transaction/txn_insert_restart_fe.groovy
index 9d1e372224e80e..bd478bc2359959 100644
--- a/regression-test/suites/insert_p0/transaction/txn_insert_restart_fe.groovy
+++ b/regression-test/suites/insert_p0/transaction/txn_insert_restart_fe.groovy
@@ -22,7 +22,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.doris.regression.util.NodeType
 
-suite("txn_insert_restart_fe") {
+suite("txn_insert_restart_fe", 'docker') {
     def get_observer_fe_url = {
         def fes = sql_return_maparray "show frontends"
         logger.info("frontends: ${fes}")
diff --git a/regression-test/suites/insert_p0/transaction/txn_insert_restart_fe_with_schema_change.groovy b/regression-test/suites/insert_p0/transaction/txn_insert_restart_fe_with_schema_change.groovy
index fec2fc4378dff2..d2537bfe8c6a55 100644
--- a/regression-test/suites/insert_p0/transaction/txn_insert_restart_fe_with_schema_change.groovy
+++ b/regression-test/suites/insert_p0/transaction/txn_insert_restart_fe_with_schema_change.groovy
@@ -22,7 +22,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.doris.regression.util.NodeType
 
-suite("txn_insert_restart_fe_with_schema_change") {
+suite("txn_insert_restart_fe_with_schema_change", 'docker') {
     def getAlterTableState = { dbName, show_sql ->
         def retry = 0
         sql "use ${dbName};"
diff --git a/regression-test/suites/inverted_index_p0/index_change/test_build_index_with_clone_by_docker.groovy b/regression-test/suites/inverted_index_p0/index_change/test_build_index_with_clone_by_docker.groovy
index f8478c3ea61ea0..999b58350ff1ab 100644
--- a/regression-test/suites/inverted_index_p0/index_change/test_build_index_with_clone_by_docker.groovy
+++ b/regression-test/suites/inverted_index_p0/index_change/test_build_index_with_clone_by_docker.groovy
@@ -19,7 +19,7 @@ import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.doris.regression.util.NodeType
 import org.apache.doris.regression.suite.SuiteCluster
 
-suite("test_build_index_with_clone_by_docker"){
+suite("test_build_index_with_clone_by_docker", 'docker'){
     if (isCloudMode()) {
         return 
     }
diff --git a/regression-test/suites/load/insert/test_min_load_replica_num_complicate.groovy b/regression-test/suites/load/insert/test_min_load_replica_num_complicate.groovy
index 10869ed9bf2d3b..d6c9c71539b406 100644
--- a/regression-test/suites/load/insert/test_min_load_replica_num_complicate.groovy
+++ b/regression-test/suites/load/insert/test_min_load_replica_num_complicate.groovy
@@ -34,7 +34,7 @@ class InjectCase {
 
 }
 
-suite('test_min_load_replica_num_complicate') {
+suite('test_min_load_replica_num_complicate', 'docker') {
     def beCloneCostMs = 3000
 
     def random = new Random()
diff --git a/regression-test/suites/load/insert/test_publish_one_succ.groovy b/regression-test/suites/load/insert/test_publish_one_succ.groovy
index 22f78c64300222..f58bca5c4581c9 100644
--- a/regression-test/suites/load/insert/test_publish_one_succ.groovy
+++ b/regression-test/suites/load/insert/test_publish_one_succ.groovy
@@ -18,7 +18,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.doris.regression.util.NodeType
 
-suite('test_publish_one_succ') {
+suite('test_publish_one_succ', 'docker') {
     def options = new ClusterOptions()
     options.enableDebugPoints()
     docker(options) {
diff --git a/regression-test/suites/load/insert/test_publish_slow_not_wait.groovy b/regression-test/suites/load/insert/test_publish_slow_not_wait.groovy
index 8d3cddc42187b8..d9cd077eab01c6 100644
--- a/regression-test/suites/load/insert/test_publish_slow_not_wait.groovy
+++ b/regression-test/suites/load/insert/test_publish_slow_not_wait.groovy
@@ -17,7 +17,7 @@
 
 import org.apache.doris.regression.suite.ClusterOptions
 
-suite('test_publish_slow_not_wait') {
+suite('test_publish_slow_not_wait', 'docker') {
     def options = new ClusterOptions()
     options.beNum = 3
     options.feConfigs.add('disable_tablet_scheduler=true')
diff --git a/regression-test/suites/load_p0/insert/test_min_load_replica_num_simple.groovy b/regression-test/suites/load_p0/insert/test_min_load_replica_num_simple.groovy
index 75d7155c3d44a6..88bcbfd248e2b7 100644
--- a/regression-test/suites/load_p0/insert/test_min_load_replica_num_simple.groovy
+++ b/regression-test/suites/load_p0/insert/test_min_load_replica_num_simple.groovy
@@ -17,7 +17,7 @@
 
 import org.apache.doris.regression.suite.ClusterOptions
 
-suite('test_min_load_replica_num_simple') {
+suite('test_min_load_replica_num_simple', 'docker') {
     def options = new ClusterOptions()
     options.feConfigs.add('tablet_checker_interval_ms=1000')
     docker(options) {
diff --git a/regression-test/suites/load_p0/routine_load/test_routine_load_restart_fe.groovy b/regression-test/suites/load_p0/routine_load/test_routine_load_restart_fe.groovy
index d60fbf265fd9e5..d8ea6f911799a2 100644
--- a/regression-test/suites/load_p0/routine_load/test_routine_load_restart_fe.groovy
+++ b/regression-test/suites/load_p0/routine_load/test_routine_load_restart_fe.groovy
@@ -22,7 +22,7 @@ import org.apache.kafka.clients.producer.KafkaProducer
 import org.apache.kafka.clients.producer.ProducerRecord
 import org.apache.kafka.clients.producer.ProducerConfig
 
-suite("test_routine_load_restart_fe", "p0, nonConcurrent") {
+suite("test_routine_load_restart_fe", "docker") {
     def kafkaCsvTpoics = [
                   "test_out_of_range",
                 ]
diff --git a/regression-test/suites/load_p0/stream_load/test_coordidator_be_restart.groovy b/regression-test/suites/load_p0/stream_load/test_coordidator_be_restart.groovy
index bb6b0c18a0daf7..e728335003a4f7 100644
--- a/regression-test/suites/load_p0/stream_load/test_coordidator_be_restart.groovy
+++ b/regression-test/suites/load_p0/stream_load/test_coordidator_be_restart.groovy
@@ -18,7 +18,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.http.NoHttpResponseException
 
-suite('test_coordidator_be_restart') {
+suite('test_coordidator_be_restart', 'docker') {
     def options = new ClusterOptions()
     options.cloudMode = false
     options.enableDebugPoints()
diff --git a/regression-test/suites/migrate_p0/test_migrate_disk_with_publish_version.groovy b/regression-test/suites/migrate_p0/test_migrate_disk_with_publish_version.groovy
index fb80d4272ddbaa..61dac3e4141822 100644
--- a/regression-test/suites/migrate_p0/test_migrate_disk_with_publish_version.groovy
+++ b/regression-test/suites/migrate_p0/test_migrate_disk_with_publish_version.groovy
@@ -18,7 +18,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.doris.regression.util.NodeType
 
-suite('test_migrate_disk_with_publish_version') {
+suite('test_migrate_disk_with_publish_version', 'docker') {
     if (isCloudMode()) {
         return
     }
diff --git a/regression-test/suites/partition_p0/dynamic_partition/test_dynamic_partition_mod_distribution_key.groovy b/regression-test/suites/partition_p0/dynamic_partition/test_dynamic_partition_mod_distribution_key.groovy
index db44f59216bbdd..c52e5897aa0880 100644
--- a/regression-test/suites/partition_p0/dynamic_partition/test_dynamic_partition_mod_distribution_key.groovy
+++ b/regression-test/suites/partition_p0/dynamic_partition/test_dynamic_partition_mod_distribution_key.groovy
@@ -18,7 +18,7 @@
 
 import org.apache.doris.regression.suite.ClusterOptions
 
-suite("test_dynamic_partition_mod_distribution_key") {
+suite("test_dynamic_partition_mod_distribution_key", "docker") {
     def options = new ClusterOptions()
     options.setFeNum(2)
     docker(options) {
@@ -72,4 +72,4 @@ suite("test_dynamic_partition_mod_distribution_key") {
             assertEquals(9, result.size())
         }
     }
-}
\ No newline at end of file
+}
diff --git a/regression-test/suites/partition_p0/test_create_table_exception.groovy b/regression-test/suites/partition_p0/test_create_table_exception.groovy
index 96f097c76705f2..7c96e4b59da16e 100644
--- a/regression-test/suites/partition_p0/test_create_table_exception.groovy
+++ b/regression-test/suites/partition_p0/test_create_table_exception.groovy
@@ -18,7 +18,7 @@ import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.doris.regression.util.NodeType
 import org.apache.doris.regression.suite.SuiteCluster
 
-suite("test_create_table_exception") {
+suite("test_create_table_exception", "docker") {
     def options = new ClusterOptions()
     options.enableDebugPoints()
     options.setFeNum(3)
diff --git a/regression-test/suites/partition_p0/test_partition_create_tablet_rr.groovy b/regression-test/suites/partition_p0/test_partition_create_tablet_rr.groovy
index f7e77f06f38c23..836dff938f8c96 100644
--- a/regression-test/suites/partition_p0/test_partition_create_tablet_rr.groovy
+++ b/regression-test/suites/partition_p0/test_partition_create_tablet_rr.groovy
@@ -18,7 +18,7 @@ import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.doris.regression.util.NodeType
 import org.apache.doris.regression.suite.SuiteCluster
 
-suite("test_partition_create_tablet_rr") {
+suite("test_partition_create_tablet_rr", "docker") {
     def options = new ClusterOptions()
     options.beNum = 1
     options.feConfigs.add('disable_balance=true')
diff --git a/regression-test/suites/query_p0/test_forward_qeury.groovy b/regression-test/suites/query_p0/test_forward_qeury.groovy
index 8dbef459d2dd75..28295e4ec895bb 100644
--- a/regression-test/suites/query_p0/test_forward_qeury.groovy
+++ b/regression-test/suites/query_p0/test_forward_qeury.groovy
@@ -19,7 +19,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.doris.regression.util.NodeType
 
-suite("test_forward_query") {
+suite("test_forward_query", 'docker') {
     def options = new ClusterOptions()
     options.enableDebugPoints()
     options.setFeNum(2)
@@ -49,4 +49,4 @@ suite("test_forward_query") {
             assertTrue(false)
         }
     }
-}
\ No newline at end of file
+}
diff --git a/regression-test/suites/schema_change/test_schema_change_concurrent_with_txn.groovy b/regression-test/suites/schema_change/test_schema_change_concurrent_with_txn.groovy
index d250a000c0123f..3a63e306ae15de 100644
--- a/regression-test/suites/schema_change/test_schema_change_concurrent_with_txn.groovy
+++ b/regression-test/suites/schema_change/test_schema_change_concurrent_with_txn.groovy
@@ -18,7 +18,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.doris.regression.util.NodeType
 
-suite('test_schema_change_concurrent_with_txn') {
+suite('test_schema_change_concurrent_with_txn', 'docker') {
     def options = new ClusterOptions()
     options.enableDebugPoints()
     options.feConfigs.add('publish_wait_time_second=-1')
diff --git a/regression-test/suites/schema_change_p0/test_abort_txn_by_be_cloud1.groovy b/regression-test/suites/schema_change_p0/test_abort_txn_by_be_cloud1.groovy
index 03f77917731c64..f2d0b767eb89fe 100644
--- a/regression-test/suites/schema_change_p0/test_abort_txn_by_be_cloud1.groovy
+++ b/regression-test/suites/schema_change_p0/test_abort_txn_by_be_cloud1.groovy
@@ -18,7 +18,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.http.NoHttpResponseException
 
-suite('test_abort_txn_by_be_cloud1') {
+suite('test_abort_txn_by_be_cloud1', 'docker') {
     def options = new ClusterOptions()
     options.cloudMode = true
     options.enableDebugPoints()
diff --git a/regression-test/suites/schema_change_p0/test_abort_txn_by_be_cloud2.groovy b/regression-test/suites/schema_change_p0/test_abort_txn_by_be_cloud2.groovy
index 7a2d382f3abfda..7264ac7f90a9f4 100644
--- a/regression-test/suites/schema_change_p0/test_abort_txn_by_be_cloud2.groovy
+++ b/regression-test/suites/schema_change_p0/test_abort_txn_by_be_cloud2.groovy
@@ -18,7 +18,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.http.NoHttpResponseException
 
-suite('test_abort_txn_by_be_cloud2') {
+suite('test_abort_txn_by_be_cloud2', 'docker') {
     def options = new ClusterOptions()
     options.cloudMode = true
     options.enableDebugPoints()
diff --git a/regression-test/suites/schema_change_p0/test_abort_txn_by_be_local5.groovy b/regression-test/suites/schema_change_p0/test_abort_txn_by_be_local5.groovy
index df4fb5d637e566..0df8254ff25844 100644
--- a/regression-test/suites/schema_change_p0/test_abort_txn_by_be_local5.groovy
+++ b/regression-test/suites/schema_change_p0/test_abort_txn_by_be_local5.groovy
@@ -18,7 +18,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.http.NoHttpResponseException
 
-suite('test_abort_txn_by_be_local5') {
+suite('test_abort_txn_by_be_local5', 'docker') {
     def options = new ClusterOptions()
     options.cloudMode = false
     options.skipRunWhenPipelineDiff = false
diff --git a/regression-test/suites/schema_change_p0/test_abort_txn_by_be_local6.groovy b/regression-test/suites/schema_change_p0/test_abort_txn_by_be_local6.groovy
index 1f6e6df4417212..a95d335579b046 100644
--- a/regression-test/suites/schema_change_p0/test_abort_txn_by_be_local6.groovy
+++ b/regression-test/suites/schema_change_p0/test_abort_txn_by_be_local6.groovy
@@ -18,7 +18,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.http.NoHttpResponseException
 
-suite('test_abort_txn_by_be_local6') {
+suite('test_abort_txn_by_be_local6', 'docker') {
     def options = new ClusterOptions()
     options.cloudMode = false
     options.skipRunWhenPipelineDiff = true
diff --git a/regression-test/suites/schema_change_p0/test_abort_txn_by_fe_cloud4.groovy b/regression-test/suites/schema_change_p0/test_abort_txn_by_fe_cloud4.groovy
index bd12d57fd34ed1..80b61e16efd3b0 100644
--- a/regression-test/suites/schema_change_p0/test_abort_txn_by_fe_cloud4.groovy
+++ b/regression-test/suites/schema_change_p0/test_abort_txn_by_fe_cloud4.groovy
@@ -18,7 +18,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.http.NoHttpResponseException
 
-suite('test_abort_txn_by_fe_cloud4') {
+suite('test_abort_txn_by_fe_cloud4', 'docker') {
     def options = new ClusterOptions()
     options.cloudMode = true
     options.enableDebugPoints()
diff --git a/regression-test/suites/schema_change_p0/test_abort_txn_by_fe_local3.groovy b/regression-test/suites/schema_change_p0/test_abort_txn_by_fe_local3.groovy
index 37667abe9506d7..355dab0587917c 100644
--- a/regression-test/suites/schema_change_p0/test_abort_txn_by_fe_local3.groovy
+++ b/regression-test/suites/schema_change_p0/test_abort_txn_by_fe_local3.groovy
@@ -18,7 +18,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.http.NoHttpResponseException
 
-suite('test_abort_txn_by_fe_local3') {
+suite('test_abort_txn_by_fe_local3', 'docker') {
     def options = new ClusterOptions()
     options.cloudMode = false
     options.skipRunWhenPipelineDiff = false
diff --git a/regression-test/suites/storage_medium_p0/test_partition_default_medium.groovy b/regression-test/suites/storage_medium_p0/test_partition_default_medium.groovy
index 3543ce64ab1ae7..163761833c21b7 100644
--- a/regression-test/suites/storage_medium_p0/test_partition_default_medium.groovy
+++ b/regression-test/suites/storage_medium_p0/test_partition_default_medium.groovy
@@ -18,7 +18,7 @@ import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.doris.regression.util.NodeType
 import org.apache.doris.regression.suite.SuiteCluster
 
-suite("test_partition_default_medium") {
+suite("test_partition_default_medium", 'docker') {
     def options = new ClusterOptions()
     options.feConfigs += [
         'default_storage_medium=HDD',
diff --git a/regression-test/suites/storage_medium_p0/test_storage_medium_has_disk.groovy b/regression-test/suites/storage_medium_p0/test_storage_medium_has_disk.groovy
index bd06680d2b2aeb..c252e10b130333 100644
--- a/regression-test/suites/storage_medium_p0/test_storage_medium_has_disk.groovy
+++ b/regression-test/suites/storage_medium_p0/test_storage_medium_has_disk.groovy
@@ -17,7 +17,7 @@
 
 import org.apache.doris.regression.suite.ClusterOptions
 
-suite('test_storage_medium_has_disk') {
+suite('test_storage_medium_has_disk', 'docker') {
     if (isCloudMode()) {
         return
     }
diff --git a/regression-test/suites/trash_p0/clean_trash.groovy b/regression-test/suites/trash_p0/clean_trash.groovy
index 71438b5b1b5af7..fdfafc4887fdcd 100644
--- a/regression-test/suites/trash_p0/clean_trash.groovy
+++ b/regression-test/suites/trash_p0/clean_trash.groovy
@@ -17,7 +17,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import org.junit.Assert
 
-suite("test_clean_trash", "p0") {
+suite("test_clean_trash", "docker") {
     if (isCloudMode()) {
         return
     }
@@ -77,4 +77,4 @@ suite("test_clean_trash", "p0") {
         sql """admin clean trash"""
         checkFunc(true)
     }
-}
\ No newline at end of file
+}
diff --git a/regression-test/suites/unique_with_mow_c_p0/test_mow_full_clone_exception.groovy b/regression-test/suites/unique_with_mow_c_p0/test_mow_full_clone_exception.groovy
index 516cff9d4f58e3..6ab872239cd945 100644
--- a/regression-test/suites/unique_with_mow_c_p0/test_mow_full_clone_exception.groovy
+++ b/regression-test/suites/unique_with_mow_c_p0/test_mow_full_clone_exception.groovy
@@ -29,7 +29,7 @@ import org.apache.doris.regression.util.NodeType
 //
 // the bug is fixed in #37001
 
-suite('test_full_clone_exception') {
+suite('test_full_clone_exception', 'docker') {
     def options = new ClusterOptions()
     options.feConfigs += [
         'disable_tablet_scheduler=true',
diff --git a/regression-test/suites/unique_with_mow_p0/partial_update/test_partial_update_conflict_be_restart.groovy b/regression-test/suites/unique_with_mow_p0/partial_update/test_partial_update_conflict_be_restart.groovy
index bc2a44425b30c8..642363f909866d 100644
--- a/regression-test/suites/unique_with_mow_p0/partial_update/test_partial_update_conflict_be_restart.groovy
+++ b/regression-test/suites/unique_with_mow_p0/partial_update/test_partial_update_conflict_be_restart.groovy
@@ -35,7 +35,7 @@ import org.apache.http.client.methods.CloseableHttpResponse
 import org.apache.http.util.EntityUtils
 import org.apache.doris.regression.suite.ClusterOptions
 
-suite("test_partial_update_conflict_be_restart") {
+suite("test_partial_update_conflict_be_restart", 'docker') {
     def dbName = context.config.getDbNameByFile(context.file)
 
     def options = new ClusterOptions()
diff --git a/regression-test/suites/unique_with_mow_p0/test_mow_full_clone_exception.groovy b/regression-test/suites/unique_with_mow_p0/test_mow_full_clone_exception.groovy
index c3fb567f258f8d..42befff4e54cce 100644
--- a/regression-test/suites/unique_with_mow_p0/test_mow_full_clone_exception.groovy
+++ b/regression-test/suites/unique_with_mow_p0/test_mow_full_clone_exception.groovy
@@ -29,7 +29,7 @@ import org.apache.doris.regression.util.NodeType
 //
 // the bug is fixed in #37001
 
-suite('test_full_clone_exception') {
+suite('test_full_clone_exception', 'docker') {
     def options = new ClusterOptions()
     options.feConfigs += [
         'disable_tablet_scheduler=true',
diff --git a/run-regression-test.sh b/run-regression-test.sh
index ea7ced8be9498b..6357f4111a7f5d 100755
--- a/run-regression-test.sh
+++ b/run-regression-test.sh
@@ -43,6 +43,7 @@ Usage: $0 <shell_options> <framework_options>
      -genOut                           generate .out file if not exist
      -forceGenOut                      delete and generate .out file
      -parallel                         run tests using specified threads
+     -dockerSuiteParallel              run docker tests using specified threads
      -randomOrder                      run tests in a random order
      -noKillDocker                     don't kill container when finish docker suites
      -times                            rum tests {times} times

From fcceceab89fc5c63190d5ab03de78c3f7fdd8ca1 Mon Sep 17 00:00:00 2001
From: yujun <yu.jun.reach@gmail.com>
Date: Wed, 11 Sep 2024 00:08:45 +0800
Subject: [PATCH 41/44] [improvement](tablet report) tablet report increase
 report version (#40172)

when be report tablet, update its report version, then if fe is handling
a stale report, it can quickly discard it.
---
 be/src/agent/task_worker_pool.cpp             |  3 +-
 .../org/apache/doris/master/MasterImpl.java   | 10 +++---
 .../apache/doris/master/ReportHandler.java    |  2 ++
 .../doris/system/SystemInfoService.java       | 36 ++++++++++++-------
 .../doris/cluster/SystemInfoServiceTest.java  |  2 +-
 5 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp
index 27921888774f9b..d4944711720a59 100644
--- a/be/src/agent/task_worker_pool.cpp
+++ b/be/src/agent/task_worker_pool.cpp
@@ -102,7 +102,7 @@ namespace {
 std::mutex s_task_signatures_mtx;
 std::unordered_map<TTaskType::type, std::unordered_set<int64_t>> s_task_signatures;
 
-std::atomic_ulong s_report_version(time(nullptr) * 10000);
+std::atomic_ulong s_report_version(time(nullptr) * 100000);
 
 void increase_report_version() {
     s_report_version.fetch_add(1, std::memory_order_relaxed);
@@ -1074,6 +1074,7 @@ void report_tablet_callback(StorageEngine& engine, const TMasterInfo& master_inf
     request.__set_backend(BackendOptions::get_local_backend());
     request.__isset.tablets = true;
 
+    increase_report_version();
     uint64_t report_version;
     for (int i = 0; i < 5; i++) {
         request.tablets.clear();
diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java b/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java
index 4e01f3a5058774..fbb3aab4ebdcd2 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java
@@ -267,7 +267,7 @@ private void finishCreateReplica(AgentTask task, TFinishTaskRequest request) {
 
                 // this should be called before 'countDownLatch()'
                 Env.getCurrentSystemInfo().updateBackendReportVersion(task.getBackendId(),
-                        request.getReportVersion(), task.getDbId(), task.getTableId());
+                        request.getReportVersion(), task.getDbId(), task.getTableId(), true);
 
                 createReplicaTask.countDownLatch(task.getBackendId(), task.getSignature());
                 if (LOG.isDebugEnabled()) {
@@ -383,7 +383,7 @@ private void finishRealtimePush(AgentTask task, TFinishTaskRequest request) thro
             // should be done before addReplicaPersistInfos and countDownLatch
             long reportVersion = request.getReportVersion();
             Env.getCurrentSystemInfo().updateBackendReportVersion(task.getBackendId(), reportVersion,
-                                                                       task.getDbId(), task.getTableId());
+                                                                       task.getDbId(), task.getTableId(), true);
 
             List<Long> tabletIds = finishTabletInfos.stream().map(
                     tTabletInfo -> tTabletInfo.getTabletId()).collect(Collectors.toList());
@@ -515,7 +515,7 @@ private void finishPublishVersion(AgentTask task, TFinishTaskRequest request) {
             // report version is required. here we check if set, for compatibility.
             long reportVersion = request.getReportVersion();
             Env.getCurrentSystemInfo().updateBackendReportVersion(
-                    task.getBackendId(), reportVersion, task.getDbId(), task.getTableId());
+                    task.getBackendId(), reportVersion, task.getDbId(), task.getTableId(), true);
         }
 
         PublishVersionTask publishVersionTask = (PublishVersionTask) task;
@@ -545,7 +545,7 @@ private void finishClone(AgentTask task, TFinishTaskRequest request) {
             if (request.isSetReportVersion()) {
                 long reportVersion = request.getReportVersion();
                 Env.getCurrentSystemInfo().updateBackendReportVersion(
-                        task.getBackendId(), reportVersion, task.getDbId(), task.getTableId());
+                        task.getBackendId(), reportVersion, task.getDbId(), task.getTableId(), true);
             }
             Env.getCurrentEnv().getTabletScheduler().finishCloneTask(cloneTask, request);
         } else {
@@ -628,7 +628,7 @@ private void finishAlterTask(AgentTask task, TFinishTaskRequest request) {
             if (request.isSetReportVersion()) {
                 long reportVersion = request.getReportVersion();
                 Env.getCurrentSystemInfo().updateBackendReportVersion(
-                        task.getBackendId(), reportVersion, task.getDbId(), task.getTableId());
+                        task.getBackendId(), reportVersion, task.getDbId(), task.getTableId(), true);
             }
         } catch (MetaNotFoundException e) {
             LOG.warn("failed to handle finish alter task: {}, {}", task.getSignature(), e.getMessage());
diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java b/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
index bce5825c5cace6..f7702a495544d2 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
@@ -178,11 +178,13 @@ public TMasterResult handleReport(TReportRequest request) throws TException {
             tablets = request.getTablets();
             reportVersion = request.getReportVersion();
             reportType = ReportType.TABLET;
+            Env.getCurrentSystemInfo().updateBackendReportVersion(beId, reportVersion, -1L, -1L, false);
         } else if (request.isSetTabletList()) {
             // the 'tablets' member will be deprecated in future.
             tablets = buildTabletMap(request.getTabletList());
             reportVersion = request.getReportVersion();
             reportType = ReportType.TABLET;
+            Env.getCurrentSystemInfo().updateBackendReportVersion(beId, reportVersion, -1L, -1L, false);
         }
 
         if (request.isSetPartitionsVersion()) {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/SystemInfoService.java b/fe/fe-core/src/main/java/org/apache/doris/system/SystemInfoService.java
index f81d8b4d7b02b6..76140d2ef26d9e 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/system/SystemInfoService.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/system/SystemInfoService.java
@@ -19,7 +19,6 @@
 
 import org.apache.doris.analysis.ModifyBackendClause;
 import org.apache.doris.analysis.ModifyBackendHostNameClause;
-import org.apache.doris.catalog.Database;
 import org.apache.doris.catalog.DiskInfo;
 import org.apache.doris.catalog.Env;
 import org.apache.doris.catalog.ReplicaAllocation;
@@ -642,18 +641,31 @@ public long getBackendReportVersion(long backendId) {
         }
     }
 
-    public void updateBackendReportVersion(long backendId, long newReportVersion, long dbId, long tableId) {
-        AtomicLong atomicLong;
-        if ((atomicLong = idToReportVersionRef.get(backendId)) != null) {
-            Database db = (Database) Env.getCurrentInternalCatalog().getDbNullable(dbId);
-            if (db == null) {
-                LOG.warn("failed to update backend report version, db {} does not exist", dbId);
-                return;
+    public void updateBackendReportVersion(long backendId, long newReportVersion, long dbId, long tableId,
+            boolean checkDbExist) {
+        AtomicLong atomicLong = idToReportVersionRef.get(backendId);
+        if (atomicLong == null) {
+            return;
+        }
+        if (checkDbExist && Env.getCurrentInternalCatalog().getDbNullable(dbId) == null) {
+            LOG.warn("failed to update backend report version, db {} does not exist", dbId);
+            return;
+        }
+        while (true) {
+            long curReportVersion = atomicLong.get();
+            if (curReportVersion >= newReportVersion) {
+                if (LOG.isDebugEnabled()) {
+                    LOG.debug("skip update backend {} report version: {}, current version: {}, db: {}, table: {}",
+                            backendId, newReportVersion, curReportVersion, dbId, tableId);
+                }
+                break;
             }
-            atomicLong.set(newReportVersion);
-            if (LOG.isDebugEnabled()) {
-                LOG.debug("update backend {} report version: {}, db: {}, table: {}",
-                        backendId, newReportVersion, dbId, tableId);
+            if (atomicLong.compareAndSet(curReportVersion, newReportVersion)) {
+                if (LOG.isDebugEnabled()) {
+                    LOG.debug("update backend {} report version: {}, db: {}, table: {}",
+                            backendId, newReportVersion, dbId, tableId);
+                }
+                break;
             }
         }
     }
diff --git a/fe/fe-core/src/test/java/org/apache/doris/cluster/SystemInfoServiceTest.java b/fe/fe-core/src/test/java/org/apache/doris/cluster/SystemInfoServiceTest.java
index c48ba030e77234..8a9216aecc0f6e 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/cluster/SystemInfoServiceTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/cluster/SystemInfoServiceTest.java
@@ -233,7 +233,7 @@ public void addBackendTest() throws UserException {
 
         Assert.assertTrue(Env.getCurrentSystemInfo().getBackendReportVersion(backendId) == 0L);
 
-        Env.getCurrentSystemInfo().updateBackendReportVersion(backendId, 2L, 20000L, 30000L);
+        Env.getCurrentSystemInfo().updateBackendReportVersion(backendId, 2L, 20000L, 30000L, true);
         Assert.assertTrue(Env.getCurrentSystemInfo().getBackendReportVersion(backendId) == 2L);
     }
 

From eb4673fbd52b9d60074648309d986237a4e6eae4 Mon Sep 17 00:00:00 2001
From: deardeng <565620795@qq.com>
Date: Wed, 11 Sep 2024 00:52:39 +0800
Subject: [PATCH 42/44] [fix](cloud) Fix cloud auto start and add a regression
 case (#40027)

1. Fix the cluster being suspended and select not waking up the cluster.
The reason is that all be nodes in the cluster are inactive, the cluster
is skipped, and the cluster that needs to be woken up cannot be found,
and the wake-up logic will not be reached. And delete the redundant
function getAuthorizedCloudCluster
2. Add check after resume cluster, there must be at least one alive be
in the cluster.
3. add auto start regression case
---
 .../doris/cloud/catalog/CloudReplica.java     |  14 +-
 .../cloud/system/CloudSystemInfoService.java  |  16 +-
 .../org/apache/doris/qe/ConnectContext.java   |  37 +---
 .../multi_cluster/test_auto_start.groovy      | 172 ++++++++++++++++++
 4 files changed, 193 insertions(+), 46 deletions(-)
 create mode 100644 regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy

diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudReplica.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudReplica.java
index 43f7dcbc6879f3..75ded96f4b6757 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudReplica.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudReplica.java
@@ -150,6 +150,13 @@ public long getBackendId() {
     }
 
     private long getBackendIdImpl(String cluster) {
+        // if cluster is SUSPENDED, wait
+        try {
+            cluster = ((CloudSystemInfoService) Env.getCurrentSystemInfo()).waitForAutoStart(cluster);
+        } catch (DdlException e) {
+            // this function cant throw exception. so just log it
+            LOG.warn("cant resume cluster {}, exception", cluster, e);
+        }
         // check default cluster valid.
         if (Strings.isNullOrEmpty(cluster)) {
             LOG.warn("failed to get available be, clusterName: {}", cluster);
@@ -163,13 +170,6 @@ private long getBackendIdImpl(String cluster) {
             return -1;
         }
 
-        // if cluster is SUSPENDED, wait
-        try {
-            ((CloudSystemInfoService) Env.getCurrentSystemInfo()).waitForAutoStart(cluster);
-        } catch (DdlException e) {
-            // this function cant throw exception. so just log it
-            LOG.warn("cant resume cluster {}, exception", cluster, e);
-        }
         String clusterId = ((CloudSystemInfoService) Env.getCurrentSystemInfo()).getCloudClusterIdByName(cluster);
 
         if (isColocated()) {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
index 202d576e3bf0d5..03cbbfe814a8b6 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
@@ -737,20 +737,20 @@ public String getClusterNameAutoStart(final String clusterName) {
         return cloudClusterTypeAndName.clusterName;
     }
 
-    public void waitForAutoStart(String clusterName) throws DdlException {
+    public String waitForAutoStart(String clusterName) throws DdlException {
         if (Config.isNotCloudMode()) {
-            return;
+            return null;
         }
         clusterName = getClusterNameAutoStart(clusterName);
         if (Strings.isNullOrEmpty(clusterName)) {
             LOG.warn("auto start in cloud mode, but clusterName empty {}", clusterName);
-            return;
+            return null;
         }
         String clusterStatus = getCloudStatusByName(clusterName);
         if (Strings.isNullOrEmpty(clusterStatus)) {
             // for cluster rename or cluster dropped
             LOG.warn("cant find clusterStatus in fe, clusterName {}", clusterName);
-            return;
+            return null;
         }
 
         if (Cloud.ClusterStatus.valueOf(clusterStatus) == Cloud.ClusterStatus.MANUAL_SHUTDOWN) {
@@ -765,7 +765,7 @@ public void waitForAutoStart(String clusterName) throws DdlException {
             // root ? see StatisticsUtil.buildConnectContext
             if (ConnectContext.get() != null && ConnectContext.get().getUserIdentity().isRootUser()) {
                 LOG.warn("auto start daemon thread run in root, not resume cluster {}-{}", clusterName, clusterStatus);
-                return;
+                return null;
             }
             Cloud.AlterClusterRequest.Builder builder = Cloud.AlterClusterRequest.newBuilder();
             builder.setCloudUniqueId(Config.cloud_unique_id);
@@ -794,7 +794,8 @@ public void waitForAutoStart(String clusterName) throws DdlException {
         StopWatch stopWatch = new StopWatch();
         stopWatch.start();
         boolean hasAutoStart = false;
-        while (!String.valueOf(Cloud.ClusterStatus.NORMAL).equals(clusterStatus)
+        boolean existAliveBe = true;
+        while ((!String.valueOf(Cloud.ClusterStatus.NORMAL).equals(clusterStatus) || !existAliveBe)
             && retryTime < retryTimes) {
             hasAutoStart = true;
             ++retryTime;
@@ -812,6 +813,8 @@ public void waitForAutoStart(String clusterName) throws DdlException {
                 LOG.info("change cluster sleep wait InterruptedException: ", e);
             }
             clusterStatus = getCloudStatusByName(clusterName);
+            // Check that the bes node in the cluster have at least one alive
+            existAliveBe = getBackendsByClusterName(clusterName).stream().anyMatch(Backend::isAlive);
         }
         if (retryTime >= retryTimes) {
             // auto start timeout
@@ -824,5 +827,6 @@ public void waitForAutoStart(String clusterName) throws DdlException {
         if (hasAutoStart) {
             LOG.info("auto start cluster {}, start cost {} ms", clusterName, stopWatch.getTime());
         }
+        return clusterName;
     }
 }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectContext.java b/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectContext.java
index ff960439f7668f..fa81825d370bc0 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectContext.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectContext.java
@@ -1249,7 +1249,7 @@ public String getCloudCluster(boolean updateErr) {
         String choseWay = null;
         if (!Strings.isNullOrEmpty(this.cloudCluster)) {
             cluster = this.cloudCluster;
-            choseWay = "use @cluster";
+            choseWay = "use context cluster";
             LOG.debug("finally set context cluster name {} for user {} with chose way '{}'",
                     cloudCluster, getCurrentUserIdentity(), choseWay);
             return cluster;
@@ -1260,9 +1260,9 @@ public String getCloudCluster(boolean updateErr) {
             cluster = defaultCluster;
             choseWay = "default cluster";
         } else {
-            String authorizedCluster = getAuthorizedCloudCluster();
-            if (!Strings.isNullOrEmpty(authorizedCluster)) {
-                cluster = authorizedCluster;
+            CloudClusterResult cloudClusterTypeAndName = getCloudClusterByPolicy();
+            if (cloudClusterTypeAndName != null && !Strings.isNullOrEmpty(cloudClusterTypeAndName.clusterName)) {
+                cluster = cloudClusterTypeAndName.clusterName;
                 choseWay = "authorized cluster";
             }
         }
@@ -1293,35 +1293,6 @@ public String getDefaultCloudCluster() {
         return null;
     }
 
-    public String getAuthorizedCloudCluster() {
-        List<String> cloudClusterNames = ((CloudSystemInfoService) Env.getCurrentSystemInfo()).getCloudClusterNames();
-        // get all available cluster of the user
-        for (String cloudClusterName : cloudClusterNames) {
-            if (!Env.getCurrentEnv().getAuth().checkCloudPriv(getCurrentUserIdentity(),
-                    cloudClusterName, PrivPredicate.USAGE, ResourceTypeEnum.CLUSTER)) {
-                continue;
-            }
-            // find a cluster has more than one alive be
-            List<Backend> bes = ((CloudSystemInfoService) Env.getCurrentSystemInfo())
-                    .getBackendsByClusterName(cloudClusterName);
-            AtomicBoolean hasAliveBe = new AtomicBoolean(false);
-            bes.stream().filter(Backend::isAlive).findAny().ifPresent(backend -> {
-                if (LOG.isDebugEnabled()) {
-                    LOG.debug("get a clusterName {}, it's has more than one alive be {}", cloudClusterName, backend);
-                }
-                hasAliveBe.set(true);
-            });
-            if (hasAliveBe.get()) {
-                if (LOG.isDebugEnabled()) {
-                    LOG.debug("set context cluster name {}", cloudClusterName);
-                }
-                return cloudClusterName;
-            }
-        }
-
-        return null;
-    }
-
     public StatsErrorEstimator getStatsErrorEstimator() {
         return statsErrorEstimator;
     }
diff --git a/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy b/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy
new file mode 100644
index 00000000000000..2ce9a9d8f4b531
--- /dev/null
+++ b/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy
@@ -0,0 +1,172 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import org.apache.doris.regression.suite.ClusterOptions
+import groovy.json.JsonSlurper
+import groovy.json.JsonOutput
+import org.awaitility.Awaitility;
+import org.apache.doris.regression.util.Http
+import static java.util.concurrent.TimeUnit.SECONDS;
+
+suite('test_auto_start_in_cloud', 'multi_cluster') {
+    if (!isCloudMode()) {
+        return;
+    }
+    def options = new ClusterOptions()
+    options.feConfigs += [
+        'cloud_cluster_check_interval_second=1',
+        'cloud_pre_heating_time_limit_sec=1',
+        'sys_log_verbose_modules=org',
+        'heartbeat_interval_second=1'
+    ]
+    options.setFeNum(3)
+    options.setBeNum(3)
+    options.cloudMode = true
+    options.connectToFollower = true
+
+    def getClusterFragementStatus = { def fe -> 
+        def (feHost, feHttpPort) = fe.getHttpAddress()
+        // curl -X GET -u root: '128.1.1.1:8030/rest/v2/manager/cluster/cluster_info/cloud_cluster_status'
+        def url = 'http://' + feHost + ':' + feHttpPort + '/rest/v2/manager/cluster/cluster_info/cloud_cluster_status'
+        def result = Http.GET(url, true)
+        result
+    }
+
+
+    def set_cluster_status = { String unique_id , String cluster_id, String status, def ms ->
+        def jsonOutput = new JsonOutput()
+        def reqBody = [
+                    cloud_unique_id: unique_id,
+                    cluster : [
+                        cluster_id : cluster_id,
+                        cluster_status : status
+                    ]
+                ]
+        def js = jsonOutput.toJson(reqBody)
+        log.info("drop cluster req: ${js} ".toString())
+
+        def set_cluster_status_api = { request_body, check_func ->
+            httpTest {
+                endpoint ms.host+':'+ms.httpPort
+                uri "/MetaService/http/set_cluster_status?token=greedisgood9999"
+                body request_body
+                check check_func
+            }
+        }
+
+        set_cluster_status_api.call(js) {
+            respCode, body ->
+                log.info("set cluster status resp: ${body} ${respCode}".toString())
+                def json = parseJson(body)
+                assertTrue(json.code.equalsIgnoreCase("OK"))
+        }
+    }
+
+    docker(options) {
+        sql """
+            CREATE TABLE table1 (
+            class INT,
+            id INT,
+            score INT SUM
+            )
+            AGGREGATE KEY(class, id)
+            DISTRIBUTED BY HASH(class) BUCKETS 48
+        """
+
+        sql """INSERT INTO table1 VALUES (1, 1, 100)"""
+        // master
+        def fe1 = cluster.getFeByIndex(1)
+        // ms
+        def ms = cluster.getAllMetaservices().get(0)
+
+        def result = sql_return_maparray """SHOW CLUSTERS"""
+        String clusterName = result[0].cluster
+        def tag = getCloudBeTagByName(clusterName)
+        logger.info("tag = {}", tag)
+
+        def jsonSlurper = new JsonSlurper()
+        def jsonObject = jsonSlurper.parseText(tag)
+        String cloudClusterId = jsonObject.cloud_cluster_id
+        String uniqueId = jsonObject.cloud_unique_id
+
+        sleep(5 * 1000)
+
+        Map<String, Long> fragmentUpdateTimeMap = [:]
+
+        // no read,write,sc, 20s suspend cluster
+        boolean clusterCanSuspend = true
+        for (int i = 0; i < 20; i++) {
+            result = getClusterFragementStatus(fe1)
+            result.data.compute_cluster_id.each {
+                if (fragmentUpdateTimeMap[it.host] == null) {
+                    fragmentUpdateTimeMap[it.host] = it.lastFragmentUpdateTime
+                } else if (fragmentUpdateTimeMap[it.host] != it.lastFragmentUpdateTime) {
+                    log.info("fragment update time changed be: {} old time: {} new time: {}", it.host, fragmentUpdateTimeMap[it.host], it.lastFragmentUpdateTime)
+                    clusterCanSuspend = false
+                }
+            }
+            sleep(1 * 1000)
+        }
+        assertTrue(clusterCanSuspend)
+
+        // cloud control set cluster status SUSPENDED
+        set_cluster_status(uniqueId, cloudClusterId, "SUSPENDED", ms)
+
+        dockerAwaitUntil(5) {
+            tag = getCloudBeTagByName(clusterName)
+            logger.info("tag = {}", tag) 
+            jsonObject = jsonSlurper.parseText(tag)
+            String cluster_status = jsonObject.cloud_cluster_status
+            cluster_status == "SUSPENDED"
+        }
+
+        cluster.stopBackends(1,2,3)
+
+        // select
+        future1 = thread {
+            def begin = System.currentTimeMillis();
+            // root cant resume, due to deamon thread use root
+            def connInfo = context.threadLocalConn.get()
+            result = connect(user = 'admin', password = '', url = connInfo.conn.getMetaData().getURL()) {
+                sql 'SELECT * FROM table1'
+            }
+            def cost = System.currentTimeMillis() - begin;
+            log.info("result {} time cost: {}", result, cost)
+            assertTrue(cost > 5000)
+            assertEquals(1, result.size())
+        }
+        // insert
+   
+        // cloud control
+        future2 = thread {
+            // check cluster "TO_RESUME"
+            dockerAwaitUntil(5) {
+                tag = getCloudBeTagByName(clusterName)
+                logger.info("tag = {}", tag) 
+                jsonObject = jsonSlurper.parseText(tag)
+                String cluster_status = jsonObject.cloud_cluster_status
+                cluster_status == "TO_RESUME"
+            }
+            sleep(5 * 1000)
+            cluster.startBackends(1,2,3)
+            set_cluster_status(uniqueId, cloudClusterId, "NORMAL", ms)
+        } 
+
+        future1.get()
+        future2.get()
+    }
+}

From a01c85edb97be973ec5752d791fe4e655781e7cf Mon Sep 17 00:00:00 2001
From: yujun <yu.jun.reach@gmail.com>
Date: Wed, 11 Sep 2024 10:20:58 +0800
Subject: [PATCH 43/44] [chore](regression test) mv test_report_version_missing
 to p1 (#40184)

---
 .../test_report_version_missing.groovy        | 82 ------------------
 .../test_report_version_missing.groovy        | 85 +++++++++++++++++++
 2 files changed, 85 insertions(+), 82 deletions(-)
 delete mode 100644 regression-test/suites/control_p0/test_report_version_missing.groovy
 create mode 100644 regression-test/suites/control_p1/test_report_version_missing.groovy

diff --git a/regression-test/suites/control_p0/test_report_version_missing.groovy b/regression-test/suites/control_p0/test_report_version_missing.groovy
deleted file mode 100644
index 675ac52362e6fb..00000000000000
--- a/regression-test/suites/control_p0/test_report_version_missing.groovy
+++ /dev/null
@@ -1,82 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-import org.apache.doris.regression.suite.ClusterOptions
-import org.apache.doris.regression.util.NodeType
-
-suite('test_report_version_missing', "nonConcurrent") {
-    if (isCloudMode()) {
-        return
-    }
-    def tableName = "test_set_replica_status_table_in_docker"
-    try {
-    setFeConfig('disable_tablet_scheduler', true)
-    Thread.sleep(2000)
-
-    sql "DROP TABLE IF EXISTS ${tableName}"
-    sql """
-        CREATE TABLE ${tableName} (
-            `id` LARGEINT NOT NULL,
-            `count` LARGEINT SUM DEFAULT "0")
-        AGGREGATE KEY(`id`)
-        DISTRIBUTED BY HASH(`id`) BUCKETS 1
-        PROPERTIES
-        (
-            "replication_num" = "1"
-        )
-        """
-    List<String> values = []
-    for (int i = 1; i <= 10; ++i) {
-        values.add("(${i}, ${i})")
-    }
-    sql """INSERT INTO ${tableName} VALUES ${values.join(",")}"""
-
-    def result = sql_return_maparray """show tablets from ${tableName}"""
-    assertNotNull(result)
-    def tabletId = null
-    for (def res : result) {
-        tabletId = res.TabletId
-        break
-    }
-
-        GetDebugPoint().enableDebugPointForAllBEs("Tablet.build_tablet_report_info.version_miss", [tablet_id:"${tabletId}",version_miss:true])
-        boolean succ = false
-
-        for (int i = 0; i < 3; ++i) {
-            result = sql_return_maparray """show tablets from ${tableName}"""
-            logger.info("show tablets from ${result}, has after ${i} * 60 s")
-            assertNotNull(result)
-            // LstFailedVersion > 0, version missing
-            for (def res : result) {
-                if (res.TabletId.toLong() == tabletId.toLong() && res.LstFailedVersion.toLong() > 0) {
-                    succ = true
-                    break
-                }
-            }
-            if (succ) {
-                break
-            }
-            Thread.sleep(60000)
-        }
-        assertTrue(succ)
-        
-    } finally {
-        setFeConfig('disable_tablet_scheduler', false)
-        GetDebugPoint().disableDebugPointForAllBEs("Tablet.build_tablet_report_info.version_miss")
-        sql "DROP TABLE IF EXISTS ${tableName}"
-    }
-}
diff --git a/regression-test/suites/control_p1/test_report_version_missing.groovy b/regression-test/suites/control_p1/test_report_version_missing.groovy
new file mode 100644
index 00000000000000..63ccd4ed6dd6ea
--- /dev/null
+++ b/regression-test/suites/control_p1/test_report_version_missing.groovy
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import org.apache.doris.regression.suite.ClusterOptions
+import org.apache.doris.regression.util.NodeType
+
+import org.awaitility.Awaitility
+import static java.util.concurrent.TimeUnit.SECONDS
+
+suite('test_report_version_missing', 'nonConcurrent,p1') {
+    if (isCloudMode()) {
+        return
+    }
+    def tableName = 'test_set_replica_status_table_in_docker'
+    try {
+        setFeConfig('disable_tablet_scheduler', true)
+        Thread.sleep(2000)
+
+        sql "DROP TABLE IF EXISTS ${tableName}"
+        sql """
+        CREATE TABLE ${tableName} (
+            `id` LARGEINT NOT NULL,
+            `count` LARGEINT SUM DEFAULT "0")
+        AGGREGATE KEY(`id`)
+        DISTRIBUTED BY HASH(`id`) BUCKETS 1
+        PROPERTIES
+        (
+            "replication_num" = "1"
+        )
+        """
+        List<String> values = []
+        for (int i = 1; i <= 10; ++i) {
+            values.add("(${i}, ${i})")
+        }
+        sql """INSERT INTO ${tableName} VALUES ${values.join(',')}"""
+
+        def result = sql_return_maparray """show tablets from ${tableName}"""
+        assertNotNull(result)
+        def tabletId = null
+        for (def res : result) {
+            tabletId = res.TabletId
+            break
+        }
+
+        GetDebugPoint().enableDebugPointForAllBEs('Tablet.build_tablet_report_info.version_miss', [tablet_id:"${tabletId}", version_miss:true])
+        boolean succ = false
+
+        def backendId_to_backendIP = [:]
+        def backendId_to_backendHttpPort = [:]
+        getBackendIpHttpPort(backendId_to_backendIP, backendId_to_backendHttpPort)
+
+        backendId_to_backendIP.each { beId, beIp ->
+            def port = backendId_to_backendHttpPort.get(beId) as int
+            be_report_tablet(beIp, port)
+        }
+
+        Awaitility.await().atMost(180, SECONDS).pollInterval(1, SECONDS).await().until({
+            def tablets = sql_return_maparray """show tablets from ${tableName}"""
+            logger.info("show tablets from ${tablets}")
+            assertNotNull(tablets)
+            succ = tablets.any { it.TabletId.toLong() == tabletId.toLong() && it.LstFailedVersion.toLong() > 0 }
+            return succ
+        })
+
+        assertTrue(succ)
+    } finally {
+        setFeConfig('disable_tablet_scheduler', false)
+        GetDebugPoint().disableDebugPointForAllBEs('Tablet.build_tablet_report_info.version_miss')
+        sql "DROP TABLE IF EXISTS ${tableName}"
+    }
+}

From ee1e939e25092fbba0f3cf52d7a12eeddefbab66 Mon Sep 17 00:00:00 2001
From: walter <w41ter.l@gmail.com>
Date: Wed, 11 Sep 2024 11:04:27 +0800
Subject: [PATCH 44/44] [fix](restore) Persist the sqlMode field of the View
 (#40612)

During initialization, the View will parse inlineViewRef again according
to the sqlMode value. Therefore, sqlMode must be persisted.
---
 .../main/java/org/apache/doris/catalog/View.java |  1 +
 .../test_backup_restore_with_view.groovy         | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/View.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/View.java
index 8285dedc0941b8..62402c7d474f3b 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/View.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/View.java
@@ -76,6 +76,7 @@ public class View extends Table implements GsonPostProcessable {
     private String inlineViewDef;
 
     // for persist
+    @SerializedName("sm")
     private long sqlMode = 0L;
 
     // View definition created by parsing inlineViewDef_ into a QueryStmt.
diff --git a/regression-test/suites/backup_restore/test_backup_restore_with_view.groovy b/regression-test/suites/backup_restore/test_backup_restore_with_view.groovy
index eee4a70c745ed5..be7769953230db 100644
--- a/regression-test/suites/backup_restore/test_backup_restore_with_view.groovy
+++ b/regression-test/suites/backup_restore/test_backup_restore_with_view.groovy
@@ -92,6 +92,22 @@ suite("test_backup_restore_with_view", "backup_restore") {
     assertTrue(show_view.contains("${dbName1}"))
     assertTrue(show_view.contains("${tableName}"))
 
+    // restore to db, test the view signature.
+    sql """
+        RESTORE SNAPSHOT ${dbName}.${snapshotName}
+        FROM `${repoName}`
+        PROPERTIES
+        (
+            "backup_timestamp" = "${snapshot}",
+            "reserve_replica" = "true"
+        )
+    """
+
+    syncer.waitAllRestoreFinish(dbName)
+    def restore_result = sql_return_maparray """ SHOW RESTORE FROM ${dbName} WHERE Label ="${snapshotName}" """
+    restore_result.last()
+    logger.info("show restore result: ${restore_result}")
+    assertTrue(restore_result.last().State == "FINISHED")
 
     sql "DROP TABLE ${dbName}.${tableName} FORCE"
     sql "DROP VIEW ${dbName}.${viewName}"