Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fix](memory) Allows to enable memory tracker accuracy detection via Config #40714

Merged
merged 2 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions be/src/common/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,9 @@ DEFINE_mInt64(stacktrace_in_alloc_large_memory_bytes, "2147483648");

DEFINE_mInt64(crash_in_alloc_large_memory_bytes, "-1");

// If memory tracker value is inaccurate, BE will crash. usually used in test environments, default value is false.
DEFINE_mBool(crash_in_memory_tracker_inaccurate, "false");
xinyiZzz marked this conversation as resolved.
Show resolved Hide resolved

// default is true. if any memory tracking in Orphan mem tracker will report error.
// !! not modify the default value of this conf!! otherwise memory errors cannot be detected in time.
// allocator free memory not need to check, because when the thread memory tracker label is Orphan,
Expand Down
4 changes: 4 additions & 0 deletions be/src/common/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,11 +194,15 @@ DECLARE_mBool(enable_stacktrace);
// if alloc failed using Doris Allocator, will print stacktrace in error log.
// if is -1, disable print stacktrace when alloc large memory.
DECLARE_mInt64(stacktrace_in_alloc_large_memory_bytes);

// when alloc memory larger than crash_in_alloc_large_memory_bytes will crash, default -1 means disabled.
// if you need a core dump to analyze large memory allocation,
// modify this parameter to crash when large memory allocation occur will help
DECLARE_mInt64(crash_in_alloc_large_memory_bytes);

// If memory tracker value is inaccurate, BE will crash. usually used in test environments, default value is false.
DECLARE_mBool(crash_in_memory_tracker_inaccurate);

// default is true. if any memory tracking in Orphan mem tracker will report error.
// !! not modify the default value of this conf!! otherwise memory errors cannot be detected in time.
// allocator free memory not need to check, because when the thread memory tracker label is Orphan,
Expand Down
23 changes: 11 additions & 12 deletions be/src/runtime/memory/mem_tracker_limiter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,12 @@ std::shared_ptr<MemTrackerLimiter> MemTrackerLimiter::create_shared(MemTrackerLi
return tracker;
}

bool MemTrackerLimiter::open_memory_tracker_inaccurate_detect() {
return doris::config::crash_in_memory_tracker_inaccurate &&
(_type == Type::COMPACTION || _type == Type::SCHEMA_CHANGE || _type == Type::QUERY ||
(_type == Type::LOAD && !is_group_commit_load));
}

MemTrackerLimiter::~MemTrackerLimiter() {
consume(_untracked_mem);
static std::string mem_tracker_inaccurate_msg =
Expand All @@ -127,35 +133,29 @@ MemTrackerLimiter::~MemTrackerLimiter() {
"4. If you need to "
"transfer memory tracking value between two trackers, can use transfer_to.";
if (_consumption->current_value() != 0) {
// TODO, expect mem tracker equal to 0 at the load/compaction/etc. task end.
#ifndef NDEBUG
if (_type == Type::COMPACTION || _type == Type::SCHEMA_CHANGE || _type == Type::QUERY ||
(_type == Type::LOAD && !is_group_commit_load)) {
if (open_memory_tracker_inaccurate_detect()) {
std::string err_msg =
fmt::format("mem tracker label: {}, consumption: {}, peak consumption: {}, {}.",
label(), _consumption->current_value(), _consumption->peak_value(),
mem_tracker_inaccurate_msg);
LOG(FATAL) << err_msg << print_address_sanitizers();
}
#endif
if (ExecEnv::tracking_memory()) {
ExecEnv::GetInstance()->orphan_mem_tracker()->consume(_consumption->current_value());
}
_consumption->set(0);
#ifndef NDEBUG
} else if (!_address_sanitizers.empty() && !is_group_commit_load) {
} else if (doris::config::crash_in_memory_tracker_inaccurate && !_address_sanitizers.empty() &&
!is_group_commit_load) {
LOG(FATAL) << "[Address Sanitizer] consumption is 0, but address sanitizers not empty. "
<< ", mem tracker label: " << _label
<< ", peak consumption: " << _consumption->peak_value()
<< print_address_sanitizers();
#endif
}
memory_memtrackerlimiter_cnt << -1;
}

#ifndef NDEBUG
void MemTrackerLimiter::add_address_sanitizers(void* buf, size_t size) {
if (_type == Type::QUERY || (_type == Type::LOAD && !is_group_commit_load)) {
if (open_memory_tracker_inaccurate_detect()) {
std::lock_guard<std::mutex> l(_address_sanitizers_mtx);
auto it = _address_sanitizers.find(buf);
if (it != _address_sanitizers.end()) {
Expand All @@ -177,7 +177,7 @@ void MemTrackerLimiter::add_address_sanitizers(void* buf, size_t size) {
}

void MemTrackerLimiter::remove_address_sanitizers(void* buf, size_t size) {
if (_type == Type::QUERY || (_type == Type::LOAD && !is_group_commit_load)) {
if (open_memory_tracker_inaccurate_detect()) {
std::lock_guard<std::mutex> l(_address_sanitizers_mtx);
auto it = _address_sanitizers.find(buf);
if (it != _address_sanitizers.end()) {
Expand Down Expand Up @@ -221,7 +221,6 @@ std::string MemTrackerLimiter::print_address_sanitizers() {
}
return detail;
}
#endif

MemTracker::Snapshot MemTrackerLimiter::make_snapshot() const {
Snapshot snapshot;
Expand Down
7 changes: 2 additions & 5 deletions be/src/runtime/memory/mem_tracker_limiter.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,12 +205,9 @@ class MemTrackerLimiter final : public MemTracker {
// Log the memory usage when memory limit is exceeded.
std::string tracker_limit_exceeded_str();

#ifndef NDEBUG
void add_address_sanitizers(void* buf, size_t size);
void remove_address_sanitizers(void* buf, size_t size);
std::string print_address_sanitizers();
bool is_group_commit_load {false};
#endif

std::string debug_string() override {
std::stringstream msg;
Expand Down Expand Up @@ -253,16 +250,16 @@ class MemTrackerLimiter final : public MemTracker {
bool _enable_print_log_usage = false;
static std::atomic<bool> _enable_print_log_process_usage;

#ifndef NDEBUG
struct AddressSanitizer {
size_t size;
std::string stack_trace;
};

std::string print_address_sanitizers();
bool open_memory_tracker_inaccurate_detect();
std::mutex _address_sanitizers_mtx;
std::unordered_map<void*, AddressSanitizer> _address_sanitizers;
std::vector<std::string> _error_address_sanitizers;
#endif
};

inline int64_t MemTrackerLimiter::add_untracked_mem(int64_t bytes) {
Expand Down
3 changes: 2 additions & 1 deletion regression-test/pipeline/cloud_p0/conf/be_custom.conf
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,5 @@ stream_load_record_batch_size = 500
webserver_num_workers = 128
enable_new_tablet_do_compaction = true
arrow_flight_sql_port = 8181
pipeline_task_leakage_detect_period_sec=1
pipeline_task_leakage_detect_period_sec=1
crash_in_memory_tracker_inaccurate = true
3 changes: 2 additions & 1 deletion regression-test/pipeline/cloud_p1/conf/be_custom.conf
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,5 @@ file_cache_path = [{"path":"/data/doris_cloud/file_cache","total_size":104857600
tmp_file_dirs = [{"path":"/data/doris_cloud/tmp","max_cache_bytes":104857600,"max_upload_bytes":104857600}]
save_load_error_log_to_s3 = true
arrow_flight_sql_port = 8181
pipeline_task_leakage_detect_period_sec=1
pipeline_task_leakage_detect_period_sec=1
crash_in_memory_tracker_inaccurate = true
3 changes: 2 additions & 1 deletion regression-test/pipeline/external/conf/be.conf
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,5 @@ enable_jvm_monitor = true

KRB5_CONFIG=/keytabs/krb5.conf
kerberos_krb5_conf_path=/keytabs/krb5.conf
pipeline_task_leakage_detect_period_sec=1
pipeline_task_leakage_detect_period_sec=1
crash_in_memory_tracker_inaccurate = true
3 changes: 2 additions & 1 deletion regression-test/pipeline/p0/conf/be.conf
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,5 @@ enable_jvm_monitor = true
enable_be_proc_monitor = true
be_proc_monitor_interval_ms = 30000
webserver_num_workers = 128
pipeline_task_leakage_detect_period_sec=1
pipeline_task_leakage_detect_period_sec=1
crash_in_memory_tracker_inaccurate = true
1 change: 1 addition & 0 deletions regression-test/pipeline/p1/conf/be.conf
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,4 @@ enable_missing_rows_correctness_check=true

enable_jvm_monitor = true
pipeline_task_leakage_detect_period_sec=1
crash_in_memory_tracker_inaccurate = true
Loading