diff --git a/cloud/script/start.sh b/cloud/script/start.sh index ac6a07d7ea509e..582c80c2e6fa4c 100644 --- a/cloud/script/start.sh +++ b/cloud/script/start.sh @@ -122,7 +122,10 @@ fi echo "LIBHDFS3_CONF=${LIBHDFS3_CONF}" -export JEMALLOC_CONF="percpu_arena:percpu,background_thread:true,metadata_thp:auto,muzzy_decay_ms:15000,dirty_decay_ms:15000,oversize_threshold:0,prof:true,prof_prefix:jeprof.out" +# to enable dump jeprof heap stats prodigally, change `prof:false` to `prof:true` +# to control the dump interval change `lg_prof_interval` to a specific value, it is pow/exponent of 2 in size of bytes, default 34 means 2 ** 34 = 16GB +# to control the dump path, change `prof_prefix` to a specific path, e.g. /doris_cloud/log/ms_, by default it dumps at the path where the start command called +export JEMALLOC_CONF="percpu_arena:percpu,background_thread:true,metadata_thp:auto,muzzy_decay_ms:5000,dirty_decay_ms:5000,oversize_threshold:0,prof_prefix:ms_,prof:false,lg_prof_interval:34" if [[ "${RUN_VERSION}" -eq 1 ]]; then "${bin}" --version @@ -131,14 +134,22 @@ fi mkdir -p "${DORIS_HOME}/log" echo "starts ${process} with args: $*" +out_file=${DORIS_HOME}/log/${process}.out if [[ "${RUN_DAEMON}" -eq 1 ]]; then - date >>"${DORIS_HOME}/log/${process}.out" - nohup "${bin}" "$@" >>"${DORIS_HOME}/log/${process}.out" 2>&1 & - # wait for log flush - sleep 1.5 - tail -n10 "${DORIS_HOME}/log/${process}.out" | grep 'working directory' -B1 -A10 - echo "please check process log for more details" - echo "" + # append 10 blank lines to ensure the following tail -n10 works correctly + printf "\n\n\n\n\n\n\n\n\n\n" >>"${out_file}" + echo "$(date +'%F %T') try to start ${process}" >>"${out_file}" + nohup "${bin}" "$@" >>"${out_file}" 2>&1 & + echo "wait and check ${process} start successfully" + sleep 3 + tail -n10 "${out_file}" | grep 'successfully started brpc' + ret=$? + if [[ ${ret} -ne 0 ]]; then + echo "${process} may not start successfully please check process log for more details" + exit 1 + fi + echo "${process} start successfully" + exit 0 elif [[ "${RUN_CONSOLE}" -eq 1 ]]; then export DORIS_LOG_TO_STDERR=1 date diff --git a/cloud/src/common/config.h b/cloud/src/common/config.h index 6c3b22e1bb94a7..78b682641d1a14 100644 --- a/cloud/src/common/config.h +++ b/cloud/src/common/config.h @@ -77,6 +77,7 @@ CONF_mInt32(scan_instances_interval_seconds, "60"); // 1min CONF_mInt32(check_object_interval_seconds, "43200"); // 12hours CONF_mInt64(check_recycle_task_interval_seconds, "600"); // 10min +CONF_mInt64(recycler_sleep_before_scheduling_seconds, "60"); // log a warning if a recycle task takes longer than this duration CONF_mInt64(recycle_task_threshold_seconds, "10800"); // 3h diff --git a/cloud/src/main.cpp b/cloud/src/main.cpp index 9356a3546d03a9..74e6a8daaf161c 100644 --- a/cloud/src/main.cpp +++ b/cloud/src/main.cpp @@ -161,13 +161,13 @@ DECLARE_int64(socket_max_unwritten_bytes); int main(int argc, char** argv) { if (argc > 1) { if (auto ret = args.parse(argc - 1, argv + 1); !ret.empty()) { - std::cerr << ret << std::endl; + std::cerr << "parse arguments error: " << ret << std::endl; help(); return -1; } } - if (argc < 2 || args.get(ARG_HELP)) { + if (args.get(ARG_HELP)) { help(); return 0; } @@ -177,21 +177,16 @@ int main(int argc, char** argv) { return 0; } - // FIXME(gavin): do we need to enable running both MS and recycler within - // single process - if (!(args.get(ARG_META_SERVICE) ^ args.get(ARG_RECYCLER))) { - std::cerr << "only one of --meta-service and --recycler must be specified" << std::endl; - return 1; - } - - // There may be more roles to play + // There may be more roles to play in the future, if there are multi roles specified, + // use meta_service as the process name std::string process_name = args.get(ARG_META_SERVICE) ? "meta_service" : args.get(ARG_RECYCLER) ? "recycler" - : ""; - if (process_name.empty()) { - std::cerr << "failed to determine prcess name with given args" << std::endl; - return 1; - } + : "meta_service"; + + using namespace std::chrono; + + auto start = steady_clock::now(); + auto end = start; auto pid_file_fd_holder = gen_pidfile("doris_cloud"); if (pid_file_fd_holder == nullptr) { @@ -215,11 +210,19 @@ int main(int argc, char** argv) { } // We can invoke glog from now on - std::string msg; + LOG(INFO) << "try to start doris_cloud"; LOG(INFO) << build_info(); std::cout << build_info() << std::endl; + if (!args.get(ARG_META_SERVICE) && !args.get(ARG_RECYCLER)) { + std::get<0>(args.args()[ARG_META_SERVICE]) = true; + std::get<0>(args.args()[ARG_RECYCLER]) = true; + LOG(INFO) << "meta_service and recycler are both not specified, " + "run doris_cloud as meta_service and recycler by default"; + std::cout << "run doris_cloud as meta_service and recycler by default" << std::endl; + } + brpc::Server server; brpc::FLAGS_max_body_size = config::brpc_max_body_size; brpc::FLAGS_socket_max_unwritten_bytes = config::brpc_socket_max_unwritten_bytes; @@ -238,19 +241,22 @@ int main(int argc, char** argv) { return 1; } LOG(INFO) << "begin to init txn kv"; + auto start_init_kv = steady_clock::now(); int ret = txn_kv->init(); if (ret != 0) { LOG(WARNING) << "failed to init txnkv, ret=" << ret; return 1; } - LOG(INFO) << "successfully init txn kv"; + end = steady_clock::now(); + LOG(INFO) << "successfully init txn kv, elapsed milliseconds: " + << duration_cast(end - start_init_kv).count(); if (init_global_encryption_key_info_map(txn_kv.get()) != 0) { LOG(WARNING) << "failed to init global encryption key map"; return -1; } - std::unique_ptr meta_server; + std::unique_ptr meta_server; // meta-service std::unique_ptr recycler; std::thread periodiccally_log_thread; std::mutex periodiccally_log_thread_lock; @@ -269,7 +275,8 @@ int main(int argc, char** argv) { msg = "meta-service started"; LOG(INFO) << msg; std::cout << msg << std::endl; - } else if (args.get(ARG_RECYCLER)) { + } + if (args.get(ARG_RECYCLER)) { recycler = std::make_unique(txn_kv); int ret = recycler->start(&server); if (ret != 0) { @@ -284,15 +291,12 @@ int main(int argc, char** argv) { auto periodiccally_log = [&]() { while (periodiccally_log_thread_run) { std::unique_lock lck {periodiccally_log_thread_lock}; - periodiccally_log_thread_cv.wait_for( - lck, std::chrono::milliseconds(config::periodically_log_ms)); + periodiccally_log_thread_cv.wait_for(lck, + milliseconds(config::periodically_log_ms)); LOG(INFO) << "Periodically log for recycler"; } }; periodiccally_log_thread = std::thread {periodiccally_log}; - } else { - std::cerr << "cloud starts without doing anything and exits" << std::endl; - return -1; } // start service brpc::ServerOptions options; @@ -309,7 +313,11 @@ int main(int argc, char** argv) { << ", errmsg=" << strerror_r(errno, buf, 64) << ", port=" << port; return -1; } - LOG(INFO) << "successfully started brpc listening on port=" << port; + end = steady_clock::now(); + msg = "successfully started brpc listening on port=" + std::to_string(port) + + " time_elapsed_ms=" + std::to_string(duration_cast(end - start).count()); + LOG(INFO) << msg; + std::cout << msg << std::endl; server.RunUntilAskedToQuit(); // Wait for signals server.ClearServices(); @@ -326,7 +334,7 @@ int main(int argc, char** argv) { periodiccally_log_thread_run = false; // immediately notify the log thread to quickly exit in case it block the // whole procedure - periodiccally_log_thread_cv.notify_one(); + periodiccally_log_thread_cv.notify_all(); } periodiccally_log_thread.join(); } diff --git a/cloud/src/recycler/checker.cpp b/cloud/src/recycler/checker.cpp index f8289160269eff..0ed3b1934c4442 100644 --- a/cloud/src/recycler/checker.cpp +++ b/cloud/src/recycler/checker.cpp @@ -79,6 +79,8 @@ int Checker::start() { // launch instance scanner auto scanner_func = [this]() { + std::this_thread::sleep_for( + std::chrono::seconds(config::recycler_sleep_before_scheduling_seconds)); while (!stopped()) { std::vector instances; get_all_instances(txn_kv_.get(), instances); diff --git a/cloud/src/recycler/recycler.cpp b/cloud/src/recycler/recycler.cpp index 38c1b7979049d3..486e143f30b670 100644 --- a/cloud/src/recycler/recycler.cpp +++ b/cloud/src/recycler/recycler.cpp @@ -186,6 +186,11 @@ Recycler::~Recycler() { } void Recycler::instance_scanner_callback() { + // sleep 60 seconds before scheduling for the launch procedure to complete: + // some bad hdfs connection may cause some log to stdout stderr + // which may pollute .out file and affect the script to check success + std::this_thread::sleep_for( + std::chrono::seconds(config::recycler_sleep_before_scheduling_seconds)); while (!stopped()) { std::vector instances; get_all_instances(txn_kv_.get(), instances); diff --git a/cloud/src/recycler/recycler_service.cpp b/cloud/src/recycler/recycler_service.cpp index 52c510fb2e7da0..f9ffe750fdfb65 100644 --- a/cloud/src/recycler/recycler_service.cpp +++ b/cloud/src/recycler/recycler_service.cpp @@ -440,7 +440,7 @@ void RecyclerServiceImpl::http(::google::protobuf::RpcController* controller, } status_code = 404; - msg = "not found"; + msg = "http path " + uri.path() + " not found, it may be not implemented"; response_body = msg; } diff --git a/cloud/test/recycler_test.cpp b/cloud/test/recycler_test.cpp index ca8ffbcee6139e..ae4f35a4605e66 100644 --- a/cloud/test/recycler_test.cpp +++ b/cloud/test/recycler_test.cpp @@ -64,6 +64,7 @@ int main(int argc, char** argv) { using namespace std::chrono; current_time = duration_cast(system_clock::now().time_since_epoch()).count(); + config::recycler_sleep_before_scheduling_seconds = 0; // we dont have to wait in UT ::testing::InitGoogleTest(&argc, argv); auto s3_producer_pool = std::make_shared(config::recycle_pool_parallelism);