From 67cf3cbf34f53c561c6d923af0d06e0975a64c0f Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Sat, 9 Nov 2024 00:38:57 +0800 Subject: [PATCH] DAOS-16170 cart: do not release completed RPC reference repeatedly For collective RPC, when handle failure cases during crt_req_send(), its reference may has been released via crt_rpc_complete_and_unlock() that is triggered by crt_corpc_complete(). Under such case, we should check whether the RPC is completed or not before calling RPC_DECREF() to avoid releasing the RPC reference repeatedly. The patch also initializes some local variable for CHK RPC to avoid accessing invalid DRAM when handle failed collective CHK RPC. Test-tag: test_daos_cat_recov_core Allow-unstable-test: true Signed-off-by: Fan Yong --- src/cart/crt_rpc.c | 2 +- src/chk/chk_rpc.c | 12 +++--- src/tests/suite/daos_cr.c | 86 +++++++++++++++++++++++++++++++++++---- 3 files changed, 85 insertions(+), 15 deletions(-) diff --git a/src/cart/crt_rpc.c b/src/cart/crt_rpc.c index 2a7db9a90ad..d535b1c1c29 100644 --- a/src/cart/crt_rpc.c +++ b/src/cart/crt_rpc.c @@ -1532,7 +1532,7 @@ crt_req_send(crt_rpc_t *req, crt_cb_t complete_cb, void *arg) /* failure already reported through complete cb */ if (complete_cb != NULL) rc = 0; - } else { + } else if (!crt_rpc_completed(rpc_priv)) { RPC_DECREF(rpc_priv); } } diff --git a/src/chk/chk_rpc.c b/src/chk/chk_rpc.c index 84e6ae33857..e340d482cec 100644 --- a/src/chk/chk_rpc.c +++ b/src/chk/chk_rpc.c @@ -694,7 +694,7 @@ chk_query_remote(d_rank_list_t *rank_list, uint64_t gen, int pool_nr, uuid_t poo int chk_mark_remote(d_rank_list_t *rank_list, uint64_t gen, d_rank_t rank, uint32_t version) { - crt_rpc_t *req; + crt_rpc_t *req = NULL; struct chk_mark_in *cmi; struct chk_mark_out *cmo; int rc; @@ -730,7 +730,7 @@ int chk_act_remote(d_rank_list_t *rank_list, uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, d_rank_t rank, bool for_all) { - crt_rpc_t *req; + crt_rpc_t *req = NULL; struct chk_act_in *cai; struct chk_act_out *cao; int rc; @@ -825,7 +825,7 @@ int chk_pool_start_remote(d_rank_list_t *rank_list, uint64_t gen, uuid_t uuid, uint32_t phase, uint32_t flags) { - crt_rpc_t *req; + crt_rpc_t *req = NULL; struct chk_pool_start_in *cpsi; struct chk_pool_start_out *cpso; int rc; @@ -863,7 +863,7 @@ chk_pool_mbs_remote(d_rank_t rank, uint32_t phase, uint64_t gen, uuid_t uuid, ch uint64_t seq, uint32_t flags, uint32_t mbs_nr, struct chk_pool_mbs *mbs_array, int *svc_rc, struct rsvc_hint *svc_hint) { - crt_rpc_t *req; + crt_rpc_t *req = NULL; struct chk_pool_mbs_in *cpmi; struct chk_pool_mbs_out *cpmo; int rc; @@ -908,7 +908,7 @@ int chk_report_remote(d_rank_t leader, uint64_t gen, uint32_t cla, uint32_t act, char *msg, uint32_t option_nr, uint32_t *options, uint32_t detail_nr, d_sg_list_t *details, uint64_t seq) { - crt_rpc_t *req; + crt_rpc_t *req = NULL; struct chk_report_in *cri; struct chk_report_out *cro; int rc; @@ -984,7 +984,7 @@ int chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, uint32_t *pool_nr, uuid_t **pools) { - crt_rpc_t *req; + crt_rpc_t *req = NULL; struct chk_rejoin_in *cri; struct chk_rejoin_out *cro; uuid_t *tmp; diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c index 4c981f29645..85ade4754de 100644 --- a/src/tests/suite/daos_cr.c +++ b/src/tests/suite/daos_cr.c @@ -382,6 +382,78 @@ cr_locate_dcri(struct daos_check_info *dci, struct daos_check_report_info *base, return dcri; } +static int +cr_pool_fail_result(struct daos_check_info *dci, uuid_t uuid, uint32_t cla, uint32_t act, int *ret) +{ + struct daos_check_pool_info *dcpi; + struct daos_check_report_info *dcri; + int i; + + if (dci->dci_pool_nr != 1) { + print_message("CR pool count %d (pool " DF_UUID ") is not 1\n", + dci->dci_pool_nr, DP_UUID(uuid)); + return -DER_INVAL; + } + + dcpi = &dci->dci_pools[0]; + D_ASSERTF(uuid_compare(dcpi->dcpi_uuid, uuid) == 0, + "Unmatched pool UUID (1): " DF_UUID " vs " DF_UUID "\n", + DP_UUID(dcpi->dcpi_uuid), DP_UUID(uuid)); + + if (!cr_pool_status_failed(dcpi->dcpi_status)) { + print_message("CR pool " DF_UUID " status %s is not failed\n", + DP_UUID(uuid), dcpi->dcpi_status); + return -DER_INVAL; + } + + if (cr_pool_phase_is_done(dcpi->dcpi_phase)) { + print_message("CR pool " DF_UUID " phase should not be done\n", + DP_UUID(uuid)); + return -DER_INVAL; + } + +#ifdef CR_ACCURATE_QUERY_RESULT + if (dci->dci_report_nr != 1) { + print_message("CR (failed) pool " DF_UUID " has unexpected reports: %d\n", + DP_UUID(uuid), dci->dci_report_nr); + return -DER_INVAL; + } +#endif + + for (i = 0; i < dci->dci_report_nr; i++) { + dcri = &dci->dci_reports[i]; + if (uuid_compare(dcri->dcri_uuid, uuid) != 0) { +#ifdef CR_ACCURATE_QUERY_RESULT + print_message("Detect unrelated inconsistency report: " + DF_UUID " vs " DF_UUID "\n", + DP_UUID(dcpi->dcpi_uuid), DP_UUID(uuid)); + return -DER_INVAL; +#else + continue; +#endif + } + + if (dcri->dcri_class != cla) { + print_message("CR (failed) pool " DF_UUID " reports unexpected " + "inconsistency at %d: %u vs %u\n", + DP_UUID(uuid), i, dcri->dcri_class, cla); + return -DER_INVAL; + } + + if (dcri->dcri_act != act) { + print_message("CR (failed) pool " DF_UUID " reports unexpected " + "solution at %d: %u vs %u\n", + DP_UUID(uuid), i, dcri->dcri_act, act); + return -DER_INVAL; + } + + *ret = dcri->dcri_result; + return 0; + } + + return -DER_INVAL; +} + static void cr_dci_fini(struct daos_check_info *dci) { @@ -2953,7 +3025,7 @@ cr_engine_rejoin_fail(void **state) uint32_t class = TCC_POOL_NONEXIST_ON_MS; uint32_t action; int rank = -1; - int result; + int result = 0; int rc; int i; @@ -3006,15 +3078,13 @@ cr_engine_rejoin_fail(void **state) rc = cr_ins_verify(&dci, TCIS_COMPLETED); assert_rc_equal(rc, 0); - /* The check on the pool will fail as -DER_HG or -DER_TIMEDOUT. */ - result = -DER_HG; - rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_FAILED, 1, &class, &action, &result); - if (rc == -DER_INVAL) { - result = -DER_TIMEDOUT; - rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_FAILED, 1, &class, &action, &result); - } + rc = cr_pool_fail_result(&dci, pool.pool_uuid, class, action, &result); assert_rc_equal(rc, 0); + /* The check on the pool will fail because of -DER_HG or -DER_TIMEDOUT or -DER_OOG. */ + D_ASSERTF(result == -DER_HG || result == -DER_TIMEDOUT || result == -DER_OOG, + "Unexpected pool " DF_UUID " fail result: %d\n", DP_UUID(pool.pool_uuid), result); + /* Reint the rank, rejoin will fail but not affect the rank start. */ rc = cr_rank_reint(rank, true); assert_rc_equal(rc, 0);