diff --git a/service/raft/raft_group0.cc b/service/raft/raft_group0.cc index ff85d8011b48..486e9d970634 100644 --- a/service/raft/raft_group0.cc +++ b/service/raft/raft_group0.cc @@ -33,6 +33,7 @@ #include #include #include +#include #include "idl/group0.dist.hh" @@ -562,6 +563,10 @@ future<> raft_group0::setup_group0( co_await join_group0(std::move(seeds), false /* non-voter */); group0_log.info("setup_group0: successfully joined group 0."); + utils::get_local_injector().inject("stop_after_joining_group0", [&] { + throw std::runtime_error{"injection: stop_after_joining_group0"}; + }); + if (replace_info) { // Insert the replaced node's (Raft ID, IP address) pair into `raft_address_map`. // In general, the mapping won't be obtained through the regular gossiping route: diff --git a/test/topology_experimental_raft/suite.yaml b/test/topology_experimental_raft/suite.yaml new file mode 100644 index 000000000000..7b828049e46a --- /dev/null +++ b/test/topology_experimental_raft/suite.yaml @@ -0,0 +1,10 @@ +type: Topology +pool_size: 4 +cluster: + initial_size: 0 +extra_scylla_config_options: + authenticator: AllowAllAuthenticator + authorizer: AllowAllAuthorizer + experimental_features: ['raft'] +skip_in_release: + - test_blocked_bootstrap diff --git a/test/topology_experimental_raft/test_blocked_bootstrap.py b/test/topology_experimental_raft/test_blocked_bootstrap.py new file mode 100644 index 000000000000..ec10f04b0fae --- /dev/null +++ b/test/topology_experimental_raft/test_blocked_bootstrap.py @@ -0,0 +1,51 @@ +# Copyright (C) 2023-present ScyllaDB +# +# SPDX-License-Identifier: AGPL-3.0-or-later +# +from test.pylib.scylla_cluster import ReplaceConfig +from test.pylib.manager_client import ManagerClient + +import pytest +import logging + +logger = logging.getLogger(__name__) + + +@pytest.mark.asyncio +async def test_blocked_bootstrap(manager: ManagerClient): + """ + Scenario: + 1. Start a cluster with nodes node1, node2, node3 + 2. Start node4 replacing node node2 + 3. Stop node node4 after it joined group0 but before it advertised itself in gossip + 4. Start node5 replacing node node2 + + Test simulates the behavior described in #13543. + + Test passes only if `wait_for_peers_to_enter_synchronize_state` doesn't need to + resolve all IPs to return early. If not, node5 will hang trying to resolve the + IP of node4: + ``` + raft_group0_upgrade - : failed to resolve IP addresses of some of the cluster members ([node4's host ID]) + ``` + """ + servers = [await manager.server_add() for _ in range(3)] + + logger.info(f"Stopping node {servers[0]}") + await manager.server_stop_gracefully(servers[0].server_id) + + logger.info(f"Replacing node {servers[0]}") + replace_cfg = ReplaceConfig(replaced_id = servers[0].server_id, reuse_ip_addr = False, use_host_id = False) + + try: + await manager.server_add(replace_cfg, config={ + 'error_injections_at_startup': ['stop_after_joining_group0'] + }) + except: + # Node stops before it advertised itself in gossip, so manager.server_add throws an exception + pass + else: + assert False, "Node should stop before it advertised itself in gossip" + + logger.info(f"Replacing node {servers[0]}") + await manager.server_add(replace_cfg)