Skip to content

Commit

Permalink
Merge to stable-23-3: making blockstore cluster survive after destruc…
Browse files Browse the repository at this point in the history
…tion of static BS group in local-emergency test (#2055)

* silence 'E_NOT_IMPLEMENTED Disk registry based disks can not handle GetChangedBlocks requests' error (#1972)

* issue-757: refactor local-emergency load-test making it actually turn on emergency mode; increment suggested generation upon external boot in fallback mode; minor tweaks (#2019)

* issue-757: don't load configs from CMS in emergency mode (#2042)

* issue-757: pass path as empty TMaybe<TString> instead of empty TString when registering node in Node Broker; spoil BS Controller config before restarting Kikimr after formatting static pdisks in local-emergency test (otherwise BS Controller would erase all the data from other non-static groups) (#2049)
  • Loading branch information
SvartMetal authored Sep 17, 2024
1 parent d4c0c26 commit 7e6851a
Show file tree
Hide file tree
Showing 10 changed files with 104 additions and 41 deletions.
16 changes: 15 additions & 1 deletion cloud/blockstore/libs/daemon/ydb/bootstrap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,17 +231,31 @@ void TBootstrapYdb::InitKikimrService()
.NodeType = Configs->StorageConfig->GetNodeType(),
};

bool loadCmsConfigs = Configs->Options->LoadCmsConfigs;
bool emergencyMode =
Configs->StorageConfig->GetHiveProxyFallbackMode() ||
Configs->StorageConfig->GetSSProxyFallbackMode();

if (loadCmsConfigs && emergencyMode) {
STORAGE_INFO("Disable loading configs from CMS in emergency mode");
loadCmsConfigs = false;
}

NCloud::NStorage::TRegisterDynamicNodeOptions registerOpts {
.Domain = Configs->Options->Domain,
.SchemeShardDir = Configs->StorageConfig->GetSchemeShardDir(),
.NodeBrokerAddress = Configs->Options->NodeBrokerAddress,
.NodeBrokerPort = Configs->Options->NodeBrokerPort,
.UseNodeBrokerSsl = Configs->Options->UseNodeBrokerSsl,
.InterconnectPort = Configs->Options->InterconnectPort,
.LoadCmsConfigs = Configs->Options->LoadCmsConfigs,
.LoadCmsConfigs = loadCmsConfigs,
.Settings = std::move(settings)
};

if (emergencyMode) {
registerOpts.SchemeShardDir = "";
}

if (Configs->Options->LocationFile) {
NProto::TLocation location;
ParseProtoTextFromFile(Configs->Options->LocationFile, location);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -427,8 +427,9 @@ void TStartVolumeActor::StartTablet(const TActorContext& ctx)
}

LOG_INFO(ctx, TBlockStoreComponents::SERVICE,
"[%lu] Starting tablet",
VolumeTabletId);
"[%lu] Starting tablet (gen: %u)",
VolumeTabletId,
VolumeGeneration);

const auto* appData = AppData(ctx);

Expand Down Expand Up @@ -551,7 +552,7 @@ void TStartVolumeActor::HandleTabletDead(

if (PendingRequest == EPendingRequest::START) {
LOG_ERROR(ctx, TBlockStoreComponents::SERVICE,
"[%lu] Tablet boot failed during actor stopping",
"[%lu] Tablet boot failed during actor starting",
VolumeTabletId);

PendingRequest = EPendingRequest::NONE;
Expand Down Expand Up @@ -579,7 +580,7 @@ void TStartVolumeActor::HandleTabletDead(
0, // cookie
error);

bool delay;
bool delay = true;
switch (msg->Reason) {
case TEvTablet::TEvTabletDead::ReasonBootRace:
// Avoid unnecessary delays
Expand All @@ -591,7 +592,6 @@ void TStartVolumeActor::HandleTabletDead(
++VolumeGeneration;
break;
default:
delay = true;
break;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -912,8 +912,11 @@ void TVolumeActor::ReplyErrorOnNormalGetChangedBlocksRequestForDiskRegistryBased
TGetChangedBlocksMethod::Name,
errorMsg.c_str());

auto response = std::make_unique<TGetChangedBlocksMethod::TResponse>();
*response->Record.MutableError() = MakeError(E_NOT_IMPLEMENTED, errorMsg);
ui32 flags = 0;
SetProtoFlag(flags, NProto::EF_SILENT);
auto error = MakeError(E_NOT_IMPLEMENTED, errorMsg, flags);
auto response = std::make_unique<TGetChangedBlocksMethod::TResponse>(
std::move(error));

NCloud::Reply(ctx, *ev, std::move(response));
}
Expand Down
48 changes: 22 additions & 26 deletions cloud/blockstore/tests/loadtest/local-emergency/test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# import os
import pytest

import yatest.common as common
Expand All @@ -12,7 +11,7 @@
from ydb.tests.library.harness.kikimr_runner import get_unique_path_for_current_test, ensure_path_exists


def default_storage_config(cache_folder):
def default_storage_config(backups_folder):
storage = storage_config_with_default_limits()
storage.SSDSystemChannelPoolKind = "ssd"
storage.SSDLogChannelPoolKind = "ssd"
Expand All @@ -21,17 +20,18 @@ def default_storage_config(cache_folder):
storage.SSDMergedChannelPoolKind = "ssd"

storage.TabletBootInfoBackupFilePath = \
cache_folder + "/tablet_boot_info_backup.txt"
backups_folder + "/tablet_boot_info_backup.txt"
storage.PathDescriptionBackupFilePath = \
cache_folder + "/path_description_backup.txt"
backups_folder + "/path_description_backup.txt"

return storage


def storage_config_with_emergency_mode(cache_folder):
storage = default_storage_config(cache_folder)
def storage_config_with_emergency_mode(backups_folder):
storage = default_storage_config(backups_folder)
storage.HiveProxyFallbackMode = True
storage.SSProxyFallbackMode = True
storage.DisableLocalService = True
return storage


Expand All @@ -51,47 +51,43 @@ def __init__(self, name, config_path):


def __run_test(test_case):
cache_folder = get_unique_path_for_current_test(
backups_folder = get_unique_path_for_current_test(
output_path=common.output_path(),
sub_folder="cache",
sub_folder="backups",
)
ensure_path_exists(cache_folder)

storage_config_patches = [
default_storage_config(cache_folder),
storage_config_with_emergency_mode(cache_folder),
]
ensure_path_exists(backups_folder)

env = LocalLoadTest(
"",
storage_config_patches=storage_config_patches,
storage_config_patches=[default_storage_config(backups_folder)],
dynamic_pdisks=[dict(user_kind=1)],
dynamic_storage_pools=[
dict(name="dynamic_storage_pool:1", kind="system", pdisk_user_kind=0),
dict(name="dynamic_storage_pool:2", kind="ssd", pdisk_user_kind=1)
],
bs_cache_file_path=cache_folder + "/bs_cache.txt",
bs_cache_file_path=backups_folder + "/bs_cache.txt",
load_configs_from_cms=False,
)

client = CreateClient(env.endpoint)
client.create_volume("vol0", 4096, 1000000, 1, protos.EStorageMediaKind.Value("STORAGE_MEDIA_SSD"))

session = Session(client, "vol0", "")
session.mount_volume()
session.write_blocks(0, [b'\1' * 4096])
session.write_blocks(100500, [b'\1' * 4096])
# TODO: should not unmount volume to make emergency unexpected
session.unmount_volume()

static_pdisk_paths = []
for info in env.pdisks_info:
if info["pdisk_user_kind"] == 0:
static_pdisk_paths += [info["pdisk_path"]]
assert len(static_pdisk_paths) == 1

# Destroy static group in order to emulate emergency.
# TODO: survive outage of kikimr static tablets.
# os.remove(static_pdisk_paths[0])
client.execute_action(action="BackupPathDescriptions", input_bytes=str.encode(""))
client.execute_action(action="BackupTabletBootInfos", input_bytes=str.encode(""))

env.kikimr_cluster.format_static_pdisks()
# spoil config to prevent BS Controller from starting otherwise it will
# erase dynamic groups data
env.kikimr_cluster.spoil_bs_controller_config()
env.kikimr_cluster.restart_nodes()

env.nbs.storage_config_patches = [storage_config_with_emergency_mode(backups_folder)]
env.nbs.restart()

try:
Expand Down
6 changes: 5 additions & 1 deletion cloud/blockstore/tests/python/lib/loadtest_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def __init__(
kikimr_binary_path=None,
with_endpoint_proxy=False,
with_netlink=False,
load_configs_from_cms=True,
):

self.__endpoint = endpoint
Expand All @@ -76,6 +77,9 @@ def __init__(
if run_kikimr:
self.kikimr_cluster.start()
kikimr_port = list(self.kikimr_cluster.nodes.values())[0].port
else:
# makes sense only when Kikimr is running
load_configs_from_cms = False

self.__devices = []

Expand Down Expand Up @@ -107,7 +111,7 @@ def __init__(
discovery_config=discovery_config,
restart_interval=restart_interval,
dynamic_storage_pools=dynamic_storage_pools,
load_configs_from_cms=run_kikimr,
load_configs_from_cms=load_configs_from_cms,
features_config_patch=features_config_patch,
grpc_trace=grpc_trace,
rack=rack)
Expand Down
8 changes: 4 additions & 4 deletions cloud/blockstore/tests/python/lib/nbs_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,6 @@ def __init__(
if kms_config is not None:
self.__proto_configs["kms.txt"] = kms_config

if storage_config_patches is not None and len(storage_config_patches) > 0:
for i in range(len(storage_config_patches)):
self.__proto_configs["storage-%s.txt" % i] = self.__generate_patched_storage_txt(i)

if discovery_config is not None:
self.__proto_configs["discovery.txt"] = discovery_config
self.__use_discovery = True
Expand All @@ -165,6 +161,10 @@ def __init__(
self.__init_daemon()

def __init_daemon(self):
if self.storage_config_patches is not None and len(self.storage_config_patches) > 0:
for i in range(len(self.storage_config_patches)):
self.__proto_configs["storage-%s.txt" % i] = self.__generate_patched_storage_txt(i)

cp = None
if self.__binary_path:
cp = core_pattern(self.__binary_path, self.__cwd)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,15 @@ void THiveProxyFallbackActor::HandleBootExternal(
return;
}

// increment suggested generation to ensure that the tablet does not get
// stuck with an outdated generation, no matter what
auto request = std::make_unique<
TEvHiveProxyPrivate::TEvUpdateTabletBootInfoBackupRequest>(
r->StorageInfo,
r->SuggestedGeneration + 1
);
NCloud::Send(ctx, TabletBootInfoBackup, std::move(request));

auto response = std::make_unique<TResponse>(
std::move(r->StorageInfo),
r->SuggestedGeneration,
Expand Down
8 changes: 8 additions & 0 deletions cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1152,6 +1152,14 @@ Y_UNIT_TEST_SUITE(THiveProxyTest)
auto result2 = env.SendBootExternalRequest(
sender, 0xdeadbeaf, E_REJECTED);
UNIT_ASSERT(!result2.StorageInfo);

auto result3 = env.SendBootExternalRequest(sender, FakeTablet2, S_OK);
UNIT_ASSERT(result3.StorageInfo);
UNIT_ASSERT_VALUES_EQUAL(
FakeTablet2,
result3.StorageInfo->TabletID);
// suggested generation should be incremented after last boot
UNIT_ASSERT_VALUES_EQUAL(2u, result3.SuggestedGeneration);
}
}

Expand Down
9 changes: 7 additions & 2 deletions cloud/storage/core/libs/kikimr/node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,11 @@ struct TLegacyNodeRegistrant
{
NClient::TKikimr kikimr(CreateKikimrConfig(Options, nodeBrokerAddress));

TMaybe<TString> path;
if (Options.SchemeShardDir) {
path = Options.SchemeShardDir;
}

auto registrant = kikimr.GetNodeRegistrant();
auto result = registrant.SyncRegisterNode(
Options.Domain,
Expand All @@ -232,8 +237,8 @@ struct TLegacyNodeRegistrant
HostAddress,
HostName,
Location,
false, //request fixed node id
Options.SchemeShardDir);
false, // fixedNodeId
path);

if (!result.IsSuccess()) {
return MakeError(
Expand Down
24 changes: 24 additions & 0 deletions ydb/tests/library/harness/kikimr_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,30 @@ def __instantiate_udfs_dir(self):
os.symlink(udf_path, link_name)
return self.__common_udfs_dir

# TODO(svartmetal): remove this when YDB learns not to erase dynamic groups
# data after formatting of static pdisks
def spoil_bs_controller_config(self):
flat_bs_controller = [{
"info": {
"channels": [{
"channel": 0,
"channel_erasure_name": str(self.__configurator.static_erasure),
"history": [{
"from_generation": 0,
"group_id": 100500
}]
}]
}
}]
self.__configurator.yaml_config["system_tablets"]["flat_bs_controller"] = flat_bs_controller
self.__write_configs()

def format_static_pdisks(self):
for node_id in self.__configurator.all_node_ids():
for pdisk in self.__configurator.pdisks_info:
if pdisk["pdisk_user_kind"] == 0:
self.nodes[node_id].format_pdisk(**pdisk)

def __format_disks(self, node_id):
for pdisk in self.__configurator.pdisks_info:
if pdisk['node_id'] != node_id:
Expand Down

0 comments on commit 7e6851a

Please sign in to comment.