Skip to content

Commit

Permalink
issue-757: refactor local-emergency load-test making it actually turn…
Browse files Browse the repository at this point in the history
… on emergency mode; increment suggested generation upon external boot in fallback mode; minor tweaks (#2019)
  • Loading branch information
SvartMetal authored and Mikhail Montsev committed Sep 17, 2024
1 parent c34bb2d commit a31d56a
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -427,8 +427,9 @@ void TStartVolumeActor::StartTablet(const TActorContext& ctx)
}

LOG_INFO(ctx, TBlockStoreComponents::SERVICE,
"[%lu] Starting tablet",
VolumeTabletId);
"[%lu] Starting tablet (gen: %u)",
VolumeTabletId,
VolumeGeneration);

const auto* appData = AppData(ctx);

Expand Down Expand Up @@ -551,7 +552,7 @@ void TStartVolumeActor::HandleTabletDead(

if (PendingRequest == EPendingRequest::START) {
LOG_ERROR(ctx, TBlockStoreComponents::SERVICE,
"[%lu] Tablet boot failed during actor stopping",
"[%lu] Tablet boot failed during actor starting",
VolumeTabletId);

PendingRequest = EPendingRequest::NONE;
Expand Down Expand Up @@ -579,7 +580,7 @@ void TStartVolumeActor::HandleTabletDead(
0, // cookie
error);

bool delay;
bool delay = true;
switch (msg->Reason) {
case TEvTablet::TEvTabletDead::ReasonBootRace:
// Avoid unnecessary delays
Expand All @@ -591,7 +592,6 @@ void TStartVolumeActor::HandleTabletDead(
++VolumeGeneration;
break;
default:
delay = true;
break;
}

Expand Down
35 changes: 18 additions & 17 deletions cloud/blockstore/tests/loadtest/local-emergency/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from ydb.tests.library.harness.kikimr_runner import get_unique_path_for_current_test, ensure_path_exists


def default_storage_config(cache_folder):
def default_storage_config(backups_folder):
storage = storage_config_with_default_limits()
storage.SSDSystemChannelPoolKind = "ssd"
storage.SSDLogChannelPoolKind = "ssd"
Expand All @@ -21,17 +21,18 @@ def default_storage_config(cache_folder):
storage.SSDMergedChannelPoolKind = "ssd"

storage.TabletBootInfoBackupFilePath = \
cache_folder + "/tablet_boot_info_backup.txt"
backups_folder + "/tablet_boot_info_backup.txt"
storage.PathDescriptionBackupFilePath = \
cache_folder + "/path_description_backup.txt"
backups_folder + "/path_description_backup.txt"

return storage


def storage_config_with_emergency_mode(cache_folder):
storage = default_storage_config(cache_folder)
def storage_config_with_emergency_mode(backups_folder):
storage = default_storage_config(backups_folder)
storage.HiveProxyFallbackMode = True
storage.SSProxyFallbackMode = True
storage.DisableLocalService = True
return storage


Expand All @@ -51,36 +52,35 @@ def __init__(self, name, config_path):


def __run_test(test_case):
cache_folder = get_unique_path_for_current_test(
backups_folder = get_unique_path_for_current_test(
output_path=common.output_path(),
sub_folder="cache",
sub_folder="backups",
)
ensure_path_exists(cache_folder)

storage_config_patches = [
default_storage_config(cache_folder),
storage_config_with_emergency_mode(cache_folder),
]
ensure_path_exists(backups_folder)

env = LocalLoadTest(
"",
storage_config_patches=storage_config_patches,
storage_config_patches=[default_storage_config(backups_folder)],
dynamic_pdisks=[dict(user_kind=1)],
dynamic_storage_pools=[
dict(name="dynamic_storage_pool:1", kind="system", pdisk_user_kind=0),
dict(name="dynamic_storage_pool:2", kind="ssd", pdisk_user_kind=1)
],
bs_cache_file_path=cache_folder + "/bs_cache.txt",
bs_cache_file_path=backups_folder + "/bs_cache.txt",
)

client = CreateClient(env.endpoint)
client.create_volume("vol0", 4096, 1000000, 1, protos.EStorageMediaKind.Value("STORAGE_MEDIA_SSD"))

session = Session(client, "vol0", "")
session.mount_volume()
session.write_blocks(0, [b'\1' * 4096])
session.write_blocks(100500, [b'\1' * 4096])
# TODO: should not unmount volume to make emergency unexpected
session.unmount_volume()

client.execute_action(action="BackupPathDescriptions", input_bytes=str.encode(""))
client.execute_action(action="BackupTabletBootInfos", input_bytes=str.encode(""))

static_pdisk_paths = []
for info in env.pdisks_info:
if info["pdisk_user_kind"] == 0:
Expand All @@ -90,8 +90,9 @@ def __run_test(test_case):
# Destroy static group in order to emulate emergency.
# TODO: survive outage of kikimr static tablets.
# os.remove(static_pdisk_paths[0])

env.kikimr_cluster.restart_nodes()

env.nbs.storage_config_patches = [storage_config_with_emergency_mode(backups_folder)]
env.nbs.restart()

try:
Expand Down
8 changes: 4 additions & 4 deletions cloud/blockstore/tests/python/lib/nbs_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,6 @@ def __init__(
if kms_config is not None:
self.__proto_configs["kms.txt"] = kms_config

if storage_config_patches is not None and len(storage_config_patches) > 0:
for i in range(len(storage_config_patches)):
self.__proto_configs["storage-%s.txt" % i] = self.__generate_patched_storage_txt(i)

if discovery_config is not None:
self.__proto_configs["discovery.txt"] = discovery_config
self.__use_discovery = True
Expand All @@ -165,6 +161,10 @@ def __init__(
self.__init_daemon()

def __init_daemon(self):
if self.storage_config_patches is not None and len(self.storage_config_patches) > 0:
for i in range(len(self.storage_config_patches)):
self.__proto_configs["storage-%s.txt" % i] = self.__generate_patched_storage_txt(i)

cp = None
if self.__binary_path:
cp = core_pattern(self.__binary_path, self.__cwd)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,15 @@ void THiveProxyFallbackActor::HandleBootExternal(
return;
}

// increment suggested generation to ensure that the tablet does not get
// stuck with an outdated generation, no matter what
auto request = std::make_unique<
TEvHiveProxyPrivate::TEvUpdateTabletBootInfoBackupRequest>(
r->StorageInfo,
r->SuggestedGeneration + 1
);
NCloud::Send(ctx, TabletBootInfoBackup, std::move(request));

auto response = std::make_unique<TResponse>(
std::move(r->StorageInfo),
r->SuggestedGeneration,
Expand Down
8 changes: 8 additions & 0 deletions cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1152,6 +1152,14 @@ Y_UNIT_TEST_SUITE(THiveProxyTest)
auto result2 = env.SendBootExternalRequest(
sender, 0xdeadbeaf, E_REJECTED);
UNIT_ASSERT(!result2.StorageInfo);

auto result3 = env.SendBootExternalRequest(sender, FakeTablet2, S_OK);
UNIT_ASSERT(result3.StorageInfo);
UNIT_ASSERT_VALUES_EQUAL(
FakeTablet2,
result3.StorageInfo->TabletID);
// suggested generation should be incremented after last boot
UNIT_ASSERT_VALUES_EQUAL(2u, result3.SuggestedGeneration);
}
}

Expand Down

0 comments on commit a31d56a

Please sign in to comment.