Skip to content

Commit

Permalink
[bugfix] RemoveDevice request should trigger migration from all logic…
Browse files Browse the repository at this point in the history
…al devices (#1703)

* fix RemoveDevice
  • Loading branch information
sharpeye committed Aug 6, 2024
1 parent 067d486 commit 957d8d7
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 23 deletions.
63 changes: 40 additions & 23 deletions cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5636,43 +5636,60 @@ auto TDiskRegistryState::UpdateCmsDeviceState(

// not transactional actually but it's not a big problem
ui32 processed = 0;
for (auto* device: devices) {
for (; processed != devices.size(); ++processed) {
auto& device = *devices[processed];

if (newState == NProto::DEVICE_STATE_ONLINE) {
result.Error = CmsAddDevice(
db,
*agent,
*device,
device,
now,
shouldResume,
dryRun,
result.Timeout);
} else {
TString affectedDisk;
result.Error = CmsRemoveDevice(
db,
*agent,
*device,
now,
dryRun,
affectedDisk,
result.Timeout);
if (!affectedDisk.empty()) {
result.AffectedDisks.push_back(std::move(affectedDisk));

if (HasError(result.Error)) {
break;
}

continue;
}

if (HasError(result.Error)) {
STORAGE_WARN(
"UpdateCmsDeviceState stopped after processing %u devices"
", current deviceId: %s, error: %s",
processed,
device->GetDeviceUUID().c_str(),
FormatError(result.Error).c_str());
TDuration timeout;
TString affectedDisk;
auto error = CmsRemoveDevice(
db,
*agent,
device,
now,
dryRun,
affectedDisk,
timeout);

break;
if (!affectedDisk.empty()) {
result.AffectedDisks.push_back(std::move(affectedDisk));
}

++processed;
result.Timeout = Max(result.Timeout, timeout);

if (HasError(error)) {
result.Error = std::move(error);
// ignoring E_TRY_AGAIN error, we touch each logical device to
// start migrations
if (result.Error.GetCode() != E_TRY_AGAIN) {
break;
}
}
}

if (processed != devices.size()) {
STORAGE_WARN(
"UpdateCmsDeviceState stopped after processing %u devices"
", current deviceId: %s, error: %s",
processed,
devices[processed]->GetDeviceUUID().c_str(),
FormatError(result.Error).c_str());
}

SortUnique(result.AffectedDisks);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1358,6 +1358,61 @@ Y_UNIT_TEST_SUITE(TDiskRegistryTest)
UNIT_ASSERT_VALUES_EQUAL(0, timeout);
}
}

Y_UNIT_TEST_F(ShouldUpdateAllLogicalDevices, TFixture)
{
// two logical devices on the same path
const auto agent = CreateAgentConfig("agent-1", {
Device("path", "uuid-1", "rack-1", 10_GB),
Device("path", "uuid-2", "rack-1", 10_GB)
});

auto config = CreateDefaultStorageConfig();

SetUpRuntime(TTestRuntimeBuilder()
.WithAgents({agent})
.With(config)
.Build());

DiskRegistry->SetWritableState(true);
DiskRegistry->UpdateConfig(CreateRegistryConfig(0, {agent}));
RegisterAndWaitForAgents(*Runtime, {agent});

// allocate vol0 on one of the two logical devices
{
auto response = DiskRegistry->AllocateDisk("vol0", 10_GB);
const auto& msg = response->Record;
UNIT_ASSERT_VALUES_EQUAL(1, msg.DevicesSize());
UNIT_ASSERT_VALUES_EQUAL(0, msg.MigrationsSize());
}

{
auto [error, timeout] = RemoveDevice(agent.GetAgentId(), "path");

UNIT_ASSERT_VALUES_EQUAL(E_TRY_AGAIN, error.GetCode());
UNIT_ASSERT_VALUES_EQUAL(
config.GetNonReplicatedInfraTimeout(),
timeout * 1000);
}

// check that vol0 hasn't started migration to a logical device with the
// same path
{
auto response = DiskRegistry->AllocateDisk("vol0", 10_GB);
const auto& msg = response->Record;
UNIT_ASSERT_VALUES_EQUAL(1, msg.DevicesSize());
UNIT_ASSERT_VALUES_EQUAL(0, msg.MigrationsSize());
}

// check that the second logical device is not available for allocation
{
DiskRegistry->SendAllocateDiskRequest("vol1", 10_GB);
auto response = DiskRegistry->RecvAllocateDiskResponse();
UNIT_ASSERT_VALUES_EQUAL(
E_BS_DISK_ALLOCATION_FAILED,
response->GetStatus());
}
}
}

} // namespace NCloud::NBlockStore::NStorage

0 comments on commit 957d8d7

Please sign in to comment.