Skip to content

Commit

Permalink
issue-2076: fix affected disk tracking (#2092)
Browse files Browse the repository at this point in the history
* issue-2076:  fix affected disk tracking
  • Loading branch information
sharpeye authored Oct 2, 2024
1 parent 814e583 commit 010e363
Show file tree
Hide file tree
Showing 3 changed files with 450 additions and 53 deletions.
105 changes: 57 additions & 48 deletions cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1151,11 +1151,50 @@ NProto::TError TDiskRegistryState::ReplaceDevice(
bool manual,
bool* diskStateUpdated)
{
try {
if (!diskId) {
return MakeError(E_ARGUMENT, "empty disk id");
}
Y_ABORT_UNLESS(diskStateUpdated);
*diskStateUpdated = false;

if (!diskId) {
return MakeError(E_ARGUMENT, "empty disk id");
}

if (!Disks.contains(diskId)) {
return MakeError(E_ARGUMENT, TStringBuilder()
<< "unknown disk: " << diskId.Quote());
}

TDiskState& disk = Disks[diskId];

auto error = ReplaceDeviceWithoutDiskStateUpdate(
db,
disk,
diskId,
deviceId,
deviceReplacementId,
timestamp,
std::move(message),
manual);

if (HasError(error)) {
return error;
}

*diskStateUpdated = TryUpdateDiskState(db, diskId, disk, timestamp);

return {};
}

NProto::TError TDiskRegistryState::ReplaceDeviceWithoutDiskStateUpdate(
TDiskRegistryDatabase& db,
TDiskState& disk,
const TString& diskId,
const TString& deviceId,
const TString& deviceReplacementId,
TInstant timestamp,
TString message,
bool manual)
{
try {
if (!deviceId) {
return MakeError(E_ARGUMENT, "empty device id");
}
Expand All @@ -1165,13 +1204,6 @@ NProto::TError TDiskRegistryState::ReplaceDevice(
<< "device does not belong to disk " << diskId.Quote());
}

if (!Disks.contains(diskId)) {
return MakeError(E_ARGUMENT, TStringBuilder()
<< "unknown disk: " << diskId.Quote());
}

TDiskState& disk = Disks[diskId];

auto it = Find(disk.Devices, deviceId);
if (it == disk.Devices.end()) {
auto message = ReportDiskRegistryDeviceNotFound(
Expand Down Expand Up @@ -1210,7 +1242,6 @@ NProto::TError TDiskRegistryState::ReplaceDevice(
timestamp,
message);
if (HasError(error)) {
TryUpdateDiskState(db, diskId, timestamp);
return error;
}

Expand Down Expand Up @@ -1284,8 +1315,6 @@ NProto::TError TDiskRegistryState::ReplaceDevice(

*it = targetDevice.GetDeviceUUID();

*diskStateUpdated = TryUpdateDiskState(db, diskId, disk, timestamp);

UpdateAgent(db, *agentPtr);

UpdatePlacementGroup(db, diskId, disk, "ReplaceDevice");
Expand Down Expand Up @@ -4793,13 +4822,13 @@ void TDiskRegistryState::ApplyAgentStateChange(

auto& disk = Disks[diskId];

diskIds.emplace(diskId);

// check if deviceId is target for migration
if (RestartDeviceMigration(timestamp, db, diskId, disk, deviceId)) {
continue;
}

bool isAffected = true;

if (agent.GetState() == NProto::AGENT_STATE_WARNING) {
if (disk.MigrationSource2Target.contains(deviceId)) {
// migration already started
Expand Down Expand Up @@ -4832,42 +4861,32 @@ void TDiskRegistryState::ApplyAgentStateChange(
deviceId);

if (canReplaceDevice) {
bool updated = false;

auto error = ReplaceDevice(
auto error = ReplaceDeviceWithoutDiskStateUpdate(
db,
disk,
diskId,
deviceId,
"", // no replacement device
timestamp,
MakeMirroredDiskDeviceReplacementMessage(
disk.MasterDiskId,
"agent unavailable"),
false, // manual
&updated);
false); // manual

if (HasError(error)) {
ReportMirroredDiskDeviceReplacementFailure(
FormatError(error));
}

if (!updated) {
isAffected = false;
}
}
}

CancelDeviceMigration(timestamp, db, diskId, disk, deviceId);
}

if (isAffected) {
diskIds.emplace(std::move(diskId));
}
}

for (auto& id: diskIds) {
if (TryUpdateDiskState(db, id, timestamp)) {
affectedDisks.push_back(std::move(id));
for (const auto& diskId: diskIds) {
if (TryUpdateDiskState(db, diskId, timestamp)) {
affectedDisks.push_back(diskId);
}
}
}
Expand Down Expand Up @@ -5763,39 +5782,29 @@ void TDiskRegistryState::ApplyDeviceStateChange(
return;
}

if (device.GetState() == NProto::DEVICE_STATE_ERROR
&& disk->MasterDiskId)
{
if (device.GetState() == NProto::DEVICE_STATE_ERROR && disk->MasterDiskId) {
const bool canReplaceDevice = CheckIfDeviceReplacementIsAllowed(
now,
disk->MasterDiskId,
device.GetDeviceUUID());

if (canReplaceDevice) {
bool updated = false;
auto error = ReplaceDevice(
auto error = ReplaceDeviceWithoutDiskStateUpdate(
db,
*disk,
diskId,
device.GetDeviceUUID(),
"", // no replacement device
"", // no replacement device
now,
MakeMirroredDiskDeviceReplacementMessage(
disk->MasterDiskId,
"device failure"),
false, // manual
&updated);
false); // manual

if (HasError(error)) {
ReportMirroredDiskDeviceReplacementFailure(
FormatError(error));
}

if (updated) {
affectedDisk = diskId;
ReportMirroredDiskDeviceReplacementFailure(FormatError(error));
}
}

return;
}

if (TryUpdateDiskState(db, diskId, *disk, now)) {
Expand Down
10 changes: 10 additions & 0 deletions cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -1264,6 +1264,16 @@ class TDiskRegistryState
const NProto::TDiskRegistryConfig& newConfig) const;

std::optional<ui64> GetDiskBlockCount(const TDiskId& diskId) const;

NProto::TError ReplaceDeviceWithoutDiskStateUpdate(
TDiskRegistryDatabase& db,
TDiskState& disk,
const TString& diskId,
const TString& deviceId,
const TString& deviceReplacementId,
TInstant timestamp,
TString message,
bool manual);
};

} // namespace NCloud::NBlockStore::NStorage
Loading

0 comments on commit 010e363

Please sign in to comment.