From 32dc4c8f7eaf6d6448243fcdd54a1618ec969e69 Mon Sep 17 00:00:00 2001 From: sharpeye Date: Tue, 5 Nov 2024 20:48:31 +0000 Subject: [PATCH] disable broken devices on Disk Agent registration --- cloud/blockstore/config/disk.proto | 4 + .../libs/rdma_test/rdma_test_environment.cpp | 26 +- .../libs/rdma_test/rdma_test_environment.h | 2 + .../disk_agent/disk_agent_actor_disable.cpp | 1 + .../disk_agent/disk_agent_actor_io.cpp | 11 - .../disk_agent/disk_agent_actor_register.cpp | 4 + .../disk_agent/disk_agent_actor_ut.cpp | 235 +++++++++++++++++- .../storage/disk_agent/disk_agent_private.h | 4 +- .../storage/disk_agent/disk_agent_state.cpp | 113 +++++++-- .../storage/disk_agent/disk_agent_state.h | 6 + .../disk_agent/disk_agent_state_ut.cpp | 164 ++++++++++++ .../libs/storage/disk_agent/model/config.cpp | 15 ++ .../libs/storage/disk_agent/model/config.h | 5 + .../disk_agent/model/device_client.cpp | 34 ++- .../storage/disk_agent/model/device_client.h | 17 +- .../libs/storage/disk_agent/rdma_target.cpp | 42 +++- .../libs/storage/disk_agent/rdma_target.h | 2 +- .../storage/disk_agent/rdma_target_ut.cpp | 34 ++- .../disk_agent/storage_initializer.cpp | 12 +- .../disk_agent/storage_initializer_ut.cpp | 1 + .../storage/disk_agent/testlib/test_env.cpp | 29 ++- .../storage/disk_agent/testlib/test_env.h | 7 + .../disk_registry_actor_register.cpp | 49 ++-- .../disk_registry/disk_registry_state.cpp | 11 +- .../disk_registry/disk_registry_state.h | 1 + .../storage/disk_registry/disk_registry_tx.h | 2 + .../blockstore/libs/storage/protos/disk.proto | 3 + cloud/storage/core/libs/common/error.h | 5 +- 28 files changed, 738 insertions(+), 101 deletions(-) diff --git a/cloud/blockstore/config/disk.proto b/cloud/blockstore/config/disk.proto index 25e0e63ef42..67c9552e60f 100644 --- a/cloud/blockstore/config/disk.proto +++ b/cloud/blockstore/config/disk.proto @@ -261,7 +261,11 @@ message TDiskAgentConfig // List of device UUIDs with suspended I/O. // I/O operations for such a device will result in errors. + // Is used for the config cache file only. repeated string DevicesWithSuspendedIO = 35; + + // Disable devices that have been recognized as broken by the DR + optional bool DisableBrokenDevices = 36; } //////////////////////////////////////////////////////////////////////////////// diff --git a/cloud/blockstore/libs/rdma_test/rdma_test_environment.cpp b/cloud/blockstore/libs/rdma_test/rdma_test_environment.cpp index 36b410fe38a..71e3a154b92 100644 --- a/cloud/blockstore/libs/rdma_test/rdma_test_environment.cpp +++ b/cloud/blockstore/libs/rdma_test/rdma_test_environment.cpp @@ -14,23 +14,27 @@ namespace NCloud::NBlockStore::NStorage { TRdmaTestEnvironment::TRdmaTestEnvironment(size_t deviceSize, ui32 poolSize) : Storage(std::make_shared(deviceSize)) { - THashMap devices; - devices[Device_1] = std::make_shared( - Storage, - 4_KB, // storageBlockSize - true, // normalize, - TDuration::Zero(), // maxRequestDuration - TDuration::Zero() // shutdownTimeout - ); + THashMap devices{ + {Device_1, + std::make_shared( + Storage, + 4_KB, // storageBlockSize + true, // normalize, + TDuration::Zero(), // maxRequestDuration + TDuration::Zero() // shutdownTimeout + )}}; + TVector uuids; for (const auto& [key, value]: devices) { uuids.push_back(key); } - auto deviceClient = std::make_shared( + + DeviceClient = std::make_shared( TDuration::MilliSeconds(100), uuids, Logging->CreateLog("BLOCKSTORE_DISK_AGENT")); - deviceClient->AcquireDevices( + + DeviceClient->AcquireDevices( uuids, ClientId, TInstant::Now(), @@ -60,7 +64,7 @@ TRdmaTestEnvironment::TRdmaTestEnvironment(size_t deviceSize, ui32 poolSize) std::move(oldRequestCounters), Logging, Server, - std::move(deviceClient), + DeviceClient, std::move(devices)); RdmaTarget->Start(); diff --git a/cloud/blockstore/libs/rdma_test/rdma_test_environment.h b/cloud/blockstore/libs/rdma_test/rdma_test_environment.h index 5c029ff4715..c8ad37d4827 100644 --- a/cloud/blockstore/libs/rdma_test/rdma_test_environment.h +++ b/cloud/blockstore/libs/rdma_test/rdma_test_environment.h @@ -36,6 +36,8 @@ struct TRdmaTestEnvironment "console", TLogSettings{TLOG_RESOURCES}); + std::shared_ptr DeviceClient; + TRdmaTestEnvironment(size_t deviceSize = 4_MB, ui32 poolSize = 1); virtual ~TRdmaTestEnvironment(); diff --git a/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_disable.cpp b/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_disable.cpp index 74d06b8d0fd..b2278ff7ad0 100644 --- a/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_disable.cpp +++ b/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_disable.cpp @@ -26,6 +26,7 @@ void TDiskAgentActor::HandleDisableConcreteAgent( if (record.DeviceUUIDsSize()) { for (const auto& d: record.GetDeviceUUIDs()) { State->DisableDevice(d); + State->ReportDisabledDeviceError(d); } } else { HandlePoisonPill(nullptr, ctx); diff --git a/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_io.cpp b/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_io.cpp index 16c5fa5b616..d5f55290426 100644 --- a/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_io.cpp +++ b/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_io.cpp @@ -178,17 +178,6 @@ void TDiskAgentActor::PerformIO( started); }; - if (State->IsDeviceDisabled(deviceUUID)) { - LOG_INFO(ctx, TBlockStoreComponents::DISK_AGENT, - "Dropped %s request to device %s, session %s", - TMethod::Name, - deviceUUID.c_str(), - clientId.c_str()); - State->ReportDisabledDeviceError(deviceUUID); - replyError(E_IO, "Device disabled"); - return; - } - LOG_TRACE(ctx, TBlockStoreComponents::DISK_AGENT, "%s [%s / %s]", TMethod::Name, diff --git a/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_register.cpp b/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_register.cpp index 7859e47edc8..bd4c69dc121 100644 --- a/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_register.cpp +++ b/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_register.cpp @@ -87,6 +87,9 @@ void TRegisterActor::HandleRegisterAgentResponse( auto response = std::make_unique( msg->GetError()); + response->DevicesToSuspendIO.assign( + msg->Record.GetDevicesToSuspendIO().cbegin(), + msg->Record.GetDevicesToSuspendIO().cend()); NCloud::Reply(ctx, *RequestInfo, std::move(response)); } @@ -144,6 +147,7 @@ void TDiskAgentActor::HandleRegisterAgentResponse( if (!HasError(msg->GetError())) { RegistrationState = ERegistrationState::Registered; LOG_INFO(ctx, TBlockStoreComponents::DISK_AGENT, "Register completed"); + State->UpdateDevicesWithSuspendedIO(msg->DevicesToSuspendIO); } else { LOG_WARN(ctx, TBlockStoreComponents::DISK_AGENT, "Register failed: %s. Try later", FormatError(msg->GetError()).c_str()); diff --git a/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_ut.cpp b/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_ut.cpp index ab8a9667c77..9a3052a9380 100644 --- a/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_ut.cpp +++ b/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_ut.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -60,6 +61,60 @@ TFsPath TryGetRamDrivePath() //////////////////////////////////////////////////////////////////////////////// +struct TTestNvmeManager + : NNvme::INvmeManager +{ + THashMap PathToSerial; + + explicit TTestNvmeManager( + const TVector>& pathToSerial) + : PathToSerial{pathToSerial.cbegin(), pathToSerial.cend()} + { + } + + TFuture Format( + const TString& path, + NNvme::nvme_secure_erase_setting ses) override + { + Y_UNUSED(path); + Y_UNUSED(ses); + + return MakeFuture(); + } + + TFuture Deallocate( + const TString& path, + ui64 offsetBytes, + ui64 sizeBytes) override + { + Y_UNUSED(path); + Y_UNUSED(offsetBytes); + Y_UNUSED(sizeBytes); + + return MakeFuture(); + } + + TResultOrError GetSerialNumber(const TString& path) override + { + const auto filename = TFsPath{path}.Basename(); + auto it = PathToSerial.find(filename); + if (it == PathToSerial.end()) { + return MakeError(MAKE_SYSTEM_ERROR(42), filename); + } + + return it->second; + } + + TResultOrError IsSsd(const TString& path) override + { + Y_UNUSED(path); + + return true; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + struct THttpGetRequest: NMonitoring::IHttpRequest { TCgiParameters CgiParameters; @@ -114,10 +169,12 @@ struct TFixture const TTempDir TempDir; const TString CachedSessionsPath = TempDir.Path() / "nbs-disk-agent-sessions.txt"; + const TString CachedConfigPath = TempDir.Path() / "nbs-disk-agent.txt"; const TDuration ReleaseInactiveSessionsTimeout = 10s; TTestBasicRuntime Runtime; - NMonitoring::TDynamicCountersPtr Counters = MakeIntrusive(); + NMonitoring::TDynamicCountersPtr Counters = + MakeIntrusive(); TVector Files; @@ -142,10 +199,12 @@ struct TFixture pool.SetMaxSize(DefaultFileSize); config.SetCachedSessionsPath(CachedSessionsPath); + config.SetCachedConfigPath(CachedConfigPath); config.SetBackend(NProto::DISK_AGENT_BACKEND_AIO); config.SetAcquireRequired(true); config.SetReleaseInactiveSessionsTimeout( ReleaseInactiveSessionsTimeout.MilliSeconds()); + config.SetDisableBrokenDevices(true); return config; } @@ -1054,7 +1113,7 @@ Y_UNIT_TEST_SUITE(TDiskAgentTest) // Return all stolen request - write/zero request will be completed // after this - for (auto& request : stolenWriteCompletedRequests) { + for (auto& request: stolenWriteCompletedRequests) { runtime.Send(request.release()); } @@ -1257,7 +1316,7 @@ Y_UNIT_TEST_SUITE(TDiskAgentTest) // Return all stolen request - write/zero request will be completed // after this - for (auto& request : stolenWriteCompletedRequests) { + for (auto& request: stolenWriteCompletedRequests) { runtime.Send(request.release()); } { @@ -4560,6 +4619,176 @@ Y_UNIT_TEST_SUITE(TDiskAgentTest) UNIT_ASSERT_VALUES_EQUAL(1, parsedZeroes); UNIT_ASSERT_VALUES_EQUAL(5, actors.size()); } + + Y_UNIT_TEST_F(ShouldDisableDevicesAfterRegistration, TFixture) + { + const TString uuids[] { + "79955ae90189fe8a89ab832a8b0cb57d", // NVMENBS01 + "657dabaf3d224c9177b00c437716dfb1", // NVMENBS02 + "e85cd1d217c3239507fc0cd180a075fd", // NVMENBS03 + "5ea2fcdce0a180a63db2b5f6a5b34221" // NVMENBS04 + }; + + TVector> pathToSerial{ + {"NVMENBS01", "W"}, + {"NVMENBS02", "X"}, + {"NVMENBS03", "Y"}, + {"NVMENBS04", "Z"}, + }; + + // build the config cache + { + NProto::TDiskAgentConfig config; + + size_t i = 0; + for (const auto& [filename, sn]: pathToSerial) { + auto& file = *config.AddFileDevices(); + file.SetPath(TempDir.Path() / filename); + file.SetSerialNumber(sn); + file.SetDeviceId(uuids[i]); + file.SetBlockSize(4_KB); + + ++i; + } + + auto error = SaveDiskAgentConfig(CachedConfigPath, config); + UNIT_ASSERT_C(!HasError(error), FormatError(error)); + } + + // change serial numbers of NVMENBS02 & NVMENBS04 + pathToSerial[1].second = "new-X"; // NVMENBS02 + pathToSerial[3].second = "new-Z"; // NVMENBS04 + + auto diskregistryState = MakeIntrusive(); + + // Disable only NVMENBS02 + diskregistryState->DisabledDevices = {uuids[1]}; + + // Postpone the registration in DR + auto oldEventFilterFn = Runtime.SetEventFilter( + [](auto& runtime, TAutoPtr& event) + { + if (event->GetTypeRewrite() == + TEvDiskRegistry::EvRegisterAgentRequest) + { + auto response = std::make_unique< + TEvDiskRegistry::TEvRegisterAgentResponse>( + MakeError(E_REJECTED)); + + runtime.Send(new NActors::IEventHandle( + event->Sender, + event->GetRecipientRewrite(), + response.release())); + + return true; + } + + return false; + }); + + auto env = TTestEnvBuilder(Runtime) + .With(CreateDiskAgentConfig()) + .With(std::make_shared(pathToSerial)) + .With(diskregistryState) + .Build(); + + Runtime.UpdateCurrentTime(Now()); + + TDiskAgentClient diskAgent(Runtime); + diskAgent.WaitReady(); + + diskAgent.AcquireDevices( + TVector{ + uuids[0], // NVMENBS01 + uuids[1], // NVMENBS02 + uuids[3], // NVMENBS04 + }, + "reader-1", + NProto::VOLUME_ACCESS_READ_ONLY, + -1, // MountSeqNumber + "vol0", + 1000); // VolumeGeneration + + // NVMENBS01 is OK + { + auto error = Read(diskAgent, "reader-1", uuids[0]); + UNIT_ASSERT_EQUAL_C(S_OK, error.GetCode(), error); + } + + // Before the registration in DR NVMENBS02 & NVMENBS04 should be + // suspended + { + auto error = Read(diskAgent, "reader-1", uuids[1]); + UNIT_ASSERT_EQUAL_C(E_REJECTED, error.GetCode(), error); + UNIT_ASSERT_C( + error.GetMessage().Contains("Device disabled"), + error); + } + + { + auto error = Read(diskAgent, "reader-1", uuids[3]); + UNIT_ASSERT_EQUAL_C(E_REJECTED, error.GetCode(), error); + UNIT_ASSERT_C( + error.GetMessage().Contains("Device disabled"), + error); + } + + // check the config cache + { + auto [config, error] = LoadDiskAgentConfig(CachedConfigPath); + UNIT_ASSERT_EQUAL_C(S_OK, error.GetCode(), error); + UNIT_ASSERT_VALUES_EQUAL(4, config.FileDevicesSize()); + UNIT_ASSERT_VALUES_EQUAL(2, config.DevicesWithSuspendedIOSize()); + + Sort(*config.MutableDevicesWithSuspendedIO()); + + UNIT_ASSERT_VALUES_EQUAL( + uuids[3], // NVMENBS04 + config.GetDevicesWithSuspendedIO(0)); + + UNIT_ASSERT_VALUES_EQUAL( + uuids[1], // NVMENBS02 + config.GetDevicesWithSuspendedIO(1)); + } + + Runtime.SetEventFilter(oldEventFilterFn); + Runtime.AdvanceCurrentTime(5s); + Runtime.DispatchEvents(TDispatchOptions{ + .FinalEvents = {TDispatchOptions::TFinalEventCondition( + TEvDiskRegistry::EvRegisterAgentResponse)}}); + + // NVMENBS01 is OK + { + auto error = Read(diskAgent, "reader-1", uuids[0]); + UNIT_ASSERT_EQUAL_C(S_OK, error.GetCode(), error); + } + + // After the registartion in DR NVMENBS02 should be disabled + { + auto error = Read(diskAgent, "reader-1", uuids[1]); + UNIT_ASSERT_EQUAL_C(E_IO, error.GetCode(), error); + UNIT_ASSERT_C( + error.GetMessage().Contains("Device disabled"), + error); + } + + // NVMENBS04 should be OK + { + auto error = Read(diskAgent, "reader-1", uuids[3]); + UNIT_ASSERT_EQUAL_C(S_OK, error.GetCode(), error); + } + + // check the config cache + { + auto [config, error] = LoadDiskAgentConfig(CachedConfigPath); + UNIT_ASSERT_EQUAL_C(S_OK, error.GetCode(), error); + UNIT_ASSERT_VALUES_EQUAL(4, config.FileDevicesSize()); + UNIT_ASSERT_VALUES_EQUAL(1, config.DevicesWithSuspendedIOSize()); + UNIT_ASSERT_VALUES_EQUAL( + uuids[1], // NVMENBS02 + config.GetDevicesWithSuspendedIO(0)); + } + } } } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_agent/disk_agent_private.h b/cloud/blockstore/libs/storage/disk_agent/disk_agent_private.h index 39f5648bbe6..bc87ed7a09b 100644 --- a/cloud/blockstore/libs/storage/disk_agent/disk_agent_private.h +++ b/cloud/blockstore/libs/storage/disk_agent/disk_agent_private.h @@ -61,7 +61,9 @@ struct TEvDiskAgentPrivate {}; struct TRegisterAgentResponse - {}; + { + TVector DevicesToSuspendIO; + }; // // CollectStats diff --git a/cloud/blockstore/libs/storage/disk_agent/disk_agent_state.cpp b/cloud/blockstore/libs/storage/disk_agent/disk_agent_state.cpp index 32f06830225..c5d6d046bc3 100644 --- a/cloud/blockstore/libs/storage/disk_agent/disk_agent_state.cpp +++ b/cloud/blockstore/libs/storage/disk_agent/disk_agent_state.cpp @@ -283,7 +283,7 @@ const TDiskAgentState::TDeviceState& TDiskAgentState::GetDeviceStateImpl( auto error = DeviceClient->AccessDevice(uuid, clientId, accessMode); if (HasError(error)) { - ythrow TServiceError(error); + ythrow TServiceError(error.GetCode()) << error.GetMessage(); } auto d = Devices.FindPtr(uuid); @@ -449,9 +449,11 @@ TFuture TDiskAgentState::Initialize() { auto future = Spdk ? InitSpdkStorage() : InitAioStorage(); - return future.Subscribe( - [this](auto) mutable + return future.Apply( + [this](TFuture future) mutable { + TInitializeResult r = future.ExtractValue(); + TVector uuids(Reserve(Devices.size())); for (const auto& x: Devices) { uuids.push_back(x.first); @@ -464,7 +466,15 @@ TFuture TDiskAgentState::Initialize() InitRdmaTarget(); + if (AgentConfig->GetDisableBrokenDevices()) { + for (const auto& uuid: r.DevicesWithSuspendedIO) { + SuspendDevice(uuid); + } + } + RestoreSessions(*DeviceClient); + + return r; }); } @@ -520,10 +530,38 @@ TFuture TDiskAgentState::CollectStats() return context->Promise; } +void TDiskAgentState::CheckIfDeviceIsDisabled( + const TString& uuid, + const TString& clientId) +{ + auto ec = DeviceClient->GetDeviceIOErrorCode(uuid); + if (!ec) { + return; + } + + if (GetErrorKind(MakeError(*ec)) != EErrorKind::ErrorRetriable) { + STORAGE_ERROR( + "[" << uuid << "/" << clientId + << "] Device disabled. Drop request."); + + ReportDisabledDeviceError(uuid); + } else { + STORAGE_TRACE( + "[" << uuid << "/" << clientId + << "] Device suspended. Reject request."); + } + + ythrow TServiceError(*ec) << "Device disabled"; +} + TFuture TDiskAgentState::Read( TInstant now, NProto::TReadDeviceBlocksRequest request) { + CheckIfDeviceIsDisabled( + request.GetDeviceUUID(), + request.GetHeaders().GetClientId()); + const auto& device = GetDeviceState( request.GetDeviceUUID(), request.GetHeaders().GetClientId(), @@ -581,6 +619,10 @@ TFuture TDiskAgentState::Write( TInstant now, NProto::TWriteDeviceBlocksRequest request) { + CheckIfDeviceIsDisabled( + request.GetDeviceUUID(), + request.GetHeaders().GetClientId()); + const auto& device = GetDeviceState( request.GetDeviceUUID(), request.GetHeaders().GetClientId(), @@ -621,6 +663,10 @@ TFuture TDiskAgentState::WriteZeroes( TInstant now, NProto::TZeroDeviceBlocksRequest request) { + CheckIfDeviceIsDisabled( + request.GetDeviceUUID(), + request.GetHeaders().GetClientId()); + const auto& device = GetDeviceState( request.GetDeviceUUID(), request.GetHeaders().GetClientId(), @@ -803,10 +849,9 @@ void TDiskAgentState::ReleaseDevices( ui32 volumeGeneration) { if (PartiallySuspended) { - ythrow TServiceError(MakeError( - E_REJECTED, - TStringBuilder() << "Disk agent is partially suspended. Can't " - "release any sessions at this state.")); + ythrow TServiceError(E_REJECTED) + << "Disk agent is partially suspended. Can't " + "release any sessions at this state."; } CheckError(DeviceClient->ReleaseDevices( @@ -816,10 +861,48 @@ void TDiskAgentState::ReleaseDevices( volumeGeneration)); } +void TDiskAgentState::UpdateDevicesWithSuspendedIO( + const TVector& devicesToSuspendIO) +{ + if (!AgentConfig->GetDisableBrokenDevices()) { + return; + } + + const THashSet uuids( + devicesToSuspendIO.cbegin(), + devicesToSuspendIO.cend()); + + for (const auto& [uuid, _]: Devices) { + if (uuids.contains(uuid)) { + STORAGE_INFO("Disable device " << uuid); + DeviceClient->DisableDevice(uuid); + } else { + DeviceClient->EnableDevice(uuid); + } + } + + const TString storagePath = StorageConfig->GetCachedDiskAgentConfigPath(); + const TString diskAgentPath = AgentConfig->GetCachedConfigPath(); + + auto error = NStorage::UpdateDevicesWithSuspendedIO( + diskAgentPath.empty() ? storagePath : diskAgentPath, + devicesToSuspendIO); + if (HasError(error)) { + STORAGE_ERROR( + "Can't update DevicesWithSuspendedIO in the config cache file: " + << FormatError(error)); + } +} + void TDiskAgentState::DisableDevice(const TString& uuid) { DeviceClient->DisableDevice(uuid); - ReportDisabledDeviceError(uuid); +} + +void TDiskAgentState::SuspendDevice(const TString& uuid) +{ + STORAGE_INFO("Suspend device " << uuid); + DeviceClient->SuspendDevice(uuid); } void TDiskAgentState::EnableDevice(const TString& uuid) @@ -832,7 +915,8 @@ bool TDiskAgentState::IsDeviceDisabled(const TString& uuid) const return DeviceClient->IsDeviceDisabled(uuid); } -void TDiskAgentState::ReportDisabledDeviceError(const TString& uuid) { +void TDiskAgentState::ReportDisabledDeviceError(const TString& uuid) +{ if (auto* d = Devices.FindPtr(uuid)) { d->Stats->OnError(); } @@ -936,12 +1020,11 @@ void TDiskAgentState::EnsureAccessToDevices( for (const TString& uuid: uuids) { auto error = DeviceClient->AccessDevice(uuid, clientId, accessMode); if (HasError(error)) { - ythrow TServiceError(MakeError( - E_REJECTED, - TStringBuilder() << "Disk agent is partially suspended. " - "Can't acquire previously not acquired " - "devices. Access returned an error: " - << FormatError(error))); + ythrow TServiceError(E_REJECTED) + << "Disk agent is partially suspended. " + "Can't acquire previously not acquired " + "devices. Access returned an error: " + << FormatError(error); } } } diff --git a/cloud/blockstore/libs/storage/disk_agent/disk_agent_state.h b/cloud/blockstore/libs/storage/disk_agent/disk_agent_state.h index 7e4495ebcf3..4ee4ab2329e 100644 --- a/cloud/blockstore/libs/storage/disk_agent/disk_agent_state.h +++ b/cloud/blockstore/libs/storage/disk_agent/disk_agent_state.h @@ -144,7 +144,11 @@ class TDiskAgentState TVector GetSessions() const; + void UpdateDevicesWithSuspendedIO( + const TVector& devicesToSuspendIO); + void DisableDevice(const TString& uuid); + void SuspendDevice(const TString& uuid); void EnableDevice(const TString& uuid); bool IsDeviceDisabled(const TString& uuid) const; void ReportDisabledDeviceError(const TString& uuid); @@ -185,6 +189,8 @@ class TDiskAgentState void InitRdmaTarget(); void RestoreSessions(TDeviceClient& client) const; + + void CheckIfDeviceIsDisabled(const TString& uuid, const TString& clientId); }; } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_agent/disk_agent_state_ut.cpp b/cloud/blockstore/libs/storage/disk_agent/disk_agent_state_ut.cpp index 2486186853b..8c0aa0c4782 100644 --- a/cloud/blockstore/libs/storage/disk_agent/disk_agent_state_ut.cpp +++ b/cloud/blockstore/libs/storage/disk_agent/disk_agent_state_ut.cpp @@ -1855,6 +1855,170 @@ Y_UNIT_TEST_SUITE(TDiskAgentStateTest) UNIT_ASSERT_VALUES_EQUAL_C(S_OK, error.GetCode(), error); } } + + Y_UNIT_TEST_F(ShouldDisableDevice, TFiles) + { + auto state = CreateDiskAgentStateNull( + CreateNullConfig({ .Files = Nvme3s, .AcquireRequired = true }) + ); + + auto read = [&] + { + NProto::TReadDeviceBlocksRequest request; + request.SetDeviceUUID("uuid-1"); + request.SetStartIndex(1); + request.SetBlockSize(4096); + request.SetBlocksCount(10); + request.MutableHeaders()->SetClientId("writer-1"); + + return state->Read(Now(), std::move(request)) + .GetValueSync() + .GetError(); + }; + + auto write = [&] + { + NProto::TWriteDeviceBlocksRequest request; + request.SetDeviceUUID("uuid-1"); + request.SetStartIndex(1); + request.SetBlockSize(4096); + request.MutableHeaders()->SetClientId("writer-1"); + + ResizeIOVector(*request.MutableBlocks(), 10, 4096); + + return state->Write(Now(), std::move(request)) + .GetValueSync() + .GetError(); + }; + + auto zero = [&] + { + NProto::TZeroDeviceBlocksRequest request; + request.SetDeviceUUID("uuid-1"); + request.SetStartIndex(1); + request.SetBlockSize(4096); + request.SetBlocksCount(10); + request.MutableHeaders()->SetClientId("writer-1"); + + return state->WriteZeroes(Now(), std::move(request)) + .GetValueSync() + .GetError(); + }; + + auto future = state->Initialize(); + const auto& r = future.GetValueSync(); + + UNIT_ASSERT(r.Errors.empty()); + UNIT_ASSERT_VALUES_EQUAL(3, r.Configs.size()); + + state->AcquireDevices( + {"uuid-1"}, + "writer-1", + TInstant::FromValue(1), + NProto::VOLUME_ACCESS_READ_WRITE, + 42, // MountSeqNumber + "vol0", + 1000); // VolumeGeneration + + state->DisableDevice("uuid-1"); + + auto stats = state->CollectStats().GetValueSync(); + UNIT_ASSERT_VALUES_EQUAL(3, stats.DeviceStatsSize()); + UNIT_ASSERT_VALUES_EQUAL(0, stats.GetDeviceStats(0).GetErrors()); + UNIT_ASSERT_VALUES_EQUAL(0, stats.GetDeviceStats(1).GetErrors()); + UNIT_ASSERT_VALUES_EQUAL(0, stats.GetDeviceStats(2).GetErrors()); + + UNIT_ASSERT_EXCEPTION_SATISFIES( + read(), + TServiceError, + [](auto& e) { return e.GetCode() == E_IO; }); + + UNIT_ASSERT_EXCEPTION_SATISFIES( + write(), + TServiceError, + [](auto& e) { return e.GetCode() == E_IO; }); + + UNIT_ASSERT_EXCEPTION_SATISFIES( + zero(), + TServiceError, + [](auto& e) { return e.GetCode() == E_IO; }); + + stats = state->CollectStats().GetValueSync(); + SortBy( + *stats.MutableDeviceStats(), + [](const auto& s) { return s.GetDeviceUUID(); }); + + UNIT_ASSERT_VALUES_EQUAL(3, stats.DeviceStatsSize()); + UNIT_ASSERT_VALUES_EQUAL(3, stats.GetDeviceStats(0).GetErrors()); // uuid-1 + UNIT_ASSERT_VALUES_EQUAL(0, stats.GetDeviceStats(1).GetErrors()); + UNIT_ASSERT_VALUES_EQUAL(0, stats.GetDeviceStats(2).GetErrors()); + + state->SuspendDevice("uuid-1"); + + UNIT_ASSERT_EXCEPTION_SATISFIES( + read(), + TServiceError, + [](auto& e) { return e.GetCode() == E_REJECTED; }); + + UNIT_ASSERT_EXCEPTION_SATISFIES( + write(), + TServiceError, + [](auto& e) { return e.GetCode() == E_REJECTED; }); + + UNIT_ASSERT_EXCEPTION_SATISFIES( + zero(), + TServiceError, + [](auto& e) { return e.GetCode() == E_REJECTED; }); + + stats = state->CollectStats().GetValueSync(); + SortBy( + *stats.MutableDeviceStats(), + [](const auto& s) { return s.GetDeviceUUID(); }); + + UNIT_ASSERT_VALUES_EQUAL(3, stats.DeviceStatsSize()); + UNIT_ASSERT_VALUES_EQUAL(3, stats.GetDeviceStats(0).GetErrors()); // uuid-1 + UNIT_ASSERT_VALUES_EQUAL(0, stats.GetDeviceStats(1).GetErrors()); + UNIT_ASSERT_VALUES_EQUAL(0, stats.GetDeviceStats(2).GetErrors()); + + state->EnableDevice("uuid-1"); + + { + auto error = read(); + + UNIT_ASSERT_VALUES_EQUAL_C( + S_OK, + error.GetCode(), + error.GetMessage()); + } + + { + auto error = write(); + + UNIT_ASSERT_VALUES_EQUAL_C( + S_OK, + error.GetCode(), + error.GetMessage()); + } + + { + auto error = zero(); + + UNIT_ASSERT_VALUES_EQUAL_C( + S_OK, + error.GetCode(), + error.GetMessage()); + } + + stats = state->CollectStats().GetValueSync(); + SortBy( + *stats.MutableDeviceStats(), + [](const auto& s) { return s.GetDeviceUUID(); }); + + UNIT_ASSERT_VALUES_EQUAL(3, stats.DeviceStatsSize()); + UNIT_ASSERT_VALUES_EQUAL(3, stats.GetDeviceStats(0).GetErrors()); // uuid-1 + UNIT_ASSERT_VALUES_EQUAL(0, stats.GetDeviceStats(1).GetErrors()); + UNIT_ASSERT_VALUES_EQUAL(0, stats.GetDeviceStats(2).GetErrors()); + } } } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_agent/model/config.cpp b/cloud/blockstore/libs/storage/disk_agent/model/config.cpp index db57b89a4ad..270a9516cca 100644 --- a/cloud/blockstore/libs/storage/disk_agent/model/config.cpp +++ b/cloud/blockstore/libs/storage/disk_agent/model/config.cpp @@ -48,6 +48,7 @@ namespace { xxx(DisableNodeBrokerRegistrationOnDevicelessAgent, bool, false )\ xxx(MaxAIOContextEvents, ui32, 1024 )\ xxx(PathsPerFileIOService, ui32, 0 )\ + xxx(DisableBrokenDevices, bool, 0 )\ // BLOCKSTORE_AGENT_CONFIG #define BLOCKSTORE_DECLARE_CONFIG(name, type, value) \ @@ -223,4 +224,18 @@ NProto::TError SaveDiskAgentConfig( return {}; } +[[nodiscard]] auto UpdateDevicesWithSuspendedIO( + const TString& path, + const TVector& uuids) -> NProto::TError +{ + auto [config, error] = LoadDiskAgentConfig(path); + if (HasError(error)) { + return error; + } + + config.MutableDevicesWithSuspendedIO()->Assign(uuids.begin(), uuids.end()); + + return SaveDiskAgentConfig(path, config); +} + } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_agent/model/config.h b/cloud/blockstore/libs/storage/disk_agent/model/config.h index ba0de9e5126..041ef7fecbb 100644 --- a/cloud/blockstore/libs/storage/disk_agent/model/config.h +++ b/cloud/blockstore/libs/storage/disk_agent/model/config.h @@ -108,6 +108,7 @@ class TDiskAgentConfig bool GetDisableNodeBrokerRegistrationOnDevicelessAgent() const; ui32 GetMaxAIOContextEvents() const; ui32 GetPathsPerFileIOService() const; + bool GetDisableBrokenDevices() const; const auto& GetDevicesWithSuspendedIO() const { @@ -123,6 +124,10 @@ class TDiskAgentConfig [[nodiscard]] auto LoadDiskAgentConfig( const TString& path) -> TResultOrError; +[[nodiscard]] auto UpdateDevicesWithSuspendedIO( + const TString& path, + const TVector& uuids) -> NProto::TError; + [[nodiscard]] auto SaveDiskAgentConfig( const TString& path, const NProto::TDiskAgentConfig& proto) -> NProto::TError; diff --git a/cloud/blockstore/libs/storage/disk_agent/model/device_client.cpp b/cloud/blockstore/libs/storage/disk_agent/model/device_client.cpp index 485dd67bcb3..8240c909828 100644 --- a/cloud/blockstore/libs/storage/disk_agent/model/device_client.cpp +++ b/cloud/blockstore/libs/storage/disk_agent/model/device_client.cpp @@ -348,28 +348,44 @@ TVector TDeviceClient::GetReaderSessions( } void TDeviceClient::DisableDevice(const TString& uuid) const +{ + SetDeviceIOErrorCode(uuid, E_IO); +} + +void TDeviceClient::SuspendDevice(const TString& uuid) const +{ + SetDeviceIOErrorCode(uuid, E_REJECTED); +} + +void TDeviceClient::SetDeviceIOErrorCode( + const TString& uuid, + std::optional errorCode) const { if (auto* deviceState = GetDeviceState(uuid)) { TWriteGuard g(deviceState->Lock); - deviceState->Disabled = true; + deviceState->IOErrorCode = errorCode; } } void TDeviceClient::EnableDevice(const TString& uuid) const { - if (auto* deviceState = GetDeviceState(uuid)) { - TWriteGuard g(deviceState->Lock); - deviceState->Disabled = false; - } + SetDeviceIOErrorCode(uuid, std::nullopt); } -bool TDeviceClient::IsDeviceDisabled(const TString& uuid) const +std::optional TDeviceClient::GetDeviceIOErrorCode( + const TString& uuid) const { if (auto* deviceState = GetDeviceState(uuid)) { TReadGuard g(deviceState->Lock); - return deviceState->Disabled; + return deviceState->IOErrorCode; } - return false; + + return {}; +} + +bool TDeviceClient::IsDeviceDisabled(const TString& uuid) const +{ + return GetDeviceIOErrorCode(uuid).has_value(); } // static @@ -377,7 +393,7 @@ TDeviceClient::TDevicesState TDeviceClient::MakeDevices(TVector uuids) { TDevicesState result; for (auto& uuid: uuids) { - result[std::move(uuid)] = std::make_unique(); + result.emplace(std::move(uuid), std::make_unique()); } return result; } diff --git a/cloud/blockstore/libs/storage/disk_agent/model/device_client.h b/cloud/blockstore/libs/storage/disk_agent/model/device_client.h index e88fb141eda..1237928ebe1 100644 --- a/cloud/blockstore/libs/storage/disk_agent/model/device_client.h +++ b/cloud/blockstore/libs/storage/disk_agent/model/device_client.h @@ -34,7 +34,11 @@ class TDeviceClient final ui32 VolumeGeneration = 0; TSessionInfo WriterSession; TVector ReaderSessions; - bool Disabled = false; + + // If the value is set, it is returned as the result of IO + // operations and the device is considered to be disabled. + std::optional IOErrorCode; + TRWMutex Lock; }; using TDevicesState = THashMap>; @@ -80,15 +84,26 @@ class TDeviceClient final TSessionInfo GetWriterSession(const TString& uuid) const; TVector GetReaderSessions(const TString& uuid) const; + // Return E_IO error on I/O operations. void DisableDevice(const TString& uuid) const; + + // Same as DisableDevice but return E_REJECTED. + void SuspendDevice(const TString& uuid) const; + void EnableDevice(const TString& uuid) const; + bool IsDeviceDisabled(const TString& uuid) const; + std::optional GetDeviceIOErrorCode(const TString& uuid) const; TVector GetSessions() const; private: static TDevicesState MakeDevices(TVector uuids); [[nodiscard]] TDeviceState* GetDeviceState(const TString& uuid) const; + + void SetDeviceIOErrorCode( + const TString& uuid, + std::optional errorCode) const; }; } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_agent/rdma_target.cpp b/cloud/blockstore/libs/storage/disk_agent/rdma_target.cpp index 2be9327d053..9730a314318 100644 --- a/cloud/blockstore/libs/storage/disk_agent/rdma_target.cpp +++ b/cloud/blockstore/libs/storage/disk_agent/rdma_target.cpp @@ -293,31 +293,49 @@ class TRequestHandler final } } - TStorageAdapterPtr GetDevice( + void CheckIfDeviceIsDisabled( const TString& uuid, - const TString& clientId, - NProto::EVolumeAccessMode accessMode) const + const TString& clientId) const { - if (DeviceClient->IsDeviceDisabled(uuid)) { + auto ec = DeviceClient->GetDeviceIOErrorCode(uuid); + if (!ec) { + return; + } + + if (GetErrorKind(MakeError(*ec)) != EErrorKind::ErrorRetriable) { STORAGE_ERROR_T( LogThrottler, - "[" << uuid << "/" << clientId << "] Device disabled. Drop request."); + "[" << uuid << "/" << clientId + << "] Device disabled. Drop request."); - if (auto* deviceData = Devices.FindPtr(uuid)) { + if (const auto* deviceData = Devices.FindPtr(uuid)) { deviceData->Device->ReportIOError(); } - ythrow TServiceError(MakeError(E_IO, "Device disabled")); + } else { + STORAGE_TRACE_T( + LogThrottler, + "[" << uuid << "/" << clientId + << "] Device suspended. Reject request."); } + ythrow TServiceError(*ec) << "Device disabled"; + } + + TStorageAdapterPtr GetDevice( + const TString& uuid, + const TString& clientId, + NProto::EVolumeAccessMode accessMode) const + { + CheckIfDeviceIsDisabled(uuid, clientId); + NProto::TError error = DeviceClient->AccessDevice(uuid, clientId, accessMode); if (HasError(error)) { - ythrow TServiceError(error); + ythrow TServiceError(error.GetCode()) << error.GetMessage(); } auto it = Devices.find(uuid); - if (it == Devices.cend()) { ythrow TServiceError(E_NOT_FOUND); } @@ -333,16 +351,16 @@ class TRequestHandler final ythrow TServiceError(E_NOT_FOUND); } - template + template void SubscribeForResponse( - Future future, + TFuture future, TRequestDetails requestDetails, THandleResponseMethod handleResponseMethod) const { auto handleResponse = [self = shared_from_this(), requestDetails = std::move(requestDetails), - handleResponseMethod = handleResponseMethod](Future future) mutable + handleResponseMethod = handleResponseMethod](TFuture future) mutable { self->TaskQueue->ExecuteSimple( [self = self, diff --git a/cloud/blockstore/libs/storage/disk_agent/rdma_target.h b/cloud/blockstore/libs/storage/disk_agent/rdma_target.h index 9160024fc35..c43c5363cee 100644 --- a/cloud/blockstore/libs/storage/disk_agent/rdma_target.h +++ b/cloud/blockstore/libs/storage/disk_agent/rdma_target.h @@ -65,7 +65,7 @@ using IRdmaTargetPtr = std::shared_ptr; IRdmaTargetPtr CreateRdmaTarget( TRdmaTargetConfigPtr rdmaTargetConfig, - TOldRequestCounters OldRequestCounters, + TOldRequestCounters oldRequestCounters, ILoggingServicePtr logging, NRdma::IServerPtr server, TDeviceClientPtr deviceClient, diff --git a/cloud/blockstore/libs/storage/disk_agent/rdma_target_ut.cpp b/cloud/blockstore/libs/storage/disk_agent/rdma_target_ut.cpp index 965fbfae2fc..d62059ad8f9 100644 --- a/cloud/blockstore/libs/storage/disk_agent/rdma_target_ut.cpp +++ b/cloud/blockstore/libs/storage/disk_agent/rdma_target_ut.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -324,7 +325,6 @@ Y_UNIT_TEST_SUITE(TRdmaTargetTest) response.GetError().GetCode(), response.GetError().GetMessage()); } - } Y_UNIT_TEST(ShouldRejectSecureEraseDuringIo) @@ -356,6 +356,38 @@ Y_UNIT_TEST_SUITE(TRdmaTargetTest) UNIT_ASSERT_C(!HasError(error), error); } + Y_UNIT_TEST(ShouldDisableDevice) + { + TRdmaTestEnvironment env(8_MB, 2); + + const auto blockRange = TBlockRange64::WithLength(0, 1024); + + env.DeviceClient->DisableDevice(env.Device_1); + + { + auto responseFuture = + env.Run(env.MakeWriteRequest(blockRange, 'A', 100)); + const auto& response = responseFuture.GetValueSync(); + UNIT_ASSERT_VALUES_EQUAL_C( + E_IO, + response.GetError().GetCode(), + response.GetError().GetMessage()); + } + + env.DeviceClient->SuspendDevice(env.Device_1); + + { + auto responseFuture = + env.Run(env.MakeWriteRequest(blockRange, 'A', 100)); + const auto& response = responseFuture.GetValueSync(); + UNIT_ASSERT_VALUES_EQUAL_C( + E_REJECTED, + response.GetError().GetCode(), + response.GetError().GetMessage()); + } + + // TODO: check counters (Error/Fatal) + } } } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_agent/storage_initializer.cpp b/cloud/blockstore/libs/storage/disk_agent/storage_initializer.cpp index 7e77faba08c..b690234fca8 100644 --- a/cloud/blockstore/libs/storage/disk_agent/storage_initializer.cpp +++ b/cloud/blockstore/libs/storage/disk_agent/storage_initializer.cpp @@ -399,9 +399,12 @@ void TInitializer::SaveCurrentConfig() proto.MutableFileDevices()->Assign( FileDevices.cbegin(), FileDevices.cend()); - proto.MutableDevicesWithSuspendedIO()->Assign( - DevicesWithSuspendedIO.cbegin(), - DevicesWithSuspendedIO.cend()); + + if (AgentConfig->GetDisableBrokenDevices()) { + proto.MutableDevicesWithSuspendedIO()->Assign( + DevicesWithSuspendedIO.cbegin(), + DevicesWithSuspendedIO.cend()); + } auto error = SaveDiskAgentConfig(path, proto); if (HasError(error)) { @@ -480,6 +483,9 @@ NProto::TError TInitializer::ProcessConfigCache() TVector devices{ std::make_move_iterator(config.MutableFileDevices()->begin()), std::make_move_iterator(config.MutableFileDevices()->end())}; + SortBy(devices, [] (const auto& d) { + return d.GetDeviceId(); + }); for (const auto& d: devices) { const auto currentSerialNumber = GetSerialNumber(d.GetPath()); diff --git a/cloud/blockstore/libs/storage/disk_agent/storage_initializer_ut.cpp b/cloud/blockstore/libs/storage/disk_agent/storage_initializer_ut.cpp index c84b0bfbf18..2af4576cc00 100644 --- a/cloud/blockstore/libs/storage/disk_agent/storage_initializer_ut.cpp +++ b/cloud/blockstore/libs/storage/disk_agent/storage_initializer_ut.cpp @@ -134,6 +134,7 @@ struct TFixture DefaultConfig.SetCachedConfigPath(CachedConfigPath / "config.txt"); DefaultConfig.SetBackend(NProto::DISK_AGENT_BACKEND_AIO); DefaultConfig.SetAcquireRequired(true); + DefaultConfig.SetDisableBrokenDevices(true); SetUpStorageDiscoveryConfig(); SetUpStorage(); diff --git a/cloud/blockstore/libs/storage/disk_agent/testlib/test_env.cpp b/cloud/blockstore/libs/storage/disk_agent/testlib/test_env.cpp index 5d4b9e08e12..304e71efc8a 100644 --- a/cloud/blockstore/libs/storage/disk_agent/testlib/test_env.cpp +++ b/cloud/blockstore/libs/storage/disk_agent/testlib/test_env.cpp @@ -72,10 +72,14 @@ void TDiskRegistryMock::HandleRegisterAgent( State->Devices[device.GetDeviceName()].CopyFrom(device); } - NCloud::Reply( - ctx, - *ev, - std::make_unique()); + auto response = + std::make_unique(); + + response->Record.MutableDevicesToSuspendIO()->Assign( + State->DisabledDevices.cbegin(), + State->DisabledDevices.cend()); + + NCloud::Reply(ctx, *ev, std::move(response)); } void TDiskRegistryMock::HandleUpdateAgentStats( @@ -293,6 +297,12 @@ TTestEnvBuilder& TTestEnvBuilder::With(NProto::TStorageServiceConfig storageServ return *this; } +TTestEnvBuilder& TTestEnvBuilder::With(TDiskRegistryState::TPtr diskRegistryState) +{ + DiskRegistryState = std::move(diskRegistryState); + return *this; +} + TTestEnv TTestEnvBuilder::Build() { Runtime.AppendToLogSettings( @@ -314,11 +324,16 @@ TTestEnv TTestEnvBuilder::Build() runtime.EnableScheduleForActor(actorId); }); - auto diskRegistryState = MakeIntrusive(); + if (!DiskRegistryState) { + DiskRegistryState = MakeIntrusive(); + } Runtime.AddLocalService( MakeDiskRegistryProxyServiceId(), - TActorSetupCmd(new TDiskRegistryMock(diskRegistryState), TMailboxType::Simple, 0)); + TActorSetupCmd( + new TDiskRegistryMock(DiskRegistryState), + TMailboxType::Simple, + 0)); auto allocator = CreateCachingAllocator( TDefaultAllocator::Instance(), 0, 0, 0); @@ -383,7 +398,7 @@ TTestEnv TTestEnvBuilder::Build() return TTestEnv{ .Runtime = Runtime, - .DiskRegistryState = diskRegistryState, + .DiskRegistryState = std::move(DiskRegistryState), .FileIOService = FileIOService, .NvmeManager = NvmeManager, }; diff --git a/cloud/blockstore/libs/storage/disk_agent/testlib/test_env.h b/cloud/blockstore/libs/storage/disk_agent/testlib/test_env.h index 41fea90687b..f700df0d9b9 100644 --- a/cloud/blockstore/libs/storage/disk_agent/testlib/test_env.h +++ b/cloud/blockstore/libs/storage/disk_agent/testlib/test_env.h @@ -53,6 +53,8 @@ struct TDiskRegistryState THashMap Devices; THashMap Stats; + + TVector DisabledDevices; }; //////////////////////////////////////////////////////////////////////////////// @@ -454,6 +456,10 @@ struct TTestEnvBuilder NNvme::INvmeManagerPtr NvmeManager; NSpdk::ISpdkEnvPtr Spdk; NProto::TStorageServiceConfig StorageServiceConfig; + TDiskRegistryState::TPtr DiskRegistryState; + + std::function + DiskRegistryProxyFactory; explicit TTestEnvBuilder(NActors::TTestActorRuntime& runtime); @@ -463,6 +469,7 @@ struct TTestEnvBuilder TTestEnvBuilder& With(NNvme::INvmeManagerPtr nvmeManager); TTestEnvBuilder& With(NProto::TDiskAgentConfig config); TTestEnvBuilder& With(NProto::TStorageServiceConfig storageServiceConfig); + TTestEnvBuilder& With(TDiskRegistryState::TPtr diskRegistryState); TTestEnv Build(); }; diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_register.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_register.cpp index 8d0cfba0b7f..4f0557e7608 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_register.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_register.cpp @@ -110,35 +110,38 @@ void TDiskRegistryActor::ExecuteAddAgent( args.Error = std::move(error); args.AffectedDisks = std::move(r.AffectedDisks); args.NotifiedDisks = std::move(r.DisksToReallocate); + args.DevicesToSuspendIO = std::move(r.DevicesToSuspendIO); - if (!HasError(args.Error)) { - for (auto it = ServerToAgentId.begin(); it != ServerToAgentId.end(); ) { - const auto& agentId = it->second; + if (HasError(args.Error)) { + return; + } - if (agentId == args.Config.GetAgentId()) { - const auto& serverId = it->first; - NCloud::Send(ctx, serverId); + for (auto it = ServerToAgentId.begin(); it != ServerToAgentId.end(); ) { + const auto& agentId = it->second; - ServerToAgentId.erase(it++); - } else { - ++it; - } + if (agentId == args.Config.GetAgentId()) { + const auto& serverId = it->first; + NCloud::Send(ctx, serverId); + + ServerToAgentId.erase(it++); + } else { + ++it; } + } - ServerToAgentId[args.RegisterActorId] = args.Config.GetAgentId(); + ServerToAgentId[args.RegisterActorId] = args.Config.GetAgentId(); - auto& info = AgentRegInfo[args.Config.GetAgentId()]; - info.Connected = true; - info.SeqNo += 1; + auto& info = AgentRegInfo[args.Config.GetAgentId()]; + info.Connected = true; + info.SeqNo += 1; - LOG_DEBUG(ctx, TBlockStoreComponents::DISK_REGISTRY, - "[%lu] Execute register agent: NodeId=%u, AgentId=%s" - ", SeqNo=%lu", - TabletID(), - args.Config.GetNodeId(), - args.Config.GetAgentId().c_str(), - info.SeqNo); - } + LOG_DEBUG(ctx, TBlockStoreComponents::DISK_REGISTRY, + "[%lu] Execute register agent: NodeId=%u, AgentId=%s" + ", SeqNo=%lu", + TabletID(), + args.Config.GetNodeId(), + args.Config.GetAgentId().c_str(), + info.SeqNo); } void TDiskRegistryActor::CompleteAddAgent( @@ -177,6 +180,8 @@ void TDiskRegistryActor::CompleteAddAgent( auto response = std::make_unique(); *response->Record.MutableError() = std::move(args.Error); + // + NCloud::Reply(ctx, *args.RequestInfo, std::move(response)); SendCachedAcquireRequestsToAgent(ctx, args.Config); diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp index 07befcdd4c5..7229592da5d 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp @@ -905,6 +905,7 @@ auto TDiskRegistryState::RegisterAgent( TVector affectedDisks; TVector disksToReallocate; + TVector devicesToSuspendIO; try { if (auto* buddy = AgentList.FindAgent(config.GetNodeId()); @@ -968,6 +969,11 @@ auto TDiskRegistryState::RegisterAgent( for (const auto& d: agent.GetDevices()) { const auto& uuid = d.GetDeviceUUID(); + + if (d.GetState() == NProto::DEVICE_STATE_ERROR) { + devicesToSuspendIO.push_back(uuid); + } + auto diskId = DeviceList.FindDiskId(uuid); if (diskId.empty()) { @@ -995,7 +1001,7 @@ auto TDiskRegistryState::RegisterAgent( diskIds.emplace(std::move(diskId)); } - for (auto& id: diskIds) { + for (const auto& id: diskIds) { if (TryUpdateDiskState(db, id, timestamp)) { affectedDisks.push_back(id); } @@ -1030,7 +1036,8 @@ auto TDiskRegistryState::RegisterAgent( return TAgentRegistrationResult{ .AffectedDisks = std::move(affectedDisks), - .DisksToReallocate = std::move(disksToReallocate)}; + .DisksToReallocate = std::move(disksToReallocate), + .DevicesToSuspendIO = std::move(devicesToSuspendIO)}; } NProto::TError TDiskRegistryState::CheckDestructiveConfigurationChange( diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h index a945e5f0d5a..caa5ed18163 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h @@ -356,6 +356,7 @@ class TDiskRegistryState { TVector AffectedDisks; TVector DisksToReallocate; + TVector DevicesToSuspendIO; }; auto RegisterAgent( diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_tx.h b/cloud/blockstore/libs/storage/disk_registry/disk_registry_tx.h index 3e3efe9d2a1..ddab4e7b20f 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_tx.h +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_tx.h @@ -215,6 +215,7 @@ struct TTxDiskRegistry NProto::TError Error; TVector AffectedDisks; TVector NotifiedDisks; + TVector DevicesToSuspendIO; TAddAgent( TRequestInfoPtr requestInfo, @@ -232,6 +233,7 @@ struct TTxDiskRegistry Error.Clear(); AffectedDisks.clear(); NotifiedDisks.clear(); + DevicesToSuspendIO.clear(); } }; diff --git a/cloud/blockstore/libs/storage/protos/disk.proto b/cloud/blockstore/libs/storage/protos/disk.proto index 52d9f8ea435..aaac2a06a3c 100644 --- a/cloud/blockstore/libs/storage/protos/disk.proto +++ b/cloud/blockstore/libs/storage/protos/disk.proto @@ -541,6 +541,9 @@ message TRegisterAgentResponse { // Optional error, set only if error happened. NCloud.NProto.TError Error = 1; + + // List of device IDs to which Disk Agent should disable IO operations. + repeated string DevicesToSuspendIO = 2; } //////////////////////////////////////////////////////////////////////////////// diff --git a/cloud/storage/core/libs/common/error.h b/cloud/storage/core/libs/common/error.h index b27a6d19fe7..c0d7170296e 100644 --- a/cloud/storage/core/libs/common/error.h +++ b/cloud/storage/core/libs/common/error.h @@ -274,7 +274,7 @@ bool HasError(const T& response) inline void CheckError(const NProto::TError& error) { if (HasError(error)) { - ythrow TServiceError(error); + ythrow TServiceError(error.GetCode()) << error.GetMessage() << "[!!!]"; } } @@ -282,7 +282,8 @@ template void CheckError(const T& response) { if (HasError(response)) { - ythrow TServiceError(response.GetError()); + ythrow TServiceError(response.GetError().GetCode()) + << response.GetError().GetMessage() << "[???]";; } }