Skip to content

Commit

Permalink
Cleanup agents from DR on REMOVE_HOST cms action
Browse files Browse the repository at this point in the history
  • Loading branch information
komarevtsev-d committed Jul 15, 2024
1 parent a33fbbf commit b1a0bc3
Show file tree
Hide file tree
Showing 12 changed files with 613 additions and 44 deletions.
4 changes: 4 additions & 0 deletions cloud/blockstore/config/storage.proto
Original file line number Diff line number Diff line change
Expand Up @@ -1000,4 +1000,8 @@ message TStorageServiceConfig

// Allow destruction only for those disks whose id matches one of the given prefixes.
repeated string DestructionAllowedOnlyForDisksWithIdPrefixes = 374;

// CMS actions such as "REMOVE_HOST" and "REMOVE_DEVICE" will attempt to
// remove devices and the host from DR memory (including persistent storage).
optional bool CleanupDRConfigOnCMSActions = 375;
}
1 change: 1 addition & 0 deletions cloud/blockstore/libs/storage/core/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,7 @@ TDuration MSeconds(ui32 value)
xxx(MaxLocalVolumes, ui32, 100 )\
\
xxx(DiskRegistryVolumeConfigUpdatePeriod, TDuration, Minutes(5) )\
xxx(CleanupDRConfigOnCMSActions, bool, false )\
\
xxx(ReassignRequestRetryTimeout, TDuration, Seconds(5) )\
xxx(ReassignChannelsPercentageThreshold, ui32, 10 )\
Expand Down
1 change: 1 addition & 0 deletions cloud/blockstore/libs/storage/core/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,7 @@ class TStorageConfig
ui32 GetMaxLocalVolumes() const;

TDuration GetDiskRegistryVolumeConfigUpdatePeriod() const;
bool GetCleanupDRConfigOnCMSActions() const;
TDuration GetReassignRequestRetryTimeout() const;
ui32 GetReassignChannelsPercentageThreshold() const;

Expand Down
4 changes: 3 additions & 1 deletion cloud/blockstore/libs/storage/core/pending_request.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#include <cloud/blockstore/libs/kikimr/public.h>
#include <cloud/blockstore/libs/storage/core/request_info.h>

#include <utility>

namespace NCloud::NBlockStore::NStorage {

////////////////////////////////////////////////////////////////////////////////
Expand All @@ -16,7 +18,7 @@ struct TPendingRequest

TPendingRequest(NActors::IEventHandlePtr event, TRequestInfoPtr requestInfo)
: Event(std::move(event))
, RequestInfo(requestInfo)
, RequestInfo(std::move(requestInfo))
{}
};

Expand Down
27 changes: 20 additions & 7 deletions cloud/blockstore/libs/storage/disk_common/monitoring_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,34 @@ namespace NCloud::NBlockStore::NStorage {

IOutputStream& DumpAgentState(
IOutputStream& out,
NProto::EAgentState state)
NProto::EAgentState state,
bool connected)
{
switch (state) {
case NProto::AGENT_STATE_ONLINE:
return out << "<font color=green>online</font>";
out << "<font color=green>online</font>";
break;
case NProto::AGENT_STATE_WARNING:
return out << "<font color=brown>warning</font>";
out << "<font color=brown>warning</font>";
break;
case NProto::AGENT_STATE_UNAVAILABLE:
return out << "<font color=red>unavailable</font>";
out << "<font color=red>unavailable</font>";
break;
default:
return out
<< "(Unknown EAgentState value "
<< static_cast<int>(state)
out << "(Unknown EAgentState value " << static_cast<int>(state)
<< ")";
break;
}

if (state != NProto::AGENT_STATE_UNAVAILABLE) {
if (!connected) {
out << " <font color=gray>disconnected</font>";
}
} else if (connected) {
out << " <font color=gray>connected</font>";
}

return out;
}

IOutputStream& DumpDiskState(
Expand Down
3 changes: 2 additions & 1 deletion cloud/blockstore/libs/storage/disk_common/monitoring_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ inline EDeviceStateFlags& operator|=(EDeviceStateFlags& a, EDeviceStateFlags b)

IOutputStream& DumpAgentState(
IOutputStream& out,
NProto::EAgentState state);
NProto::EAgentState state,
bool connected);

IOutputStream& DumpDiskState(
IOutputStream& out,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,14 @@ void TDiskRegistryActor::RenderAgentHtmlInfo(
out << "Rack: " << rack;
}

DIV() { out << "State: "; DumpAgentState(out, agent->GetState()); }
DIV() {
out << "State: ";

auto it = AgentRegInfo.find(agent->GetAgentId());
const bool connected =
it != AgentRegInfo.end() && it->second.Connected;
DumpAgentState(out, agent->GetState(), connected);
}
DIV() {
out << "State Timestamp: "
<< TInstant::MicroSeconds(agent->GetStateTs());
Expand Down Expand Up @@ -695,7 +702,11 @@ void TDiskRegistryActor::RenderDiskHtmlInfo(
!= NProto::AGENT_STATE_ONLINE)
{
out << " ";
DumpAgentState(out, agent->GetState());

auto it = AgentRegInfo.find(agent->GetAgentId());
const bool connected =
it != AgentRegInfo.end() && it->second.Connected;
DumpAgentState(out, agent->GetState(), connected);
}
} else {
out << device.GetNodeId();
Expand Down Expand Up @@ -1793,22 +1804,10 @@ void TDiskRegistryActor::RenderAgentList(
out << config.GetSeqNumber();
}
TABLED() {
DumpAgentState(out, config.GetState());

auto it = AgentRegInfo.find(config.GetAgentId());

const bool connected = it != AgentRegInfo.end()
&& it->second.Connected;

if (config.GetState()
!= NProto::AGENT_STATE_UNAVAILABLE)
{
if (!connected) {
out << " <font color=gray>disconnected</font>";
}
} else if (connected) {
out << " <font color=gray>connected</font>";
}
DumpAgentState(out, config.GetState(), connected);
}
TABLED() {
out << (config.GetDedicatedDiskAgent()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4690,6 +4690,24 @@ NProto::TError TDiskRegistryState::UpdateAgentState(
return error;
}

// Unavailable hosts are a problem when infra tries to request ADD_HOST on
// an idle agent. DR waits for it to become online and blocks host
// deployment.
if (StorageConfig->GetCleanupDRConfigOnCMSActions() &&
newState == NProto::EAgentState::AGENT_STATE_UNAVAILABLE &&
agent->GetDevices().empty())
{
bool success = RemoveAgent(db, agent->GetNodeId());
if (!success) {
return MakeError(
E_FAIL,
TStringBuilder() << "Couldn't remove the agent "
<< agent->GetAgentId().Quote());
}

return {};
}

const auto cmsTs = TInstant::MicroSeconds(agent->GetCmsTs());
const auto oldState = agent->GetState();
const auto cmsDeadline = cmsTs + GetInfraTimeout(*StorageConfig, oldState);
Expand Down Expand Up @@ -5042,6 +5060,18 @@ NProto::TError TDiskRegistryState::UpdateCmsHostState(
ApplyAgentStateChange(db, *agent, now, affectedDisks);

if (newState != NProto::AGENT_STATE_ONLINE && !HasError(result)) {
auto error = TryToRemoveAgentDevices(db, agent->GetAgentId());
if (!HasError(error)) {
return result;
}

// Do not return the error from "TryToRemoveAgentDevices()" since it's
// internal and shouldn't block node removal.
STORAGE_WARN(
"Could not remove device from agent %s: %s",
agent->GetAgentId().Quote().c_str(),
FormatError(error).c_str());

SuspendLocalDevices(db, *agent);
}

Expand Down Expand Up @@ -5224,6 +5254,38 @@ bool TDiskRegistryState::TryUpdateDiskState(
return true;
}

NProto::TError TDiskRegistryState::TryToRemoveAgentDevices(
TDiskRegistryDatabase& db,
const TAgentId& agentId)
{
if (!StorageConfig->GetCleanupDRConfigOnCMSActions()) {
return MakeError(
E_INVALID_STATE,
"CleanupDRConfigOnCMSActions is disabled.");
}

auto newConfig = GetConfig();
auto* agents = newConfig.MutableKnownAgents();
const auto agentIt = FindIf(
*agents,
[&](const auto& x) { return x.GetAgentId() == agentId; });
if (agentIt == agents->end()) {
return MakeError(
E_NOT_FOUND,
TStringBuilder() << "Couldn't find agent " << agentId.Quote()
<< " in the DR config.");
}

agents->erase(agentIt);
TVector<TString> affectedDisks;
auto error = UpdateConfig(
db,
std::move(newConfig),
false, // ignoreVersion
affectedDisks);
return error;
}

void TDiskRegistryState::DeleteDiskStateUpdate(
TDiskRegistryDatabase& db,
ui64 maxSeqNo)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -585,8 +585,8 @@ class TDiskRegistryState
NProto::TError UpdateAgentState(
TDiskRegistryDatabase& db,
const TString& agentId,
NProto::EAgentState state,
TInstant now,
NProto::EAgentState newState,
TInstant timestamp,
TString reason,
TVector<TDiskId>& affectedDisks);

Expand Down Expand Up @@ -933,6 +933,10 @@ class TDiskRegistryState
TDiskState& disk,
TInstant timestamp);

NProto::TError TryToRemoveAgentDevices(
TDiskRegistryDatabase& db,
const TAgentId& agentId);

NProto::TPlacementGroupConfig::TDiskInfo* CollectRacks(
const TString& diskId,
ui32 placementPartitionIndex,
Expand Down
Loading

0 comments on commit b1a0bc3

Please sign in to comment.