Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup agents from DR on REMOVE_HOST cms action #1583

Merged
merged 2 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cloud/blockstore/config/storage.proto
Original file line number Diff line number Diff line change
Expand Up @@ -1000,4 +1000,8 @@ message TStorageServiceConfig

// Allow destruction only for those disks whose id matches one of the given prefixes.
repeated string DestructionAllowedOnlyForDisksWithIdPrefixes = 374;

// CMS actions such as "REMOVE_HOST" and "REMOVE_DEVICE" will attempt to
// remove devices and the host from DR memory (including persistent storage).
optional bool CleanupDRConfigOnCMSActions = 375;
}
1 change: 1 addition & 0 deletions cloud/blockstore/libs/storage/core/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,7 @@ TDuration MSeconds(ui32 value)
xxx(MaxLocalVolumes, ui32, 100 )\
\
xxx(DiskRegistryVolumeConfigUpdatePeriod, TDuration, Minutes(5) )\
xxx(CleanupDRConfigOnCMSActions, bool, false )\
\
xxx(ReassignRequestRetryTimeout, TDuration, Seconds(5) )\
xxx(ReassignChannelsPercentageThreshold, ui32, 10 )\
Expand Down
1 change: 1 addition & 0 deletions cloud/blockstore/libs/storage/core/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,7 @@ class TStorageConfig
ui32 GetMaxLocalVolumes() const;

TDuration GetDiskRegistryVolumeConfigUpdatePeriod() const;
bool GetCleanupDRConfigOnCMSActions() const;
TDuration GetReassignRequestRetryTimeout() const;
ui32 GetReassignChannelsPercentageThreshold() const;

Expand Down
4 changes: 3 additions & 1 deletion cloud/blockstore/libs/storage/core/pending_request.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#include <cloud/blockstore/libs/kikimr/public.h>
#include <cloud/blockstore/libs/storage/core/request_info.h>

#include <utility>

namespace NCloud::NBlockStore::NStorage {

////////////////////////////////////////////////////////////////////////////////
Expand All @@ -16,7 +18,7 @@ struct TPendingRequest

TPendingRequest(NActors::IEventHandlePtr event, TRequestInfoPtr requestInfo)
: Event(std::move(event))
, RequestInfo(requestInfo)
, RequestInfo(std::move(requestInfo))
{}
};

Expand Down
27 changes: 20 additions & 7 deletions cloud/blockstore/libs/storage/disk_common/monitoring_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,34 @@ namespace NCloud::NBlockStore::NStorage {

IOutputStream& DumpAgentState(
IOutputStream& out,
NProto::EAgentState state)
NProto::EAgentState state,
bool connected)
{
switch (state) {
case NProto::AGENT_STATE_ONLINE:
return out << "<font color=green>online</font>";
out << "<font color=green>online</font>";
break;
case NProto::AGENT_STATE_WARNING:
return out << "<font color=brown>warning</font>";
out << "<font color=brown>warning</font>";
break;
case NProto::AGENT_STATE_UNAVAILABLE:
return out << "<font color=red>unavailable</font>";
out << "<font color=red>unavailable</font>";
break;
default:
return out
<< "(Unknown EAgentState value "
<< static_cast<int>(state)
out << "(Unknown EAgentState value " << static_cast<int>(state)
<< ")";
break;
}

if (state != NProto::AGENT_STATE_UNAVAILABLE) {
if (!connected) {
out << " <font color=gray>disconnected</font>";
}
} else if (connected) {
out << " <font color=gray>connected</font>";
}

return out;
}

IOutputStream& DumpDiskState(
Expand Down
3 changes: 2 additions & 1 deletion cloud/blockstore/libs/storage/disk_common/monitoring_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ inline EDeviceStateFlags& operator|=(EDeviceStateFlags& a, EDeviceStateFlags b)

IOutputStream& DumpAgentState(
IOutputStream& out,
NProto::EAgentState state);
NProto::EAgentState state,
bool connected);

IOutputStream& DumpDiskState(
IOutputStream& out,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,14 @@ void TDiskRegistryActor::RenderAgentHtmlInfo(
out << "Rack: " << rack;
}

DIV() { out << "State: "; DumpAgentState(out, agent->GetState()); }
DIV() {
out << "State: ";

auto it = AgentRegInfo.find(agent->GetAgentId());
const bool connected =
it != AgentRegInfo.end() && it->second.Connected;
DumpAgentState(out, agent->GetState(), connected);
}
DIV() {
out << "State Timestamp: "
<< TInstant::MicroSeconds(agent->GetStateTs());
Expand Down Expand Up @@ -695,7 +702,11 @@ void TDiskRegistryActor::RenderDiskHtmlInfo(
!= NProto::AGENT_STATE_ONLINE)
{
out << " ";
DumpAgentState(out, agent->GetState());

auto it = AgentRegInfo.find(agent->GetAgentId());
const bool connected =
it != AgentRegInfo.end() && it->second.Connected;
DumpAgentState(out, agent->GetState(), connected);
}
} else {
out << device.GetNodeId();
Expand Down Expand Up @@ -1793,22 +1804,10 @@ void TDiskRegistryActor::RenderAgentList(
out << config.GetSeqNumber();
}
TABLED() {
DumpAgentState(out, config.GetState());

auto it = AgentRegInfo.find(config.GetAgentId());

const bool connected = it != AgentRegInfo.end()
&& it->second.Connected;

if (config.GetState()
!= NProto::AGENT_STATE_UNAVAILABLE)
{
if (!connected) {
out << " <font color=gray>disconnected</font>";
}
} else if (connected) {
out << " <font color=gray>connected</font>";
}
DumpAgentState(out, config.GetState(), connected);
}
TABLED() {
out << (config.GetDedicatedDiskAgent()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4690,6 +4690,17 @@ NProto::TError TDiskRegistryState::UpdateAgentState(
return error;
}

// Unavailable hosts are a problem when infra tries to request ADD_HOST on
// an idle agent. DR waits for it to become online and blocks host
// deployment.
if (StorageConfig->GetCleanupDRConfigOnCMSActions() &&
newState == NProto::AGENT_STATE_UNAVAILABLE &&
agent->GetDevices().empty())
{
RemoveAgent(db, *agent);
return {};
}

const auto cmsTs = TInstant::MicroSeconds(agent->GetCmsTs());
const auto oldState = agent->GetState();
const auto cmsDeadline = cmsTs + GetInfraTimeout(*StorageConfig, oldState);
Expand Down Expand Up @@ -5042,6 +5053,20 @@ NProto::TError TDiskRegistryState::UpdateCmsHostState(
ApplyAgentStateChange(db, *agent, now, affectedDisks);

if (newState != NProto::AGENT_STATE_ONLINE && !HasError(result)) {
if (StorageConfig->GetCleanupDRConfigOnCMSActions()) {
auto error = TryToRemoveAgentDevices(db, agent->GetAgentId());
if (!HasError(error)) {
return result;
}

// Do not return the error from "TryToRemoveAgentDevices()" since
// it's internal and shouldn't block node removal.
STORAGE_WARN(
"Could not remove device from agent %s: %s",
agent->GetAgentId().Quote().c_str(),
FormatError(error).c_str());
}

SuspendLocalDevices(db, *agent);
}

Expand Down Expand Up @@ -5224,6 +5249,38 @@ bool TDiskRegistryState::TryUpdateDiskState(
return true;
}

NProto::TError TDiskRegistryState::TryToRemoveAgentDevices(
TDiskRegistryDatabase& db,
const TAgentId& agentId)
{
if (!StorageConfig->GetCleanupDRConfigOnCMSActions()) {
return MakeError(
E_INVALID_STATE,
"CleanupDRConfigOnCMSActions is disabled.");
}

auto newConfig = GetConfig();
auto* agents = newConfig.MutableKnownAgents();
const auto agentIt = FindIf(
*agents,
[&](const auto& x) { return x.GetAgentId() == agentId; });
if (agentIt == agents->end()) {
return MakeError(
E_NOT_FOUND,
TStringBuilder() << "Couldn't find agent " << agentId.Quote()
<< " in the DR config.");
}

agents->erase(agentIt);
TVector<TString> affectedDisks;
auto error = UpdateConfig(
db,
std::move(newConfig),
false, // ignoreVersion
affectedDisks);
return error;
}

void TDiskRegistryState::DeleteDiskStateUpdate(
TDiskRegistryDatabase& db,
ui64 maxSeqNo)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -585,8 +585,8 @@ class TDiskRegistryState
NProto::TError UpdateAgentState(
TDiskRegistryDatabase& db,
const TString& agentId,
NProto::EAgentState state,
TInstant now,
NProto::EAgentState newState,
TInstant timestamp,
TString reason,
TVector<TDiskId>& affectedDisks);

Expand Down Expand Up @@ -933,6 +933,10 @@ class TDiskRegistryState
TDiskState& disk,
TInstant timestamp);

NProto::TError TryToRemoveAgentDevices(
TDiskRegistryDatabase& db,
const TAgentId& agentId);

NProto::TPlacementGroupConfig::TDiskInfo* CollectRacks(
const TString& diskId,
ui32 placementPartitionIndex,
Expand Down
Loading