Skip to content

Commit

Permalink
issue-1395: run cleaning process per device pool (#1513)
Browse files Browse the repository at this point in the history
Issue: #1395

Problem:
Erasing of hdd disks is significantly slower compare to erasing of nvme disks. Disk registry blocks next erase by setting SecureEraseInProgress flag while current erase is in progress. As a result we can delay erase of the next bunch of nvme disks despite of no nvme disks erase is in progress.

SecureErase is a background process which is regularly triggered by disk registry actor.

Solution:

Split SecureErase by pool name
Use different SecureEraseInProgress flags depend on the pool name.

---------

Co-authored-by: Pavel Misko <[email protected]>
  • Loading branch information
antonmyagkov and sharpeye committed Jul 22, 2024
1 parent a90d95e commit a478d50
Show file tree
Hide file tree
Showing 6 changed files with 326 additions and 34 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class TDiskRegistryActor final
bool UsersNotificationInProgress = false;
bool DiskStatesPublicationInProgress = false;
bool AutomaticallyReplacedDevicesDeletionInProgress = false;
bool SecureEraseInProgress = false;
THashSet<TString> SecureEraseInProgressPerPool;
bool StartMigrationInProgress = false;

TVector<TString> DisksBeingDestroyed;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class TSecureEraseActor final
const TRequestInfoPtr Request;
const TDuration RequestTimeout;

const TString PoolName;
TVector<NProto::TDeviceConfig> Devices;
TVector<TString> CleanDevices;

Expand All @@ -34,6 +35,7 @@ class TSecureEraseActor final
const TActorId& owner,
TRequestInfoPtr request,
TDuration requestTimeout,
TString poolName,
TVector<NProto::TDeviceConfig> devicesToClean);

void Bootstrap(const TActorContext& ctx);
Expand Down Expand Up @@ -73,10 +75,12 @@ TSecureEraseActor::TSecureEraseActor(
const TActorId& owner,
TRequestInfoPtr request,
TDuration requestTimeout,
TString poolName,
TVector<NProto::TDeviceConfig> devicesToClean)
: Owner(owner)
, Request(std::move(request))
, RequestTimeout(requestTimeout)
, PoolName(std::move(poolName))
, Devices(std::move(devicesToClean))
{}

Expand Down Expand Up @@ -113,6 +117,7 @@ void TSecureEraseActor::ReplyAndDie(const TActorContext& ctx, NProto::TError err
{
auto response = std::make_unique<TEvDiskRegistryPrivate::TEvSecureEraseResponse>(
std::move(error),
PoolName,
CleanDevices.size());
NCloud::Reply(ctx, *Request, std::move(response));

Expand Down Expand Up @@ -301,10 +306,6 @@ void TDiskRegistryActor::CompleteCleanupDevices(

void TDiskRegistryActor::SecureErase(const TActorContext& ctx)
{
if (SecureEraseInProgress) {
return;
}

auto dirtyDevices = State->GetDirtyDevices();
EraseIf(dirtyDevices, [&] (auto& d) {
if (d.GetState() == NProto::DEVICE_STATE_ERROR) {
Expand Down Expand Up @@ -374,30 +375,55 @@ void TDiskRegistryActor::SecureErase(const TActorContext& ctx)
countBeforeFiltration,
dirtyDevices.size());

SecureEraseInProgress = true;

auto request = std::make_unique<TEvDiskRegistryPrivate::TEvSecureEraseRequest>(
std::move(dirtyDevices),
Config->GetNonReplicatedSecureEraseTimeout());

auto deadline = Min(SecureEraseStartTs, ctx.Now()) + TDuration::Seconds(5);
if (deadline > ctx.Now()) {
LOG_INFO(ctx, TBlockStoreComponents::DISK_REGISTRY,
"[%lu] Scheduled secure erase, now: %lu, deadline: %lu",
TabletID(),
ctx.Now().MicroSeconds(),
deadline.MicroSeconds());

ctx.ExecutorThread.Schedule(
deadline,
new IEventHandle(ctx.SelfID, ctx.SelfID, request.get()));
request.release();
} else {
LOG_INFO(ctx, TBlockStoreComponents::DISK_REGISTRY,
"[%lu] Sending secure erase request",
TabletID());
auto it = dirtyDevices.begin();
while (it != dirtyDevices.end()) {
auto first = it;
const auto poolName = first->GetPoolName();
it = std::partition(
first,
dirtyDevices.end(),
[&poolName](const auto& device)
{ return poolName == device.GetPoolName(); });

auto [_, alreadyInProgress] =
SecureEraseInProgressPerPool.insert(poolName);
if (!alreadyInProgress) {
continue;
}

NCloud::Send(ctx, ctx.SelfID, std::move(request));
auto request =
std::make_unique<TEvDiskRegistryPrivate::TEvSecureEraseRequest>(
poolName,
TVector<NProto::TDeviceConfig>(
std::make_move_iterator(first),
std::make_move_iterator(it)),
Config->GetNonReplicatedSecureEraseTimeout());

auto deadline =
Min(SecureEraseStartTs, ctx.Now()) + TDuration::Seconds(5);
if (deadline > ctx.Now()) {
LOG_INFO(
ctx,
TBlockStoreComponents::DISK_REGISTRY,
"[%lu] Scheduled secure erase for pool: %s, now: %lu, "
"deadline: %lu",
TabletID(),
poolName.c_str(),
ctx.Now().MicroSeconds(),
deadline.MicroSeconds());

ctx.ExecutorThread.Schedule(
deadline,
new IEventHandle(ctx.SelfID, ctx.SelfID, request.release()));
} else {
LOG_INFO(
ctx,
TBlockStoreComponents::DISK_REGISTRY,
"[%lu] Sending secure erase request",
TabletID());

NCloud::Send(ctx, ctx.SelfID, std::move(request));
}
}
}

Expand Down Expand Up @@ -425,6 +451,7 @@ void TDiskRegistryActor::HandleSecureErase(
msg->CallContext
),
msg->RequestTimeout,
msg->PoolName,
std::move(msg->DirtyDevices));
Actors.insert(actor);
}
Expand All @@ -440,7 +467,7 @@ void TDiskRegistryActor::HandleSecureEraseResponse(
TabletID(),
msg->CleanDevices);

SecureEraseInProgress = false;
SecureEraseInProgressPerPool.erase(msg->PoolName);
SecureErase(ctx);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ void TDiskRegistryActor::CompleteWritableState(
DisksNotificationInProgress = false;
UsersNotificationInProgress = false;
DiskStatesPublicationInProgress = false;
SecureEraseInProgress = false;
SecureEraseInProgressPerPool.clear();
StartMigrationInProgress = false;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,25 +267,32 @@ struct TEvDiskRegistryPrivate

struct TSecureEraseRequest
{
TString PoolName;
TVector<NProto::TDeviceConfig> DirtyDevices;
TDuration RequestTimeout;

explicit TSecureEraseRequest(
TSecureEraseRequest(
TString poolName,
TVector<NProto::TDeviceConfig> dirtyDevices,
TDuration requestTimeout)
: DirtyDevices(std::move(dirtyDevices))
: PoolName(std::move(poolName))
, DirtyDevices(std::move(dirtyDevices))
, RequestTimeout(requestTimeout)
{}
};

struct TSecureEraseResponse
{
TString PoolName;
size_t CleanDevices = 0;

TSecureEraseResponse() = default;

explicit TSecureEraseResponse(size_t cleanDevices)
: CleanDevices(cleanDevices)
TSecureEraseResponse(
TString poolName,
size_t cleanDevices)
: PoolName(std::move(poolName))
, CleanDevices(cleanDevices)
{}
};

Expand Down
Loading

0 comments on commit a478d50

Please sign in to comment.