Skip to content

Commit

Permalink
NBSNEBIUS-127: add data scrubbing for mirrored disks (#954)
Browse files Browse the repository at this point in the history
* add data scrubbing for mirrored disks

* fix issues

* add processing of write requests intersecting with scrubbing

* fix issues

* fix issues

* fix issues
  • Loading branch information
WilyTiger authored May 7, 2024
1 parent e2fe8fd commit 277c3ec
Show file tree
Hide file tree
Showing 21 changed files with 632 additions and 47 deletions.
5 changes: 5 additions & 0 deletions cloud/blockstore/config/storage.proto
Original file line number Diff line number Diff line change
Expand Up @@ -965,4 +965,9 @@ message TStorageServiceConfig

// Max shadow disk fill io depth.
optional uint32 MaxShadowDiskFillIoDepth = 363;

// Enable data scrubbing for mirrored disks.
optional bool DataScrubbingEnabled = 364;
// Interval between scrubbing ranges in milliseconds
optional uint32 ScrubbingInterval = 365;
}
1 change: 1 addition & 0 deletions cloud/blockstore/libs/diagnostics/critical_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ namespace NCloud::NBlockStore {
xxx(MirroredDiskDeviceReplacementForbidden) \
xxx(MirroredDiskDeviceReplacementFailure) \
xxx(MirroredDiskDeviceReplacementRateLimitExceeded) \
xxx(MirroredDiskChecksumMismatch) \
xxx(CounterUpdateRace) \
xxx(EndpointStartingError) \
xxx(ResyncFailed) \
Expand Down
3 changes: 3 additions & 0 deletions cloud/blockstore/libs/storage/core/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,9 @@ TDuration MSeconds(ui32 value)
xxx(MaxAcquireShadowDiskRetryDelayWhenNonBlocked, TDuration, Seconds(10) )\
xxx(MaxAcquireShadowDiskTotalTimeoutWhenBlocked, TDuration, Seconds(5) )\
xxx(MaxAcquireShadowDiskTotalTimeoutWhenNonBlocked, TDuration, Seconds(600) )\
\
xxx(DataScrubbingEnabled, bool, false )\
xxx(ScrubbingInterval, TDuration, MSeconds(50) )\

// BLOCKSTORE_STORAGE_CONFIG_RW

Expand Down
3 changes: 3 additions & 0 deletions cloud/blockstore/libs/storage/core/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,9 @@ class TStorageConfig
TDuration GetVolumeProxyCacheRetryDuration() const;

TDuration GetServiceSelfPingInterval() const;

bool GetDataScrubbingEnabled() const;
TDuration GetScrubbingInterval() const;
};

ui64 GetAllocationUnit(
Expand Down
7 changes: 7 additions & 0 deletions cloud/blockstore/libs/storage/core/disk_counters.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ enum class EPublishingPolicy

////////////////////////////////////////////////////////////////////////////////

#define BLOCKSTORE_DRBASED_PART_CUMULATIVE_COUNTERS(xxx, ...) \
xxx(ScrubbingThroughput, Generic, Permanent, __VA_ARGS__)\
// BLOCKSTORE_DRBASED_PART_CUMULATIVE_COUNTERS

////////////////////////////////////////////////////////////////////////////////

#define BLOCKSTORE_REPL_PART_CUMULATIVE_COUNTERS(xxx, ...) \
xxx(BytesWritten, Generic, Permanent, __VA_ARGS__)\
xxx(BytesRead, Generic, Permanent, __VA_ARGS__)\
Expand Down Expand Up @@ -191,6 +197,7 @@ struct TPartitionDiskCounters
// BLOCKSTORE_CUMULATIVE_COUNTER

BLOCKSTORE_REPL_PART_CUMULATIVE_COUNTERS(BLOCKSTORE_CUMULATIVE_COUNTER)
BLOCKSTORE_DRBASED_PART_CUMULATIVE_COUNTERS(BLOCKSTORE_CUMULATIVE_COUNTER)
#undef BLOCKSTORE_CUMULATIVE_COUNTER
} Cumulative;

Expand Down
34 changes: 17 additions & 17 deletions cloud/blockstore/libs/storage/partition_nonrepl/checksum_range.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,15 @@ using namespace NActors;
////////////////////////////////////////////////////////////////////////////////

TChecksumRangeActorCompanion::TChecksumRangeActorCompanion(
TBlockRange64 range,
TVector<TReplicaDescriptor> replicas)
: Range(range)
, Replicas(std::move(replicas))
: Replicas(std::move(replicas))
{
Checksums.resize(Replicas.size());
}

bool TChecksumRangeActorCompanion::IsFinished() const
{
return Finished;
return CalculatedChecksumsCount == Replicas.size();
}

const TVector<ui64>& TChecksumRangeActorCompanion::GetChecksums() const
Expand All @@ -45,19 +43,24 @@ TDuration TChecksumRangeActorCompanion::GetChecksumDuration() const
return ChecksumDuration;
}

void TChecksumRangeActorCompanion::CalculateChecksums(const TActorContext& ctx)
void TChecksumRangeActorCompanion::CalculateChecksums(
const TActorContext& ctx,
TBlockRange64 range)
{
for (size_t i = 0; i < Replicas.size(); ++i) {
CalculateReplicaChecksum(ctx, i);
CalculateReplicaChecksum(ctx, range, i);
}
ChecksumStartTs = ctx.Now();
}

void TChecksumRangeActorCompanion::CalculateReplicaChecksum(const TActorContext& ctx, int idx)
void TChecksumRangeActorCompanion::CalculateReplicaChecksum(
const TActorContext& ctx,
TBlockRange64 range,
int idx)
{
auto request = std::make_unique<TEvNonreplPartitionPrivate::TEvChecksumBlocksRequest>();
request->Record.SetStartIndex(Range.Start);
request->Record.SetBlocksCount(Range.Size());
request->Record.SetStartIndex(range.Start);
request->Record.SetBlocksCount(range.Size());

auto* headers = request->Record.MutableHeaders();
headers->SetIsBackgroundRequest(true);
Expand All @@ -81,33 +84,30 @@ void TChecksumRangeActorCompanion::HandleChecksumResponse(
const TEvNonreplPartitionPrivate::TEvChecksumBlocksResponse::TPtr& ev,
const TActorContext& ctx)
{
++CalculatedChecksumsCount;
auto* msg = ev->Get();

Error = msg->Record.GetError();

if (HasError(Error)) {
if (HasError(msg->Record.GetError())) {
LOG_WARN(ctx, TBlockStoreComponents::PARTITION,
"[%s] Checksum error %s",
Replicas[0].Name.c_str(),
FormatError(Error).c_str());

Error = msg->Record.GetError();
ChecksumDuration = ctx.Now() - ChecksumStartTs;
Finished = true;
return;
}

Checksums[ev->Cookie] = msg->Record.GetChecksum();
if (++CalculatedChecksumsCount == Replicas.size()) {
if (CalculatedChecksumsCount == Replicas.size()) {
ChecksumDuration = ctx.Now() - ChecksumStartTs;
Finished = true;
}
}

void TChecksumRangeActorCompanion::HandleChecksumUndelivery(
const NActors::TActorContext& ctx)
{
++CalculatedChecksumsCount;
ChecksumDuration = ctx.Now() - ChecksumStartTs;

Error = MakeError(E_REJECTED, "ChecksumBlocks request undelivered");
}

Expand Down
19 changes: 11 additions & 8 deletions cloud/blockstore/libs/storage/partition_nonrepl/checksum_range.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,22 @@ namespace NCloud::NBlockStore::NStorage {
class TChecksumRangeActorCompanion
{
private:
const TBlockRange64 Range;
const TVector<TReplicaDescriptor> Replicas;
TVector<TReplicaDescriptor> Replicas;

TInstant ChecksumStartTs;
TDuration ChecksumDuration;
ui32 CalculatedChecksumsCount = 0;
bool Finished = false;
TVector<ui64> Checksums;
NProto::TError Error;

public:
TChecksumRangeActorCompanion(
TBlockRange64 range,
TVector<TReplicaDescriptor> replicas);
TChecksumRangeActorCompanion(TVector<TReplicaDescriptor> replicas);

TChecksumRangeActorCompanion() = default;

void CalculateChecksums(const NActors::TActorContext& ctx);
void CalculateChecksums(
const NActors::TActorContext& ctx,
TBlockRange64 range);

void HandleChecksumResponse(
const TEvNonreplPartitionPrivate::TEvChecksumBlocksResponse::TPtr& ev,
Expand All @@ -43,7 +43,10 @@ class TChecksumRangeActorCompanion
TDuration GetChecksumDuration() const;

private:
void CalculateReplicaChecksum(const NActors::TActorContext& ctx, int idx);
void CalculateReplicaChecksum(
const NActors::TActorContext& ctx,
TBlockRange64 range,
int idx);
};

} // namespace NCloud::NBlockStore::NStorage
Loading

0 comments on commit 277c3ec

Please sign in to comment.