Skip to content

Commit

Permalink
[NBS] issues-1553 add rdma/interconnect counters (#1977)
Browse files Browse the repository at this point in the history
add transport counters
  • Loading branch information
Sazonov99 committed Sep 19, 2024
1 parent 6bc826c commit 1fcd222
Show file tree
Hide file tree
Showing 12 changed files with 399 additions and 0 deletions.
136 changes: 136 additions & 0 deletions cloud/blockstore/libs/storage/core/disk_counters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,16 @@ void TPartitionDiskCounters::Add(const TPartitionDiskCounters& source)
auto& counter = meta.GetValue(Histogram);
counter.Add(meta.GetValue(source.Histogram));
}

for (auto meta: TTransportRequestCounters::AllCounters) {
auto& counter = meta.GetValue(Rdma);
counter.Add(meta.GetValue(source.Rdma));
}

for (auto meta: TTransportRequestCounters::AllCounters) {
auto& counter = meta.GetValue(Interconnect);
counter.Add(meta.GetValue(source.Interconnect));
}
}

void TPartitionDiskCounters::AggregateWith(const TPartitionDiskCounters& source)
Expand Down Expand Up @@ -62,6 +72,16 @@ void TPartitionDiskCounters::AggregateWith(const TPartitionDiskCounters& source)
auto& counter = meta.GetValue(Histogram);
counter.AggregateWith(meta.GetValue(source.Histogram));
}

for (auto meta: TTransportRequestCounters::AllCounters) {
auto& counter = meta.GetValue(Rdma);
counter.AggregateWith(meta.GetValue(source.Rdma));
}

for (auto meta: TTransportRequestCounters::AllCounters) {
auto& counter = meta.GetValue(Interconnect);
counter.AggregateWith(meta.GetValue(source.Interconnect));
}
}

void TPartitionDiskCounters::Publish(TInstant now)
Expand Down Expand Up @@ -116,6 +136,25 @@ void TPartitionDiskCounters::Publish(TInstant now)
}
}

for (auto meta: TTransportRequestCounters::AllCounters) {
auto& counter = meta.GetValue(Rdma);
if (Policy == EPublishingPolicy::All ||
counter.PublishingPolicy == EPublishingPolicy::All ||
Policy == counter.PublishingPolicy)
{
counter.Publish();
}
}

for (auto meta: TTransportRequestCounters::AllCounters) {
auto& counter = meta.GetValue(Interconnect);
if (Policy == EPublishingPolicy::All ||
counter.PublishingPolicy == EPublishingPolicy::All ||
Policy == counter.PublishingPolicy)
{
counter.Publish();
}
}
Reset();
}

Expand Down Expand Up @@ -184,6 +223,30 @@ void TPartitionDiskCounters::Register(
aggregate);
}
}

for (auto meta: TTransportRequestCounters::AllCounters) {
auto& counter = meta.GetValue(Rdma);
if (Policy == EPublishingPolicy::All ||
counter.PublishingPolicy == EPublishingPolicy::All ||
Policy == counter.PublishingPolicy)
{
counter.Register(
counters->GetSubgroup("request", TString(meta.Name)),
requestCounterOptions | counter.CounterOption);
}
}

for (auto meta: TTransportRequestCounters::AllCounters) {
auto& counter = meta.GetValue(Interconnect);
if (Policy == EPublishingPolicy::All ||
counter.PublishingPolicy == EPublishingPolicy::All ||
Policy == counter.PublishingPolicy)
{
counter.Register(
counters->GetSubgroup("request", TString(meta.Name)),
requestCounterOptions | counter.CounterOption);
}
}
}

void TPartitionDiskCounters::Reset()
Expand Down Expand Up @@ -212,6 +275,16 @@ void TPartitionDiskCounters::Reset()
auto& counter = meta.GetValue(Histogram);
counter.Reset();
}

for (auto meta: TTransportRequestCounters::AllCounters) {
auto& counter = meta.GetValue(Rdma);
counter.Reset();
}

for (auto meta: TTransportRequestCounters::AllCounters) {
auto& counter = meta.GetValue(Interconnect);
counter.Reset();
}
}

////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -345,6 +418,69 @@ void TVolumeSelfCounters::Publish(TInstant now)

////////////////////////////////////////////////////////////////////////////////

void TTransportDiskCounters::Add(const TTransportDiskCounters& source)
{
for (auto meta: TTransportRequestCounters::AllCounters) {
auto& counter = meta.GetValue(RequestCounters);
counter.Add(meta.GetValue(source.RequestCounters));
}
}

void TTransportDiskCounters::AggregateWith(const TTransportDiskCounters& source)
{
for (auto meta: TTransportRequestCounters::AllCounters) {
auto& counter = meta.GetValue(RequestCounters);
counter.AggregateWith(meta.GetValue(source.RequestCounters));
}
}

void TTransportDiskCounters::Publish()
{
for (auto meta: TTransportRequestCounters::AllCounters) {
auto& counter = meta.GetValue(RequestCounters);
if (Policy == EPublishingPolicy::All ||
counter.PublishingPolicy == EPublishingPolicy::All ||
Policy == counter.PublishingPolicy)
{
counter.Publish();
}
}
Reset();
}

void TTransportDiskCounters::Register(
NMonitoring::TDynamicCountersPtr counters,
bool aggregate)
{
ERequestCounterOptions requestCounterOptions;
if (aggregate) {
requestCounterOptions =
requestCounterOptions | ERequestCounterOption::ReportHistogram;
}

for (auto meta: TTransportRequestCounters::AllCounters) {
auto& counter = meta.GetValue(RequestCounters);
if (Policy == EPublishingPolicy::All ||
counter.PublishingPolicy == EPublishingPolicy::All ||
Policy == counter.PublishingPolicy)
{
counter.Register(
counters->GetSubgroup("request", TString(meta.Name)),
requestCounterOptions | counter.CounterOption);
}
}
}

void TTransportDiskCounters::Reset()
{
for (auto meta: TTransportRequestCounters::AllCounters) {
auto& counter = meta.GetValue(RequestCounters);
counter.Reset();
}
}

////////////////////////////////////////////////////////////////////////////////

TVolumeSelfCountersPtr CreateVolumeSelfCounters(EPublishingPolicy policy)
{
return std::make_unique<TVolumeSelfCounters>(policy);
Expand Down
42 changes: 42 additions & 0 deletions cloud/blockstore/libs/storage/core/disk_counters.h
Original file line number Diff line number Diff line change
Expand Up @@ -636,6 +636,27 @@ static_assert(
sizeof(TVolumeSelfRequestCounters::TCounter) *
std::size(TVolumeSelfRequestCounters::AllCounters));

struct TTransportRequestCounters
{
using TCounter =
TMemberWithMeta<TRequestCounters<THistogram<TRequestUsTimeBuckets>>>;
using TMeta = TMemberMeta<TCounter TTransportRequestCounters::*>;

TCounter TransportReadBlocks{EPublishingPolicy::All};
TCounter TransportWriteBlocks{
EPublishingPolicy::All
};

static constexpr TMeta AllCounters[] = {
MakeMeta<&TTransportRequestCounters::TransportReadBlocks>(),
MakeMeta<&TTransportRequestCounters::TransportWriteBlocks>(),
};
};

static_assert(
sizeof(TTransportRequestCounters) ==
(sizeof(TTransportRequestCounters::TCounter) *
std::size(TTransportRequestCounters::AllCounters)));
////////////////////////////////////////////////////////////////////////////////

struct TPartitionDiskCounters
Expand All @@ -644,6 +665,8 @@ struct TPartitionDiskCounters
TCumulativeDiskCounters Cumulative;
THistogramRequestCounters RequestCounters;
THistogramCounters Histogram;
TTransportRequestCounters Rdma;
TTransportRequestCounters Interconnect;

EPublishingPolicy Policy;

Expand Down Expand Up @@ -681,6 +704,25 @@ struct TVolumeSelfCounters

////////////////////////////////////////////////////////////////////////////////

struct TTransportDiskCounters
{
TTransportRequestCounters RequestCounters;

EPublishingPolicy Policy;

explicit TTransportDiskCounters(EPublishingPolicy policy)
: Policy(policy)
{}

void Add(const TTransportDiskCounters& source);
void AggregateWith(const TTransportDiskCounters& source);
void Register(NMonitoring::TDynamicCountersPtr counters, bool aggregate);
void Publish();
void Reset();
};

////////////////////////////////////////////////////////////////////////////////

using TPartitionDiskCountersPtr = std::unique_ptr<TPartitionDiskCounters>;
using TVolumeSelfCountersPtr = std::unique_ptr<TVolumeSelfCounters>;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,7 @@ void TNonreplicatedPartitionActor::HandleReadBlocksCompleted(
const auto time = CyclesToDurationSafe(msg->TotalCycles).MicroSeconds();
PartCounters->RequestCounters.ReadBlocks.AddRequest(time, requestBytes);

PartCounters->Interconnect.TransportReadBlocks.AddRequest(time, requestBytes);
PartCounters->RequestCounters.ReadBlocks.RequestNonVoidBytes +=
static_cast<ui64>(msg->NonVoidBlockCount) * PartConfig->GetBlockSize();
PartCounters->RequestCounters.ReadBlocks.RequestVoidBytes +=
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,7 @@ void TNonreplicatedPartitionActor::HandleWriteBlocksCompleted(
* PartConfig->GetBlockSize();
const auto time = CyclesToDurationSafe(msg->TotalCycles).MicroSeconds();
PartCounters->RequestCounters.WriteBlocks.AddRequest(time, requestBytes);
PartCounters->Interconnect.TransportWriteBlocks.AddRequest(time, requestBytes);
NetworkBytes += requestBytes;
CpuUsage += CyclesToDurationSafe(msg->ExecCycles);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,8 @@ void TNonreplicatedPartitionRdmaActor::HandleReadBlocksCompleted(
* PartConfig->GetBlockSize();
const auto time = CyclesToDurationSafe(msg->TotalCycles).MicroSeconds();
PartCounters->RequestCounters.ReadBlocks.AddRequest(time, requestBytes);
PartCounters->Rdma.TransportReadBlocks.AddRequest(time, requestBytes);

PartCounters->RequestCounters.ReadBlocks.RequestNonVoidBytes +=
static_cast<ui64>(msg->NonVoidBlockCount) * PartConfig->GetBlockSize();
PartCounters->RequestCounters.ReadBlocks.RequestVoidBytes +=
Expand Down Expand Up @@ -371,6 +373,7 @@ void TNonreplicatedPartitionRdmaActor::HandleWriteBlocksCompleted(
* PartConfig->GetBlockSize();
const auto time = CyclesToDurationSafe(msg->TotalCycles).MicroSeconds();
PartCounters->RequestCounters.WriteBlocks.AddRequest(time, requestBytes);
PartCounters->Rdma.TransportWriteBlocks.AddRequest(time, requestBytes);
NetworkBytes += requestBytes;
CpuUsage += CyclesToDurationSafe(msg->ExecCycles);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,8 @@ Y_UNIT_TEST_SUITE(TNonreplicatedPartitionRdmaTest)
runtime.DispatchEvents({}, TDuration::Seconds(1));

auto& counters = env.StorageStatsServiceState->Counters.RequestCounters;
auto& rdmaCounters = env.StorageStatsServiceState->Counters.Rdma;

UNIT_ASSERT_VALUES_EQUAL(3, counters.ReadBlocks.Count);
UNIT_ASSERT_VALUES_EQUAL(
DefaultBlockSize * (
Expand All @@ -324,6 +326,19 @@ Y_UNIT_TEST_SUITE(TNonreplicatedPartitionRdmaTest)
counters.WriteBlocks.RequestBytes
);

UNIT_ASSERT_VALUES_EQUAL(
rdmaCounters.TransportReadBlocks.Count,
counters.ReadBlocks.Count);
UNIT_ASSERT_VALUES_EQUAL(
rdmaCounters.TransportWriteBlocks.Count,
counters.WriteBlocks.Count);
UNIT_ASSERT_VALUES_EQUAL(
rdmaCounters.TransportReadBlocks.RequestBytes,
counters.ReadBlocks.RequestBytes);
UNIT_ASSERT_VALUES_EQUAL(
rdmaCounters.TransportWriteBlocks.RequestBytes,
counters.WriteBlocks.RequestBytes);

UNIT_ASSERT_VALUES_EQUAL(
0,
env.StorageStatsServiceState->Counters.Simple.IORequestsInFlight.Value
Expand Down Expand Up @@ -386,6 +401,7 @@ Y_UNIT_TEST_SUITE(TNonreplicatedPartitionRdmaTest)
runtime.DispatchEvents({}, TDuration::Seconds(1));

auto& counters = env.StorageStatsServiceState->Counters.RequestCounters;
auto& rdmaCounters = env.StorageStatsServiceState->Counters.Rdma;
UNIT_ASSERT_VALUES_EQUAL(2, counters.ReadBlocks.Count);
UNIT_ASSERT_VALUES_EQUAL(
DefaultBlockSize * (
Expand All @@ -401,6 +417,19 @@ Y_UNIT_TEST_SUITE(TNonreplicatedPartitionRdmaTest)
counters.WriteBlocks.RequestBytes
);

UNIT_ASSERT_VALUES_EQUAL(
rdmaCounters.TransportReadBlocks.Count,
counters.ReadBlocks.Count);
UNIT_ASSERT_VALUES_EQUAL(
rdmaCounters.TransportWriteBlocks.Count,
counters.WriteBlocks.Count);
UNIT_ASSERT_VALUES_EQUAL(
rdmaCounters.TransportReadBlocks.RequestBytes,
counters.ReadBlocks.RequestBytes);
UNIT_ASSERT_VALUES_EQUAL(
rdmaCounters.TransportWriteBlocks.RequestBytes,
counters.WriteBlocks.RequestBytes);

UNIT_ASSERT_VALUES_EQUAL(
0,
env.StorageStatsServiceState->Counters.Simple.IORequestsInFlight.Value
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,11 @@ Y_UNIT_TEST_SUITE(TNonreplicatedPartitionTest)
runtime.DispatchEvents({}, TDuration::Seconds(1));

auto& counters = env.StorageStatsServiceState->Counters.RequestCounters;
auto& transportCounters =
env.StorageStatsServiceState->Counters.Interconnect;
UNIT_ASSERT_VALUES_EQUAL(
transportCounters.TransportReadBlocks.Count,
counters.ReadBlocks.Count);
UNIT_ASSERT_VALUES_EQUAL(2, counters.ReadBlocks.Count);
UNIT_ASSERT_VALUES_EQUAL(
DefaultBlockSize * (
Expand Down Expand Up @@ -493,6 +498,11 @@ Y_UNIT_TEST_SUITE(TNonreplicatedPartitionTest)
runtime.DispatchEvents({}, TDuration::Seconds(1));

auto& counters = env.StorageStatsServiceState->Counters.RequestCounters;
auto& transportCounters =
env.StorageStatsServiceState->Counters.Interconnect;
UNIT_ASSERT_VALUES_EQUAL(
transportCounters.TransportWriteBlocks.Count,
counters.WriteBlocks.Count);
UNIT_ASSERT_VALUES_EQUAL(1, counters.WriteBlocks.Count);
UNIT_ASSERT_VALUES_EQUAL(
DefaultBlockSize * 3072,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,23 @@ void TStatsServiceActor::RegisterCounters(const TActorContext& ctx)
State.GetSsdBlobCounters().Register(ssdCounters);
State.GetHddBlobCounters().Register(hddCounters);

State.GetRdmaSsdNonreplCounters().Register(
ssdNonreplCounters->GetSubgroup("transport", "RDMA"));
State.GetInterconnectSsdNonreplCounters().Register(
ssdNonreplCounters->GetSubgroup("transport", "Interconnect"));
State.GetRdmaHddNonreplCounters().Register(
hddNonreplCounters->GetSubgroup("transport", "RDMA"));
State.GetInterconnectHddNonreplCounters().Register(
hddNonreplCounters->GetSubgroup("transport", "Interconnect"));
State.GetRdmaSsdMirror2Counters().Register(
ssdMirror2Counters->GetSubgroup("transport", "RDMA"));
State.GetInterconnectSsdMirror2Counters().Register(
ssdMirror2Counters->GetSubgroup("transport", "Interconnect"));
State.GetRdmaSsdMirror3Counters().Register(
ssdMirror3Counters->GetSubgroup("transport", "RDMA"));
State.GetInterconnectSsdMirror3Counters().Register(
ssdMirror3Counters->GetSubgroup("transport", "Interconnect"));

YDbFailedRequests = totalCounters->GetCounter("Ydb/FailedRequests", true);
FailedPartitionBoots = totalCounters->GetCounter("FailedBoots", true);

Expand Down
Loading

0 comments on commit 1fcd222

Please sign in to comment.