Skip to content

Commit

Permalink
NBS-4748: migrate from solomon to monitoring
Browse files Browse the repository at this point in the history
  • Loading branch information
komarevtsev-d committed Jan 19, 2024
1 parent fc1dd1f commit e0e4fce
Show file tree
Hide file tree
Showing 19 changed files with 271 additions and 99 deletions.
31 changes: 28 additions & 3 deletions cloud/blockstore/config/diagnostics.proto
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,25 @@ message TVolumePerfSettings
optional uint32 CriticalFactor = 3;
};

////////////////////////////////////////////////////////////////////////////////
// Monitoring URL components

message TMonitoringUrlData
{
// Monitoring cluster name (e.g. yandexcloud_prod_vla).
optional string MonitoringClusterName = 1;
// Monitoring host url.
optional string MonitoringUrl = 2;
// Project name in monitoring.
optional string MonitoringProject = 3;

// Dashboards ids for monitoring URL formation.
optional string MonitoringVolumeDashboard = 4;
optional string MonitoringPartitionDashboard = 5;
optional string MonitoringNBSAlertsDashboard = 6;
optional string MonitoringNBSTVDashboard = 7;
};

////////////////////////////////////////////////////////////////////////////////

message TDiagnosticsConfig
Expand All @@ -72,8 +91,9 @@ message TDiagnosticsConfig
// Kikimr monitoring port.
optional string KikimrMonitoringPort = 4;

// [obsolete]
// Name of cluster in Solomon.
optional string SolomonClusterName = 5;
// optional string SolomonClusterName = 5;

// Kikimr monitoring port.
optional uint32 KikimrMonPort = 6;
Expand All @@ -94,17 +114,19 @@ message TDiagnosticsConfig
// HDD disk performance threshold
// optional TVolumePerfThreshold HddPerfThreshold = 15;

// [obsolete]
// Solomon host url.
optional string SolomonUrl = 16;
// optional string SolomonUrl = 16;

// Allow destructive LWTrace actions.
optional bool UnsafeLWTrace = 17;

// Path to LWTrace query file.
optional string LWTraceDebugInitializationQuery = 18;

// [obsolete]
// Project name in solomon.
optional string SolomonProject = 20;
// optional string SolomonProject = 20;

// Sampling rate for request tracking
optional uint32 SamplingRate = 21;
Expand Down Expand Up @@ -178,4 +200,7 @@ message TDiagnosticsConfig

// Performance measurements coefficients for HDD NRD.
optional TVolumePerfSettings HddNonreplPerfSettings = 46;

// Monitoring data necessary for link generation on monpages.
optional TMonitoringUrlData MonitoringUrlData = 47;
}
33 changes: 30 additions & 3 deletions cloud/blockstore/libs/diagnostics/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ namespace {
xxx(HostNameScheme, NProto::EHostNameScheme, NProto::EHostNameScheme::HOSTNAME_RAW )\
xxx(BastionNameSuffix, TString, "" )\
xxx(ViewerHostName, TString, "" )\
xxx(SolomonClusterName, TString, "" )\
xxx(KikimrMonPort, ui32, 8765 )\
xxx(NbsMonPort, ui32, 8766 )\
\
Expand All @@ -26,8 +25,6 @@ namespace {
\
xxx(ProfileLogTimeThreshold, TDuration, TDuration::Seconds(15) )\
xxx(UseAsyncLogger, bool, false )\
xxx(SolomonUrl, TString, "" )\
xxx(SolomonProject, TString, "nbs" )\
xxx(UnsafeLWTrace, bool, false )\
xxx(LWTraceDebugInitializationQuery, TString, "" )\
xxx(SsdPerfSettings, TVolumePerfSettings, {} )\
Expand All @@ -40,6 +37,7 @@ namespace {
xxx(ExpectedIoParallelism, ui32, 32 )\
xxx(CloudIdsWithStrictSLA, TVector<TString>, {} )\
xxx(LWTraceShuttleCount, ui32, 2000 )\
xxx(MonitoringUrlData, TMonitoringUrlData, {} )\
\
xxx(CpuWaitFilename, TString, "/sys/fs/cgroup/cpu/system.slice/nbs.service/cpuacct.wait" )\
\
Expand Down Expand Up @@ -85,6 +83,14 @@ ConvertValue<TVolumePerfSettings, NProto::TVolumePerfSettings>(
return TVolumePerfSettings(value);
}

template <>
TMonitoringUrlData
ConvertValue<TMonitoringUrlData, NProto::TMonitoringUrlData>(
const NProto::TMonitoringUrlData& value)
{
return TMonitoringUrlData(value);
}

template <>
TRequestThresholds
ConvertValue<TRequestThresholds, TProtoRequestThresholds>(
Expand Down Expand Up @@ -115,6 +121,11 @@ bool IsEmpty(const NProto::TVolumePerfSettings& t)
return t.ByteSizeLong() == 0;
}

bool IsEmpty(const NProto::TMonitoringUrlData& t)
{
return t.ByteSizeLong() == 0;
}

template <typename T>
bool IsEmpty(const google::protobuf::RepeatedPtrField<T>& value)
{
Expand Down Expand Up @@ -253,6 +264,22 @@ void Out<NCloud::NBlockStore::TVolumePerfSettings>(
SerializeToTextFormat(v, out);
}

template <>
void Out<NCloud::NBlockStore::TMonitoringUrlData>(
IOutputStream& out,
const NCloud::NBlockStore::TMonitoringUrlData& value)
{
NCloud::NBlockStore::NProto::TMonitoringUrlData v;
v.SetMonitoringClusterName(value.MonitoringClusterName);
v.SetMonitoringUrl(value.MonitoringUrl);
v.SetMonitoringProject(value.MonitoringProject);
v.SetMonitoringVolumeDashboard(value.MonitoringVolumeDashboard);
v.SetMonitoringPartitionDashboard(value.MonitoringPartitionDashboard);
v.SetMonitoringNBSAlertsDashboard(value.MonitoringNBSAlertsDashboard);
v.SetMonitoringNBSTVDashboard(value.MonitoringNBSTVDashboard);
SerializeToTextFormat(v, out);
}

template <>
void Out<NCloud::TRequestThresholds>(
IOutputStream& out,
Expand Down
32 changes: 29 additions & 3 deletions cloud/blockstore/libs/diagnostics/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,34 @@ struct TVolumePerfSettings:

////////////////////////////////////////////////////////////////////////////////

struct TMonitoringUrlData: public TAtomicRefCount<TMonitoringUrlData>
{
TString MonitoringClusterName;
TString MonitoringUrl;
TString MonitoringProject;
TString MonitoringVolumeDashboard;
TString MonitoringPartitionDashboard;
TString MonitoringNBSAlertsDashboard;
TString MonitoringNBSTVDashboard;

TMonitoringUrlData()
: MonitoringProject("nbs")
{}
TMonitoringUrlData(const TMonitoringUrlData& rhs) = default;

explicit TMonitoringUrlData(const NProto::TMonitoringUrlData& data)
: MonitoringClusterName(data.GetMonitoringClusterName())
, MonitoringUrl(data.GetMonitoringUrl())
, MonitoringProject(data.GetMonitoringProject())
, MonitoringVolumeDashboard(data.GetMonitoringVolumeDashboard())
, MonitoringPartitionDashboard(data.GetMonitoringPartitionDashboard())
, MonitoringNBSAlertsDashboard(data.GetMonitoringNBSAlertsDashboard())
, MonitoringNBSTVDashboard(data.GetMonitoringNBSTVDashboard())
{}
};

////////////////////////////////////////////////////////////////////////////////

class TDiagnosticsConfig
{
private:
Expand All @@ -81,9 +109,6 @@ class TDiagnosticsConfig
NProto::EHostNameScheme GetHostNameScheme() const;
TString GetBastionNameSuffix() const;
TString GetViewerHostName() const;
TString GetSolomonClusterName() const;
TString GetSolomonUrl() const;
TString GetSolomonProject() const;
ui32 GetKikimrMonPort() const;
ui32 GetNbsMonPort() const;
ui32 GetSamplingRate() const;
Expand All @@ -106,6 +131,7 @@ class TDiagnosticsConfig
TVolumePerfSettings GetLocalSSDPerfSettings() const;
ui32 GetExpectedIoParallelism() const;
TVector<TString> GetCloudIdsWithStrictSLA() const;
TMonitoringUrlData GetMonitoringUrlData() const;

TString GetCpuWaitFilename() const;

Expand Down
83 changes: 37 additions & 46 deletions cloud/blockstore/libs/diagnostics/hostname.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,74 +72,65 @@ TString GetExternalHostUrl(
return out;
}

TString GetSolomonVolumeUrl(
TString GetMonitoringVolumeUrl(
const TDiagnosticsConfig& config,
const TString& diskId,
const TString& dashboard)
const TString& diskId)
{
TMonitoringUrlData data = config.GetMonitoringUrlData();
return TStringBuilder()
<< config.GetSolomonUrl()
<< "/?project=" << config.GetSolomonProject()
<< "&service=service_volume"
<< "&cluster="<< config.GetSolomonClusterName()
<< "&volume=" << diskId
<< "&dashboard=" << dashboard;
<< data.MonitoringUrl << "/projects/" << data.MonitoringProject
<< "/dashboards/" << data.MonitoringVolumeDashboard
<< "?from=now-1d&to=now&refresh=60000&p.cluster="
<< data.MonitoringClusterName << "&p.volume=" << diskId;
}

TString GetSolomonPartitionUrl(
const TDiagnosticsConfig& config,
const TString& dashboard)
TString GetMonitoringPartitionUrl(const TDiagnosticsConfig& config)
{
TMonitoringUrlData data = config.GetMonitoringUrlData();
return TStringBuilder()
<< config.GetSolomonUrl()
<< "/?project=" << config.GetSolomonProject()
<< "&service=tablets"
<< "&cluster=" << config.GetSolomonClusterName()
<< "&host=" << GetShortHostName()
<< "&dashboard=" << dashboard;
<< data.MonitoringUrl << "/projects/" << data.MonitoringProject
<< "/dashboards/" << data.MonitoringPartitionDashboard
<< "?from=now-1d&to=now&"
"refresh=60000&p.service=tablets&p.cluster="
<< data.MonitoringClusterName << "&p.host=" << GetShortHostName();
}

TString GetSolomonServerUrl(
const TDiagnosticsConfig& config,
const TString& dashboard)
TString GetMonitoringNBSAlertsUrl(const TDiagnosticsConfig& config)
{
TMonitoringUrlData data = config.GetMonitoringUrlData();
return TStringBuilder()
<< config.GetSolomonUrl()
<< "/?project" << config.GetSolomonProject()
<< "&service=server"
<< "&cluster=" << config.GetSolomonClusterName()
<< "&host=" << GetShortHostName()
<< "&type=-"
<< "&dashboard="<< dashboard;
<< data.MonitoringUrl << "/projects/" << data.MonitoringProject
<< "/dashboards/" << data.MonitoringNBSAlertsDashboard
<< "?from=now-1d&to=now&refresh=60000&p.cluster="
<< data.MonitoringClusterName << "&p.host=" << GetShortHostName();
}

TString GetSolomonClientUrl(
const TDiagnosticsConfig& config,
const TString& dashboard)
TString GetMonitoringNBSOverviewToTVUrl(const TDiagnosticsConfig& config)
{
TMonitoringUrlData data = config.GetMonitoringUrlData();
return TStringBuilder()
<< config.GetSolomonUrl()
<< "/?project=" << config.GetSolomonProject()
<< "&service=client"
<< "&cluster="<< config.GetSolomonClusterName()
<< "&host=" << GetShortHostName()
<< "&type=-"
<< "&dashboard=" << dashboard;
<< data.MonitoringUrl << "/projects/" << data.MonitoringProject
<< "/dashboards/" << data.MonitoringNBSTVDashboard
<< "?from=now-1d&to=now&refresh=60000&p.cluster="
<< data.MonitoringClusterName << "&p.host=cluster";
}

TString GetSolomonBsProxyUrl(
const TDiagnosticsConfig& config,
ui32 groupId,
const TString& dashboard)
const TString& storagePool)
{
TMonitoringUrlData data = config.GetMonitoringUrlData();
return TStringBuilder()
<< config.GetSolomonUrl()
<< "/?project" << config.GetSolomonProject()
<< "&service=dsproxy_percentile"
<< "&cluster=" << config.GetSolomonClusterName()
<< "&host=" << GetShortHostName()
<< "&blobstorageproxy=" << groupId
<< "&dashboard=" << dashboard;
<< data.MonitoringUrl
<< "/projects/kikimr/explorer/"
"queries?q.0.s=histogram_percentile(99, {project=\"kikimr"
<< "\", cluster=\"" << data.MonitoringClusterName
<< "\", storagePool=\"" << storagePool << "\", group=\"" << groupId
<< "\", host=\"*\", service=\"vdisks\", "
"subsystem=\"latency_histo\", "
"handleclass=\"GetFast\"})&q.0.name=A&from=now-1d&to=now&refresh="
"60000";
}

} // namespace NCloud::NBlockStore
19 changes: 6 additions & 13 deletions cloud/blockstore/libs/diagnostics/hostname.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,26 +24,19 @@ TString GetExternalHostUrl(
EHostService serviceType,
const TDiagnosticsConfig& config);

TString GetSolomonServerUrl(
const TDiagnosticsConfig& config,
const TString& dashboard);
TString GetMonitoringNBSAlertsUrl(const TDiagnosticsConfig& config);

TString GetSolomonClientUrl(
const TDiagnosticsConfig& config,
const TString& dashboard);
TString GetMonitoringNBSOverviewToTVUrl(const TDiagnosticsConfig& config);

TString GetSolomonVolumeUrl(
TString GetMonitoringVolumeUrl(
const TDiagnosticsConfig& config,
const TString& diskId,
const TString& dashboard);
const TString& diskId);

TString GetSolomonPartitionUrl(
const TDiagnosticsConfig& config,
const TString& dashboard);
TString GetMonitoringPartitionUrl(const TDiagnosticsConfig& config);

TString GetSolomonBsProxyUrl(
const TDiagnosticsConfig& config,
ui32 groupId,
const TString& dashboard);
const TString& storagePool);

} // namespace NCloud::NBlockStore
10 changes: 5 additions & 5 deletions cloud/blockstore/libs/diagnostics/server_stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -601,15 +601,15 @@ void TServerStats::OutputHtml(IOutputStream& out, const IMonHttpRequest& request
if (DiagnosticsConfig) {
TAG(TH3) {
out << "<a href='"
<< GetSolomonServerUrl(*DiagnosticsConfig, "nbs-server-monitoring")
<< "'>Server dashboards</a>";
<< GetMonitoringNBSAlertsUrl(*DiagnosticsConfig)
<< "'>NBS Alerts dashboard</a>";
};

TAG(TH3) {
out << "<a href='"
<< GetSolomonClientUrl(*DiagnosticsConfig, "nbs-compute-client-monitoring")
<< "'>Client dashboards</a>";
};
<< GetMonitoringNBSOverviewToTVUrl(*DiagnosticsConfig)
<< "'>NBS overview To TV</a>";
}
}

TAG(TH3) { out << "Config"; }
Expand Down
8 changes: 3 additions & 5 deletions cloud/blockstore/libs/storage/core/monitoring_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -968,15 +968,14 @@ void DumpCompactionMap(
}
}

void DumpSolomonVolumeLink(
void DumpMonitoringVolumeLink(
IOutputStream& out,
const TDiagnosticsConfig& config,
const TString& diskId)
{
HTML(out) {
TAG(TH3) {
out << "<a href='"
<< GetSolomonVolumeUrl(config, diskId, "nbs-volume-overview")
out << "<a href='" << GetMonitoringVolumeUrl(config, diskId)
<< "'>Volume dashboards</a>";
}
}
Expand All @@ -988,8 +987,7 @@ void DumpSolomonPartitionLink(
{
HTML(out) {
TAG(TH3) {
out << "<a href='"
<< GetSolomonPartitionUrl(config, "nbs-tablets-transactions")
out << "<a href='" << GetMonitoringPartitionUrl(config)
<< "'>Partition dashboards</a>";
}
}
Expand Down
2 changes: 1 addition & 1 deletion cloud/blockstore/libs/storage/core/monitoring_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ void DumpCompactionMap(
const TVector<TCompactionCounter>& items,
const ui32 rangeSize);

void DumpSolomonVolumeLink(
void DumpMonitoringVolumeLink(
IOutputStream& out,
const TDiagnosticsConfig& config,
const TString& diskId);
Expand Down
Loading

0 comments on commit e0e4fce

Please sign in to comment.