Skip to content

Commit

Permalink
Merge pull request #102 from Manda-supraja26/etcd-cluster-info
Browse files Browse the repository at this point in the history
Adding etcd-on-cluster Dashboard to grafonnet
  • Loading branch information
vishnuchalla authored Feb 7, 2024
2 parents dbbd2ec + 92c01bb commit 2141887
Show file tree
Hide file tree
Showing 4 changed files with 345 additions and 0 deletions.
116 changes: 116 additions & 0 deletions assets/etcd-on-cluster-dashboard/panels.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';

{
timeSeries: {
local timeSeries = g.panel.timeSeries,
local custom = timeSeries.fieldConfig.defaults.custom,
local options = timeSeries.options,

base(title, unit, targets, gridPos):
timeSeries.new(title)
+ timeSeries.queryOptions.withTargets(targets)
+ timeSeries.datasource.withType('prometheus')
+ timeSeries.datasource.withUid('$Datasource')
+ timeSeries.standardOptions.withUnit(unit)
+ timeSeries.gridPos.withX(gridPos.x)
+ timeSeries.gridPos.withY(gridPos.y)
+ timeSeries.gridPos.withH(gridPos.h)
+ timeSeries.gridPos.withW(gridPos.w)
+ custom.withDrawStyle("line")
+ custom.withLineInterpolation("linear")
+ custom.withBarAlignment(0)
+ custom.withLineWidth(1)
+ custom.withFillOpacity(10)
+ custom.withGradientMode("none")
+ custom.withSpanNulls(false)
+ custom.withPointSize(5)
+ custom.withSpanNulls(false)
+ custom.stacking.withMode("none")
+ custom.withShowPoints('never')
+ options.tooltip.withMode('multi')
+ options.tooltip.withSort('desc')
+ options.legend.withShowLegend(true)
+ options.legend.withPlacement('bottom'),

generalUsageAgg(title, unit, targets, gridPos):
self.base(title, unit, targets, gridPos)
+ options.legend.withCalcs([
'mean',
'max'
])
+ options.legend.withDisplayMode('table'),

withoutCalcsAgg(title, unit, targets, gridPos):
self.base(title, unit, targets, gridPos)
+ options.legend.withCalcs([])
+ options.legend.withDisplayMode('table'),

GeneralInfoAgg(title, unit, targets, gridPos):
self.base(title, unit, targets, gridPos)
+ options.legend.withCalcs([
'mean',
'max'
])
+ options.legend.withDisplayMode('list'),

GeneralInfo(title, unit, targets, gridPos):
self.base(title, unit, targets, gridPos)
+ options.legend.withCalcs([])
+ options.legend.withDisplayMode('list'),
},

stat: {
local stat = g.panel.stat,
local options = stat.options,

base(title, unit, targets, gridPos):
stat.new(title)
+ stat.datasource.withType('prometheus')
+ stat.datasource.withUid('$Datasource')
+ stat.standardOptions.withUnit(unit)
+ stat.queryOptions.withTargets(targets)
+ stat.gridPos.withX(gridPos.x)
+ stat.gridPos.withY(gridPos.y)
+ stat.gridPos.withH(gridPos.h)
+ stat.gridPos.withW(gridPos.w)
+ options.withJustifyMode("auto")
+ options.withGraphMode("none")
+ options.text.withTitleSize(12)
+ stat.standardOptions.color.withMode('thresholds')
+ options.withColorMode('none'),


etcdLeader(title, unit, target, gridPos):
self.base(title, unit, target, gridPos)
+ stat.options.reduceOptions.withCalcs([
'mean'
])
+ stat.standardOptions.withMappings({
"type": "value",
"options": {
"0": {
"text": "NO"
},
"1": {
"text": "YES"
}
}
}),

failedProposalsSeen(title, unit, target, gridPos):
self.base(title, unit, target, gridPos)
+ stat.options.reduceOptions.withCalcs([
'mean'
])
+ stat.standardOptions.withMappings(
{
"type": "special",
"options": {
"match": "null",
"result": {
"text": "N/A"
}
}
}),
}
}
148 changes: 148 additions & 0 deletions assets/etcd-on-cluster-dashboard/queries.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
local variables = import './variables.libsonnet';

local generateTimeSeriesQuery(query, legend) = [
local prometheusQuery = g.query.prometheus;
prometheusQuery.new('$'+variables.Datasource.name, query)
+ prometheusQuery.withFormat('time_series')
+ prometheusQuery.withIntervalFactor(2)
+ prometheusQuery.withLegendFormat(legend),
];

{
CPUUsage: {
query():
generateTimeSeriesQuery('sum(irate(container_cpu_usage_seconds_total{namespace="openshift-etcd", container="etcd"}[2m])) by (pod) * 100','{{ pod }}')
},

memoryUsage: {
query():
generateTimeSeriesQuery('sum(avg_over_time(container_memory_working_set_bytes{container="",pod!="", namespace=~"openshift-etcd.*"}[2m])) BY (pod, namespace)','{{ pod }}')
},

diskWalSyncDuration: {
query():
generateTimeSeriesQuery('histogram_quantile(0.99, sum(irate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace="openshift-etcd"}[2m])) by (pod, le))','{{pod}} WAL fsync')
},

diskBackendSyncDuration: {
query():
generateTimeSeriesQuery('histogram_quantile(0.99, sum(irate(etcd_disk_backend_commit_duration_seconds_bucket{namespace="openshift-etcd"}[2m])) by (pod, le))','{{pod}} DB fsync')
},

etcdContainerDiskWrites: {
query():
generateTimeSeriesQuery('rate(container_fs_writes_bytes_total{namespace="openshift-etcd",container="etcd",device!~".+dm.+"}[2m])','{{ pod }}: {{ device }}')
},

dbSize: {
query():
generateTimeSeriesQuery('etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd"}','{{pod}} DB physical size')
+ generateTimeSeriesQuery('etcd_mvcc_db_total_size_in_use_in_bytes{namespace="openshift-etcd"}','{{pod}} DB logical size')
},

containerNetworkTraffic: {
query():
generateTimeSeriesQuery('sum(rate(container_network_receive_bytes_total{ container="etcd", namespace=~"openshift-etcd.*"}[2m])) BY (namespace, pod)','rx {{ pod }}')
+ generateTimeSeriesQuery('sum(rate(container_network_transmit_bytes_total{ container="etcd", namespace=~"openshift-etcd.*"}[2m])) BY (namespace, pod)','tx {{ pod }}')
},

p99PeerToPeerLatency: {
query():
generateTimeSeriesQuery('histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{namespace="openshift-etcd"}[2m]))','{{pod}}')
},

peerNetworkTraffic: {
query():
generateTimeSeriesQuery('rate(etcd_network_peer_received_bytes_total{namespace="openshift-etcd"}[2m])','rx {{pod}} Peer Traffic')
+ generateTimeSeriesQuery('rate(etcd_network_peer_sent_bytes_total{namespace="openshift-etcd"}[2m])','tx {{pod}} Peer Traffic')
},

gRPCNetworkTraffic: {
query():
generateTimeSeriesQuery('rate(etcd_network_client_grpc_received_bytes_total{namespace="openshift-etcd"}[2m])','rx {{pod}}')
+ generateTimeSeriesQuery('rate(etcd_network_client_grpc_sent_bytes_total{namespace="openshift-etcd"}[2m])','tx {{pod}}')
},

activeStreams: {
query():
generateTimeSeriesQuery('sum(grpc_server_started_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})','Watch Streams')
+ generateTimeSeriesQuery('sum(grpc_server_started_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})','Lease Streams')
},

snapshotDuration: {
query():
generateTimeSeriesQuery('sum(rate(etcd_debugging_snap_save_total_duration_seconds_sum{namespace="openshift-etcd"}[2m]))','the total latency distributions of save called by snapshot')
},

dbSpaceUsed: {
query():
generateTimeSeriesQuery('(etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd"} / etcd_server_quota_backend_bytes{namespace="openshift-etcd"})*100','{{pod}}')
},

dbLeftCapacity: {
query():
generateTimeSeriesQuery('etcd_server_quota_backend_bytes{namespace="openshift-etcd"} - etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd"}','{{pod}}')
},

dbSizeLimit: {
query():
generateTimeSeriesQuery('etcd_server_quota_backend_bytes{namespace="openshift-etcd"}','{{ pod }} Quota Bytes')
},

raftProposals: {
query():
generateTimeSeriesQuery('sum(rate(etcd_server_proposals_failed_total{namespace="openshift-etcd"}[2m]))','Proposal Failure Rate')
+ generateTimeSeriesQuery('sum(etcd_server_proposals_pending{namespace="openshift-etcd"})','Proposal Pending Total')
+ generateTimeSeriesQuery('sum(rate(etcd_server_proposals_committed_total{namespace="openshift-etcd"}[2m]))','Proposal Commit Rate')
+ generateTimeSeriesQuery('sum(rate(etcd_server_proposals_applied_total{namespace="openshift-etcd"}[2m]))','Proposal Apply Rate')
},

numberOfLeaderChangesSeen: {
query():
generateTimeSeriesQuery('sum(rate(etcd_server_leader_changes_seen_total{namespace="openshift-etcd"}[2m]))','')
},

etcdHasALeader: {
query():
generateTimeSeriesQuery('max(etcd_server_has_leader{namespace="openshift-etcd"})','')
},

totalNumberOfProposalsSeen: {
query():
generateTimeSeriesQuery('max(etcd_server_proposals_committed_total{namespace="openshift-etcd"})','')
},

keys: {
query():
generateTimeSeriesQuery('etcd_debugging_mvcc_keys_total{namespace="openshift-etcd"}','{{ pod }} Num keys')
},

leaderElectionsPerDay: {
query():
generateTimeSeriesQuery('changes(etcd_server_leader_changes_seen_total{namespace="openshift-etcd"}[1d])','{{instance}} Total Leader Elections Per Day')
},

slowOperations: {
query():
generateTimeSeriesQuery('delta(etcd_server_slow_apply_total{namespace="openshift-etcd"}[2m])','{{ pod }} slow applies')
+ generateTimeSeriesQuery('delta(etcd_server_slow_read_indexes_total{namespace="openshift-etcd"}[2m])','{{ pod }} slow read indexes')
},

keyOperations: {
query():
generateTimeSeriesQuery('rate(etcd_mvcc_put_total{namespace="openshift-etcd"}[2m])','{{ pod }} puts/s')
+ generateTimeSeriesQuery('rate(etcd_mvcc_delete_total{namespace="openshift-etcd"}[2m])','{{ pod }} deletes/s')
},

heartBeatFailure: {
query():
generateTimeSeriesQuery('etcd_server_heartbeat_send_failures_total{namespace="openshift-etcd"}','{{ pod }} heartbeat failures')
+ generateTimeSeriesQuery('etcd_server_health_failures{namespace="openshift-etcd"}','{{ pod }} health failures')
},

compactedKeys: {
query():
generateTimeSeriesQuery('etcd_debugging_mvcc_db_compaction_keys_total{namespace="openshift-etcd"}','{{ pod }} keys compacted')
}
}
12 changes: 12 additions & 0 deletions assets/etcd-on-cluster-dashboard/variables.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
local var = g.dashboard.variable;

{
Datasource:
var.datasource.new('Datasource','prometheus')
+ var.datasource.withRegex("")
+ var.query.generalOptions.withLabel('Datasource')
+ var.query.withRefresh(1)
+ var.query.selectionOptions.withMulti(false)
+ var.query.selectionOptions.withIncludeAll(false),
}
69 changes: 69 additions & 0 deletions templates/General/etcd-on-cluster-dashboard-v2.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
local panels = import '../../assets/etcd-on-cluster-dashboard/panels.libsonnet';
local queries = import '../../assets/etcd-on-cluster-dashboard/queries.libsonnet';
local variables = import '../../assets/etcd-on-cluster-dashboard/variables.libsonnet';
local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';

g.dashboard.new('etcd-cluster-info dashoard')
+ g.dashboard.time.withFrom('now-1h')
+ g.dashboard.time.withTo('now')
+ g.dashboard.withTimezone('utc')
+ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'])
+ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'])
+ g.dashboard.withRefresh('')
+ g.dashboard.withEditable(false)
+ g.dashboard.graphTooltip.withSharedCrosshair()
+ g.dashboard.withVariables([
variables.Datasource,
])

+ g.dashboard.withPanels([
g.panel.row.new('General Resource Usage')
+ g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withPanels([
panels.timeSeries.generalUsageAgg('CPU usage', 'percent', queries.CPUUsage.query(), { x: 0, y: 1, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Memory usage', 'bytes', queries.memoryUsage.query(), { x: 12, y: 1, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Disk WAL Sync Duration', 's', queries.diskWalSyncDuration.query(), { x: 0, y: 8, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Disk Backend Sync Duration', 's', queries.diskBackendSyncDuration.query(), { x: 12, y: 8, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Etcd container disk writes', 'Bps', queries.etcdContainerDiskWrites.query(), { x: 0, y: 16, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('DB Size', 'bytes', queries.dbSize.query(), { x: 12, y: 16, w: 12, h: 8 }),
]),

g.panel.row.new('Network Usage')
+ g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withPanels([
panels.timeSeries.generalUsageAgg('Container network traffic', 'Bps', queries.containerNetworkTraffic.query(), { x: 0, y: 1, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('p99 peer to peer latency', 's', queries.p99PeerToPeerLatency.query(), { x: 12, y: 1, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('Peer network traffic', 'Bps', queries.peerNetworkTraffic.query(), { x: 0, y: 8, w: 12, h: 8 }),
panels.timeSeries.generalUsageAgg('gRPC network traffic', 'Bps', queries.gRPCNetworkTraffic.query(), { x: 12, y: 8, w: 12, h: 8 }),
panels.timeSeries.withoutCalcsAgg('Active Streams', '', queries.activeStreams.query(), { x: 0, y: 16, w: 12, h: 8 }),
panels.timeSeries.withoutCalcsAgg('Snapshot duration', 's', queries.snapshotDuration.query(), { x: 12, y: 16, w: 12, h: 8 }),
]),

g.panel.row.new('DB Info per Member')
+ g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withPanels([
panels.timeSeries.withoutCalcsAgg('% DB Space Used', 'percent', queries.dbSpaceUsed.query(), { x: 0, y: 8, w: 8, h: 8 }),
panels.timeSeries.withoutCalcsAgg('DB Left capacity (with fragmented space)', 'bytes', queries.dbLeftCapacity.query(), { x: 8, y: 8, w: 8, h: 8 }),
panels.timeSeries.withoutCalcsAgg('DB Size Limit (Backend-bytes)', 'bytes', queries.dbSizeLimit.query(), { x: 16, y: 8, w: 8, h: 8 }),
]),

g.panel.row.new('General Info')
+ g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withPanels([
panels.timeSeries.GeneralInfo('Raft Proposals', '', queries.raftProposals.query(), { x: 0, y: 1, w: 12, h: 8 }),
panels.timeSeries.GeneralInfo('Number of leader changes seen', '', queries.numberOfLeaderChangesSeen.query(), { x: 12, y: 1, w: 12, h: 8 }),
panels.stat.etcdLeader('Etcd has a leader?', 'none', queries.etcdHasALeader.query(), { x: 0, y: 8, w: 6, h: 2 }),
panels.stat.failedProposalsSeen('Total number of failed proposals seen', 'none', queries.totalNumberOfProposalsSeen.query(), { x: 6, y: 8, w: 6, h: 2 }),
panels.timeSeries.GeneralInfo('Keys', 'short', queries.keys.query(), { x: 12, y: 12, w: 12, h: 8 }),
panels.timeSeries.GeneralInfo('Leader Elections Per Day', 'short', queries.leaderElectionsPerDay.query(), { x: 0, y: 12, w: 12, h: 6 }),
panels.timeSeries.GeneralInfo('Slow Operations', 'ops', queries.slowOperations.query(), { x: 0, y: 20, w: 12, h: 8 }),
panels.timeSeries.GeneralInfo('Key Operations', 'ops', queries.keyOperations.query(), { x: 12, y: 20, w: 12, h: 8 }),
panels.timeSeries.GeneralInfo('Heartbeat Failures', 'short', queries.heartBeatFailure.query(), { x: 0, y: 28, w: 12, h: 8 }),
panels.timeSeries.GeneralInfo('Compacted Keys', 'short', queries.compactedKeys.query(), { x: 12, y: 28, w: 12, h: 8 }),
]),

])

0 comments on commit 2141887

Please sign in to comment.