diff --git a/assets/etcd-on-cluster-dashboard/panels.libsonnet b/assets/etcd-on-cluster-dashboard/panels.libsonnet new file mode 100644 index 0000000..483c98e --- /dev/null +++ b/assets/etcd-on-cluster-dashboard/panels.libsonnet @@ -0,0 +1,116 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +{ + timeSeries: { + local timeSeries = g.panel.timeSeries, + local custom = timeSeries.fieldConfig.defaults.custom, + local options = timeSeries.options, + + base(title, unit, targets, gridPos): + timeSeries.new(title) + + timeSeries.queryOptions.withTargets(targets) + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid('$Datasource') + + timeSeries.standardOptions.withUnit(unit) + + timeSeries.gridPos.withX(gridPos.x) + + timeSeries.gridPos.withY(gridPos.y) + + timeSeries.gridPos.withH(gridPos.h) + + timeSeries.gridPos.withW(gridPos.w) + + custom.withDrawStyle("line") + + custom.withLineInterpolation("linear") + + custom.withBarAlignment(0) + + custom.withLineWidth(1) + + custom.withFillOpacity(10) + + custom.withGradientMode("none") + + custom.withSpanNulls(false) + + custom.withPointSize(5) + + custom.withSpanNulls(false) + + custom.stacking.withMode("none") + + custom.withShowPoints('never') + + options.tooltip.withMode('multi') + + options.tooltip.withSort('desc') + + options.legend.withShowLegend(true) + + options.legend.withPlacement('bottom'), + + generalUsageAgg(title, unit, targets, gridPos): + self.base(title, unit, targets, gridPos) + + options.legend.withCalcs([ + 'mean', + 'max' + ]) + + options.legend.withDisplayMode('table'), + + withoutCalcsAgg(title, unit, targets, gridPos): + self.base(title, unit, targets, gridPos) + + options.legend.withCalcs([]) + + options.legend.withDisplayMode('table'), + + GeneralInfoAgg(title, unit, targets, gridPos): + self.base(title, unit, targets, gridPos) + + options.legend.withCalcs([ + 'mean', + 'max' + ]) + + options.legend.withDisplayMode('list'), + + GeneralInfo(title, unit, targets, gridPos): + self.base(title, unit, targets, gridPos) + + options.legend.withCalcs([]) + + options.legend.withDisplayMode('list'), + }, + + stat: { + local stat = g.panel.stat, + local options = stat.options, + + base(title, unit, targets, gridPos): + stat.new(title) + + stat.datasource.withType('prometheus') + + stat.datasource.withUid('$Datasource') + + stat.standardOptions.withUnit(unit) + + stat.queryOptions.withTargets(targets) + + stat.gridPos.withX(gridPos.x) + + stat.gridPos.withY(gridPos.y) + + stat.gridPos.withH(gridPos.h) + + stat.gridPos.withW(gridPos.w) + + options.withJustifyMode("auto") + + options.withGraphMode("none") + + options.text.withTitleSize(12) + + stat.standardOptions.color.withMode('thresholds') + + options.withColorMode('none'), + + + etcdLeader(title, unit, target, gridPos): + self.base(title, unit, target, gridPos) + + stat.options.reduceOptions.withCalcs([ + 'mean' + ]) + + stat.standardOptions.withMappings({ + "type": "value", + "options": { + "0": { + "text": "NO" + }, + "1": { + "text": "YES" + } + } + }), + + failedProposalsSeen(title, unit, target, gridPos): + self.base(title, unit, target, gridPos) + + stat.options.reduceOptions.withCalcs([ + 'mean' + ]) + + stat.standardOptions.withMappings( + { + "type": "special", + "options": { + "match": "null", + "result": { + "text": "N/A" + } + } + }), + } +} \ No newline at end of file diff --git a/assets/etcd-on-cluster-dashboard/queries.libsonnet b/assets/etcd-on-cluster-dashboard/queries.libsonnet new file mode 100644 index 0000000..483e4f1 --- /dev/null +++ b/assets/etcd-on-cluster-dashboard/queries.libsonnet @@ -0,0 +1,148 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; +local variables = import './variables.libsonnet'; + +local generateTimeSeriesQuery(query, legend) = [ + local prometheusQuery = g.query.prometheus; + prometheusQuery.new('$'+variables.Datasource.name, query) + + prometheusQuery.withFormat('time_series') + + prometheusQuery.withIntervalFactor(2) + + prometheusQuery.withLegendFormat(legend), +]; + +{ + CPUUsage: { + query(): + generateTimeSeriesQuery('sum(irate(container_cpu_usage_seconds_total{namespace="openshift-etcd", container="etcd"}[2m])) by (pod) * 100','{{ pod }}') + }, + + memoryUsage: { + query(): + generateTimeSeriesQuery('sum(avg_over_time(container_memory_working_set_bytes{container="",pod!="", namespace=~"openshift-etcd.*"}[2m])) BY (pod, namespace)','{{ pod }}') + }, + + diskWalSyncDuration: { + query(): + generateTimeSeriesQuery('histogram_quantile(0.99, sum(irate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace="openshift-etcd"}[2m])) by (pod, le))','{{pod}} WAL fsync') + }, + + diskBackendSyncDuration: { + query(): + generateTimeSeriesQuery('histogram_quantile(0.99, sum(irate(etcd_disk_backend_commit_duration_seconds_bucket{namespace="openshift-etcd"}[2m])) by (pod, le))','{{pod}} DB fsync') + }, + + etcdContainerDiskWrites: { + query(): + generateTimeSeriesQuery('rate(container_fs_writes_bytes_total{namespace="openshift-etcd",container="etcd",device!~".+dm.+"}[2m])','{{ pod }}: {{ device }}') + }, + + dbSize: { + query(): + generateTimeSeriesQuery('etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd"}','{{pod}} DB physical size') + + generateTimeSeriesQuery('etcd_mvcc_db_total_size_in_use_in_bytes{namespace="openshift-etcd"}','{{pod}} DB logical size') + }, + + containerNetworkTraffic: { + query(): + generateTimeSeriesQuery('sum(rate(container_network_receive_bytes_total{ container="etcd", namespace=~"openshift-etcd.*"}[2m])) BY (namespace, pod)','rx {{ pod }}') + + generateTimeSeriesQuery('sum(rate(container_network_transmit_bytes_total{ container="etcd", namespace=~"openshift-etcd.*"}[2m])) BY (namespace, pod)','tx {{ pod }}') + }, + + p99PeerToPeerLatency: { + query(): + generateTimeSeriesQuery('histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{namespace="openshift-etcd"}[2m]))','{{pod}}') + }, + + peerNetworkTraffic: { + query(): + generateTimeSeriesQuery('rate(etcd_network_peer_received_bytes_total{namespace="openshift-etcd"}[2m])','rx {{pod}} Peer Traffic') + + generateTimeSeriesQuery('rate(etcd_network_peer_sent_bytes_total{namespace="openshift-etcd"}[2m])','tx {{pod}} Peer Traffic') + }, + + gRPCNetworkTraffic: { + query(): + generateTimeSeriesQuery('rate(etcd_network_client_grpc_received_bytes_total{namespace="openshift-etcd"}[2m])','rx {{pod}}') + + generateTimeSeriesQuery('rate(etcd_network_client_grpc_sent_bytes_total{namespace="openshift-etcd"}[2m])','tx {{pod}}') + }, + + activeStreams: { + query(): + generateTimeSeriesQuery('sum(grpc_server_started_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})','Watch Streams') + + generateTimeSeriesQuery('sum(grpc_server_started_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})','Lease Streams') + }, + + snapshotDuration: { + query(): + generateTimeSeriesQuery('sum(rate(etcd_debugging_snap_save_total_duration_seconds_sum{namespace="openshift-etcd"}[2m]))','the total latency distributions of save called by snapshot') + }, + + dbSpaceUsed: { + query(): + generateTimeSeriesQuery('(etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd"} / etcd_server_quota_backend_bytes{namespace="openshift-etcd"})*100','{{pod}}') + }, + + dbLeftCapacity: { + query(): + generateTimeSeriesQuery('etcd_server_quota_backend_bytes{namespace="openshift-etcd"} - etcd_mvcc_db_total_size_in_bytes{namespace="openshift-etcd"}','{{pod}}') + }, + + dbSizeLimit: { + query(): + generateTimeSeriesQuery('etcd_server_quota_backend_bytes{namespace="openshift-etcd"}','{{ pod }} Quota Bytes') + }, + + raftProposals: { + query(): + generateTimeSeriesQuery('sum(rate(etcd_server_proposals_failed_total{namespace="openshift-etcd"}[2m]))','Proposal Failure Rate') + + generateTimeSeriesQuery('sum(etcd_server_proposals_pending{namespace="openshift-etcd"})','Proposal Pending Total') + + generateTimeSeriesQuery('sum(rate(etcd_server_proposals_committed_total{namespace="openshift-etcd"}[2m]))','Proposal Commit Rate') + + generateTimeSeriesQuery('sum(rate(etcd_server_proposals_applied_total{namespace="openshift-etcd"}[2m]))','Proposal Apply Rate') + }, + + numberOfLeaderChangesSeen: { + query(): + generateTimeSeriesQuery('sum(rate(etcd_server_leader_changes_seen_total{namespace="openshift-etcd"}[2m]))','') + }, + + etcdHasALeader: { + query(): + generateTimeSeriesQuery('max(etcd_server_has_leader{namespace="openshift-etcd"})','') + }, + + totalNumberOfProposalsSeen: { + query(): + generateTimeSeriesQuery('max(etcd_server_proposals_committed_total{namespace="openshift-etcd"})','') + }, + + keys: { + query(): + generateTimeSeriesQuery('etcd_debugging_mvcc_keys_total{namespace="openshift-etcd"}','{{ pod }} Num keys') + }, + + leaderElectionsPerDay: { + query(): + generateTimeSeriesQuery('changes(etcd_server_leader_changes_seen_total{namespace="openshift-etcd"}[1d])','{{instance}} Total Leader Elections Per Day') + }, + + slowOperations: { + query(): + generateTimeSeriesQuery('delta(etcd_server_slow_apply_total{namespace="openshift-etcd"}[2m])','{{ pod }} slow applies') + + generateTimeSeriesQuery('delta(etcd_server_slow_read_indexes_total{namespace="openshift-etcd"}[2m])','{{ pod }} slow read indexes') + }, + + keyOperations: { + query(): + generateTimeSeriesQuery('rate(etcd_mvcc_put_total{namespace="openshift-etcd"}[2m])','{{ pod }} puts/s') + + generateTimeSeriesQuery('rate(etcd_mvcc_delete_total{namespace="openshift-etcd"}[2m])','{{ pod }} deletes/s') + }, + + heartBeatFailure: { + query(): + generateTimeSeriesQuery('etcd_server_heartbeat_send_failures_total{namespace="openshift-etcd"}','{{ pod }} heartbeat failures') + + generateTimeSeriesQuery('etcd_server_health_failures{namespace="openshift-etcd"}','{{ pod }} health failures') + }, + + compactedKeys: { + query(): + generateTimeSeriesQuery('etcd_debugging_mvcc_db_compaction_keys_total{namespace="openshift-etcd"}','{{ pod }} keys compacted') + } +} \ No newline at end of file diff --git a/assets/etcd-on-cluster-dashboard/variables.libsonnet b/assets/etcd-on-cluster-dashboard/variables.libsonnet new file mode 100644 index 0000000..ded8516 --- /dev/null +++ b/assets/etcd-on-cluster-dashboard/variables.libsonnet @@ -0,0 +1,12 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; +local var = g.dashboard.variable; + +{ + Datasource: + var.datasource.new('Datasource','prometheus') + + var.datasource.withRegex("") + + var.query.generalOptions.withLabel('Datasource') + + var.query.withRefresh(1) + + var.query.selectionOptions.withMulti(false) + + var.query.selectionOptions.withIncludeAll(false), +} \ No newline at end of file diff --git a/templates/General/etcd-on-cluster-dashboard-v2.jsonnet b/templates/General/etcd-on-cluster-dashboard-v2.jsonnet new file mode 100644 index 0000000..f2d08b3 --- /dev/null +++ b/templates/General/etcd-on-cluster-dashboard-v2.jsonnet @@ -0,0 +1,69 @@ +local panels = import '../../assets/etcd-on-cluster-dashboard/panels.libsonnet'; +local queries = import '../../assets/etcd-on-cluster-dashboard/queries.libsonnet'; +local variables = import '../../assets/etcd-on-cluster-dashboard/variables.libsonnet'; +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +g.dashboard.new('etcd-cluster-info dashoard') ++ g.dashboard.time.withFrom('now-1h') ++ g.dashboard.time.withTo('now') ++ g.dashboard.withTimezone('utc') ++ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) ++ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) ++ g.dashboard.withRefresh('') ++ g.dashboard.withEditable(false) ++ g.dashboard.graphTooltip.withSharedCrosshair() ++ g.dashboard.withVariables([ + variables.Datasource, +]) + ++ g.dashboard.withPanels([ + g.panel.row.new('General Resource Usage') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.generalUsageAgg('CPU usage', 'percent', queries.CPUUsage.query(), { x: 0, y: 1, w: 12, h: 8 }), + panels.timeSeries.generalUsageAgg('Memory usage', 'bytes', queries.memoryUsage.query(), { x: 12, y: 1, w: 12, h: 8 }), + panels.timeSeries.generalUsageAgg('Disk WAL Sync Duration', 's', queries.diskWalSyncDuration.query(), { x: 0, y: 8, w: 12, h: 8 }), + panels.timeSeries.generalUsageAgg('Disk Backend Sync Duration', 's', queries.diskBackendSyncDuration.query(), { x: 12, y: 8, w: 12, h: 8 }), + panels.timeSeries.generalUsageAgg('Etcd container disk writes', 'Bps', queries.etcdContainerDiskWrites.query(), { x: 0, y: 16, w: 12, h: 8 }), + panels.timeSeries.generalUsageAgg('DB Size', 'bytes', queries.dbSize.query(), { x: 12, y: 16, w: 12, h: 8 }), + ]), + + g.panel.row.new('Network Usage') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.generalUsageAgg('Container network traffic', 'Bps', queries.containerNetworkTraffic.query(), { x: 0, y: 1, w: 12, h: 8 }), + panels.timeSeries.generalUsageAgg('p99 peer to peer latency', 's', queries.p99PeerToPeerLatency.query(), { x: 12, y: 1, w: 12, h: 8 }), + panels.timeSeries.generalUsageAgg('Peer network traffic', 'Bps', queries.peerNetworkTraffic.query(), { x: 0, y: 8, w: 12, h: 8 }), + panels.timeSeries.generalUsageAgg('gRPC network traffic', 'Bps', queries.gRPCNetworkTraffic.query(), { x: 12, y: 8, w: 12, h: 8 }), + panels.timeSeries.withoutCalcsAgg('Active Streams', '', queries.activeStreams.query(), { x: 0, y: 16, w: 12, h: 8 }), + panels.timeSeries.withoutCalcsAgg('Snapshot duration', 's', queries.snapshotDuration.query(), { x: 12, y: 16, w: 12, h: 8 }), + ]), + + g.panel.row.new('DB Info per Member') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.withoutCalcsAgg('% DB Space Used', 'percent', queries.dbSpaceUsed.query(), { x: 0, y: 8, w: 8, h: 8 }), + panels.timeSeries.withoutCalcsAgg('DB Left capacity (with fragmented space)', 'bytes', queries.dbLeftCapacity.query(), { x: 8, y: 8, w: 8, h: 8 }), + panels.timeSeries.withoutCalcsAgg('DB Size Limit (Backend-bytes)', 'bytes', queries.dbSizeLimit.query(), { x: 16, y: 8, w: 8, h: 8 }), + ]), + + g.panel.row.new('General Info') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.GeneralInfo('Raft Proposals', '', queries.raftProposals.query(), { x: 0, y: 1, w: 12, h: 8 }), + panels.timeSeries.GeneralInfo('Number of leader changes seen', '', queries.numberOfLeaderChangesSeen.query(), { x: 12, y: 1, w: 12, h: 8 }), + panels.stat.etcdLeader('Etcd has a leader?', 'none', queries.etcdHasALeader.query(), { x: 0, y: 8, w: 6, h: 2 }), + panels.stat.failedProposalsSeen('Total number of failed proposals seen', 'none', queries.totalNumberOfProposalsSeen.query(), { x: 6, y: 8, w: 6, h: 2 }), + panels.timeSeries.GeneralInfo('Keys', 'short', queries.keys.query(), { x: 12, y: 12, w: 12, h: 8 }), + panels.timeSeries.GeneralInfo('Leader Elections Per Day', 'short', queries.leaderElectionsPerDay.query(), { x: 0, y: 12, w: 12, h: 6 }), + panels.timeSeries.GeneralInfo('Slow Operations', 'ops', queries.slowOperations.query(), { x: 0, y: 20, w: 12, h: 8 }), + panels.timeSeries.GeneralInfo('Key Operations', 'ops', queries.keyOperations.query(), { x: 12, y: 20, w: 12, h: 8 }), + panels.timeSeries.GeneralInfo('Heartbeat Failures', 'short', queries.heartBeatFailure.query(), { x: 0, y: 28, w: 12, h: 8 }), + panels.timeSeries.GeneralInfo('Compacted Keys', 'short', queries.compactedKeys.query(), { x: 12, y: 28, w: 12, h: 8 }), + ]), + +])