diff --git a/pkg/manager/router/backend_observer.go b/pkg/manager/router/backend_observer.go index d814dda1..1b9c2527 100644 --- a/pkg/manager/router/backend_observer.go +++ b/pkg/manager/router/backend_observer.go @@ -15,6 +15,7 @@ import ( "github.com/pingcap/tiproxy/lib/config" "github.com/pingcap/tiproxy/lib/util/errors" "github.com/pingcap/tiproxy/lib/util/waitgroup" + "github.com/pingcap/tiproxy/pkg/metrics" pnet "github.com/pingcap/tiproxy/pkg/proxy/net" "go.uber.org/zap" ) @@ -156,6 +157,7 @@ func (bo *BackendObserver) Refresh() { func (bo *BackendObserver) observe(ctx context.Context) { for ctx.Err() == nil { + startTime := time.Now() backendInfo, err := bo.fetcher.GetBackendList(ctx) if err != nil { bo.logger.Error("fetching backends encounters error", zap.Error(err)) @@ -167,11 +169,17 @@ func (bo *BackendObserver) observe(ctx context.Context) { } bo.notifyIfChanged(bhMap) } - select { - case <-time.After(bo.healthCheckConfig.Interval): - case <-bo.refreshChan: - case <-ctx.Done(): - return + + cost := time.Since(startTime) + metrics.HealthCheckCycleGauge.Set(cost.Seconds()) + wait := bo.healthCheckConfig.Interval - cost + if wait > 0 { + select { + case <-time.After(wait): + case <-bo.refreshChan: + case <-ctx.Done(): + return + } } } } diff --git a/pkg/manager/router/backend_observer_test.go b/pkg/manager/router/backend_observer_test.go index e57187e8..64c4b82d 100644 --- a/pkg/manager/router/backend_observer_test.go +++ b/pkg/manager/router/backend_observer_test.go @@ -172,6 +172,10 @@ func (ts *observerTestSuite) checkStatus(backend *backendServer, expectedStatus require.True(ts.t, ok) require.Equal(ts.t, expectedStatus, health.status) require.True(ts.t, checkBackendStatusMetrics(backend.sqlAddr, health.status)) + cycle, err := readHealthCheckCycle() + require.NoError(ts.t, err) + require.Greater(ts.t, cycle.Nanoseconds(), int64(0)) + require.Less(ts.t, cycle.Nanoseconds(), 3*time.Second) } func (ts *observerTestSuite) getBackendsFromCh() map[string]*backendHealth { diff --git a/pkg/manager/router/metrics.go b/pkg/manager/router/metrics.go index 958e3150..909bebd3 100644 --- a/pkg/manager/router/metrics.go +++ b/pkg/manager/router/metrics.go @@ -19,7 +19,7 @@ func checkBackendStatusMetrics(addr string, status BackendStatus) bool { if err != nil { return false } - return val == 1 + return int(val) == 1 } func setBackendConnMetrics(addr string, conns int) { @@ -27,7 +27,8 @@ func setBackendConnMetrics(addr string, conns int) { } func readBackendConnMetrics(addr string) (int, error) { - return metrics.ReadGauge(metrics.BackendConnGauge.WithLabelValues(addr)) + val, err := metrics.ReadGauge(metrics.BackendConnGauge.WithLabelValues(addr)) + return int(val), err } func succeedToLabel(succeed bool) string { @@ -53,3 +54,8 @@ func setPingBackendMetrics(addr string, succeed bool, startTime time.Time) { cost := time.Since(startTime) metrics.PingBackendGauge.WithLabelValues(addr).Set(cost.Seconds()) } + +func readHealthCheckCycle() (time.Duration, error) { + seconds, err := metrics.ReadGauge(metrics.HealthCheckCycleGauge) + return time.Duration(int(seconds * float64(time.Second))), err +} diff --git a/pkg/metrics/backend.go b/pkg/metrics/backend.go index 3389727c..150ee530 100644 --- a/pkg/metrics/backend.go +++ b/pkg/metrics/backend.go @@ -43,4 +43,12 @@ var ( Name: "ping_duration_seconds", Help: "Time (s) of pinging the SQL port of each backend.", }, []string{LblBackend}) + + HealthCheckCycleGauge = prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: ModuleProxy, + Subsystem: LabelBackend, + Name: "health_check_seconds", + Help: "Time (s) of each health check cycle.", + }) ) diff --git a/pkg/metrics/grafana/tiproxy_summary.json b/pkg/metrics/grafana/tiproxy_summary.json index cfceb5ac..f66eabda 100644 --- a/pkg/metrics/grafana/tiproxy_summary.json +++ b/pkg/metrics/grafana/tiproxy_summary.json @@ -316,7 +316,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "TiProxy current goroutine counts.", + "description": "TiProxy disconnection count per minute.", "fill": 1, "fillGradient": 0, "gridPos": { @@ -351,6 +351,92 @@ "spaceLength": 10, "stack": false, "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(tiproxy_server_disconnection_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{type}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Disconnection OPM", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "TiProxy current goroutine counts.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, "targets": [ { "expr": "go_goroutines{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", job=\"tiproxy\"}", @@ -415,10 +501,10 @@ "gridPos": { "h": 6, "w": 12, - "x": 0, + "x": 12, "y": 0 }, - "id": 7, + "id": 8, "legend": { "alignAsTable": false, "avg": false, @@ -507,7 +593,7 @@ "x": 0, "y": 0 }, - "id": 8, + "id": 9, "panels": [ { "aliasColors": { }, @@ -524,7 +610,7 @@ "x": 0, "y": 0 }, - "id": 9, + "id": 10, "legend": { "alignAsTable": false, "avg": false, @@ -609,6 +695,178 @@ } ] }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "TiProxy P99 query durations by TiProxy instances.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 11, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(tiproxy_session_query_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "P99 Duration By Instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "TiProxy P99 query durations by backends.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(histogram_quantile(0.99, sum(rate(tiproxy_session_query_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, backend)), \"backend\", \"$1\", \"backend\", \"(.+-tidb-[0-9]+).*peer.*.svc.*\")", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{backend}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "P99 Duration By Backend", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, { "aliasColors": { }, "bars": false, @@ -624,7 +882,7 @@ "x": 12, "y": 0 }, - "id": 10, + "id": 13, "legend": { "alignAsTable": false, "avg": false, @@ -710,7 +968,7 @@ "x": 0, "y": 0 }, - "id": 11, + "id": 14, "legend": { "alignAsTable": false, "avg": false, @@ -796,7 +1054,7 @@ "x": 12, "y": 0 }, - "id": 12, + "id": 15, "legend": { "alignAsTable": false, "avg": false, @@ -885,7 +1143,7 @@ "x": 0, "y": 0 }, - "id": 13, + "id": 16, "panels": [ { "aliasColors": { }, @@ -902,7 +1160,7 @@ "x": 0, "y": 0 }, - "id": 14, + "id": 17, "legend": { "alignAsTable": false, "avg": false, @@ -979,7 +1237,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "Number of session migrations on all backends.", + "description": "OPM of session migrations on all backends.", "fill": 1, "fillGradient": 0, "gridPos": { @@ -988,7 +1246,7 @@ "x": 12, "y": 0 }, - "id": 15, + "id": 18, "legend": { "alignAsTable": false, "avg": false, @@ -1016,7 +1274,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(label_replace(tiproxy_balance_migrate_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}, \"from\", \"$1\", \"from\", \"(.+-tidb-[0-9]+).*peer.*.svc.*\"), \"to\", \"$1\", \"to\", \"(.+-tidb-[0-9]+).*peer.*.svc.*\")", + "expr": "label_replace(label_replace(sum(increase(tiproxy_balance_migrate_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (migrate_res, from, to), \"from\", \"$1\", \"from\", \"(.+-tidb-[0-9]+).*peer.*.svc.*\"), \"to\", \"$1\", \"to\", \"(.+-tidb-[0-9]+).*peer.*.svc.*\")", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{migrate_res}}: {{from}} => {{to}}", @@ -1026,7 +1284,7 @@ "thresholds": [ ], "timeFrom": null, "timeShift": null, - "title": "Session Migrations", + "title": "Session Migration OPM", "tooltip": { "shared": true, "sort": 0, @@ -1074,7 +1332,7 @@ "x": 0, "y": 0 }, - "id": 16, + "id": 19, "legend": { "alignAsTable": false, "avg": false, @@ -1177,7 +1435,7 @@ "x": 0, "y": 0 }, - "id": 17, + "id": 20, "panels": [ { "aliasColors": { }, @@ -1185,7 +1443,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "Number of getting an available backend.", + "description": "Duration of getting an available backend.", "fill": 1, "fillGradient": 0, "gridPos": { @@ -1194,7 +1452,7 @@ "x": 0, "y": 0 }, - "id": 18, + "id": 21, "legend": { "alignAsTable": false, "avg": false, @@ -1222,17 +1480,31 @@ "steppedLine": false, "targets": [ { - "expr": "tiproxy_backend_get_backend{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}", + "expr": "histogram_quantile(0.99, sum(rate(tiproxy_backend_get_backend_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} : {{res}}", + "legendFormat": "99", "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(tiproxy_backend_get_backend_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "95", + "refId": "B" + }, + { + "expr": "sum(rate(tiproxy_backend_get_backend_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[30s])) / sum(rate(tiproxy_backend_get_backend_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "avg", + "refId": "C" } ], "thresholds": [ ], "timeFrom": null, "timeShift": null, - "title": "Get Backend Count", + "title": "Get Backend Duration", "tooltip": { "shared": true, "sort": 0, @@ -1248,7 +1520,7 @@ }, "yaxes": [ { - "format": "short", + "format": "s", "label": null, "logBase": 1, "max": null, @@ -1256,7 +1528,7 @@ "show": true }, { - "format": "short", + "format": "s", "label": null, "logBase": 1, "max": null, @@ -1271,7 +1543,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "Duration of getting an available backend.", + "description": "Duration of Pinging backends.", "fill": 1, "fillGradient": 0, "gridPos": { @@ -1280,7 +1552,7 @@ "x": 12, "y": 0 }, - "id": 19, + "id": 22, "legend": { "alignAsTable": false, "avg": false, @@ -1308,31 +1580,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tiproxy_backend_get_backend_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "expr": "label_replace(tiproxy_backend_ping_duration_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}, \"backend\", \"$1\", \"backend\", \"(.+-tidb-[0-9]+).*peer.*.svc.*\")", "format": "time_series", "intervalFactor": 2, - "legendFormat": "99", + "legendFormat": "{{instance}} | {{backend}}", "refId": "A" - }, - { - "expr": "histogram_quantile(0.95, sum(rate(tiproxy_backend_get_backend_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95", - "refId": "B" - }, - { - "expr": "sum(rate(tiproxy_backend_get_backend_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[30s])) / sum(rate(tiproxy_backend_get_backend_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", - "refId": "C" } ], "thresholds": [ ], "timeFrom": null, "timeShift": null, - "title": "Get Backend Duration", + "title": "Ping Backend Duration", "tooltip": { "shared": true, "sort": 0, @@ -1371,7 +1629,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "Duration of Pinging backends.", + "description": "Duration of each health check cycle.", "fill": 1, "fillGradient": 0, "gridPos": { @@ -1380,7 +1638,7 @@ "x": 0, "y": 0 }, - "id": 20, + "id": 23, "legend": { "alignAsTable": false, "avg": false, @@ -1408,17 +1666,17 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(tiproxy_backend_ping_duration_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}, \"backend\", \"$1\", \"backend\", \"(.+-tidb-[0-9]+).*peer.*.svc.*\")", + "expr": "tiproxy_backend_health_check_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} | {{backend}}", + "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [ ], "timeFrom": null, "timeShift": null, - "title": "Ping Backend Duration", + "title": "Health Check Cycle", "tooltip": { "shared": true, "sort": 0, diff --git a/pkg/metrics/grafana/tiproxy_summary.jsonnet b/pkg/metrics/grafana/tiproxy_summary.jsonnet index 22c65863..887b0c90 100644 --- a/pkg/metrics/grafana/tiproxy_summary.jsonnet +++ b/pkg/metrics/grafana/tiproxy_summary.jsonnet @@ -106,6 +106,20 @@ local connectionP = graphPanel.new( ) ); +local disconnP = graphPanel.new( + title='Disconnection OPM', + datasource=myDS, + legend_rightSide=true, + description='TiProxy disconnection count per minute.', + format='short', +) +.addTarget( + prometheus.target( + 'sum(increase(tiproxy_server_disconnection_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (type)', + legendFormat='{{type}}', + ) +); + local goroutineP = graphPanel.new( title='Goroutine Count', datasource=myDS, @@ -208,17 +222,31 @@ local durationP = graphPanel.new( ) ); -local durationByBackP = graphPanel.new( - title='Duration By Backend', +local durByInstP = graphPanel.new( + title='P99 Duration By Instance', datasource=myDS, legend_rightSide=true, - description='TiProxy P99 query durations by instances and backends.', + description='TiProxy P99 query durations by TiProxy instances.', format='s', ) .addTarget( prometheus.target( - 'label_replace(histogram_quantile(0.99, sum(rate(tiproxy_session_query_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (le, instance, backend)), "backend", "$1", "backend", "(.+-tidb-[0-9]+).*peer.*.svc.*")', - legendFormat='{{instance}} | {{backend}}', + 'histogram_quantile(0.99, sum(rate(tiproxy_session_query_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (le, instance))', + legendFormat='{{instance}}', + ) +); + +local durByBackP = graphPanel.new( + title='P99 Duration By Backend', + datasource=myDS, + legend_rightSide=true, + description='TiProxy P99 query durations by backends.', + format='s', +) +.addTarget( + prometheus.target( + 'label_replace(histogram_quantile(0.99, sum(rate(tiproxy_session_query_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (le, backend)), "backend", "$1", "backend", "(.+-tidb-[0-9]+).*peer.*.svc.*")', + legendFormat='{{backend}}', ) ); @@ -281,15 +309,15 @@ local bConnP = graphPanel.new( ); local bMigCounterP = graphPanel.new( - title='Session Migrations', + title='Session Migration OPM', datasource=myDS, legend_rightSide=true, - description='Number of session migrations on all backends.', + description='OPM of session migrations on all backends.', format='short', ) .addTarget( prometheus.target( - 'label_replace(label_replace(tiproxy_balance_migrate_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}, "from", "$1", "from", "(.+-tidb-[0-9]+).*peer.*.svc.*"), "to", "$1", "to", "(.+-tidb-[0-9]+).*peer.*.svc.*")', + 'label_replace(label_replace(sum(increase(tiproxy_balance_migrate_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (migrate_res, from, to), "from", "$1", "from", "(.+-tidb-[0-9]+).*peer.*.svc.*"), "to", "$1", "to", "(.+-tidb-[0-9]+).*peer.*.svc.*")', legendFormat='{{migrate_res}}: {{from}} => {{to}}', ) ); @@ -348,31 +376,32 @@ local bGetDurP = graphPanel.new( ) ); -local bGetBeP = graphPanel.new( - title='Get Backend Count', +local bPingBeP = graphPanel.new( + title='Ping Backend Duration', datasource=myDS, legend_rightSide=true, - description='Number of getting an available backend.', - format='short', + description='Duration of Pinging backends.', + format='s', ) .addTarget( prometheus.target( - 'tiproxy_backend_get_backend{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}', - legendFormat='{{instance}} : {{res}}', + 'label_replace(tiproxy_backend_ping_duration_seconds{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}, "backend", "$1", "backend", "(.+-tidb-[0-9]+).*peer.*.svc.*")', + legendFormat='{{instance}} | {{backend}}', ) ); -local bPingBeP = graphPanel.new( - title='Ping Backend Duration', +local bHealthCycleP = +graphPanel.new( + title='Health Check Cycle', datasource=myDS, legend_rightSide=true, - description='Duration of Pinging backends.', + description='Duration of each health check cycle.', format='s', ) .addTarget( prometheus.target( - 'label_replace(tiproxy_backend_ping_duration_seconds{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}, "backend", "$1", "backend", "(.+-tidb-[0-9]+).*peer.*.svc.*")', - legendFormat='{{instance}} | {{backend}}', + 'tiproxy_backend_health_check_seconds{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}', + legendFormat='{{instance}}', ) ); @@ -392,14 +421,17 @@ newDash .addPanel(cpuP, gridPos=leftPanelPos) .addPanel(memP, gridPos=rightPanelPos) .addPanel(connectionP, gridPos=leftPanelPos) - .addPanel(goroutineP, gridPos=rightPanelPos) - .addPanel(uptimeP, gridPos=leftPanelPos) + .addPanel(disconnP, gridPos=rightPanelPos) + .addPanel(goroutineP, gridPos=leftPanelPos) + .addPanel(uptimeP, gridPos=rightPanelPos) , gridPos=rowPos ) .addPanel( queryRow .addPanel(durationP, gridPos=leftPanelPos) + .addPanel(durByInstP, gridPos=rightPanelPos) + .addPanel(durByBackP, gridPos=leftPanelPos) .addPanel(cpsByInstP, gridPos=rightPanelPos) .addPanel(cpsByBackP, gridPos=leftPanelPos) .addPanel(cpsByCMDP, gridPos=rightPanelPos) @@ -416,9 +448,9 @@ newDash ) .addPanel( backendRow - .addPanel(bGetBeP, gridPos=leftPanelPos) - .addPanel(bGetDurP, gridPos=rightPanelPos) - .addPanel(bPingBeP, gridPos=leftPanelPos) + .addPanel(bGetDurP, gridPos=leftPanelPos) + .addPanel(bPingBeP, gridPos=rightPanelPos) + .addPanel(bHealthCycleP, gridPos=leftPanelPos) , gridPos=rowPos ) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index fc283196..e5cbf7b8 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -154,6 +154,7 @@ func registerProxyMetrics() { prometheus.MustRegister(collectors.NewGoCollector(collectors.WithGoCollections(collectors.GoRuntimeMetricsCollection | collectors.GoRuntimeMemStatsCollection))) prometheus.MustRegister(ConnGauge) + prometheus.MustRegister(DisConnCounter) prometheus.MustRegister(MaxProcsGauge) prometheus.MustRegister(ServerEventCounter) prometheus.MustRegister(ServerErrCounter) @@ -166,6 +167,7 @@ func registerProxyMetrics() { prometheus.MustRegister(GetBackendCounter) prometheus.MustRegister(PingBackendGauge) prometheus.MustRegister(BackendConnGauge) + prometheus.MustRegister(HealthCheckCycleGauge) prometheus.MustRegister(MigrateCounter) prometheus.MustRegister(MigrateDurationHistogram) } @@ -192,10 +194,10 @@ func ReadCounter(counter prometheus.Counter) (int, error) { } // ReadGauge reads the value from the gauge. It is only used for testing. -func ReadGauge(gauge prometheus.Gauge) (int, error) { +func ReadGauge(gauge prometheus.Gauge) (float64, error) { var metric dto.Metric if err := gauge.Write(&metric); err != nil { return 0, err } - return int(metric.Gauge.GetValue()), nil + return metric.Gauge.GetValue(), nil } diff --git a/pkg/metrics/server.go b/pkg/metrics/server.go index 683baa95..d93b8c4c 100644 --- a/pkg/metrics/server.go +++ b/pkg/metrics/server.go @@ -26,6 +26,14 @@ var ( Help: "Number of connections.", }) + DisConnCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: ModuleProxy, + Subsystem: LabelServer, + Name: "disconnection_total", + Help: "Number of disconnections.", + }, []string{LblType}) + MaxProcsGauge = prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: ModuleProxy, diff --git a/pkg/proxy/backend/backend_conn_mgr.go b/pkg/proxy/backend/backend_conn_mgr.go index 3faee4ae..8ea4d256 100644 --- a/pkg/proxy/backend/backend_conn_mgr.go +++ b/pkg/proxy/backend/backend_conn_mgr.go @@ -244,7 +244,7 @@ func (mgr *BackendConnManager) getBackendIO(cctx ConnContext, auth *Authenticato addGetBackendMetrics(duration, err == nil) if err != nil { mgr.logger.Error("get backend failed", zap.Duration("duration", duration), zap.NamedError("last_err", origErr)) - } else if duration >= 3*time.Second { + } else if duration >= time.Second { mgr.logger.Warn("get backend slow", zap.Duration("duration", duration), zap.NamedError("last_err", origErr), zap.String("backend_addr", mgr.ServerAddr())) } diff --git a/pkg/proxy/client/client_conn.go b/pkg/proxy/client/client_conn.go index b66afe16..c43e2ef8 100644 --- a/pkg/proxy/client/client_conn.go +++ b/pkg/proxy/client/client_conn.go @@ -9,6 +9,7 @@ import ( "net" "github.com/pingcap/tiproxy/lib/util/errors" + "github.com/pingcap/tiproxy/pkg/metrics" "github.com/pingcap/tiproxy/pkg/proxy/backend" pnet "github.com/pingcap/tiproxy/pkg/proxy/net" "go.uber.org/zap" @@ -62,6 +63,7 @@ clean: fields = append(fields, zap.Stringer("quit_source", src), zap.Error(err)) cc.logger.Warn(msg, fields...) } + metrics.DisConnCounter.WithLabelValues(src.String()).Inc() } func (cc *ClientConnection) processMsg(ctx context.Context) error {