diff --git a/CHANGELOG.md b/CHANGELOG.md index 23546b149a7d..b2fcaa2089ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,9 @@ Main (unreleased) - Grafana Agent on Windows now automatically restarts on failure. (@hainenber) +- Added metrics, alerts and dashboard visualisations to help diagnose issues + with unhealthy components and components that take too long to evaluate. (@thampiotr) + ### Bugfixes - Permit `X-Faro-Session-ID` header in CORS requests for the `faro.receiver` diff --git a/operations/agent-flow-mixin/alerts.libsonnet b/operations/agent-flow-mixin/alerts.libsonnet index 92555dc89d5e..d1473e605ed4 100644 --- a/operations/agent-flow-mixin/alerts.libsonnet +++ b/operations/agent-flow-mixin/alerts.libsonnet @@ -2,6 +2,7 @@ prometheusAlerts+: { groups+: [ (import './alerts/clustering.libsonnet'), + (import './alerts/controller.libsonnet'), ], }, } diff --git a/operations/agent-flow-mixin/alerts/controller.libsonnet b/operations/agent-flow-mixin/alerts/controller.libsonnet new file mode 100644 index 000000000000..b6451db76292 --- /dev/null +++ b/operations/agent-flow-mixin/alerts/controller.libsonnet @@ -0,0 +1,22 @@ +local alert = import './utils/alert.jsonnet'; + +alert.newGroup( + 'agent_controller', + [ + // Component evaluations are taking too long, which can lead to e.g. stale targets. + alert.newRule( + 'SlowComponentEvaluations', + 'sum by (cluster, namespace, component_id) (rate(agent_component_evaluation_slow_seconds[10m])) > 0', + 'Flow component evaluations are taking too long.', + '15m', + ), + + // Unhealthy components detected. + alert.newRule( + 'UnhealthyComponents', + 'sum(agent_component_controller_running_components{health_type!="healthy"}) > 0', + 'Unhealthy Flow components detected.', + '15m', + ), + ] +) diff --git a/operations/agent-flow-mixin/dashboards/controller.libsonnet b/operations/agent-flow-mixin/dashboards/controller.libsonnet index 1dcfcd3c9ab3..3876d612a77b 100644 --- a/operations/agent-flow-mixin/dashboards/controller.libsonnet +++ b/operations/agent-flow-mixin/dashboards/controller.libsonnet @@ -233,6 +233,28 @@ local filename = 'agent-flow-controller.json'; ]) ), + // Slow components evaluation time % + ( + panel.new(title='Slow components evaluation times', type='timeseries') + + panel.withUnit('percentunit') + + panel.withDescription(||| + The percentage of time spent evaluating 'slow' components - components that took longer than 1 minute to evaluate. + + Ideally, no component should take more than 1 minute to evaluate. The components displayed in this chart + may be a sign of a problem with the pipeline. + |||) + + panel.withPosition({ x: 16, y: 12, w: 8, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr=||| + sum by (component_id) (rate(agent_component_evaluation_slow_seconds{cluster="$cluster", namespace="$namespace"}[$__rate_interval])) + / scalar(sum(rate(agent_component_evaluation_seconds_sum{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))) + |||, + legendFormat='{{component_id}}', + ), + ]) + ), + // Component evaluation histogram ( panel.newHeatmap('Component evaluation histogram') + @@ -242,7 +264,7 @@ local filename = 'agent-flow-controller.json'; The goal is to design your config so that evaluations take as little time as possible; under 100ms is a good goal. |||) + - panel.withPosition({ x: 16, y: 12, w: 8, h: 10 }) + + panel.withPosition({ x: 0, y: 22, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( expr='sum by (le) (increase(agent_component_evaluation_seconds_bucket{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))', @@ -261,7 +283,7 @@ local filename = 'agent-flow-controller.json'; The goal is to design your config so that most of the time components do not queue for long; under 10ms is a good goal. |||) + - panel.withPosition({ x: 0, y: 22, w: 8, h: 10 }) + + panel.withPosition({ x: 8, y: 22, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( expr='sum by (le) (increase(agent_component_dependencies_wait_seconds_bucket{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))', diff --git a/pkg/flow/internal/controller/loader.go b/pkg/flow/internal/controller/loader.go index a0954f82333f..d44a2a5062eb 100644 --- a/pkg/flow/internal/controller/loader.go +++ b/pkg/flow/internal/controller/loader.go @@ -149,9 +149,7 @@ func (l *Loader) Apply(args map[string]any, componentBlocks []*ast.BlockStmt, co defer func() { span.SetStatus(codes.Ok, "") - duration := time.Since(start) - level.Info(logger).Log("msg", "finished complete graph evaluation", "duration", duration) - l.cm.componentEvaluationTime.Observe(duration.Seconds()) + level.Info(logger).Log("msg", "finished complete graph evaluation", "duration", time.Since(start)) }() l.cache.ClearModuleExports() @@ -234,7 +232,6 @@ func (l *Loader) Apply(args map[string]any, componentBlocks []*ast.BlockStmt, co l.graph = &newGraph l.cache.SyncIDs(componentIDs) l.blocks = componentBlocks - l.cm.componentEvaluationTime.Observe(time.Since(start).Seconds()) if l.globals.OnExportsChange != nil && l.cache.ExportChangeIndex() != l.moduleExportIndex { l.moduleExportIndex = l.cache.ExportChangeIndex() l.globals.OnExportsChange(l.cache.CreateModuleExports()) @@ -652,8 +649,8 @@ func (l *Loader) concurrentEvalFn(n dag.Node, spanCtx context.Context, tracer tr defer func() { duration := time.Since(start) + l.cm.onComponentEvaluationDone(n.NodeID(), duration) level.Info(l.log).Log("msg", "finished node evaluation", "node_id", n.NodeID(), "duration", duration) - l.cm.componentEvaluationTime.Observe(duration.Seconds()) }() var err error diff --git a/pkg/flow/internal/controller/metrics.go b/pkg/flow/internal/controller/metrics.go index ff6c427d7e96..1c5a558ccc1b 100644 --- a/pkg/flow/internal/controller/metrics.go +++ b/pkg/flow/internal/controller/metrics.go @@ -1,20 +1,32 @@ package controller import ( + "fmt" + "time" + "github.com/prometheus/client_golang/prometheus" ) // controllerMetrics contains the metrics for components controller type controllerMetrics struct { - controllerEvaluation prometheus.Gauge - componentEvaluationTime prometheus.Histogram - dependenciesWaitTime prometheus.Histogram - evaluationQueueSize prometheus.Gauge + controllerEvaluation prometheus.Gauge + componentEvaluationTime prometheus.Histogram + dependenciesWaitTime prometheus.Histogram + evaluationQueueSize prometheus.Gauge + slowComponentThreshold time.Duration + slowComponentEvaluationTime *prometheus.CounterVec } // newControllerMetrics inits the metrics for the components controller func newControllerMetrics(id string) *controllerMetrics { - cm := &controllerMetrics{} + cm := &controllerMetrics{ + slowComponentThreshold: 1 * time.Minute, + } + + // The evaluation time becomes particularly problematic in the range of 30s+, so add more buckets + // that can help spot issues in that range. + // Use the following buckets: 5ms, 25ms, 100ms, 500ms, 1s, 5s, 10s, 30s, 1m, 2m, 5m, 10m + evaluationTimesBuckets := []float64{.005, .025, .1, .5, 1, 5, 10, 30, 60, 120, 300, 600} cm.controllerEvaluation = prometheus.NewGauge(prometheus.GaugeOpts{ Name: "agent_component_controller_evaluating", @@ -27,6 +39,7 @@ func newControllerMetrics(id string) *controllerMetrics { Name: "agent_component_evaluation_seconds", Help: "Time spent performing component evaluation", ConstLabels: map[string]string{"controller_id": id}, + Buckets: evaluationTimesBuckets, }, ) cm.dependenciesWaitTime = prometheus.NewHistogram( @@ -34,6 +47,7 @@ func newControllerMetrics(id string) *controllerMetrics { Name: "agent_component_dependencies_wait_seconds", Help: "Time spent by components waiting to be evaluated after their dependency is updated.", ConstLabels: map[string]string{"controller_id": id}, + Buckets: evaluationTimesBuckets, }, ) @@ -43,14 +57,28 @@ func newControllerMetrics(id string) *controllerMetrics { ConstLabels: map[string]string{"controller_id": id}, }) + cm.slowComponentEvaluationTime = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "agent_component_evaluation_slow_seconds", + Help: fmt.Sprintf("Number of seconds spent evaluating components that take longer than %v to evaluate", cm.slowComponentThreshold), + ConstLabels: map[string]string{"controller_id": id}, + }, []string{"component_id"}) + return cm } +func (cm *controllerMetrics) onComponentEvaluationDone(name string, duration time.Duration) { + cm.componentEvaluationTime.Observe(duration.Seconds()) + if duration >= cm.slowComponentThreshold { + cm.slowComponentEvaluationTime.WithLabelValues(name).Add(duration.Seconds()) + } +} + func (cm *controllerMetrics) Collect(ch chan<- prometheus.Metric) { cm.componentEvaluationTime.Collect(ch) cm.controllerEvaluation.Collect(ch) cm.dependenciesWaitTime.Collect(ch) cm.evaluationQueueSize.Collect(ch) + cm.slowComponentEvaluationTime.Collect(ch) } func (cm *controllerMetrics) Describe(ch chan<- *prometheus.Desc) { @@ -58,6 +86,7 @@ func (cm *controllerMetrics) Describe(ch chan<- *prometheus.Desc) { cm.controllerEvaluation.Describe(ch) cm.dependenciesWaitTime.Describe(ch) cm.evaluationQueueSize.Describe(ch) + cm.slowComponentEvaluationTime.Describe(ch) } type controllerCollector struct { diff --git a/production/grafana-agent-mixin/debugging.libsonnet b/production/grafana-agent-mixin/debugging.libsonnet index be2ab983c8ef..711184d333f7 100644 --- a/production/grafana-agent-mixin/debugging.libsonnet +++ b/production/grafana-agent-mixin/debugging.libsonnet @@ -125,3 +125,4 @@ local g = import 'grafana-builder/grafana.libsonnet'; ), }, } +