From 07ac35627c86a181a4d86c7f79ddab55b432831c Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Fri, 3 May 2024 10:47:24 +0200 Subject: [PATCH] enhance: add missing cluster label to mixins Signed-off-by: QuentinBisson --- production/loki-mixin-compiled-ssd/alerts.yaml | 6 +++--- production/loki-mixin-compiled/alerts.yaml | 6 +++--- production/loki-mixin/alerts.libsonnet | 18 +++++++++--------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/production/loki-mixin-compiled-ssd/alerts.yaml b/production/loki-mixin-compiled-ssd/alerts.yaml index 7c0825d8580d..114cc6faa1c8 100644 --- a/production/loki-mixin-compiled-ssd/alerts.yaml +++ b/production/loki-mixin-compiled-ssd/alerts.yaml @@ -4,7 +4,7 @@ groups: - alert: LokiRequestErrors annotations: description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + {{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. summary: Loki request error rate is high. expr: | 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) @@ -17,7 +17,7 @@ groups: - alert: LokiRequestPanics annotations: description: | - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + {{ $labels.cluster }} {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. summary: Loki requests are causing code panics. expr: | sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 @@ -26,7 +26,7 @@ groups: - alert: LokiRequestLatency annotations: description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + {{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. summary: Loki request error latency is high. expr: | cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 diff --git a/production/loki-mixin-compiled/alerts.yaml b/production/loki-mixin-compiled/alerts.yaml index 7c0825d8580d..114cc6faa1c8 100644 --- a/production/loki-mixin-compiled/alerts.yaml +++ b/production/loki-mixin-compiled/alerts.yaml @@ -4,7 +4,7 @@ groups: - alert: LokiRequestErrors annotations: description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + {{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. summary: Loki request error rate is high. expr: | 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) @@ -17,7 +17,7 @@ groups: - alert: LokiRequestPanics annotations: description: | - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + {{ $labels.cluster }} {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. summary: Loki requests are causing code panics. expr: | sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 @@ -26,7 +26,7 @@ groups: - alert: LokiRequestLatency annotations: description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + {{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. summary: Loki request error latency is high. expr: | cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 diff --git a/production/loki-mixin/alerts.libsonnet b/production/loki-mixin/alerts.libsonnet index 5bff18e72c6e..65dfc0b6a057 100644 --- a/production/loki-mixin/alerts.libsonnet +++ b/production/loki-mixin/alerts.libsonnet @@ -18,9 +18,9 @@ }, annotations: { summary: 'Loki request error rate is high.', - description: ||| - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - |||, + description: std.strReplace(||| + {{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + |||, 'cluster', $._config.per_cluster_label), }, }, { @@ -33,9 +33,9 @@ }, annotations: { summary: 'Loki requests are causing code panics.', - description: ||| - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. - |||, + description: std.strReplace(||| + {{ $labels.cluster }} {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + |||, 'cluster', $._config.per_cluster_label), }, }, { @@ -49,9 +49,9 @@ }, annotations: { summary: 'Loki request error latency is high.', - description: ||| - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - |||, + description: std.strReplace(||| + {{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + |||, 'cluster', $._config.per_cluster_label), }, }, {