Skip to content

Commit

Permalink
feat: add missing cluster label to mixins (#12870)
Browse files Browse the repository at this point in the history
Signed-off-by: QuentinBisson <[email protected]>
Co-authored-by: Vladyslav Diachenko <[email protected]>
Co-authored-by: Callum Styan <[email protected]>
  • Loading branch information
3 people authored Sep 19, 2024
1 parent fc90a63 commit 547ca70
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 31 deletions.
2 changes: 1 addition & 1 deletion production/helm/loki/src/alerts.yaml.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ groups:
message: |
{{`{{`}} $labels.cluster {{`}}`}} {{`{{`}} $labels.namespace {{`}}`}} has had {{`{{`}} printf "%.0f" $value {{`}}`}} compactors running for more than 5m. Only one compactor should run at a time.
expr: |
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1
for: "5m"
labels:
severity: "warning"
Expand Down
14 changes: 7 additions & 7 deletions production/loki-mixin-compiled-ssd/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,29 @@ groups:
- alert: LokiRequestErrors
annotations:
description: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
{{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
summary: Loki request error rate is high.
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route)
> 10
for: 15m
labels:
severity: critical
- alert: LokiRequestPanics
annotations:
description: |
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
{{ $labels.cluster }} {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
summary: Loki requests are causing code panics.
expr: |
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
sum(increase(loki_panic_total[10m])) by (cluster, namespace, job) > 0
labels:
severity: critical
- alert: LokiRequestLatency
annotations:
description: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
{{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
summary: Loki request error latency is high.
expr: |
cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1
Expand All @@ -39,7 +39,7 @@ groups:
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
summary: Loki deployment is running more than one compactor.
expr: |
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1
for: 5m
labels:
severity: warning
14 changes: 7 additions & 7 deletions production/loki-mixin-compiled/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,29 @@ groups:
- alert: LokiRequestErrors
annotations:
description: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
{{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
summary: Loki request error rate is high.
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route)
> 10
for: 15m
labels:
severity: critical
- alert: LokiRequestPanics
annotations:
description: |
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
{{ $labels.cluster }} {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
summary: Loki requests are causing code panics.
expr: |
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
sum(increase(loki_panic_total[10m])) by (cluster, namespace, job) > 0
labels:
severity: critical
- alert: LokiRequestLatency
annotations:
description: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
{{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
summary: Loki request error latency is high.
expr: |
cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1
Expand All @@ -39,7 +39,7 @@ groups:
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
summary: Loki deployment is running more than one compactor.
expr: |
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1
for: 5m
labels:
severity: warning
32 changes: 16 additions & 16 deletions production/loki-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -6,36 +6,36 @@
rules: [
{
alert: 'LokiRequestErrors',
expr: |||
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
expr: std.strReplace(|||
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route)
> 10
|||,
|||, 'cluster', $._config.per_cluster_label),
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Loki request error rate is high.',
description: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|||,
description: std.strReplace(|||
{{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|||, 'cluster', $._config.per_cluster_label),
},
},
{
alert: 'LokiRequestPanics',
expr: |||
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
|||,
sum(increase(loki_panic_total[10m])) by (%s, namespace, job) > 0
||| % $._config.per_cluster_label,
labels: {
severity: 'critical',
},
annotations: {
summary: 'Loki requests are causing code panics.',
description: |||
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
|||,
description: std.strReplace(|||
{{ $labels.cluster }} {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
|||, 'cluster', $._config.per_cluster_label),
},
},
{
Expand All @@ -49,15 +49,15 @@
},
annotations: {
summary: 'Loki request error latency is high.',
description: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|||,
description: std.strReplace(|||
{{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|||, 'cluster', $._config.per_cluster_label),
},
},
{
alert: 'LokiTooManyCompactorsRunning',
expr: |||
sum(loki_boltdb_shipper_compactor_running) by (namespace, %s) > 1
sum(loki_boltdb_shipper_compactor_running) by (%s, namespace) > 1
||| % $._config.per_cluster_label,
'for': '5m',
labels: {
Expand Down

0 comments on commit 547ca70

Please sign in to comment.