Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add missing cluster label to mixins #12870

Merged
merged 11 commits into from
Sep 19, 2024
2 changes: 1 addition & 1 deletion production/helm/loki/src/alerts.yaml.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ groups:
message: |
{{`{{`}} $labels.cluster {{`}}`}} {{`{{`}} $labels.namespace {{`}}`}} has had {{`{{`}} printf "%.0f" $value {{`}}`}} compactors running for more than 5m. Only one compactor should run at a time.
expr: |
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Making sure they are all in the same order

sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1
for: "5m"
labels:
severity: "warning"
Expand Down
14 changes: 7 additions & 7 deletions production/loki-mixin-compiled-ssd/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,29 @@ groups:
- alert: LokiRequestErrors
annotations:
description: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
{{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
summary: Loki request error rate is high.
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route)
> 10
for: 15m
labels:
severity: critical
- alert: LokiRequestPanics
annotations:
description: |
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
{{ $labels.cluster }} {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
summary: Loki requests are causing code panics.
expr: |
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
sum(increase(loki_panic_total[10m])) by (cluster, namespace, job) > 0
labels:
severity: critical
- alert: LokiRequestLatency
annotations:
description: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
{{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
summary: Loki request error latency is high.
expr: |
cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1
Expand All @@ -39,7 +39,7 @@ groups:
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
summary: Loki deployment is running more than one compactor.
expr: |
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1
for: 5m
labels:
severity: warning
14 changes: 7 additions & 7 deletions production/loki-mixin-compiled/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,29 @@ groups:
- alert: LokiRequestErrors
annotations:
description: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
{{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
summary: Loki request error rate is high.
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route)
> 10
for: 15m
labels:
severity: critical
- alert: LokiRequestPanics
annotations:
description: |
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
{{ $labels.cluster }} {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
summary: Loki requests are causing code panics.
expr: |
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
sum(increase(loki_panic_total[10m])) by (cluster, namespace, job) > 0
labels:
severity: critical
- alert: LokiRequestLatency
annotations:
description: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
{{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
summary: Loki request error latency is high.
expr: |
cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1
Expand All @@ -39,7 +39,7 @@ groups:
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
summary: Loki deployment is running more than one compactor.
expr: |
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1
for: 5m
labels:
severity: warning
32 changes: 16 additions & 16 deletions production/loki-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -6,36 +6,36 @@
rules: [
{
alert: 'LokiRequestErrors',
expr: |||
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
expr: std.strReplace(|||
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route)
> 10
|||,
|||, 'cluster', $._config.per_cluster_label),
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Loki request error rate is high.',
description: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|||,
description: std.strReplace(|||
{{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|||, 'cluster', $._config.per_cluster_label),
},
},
{
alert: 'LokiRequestPanics',
expr: |||
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
|||,
sum(increase(loki_panic_total[10m])) by (%s, namespace, job) > 0
||| % $._config.per_cluster_label,
labels: {
severity: 'critical',
},
annotations: {
summary: 'Loki requests are causing code panics.',
description: |||
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
|||,
description: std.strReplace(|||
{{ $labels.cluster }} {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
|||, 'cluster', $._config.per_cluster_label),
},
},
{
Expand All @@ -49,15 +49,15 @@
},
annotations: {
summary: 'Loki request error latency is high.',
description: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|||,
description: std.strReplace(|||
{{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|||, 'cluster', $._config.per_cluster_label),
},
},
{
alert: 'LokiTooManyCompactorsRunning',
expr: |||
sum(loki_boltdb_shipper_compactor_running) by (namespace, %s) > 1
sum(loki_boltdb_shipper_compactor_running) by (%s, namespace) > 1
||| % $._config.per_cluster_label,
'for': '5m',
labels: {
Expand Down
Loading