diff --git a/monitoring-as-code/src/alerts/burn-rate-alerts.libsonnet b/monitoring-as-code/src/alerts/burn-rate-alerts.libsonnet index 6bd2ba17..69d312eb 100644 --- a/monitoring-as-code/src/alerts/burn-rate-alerts.libsonnet +++ b/monitoring-as-code/src/alerts/burn-rate-alerts.libsonnet @@ -136,6 +136,8 @@ local createBurnRateAlerts(config, sliSpec, sliKey, journeyKey) = }, labels: { ci_type: 'CMDB_CI_Service_Auto', + configuration_item: config.configurationItem, + primary_impact_service: config.applicationServiceName, title: alertTitle, wait_for: '%(for)s' % errorBudgetBurnWindow, factor: std.toString(errorBudgetBurnWindow.factor), diff --git a/monitoring-as-code/src/metric-types.libsonnet b/monitoring-as-code/src/metric-types.libsonnet index 3785e25a..29bc9f0f 100644 --- a/monitoring-as-code/src/metric-types.libsonnet +++ b/monitoring-as-code/src/metric-types.libsonnet @@ -283,7 +283,7 @@ }, sliTypesConfig: { availability: { - library: (import 'sli-value-libraries/availability-counter-using-2-failure-metrics-for-num-and-1-total-metric-for-dem.libsonnet'), + library: (import 'sli-value-libraries/availability-gauge-using-2-failure-metrics-for-num-and-1-total-metric-for-dem.libsonnet'), description: 'The error rate for %(sliDescription)s should be below %(metric_target_percent)0.1f%%', targetMetrics: { code4xx: 'count4xx', @@ -530,7 +530,7 @@ }, sliTypesConfig: { availability: { - library: (import 'sli-value-libraries/availability-counter-using-2-failure-metrics-for-num-and-1-total-metric-for-dem.libsonnet'), + library: (import 'sli-value-libraries/availability-gauge-using-2-failure-metrics-for-num-and-1-total-metric-for-dem.libsonnet'), description: 'The error rate for %(sliDescription)s should be below %(metric_target_percent)0.1f%%', targetMetrics: { code4xx: 'sum4xx', @@ -567,7 +567,7 @@ }, sliTypesConfig: { availability: { - library: (import 'sli-value-libraries/availability-counter-using-2-failure-metrics-for-num-and-1-total-metric-for-dem.libsonnet'), + library: (import 'sli-value-libraries/availability-gauge-using-2-failure-metrics-for-num-and-1-total-metric-for-dem.libsonnet'), description: 'The error rate for %(sliDescription)s should be below %(metric_target_percent)0.1f%%', targetMetrics: { code4xx: 'sum4xx', diff --git a/monitoring-as-code/src/sli-value-libraries/availability-counter-using-2-failure-metrics-for-num-and-1-total-metric-for-dem.libsonnet b/monitoring-as-code/src/sli-value-libraries/availability-gauge-using-2-failure-metrics-for-num-and-1-total-metric-for-dem.libsonnet similarity index 78% rename from monitoring-as-code/src/sli-value-libraries/availability-counter-using-2-failure-metrics-for-num-and-1-total-metric-for-dem.libsonnet rename to monitoring-as-code/src/sli-value-libraries/availability-gauge-using-2-failure-metrics-for-num-and-1-total-metric-for-dem.libsonnet index 6715289a..28f93198 100644 --- a/monitoring-as-code/src/sli-value-libraries/availability-counter-using-2-failure-metrics-for-num-and-1-total-metric-for-dem.libsonnet +++ b/monitoring-as-code/src/sli-value-libraries/availability-gauge-using-2-failure-metrics-for-num-and-1-total-metric-for-dem.libsonnet @@ -32,20 +32,16 @@ local createSliValueRule(sliSpec, sliMetadata, config) = sum without (%(selectorLabels)s) (label_replace(label_replace( ( ( - sum by(%(selectorLabels)s) ( - rate(%(code4xxMetric)s{%(selectors)s}[%(evalInterval)s]) - or - 0 * %(codeAllMetric)s{%(selectors)s} - ) - + - sum by(%(selectorLabels)s) ( - rate(%(code5xxMetric)s{%(selectors)s}[%(evalInterval)s]) - or - 0 * %(codeAllMetric)s{%(selectors)s} - ) - ) + sum by(%(selectorLabels)s) (avg_over_time(%(code4xxMetric)s{%(selectors)s}[%(evalInterval)s]) + or 0 * %(codeAllMetric)s{%(selectors)s}) + + + sum by(%(selectorLabels)s) (avg_over_time(%(code5xxMetric)s{%(selectors)s}[%(evalInterval)s]) + or 0 * %(codeAllMetric)s{%(selectors)s}) + >=0) / - sum by(%(selectorLabels)s) (rate(%(codeAllMetric)s{%(selectors)s}[%(evalInterval)s])) + ( + sum by(%(selectorLabels)s) (avg_over_time(%(codeAllMetric)s{%(selectors)s}[%(evalInterval)s])) + >=0) ), "sli_environment", "$1", "%(environmentSelectorLabel)s", "(.*)"), "sli_product", "$1", "%(productSelectorLabel)s", "(.*)")) ||| % { @@ -97,38 +93,40 @@ local createGraphPanel(sliSpec) = ).addTarget( prometheus.target( ||| - sum(rate(%(codeAllMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) + sum(avg_over_time(%(codeAllMetric)s{%(selectors)s}[%(evalInterval)s]) >=0 or vector(0)) ||| % { codeAllMetric: targetMetrics.codeAll, selectors: std.join(',', dashboardSelectors), evalInterval: sliSpec.evalInterval, }, - legendFormat='requests per second', + legendFormat='average requests', ), ).addTarget( prometheus.target( ||| - sum(rate(%(code4xxMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) - + - sum(rate(%(code5xxMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) + sum(avg_over_time(%(code4xxMetric)s{%(selectors)s}[%(evalInterval)s]) >=0 or vector(0)) + + + sum(avg_over_time(%(code5xxMetric)s{%(selectors)s}[%(evalInterval)s]) >=0 or vector(0)) ||| % { code4xxMetric: targetMetrics.code4xx, code5xxMetric: targetMetrics.code5xx, selectors: std.join(',', dashboardSelectors), evalInterval: sliSpec.evalInterval, }, - legendFormat='errors per second', + legendFormat='average errors', ) ).addTarget( prometheus.target( ||| ( - sum(rate(%(code4xxMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) - + - sum(rate(%(code5xxMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) + sum(avg_over_time(%(code4xxMetric)s{%(selectors)s}[%(evalInterval)s]) >=0 or vector(0)) + + + sum(avg_over_time(%(code5xxMetric)s{%(selectors)s}[%(evalInterval)s]) >=0 or vector(0)) ) / - sum(rate(%(codeAllMetric)s{%(selectors)s}[%(evalInterval)s])) + ( + sum(avg_over_time(%(codeAllMetric)s{%(selectors)s}[%(evalInterval)s]) >=0 or vector(0)) + ) ||| % { code4xxMetric: targetMetrics.code4xx, code5xxMetric: targetMetrics.code5xx, diff --git a/monitoring-as-code/src/sli-value-libraries/saturation-gauge-with-useage-metric-and-percent-target.libsonnet b/monitoring-as-code/src/sli-value-libraries/saturation-gauge-with-useage-metric-and-percent-target.libsonnet index 2ea9a07a..981e7592 100644 --- a/monitoring-as-code/src/sli-value-libraries/saturation-gauge-with-useage-metric-and-percent-target.libsonnet +++ b/monitoring-as-code/src/sli-value-libraries/saturation-gauge-with-useage-metric-and-percent-target.libsonnet @@ -70,6 +70,17 @@ local createGraphPanel(sliSpec) = }, min=0, fill=4, + formatY2='percentunit', + thresholds=[ + { + value: sliSpec.metricTarget, + colorMode: 'critical', + op: 'gt', + line: true, + fill: false, + yaxis: 'right', + }, + ], ).addTarget( prometheus.target( ||| @@ -94,12 +105,14 @@ local createGraphPanel(sliSpec) = selectors: std.join(',', dashboardSelectors), evalInterval: sliSpec.evalInterval, }, - legendFormat='avg period where maximum saturation > %s percentage' % sliSpec.counterPercentTarget, + legendFormat='avg period where saturation > %s percent' % sliSpec.counterPercentTarget, ) ).addSeriesOverride( { - alias: '/avg period where maximum saturation > %s percentage/' % sliSpec.counterPercentTarget, + alias: '/avg period where saturation > %s percent/' % sliSpec.counterPercentTarget, + yaxis: 2, color: 'red', + }, ).addSeriesOverride( {