From af5b69e504f7fa22f55c8a87686e7e604b28468b Mon Sep 17 00:00:00 2001 From: Mike Pearson <95076970+michaelpearsonHO@users.noreply.github.com> Date: Wed, 17 Aug 2022 17:19:22 +0100 Subject: [PATCH 01/12] fix: correct sql metric naming and labels (#168) * fix: correct sql metric naming and labels * fix: updated panel that was not using dynamic label Co-authored-by: finlaymccormickHO --- .../cloudwatch-sqs.libsonnet | 48 +++++++++---------- monitoring-as-code/src/metric-types.libsonnet | 17 ++++--- .../sli-freshness-cloudwatch-sqs.libsonnet | 14 +++--- 3 files changed, 39 insertions(+), 40 deletions(-) diff --git a/monitoring-as-code/src/dashboards/detail-dashboard-elements/cloudwatch-sqs.libsonnet b/monitoring-as-code/src/dashboards/detail-dashboard-elements/cloudwatch-sqs.libsonnet index 9c36b44c..73c7e869 100644 --- a/monitoring-as-code/src/dashboards/detail-dashboard-elements/cloudwatch-sqs.libsonnet +++ b/monitoring-as-code/src/dashboards/detail-dashboard-elements/cloudwatch-sqs.libsonnet @@ -35,7 +35,7 @@ local createCustomTemplates(direction, metrics, customSelectorLabels, customSele multi = true, refresh = 'time', ) - for selectorLabel in customSelectorLabels.targetQueue + for selectorLabel in customSelectorLabels.deadletterQueueName ] + [ template.new( name = '%s_deadletter_%s' % [direction, selectorLabel], @@ -53,7 +53,7 @@ local createCustomTemplates(direction, metrics, customSelectorLabels, customSele multi = true, refresh = 'time', ) - for selectorLabel in customSelectorLabels.targetQueue + for selectorLabel in customSelectorLabels.deadletterQueueName ] ]); @@ -69,14 +69,14 @@ local createCustomSelectors(direction, customSelectorLabels, customSelectorValue selectorLabel: selectorLabel, direction: direction, }, - customSelectorLabels.targetQueue + customSelectorLabels.deadletterQueueName )), deadletterQueueTemplate: std.join(', ', std.map( function(selectorLabel) '%(selectorLabel)s=~"$%(direction)s_deadletter_%(selectorLabel)s|"' % { selectorLabel: selectorLabel, direction: direction, }, - customSelectorLabels.targetQueue + customSelectorLabels.deadletterQueueName )), standardQueue: std.join(', ', std.map( function(selectorLabel) '%s!~"%s|"' % [selectorLabel, std.join('|', customSelectorValues.deadletterQueueType)], @@ -109,17 +109,17 @@ local createPanels(direction, metrics, selectorLabels, customSelectorLabels, cus justifyMode = 'center', ).addTarget( prometheus.target(||| - sum by (%(targetQueueSelectorLabels)s) ({__name__=~"%(oldestMessageMetrics)s", %(queueSelectors)s, + sum by (%(deadletterQueueNameSelectorLabels)s) ({__name__=~"%(oldestMessageMetrics)s", %(queueSelectors)s, %(queueTemplateSelectors)s, %(environmentSelectors)s, %(productSelectors)s}) ||| % { - targetQueueSelectorLabels: std.join(', ', customSelectorLabels.targetQueue), + deadletterQueueNameSelectorLabels: std.join(', ', customSelectorLabels.deadletterQueueName), oldestMessageMetrics: std.join('|', metrics.oldestMessage), queueSelectors: selectors.deadletterQueue, queueTemplateSelectors: selectors.deadletterQueueTemplate, environmentSelectors: selectors.environment, productSelectors: selectors.product, - }, - legendFormat = '{{%s}}' % std.join(', ', customSelectorLabels.targetQueue)) + }, + legendFormat = '{{%s}}' % std.join(', ', customSelectorLabels.deadletterQueueName)) ) + { options+: { textMode: 'value_and_name' } } + { gridPos: { w: 12, h: 10 } }], [graphPanel.new( // SQS messages visible in DLQs @@ -128,17 +128,17 @@ local createPanels(direction, metrics, selectorLabels, customSelectorLabels, cus min = 0, ).addTarget( prometheus.target(||| - sum by (%(targetQueueSelectorLabels)s) ({__name__=~"%(messagesVisibleMetrics)s", %(queueSelectors)s, + sum by (%(deadletterQueueNameSelectorLabels)s) ({__name__=~"%(messagesVisibleMetrics)s", %(queueSelectors)s, %(queueTemplateSelectors)s, %(environmentSelectors)s, %(productSelectors)s}) ||| % { - targetQueueSelectorLabels: std.join(', ', customSelectorLabels.targetQueue), + deadletterQueueNameSelectorLabels: std.join(', ', customSelectorLabels.deadletterQueueName), messagesVisibleMetrics: std.join('|', metrics.messagesVisible), queueSelectors: selectors.deadletterQueue, queueTemplateSelectors: selectors.deadletterQueueTemplate, environmentSelectors: selectors.environment, productSelectors: selectors.product, }, - legendFormat = '{{%s}}' % std.join(', ', customSelectorLabels.targetQueue)) + legendFormat = '{{%s}}' % std.join(', ', customSelectorLabels.deadletterQueueName)) ) + { gridPos: { w: 12, h: 10, x: 12 } }], [statPanel.new( // SQS messages age of oldest message in standard queues @@ -148,17 +148,17 @@ local createPanels(direction, metrics, selectorLabels, customSelectorLabels, cus justifyMode = 'center', ).addTarget( prometheus.target(||| - sum by (dimension_QueueName) ({__name__=~"%(oldestMessageMetrics)s", %(queueSelectors)s, + sum by (%(deadletterQueueNameSelectorLabels)s) ({__name__=~"%(oldestMessageMetrics)s", %(queueSelectors)s, %(queueTemplateSelectors)s, %(environmentSelectors)s, %(productSelectors)s}) ||| % { - targetQueueSelectorLabels: std.join(', ', customSelectorLabels.targetQueue), + deadletterQueueNameSelectorLabels: std.join(', ', customSelectorLabels.deadletterQueueName), oldestMessageMetrics: std.join('|', metrics.oldestMessage), queueSelectors: selectors.standardQueue, queueTemplateSelectors: selectors.standardQueueTemplate, environmentSelectors: selectors.environment, productSelectors: selectors.product, - }, - legendFormat = '{{%s}}' % std.join(', ', customSelectorLabels.targetQueue)) + }, + legendFormat = '{{%s}}' % std.join(', ', customSelectorLabels.deadletterQueueName)) ) + { options+: { textMode: 'value_and_name' } } + { gridPos: { w: 12, h: 10 } }], [graphPanel.new( // SQS messages visible in standard queues @@ -167,17 +167,17 @@ local createPanels(direction, metrics, selectorLabels, customSelectorLabels, cus min = 0, ).addTarget( prometheus.target(||| - sum by (%(targetQueueSelectorLabels)s) ({__name__=~"%(messagesVisibleMetrics)s", %(queueSelectors)s, + sum by (%(deadletterQueueNameSelectorLabels)s) ({__name__=~"%(messagesVisibleMetrics)s", %(queueSelectors)s, %(queueTemplateSelectors)s, %(environmentSelectors)s, %(productSelectors)s}) ||| % { - targetQueueSelectorLabels: std.join(', ', customSelectorLabels.targetQueue), + deadletterQueueNameSelectorLabels: std.join(', ', customSelectorLabels.deadletterQueueName), messagesVisibleMetrics: std.join('|', metrics.messagesVisible), queueSelectors: selectors.standardQueue, queueTemplateSelectors: selectors.standardQueueTemplate, environmentSelectors: selectors.environment, productSelectors: selectors.product, }, - legendFormat = '{{%s}}' % std.join(', ', customSelectorLabels.targetQueue)) + legendFormat = '{{%s}}' % std.join(', ', customSelectorLabels.deadletterQueueName)) ) + { gridPos: { w: 12, h: 10, x: 12 } }], [graphPanel.new( // SQS messages sent - standard queues @@ -186,17 +186,17 @@ local createPanels(direction, metrics, selectorLabels, customSelectorLabels, cus min = 0, ).addTarget( prometheus.target(||| - sum by (%(targetQueueSelectorLabels)s) ({__name__=~"%(messagesSentMetrics)s", %(queueSelectors)s, + sum by (%(deadletterQueueNameSelectorLabels)s) ({__name__=~"%(messagesSentMetrics)s", %(queueSelectors)s, %(queueTemplateSelectors)s, %(environmentSelectors)s, %(productSelectors)s}) ||| % { - targetQueueSelectorLabels: std.join(', ', customSelectorLabels.targetQueue), + deadletterQueueNameSelectorLabels: std.join(', ', customSelectorLabels.deadletterQueueName), messagesSentMetrics: std.join('|', metrics.messagesSent), queueSelectors: selectors.standardQueue, queueTemplateSelectors: selectors.standardQueueTemplate, environmentSelectors: selectors.environment, productSelectors: selectors.product, }, - legendFormat = '{{%s}}' % std.join(', ', customSelectorLabels.targetQueue)) + legendFormat = '{{%s}}' % std.join(', ', customSelectorLabels.deadletterQueueName)) ) + { gridPos: { w: 12, h: 10 } }], [graphPanel.new( // SQS messages deleted - in standard queues @@ -205,17 +205,17 @@ local createPanels(direction, metrics, selectorLabels, customSelectorLabels, cus min = 0, ).addTarget( prometheus.target(||| - sum by (%(targetQueueSelectorLabels)s) ({__name__=~"%(messagesDeletedMetrics)s", %(queueSelectors)s, + sum by (%(deadletterQueueNameSelectorLabels)s) ({__name__=~"%(messagesDeletedMetrics)s", %(queueSelectors)s, %(queueTemplateSelectors)s, %(environmentSelectors)s, %(productSelectors)s}) ||| % { - targetQueueSelectorLabels: std.join(', ', customSelectorLabels.targetQueue), + deadletterQueueNameSelectorLabels: std.join(', ', customSelectorLabels.deadletterQueueName), messagesDeletedMetrics: std.join('|', metrics.messagesDeleted), queueSelectors: selectors.standardQueue, queueTemplateSelectors: selectors.standardQueueTemplate, environmentSelectors: selectors.environment, productSelectors: selectors.product, }, - legendFormat = '{{%s}}' % std.join(', ', customSelectorLabels.targetQueue)) + legendFormat = '{{%s}}' % std.join(', ', customSelectorLabels.deadletterQueueName)) ) + { gridPos: { w: 12, h: 10, x: 12 } }], ]); diff --git a/monitoring-as-code/src/metric-types.libsonnet b/monitoring-as-code/src/metric-types.libsonnet index 539338d2..c56d19b2 100644 --- a/monitoring-as-code/src/metric-types.libsonnet +++ b/monitoring-as-code/src/metric-types.libsonnet @@ -215,7 +215,7 @@ 'aws_alb': { metricTypeConfig: { selectorLabels: { - environment: 'namespace', + environment: 'Environment', product: 'job', }, metrics: { @@ -252,19 +252,18 @@ 'aws_sqs': { metricTypeConfig: { selectorLabels: { - environment: 'namespace', + environment: 'Environment', product: 'job', }, metrics: { - messagesVisible: 'aws_sqs_approximate_number_of_messages_visible_sum', - oldestMessage: 'aws_sqs_approximate_age_of_oldest_message_sum', + messagesVisible: 'aws_sqs_approximate_number_of_messages_visible_average', + oldestMessage: 'aws_sqs_approximate_age_of_oldest_message_maximum', messagesSent: 'aws_sqs_number_of_messages_sent_sum', messagesDeleted: 'aws_sqs_number_of_messages_deleted_sum', }, customSelectorLabels: { - deadletterQueueName: 'queue_name', + deadletterQueueName: 'dimension_QueueName', deadletterQueueType: 'queue_type', - targetQueue: 'dimension_QueueName', }, customSelectors: { deadletterQueueName: '.+dlq.+', @@ -379,7 +378,7 @@ 'aws_rds_read': { metricTypeConfig: { selectorLabels: { - environment: 'namespace', + environment: 'Environment', product: 'job', }, metrics: { @@ -420,7 +419,7 @@ 'aws_rds_write': { metricTypeConfig: { selectorLabels: { - environment: 'namespace', + environment: 'Environment', product: 'job', }, metrics: { @@ -461,7 +460,7 @@ 'aws_es': { metricTypeConfig: { selectorLabels: { - environment: 'namespace', + environment: 'Environment', product: 'job', }, metrics: { diff --git a/monitoring-as-code/src/sli-metric-libraries/sli-freshness-cloudwatch-sqs.libsonnet b/monitoring-as-code/src/sli-metric-libraries/sli-freshness-cloudwatch-sqs.libsonnet index dbb586cd..6341a79e 100644 --- a/monitoring-as-code/src/sli-metric-libraries/sli-freshness-cloudwatch-sqs.libsonnet +++ b/monitoring-as-code/src/sli-metric-libraries/sli-freshness-cloudwatch-sqs.libsonnet @@ -21,7 +21,7 @@ local createGraphPanel(sliSpec) = graphPanel.new( title = '%s' % sliSpec.sliDescription, description = ||| - * Sample interval is %(evalInterval)s + * Sample interval is %(evalInterval)s * Resource selectors are %(selectors)s * Only queues where type is not deadletter ||| % { @@ -37,19 +37,19 @@ local createGraphPanel(sliSpec) = [%(evalInterval)s]) or vector(0))' % { messagesDeletedMetric: targetMetrics.messagesDeleted, selectors: std.join(',', dashboardSelectors), - queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueType, metricConfig.customSelectors.deadletterQueueType], + queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueName, metricConfig.customSelectors.deadletterQueueName], evalInterval: sliSpec.evalInterval, }, legendFormat='avg number of msgs delivered', ) ).addTarget( prometheus.target( - 'sum(avg_over_time(sqs_high_latency_in_queue_avg{%(dashboardSliLabelSelectors)s}[%(evalInterval)s]) + 'sum(avg_over_time(sqs_high_latency_in_queue_avg{%(dashboardSliLabelSelectors)s}[%(evalInterval)s]) or vector(0))/ sum(count_over_time(%(oldestMessageMetric)s{%(selectors)s, %(queueSelector)s} [%(evalInterval)s]) or vector(0))' % { oldestMessageMetric: targetMetrics.oldestMessage, selectors: std.join(',', dashboardSelectors), - queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueType, metricConfig.customSelectors.deadletterQueueType], + queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueName, metricConfig.customSelectors.deadletterQueueName], dashboardSliLabelSelectors: sliSpec.dashboardSliLabelSelectors, evalInterval: sliSpec.evalInterval, }, @@ -60,7 +60,7 @@ local createGraphPanel(sliSpec) = 'sum(avg_over_time(%(oldestMessageMetric)s{%(selectors)s, %(queueSelector)s}[%(evalInterval)s]) or vector(0))' % { oldestMessageMetric: targetMetrics.oldestMessage, selectors: std.join(',', dashboardSelectors), - queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueType, metricConfig.customSelectors.deadletterQueueType], + queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueName, metricConfig.customSelectors.deadletterQueueName], evalInterval: sliSpec.evalInterval, }, legendFormat='avg age of oldest msg in standard queue (secs)', @@ -103,7 +103,7 @@ local createCustomRecordingRules(sliSpec, sliMetadata, config) = ||| % { oldestMessageMetric: targetMetrics.oldestMessage, selectors: std.join(',', ruleSelectors), - queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueType, metricConfig.customSelectors.deadletterQueueType], + queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueName, metricConfig.customSelectors.deadletterQueueName], ruleSliLabelSelectors: sliSpec.ruleSliLabelSelectors, evalInterval: sliSpec.evalInterval, }, @@ -117,7 +117,7 @@ local createCustomRecordingRules(sliSpec, sliMetadata, config) = ||| % { oldestMessageMetric: targetMetrics.oldestMessage, selectors: std.join(',', ruleSelectors), - queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueType, metricConfig.customSelectors.deadletterQueueType], + queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueName, metricConfig.customSelectors.deadletterQueueName], metricTarget: sliSpec.metricTarget, }, labels: sliSpec.sliLabels, From f23d217396c26387b1e966ae7365e9d7161ca9b0 Mon Sep 17 00:00:00 2001 From: finlaymccormickHO <102794431+finlaymccormickHO@users.noreply.github.com> Date: Wed, 17 Aug 2022 18:01:39 +0100 Subject: [PATCH 02/12] fix: http requests availability detail dashboard elements file now also works for metric types with just resource or just errorstatus selector labels (#160) Co-authored-by: fim17 --- .../http-requests-availability.libsonnet | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/monitoring-as-code/src/dashboards/detail-dashboard-elements/http-requests-availability.libsonnet b/monitoring-as-code/src/dashboards/detail-dashboard-elements/http-requests-availability.libsonnet index 53ee8dd9..40538890 100644 --- a/monitoring-as-code/src/dashboards/detail-dashboard-elements/http-requests-availability.libsonnet +++ b/monitoring-as-code/src/dashboards/detail-dashboard-elements/http-requests-availability.libsonnet @@ -39,7 +39,8 @@ local createPanels(direction, metrics, selectorLabels, customSelectorLabels, cus [row.new( title = stringFormattingFunctions.capitaliseFirstLetters('%s HTTP Requests Availability' % direction), ) + { gridPos: { w: 24, h: 1 } }], - [graphPanel.new( + [if std.objectHas(selectorLabels, 'errorStatus') then + graphPanel.new( title = 'Availability - requests per second by response code', datasource = 'prometheus', min = 0, @@ -57,8 +58,9 @@ local createPanels(direction, metrics, selectorLabels, customSelectorLabels, cus selectors: std.join(', ', std.objectValues(selectors)), }, legendFormat = '{{%s}}' % std.join(', ', selectorLabels.errorStatus)) - ) + { gridPos: { w: 12, h: 10 } }], - [graphPanel.new( + ) + { gridPos: { w: if std.objectHas(selectorLabels, 'resource') then 12 else 24, h: 10 } }], + [if std.objectHas(selectorLabels, 'resource') then + graphPanel.new( title = 'Availability - requests per second by path', datasource = 'prometheus', min = 0, @@ -76,7 +78,11 @@ local createPanels(direction, metrics, selectorLabels, customSelectorLabels, cus selectors: std.join(', ', std.objectValues(selectors)), }, legendFormat = '{{%s}}' % std.join(', ', selectorLabels.resource)) - ) + { gridPos: { w: 12, h: 10, x: 12 } }], + ) + { gridPos: { + w: if std.objectHas(selectorLabels, 'errorStatus') then 12 else 24, + h: 10, + x: if std.objectHas(selectorLabels, 'errorStatus') then 12 else 0, + } }], ]); // File exports From f3857a63ce84e9b4d5fec118519d4547845c895f Mon Sep 17 00:00:00 2001 From: Humayun Alam <108126376+humayunalamHO@users.noreply.github.com> Date: Thu, 18 Aug 2022 09:23:20 +0100 Subject: [PATCH 03/12] refactor: add dependencies label to release drafter (#172) --- .github/release-drafter.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml index 582132a7..9599b25f 100644 --- a/.github/release-drafter.yml +++ b/.github/release-drafter.yml @@ -19,6 +19,7 @@ categories: - 'refactor' - 'style' - 'docs' + - 'dependencies' change-template: '- $TITLE @$AUTHOR (#$NUMBER)' change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks. version-resolver: From c8b4780af780b334c3e30548a39618ce4bb348e8 Mon Sep 17 00:00:00 2001 From: Mahruf Iqbal <102766665+mahrufiqbalHO@users.noreply.github.com> Date: Thu, 18 Aug 2022 11:43:34 +0100 Subject: [PATCH 04/12] refactor: Update Docker Build Tag Workflow (#174) --- .github/workflows/docker-build-branch.yml | 2 +- .github/workflows/docker-build-tag.yml | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/.github/workflows/docker-build-branch.yml b/.github/workflows/docker-build-branch.yml index 77ac29c1..d34af691 100644 --- a/.github/workflows/docker-build-branch.yml +++ b/.github/workflows/docker-build-branch.yml @@ -20,7 +20,7 @@ jobs: steps: #Checks-out our repository under $GITHUB_WORKSPACE, so our workflow can access it. - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 #Logs into Github registry - name: Login to GitHub Container Registry diff --git a/.github/workflows/docker-build-tag.yml b/.github/workflows/docker-build-tag.yml index 987ab82c..6251189e 100644 --- a/.github/workflows/docker-build-tag.yml +++ b/.github/workflows/docker-build-tag.yml @@ -24,7 +24,7 @@ jobs: #Checks-out our repository under $GITHUB_WORKSPACE, so our workflow can access it. - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 #Logs into Github registry - name: Login to GitHub Container Registry @@ -34,14 +34,13 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Calculate SemVer increment - id: increment - uses: UKHomeOffice/semver-tag-action@v3 + - name: Calculate SemVer value + id: calculate + uses: UKHomeOffice/semver-calculate-action@v1 with: increment: ${{ steps.label.outputs.matchedLabels }} github_token: ${{ secrets.GITHUB_TOKEN }} - default_use_head_tag: ${{ github.base_ref == 'main' }} - dry_run: true + default_to_highest: ${{ github.base_ref == 'main' }} - name: Calculate metadata id: meta @@ -49,7 +48,7 @@ jobs: with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} tags: | - type=raw,value=${{steps.increment.outputs.version}} + type=raw,value=${{steps.calculate.outputs.version}} type=raw,value=latest,enable=${{ github.base_ref == 'main' }} - name: Build container @@ -62,12 +61,12 @@ jobs: labels: ${{ steps.meta.outputs.labels }} build-args: | PACKAGE_TOKEN=${{secrets.GITHUB_TOKEN}} - MAC_VERSION=${{steps.increment.outputs.version}} + MAC_VERSION=${{steps.calculate.outputs.version}} - name: Tag repository with SemVer uses: UKHomeOffice/semver-tag-action@v3 with: - tag: ${{steps.increment.outputs.version}} + tag: ${{steps.calculate.outputs.version}} github_token: ${{ secrets.GITHUB_TOKEN }} default_use_head_tag: ${{ github.base_ref == 'main' }} From b9137973311550afe3666c3ed8867341124ce7cc Mon Sep 17 00:00:00 2001 From: Mahruf Iqbal <102766665+mahrufiqbalHO@users.noreply.github.com> Date: Thu, 18 Aug 2022 12:29:38 +0100 Subject: [PATCH 05/12] fix: remove 'v' version from the config (#176) --- .github/release-drafter.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml index 9599b25f..ed8ba091 100644 --- a/.github/release-drafter.yml +++ b/.github/release-drafter.yml @@ -1,5 +1,5 @@ -name-template: 'v$RESOLVED_VERSION 🌈' -tag-template: 'v$RESOLVED_VERSION' +name-template: '$RESOLVED_VERSION 🌈' +tag-template: '$RESOLVED_VERSION' categories: - title: '🚀 Features' labels: From d09cfff4130bd4e938df8d23fa768d89124223d6 Mon Sep 17 00:00:00 2001 From: finlaymccormickHO <102794431+finlaymccormickHO@users.noreply.github.com> Date: Fri, 19 Aug 2022 15:26:48 +0100 Subject: [PATCH 06/12] 169 update sli metric libraries (#179) * refactor: renamed and refactored sli metric library files * fix: adding product selector label to up metric type and expanding some graph panel descriptions to match previous versions * fix: removing old sli metric library files * feat: added template metric type --- monitoring-as-code/src/metric-types.libsonnet | 130 +++++++++++------ .../sli-elements/recording-rules.libsonnet | 2 +- .../sli-availability-cloudwatch-alb.libsonnet | 128 ---------------- .../sli-availability-generic.libsonnet | 97 ------------- .../sli-availability-promclient.libsonnet | 114 --------------- .../sli-average-metric.libsonnet | 78 ---------- .../sli-avgovertime-generic.libsonnet | 79 ---------- ...i-correctness-cloudwatch-sqs-dlq.libsonnet | 132 ----------------- .../sli-freshness-cloudwatch-sqs.libsonnet | 131 ----------------- .../sli-latency-cloudwatch-alb.libsonnet | 112 -------------- .../sli-latency-promclient.libsonnet | 84 ----------- .../sli-metric-library-template.libsonnet | 49 ------- ...e-correctness-using-queue-metric.libsonnet | 127 ++++++++++++++++ ...age-freshness-using-queue-metric.libsonnet | 130 +++++++++++++++++ .../average-using-single-metric.libsonnet | 89 ++++++++++++ .../histogram-quantile-latency.libsonnet | 91 ++++++++++++ ...ing-cloudwatch-percentile-metric.libsonnet | 124 ++++++++++++++++ ...errors-using-bad-request-metrics.libsonnet | 137 ++++++++++++++++++ ...n-of-errors-using-failure-metric.libsonnet | 112 ++++++++++++++ ...proportion-of-errors-using-label.libsonnet | 128 ++++++++++++++++ .../sli-value-library-template.libsonnet | 74 ++++++++++ ... => sli-value-library-functions.libsonnet} | 0 22 files changed, 1095 insertions(+), 1053 deletions(-) delete mode 100644 monitoring-as-code/src/sli-metric-libraries/sli-availability-cloudwatch-alb.libsonnet delete mode 100644 monitoring-as-code/src/sli-metric-libraries/sli-availability-generic.libsonnet delete mode 100644 monitoring-as-code/src/sli-metric-libraries/sli-availability-promclient.libsonnet delete mode 100644 monitoring-as-code/src/sli-metric-libraries/sli-average-metric.libsonnet delete mode 100644 monitoring-as-code/src/sli-metric-libraries/sli-avgovertime-generic.libsonnet delete mode 100644 monitoring-as-code/src/sli-metric-libraries/sli-correctness-cloudwatch-sqs-dlq.libsonnet delete mode 100644 monitoring-as-code/src/sli-metric-libraries/sli-freshness-cloudwatch-sqs.libsonnet delete mode 100644 monitoring-as-code/src/sli-metric-libraries/sli-latency-cloudwatch-alb.libsonnet delete mode 100644 monitoring-as-code/src/sli-metric-libraries/sli-latency-promclient.libsonnet delete mode 100644 monitoring-as-code/src/sli-metric-libraries/sli-metric-library-template.libsonnet create mode 100644 monitoring-as-code/src/sli-value-libraries/average-correctness-using-queue-metric.libsonnet create mode 100644 monitoring-as-code/src/sli-value-libraries/average-freshness-using-queue-metric.libsonnet create mode 100644 monitoring-as-code/src/sli-value-libraries/average-using-single-metric.libsonnet create mode 100644 monitoring-as-code/src/sli-value-libraries/histogram-quantile-latency.libsonnet create mode 100644 monitoring-as-code/src/sli-value-libraries/max-latency-using-cloudwatch-percentile-metric.libsonnet create mode 100644 monitoring-as-code/src/sli-value-libraries/proportion-of-errors-using-bad-request-metrics.libsonnet create mode 100644 monitoring-as-code/src/sli-value-libraries/proportion-of-errors-using-failure-metric.libsonnet create mode 100644 monitoring-as-code/src/sli-value-libraries/proportion-of-errors-using-label.libsonnet create mode 100644 monitoring-as-code/src/sli-value-libraries/sli-value-library-template.libsonnet rename monitoring-as-code/src/util/{sli-metric-library-functions.libsonnet => sli-value-library-functions.libsonnet} (100%) diff --git a/monitoring-as-code/src/metric-types.libsonnet b/monitoring-as-code/src/metric-types.libsonnet index c56d19b2..c82093d9 100644 --- a/monitoring-as-code/src/metric-types.libsonnet +++ b/monitoring-as-code/src/metric-types.libsonnet @@ -29,19 +29,19 @@ }, sliTypesConfig: { availability: { - library: (import 'sli-metric-libraries/sli-availability-promclient.libsonnet'), + library: (import 'sli-value-libraries/proportion-of-errors-using-label.libsonnet'), description: 'Error rate for %(sliDescription)s should be below %(metric_target_percent)0.1f%%', targetMetrics: { - count: 'count', + target: 'count', }, }, latency: { - library: (import 'sli-metric-libraries/sli-latency-promclient.libsonnet'), + library: (import 'sli-value-libraries/histogram-quantile-latency.libsonnet'), description: 'Request latency for %(sliDescription)s should be below %(metricTarget)0.1fs for the %(latencyPercentile)0.0fth percentile', targetMetrics: { + bucket: 'bucket', sum: 'sum', count: 'count', - bucket: 'bucket', }, }, }, @@ -68,10 +68,10 @@ }, sliTypesConfig: { availability: { - library: (import 'sli-metric-libraries/sli-availability-promclient.libsonnet'), + library: (import 'sli-value-libraries/proportion-of-errors-using-label.libsonnet'), description: 'Error rate for %(sliDescription)s should be below %(metric_target_percent)0.1f%%', targetMetrics: { - count: 'count', + target: 'count', }, }, }, @@ -97,10 +97,10 @@ }, sliTypesConfig: { availability: { - library: (import 'sli-metric-libraries/sli-availability-promclient.libsonnet'), + library: (import 'sli-value-libraries/proportion-of-errors-using-label.libsonnet'), description: 'Error rate for %(sliDescription)s should be below %(metric_target_percent)0.1f%%', targetMetrics: { - count: 'count', + target: 'count', }, }, }, @@ -127,12 +127,12 @@ }, sliTypesConfig: { latency: { - library: (import 'sli-metric-libraries/sli-latency-promclient.libsonnet'), + library: (import 'sli-value-libraries/histogram-quantile-latency.libsonnet'), description: 'Request latency for %(sliDescription)s should be below %(metricTarget)0.1fs for the %(latencyPercentile)0.0fth percentile', targetMetrics: { + bucket: 'bucket', sum: 'sum', count: 'count', - bucket: 'bucket', }, }, }, @@ -157,10 +157,10 @@ }, sliTypesConfig: { availability: { - library: (import 'sli-metric-libraries/sli-availability-promclient.libsonnet'), + library: (import 'sli-value-libraries/proportion-of-errors-using-label.libsonnet'), description: 'Error rate for %(sliDescription)s should be below %(metric_target_percent)0.1f%%', targetMetrics: { - count: 'count', + target: 'count', }, }, }, @@ -187,19 +187,19 @@ }, sliTypesConfig: { availability: { - library: (import 'sli-metric-libraries/sli-availability-promclient.libsonnet'), + library: (import 'sli-value-libraries/proportion-of-errors-using-label.libsonnet'), description: 'Error rate for %(sliDescription)s should be below %(metric_target_percent)0.1f%%', targetMetrics: { - count: 'count', + target: 'count', }, }, latency: { - library: (import 'sli-metric-libraries/sli-latency-promclient.libsonnet'), + library: (import 'sli-value-libraries/histogram-quantile-latency.libsonnet'), description: 'Request latency for %(sliDescription)s should be below %(metricTarget)0.1fs for the %(latencyPercentile)0.0fth percentile', targetMetrics: { + bucket: 'bucket', sum: 'sum', count: 'count', - bucket: 'bucket', }, }, }, @@ -222,24 +222,30 @@ count4xx: 'aws_alb_httpcode_target_4_xx_count_sum', count5xx: 'aws_alb_httpcode_target_5_xx_count_sum', requestCount: 'aws_alb_request_count_sum', - responseTime: 'aws_alb_target_response_time', + responseTimeAverage: 'aws_alb_target_response_time_average', + responseTimeP90: 'aws_alb_target_response_time_p90', + responseTimeP95: 'aws_alb_target_response_time_p95', + responseTimeP99: 'aws_alb_target_response_time_p99', }, }, sliTypesConfig: { availability: { - library: (import 'sli-metric-libraries/sli-availability-cloudwatch-alb.libsonnet'), + library: (import 'sli-value-libraries/proportion-of-errors-using-bad-request-metrics.libsonnet'), description: 'The error rate for %(sliDescription)s should be below %(metric_target_percent)0.1f%%', targetMetrics: { - count4xx: 'count4xx', - count5xx: 'count5xx', - requestCount: 'requestCount', + code4xx: 'count4xx', + code5xx: 'count5xx', + codeAll: 'requestCount', }, }, latency: { - library: (import 'sli-metric-libraries/sli-latency-cloudwatch-alb.libsonnet'), + library: (import 'sli-value-libraries/max-latency-using-cloudwatch-percentile-metric.libsonnet'), description: 'Target latency for %(sliDescription)s should be below %(metricTarget)0.1fs for the %(latencyPercentile)0.0fth percentile', targetMetrics: { - responseTime: 'responseTime', + p90: 'responseTimeP90', + p95: 'responseTimeP95', + p99: 'responseTimeP99', + average: 'responseTimeAverage', }, }, }, @@ -272,18 +278,18 @@ }, sliTypesConfig: { freshness: { - library: (import 'sli-metric-libraries/sli-freshness-cloudwatch-sqs.libsonnet'), + library: (import 'sli-value-libraries/average-freshness-using-queue-metric.libsonnet'), description: 'Age of oldest message in SQS queue should be less than %(metricTarget)s seconds for %(sliDescription)s', targetMetrics: { oldestMessage: 'oldestMessage', - messagesDeleted: 'messagesDeleted', + deletedMessages: 'messagesDeleted', }, }, correctness: { - library: (import 'sli-metric-libraries/sli-correctness-cloudwatch-sqs-dlq.libsonnet'), + library: (import 'sli-value-libraries/average-correctness-using-queue-metric.libsonnet'), description: 'There should be no messages in the DLQ for %(sliDescription)s', targetMetrics: { - messagesVisible: 'messagesVisible', + visibleMessages: 'messagesVisible', oldestMessage: 'oldestMessage', }, }, @@ -312,11 +318,11 @@ }, sliTypesConfig: { availability: { - library: (import 'sli-metric-libraries/sli-availability-generic.libsonnet'), + library: (import 'sli-value-libraries/proportion-of-errors-using-failure-metric.libsonnet'), description: 'The rate of %(sliDescription)s should be below %(metric_target_percent)0.1f%%', targetMetrics: { - totalFailures: 'totalFailures', - total: 'total', + failure: 'totalFailures', + successAndFailure: 'total', }, }, }, @@ -330,6 +336,7 @@ metricTypeConfig: { selectorLabels: { environment: 'namespace', + product: 'job', }, metrics: { duration: 'up', @@ -337,10 +344,10 @@ }, sliTypesConfig: { availability: { - library: (import 'sli-metric-libraries/sli-avgovertime-generic.libsonnet'), + library: (import 'sli-value-libraries/average-using-single-metric.libsonnet'), description: 'The average of %(sliDescription)s should be %(comparison)s %(metricTarget)0.1f', targetMetrics: { - duration: 'duration', + target: 'duration', }, }, }, @@ -362,10 +369,10 @@ }, sliTypesConfig: { availability: { - library: (import 'sli-metric-libraries/sli-avgovertime-generic.libsonnet'), + library: (import 'sli-value-libraries/average-using-single-metric.libsonnet'), description: 'The average of %(sliDescription)s should be %(comparison)s %(metricTarget)0.1f', targetMetrics: { - duration: 'duration', + target: 'duration', }, }, }, @@ -389,24 +396,24 @@ }, sliTypesConfig: { latency: { - library: (import 'sli-metric-libraries/sli-average-metric.libsonnet'), + library: (import 'sli-value-libraries/average-using-single-metric.libsonnet'), description: 'The average latency of %(sliDescription)s should be %(comparison)s %(metricTarget)0.1f', targetMetrics: { - average: 'averageLatency', + target: 'averageLatency', }, }, iops: { - library: (import 'sli-metric-libraries/sli-average-metric.libsonnet'), + library: (import 'sli-value-libraries/average-using-single-metric.libsonnet'), description: 'The average IOPS of %(sliDescription)s should be %(comparison)s %(metricTarget)0.1f', targetMetrics: { - average: 'averageIops', + target: 'averageIops', }, }, throughput: { - library: (import 'sli-metric-libraries/sli-average-metric.libsonnet'), + library: (import 'sli-value-libraries/average-using-single-metric.libsonnet'), description: 'The average throughput of %(sliDescription)s should be %(comparison)s %(metricTarget)0.1f', targetMetrics: { - average: 'averageThroughput', + target: 'averageThroughput', }, }, }, @@ -430,24 +437,24 @@ }, sliTypesConfig: { latency: { - library: (import 'sli-metric-libraries/sli-average-metric.libsonnet'), + library: (import 'sli-value-libraries/average-using-single-metric.libsonnet'), description: 'The average latency of %(sliDescription)s should be %(comparison)s %(metricTarget)0.1f', targetMetrics: { - average: 'averageLatency', + target: 'averageLatency', }, }, iops: { - library: (import 'sli-metric-libraries/sli-average-metric.libsonnet'), + library: (import 'sli-value-libraries/average-using-single-metric.libsonnet'), description: 'The average IOPS of %(sliDescription)s should be %(comparison)s %(metricTarget)0.1f', targetMetrics: { - average: 'averageIops', + target: 'averageIops', }, }, throughput: { - library: (import 'sli-metric-libraries/sli-average-metric.libsonnet'), + library: (import 'sli-value-libraries/average-using-single-metric.libsonnet'), description: 'The average throughput of %(sliDescription)s should be %(comparison)s %(metricTarget)0.1f', targetMetrics: { - average: 'averageThroughput', + target: 'averageThroughput', }, }, }, @@ -469,10 +476,10 @@ }, sliTypesConfig: { latency: { - library: (import 'sli-metric-libraries/sli-average-metric.libsonnet'), + library: (import 'sli-value-libraries/average-using-single-metric.libsonnet'), description: 'The average latency of %(sliDescription)s should be %(comparison)s %(metricTarget)0.1f', targetMetrics: { - average: 'averageLatency', + target: 'averageLatency', }, }, }, @@ -482,4 +489,31 @@ targetMetrics: {}, }, }, + template: { + metricTypeConfig: { + selectorLabels: { + environment: '', + product: '', + }, + metrics: { + + }, + }, + sliTypesConfig: { + sliType: { + library: (import ''), + description: '', + targetMetrics: { + + }, + }, + }, + detailDashboardConfig: { + standardTemplates: [], + elements: [], + targetMetrics: { + + }, + }, + }, } diff --git a/monitoring-as-code/src/sli-elements/recording-rules.libsonnet b/monitoring-as-code/src/sli-elements/recording-rules.libsonnet index 8cae856a..51ff13a1 100644 --- a/monitoring-as-code/src/sli-elements/recording-rules.libsonnet +++ b/monitoring-as-code/src/sli-elements/recording-rules.libsonnet @@ -42,7 +42,7 @@ local createRecordingRules(sliSpec, config) = { recording_rules+: std.flattenArrays([createStandardRecordingRules(sliSpec, sliMetadata), - macConfig.metricTypes[sliSpec.metricType].sliTypesConfig[sliSpec.sliType].library.createCustomRecordingRules(sliSpec, sliMetadata, config)]), + macConfig.metricTypes[sliSpec.metricType].sliTypesConfig[sliSpec.sliType].library.createSliValueRule(sliSpec, sliMetadata, config)]), }; // File exports diff --git a/monitoring-as-code/src/sli-metric-libraries/sli-availability-cloudwatch-alb.libsonnet b/monitoring-as-code/src/sli-metric-libraries/sli-availability-cloudwatch-alb.libsonnet deleted file mode 100644 index 2027ae0e..00000000 --- a/monitoring-as-code/src/sli-metric-libraries/sli-availability-cloudwatch-alb.libsonnet +++ /dev/null @@ -1,128 +0,0 @@ -// Library to generate Grafana and Prometheus config for Cloudwatch ALB availability - -// MaC imports -local sliMetricLibraryFunctions = import '../util/sli-metric-library-functions.libsonnet'; - -// Grafana imports -local grafana = import 'grafonnet/grafana.libsonnet'; -local prometheus = grafana.prometheus; -local graphPanel = grafana.graphPanel; - -// Creates Grafana dashboard graph panel for an SLI type -// @param sliSpec The spec for the SLI having its dashboard created -// @returns Grafana graph panel object -local createGraphPanel(sliSpec) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local dashboardSelectors = sliMetricLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - graphPanel.new( - title = 'Requests, errors, and error rate - %s' % sliSpec.sliDescription, - description = ||| - * Sample interval is %(evalInterval)s - * Resource selectors are %(selectors)s - * Errors are 4_xx and 5_xx count - ||| % { - evalInterval: sliSpec.evalInterval, - selectors: std.strReplace(std.join(', ', sliMetricLibraryFunctions.getSelectors(metricConfig, sliSpec)), '~', '\\~'), - }, - datasource = 'prometheus', - min = 0, - fill = 0, - formatY2 = 'percentunit', - thresholds = [ - { - value: sliSpec.metricTarget, - colorMode: 'critical', - op: 'gt', - line: true, - fill: false, - yaxis: 'right', - }, - ], - ).addTarget( - prometheus.target( - 'sum(rate(%(requestCountMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0))' % { - requestCountMetric: targetMetrics.requestCount, - selectors: std.join(',', dashboardSelectors), - evalInterval: sliSpec.evalInterval, - }, - legendFormat = 'Requests per second', - ), - ).addTarget( - prometheus.target( - '(sum(rate(%(count4xxMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) + - sum(rate(%(count5xxMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)))' % { - count4xxMetric: targetMetrics.count4xx, - count5xxMetric: targetMetrics.count5xx, - selectors: std.join(',', dashboardSelectors), - evalInterval: sliSpec.evalInterval, - }, - legendFormat = 'Errors per second', - ) - ).addTarget( - prometheus.target( - '(sum(rate(%(count4xxMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) + - sum(rate(%(count5xxMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0))) / - sum(rate(%(requestCountMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0))' % { - count4xxMetric: targetMetrics.count4xx, - count5xxMetric: targetMetrics.count5xx, - requestCountMetric: targetMetrics.requestCount, - selectors: std.join(',', dashboardSelectors), - evalInterval: sliSpec.evalInterval, - }, - legendFormat = 'Error rate', - ) - ).addSeriesOverride( - { - alias: '/Error rate/', - yaxis: 2, - color: 'red', - }, - ); - -// Creates custom recording rules for an SLI type -// @param sliSpec The spec for the SLI having its recording rules created -// @param sliMetadata Metadata about the type and category of the SLI -// @param config The config for the service defined in the mixin file -// @returns JSON defining the recording rules -local createCustomRecordingRules(sliSpec, sliMetadata, config) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local ruleSelectors = sliMetricLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - [ - { - record: 'sli_value', - expr: ||| - (sum(rate(%(count4xxMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) + - sum(rate(%(count5xxMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0))) / - sum(rate(%(requestCountMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) - ||| % { - count4xxMetric: targetMetrics.count4xx, - count5xxMetric: targetMetrics.count5xx, - requestCountMetric: targetMetrics.requestCount, - selectors: std.join(',', ruleSelectors), - evalInterval: sliSpec.evalInterval, - }, - labels: sliSpec.sliLabels + sliMetadata, - }, - { - record: 'aggregated_alb_request_count_sum', - expr: ||| - (sum(sum_over_time(%(requestCountMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0))) - >= 0 - ||| % { - requestCountMetric: targetMetrics.requestCount, - selectors: std.join(',', ruleSelectors), - evalInterval: sliSpec.evalInterval, - }, - labels: sliSpec.sliLabels + sliMetadata, - }, - ]; - -// File exports -{ - createGraphPanel(sliSpec): createGraphPanel(sliSpec), - createCustomRecordingRules(sliSpec, sliMetadata, config): createCustomRecordingRules(sliSpec, sliMetadata, config), -} diff --git a/monitoring-as-code/src/sli-metric-libraries/sli-availability-generic.libsonnet b/monitoring-as-code/src/sli-metric-libraries/sli-availability-generic.libsonnet deleted file mode 100644 index 4c7dd280..00000000 --- a/monitoring-as-code/src/sli-metric-libraries/sli-availability-generic.libsonnet +++ /dev/null @@ -1,97 +0,0 @@ -// Library to generate Grafana and Prometheus config for generic availability - -// MaC imports -local sliMetricLibraryFunctions = import '../util/sli-metric-library-functions.libsonnet'; - -// Grafana imports -local grafana = import 'grafonnet/grafana.libsonnet'; -local prometheus = grafana.prometheus; -local graphPanel = grafana.graphPanel; - -// Creates Grafana dashboard graph panel for an SLI type -// @param sliSpec The spec for the SLI having its dashboard created -// @returns Grafana graph panel object -local createGraphPanel(sliSpec) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local dashboardSelectors = sliMetricLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - graphPanel.new( - title = '%s' % sliSpec.sliDescription, - description = ||| - * Sample interval is %(evalInterval)s - ||| % { - evalInterval: sliSpec.evalInterval, - }, - datasource = 'prometheus', - min = 0, - fill = 0, - formatY2 = 'percentunit', - ).addTarget( - prometheus.target( - 'sum(rate(%(totalMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0))' % { - totalMetric: targetMetrics.total, - selectors: std.join(',', dashboardSelectors), - evalInterval: sliSpec.evalInterval, - }, - legendFormat = 'Total per second', - ), - ).addTarget( - prometheus.target( - 'sum(rate(%(totalFailuresMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0))' % { - totalFailuresMetric: targetMetrics.totalFailures, - selectors: std.join(',', dashboardSelectors), - evalInterval: sliSpec.evalInterval, - }, - legendFormat = 'Errors per second', - ) - ).addTarget( - prometheus.target( - 'sum(rate(%(totalFailuresMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) / - sum(rate(%(totalMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0))' % { - totalFailuresMetric: targetMetrics.totalFailures, - totalMetric: targetMetrics.total, - selectors: std.join(',', dashboardSelectors), - evalInterval: sliSpec.evalInterval, - }, - legendFormat = 'Error rate', - ) - ).addSeriesOverride( - { - alias: '/Error rate/', - yaxis: 2, - color: 'red', - } - ); - -// Creates custom recording rules for an SLI type -// @param sliSpec The spec for the SLI having its recording rules created -// @param sliMetadata Metadata about the type and category of the SLI -// @param config The config for the service defined in the mixin file -// @returns JSON defining the recording rules -local createCustomRecordingRules(sliSpec, sliMetadata, config) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local ruleSelectors = sliMetricLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - [ - { - record: 'sli_value', - expr: ||| - sum(rate(%(totalFailuresMetric)s{%(selectors)s}[%(evalInterval)s])) / - sum(rate(%(totalMetric)s{%(selectors)s}[%(evalInterval)s])) - ||| % { - selectors: std.join(',', ruleSelectors), - totalFailuresMetric: targetMetrics.totalFailures, - totalMetric: targetMetrics.total, - evalInterval: sliSpec.evalInterval, - }, - labels: sliSpec.sliLabels + sliMetadata, - }, - ]; - -// File exports -{ - createGraphPanel(sliSpec): createGraphPanel(sliSpec), - createCustomRecordingRules(sliSpec, sliMetadata, config): createCustomRecordingRules(sliSpec, sliMetadata, config), -} diff --git a/monitoring-as-code/src/sli-metric-libraries/sli-availability-promclient.libsonnet b/monitoring-as-code/src/sli-metric-libraries/sli-availability-promclient.libsonnet deleted file mode 100644 index 661aedd6..00000000 --- a/monitoring-as-code/src/sli-metric-libraries/sli-availability-promclient.libsonnet +++ /dev/null @@ -1,114 +0,0 @@ -// Library to generate Grafana and Prometheus config for service error rate metrics - -// MaC imports -local sliMetricLibraryFunctions = import '../util/sli-metric-library-functions.libsonnet'; -local stringFormattingFunctions = import '../util/string-formatting-functions.libsonnet'; - -// Grafana imports -local grafana = import 'grafonnet/grafana.libsonnet'; -local prometheus = grafana.prometheus; -local graphPanel = grafana.graphPanel; -local row = grafana.row; - -// Creates Grafana dashboard graph panel for an SLI type -// @param sliSpec The spec for the SLI having its dashboard created -// @returns Grafana graph panel object -local createGraphPanel(sliSpec) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local dashboardSelectors = sliMetricLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - graphPanel.new( - title = 'Requests, errors, and error rate - %s' % sliSpec.sliDescription, - description = ||| - * Sample interval is %(evalInterval)s - * Request selectors are %(selectors)s - * Error selectors are %(errorSelector)s - ||| % { - evalInterval: sliSpec.evalInterval, - selectors: std.strReplace(std.join(', ', sliMetricLibraryFunctions.getSelectors(metricConfig, sliSpec)), '~', '\\~'), - errorSelector: sliMetricLibraryFunctions.getSelector('errorStatus', metricConfig, sliSpec), - }, - datasource = 'prometheus', - min = 0, - fill = 0, - formatY2 = 'percentunit', - thresholds = [ - { - value: sliSpec.metricTarget, - colorMode: 'critical', - op: 'gt', - line: true, - fill: false, - yaxis: 'right', - }, - ], - ).addTarget( - prometheus.target( - 'sum(rate(%(countMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0))' % { - countMetric: targetMetrics.count, - selectors: std.join(',', dashboardSelectors), - evalInterval: sliSpec.evalInterval, - }, - legendFormat = 'Requests per second', - ), - ).addTarget( - prometheus.target( - 'sum(rate(%(countMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0))' % - { - countMetric: targetMetrics.count, - selectors: std.join(',', dashboardSelectors + [sliMetricLibraryFunctions.getSelector('errorStatus', metricConfig, sliSpec)]), - evalInterval: sliSpec.evalInterval, - }, - legendFormat = 'Errors per second', - ) - ).addTarget( - prometheus.target( - 'sum(rate(%(countMetric)s{%(errorSelectors)s}[%(evalInterval)s]) or vector(0)) / - sum(rate(%(countMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0))' % { - countMetric: targetMetrics.count, - selectors: std.join(',', dashboardSelectors), - errorSelectors: std.join(',', dashboardSelectors + [sliMetricLibraryFunctions.getSelector('errorStatus', metricConfig, sliSpec)]), - evalInterval: sliSpec.evalInterval, - }, - legendFormat = 'Error rate', - ) - ).addSeriesOverride( - { - alias: '/Error rate/', - yaxis: 2, - color: 'red', - } - ); - -// Creates custom recording rules for an SLI type -// @param sliSpec The spec for the SLI having its recording rules created -// @param sliMetadata Metadata about the type and category of the SLI -// @param config The config for the service defined in the mixin file -// @returns JSON defining the recording rules -local createCustomRecordingRules(sliSpec, sliMetadata, config) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local ruleSelectors = sliMetricLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - [ - { - record: 'sli_value', - expr: ||| - sum(rate(%(countMetric)s{%(errorSelectors)s}[%(evalInterval)s]) or vector(0)) / - sum(rate(%(countMetric)s{%(selectors)s}[%(evalInterval)s])) - ||| % { - countMetric: targetMetrics.count, - selectors: std.join(',', ruleSelectors), - errorSelectors: std.join(',', ruleSelectors + [sliMetricLibraryFunctions.getSelector('errorStatus', metricConfig, sliSpec)]), - evalInterval: sliSpec.evalInterval, - }, - labels: sliSpec.sliLabels + sliMetadata, - }, - ]; - -// File exports -{ - createGraphPanel(sliSpec): createGraphPanel(sliSpec), - createCustomRecordingRules(sliSpec, sliMetadata, config): createCustomRecordingRules(sliSpec, sliMetadata, config), -} diff --git a/monitoring-as-code/src/sli-metric-libraries/sli-average-metric.libsonnet b/monitoring-as-code/src/sli-metric-libraries/sli-average-metric.libsonnet deleted file mode 100644 index 0b0f4662..00000000 --- a/monitoring-as-code/src/sli-metric-libraries/sli-average-metric.libsonnet +++ /dev/null @@ -1,78 +0,0 @@ -// Library for average value metrics - -// MaC imports -local sliMetricLibraryFunctions = import '../util/sli-metric-library-functions.libsonnet'; - -// Grafana imports -local grafana = import 'grafonnet/grafana.libsonnet'; -local prometheus = grafana.prometheus; -local graphPanel = grafana.graphPanel; - -// Creates Grafana dashboard graph panel for an SLI type -// @param sliSpec The spec for the SLI having its dashboard created -// @returns Grafana graph panel object -local createGraphPanel(sliSpec) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local dashboardSelectors = sliMetricLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - graphPanel.new( - title = 'Latency - %s' % sliSpec.sliDescription, - description = ||| - * Sample interval is %(evalInterval)s - * Request selectors are %(selectors)s - ||| % { - evalInterval: sliSpec.evalInterval, - selectors: std.strReplace(std.join(', ', sliMetricLibraryFunctions.getSelectors(metricConfig, sliSpec)), '~', '\\~'), - }, - datasource = 'prometheus', - fill = 0, - thresholds = [ - { - value: sliSpec.metricTarget, - colorMode: 'critical', - op: 'gt', - line: true, - fill: false, - }, - ], - ).addTarget( - prometheus.target( - 'avg_over_time(%(metric)s{%(selectors)s}[%(evalInterval)s])' % { - metric: targetMetrics.average, - selectors: std.join(',', dashboardSelectors), - evalInterval: sliSpec.evalInterval, - }, - legendFormat = 'Average %s' % sliSpec.sliDescription, - ) - ); - -// Creates custom recording rules for an SLI type -// @param sliSpec The spec for the SLI having its recording rules created -// @param sliMetadata Metadata about the type and category of the SLI -// @param config The config for the service defined in the mixin file -// @returns JSON defining the recording rules -local createCustomRecordingRules(sliSpec, sliMetadata, config) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local ruleSelectors = sliMetricLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - [ - { - record: 'sli_value', - expr: ||| - avg_over_time(%(metric)s{%(selectors)s}[%(evalInterval)s]) - ||| % { - metric: targetMetrics.average, - selectors: std.join(',', ruleSelectors), - evalInterval: sliSpec.evalInterval, - }, - labels: sliSpec.sliLabels + sliMetadata, - }, - ]; - -// File exports -{ - createGraphPanel(sliSpec): createGraphPanel(sliSpec), - createCustomRecordingRules(sliSpec, sliMetadata, config): createCustomRecordingRules(sliSpec, sliMetadata, config), -} diff --git a/monitoring-as-code/src/sli-metric-libraries/sli-avgovertime-generic.libsonnet b/monitoring-as-code/src/sli-metric-libraries/sli-avgovertime-generic.libsonnet deleted file mode 100644 index 6811f2f1..00000000 --- a/monitoring-as-code/src/sli-metric-libraries/sli-avgovertime-generic.libsonnet +++ /dev/null @@ -1,79 +0,0 @@ -// Library to generate Grafana and Prometheus config for generic availability - -// MaC imports -local sliMetricLibraryFunctions = import '../util/sli-metric-library-functions.libsonnet'; - -// Grafana imports -local grafana = import 'grafonnet/grafana.libsonnet'; -local prometheus = grafana.prometheus; -local graphPanel = grafana.graphPanel; - -// Creates Grafana dashboard graph panel for an SLI type -// @param sliSpec The spec for the SLI having its dashboard created -// @returns Grafana graph panel object -local createGraphPanel(sliSpec) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local dashboardSelectors = sliMetricLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - graphPanel.new( - title = '%s' % sliSpec.sliDescription, - description = ||| - * Sample interval is %(evalInterval)s - ||| % { - evalInterval: sliSpec.evalInterval, - }, - datasource = 'prometheus', - min = 0, - fill = 0, - thresholds = [ - { - value: sliSpec.metricTarget, - colorMode: 'critical', - op: 'gt', - line: 'true', - fill: 'true', - }, - ] - ).addTarget( - prometheus.target( - 'sum(sum_over_time(%(durationMetric)s{%(selectors)s}[%(evalInterval)s])) / - sum(count_over_time(%(durationMetric)s{%(selectors)s}[%(evalInterval)s]))' % { - durationMetric: targetMetrics.duration, - selectors: std.join(',', dashboardSelectors), - evalInterval: sliSpec.evalInterval, - }, - legendFormat = 'Average %s' % sliSpec.sliDescription, - ), - ); - -// Creates custom recording rules for an SLI type -// @param sliSpec The spec for the SLI having its recording rules created -// @param sliMetadata Metadata about the type and category of the SLI -// @param config The config for the service defined in the mixin file -// @returns JSON defining the recording rules -local createCustomRecordingRules(sliSpec, sliMetadata, config) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local ruleSelectors = sliMetricLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - [ - { - record: 'sli_value', - expr: ||| - sum(sum_over_time(%(durationMetric)s{%(selectors)s}[%(evalInterval)s])) / - sum(count_over_time(%(durationMetric)s{%(selectors)s}[%(evalInterval)s])) - ||| % { - durationMetric: targetMetrics.duration, - selectors: std.join(',', ruleSelectors), - evalInterval: sliSpec.evalInterval, - }, - labels: sliSpec.sliLabels + sliMetadata, - }, - ]; - -// File exports -{ - createGraphPanel(sliSpec): createGraphPanel(sliSpec), - createCustomRecordingRules(sliSpec, sliMetadata, config): createCustomRecordingRules(sliSpec, sliMetadata, config), -} diff --git a/monitoring-as-code/src/sli-metric-libraries/sli-correctness-cloudwatch-sqs-dlq.libsonnet b/monitoring-as-code/src/sli-metric-libraries/sli-correctness-cloudwatch-sqs-dlq.libsonnet deleted file mode 100644 index e5a442bc..00000000 --- a/monitoring-as-code/src/sli-metric-libraries/sli-correctness-cloudwatch-sqs-dlq.libsonnet +++ /dev/null @@ -1,132 +0,0 @@ -// Library to generate Grafana and Prometheus config for Cloudwatch SQS correctness - -// MaC imports -local sliMetricLibraryFunctions = import '../util/sli-metric-library-functions.libsonnet'; -local stringFormattingFunctions = import '../util/string-formatting-functions.libsonnet'; - -// Grafana imports -local grafana = import 'grafonnet/grafana.libsonnet'; -local prometheus = grafana.prometheus; -local graphPanel = grafana.graphPanel; -local row = grafana.row; -local template = grafana.template; -local statPanel = grafana.statPanel; - -// Creates Grafana dashboard graph panel for an SLI type -// @param sliSpec The spec for the SLI having its dashboard created -// @returns Grafana graph panel object -local createGraphPanel(sliSpec) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local dashboardSelectors = sliMetricLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - graphPanel.new( - title = '%s' % sliSpec.sliDescription, - description = ||| - * Sample interval is %(evalInterval)s - * Resource selectors are %(selectors)s - * Only queues with type deadletter - ||| % { - evalInterval: sliSpec.evalInterval, - selectors: std.strReplace(std.join(', ', sliMetricLibraryFunctions.getSelectors(metricConfig, sliSpec)), '~', '\\~'), - }, - datasource = 'prometheus', - min = 0, - fill = 0, - ).addTarget( - prometheus.target( - 'sum(avg_over_time(%(messagesVisibleMetric)s{%(selectors)s, %(queueSelector)s}[%(evalInterval)s]) or vector(0))' % { - messagesVisibleMetric: targetMetrics.messagesVisible, - selectors: std.join(',', dashboardSelectors), - queueSelector: sliMetricLibraryFunctions.getCustomSelector('deadletterQueueName', metricConfig), - evalInterval: sliSpec.evalInterval, - }, - legendFormat = 'avg number of msgs visible in dlq', - ) - ).addTarget( - prometheus.target( - 'sum(avg_over_time(sqs_message_received_in_dlq_avg{%(dashboardSliLabelSelectors)s}[%(evalInterval)s]) - or vector(0))/ sum(count_over_time(%(messagesVisibleMetric)s{%(selectors)s, %(queueSelector)s}[%(evalInterval)s]) - or vector(0))' % { - messagesVisibleMetric: targetMetrics.messagesVisible, - selectors: std.join(',', dashboardSelectors), - queueSelector: sliMetricLibraryFunctions.getCustomSelector('deadletterQueueName', metricConfig), - dashboardSliLabelSelectors: sliSpec.dashboardSliLabelSelectors, - evalInterval: sliSpec.evalInterval, - }, - legendFormat = 'avg period where msgs in dlq >= 1', - ) - ).addTarget( - prometheus.target( - 'sum(avg_over_time(%(oldestMessageMetric)s{%(selectors)s, %(queueSelector)s} - [%(evalInterval)s]) or vector(0))' % { - oldestMessageMetric: targetMetrics.oldestMessage, - selectors: std.join(',', dashboardSelectors), - queueSelector: sliMetricLibraryFunctions.getCustomSelector('deadletterQueueName', metricConfig), - evalInterval: sliSpec.evalInterval, - }, - legendFormat = 'avg age of oldest msg in dlq (secs)', - ) - ).addSeriesOverride( - { - alias: '/avg age of oldest msg in dlq/', - yaxis: 2, - color: '#8AB8FF', - }, - ).addSeriesOverride( - { - alias: '/avg period where msgs in dlq >= 1/', - color: 'red', - }, - ).addSeriesOverride( - { - alias: '/avg number of msgs visible in dlq/', - color: 'orange', - }, - ); - -// Creates custom recording rules for an SLI type -// @param sliSpec The spec for the SLI having its recording rules created -// @param sliMetadata Metadata about the type and category of the SLI -// @param config The config for the service defined in the mixin file -// @returns JSON defining the recording rules -local createCustomRecordingRules(sliSpec, sliMetadata, config) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local ruleSelectors = sliMetricLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - [ - { - record: 'sli_value', - expr: ||| - (sum(avg_over_time(sqs_message_received_in_dlq_avg{%(ruleSliLabelSelectors)s}[%(evalInterval)s])) or vector(0)) - / - (sum(count_over_time(%(messagesVisibleMetric)s{%(selectors)s, %(queueSelector)s}[%(evalInterval)s])) or vector(0)) - ||| % { - messagesVisibleMetric: targetMetrics.messagesVisible, - selectors: std.join(',', ruleSelectors), - queueSelector: sliMetricLibraryFunctions.getCustomSelector('deadletterQueueName', metricConfig), - ruleSliLabelSelectors: sliSpec.ruleSliLabelSelectors, - evalInterval: sliSpec.evalInterval, - }, - labels: sliSpec.sliLabels + sliMetadata, - }, - { - // unsure how to add to this expression - record: 'sqs_message_received_in_dlq_avg', - expr: ||| - (%(messagesVisibleMetric)s{%(selectors)s, %(queueSelector)s} >= bool 1) or vector(0) - ||| % { - messagesVisibleMetric: targetMetrics.messagesVisible, - selectors: std.join(',', ruleSelectors), - queueSelector: sliMetricLibraryFunctions.getCustomSelector('deadletterQueueName', metricConfig), - }, - labels: sliSpec.sliLabels, - }, - ]; - -// File exports -{ - createGraphPanel(sliSpec): createGraphPanel(sliSpec), - createCustomRecordingRules(sliSpec, sliMetadata, config): createCustomRecordingRules(sliSpec, sliMetadata, config), -} diff --git a/monitoring-as-code/src/sli-metric-libraries/sli-freshness-cloudwatch-sqs.libsonnet b/monitoring-as-code/src/sli-metric-libraries/sli-freshness-cloudwatch-sqs.libsonnet deleted file mode 100644 index 6341a79e..00000000 --- a/monitoring-as-code/src/sli-metric-libraries/sli-freshness-cloudwatch-sqs.libsonnet +++ /dev/null @@ -1,131 +0,0 @@ -// Library to generate Grafana and Prometheus config for Cloudwatch SQS latency - -// MaC imports -local sliMetricLibraryFunctions = import '../util/sli-metric-library-functions.libsonnet'; - -// Grafana imports -local grafana = import 'grafonnet/grafana.libsonnet'; -local prometheus = grafana.prometheus; -local graphPanel = grafana.graphPanel; -local row = grafana.row; -local template = grafana.template; - -// Creates Grafana dashboard graph panel for an SLI type -// @param sliSpec The spec for the SLI having its dashboard created -// @returns Grafana graph panel object -local createGraphPanel(sliSpec) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local dashboardSelectors = sliMetricLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - graphPanel.new( - title = '%s' % sliSpec.sliDescription, - description = ||| - * Sample interval is %(evalInterval)s - * Resource selectors are %(selectors)s - * Only queues where type is not deadletter - ||| % { - evalInterval: sliSpec.evalInterval, - selectors: std.strReplace(std.join(', ', sliMetricLibraryFunctions.getSelectors(metricConfig, sliSpec)), '~', '\\~'), - }, - datasource = 'prometheus', - min = 0, - fill = 0, - ).addTarget( - prometheus.target( - 'sum(avg_over_time(%(messagesDeletedMetric)s{%(selectors)s, %(queueSelector)s} - [%(evalInterval)s]) or vector(0))' % { - messagesDeletedMetric: targetMetrics.messagesDeleted, - selectors: std.join(',', dashboardSelectors), - queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueName, metricConfig.customSelectors.deadletterQueueName], - evalInterval: sliSpec.evalInterval, - }, - legendFormat='avg number of msgs delivered', - ) - ).addTarget( - prometheus.target( - 'sum(avg_over_time(sqs_high_latency_in_queue_avg{%(dashboardSliLabelSelectors)s}[%(evalInterval)s]) - or vector(0))/ sum(count_over_time(%(oldestMessageMetric)s{%(selectors)s, %(queueSelector)s} - [%(evalInterval)s]) or vector(0))' % { - oldestMessageMetric: targetMetrics.oldestMessage, - selectors: std.join(',', dashboardSelectors), - queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueName, metricConfig.customSelectors.deadletterQueueName], - dashboardSliLabelSelectors: sliSpec.dashboardSliLabelSelectors, - evalInterval: sliSpec.evalInterval, - }, - legendFormat='avg period where msg in standard queue > %s seconds' % sliSpec.metricTarget, - ) - ).addTarget( - prometheus.target( - 'sum(avg_over_time(%(oldestMessageMetric)s{%(selectors)s, %(queueSelector)s}[%(evalInterval)s]) or vector(0))' % { - oldestMessageMetric: targetMetrics.oldestMessage, - selectors: std.join(',', dashboardSelectors), - queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueName, metricConfig.customSelectors.deadletterQueueName], - evalInterval: sliSpec.evalInterval, - }, - legendFormat='avg age of oldest msg in standard queue (secs)', - ) - ).addSeriesOverride( - { - alias: '/avg age of oldest msg in standard queue/', - yaxis: 2, - color: 'orange', - }, - ).addSeriesOverride( - { - alias: '/avg period where msg in standard queue > %s seconds/' % sliSpec.metricTarget, - color: 'red', - }, - ).addSeriesOverride( - { - alias: '/avg number of msgs delivered/', - color: 'green', - }, - ); - -// Creates custom recording rules for an SLI type -// @param sliSpec The spec for the SLI having its recording rules created -// @param sliMetadata Metadata about the type and category of the SLI -// @param config The config for the service defined in the mixin file -// @returns JSON defining the recording rules -local createCustomRecordingRules(sliSpec, sliMetadata, config) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local ruleSelectors = sliMetricLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - [ - { - record: 'sli_value', - expr: ||| - (sum(avg_over_time(sqs_high_latency_in_queue_avg{%(ruleSliLabelSelectors)s}[%(evalInterval)s])) or vector(0)) - / - (sum(count_over_time(%(oldestMessageMetric)s{%(selectors)s, %(queueSelector)s}[%(evalInterval)s])) or vector(0)) - ||| % { - oldestMessageMetric: targetMetrics.oldestMessage, - selectors: std.join(',', ruleSelectors), - queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueName, metricConfig.customSelectors.deadletterQueueName], - ruleSliLabelSelectors: sliSpec.ruleSliLabelSelectors, - evalInterval: sliSpec.evalInterval, - }, - labels: sliSpec.sliLabels + sliMetadata, - }, - { - // unsure how to add to this expression - record: 'sqs_high_latency_in_queue_avg', - expr: ||| - (%(oldestMessageMetric)s{%(selectors)s, %(queueSelector)s} > bool %(metricTarget)s) or vector(0) - ||| % { - oldestMessageMetric: targetMetrics.oldestMessage, - selectors: std.join(',', ruleSelectors), - queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueName, metricConfig.customSelectors.deadletterQueueName], - metricTarget: sliSpec.metricTarget, - }, - labels: sliSpec.sliLabels, - }, - ]; - -// File exports -{ - createGraphPanel(sliSpec): createGraphPanel(sliSpec), - createCustomRecordingRules(sliSpec, sliMetadata, config): createCustomRecordingRules(sliSpec, sliMetadata, config), -} diff --git a/monitoring-as-code/src/sli-metric-libraries/sli-latency-cloudwatch-alb.libsonnet b/monitoring-as-code/src/sli-metric-libraries/sli-latency-cloudwatch-alb.libsonnet deleted file mode 100644 index b86d5c2d..00000000 --- a/monitoring-as-code/src/sli-metric-libraries/sli-latency-cloudwatch-alb.libsonnet +++ /dev/null @@ -1,112 +0,0 @@ -// Library to generate Grafana and Prometheus config for AWS Cloudwatch LoadBalancer latency metrics - -// MaC imports -local sliMetricLibraryFunctions = import '../util/sli-metric-library-functions.libsonnet'; - -// Grafana imports -local grafana = import 'grafonnet/grafana.libsonnet'; -local prometheus = grafana.prometheus; -local graphPanel = grafana.graphPanel; - -// Gets valid formatted Cloudwatch percentile -// @param sliSpec The spec for the SLI being processed -// @returns Formatted Cloudwatch percentile -local getCloudwatchPercentile(sliSpec) = - if sliSpec.latencyPercentile == 0.9 then 'p90' - else if sliSpec.latencyPercentile == 0.95 then 'p95' - else if sliSpec.latencyPercentile == 0.99 then 'p99' - else error 'Invalid latency percentile for Cloudwatch ALB latency'; - -// Creates Grafana dashboard graph panel for an SLI type -// @param sliSpec The spec for the SLI having its dashboard created -// @returns Grafana graph panel object -local createGraphPanel(sliSpec) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local dashboardSelectors = sliMetricLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - graphPanel.new( - title = 'Latency - %s' % sliSpec.sliDescription, - description = ||| - * Sample interval is %(evalInterval)s - * Resource selectors are %(selectors)s - ||| % { - evalInterval: sliSpec.evalInterval, - selectors: std.strReplace(std.join(', ', sliMetricLibraryFunctions.getSelectors(metricConfig, sliSpec)), '~', '\\~'), - }, - datasource = 'prometheus', - min = 0, - format = 's', - thresholds = [ - { - value: sliSpec.metricTarget, - colorMode: 'critical', - op: 'gt', - line: true, - fill: false, - }, - ], - ).addTarget( - prometheus.target( - 'avg (%(responseTimeMetric)s_average{%(selectors)s})' % { - responseTimeMetric: targetMetrics.responseTime, - selectors: std.join(',', dashboardSelectors) - }, - legendFormat = 'Average Latency', - ), - ).addTarget( - prometheus.target( - 'max (%(responseTimeMetric)s_p90{%(selectors)s})' % { - responseTimeMetric: targetMetrics.responseTime, - selectors: std.join(',', dashboardSelectors) - }, - legendFormat = 'Max p90 Latency', - ), - ).addTarget( - prometheus.target( - 'max (%(responseTimeMetric)s_p95{%(selectors)s})' % { - responseTimeMetric: targetMetrics.responseTime, - selectors: std.join(',', dashboardSelectors) - }, - legendFormat = 'Max p95 Latency', - ), - ).addTarget( - prometheus.target( - 'max (%(responseTimeMetric)s_p99{%(selectors)s})' % { - responseTimeMetric: targetMetrics.responseTime, - selectors: std.join(',', dashboardSelectors) - }, - legendFormat = 'Max p99 Latency', - ), - ); - -// Creates custom recording rules for an SLI type -// @param sliSpec The spec for the SLI having its recording rules created -// @param sliMetadata Metadata about the type and category of the SLI -// @param config The config for the service defined in the mixin file -// @returns JSON defining the recording rules -local createCustomRecordingRules(sliSpec, sliMetadata, config) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local ruleSelectors = sliMetricLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - local cloudwatchPercentile = getCloudwatchPercentile(sliSpec); - - [ - { - record: 'sli_value', - expr: ||| - max(%(responseTimeMetric)s_%(cloudwatchPercentile)s{%(selectors)s}) - ||| % { - responseTimeMetric: targetMetrics.responseTime, - selectors: std.join(',', ruleSelectors), - cloudwatchPercentile: cloudwatchPercentile, - }, - labels: sliSpec.sliLabels + sliMetadata, - }, - ]; - -// File exports -{ - createGraphPanel(sliSpec): createGraphPanel(sliSpec), - createCustomRecordingRules(sliSpec, sliMetadata, config): createCustomRecordingRules(sliSpec, sliMetadata, config), -} diff --git a/monitoring-as-code/src/sli-metric-libraries/sli-latency-promclient.libsonnet b/monitoring-as-code/src/sli-metric-libraries/sli-latency-promclient.libsonnet deleted file mode 100644 index 8bb9c238..00000000 --- a/monitoring-as-code/src/sli-metric-libraries/sli-latency-promclient.libsonnet +++ /dev/null @@ -1,84 +0,0 @@ -// Library to generate Grafana and Prometheus config for service latency metrics - -// MaC imports -local sliMetricLibraryFunctions = import '../util/sli-metric-library-functions.libsonnet'; -local stringFormattingFunctions = import '../util/string-formatting-functions.libsonnet'; - -// Grafana imports -local grafana = import 'grafonnet/grafana.libsonnet'; -local prometheus = grafana.prometheus; -local graphPanel = grafana.graphPanel; -local row = grafana.row; -local template = grafana.template; - -// Creates Grafana dashboard graph panel for an SLI type -// @param sliSpec The spec for the SLI having its dashboard created -// @returns Grafana graph panel object -local createGraphPanel(sliSpec) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local dashboardSelectors = sliMetricLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - graphPanel.new( - title = 'Latency - %s' % sliSpec.sliDescription, - description = ||| - * Sample interval is %(evalInterval)s - * Request selectors are %(selectors)s - ||| % { - evalInterval: sliSpec.evalInterval, - selectors: std.strReplace(std.join(', ', sliMetricLibraryFunctions.getSelectors(metricConfig, sliSpec)), '~', '\\~'), - }, - datasource = 'prometheus', - fill = 0, - thresholds = [ - { - value: sliSpec.metricTarget, - colorMode: 'critical', - op: 'gt', - line: true, - fill: false, - }, - ], - ).addTarget( - prometheus.target( - 'sum(rate(%(sumMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) / - sum(rate(%(countMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0))' % { - sumMetric: targetMetrics.sum, - countMetric: targetMetrics.count, - selectors: std.join(',', dashboardSelectors), - evalInterval: sliSpec.evalInterval, - }, - legendFormat = 'Average Latency', - ) - ); - -// Creates custom recording rules for an SLI type -// @param sliSpec The spec for the SLI having its recording rules created -// @param sliMetadata Metadata about the type and category of the SLI -// @param config The config for the service defined in the mixin file -// @returns JSON defining the recording rules -local createCustomRecordingRules(sliSpec, sliMetadata, config) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local ruleSelectors = sliMetricLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - [ - { - record: 'sli_value', - expr: ||| - histogram_quantile(%(latencyPercentile)0.2f, (sum by (le) (rate(%(bucketMetric)s{%(selectors)s}[%(evalInterval)s])))) - ||| % { - bucketMetric: targetMetrics.bucket, - latencyPercentile: sliSpec.latencyPercentile, - selectors: std.join(',', ruleSelectors), - evalInterval: sliSpec.evalInterval, - }, - labels: sliSpec.sliLabels + sliMetadata, - }, - ]; - -// File exports -{ - createGraphPanel(sliSpec): createGraphPanel(sliSpec), - createCustomRecordingRules(sliSpec, sliMetadata, config): createCustomRecordingRules(sliSpec, sliMetadata, config), -} diff --git a/monitoring-as-code/src/sli-metric-libraries/sli-metric-library-template.libsonnet b/monitoring-as-code/src/sli-metric-libraries/sli-metric-library-template.libsonnet deleted file mode 100644 index c1f5c476..00000000 --- a/monitoring-as-code/src/sli-metric-libraries/sli-metric-library-template.libsonnet +++ /dev/null @@ -1,49 +0,0 @@ -// Template for SLI metric libraries - -// MaC imports -local sliMetricLibraryFunctions = import '../util/sli-metric-library-functions.libsonnet'; - -// Grafana imports -local grafana = import 'grafonnet/grafana.libsonnet'; -local prometheus = grafana.prometheus; -local graphPanel = grafana.graphPanel; - -// Creates Grafana dashboard graph panel for an SLI type -// @param sliSpec The spec for the SLI having its dashboard created -// @returns Grafana graph panel object -local createGraphPanel(sliSpec) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local dashboardSelectors = sliMetricLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - graphPanel.new( - title = '%s' % sliSpec.sliDescription, - datasource = 'prometheus', - ); - -// Creates custom recording rules for an SLI type -// @param sliSpec The spec for the SLI having its recording rules created -// @param sliMetadata Metadata about the type and category of the SLI -// @param config The config for the service defined in the mixin file -// @returns JSON defining the recording rules -local createCustomRecordingRules(sliSpec, sliMetadata, config) = - local metricConfig = sliMetricLibraryFunctions.getMetricConfig(sliSpec); - local ruleSelectors = sliMetricLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); - local targetMetrics = sliMetricLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); - - [ - { - record: 'sli_value', - expr: ||| - ||| % { - evalInterval: sliSpec.evalInterval, - }, - labels: sliSpec.sliLabels + sliMetadata, - }, - ]; - -// File exports -{ - createGraphPanel(sliSpec): createGraphPanel(sliSpec), - createCustomRecordingRules(sliSpec, sliMetadata, config): createCustomRecordingRules(sliSpec, sliMetadata, config), -} diff --git a/monitoring-as-code/src/sli-value-libraries/average-correctness-using-queue-metric.libsonnet b/monitoring-as-code/src/sli-value-libraries/average-correctness-using-queue-metric.libsonnet new file mode 100644 index 00000000..87907f8d --- /dev/null +++ b/monitoring-as-code/src/sli-value-libraries/average-correctness-using-queue-metric.libsonnet @@ -0,0 +1,127 @@ +// Divides the sum of if visible message metric samples above one by the sum of count over time of +// target metric samples + +// Target metrics: +// visibleMessages - Metric representing the number of visible messages +// oldestMessage - Metric representing the age of oldest message + +// Additional config: +// deadletterQueueName custom selector label in metric type config +// deadletterQueueName custom selector in metric type config + +// MaC imports +local sliValueLibraryFunctions = import '../util/sli-value-library-functions.libsonnet'; + +// Grafana imports +local grafana = import 'grafonnet/grafana.libsonnet'; +local prometheus = grafana.prometheus; +local graphPanel = grafana.graphPanel; + +// Creates the custom SLI value rule +// @param sliSpec The spec for the SLI having its recording rules created +// @param sliMetadata Metadata about the type and category of the SLI +// @param config The config for the service defined in the mixin file +// @returns JSON defining the recording rule +local createSliValueRule(sliSpec, sliMetadata, config) = + local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec); + local ruleSelectors = sliValueLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); + local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); + + [ + { + record: 'sli_value', + expr: ||| + sum(avg_over_time((%(visibleMessagesMetric)s{%(selectors)s, %(queueSelector)s} >= bool 1)[%(evalInterval)s:%(evalInterval)s]) or vector(0)) + / + sum(count_over_time(%(visibleMessagesMetric)s{%(selectors)s, %(queueSelector)s}[%(evalInterval)s])) + ||| % { + visibleMessagesMetric: targetMetrics.visibleMessages, + queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueName, metricConfig.customSelectors.deadletterQueueName], + selectors: std.join(',', ruleSelectors), + evalInterval: sliSpec.evalInterval, + }, + labels: sliSpec.sliLabels + sliMetadata, + }, + ]; + +// Creates Grafana dashboard graph panel +// @param sliSpec The spec for the SLI having its dashboard created +// @returns Grafana graph panel object +local createGraphPanel(sliSpec) = + local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec); + local dashboardSelectors = sliValueLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); + local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); + + graphPanel.new( + title = '%s' % sliSpec.sliDescription, + datasource = 'prometheus', + description = ||| + * Sample interval is %(evalInterval)s + * Selectors are %(selectors)s + ||| % { + selectors: std.strReplace(std.join(', ', sliValueLibraryFunctions.getSelectors(metricConfig, sliSpec)), '~', '\\~'), + evalInterval: sliSpec.evalInterval, + }, + min = 0, + fill = 0, + ).addTarget( + prometheus.target( + ||| + sum(avg_over_time(%(visibleMessagesMetric)s{%(selectors)s, %(queueSelector)s}[%(evalInterval)s]) or vector(0)) + ||| % { + visibleMessagesMetric: targetMetrics.visibleMessages, + queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueName, metricConfig.customSelectors.deadletterQueueName], + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'avg number of msgs visible in dlq', + ) + ).addTarget( + prometheus.target( + ||| + sum(avg_over_time((%(visibleMessagesMetric)s{%(selectors)s, %(queueSelector)s} >= bool 1)[%(evalInterval)s:%(evalInterval)s]) or vector(0)) + / + sum(count_over_time(%(visibleMessagesMetric)s{%(selectors)s, %(queueSelector)s}[%(evalInterval)s])) + ||| % { + visibleMessagesMetric: targetMetrics.visibleMessages, + queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueName, metricConfig.customSelectors.deadletterQueueName], + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'avg period where msgs in dlq >= 1', + ) + ).addTarget( + prometheus.target( + ||| + sum(avg_over_time(%(oldestMessageMetric)s{%(selectors)s, %(queueSelector)s}[%(evalInterval)s]) or vector(0)) + ||| % { + oldestMessageMetric: targetMetrics.oldestMessage, + queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueName, metricConfig.customSelectors.deadletterQueueName], + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'avg age of oldest msg in dlq (secs)', + ) + ).addSeriesOverride( + { + alias: '/avg age of oldest msg in dlq/', + yaxis: 2, + color: '#8AB8FF', + }, + ).addSeriesOverride( + { + alias: '/avg period where msgs in dlq >= 1/', + color: 'red', + }, + ).addSeriesOverride( + { + alias: '/avg number of msgs visible in dlq/', + color: 'orange', + }, + ); + +// File exports +{ + createSliValueRule(sliSpec, sliMetadata, config): createSliValueRule(sliSpec, sliMetadata, config), + createGraphPanel(sliSpec): createGraphPanel(sliSpec), +} diff --git a/monitoring-as-code/src/sli-value-libraries/average-freshness-using-queue-metric.libsonnet b/monitoring-as-code/src/sli-value-libraries/average-freshness-using-queue-metric.libsonnet new file mode 100644 index 00000000..d3b48793 --- /dev/null +++ b/monitoring-as-code/src/sli-value-libraries/average-freshness-using-queue-metric.libsonnet @@ -0,0 +1,130 @@ +// Divides the sum of if target metric samples above latency target by the sum of count over time of +// target metric samples + +// Target metrics: +// oldestMessage - Metric representing the age of oldest message +// deletedMessages - Metric representing the number of deleted messages + +// Additional config: +// latencyTarget in SLI spec +// deadletterQueueName custom selector label in metric type config +// deadletterQueueName custom selector in metric type config + +// MaC imports +local sliValueLibraryFunctions = import '../util/sli-value-library-functions.libsonnet'; + +// Grafana imports +local grafana = import 'grafonnet/grafana.libsonnet'; +local prometheus = grafana.prometheus; +local graphPanel = grafana.graphPanel; + +// Creates the custom SLI value rule +// @param sliSpec The spec for the SLI having its recording rules created +// @param sliMetadata Metadata about the type and category of the SLI +// @param config The config for the service defined in the mixin file +// @returns JSON defining the recording rule +local createSliValueRule(sliSpec, sliMetadata, config) = + local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec); + local ruleSelectors = sliValueLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); + local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); + + [ + { + record: 'sli_value', + expr: ||| + sum(avg_over_time((%(oldestMessageMetric)s{%(selectors)s, %(queueSelector)s} > bool %(latencyTarget)s)[%(evalInterval)s:%(evalInterval)s]) or vector(0)) + / + sum(count_over_time(%(oldestMessageMetric)s{%(selectors)s, %(queueSelector)s}[%(evalInterval)s])) + ||| % { + oldestMessageMetric: targetMetrics.oldestMessage, + queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueName, metricConfig.customSelectors.deadletterQueueName], + latencyTarget: sliSpec.latencyTarget, + selectors: std.join(',', ruleSelectors), + evalInterval: sliSpec.evalInterval, + }, + labels: sliSpec.sliLabels + sliMetadata, + }, + ]; + +// Creates Grafana dashboard graph panel +// @param sliSpec The spec for the SLI having its dashboard created +// @returns Grafana graph panel object +local createGraphPanel(sliSpec) = + local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec); + local dashboardSelectors = sliValueLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); + local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); + + graphPanel.new( + title = '%s' % sliSpec.sliDescription, + datasource = 'prometheus', + description = ||| + * Sample interval is %(evalInterval)s + * Selectors are %(selectors)s + ||| % { + selectors: std.strReplace(std.join(', ', sliValueLibraryFunctions.getSelectors(metricConfig, sliSpec)), '~', '\\~'), + evalInterval: sliSpec.evalInterval, + }, + min = 0, + fill = 0, + ).addTarget( + prometheus.target( + ||| + sum(avg_over_time(%(deletedMessagesMetric)s{%(selectors)s, %(queueSelector)s}[%(evalInterval)s]) or vector(0)) + ||| % { + deletedMessagesMetric: targetMetrics.deletedMessages, + queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueName, metricConfig.customSelectors.deadletterQueueName], + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'avg number of msgs delivered', + ), + ).addTarget( + prometheus.target( + ||| + sum(avg_over_time((%(oldestMessageMetric)s{%(selectors)s, %(queueSelector)s} > bool %(latencyTarget)s)[%(evalInterval)s:%(evalInterval)s]) or vector(0)) + / + sum(count_over_time(%(oldestMessageMetric)s{%(selectors)s, %(queueSelector)s}[%(evalInterval)s])) + ||| % { + oldestMessageMetric: targetMetrics.oldestMessage, + queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueName, metricConfig.customSelectors.deadletterQueueName], + latencyTarget: sliSpec.latencyTarget, + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat='avg period where msg in standard queue > %s seconds' % sliSpec.latencyTarget, + ) + ).addTarget( + prometheus.target( + ||| + sum(avg_over_time(%(oldestMessageMetric)s{%(selectors)s, %(queueSelector)s}[%(evalInterval)s]) or vector(0)) + ||| % { + oldestMessageMetric: targetMetrics.oldestMessage, + queueSelector: '%s!~"%s"' % [metricConfig.customSelectorLabels.deadletterQueueName, metricConfig.customSelectors.deadletterQueueName], + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat='avg age of oldest msg in standard queue (secs)', + ) + ).addSeriesOverride( + { + alias: '/avg age of oldest msg in standard queue/', + yaxis: 2, + color: 'orange', + }, + ).addSeriesOverride( + { + alias: '/avg period where msg in standard queue > %s seconds/' % sliSpec.metricTarget, + color: 'red', + }, + ).addSeriesOverride( + { + alias: '/avg number of msgs delivered/', + color: 'green', + }, + ); + +// File exports +{ + createSliValueRule(sliSpec, sliMetadata, config): createSliValueRule(sliSpec, sliMetadata, config), + createGraphPanel(sliSpec): createGraphPanel(sliSpec), +} diff --git a/monitoring-as-code/src/sli-value-libraries/average-using-single-metric.libsonnet b/monitoring-as-code/src/sli-value-libraries/average-using-single-metric.libsonnet new file mode 100644 index 00000000..8d6a3af7 --- /dev/null +++ b/monitoring-as-code/src/sli-value-libraries/average-using-single-metric.libsonnet @@ -0,0 +1,89 @@ +// Divides the sum of sum over time of target metric samples by the sum of count over time of +// target metric samples + +// Target metrics: +// target - Metric to get the average value of over evaluation interval + +// MaC imports +local sliValueLibraryFunctions = import '../util/sli-value-library-functions.libsonnet'; + +// Grafana imports +local grafana = import 'grafonnet/grafana.libsonnet'; +local prometheus = grafana.prometheus; +local graphPanel = grafana.graphPanel; + +// Creates the custom SLI value rule +// @param sliSpec The spec for the SLI having its recording rules created +// @param sliMetadata Metadata about the type and category of the SLI +// @param config The config for the service defined in the mixin file +// @returns JSON defining the recording rule +local createSliValueRule(sliSpec, sliMetadata, config) = + local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec); + local ruleSelectors = sliValueLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); + local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); + + [ + { + record: 'sli_value', + expr: ||| + sum(sum_over_time(%(targetMetric)s{%(selectors)s}[%(evalInterval)s])) + / + sum(count_over_time(%(targetMetric)s{%(selectors)s}[%(evalInterval)s])) + ||| % { + targetMetric: targetMetrics.target, + selectors: std.join(',', ruleSelectors), + evalInterval: sliSpec.evalInterval, + }, + labels: sliSpec.sliLabels + sliMetadata, + }, + ]; + +// Creates Grafana dashboard graph panel +// @param sliSpec The spec for the SLI having its dashboard created +// @returns Grafana graph panel object +local createGraphPanel(sliSpec) = + local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec); + local dashboardSelectors = sliValueLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); + local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); + + graphPanel.new( + title = '%s' % sliSpec.sliDescription, + datasource = 'prometheus', + description = ||| + * Sample interval is %(evalInterval)s + * Selectors are %(selectors)s + ||| % { + selectors: std.strReplace(std.join(', ', sliValueLibraryFunctions.getSelectors(metricConfig, sliSpec)), '~', '\\~'), + evalInterval: sliSpec.evalInterval, + }, + min = 0, + fill = 0, + thresholds = [ + { + value: sliSpec.metricTarget, + colorMode: 'critical', + op: 'gt', + line: 'true', + fill: 'true', + }, + ], + ).addTarget( + prometheus.target( + ||| + sum(sum_over_time(%(targetMetric)s{%(selectors)s}[%(evalInterval)s])) + / + sum(count_over_time(%(targetMetric)s{%(selectors)s}[%(evalInterval)s])) + ||| % { + targetMetric: targetMetrics.target, + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'Average %s' % sliSpec.sliDescription, + ), + ); + +// File exports +{ + createSliValueRule(sliSpec, sliMetadata, config): createSliValueRule(sliSpec, sliMetadata, config), + createGraphPanel(sliSpec): createGraphPanel(sliSpec), +} diff --git a/monitoring-as-code/src/sli-value-libraries/histogram-quantile-latency.libsonnet b/monitoring-as-code/src/sli-value-libraries/histogram-quantile-latency.libsonnet new file mode 100644 index 00000000..356d9ba0 --- /dev/null +++ b/monitoring-as-code/src/sli-value-libraries/histogram-quantile-latency.libsonnet @@ -0,0 +1,91 @@ +// Calculates the histogram quantile of bucket metric samples + +// Target metrics: +// bucket - Metric representing the buckets histogram values fall into +// sum - Metric representing the sum of values +// count - Metric representing the count of values + +// Additional config: +// latencyPercentile in SLI spec + +// MaC imports +local sliValueLibraryFunctions = import '../util/sli-value-library-functions.libsonnet'; + +// Grafana imports +local grafana = import 'grafonnet/grafana.libsonnet'; +local prometheus = grafana.prometheus; +local graphPanel = grafana.graphPanel; + +// Creates the custom SLI value rule +// @param sliSpec The spec for the SLI having its recording rules created +// @param sliMetadata Metadata about the type and category of the SLI +// @param config The config for the service defined in the mixin file +// @returns JSON defining the recording rule +local createSliValueRule(sliSpec, sliMetadata, config) = + local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec); + local ruleSelectors = sliValueLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); + local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); + + [ + { + record: 'sli_value', + expr: ||| + histogram_quantile(%(latencyPercentile)0.2f, (sum by (le) (rate(%(bucketMetric)s{%(selectors)s}[%(evalInterval)s])))) + ||| % { + bucketMetric: targetMetrics.bucket, + latencyPercentile: sliSpec.latencyPercentile, + selectors: std.join(',', ruleSelectors), + evalInterval: sliSpec.evalInterval, + }, + labels: sliSpec.sliLabels + sliMetadata, + }, + ]; + +// Creates Grafana dashboard graph panel +// @param sliSpec The spec for the SLI having its dashboard created +// @returns Grafana graph panel object +local createGraphPanel(sliSpec) = + local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec); + local dashboardSelectors = sliValueLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); + local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); + + graphPanel.new( + title = '%s' % sliSpec.sliDescription, + datasource = 'prometheus', + description = ||| + * Sample interval is %(evalInterval)s + * Selectors are %(selectors)s + ||| % { + selectors: std.strReplace(std.join(', ', sliValueLibraryFunctions.getSelectors(metricConfig, sliSpec)), '~', '\\~'), + evalInterval: sliSpec.evalInterval, + }, + fill = 0, + thresholds = [ + { + value: sliSpec.metricTarget, + colorMode: 'critical', + op: 'gt', + line: true, + fill: false, + }, + ], + ).addTarget( + prometheus.target( + ||| + sum(rate(%(sumMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) / + sum(rate(%(countMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) + ||| % { + sumMetric: targetMetrics.sum, + countMetric: targetMetrics.count, + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'average latency', + ), + ); + +// File exports +{ + createSliValueRule(sliSpec, sliMetadata, config): createSliValueRule(sliSpec, sliMetadata, config), + createGraphPanel(sliSpec): createGraphPanel(sliSpec), +} diff --git a/monitoring-as-code/src/sli-value-libraries/max-latency-using-cloudwatch-percentile-metric.libsonnet b/monitoring-as-code/src/sli-value-libraries/max-latency-using-cloudwatch-percentile-metric.libsonnet new file mode 100644 index 00000000..4a7207a9 --- /dev/null +++ b/monitoring-as-code/src/sli-value-libraries/max-latency-using-cloudwatch-percentile-metric.libsonnet @@ -0,0 +1,124 @@ +// Calculates the maximum value of CloudWatch percentile metric samples + +// Target metrics: +// p90 - Metric with p90 CloudWatch percentile suffix +// p95 - Metric with p95 CloudWatch percentile suffix +// p99 - Metric with p99 CloudWatch percentile suffix +// average - Metric with average Cloudwatch percentile suffix + +// Additional config: +// latencyPercentile in SLI spec + +// MaC imports +local sliValueLibraryFunctions = import '../util/sli-value-library-functions.libsonnet'; + +// Grafana imports +local grafana = import 'grafonnet/grafana.libsonnet'; +local prometheus = grafana.prometheus; +local graphPanel = grafana.graphPanel; + +// Creates the custom SLI value rule +// @param sliSpec The spec for the SLI having its recording rules created +// @param sliMetadata Metadata about the type and category of the SLI +// @param config The config for the service defined in the mixin file +// @returns JSON defining the recording rule +local createSliValueRule(sliSpec, sliMetadata, config) = + local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec); + local ruleSelectors = sliValueLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); + local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); + + local cloudwatchPercentile = if sliSpec.latencyPercentile == 0.9 then 'p90' + else if sliSpec.latencyPercentile == 0.95 then 'p95' + else if sliSpec.latencyPercentile == 0.99 then 'p99' + else error 'Invalid latency percentile for Cloudwatch conversion'; + + [ + { + record: 'sli_value', + expr: ||| + max(%(cloudwatchPercentileMetric)s{%(selectors)s}) + ||| % { + cloudwatchPercentileMetric: targetMetrics[cloudwatchPercentile], + selectors: std.join(',', ruleSelectors), + evalInterval: sliSpec.evalInterval, + }, + labels: sliSpec.sliLabels + sliMetadata, + }, + ]; + +// Creates Grafana dashboard graph panel +// @param sliSpec The spec for the SLI having its dashboard created +// @returns Grafana graph panel object +local createGraphPanel(sliSpec) = + local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec); + local dashboardSelectors = sliValueLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); + local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); + + graphPanel.new( + title = '%s' % sliSpec.sliDescription, + datasource = 'prometheus', + description = ||| + * Sample interval is %(evalInterval)s + * Selectors are %(selectors)s + ||| % { + selectors: std.strReplace(std.join(', ', sliValueLibraryFunctions.getSelectors(metricConfig, sliSpec)), '~', '\\~'), + evalInterval: sliSpec.evalInterval, + }, + min = 0, + format = 's', + thresholds = [ + { + value: sliSpec.metricTarget, + colorMode: 'critical', + op: 'gt', + line: true, + fill: false, + }, + ], + ).addTarget( + prometheus.target( + ||| + avg(%(averageMetric)s{%(selectors)s}) + ||| % { + averageMetric: targetMetrics.average, + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'average latency', + ), + ).addTarget( + prometheus.target( + ||| + max(%(p90Metric)s{%(selectors)s}) + ||| % { + p90Metric: targetMetrics.p90, + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'max p90 latency', + ), + ).addTarget( + prometheus.target( + 'max(%(p95Metric)s{%(selectors)s})' % { + p95Metric: targetMetrics.p95, + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'max p95 latency', + ), + ).addTarget( + prometheus.target( + 'max(%(p99Metric)s{%(selectors)s})' % { + p99Metric: targetMetrics.p99, + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'max p99 latency', + ), + ); + +// File exports +{ + createSliValueRule(sliSpec, sliMetadata, config): createSliValueRule(sliSpec, sliMetadata, config), + createGraphPanel(sliSpec): createGraphPanel(sliSpec), +} diff --git a/monitoring-as-code/src/sli-value-libraries/proportion-of-errors-using-bad-request-metrics.libsonnet b/monitoring-as-code/src/sli-value-libraries/proportion-of-errors-using-bad-request-metrics.libsonnet new file mode 100644 index 00000000..5b5b2660 --- /dev/null +++ b/monitoring-as-code/src/sli-value-libraries/proportion-of-errors-using-bad-request-metrics.libsonnet @@ -0,0 +1,137 @@ +// Divides the sum of rate of change of 4xx and 5xx status code metric samples by the sum of +// rate of change of all status code metric samples + +// Target metrics: +// code4xx - Metric representing requests with 4xx status codes +// code5xx - Metric representing requests with 5xx status codes +// codeAll - Metric representing requests with all status codes + +// MaC imports +local sliValueLibraryFunctions = import '../util/sli-value-library-functions.libsonnet'; + +// Grafana imports +local grafana = import 'grafonnet/grafana.libsonnet'; +local prometheus = grafana.prometheus; +local graphPanel = grafana.graphPanel; + +// Creates the custom SLI value rule +// @param sliSpec The spec for the SLI having its recording rules created +// @param sliMetadata Metadata about the type and category of the SLI +// @param config The config for the service defined in the mixin file +// @returns JSON defining the recording rule +local createSliValueRule(sliSpec, sliMetadata, config) = + local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec); + local ruleSelectors = sliValueLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); + local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); + + [ + { + record: 'sli_value', + expr: ||| + ( + sum(rate(%(code4xxMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) + + + sum(rate(%(code5xxMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) + ) + / + sum(rate(%(codeAllMetric)s{%(selectors)s}[%(evalInterval)s])) + ||| % { + code4xxMetric: targetMetrics.code4xx, + code5xxMetric: targetMetrics.code5xx, + codeAllMetric: targetMetrics.codeAll, + selectors: std.join(',', ruleSelectors), + evalInterval: sliSpec.evalInterval, + }, + labels: sliSpec.sliLabels + sliMetadata, + }, + ]; + +// Creates Grafana dashboard graph panel +// @param sliSpec The spec for the SLI having its dashboard created +// @returns Grafana graph panel object +local createGraphPanel(sliSpec) = + local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec); + local dashboardSelectors = sliValueLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); + local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); + + graphPanel.new( + title = '%s' % sliSpec.sliDescription, + datasource = 'prometheus', + description = ||| + * Sample interval is %(evalInterval)s + * Selectors are %(selectors)s + * Errors are 4xx and 5xx requests + ||| % { + selectors: std.strReplace(std.join(', ', sliValueLibraryFunctions.getSelectors(metricConfig, sliSpec)), '~', '\\~'), + evalInterval: sliSpec.evalInterval, + }, + min = 0, + fill = 0, + formatY2 = 'percentunit', + thresholds = [ + { + value: sliSpec.metricTarget, + colorMode: 'critical', + op: 'gt', + line: true, + fill: false, + yaxis: 'right', + }, + ], + ).addTarget( + prometheus.target( + ||| + sum(rate(%(codeAllMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) + ||| % { + codeAllMetric: targetMetrics.codeAll, + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'requests per second', + ), + ).addTarget( + prometheus.target( + ||| + sum(rate(%(code4xxMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) + + + sum(rate(%(code5xxMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) + ||| % { + code4xxMetric: targetMetrics.code4xx, + code5xxMetric: targetMetrics.code5xx, + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'errors per second', + ) + ).addTarget( + prometheus.target( + ||| + ( + sum(rate(%(code4xxMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) + + + sum(rate(%(code5xxMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) + ) + / + sum(rate(%(codeAllMetric)s{%(selectors)s}[%(evalInterval)s])) + ||| % { + code4xxMetric: targetMetrics.code4xx, + code5xxMetric: targetMetrics.code5xx, + codeAllMetric: targetMetrics.codeAll, + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'error rate', + ) + ).addSeriesOverride( + { + alias: '/error rate/', + yaxis: 2, + color: 'red', + }, + ); + +// File exports +{ + createSliValueRule(sliSpec, sliMetadata, config): createSliValueRule(sliSpec, sliMetadata, config), + createGraphPanel(sliSpec): createGraphPanel(sliSpec), +} diff --git a/monitoring-as-code/src/sli-value-libraries/proportion-of-errors-using-failure-metric.libsonnet b/monitoring-as-code/src/sli-value-libraries/proportion-of-errors-using-failure-metric.libsonnet new file mode 100644 index 00000000..03a29517 --- /dev/null +++ b/monitoring-as-code/src/sli-value-libraries/proportion-of-errors-using-failure-metric.libsonnet @@ -0,0 +1,112 @@ +// Divides the sum of rate of change of failure metric samples by the sum of rate of change of +// success and failure metric samples + +// Target metrics: +// failure - Metric representing the total number of failures +// successAndFailure - Metric representing the total number of successes and failures + +// MaC imports +local sliValueLibraryFunctions = import '../util/sli-value-library-functions.libsonnet'; + +// Grafana imports +local grafana = import 'grafonnet/grafana.libsonnet'; +local prometheus = grafana.prometheus; +local graphPanel = grafana.graphPanel; + +// Creates the custom SLI value rule +// @param sliSpec The spec for the SLI having its recording rules created +// @param sliMetadata Metadata about the type and category of the SLI +// @param config The config for the service defined in the mixin file +// @returns JSON defining the recording rule +local createSliValueRule(sliSpec, sliMetadata, config) = + local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec); + local ruleSelectors = sliValueLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); + local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); + + [ + { + record: 'sli_value', + expr: ||| + sum(rate(%(failureMetric)s{%(selectors)s}[%(evalInterval)s])) + / + sum(rate(%(successAndFailureMetric)s{%(selectors)s}[%(evalInterval)s])) + ||| % { + failureMetric: targetMetrics.failure, + successAndFailureMetric: targetMetrics.successAndFailure, + selectors: std.join(',', ruleSelectors), + evalInterval: sliSpec.evalInterval, + }, + labels: sliSpec.sliLabels + sliMetadata, + }, + ]; + +// Creates Grafana dashboard graph panel +// @param sliSpec The spec for the SLI having its dashboard created +// @returns Grafana graph panel object +local createGraphPanel(sliSpec) = + local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec); + local dashboardSelectors = sliValueLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); + local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); + + graphPanel.new( + title = '%s' % sliSpec.sliDescription, + datasource = 'prometheus', + description = ||| + * Sample interval is %(evalInterval)s + * Selectors are %(selectors)s + ||| % { + selectors: std.strReplace(std.join(', ', sliValueLibraryFunctions.getSelectors(metricConfig, sliSpec)), '~', '\\~'), + evalInterval: sliSpec.evalInterval, + }, + min = 0, + fill = 0, + formatY2 = 'percentunit', + ).addTarget( + prometheus.target( + ||| + sum(rate(%(successAndFailureMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) + ||| % { + successAndFailureMetric: targetMetrics.successAndFailure, + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'total per second', + ), + ).addTarget( + prometheus.target( + ||| + sum(rate(%(failureMetric)s{%(selectors)s}[%(evalInterval)s]) or vector(0)) + ||| % { + failureMetric: targetMetrics.failure, + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'errors per second', + ), + ).addTarget( + prometheus.target( + ||| + sum(rate(%(failureMetric)s{%(selectors)s}[%(evalInterval)s])) + / + sum(rate(%(successAndFailureMetric)s{%(selectors)s}[%(evalInterval)s])) + ||| % { + failureMetric: targetMetrics.failure, + successAndFailureMetric: targetMetrics.successAndFailure, + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'errors rate', + ), + ).addSeriesOverride( + { + alias: '/error rate/', + yaxis: 2, + color: 'red', + } + ); + +// File exports +{ + createSliValueRule(sliSpec, sliMetadata, config): createSliValueRule(sliSpec, sliMetadata, config), + createGraphPanel(sliSpec): createGraphPanel(sliSpec), +} diff --git a/monitoring-as-code/src/sli-value-libraries/proportion-of-errors-using-label.libsonnet b/monitoring-as-code/src/sli-value-libraries/proportion-of-errors-using-label.libsonnet new file mode 100644 index 00000000..a80042b6 --- /dev/null +++ b/monitoring-as-code/src/sli-value-libraries/proportion-of-errors-using-label.libsonnet @@ -0,0 +1,128 @@ +// Divides the sum of rate of change of metric samples with bad error selectors by the sum of rate of change +// of metric samples with all error selectors + +// Target metrics: +// target - Metric with a selector label that can be used to differentiate between good and bad + +// Additional config: +// errorStatus selector in SLI spec +// errorStatus selector label in metric type config + +// MaC imports +local sliValueLibraryFunctions = import '../util/sli-value-library-functions.libsonnet'; + +// Grafana imports +local grafana = import 'grafonnet/grafana.libsonnet'; +local prometheus = grafana.prometheus; +local graphPanel = grafana.graphPanel; + +// Creates the custom SLI value rule +// @param sliSpec The spec for the SLI having its recording rules created +// @param sliMetadata Metadata about the type and category of the SLI +// @param config The config for the service defined in the mixin file +// @returns JSON defining the recording rule +local createSliValueRule(sliSpec, sliMetadata, config) = + local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec); + local ruleSelectors = sliValueLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); + local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); + + [ + { + record: 'sli_value', + expr: ||| + sum(rate(%(targetMetric)s{%(selectors)s, %(errorStatusSelector)s}[%(evalInterval)s]) or vector(0)) + / + sum(rate(%(targetMetric)s{%(selectors)s}[%(evalInterval)s])) + ||| % { + targetMetric: targetMetrics.target, + errorStatusSelector: sliValueLibraryFunctions.getSelector('errorStatus', metricConfig, sliSpec), + selectors: std.join(',', ruleSelectors), + evalInterval: sliSpec.evalInterval, + }, + labels: sliSpec.sliLabels + sliMetadata, + }, + ]; + +// Creates Grafana dashboard graph panel +// @param sliSpec The spec for the SLI having its dashboard created +// @returns Grafana graph panel object +local createGraphPanel(sliSpec) = + local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec); + local dashboardSelectors = sliValueLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); + local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); + + graphPanel.new( + title = '%s' % sliSpec.sliDescription, + datasource = 'prometheus', + description = ||| + * Sample interval is %(evalInterval)s + * Selectors are %(selectors)s + * Error selectors are %(errorStatusSelector)s + ||| % { + errorStatusSelector: sliValueLibraryFunctions.getSelector('errorStatus', metricConfig, sliSpec), + selectors: std.strReplace(std.join(', ', sliValueLibraryFunctions.getSelectors(metricConfig, sliSpec)), '~', '\\~'), + evalInterval: sliSpec.evalInterval, + }, + min = 0, + fill = 0, + formatY2 = 'percentunit', + thresholds = [ + { + value: sliSpec.metricTarget, + colorMode: 'critical', + op: 'gt', + line: true, + fill: false, + yaxis: 'right', + }, + ], + ).addTarget( + prometheus.target( + ||| + sum(rate(%(targetMetric)s{%(selectors)s}[%(evalInterval)s])) + ||| % { + targetMetric: targetMetrics.target, + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'requests per second', + ), + ).addTarget( + prometheus.target( + ||| + sum(rate(%(targetMetric)s{%(selectors)s, %(errorStatusSelector)s}[%(evalInterval)s]) or vector(0)) + ||| % { + targetMetric: targetMetrics.target, + errorStatusSelector: sliValueLibraryFunctions.getSelector('errorStatus', metricConfig, sliSpec), + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'errors per second', + ), + ).addTarget( + prometheus.target( + ||| + sum(rate(%(targetMetric)s{%(selectors)s, %(errorStatusSelector)s}[%(evalInterval)s]) or vector(0)) + / + sum(rate(%(targetMetric)s{%(selectors)s}[%(evalInterval)s])) + ||| % { + targetMetric: targetMetrics.target, + errorStatusSelector: sliValueLibraryFunctions.getSelector('errorStatus', metricConfig, sliSpec), + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = 'error rate', + ), + ).addSeriesOverride( + { + alias: '/error rate/', + yaxis: 2, + color: 'red', + } + ); + +// File exports +{ + createSliValueRule(sliSpec, sliMetadata, config): createSliValueRule(sliSpec, sliMetadata, config), + createGraphPanel(sliSpec): createGraphPanel(sliSpec), +} diff --git a/monitoring-as-code/src/sli-value-libraries/sli-value-library-template.libsonnet b/monitoring-as-code/src/sli-value-libraries/sli-value-library-template.libsonnet new file mode 100644 index 00000000..58bc7f3d --- /dev/null +++ b/monitoring-as-code/src/sli-value-libraries/sli-value-library-template.libsonnet @@ -0,0 +1,74 @@ +// Describe the calculation being performed for the SLI value + +// Target metrics: +// List the target metrics needed for this SLI value in format: keyword - Description of metric + +// Additional config: +// List any additional required config either from metric type config or SLI spec + +// MaC imports +local sliValueLibraryFunctions = import '../util/sli-value-library-functions.libsonnet'; + +// Grafana imports +local grafana = import 'grafonnet/grafana.libsonnet'; +local prometheus = grafana.prometheus; +local graphPanel = grafana.graphPanel; + +// Creates the custom SLI value rule +// @param sliSpec The spec for the SLI having its recording rules created +// @param sliMetadata Metadata about the type and category of the SLI +// @param config The config for the service defined in the mixin file +// @returns JSON defining the recording rule +local createSliValueRule(sliSpec, sliMetadata, config) = + local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec); + local ruleSelectors = sliValueLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config); + local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); + + [ + { + record: 'sli_value', + expr: ||| + + ||| % { + selectors: std.join(',', ruleSelectors), + evalInterval: sliSpec.evalInterval, + }, + labels: sliSpec.sliLabels + sliMetadata, + }, + ]; + +// Creates Grafana dashboard graph panel +// @param sliSpec The spec for the SLI having its dashboard created +// @returns Grafana graph panel object +local createGraphPanel(sliSpec) = + local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec); + local dashboardSelectors = sliValueLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec); + local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec); + + graphPanel.new( + title = '%s' % sliSpec.sliDescription, + datasource = 'prometheus', + description = ||| + * Sample interval is %(evalInterval)s + * Selectors are %(selectors)s + ||| % { + selectors: std.strReplace(std.join(', ', sliValueLibraryFunctions.getSelectors(metricConfig, sliSpec)), '~', '\\~'), + evalInterval: sliSpec.evalInterval, + }, + ).addTarget( + prometheus.target( + ||| + + ||| % { + selectors: std.join(',', dashboardSelectors), + evalInterval: sliSpec.evalInterval, + }, + legendFormat = '', + ), + ); + +// File exports +{ + createSliValueRule(sliSpec, sliMetadata, config): createSliValueRule(sliSpec, sliMetadata, config), + createGraphPanel(sliSpec): createGraphPanel(sliSpec), +} diff --git a/monitoring-as-code/src/util/sli-metric-library-functions.libsonnet b/monitoring-as-code/src/util/sli-value-library-functions.libsonnet similarity index 100% rename from monitoring-as-code/src/util/sli-metric-library-functions.libsonnet rename to monitoring-as-code/src/util/sli-value-library-functions.libsonnet From e0d48506aa26388d448a7e686c776f2e632406c2 Mon Sep 17 00:00:00 2001 From: finlaymccormickHO <102794431+finlaymccormickHO@users.noreply.github.com> Date: Mon, 22 Aug 2022 10:06:40 +0100 Subject: [PATCH 07/12] feat: added new opensearch metrics and availablity sli type (#181) --- monitoring-as-code/src/metric-types.libsonnet | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/monitoring-as-code/src/metric-types.libsonnet b/monitoring-as-code/src/metric-types.libsonnet index c82093d9..47854545 100644 --- a/monitoring-as-code/src/metric-types.libsonnet +++ b/monitoring-as-code/src/metric-types.libsonnet @@ -472,9 +472,21 @@ }, metrics: { averageLatency: 'aws_es_search_latency_average', + sum4xx: 'aws_es_4xx_sum', + sum5xx: 'aws_es_5xx_sum', + requestsSum: 'aws_es_open_search_requests_sum', }, }, sliTypesConfig: { + availability: { + library: (import 'sli-value-libraries/proportion-of-errors-using-bad-request-metrics.libsonnet'), + description: 'The error rate for %(sliDescription)s should be below %(metric_target_percent)0.1f%%', + targetMetrics: { + code4xx: 'sum4xx', + code5xx: 'sum5xx', + codeAll: 'requestsSum', + }, + }, latency: { library: (import 'sli-value-libraries/average-using-single-metric.libsonnet'), description: 'The average latency of %(sliDescription)s should be %(comparison)s %(metricTarget)0.1f', From d215b8740ab83a528d35435accf5eb6e7063dddf Mon Sep 17 00:00:00 2001 From: finlaymccormickHO <102794431+finlaymccormickHO@users.noreply.github.com> Date: Mon, 22 Aug 2022 10:38:48 +0100 Subject: [PATCH 08/12] refactor: giving target metrics for detail dashboard elements better names and defining them in comment at top of files (#183) --- .../cloudwatch-sqs.libsonnet | 28 +++++++++++++------ ...tail-dashboard-elements-template.libsonnet | 6 +++- .../http-requests-availability.libsonnet | 11 +++++--- .../http-requests-latency.libsonnet | 11 +++++--- monitoring-as-code/src/metric-types.libsonnet | 22 +++++++-------- 5 files changed, 50 insertions(+), 28 deletions(-) diff --git a/monitoring-as-code/src/dashboards/detail-dashboard-elements/cloudwatch-sqs.libsonnet b/monitoring-as-code/src/dashboards/detail-dashboard-elements/cloudwatch-sqs.libsonnet index 73c7e869..8772e4d3 100644 --- a/monitoring-as-code/src/dashboards/detail-dashboard-elements/cloudwatch-sqs.libsonnet +++ b/monitoring-as-code/src/dashboards/detail-dashboard-elements/cloudwatch-sqs.libsonnet @@ -1,3 +1,15 @@ +// Target metrics: +// sentMessages - Metric representing the number of messages sent +// visibleMessages - Metric representing the number of visible messages +// deletedMessages - Metric representing the number of deleted messages +// oldestMessage - Metric representing the age of oldest message + +// Additional config: +// deadletterQueueType custom selector label in metric type config +// deadletterQueueName custom selector label in metric type config +// deadletterQueueType custom selector in metric type config +// deadletterQueueName custom selector in metric type config + // MaC imports local stringFormattingFunctions = import '../../util/string-formatting-functions.libsonnet'; @@ -128,11 +140,11 @@ local createPanels(direction, metrics, selectorLabels, customSelectorLabels, cus min = 0, ).addTarget( prometheus.target(||| - sum by (%(deadletterQueueNameSelectorLabels)s) ({__name__=~"%(messagesVisibleMetrics)s", %(queueSelectors)s, + sum by (%(deadletterQueueNameSelectorLabels)s) ({__name__=~"%(visibleMessagesMetrics)s", %(queueSelectors)s, %(queueTemplateSelectors)s, %(environmentSelectors)s, %(productSelectors)s}) ||| % { deadletterQueueNameSelectorLabels: std.join(', ', customSelectorLabels.deadletterQueueName), - messagesVisibleMetrics: std.join('|', metrics.messagesVisible), + visibleMessagesMetrics: std.join('|', metrics.visibleMessages), queueSelectors: selectors.deadletterQueue, queueTemplateSelectors: selectors.deadletterQueueTemplate, environmentSelectors: selectors.environment, @@ -167,11 +179,11 @@ local createPanels(direction, metrics, selectorLabels, customSelectorLabels, cus min = 0, ).addTarget( prometheus.target(||| - sum by (%(deadletterQueueNameSelectorLabels)s) ({__name__=~"%(messagesVisibleMetrics)s", %(queueSelectors)s, + sum by (%(deadletterQueueNameSelectorLabels)s) ({__name__=~"%(visibleMessagesMetrics)s", %(queueSelectors)s, %(queueTemplateSelectors)s, %(environmentSelectors)s, %(productSelectors)s}) ||| % { deadletterQueueNameSelectorLabels: std.join(', ', customSelectorLabels.deadletterQueueName), - messagesVisibleMetrics: std.join('|', metrics.messagesVisible), + visibleMessagesMetrics: std.join('|', metrics.visibleMessages), queueSelectors: selectors.standardQueue, queueTemplateSelectors: selectors.standardQueueTemplate, environmentSelectors: selectors.environment, @@ -186,11 +198,11 @@ local createPanels(direction, metrics, selectorLabels, customSelectorLabels, cus min = 0, ).addTarget( prometheus.target(||| - sum by (%(deadletterQueueNameSelectorLabels)s) ({__name__=~"%(messagesSentMetrics)s", %(queueSelectors)s, + sum by (%(deadletterQueueNameSelectorLabels)s) ({__name__=~"%(sentMessagesMetrics)s", %(queueSelectors)s, %(queueTemplateSelectors)s, %(environmentSelectors)s, %(productSelectors)s}) ||| % { deadletterQueueNameSelectorLabels: std.join(', ', customSelectorLabels.deadletterQueueName), - messagesSentMetrics: std.join('|', metrics.messagesSent), + sentMessagesMetrics: std.join('|', metrics.sentMessages), queueSelectors: selectors.standardQueue, queueTemplateSelectors: selectors.standardQueueTemplate, environmentSelectors: selectors.environment, @@ -205,11 +217,11 @@ local createPanels(direction, metrics, selectorLabels, customSelectorLabels, cus min = 0, ).addTarget( prometheus.target(||| - sum by (%(deadletterQueueNameSelectorLabels)s) ({__name__=~"%(messagesDeletedMetrics)s", %(queueSelectors)s, + sum by (%(deadletterQueueNameSelectorLabels)s) ({__name__=~"%(deletedMessagesMetrics)s", %(queueSelectors)s, %(queueTemplateSelectors)s, %(environmentSelectors)s, %(productSelectors)s}) ||| % { deadletterQueueNameSelectorLabels: std.join(', ', customSelectorLabels.deadletterQueueName), - messagesDeletedMetrics: std.join('|', metrics.messagesDeleted), + deletedMessagesMetrics: std.join('|', metrics.deletedMessages), queueSelectors: selectors.standardQueue, queueTemplateSelectors: selectors.standardQueueTemplate, environmentSelectors: selectors.environment, diff --git a/monitoring-as-code/src/dashboards/detail-dashboard-elements/detail-dashboard-elements-template.libsonnet b/monitoring-as-code/src/dashboards/detail-dashboard-elements/detail-dashboard-elements-template.libsonnet index 84618072..cf191076 100644 --- a/monitoring-as-code/src/dashboards/detail-dashboard-elements/detail-dashboard-elements-template.libsonnet +++ b/monitoring-as-code/src/dashboards/detail-dashboard-elements/detail-dashboard-elements-template.libsonnet @@ -1,4 +1,8 @@ -// Template for detail dashboard element files +// Target metrics: +// List the target metrics needed for this SLI value in format: keyword - Description of metric + +// Additional config: +// List any additional required config either from metric type config or SLI spec // MaC imports local stringFormattingFunctions = import '../../util/string-formatting-functions.libsonnet'; diff --git a/monitoring-as-code/src/dashboards/detail-dashboard-elements/http-requests-availability.libsonnet b/monitoring-as-code/src/dashboards/detail-dashboard-elements/http-requests-availability.libsonnet index 40538890..dd1aaacf 100644 --- a/monitoring-as-code/src/dashboards/detail-dashboard-elements/http-requests-availability.libsonnet +++ b/monitoring-as-code/src/dashboards/detail-dashboard-elements/http-requests-availability.libsonnet @@ -1,3 +1,6 @@ +// Target metrics: +// requestCount - Metric representing the count of requests + // MaC imports local stringFormattingFunctions = import '../../util/string-formatting-functions.libsonnet'; @@ -51,10 +54,10 @@ local createPanels(direction, metrics, selectorLabels, customSelectorLabels, cus legend_hideZero = true, ).addTarget( prometheus.target(||| - sum by (%(errorSelectorLabels)s) (rate({__name__=~"%(countMetrics)s", %(selectors)s}[$__rate_interval])) + sum by (%(errorSelectorLabels)s) (rate({__name__=~"%(requestCountMetrics)s", %(selectors)s}[$__rate_interval])) ||| % { errorSelectorLabels: std.join(', ', selectorLabels.errorStatus), - countMetrics: std.join('|', metrics.count), + requestCountMetrics: std.join('|', metrics.requestCount), selectors: std.join(', ', std.objectValues(selectors)), }, legendFormat = '{{%s}}' % std.join(', ', selectorLabels.errorStatus)) @@ -71,10 +74,10 @@ local createPanels(direction, metrics, selectorLabels, customSelectorLabels, cus legend_hideZero = true, ).addTarget( prometheus.target(||| - sum by (%(resourceSelectorLabels)s) (rate({__name__=~"%(countMetrics)s", %(selectors)s}[$__rate_interval])) + sum by (%(resourceSelectorLabels)s) (rate({__name__=~"%(requestCountMetrics)s", %(selectors)s}[$__rate_interval])) ||| % { resourceSelectorLabels: std.join(', ', selectorLabels.resource), - countMetrics: std.join('|', metrics.count), + requestCountMetrics: std.join('|', metrics.requestCount), selectors: std.join(', ', std.objectValues(selectors)), }, legendFormat = '{{%s}}' % std.join(', ', selectorLabels.resource)) diff --git a/monitoring-as-code/src/dashboards/detail-dashboard-elements/http-requests-latency.libsonnet b/monitoring-as-code/src/dashboards/detail-dashboard-elements/http-requests-latency.libsonnet index cfbd0201..3eb991b2 100644 --- a/monitoring-as-code/src/dashboards/detail-dashboard-elements/http-requests-latency.libsonnet +++ b/monitoring-as-code/src/dashboards/detail-dashboard-elements/http-requests-latency.libsonnet @@ -1,3 +1,6 @@ +// Target metrics: +// requestBucket - Metric representing the buckets histogram request latency values fall into + // MaC imports local stringFormattingFunctions = import '../../util/string-formatting-functions.libsonnet'; @@ -56,9 +59,9 @@ local createPanels(direction, metrics, selectorLabels, customSelectorLabels, cus legend_show = true, ).addTarget( prometheus.target(||| - sum by (le) (increase({__name__=~"%(bucketMetrics)s", %(selectors)s}[$__rate_interval])) + sum by (le) (increase({__name__=~"%(requestBucketMetrics)s", %(selectors)s}[$__rate_interval])) ||| % { - bucketMetrics: std.join('|', metrics.bucket), + requestBucketMetrics: std.join('|', metrics.requestBucket), selectors: std.join(', ', std.objectValues(selectors)), }, legendFormat = '{{le}}', format = 'heatmap') @@ -70,11 +73,11 @@ local createPanels(direction, metrics, selectorLabels, customSelectorLabels, cus format = 's', ).addTarget( prometheus.target(||| - histogram_quantile($%(direction)s_latency_percentile/100, (sum by (le) (rate({__name__=~"%(bucketMetrics)s", + histogram_quantile($%(direction)s_latency_percentile/100, (sum by (le) (rate({__name__=~"%(requestBucketMetrics)s", %(selectors)s}[$__rate_interval])))) ||| % { direction: direction, - bucketMetrics: std.join('|', metrics.bucket), + requestBucketMetrics: std.join('|', metrics.requestBucket), selectors: std.join(', ', std.objectValues(selectors)), }, legendFormat = 'Selected Percentile Latency') diff --git a/monitoring-as-code/src/metric-types.libsonnet b/monitoring-as-code/src/metric-types.libsonnet index 47854545..cbb66fd2 100644 --- a/monitoring-as-code/src/metric-types.libsonnet +++ b/monitoring-as-code/src/metric-types.libsonnet @@ -49,8 +49,8 @@ standardTemplates: ['resource', 'errorStatus'], elements: ['httpRequestsAvailability', 'httpRequestsLatency'], targetMetrics: { - count: 'count', - bucket: 'bucket', + requestCount: 'count', + requestBucket: 'bucket', }, }, }, @@ -79,7 +79,7 @@ standardTemplates: ['resource', 'errorStatus'], elements: ['httpRequestsAvailability'], targetMetrics: { - count: 'count', + requestCount: 'count', }, }, }, @@ -108,7 +108,7 @@ standardTemplates: ['resource', 'errorStatus'], elements: ['httpRequestsAvailability'], targetMetrics: { - count: 'count', + requestCount: 'count', }, }, }, @@ -140,7 +140,7 @@ standardTemplates: ['resource'], elements: ['httpRequestsLatency'], targetMetrics: { - bucket: 'bucket', + requestBucket: 'bucket', }, }, }, @@ -168,7 +168,7 @@ standardTemplates: ['errorStatus'], elements: ['httpRequestsAvailability'], targetMetrics: { - count: 'count', + requestCount: 'count', }, }, }, @@ -207,8 +207,8 @@ standardTemplates: ['errorStatus'], elements: ['httpRequestsAvailability', 'httpRequestsLatency'], targetMetrics: { - count: 'count', - bucket: 'bucket', + requestCount: 'count', + requestBucket: 'bucket', }, }, }, @@ -299,9 +299,9 @@ elements: ['cloudwatchSqs'], targetMetrics: { oldestMessage: 'oldestMessage', - messagesDeleted: 'messagesDeleted', - messagesVisible: 'messagesVisible', - messagesSent: 'messagesSent', + sentMessages: 'messagesSent', + visibleMessages: 'messagesVisisble', + deletedMessages: 'messagesDeleted', }, }, }, From 8b082da65bf6d83027c5868037c0b31723f3d899 Mon Sep 17 00:00:00 2001 From: Humayun Alam <108126376+humayunalamHO@users.noreply.github.com> Date: Mon, 22 Aug 2022 15:44:25 +0100 Subject: [PATCH 09/12] refactor: add docker image scan to the workflow (#178) * refactor: add docker image scan to the workflow * refactor: update docker image scan * fix: switch snyk worklow sequence * fix: correct snyk output file names * fix: correct remove snyk output from image scan * fix: update artifacts in dockerfile and removed promtool * fix: hard code git version * fix: move grafonnet lib to stage 1 of the docker build * fix: tag version to git and bash library * fix: remove bash from runner stage Co-authored-by: Michael Pearson Co-authored-by: Mahruf Iqbal <102766665+mahrufiqbalHO@users.noreply.github.com> --- .github/workflows/docker-build-branch.yml | 24 ++++++++++++++++++++--- monitoring-as-code/Dockerfile | 24 ++++++++++------------- monitoring-as-code/run-mixin.sh | 3 --- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/.github/workflows/docker-build-branch.yml b/.github/workflows/docker-build-branch.yml index d34af691..437cd9bb 100644 --- a/.github/workflows/docker-build-branch.yml +++ b/.github/workflows/docker-build-branch.yml @@ -50,8 +50,9 @@ jobs: PACKAGE_TOKEN=${{secrets.GITHUB_TOKEN}} MAC_VERSION='#${{ github.event.number }}' - - name: Run Snyk to check Docker image for vulnerabilities - id: snyk + # Snyk security scan of Docker file only + - name: Run Snyk to check Docker file for vulnerabilities + id: snyk-dockerfile continue-on-error: true uses: snyk/actions/docker@0.3.0 env: @@ -69,7 +70,24 @@ jobs: sarif_file: snyk.sarif - name: Check on failures - if: steps.snyk.outcome != 'success' + if: steps.snyk-dockerfile.outcome != 'success' + run: exit 1 + + # Snyk security scan of Built Docker Image and unmanaged dependencies + - name: Run Snyk to check Docker image for vulnerabilities + id: snyk-image + continue-on-error: true + uses: snyk/actions/docker@0.3.0 + env: + SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }} + with: + image: ${{ steps.meta.outputs.tags }} + args: | + --app-vulns + --severity-threshold=medium + + - name: Check on failures + if: steps.snyk-image.outcome != 'success' run: exit 1 - name: Push image to GitHub Container Registry diff --git a/monitoring-as-code/Dockerfile b/monitoring-as-code/Dockerfile index e97be4e5..425fd93f 100644 --- a/monitoring-as-code/Dockerfile +++ b/monitoring-as-code/Dockerfile @@ -1,32 +1,28 @@ # Builder image to download binaries -FROM golang:1.19.0-alpine3.15 AS builder +FROM golang:1.19.0-alpine3.16 AS builder # Download git, jsonnet and jsonnet-bundler -RUN apk add --no-cache git=2.34.4-r0 curl jq wget && \ - go install github.com/google/go-jsonnet/cmd/jsonnet@v0.18.0 && \ +RUN apk add --no-cache git=2.36.2-r0 && \ + go install github.com/google/go-jsonnet/cmd/jsonnet@v0.17.0 && \ go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@v0.5.1 -# Download promtool -RUN VERSION=$(curl -Ls https://api.github.com/repos/prometheus/prometheus/releases/latest | jq ".tag_name" | xargs | cut -c2-) && \ - wget -qO- "https://github.com/prometheus/prometheus/releases/download/v${VERSION}/prometheus-$VERSION.linux-amd64.tar.gz" \ - | tar xvzf - "prometheus-$VERSION.linux-amd64"/promtool --strip-components=1 && cp promtool /go/bin/promtool +WORKDIR /go/grafonnet +# Download grafonnet and grafana-builder +RUN jb init && \ + jb install https://github.com/grafana/grafonnet-lib/grafonnet && \ + jb install https://github.com/grafana/jsonnet-libs/grafana-builder FROM alpine:3.16.2 AS runner -RUN apk --no-cache add git bash # Pass semver or PR from GitHub workflow ARG MAC_VERSION ENV MAC_VERSION $MAC_VERSION -# Download grafonnet and grafana-builder COPY --from=builder /go/bin/* /usr/local/bin/ -RUN jb init && \ - jb install https://github.com/grafana/grafonnet-lib/grafonnet && \ - jb install https://github.com/grafana/jsonnet-libs/grafana-builder - +COPY --from=builder /go/grafonnet / COPY src /src COPY mixin-defs /mixin-defs COPY run-mixin.sh / RUN chmod a+x /usr/local/bin/jb /usr/local/bin/jsonnet /run-mixin.sh -ENTRYPOINT ["/run-mixin.sh"] +ENTRYPOINT ["/run-mixin.sh"] \ No newline at end of file diff --git a/monitoring-as-code/run-mixin.sh b/monitoring-as-code/run-mixin.sh index a7b75d16..669a6748 100644 --- a/monitoring-as-code/run-mixin.sh +++ b/monitoring-as-code/run-mixin.sh @@ -93,9 +93,6 @@ if [ "$generate_rules" = "true" ]; then jsonnet -J vendor --ext-str ENV="$environment" --ext-str ACCOUNT="$account" --ext-str MAC_VERSION="$MAC_VERSION" -S -e "std.manifestYamlDoc((import \"${PWD}/_input/mixin.jsonnet\").prometheusAlerts)" > "$PWD"/_output/prometheus-rules/"$mixin"-"$environment"-alert-rules.yaml if [ $? -ne 0 ]; then echo "Failed to run alert rules for ${mixin} (environment ${environment}) - exiting"; exit; fi - # Test Prometheus rules with promtool - promtool check rules "$PWD"/_output/prometheus-rules/"$mixin"-"$environment"-alert-rules.yaml "$PWD"/_output/prometheus-rules/"$mixin"-"$environment"-recording-rules.yaml - if [ $? -ne 0 ]; then echo "Validation of rules files failed for ${mixin} (environment ${environment}) - exiting"; exit 1; fi done fi From c495c8b195eb9853e0be37342605d5923fb21dc3 Mon Sep 17 00:00:00 2001 From: Mike Pearson <95076970+michaelpearsonHO@users.noreply.github.com> Date: Mon, 22 Aug 2022 16:47:51 +0100 Subject: [PATCH 10/12] 187 update readme installation and useage instructions (#188) * feat: add installation and useage instructions to readme * fix: correct syntax errors in readme * fix: correct typo with docker run example --- monitoring-as-code/README.md | 48 +++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/monitoring-as-code/README.md b/monitoring-as-code/README.md index 3f426e02..fb5e9b4d 100644 --- a/monitoring-as-code/README.md +++ b/monitoring-as-code/README.md @@ -19,7 +19,15 @@ Monitoring Mixins bundle up SLI configuration, Alerting, Grafana dashboards, and - [docker](https://docs.docker.com) - [git](https://git-scm.com) -**Now in a directory of your choosing run the following setup commands.** +## Docker installation + +See GitHub Releases page for most recent tagged version and pull the Docker image: - + +`docker pull ghcr.io/ho-cto/sre-monitoring-as-code:{tag}` + +## GitHub clone installation + +**In a directory of your choosing run the following setup commands.** ``` # Clone the repository to your local machine @@ -34,17 +42,45 @@ docker build -t sre-monitoring-as-code:latest . ## Useage -``` -# Add mixin file -mixin.jsonnet to /montoring-config -touch grapi-mixin.jsonnet +### Default mixin config -# Add Global and SLI configuration as per sre-monitoring-as-code docs (see Resources) +**To run the default monitoring and summary mixins bundles into the built container run the following command** +``` # Execute makefile script sh deploy.sh +``` + +### Custom mixin + +**To run a custom mixin file** -# Add artefacts (dashboard, alerts rules and recording rules) to Grafana and Prometheus package management tooling (Prometheus Operator) ``` +# Add mixin file -mixin.jsonnet to a directory +touch grapi-mixin.jsonnet + +# Execute docker run command based on mounted directory where the mixin file has been added. +docker run --mount type=bind,source="$PWD"/{user input directory},target=/input --mount type=bind,source="$PWD"/{user output directory},target=/output -it sre-monitoring-as-code:{tag} -m {service} -rd -i input -o output +``` + +### Configuration Arguments + +**Arguments to be passed to container at runtime** + +| Argument | Description | +|----------|------------------------------------------------------------------------------------------------------------------------| +| -m | The name of the mixin to target, must be included | +| -o | The path to the directory where you want the output to be copied to, must be included | +| -i | The path to the directory containing the mixin file, if not included defaults to mixin-defs directory inside container | +| -a | The type of account (np, pr or localhost), if not included defaults to localhost | +| -r | Include if you only want to generate Prometheus rules, both generated if neither included | +| -d | Include if you only want to generate Grafana dashboards, both generated if neither included | + +## Distribution + +### Add artefacts (dashboard, alerts rules and recording rules) to Grafana and Prometheus package management tooling (Prometheus Operator) + +TBC ## Resources From 10b0b3698734ccf156094076a90ecc8fc2585643 Mon Sep 17 00:00:00 2001 From: finlaymccormickHO <102794431+finlaymccormickHO@users.noreply.github.com> Date: Wed, 24 Aug 2022 15:01:51 +0100 Subject: [PATCH 11/12] 190 add multi sli type (#191) * feat: added multi sli types for recording rules, dashboards not yet working * feat: multi sli types now work for dashboards * feat: updated how sli types and metric targets are set in mixin --- .../mixin-defs/monitoring-mixin.jsonnet | 50 +++++++++++-------- .../src/alerts/burn-rate-alerts.libsonnet | 7 +-- .../src/alerts/burn-rate-rules.libsonnet | 7 +-- .../dashboard-standard-elements.libsonnet | 26 ++++++---- .../dashboards/journey-dashboard.libsonnet | 3 +- .../dashboards/product-dashboard.libsonnet | 19 ++++++- .../src/mixin-builder.libsonnet | 38 ++++++++++---- .../sli-elements/recording-rules.libsonnet | 5 +- 8 files changed, 104 insertions(+), 51 deletions(-) diff --git a/monitoring-as-code/mixin-defs/monitoring-mixin.jsonnet b/monitoring-as-code/mixin-defs/monitoring-mixin.jsonnet index c005e66b..10c8858b 100644 --- a/monitoring-as-code/mixin-defs/monitoring-mixin.jsonnet +++ b/monitoring-as-code/mixin-defs/monitoring-mixin.jsonnet @@ -24,7 +24,6 @@ local sliSpecList = { sliDescription: 'Grafana landing page requests', period: '7d', metricType: 'grafana_http_request_duration_seconds', - metricTarget: 0.1, evalInterval: '1m', selectors: { product: 'grafana', @@ -32,14 +31,15 @@ local sliSpecList = { errorStatus: '4..|5..', }, sloTarget: 90, - sliType: 'availability', + sliTypes: { + availability: 0.1, + }, }, SLI02: { title: 'requests to the Grafana login are successful', sliDescription: 'Grafana login page requests', period: '7d', metricType: 'grafana_http_request_duration_seconds', - metricTarget: 0.1, evalInterval: '1m', selectors: { product: 'grafana', @@ -47,14 +47,15 @@ local sliSpecList = { errorStatus: '4..|5..', }, sloTarget: 90, - sliType: 'availability', + sliTypes: { + availability: 0.1, + }, }, SLI03: { title: 'requests to the Grafana datasources are successful', sliDescription: 'Grafana datasource API requests', period: '7d', metricType: 'grafana_http_request_duration_seconds', - metricTarget: 0.5, evalInterval: '1m', selectors: { product: 'grafana', @@ -62,7 +63,9 @@ local sliSpecList = { errorStatus: '4..|5..', }, sloTarget: 90, - sliType: 'availability', + sliTypes: { + availability: 0.5, + }, }, }, prometheus: { @@ -71,25 +74,27 @@ local sliSpecList = { sliDescription: 'Average of prometheus scrape target status', period: '7d', metricType: 'up', - metricTarget: 1, comparison: '==', evalInterval: '1m', selectors: {}, sloTarget: 90, - sliType: 'availability', + sliTypes: { + availability: 1, + }, }, SLI02: { title: 'prometheus scraping of Yace is fast enough', sliDescription: 'Average duration of Prometheus scrape of Yace', period: '7d', metricType: 'scrape_duration_seconds', - metricTarget: 15, evalInterval: '1m', selectors: { product: 'yace' }, sloTarget: 90, - sliType: 'availability', + sliTypes: { + availability: 15, + }, }, }, thanos: { @@ -98,7 +103,6 @@ local sliSpecList = { sliDescription: 'Instant query requests to thanos-query', period: '7d', metricType: 'http_requests_total', - metricTarget: 0.1, evalInterval: '1m', selectors: { product: 'thanos-query', @@ -106,21 +110,24 @@ local sliSpecList = { errorStatus: '4..|5..', }, sloTarget: 90, - sliType: 'availability', + sliTypes: { + availability: 0.1, + }, }, SLI02: { title: 'instant query requests to Thanos are fast enough', sliDescription: 'Instant query requests to thanos-query', period: '7d', metricType: 'http_request_duration_seconds', - metricTarget: 15, latencyPercentile: 0.8, evalInterval: '1m', selectors: { product: 'thanos-query', resource: 'query', }, - sliType: 'latency', + sliTypes: { + latency: 15, + }, sloTarget: 90, }, SLI03: { @@ -128,7 +135,6 @@ local sliSpecList = { sliDescription: 'Range query requests to thanos-query', period: '7d', metricType: 'http_requests_total', - metricTarget: 0.1, evalInterval: '1m', selectors: { product: 'thanos-query', @@ -136,21 +142,24 @@ local sliSpecList = { errorStatus: '4..|5..', }, sloTarget: 90, - sliType: 'availability', + sliTypes: { + availability: 0.1, + }, }, SLI04: { title: 'range query requests to Thanos are fast enough', sliDescription: 'Range query requests to thanos-query', period: '7d', metricType: 'http_request_duration_seconds', - metricTarget: 10, latencyPercentile: 0.8, evalInterval: '1m', selectors: { product: 'thanos-query', resource: 'query_range', }, - sliType: 'latency', + sliTypes: { + latency: 10, + }, sloTarget: 90, }, SLI05: { @@ -158,13 +167,14 @@ local sliSpecList = { sliDescription: 'Thanos-compact operations and failures', period: '7d', metricType: 'thanos_compact_group_compactions', - metricTarget: 0.01, evalInterval: '1m', selectors: { product: 'monitoring-thanos-compact.*' }, sloTarget: 99, - sliType: 'availability', + sliTypes: { + availability: 0.01, + }, }, }, }; diff --git a/monitoring-as-code/src/alerts/burn-rate-alerts.libsonnet b/monitoring-as-code/src/alerts/burn-rate-alerts.libsonnet index feccb6be..c22d8963 100644 --- a/monitoring-as-code/src/alerts/burn-rate-alerts.libsonnet +++ b/monitoring-as-code/src/alerts/burn-rate-alerts.libsonnet @@ -106,7 +106,7 @@ local createBurnRateAlerts(config, sliSpec, sliKey, journeyKey) = { alerts+: [ { - local alertName = std.join('_', [config.product, journeyKey, sliKey, 'ErrorBudgetBurn']), + local alertName = std.join('_', [config.product, journeyKey, sliKey, sliSpec.sliType, 'ErrorBudgetBurn']), local severity = getSeverity(errorBudgetBurnWindow, config, sliSpec), local alertTitle = createAlertTitle(errorBudgetBurnWindow, config, sliSpec, sliKey, journeyKey), @@ -115,13 +115,14 @@ local createBurnRateAlerts(config, sliSpec, sliKey, journeyKey) = alert: alertName, expr: ||| - %(recordingRuleShort)s{%(sliLabelSelectors)s} > %(factor).5f + %(recordingRuleShort)s{%(sliLabelSelectors)s, type="%(sliType)s"} > %(factor).5f and - %(recordingRuleLong)s{%(sliLabelSelectors)s} > %(factor).5f + %(recordingRuleLong)s{%(sliLabelSelectors)s, type="%(sliType)s"} > %(factor).5f ||| % { recordingRuleShort: macConfig.burnRateRuleNameTemplate % errorBudgetBurnWindow.short, recordingRuleLong: macConfig.burnRateRuleNameTemplate % errorBudgetBurnWindow.long, sliLabelSelectors: sliSpec.ruleSliLabelSelectors, + sliType: sliSpec.sliType, factor: errorBudgetBurnWindow.factor, }, labels: { diff --git a/monitoring-as-code/src/alerts/burn-rate-rules.libsonnet b/monitoring-as-code/src/alerts/burn-rate-rules.libsonnet index dddca4ff..5b595d61 100644 --- a/monitoring-as-code/src/alerts/burn-rate-rules.libsonnet +++ b/monitoring-as-code/src/alerts/burn-rate-rules.libsonnet @@ -25,22 +25,23 @@ local createBurnRateRules(sliSpec) = ( 1 - ( - sum(sum_over_time((sli_value{%(sliLabelSelectors)s} %(comparison)s bool %(target)s)[%(burnRateWindow)s:%(evalInterval)s])) + sum(sum_over_time((sli_value{%(sliLabelSelectors)s, type="%(sliType)s"} %(comparison)s bool %(target)s)[%(burnRateWindow)s:%(evalInterval)s])) / - (sum(sum_over_time((sli_value{%(sliLabelSelectors)s} < bool Inf)[%(burnRateWindow)s:%(evalInterval)s])) > 0) + (sum(sum_over_time((sli_value{%(sliLabelSelectors)s, type="%(sliType)s"} < bool Inf)[%(burnRateWindow)s:%(evalInterval)s])) > 0) ) ) / %(sloTarget).5f or vector(0) ||| % { sliLabelSelectors: sliSpec.ruleSliLabelSelectors, + sliType: sliSpec.sliType, burnRateWindow: burnRateWindow, sloTarget: (100 - sliSpec.sloTarget) / 100, evalInterval: sliSpec.evalInterval, target: sliSpec.metricTarget, comparison: if std.objectHas(sliSpec, 'comparison') then sliSpec.comparison else '<', }, - labels: sliSpec.sliLabels, + labels: sliSpec.sliLabels + { type: sliSpec.sliType }, record: macConfig.burnRateRuleNameTemplate % burnRateWindow, } for burnRateWindow in getBurnRateWindowArray() diff --git a/monitoring-as-code/src/dashboards/dashboard-standard-elements.libsonnet b/monitoring-as-code/src/dashboards/dashboard-standard-elements.libsonnet index 5387c50c..885217c8 100644 --- a/monitoring-as-code/src/dashboards/dashboard-standard-elements.libsonnet +++ b/monitoring-as-code/src/dashboards/dashboard-standard-elements.libsonnet @@ -28,12 +28,14 @@ local createSliDescription(sliSpec) = local createRowTitles(sliKey, sliSpec) = // Row title to describe SLI/SLO { - rowTitle: '%(slo)s: %(title)s' % { + rowTitle: '%(slo)s %(sliType)s: %(title)s' % { slo: sliKey, + sliType: sliSpec.sliType, title: sliSpec.title, }, - rowTitleShort: '%(slo)s (%(period)s)' % { + rowTitleShort: '%(slo)s %(sliType)s (%(period)s)' % { slo: sliKey, + sliType: sliSpec.sliType, period: sliSpec.period, }, }; @@ -55,13 +57,14 @@ local createAvailabilityPanel(sloTargetLegend, sliSpec) = ).addTarget( prometheus.target( ||| - sum(sum_over_time((sli_value{%(sliLabelSelectors)s} + sum(sum_over_time((sli_value{%(sliLabelSelectors)s, type="%(sliType)s"} %(comparison)s bool %(target)s)[%(period)s:%(evalInterval)s])) / - sum(sum_over_time((sli_value{%(sliLabelSelectors)s} + sum(sum_over_time((sli_value{%(sliLabelSelectors)s, type="%(sliType)s"} < bool Inf)[%(period)s:%(evalInterval)s]) > 0) ||| % { sliLabelSelectors: sliSpec.dashboardSliLabelSelectors, + sliType: sliSpec.sliType, period: sliSpec.period, target: sliSpec.metricTarget, evalInterval: sliSpec.evalInterval, @@ -99,15 +102,16 @@ local createErrorBudgetPanel(sliSpec) = ).addTarget( prometheus.target( ||| - (%(target)s - (1 - (sum(sum_over_time((sli_value{%(sliLabelSelectors)s} + (%(target)s - (1 - (sum(sum_over_time((sli_value{%(sliLabelSelectors)s, type="%(sliType)s"} %(comparison)s bool %(metricTarget)s)[%(period)s:%(evalInterval)s])) / - sum(sum_over_time((sli_value{%(sliLabelSelectors)s} + sum(sum_over_time((sli_value{%(sliLabelSelectors)s, type="%(sliType)s"} < bool Inf)[%(period)s:%(evalInterval)s]))))) / %(target)s ||| % { sliLabelSelectors: sliSpec.dashboardSliLabelSelectors, + sliType: sliSpec.sliType, period: sliSpec.period, evalInterval: sliSpec.evalInterval, target: (100 - sliSpec.sloTarget) / 100, @@ -143,11 +147,12 @@ local createSloStatusPanel(sliDescription, sliSpec) = // Proportion of intervals SLO has pased prometheus.target( ||| - sum_over_time((sli_value{%(sliLabelSelectors)s} %(comparison)s bool %(target)s)[$__interval:%(evalInterval)s]) + sum_over_time((sli_value{%(sliLabelSelectors)s, type="%(sliType)s"} %(comparison)s bool %(target)s)[$__interval:%(evalInterval)s]) / - sum_over_time((sli_value{%(sliLabelSelectors)s} < bool Inf)[$__interval:%(evalInterval)s]) + sum_over_time((sli_value{%(sliLabelSelectors)s, type="%(sliType)s"} < bool Inf)[$__interval:%(evalInterval)s]) ||| % { sliLabelSelectors: sliSpec.dashboardSliLabelSelectors, + sliType: sliSpec.sliType, evalInterval: sliSpec.evalInterval, target: sliSpec.metricTarget, comparison: if std.objectHas(sliSpec, 'comparison') then sliSpec.comparison else '<', @@ -159,12 +164,13 @@ local createSloStatusPanel(sliDescription, sliSpec) = prometheus.target( ||| 1 - ( - sum_over_time((sli_value{%(sliLabelSelectors)s} %(comparison)s bool %(target)s)[$__interval:%(evalInterval)s]) + sum_over_time((sli_value{%(sliLabelSelectors)s, type="%(sliType)s"} %(comparison)s bool %(target)s)[$__interval:%(evalInterval)s]) / - sum_over_time((sli_value{%(sliLabelSelectors)s} < bool Inf)[$__interval:%(evalInterval)s]) + sum_over_time((sli_value{%(sliLabelSelectors)s, type="%(sliType)s"} < bool Inf)[$__interval:%(evalInterval)s]) ) ||| % { sliLabelSelectors: sliSpec.dashboardSliLabelSelectors, + sliType: sliSpec.sliType, evalInterval: sliSpec.evalInterval, target: sliSpec.metricTarget, comparison: if std.objectHas(sliSpec, 'comparison') then sliSpec.comparison else '<', diff --git a/monitoring-as-code/src/dashboards/journey-dashboard.libsonnet b/monitoring-as-code/src/dashboards/journey-dashboard.libsonnet index 9c785458..9209f573 100644 --- a/monitoring-as-code/src/dashboards/journey-dashboard.libsonnet +++ b/monitoring-as-code/src/dashboards/journey-dashboard.libsonnet @@ -63,7 +63,8 @@ local createJourneyDashboards(config, sliList, links) = + // Detail graph for this SLI, generated by metric specific library [sli.graph { gridPos: { x: 14, y: 0, w: 10, h: 5 } }] - for sli in std.objectValues(sliList[journeyKey],) + for sliKey in std.objectFields(sliList[journeyKey]) + for sli in std.objectValues(sliList[journeyKey][sliKey]) ]) ) for journeyKey in std.objectFields(sliList) diff --git a/monitoring-as-code/src/dashboards/product-dashboard.libsonnet b/monitoring-as-code/src/dashboards/product-dashboard.libsonnet index 8f3aa281..1fe5f31b 100644 --- a/monitoring-as-code/src/dashboards/product-dashboard.libsonnet +++ b/monitoring-as-code/src/dashboards/product-dashboard.libsonnet @@ -18,6 +18,20 @@ local viewPanelSize = { y: 4, }; +// Combines all of the SLI types of SLIs in a journey +// @param sliList The list of SLIs for a service +// @returns Object containing journeys and SLI types +local createCombinedSliList(sliList) = + { + [journeyKey]: { + // This attribute should be a unique identifier for SLI types + [sli.row_title_short]: sli + for sliKey in std.objectFields(sliList[journeyKey]) + for sli in std.objectValues(sliList[journeyKey][sliKey]) + } + for journeyKey in std.objectFields(sliList) + }; + // Creates a row panel which is used to contain all of the SLIs in each journey // @param journeyIndex The index of the current journey having its panels created // @param noOfPanelRows The number of rows of view panels that have been created @@ -108,9 +122,10 @@ local createPanels(journeyIndex, sliIndex, noOfPanelRows, config, sliList) = // @param links The links to other dashboards // @returns The JSON defining the product dashboard local createProductDashboard(config, sliList, links) = - { - local panels = createPanels(0, 0, 0, config, sliList), + local combinedSliList = createCombinedSliList(sliList); + local panels = createPanels(0, 0, 0, config, combinedSliList); + { [std.join('-', [config.product, 'product-view.json'])]: dashboard.new( title = '%(product)s-product-view' % { product: config.product }, diff --git a/monitoring-as-code/src/mixin-builder.libsonnet b/monitoring-as-code/src/mixin-builder.libsonnet index fdf7d8c9..40555df4 100644 --- a/monitoring-as-code/src/mixin-builder.libsonnet +++ b/monitoring-as-code/src/mixin-builder.libsonnet @@ -68,25 +68,38 @@ local updateSliSpecList(config, passedSliSpecList) = for journeyKey in std.objectFields(passedSliSpecList) }; +// Adds the current SLI type and metric target to the SLI spec +// @param sliType The current SLI type +// @param sliSpec The spec for the SLI having its elements created +// @returns The SLI spec object but with updated SLI type +local updateSliSpec(sliType, sliSpec) = + sliSpec + + + { + metricTarget: sliSpec.sliTypes[sliType], + sliType: sliType, + }; + // Creates an SLI with its standard dashboard elements, unique dashboard elements, recording // rules, alerting rules and alerts +// @param sliType The current SLI type // @param config The config for the service defined in the mixin file -// @param sliSpecList The list of SLI specs defined in the mixin file +// @param passedSliSpec The spec for the SLI having its elements created // @param sliKey The key of the current SLI having rules generated // @param journeyKey The key of the journey containing the SLI having rules generated // @returns The SLI with standard elements -local createSli(config, sliSpecList, sliKey, journeyKey) = - local sliSpec = sliSpecList[journeyKey][sliKey]; +local createSli(sliType, config, passedSliSpec, sliKey, journeyKey) = + local sliSpec = updateSliSpec(sliType, passedSliSpec); if std.objectHas(macConfig.metricTypes, sliSpec.metricType) then - if std.objectHas(macConfig.metricTypes[sliSpec.metricType].sliTypesConfig, sliSpec.sliType) then + if std.objectHas(macConfig.metricTypes[sliSpec.metricType].sliTypesConfig, sliType) then sliElementFunctions.createRecordingRules(sliSpec, config) + sliElementFunctions.createSliStandardElements(sliKey, sliSpec) + dashboardFunctions.createDashboardStandardElements(sliKey, journeyKey, sliSpec, config) + alertFunctions.createBurnRateRules(sliSpec) + alertFunctions.createBurnRateAlerts(config, sliSpec, sliKey, journeyKey) - else error 'Metric type does not have SLI type' - else error 'Undefined metric type'; + else error 'Metric type %s does not have SLI type %s' % [sliSpec.metricType, sliType] + else error 'Undefined metric type %s' % sliSpec.metricType; // Creates a list of all the SLIs in a service with their standard dashboard elements, unique // dashboard elements, recording rules, alerting rules and alerts @@ -96,8 +109,11 @@ local createSli(config, sliSpecList, sliKey, journeyKey) = local createSliList(config, sliSpecList) = { [journeyKey]+: { - [sliKey]+: - createSli(config, sliSpecList, sliKey, journeyKey) + [sliKey]+: { + [sliType]+: + createSli(sliType, config, sliSpecList[journeyKey][sliKey], sliKey, journeyKey) + for sliType in std.objectFields(sliSpecList[journeyKey][sliKey].sliTypes) + } for sliKey in std.objectFields(sliSpecList[journeyKey]) } for journeyKey in std.objectFields(sliSpecList) @@ -153,7 +169,8 @@ local createPrometheusRules(config, sliList) = rules: std.flattenArrays([ sli.recording_rules for journeyKey in std.objectFields(sliList) - for sli in std.objectValues(sliList[journeyKey]) + for sliKey in std.objectFields(sliList[journeyKey]) + for sli in std.objectValues(sliList[journeyKey][sliKey]) ]), }], }; @@ -169,7 +186,8 @@ local createPrometheusAlerts(config, sliList) = rules: std.flattenArrays([ sli.alerts for journeyKey in std.objectFields(sliList) - for sli in std.objectValues(sliList[journeyKey]) + for sliKey in std.objectFields(sliList[journeyKey]) + for sli in std.objectValues(sliList[journeyKey][sliKey]) ]), }], }; diff --git a/monitoring-as-code/src/sli-elements/recording-rules.libsonnet b/monitoring-as-code/src/sli-elements/recording-rules.libsonnet index 51ff13a1..7ca383cc 100644 --- a/monitoring-as-code/src/sli-elements/recording-rules.libsonnet +++ b/monitoring-as-code/src/sli-elements/recording-rules.libsonnet @@ -21,11 +21,12 @@ local createStandardRecordingRules(sliSpec, sliMetadata) = { record: 'sli_percentage', expr: ||| - (sum(sum_over_time((sli_value{%(ruleSliLabelSelectors)s} %(comparison)s bool %(metricTarget)s)[30d:%(evalInterval)s]) - or vector(0)) / sum(sum_over_time((sli_value{%(ruleSliLabelSelectors)s} < bool Inf)[30d:%(evalInterval)s]) + (sum(sum_over_time((sli_value{%(ruleSliLabelSelectors)s, type="%(sliType)s"} %(comparison)s bool %(metricTarget)s)[30d:%(evalInterval)s]) + or vector(0)) / sum(sum_over_time((sli_value{%(ruleSliLabelSelectors)s, type="%(sliType)s"} < bool Inf)[30d:%(evalInterval)s]) or vector(0))) >= 0 ||| % { ruleSliLabelSelectors: sliSpec.ruleSliLabelSelectors, + sliType: sliSpec.sliType, evalInterval: sliSpec.evalInterval, metricTarget: sliSpec.metricTarget, comparison: if std.objectHas(sliSpec, 'comparison') then sliSpec.comparison else '<', From b775d5a008128a57d95a2fce7346be11e033c176 Mon Sep 17 00:00:00 2001 From: finlaymccormickHO <102794431+finlaymccormickHO@users.noreply.github.com> Date: Wed, 24 Aug 2022 16:23:06 +0100 Subject: [PATCH 12/12] feat: added testing mixin (#194) * feat: added testing mixin * Update testing-mixin.jsonnet --- .../mixin-defs/testing-mixin.jsonnet | 234 ++++++++++++++++++ 1 file changed, 234 insertions(+) create mode 100644 monitoring-as-code/mixin-defs/testing-mixin.jsonnet diff --git a/monitoring-as-code/mixin-defs/testing-mixin.jsonnet b/monitoring-as-code/mixin-defs/testing-mixin.jsonnet new file mode 100644 index 00000000..621813ce --- /dev/null +++ b/monitoring-as-code/mixin-defs/testing-mixin.jsonnet @@ -0,0 +1,234 @@ +local mixinFunctions = import '../src/lib/mixin-functions.libsonnet'; + +local config = { + product: 'testing', + applicationServiceName: 'test', + servicenowAssignmentGroup: 'test', + maxAlertSeverity: 'test', + configurationItem: 'test', + alertingSlackChannel: 'test', + grafanaUrl: 'test', + alertmanagerUrl: 'test', +}; + +local sliSpecList = { + testing: { + SLI01: { + title: 'test', + sliDescription: 'test', + period: '7d', + metricType: 'aws_sqs', + evalInterval: '5m', + latencyTarget: 100, + selectors: { + product: 'test', + }, + sloTarget: 90, + sliTypes: { + correctness: 0.1, + freshness: 0.1, + }, + }, + SLI02: { + title: 'test', + sliDescription: 'test', + period: '7d', + metricType: 'http_server_requests_seconds', + evalInterval: '5m', + latencyPercentile: 0.1, + selectors: { + product: 'test', + errorStatus: '4..|5..', + }, + sloTarget: 90, + sliTypes: { + availability: 0.1, + latency: 0.1, + }, + }, + SLI03: { + title: 'test', + sliDescription: 'test', + period: '7d', + metricType: 'grafana_http_request_duration_seconds', + evalInterval: '5m', + selectors: { + product: 'test', + errorStatus: '4..|5..', + }, + sloTarget: 90, + sliTypes: { + availability: 0.1, + }, + }, + SLI04: { + title: 'test', + sliDescription: 'test', + period: '7d', + metricType: 'http_requests_total', + evalInterval: '5m', + selectors: { + product: 'test', + errorStatus: '4..|5..', + }, + sloTarget: 90, + sliTypes: { + availability: 0.1, + }, + }, + SLI05: { + title: 'test', + sliDescription: 'test', + period: '7d', + metricType: 'http_request_duration_seconds', + evalInterval: '5m', + latencyPercentile: 0.1, + selectors: { + product: 'test', + }, + sloTarget: 90, + sliTypes: { + latency: 0.1, + }, + }, + SLI06: { + title: 'test', + sliDescription: 'test', + period: '7d', + metricType: 'nginx_ingress_controller_requests', + evalInterval: '5m', + selectors: { + product: 'test', + errorStatus: '4..|5..', + }, + sloTarget: 90, + sliTypes: { + availability: 0.1, + }, + }, + SLI07: { + title: 'test', + sliDescription: 'test', + period: '7d', + metricType: 'nginx_ingress_controller_request_duration_seconds', + evalInterval: '5m', + latencyPercentile: 0.1, + selectors: { + product: 'test', + errorStatus: '4..|5..', + }, + sloTarget: 90, + sliTypes: { + availability: 0.1, + latency: 0.1, + }, + }, + SLI08: { + title: 'test', + sliDescription: 'test', + period: '7d', + metricType: 'aws_alb', + latencyPercentile: 0.9, + evalInterval: '5m', + selectors: { + product: 'test', + }, + sloTarget: 90, + sliTypes: { + availability: 0.1, + latency: 0.1, + }, + }, + SLI09: { + title: 'test', + sliDescription: 'test', + period: '7d', + metricType: 'thanos_compact_group_compactions', + evalInterval: '5m', + selectors: { + product: 'test', + }, + sloTarget: 90, + sliTypes: { + availability: 0.1, + }, + }, + SLI10: { + title: 'test', + sliDescription: 'test', + period: '7d', + metricType: 'up', + evalInterval: '5m', + selectors: { + product: 'test', + }, + sloTarget: 90, + sliTypes: { + availability: 0.1, + }, + }, + SLI11: { + title: 'test', + sliDescription: 'test', + period: '7d', + metricType: 'scrape_duration_seconds', + evalInterval: '5m', + selectors: { + product: 'test', + }, + sloTarget: 90, + sliTypes: { + availability: 0.1, + }, + }, + SLI12: { + title: 'test', + sliDescription: 'test', + period: '7d', + metricType: 'aws_rds_read', + evalInterval: '5m', + selectors: { + product: 'test', + }, + sloTarget: 90, + sliTypes: { + latency: 0.1, + iops: 0.1, + throughput: 0.1, + }, + }, + SLI13: { + title: 'test', + sliDescription: 'test', + period: '7d', + metricType: 'aws_rds_write', + evalInterval: '5m', + selectors: { + product: 'test', + }, + sloTarget: 90, + sliTypes: { + latency: 0.1, + iops: 0.1, + throughput: 0.1, + }, + }, + SLI14: { + title: 'test', + sliDescription: 'test', + period: '7d', + metricType: 'aws_es', + evalInterval: '5m', + selectors: { + product: 'test', + }, + sloTarget: 90, + sliTypes: { + availability: 0.1, + latency: 0.1, + }, + }, + }, +}; + +mixinFunctions.buildMixin(config, sliSpecList)