Skip to content

Commit

Permalink
feat: add saturation gauge sli library (#594)
Browse files Browse the repository at this point in the history
* feat: add saturation gauge sli library

* feat: adding new avail slis for status check and inservice instances

* fix: correction to legends and formula for calculating colour thresholds

* refactor: correction status metric with integer to support avg success prom target, also update fill on all graph panels to 4

* fix: correction to saturation SLI promql expression to use sum and remove detailed dashboard gen

* feat: update output folder permissions
  • Loading branch information
michaelpearsonHO authored Sep 5, 2023
1 parent b823360 commit 12c834d
Show file tree
Hide file tree
Showing 19 changed files with 503 additions and 22 deletions.
50 changes: 50 additions & 0 deletions monitoring-as-code/mixin-defs/testing-mixin.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,56 @@ local sliSpecList = {
},
},
},
SLI20: {
title: 'cwa saturation',
sliDescription: 'cwagent',
period: '30d',
metricType: 'aws_cwagent',
evalInterval: '5m',
selectors: {
product: 'test',
},
sloTarget: 90,
sliTypes: {
saturation: {
counterPercentTarget: 90,
intervalTarget: 90,
},
},
},
SLI21: {
title: 'ec2 status check',
sliDescription: 'ec2 status check',
period: '30d',
metricType: 'aws_ec2_status_check',
evalInterval: '5m',
selectors: {
product: 'test',
},
sloTarget: 90,
sliTypes: {
availability: {
counterIntegerTarget: 1,
intervalTarget: 90,
},
},
},
SLI22: {
title: 'asg inservice',
sliDescription: 'asg inservice instances',
period: '30d',
metricType: 'aws_autoscaling_group_in_service_instance',
evalInterval: '5m',
selectors: {
product: 'test',
},
sloTarget: 90,
sliTypes: {
availability: {
intervalTarget: 90,
},
},
},
},
};

Expand Down
1 change: 1 addition & 0 deletions monitoring-as-code/run-mixin.sh
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ fi

# Transfer Prometheus rules and Grafana dashboards to output path
cp -a "$PWD"/_output/. "$output_path"
chmod u+x "$output_path"

# Remove temporary directories
rm -rf "$PWD"/_input
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ local createAvailabilityPanel(sloTargetLegend, sliSpec) =
{ color: 'grey', value: null },
{ color: 'red', value: 0 },
{ color: 'orange', value: sliSpec.sloTarget / 100 },
{ color: 'green', value: sliSpec.sloTarget / 98 },
{ color: 'green', value: sliSpec.sloTarget / 99 },
],
) + { options+: { textMode: 'Value and name' } };

Expand Down Expand Up @@ -189,7 +189,7 @@ local createAveragedSliTypesPanel(sloTargetLegend, sliSpec, avgSloStatusExpr, av
{ color: 'red', value: -99 }, // minus numbers will now be red instead of grey
{ color: 'red', value: 0 },
{ color: 'orange', value: sloTargetLegend / 100 },
{ color: 'green', value: sloTargetLegend / 98 },
{ color: 'green', value: sloTargetLegend / 99 },
],
) + { options+: { textMode: 'Value and name' } } + {
Expand Down Expand Up @@ -223,7 +223,7 @@ local createAveragedSliTypesPanel(sloTargetLegend, sliSpec, avgSloStatusExpr, av
{ color: 'green', value: 0.5 },
// { color: 'orange', value: sloTargetLegend / 100 },
// { color: 'green', value: sloTargetLegend / 98 },
// { color: 'green', value: sloTargetLegend / 99 },
// { color: 'orange', value: debug.debug(sloTargetLegend) },
],
Expand Down
81 changes: 78 additions & 3 deletions monitoring-as-code/src/metric-types.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,83 @@
targetMetrics: {},
},
},
aws_cwagent: {
metricTypeConfig: {
selectorLabels: {
environment: 'environment',
product: 'job',
},
metrics: {
averageSaturation: 'aws_cw_agent_mem_used_percent_average',
},
},
sliTypesConfig: {
saturation: {
library: (import 'sli-value-libraries/saturation-gauge-with-useage-metric-and-percent-target.libsonnet'),
description: 'The average saturation of %(sliDescription)s should be %(comparison)s %(metricTarget)0.1f',
targetMetrics: {
target: 'averageSaturation',
},
},
},
detailDashboardConfig: {
standardTemplates: [],
elements: [],
targetMetrics: {},
},
},
aws_ec2_status_check: {
metricTypeConfig: {
selectorLabels: {
environment: 'environment',
product: 'job',
},
metrics: {
averageStatus: 'aws_ec2_status_check_failed_sum',
},
},
sliTypesConfig: {
availability: {
library: (import 'sli-value-libraries/availability-gauge-with-status-metric-and-integer-target.libsonnet'),
description: 'Error rate for %(sliDescription)s should be below %(metric_target_percent)0.1f%%',
targetMetrics: {
target: 'averageStatus',
},
},
},
detailDashboardConfig: {
standardTemplates: [],
elements: [],
targetMetrics: {},
},
},
aws_autoscaling_group_in_service_instance: {
metricTypeConfig: {
selectorLabels: {
environment: 'environment',
product: 'job',
},
metrics: {
desired: 'aws_autoscaling_group_desired_capacity_average',
inservice: 'aws_autoscaling_group_in_service_instance_average',
},
},
sliTypesConfig: {
availability: {
library: (import 'sli-value-libraries/availability-gauge-using-inservice-and-desired-instance-metrics.libsonnet'),
description: 'Error rate for %(sliDescription)s should be below %(metric_target_percent)0.1f%%',
targetMetrics: {
desired: 'desired',
inservice: 'inservice',
},
},
},
detailDashboardConfig: {
standardTemplates: [],
elements: [],
targetMetrics: {},
},
},
template: {
metricTypeConfig: {
selectorLabels: {
Expand All @@ -696,9 +773,7 @@
detailDashboardConfig: {
standardTemplates: [],
elements: [],
targetMetrics: {

},
targetMetrics: {},
},
},
}
12 changes: 7 additions & 5 deletions monitoring-as-code/src/mixin-builder.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ local updateSliSpecList(config, passedSliSpecList) =
for journeyKey in std.objectFields(passedSliSpecList)
};

// Adds the current SLI type, metric target, counter seconds target and latency percentile to the SLI spec.
// Adds the current SLI type, metric target, counter seconds target, counter percent target, counter integer target and latency percentile to the SLI spec.
// @param sliType The current SLI type
// @param sliSpec The spec for the SLI having its elements created
// @returns The SLI spec object but with updated SLI type and supplementary target and percentile metadata.
Expand All @@ -96,8 +96,10 @@ local updateSliSpec(sliType, sliSpec) =
then (100 - sliSpec.sliTypes[sliType].intervalTarget) / 100
else (100 - sliSpec.sloTarget) / 100,

// CounterSecondsTarget is applied within sli_value expressions and as such does not used for standard elements
// CounterSecondsTarget, CounterPercentTarget and CounterIntegerTarget are applied within sli_value expressions and as such does not used for standard elements
counterSecondsTarget: sliSpec.sliTypes[sliType].counterSecondsTarget,
counterPercentTarget: sliSpec.sliTypes[sliType].counterPercentTarget,
counterIntegerTarget: sliSpec.sliTypes[sliType].counterIntegerTarget,
latencyPercentile: (sliSpec.sliTypes[sliType].percentile / 100),
sliType: sliType,
};
Expand Down Expand Up @@ -242,9 +244,9 @@ local buildMixin(passedConfig, passedSliSpecList) =
{
grafanaDashboardFolder: config.product,
grafanaDashboards+: dashboardFunctions.createJourneyDashboards(config, sliList, links) +
dashboardFunctions.createProductDashboard(config, sliList, links) +
dashboardFunctions.createDetailDashboards(config, links, sliSpecList),

dashboardFunctions.createProductDashboard(config, sliList, links),
// temporarily removed detailed dashboards pending further consideration
// + dashboardFunctions.createDetailDashboards(config, links, sliSpecList)
prometheusRules+: createPrometheusRules(config, sliList),
prometheusAlerts+: createPrometheusAlerts(config, sliList),
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ local createGraphPanel(sliSpec) =
evalInterval: sliSpec.evalInterval,
},
min=0,
fill=0,
fill=4,
formatY2='percentunit',
).addTarget(
prometheus.target(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ local createGraphPanel(sliSpec) =
evalInterval: sliSpec.evalInterval,
},
min=0,
fill=0,
fill=4,
formatY2='percentunit',
).addTarget(
prometheus.target(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ local createGraphPanel(sliSpec) =
evalInterval: sliSpec.evalInterval,
},
min=0,
fill=0,
fill=4,
formatY2='percentunit',
thresholds=[
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ local createGraphPanel(sliSpec) =
evalInterval: sliSpec.evalInterval,
},
min=0,
fill=0,
fill=4,
formatY2='percentunit',
thresholds=[
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ local createGraphPanel(sliSpec) =
evalInterval: sliSpec.evalInterval,
},
min=0,
fill=0,
fill=4,
formatY2='percentunit',
).addTarget(
prometheus.target(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ local createGraphPanel(sliSpec) =
evalInterval: sliSpec.evalInterval,
},
min=0,
fill=0,
fill=4,
formatY2='percentunit',
).addTarget(
prometheus.target(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
// Divides the Count where inservice instances do not match desired instances by the overall count of samples
// target metric samples taken from average-using-single-metric

// Target metrics:
// target - Metric to get the average value of over evaluation interval

// Additional config:
// counterIntegerTarget in SLI spec

// MaC imports
local sliValueLibraryFunctions = import '../util/sli-value-library-functions.libsonnet';

// Grafana imports
local grafana = import 'grafonnet/grafana.libsonnet';
local prometheus = grafana.prometheus;
local graphPanel = grafana.graphPanel;

// Creates the custom SLI value rule
// @param sliSpec The spec for the SLI having its recording rules created
// @param sliMetadata Metadata about the type and category of the SLI
// @param config The config for the service defined in the mixin file
// @returns JSON defining the recording rule
local createSliValueRule(sliSpec, sliMetadata, config) =
local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec);
local ruleSelectors = sliValueLibraryFunctions.createRuleSelectors(metricConfig, sliSpec, config);
local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec);
local selectorLabels = sliValueLibraryFunctions.getSelectorLabels(metricConfig);

[
{
record: 'sli_value',
expr: |||
sum without (%(selectorLabels)s) (label_replace(label_replace(
(
(
sum by(%(selectorLabels)s) (avg_over_time(%(desiredMetric)s{%(selectors)s}[%(evalInterval)s]))
-
sum by(%(selectorLabels)s) (avg_over_time(%(inserviceMetric)s{%(selectors)s}[%(evalInterval)s]))
) >= 1
/
count by(%(selectorLabels)s) (count_over_time(%(desiredMetric)s{%(selectors)s}[%(evalInterval)s]))
),
"sli_environment", "$1", "%(environmentSelectorLabel)s", "(.*)"), "sli_product", "$1", "%(productSelectorLabel)s", "(.*)"))
||| % {
desiredMetric: targetMetrics.desired,
inserviceMetric: targetMetrics.inservice,
selectorLabels: std.join(', ', std.objectValues(selectorLabels)),
environmentSelectorLabel: selectorLabels.environment,
productSelectorLabel: selectorLabels.product,
selectors: std.join(', ', ruleSelectors),
evalInterval: sliSpec.evalInterval,
},
labels: sliSpec.sliLabels + sliMetadata,
},
];

// Creates Grafana dashboard graph panel
// @param sliSpec The spec for the SLI having its dashboard created
// @returns Grafana graph panel object
local createGraphPanel(sliSpec) =
local metricConfig = sliValueLibraryFunctions.getMetricConfig(sliSpec);
local dashboardSelectors = sliValueLibraryFunctions.createDashboardSelectors(metricConfig, sliSpec);
local targetMetrics = sliValueLibraryFunctions.getTargetMetrics(metricConfig, sliSpec);

graphPanel.new(
title='%s' % sliSpec.sliDescription,
datasource='prometheus',
description=|||
* Sample interval is %(evalInterval)s
* Selectors are %(selectors)s
||| % {
selectors: std.strReplace(std.join(', ', sliValueLibraryFunctions.getSelectors(metricConfig, sliSpec)), '~', '\\~'),
evalInterval: sliSpec.evalInterval,
},
min=0,
fill=4,
).addTarget(
prometheus.target(
|||
sum(avg_over_time(%(inserviceMetric)s{%(selectors)s}[%(evalInterval)s]) >= 0 or vector(0))
||| % {
desiredMetric: targetMetrics.desired,
inserviceMetric: targetMetrics.inservice,
selectors: std.join(',', dashboardSelectors),
evalInterval: sliSpec.evalInterval,
},
legendFormat='avg inservice instances',
),
).addTarget(
prometheus.target(
|||
(
sum(avg_over_time(%(inserviceMetric)s{%(selectors)s}[%(evalInterval)s]) >= 0 or vector(0))
-
sum(avg_over_time(%(desiredMetric)s{%(selectors)s}[%(evalInterval)s]) >= 0 or vector(0))
) >= 1
/
count(count_over_time(%(desiredMetric)s{%(selectors)s}[%(evalInterval)s]) >= 0 or vector(0))
||| % {
desiredMetric: targetMetrics.desired,
inserviceMetric: targetMetrics.inservice,
selectors: std.join(',', dashboardSelectors),
evalInterval: sliSpec.evalInterval,
},
legendFormat='avg period where inservice instances < desired instances'
)
).addSeriesOverride(
{
alias: '/avg period where inservice instances < desired instances/',
color: 'red',
},
).addSeriesOverride(
{
alias: '/avg inservice instances/',
color: 'green',
},
);

// File exports
{
createSliValueRule(sliSpec, sliMetadata, config): createSliValueRule(sliSpec, sliMetadata, config),
createGraphPanel(sliSpec): createGraphPanel(sliSpec),
}
Loading

0 comments on commit 12c834d

Please sign in to comment.