From e7c5d468e4833dd4f5f4eeba46850c55463d72cf Mon Sep 17 00:00:00 2001 From: Sjuul Janssen Date: Fri, 19 Aug 2022 14:43:01 +0200 Subject: [PATCH 1/2] upgrading k8s monitoring to kubernetes_state_core --- README.md | 115 ++++++++++++++++-------------- cpu-limits-low-perc-state.tf | 6 +- cpu-limits-low-perc.tf | 6 +- cpu-limits-low.tf | 7 +- cpu-on-dns-pods-high.tf | 2 +- cpu-requests-low-perc-state.tf | 2 +- cpu-requests-low-perc.tf | 2 +- cpu-requests-low.tf | 2 +- daemonset-incomplete-variables.tf | 4 +- daemonset-incomplete.tf | 2 +- datadog-agent.tf | 6 +- deploy-desired-vs-status.tf | 2 +- examples/example.tf | 2 +- memory-limits-low-perc-state.tf | 2 +- memory-limits-low-perc.tf | 2 +- memory-limits-low.tf | 2 +- memory-requests-low.tf | 2 +- module_description.md | 7 +- network-unavailable.tf | 2 +- node-diskpressure.tf | 2 +- node-memory-used-percent.tf | 2 +- node-memorypressure.tf | 2 +- node-ready.tf | 2 +- node-status.tf | 2 +- persistent-volumes.tf | 2 +- pid-pressure.tf | 2 +- pod-ready.tf | 6 +- pods-failed.tf | 2 +- pods-pending.tf | 2 +- replicaset-incomplete.tf | 2 +- replicaset-unavailable.tf | 4 +- 31 files changed, 109 insertions(+), 96 deletions(-) diff --git a/README.md b/README.md index 9468f0c..10ef1c3 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,11 @@ This module mainly check on Kubernetes resource level and cluster health. System level monitoring can best be implemented with the [system module](https://github.com/kabisa/terraform-datadog-system). Docker/Container level monitoring can best be implemented with the [docker module](https://github.com/kabisa/terraform-datadog-docker-container). +# Recent changes: + +- switch from kubernetes_state to kubernetes_state_core as a default https://docs.datadoghq.com/integrations/kubernetes_state_core/?tab=helm +- upgrade provider to ~> 3.12 + This module is part of a larger suite of modules that provide alerts in Datadog. Other modules can be found on the [Terraform Registry](https://registry.terraform.io/search/modules?namespace=kabisa&provider=datadog) @@ -27,7 +32,7 @@ module "kubernetes" { notification_channel = "mail@example.com" service = "Kubernetes" env = "prd" - filter_str = "cluster_name:production" + filter_str = "kube_cluster_name:production" } ``` @@ -39,39 +44,39 @@ Monitors: | Monitor name | Default enabled | Priority | Query | |-----------------|------|----|------------------------| -| [CPU Limits Low Perc State](#cpu-limits-low-perc-state) | False | 3 | `max(last_5m):( sum:kubernetes_state.container.cpu_limit{tag:xxx} by {host,cluster_name} / sum:kubernetes_state.node.cpu_capacity{tag:xxx} by {host,cluster_name}) * 100 > 100` | -| [CPU Limits Low Perc](#cpu-limits-low-perc) | True | 3 | `max(last_5m):(sum:kubernetes.cpu.limits{tag:xxx} by {host,cluster_name} / max:system.cpu.num_cores{tag:xxx} by {host,cluster_name}) * 100 > 100` | -| [CPU Limits Low](#cpu-limits-low) | False | 3 | `min(last_5m):max:system.cpu.num_cores{tag:xxx} by {cluster_name,host} - sum:kubernetes.cpu.limits{tag:xxx} by {cluster_name,host} < ${-30}` | -| [CPU On Dns Pods High](#cpu-on-dns-pods-high) | True | 2 | `avg(last_30m):avg:docker.cpu.usage{tag:xxx} by {cluster_name,host,container_name} > 85` | -| [CPU Requests Low Perc State](#cpu-requests-low-perc-state) | False | 3 | `max(last_5m):( sum:kubernetes_state.container.cpu_requested{tag:xxx} by {host,cluster_name} / sum:kubernetes_state.node.cpu_capacity{tag:xxx} by {host,cluster_name} ) * 100 > 95` | -| [CPU Requests Low Perc](#cpu-requests-low-perc) | True | 3 | `max(last_5m):100 * sum:kubernetes.cpu.requests{tag:xxx} by {cluster_name,host} / max:system.cpu.num_cores{tag:xxx} by {cluster_name,host} > 95` | -| [CPU Requests Low](#cpu-requests-low) | False | 3 | `max(last_5m):max:system.cpu.num_cores{tag:xxx} by {cluster_name,host} - sum:kubernetes.cpu.requests{tag:xxx} by {cluster_name,host} < 0.5` | -| [Daemonset Incomplete](#daemonset-incomplete) | True | 3 | `min(last_30m):max:kubernetes_state.daemonset.scheduled{tag:xxx} by {daemonset,cluster_name} - min:kubernetes_state.daemonset.ready{tag:xxx} by {daemonset,cluster_name} > 0` | +| [CPU Limits Low Perc State](#cpu-limits-low-perc-state) | False | 3 | `max(last_5m):( sum:kubernetes_state.container.cpu_limit{tag:xxx} by {host,kube_cluster_name} / sum:kubernetes_state.node.cpu_capacity{tag:xxx} by {host,kube_cluster_name}) * 100 > 100` | +| [CPU Limits Low Perc](#cpu-limits-low-perc) | True | 3 | `max(last_5m):(sum:kubernetes.cpu.limits{tag:xxx} by {host,kube_cluster_name} / max:system.cpu.num_cores{tag:xxx} by {host,kube_cluster_name}) * 100 > 100` | +| [CPU Limits Low](#cpu-limits-low) | False | 3 | `min(last_5m):max:system.cpu.num_cores{tag:xxx} by {kube_cluster_name,host} - sum:kubernetes.cpu.limits{tag:xxx} by {kube_cluster_name,host} < ${-30}` | +| [CPU On Dns Pods High](#cpu-on-dns-pods-high) | True | 2 | `avg(last_30m):avg:docker.cpu.usage{tag:xxx} by {kube_cluster_name,host,container_name} > 85` | +| [CPU Requests Low Perc State](#cpu-requests-low-perc-state) | False | 3 | `max(last_5m):( sum:kubernetes_state.container.cpu_requested{tag:xxx} by {host,kube_cluster_name} / sum:kubernetes_state.node.cpu_capacity{tag:xxx} by {host,kube_cluster_name} ) * 100 > 95` | +| [CPU Requests Low Perc](#cpu-requests-low-perc) | True | 3 | `max(last_5m):100 * sum:kubernetes.cpu.requests{tag:xxx} by {kube_cluster_name,host} / max:system.cpu.num_cores{tag:xxx} by {kube_cluster_name,host} > 95` | +| [CPU Requests Low](#cpu-requests-low) | False | 3 | `max(last_5m):max:system.cpu.num_cores{tag:xxx} by {kube_cluster_name,host} - sum:kubernetes.cpu.requests{tag:xxx} by {kube_cluster_name,host} < 0.5` | +| [Daemonset Incomplete](#daemonset-incomplete) | True | 2 | `min(last_15m):max:kubernetes_state.daemonset.scheduled{tag:xxx} by {kube_daemon_set,kube_cluster_name} - min:kubernetes_state.daemonset.ready{tag:xxx} by {kube_daemon_set,kube_cluster_name} > 0` | | [Daemonset Multiple Restarts](#daemonset-multiple-restarts) | True | 3 | `max(last_15m):clamp_min(max:kubernetes.containers.restarts{tag:xxx} by {kube_daemon_set} - hour_before(max:kubernetes.containers.restarts{tag:xxx} by {kube_daemon_set}), 0) > 5.0` | -| [Datadog Agent](#datadog-agent) | True | 2 | `avg(last_5m):avg:datadog.agent.running{tag:xxx} by {host,cluster_name} < 1` | -| [Deploy Desired Vs Status](#deploy-desired-vs-status) | True | 3 | `avg(last_15m):max:kubernetes_state.deployment.replicas_desired{tag:xxx} by {cluster_name,host} - max:kubernetes_state.deployment.replicas{tag:xxx} by {cluster_name,host} > 10` | +| [Datadog Agent](#datadog-agent) | True | 2 | `avg(last_5m):avg:datadog.agent.running{tag:xxx} by {host,kube_cluster_name} < 1` | +| [Deploy Desired Vs Status](#deploy-desired-vs-status) | True | 3 | `avg(last_15m):max:kubernetes_state.deployment.replicas_desired{tag:xxx} by {kube_cluster_name} - max:kubernetes_state.deployment.replicas_available{tag:xxx} by {kube_cluster_name} > 10` | | [Deployment Multiple Restarts](#deployment-multiple-restarts) | True | 3 | `max(last_15m):clamp_min(max:kubernetes.containers.restarts{tag:xxx} by {kube_deployment} - hour_before(max:kubernetes.containers.restarts{tag:xxx} by {kube_deployment}), 0) > 5.0` | | [Hpa Status](#hpa-status) | True | 3 | `avg(last_15m):avg:kubernetes_state.hpa.condition{tag:xxx} by {hpa,kube_namespace,status,condition} < 1` | -| [Memory Limits Low Perc State](#memory-limits-low-perc-state) | False | 3 | `max(last_5m):( sum:kubernetes_state.container.memory_limit{tag:xxx} by {host,cluster_name} / sum:kubernetes_state.node.memory_allocatable{tag:xxx} by {host,cluster_name}) * 100 > 100` | -| [Memory Limits Low Perc](#memory-limits-low-perc) | True | 3 | `max(last_5m):( max:kubernetes.memory.limits{tag:xxx} by {host,cluster_name}/ max:system.mem.total{tag:xxx} by {host,cluster_name}) * 100 > 100` | -| [Memory Limits Low](#memory-limits-low) | False | 3 | `avg(last_5m):max:system.mem.total{tag:xxx} by {host,cluster_name} - max:kubernetes.memory.limits{tag:xxx} by {host,cluster_name} < 3000000000` | +| [Memory Limits Low Perc State](#memory-limits-low-perc-state) | False | 3 | `max(last_5m):( sum:kubernetes_state.container.memory_limit{tag:xxx} by {host,kube_cluster_name} / sum:kubernetes_state.node.memory_allocatable{tag:xxx} by {host,kube_cluster_name}) * 100 > 100` | +| [Memory Limits Low Perc](#memory-limits-low-perc) | True | 3 | `max(last_5m):( max:kubernetes.memory.limits{tag:xxx} by {host,kube_cluster_name}/ max:system.mem.total{tag:xxx} by {host,kube_cluster_name}) * 100 > 100` | +| [Memory Limits Low](#memory-limits-low) | False | 3 | `avg(last_5m):max:system.mem.total{tag:xxx} by {host,kube_cluster_name} - max:kubernetes.memory.limits{tag:xxx} by {host,kube_cluster_name} < 3000000000` | | [Memory Requests Low Perc State](#memory-requests-low-perc-state) | False | 3 | `max(last_5m):( max:kubernetes_state.container.memory_requested{tag:xxx} / max:kubernetes_state.node.memory_allocatable{tag:xxx} ) * 100 > 95` | | [Memory Requests Low Perc](#memory-requests-low-perc) | True | 3 | `max(${var.cpu_requests_low_perc_evaluation_period}):( max:kubernetes.memory.requests{${local.cpu_requests_low_perc_filter}} / max:system.mem.total{${local.cpu_requests_low_perc_filter}} ) * 100 > ${var.cpu_requests_low_perc_critical}` | -| [Memory Requests Low](#memory-requests-low) | False | 3 | `avg(last_5m):max:system.mem.total{tag:xxx} by {host,cluster_name} - max:kubernetes.memory.requests{tag:xxx} by {host,cluster_name} < 3000000000` | -| [Network Unavailable](#network-unavailable) | True | 3 | `avg(last_5m):max:kubernetes_state.nodes.by_condition{tag:xxx AND condition:networkunavailable AND (status:true OR status:unknown)} by {cluster_name,host} > ` | -| [Node Diskpressure](#node-diskpressure) | True | 3 | `avg(last_5m):max:kubernetes_state.nodes.by_condition{tag:xxx AND condition:diskpressure AND (status:true OR status:unknown)} by {cluster_name,host} > ` | -| [Node Memory Used Percent](#node-memory-used-percent) | True | 2 | `avg(last_5m):( 100 * max:kubernetes.memory.usage{tag:xxx} by {host,cluster_name} ) / max:system.mem.total{tag:xxx} by {host,cluster_name} > 90` | -| [Node Memorypressure](#node-memorypressure) | True | 3 | `avg(last_5m):max:kubernetes_state.nodes.by_condition{tag:xxx AND condition:memorypressure AND (status:true OR status:unknown)} by {cluster_name,host} > ` | -| [Node Ready](#node-ready) | True | 2 | `avg(last_5m):count_nonzero(sum:kubernetes_state.nodes.by_condition{tag:xxx AND (NOT condition:ready) AND (status:true OR status:unknown)} by {cluster_name,host}) > 1` | -| [Node Status](#node-status) | True | 2 | `avg(last_5m):avg:kubernetes_state.node.status{tag:xxx} by {cluster_name,node} < 1` | -| [Persistent Volumes](#persistent-volumes) | True | 3 | `avg(last_5m):max:kubernetes_state.persistentvolumes.by_phase{tag:xxx AND phase:failed} > 1` | -| [Pid Pressure](#pid-pressure) | True | 3 | `avg(last_5m):max:kubernetes_state.nodes.by_condition{tag:xxx AND condition:pidpressure AND (status:true OR status:unknown)} by {cluster_name,host} > ` | +| [Memory Requests Low](#memory-requests-low) | False | 3 | `avg(last_5m):max:system.mem.total{tag:xxx} by {host,kube_cluster_name} - max:kubernetes.memory.requests{tag:xxx} by {host,kube_cluster_name} < 3000000000` | +| [Network Unavailable](#network-unavailable) | True | 3 | `avg(last_5m):max:kubernetes_state.node.by_condition{tag:xxx AND condition:networkunavailable AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ` | +| [Node Diskpressure](#node-diskpressure) | True | 3 | `avg(last_5m):max:kubernetes_state.node.by_condition{tag:xxx AND condition:diskpressure AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ` | +| [Node Memory Used Percent](#node-memory-used-percent) | True | 2 | `avg(last_5m):( 100 * max:kubernetes.memory.usage{tag:xxx} by {host,kube_cluster_name} ) / max:system.mem.total{tag:xxx} by {host,kube_cluster_name} > 90` | +| [Node Memorypressure](#node-memorypressure) | True | 3 | `avg(last_5m):max:kubernetes_state.node.by_condition{tag:xxx AND condition:memorypressure AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ` | +| [Node Ready](#node-ready) | True | 2 | `avg(last_5m):count_nonzero(sum:kubernetes_state.node.by_condition{tag:xxx AND (NOT condition:ready) AND (status:true OR status:unknown)} by {kube_cluster_name,host}) > 1` | +| [Node Status](#node-status) | True | 2 | `avg(last_5m):avg:kubernetes_state.node.status{tag:xxx} by {kube_cluster_name,node} < 1` | +| [Persistent Volumes](#persistent-volumes) | True | 3 | `avg(last_5m):max:kubernetes_state.persistentvolume.by_phase{tag:xxx AND phase:failed} > 1` | +| [Pid Pressure](#pid-pressure) | True | 3 | `avg(last_5m):max:kubernetes_state.node.by_condition{tag:xxx AND condition:pidpressure AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ` | | [Pod Count Per Node High](#pod-count-per-node-high) | True | 2 | `min(last_10m):sum:kubernetes.pods.running{tag:xxx} by {host} > 100.0` | -| [Pod Ready](#pod-ready) | True | 3 | `min(last_30m):sum:kubernetes_state.pod.count{tag:xxx} by {cluster_name,namespace} - sum:kubernetes_state.pod.ready{tag:xxx} by {cluster_name,namespace} > 0` | +| [Pod Ready](#pod-ready) | True | 3 | `min(last_30m):sum:kubernetes_state.pod.count{tag:xxx} by {kube_cluster_name,kube_namespace} - sum:kubernetes_state.pod.ready{tag:xxx} by {kube_cluster_name,kube_namespace} > 0` | | [Pod Restarts](#pod-restarts) | False | 2 | `change(avg(last_10m),last_10m):exclude_null(avg:kubernetes.containers.restarts{tag:xxx} by {pod_name}) > 5` | -| [Pods Failed](#pods-failed) | True | 3 | `min(last_10m):default_zero(max:kubernetes_state.pod.status_phase{phase:failed${var.filter_str_concatenation}tag:xxx} by {namespace}) > ` | -| [Pods Pending](#pods-pending) | True | 3 | `min(last_10m):default_zero(max:kubernetes_state.pod.status_phase{phase:pending${var.filter_str_concatenation}tag:xxx} by {namespace}) > ` | -| [Replicaset Incomplete](#replicaset-incomplete) | True | 3 | `min(last_15m):max:kubernetes_state.replicaset.replicas_desired{tag:xxx} by {kube_replica_set,cluster_name} - min:kubernetes_state.replicaset.replicas_ready{tag:xxx} by {kube_replica_set,cluster_name} > ` | +| [Pods Failed](#pods-failed) | True | 3 | `min(last_10m):default_zero(max:kubernetes_state.pod.status_phase{phase:failed${var.filter_str_concatenation}tag:xxx} by {kube_namespace}) > ` | +| [Pods Pending](#pods-pending) | True | 3 | `min(last_10m):default_zero(max:kubernetes_state.pod.status_phase{phase:pending${var.filter_str_concatenation}tag:xxx} by {kube_namespace}) > ` | +| [Replicaset Incomplete](#replicaset-incomplete) | True | 3 | `min(last_15m):max:kubernetes_state.replicaset.replicas_desired{tag:xxx} by {kube_replica_set,kube_cluster_name} - min:kubernetes_state.replicaset.replicas_ready{tag:xxx} by {kube_replica_set,kube_cluster_name} > ` | | [Replicaset Unavailable](#replicaset-unavailable) | True | 2 | `max(last_5m):( ${local.rs_pods_ready} ) / ${local.rs_pods_desired} / ( ${local.rs_pods_desired} - 1 ) <= 0` | # Getting started developing @@ -88,7 +93,7 @@ If the node where a Pod is running has enough of a resource available, it's poss Query: ```terraform -max(last_5m):( sum:kubernetes_state.container.cpu_limit{tag:xxx} by {host,cluster_name} / sum:kubernetes_state.node.cpu_capacity{tag:xxx} by {host,cluster_name}) * 100 > 100 +max(last_5m):( sum:kubernetes_state.container.cpu_limit{tag:xxx} by {host,kube_cluster_name} / sum:kubernetes_state.node.cpu_capacity{tag:xxx} by {host,kube_cluster_name}) * 100 > 100 ``` | variable | default | required | description | @@ -113,7 +118,7 @@ If the node where a Pod is running has enough of a resource available, it's poss Query: ```terraform -max(last_5m):(sum:kubernetes.cpu.limits{tag:xxx} by {host,cluster_name} / max:system.cpu.num_cores{tag:xxx} by {host,cluster_name}) * 100 > 100 +max(last_5m):(sum:kubernetes.cpu.limits{tag:xxx} by {host,kube_cluster_name} / max:system.cpu.num_cores{tag:xxx} by {host,kube_cluster_name}) * 100 > 100 ``` | variable | default | required | description | @@ -138,7 +143,7 @@ If the node where a Pod is running has enough of a resource available, it's poss Query: ```terraform -min(last_5m):max:system.cpu.num_cores{tag:xxx} by {cluster_name,host} - sum:kubernetes.cpu.limits{tag:xxx} by {cluster_name,host} < ${-30} +min(last_5m):max:system.cpu.num_cores{tag:xxx} by {kube_cluster_name,host} - sum:kubernetes.cpu.limits{tag:xxx} by {kube_cluster_name,host} < ${-30} ``` | variable | default | required | description | @@ -161,7 +166,7 @@ min(last_5m):max:system.cpu.num_cores{tag:xxx} by {cluster_name,host} - sum:kube Query: ```terraform -avg(last_30m):avg:docker.cpu.usage{tag:xxx} by {cluster_name,host,container_name} > 85 +avg(last_30m):avg:docker.cpu.usage{tag:xxx} by {kube_cluster_name,host,container_name} > 85 ``` | variable | default | required | description | @@ -190,7 +195,7 @@ If the node where a Pod is running has enough of a resource available, it's poss Query: ```terraform -max(last_5m):( sum:kubernetes_state.container.cpu_requested{tag:xxx} by {host,cluster_name} / sum:kubernetes_state.node.cpu_capacity{tag:xxx} by {host,cluster_name} ) * 100 > 95 +max(last_5m):( sum:kubernetes_state.container.cpu_requested{tag:xxx} by {host,kube_cluster_name} / sum:kubernetes_state.node.cpu_capacity{tag:xxx} by {host,kube_cluster_name} ) * 100 > 95 ``` | variable | default | required | description | @@ -215,7 +220,7 @@ If the node where a Pod is running has enough of a resource available, it's poss Query: ```terraform -max(last_5m):100 * sum:kubernetes.cpu.requests{tag:xxx} by {cluster_name,host} / max:system.cpu.num_cores{tag:xxx} by {cluster_name,host} > 95 +max(last_5m):100 * sum:kubernetes.cpu.requests{tag:xxx} by {kube_cluster_name,host} / max:system.cpu.num_cores{tag:xxx} by {kube_cluster_name,host} > 95 ``` | variable | default | required | description | @@ -240,7 +245,7 @@ If the node where a Pod is running has enough of a resource available, it's poss Query: ```terraform -max(last_5m):max:system.cpu.num_cores{tag:xxx} by {cluster_name,host} - sum:kubernetes.cpu.requests{tag:xxx} by {cluster_name,host} < 0.5 +max(last_5m):max:system.cpu.num_cores{tag:xxx} by {kube_cluster_name,host} - sum:kubernetes.cpu.requests{tag:xxx} by {kube_cluster_name,host} < 0.5 ``` | variable | default | required | description | @@ -265,14 +270,14 @@ In kubernetes a daemonset is responsible for running the same pod across all Nod Query: ```terraform -min(last_30m):max:kubernetes_state.daemonset.scheduled{tag:xxx} by {daemonset,cluster_name} - min:kubernetes_state.daemonset.ready{tag:xxx} by {daemonset,cluster_name} > 0 +min(last_15m):max:kubernetes_state.daemonset.scheduled{tag:xxx} by {kube_daemon_set,kube_cluster_name} - min:kubernetes_state.daemonset.ready{tag:xxx} by {kube_daemon_set,kube_cluster_name} > 0 ``` | variable | default | required | description | |----------------------------------------|------------------------------------------|----------|--------------------------------------------------------------------------| | daemonset_incomplete_enabled | True | No | | | daemonset_incomplete_critical | 0 | No | alert is raised when (desired - running) > daemonset_incomplete_critical | -| daemonset_incomplete_evaluation_period | last_30m | No | | +| daemonset_incomplete_evaluation_period | last_15m | No | | | daemonset_incomplete_note | "" | No | | | daemonset_incomplete_docs | In kubernetes a daemonset is responsible for running the same pod across all Nodes. An example for when this fails, is when the image cannot be pulled, the pod fails to initialize or no resources are available on the cluster\nThis alert is raised when (desired - running) > 0 | No | | | daemonset_incomplete_filter_override | "" | No | | @@ -280,7 +285,7 @@ min(last_30m):max:kubernetes_state.daemonset.scheduled{tag:xxx} by {daemonset,cl | daemonset_incomplete_no_data_timeframe | None | No | | | daemonset_incomplete_notify_no_data | False | No | | | daemonset_incomplete_ok_threshold | None | No | | -| daemonset_incomplete_priority | 3 | No | Number from 1 (high) to 5 (low). | +| daemonset_incomplete_priority | 2 | No | Number from 1 (high) to 5 (low). | ## Daemonset Multiple Restarts @@ -315,7 +320,7 @@ max(last_15m):clamp_min(max:kubernetes.containers.restarts{tag:xxx} by {kube_dae Query: ```terraform -avg(last_5m):avg:datadog.agent.running{tag:xxx} by {host,cluster_name} < 1 +avg(last_5m):avg:datadog.agent.running{tag:xxx} by {host,kube_cluster_name} < 1 ``` | variable | default | required | description | @@ -338,7 +343,7 @@ The amount of expected pods to run minus the actual number Query: ```terraform -avg(last_15m):max:kubernetes_state.deployment.replicas_desired{tag:xxx} by {cluster_name,host} - max:kubernetes_state.deployment.replicas{tag:xxx} by {cluster_name,host} > 10 +avg(last_15m):max:kubernetes_state.deployment.replicas_desired{tag:xxx} by {kube_cluster_name} - max:kubernetes_state.deployment.replicas_available{tag:xxx} by {kube_cluster_name} > 10 ``` | variable | default | required | description | @@ -414,7 +419,7 @@ If the node where a Pod is running has enough of a resource available, it's poss Query: ```terraform -max(last_5m):( sum:kubernetes_state.container.memory_limit{tag:xxx} by {host,cluster_name} / sum:kubernetes_state.node.memory_allocatable{tag:xxx} by {host,cluster_name}) * 100 > 100 +max(last_5m):( sum:kubernetes_state.container.memory_limit{tag:xxx} by {host,kube_cluster_name} / sum:kubernetes_state.node.memory_allocatable{tag:xxx} by {host,kube_cluster_name}) * 100 > 100 ``` | variable | default | required | description | @@ -439,7 +444,7 @@ If the node where a Pod is running has enough of a resource available, it's poss Query: ```terraform -max(last_5m):( max:kubernetes.memory.limits{tag:xxx} by {host,cluster_name}/ max:system.mem.total{tag:xxx} by {host,cluster_name}) * 100 > 100 +max(last_5m):( max:kubernetes.memory.limits{tag:xxx} by {host,kube_cluster_name}/ max:system.mem.total{tag:xxx} by {host,kube_cluster_name}) * 100 > 100 ``` | variable | default | required | description | @@ -464,7 +469,7 @@ If the node where a Pod is running has enough of a resource available, it's poss Query: ```terraform -avg(last_5m):max:system.mem.total{tag:xxx} by {host,cluster_name} - max:kubernetes.memory.limits{tag:xxx} by {host,cluster_name} < 3000000000 +avg(last_5m):max:system.mem.total{tag:xxx} by {host,kube_cluster_name} - max:kubernetes.memory.limits{tag:xxx} by {host,kube_cluster_name} < 3000000000 ``` | variable | default | required | description | @@ -539,7 +544,7 @@ If the node where a Pod is running has enough of a resource available, it's poss Query: ```terraform -avg(last_5m):max:system.mem.total{tag:xxx} by {host,cluster_name} - max:kubernetes.memory.requests{tag:xxx} by {host,cluster_name} < 3000000000 +avg(last_5m):max:system.mem.total{tag:xxx} by {host,kube_cluster_name} - max:kubernetes.memory.requests{tag:xxx} by {host,kube_cluster_name} < 3000000000 ``` | variable | default | required | description | @@ -564,7 +569,7 @@ All your nodes need network connections, and this status indicates that there Query: ```terraform -avg(last_5m):max:kubernetes_state.nodes.by_condition{tag:xxx AND condition:networkunavailable AND (status:true OR status:unknown)} by {cluster_name,host} > +avg(last_5m):max:kubernetes_state.node.by_condition{tag:xxx AND condition:networkunavailable AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ``` | variable | default | required | description | @@ -588,7 +593,7 @@ Disk pressure is a condition indicating that a node is using too much disk space Query: ```terraform -avg(last_5m):max:kubernetes_state.nodes.by_condition{tag:xxx AND condition:diskpressure AND (status:true OR status:unknown)} by {cluster_name,host} > +avg(last_5m):max:kubernetes_state.node.by_condition{tag:xxx AND condition:diskpressure AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ``` | variable | default | required | description | @@ -610,7 +615,7 @@ avg(last_5m):max:kubernetes_state.nodes.by_condition{tag:xxx AND condition:diskp Query: ```terraform -avg(last_5m):( 100 * max:kubernetes.memory.usage{tag:xxx} by {host,cluster_name} ) / max:system.mem.total{tag:xxx} by {host,cluster_name} > 90 +avg(last_5m):( 100 * max:kubernetes.memory.usage{tag:xxx} by {host,kube_cluster_name} ) / max:system.mem.total{tag:xxx} by {host,kube_cluster_name} > 90 ``` | variable | default | required | description | @@ -635,7 +640,7 @@ Memory pressure is a resourcing condition indicating that your node is running o Query: ```terraform -avg(last_5m):max:kubernetes_state.nodes.by_condition{tag:xxx AND condition:memorypressure AND (status:true OR status:unknown)} by {cluster_name,host} > +avg(last_5m):max:kubernetes_state.node.by_condition{tag:xxx AND condition:memorypressure AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ``` | variable | default | required | description | @@ -659,7 +664,7 @@ Checks to see if the node is in ready status or not Query: ```terraform -avg(last_5m):count_nonzero(sum:kubernetes_state.nodes.by_condition{tag:xxx AND (NOT condition:ready) AND (status:true OR status:unknown)} by {cluster_name,host}) > 1 +avg(last_5m):count_nonzero(sum:kubernetes_state.node.by_condition{tag:xxx AND (NOT condition:ready) AND (status:true OR status:unknown)} by {kube_cluster_name,host}) > 1 ``` | variable | default | required | description | @@ -683,7 +688,7 @@ This cluster state metric provides a high-level overview of a node’s health an Query: ```terraform -avg(last_5m):avg:kubernetes_state.node.status{tag:xxx} by {cluster_name,node} < 1 +avg(last_5m):avg:kubernetes_state.node.status{tag:xxx} by {kube_cluster_name,node} < 1 ``` | variable | default | required | description | @@ -704,7 +709,7 @@ avg(last_5m):avg:kubernetes_state.node.status{tag:xxx} by {cluster_name,node} < Query: ```terraform -avg(last_5m):max:kubernetes_state.persistentvolumes.by_phase{tag:xxx AND phase:failed} > 1 +avg(last_5m):max:kubernetes_state.persistentvolume.by_phase{tag:xxx AND phase:failed} > 1 ``` | variable | default | required | description | @@ -729,7 +734,7 @@ PID pressure is a rare condition where a pod or container spawns too many proces Query: ```terraform -avg(last_5m):max:kubernetes_state.nodes.by_condition{tag:xxx AND condition:pidpressure AND (status:true OR status:unknown)} by {cluster_name,host} > +avg(last_5m):max:kubernetes_state.node.by_condition{tag:xxx AND condition:pidpressure AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ``` | variable | default | required | description | @@ -780,7 +785,7 @@ A pod may be running but not available, meaning it is not ready and able to acce Query: ```terraform -min(last_30m):sum:kubernetes_state.pod.count{tag:xxx} by {cluster_name,namespace} - sum:kubernetes_state.pod.ready{tag:xxx} by {cluster_name,namespace} > 0 +min(last_30m):sum:kubernetes_state.pod.count{tag:xxx} by {kube_cluster_name,kube_namespace} - sum:kubernetes_state.pod.ready{tag:xxx} by {kube_cluster_name,kube_namespace} > 0 ``` | variable | default | required | description | @@ -826,7 +831,7 @@ https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/ Query: ```terraform -min(last_10m):default_zero(max:kubernetes_state.pod.status_phase{phase:failed${var.filter_str_concatenation}tag:xxx} by {namespace}) > +min(last_10m):default_zero(max:kubernetes_state.pod.status_phase{phase:failed${var.filter_str_concatenation}tag:xxx} by {kube_namespace}) > ``` | variable | default | required | description | @@ -853,7 +858,7 @@ https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/ Query: ```terraform -min(last_10m):default_zero(max:kubernetes_state.pod.status_phase{phase:pending${var.filter_str_concatenation}tag:xxx} by {namespace}) > +min(last_10m):default_zero(max:kubernetes_state.pod.status_phase{phase:pending${var.filter_str_concatenation}tag:xxx} by {kube_namespace}) > ``` | variable | default | required | description | @@ -880,7 +885,7 @@ In kubernetes a Replicaset is responsible for making sure a specific number of p Query: ```terraform -min(last_15m):max:kubernetes_state.replicaset.replicas_desired{tag:xxx} by {kube_replica_set,cluster_name} - min:kubernetes_state.replicaset.replicas_ready{tag:xxx} by {kube_replica_set,cluster_name} > +min(last_15m):max:kubernetes_state.replicaset.replicas_desired{tag:xxx} by {kube_replica_set,kube_cluster_name} - min:kubernetes_state.replicaset.replicas_ready{tag:xxx} by {kube_replica_set,kube_cluster_name} > ``` | variable | default | required | description | diff --git a/cpu-limits-low-perc-state.tf b/cpu-limits-low-perc-state.tf index d3dc302..99f6b31 100644 --- a/cpu-limits-low-perc-state.tf +++ b/cpu-limits-low-perc-state.tf @@ -10,9 +10,9 @@ module "cpu_limits_low_perc_state" { version = "1.0.0" name = "Available CPU for Limits in percentages Low" - query = "max(${var.cpu_limits_low_perc_state_evaluation_period}):( sum:kubernetes_state.container.cpu_limit{${local.cpu_limits_low_perc_state_filter}} by {host,cluster_name} / sum:kubernetes_state.node.cpu_capacity{${local.cpu_limits_low_perc_state_filter}} by {host,cluster_name}) * 100 > ${var.cpu_limits_low_perc_state_critical}" - alert_message = "Kubernetes cluster cpu room for limits / percentage is too low" - recovery_message = "Kubernetes cluster cpu limits / percentage has recovered" + query = "max(${var.cpu_limits_low_perc_state_evaluation_period}):( sum:kubernetes_state.container.cpu_limit{${local.cpu_limits_low_perc_state_filter}} by {host,kube_cluster_name} / sum:kubernetes_state.node.cpu_capacity{${local.cpu_limits_low_perc_state_filter}} by {host,kube_cluster_name}) * 100 > ${var.cpu_limits_low_perc_state_critical}" + alert_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu room for limits / percentage is too low" + recovery_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu limits / percentage has recovered" # monitor level vars enabled = var.cpu_limits_low_perc_state_enabled diff --git a/cpu-limits-low-perc.tf b/cpu-limits-low-perc.tf index c36099e..d2912fa 100644 --- a/cpu-limits-low-perc.tf +++ b/cpu-limits-low-perc.tf @@ -10,9 +10,9 @@ module "cpu_limits_low_perc" { version = "1.0.0" name = "Available CPU for Limits in percentages Low" - query = "max(${var.cpu_limits_low_perc_evaluation_period}):(sum:kubernetes.cpu.limits{${local.cpu_limits_low_perc_filter}} by {host,cluster_name} / max:system.cpu.num_cores{${local.cpu_limits_low_perc_filter}} by {host,cluster_name}) * 100 > ${var.cpu_limits_low_perc_critical}" - alert_message = "Kubernetes cluster cpu room for limits / percentage is too low" - recovery_message = "Kubernetes cluster cpu limits / percentage has recovered" + query = "max(${var.cpu_limits_low_perc_evaluation_period}):(sum:kubernetes.cpu.limits{${local.cpu_limits_low_perc_filter}} by {host,kube_cluster_name} / max:system.cpu.num_cores{${local.cpu_limits_low_perc_filter}} by {host,kube_cluster_name}) * 100 > ${var.cpu_limits_low_perc_critical}" + alert_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu room for limits / percentage is too low" + recovery_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu limits / percentage has recovered" # monitor level vars enabled = var.cpu_limits_low_perc_enabled diff --git a/cpu-limits-low.tf b/cpu-limits-low.tf index 1d991ac..9712660 100644 --- a/cpu-limits-low.tf +++ b/cpu-limits-low.tf @@ -10,9 +10,10 @@ module "cpu_limits_low" { version = "1.0.0" name = "Available CPU for Limits Low" - query = "min(${var.cpu_limits_low_evaluation_period}):max:system.cpu.num_cores{${local.cpu_limits_low_filter}} by {cluster_name,host} - sum:kubernetes.cpu.limits{${local.cpu_limits_low_filter}} by {cluster_name,host} < ${var.cpu_limits_low_critical}" - alert_message = "Kubernetes cluster cpu room for limits is too low" - recovery_message = "Kubernetes cluster cpu limits has recovered" + query = "min(${var.cpu_limits_low_evaluation_period}):max:system.cpu.num_cores{${local.cpu_limits_low_filter}} by {kube_cluster_name,host} - sum:kubernetes.cpu.limits{${local.cpu_limits_low_filter}} by {kube_cluster_name,host} < ${var.cpu_limits_low_critical}" + alert_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu room for limits is too low " + recovery_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu limits has recovered" + # monitor level vars enabled = var.cpu_limits_low_enabled diff --git a/cpu-on-dns-pods-high.tf b/cpu-on-dns-pods-high.tf index 7a6c6bc..c245f3d 100644 --- a/cpu-on-dns-pods-high.tf +++ b/cpu-on-dns-pods-high.tf @@ -12,7 +12,7 @@ module "cpu_on_dns_pods_high" { version = "1.0.0" name = "CPU Usage on DNS pods is high" - query = "avg(${var.cpu_on_dns_pods_high_evaluation_period}):avg:docker.cpu.usage{${local.cpu_on_dns_pods_high_filter}} by {cluster_name,host,container_name} > ${var.cpu_on_dns_pods_high_critical}" + query = "avg(${var.cpu_on_dns_pods_high_evaluation_period}):avg:docker.cpu.usage{${local.cpu_on_dns_pods_high_filter}} by {kube_cluster_name,host,container_name} > ${var.cpu_on_dns_pods_high_critical}" alert_message = "Kubernetes CPU usage on DNS pods is too high" recovery_message = "Kubernetes CPU usage on DNS pods has recovered" diff --git a/cpu-requests-low-perc-state.tf b/cpu-requests-low-perc-state.tf index 7a17090..b24d672 100644 --- a/cpu-requests-low-perc-state.tf +++ b/cpu-requests-low-perc-state.tf @@ -10,7 +10,7 @@ module "cpu_requests_low_perc_state" { version = "1.0.0" name = "Available CPU for requests in percentages Low" - query = "max(${var.cpu_requests_low_perc_state_evaluation_period}):( sum:kubernetes_state.container.cpu_requested{${local.cpu_requests_low_perc_state_filter}} by {host,cluster_name} / sum:kubernetes_state.node.cpu_capacity{${local.cpu_requests_low_perc_state_filter}} by {host,cluster_name} ) * 100 > ${var.cpu_requests_low_perc_state_critical}" + query = "max(${var.cpu_requests_low_perc_state_evaluation_period}):( sum:kubernetes_state.container.cpu_requested{${local.cpu_requests_low_perc_state_filter}} by {host,kube_cluster_name} / sum:kubernetes_state.node.cpu_capacity{${local.cpu_requests_low_perc_state_filter}} by {host,kube_cluster_name} ) * 100 > ${var.cpu_requests_low_perc_state_critical}" alert_message = "Kubernetes cluster cpu room for requests / percentage is too low" recovery_message = "Kubernetes cluster cpu requests / percentage has recovered" diff --git a/cpu-requests-low-perc.tf b/cpu-requests-low-perc.tf index 9ef02d8..9191d75 100644 --- a/cpu-requests-low-perc.tf +++ b/cpu-requests-low-perc.tf @@ -10,7 +10,7 @@ module "cpu_requests_low_perc" { version = "1.0.0" name = "Available CPU for requests in percentages Low" - query = "max(${var.cpu_requests_low_perc_evaluation_period}):100 * sum:kubernetes.cpu.requests{${local.cpu_requests_low_perc_filter}} by {cluster_name,host} / max:system.cpu.num_cores{${local.cpu_requests_low_perc_filter}} by {cluster_name,host} > ${var.cpu_requests_low_perc_critical}" + query = "max(${var.cpu_requests_low_perc_evaluation_period}):100 * sum:kubernetes.cpu.requests{${local.cpu_requests_low_perc_filter}} by {kube_cluster_name,host} / max:system.cpu.num_cores{${local.cpu_requests_low_perc_filter}} by {kube_cluster_name,host} > ${var.cpu_requests_low_perc_critical}" alert_message = "Kubernetes cluster cpu room for requests / percentage is too low" recovery_message = "Kubernetes cluster cpu requests / percentage has recovered" diff --git a/cpu-requests-low.tf b/cpu-requests-low.tf index 1e2df6e..1079657 100644 --- a/cpu-requests-low.tf +++ b/cpu-requests-low.tf @@ -10,7 +10,7 @@ module "cpu_requests_low" { version = "1.0.0" name = "Available CPU for Requests Low" - query = "max(${var.cpu_requests_low_evaluation_period}):max:system.cpu.num_cores{${local.cpu_requests_low_filter}} by {cluster_name,host} - sum:kubernetes.cpu.requests{${local.cpu_requests_low_filter}} by {cluster_name,host} < ${var.cpu_requests_low_critical}" + query = "max(${var.cpu_requests_low_evaluation_period}):max:system.cpu.num_cores{${local.cpu_requests_low_filter}} by {kube_cluster_name,host} - sum:kubernetes.cpu.requests{${local.cpu_requests_low_filter}} by {kube_cluster_name,host} < ${var.cpu_requests_low_critical}" alert_message = "Kubernetes cluster cpu room for requests is too low" recovery_message = "Kubernetes cluster cpu requests has recovered" diff --git a/daemonset-incomplete-variables.tf b/daemonset-incomplete-variables.tf index 97824bd..7b38349 100644 --- a/daemonset-incomplete-variables.tf +++ b/daemonset-incomplete-variables.tf @@ -11,7 +11,7 @@ variable "daemonset_incomplete_critical" { variable "daemonset_incomplete_evaluation_period" { type = string - default = "last_30m" + default = "last_15m" } variable "daemonset_incomplete_note" { @@ -53,5 +53,5 @@ variable "daemonset_incomplete_priority" { description = "Number from 1 (high) to 5 (low)." type = number - default = 3 + default = 2 } diff --git a/daemonset-incomplete.tf b/daemonset-incomplete.tf index f09a8e9..ebe1748 100644 --- a/daemonset-incomplete.tf +++ b/daemonset-incomplete.tf @@ -10,7 +10,7 @@ module "daemonset_incomplete" { version = "1.0.0" name = "Daemonset Incomplete" - query = "min(${var.daemonset_incomplete_evaluation_period}):max:kubernetes_state.daemonset.scheduled{${local.daemonset_incomplete_filter}} by {daemonset,cluster_name} - min:kubernetes_state.daemonset.ready{${local.daemonset_incomplete_filter}} by {daemonset,cluster_name} > 0" + query = "min(${var.daemonset_incomplete_evaluation_period}):max:kubernetes_state.daemonset.scheduled{${local.daemonset_incomplete_filter}} by {kube_daemon_set,kube_cluster_name} - min:kubernetes_state.daemonset.ready{${local.daemonset_incomplete_filter}} by {kube_daemon_set,kube_cluster_name} > 0" alert_message = "Kubernetes Daemonset {{daemonset}} is incomplete. Missing pod count:{{value}}" recovery_message = "Kubernetes Daemonset {{daemonset}} has recovered" diff --git a/datadog-agent.tf b/datadog-agent.tf index dd9756f..fc9b4ee 100644 --- a/datadog-agent.tf +++ b/datadog-agent.tf @@ -10,9 +10,11 @@ module "datadog_agent" { version = "1.0.0" name = "Datadog agent not running" - query = "avg(${var.datadog_agent_evaluation_period}):avg:datadog.agent.running{${local.datadog_agent_filter}} by {host,cluster_name} < 1" - alert_message = "Datadog Agent not running on {{host.name}} in Cluster: {{cluster_name.name}}" + query = "avg(${var.datadog_agent_evaluation_period}):avg:datadog.agent.running{${local.datadog_agent_filter}} by {host,kube_cluster_name} < 1" + alert_message = "Datadog Agent not running on {{host.name}} in Cluster: {{kube_cluster_name.name}}" recovery_message = "Agent running again" + notify_no_data = true + no_data_message = "Datadog agent is not running on {{host.name}} in Cluster: {{kube_cluster_name.name}}" # monitor level vars enabled = var.datadog_agent_enabled diff --git a/deploy-desired-vs-status.tf b/deploy-desired-vs-status.tf index 3469d3d..b5d4f6f 100644 --- a/deploy-desired-vs-status.tf +++ b/deploy-desired-vs-status.tf @@ -10,7 +10,7 @@ module "deploy_desired_vs_status" { version = "1.0.0" name = "Desired pods vs current pods (Deployments)" - query = "avg(${var.deploy_desired_vs_status_evaluation_period}):max:kubernetes_state.deployment.replicas_desired{${local.deploy_desired_vs_status_filter}} by {cluster_name,host} - max:kubernetes_state.deployment.replicas{${local.deploy_desired_vs_status_filter}} by {cluster_name,host} > ${var.deploy_desired_vs_status_critical}" + query = "avg(${var.deploy_desired_vs_status_evaluation_period}):max:kubernetes_state.deployment.replicas_desired{${local.deploy_desired_vs_status_filter}} by {kube_cluster_name} - max:kubernetes_state.deployment.replicas_available{${local.deploy_desired_vs_status_filter}} by {kube_cluster_name} > ${var.deploy_desired_vs_status_critical}" alert_message = "Kubernetes is having trouble getting all the pods to start. (Based on replicas number in all the deployments)" recovery_message = "All pods described in deployments have started" diff --git a/examples/example.tf b/examples/example.tf index 6e2bb84..6ed20c1 100644 --- a/examples/example.tf +++ b/examples/example.tf @@ -5,5 +5,5 @@ module "kubernetes" { notification_channel = "mail@example.com" service = "Kubernetes" env = "prd" - filter_str = "cluster_name:production" + filter_str = "kube_cluster_name:production" } diff --git a/memory-limits-low-perc-state.tf b/memory-limits-low-perc-state.tf index b7b5c26..86f2621 100644 --- a/memory-limits-low-perc-state.tf +++ b/memory-limits-low-perc-state.tf @@ -10,7 +10,7 @@ module "memory_limits_low_perc_state" { version = "1.0.0" name = "Available Memory for Limits in percentage Low" - query = "max(${var.memory_limits_low_perc_state_evaluation_period}):( sum:kubernetes_state.container.memory_limit{${local.memory_limits_low_perc_state_filter}} by {host,cluster_name} / sum:kubernetes_state.node.memory_allocatable{${local.memory_limits_low_perc_state_filter}} by {host,cluster_name}) * 100 > ${var.memory_limits_low_perc_state_critical}" + query = "max(${var.memory_limits_low_perc_state_evaluation_period}):( sum:kubernetes_state.container.memory_limit{${local.memory_limits_low_perc_state_filter}} by {host,kube_cluster_name} / sum:kubernetes_state.node.memory_allocatable{${local.memory_limits_low_perc_state_filter}} by {host,kube_cluster_name}) * 100 > ${var.memory_limits_low_perc_state_critical}" alert_message = "Kubernetes cluster memory room for limits in percentage is too low" recovery_message = "Kubernetes cluster memory limits in percentage has recovered" diff --git a/memory-limits-low-perc.tf b/memory-limits-low-perc.tf index 75be320..22cab78 100644 --- a/memory-limits-low-perc.tf +++ b/memory-limits-low-perc.tf @@ -10,7 +10,7 @@ module "memory_limits_low_perc" { version = "1.0.0" name = "Available Memory for Limits in percentage Low" - query = "max(${var.memory_limits_low_perc_evaluation_period}):( max:kubernetes.memory.limits{${local.memory_limits_low_perc_filter}} by {host,cluster_name}/ max:system.mem.total{${local.memory_limits_low_perc_filter}} by {host,cluster_name}) * 100 > ${var.memory_limits_low_perc_critical}" + query = "max(${var.memory_limits_low_perc_evaluation_period}):( max:kubernetes.memory.limits{${local.memory_limits_low_perc_filter}} by {host,kube_cluster_name}/ max:system.mem.total{${local.memory_limits_low_perc_filter}} by {host,kube_cluster_name}) * 100 > ${var.memory_limits_low_perc_critical}" alert_message = "Kubernetes cluster memory room for limits in percentage is too low" recovery_message = "Kubernetes cluster memory limits in percentage has recovered" diff --git a/memory-limits-low.tf b/memory-limits-low.tf index 3e75956..c33856f 100644 --- a/memory-limits-low.tf +++ b/memory-limits-low.tf @@ -10,7 +10,7 @@ module "memory_limits_low" { version = "1.0.0" name = "Available Memory for Limits Low" - query = "avg(${var.memory_limits_low_evaluation_period}):max:system.mem.total{${local.memory_limits_low_filter}} by {host,cluster_name} - max:kubernetes.memory.limits{${local.memory_limits_low_filter}} by {host,cluster_name} < ${var.memory_limits_low_critical}" + query = "avg(${var.memory_limits_low_evaluation_period}):max:system.mem.total{${local.memory_limits_low_filter}} by {host,kube_cluster_name} - max:kubernetes.memory.limits{${local.memory_limits_low_filter}} by {host,kube_cluster_name} < ${var.memory_limits_low_critical}" alert_message = "Kubernetes cluster memory room for limits is too low" recovery_message = "Kubernetes cluster memory limits has recovered" diff --git a/memory-requests-low.tf b/memory-requests-low.tf index 2223c36..2ad99aa 100644 --- a/memory-requests-low.tf +++ b/memory-requests-low.tf @@ -10,7 +10,7 @@ module "memory_requests_low" { version = "1.0.0" name = "Available Memory for Requests Low" - query = "avg(${var.memory_requests_low_evaluation_period}):max:system.mem.total{${local.memory_requests_low_filter}} by {host,cluster_name} - max:kubernetes.memory.requests{${local.memory_requests_low_filter}} by {host,cluster_name} < ${var.memory_requests_low_critical}" + query = "avg(${var.memory_requests_low_evaluation_period}):max:system.mem.total{${local.memory_requests_low_filter}} by {host,kube_cluster_name} - max:kubernetes.memory.requests{${local.memory_requests_low_filter}} by {host,kube_cluster_name} < ${var.memory_requests_low_critical}" alert_message = "Total memory available for requests on {{ host }} is low ({{value}})" recovery_message = "Total memory available for requests on {{ host }} has recovered ({{value}})" diff --git a/module_description.md b/module_description.md index aa2dbc4..d26ce0f 100644 --- a/module_description.md +++ b/module_description.md @@ -1,3 +1,8 @@ This module mainly check on Kubernetes resource level and cluster health. System level monitoring can best be implemented with the [system module](https://github.com/kabisa/terraform-datadog-system). -Docker/Container level monitoring can best be implemented with the [docker module](https://github.com/kabisa/terraform-datadog-docker-container). \ No newline at end of file +Docker/Container level monitoring can best be implemented with the [docker module](https://github.com/kabisa/terraform-datadog-docker-container). + +# Recent changes: + +- switch from kubernetes_state to kubernetes_state_core as a default https://docs.datadoghq.com/integrations/kubernetes_state_core/?tab=helm +- upgrade provider to ~> 3.12 diff --git a/network-unavailable.tf b/network-unavailable.tf index 3ad716c..0a5ab9b 100644 --- a/network-unavailable.tf +++ b/network-unavailable.tf @@ -10,7 +10,7 @@ module "network_unavailable" { version = "1.0.0" name = "Nodes with Network Unavailable" - query = "avg(${var.network_unavailable_evaluation_period}):max:kubernetes_state.nodes.by_condition{${local.network_unavailable_filter} AND condition:networkunavailable AND (status:true OR status:unknown)} by {cluster_name,host} > ${var.network_unavailable_critical}" + query = "avg(${var.network_unavailable_evaluation_period}):max:kubernetes_state.node.by_condition{${local.network_unavailable_filter} AND condition:networkunavailable AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ${var.network_unavailable_critical}" alert_message = "Kubernetes cluster node {{node}} has no network. Meaning it is not accessible" recovery_message = "Kubernetes cluster node {{node}} has come back on the network" diff --git a/node-diskpressure.tf b/node-diskpressure.tf index 8c5470f..4f4faef 100644 --- a/node-diskpressure.tf +++ b/node-diskpressure.tf @@ -10,7 +10,7 @@ module "node_diskpressure" { version = "1.0.0" name = "Nodes with Diskpressure" - query = "avg(${var.node_diskpressure_evaluation_period}):max:kubernetes_state.nodes.by_condition{${local.node_diskpressure_filter} AND condition:diskpressure AND (status:true OR status:unknown)} by {cluster_name,host} > ${var.node_diskpressure_critical}" + query = "avg(${var.node_diskpressure_evaluation_period}):max:kubernetes_state.node.by_condition{${local.node_diskpressure_filter} AND condition:diskpressure AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ${var.node_diskpressure_critical}" alert_message = "Kubernetes cluster node {{node}} has diskpressure. Meaning it is low on disk space (Logging, emptydir volumes, caching, etc)" recovery_message = "Kubernetes cluster node {{node}} no longer has problems with DiskPressure." diff --git a/node-memory-used-percent.tf b/node-memory-used-percent.tf index b0154fa..4bcbb0c 100644 --- a/node-memory-used-percent.tf +++ b/node-memory-used-percent.tf @@ -10,7 +10,7 @@ module "node_memory_used_percent" { version = "1.0.0" name = "Memory Used Percent" - query = "avg(${var.node_memory_used_percent_evaluation_period}):( 100 * max:kubernetes.memory.usage{${local.node_memory_used_percent_filter}} by {host,cluster_name} ) / max:system.mem.total{${local.node_memory_used_percent_filter}} by {host,cluster_name} > ${var.node_memory_used_percent_critical}" + query = "avg(${var.node_memory_used_percent_evaluation_period}):( 100 * max:kubernetes.memory.usage{${local.node_memory_used_percent_filter}} by {host,kube_cluster_name} ) / max:system.mem.total{${local.node_memory_used_percent_filter}} by {host,kube_cluster_name} > ${var.node_memory_used_percent_critical}" alert_message = "Available memory on ${var.service} Node {{host.name}} has dropped below {{threshold}} and has {{value}}% available" recovery_message = "Available memory on ${var.service} Node {{host.name}} has recovered {{value}}%" diff --git a/node-memorypressure.tf b/node-memorypressure.tf index bbfb130..7f2728d 100644 --- a/node-memorypressure.tf +++ b/node-memorypressure.tf @@ -10,7 +10,7 @@ module "node_memorypressure" { version = "1.0.0" name = "Nodes with Memorypressure" - query = "avg(${var.node_memorypressure_evaluation_period}):max:kubernetes_state.nodes.by_condition{${local.node_memorypressure_filter} AND condition:memorypressure AND (status:true OR status:unknown)} by {cluster_name,host} > ${var.node_memorypressure_critical}" + query = "avg(${var.node_memorypressure_evaluation_period}):max:kubernetes_state.node.by_condition{${local.node_memorypressure_filter} AND condition:memorypressure AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ${var.node_memorypressure_critical}" alert_message = "Kubernetes cluster node {{node}} has memorypressure. Meaning it is low on memory" recovery_message = "Kubernetes cluster node {{node}} no longer has Memory Pressure." diff --git a/node-ready.tf b/node-ready.tf index c80bf51..9588c69 100644 --- a/node-ready.tf +++ b/node-ready.tf @@ -10,7 +10,7 @@ module "node_ready" { version = "1.0.0" name = "Node Not Ready" - query = "avg(${var.node_ready_evaluation_period}):count_nonzero(sum:kubernetes_state.nodes.by_condition{${local.node_ready_filter} AND (NOT condition:ready) AND (status:true OR status:unknown)} by {cluster_name,host}) > ${var.node_ready_critical}" + query = "avg(${var.node_ready_evaluation_period}):count_nonzero(sum:kubernetes_state.node.by_condition{${local.node_ready_filter} AND (NOT condition:ready) AND (status:true OR status:unknown)} by {kube_cluster_name,host}) > ${var.node_ready_critical}" alert_message = "Kubernetes cluster node {{host}} is not ready." recovery_message = "Kubernetes cluster node {{host}} is ready again." diff --git a/node-status.tf b/node-status.tf index 404add3..a08e6c7 100644 --- a/node-status.tf +++ b/node-status.tf @@ -10,7 +10,7 @@ module "node_status" { version = "1.0.0" name = "Node Status not OK" - query = "avg(${var.node_status_evaluation_period}):avg:kubernetes_state.node.status{${local.node_status_filter}} by {cluster_name,node} < 1" + query = "avg(${var.node_status_evaluation_period}):avg:kubernetes_state.node.status{${local.node_status_filter}} by {kube_cluster_name,node} < 1" alert_message = "Kubernetes Node Status for Node {{node}} is not ok" recovery_message = "Kubernetes Node Status for Node {{node}} has recovered" require_full_window = false diff --git a/persistent-volumes.tf b/persistent-volumes.tf index a4473e7..e74db78 100644 --- a/persistent-volumes.tf +++ b/persistent-volumes.tf @@ -10,7 +10,7 @@ module "persistent_volumes_low" { version = "1.0.0" name = "Failed Persistent Volume Claims" - query = "avg(${var.persistent_volumes_evaluation_period}):max:kubernetes_state.persistentvolumes.by_phase{${local.persistent_volumes_filter} AND phase:failed} > ${var.persistent_volumes_critical}" + query = "avg(${var.persistent_volumes_evaluation_period}):max:kubernetes_state.persistentvolume.by_phase{${local.persistent_volumes_filter} AND phase:failed} > ${var.persistent_volumes_critical}" alert_message = "There are failed Physical Volume Claims, storage has problems" recovery_message = "There are no failed Physical Volume Claims" diff --git a/pid-pressure.tf b/pid-pressure.tf index d78a6a5..ef03391 100644 --- a/pid-pressure.tf +++ b/pid-pressure.tf @@ -10,7 +10,7 @@ module "pid_pressure" { version = "1.0.0" name = "Nodes with PID Pressure" - query = "avg(${var.pid_pressure_evaluation_period}):max:kubernetes_state.nodes.by_condition{${local.pid_pressure_filter} AND condition:pidpressure AND (status:true OR status:unknown)} by {cluster_name,host} > ${var.pid_pressure_critical}" + query = "avg(${var.pid_pressure_evaluation_period}):max:kubernetes_state.node.by_condition{${local.pid_pressure_filter} AND condition:pidpressure AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ${var.pid_pressure_critical}" alert_message = "Kubernetes cluster node {{node}} has PID Pressure, meaning it may not be able to start more containers" recovery_message = "Kubernetes cluster node {{node}} n olonger has pid pressure." diff --git a/pod-ready.tf b/pod-ready.tf index 815c37e..352e638 100644 --- a/pod-ready.tf +++ b/pod-ready.tf @@ -10,9 +10,9 @@ module "pod_ready" { version = "1.0.0" name = "Pod status not ready" - query = "min(${var.pod_ready_evaluation_period}):sum:kubernetes_state.pod.count{${local.pod_ready_filter}} by {cluster_name,namespace} - sum:kubernetes_state.pod.ready{${local.pod_ready_filter}} by {cluster_name,namespace} > 0" - alert_message = "Kubernetes Pod {{value}} status not ready in namespace {{namespace}} " - recovery_message = "Kubernetes Pod status recovered in namespace {{namespace}}" + query = "min(${var.pod_ready_evaluation_period}):sum:kubernetes_state.pod.count{${local.pod_ready_filter}} by {kube_cluster_name,kube_namespace} - sum:kubernetes_state.pod.ready{${local.pod_ready_filter}} by {kube_cluster_name,kube_namespace} > 0" + alert_message = "Kubernetes Pod {{value}} status not ready in namespace {{kube_namespace}} " + recovery_message = "Kubernetes Pod status recovered in namespace {{kube_namespace}}" # monitor level vars enabled = var.state_metrics_monitoring && var.pod_ready_enabled diff --git a/pods-failed.tf b/pods-failed.tf index e20ca88..e33c0a7 100644 --- a/pods-failed.tf +++ b/pods-failed.tf @@ -10,7 +10,7 @@ module "pods_failed" { version = "1.0.0" name = "Pods Failed" - query = "min(${var.pods_failed_evaluation_period}):default_zero(max:kubernetes_state.pod.status_phase{phase:failed${var.filter_str_concatenation}${local.pods_failed_filter}} by {namespace}) > ${var.pods_failed_critical}" + query = "min(${var.pods_failed_evaluation_period}):default_zero(max:kubernetes_state.pod.status_phase{phase:failed${var.filter_str_concatenation}${local.pods_failed_filter}} by {kube_namespace}) > ${var.pods_failed_critical}" # alert specific configuration require_full_window = true diff --git a/pods-pending.tf b/pods-pending.tf index 2342fa4..2945be1 100644 --- a/pods-pending.tf +++ b/pods-pending.tf @@ -10,7 +10,7 @@ module "pods_pending" { version = "1.0.0" name = "Pods Pending" - query = "min(${var.pods_pending_evaluation_period}):default_zero(max:kubernetes_state.pod.status_phase{phase:pending${var.filter_str_concatenation}${local.pods_pending_filter}} by {namespace}) > ${var.pods_pending_critical}" + query = "min(${var.pods_pending_evaluation_period}):default_zero(max:kubernetes_state.pod.status_phase{phase:pending${var.filter_str_concatenation}${local.pods_pending_filter}} by {kube_namespace}) > ${var.pods_pending_critical}" # alert specific configuration require_full_window = true diff --git a/replicaset-incomplete.tf b/replicaset-incomplete.tf index 8c6d9c8..0a3e27f 100644 --- a/replicaset-incomplete.tf +++ b/replicaset-incomplete.tf @@ -10,7 +10,7 @@ module "replicaset_incomplete" { version = "1.0.0" name = "Replicaset Incomplete" - query = "min(${var.replicaset_incomplete_evaluation_period}):max:kubernetes_state.replicaset.replicas_desired{${local.replicaset_incomplete_filter}} by {kube_replica_set,cluster_name} - min:kubernetes_state.replicaset.replicas_ready{${local.replicaset_incomplete_filter}} by {kube_replica_set,cluster_name} > ${var.replicaset_incomplete_critical}" + query = "min(${var.replicaset_incomplete_evaluation_period}):max:kubernetes_state.replicaset.replicas_desired{${local.replicaset_incomplete_filter}} by {kube_replica_set,kube_cluster_name} - min:kubernetes_state.replicaset.replicas_ready{${local.replicaset_incomplete_filter}} by {kube_replica_set,kube_cluster_name} > ${var.replicaset_incomplete_critical}" alert_message = "Kubernetes Replicaset {{kube_replica_set}} is incomplete. Missing pod count:{{value}}" recovery_message = "Kubernetes Replicaset {{kube_replica_set}} has recovered" diff --git a/replicaset-unavailable.tf b/replicaset-unavailable.tf index 972b635..e32de0d 100644 --- a/replicaset-unavailable.tf +++ b/replicaset-unavailable.tf @@ -3,8 +3,8 @@ locals { var.replicaset_unavailable_filter_override, var.filter_str ) - rs_pods_ready = "min:kubernetes_state.replicaset.replicas_ready{${local.replicaset_unavailable_filter}} by {kube_replica_set,cluster_name}" - rs_pods_desired = "min:kubernetes_state.replicaset.replicas_desired{${local.replicaset_unavailable_filter}} by {kube_replica_set,cluster_name}" + rs_pods_ready = "min:kubernetes_state.replicaset.replicas_ready{${local.replicaset_unavailable_filter}} by {kube_replica_set,kube_cluster_name}" + rs_pods_desired = "min:kubernetes_state.replicaset.replicas_desired{${local.replicaset_unavailable_filter}} by {kube_replica_set,kube_cluster_name}" } module "replicaset_unavailable" { From c6bffbe0a56822fb62c90a9b7dddadee60eb5e9b Mon Sep 17 00:00:00 2001 From: Sjuul Janssen Date: Fri, 19 Aug 2022 14:49:12 +0200 Subject: [PATCH 2/2] . --- daemonset-incomplete.tf | 4 ++-- deploy-desired-vs-status.tf | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/daemonset-incomplete.tf b/daemonset-incomplete.tf index ebe1748..27010fa 100644 --- a/daemonset-incomplete.tf +++ b/daemonset-incomplete.tf @@ -11,8 +11,8 @@ module "daemonset_incomplete" { name = "Daemonset Incomplete" query = "min(${var.daemonset_incomplete_evaluation_period}):max:kubernetes_state.daemonset.scheduled{${local.daemonset_incomplete_filter}} by {kube_daemon_set,kube_cluster_name} - min:kubernetes_state.daemonset.ready{${local.daemonset_incomplete_filter}} by {kube_daemon_set,kube_cluster_name} > 0" - alert_message = "Kubernetes Daemonset {{daemonset}} is incomplete. Missing pod count:{{value}}" - recovery_message = "Kubernetes Daemonset {{daemonset}} has recovered" + alert_message = "Kubernetes Daemonset {{kube_daemon_set}} is incomplete. Missing pod count:{{value}}" + recovery_message = "Kubernetes Daemonset {{kube_daemon_set}} has recovered" # monitor level vars enabled = var.state_metrics_monitoring && var.daemonset_incomplete_enabled diff --git a/deploy-desired-vs-status.tf b/deploy-desired-vs-status.tf index b5d4f6f..c766893 100644 --- a/deploy-desired-vs-status.tf +++ b/deploy-desired-vs-status.tf @@ -13,6 +13,8 @@ module "deploy_desired_vs_status" { query = "avg(${var.deploy_desired_vs_status_evaluation_period}):max:kubernetes_state.deployment.replicas_desired{${local.deploy_desired_vs_status_filter}} by {kube_cluster_name} - max:kubernetes_state.deployment.replicas_available{${local.deploy_desired_vs_status_filter}} by {kube_cluster_name} > ${var.deploy_desired_vs_status_critical}" alert_message = "Kubernetes is having trouble getting all the pods to start. (Based on replicas number in all the deployments)" recovery_message = "All pods described in deployments have started" + notify_no_data = true + no_data_message = "Kubernetes State data missing for {{kube_cluster_name.name}}" # monitor level vars enabled = var.state_metrics_monitoring && var.deploy_desired_vs_status_enabled