Skip to content

Commit

Permalink
Merge pull request #27 from kabisa/kubernetes_state_core
Browse files Browse the repository at this point in the history
upgrading k8s monitoring to kubernetes_state_core
  • Loading branch information
obeleh authored Aug 23, 2022
2 parents ae17c0c + c6bffbe commit 36572ad
Show file tree
Hide file tree
Showing 31 changed files with 113 additions and 98 deletions.
115 changes: 60 additions & 55 deletions README.md

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions cpu-limits-low-perc-state.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ module "cpu_limits_low_perc_state" {
version = "1.0.0"

name = "Available CPU for Limits in percentages Low"
query = "max(${var.cpu_limits_low_perc_state_evaluation_period}):( sum:kubernetes_state.container.cpu_limit{${local.cpu_limits_low_perc_state_filter}} by {host,cluster_name} / sum:kubernetes_state.node.cpu_capacity{${local.cpu_limits_low_perc_state_filter}} by {host,cluster_name}) * 100 > ${var.cpu_limits_low_perc_state_critical}"
alert_message = "Kubernetes cluster cpu room for limits / percentage is too low"
recovery_message = "Kubernetes cluster cpu limits / percentage has recovered"
query = "max(${var.cpu_limits_low_perc_state_evaluation_period}):( sum:kubernetes_state.container.cpu_limit{${local.cpu_limits_low_perc_state_filter}} by {host,kube_cluster_name} / sum:kubernetes_state.node.cpu_capacity{${local.cpu_limits_low_perc_state_filter}} by {host,kube_cluster_name}) * 100 > ${var.cpu_limits_low_perc_state_critical}"
alert_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu room for limits / percentage is too low"
recovery_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu limits / percentage has recovered"

# monitor level vars
enabled = var.cpu_limits_low_perc_state_enabled
Expand Down
6 changes: 3 additions & 3 deletions cpu-limits-low-perc.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ module "cpu_limits_low_perc" {
version = "1.0.0"

name = "Available CPU for Limits in percentages Low"
query = "max(${var.cpu_limits_low_perc_evaluation_period}):(sum:kubernetes.cpu.limits{${local.cpu_limits_low_perc_filter}} by {host,cluster_name} / max:system.cpu.num_cores{${local.cpu_limits_low_perc_filter}} by {host,cluster_name}) * 100 > ${var.cpu_limits_low_perc_critical}"
alert_message = "Kubernetes cluster cpu room for limits / percentage is too low"
recovery_message = "Kubernetes cluster cpu limits / percentage has recovered"
query = "max(${var.cpu_limits_low_perc_evaluation_period}):(sum:kubernetes.cpu.limits{${local.cpu_limits_low_perc_filter}} by {host,kube_cluster_name} / max:system.cpu.num_cores{${local.cpu_limits_low_perc_filter}} by {host,kube_cluster_name}) * 100 > ${var.cpu_limits_low_perc_critical}"
alert_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu room for limits / percentage is too low"
recovery_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu limits / percentage has recovered"

# monitor level vars
enabled = var.cpu_limits_low_perc_enabled
Expand Down
7 changes: 4 additions & 3 deletions cpu-limits-low.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@ module "cpu_limits_low" {
version = "1.0.0"

name = "Available CPU for Limits Low"
query = "min(${var.cpu_limits_low_evaluation_period}):max:system.cpu.num_cores{${local.cpu_limits_low_filter}} by {cluster_name,host} - sum:kubernetes.cpu.limits{${local.cpu_limits_low_filter}} by {cluster_name,host} < ${var.cpu_limits_low_critical}"
alert_message = "Kubernetes cluster cpu room for limits is too low"
recovery_message = "Kubernetes cluster cpu limits has recovered"
query = "min(${var.cpu_limits_low_evaluation_period}):max:system.cpu.num_cores{${local.cpu_limits_low_filter}} by {kube_cluster_name,host} - sum:kubernetes.cpu.limits{${local.cpu_limits_low_filter}} by {kube_cluster_name,host} < ${var.cpu_limits_low_critical}"
alert_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu room for limits is too low "
recovery_message = "Kubernetes cluster {{kube_cluster_name.name}} cpu limits has recovered"


# monitor level vars
enabled = var.cpu_limits_low_enabled
Expand Down
2 changes: 1 addition & 1 deletion cpu-on-dns-pods-high.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ module "cpu_on_dns_pods_high" {
version = "1.0.0"

name = "CPU Usage on DNS pods is high"
query = "avg(${var.cpu_on_dns_pods_high_evaluation_period}):avg:docker.cpu.usage{${local.cpu_on_dns_pods_high_filter}} by {cluster_name,host,container_name} > ${var.cpu_on_dns_pods_high_critical}"
query = "avg(${var.cpu_on_dns_pods_high_evaluation_period}):avg:docker.cpu.usage{${local.cpu_on_dns_pods_high_filter}} by {kube_cluster_name,host,container_name} > ${var.cpu_on_dns_pods_high_critical}"
alert_message = "Kubernetes CPU usage on DNS pods is too high"
recovery_message = "Kubernetes CPU usage on DNS pods has recovered"

Expand Down
2 changes: 1 addition & 1 deletion cpu-requests-low-perc-state.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ module "cpu_requests_low_perc_state" {
version = "1.0.0"

name = "Available CPU for requests in percentages Low"
query = "max(${var.cpu_requests_low_perc_state_evaluation_period}):( sum:kubernetes_state.container.cpu_requested{${local.cpu_requests_low_perc_state_filter}} by {host,cluster_name} / sum:kubernetes_state.node.cpu_capacity{${local.cpu_requests_low_perc_state_filter}} by {host,cluster_name} ) * 100 > ${var.cpu_requests_low_perc_state_critical}"
query = "max(${var.cpu_requests_low_perc_state_evaluation_period}):( sum:kubernetes_state.container.cpu_requested{${local.cpu_requests_low_perc_state_filter}} by {host,kube_cluster_name} / sum:kubernetes_state.node.cpu_capacity{${local.cpu_requests_low_perc_state_filter}} by {host,kube_cluster_name} ) * 100 > ${var.cpu_requests_low_perc_state_critical}"
alert_message = "Kubernetes cluster cpu room for requests / percentage is too low"
recovery_message = "Kubernetes cluster cpu requests / percentage has recovered"

Expand Down
2 changes: 1 addition & 1 deletion cpu-requests-low-perc.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ module "cpu_requests_low_perc" {
version = "1.0.0"

name = "Available CPU for requests in percentages Low"
query = "max(${var.cpu_requests_low_perc_evaluation_period}):100 * sum:kubernetes.cpu.requests{${local.cpu_requests_low_perc_filter}} by {cluster_name,host} / max:system.cpu.num_cores{${local.cpu_requests_low_perc_filter}} by {cluster_name,host} > ${var.cpu_requests_low_perc_critical}"
query = "max(${var.cpu_requests_low_perc_evaluation_period}):100 * sum:kubernetes.cpu.requests{${local.cpu_requests_low_perc_filter}} by {kube_cluster_name,host} / max:system.cpu.num_cores{${local.cpu_requests_low_perc_filter}} by {kube_cluster_name,host} > ${var.cpu_requests_low_perc_critical}"
alert_message = "Kubernetes cluster cpu room for requests / percentage is too low"
recovery_message = "Kubernetes cluster cpu requests / percentage has recovered"

Expand Down
2 changes: 1 addition & 1 deletion cpu-requests-low.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ module "cpu_requests_low" {
version = "1.0.0"

name = "Available CPU for Requests Low"
query = "max(${var.cpu_requests_low_evaluation_period}):max:system.cpu.num_cores{${local.cpu_requests_low_filter}} by {cluster_name,host} - sum:kubernetes.cpu.requests{${local.cpu_requests_low_filter}} by {cluster_name,host} < ${var.cpu_requests_low_critical}"
query = "max(${var.cpu_requests_low_evaluation_period}):max:system.cpu.num_cores{${local.cpu_requests_low_filter}} by {kube_cluster_name,host} - sum:kubernetes.cpu.requests{${local.cpu_requests_low_filter}} by {kube_cluster_name,host} < ${var.cpu_requests_low_critical}"
alert_message = "Kubernetes cluster cpu room for requests is too low"
recovery_message = "Kubernetes cluster cpu requests has recovered"

Expand Down
4 changes: 2 additions & 2 deletions daemonset-incomplete-variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ variable "daemonset_incomplete_critical" {

variable "daemonset_incomplete_evaluation_period" {
type = string
default = "last_30m"
default = "last_15m"
}

variable "daemonset_incomplete_note" {
Expand Down Expand Up @@ -53,5 +53,5 @@ variable "daemonset_incomplete_priority" {
description = "Number from 1 (high) to 5 (low)."

type = number
default = 3
default = 2
}
6 changes: 3 additions & 3 deletions daemonset-incomplete.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ module "daemonset_incomplete" {
version = "1.0.0"

name = "Daemonset Incomplete"
query = "min(${var.daemonset_incomplete_evaluation_period}):max:kubernetes_state.daemonset.scheduled{${local.daemonset_incomplete_filter}} by {daemonset,cluster_name} - min:kubernetes_state.daemonset.ready{${local.daemonset_incomplete_filter}} by {daemonset,cluster_name} > 0"
alert_message = "Kubernetes Daemonset {{daemonset}} is incomplete. Missing pod count:{{value}}"
recovery_message = "Kubernetes Daemonset {{daemonset}} has recovered"
query = "min(${var.daemonset_incomplete_evaluation_period}):max:kubernetes_state.daemonset.scheduled{${local.daemonset_incomplete_filter}} by {kube_daemon_set,kube_cluster_name} - min:kubernetes_state.daemonset.ready{${local.daemonset_incomplete_filter}} by {kube_daemon_set,kube_cluster_name} > 0"
alert_message = "Kubernetes Daemonset {{kube_daemon_set}} is incomplete. Missing pod count:{{value}}"
recovery_message = "Kubernetes Daemonset {{kube_daemon_set}} has recovered"

# monitor level vars
enabled = var.state_metrics_monitoring && var.daemonset_incomplete_enabled
Expand Down
6 changes: 4 additions & 2 deletions datadog-agent.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ module "datadog_agent" {
version = "1.0.0"

name = "Datadog agent not running"
query = "avg(${var.datadog_agent_evaluation_period}):avg:datadog.agent.running{${local.datadog_agent_filter}} by {host,cluster_name} < 1"
alert_message = "Datadog Agent not running on {{host.name}} in Cluster: {{cluster_name.name}}"
query = "avg(${var.datadog_agent_evaluation_period}):avg:datadog.agent.running{${local.datadog_agent_filter}} by {host,kube_cluster_name} < 1"
alert_message = "Datadog Agent not running on {{host.name}} in Cluster: {{kube_cluster_name.name}}"
recovery_message = "Agent running again"
notify_no_data = true
no_data_message = "Datadog agent is not running on {{host.name}} in Cluster: {{kube_cluster_name.name}}"

# monitor level vars
enabled = var.datadog_agent_enabled
Expand Down
4 changes: 3 additions & 1 deletion deploy-desired-vs-status.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ module "deploy_desired_vs_status" {
version = "1.0.0"

name = "Desired pods vs current pods (Deployments)"
query = "avg(${var.deploy_desired_vs_status_evaluation_period}):max:kubernetes_state.deployment.replicas_desired{${local.deploy_desired_vs_status_filter}} by {cluster_name,host} - max:kubernetes_state.deployment.replicas{${local.deploy_desired_vs_status_filter}} by {cluster_name,host} > ${var.deploy_desired_vs_status_critical}"
query = "avg(${var.deploy_desired_vs_status_evaluation_period}):max:kubernetes_state.deployment.replicas_desired{${local.deploy_desired_vs_status_filter}} by {kube_cluster_name} - max:kubernetes_state.deployment.replicas_available{${local.deploy_desired_vs_status_filter}} by {kube_cluster_name} > ${var.deploy_desired_vs_status_critical}"
alert_message = "Kubernetes is having trouble getting all the pods to start. (Based on replicas number in all the deployments)"
recovery_message = "All pods described in deployments have started"
notify_no_data = true
no_data_message = "Kubernetes State data missing for {{kube_cluster_name.name}}"

# monitor level vars
enabled = var.state_metrics_monitoring && var.deploy_desired_vs_status_enabled
Expand Down
2 changes: 1 addition & 1 deletion examples/example.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ module "kubernetes" {
notification_channel = "[email protected]"
service = "Kubernetes"
env = "prd"
filter_str = "cluster_name:production"
filter_str = "kube_cluster_name:production"
}
2 changes: 1 addition & 1 deletion memory-limits-low-perc-state.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ module "memory_limits_low_perc_state" {
version = "1.0.0"

name = "Available Memory for Limits in percentage Low"
query = "max(${var.memory_limits_low_perc_state_evaluation_period}):( sum:kubernetes_state.container.memory_limit{${local.memory_limits_low_perc_state_filter}} by {host,cluster_name} / sum:kubernetes_state.node.memory_allocatable{${local.memory_limits_low_perc_state_filter}} by {host,cluster_name}) * 100 > ${var.memory_limits_low_perc_state_critical}"
query = "max(${var.memory_limits_low_perc_state_evaluation_period}):( sum:kubernetes_state.container.memory_limit{${local.memory_limits_low_perc_state_filter}} by {host,kube_cluster_name} / sum:kubernetes_state.node.memory_allocatable{${local.memory_limits_low_perc_state_filter}} by {host,kube_cluster_name}) * 100 > ${var.memory_limits_low_perc_state_critical}"
alert_message = "Kubernetes cluster memory room for limits in percentage is too low"
recovery_message = "Kubernetes cluster memory limits in percentage has recovered"

Expand Down
2 changes: 1 addition & 1 deletion memory-limits-low-perc.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ module "memory_limits_low_perc" {
version = "1.0.0"

name = "Available Memory for Limits in percentage Low"
query = "max(${var.memory_limits_low_perc_evaluation_period}):( max:kubernetes.memory.limits{${local.memory_limits_low_perc_filter}} by {host,cluster_name}/ max:system.mem.total{${local.memory_limits_low_perc_filter}} by {host,cluster_name}) * 100 > ${var.memory_limits_low_perc_critical}"
query = "max(${var.memory_limits_low_perc_evaluation_period}):( max:kubernetes.memory.limits{${local.memory_limits_low_perc_filter}} by {host,kube_cluster_name}/ max:system.mem.total{${local.memory_limits_low_perc_filter}} by {host,kube_cluster_name}) * 100 > ${var.memory_limits_low_perc_critical}"
alert_message = "Kubernetes cluster memory room for limits in percentage is too low"
recovery_message = "Kubernetes cluster memory limits in percentage has recovered"

Expand Down
2 changes: 1 addition & 1 deletion memory-limits-low.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ module "memory_limits_low" {
version = "1.0.0"

name = "Available Memory for Limits Low"
query = "avg(${var.memory_limits_low_evaluation_period}):max:system.mem.total{${local.memory_limits_low_filter}} by {host,cluster_name} - max:kubernetes.memory.limits{${local.memory_limits_low_filter}} by {host,cluster_name} < ${var.memory_limits_low_critical}"
query = "avg(${var.memory_limits_low_evaluation_period}):max:system.mem.total{${local.memory_limits_low_filter}} by {host,kube_cluster_name} - max:kubernetes.memory.limits{${local.memory_limits_low_filter}} by {host,kube_cluster_name} < ${var.memory_limits_low_critical}"
alert_message = "Kubernetes cluster memory room for limits is too low"
recovery_message = "Kubernetes cluster memory limits has recovered"

Expand Down
2 changes: 1 addition & 1 deletion memory-requests-low.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ module "memory_requests_low" {
version = "1.0.0"

name = "Available Memory for Requests Low"
query = "avg(${var.memory_requests_low_evaluation_period}):max:system.mem.total{${local.memory_requests_low_filter}} by {host,cluster_name} - max:kubernetes.memory.requests{${local.memory_requests_low_filter}} by {host,cluster_name} < ${var.memory_requests_low_critical}"
query = "avg(${var.memory_requests_low_evaluation_period}):max:system.mem.total{${local.memory_requests_low_filter}} by {host,kube_cluster_name} - max:kubernetes.memory.requests{${local.memory_requests_low_filter}} by {host,kube_cluster_name} < ${var.memory_requests_low_critical}"
alert_message = "Total memory available for requests on {{ host }} is low ({{value}})"
recovery_message = "Total memory available for requests on {{ host }} has recovered ({{value}})"

Expand Down
7 changes: 6 additions & 1 deletion module_description.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
This module mainly check on Kubernetes resource level and cluster health.
System level monitoring can best be implemented with the [system module](https://github.com/kabisa/terraform-datadog-system).
Docker/Container level monitoring can best be implemented with the [docker module](https://github.com/kabisa/terraform-datadog-docker-container).
Docker/Container level monitoring can best be implemented with the [docker module](https://github.com/kabisa/terraform-datadog-docker-container).

# Recent changes:

- switch from kubernetes_state to kubernetes_state_core as a default https://docs.datadoghq.com/integrations/kubernetes_state_core/?tab=helm
- upgrade provider to ~> 3.12
2 changes: 1 addition & 1 deletion network-unavailable.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ module "network_unavailable" {
version = "1.0.0"

name = "Nodes with Network Unavailable"
query = "avg(${var.network_unavailable_evaluation_period}):max:kubernetes_state.nodes.by_condition{${local.network_unavailable_filter} AND condition:networkunavailable AND (status:true OR status:unknown)} by {cluster_name,host} > ${var.network_unavailable_critical}"
query = "avg(${var.network_unavailable_evaluation_period}):max:kubernetes_state.node.by_condition{${local.network_unavailable_filter} AND condition:networkunavailable AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ${var.network_unavailable_critical}"
alert_message = "Kubernetes cluster node {{node}} has no network. Meaning it is not accessible"
recovery_message = "Kubernetes cluster node {{node}} has come back on the network"

Expand Down
2 changes: 1 addition & 1 deletion node-diskpressure.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ module "node_diskpressure" {
version = "1.0.0"

name = "Nodes with Diskpressure"
query = "avg(${var.node_diskpressure_evaluation_period}):max:kubernetes_state.nodes.by_condition{${local.node_diskpressure_filter} AND condition:diskpressure AND (status:true OR status:unknown)} by {cluster_name,host} > ${var.node_diskpressure_critical}"
query = "avg(${var.node_diskpressure_evaluation_period}):max:kubernetes_state.node.by_condition{${local.node_diskpressure_filter} AND condition:diskpressure AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ${var.node_diskpressure_critical}"
alert_message = "Kubernetes cluster node {{node}} has diskpressure. Meaning it is low on disk space (Logging, emptydir volumes, caching, etc)"
recovery_message = "Kubernetes cluster node {{node}} no longer has problems with DiskPressure."

Expand Down
2 changes: 1 addition & 1 deletion node-memory-used-percent.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ module "node_memory_used_percent" {
version = "1.0.0"

name = "Memory Used Percent"
query = "avg(${var.node_memory_used_percent_evaluation_period}):( 100 * max:kubernetes.memory.usage{${local.node_memory_used_percent_filter}} by {host,cluster_name} ) / max:system.mem.total{${local.node_memory_used_percent_filter}} by {host,cluster_name} > ${var.node_memory_used_percent_critical}"
query = "avg(${var.node_memory_used_percent_evaluation_period}):( 100 * max:kubernetes.memory.usage{${local.node_memory_used_percent_filter}} by {host,kube_cluster_name} ) / max:system.mem.total{${local.node_memory_used_percent_filter}} by {host,kube_cluster_name} > ${var.node_memory_used_percent_critical}"
alert_message = "Available memory on ${var.service} Node {{host.name}} has dropped below {{threshold}} and has {{value}}% available"
recovery_message = "Available memory on ${var.service} Node {{host.name}} has recovered {{value}}%"

Expand Down
2 changes: 1 addition & 1 deletion node-memorypressure.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ module "node_memorypressure" {
version = "1.0.0"

name = "Nodes with Memorypressure"
query = "avg(${var.node_memorypressure_evaluation_period}):max:kubernetes_state.nodes.by_condition{${local.node_memorypressure_filter} AND condition:memorypressure AND (status:true OR status:unknown)} by {cluster_name,host} > ${var.node_memorypressure_critical}"
query = "avg(${var.node_memorypressure_evaluation_period}):max:kubernetes_state.node.by_condition{${local.node_memorypressure_filter} AND condition:memorypressure AND (status:true OR status:unknown)} by {kube_cluster_name,host} > ${var.node_memorypressure_critical}"
alert_message = "Kubernetes cluster node {{node}} has memorypressure. Meaning it is low on memory"
recovery_message = "Kubernetes cluster node {{node}} no longer has Memory Pressure."

Expand Down
2 changes: 1 addition & 1 deletion node-ready.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ module "node_ready" {
version = "1.0.0"

name = "Node Not Ready"
query = "avg(${var.node_ready_evaluation_period}):count_nonzero(sum:kubernetes_state.nodes.by_condition{${local.node_ready_filter} AND (NOT condition:ready) AND (status:true OR status:unknown)} by {cluster_name,host}) > ${var.node_ready_critical}"
query = "avg(${var.node_ready_evaluation_period}):count_nonzero(sum:kubernetes_state.node.by_condition{${local.node_ready_filter} AND (NOT condition:ready) AND (status:true OR status:unknown)} by {kube_cluster_name,host}) > ${var.node_ready_critical}"
alert_message = "Kubernetes cluster node {{host}} is not ready."
recovery_message = "Kubernetes cluster node {{host}} is ready again."

Expand Down
Loading

0 comments on commit 36572ad

Please sign in to comment.