From 0f5c871208ce449e03860a116e8771cb6ff23661 Mon Sep 17 00:00:00 2001 From: Chris Werner Rau Date: Thu, 30 Nov 2023 17:47:32 +0100 Subject: [PATCH] feat(base-cluster/monitoring): add pagerduty alertmanager receiver feat(base-cluster/monitoring): add deadMansSwitch integration --- charts/base-cluster/README.md.gotmpl | 6 +- .../base-cluster/ci/artifacthub-values.yaml | 10 +++ .../ci/deadmansswitch-values.yaml | 5 ++ charts/base-cluster/ci/pagerduty-values.yaml | 7 ++ charts/base-cluster/templates/_images.tpl | 4 ++ .../deadMansSwitch/ciliumNetworkPolicy.yaml | 37 ++++++++++ .../monitoring/deadMansSwitch/cronjob.yaml | 69 +++++++++++++++++++ .../deadMansSwitch/hook-secret.yaml | 15 ++++ .../deadMansSwitch/registration.yaml | 69 +++++++++++++++++++ .../monitoring/deadMansSwitch/secret.yaml | 13 ++++ .../monitoring/deadMansSwitch/unregister.yaml | 68 ++++++++++++++++++ .../_alertmanager-config.yaml | 43 ++++++++++-- .../kube-prometheus-stack/oauth-proxy.yaml | 2 +- charts/base-cluster/values.schema.json | 42 ++++++++++- charts/base-cluster/values.yaml | 19 ++++- 15 files changed, 395 insertions(+), 14 deletions(-) create mode 100644 charts/base-cluster/ci/deadmansswitch-values.yaml create mode 100644 charts/base-cluster/ci/pagerduty-values.yaml create mode 100644 charts/base-cluster/templates/monitoring/deadMansSwitch/ciliumNetworkPolicy.yaml create mode 100644 charts/base-cluster/templates/monitoring/deadMansSwitch/cronjob.yaml create mode 100644 charts/base-cluster/templates/monitoring/deadMansSwitch/hook-secret.yaml create mode 100644 charts/base-cluster/templates/monitoring/deadMansSwitch/registration.yaml create mode 100644 charts/base-cluster/templates/monitoring/deadMansSwitch/secret.yaml create mode 100644 charts/base-cluster/templates/monitoring/deadMansSwitch/unregister.yaml diff --git a/charts/base-cluster/README.md.gotmpl b/charts/base-cluster/README.md.gotmpl index 70d8731c0..893d4dd3f 100644 --- a/charts/base-cluster/README.md.gotmpl +++ b/charts/base-cluster/README.md.gotmpl @@ -172,7 +172,7 @@ to present the results. #### Sub-Component [tracing](#monitoring_tracing) The included [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) -collects traces via otlp-grpc on every node on the host IP. +collects traces via otlp-grpc on every node via the `open-telemetry-collector-opentelemetry-collector.monitoring` service. These traces are then sent to [Grafana Tempo](https://grafana.com/oss/tempo/), which is included as a datasource in Grafana by default. @@ -187,9 +187,7 @@ spec: containers: - env: - name: OTEL_HOST <- change this to your framework's environment variable - valueFrom: - fieldRef: - fieldPath: status.hostIP + value: open-telemetry-collector-opentelemetry-collector.monitoring - name: OTEL_PORT value: "4317" ``` diff --git a/charts/base-cluster/ci/artifacthub-values.yaml b/charts/base-cluster/ci/artifacthub-values.yaml index 66304c3a5..9025befe2 100644 --- a/charts/base-cluster/ci/artifacthub-values.yaml +++ b/charts/base-cluster/ci/artifacthub-values.yaml @@ -17,3 +17,13 @@ monitoring: adminPassword: test tracing: enabled: true + deadMansSwitch: + enabled: true + pingKey: PING_KEY + apiKey: API_KEY + prometheus: + alertmanager: + receivers: + pagerduty: + enabled: true + integrationKey: INTEGRATION_KEY diff --git a/charts/base-cluster/ci/deadmansswitch-values.yaml b/charts/base-cluster/ci/deadmansswitch-values.yaml new file mode 100644 index 000000000..5865bfdbc --- /dev/null +++ b/charts/base-cluster/ci/deadmansswitch-values.yaml @@ -0,0 +1,5 @@ +monitoring: + deadMansSwitch: + enabled: true + pingKey: PING_KEY + apiKey: API_KEY diff --git a/charts/base-cluster/ci/pagerduty-values.yaml b/charts/base-cluster/ci/pagerduty-values.yaml new file mode 100644 index 000000000..1eecc882d --- /dev/null +++ b/charts/base-cluster/ci/pagerduty-values.yaml @@ -0,0 +1,7 @@ +monitoring: + prometheus: + alertmanager: + receivers: + pagerduty: + enabled: true + integrationKey: INTEGRATION_KEY diff --git a/charts/base-cluster/templates/_images.tpl b/charts/base-cluster/templates/_images.tpl index 8841b91fc..c1266a7d0 100644 --- a/charts/base-cluster/templates/_images.tpl +++ b/charts/base-cluster/templates/_images.tpl @@ -2,6 +2,10 @@ {{- include "common.images.image" (dict "imageRoot" .Values.global.kubectl.image "global" .Values.global) -}} {{- end -}} +{{- define "base-cluster.curl.image" -}} +{{- include "common.images.image" (dict "imageRoot" .Values.global.curl.image "global" .Values.global) -}} +{{- end -}} + {{- define "base-cluster.flux.image" -}} {{- include "common.images.image" (dict "imageRoot" .Values.global.flux.image "global" .Values.global) -}} {{- end -}} diff --git a/charts/base-cluster/templates/monitoring/deadMansSwitch/ciliumNetworkPolicy.yaml b/charts/base-cluster/templates/monitoring/deadMansSwitch/ciliumNetworkPolicy.yaml new file mode 100644 index 000000000..405401f8c --- /dev/null +++ b/charts/base-cluster/templates/monitoring/deadMansSwitch/ciliumNetworkPolicy.yaml @@ -0,0 +1,37 @@ +{{- if eq (include "common.networkPolicy.type" .) "cilium" }} +apiVersion: cilium.io/v2 +kind: CiliumNetworkPolicy +metadata: + name: dead-mans-switch + namespace: monitoring + labels: {{- include "common.labels.standard" $ | nindent 4 }} + app.kubernetes.io/component: dead-mans-switch +spec: + endpointSelector: + matchLabels: {{- include "common.labels.matchLabels" $ | nindent 6 }} + app.kubernetes.io/component: dead-mans-switch + ingress: + - { } + egress: + - toFQDNs: + - matchName: hc-ping.com + - matchName: healthchecks.io + toPorts: + - ports: + - port: "443" + protocol: TCP + - toServices: + - k8sServiceSelector: + selector: + matchLabels: + k8s-app: kube-dns + namespace: kube-system + toPorts: + - ports: + - port: "53" + protocol: UDP + rules: + dns: + - matchName: hc-ping.com + - matchName: healthchecks.io +{{- end }} diff --git a/charts/base-cluster/templates/monitoring/deadMansSwitch/cronjob.yaml b/charts/base-cluster/templates/monitoring/deadMansSwitch/cronjob.yaml new file mode 100644 index 000000000..9b9adcddb --- /dev/null +++ b/charts/base-cluster/templates/monitoring/deadMansSwitch/cronjob.yaml @@ -0,0 +1,69 @@ +{{- if .Values.monitoring.deadMansSwitch.enabled }} +{{- if false }} +apiVersion: batch/v1 +{{- else }} +apiVersion: {{ include "common.capabilities.cronjob.apiVersion" . }} +{{- end }} +kind: CronJob +metadata: + name: dead-mans-switch + namespace: monitoring + labels: {{- include "common.labels.standard" $ | nindent 4 }} + app.kubernetes.io/component: dead-mans-switch +spec: + concurrencyPolicy: Forbid + startingDeadlineSeconds: 50 + schedule: "* * * * *" # Every minute + jobTemplate: + spec: + template: + metadata: + labels: {{- include "common.labels.standard" $ | nindent 12 }} + app.kubernetes.io/component: dead-mans-switch + spec: + securityContext: + runAsGroup: 1000 + runAsUser: 1000 + runAsNonRoot: true + fsGroup: 1000 + automountServiceAccountToken: false + restartPolicy: OnFailure + containers: + - name: watchdog + image: {{ template "base-cluster.curl.image" . }} + {{- if .Values.global.kubectl.image.repository | contains "@" }} + imagePullPolicy: IfNotPresent + {{- else }} + imagePullPolicy: Always + {{- end }} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + env: + - name: PING_KEY + valueFrom: + secretKeyRef: + name: dead-mans-switch + key: pingKey + command: + - curl + - --silent + - --show-error + - --fail + - --retry + - '5' + - --max-time + - '30' + - {{ printf "https://hc-ping.com/$(PING_KEY)/k8s-cluster-%s-%s-scheduling" (.Values.global.baseDomain | replace "." "-") .Values.global.clusterName }} + resources: + requests: + cpu: 100m + memory: 16Mi + limits: + cpu: 100m + memory: 16Mi +{{- end }} diff --git a/charts/base-cluster/templates/monitoring/deadMansSwitch/hook-secret.yaml b/charts/base-cluster/templates/monitoring/deadMansSwitch/hook-secret.yaml new file mode 100644 index 000000000..2dc07a0c7 --- /dev/null +++ b/charts/base-cluster/templates/monitoring/deadMansSwitch/hook-secret.yaml @@ -0,0 +1,15 @@ +{{- if .Values.monitoring.deadMansSwitch.enabled -}} + {{- $secret := include (print .Template.BasePath "/monitoring/deadMansSwitch/secret.yaml") . | fromYaml -}} + {{- $secret = mustMerge ( + dict "metadata" ( + dict "annotations" (dict + "helm.sh/hook" "pre-install,pre-upgrade,pre-delete" + "helm.sh/hook-delete-policy" "before-hook-creation,hook-succeeded,hook-failed" + ) + "namespace" .Release.Namespace + ) + ) + $secret + -}} + {{- $secret | toYaml -}} +{{- end -}} diff --git a/charts/base-cluster/templates/monitoring/deadMansSwitch/registration.yaml b/charts/base-cluster/templates/monitoring/deadMansSwitch/registration.yaml new file mode 100644 index 000000000..b2c1540a5 --- /dev/null +++ b/charts/base-cluster/templates/monitoring/deadMansSwitch/registration.yaml @@ -0,0 +1,69 @@ +{{- if .Values.monitoring.deadMansSwitch.enabled }} +apiVersion: batch/v1 +kind: Job +metadata: + name: dead-mans-switch-registration + namespace: {{ $.Release.Namespace }} + labels: {{- include "common.labels.standard" $ | nindent 4 }} + app.kubernetes.io/component: dead-mans-switch + annotations: + helm.sh/hook: pre-install,pre-upgrade + helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded,hook-failed +spec: + template: + spec: + securityContext: + runAsGroup: 1000 + runAsUser: 1000 + runAsNonRoot: true + fsGroup: 1000 + automountServiceAccountToken: false + restartPolicy: OnFailure + containers: + - name: register + image: {{ include "base-cluster.curl.image" . }} + {{- if .Values.global.kubectl.image.repository | contains "@" }} + imagePullPolicy: IfNotPresent + {{- else }} + imagePullPolicy: Always + {{- end }} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + env: + - name: PING_KEY + valueFrom: + secretKeyRef: + name: dead-mans-switch + key: pingKey + - name: API_KEY + valueFrom: + secretKeyRef: + name: dead-mans-switch + key: apiKey + command: + - ash + - -e + - -c + - | + set -o pipefail + + set -x + function createCheck() { + local checkName="$1" + local data='{"name": "'"$checkName"'", "slug": "'"$checkName"'", "tags": "k8s {{ .Values.global.clusterName -}}", "timeout": 120, "grace": 60, "channels": "{{- .Values.global.clusterName -}}", "unique": ["name"]}' + + curl --silent --show-error --fail --retry 5 --max-time 30 --header "X-Api-Key: $API_KEY" https://healthchecks.io/api/v3/checks/ --data "$data" + curl --silent --show-error --fail --retry 5 --max-time 30 "https://hc-ping.com/$PING_KEY/$checkName" + } + + checkName={{- printf "k8s-cluster-%s-%s" (.Values.global.baseDomain | replace "." "-") .Values.global.clusterName | quote }} + {{- if .Values.monitoring.prometheus.enabled }} + createCheck "$checkName-monitoring" + {{- end }} + createCheck "$checkName-scheduling" +{{- end }} diff --git a/charts/base-cluster/templates/monitoring/deadMansSwitch/secret.yaml b/charts/base-cluster/templates/monitoring/deadMansSwitch/secret.yaml new file mode 100644 index 000000000..a574b6e97 --- /dev/null +++ b/charts/base-cluster/templates/monitoring/deadMansSwitch/secret.yaml @@ -0,0 +1,13 @@ +{{- if .Values.monitoring.deadMansSwitch.enabled -}} +apiVersion: v1 +kind: Secret +metadata: + name: dead-mans-switch + namespace: monitoring + labels: {{- include "common.labels.standard" $ | nindent 4 }} + app.kubernetes.io/component: dead-mans-switch +type: Opaque +stringData: + apiKey: {{ .Values.monitoring.deadMansSwitch.apiKey | required "You need to provide the `.Values.monitoring.deadMansSwitch.apiKey`" | quote }} + pingKey: {{ .Values.monitoring.deadMansSwitch.pingKey | required "You need to provide the `.Values.monitoring.deadMansSwitch.pingKey`" | quote }} +{{- end -}} diff --git a/charts/base-cluster/templates/monitoring/deadMansSwitch/unregister.yaml b/charts/base-cluster/templates/monitoring/deadMansSwitch/unregister.yaml new file mode 100644 index 000000000..acf8b5460 --- /dev/null +++ b/charts/base-cluster/templates/monitoring/deadMansSwitch/unregister.yaml @@ -0,0 +1,68 @@ +{{- if lookup "v1" "Secret" "monitoring" "dead-mans-switch" }} +apiVersion: batch/v1 +kind: Job +metadata: + name: dead-mans-switch-unregister + namespace: {{ $.Release.Namespace }} + labels: {{- include "common.labels.standard" $ | nindent 4 }} + app.kubernetes.io/component: dead-mans-switch + annotations: + helm.sh/hook: pre-delete + helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded,hook-failed +spec: + template: + spec: + securityContext: + runAsGroup: 1000 + runAsUser: 1000 + runAsNonRoot: true + fsGroup: 1000 + automountServiceAccountToken: false + restartPolicy: OnFailure + containers: + - name: unregister + image: {{ include "base-cluster.curl.image" . }} + {{- if .Values.global.kubectl.image.repository | contains "@" }} + imagePullPolicy: IfNotPresent + {{- else }} + imagePullPolicy: Always + {{- end }} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + env: + - name: API_KEY + valueFrom: + secretKeyRef: + name: dead-mans-switch + key: apiKey + command: + - ash + - -e + - -c + - | + set -o pipefail + + set -x + function deleteCheck() { + local checkName="$1" + local existingChecks + local existingCheckUUID + existingChecks="$(curl --silent --fail --retry 5 --max-time 30 --header "X-Api-Key: $apiKey" https://healthchecks.io/api/v3/checks/)" + existingCheckUUID="$(jq -r ".checks[] | select(.name == \"$checkName\") | .ping_url | split(\"/\") | last" <<< "$existingChecks")" + + if curl --silent --show-error --fail --retry 5 --max-time 30 --header "X-Api-Key: $API_KEY" "https://healthchecks.io/api/v3/checks/$existingCheckUUID" > /dev/null; then + curl --silent --show-error --fail --retry 5 --max-time 30 --header "X-Api-Key: $API_KEY" "https://healthchecks.io/api/v3/checks/$existingCheckUUID" --request DELETE + fi + } + + checkName={{- printf "k8s-cluster-%s-%s" (.Values.global.baseDomain | replace "." "-") .Values.global.clusterName | quote }} + {{- if .Values.monitoring.prometheus.enabled }} + deleteCheck "$checkName-monitoring" + {{- end }} + deleteCheck "$checkName-scheduling" +{{- end }} diff --git a/charts/base-cluster/templates/monitoring/kube-prometheus-stack/_alertmanager-config.yaml b/charts/base-cluster/templates/monitoring/kube-prometheus-stack/_alertmanager-config.yaml index 73a657801..bd0e7a4a9 100644 --- a/charts/base-cluster/templates/monitoring/kube-prometheus-stack/_alertmanager-config.yaml +++ b/charts/base-cluster/templates/monitoring/kube-prometheus-stack/_alertmanager-config.yaml @@ -1,8 +1,10 @@ {{- define "base-cluster.prometheus-stack.alertmanager.config" -}} -enabled: false # TODO dependent on routes/receivers -# TODO routes -# TODO receivers -{{- if false }} +{{- $enabled := false -}} +{{- range $_, $receiver := .Values.monitoring.prometheus.alertmanager.receivers -}} + {{- $enabled = or $enabled $receiver.enabled -}} +{{- end -}} +enabled: {{ $enabled }} +{{- if $enabled }} podDisruptionBudget: enabled: true {{- if include "base-cluster.monitoring.unauthenticated-ingress.enabled" (dict "name" "alertmanager" "context" .) }} @@ -26,5 +28,38 @@ alertmanagerSpec: storage: {{ .Values.monitoring.prometheus.alertmanager.persistence.size }} alertmanagerConfigSelector: matchLabels: {{- .Values.monitoring.labels | toYaml | nindent 6 }} +config: + {{- if .Values.monitoring.prometheus.alertmanager.receivers.pagerduty.enabled }} + global: + pagerduty_url: {{ .Values.monitoring.prometheus.alertmanager.receivers.pagerduty.url | required "You need to provide the `.Values.monitoring.promteheus.alertmanager.receivers.pagerduty.url`" | quote }} + {{- end }} + receivers: + {{- if .Values.monitoring.prometheus.alertmanager.receivers.pagerduty.enabled }} + - name: pagerduty + pagerduty_configs: + - routing_key: {{ .Values.monitoring.prometheus.alertmanager.receivers.pagerduty.integrationKey | required "You need to provide the `.Values.monitoring.prometheus.alertmanager.receivers.pagerduty.integrationKey`" }} + send_resolved: true + http_config: + follow_redirects: true + {{- end }} + {{- if and .Values.monitoring.deadMansSwitch.enabled .Values.global.baseDomain .Values.global.clusterName }} + - name: healthchecks.io + webhook_configs: + - url: {{ printf "https://hc-ping.com/%s/k8s-cluster-%s-%s-monitoring" .Values.monitoring.deadMansSwitch.pingKey (.Values.global.baseDomain | replace "." "-") .Values.global.clusterName }} + send_resolved: false + {{- end }} + - name: "null" + route: + {{- if .Values.monitoring.prometheus.alertmanager.receivers.pagerduty.enabled }} + receiver: pagerduty + {{- end }} + routes: + {{- if .Values.monitoring.deadMansSwitch.enabled }} + - match: + alertname: Watchdog + receiver: healthchecks.io + group_interval: 1m + repeat_interval: 1m + {{- end }} {{- end }} {{- end -}} diff --git a/charts/base-cluster/templates/monitoring/kube-prometheus-stack/oauth-proxy.yaml b/charts/base-cluster/templates/monitoring/kube-prometheus-stack/oauth-proxy.yaml index 2dca7d36f..bb2042c9e 100644 --- a/charts/base-cluster/templates/monitoring/kube-prometheus-stack/oauth-proxy.yaml +++ b/charts/base-cluster/templates/monitoring/kube-prometheus-stack/oauth-proxy.yaml @@ -3,7 +3,7 @@ {{- if include "base-cluster.monitoring.authenticated-ingress.enabled" (dict "name" "prometheus" "context" .) -}} {{- $backends = append $backends (dict "host" "prometheus" "port" 9090) -}} {{- end -}} -{{- if and (include "base-cluster.monitoring.authenticated-ingress.enabled" (dict "name" "alertmanager" "context" .)) false -}} +{{- if include "base-cluster.monitoring.authenticated-ingress.enabled" (dict "name" "alertmanager" "context" .) -}} {{- $backends = append $backends (dict "host" "alertmanager" "port" 9093) -}} {{- end -}} {{- range $backend := $backends }} diff --git a/charts/base-cluster/values.schema.json b/charts/base-cluster/values.schema.json index 88b64461f..287ed2971 100644 --- a/charts/base-cluster/values.schema.json +++ b/charts/base-cluster/values.schema.json @@ -77,9 +77,9 @@ }, "additionalProperties": false }, - "pause": { + "curl": { "type": "object", - "description": "Image to be used for pause containers", + "description": "Image with `curl` binary", "properties": { "image": { "$ref": "#/$defs/image" @@ -381,6 +381,23 @@ "type": "string" } }, + "deadMansSwitch": { + "type": "object", + "description": "This needs `.global.clusterName` to be set up as an integration in healthchecks.io. Also, `.global.baseDomain` has to be set.", + "properties": { + "enabled": { + "type": "boolean" + }, + "apiKey": { + "type": "string", + "description": "Used for registration and unregistration" + }, + "pingKey": { + "type": "string" + } + }, + "additionalProperties": false + }, "prometheus": { "type": "object", "properties": { @@ -458,6 +475,27 @@ "alertmanager": { "type": "object", "properties": { + "receivers": { + "type": "object", + "properties": { + "pagerduty": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "url": { + "type": "string" + }, + "integrationKey": { + "type": "string" + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + }, "ingress": { "$ref": "#/$defs/toolIngress" }, diff --git a/charts/base-cluster/values.yaml b/charts/base-cluster/values.yaml index c75f38d48..9a1bcca2a 100644 --- a/charts/base-cluster/values.yaml +++ b/charts/base-cluster/values.yaml @@ -68,7 +68,11 @@ global: registry: docker.io repository: vladgh/gpg digest: sha256:8514acc9c94607895e3dea724bd85d885252666212567f6632d2654580539ed3 - + curl: + image: + registry: docker.io + repository: curlimages/curl + tag: 8.4.0 networkPolicy: type: none dnsLabels: @@ -149,7 +153,7 @@ global: open-telemetry: url: https://open-telemetry.github.io/opentelemetry-helm-charts charts: - opentelemetry-collector: 0.x.x + opentelemetry-collector: 0.x.x condition: "{{ and .Values.monitoring.tracing.enabled .Values.monitoring.prometheus.enabled }}" authentication: config: @@ -212,6 +216,10 @@ monitoring: storageClassMapping: teutostack-hdd: 0.002 teutostack-ssd: 0.0067 + deadMansSwitch: + enabled: false + apiKey: "" + pingKey: "" prometheus: enabled: true replicas: 2 @@ -269,10 +277,15 @@ monitoring: host: prometheus customDomain: "" alertmanager: + receivers: + pagerduty: + enabled: false + integrationKey: "" + url: https://events.pagerduty.com/v2/enqueue ingress: host: alertmanager customDomain: "" - replicas: 1 + replicas: 3 retentionDuration: 120h persistence: storageClass: ""