From df26b2f1f879e5c586d36afec8dab62c07cd703d Mon Sep 17 00:00:00 2001 From: Nikodemas Tuckus Date: Tue, 28 Nov 2023 11:14:07 +0100 Subject: [PATCH 1/5] Add scripts, update manifests --- kubernetes/monitoring/deploy-mon.sh | 314 ++++++++++++++++++ kubernetes/monitoring/ingress/ingress.yaml | 2 +- ...nder.yaml => cmsmonit-cluster-cinder.yaml} | 16 +- 3 files changed, 323 insertions(+), 9 deletions(-) create mode 100755 kubernetes/monitoring/deploy-mon.sh rename kubernetes/monitoring/storages/{cmsmonitnew-cluster-cinder.yaml => cmsmonit-cluster-cinder.yaml} (83%) diff --git a/kubernetes/monitoring/deploy-mon.sh b/kubernetes/monitoring/deploy-mon.sh new file mode 100755 index 000000000..65fd032de --- /dev/null +++ b/kubernetes/monitoring/deploy-mon.sh @@ -0,0 +1,314 @@ +#!/bin/bash +# shellcheck disable=SC2181 +set -e +##H Usage: deploy-ha.sh ACTION +##H +##H Examples: +##H --- If CMSKubernetes, cmsmon-configs and secrets repos are in same directory --- +##H deploy-ha.sh status +##H deploy-ha.sh deploy-secrets +##H deploy-ha.sh deploy-all +##H deploy-ha.sh clean-services +##H --- Else --- +##H export SECRETS_D=$SOMEDIR/secrets; export CONFIGS_D=$SOMEDIR/cmsmon-configs; deploy-ha.sh ha1 status +##H +##H Arguments: ACTION should be one of the defined actions +##H Attention: this script depends on deploy-secrets.sh +##H +##H Actions: +##H help show this help +##H clean-all cleanup all services secrets storages cronjobs accounts +##H clean-services cleanup services +##H clean-secrets cleanup secrets +##H clean-storages cleanup storages +##H status check status of all cluster +##H test perform integration tests using VictoriaMetrics +##H deploy-all deploy everything +##H deploy-secrets deploy secrets +##H deploy-services deploy services +##H +##H Environments: +##H SECRETS_D defines secrets repository local path. (default CMSKubernetes parent dir) +##H CONFIGS_D defines cmsmon-configs repository local path. (default CMSKubernetes parent dir) +##H +##H READ the DOC: https://cmsmonit-docs.web.cern.ch/k8s/cluster_upgrades/#ha1 +##H + +unset script_dir action cluster sdir cdir deploy_secrets_sh +script_dir="$(cd "$(dirname "$0")" && pwd)" + +# help definition +if [ "$1" == "-h" ] || [ "$1" == "-help" ] || [ "$1" == "--help" ] || [ "$1" == "help" ] || [ "$1" == "" ]; then + grep "^##H" <"$0" | sed -e "s,##H,,g" + exit 1 +fi + +action=$1 + +sdir=${SECRETS_D:-"${script_dir}/../../../secrets"} +cdir=${CONFIGS_D:-"${script_dir}/../../../cmsmon-configs"} + +# deploy-secrets.sh temporary file +deploy_secrets_sh="$script_dir"/__temp-deploy-secrets__.sh + +if [[ -z $ha || -z $action || $ha != ha*[0-9] ]]; then + echo "ha or action is not defined. ha:${ha}, action:${action}. Exiting with help message..." + grep "^##H" <"$0" | sed -e "s,##H,,g" + exit 1 +fi + +echo "will continue with following values:" +echo "OS_PROJECT_NAME:${OS_PROJECT_NAME}, action: ${action}, secrets:${sdir}, cmsmon-configs:${cdir}" + +# ------------------------------------------ CONFIG CHECKS ---------------------------------------- +# Check prometheus configs +function check_configs_prometheus() { + if [ ! -f "$cdir"/prometheus/ha/prometheus.yaml ]; then + echo "Please provide ${cdir}/prometheus/prometheus.yaml file" + exit 1 + fi + # Prometheus conf should be in same directory with rules to check correctly + cp "$cdir"/prometheus/ha/prometheus.yaml "$cdir"/prometheus/__prometheus__.yaml + /cvmfs/cms.cern.ch/cmsmon/promtool check config "$cdir"/prometheus/__prometheus__.yaml + if [ $? -ne 0 ]; then + echo "Fail to validate prometheus config file" + exit 1 + fi + /cvmfs/cms.cern.ch/cmsmon/promtool check rules "$cdir"/prometheus/*.rules + if [ $? -ne 0 ]; then + echo "Fail to validate prometheus rules" + exit 1 + fi + # Delete temp file + rm "$cdir"/prometheus/__prometheus__.yaml +} + +# Check alertmanager configs +function check_configs_am() { + if [ ! -f "$cdir"/alertmanager/alertmanager.yaml ]; then + echo "Please provide ${cdir}/alertmanager/alertmanager.yaml file" + exit 1 + fi + /cvmfs/cms.cern.ch/cmsmon/amtool check-config "$cdir"/alertmanager/alertmanager.yaml + if [ $? -ne 0 ]; then + echo "Fail to validate alertmanager config file" + exit 1 + fi + /cvmfs/cms.cern.ch/cmsmon/amtool config routes show --config.file="${cdir}"/alertmanager/alertmanager.yaml +} + +# Check status of the cluster +function cluster_check() { + echo -e "\n*** check secrets" + kubectl get secrets -A | grep -E "default *|http *|alerts *" | grep Opaque + echo -e "\n*** check svc" + kubectl get svc -A | grep -E "default *|http *|alerts *" + echo -e "\n*** node status" + kubectl top node + echo -e "\n*** pods status" + kubectl top pods --sort-by=memory -A | grep -E "default *|http *|alerts *" +} + +# Test VictoriaMetrics +function test_vm() { + local url="http://cms-monitoring.cern.ch" + local purl=${url}:30422/api/put + local rurl=${url}:30428/api/v1/export + echo "put data into $purl" + curl -H 'Content-Type: application/json' -d '{"metric":"cms.test.exitCode", "value":1, "tags":{"exitCode": "8021", "site":"T2_US", "task":"test", "log":"/path/file.log"}}' "$purl" + echo "get data from $rurl" + curl -G "$rurl" -d 'match[]=cms.test.exitCode' +} +# ================================================================================================= + +# -------------------------------------- PREPARE deploy-secrets.sh ------------------------------- +# Create temporary deploy-secrets.sh with correct sdir and cdir +function create_temp_deploy_secrets_sh() { + echo "secrets dir: ${sdir}, cmsmon-configs dir: ${cdir}" + # + if [ ! -e "$script_dir"/deploy-secrets.sh ] || [ ! -d "$sdir" ] || [ ! -d "$cdir" ]; then + echo "Please check if [deploy-secrets.sh:${script_dir}], [secrets:${sdir}], [cmsmon-configs:${cdir}] exist" + exit 1 + fi + # + sed -e "s,cdir=cmsmon-configs.*,cdir=${cdir},g" \ + -e "s,sdir=secrets.*,sdir=${sdir},g" \ + "$script_dir"/deploy-secrets.sh >"$deploy_secrets_sh" + chmod +x "$deploy_secrets_sh" +} + +# Delete temporary deploy-secrets.sh +function rm_temp_deploy_secrets_sh() { + rm "$deploy_secrets_sh" +} +# ================================================================================================= + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +function deploy_secrets() { + create_temp_deploy_secrets_sh + # auth + "$deploy_secrets_sh" auth alertmanager-secrets + "$deploy_secrets_sh" auth cern-certificates + "$deploy_secrets_sh" auth proxy-secrets + "$deploy_secrets_sh" auth robot-secrets + # cpueff + "$deploy_secrets_sh" cpueff cpueff-mongo-secrets + # default + "$deploy_secrets_sh" default alertmanager-secrets + "$deploy_secrets_sh" default karma-secrets + "$deploy_secrets_sh" default prometheus-secrets + "$deploy_secrets_sh" default promxy-secrets + "$deploy_secrets_sh" default proxy-secrets + "$deploy_secrets_sh" default robot-secrets + # http + "$deploy_secrets_sh" http certcheck-secrets + "$deploy_secrets_sh" http es-wma-secrets + "$deploy_secrets_sh" http keytab-secrets + "$deploy_secrets_sh" http krb5cc-secrets + "$deploy_secrets_sh" http proxy-secrets + "$deploy_secrets_sh" http robot-secrets + # + rm_temp_deploy_secrets_sh +} +function clean_secrets() { + # auth + kubectl -n auth --ignore-not-found=true delete secret auth-secrets + kubectl -n auth --ignore-not-found=true delete secret cern-certificates + kubectl -n auth --ignore-not-found=true delete secret proxy-secrets + kubectl -n auth --ignore-not-found=true delete secret robot-secrets + # cpueff + kubectl -n cpueff --ignore-not-found=true delete secret cpueff-mongo-secrets + # default + kubectl -n default --ignore-not-found=true delete secret alertmanager-secrets + kubectl -n default --ignore-not-found=true delete secret karma-secrets + kubectl -n default --ignore-not-found=true delete secret prometheus-secrets + kubectl -n default --ignore-not-found=true delete secret promxy-secrets + kubectl -n default --ignore-not-found=true delete secret proxy-secrets + kubectl -n default --ignore-not-found=true delete secret robot-secrets + # http + kubectl -n http --ignore-not-found=true delete secret certcheck-secrets + kubectl -n http --ignore-not-found=true delete secret es-wma-secrets + kubectl -n http --ignore-not-found=true delete secret keytab-secrets + kubectl -n http --ignore-not-found=true delete secret krb5cc-secrets + kubectl -n http --ignore-not-found=true delete secret proxy-secrets + kubectl -n http --ignore-not-found=true delete secret robot-secrets +} +function deploy_services() { + # Fails because of /etc/proxy/proxy tls conf + # auth + kubectl -n auth apply -f services/auth-proxy-server.yaml + # cpueff + kubectl -n cpueff apply -f services/cpueff/cpueff-goweb.yaml + kubectl -n cpueff apply -f services/cpueff/mongo-cpueff.yaml + # default + kubectl -n default apply -f services/httpgo.yaml + kubectl -n default apply -f services/karma.yaml + kubectl -n default apply -f kmon/kube-eagle.yaml + kubectl -n default apply -f services/promxy.yaml + kubectl -n default apply -f services/pushgateway.yaml + # http + find "${script_dir}"/services/ -name "*-exp*.yaml" | awk '{print "kubectl apply -f "$1""}' | /bin/sh +} +function clean_all_services() { + # auth + kubectl -n auth --ignore-not-found=true delete -f services/auth-proxy-server.yaml + # cpueff + kubectl -n cpueff --ignore-not-found=true delete -f services/cpueff/cpueff-goweb.yaml + kubectl -n cpueff --ignore-not-found=true delete -f services/cpueff/mongo-cpueff.yaml + # default + kubectl -n default --ignore-not-found=true delete -f services/alertmanager.yaml + kubectl -n default --ignore-not-found=true delete -f services/httpgo.yaml + kubectl -n default --ignore-not-found=true delete -f services/karma.yaml + kubectl -n default --ignore-not-found=true delete -f kmon/kube-eagle.yaml + kubectl -n default --ignore-not-found=true delete -f services/prometheus.yaml + kubectl -n default --ignore-not-found=true delete -f services/promxy.yaml + kubectl -n default --ignore-not-found=true delete -f services/pushgateway.yaml + kubectl -n default --ignore-not-found=true delete -f services/victoria-metrics.yaml + # http + find "${script_dir}"/services/ -name "*-exp*.yaml" | awk '{print "kubectl --ignore-not-found=true delete -f "$1""}' | /bin/sh +} + +function deploy_storage_services() { + # Fails because of /etc/proxy/proxy tls conf + check_configs_prometheus + check_configs_am + # default + kubectl -n default apply -f services/alertmanager.yaml + kubectl -n default apply -f services/prometheus.yaml + kubectl -n default apply -f services/victoria-metrics.yaml +} + +# cluster cronjob deployment +function deploy_cronjobs() { + kubectl -n auth apply -f crons/cron-proxy.yaml + kubectl -n cpueff apply -f cpueff/cpueff-spark.yaml + kubectl -n default apply -f crons/cron-proxy.yaml + kubectl -n http apply -f crons/cron-kerberos.yaml + kubectl -n http apply -f crons/cron-proxy.yaml +} + +function clean_cronjobs() { + kubectl -n auth --ignore-not-found=true delete -f crons/cron-proxy.yaml + kubectl -n cpueff --ignore-not-found=true delete -f cpueff/cpueff-spark.yaml + kubectl -n default --ignore-not-found=true delete -f crons/cron-proxy.yaml + kubectl -n http --ignore-not-found=true delete -f crons/cron-kerberos.yaml + kubectl -n http --ignore-not-found=true delete -f crons/cron-proxy.yaml +} + +# Deploy cinder volumes for default namespace +function deploy_storages() { + kubectl apply -f storages/cmsmonit-cluster-cinder.yaml -n default +} +function clean_storages() { + kubectl delete -f storages/cmsmonit-cluster-cinder.yaml -n default +} +# cluster ingress deployment +deploy_ingress() +{ + # add labels for ingress + kubectl get node | grep node | \ + awk '{print "kubectl label node "$1" role=ingress --overwrite"}' | /bin/sh + # deploy ingress controller + kubectl apply -f ingress/ingress.yaml +} +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MAIN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +namespaces="auth cpueff http " +deploy_all() { + for _ns in $namespaces; do + if ! kubectl get ns | grep -q $_ns; then + kubectl create namespace $_ns + fi + done + deploy_secrets + deploy_services +} +clean_all() { + clean_all_services + clean_cronjobs + sleep 10 + clean_secrets + clean_storages + for _ns in $namespaces; do + if kubectl get ns | grep -q $_ns; then + kubectl --ignore-not-found=true delete namespace $_ns + fi + done +} +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# Main routine, perform action requested on command line. +case ${action:-help} in +"deploy-all") deploy_all ;; +"deploy-secrets") deploy_secrets ;; +"deploy-services") deploy_services ;; +"deploy-storages") deploy_storages ;; +"deploy-storage-services") deploy_storage_services ;; +"status") cluster_check ;; +"clean-all") clean_all ;; +"clean-services") clean_services ;; +"clean-secrets") clean_secrets ;; +"clean-storages") clean_storages ;; +"test") test_vm ;; +"help") grep "^##H" <"$0" | sed -e "s,##H,,g" ;; +*) grep "^##H" <"$0" | sed -e "s,##H,,g" ;; +esac diff --git a/kubernetes/monitoring/ingress/ingress.yaml b/kubernetes/monitoring/ingress/ingress.yaml index e6880b71c..05b6d7b4b 100644 --- a/kubernetes/monitoring/ingress/ingress.yaml +++ b/kubernetes/monitoring/ingress/ingress.yaml @@ -1,4 +1,4 @@ -apiVersion: extensions/v1beta1 +apiVersion: networking.k8s.io/v1 # apiVersion: networking.k8s.io/v1 kind: Ingress metadata: diff --git a/kubernetes/monitoring/storages/cmsmonitnew-cluster-cinder.yaml b/kubernetes/monitoring/storages/cmsmonit-cluster-cinder.yaml similarity index 83% rename from kubernetes/monitoring/storages/cmsmonitnew-cluster-cinder.yaml rename to kubernetes/monitoring/storages/cmsmonit-cluster-cinder.yaml index 648c767dd..d610f8d25 100644 --- a/kubernetes/monitoring/storages/cmsmonitnew-cluster-cinder.yaml +++ b/kubernetes/monitoring/storages/cmsmonit-cluster-cinder.yaml @@ -1,9 +1,9 @@ apiVersion: v1 kind: PersistentVolume metadata: - name: pv-am-volume-claim + name: pv-alertmanager-volume-claim spec: - storageClassName: sc-am-volume-claim + storageClassName: sc-alertmanager-volume-claim accessModes: - ReadWriteOnce capacity: @@ -17,10 +17,10 @@ spec: apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: am-volume-claim + name: alertmanager-volume-claim spec: # The class must be the same as in the PV - storageClassName: sc-am-volume-claim + storageClassName: sc-alertmanager-volume-claim accessModes: - ReadWriteOnce resources: @@ -31,9 +31,9 @@ spec: apiVersion: v1 kind: PersistentVolume metadata: - name: pv-prom-volume-claim + name: pv-prometheus-volume-claim spec: - storageClassName: sc-prom-volume-claim + storageClassName: sc-prometheus-volume-claim accessModes: - ReadWriteOnce capacity: @@ -47,10 +47,10 @@ spec: apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: prom-volume-claim + name: prometheus-volume-claim spec: # The class must be the same as in the PV - storageClassName: sc-prom-volume-claim + storageClassName: sc-prometheus-volume-claim accessModes: - ReadWriteOnce resources: From 8f8703e20b0b5e64bacb83bd39d02cbf5d46b348 Mon Sep 17 00:00:00 2001 From: Nikodemas Tuckus Date: Wed, 29 Nov 2023 10:28:11 +0100 Subject: [PATCH 2/5] Update image versions --- docker/promxy/Dockerfile | 2 +- kubernetes/monitoring/kmon/kube-eagle.yaml | 2 +- kubernetes/monitoring/services/alertmanager.yaml | 3 +-- kubernetes/monitoring/services/auth-proxy-server.yaml | 2 +- kubernetes/monitoring/services/prometheus.yaml | 3 +-- kubernetes/monitoring/services/promxy.yaml | 2 +- kubernetes/monitoring/services/victoria-metrics.yaml | 2 +- 7 files changed, 7 insertions(+), 9 deletions(-) diff --git a/docker/promxy/Dockerfile b/docker/promxy/Dockerfile index ae43e611d..bf41768fa 100644 --- a/docker/promxy/Dockerfile +++ b/docker/promxy/Dockerfile @@ -4,7 +4,7 @@ ENV WDIR=/data ENV USER=promxy EXPOSE 8082 WORKDIR $WDIR -ENV VER=v0.0.77 +ENV VER=v0.0.84 RUN curl -ksLO https://github.com/jacksontj/promxy/releases/download/${VER}/promxy-${VER}-linux-amd64 && mv promxy-${VER}-linux-amd64 promxy && chmod +x /data/promxy #FROM alpine:3.17 diff --git a/kubernetes/monitoring/kmon/kube-eagle.yaml b/kubernetes/monitoring/kmon/kube-eagle.yaml index 6b67b0daa..73952b7c4 100644 --- a/kubernetes/monitoring/kmon/kube-eagle.yaml +++ b/kubernetes/monitoring/kmon/kube-eagle.yaml @@ -93,7 +93,7 @@ spec: serviceAccountName: sa-kube-eagle containers: - name: kube-eagle - image: "quay.io/google-cloud-tools/kube-eagle:1.1.0" + image: "quay.io/google-cloud-tools/kube-eagle:1.1.4" imagePullPolicy: IfNotPresent env: - name: TELEMETRY_HOST diff --git a/kubernetes/monitoring/services/alertmanager.yaml b/kubernetes/monitoring/services/alertmanager.yaml index 7918eb86f..e6a37ecd0 100644 --- a/kubernetes/monitoring/services/alertmanager.yaml +++ b/kubernetes/monitoring/services/alertmanager.yaml @@ -57,5 +57,4 @@ spec: secretName: alertmanager-secrets - name: cinder-volume persistentVolumeClaim: - claimName: am-volume-claim -# claimName: alertmanager-volume-claim + claimName: alertmanager-volume-claim diff --git a/kubernetes/monitoring/services/auth-proxy-server.yaml b/kubernetes/monitoring/services/auth-proxy-server.yaml index d7f99b3ad..e3dbc6dd6 100644 --- a/kubernetes/monitoring/services/auth-proxy-server.yaml +++ b/kubernetes/monitoring/services/auth-proxy-server.yaml @@ -35,7 +35,7 @@ spec: prometheus.io/port: "9091" spec: containers: - - image: registry.cern.ch/cmsweb/auth-proxy-server:0.2.46 + - image: registry.cern.ch/cmsweb/auth-proxy-server:0.2.48 name: auth-proxy-server args: - /data/auth-proxy-server diff --git a/kubernetes/monitoring/services/prometheus.yaml b/kubernetes/monitoring/services/prometheus.yaml index a6da79df3..66774f8fd 100644 --- a/kubernetes/monitoring/services/prometheus.yaml +++ b/kubernetes/monitoring/services/prometheus.yaml @@ -61,8 +61,7 @@ spec: secretName: prometheus-secrets - name: cinder-volume persistentVolumeClaim: - claimName: prom-volume-claim -# claimName: prometheus-volume-claim + claimName: prometheus-volume-claim - name: proxy-secrets secret: secretName: proxy-secrets diff --git a/kubernetes/monitoring/services/promxy.yaml b/kubernetes/monitoring/services/promxy.yaml index cf6633f70..a4a7e5f6a 100644 --- a/kubernetes/monitoring/services/promxy.yaml +++ b/kubernetes/monitoring/services/promxy.yaml @@ -36,7 +36,7 @@ spec: - /data/promxy - --config=/etc/promxy/config.yaml name: promxy - image: registry.cern.ch/cmsmonitoring/promxy:v0.0.77 + image: registry.cern.ch/cmsmonitoring/promxy:v0.0.84 ports: - containerPort: 8082 protocol: TCP diff --git a/kubernetes/monitoring/services/victoria-metrics.yaml b/kubernetes/monitoring/services/victoria-metrics.yaml index 189717a42..84dfa1813 100644 --- a/kubernetes/monitoring/services/victoria-metrics.yaml +++ b/kubernetes/monitoring/services/victoria-metrics.yaml @@ -40,7 +40,7 @@ spec: - -storageDataPath=/tsdb - -search.maxConcurrentRequests=32 name: victoria-metrics - image: victoriametrics/victoria-metrics:v1.89.1 + image: victoriametrics/victoria-metrics:v1.95.1 ports: - containerPort: 4242 protocol: TCP From 43b58e9874b126ec674517563d56d4529246ee06 Mon Sep 17 00:00:00 2001 From: Nikodemas Tuckus Date: Thu, 30 Nov 2023 10:20:28 +0100 Subject: [PATCH 3/5] Update script, versions --- kubernetes/monitoring/deploy-mon.sh | 56 ++++++++++--------- kubernetes/monitoring/ingress/ingress.yaml | 11 ++-- .../services/cpueff/cpueff-spark.yaml | 3 +- 3 files changed, 37 insertions(+), 33 deletions(-) diff --git a/kubernetes/monitoring/deploy-mon.sh b/kubernetes/monitoring/deploy-mon.sh index 65fd032de..0b07995ea 100755 --- a/kubernetes/monitoring/deploy-mon.sh +++ b/kubernetes/monitoring/deploy-mon.sh @@ -1,31 +1,34 @@ #!/bin/bash # shellcheck disable=SC2181 set -e -##H Usage: deploy-ha.sh ACTION +##H Usage: deploy-mon.sh ACTION ##H ##H Examples: ##H --- If CMSKubernetes, cmsmon-configs and secrets repos are in same directory --- -##H deploy-ha.sh status -##H deploy-ha.sh deploy-secrets -##H deploy-ha.sh deploy-all -##H deploy-ha.sh clean-services +##H deploy-mon.sh status +##H deploy-mon.sh deploy-secrets +##H deploy-mon.sh deploy-all +##H deploy-mon.sh clean-services ##H --- Else --- -##H export SECRETS_D=$SOMEDIR/secrets; export CONFIGS_D=$SOMEDIR/cmsmon-configs; deploy-ha.sh ha1 status +##H export SECRETS_D=$SOMEDIR/secrets; export CONFIGS_D=$SOMEDIR/cmsmon-configs; deploy-mon.sh status ##H ##H Arguments: ACTION should be one of the defined actions ##H Attention: this script depends on deploy-secrets.sh ##H ##H Actions: -##H help show this help -##H clean-all cleanup all services secrets storages cronjobs accounts -##H clean-services cleanup services -##H clean-secrets cleanup secrets -##H clean-storages cleanup storages -##H status check status of all cluster -##H test perform integration tests using VictoriaMetrics -##H deploy-all deploy everything -##H deploy-secrets deploy secrets -##H deploy-services deploy services +##H help show this help +##H clean-all cleanup all services secrets storages cronjobs accounts +##H clean-services cleanup services +##H clean-secrets cleanup secrets +##H clean-storages cleanup storages +##H status check status of all cluster +##H test perform integration tests using VictoriaMetrics +##H deploy-all deploy everything except for storages +##H and services using storages +##H deploy-secrets deploy secrets +##H deploy-services deploy services +##H deploy-storages deploy storages +##H deploy-storage-services deploy services that use storages (Prometheus and etc.) ##H ##H Environments: ##H SECRETS_D defines secrets repository local path. (default CMSKubernetes parent dir) @@ -51,8 +54,8 @@ cdir=${CONFIGS_D:-"${script_dir}/../../../cmsmon-configs"} # deploy-secrets.sh temporary file deploy_secrets_sh="$script_dir"/__temp-deploy-secrets__.sh -if [[ -z $ha || -z $action || $ha != ha*[0-9] ]]; then - echo "ha or action is not defined. ha:${ha}, action:${action}. Exiting with help message..." +if [[ -z $action ]]; then + echo "action is not defined. action:${action}. Exiting with help message..." grep "^##H" <"$0" | sed -e "s,##H,,g" exit 1 fi @@ -147,7 +150,7 @@ function rm_temp_deploy_secrets_sh() { function deploy_secrets() { create_temp_deploy_secrets_sh # auth - "$deploy_secrets_sh" auth alertmanager-secrets + "$deploy_secrets_sh" auth auth-secrets "$deploy_secrets_sh" auth cern-certificates "$deploy_secrets_sh" auth proxy-secrets "$deploy_secrets_sh" auth robot-secrets @@ -194,7 +197,6 @@ function clean_secrets() { kubectl -n http --ignore-not-found=true delete secret robot-secrets } function deploy_services() { - # Fails because of /etc/proxy/proxy tls conf # auth kubectl -n auth apply -f services/auth-proxy-server.yaml # cpueff @@ -202,10 +204,7 @@ function deploy_services() { kubectl -n cpueff apply -f services/cpueff/mongo-cpueff.yaml # default kubectl -n default apply -f services/httpgo.yaml - kubectl -n default apply -f services/karma.yaml kubectl -n default apply -f kmon/kube-eagle.yaml - kubectl -n default apply -f services/promxy.yaml - kubectl -n default apply -f services/pushgateway.yaml # http find "${script_dir}"/services/ -name "*-exp*.yaml" | awk '{print "kubectl apply -f "$1""}' | /bin/sh } @@ -230,18 +229,21 @@ function clean_all_services() { function deploy_storage_services() { # Fails because of /etc/proxy/proxy tls conf - check_configs_prometheus + # check_configs_prometheus check_configs_am # default kubectl -n default apply -f services/alertmanager.yaml + kubectl -n default apply -f services/karma.yaml kubectl -n default apply -f services/prometheus.yaml + kubectl -n default apply -f services/promxy.yaml + kubectl -n default apply -f services/pushgateway.yaml kubectl -n default apply -f services/victoria-metrics.yaml } # cluster cronjob deployment function deploy_cronjobs() { kubectl -n auth apply -f crons/cron-proxy.yaml - kubectl -n cpueff apply -f cpueff/cpueff-spark.yaml + kubectl -n cpueff apply -f services/cpueff/cpueff-spark.yaml kubectl -n default apply -f crons/cron-proxy.yaml kubectl -n http apply -f crons/cron-kerberos.yaml kubectl -n http apply -f crons/cron-proxy.yaml @@ -249,7 +251,7 @@ function deploy_cronjobs() { function clean_cronjobs() { kubectl -n auth --ignore-not-found=true delete -f crons/cron-proxy.yaml - kubectl -n cpueff --ignore-not-found=true delete -f cpueff/cpueff-spark.yaml + kubectl -n cpueff --ignore-not-found=true delete -f services/cpueff/cpueff-spark.yaml kubectl -n default --ignore-not-found=true delete -f crons/cron-proxy.yaml kubectl -n http --ignore-not-found=true delete -f crons/cron-kerberos.yaml kubectl -n http --ignore-not-found=true delete -f crons/cron-proxy.yaml @@ -281,6 +283,8 @@ deploy_all() { done deploy_secrets deploy_services + deploy_cronjobs + deploy_ingress } clean_all() { clean_all_services diff --git a/kubernetes/monitoring/ingress/ingress.yaml b/kubernetes/monitoring/ingress/ingress.yaml index 05b6d7b4b..05a1efc20 100644 --- a/kubernetes/monitoring/ingress/ingress.yaml +++ b/kubernetes/monitoring/ingress/ingress.yaml @@ -1,5 +1,4 @@ apiVersion: networking.k8s.io/v1 -# apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: ingress @@ -18,11 +17,13 @@ spec: http: paths: - path: / -# pathType: Prefix + pathType: Prefix backend: - serviceName: auth-proxy-server - servicePort: 443 + service: + name: auth-proxy-server + port: + number: 443 tls: - hosts: - cms-monitoring.cern.ch - secretName: auth-secrets + secretName: auth-secrets diff --git a/kubernetes/monitoring/services/cpueff/cpueff-spark.yaml b/kubernetes/monitoring/services/cpueff/cpueff-spark.yaml index ae6a23472..e347f843d 100644 --- a/kubernetes/monitoring/services/cpueff/cpueff-spark.yaml +++ b/kubernetes/monitoring/services/cpueff/cpueff-spark.yaml @@ -38,8 +38,7 @@ data: /data/CMSMonitoring/cpueff-goweb/spark/cron4cpueff_goweb.sh \ --keytab /etc/secrets/keytab --p1 31205 --p2 31206 --host $MY_NODE_NAME --wdir $WDIR --- -#apiVersion: batch/v1 -apiVersion: batch/v1beta1 +apiVersion: batch/v1 kind: CronJob metadata: name: cpueff-spark From 2a3567519fa9b455b1d263786d616e9304ed6d15 Mon Sep 17 00:00:00 2001 From: Nikodemas Tuckus Date: Mon, 4 Dec 2023 15:31:58 +0100 Subject: [PATCH 4/5] Update dmmon cluster scripts and manifests --- kubernetes/monitoring/create_cluster_dmmon.sh | 2 -- kubernetes/monitoring/services/auth-proxy-server-dm.yaml | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/kubernetes/monitoring/create_cluster_dmmon.sh b/kubernetes/monitoring/create_cluster_dmmon.sh index 175cda01d..f253305be 100755 --- a/kubernetes/monitoring/create_cluster_dmmon.sh +++ b/kubernetes/monitoring/create_cluster_dmmon.sh @@ -35,7 +35,6 @@ openstack --os-project-name "$namespace" coe cluster create "$name" \ --node-count 2 \ --flavor m2.large \ --merge-labels \ - --labels availability_zone="" \ --labels cinder_csi_enabled="true" \ --labels logging_include_internal="true" \ --labels logging_http_destination="http://monit-logs.cern.ch:10012/" \ @@ -45,7 +44,6 @@ openstack --os-project-name "$namespace" coe cluster create "$name" \ --labels logging_producer="cmswebk8s" \ --labels ingress_controller="nginx" \ --labels cern_enabled="true" \ - --labels keystone_auth_enabled="true" \ --labels logging_type="http" # Ref: https://cms-http-group.docs.cern.ch/k8s_cluster/cmsweb-deployment/ diff --git a/kubernetes/monitoring/services/auth-proxy-server-dm.yaml b/kubernetes/monitoring/services/auth-proxy-server-dm.yaml index 81a25ca29..2f8a5d8f3 100644 --- a/kubernetes/monitoring/services/auth-proxy-server-dm.yaml +++ b/kubernetes/monitoring/services/auth-proxy-server-dm.yaml @@ -35,7 +35,7 @@ spec: prometheus.io/port: "9091" spec: containers: - - image: registry.cern.ch/cmsweb/auth-proxy-server:0.2.46 + - image: registry.cern.ch/cmsweb/auth-proxy-server:0.2.48 name: auth-proxy-server args: - /data/auth-proxy-server From 951c3ff705816f7e186427d16fd92889560364d3 Mon Sep 17 00:00:00 2001 From: Nikodemas Tuckus Date: Mon, 4 Dec 2023 15:37:30 +0100 Subject: [PATCH 5/5] Update kenv script --- kubernetes/monitoring/scripts/kenv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/monitoring/scripts/kenv b/kubernetes/monitoring/scripts/kenv index 522c3d55f..19775e18c 100644 --- a/kubernetes/monitoring/scripts/kenv +++ b/kubernetes/monitoring/scripts/kenv @@ -21,7 +21,7 @@ else # CMS Monitoring clusters export OS_PROJECT_NAME="CMS Web" if [ "$1" == "mon" ]; then - export KUBECONFIG=$adir/config.monit/config.cmsmonitnew + export KUBECONFIG=$adir/config.monit/config.cmsmonit elif [ "$1" == "ha1" ]; then export KUBECONFIG=$adir/config.monit/config.monitoring-vm-ha1 elif [ "$1" == "ha2" ]; then