Skip to content

Commit

Permalink
Merge pull request #1442 from nikodemas/update_mon_cluster
Browse files Browse the repository at this point in the history
Upgrade monitoring clusters
  • Loading branch information
nikodemas authored Dec 4, 2023
2 parents 97cd8ec + 951c3ff commit bff1269
Show file tree
Hide file tree
Showing 14 changed files with 343 additions and 29 deletions.
2 changes: 1 addition & 1 deletion docker/promxy/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ ENV WDIR=/data
ENV USER=promxy
EXPOSE 8082
WORKDIR $WDIR
ENV VER=v0.0.77
ENV VER=v0.0.84
RUN curl -ksLO https://github.com/jacksontj/promxy/releases/download/${VER}/promxy-${VER}-linux-amd64 && mv promxy-${VER}-linux-amd64 promxy && chmod +x /data/promxy

#FROM alpine:3.17
Expand Down
2 changes: 0 additions & 2 deletions kubernetes/monitoring/create_cluster_dmmon.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ openstack --os-project-name "$namespace" coe cluster create "$name" \
--node-count 2 \
--flavor m2.large \
--merge-labels \
--labels availability_zone="" \
--labels cinder_csi_enabled="true" \
--labels logging_include_internal="true" \
--labels logging_http_destination="http://monit-logs.cern.ch:10012/" \
Expand All @@ -45,7 +44,6 @@ openstack --os-project-name "$namespace" coe cluster create "$name" \
--labels logging_producer="cmswebk8s" \
--labels ingress_controller="nginx" \
--labels cern_enabled="true" \
--labels keystone_auth_enabled="true" \
--labels logging_type="http"

# Ref: https://cms-http-group.docs.cern.ch/k8s_cluster/cmsweb-deployment/
Expand Down
318 changes: 318 additions & 0 deletions kubernetes/monitoring/deploy-mon.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
#!/bin/bash
# shellcheck disable=SC2181
set -e
##H Usage: deploy-mon.sh ACTION
##H
##H Examples:
##H --- If CMSKubernetes, cmsmon-configs and secrets repos are in same directory ---
##H deploy-mon.sh status
##H deploy-mon.sh deploy-secrets
##H deploy-mon.sh deploy-all
##H deploy-mon.sh clean-services
##H --- Else ---
##H export SECRETS_D=$SOMEDIR/secrets; export CONFIGS_D=$SOMEDIR/cmsmon-configs; deploy-mon.sh status
##H
##H Arguments: ACTION should be one of the defined actions
##H Attention: this script depends on deploy-secrets.sh
##H
##H Actions:
##H help show this help
##H clean-all cleanup all services secrets storages cronjobs accounts
##H clean-services cleanup services
##H clean-secrets cleanup secrets
##H clean-storages cleanup storages
##H status check status of all cluster
##H test perform integration tests using VictoriaMetrics
##H deploy-all deploy everything except for storages
##H and services using storages
##H deploy-secrets deploy secrets
##H deploy-services deploy services
##H deploy-storages deploy storages
##H deploy-storage-services deploy services that use storages (Prometheus and etc.)
##H
##H Environments:
##H SECRETS_D defines secrets repository local path. (default CMSKubernetes parent dir)
##H CONFIGS_D defines cmsmon-configs repository local path. (default CMSKubernetes parent dir)
##H
##H READ the DOC: https://cmsmonit-docs.web.cern.ch/k8s/cluster_upgrades/#ha1
##H

unset script_dir action cluster sdir cdir deploy_secrets_sh
script_dir="$(cd "$(dirname "$0")" && pwd)"

# help definition
if [ "$1" == "-h" ] || [ "$1" == "-help" ] || [ "$1" == "--help" ] || [ "$1" == "help" ] || [ "$1" == "" ]; then
grep "^##H" <"$0" | sed -e "s,##H,,g"
exit 1
fi

action=$1

sdir=${SECRETS_D:-"${script_dir}/../../../secrets"}
cdir=${CONFIGS_D:-"${script_dir}/../../../cmsmon-configs"}

# deploy-secrets.sh temporary file
deploy_secrets_sh="$script_dir"/__temp-deploy-secrets__.sh

if [[ -z $action ]]; then
echo "action is not defined. action:${action}. Exiting with help message..."
grep "^##H" <"$0" | sed -e "s,##H,,g"
exit 1
fi

echo "will continue with following values:"
echo "OS_PROJECT_NAME:${OS_PROJECT_NAME}, action: ${action}, secrets:${sdir}, cmsmon-configs:${cdir}"

# ------------------------------------------ CONFIG CHECKS ----------------------------------------
# Check prometheus configs
function check_configs_prometheus() {
if [ ! -f "$cdir"/prometheus/ha/prometheus.yaml ]; then
echo "Please provide ${cdir}/prometheus/prometheus.yaml file"
exit 1
fi
# Prometheus conf should be in same directory with rules to check correctly
cp "$cdir"/prometheus/ha/prometheus.yaml "$cdir"/prometheus/__prometheus__.yaml
/cvmfs/cms.cern.ch/cmsmon/promtool check config "$cdir"/prometheus/__prometheus__.yaml
if [ $? -ne 0 ]; then
echo "Fail to validate prometheus config file"
exit 1
fi
/cvmfs/cms.cern.ch/cmsmon/promtool check rules "$cdir"/prometheus/*.rules
if [ $? -ne 0 ]; then
echo "Fail to validate prometheus rules"
exit 1
fi
# Delete temp file
rm "$cdir"/prometheus/__prometheus__.yaml
}

# Check alertmanager configs
function check_configs_am() {
if [ ! -f "$cdir"/alertmanager/alertmanager.yaml ]; then
echo "Please provide ${cdir}/alertmanager/alertmanager.yaml file"
exit 1
fi
/cvmfs/cms.cern.ch/cmsmon/amtool check-config "$cdir"/alertmanager/alertmanager.yaml
if [ $? -ne 0 ]; then
echo "Fail to validate alertmanager config file"
exit 1
fi
/cvmfs/cms.cern.ch/cmsmon/amtool config routes show --config.file="${cdir}"/alertmanager/alertmanager.yaml
}

# Check status of the cluster
function cluster_check() {
echo -e "\n*** check secrets"
kubectl get secrets -A | grep -E "default *|http *|alerts *" | grep Opaque
echo -e "\n*** check svc"
kubectl get svc -A | grep -E "default *|http *|alerts *"
echo -e "\n*** node status"
kubectl top node
echo -e "\n*** pods status"
kubectl top pods --sort-by=memory -A | grep -E "default *|http *|alerts *"
}

# Test VictoriaMetrics
function test_vm() {
local url="http://cms-monitoring.cern.ch"
local purl=${url}:30422/api/put
local rurl=${url}:30428/api/v1/export
echo "put data into $purl"
curl -H 'Content-Type: application/json' -d '{"metric":"cms.test.exitCode", "value":1, "tags":{"exitCode": "8021", "site":"T2_US", "task":"test", "log":"/path/file.log"}}' "$purl"
echo "get data from $rurl"
curl -G "$rurl" -d 'match[]=cms.test.exitCode'
}
# =================================================================================================

# -------------------------------------- PREPARE deploy-secrets.sh -------------------------------
# Create temporary deploy-secrets.sh with correct sdir and cdir
function create_temp_deploy_secrets_sh() {
echo "secrets dir: ${sdir}, cmsmon-configs dir: ${cdir}"
#
if [ ! -e "$script_dir"/deploy-secrets.sh ] || [ ! -d "$sdir" ] || [ ! -d "$cdir" ]; then
echo "Please check if [deploy-secrets.sh:${script_dir}], [secrets:${sdir}], [cmsmon-configs:${cdir}] exist"
exit 1
fi
#
sed -e "s,cdir=cmsmon-configs.*,cdir=${cdir},g" \
-e "s,sdir=secrets.*,sdir=${sdir},g" \
"$script_dir"/deploy-secrets.sh >"$deploy_secrets_sh"
chmod +x "$deploy_secrets_sh"
}

# Delete temporary deploy-secrets.sh
function rm_temp_deploy_secrets_sh() {
rm "$deploy_secrets_sh"
}
# =================================================================================================

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
function deploy_secrets() {
create_temp_deploy_secrets_sh
# auth
"$deploy_secrets_sh" auth auth-secrets
"$deploy_secrets_sh" auth cern-certificates
"$deploy_secrets_sh" auth proxy-secrets
"$deploy_secrets_sh" auth robot-secrets
# cpueff
"$deploy_secrets_sh" cpueff cpueff-mongo-secrets
# default
"$deploy_secrets_sh" default alertmanager-secrets
"$deploy_secrets_sh" default karma-secrets
"$deploy_secrets_sh" default prometheus-secrets
"$deploy_secrets_sh" default promxy-secrets
"$deploy_secrets_sh" default proxy-secrets
"$deploy_secrets_sh" default robot-secrets
# http
"$deploy_secrets_sh" http certcheck-secrets
"$deploy_secrets_sh" http es-wma-secrets
"$deploy_secrets_sh" http keytab-secrets
"$deploy_secrets_sh" http krb5cc-secrets
"$deploy_secrets_sh" http proxy-secrets
"$deploy_secrets_sh" http robot-secrets
#
rm_temp_deploy_secrets_sh
}
function clean_secrets() {
# auth
kubectl -n auth --ignore-not-found=true delete secret auth-secrets
kubectl -n auth --ignore-not-found=true delete secret cern-certificates
kubectl -n auth --ignore-not-found=true delete secret proxy-secrets
kubectl -n auth --ignore-not-found=true delete secret robot-secrets
# cpueff
kubectl -n cpueff --ignore-not-found=true delete secret cpueff-mongo-secrets
# default
kubectl -n default --ignore-not-found=true delete secret alertmanager-secrets
kubectl -n default --ignore-not-found=true delete secret karma-secrets
kubectl -n default --ignore-not-found=true delete secret prometheus-secrets
kubectl -n default --ignore-not-found=true delete secret promxy-secrets
kubectl -n default --ignore-not-found=true delete secret proxy-secrets
kubectl -n default --ignore-not-found=true delete secret robot-secrets
# http
kubectl -n http --ignore-not-found=true delete secret certcheck-secrets
kubectl -n http --ignore-not-found=true delete secret es-wma-secrets
kubectl -n http --ignore-not-found=true delete secret keytab-secrets
kubectl -n http --ignore-not-found=true delete secret krb5cc-secrets
kubectl -n http --ignore-not-found=true delete secret proxy-secrets
kubectl -n http --ignore-not-found=true delete secret robot-secrets
}
function deploy_services() {
# auth
kubectl -n auth apply -f services/auth-proxy-server.yaml
# cpueff
kubectl -n cpueff apply -f services/cpueff/cpueff-goweb.yaml
kubectl -n cpueff apply -f services/cpueff/mongo-cpueff.yaml
# default
kubectl -n default apply -f services/httpgo.yaml
kubectl -n default apply -f kmon/kube-eagle.yaml
# http
find "${script_dir}"/services/ -name "*-exp*.yaml" | awk '{print "kubectl apply -f "$1""}' | /bin/sh
}
function clean_all_services() {
# auth
kubectl -n auth --ignore-not-found=true delete -f services/auth-proxy-server.yaml
# cpueff
kubectl -n cpueff --ignore-not-found=true delete -f services/cpueff/cpueff-goweb.yaml
kubectl -n cpueff --ignore-not-found=true delete -f services/cpueff/mongo-cpueff.yaml
# default
kubectl -n default --ignore-not-found=true delete -f services/alertmanager.yaml
kubectl -n default --ignore-not-found=true delete -f services/httpgo.yaml
kubectl -n default --ignore-not-found=true delete -f services/karma.yaml
kubectl -n default --ignore-not-found=true delete -f kmon/kube-eagle.yaml
kubectl -n default --ignore-not-found=true delete -f services/prometheus.yaml
kubectl -n default --ignore-not-found=true delete -f services/promxy.yaml
kubectl -n default --ignore-not-found=true delete -f services/pushgateway.yaml
kubectl -n default --ignore-not-found=true delete -f services/victoria-metrics.yaml
# http
find "${script_dir}"/services/ -name "*-exp*.yaml" | awk '{print "kubectl --ignore-not-found=true delete -f "$1""}' | /bin/sh
}

function deploy_storage_services() {
# Fails because of /etc/proxy/proxy tls conf
# check_configs_prometheus
check_configs_am
# default
kubectl -n default apply -f services/alertmanager.yaml
kubectl -n default apply -f services/karma.yaml
kubectl -n default apply -f services/prometheus.yaml
kubectl -n default apply -f services/promxy.yaml
kubectl -n default apply -f services/pushgateway.yaml
kubectl -n default apply -f services/victoria-metrics.yaml
}

# cluster cronjob deployment
function deploy_cronjobs() {
kubectl -n auth apply -f crons/cron-proxy.yaml
kubectl -n cpueff apply -f services/cpueff/cpueff-spark.yaml
kubectl -n default apply -f crons/cron-proxy.yaml
kubectl -n http apply -f crons/cron-kerberos.yaml
kubectl -n http apply -f crons/cron-proxy.yaml
}

function clean_cronjobs() {
kubectl -n auth --ignore-not-found=true delete -f crons/cron-proxy.yaml
kubectl -n cpueff --ignore-not-found=true delete -f services/cpueff/cpueff-spark.yaml
kubectl -n default --ignore-not-found=true delete -f crons/cron-proxy.yaml
kubectl -n http --ignore-not-found=true delete -f crons/cron-kerberos.yaml
kubectl -n http --ignore-not-found=true delete -f crons/cron-proxy.yaml
}

# Deploy cinder volumes for default namespace
function deploy_storages() {
kubectl apply -f storages/cmsmonit-cluster-cinder.yaml -n default
}
function clean_storages() {
kubectl delete -f storages/cmsmonit-cluster-cinder.yaml -n default
}
# cluster ingress deployment
deploy_ingress()
{
# add labels for ingress
kubectl get node | grep node | \
awk '{print "kubectl label node "$1" role=ingress --overwrite"}' | /bin/sh
# deploy ingress controller
kubectl apply -f ingress/ingress.yaml
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MAIN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
namespaces="auth cpueff http "
deploy_all() {
for _ns in $namespaces; do
if ! kubectl get ns | grep -q $_ns; then
kubectl create namespace $_ns
fi
done
deploy_secrets
deploy_services
deploy_cronjobs
deploy_ingress
}
clean_all() {
clean_all_services
clean_cronjobs
sleep 10
clean_secrets
clean_storages
for _ns in $namespaces; do
if kubectl get ns | grep -q $_ns; then
kubectl --ignore-not-found=true delete namespace $_ns
fi
done
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Main routine, perform action requested on command line.
case ${action:-help} in
"deploy-all") deploy_all ;;
"deploy-secrets") deploy_secrets ;;
"deploy-services") deploy_services ;;
"deploy-storages") deploy_storages ;;
"deploy-storage-services") deploy_storage_services ;;
"status") cluster_check ;;
"clean-all") clean_all ;;
"clean-services") clean_services ;;
"clean-secrets") clean_secrets ;;
"clean-storages") clean_storages ;;
"test") test_vm ;;
"help") grep "^##H" <"$0" | sed -e "s,##H,,g" ;;
*) grep "^##H" <"$0" | sed -e "s,##H,,g" ;;
esac
13 changes: 7 additions & 6 deletions kubernetes/monitoring/ingress/ingress.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
apiVersion: extensions/v1beta1
# apiVersion: networking.k8s.io/v1
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: ingress
Expand All @@ -18,11 +17,13 @@ spec:
http:
paths:
- path: /
# pathType: Prefix
pathType: Prefix
backend:
serviceName: auth-proxy-server
servicePort: 443
service:
name: auth-proxy-server
port:
number: 443
tls:
- hosts:
- cms-monitoring.cern.ch
secretName: auth-secrets
secretName: auth-secrets
2 changes: 1 addition & 1 deletion kubernetes/monitoring/kmon/kube-eagle.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ spec:
serviceAccountName: sa-kube-eagle
containers:
- name: kube-eagle
image: "quay.io/google-cloud-tools/kube-eagle:1.1.0"
image: "quay.io/google-cloud-tools/kube-eagle:1.1.4"
imagePullPolicy: IfNotPresent
env:
- name: TELEMETRY_HOST
Expand Down
2 changes: 1 addition & 1 deletion kubernetes/monitoring/scripts/kenv
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ else
# CMS Monitoring clusters
export OS_PROJECT_NAME="CMS Web"
if [ "$1" == "mon" ]; then
export KUBECONFIG=$adir/config.monit/config.cmsmonitnew
export KUBECONFIG=$adir/config.monit/config.cmsmonit
elif [ "$1" == "ha1" ]; then
export KUBECONFIG=$adir/config.monit/config.monitoring-vm-ha1
elif [ "$1" == "ha2" ]; then
Expand Down
3 changes: 1 addition & 2 deletions kubernetes/monitoring/services/alertmanager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,5 +57,4 @@ spec:
secretName: alertmanager-secrets
- name: cinder-volume
persistentVolumeClaim:
claimName: am-volume-claim
# claimName: alertmanager-volume-claim
claimName: alertmanager-volume-claim
Loading

0 comments on commit bff1269

Please sign in to comment.