diff --git a/anax-in-k8s/cronjobs/auto-upgrade-cronjob.sh b/anax-in-k8s/cronjobs/auto-upgrade-cronjob.sh index 23db77da5..3350b9846 100644 --- a/anax-in-k8s/cronjobs/auto-upgrade-cronjob.sh +++ b/anax-in-k8s/cronjobs/auto-upgrade-cronjob.sh @@ -2,7 +2,7 @@ # Variables for interfacing with agent pod KUBECTL="kubectl" -POD_ID=$($KUBECTL get pod -l app=agent -n ${AGENT_NAMESPACE} 2>/dev/null | grep "agent-" | cut -d " " -f1 2>/dev/null) +POD_ID=$($KUBECTL get pod -l app=agent,type!=auto-upgrade-cronjob -n ${AGENT_NAMESPACE} 2>/dev/null | grep "agent-" | cut -d " " -f1 2>/dev/null) # Timeout value for agent deployment AGENT_DEPLOYMENT_STATUS_TIMEOUT_SECONDS='75' @@ -47,10 +47,12 @@ AGENT_VERBOSITY=4 # Get script flags (should never really run unless testing script manually) if [[ $AGENT_VERBOSITY -ge $VERB_DEBUG ]]; then echo $(now) "getopts begin"; fi -while getopts "c:h" opt; do +while getopts "c:h:l:" opt; do case $opt in h) usage 0 ;; + l) AGENT_VERBOSITY="$OPTARG" + ;; \?) echo "Invalid option: -$OPTARG" usage 1 ;; @@ -416,22 +418,43 @@ function restart_agent_pod() { #====================== Main ====================== -log_info "cronjob under namesapce: $AGENT_NAMESPACE" +log_info "cronjob under namespace: $AGENT_NAMESPACE" # Sets STATUS_PATH for rest of script get_status_path -# Check agent deployment/pod status and status.json -pod_status=$($KUBECTL get pods ${POD_ID} --no-headers -o custom-columns=":status.phase") +pod_status=$($KUBECTL get pods ${POD_ID} -n ${AGENT_NAMESPACE} --no-headers -o custom-columns=":status.phase" | sed -z 's/\n/ /g;s/ //g' ) log_debug "Pod status: $pod_status" + +# Check deployment/pod status +# Instantaneous state where both could be running.... +if [[ "${pod_status}" == "RunningRunning" ]]; then + log_debug "Agent pod status is Running/Running; Exiting" + write_logs + exit 0 +fi + +log_info "Checking if there is any pending agent pod..." +if [[ "$pod_status" == *Pending* ]]; then + log_info "Agent pod is still in pending. Keeping status as \"$CURRENT_STATUS\" and exiting." + write_logs + exit 0 +fi + dep_status=$($KUBECTL rollout status deployment/agent -n ${AGENT_NAMESPACE} | awk '{ print $3 }' | sed 's/successfully/Running/g') log_debug "Deployment status: $dep_status" -json_status=$(cat $STATUS_PATH | jq '.agentUpgradePolicyStatus.status' | sed 's/\"//g') + +if [[ ! -f $STATUS_PATH ]]; then + log_debug "status file $STATUS_PATH not exist, existing." + write_logs + exit 0 +fi + +json_status=$(cat $STATUS_PATH | jq '.agentUpgradePolicyStatus.status' | sed 's/\"//g') # directory will be deleted by NMP worker if the upgrade is successful log_debug "Cron Job status: $json_status" CURRENT_STATUS=$json_status panic_rollback=false -# Check deployment/pod status log_info "Checking if agent is running and deployment is successful..." if [[ "$pod_status" != "Running" || "$dep_status" != "Running" ]]; then @@ -445,7 +468,7 @@ if [[ "$pod_status" != "Running" || "$dep_status" != "Running" ]]; then if [[ ! -z "$dep_status" ]]; then log_info "Agent pod is running successfully" log_verbose "Setting the status to \"$STATUS_ROLLBACK_SUCCESSFUL\"..." - echo $(jq --arg updated_status "$STATUS_ROLLBACK_SUCCESSFUL" '.agentUpgradePolicyStatus.status = $updated_status' $STATUS_PATH) > $STATUS_PATH + echo $(jq --arg updated_status "$STATUS_ROLLBACK_SUCCESSFUL" '.agentUpgradePolicyStatus.status = $updated_status' $STATUS_PATH) > $STATUS_PATH write_logs exit 0 else @@ -478,7 +501,7 @@ elif [[ "$json_status" == "$STATUS_ROLLBACK_STARTED" ]]; then if [[ $rc -eq 0 && "$cmd_output" == *"nodeType"*"cluster"* ]]; then log_info "Agent pod is running successfully." log_verbose "Setting the status to \"$STATUS_ROLLBACK_SUCCESSFUL\"..." - echo $(jq --arg updated_status "$STATUS_ROLLBACK_SUCCESSFUL" '.agentUpgradePolicyStatus.status = $updated_status' $STATUS_PATH) > $STATUS_PATH + echo $(jq --arg updated_status "$STATUS_ROLLBACK_SUCCESSFUL" '.agentUpgradePolicyStatus.status = $updated_status' $STATUS_PATH) > $STATUS_PATH write_logs exit 0 fi @@ -507,7 +530,7 @@ elif [[ "$json_status" == "$STATUS_INITIATED" ]]; then elif [[ "$old_image_version" == "null" || "$current_version" == "$old_image_version" ]]; then # set status to "failed" log_info "Agent pod is in panic state and the image version was not updated. Setting status to \"$ROLLBACK_FAILED\" and exiting." - echo $(jq --arg updated_status "$ROLLBACK_FAILED" '.agentUpgradePolicyStatus.status = $updated_status' $STATUS_PATH) > $STATUS_PATH + echo $(jq --arg updated_status "$ROLLBACK_FAILED" '.agentUpgradePolicyStatus.status = $updated_status' $STATUS_PATH) > $STATUS_PATH log_debug "Output of \"hzn node list\": $cmd_output" write_logs log_fatal 1 "Agent pod is in panic state and the image version was not updated." @@ -517,7 +540,7 @@ elif [[ "$json_status" == "$STATUS_INITIATED" ]]; then cmd_output=$(agent_cmd "hzn node list") log_info "Agent pod is in panic state. Rollback will be performed." log_debug "Output of \"hzn node list\": $cmd_output" - update_error_message "Agent pod is in panic state" + update_error_message "Agent pod is in panic state" fi panic_rollback=true