Skip to content

Commit

Permalink
Issue open-horizon#4132 - Bug: deploy/agent deleted on k8s auto upgra…
Browse files Browse the repository at this point in the history
…de on k3s

Signed-off-by: Le Zhang <[email protected]>
  • Loading branch information
LiilyZhang committed Aug 15, 2024
1 parent ba9411a commit c01b34e
Showing 1 changed file with 34 additions and 11 deletions.
45 changes: 34 additions & 11 deletions anax-in-k8s/cronjobs/auto-upgrade-cronjob.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# Variables for interfacing with agent pod
KUBECTL="kubectl"
POD_ID=$($KUBECTL get pod -l app=agent -n ${AGENT_NAMESPACE} 2>/dev/null | grep "agent-" | cut -d " " -f1 2>/dev/null)
POD_ID=$($KUBECTL get pod -l app=agent,type!=auto-upgrade-cronjob -n ${AGENT_NAMESPACE} 2>/dev/null | grep "agent-" | cut -d " " -f1 2>/dev/null)

# Timeout value for agent deployment
AGENT_DEPLOYMENT_STATUS_TIMEOUT_SECONDS='75'
Expand Down Expand Up @@ -47,10 +47,12 @@ AGENT_VERBOSITY=4

# Get script flags (should never really run unless testing script manually)
if [[ $AGENT_VERBOSITY -ge $VERB_DEBUG ]]; then echo $(now) "getopts begin"; fi
while getopts "c:h" opt; do
while getopts "c:h:l:" opt; do
case $opt in
h) usage 0
;;
l) AGENT_VERBOSITY="$OPTARG"
;;
\?) echo "Invalid option: -$OPTARG"
usage 1
;;
Expand Down Expand Up @@ -416,22 +418,43 @@ function restart_agent_pod() {

#====================== Main ======================

log_info "cronjob under namesapce: $AGENT_NAMESPACE"
log_info "cronjob under namespace: $AGENT_NAMESPACE"

# Sets STATUS_PATH for rest of script
get_status_path

# Check agent deployment/pod status and status.json
pod_status=$($KUBECTL get pods ${POD_ID} --no-headers -o custom-columns=":status.phase")
pod_status=$($KUBECTL get pods ${POD_ID} -n ${AGENT_NAMESPACE} --no-headers -o custom-columns=":status.phase" | sed -z 's/\n/ /g;s/ //g' )
log_debug "Pod status: $pod_status"

# Check deployment/pod status
# Instantaneous state where both could be running....
if [[ "${pod_status}" == "RunningRunning" ]]; then
log_debug "Agent pod status is Running/Running; Exiting"
write_logs
exit 0
fi

log_info "Checking if there is any pending agent pod..."
if [[ "$pod_status" == *Pending* ]]; then
log_info "Agent pod is still in pending. Keeping status as \"$CURRENT_STATUS\" and exiting."
write_logs
exit 0
fi

dep_status=$($KUBECTL rollout status deployment/agent -n ${AGENT_NAMESPACE} | awk '{ print $3 }' | sed 's/successfully/Running/g')
log_debug "Deployment status: $dep_status"
json_status=$(cat $STATUS_PATH | jq '.agentUpgradePolicyStatus.status' | sed 's/\"//g')

if [[ ! -f $STATUS_PATH ]]; then
log_debug "status file $STATUS_PATH not exist, existing."
write_logs
exit 0
fi

json_status=$(cat $STATUS_PATH | jq '.agentUpgradePolicyStatus.status' | sed 's/\"//g') # directory will be deleted by NMP worker if the upgrade is successful
log_debug "Cron Job status: $json_status"
CURRENT_STATUS=$json_status
panic_rollback=false

# Check deployment/pod status
log_info "Checking if agent is running and deployment is successful..."
if [[ "$pod_status" != "Running" || "$dep_status" != "Running" ]]; then

Expand All @@ -445,7 +468,7 @@ if [[ "$pod_status" != "Running" || "$dep_status" != "Running" ]]; then
if [[ ! -z "$dep_status" ]]; then
log_info "Agent pod is running successfully"
log_verbose "Setting the status to \"$STATUS_ROLLBACK_SUCCESSFUL\"..."
echo $(jq --arg updated_status "$STATUS_ROLLBACK_SUCCESSFUL" '.agentUpgradePolicyStatus.status = $updated_status' $STATUS_PATH) > $STATUS_PATH
echo $(jq --arg updated_status "$STATUS_ROLLBACK_SUCCESSFUL" '.agentUpgradePolicyStatus.status = $updated_status' $STATUS_PATH) > $STATUS_PATH
write_logs
exit 0
else
Expand Down Expand Up @@ -478,7 +501,7 @@ elif [[ "$json_status" == "$STATUS_ROLLBACK_STARTED" ]]; then
if [[ $rc -eq 0 && "$cmd_output" == *"nodeType"*"cluster"* ]]; then
log_info "Agent pod is running successfully."
log_verbose "Setting the status to \"$STATUS_ROLLBACK_SUCCESSFUL\"..."
echo $(jq --arg updated_status "$STATUS_ROLLBACK_SUCCESSFUL" '.agentUpgradePolicyStatus.status = $updated_status' $STATUS_PATH) > $STATUS_PATH
echo $(jq --arg updated_status "$STATUS_ROLLBACK_SUCCESSFUL" '.agentUpgradePolicyStatus.status = $updated_status' $STATUS_PATH) > $STATUS_PATH
write_logs
exit 0
fi
Expand Down Expand Up @@ -507,7 +530,7 @@ elif [[ "$json_status" == "$STATUS_INITIATED" ]]; then
elif [[ "$old_image_version" == "null" || "$current_version" == "$old_image_version" ]]; then
# set status to "failed"
log_info "Agent pod is in panic state and the image version was not updated. Setting status to \"$ROLLBACK_FAILED\" and exiting."
echo $(jq --arg updated_status "$ROLLBACK_FAILED" '.agentUpgradePolicyStatus.status = $updated_status' $STATUS_PATH) > $STATUS_PATH
echo $(jq --arg updated_status "$ROLLBACK_FAILED" '.agentUpgradePolicyStatus.status = $updated_status' $STATUS_PATH) > $STATUS_PATH
log_debug "Output of \"hzn node list\": $cmd_output"
write_logs
log_fatal 1 "Agent pod is in panic state and the image version was not updated."
Expand All @@ -517,7 +540,7 @@ elif [[ "$json_status" == "$STATUS_INITIATED" ]]; then
cmd_output=$(agent_cmd "hzn node list")
log_info "Agent pod is in panic state. Rollback will be performed."
log_debug "Output of \"hzn node list\": $cmd_output"
update_error_message "Agent pod is in panic state"
update_error_message "Agent pod is in panic state"
fi

panic_rollback=true
Expand Down

0 comments on commit c01b34e

Please sign in to comment.