Skip to content

Commit

Permalink
Merge pull request #12082 from amaltaro/fix-12079-wma
Browse files Browse the repository at this point in the history
Update restartComponent.sh script to notify everyone in the WMCore team
  • Loading branch information
amaltaro authored Aug 26, 2024
2 parents 9e91b41 + 01b6760 commit 49e5450
Showing 1 changed file with 22 additions and 15 deletions.
37 changes: 22 additions & 15 deletions deploy/restartComponent.sh
Original file line number Diff line number Diff line change
@@ -1,35 +1,42 @@
#!/bin/sh
## Pass the component names as command line arguments, e.g.:
## ./restartComponent.sh ErrorHandler JobSubmitter AgentStatusWatcher
### Script to check the tail of each WMAgent component and evaluate
# whether they are running or not, based on file meta-data (stat).
# Component is automatically restarted if deemed down.
# NOTE that this script may not catch multi-thread components down,
# when only one of the threads is down.
###

HOST=`hostname`
DATENOW=`date +%s`

# Get a few environment variables in, like $install and $manage
source /data/admin/wmagent/env.sh
HOST=$(hostname)
DATENOW=$(date +%s)
DEST_NAME=cms-wmcore-team

# Figure whether it's a python2 or python3 agent
if [ ! -d "$install" ]; then
install="/data/srv/wmagent/current/install/wmagentpy3"
install="/data/srv/wmagent/current/install/"
fi

echo "List of components to be monitored: $@"
for comp in $@; do
echo -e "\n###Checking agent logs at: $(date)"
comps=$(ls $install)
for comp in $comps; do
COMPLOG=$install/$comp/ComponentLog
if [ ! -f $COMPLOG ]; then
echo "Not a component or $COMPLOG does not exist"
continue
fi
echo "Checking logs from: $COMPLOG"
LASTCHANGE=`stat -c %Y $COMPLOG`
INTERVAL=`expr $DATENOW - $LASTCHANGE`
LASTCHANGE=$(stat -c %Y $COMPLOG)
INTERVAL=$(expr $DATENOW - $LASTCHANGE)
if (("$INTERVAL" >= 1800)); then
OTHERS=`ps aux | grep wmcore | grep -v grep`
OTHERS=$(ps aux | grep wmcore | grep -v grep)
if [[ -z "$OTHERS" ]]; then
echo "Since the agent is not running, don't do anything ..."
exit 1
fi

TAIL_LOG=`tail -n100 $COMPLOG`
TAIL_LOG=$(tail -n100 $COMPLOG)
$manage execute-agent wmcoreD --restart --components=$comp
echo -e "ComponentLog quiet for $INTERVAL secs\n\nTail of the log is:\n$TAIL_LOG" |
mail -s "$HOST : $comp restarted" [email protected],todor.trendafilov.ivanov@cern.ch
mail -s "$HOST : $comp restarted" $DEST_NAME@cern.ch
fi
done

0 comments on commit 49e5450

Please sign in to comment.