-
Notifications
You must be signed in to change notification settings - Fork 107
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #12082 from amaltaro/fix-12079-wma
Update restartComponent.sh script to notify everyone in the WMCore team
- Loading branch information
Showing
1 changed file
with
22 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,35 +1,42 @@ | ||
#!/bin/sh | ||
## Pass the component names as command line arguments, e.g.: | ||
## ./restartComponent.sh ErrorHandler JobSubmitter AgentStatusWatcher | ||
### Script to check the tail of each WMAgent component and evaluate | ||
# whether they are running or not, based on file meta-data (stat). | ||
# Component is automatically restarted if deemed down. | ||
# NOTE that this script may not catch multi-thread components down, | ||
# when only one of the threads is down. | ||
### | ||
|
||
HOST=`hostname` | ||
DATENOW=`date +%s` | ||
|
||
# Get a few environment variables in, like $install and $manage | ||
source /data/admin/wmagent/env.sh | ||
HOST=$(hostname) | ||
DATENOW=$(date +%s) | ||
DEST_NAME=cms-wmcore-team | ||
|
||
# Figure whether it's a python2 or python3 agent | ||
if [ ! -d "$install" ]; then | ||
install="/data/srv/wmagent/current/install/wmagentpy3" | ||
install="/data/srv/wmagent/current/install/" | ||
fi | ||
|
||
echo "List of components to be monitored: $@" | ||
for comp in $@; do | ||
echo -e "\n###Checking agent logs at: $(date)" | ||
comps=$(ls $install) | ||
for comp in $comps; do | ||
COMPLOG=$install/$comp/ComponentLog | ||
if [ ! -f $COMPLOG ]; then | ||
echo "Not a component or $COMPLOG does not exist" | ||
continue | ||
fi | ||
echo "Checking logs from: $COMPLOG" | ||
LASTCHANGE=`stat -c %Y $COMPLOG` | ||
INTERVAL=`expr $DATENOW - $LASTCHANGE` | ||
LASTCHANGE=$(stat -c %Y $COMPLOG) | ||
INTERVAL=$(expr $DATENOW - $LASTCHANGE) | ||
if (("$INTERVAL" >= 1800)); then | ||
OTHERS=`ps aux | grep wmcore | grep -v grep` | ||
OTHERS=$(ps aux | grep wmcore | grep -v grep) | ||
if [[ -z "$OTHERS" ]]; then | ||
echo "Since the agent is not running, don't do anything ..." | ||
exit 1 | ||
fi | ||
|
||
TAIL_LOG=`tail -n100 $COMPLOG` | ||
TAIL_LOG=$(tail -n100 $COMPLOG) | ||
$manage execute-agent wmcoreD --restart --components=$comp | ||
echo -e "ComponentLog quiet for $INTERVAL secs\n\nTail of the log is:\n$TAIL_LOG" | | ||
mail -s "$HOST : $comp restarted" [email protected],todor.trendafilov.ivanov@cern.ch | ||
mail -s "$HOST : $comp restarted" $DEST_NAME@cern.ch | ||
fi | ||
done | ||
|