#!/bin/bash
# inspired from: https://gist.github.com/vitobotta/2783513
threshold=300 # after 5min of uptime, a job is considered 'stuck', to kill
logfile=log/dead_workers_killed.log
function ps_etime_to_seconds() # cheers user000001 - http://stackoverflow.com/questions/14652445/parse-ps-etime-output-into-seconds#14653443
{
echo $1 | awk -F $':' -f <(cat - <<-'EOF'
{
if (NF == 2) {
print $1*60 + $2
} else if (NF == 3) {
split($1, a, "-");
if (a[2] > 0) {
print ((a[1]*24+a[2])*60 + $2) * 60 + $3;
} else {
print ($1*60 + $2) * 60 + $3;
}
}
}
EOF
)
}
ps -eo pid,etime,command | grep "[r]esque" | grep "Processing" | while read PID UPTIME COMMAND; do
SECONDS=`ps_etime_to_seconds $UPTIME`
#echo "$PID, $COMMAND, $UPTIME (${SECONDS}s)"
if `kill -0 $PID`; then
if [ $SECONDS -gt $threshold ]; then
kill -9 $PID
QUEUE=`echo "$COMMAND" | cut -d ' ' -f 3`
echo " The forked child with pid #$PID (queue: $QUEUE) was found stuck for longer than $threshold seconds. RIP" >> $logfile
fi
fi
done