jobwat
6/5/2013 - 8:27 AM

dead_worker_killer.sh

#!/bin/bash

# inspired from: https://gist.github.com/vitobotta/2783513

threshold=300 # after 5min of uptime, a job is considered 'stuck', to kill
logfile=log/dead_workers_killed.log

function ps_etime_to_seconds() # cheers user000001 - http://stackoverflow.com/questions/14652445/parse-ps-etime-output-into-seconds#14653443
{
  echo $1 | awk -F $':' -f <(cat - <<-'EOF'
  {
    if (NF == 2) {
      print $1*60 + $2
    } else if (NF == 3) {
      split($1, a, "-");
      if (a[2] > 0) {
        print ((a[1]*24+a[2])*60 + $2) * 60 + $3;
      } else {
        print ($1*60 + $2) * 60 + $3;
      }
    }
  }
EOF
)
}


ps -eo pid,etime,command | grep "[r]esque" | grep "Processing" | while read PID UPTIME COMMAND; do
  SECONDS=`ps_etime_to_seconds $UPTIME`
  #echo "$PID, $COMMAND, $UPTIME (${SECONDS}s)"
	if `kill -0 $PID`; then
 
		if [ $SECONDS -gt $threshold ]; then
			kill -9 $PID
			QUEUE=`echo "$COMMAND" | cut -d ' ' -f 3`

			echo " The forked child with pid #$PID (queue: $QUEUE) was found stuck for longer than $threshold seconds. RIP" >> $logfile
 
		fi
	fi
done