myorama
1/3/2019 - 4:53 PM

check_cluster

Perform health check on clusterware and associated databases

#! /usr/bin/env bash
#
# comment : perform health check on clusterware and associated databases
# deploy  : chmod +x /usr/local/bin/check_cluster

export version="20170426.0930"

cif ()  { printf "[INFO]\t%s\n" "$1" ; }
ttl ()  { printf "\033[1;34m----\t%s\033[0m\n" "$1" ; }
cok ()  { printf "\033[1;32m[OK]\033[0m\t%s\n" "$1" ; }
cko ()  { printf "\033[1;31m[KO]\t%s\033[0m\n" "$1" ; }
cwa ()  { printf "\033[1;33m[WARN]\t%s\033[0m\n" "$1" ; }
log ()  { printf "\033[1;35m%s\033[0m\n" $(tail -n 5 $1) ; }

declare -a tips
add_tips () { tips=("${tips[@]}" "[$1]\t$2") ; } 
show_tips() { 
  [[ ${#tips[@]} -eq 0 ]] && return

  echo
  ttl "Please apply these commands on following nodes one by one:"
  for i in "${tips[@]}" ; do
    printf "$i\n"
  done
  ttl "End of script"
  echo
  exit
}

# list log file destination
# usage: check_cluster -l [cluster,asm,listener,scan,db,all]
show_logs() {
  TYPE=${1:all}
  patterns=""
  
  if [[ "$TYPE" == "cluster" || "$TYPE" == "all" ]] ; then 
    patterns="$HOME/log/*/alert*.log $HOME/log/*/crsd/crsd.log $HOME/log/*/ohasd/ohasd.log"
  fi
  if [[ "$TYPE" == "scan" || "$TYPE" == "all" ]] ; then 
    patterns="$patterns $HOME/log/diag/tnslsnr/*/*/trace/*.log"
  fi
  if [[ "$TYPE" == "asm" || "$TYPE" == "all" ]] ; then 
    patterns="$patterns $BASE/diag/asm/*/*/trace/alert_*.log"
  fi
  if [[ "$TYPE" == "listener" || "$TYPE" == "all" ]] ; then 
    patterns="$patterns $BASE/diag/tnslsnr/*/*/trace/listener.log"
  fi
  if [[ "$TYPE" == "db" || "$TYPE" == "all" ]] ; then 
    for db in $($SRVCTL config) ; do
      . oraenv -s <<< $db > /dev/null
      patterns="$patterns $ORACLE_BASE/diag/rdbms/*/$db*/trace/alert_$db*.log"
    done
  fi
  
  for node in $NODES ; do
    ttl "Log files on $node"
    $SSH $node ls -1 "$patterns"
  done
  exit
}

show_help () {
  echo "usage: check_cluster -l [cluster,asm,listener,scan,db,all]"
  exit
}

# set variables environnement
HOME=$(grep [+]ASM /etc/oratab | cut -d: -f2)
BASE=$(grep ^ORACLE_BASE $HOME/crs/install/crsconfig_params | cut -d= -f2 | tr ',' ' ')
NODES=$(grep ^NODE_NAME_LIST $HOME/crs/install/crsconfig_params | cut -d= -f2 | tr ',' ' ')
lastup=""

# set global commands
CRSCTL="$HOME/bin/crsctl"
SRVCTL="$HOME/bin/srvctl"
AVAGENT="/usr/local/avamar/ora_rac/etc/avagent.d"
SSH="sudo -u grid ssh -o ConnectTimeout=1 -T"

if [[ "$1" == "-l" ]] ; then
  show_logs $2
fi

if [[ "$1" == "-h" ]] ; then
  show_help
fi

# - Cluster
ttl "1. Check cluster availibility"

for node in $NODES ; do
  result=$($SSH $node $CRSCTL check crs)
  if [[ $? -eq 0 ]] ; then
    if [[ "$result" =~ CRS-4535 ]] ; then 
      cko "Cluster is not ready on $node"
      add_tips "root@anyserver" "$CRSCTL start cluster -n $node"
    elif [[ "$result" =~ CRS-4639 ]] ; then
      cko "Cluster is down on $node"
      add_tips "root@$node" "$CRSCTL start crs # (and wait few minutes)"
    else
      cok "Cluster is ready on $node"
      lastup=$node
    fi
  else
    cko "Cluster is down on $node"
    add_tips "$node" "Boot the server"
  fi
done

if [[ "$lastup" == "" ]] ; then
  show_tips
  exit 1
fi

function check_multiple_resources () {
  result=$($SSH $lastup <<DATA
$CRSCTL status resource -w "((TYPE = $1) AND (TARGET_SERVER != ''))" | sed -e 's/^$/|/g' | tr '\n' ' '
DATA
)
  IFS='|' read -ra msg <<< "$result"

  for i in "${msg[@]}" ; do
    [[ "$i" == " " ]] && continue

    name=$(echo $i | cut -d= -f2 | cut -d' ' -f1)
    state=$(echo $i | rev | cut -d= -f1 | rev)
    status=0
    
    if [[ ! "$state" =~ ONLINE ]] ; then
      status=2
    elif [[ "$state" =~ OFFLINE ]] ; then
      status=1
    else
      cok "$name is $state"
      continue
    fi
    
    nodeno=1
    for node in $NODES ; do
      if [[ ! "$state" =~ "$node" ]] ; then
        state=$(echo $state | sed -e "s/OFFLINE/OFFLINE on $node/$nodeno")
        add_tips "root@anyserver" "$CRSCTL start resource $name -n $node"
        nodeno=$(($nodeno+1))
      fi
    done
    
    if [[ $status -eq 1 ]] ; then 
      cwa "$name is $state"
      [[ "$2" != "" ]] && cwa "=> $2"
    elif [[ $status -eq 2 ]] ; then 
      cko "$name is $state"
      [[ "$2" != "" ]] && cko "=> $2"
    fi
  done
}

function check_avagent () {
  test -f $AVAGENT || return 

  rcode=0
  rmess="avagent.d"
  for node in $NODES ; do
    result=$($SSH $node $AVAGENT status)
    rc=$?
    if [[ $rc -eq 0 ]] ; then
      rmess="$rmess is ONLINE on $node"
    else
      rmess="$rmess is OFFLINE on $node"
    fi
    rcode=$(( $rcode + ! $rc ))
  done

  if [[ $rcode -eq 1 ]] ; then
    cok "$rmess"
  elif [[ $rcode -eq 0 ]] ; then
    cko "$rmess"
    add_tips "root@$(echo $NODES | cut -d' ' -f1)" "$AVAGENT start"
  elif [[ $rcode -ge 2 ]] ; then
    cko "$rmess"
    add_tips "root@others" "$AVAGENT stop"
  fi
}

function check_asmlib () {
  test -f /etc/init.d/oracleasm || return 

  rcode=0
  rmess="asmlib"
  for node in $NODES ; do
    result=$($SSH $node PATH=$PATH:/usr/sbin oracleasm status)
    rc=$?
    if [[ $rc -ne 0 ]] ; then
      rmess="$rmess is ABSENT on $node"
      rc=0
    elif [[ "$result" =~ "no" ]] ; then
      rmess="$rmess is OFFLINE on $node"
      add_tips "root@$node" "oracleasm configure -i # (with autostart)"
      add_tips "root@$node" "/etc/init.d/oracleasm start"
      rc=1
    else
      rmess="$rmess is ONLINE on $node"
      rc=0
    fi
    rcode=$(( $rcode + ! $rc ))
  done

  if [[ $rcode -eq 2 ]] ; then
    cok "$rmess"
  else
    cko "$rmess"
  fi
}

echo # - ASM and Diskgroups
ttl "2. Check ASM and diskgroups availibility"

check_asmlib
check_multiple_resources "ora.asm.type"
check_multiple_resources "ora.diskgroup.type" "Please Contact dba team."

echo # - VIP and SCAN
ttl "3. Check VIP and SCAN addresses"

check_multiple_resources "ora.cluster_vip_net1.type"
check_multiple_resources "ora.scan_vip.type"
check_multiple_resources "ora.scan_listener.type"

echo # - Instances
ttl "4. Check instance availibility"
ttl "Everything must be OK - Warning means Degraded Mode"

check_multiple_resources "ora.listener.type"
check_multiple_resources "ora.database.type"
check_avagent

show_tips