Perform health check on clusterware and associated databases
#! /usr/bin/env bash
#
# comment : perform health check on clusterware and associated databases
# deploy : chmod +x /usr/local/bin/check_cluster
export version="20170426.0930"
cif () { printf "[INFO]\t%s\n" "$1" ; }
ttl () { printf "\033[1;34m----\t%s\033[0m\n" "$1" ; }
cok () { printf "\033[1;32m[OK]\033[0m\t%s\n" "$1" ; }
cko () { printf "\033[1;31m[KO]\t%s\033[0m\n" "$1" ; }
cwa () { printf "\033[1;33m[WARN]\t%s\033[0m\n" "$1" ; }
log () { printf "\033[1;35m%s\033[0m\n" $(tail -n 5 $1) ; }
declare -a tips
add_tips () { tips=("${tips[@]}" "[$1]\t$2") ; }
show_tips() {
[[ ${#tips[@]} -eq 0 ]] && return
echo
ttl "Please apply these commands on following nodes one by one:"
for i in "${tips[@]}" ; do
printf "$i\n"
done
ttl "End of script"
echo
exit
}
# list log file destination
# usage: check_cluster -l [cluster,asm,listener,scan,db,all]
show_logs() {
TYPE=${1:all}
patterns=""
if [[ "$TYPE" == "cluster" || "$TYPE" == "all" ]] ; then
patterns="$HOME/log/*/alert*.log $HOME/log/*/crsd/crsd.log $HOME/log/*/ohasd/ohasd.log"
fi
if [[ "$TYPE" == "scan" || "$TYPE" == "all" ]] ; then
patterns="$patterns $HOME/log/diag/tnslsnr/*/*/trace/*.log"
fi
if [[ "$TYPE" == "asm" || "$TYPE" == "all" ]] ; then
patterns="$patterns $BASE/diag/asm/*/*/trace/alert_*.log"
fi
if [[ "$TYPE" == "listener" || "$TYPE" == "all" ]] ; then
patterns="$patterns $BASE/diag/tnslsnr/*/*/trace/listener.log"
fi
if [[ "$TYPE" == "db" || "$TYPE" == "all" ]] ; then
for db in $($SRVCTL config) ; do
. oraenv -s <<< $db > /dev/null
patterns="$patterns $ORACLE_BASE/diag/rdbms/*/$db*/trace/alert_$db*.log"
done
fi
for node in $NODES ; do
ttl "Log files on $node"
$SSH $node ls -1 "$patterns"
done
exit
}
show_help () {
echo "usage: check_cluster -l [cluster,asm,listener,scan,db,all]"
exit
}
# set variables environnement
HOME=$(grep [+]ASM /etc/oratab | cut -d: -f2)
BASE=$(grep ^ORACLE_BASE $HOME/crs/install/crsconfig_params | cut -d= -f2 | tr ',' ' ')
NODES=$(grep ^NODE_NAME_LIST $HOME/crs/install/crsconfig_params | cut -d= -f2 | tr ',' ' ')
lastup=""
# set global commands
CRSCTL="$HOME/bin/crsctl"
SRVCTL="$HOME/bin/srvctl"
AVAGENT="/usr/local/avamar/ora_rac/etc/avagent.d"
SSH="sudo -u grid ssh -o ConnectTimeout=1 -T"
if [[ "$1" == "-l" ]] ; then
show_logs $2
fi
if [[ "$1" == "-h" ]] ; then
show_help
fi
# - Cluster
ttl "1. Check cluster availibility"
for node in $NODES ; do
result=$($SSH $node $CRSCTL check crs)
if [[ $? -eq 0 ]] ; then
if [[ "$result" =~ CRS-4535 ]] ; then
cko "Cluster is not ready on $node"
add_tips "root@anyserver" "$CRSCTL start cluster -n $node"
elif [[ "$result" =~ CRS-4639 ]] ; then
cko "Cluster is down on $node"
add_tips "root@$node" "$CRSCTL start crs # (and wait few minutes)"
else
cok "Cluster is ready on $node"
lastup=$node
fi
else
cko "Cluster is down on $node"
add_tips "$node" "Boot the server"
fi
done
if [[ "$lastup" == "" ]] ; then
show_tips
exit 1
fi
function check_multiple_resources () {
result=$($SSH $lastup <<DATA
$CRSCTL status resource -w "((TYPE = $1) AND (TARGET_SERVER != ''))" | sed -e 's/^$/|/g' | tr '\n' ' '
DATA
)
IFS='|' read -ra msg <<< "$result"
for i in "${msg[@]}" ; do
[[ "$i" == " " ]] && continue
name=$(echo $i | cut -d= -f2 | cut -d' ' -f1)
state=$(echo $i | rev | cut -d= -f1 | rev)
status=0
if [[ ! "$state" =~ ONLINE ]] ; then
status=2
elif [[ "$state" =~ OFFLINE ]] ; then
status=1
else
cok "$name is $state"
continue
fi
nodeno=1
for node in $NODES ; do
if [[ ! "$state" =~ "$node" ]] ; then
state=$(echo $state | sed -e "s/OFFLINE/OFFLINE on $node/$nodeno")
add_tips "root@anyserver" "$CRSCTL start resource $name -n $node"
nodeno=$(($nodeno+1))
fi
done
if [[ $status -eq 1 ]] ; then
cwa "$name is $state"
[[ "$2" != "" ]] && cwa "=> $2"
elif [[ $status -eq 2 ]] ; then
cko "$name is $state"
[[ "$2" != "" ]] && cko "=> $2"
fi
done
}
function check_avagent () {
test -f $AVAGENT || return
rcode=0
rmess="avagent.d"
for node in $NODES ; do
result=$($SSH $node $AVAGENT status)
rc=$?
if [[ $rc -eq 0 ]] ; then
rmess="$rmess is ONLINE on $node"
else
rmess="$rmess is OFFLINE on $node"
fi
rcode=$(( $rcode + ! $rc ))
done
if [[ $rcode -eq 1 ]] ; then
cok "$rmess"
elif [[ $rcode -eq 0 ]] ; then
cko "$rmess"
add_tips "root@$(echo $NODES | cut -d' ' -f1)" "$AVAGENT start"
elif [[ $rcode -ge 2 ]] ; then
cko "$rmess"
add_tips "root@others" "$AVAGENT stop"
fi
}
function check_asmlib () {
test -f /etc/init.d/oracleasm || return
rcode=0
rmess="asmlib"
for node in $NODES ; do
result=$($SSH $node PATH=$PATH:/usr/sbin oracleasm status)
rc=$?
if [[ $rc -ne 0 ]] ; then
rmess="$rmess is ABSENT on $node"
rc=0
elif [[ "$result" =~ "no" ]] ; then
rmess="$rmess is OFFLINE on $node"
add_tips "root@$node" "oracleasm configure -i # (with autostart)"
add_tips "root@$node" "/etc/init.d/oracleasm start"
rc=1
else
rmess="$rmess is ONLINE on $node"
rc=0
fi
rcode=$(( $rcode + ! $rc ))
done
if [[ $rcode -eq 2 ]] ; then
cok "$rmess"
else
cko "$rmess"
fi
}
echo # - ASM and Diskgroups
ttl "2. Check ASM and diskgroups availibility"
check_asmlib
check_multiple_resources "ora.asm.type"
check_multiple_resources "ora.diskgroup.type" "Please Contact dba team."
echo # - VIP and SCAN
ttl "3. Check VIP and SCAN addresses"
check_multiple_resources "ora.cluster_vip_net1.type"
check_multiple_resources "ora.scan_vip.type"
check_multiple_resources "ora.scan_listener.type"
echo # - Instances
ttl "4. Check instance availibility"
ttl "Everything must be OK - Warning means Degraded Mode"
check_multiple_resources "ora.listener.type"
check_multiple_resources "ora.database.type"
check_avagent
show_tips