szaydel
5/30/2014 - 1:33 PM

RSF-1 Cluster Startup modified to work with SMF and having specific customizations to work on BrickstorOS.

RSF-1 Cluster Startup modified to work with SMF and having specific customizations to work on BrickstorOS.

#!/bin/sh
# $Id: rsfrc,v 2.106 2014/04/16 10:40:12 pg Exp $
#
# Script:   rsfrc
#
# Description:  RSF-1 startup/shutdown script
#       Use the 'restart' option if services are already up
#       Use the 'kill' option to stop RSF-1 but leave
#       services running. WARNING: this option may cause an
#       incorrect failover if the remote end is in automatic mode!
#
# Platform: Unix
#
# Author:   High-Availability Ltd / Paul Griffiths-Todd
#
# Rewritten to support SMF with specific-to RackTop Systems changes
# by Sam Zaydel
#
# Copyright (c) 2014 RackTop Systems.
set -o xtrace

. /opt/HAC/bin/rsf.sh
. /lib/svc/share/smf_include.sh
#
# Block states if any of these file exist (cluster node dependent)
DONOTSTARTHA="/do-not-start-ha"
SVCADM="/usr/sbin/svcadm"
RESOURCE_MONITOR_CONFIG="/opt/HAC/RSF-1/agents/etc/resource_agent.cfg"
ALWAYS_BLOCK_STATE="${PRODUCT_ETC}/.force_blocked"
PRESERVE_BLOCK_STATE="${PRODUCT_ETC}/.preserve_blocked"
ALWAYS_UNBLOCK_STATE="${PRODUCT_ETC}/.force_unblocked"

RPC_CLUSTER_STMF="/opt/HAC/RSF-1/bin/rpcstmfha"
RPC_CLUSTER_STMF_LOG="/opt/HAC/RSF-1/log/rpcstmf.log"
RPC_CLUSTER_STMF_LOCK="/var/run/rsfpmon_rpcstmfha"

RPC_CLUSTER_SERVICES="/opt/HAC/RSF-1/bin/rpchasvc"
RPC_CLUSTER_SERVICES_LOG="/opt/HAC/RSF-1/log/rpchasvc.log"
RPC_CLUSTER_SERVICES_LOCK="/var/run/rsfpmon_rpchasvc"

export DONOTSTARTHA

# FMRI for Brickstor HA Cluster
CLUSTER_FMRI=svc:/racktop/system/cluster:default
#
# Shutdown delay warning.
DELAY=2

#
# See if the contract runner is available
CTRUN=""
if [ -x "/usr/bin/ctrun" ] ; then
    CTRUN="/usr/bin/ctrun -l none"
fi

#
# Insert generic rsfmon command line flags here
PRODUCT_MONOPTS=""

if [ -f ${ALWAYS_BLOCK_STATE} ] ; then
    PRODUCT_MONOPTS="${PRODUCT_MONOPTS} -b b"
else
    if [ -f ${PRESERVE_BLOCK_STATE} ] ; then
    PRODUCT_MONOPTS="${PRODUCT_MONOPTS} -b p"
    else
    if [ -f ${ALWAYS_UNBLOCK_STATE} ] ; then
        PRODUCT_MONOPTS="${PRODUCT_MONOPTS} -b u"
    fi
    fi
fi

###################################################################
# No user modifiable parts from here on...
###################################################################
#
# The following entries are for Fedora/RedHat chkconfig tool.
#
# chkconfig: - 85 15
# description: RSF-1 from High-Availability.Com for managed application failover

#
# This must NOT contain '/' characters
DISC_HB="rsf1_hb_"
export DISC_HB

#
# Output mesg with prog/state prefix

cluster_service_msg ()
{
    printf "Brickstor HA Cluster [${state}]: %s\n" "$@"
}

run_sysevent_watcher()
{
    if [ -x ${ZFS_EVENT_HANDLER} -a -x ${SYS_EVENT_ADM} ] ; then
    cluster_service_msg "Registering ZFS sysevent watcher: ${ZFS_EVENT_HANDLER}"
    ${SYS_EVENT_ADM} remove -v SUNW -c EC_zfs ${ZFS_EVENT_HANDLER}
    ${SYS_EVENT_ADM} restart
    ${SYS_EVENT_ADM} add -v SUNW -c EC_zfs ${ZFS_EVENT_HANDLER} pool=\$pool_name guid=\$pool_guid vendor=\$vendor class=\$class subclass=\$subclass timestamp=\$timestamp publisher=\$publisher sequence=\$sequence
    ${SYS_EVENT_ADM} restart
    fi
}

killrsf()
{
    signal=$1
        type=$2

    if [ ! -f "${PRODUCT_PID}" ]; then
        cluster_service_msg "rsfmon not running"
        return 1
    else
        rsf_pid=`cat "${PRODUCT_PID:-'/dev/null'}" 2> /dev/null`
                if [ "$2" = "pgrp" ] ; then
            kill -${signal} -${rsf_pid}
                else
            kill -${signal} ${rsf_pid}
        fi
        return $?
    fi
}

smf_start_service () {
    # This is the method used to start Brickstor High-Availability Service.

    if [ -f "${DONOTSTARTHA}" -a "${state}" != "forcestart" ] ; then
        cluster_service_msg "file '${DONOTSTARTHA}' exists not performing normal start-up - exit"
        exit $SMF_EXIT_OK
    fi

    if [ "${state}" == "forcestart" ] ; then
        if [ -x "${SVCADM}" ] ; then
        cluster_service_msg "Reloading RPC/bind"
        ${SVCADM} restart rpc/bind
        fi
        sleep 1
        rm -f "${DONOTSTARTHA}"
    fi

    if rsfcli isrunning ; then
        cluster_service_msg "Cluster Service is already running!"
        exit $SMF_EXIT_OK
    fi

        cleanup_rsf_pid_dir

    run_sysevent_watcher

        if [ -f ${PRODUCT_ETC}/.disable_disc_heartbeats ] ; then
            rm -f ${PRODUCT_ETC}/.disable_disc_heartbeats >/dev/null 2>&1
        fi

    # Creating Temporary network interfaces, based on devices configured
    # for each service.
    create_ipadm_temporary_interface

    cluster_service_msg "Starting Cluster Monitoring"
    rotatelogs "${PRODUCT_LOGDIR}" "${RSF1_LOG}"

    #
    # Rotate STMFHA logs.
    STMFHA_LOGS=`ls -1 ${STMFHA_LOGDIR}/${STMFHA_LOG}* 2>/dev/null|egrep -v "\.[0-99]$"`
    for logfile in ${STMFHA_LOGS};do
        rotatelogs "${STMFHA_LOGDIR}" "${logfile##*/}"
    done

    #
    # Rotate SYSEVENT logs.
    rotatelogs "${PRODUCT_LOGDIR}" "${ZFS_SYSEVENT_LOG}"
    #
    # Rotate RPC logs for STMF and Cluster Services.
    rotatelogs "${PRODUCT_LOGDIR}" "${RPC_STMFHA_LOG}"
    rotatelogs "${PRODUCT_LOGDIR}" "${RPC_CLUSTERSVC_LOG}"

    #
    # Rotate FEN logs.
    rotatelogs "${PRODUCT_LOGDIR}" "${FEN_LOG}"

    # Need to do more testing to make sure contract handling is correct,
    # because in some of the earlier testing it appeared as though contract
    # was being orphaned almost immediately, which is not what should happen
    # as far as I know. For now, we won't use it here.
    ${PRODUCT_BIN}/rsfmon ${PRODUCT_MONOPTS} -i \
    > ${COMPANY_DIR}/RSF-1/log/${RSF1_LOG} 2>&1 \
    || exit ${SMF_EXIT_ERR_FATAL}

    # Start resource agent if configured.
    if [ -f ${RESOURCE_MONITOR_CONFIG} -a -x ${PRODUCT_BIN}/rsfagent ] ; then
        cluster_service_msg "Starting Cluster resource agent"
        ${PRODUCT_BIN}/rsfagent --resource-agent start
    fi

        #
        # Start the RSF-1 RPC process.
        if [ -f "${RPC_CLUSTER_STMF}" -a "${PROP_COMSTAR_SUPPORT}" = "${BOOL_STR_TRUE}" ] ; then
            RPCPID=`ps -ef|grep ${RPC_CLUSTER_STMF}|grep -v grep |awk '{print $2}'`
            if [ -z "${RPCPID}" ] ; then
                cluster_service_msg "Starting Cluster STMF RPC process."
                echo "${script} `date`: starting stmf rpc process" >> ${RPC_CLUSTER_STMF_LOG}
                if [ ! -f ${RSFPMON} ] ; then
                    ${CTRUN} ${RPC_CLUSTER_STMF}
                else
                ${CTRUN} ${RSFPMON} -v -w 1 -l ${RPC_CLUSTER_STMF_LOCK} ${RPC_CLUSTER_STMF} >> ${RPC_CLUSTER_STMF_LOG} 2>&1 &
                fi
            else
                echo "Not starting RSF-1 RPC process, pid indicates it is already running: ${RPCPID}"
                echo "${script} `date`: not starting stmf rpc process, pid indicates it is already running: ${RPCPID}" >> ${RPC_CLUSTER_STMF_LOG}
                ps -ef | grep ${RPCPID} >> ${RPC_CLUSTER_STMF_LOG}
            fi
        fi

        if [ -x "${RPC_CLUSTER_SERVICES}" ] ; then
            RPCPID=`getpid ${RPC_CLUSTER_SERVICES}`
            if [ -z "${RPCPID}" ] ; then
                cluster_service_msg "Starting Cluster RPC services."
                echo "${script} `date`: starting cluster rpc process" >> ${RPC_CLUSTER_SERVICES_LOG}
                if [ ! -f ${RSFPMON} ] ; then
                    ${CTRUN} ${RPC_CLUSTER_SERVICES}
                else
                ${CTRUN} ${RSFPMON} -v -w 1 -l ${RPC_CLUSTER_SERVICES_LOCK} ${RPC_CLUSTER_SERVICES} >> ${RPC_CLUSTER_SERVICES_LOG} 2>&1 &
                fi
            else
                cluster_service_msg "Not starting Cluster RPC process, pid indicates it is already running: ${RPCPID}"
                echo "${script} `date`: not starting stmf rpc process, pid indicates it is already running: ${RPCPID}" >> ${RPC_CLUSTER_SERVICES_LOG}
                if is_freebsd; then
                    ps -x | grep ${RPCPID} >> ${RPC_CLUSTER_SERVICES_LOG}
                else
                    ps -ef | grep ${RPCPID} >> ${RPC_CLUSTER_SERVICES_LOG}
                fi
        fi
        fi

    if [ -f "${STMFPROXY_CONFIG}" -a -x ${PRODUCT_BIN}/stmfproxy.sh ] ; then
        cluster_service_msg "Starting stmfproxy monitor"
        ${PRODUCT_BIN}/stmfproxy.sh start
    fi

    if [ "${PROP_ZPOOL_SYNC_CACHE}" = "${BOOL_STR_TRUE}" ]; then
        cluster_service_msg "Starting cache file sync process..."
        ${PRODUCT_BIN}/rsf-zfs-event subclass=RSF_START >/dev/null 2>&1 &
    fi

    cluster_service_msg "Cluster Service Started"
    return 0
}

smf_stop_service () {
# This is the method used to stop Brickstor High-Availability Service.

    # Stop resource agent if configured.
    if [ -f ${RESOURCE_MONITOR_CONFIG} -a -x ${PRODUCT_BIN}/rsfagent ] ; then
        cluster_service_msg "Stopping Cluster resource agent"
        ${PRODUCT_BIN}/rsfagent --resource-agent stop
    fi

    cluster_service_msg "Stopping Cluster monitoring and services in ${DELAY} seconds..."
    sleep ${DELAY}


    killrsf TERM noprg
    if [ $? -ne 0 ]; then
        cluster_service_msg "Couldn't stop Cluster (not running?)"
        if [ -f "${PRODUCT_PID}" ]; then
            rm -f ${PRODUCT_PID}
        fi
        # At this point we remove interfaces even if stop failed
        remove_ipadm_temporary_interface
        exit $SMF_EXIT_ERR_FATAL
    fi

    remove_ipadm_temporary_interface # At this point we remove interfaces

    i=0
    while [ -f "${PRODUCT_PID}" ]   # PID file removed when rsfmon exits
    do
        sleep 5
        i=`expr $i + 1`
        if [ $i -ge 120 ]; then
            cluster_service_msg "Service(s) not dying, aborting"
            killrsf KILL noprg
            rm -f ${PRODUCT_PID}
            exit $SMF_EXIT_ERR_FATAL
        fi
    done

    if [ -f "${STMFPROXY_CONFIG}" -a -x ${PRODUCT_BIN}/stmfproxy.sh ] ; then
        cluster_service_msg "Stopping stmfproxy monitor"
        ${PRODUCT_BIN}/stmfproxy.sh stop
    fi

        #
        # For old times sake.
    RPCPID=`ps -ef|grep rpcmapmgr|grep -v grep |awk '{print $2}'` # old version
    if [ ! -z "${RPCPID}" ] ; then
        kill ${RPCPID}
    fi

        #
        # Stop the STMF RPC process and process runner
    if [ -f ${RPC_CLUSTER_STMF_LOCK} ] ; then
        PID=`cat ${RPC_CLUSTER_STMF_LOCK}`
        kill -9 ${PID}
        cluster_service_msg "${script} `date`: stopped rsfpmon stmfha monitor pid ${PID}"
        fi

    RPCPID=`ps -ef|grep ${RPC_CLUSTER_STMF}|grep -v grep |awk '{print $2}'`
    if [ ! -z "${RPCPID}" ] ; then
        kill ${RPCPID}
        cluster_service_msg "${script} `date`: stopped ${RPC_CLUSTER_STMF} pid ${RPCPID}"
    fi

        #
        # Stop the cluster services RPC process and proces runner.
    if [ -f ${RPC_CLUSTER_SERVICES_LOCK} ] ; then
        PID=`cat ${RPC_CLUSTER_SERVICES_LOCK}`
        kill -9 ${PID}
        cluster_service_msg "${script} `date`: stopped rsfpmon cluster services monitor pid ${PID}"
        fi

    RPCPID=`getpid ${RPC_CLUSTER_SERVICES}`
    if [ ! -z "${RPCPID}" ] ; then
        kill ${RPCPID}
        cluster_service_msg "${script} `date`: stopped ${RPC_CLUSTER_SERVICES} pid ${RPCPID}"
    fi

    cluster_service_msg "Cluster Service Stopped"

    return 0
}

script="`basename $0`"
rev='$Revision: 2.106 $'
state=$1

case "${state}" in

'start_msg')
    echo "Starting ${COMPANY} RSF-1"
    ;;

'stop_msg')
    echo "Stopping ${COMPANY} RSF-1"
    ;;

'blockstart')
    touch "${DONOTSTARTHA}"
    ;;

'forcestart' | 'start')
    smf_start_service
    ;;

'restart')
    if rsfcli isrunning ; then
        cluster_service_msg "Cluster Service is already running!"
        exit $SMF_EXIT_OK
    fi

    cluster_service_msg "Restarting Cluster Service"
    smf_stop_service
    smf_start_service
    ;;

'kill')
    copyright

    cat <<"EOF"
                 _       _                  _
                | |     / /___ __________  (_)___  ____ _
                | | /| / / __ `/ ___/ __ \/ / __ \/ __ `/
                | |/ |/ / /_/ / /  / / / / / / / / /_/ /
                |__/|__/\__,_/_/  /_/ /_/_/_/ /_/\__, /
                                                /____/

        Killing rsfmon can cause a split-brain scenario to occur,
        resulting in application data corruption. Only kill rsfmon if
        you are aware of the risks and are certain you want to proceed

EOF

    echo "Do you really want to kill rsfmon (yes/no) ?"
    if xor yes no yes; then
        exit 1
    fi
    cluster_service_msg "Stopping RSF-1 monitoring in ${DELAY} seconds..."
    sleep ${DELAY}
    killrsf KILL pgrp
    if [ $? -ne 0 ]; then
        if [ -f "${PRODUCT_PID}" ]; then
            rm -f ${PRODUCT_PID}
        fi
        cluster_service_msg "Couldn't kill RSF-1 (not running?)"
        exit 1
    fi
    rm -f ${PRODUCT_PID}

    cluster_service_msg "RSF-1 stopped"
    ;;

'stop')
    smf_stop_service
    ;;

'-v')
    echorev
    ;;

*)
    echo "Usage: ${script} <start|forcestart|blockstart|restart|kill|stop>"
    ;;

esac

exit $SMF_EXIT_OK