#!/bin/sh
#
# Copyright (c) 2013-2017 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#

#
# Support: www.windriver.com
#
# Purpose: This resource agent manages 
#
#     .... the Titanium Cloud Controller Maintenance Daemon
#
# RA Spec:
#
# http://www.opencf.org/cgi-bin/viewcvs.cgi/specs/ra/resource-agent-api.txt?rev=HEAD
#
#######################################################################
# Initialization:

: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs


#######################################################################

# Fill in some defaults if no values are specified
OCF_RESKEY_binary_default="mtcAgent"
OCF_RESKEY_config_default="/etc/mtc.ini"
OCF_RESKEY_dbg_default="false"
OCF_RESKEY_logging_default="true"
OCF_RESKEY_mode_default="normal"
OCF_RESKEY_user_default="admin"
OCF_RESKEY_pid_default="/var/run/mtcAgent.pid"
OCF_RESKEY_state_default="standby"


: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}}
: ${OCF_RESKEY_logging=${OCF_RESKEY_logging_default}}
: ${OCF_RESKEY_dbg=${OCF_RESKEY_dbg_default}}
: ${OCF_RESKEY_mode=${OCF_RESKEY_mode_default}}
: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}}
: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}}
: ${OCF_RESKEY_state=${OCF_RESKEY_state_default}}

mydaemon="/usr/local/bin/${OCF_RESKEY_binary}"
statusfile="/var/run/${OCF_RESKEY_binary}.info"

#######################################################################

usage() {
    cat <<UEND

usage: $0 (start|stop|reload|status|monitor|validate-all|meta-data)

$0 manages the Platform's Controller Maintenance (mtcAgent) process as an HA resource

   The 'start' .....  operation starts the maintenance service in the active state.
   The 'stop' ......  operation stops the maintenance service.
   The 'reload' ....  operation stops and then starts the maintenance service.
   The 'status' ....  operation checks the status of the maintenance service.
   The 'monitor' ...  operation indicates the in-service status of the maintenance service.
   The 'validate-all' operation reports whether the parameters are valid.
   The 'meta-data' .  operation reports the mtcAgent's meta-data information.

UEND
}

#######################################################################

meta_data() {
   if [ ${OCF_RESKEY_dbg} = "true" ] ; then
        ocf_log info "mtcAgent:meta_data"
   fi

cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="mtcAgent">
<version>1.0</version>

<longdesc lang="en">
This 'mtcAgent' is an OCF Compliant Resource Agent that manages start, stop
and in-service monitoring of the Host Maintenance Process on Wind River's
Titanium Cloud in the active mode.
</longdesc>

<shortdesc lang="en">
Manages the Titanium Cloud's Maintenance (mtcAgent) Daemon.
</shortdesc>


<parameters>

<parameter name="state" unique="0" required="0">
<longdesc lang="en">
state = standby ... run maintenance daemon in 'standby' mode (default)
state = active  ... run maintenance daemon in 'active' mode
</longdesc>
<shortdesc lang="en">Maintenance Activity State Option</shortdesc>
<content type="string" default="${OCF_RESKEY_state_default}"/>
</parameter>

<parameter name="mode" unique="0" required="0">
<longdesc lang="en">
mode = normal  ... run maintenance daemon in 'normal' mode (default)
mode = passive ... run maintenance daemon in 'passive' mode
</longdesc>
<shortdesc lang="en">Maintenance Mode Option</shortdesc>
<content type="string" default="${OCF_RESKEY_mode_default}"/>
</parameter>


<parameter name="logging" unique="0" required="0">
<longdesc lang="en">
This option is used to direct the mtcAgent dameon log stream.

logging = true  ... /var/log/mtcAgent.log  (default)
logging = false ... /dev/null

See also debug option which sets the verbosity of logging.
</longdesc>
<shortdesc lang="en">Service Logging Control Option</shortdesc>
<content type="boolean" default="${OCF_RESKEY_logging_default}"/>
</parameter>


<parameter name="dbg" unique="0" required="0">
<longdesc lang="en">
dbg = false  ... info, warn and err logs sent to output stream (default) 
dbg = true   ... Additional dbg logs are also sent to the output stream
</longdesc>
<shortdesc lang="en">Service Debug Control Option</shortdesc>
<content type="boolean" default="${OCF_RESKEY_dbg_default}"/>
</parameter>

</parameters>


<actions>
<action name="start"        timeout="10s" />
<action name="stop"         timeout="10s" />
<action name="monitor"      timeout="10s" interval="300s" />
<action name="meta-data"    timeout="10s" />
<action name="validate-all" timeout="10s" />
</actions>
</resource-agent>
END
   return ${OCF_SUCCESS}
}

mtcAgent_validate() {

   if [ ${OCF_RESKEY_dbg} = "true" ] ; then
        ocf_log info "mtcAgent:validate"
   fi

    check_binary "/usr/local/bin/${OCF_RESKEY_binary}"
    check_binary "/usr/local/bin/hbsAgent"
    check_binary "/usr/local/bin/mtcClient"
    check_binary "/usr/local/bin/hbsClient"
    check_binary sysinv-api
    check_binary pidof

    if [ ! -f ${OCF_RESKEY_config} ] ; then
        msg="${OCF_RESKEY_binary} ini file missing ${OCF_RESKEY_config}"
        ocf_log err "${msg}"
        return ${OCF_ERR_CONFIGURED}
    fi

    return ${OCF_SUCCESS}
}

function log_procfs()
{
    pid=`cat ${OCF_RESKEY_pid}`
    PROCSCHEDFILE="/proc/$pid/sched"
    if [ -r $PROCSCHEDFILE ]  ; then
        PROCSCHED="$(cat $PROCSCHEDFILE 2>&1)"
        echo "$PROCSCHED" | while read line; do
            ocf_log info "sched: ${line}"
        done
    fi

    PROCSTACKFILE="/proc/$pid/stack"
    echo "stack file: $PROCSTACKFILE"
    if [ -r $PROCSTACKFILE ]  ; then
        PROCSTACK="$(cat $PROCSTACKFILE 2>&1)"
        echo "$PROCSTACK" | while read line; do
            ocf_log info "stack: ${line}"
        done
    fi
}

# total worst case timeout of this status check is 13 seconds.
# This is 2 seconds under SM's default 15 second timeout.
mtcAgent_status () {

    proc="mtcAgent:status"  
    if [ ${OCF_RESKEY_dbg} = "true" ] ; then
        ocf_log info "mtcAgent:status"
    fi

    # remove the status file before we request a new
    rm -f ${statusfile}

    # Verify the pid file exists as part of status
    for ((loop=0;loop<3;loop++)) {
        if [ -f ${OCF_RESKEY_pid} ] ; then
            break
        else
            sleep 1
        fi
    }

    # See if the daemon is running
    pid=`cat ${OCF_RESKEY_pid}`
    kill -0 $pid 2> /dev/null
    if [ $? -eq 0 ] ; then

        log_sig="${OCF_RESKEY_binary} In-Service Active Monitor Test"

        # Ask the daemon to produce status
        ocf_run kill -s USR1 $pid

        # Wait for the response
        for ((loop=0;loop<10;loop++)) {
            sleep 1
            if [ -f ${statusfile} ] ; then

                ocf_log info "${log_sig} Passed ($loop)"
                return ${OCF_SUCCESS}

            elif [ $loop -eq 5 ] ; then

                # send the signal again
                ocf_run kill -s USR1 $pid

                pid_stat=`cat /proc/${pid}/stat`
                ocf_log notice "${log_sig} is slow to respond"
                ocf_log notice "$pid_stat"

            elif [ $loop -eq 8 ] ; then

                pid_stat=`cat /proc/${pid}/stat`
                ocf_log warn "${log_sig} is very slow to respond"
                ocf_log warn "$pid_stat"

            fi
        }
        log_procfs
        ocf_log err "${log_sig} Failed"
        return ${OCF_ERR_GENERIC}
    fi
    return ${OCF_NOT_RUNNING}
}

mtcAgent_monitor () {

    proc="mtcAgent:monitor"
    if [ ${OCF_RESKEY_dbg} = "true" ] ; then
        ocf_log info "${proc}"
    fi

    # Uncomment if you want the monitor function to force-pass
    # return ${OCF_SUCCESS}

    pid=`cat ${OCF_RESKEY_pid}`
    kill -0 $pid 2> /dev/null
    if [ $? -ne 0 ] ; then
        if [ ${OCF_RESKEY_dbg} = "true" ] ; then
            ocf_log info "${proc} called while ${OCF_RESKEY_binary} not running."
        fi
        return ${OCF_NOT_RUNNING}
    fi

    mtcAgent_status
    return $?
}


mtcAgent_start () {

    local rc

    start_proc="mtcAgent:start"
    if [ ${OCF_RESKEY_dbg} = "true" ] ; then
        ocf_log info "${start_proc}"
    fi

    # Uncomment if you want the start function to force-pass without starting
    # return ${OCF_SUCCESS}

    # If running then issue a ping test
    pid=`cat ${OCF_RESKEY_pid}`
    kill -0 $pid 2> /dev/null
    if [ $? -eq 0 ] ; then
        mtcAgent_status
        rc=$?
        if [ $rc -ne ${OCF_SUCCESS} ] ; then
            msg="${start_proc} ping test failed rc=${rc}"
            ocf_log err "${msg}"
            mtcAgent_stop
        else 
            # Spec says to return success if process is already running for start
            pid=`cat ${OCF_RESKEY_pid}`
            kill -0 $pid 2> /dev/null
            if [ $? -eq 0 ] ; then
                ocf_log info "${start_proc} called while ${OCF_RESKEY_binary} is already running"
                return ${OCF_SUCCESS}
            fi
        fi
    fi

    # should not be running now or error
    pid=`cat ${OCF_RESKEY_pid}`
    kill -0 $pid 2> /dev/null
    if [ $? -eq 0 ] ; then
        msg="${start_proc} cannot kill off existing instance of ${OCF_RESKEY_binary}"
        ocf_log err "${msg}"
        return ${OCF_RUNNING_MASTER}
    fi

    rm -f ${statusfile}

    if [ ${OCF_RESKEY_state} = "active" ] ; then
        RUN_OPT_STATE="-a"
    else
        RUN_OPT_STATE=""
    fi

    if [ ${OCF_RESKEY_dbg} = "true" ] ; then
        RUN_OPT_DEBUG="-d debug"
    else
        RUN_OPT_DEBUG=""
    fi

    if [ ${OCF_RESKEY_mode} = "passive" ] ; then
        RUN_OPT_MODE="-p"
    else
        RUN_OPT_MODE=""
    fi

    if [ ${OCF_RESKEY_logging} = "true" ] ; then
        RUN_OPT_LOG="-l"
    else
        RUN_OPT_LOG=""
    fi

    # default PID to null
    pid=""

    # Try to Start the daemon
    ${mydaemon} ${RUN_OPT_STATE} ${RUN_OPT_LOG} ${RUN_OPT_MODE} ${RUN_OPT_DEBUG}
    rc=$?

    # verify it was started and set return code appropriately
    if [ $rc -eq ${OCF_SUCCESS} ] ; then
        # Verify the pid file exists as part of status
        for ((loop=0;loop<3;loop++)) {
            if [ -f ${OCF_RESKEY_pid} ] ; then
                break
            else
                ocf_log info "${start_proc} waiting ... loop=${loop}"
                sleep 1
            fi
        }

        pid=`cat ${OCF_RESKEY_pid}`
        # ocf_log info "PID:$pid"
        kill -0 $pid 2> /dev/null
        if [ $? -ne 0 ] ; then
            rc=${OCF_FAILED_MASTER}
        else
            if [ ! -f ${statusfile} ] ; then
               ocf_log info "mtcAgent: Startup Health Test Failed - missing info"
               rc = ${OCF_ERR_GENERIC}
            fi
        fi
    else
        ocf_log info "${start_proc} failed ${mydaemon} daemon rc=${rc}"
        rc = ${OCF_ERR_GENERIC}
    fi

    # Record success or failure and return status
    if [ ${rc} -eq $OCF_SUCCESS ] ; then
        msg="${start_proc}ed pid=${pid}"
        ocf_log info "${msg}"
    else
        msg="${start_proc} failed rc=${rc}"
        ocf_log err "${msg}"
        rc=${OCF_NOT_RUNNING}
    fi
    return $rc
}

mtcAgent_confirm_stop () {

    proc="mtcAgent:confirm_stop"
    ocf_log info "${proc}"

    pid=`pidof ${OCF_RESKEY_binary}`
    kill -0 ${pid} 2> /dev/null
    if [ $? -eq 0 ] ; then
        ocf_log info "${proc} 'kill -9 ${pid}'"
        kill -9 ${pid}
        ocf_log info "${proc}ed (by emergency kill -9 ${pid})"
        sleep 1
    fi
    rm -f ${OCF_RESKEY_pid}
}

mtcAgent_stop () {

    proc="mtcAgent:stop"

    # See if the process is running by pidfile

    pid=`pidof ${OCF_RESKEY_binary}`
    ocf_log info "${proc} PID:${pid}"
    kill -0 ${pid} 2> /dev/null
    if [ $? -ne 0 ] ; then
        ocf_log info "${proc} called while already stopped (no process)"
        mtcAgent_confirm_stop
        return ${OCF_SUCCESS}
    fi

    MAX=3
    for ((loop=0;loop<$MAX;loop++)) {

        # verify stop with pidfile
        if [ -f ${OCF_RESKEY_pid} ] ; then

            pid=`cat ${OCF_RESKEY_pid}`

            # if pid file is gone we are done
            if [ ${pid} = "" ] ; then
                ocf_log info "${proc}ped (by -int)"
                break

            # if pidfile is empty then kill by -int
            else

                kill -0 ${pid} 2> /dev/null
                if [ $? -ne 0 ] ; then
                    ocf_log info "${proc}ped (by pid)"
                    break
                else
                    ocf_log info "${proc}ping (by -int - loop:${loop})"
                    kill -int ${pid}
                    sleep 1
                fi
            fi
        fi
    }
    mtcAgent_confirm_stop
    return ${OCF_SUCCESS}
}

mtcAgent_reload () {

    local rc

    proc="mtcAgent:reload"
    if [ ${OCF_RESKEY_dbg} = "true" ] ; then
        ocf_log info "${proc}"
    fi

    mtcAgent_stop
    rc=$?
    if [ $rc -eq ${OCF_SUCCESS} ] ; then
        #sleep 1
        mtcAgent_start
        rc=$?
        if [ $rc -eq ${OCF_SUCCESS} ] ; then
            msg="${proc}ed"
            ocf_log info "${mgs}"
        fi
    fi

    if [ ${rc} -ne ${OCF_SUCCESS} ] ; then
        msg="${OCF_RESKEY_binary}: failed to restart rc=${rc}"
        ocf_log info "${mgs}"
    fi

    return ${rc}
}

case ${__OCF_ACTION} in
    meta-data)   meta_data
                 exit ${OCF_SUCCESS}
                 ;;
    usage|help)  usage
                 exit ${OCF_SUCCESS}
                 ;;
esac

ocf_log info "mtcAgent:${__OCF_ACTION} action"

# Anything except meta-data and help must pass validation
mtcAgent_validate || exit $?

case ${__OCF_ACTION} in
    start)        mtcAgent_start
                  ;;
    stop)         mtcAgent_stop
                  ;;
    status)       mtcAgent_status
                  ;;
    reload)       mtcAgent_reload
                  ;;
    monitor)      mtcAgent_monitor
                  ;;
    validate-all) mtcAgent_validate
                  ;;
    *)            usage
                  exit ${OCF_ERR_UNIMPLEMENTED}
                  ;;
esac
