#!/bin/sh
#
#
#	LNet OCF RA
#


# License:      GNU General Public License (GPL)v2
# Description:  Manages ZFS and Lustre on a shared storage
# Written by:   Gabriele Paciucci
# Release Date: 01 November 2016
# Release Version: 0.99.4

# Copyright (c) 2009 Andrew Beekhof
# Copyright (c) 2016, Intel Corporation


#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like.  Any license provided herein, whether implied or
# otherwise, applies only to this software file.  Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

#######################################################################
# Initialization:

: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}
. ${OCF_FUNCTIONS}
: ${__OCF_ACTION=$1}

#######################################################################

meta_data() {
	cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="healthLNET">
<version>0.99.4</version>

<longdesc lang="en">
Every time the monitor action is run, this resource agent records (in the CIB)
the current number of lctl ping nodes the host can connect to.
</longdesc>
<shortdesc lang="en">LNet connectivity</shortdesc>

<parameters>

<parameter name="pidfile" unique="0">
<longdesc lang="en">PID file</longdesc>
<shortdesc lang="en">PID file</shortdesc>
<content type="string" default="$HA_VARRUN/ping-${OCF_RESOURCE_INSTANCE}" />
</parameter>

<parameter name="dampen" unique="0">
<longdesc lang="en">
The time to wait (dampening) further changes occur
</longdesc>
<shortdesc lang="en">Dampening interval</shortdesc>
<content type="integer" default="5s"/>
</parameter>

<parameter name="name" unique="0">
<longdesc lang="en">
The name of the attributes to set.  This is the name to be used in the constraints.
</longdesc>
<shortdesc lang="en">Attribute name</shortdesc>
<content type="string" default="pingd"/>
</parameter>

<parameter name="multiplier" unique="0">
<longdesc lang="en">
The number by which to multiply the number of connected ping nodes by
</longdesc>
<shortdesc lang="en">Value multiplier</shortdesc>
<content type="integer" default=""/>
</parameter>

<parameter name="host_list" unique="0" required="1">
<longdesc lang="en">
The list of ping nodes to count.
</longdesc>
<shortdesc lang="en">Host list</shortdesc>
<content type="string" default=""/>
</parameter>

<parameter name="attempts" unique="0">
<longdesc lang="en">
Number of ping attempts, per host, before declaring it dead
</longdesc>
<shortdesc lang="en">no. of ping attempts</shortdesc>
<content type="integer" default="2"/>
</parameter>

<parameter name="timeout" unique="0">
<longdesc lang="en">
How long, in seconds, to wait before declaring a ping lost
</longdesc>
<shortdesc lang="en">ping timeout in seconds</shortdesc>
<content type="integer" default="2"/>
</parameter>

<parameter name="lctl" unique="0">
<longdesc lang="en">
Option to enable lctl ping. The default is true
</longdesc>
<shortdesc lang="en">Extra Options</shortdesc>
<content type="string" default="true"/>
</parameter>

<parameter name="device" unique="0">
<longdesc lang="en">
Device used for the LNET network. We assume the same device accross the cluster
</longdesc>
<shortdesc lang="en">LNET device</shortdesc>
<content type="string" default=""/>
</parameter>


<parameter name="options" unique="0">
<longdesc lang="en">
A catch all for any other options that need to be passed to ping.
</longdesc>
<shortdesc lang="en">Extra Options</shortdesc>
<content type="string" default=""/>
</parameter>

<parameter name="failure_score" unique="0">
<longdesc lang="en">
Resource is failed if the score is less than failure_score.
Default never fails.
</longdesc>
<shortdesc lang="en">failure_score</shortdesc>
<content type="integer" default=""/>
</parameter>

<parameter name="debug" unique="0">
<longdesc lang="en">
Enables to use default attrd_updater verbose logging on every call.
</longdesc>
<shortdesc lang="en">Verbose logging</shortdesc>
<content type="string" default="false"/>
</parameter>

</parameters>

<actions>
<action name="start"   timeout="300s" />
<action name="stop"    timeout="300s" />
<action name="reload"  timeout="300s" />
<action name="monitor" depth="0"  timeout="300s" interval="20s"/>
<action name="meta-data"  timeout="5" />
<action name="validate-all"  timeout="30" />
</actions>
</resource-agent>
END
}

#######################################################################

ping_conditional_log() {
	level=$1; shift
	if [ ${OCF_RESKEY_debug} = "true" ]; then
		ocf_log $level "$*"
	fi
}

ping_usage() {
	cat <<END
	usage: $0 {start|stop|monitor|migrate_to|migrate_from|validate-all|meta-data}

	Expects to have a fully populated OCF RA-compliant environment set.
END
}

ping_start() {
	modprobe lustre
	rc=$?
	if [ $rc -ne 0 ]; then
		return $OCF_ERR_INSTALLED
	fi
	ping_monitor
	if [ $? =  $OCF_SUCCESS ]; then
		return $OCF_SUCCESS
	fi
	touch ${OCF_RESKEY_pidfile}
	ping_update
}

ping_stop() {
	rm -f ${OCF_RESKEY_pidfile}
	attrd_updater -D -n $OCF_RESKEY_name -d $OCF_RESKEY_dampen $attrd_options
	return $OCF_SUCCESS
}

ping_monitor() {
    if [ -f ${OCF_RESKEY_pidfile} ]; then
	ping_update
	if [ $? -eq 0 ]; then
	    return $OCF_SUCCESS
	fi
	return $OCF_ERR_GENERIC
    fi
    return $OCF_NOT_RUNNING
}

ping_validate() {
	# Is the state directory writable?
	state_dir=`dirname "$OCF_RESKEY_pidfile"`
	touch "$state_dir/$$"
	if [ $? != 0 ]; then
		ocf_log err "Invalid location for 'state': $state_dir is not writable"
		return $OCF_ERR_ARGS
	fi
	rm "$state_dir/$$"

# Pidfile better be an absolute path
    case $OCF_RESKEY_pidfile in
	/*) ;;
	*) ocf_log warn "You should use an absolute path for pidfile not: $OCF_RESKEY_pidfile" ;;
    esac

# Check the host list
	if [ "x" = "x$OCF_RESKEY_host_list" ]; then
		ocf_log err "Empty host_list.  Please specify some nodes to ping"
		exit $OCF_ERR_CONFIGURED
	fi

	check_binary ping

	return $OCF_SUCCESS
}

lctl_check() {
	active=0
	for host in $OCF_RESKEY_host_list; do
	    lctl_exe="lctl ping"

	    lctl_out=`$lctl_exe $host $OCF_RESKEY_timeout 2>&1`; rc=$?
	    # debug
	    # ocf_log info "$lctl_exe $host $OCF_RESKEY_timeout"

	    case $rc in
		0) active=`expr $active + 1`;;
		1) ping_conditional_log warn "$host is inactive: $lctl_out";;
		*) ocf_log err "Unexpected result for '$lctl_exe $host $OCF_RESKEY_timeout' $rc: $p_out";;
	esac
	done
	return $active


}



ping_check() {
	active=0
	for host in $OCF_RESKEY_host_list; do
		p_exe=ping

		case `uname` in
		    Linux) p_args="-n -q -W $OCF_RESKEY_timeout -c $OCF_RESKEY_attempts";;
		    Darwin) p_args="-n -q -t $OCF_RESKEY_timeout -c $OCF_RESKEY_attempts -o";;
		    *) ocf_log err "Unknown host type: `uname`"; exit $OCF_ERR_INSTALLED;;
		esac

		case $host in
		    *:*) p_exe=ping6
		esac

		p_out=`$p_exe $p_args $OCF_RESKEY_options $host 2>&1`; rc=$?

		case $rc in
		    0) active=`expr $active + 1`;;
		    1) ping_conditional_log warn "$host is inactive: $p_out";;
		    *) ocf_log err "Unexpected result for '$p_exe $p_args $OCF_RESKEY_options $host' $rc: $p_out";;
		esac
	done
	return $active
}

ping_update() {
	# first I'm testing if I have the physical link up.
	# If not I give up without any additional tests.
	# but first we need to find which is the device we are using on the localhost.

	CARRIER=/sys/class/net/$OCF_RESKEY_device/carrier
	OPERSTATE=/sys/class/net/$OCF_RESKEY_device/operstate

	CAR_STAT=$(cat $CARRIER)
	OPER_STAT=$(cat $OPERSTATE)

	# debug
	# ocf_log info "$CAR_STAT - $OPER_STAT"


	if [ "$CAR_STAT" == "1" ] && [ "$OPER_STAT" == "up" ]; then
		if [ ${OCF_RESKEY_lctl} = "true" ]; then
			lctl_check
			active=$?
		else
			ping_check
			active=$?
		fi
	else
		active=0
	fi

	# debug
	# ocf_log info "$active"

	score=`expr $active \* $OCF_RESKEY_multiplier`
	attrd_updater -n $OCF_RESKEY_name -v $score -d $OCF_RESKEY_dampen $attrd_options
	rc=$?
	case $rc in
		0) ping_conditional_log debug "Updated $OCF_RESKEY_name = $score" ;;
		*) ocf_log warn "Could not update $OCF_RESKEY_name = $score: rc=$rc";;
	esac
	if [ $rc -ne 0 ]; then
	    return $rc
	fi
	if [ $score -eq 0 ]; then
		 ocf_log err "LNet connection failed please check"
	fi
	if [ -n "$OCF_RESKEY_failure_score" -a "$score" -lt "$OCF_RESKEY_failure_score" ]; then
	    ocf_log warn "$OCF_RESKEY_name is less than failure_score($OCF_RESKEY_failure_score)"
	    return 1
	fi
	return 0
}

: ${OCF_RESKEY_name:="pingd"}
: ${OCF_RESKEY_dampen:="5s"}
: ${OCF_RESKEY_attempts:="3"}
: ${OCF_RESKEY_multiplier:="1"}
: ${OCF_RESKEY_debug:="false"}
: ${OCF_RESKEY_lctl:="true"}
#: ${OCF_RESKEY_device:="eth1"}
: ${OCF_RESKEY_failure_score:="0"}

: ${OCF_RESKEY_CRM_meta_timeout:="20000"}
: ${OCF_RESKEY_CRM_meta_globally_unique:="true"}

integer=`echo ${OCF_RESKEY_timeout} | egrep -o '[0-9]*'`
case ${OCF_RESKEY_timeout} in
    *[0-9]ms|*[0-9]msec) OCF_RESKEY_timeout=`expr $integer / 1000`;;
    *[0-9]m|*[0-9]min) OCF_RESKEY_timeout=`expr $integer \* 60`;;
    *[0-9]h|*[0-9]hr)  OCF_RESKEY_timeout=`expr $integer \* 60 \* 60`;;
    *) OCF_RESKEY_timeout=$integer;;
esac

if [ -z ${OCF_RESKEY_timeout} ]; then
	if [ x"$OCF_RESKEY_host_list" != x ]; then
		host_count=`echo $OCF_RESKEY_host_list | awk '{print NF}'`
		OCF_RESKEY_timeout=`expr $OCF_RESKEY_CRM_meta_timeout / $host_count / $OCF_RESKEY_attempts`
		OCF_RESKEY_timeout=`expr $OCF_RESKEY_timeout / 1100` # Convert to seconds and finish 10% early
	else
		OCF_RESKEY_timeout=5
	fi
fi

if [ ${OCF_RESKEY_timeout} -lt 1 ]; then
	OCF_RESKEY_timeout=5
	elif [ ${OCF_RESKEY_timeout} -gt 1000 ]; then
		# ping actually complains if this value is too high, 5 minutes is plenty
		OCF_RESKEY_timeout=300
fi

if [ ${OCF_RESKEY_CRM_meta_globally_unique} = "false" ]; then
	: ${OCF_RESKEY_pidfile:="$HA_VARRUN/ping-${OCF_RESKEY_name}"}
else
	: ${OCF_RESKEY_pidfile:="$HA_VARRUN/ping-${OCF_RESOURCE_INSTANCE}"}
fi

attrd_options='-q'
if ocf_is_true ${OCF_RESKEY_debug} ; then
	attrd_options=''
fi

# Check the debug option
case "${OCF_RESKEY_debug}" in
	true|True|TRUE|1)    OCF_RESKEY_debug=true;;
	false|False|FALSE|0) OCF_RESKEY_debug=false;;
	*)
	ocf_log warn "Value for 'debug' is incorrect. Please specify 'true' or 'false' not: ${OCF_RESKEY_debug}"
	OCF_RESKEY_debug=false
	;;
esac

case $__OCF_ACTION in
meta-data)	meta_data
		exit $OCF_SUCCESS
		;;
start)		ping_start;;
stop)		ping_stop;;
monitor)	ping_monitor;;
reload)		ping_start;;
validate-all)	ping_validate;;
usage|help)	ping_usage
		exit $OCF_SUCCESS
		;;
*)		ping_usage
		exit $OCF_ERR_UNIMPLEMENTED
		;;
esac
