#!/bin/bash

#
# very short example:
#
# to start collection:
#   VMSTAT_INTERVAL=0 SERVICE_INTERVAL=2 SDIO_INTERVAL=0 iokit-lstats start
#
# where value of interval means:
#   0 - gather stats at start and stop only
#   N - gather stats every N seconds
# if some XXX_INTERVAL isn't specified, related stats won't be collected
# XXX can be: VMSTAT, SERVICE, BRW, SDIO, MBALLOC, IO, JBD
#
# to stop collection:
#   iokit-lstats stop
#
# to fetch collected stats:
#   iokit-lstats fetch >file
# in file you'll get a tarbal containing directory with stats
# directory's name consists of hostname and date,
# like: stats-bzzz-2007-05-13-22.52.31
#

#
# TODO
#  - close all file descriptors, otherwise sshd can't finish session
#  - for sd_iostats convert partition to whole device
#

# configuration variables
TMP=${TMP:-/tmp}
PREFIX=${PREFIX:-${TMP}/lstats.}
PIDFILE=${PREFIX}pid
STATPIDS=${PREFIX}pids
OUTPREFIX=${OUTPREFIX:-${PREFIX}out.}
STIMEPREFIX=${STIMEPREFIX:-${PREFIX}time.}


function ls_grab_control()
{
	OCOMM=$(ps -p $$ -o comm=)
	if [ "$OCOMM" == "" ]; then
		echo "Can't fetch process name"
		exit
	fi

	# check for running master first
	PID=$(cat $PIDFILE 2>/dev/null)
#echo "check master $PID"
	if [ "x$PID" != "x" ]; then
		COMM=$(ps -p $PID -o comm=)
		if [ "$COMM" == "$OCOMM" ]; then
			echo "Master is already running by $PID"
			return 1
		fi
	fi

	# XXX: race -- two process can do this at same time, use rename instead
	echo $$ >${PIDFILE}.$$
	mv ${PIDFILE}.$$ ${PIDFILE}
	a=$(cat ${PIDFILE})
	if [ "$$" != "$a" ]; then
		echo "Some one $a won the race"
		return 1
	fi

	HAS_CONTROL="yes"
#echo "We've got control"

	return 0

}

function ls_release_control()
{
#echo "Release control"

	rm -f $PIDFILE
}

trap ls_atexit EXIT
function ls_atexit()
{
	if [ "$HAS_CONTROL" != "" ]; then
		ls_release_control
	fi
}


function usr1signal()
{
	stop_collector=1
}

function idle_collector()
{
	while [ "$stop_collector" != "1" ]; do
		sleep 100;
	done
}

#
# args:
# - type
# - collector function
# - collector arguments
function run_collector()
{
	local pid
	local stime
	local ctype=$1
	local cfunc=$2
	shift
	shift

	read pid NN </proc/self/stat
	stime=$(ps -p $pid -o bsdstart=)
	echo -n "$pid " >>$STATPIDS
	echo -n "$stime" >>${STIMEPREFIX}${pid}

	trap "usr1signal" SIGUSR1

#	echo "$pid: new collector $ctype $cfunc"
	$cfunc $@ </dev/null >&${OUTPREFIX}${ctype}.${pid}

}

#
# vmstat collector
#
# VMSTAT_INTERVAL:
# - 0       - collect at start and stop only
# - N       - collect each N seconds
function vmstat_collector()
{
	echo "vmstat " $(date)

	if let "VMSTAT_INTERVAL==0"; then
		date
		vmstat
		idle_collector
		date
		vmstat
	elif let "VMSTAT_INTERVAL>0"; then
		vmstat $VMSTAT_INTERVAL
	else
		echo "Invalid VMSTAT_INTERVAL=$VMSTAT_INTERVAL"
		idle_collector
	fi
}

function vmstat_start()
{
	if [ "$VMSTAT_INTERVAL" == "" ]; then
		return;
	fi

	run_collector "vmstat" vmstat_collector &
}

#
# brw_stats collector
#
# BRW_INVERVAL:
# - 0 - collect at start and stop only
# - N - collect each N seconds
#
function brw_collector()
{
	local filter=$1

	echo "brw_* for $filter " $(date)

	# clear old stats
	lctl set_param -n obdfilter.${filter}.brw_*=0

	if let "BRW_INTERVAL==0"; then
		lctl get_param -n obdfilter.${filter}.brw_*
		idle_collector
		lctl get_param -n obdfilter.${filter}.brw_*
	elif let "BRW_INTERVAL>0"; then
		while [ "$stop_collector" != "1" ]; do
			lctl get_param -n obdfilter.${filter}.brw_*
			sleep $BRW_INTERVAL
		done
	else
		echo "Invalid BRW_INTERVAL=$BRW_INTERVAL"
		idle_collector
	fi
}

function brw_start()
{
	if [ "$BRW_INTERVAL" == "" ]; then
		return;
	fi

	# find all obdfilters
	for i in $(lctl list_param obdfilter.*); do
		filter=$(echo "$i" | awk -F"." '{print $2}')
		if [ "$filter" == "num_refs" ]; then
			continue;
		fi
		run_collector "brw" brw_collector $filter &
	done
}

#
# service_stats collector
#
# SERVICE_INVERVAL:
# - 0 - collect at start and stop only
# - N - collect each N seconds
#
function service_collector()
{
	local file=$1
	local target=$2
	local srv=$3

	echo "service stats for ${target}/${srv} " $(date)

	# clear old stats
	lctl set_param -n $file=0

	if let "SERVICE_INTERVAL==0"; then
		lctl get_param -n $file | grep -v "^[^ ]*[^0-9]*0 samples"
		idle_collector
		lctl get_param -n $file | grep -v "^[^ ]*[^0-9]*0 samples"
	elif let "SERVICE_INTERVAL>0"; then
		while [ "$stop_collector" != "1" ]; do
			lctl get_param -n $file | grep -v "^[^ ]*[^0-9]*0 samples"
			sleep $SERVICE_INTERVAL
		done
	else
		echo "Invalid SERVICE_INTERVAL=$SERVICE_INTERVAL"
		idle_collector
	fi
}

function service_start()
{
	if [ "$SERVICE_INTERVAL" == "" ]; then
		return;
	fi

	# find all OSTs and MDTs
	for i in $(lctl list_param ost.* mdt.*); do
		target=$(echo "$i" | awk -F"." '{print $2}')
		if [ "$target" == "num_refs" ]; then
			continue;
		fi
		for j in $(lctl list_param ${i}.*); do
			srv=$(echo "$j" | awk -F"." '{print $3}')
			if [ "$srv" == "uuid" ]; then
				continue;
			fi
			run_collector "service-${srv}" service_collector \
				${j}.stats $target $srv &
		done
	done

	# find all LDLM services
	for i in $(lctl list_param ldlm.services.*); do
		srv=$(echo "$i" | awk -F"." '{print $3}')
		run_collector "service" service_collector ${i}.stats "ldlm" $srv &
	done

}

#
# client_stats collector
#
# CLIENT_INTERVAL:
# - 0 - collect at start and stop only
# - N - collect each N seconds
#
function client_collector()
{
	local file=$1
	local target=$2
	local srv=$3

	echo "client stats for ${target}/${srv} " $(date)

	# clear old stats
	echo 0 >$file

	if let "CLIENT_INTERVAL==0"; then
		grep -v "^[^ ]*[^0-9]*0 samples" $file
		idle_collector
		grep -v "^[^ ]*[^0-9]*0 samples" $file
	elif let "CLIENT_INTERVAL>0"; then
		while [ "$stop_collector" != "1" ]; do
			grep -v "^[^ ]*[^0-9]*0 samples" $file
			sleep $CLIENT_INTERVAL
		done
	else
		echo "Invalid CLIENT_INTERVAL=$CLIENT_INTERVAL"
		idle_collector
	fi
}

function client_start()
{
	if [ "$CLIENT_INTERVAL" == "" ]; then
		return;
	fi

	# find all osc
	for i in $(lctl list_param osc.*); do
		target=$(echo "$i" | awk -F"." '{print $2}')
		if [ "$target" == "num_refs" ]; then
			continue;
		fi
		i=$(echo "$i" |awk '{gsub(/\./,"/");print}')
		for j in ${i}/*; do
			local stats=$(basename $j)
			if [ "$stats" == "stats" -o "$stats" == "rpc_stats" ]; then
				run_collector "osc-${stats}" client_collector \
					${j} $target $stats &
			fi
		done
	done
	# find all llite stats
	for i in $(lctl list_param llite.*); do
		target=$(echo "$i" | awk -F"." '{print $2}')
		i=$(echo "$i" |awk '{gsub(/\./,"/");print}')
		for j in ${i}/*; do
			stats=$(basename $j)
			if [ "$stats" == "stats" -o "$stats" == "vfs_ops_stats" ]; then
				run_collector "llite-${stats}" client_collector \
					${j} $target ${stats} &
			fi
		done
	done
}

#
# sdio_stats collector
#
# SDIO_INVERVAL:
# - 0 - collect at start and stop only
# - N - collect each N seconds
#
function sdio_collector()
{
	local obd=$1
	local uuid=$(lctl get_param -n obd.uuid 2>&1)
	local tmp=$(lctl get_param -n obd.mntdev 2>&1)
	local disk=$(basename $tmp)
	local file="/proc/scsi/sd_iostats/${disk}"

	echo "sd_iostats for ${uuid}/${disk} " $(date)

	# clear old stats
	echo 0 >$file

	if let "SDIO_INTERVAL==0"; then
		cat $file
		idle_collector
		cat $file
	elif let "SDIO_INTERVAL>0"; then
		while [ "$stop_collector" != "1" ]; do
			cat $file
			sleep $SDIO_INTERVAL
		done
	else
		echo "Invalid SDIO_INTERVAL=$SDIO_INTERVAL"
		idle_collector
	fi
}

function sdio_start()
{
	if [ "$SDIO_INTERVAL" == "" ]; then
		return;
	fi

	# find all obdfilters and MDSs
	for i in $(lctl list_param obdfilter.* mds.*); do
		obd=$(echo "$i" | awk -F"." '{print $2}')
		if [ "$obd" == "num_refs" ]; then
			continue;
		fi
		tmp=$(lctl get_param -n ${i}.mntdev 2>&1)
		if [ $? != 0 ]; then
			continue;
		fi
		local disk=$(basename $tmp)
		if [ ! -f /proc/scsi/sd_iostats/${disk} ]; then
			continue;
		fi
		run_collector "sdio" sdio_collector ${i} &
	done
}

#
# mballoc_stats collector
#
# MBALLOC_INVERVAL:
# - 0 - collect at start and stop only
# - N - isn't implemented yet, works as with 0
#
function mballoc_collector()
{
	local obd=$1
	local uuid=$(lctl get_param -n obd.uuid 2>&1)
	local tmp=$(lctl get_param -n obd.mntdev 2>&1)
	local disk=$(basename $tmp)
	local file="/proc/fs/ldiskfs*/${disk}/mb_history"

	echo "mballoc history for ${uuid}/${disk} " $(date)

	# log allocations only
	for i in $file; do
		echo 3 >$i
	done

	if let "MBALLOC_INTERVAL==0"; then
		idle_collector
		cat $file
	elif let "MBALLOC_INTERVAL>0"; then
		idle_collector
		cat $file
	else
		echo "Invalid MBALLOC_INTERVAL=$MBALLOC_INTERVAL"
		idle_collector
	fi
}

function mballoc_start()
{
	if [ "$MBALLOC_INTERVAL" == "" ]; then
		return;
	fi

	# find all obdfilters and MDSs
	for i in $(lctl list_param obdfilter.* mds.*); do
		obd=$(echo "$i" | awk -F"." '{print $2}')
		if [ "$obd" == "num_refs" ]; then
			continue;
		fi
		tmp=$(lctl get_param -n ${i}.mntdev 2>&1)
		if [ $? != 0 ]; then
			continue;
		fi
		disk=$(basename $tmp)
		if [ ! -f /proc/fs/ldiskfs*/${disk}/mb_history ]; then
			continue;
		fi
		run_collector "mballoc" mballoc_collector ${i} &
	done
}

#
# io_stats collector
#
# IO_INVERVAL:
# - 0 - collect at start and stop only
# - N - collect each N seconds
#
function io_collector()
{
	local obd=$1
	local uuid=$(lctl get_param -n obd.uuid 2>&1)
	local tmp=$(lctl get_param -n obd.mntdev 2>&1)
	local disk=$(basename $tmp)
	local file="/sys/block/${disk}/stat"

	echo "iostats for ${uuid}/${disk} " $(date)

	if let "IO_INTERVAL==0"; then
		cat $file
		idle_collector
		cat $file
	elif let "IO_INTERVAL>0"; then
		while [ "$stop_collector" != "1" ]; do
			cat $file
			sleep $IO_INTERVAL
		done
	else
		echo "Invalid IO_INTERVAL=$IO_INTERVAL"
		idle_collector
	fi
}

function io_start()
{
	if [ "$IO_INTERVAL" == "" ]; then
		return;
	fi

	# find all obdfilters and MDSs
	for i in $(lctl list_param obdfilter.* mds.*); do
		obd=$(echo "$i" | awk -F"." '{print $2}')
		if [ "$obd" == "num_refs" ]; then
			continue;
		fi
		local tmp=$(lctl get_param -n ${i}.mntdev 2>&1)
		if [ $? != 0 ]; then
			continue;
		fi
		local disk=$(basename $tmp)
		if [ ! -f /sys/block/${disk}/stat ]; then
			continue;
		fi
		run_collector "io" io_collector ${i} &
	done
}

#
# jbd_stats collector
#
# JBD_INVERVAL:
# - 0 - collect at start and stop only
# - N - isn't implemented yet, works as with 0
#
function jbd_collector()
{
	local obd=$1
	local uuid=$(lctl get_param -n obd.uuid 2>&1)
	local tmp=$(lctl get_param -n obd.mntdev 2>&1)
	local disk=$(basename $tmp)
	local file="/proc/fs/jbd/${disk}/history"

	echo "jbd history for ${uuid}/${disk} " $(date)

	if let "JBD_INTERVAL==0"; then
		idle_collector
		cat $file
	elif let "JBD_INTERVAL>0"; then
		idle_collector
		cat $file
	else
		echo "Invalid JBD_INTERVAL=$JBD_INTERVAL"
		idle_collector
	fi
}

function jbd_start()
{
	if [ "$JBD_INTERVAL" == "" ]; then
		return;
	fi

	# find all obdfilters and MDSs
	for i in $(lctl list_param obdfilter.* mds.*); do
		obd=$(echo "$i" | awk -F"." '{print $2}')
		if [ "$obd" == "num_refs" ]; then
			continue;
		fi
		local tmp=$(lctl get_param -n ${i}.mntdev 2>&1)
		if [ $? != 0 ]; then
			continue;
		fi
		local disk=$(basename $tmp)
		if [ ! -f /proc/fs/jbd/${disk}/history ]; then
			continue;
		fi
		run_collector "jbd" jbd_collector ${i} &
	done
}

#
# start entry point
#
function ls_start()
{
	if ! ls_grab_control; then
		exit
	fi

	local PID=$(cat $STATPIDS 2>/dev/null)
	if [ "x$PID" != "x" ]; then
		for i in $PID; do
			local i=$(echo $i | sed 's/^[^:]*://')
			local TO=$(cat ${STIMEPREFIX}$i)
			local TN=$(ps -p $i -o bsdstart=)
			if [ "$TO" != "" -a "$TO" == "$TN" ]; then
				echo "Some slave is already running by $i"
				exit
			fi
		done
	fi

	# clean all all stuff
	rm -rf ${STATPIDS}* ${OUTPREFIX}* ${STIMEPREFIX}

	vmstat_start
	brw_start
	service_start
	sdio_start
	mballoc_start
	io_start
	jbd_start
	client_start
}

#
# stop entry point
#
# should stop collection, gather all collected data
#
function ls_stop()
{
	if ! ls_grab_control; then
		exit
	fi

	local PID=$(cat $STATPIDS 2>/dev/null)
	if [ "x$PID" != "x" ]; then
		local pids2wait=""
		for i in $PID; do
			local i=$(echo $i | sed 's/^[^:]*://')
			local TO=$(cat ${STIMEPREFIX}$i 2>/dev/null)
			local TN=$(ps -p $i -o bsdstart=)
			if [ "$TO" == "" -o "$TO" != "$TN" ]; then
				echo "No collector with $i found"
				continue
			fi
			/bin/kill -s USR1 -- -${i}
			pids2wait="$pids2wait $i"
		done
#echo "XXX: wait collectors $pids2wait"
		for i in $pids2wait; do
			TO=$(cat ${STIMEPREFIX}$i 2>/dev/null)
			TN=$(ps -p $i -o bsdstart=)
			while [ "$TO" != "" -a "$TO" == "$TN" ]; do
				sleep 1
				TN=$(ps -p $i -o bsdstart=)
			done
		done
	fi
	rm -f $STATPIDS ${STIMEPREFIX}*
}

#
# fetch entry point
#
# creates tarball of all collected stats
# current version is silly - just finds all *out* files in $TMP
ls_fetch()
{
	if [ "X${GLOBAL_TIMESTAMP}" = "X" ]; then
		local date=$(date +%F-%H.%M.%S)
	else
		date=${GLOBAL_TIMESTAMP}
	fi

	local hostname=$(hostname -s)
	local name="stats-$hostname-$date"

	stats=${OUTPREFIX}*
	if ! mkdir ${TMP}/${name}; then
		echo "Can't create ${TMP}/${name}"
		exit
	fi

	let found=0
	for i in ${OUTPREFIX}*; do
		mv $i ${TMP}/${name}/
		let "found++"
	done

	if let "found > 0"; then
		(cd ${TMP}; tar -zcf "./${name}.tar.gz" "./${name}")
		cat ${TMP}/${name}.tar.gz
	else
		echo "No stats found"
	fi
	rm -rf ${TMP}/${name}*
}

#
# abort entry point
#
# should kill all running collections
#
function ls_abort()
{
	echo "Abort isn't implemented yet"
}

#########
#  main
#########

# required to put all background processes into different process groups
# so that we can manage whole groups sending them a single signal
set -m

case $1 in
	start) ls_start ;;
	stop)  ls_stop ;;
	fetch) ls_fetch ;;
	abort)  ls_abort ;;
	*) echo "Unknown command"
esac

