#!/bin/bash

######################################################################
# customize per survey

# CHOOSE EITHER scsidevs or rawdevs
# the SCSI devices to measure - WARNING: will be erased.
# The raw devices to use
# rawdevs=${rawdevs:-"/dev/raw/raw1"}
# scsidevs=`ls /dev/sd[a-z] /dev/sd[a-z][a-z]` # all devices, if you use udev

# result file prefix.
# NB ensure the path exists on all servers if it includes subdirs
rslt_loc=${rslt_loc:-"/tmp"}
rslt=${rslt:-"$rslt_loc/sgpdd_survey_`date +%F@%R`"}

# what to do (read or write)
actions=${actions:-"write read"}

# total size per device (MBytes)
# NB bigger than device cache is good
size=${size:-8192}

# record size (KBytes)
rszlo=${rszlo:-1024}
rszhi=${rszhi:-1024}

# Concurrent regions per device
crglo=${crglo:-1}
crghi=${crghi:-256}

# boundary blocks between concurrent regions per device
boundary=${boundary:-1024}

# threads to share between concurrent regions per device
# multiple threads per region simulates a deeper request queue
# NB survey skips over #thr < #regions and #thr/#regions > SG_MAX_QUEUE
thrlo=${thrlo:-1}
thrhi=${thrhi:-4096}

# NUMA support
# User provided script that returns a cpu list from a specified device.
# Implementation depends on the type of device (scsi/raw, with/without
# multipath, technology fc/sas/ib)
# For example:
#   $ cat bin/dev2cpus
#   #!/bin/bash
#   dev=$(basename $1)
#   pci=$(readlink -f /sys/class/block/$dev | cut -d/ -f1-5)
#   cat ${pci}/local_cpulist
dev2cpus=${dev2cpus:-""}

#####################################################################
# leave the rest of this alone unless you know what you're doing...

# and max # threads one instance will spawn
SG_MAX_QUEUE=16

# numactl command
NUMACTL=${NUMACTL:-"/usr/bin/numactl"}

unique () {
    echo "$@" | xargs -n1 echo | sort -u
}

split_hostname () {
    local name=$1
    case $name in
    *:*) host=`echo $name | sed 's/:.*$//'`
	 name=`echo $name | sed 's/[^:]*://'`
	 ;;
    *)   host=localhost
	 ;;
    esac
    echo "$host $name"
}

DSH=${DSH:-"ssh"}

dsh () {
    local node="$1"
    local user="$2"
    shift 2
    local command="$@"

    command="export PATH=/sbin:/usr/sbin:\$PATH; $command"

    case $DSH in
	ssh)
	    if [ -n "$user" ]; then
		user="$user@"
	    fi
	    $DSH $user$node "$command"
	    ;;
	rsh)
	    if [ -n "$user" ]; then
		user="-l $user"
	    fi
	    $DSH $user $node "$command"
	    ;;
    esac
}

# how to run commands on other nodes
remote_shell () {
    local host=$1
    shift
    local cmds="$@"
    if [ "$host" = "localhost" -o "$host" = `uname -n` ]; then
	eval "$cmds"
    else
	# split $host into $host and $user
	local user=""
	if [[ $host == *@* ]]; then
	    user=${host%@*}
	    host=${host#*@}
	fi
	dsh $host "$user" "$cmds"
    fi
}


# check either scsidevs or rawdevs is specified
# but only one of them
if [ -n "$scsidevs" -a -n "$rawdevs" -o -z "$scsidevs$rawdevs" ]; then
    echo "Must either specify scsidevs or rawdevs"
    exit 1
fi

# retrieve host and device if specified as "hostname:device"
ndevs=0
devs=()
for d in $scsidevs $rawdevs; do
    str=(`split_hostname $d`)
    hosts[$ndevs]=${str[0]}
    devs[$ndevs]=${str[1]}
    ndevs=$((ndevs+1))
done
unique_hosts=(`unique ${hosts[@]}`)

# get device cpu list
devcpus=()
if [ -n "$dev2cpus" ]; then
    for ((i=0; i < $ndevs; i++)); do
	devcpus[$i]=$(remote_shell ${hosts[$i]} $dev2cpus ${devs[$i]})
    done
fi

# map given device names into SG device names
if [ "$scsidevs" ]; then
    # make sure sg kernel module is loaded
    for host in ${unique_hosts[@]}; do
	sg_is_loaded=$(remote_shell $host grep -q "^sg " /proc/modules \
		       && echo true || echo false)
	if ! $sg_is_loaded; then
	    echo "loading the sg kernel module on $host"
	    remote_shell $host modprobe sg
	    sg_was_loaded_on="$sg_was_loaded_on $host"
	fi
    done

    for ((i=0; i < $ndevs; i++)); do
	# resolve symbolic link if any
	devs[$i]=$(remote_shell ${hosts[$i]} readlink -f ${devs[$i]})

	# retrieve associated sg device
	# we will test for a LUN, the test for a partition
	# if the partition number is > 9 this will fail
	tmp=$(remote_shell ${hosts[$i]} sg_map | \
	      awk -v dev=${devs[$i]} '{if ($2 == dev) print $1}')
	if [ -z "$tmp" ]; then
	    echo "Can't find SG device for ${hosts[$i]}:${devs[$i]}, " \
		 "testing for partition"
	    pt=`echo ${devs[$i]} | sed 's/[0-9]*$//'`
	    # Try again
	    tmp=$(remote_shell ${hosts[$i]} sg_map | \
		  awk -v dev=$pt '{if ($2 == dev) print $1}')
	    if [ -z "$tmp" ]; then
		echo -e "Can't find SG device ${hosts[$i]}:$pt.\n" \
			"Do you have the sg module configured for your kernel?"
		exit 1
	   fi
	fi
	devs[$i]=$tmp
    done
elif [ "$rawdevs" ]; then
    for ((i=0; i < $ndevs; i++)); do
	RES=$(remote_shell ${hosts[$i]} raw -q ${devs[$i]})
	if [ $? -ne 0 ];then
	    echo "Raw device ${hosts[$i]}:${devs[$i]} not set up"
	    exit 1
	fi
    done
fi

# determine block size of each device. This should also work for raw devices
# If it fails, set to 512
for ((i=0; i < $ndevs; i++)); do
    # retrieve device size (in kbytes) and block size (in bytes)
    tmp=( `remote_shell ${hosts[$i]} sg_readcap -lb ${devs[$i]}` )
    bs[$i]=$((tmp[1]))
    if [ ${bs[$i]} == 0  ]; then
	echo "sg_readcap on device ${hosts[$i]}:${devs[$i]} failed, " \
	     "setting block size to 512"
	bs[$i]=512
    fi
    devsize=$((tmp[0]*bs[$i]/1024))

    # check record size is a multiple of block size
    if [ $((rszlo*1024%bs[$i])) -ne 0 ]; then
	echo "Record size is not a multiple of block size (${bs[$i]} bytes) " \
	     "for device ${hosts[$i]}:${devs[$i]}"
	exit 1
    fi

    # check device size
    if [ $devsize -lt $((size*1024)) ]; then
	echo -e "device ${hosts[$i]}:${devs[$i]} not big enough: " \
		"$devsize < $((size*1024)).\nConsider reducing \$size"
	exit 1
    fi
done

rsltf=${rslt}.summary
workf=${rslt}.detail
cmdsf=${rslt}.script
echo -n > $rsltf
echo -n > $workf

print_summary () {
    if [ "$1" = "-n" ]; then
	minusn=$1; shift
    else
	minusn=""
    fi
    echo $minusn "$*" >> $rsltf
    echo $minusn "$*"
}

print_summary "$(date) sgpdd-survey on $rawdevs$scsidevs from $(hostname)"

for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do
    for ((crg=$crglo;crg<=$crghi;crg*=2)); do 
	for ((thr=$thrlo;thr<=$thrhi;thr*=2)); do
	    if ((thr < crg || thr/crg > SG_MAX_QUEUE)); then
		continue
	    fi
	    # compute total size (in kbytes)
	    total_size=0
	    for ((i=0; i < $ndevs; i++)); do
		tsize=$((size*1024*1024/bs[$i]/crg*crg*bs[$i]/1024))
		total_size=$((total_size+tsize))
	    done
	    # show test parameters
	    str=`printf 'dev %2d sz %8dK rsz %4dK crg %5d thr %5d ' \
			 $ndevs $total_size $rsz $((crg*ndevs)) $((thr*ndevs))`
	    echo "==============> $str" >> $workf
	    print_summary -n "$str"

	    # check memory for each host
	    for host in ${unique_hosts[@]}; do
		numdevs=0
		for ((i=0; i < $ndevs; i++)); do
		    if [ ${hosts[$i]} == $host ]; then
			numdevs=$((numdevs+1))
		    fi
		done
		freemem=$(remote_shell $host cat /proc/meminfo | \
			  awk '/^MemTotal:/ {printf "%d\n", $2}')
		if (((rsz*thr/crg + 64)*crg*numdevs > freemem)); then
		    echo "ENOMEM on $host" >> $workf
		    print_summary "ENOMEM"
		    continue 2
		fi
	    done

	    # run tests
	    for action in $actions; do
		declare -a pidarray
		print_summary -n "$action "
		echo "=====> $action" >> $workf
		tmpf=${workf}_tmp

		# create per-host script files
		for host in ${unique_hosts[@]}; do
		    echo -n > ${cmdsf}_${host}
		done
		for ((i=0; i < $ndevs; i++)); do
		    bpt=$((rsz*1024/bs[$i]))
		    blocks=$((size*((1024*1024)/bs[$i])/crg))
		    count=$blocks
		    host=${hosts[$i]}
		    dev=${devs[$i]}
		    if [ $action = read ]; then
			inf="if=$dev"
			outf="of=/dev/null"
			skip=skip
		    else
			inf="if=/dev/zero"
			outf="of=$dev"
			skip=seek
		    fi
		    if [ -n "${devcpus[$i]}" -a -x "$NUMACTL" ]; then
			numacmd="$NUMACTL --physcpubind=${devcpus[$i]} --localalloc"
		    else
			numacmd=""
		    fi
		    for ((j=0;j<crg;j++)); do 
			echo >> ${cmdsf}_${host} \
				"$numacmd " \
				"sgp_dd 2> ${tmpf}_${i}_${j} $inf $outf " \
				"${skip}=$((boundary+j*blocks)) " \
				"thr=$((thr/crg)) count=$count bs=${bs[$i]} " \
				"bpt=$bpt time=1&"
		    done
		done
		for host in ${unique_hosts[@]}; do
		    echo "wait" >> ${cmdsf}_${host}
		done

		# run of all the per-host script files
		t0=`date +%s.%N`
		pidcount=0
		for host in ${unique_hosts[@]}; do
		    remote_shell $host bash < ${cmdsf}_${host} &
		    pidarray[$pidcount]=$!
		    pidcount=$((pidcount+1))
		done
		pidcount=0
		for host in ${unique_hosts[@]}; do
		    wait ${pidarray[$pidcount]}
		    pidcount=$((pidcount+1))
		done
		t1=`date +%s.%N`

		# clean up per-host script files
		for host in ${unique_hosts[@]}; do
		    rm ${cmdsf}_${host}
		done

		# collect/check individual stats
		echo > $tmpf
		ok=0
		for ((i=0;i<ndevs;i++)); do
		    for ((j=0;j<crg;j++)); do
			rtmp=${tmpf}_${i}_${j}_local
			remote_shell ${hosts[$i]} cat ${tmpf}_${i}_${j} > $rtmp
			if grep 'error' $rtmp > /dev/null 2>&1; then
			    echo "Error found in $rtmp"
			elif grep 'time to transfer data' $rtmp > /dev/null 2>&1; then
			    ok=$((ok + 1))
			fi
			cat ${rtmp} >> $tmpf
			cat ${rtmp} >> $workf
			rm  ${rtmp}
			remote_shell ${hosts[$i]} rm ${tmpf}_${i}_${j}
		    done
		done
		if ((ok != ndevs*crg)); then
		    print_summary -n "$((ndevs*crg - ok)) failed "
		else
		    # compute bandwidth in MiB/s from total data / elapsed time
		    bw=`awk "BEGIN {printf \"%7.2f \", \
				    $total_size / (( $t1 - $t0 ) * 1024); exit}"`
		    # compute global min/max stats
		    minmax=`awk < $tmpf \
			'/time to transfer data/ {mb=$8/1.048576; \
						  if (n == 0 || mb < min) min = mb; \
						  if (n == 0 || mb > max) max = mb; \
						  n++} \
			END {printf "[ %7.2f, %7.2f] ",min,max;}'`
		    print_summary -n "$bw $minmax "
		fi
		rm $tmpf
	    done
	    print_summary ""
	done
    done
done

for host in $sg_was_loaded_on; do
    echo "unloading sg module on $host"
    remote_shell $host rmmod sg
done
