#!/bin/bas

die() {
	echo "$@" 1>&2
	exit 1
}

[[ $1 ]] || die "Need configuration file"
. /chexport/lustre/hostlist.sh || die "Helper functions missing"
. $1 || die "Problem sourcing configuration file"

# XXX 1) validate that we have everything we need
# XXX 2) Check that all hosts are up
# XXX 3) modprobe lustre everywhere
# XXX 4) ensure connectivity to MGS

PDSH="pdsh -N -w"
JMKFS=/sbin/mke2fs
LMKFS=/usr/sbin/mkfs.lustre

#PDSH="echo $PDSH"
#JMKFS="echo $JMKFS"
#LMKFS="echo $LMKFS"

remote_format() {
	local label=$1
	local host=$2
	local dev=$3
	local mkfsopts="-q"
	#local mkfsopts="-O uninit_bg,dir_nlink,huge_file,flex_bg,large_xattr"
	local jdev mkfsopts
	shift 3 

	# If we have an external journal, then format it first
	#
	[[ $dev != ${dev#*:} ]] && {
		jdev=${dev#*:}
		dev=${dev%:*}

		echo "${label}: formatting journal dev $jdev"

		# XXX journal options? limit size?
		$PDSH $host "$JMKFS -O journal_dev -L${label:0:11}-jrnl -F -b 4096 $jdev" ||
			die "${label}: making journal $jdev for $dev"
		mkfsopts="$mkfsopts -J device=$jdev"
	}

	echo "$label: formatting dev $dev with $LMKFS -vvv --mkfsoptions=$mkfsopts $@"
	$PDSH $host $LMKFS -vvv --mkfsoptions="'$mkfsopts'" "$@" $dev || die "${label}: formatting $dev"
	return 0
}

# Get the real number of disk
disk_cnt=0
max=0
i=1
#for oss in $(hostlist_expand "$OSS"); do
	# Okay find what we really have
#	luns=$($PDSH $oss "ls --sort=version /dev/mpath/*")
#	let "disk_cnt=$(get_node_count $luns)/2" # Handle fail over

	# Deal with what we want
#	var="OST${i}DEVBASE"
#	osts=$(hostlist_expand "${!var}" | sort -n)
#	let "count=$(get_node_count "$osts")"

#	[[ $count -gt $disk_cnt ]] && {
#		let "ratio=$count/$disk_cnt"

#		volslist="${!var%%]*}]"
#		rootname=$(echo ${volslist//\/dev\/} | awk ' { printf $1 }')
#		vols=$(hostlist_expand "${volslist//\/dev\/}")
#		for vol in $vols; do
#			$PDSH $oss "pvcreate -ff -y /dev/mpath/$vol"
#			$PDSH $oss "vgcreate $vol /dev/mpath/$vol"
#			vg=$($PDSH $oss "vgdisplay $vol | grep \"Free  PE\"")
#			size=$(echo $vg | awk ' { print $5 }')
#			let "size/=$ratio"
#			for j in $(seq 0 $((--ratio))); do
#				$PDSH $oss "lvcreate -l$size $vol /dev/mpath/$vol -n ${rootname%%[*}$j"
#			done;
#		done
#	}
#	((i++))
#	[[ $max -lt $count ]] && max=$count;
#done
max=28

# First format and mount the MGS
# XXX Don't reformat it if it is up and mounted
host=${MGTDEV%%:*}
dev=${MGTDEV#*:}; dev=${dev%%:*}
remote_format "$MGSNAME" "$host" "$dev" --reformat --mgs --param sys.timeout=$OBD_TIMEOUT \
				--device-size=$(( 10 * 1024 * 1024))

# Format everything else in parallel
# XXX limit parallelism per host
#
host=${MDTDEV%%:*}
dev=${MDTDEV#*:}; dev=${dev%%:*}
remote_format "$FSNAME-MDT" "$host" "$dev" --reformat --mdt --mgsnode=$MGS --fsname=$FSNAME \
	--param lov.stripesize=$DEFAULT_STRIPE_SIZE --param lov.stripecount=$DEFAULT_STRIPE_COUNT \
	--param sys.timeout=$OBD_TIMEOUT &

# Set the schedular to deadline to speed up file system creation
$PDSH $OSS 'for file in $(find /sys/block/sd* -name scheduler); do echo "deadline" > $file; done'

index=0
MKFSOPTS="--reformat --ost --param sys.timeout=$OBD_TIMEOUT --mgsnode=$MGS --fsname=$FSNAME"
[ "x$OSTSIZE" != "x" ] && MKFSOPTS="$MKFSOPTS --device-size=$((OSTSIZE * 1024 * 1024))"
for i in $(seq $max); do
	j=1
	for oss in $(hostlist_expand "$OSS"); do
		var="OST${j}DEVBASE"
		ost=$(hostlist_expand "${!var}" $i)
		failover="$(hostlist_expand "$OSSFAILOVER" $j)"
		[ -n "$ost" ] && {
			[ -n "$failover" ] && failover="--failnode=${failover}"
			#echo "format OST$index $ost labeled as $(printf "%s-OST%04x" $FSNAME $index) on $oss"	
			remote_format "$(printf "%s-OST%04x" $FSNAME $index)" "$oss" "$ost" $MKFSOPTS $failover --index=$index &
			((index++))
			((j++))
		}
	done
done
export OSTCOUNT=$index
wait || die "A format failed"

(. /chexport/lustre/new-lustre_start.sh $1)
