#!/usr/bin/env bash
# reproduction.sh — reproducer for ll_statfs_project divide-by-zero
# Prerequisites: ldev.conf configured, SSH key auth set up, client mounted
set -euo pipefail

# ── Environment variables (adjust to match your setup) ───────────────────────
FS_NAME=" "
SNAP_NAME=" "
MGS_NID=" "
MNT_CLI_PROD=" "
MNT_CLI_SNAP=" "
PROJ_ID=" "
PROJ_DIR=" "
PROJ_BLKLIMIT=" "

# ── [1] Set up project quota (block-hardlimit must be > 0 to enter ll_statfs_project) ──
mkdir -p "$PROJ_DIR"
lfs project -sp "$PROJ_ID" "$PROJ_DIR"
lfs setquota -p "$PROJ_ID" --block-hardlimit "$PROJ_BLKLIMIT" "$MNT_CLI_PROD"

# Create a dummy file so statfs has meaningful usage to report
[[ -f "${PROJ_DIR}/dummy" ]] || dd if=/dev/zero of="${PROJ_DIR}/dummy" bs=1M count=1 &>/dev/null

# ── [2] Create Lustre snapshot (write barrier ON → consistent across all MDT/OST) ──
lctl snapshot_create -F "$FS_NAME" -n "$SNAP_NAME" -b on

# ── [3] Mount snapshot targets and capture fsname from output ────────────────
# snapshot_mount prints "mounted the snapshot <name> with fsname <fsname>".
# Parse fsname directly from its output — avoids lctl snapshot_list -d which
# returns non-zero in some environments, killing the script via set -euo pipefail.

SNAP_MOUNT_OUT=$(lctl snapshot_mount -F "$FS_NAME" -n "$SNAP_NAME" 2>&1)
echo "$SNAP_MOUNT_OUT"
SNAP_FSNAME=$(echo "$SNAP_MOUNT_OUT" | awk '/fsname/{print $NF}')
[[ -n "$SNAP_FSNAME" ]] || { echo "ERROR: failed to extract snapshot_fsname from mount output"; exit 1; }

# ── [4] Wait for MDT recovery COMPLETE ───────────────────────────────────────
# MUST wait for COMPLETE before mounting the client.
# The Lustre client can connect during MDT recovery — if du runs while the MDT
# is still in recovery, lod_foreach_mdt does not yet have live OSP connections
# to sub-MDTs, so the os_bsize=0 bug path is never triggered.
# The bug window opens right AFTER recovery COMPLETE: sub-MDT OSPs just started,
# opd_statfs.os_bsize=0, first MDS_STATFS RPC reply has not yet arrived.

SNAP_MDT_PARAM="mdt.${SNAP_FSNAME}-MDT0000"
for i in {1..30}; do
    status=$(lctl get_param -n "${SNAP_MDT_PARAM}.recovery_status" 2>/dev/null \
             | grep "^status:" | awk '{print $2}')
    [[ "$status" == "COMPLETE" || "$status" == "INACTIVE" ]] && break
    echo "  recovery status=${status:-unknown} (${i}/30)..."
    sleep 2
done

# ── [5] Mount the snapshot client immediately after recovery COMPLETE ─────────
mkdir -p "$MNT_CLI_SNAP"
mount -t lustre -o ro "${MGS_NID}:/${SNAP_FSNAME}" "$MNT_CLI_SNAP"

# Confirm bug condition: sub-MDT OSP blocksize must be 0.
# If already non-zero, MDS_STATFS reply already arrived — window missed.
echo "=== sub-MDT OSP blocksize (must be 0 to trigger crash) ==="
lctl get_param "osp.${SNAP_FSNAME}-MDT*-osp-MDT0000.blocksize" 2>/dev/null || true

# ── [6] Run du → trigger the crash (#DE causes kernel panic and automatic reboot) ──
SNAP_PROJ_DIR="${MNT_CLI_SNAP}${PROJ_DIR#${MNT_CLI_PROD}}"
echo ">>> du -sh ${SNAP_PROJ_DIR}"
du -sh "$SNAP_PROJ_DIR"

# ── [7] Cleanup (only reached if no crash occurred) ──────────────────────────
umount "$MNT_CLI_SNAP"
lctl snapshot_umount -F "$FS_NAME" -n "$SNAP_NAME"
lctl snapshot_destroy -F "$FS_NAME" -n "$SNAP_NAME" -f
