#!/bin/bash
# Deadlock Reproducer v7
#
# Changes from v6:
#
# 1. Phase 4: After deadlock detected, trigger `sync` in background
#    to cause osc_extent_wait() 600s timeout and observe the
#    extent dump with flags (wiumY).
#
#    Deadlocked writer holds extent in ACTIVE state. sync triggers:
#      ll_writepages(WB_SYNC_ALL) -> osc_cache_writeback_range
#        -> sets oe_fsync_wait=1 (Y flag)
#      osc_io_fsync_end -> osc_cache_wait_range -> osc_extent_wait(OES_INV)
#        -> 600s timeout -> OSC_EXTENT_DUMP(D_ERROR)
#        -> "wait ext to 0 timedout, recovery in progress?"
#
#    Expected flags: |active|wiumY|
#      w = write extent (oe_rw=0)
#      i = in RB tree
#      u = urgent (set by kswapd's osc_flush_async_page)
#      m = memalloc (set by kswapd, PF_MEMALLOC)
#      Y = fsync_wait (set by sync's osc_cache_writeback_range)
#
# 2. After deadlock, kill other writers and memeater, remove netem,
#    then wait up to 660s for osc_extent_wait timeout in dmesg.
#
# Deadlock sequence (same as v6):
#   1. memeater allocation burst -> kswapd activates
#   2. kswapd: prep page N (N>=16, in inactive LRU) -> PG_writeback
#   3. Writer: write() -> ll_write_begin(page 0..15) -> all in active LRU,
#      NOT prepped -> proceeds -> osc_extent_hold -> CACHE->ACTIVE, list_del_init
#   4. Writer: ll_write_begin(page N) -> PG_writeback set -> BLOCKED
#   5. Extent is ACTIVE, not on urgent list -> ptlrpcd can't flush
#   6. max_rpcs_in_flight=1 + 10ms delay -> ptlrpcd stuck on slow RPC
#   7. DEADLOCK

set -u

SERVER_IP="10.63.117.77"
LUSTRE_MNT="/mnt/lustre"
WRITE_DIR="${LUSTRE_MNT}/deadlock"
DIRTY_MB=3
NUM_WRITERS=8
PREAD_PAGES=16
DURATION=3600
CHECK_INTERVAL=2
DEADLOCK_THRESHOLD=30
MEMEATER_HOLD_SECS=3
MEMEATER_GAP_SECS=2
NETEM_DELAY_MS=10
EXTENT_WAIT_TIMEOUT=660

echo "=== Deadlock Reproducer v7 ==="
echo "  Key: v6 + Phase 4 (sync -> osc_extent_wait 600s timeout observation)"
echo ""

if ! mountpoint -q "$LUSTRE_MNT" 2>/dev/null; then
    echo "ERROR: $LUSTRE_MNT not mounted"; exit 1
fi

# ============================================================
# Cleanup
# ============================================================
WRITER_PIDS=()
MEMEATER_PID=""
SYNC_PID=""
ORIG_MIN_FREE=""
ORIG_VFS_CACHE_PRESSURE=""
ORIG_SWAPPINESS=""
ORIG_OVERCOMMIT=""
ORIG_DIRTY_WB=""
ORIG_DIRTY_EXP=""
ORIG_DIRTY_RATIO=""
ORIG_DIRTY_BG_RATIO=""
ORIG_MAX_RPCS=""
NETEM_IFACE=""

cleanup() {
    echo ""
    echo "--- Cleaning up ---"

    [ -n "$NETEM_IFACE" ] && tc qdisc del dev "$NETEM_IFACE" root 2>/dev/null
    [ -n "$ORIG_MIN_FREE" ] && echo "$ORIG_MIN_FREE" > /proc/sys/vm/min_free_kbytes

    [ -n "$MEMEATER_PID" ] && kill -9 "$MEMEATER_PID" 2>/dev/null && \
        wait "$MEMEATER_PID" 2>/dev/null

    [ -n "$SYNC_PID" ] && kill -9 "$SYNC_PID" 2>/dev/null && \
        wait "$SYNC_PID" 2>/dev/null

    sleep 2
    for pid in "${WRITER_PIDS[@]}"; do
        kill "$pid" 2>/dev/null
    done
    sleep 2
    for pid in "${WRITER_PIDS[@]}"; do
        kill -9 "$pid" 2>/dev/null
        wait "$pid" 2>/dev/null
    done

    [ -n "$ORIG_VFS_CACHE_PRESSURE" ] && echo "$ORIG_VFS_CACHE_PRESSURE" > /proc/sys/vm/vfs_cache_pressure
    [ -n "$ORIG_SWAPPINESS" ] && echo "$ORIG_SWAPPINESS" > /proc/sys/vm/swappiness
    [ -n "$ORIG_OVERCOMMIT" ] && echo "$ORIG_OVERCOMMIT" > /proc/sys/vm/overcommit_memory
    [ -n "$ORIG_DIRTY_WB" ] && echo "$ORIG_DIRTY_WB" > /proc/sys/vm/dirty_writeback_centisecs
    [ -n "$ORIG_DIRTY_EXP" ] && echo "$ORIG_DIRTY_EXP" > /proc/sys/vm/dirty_expire_centisecs
    # Restore dirty ratio (dirty_bytes=0 is invalid, so switch back to ratio mode)
    [ -n "$ORIG_DIRTY_RATIO" ] && echo "$ORIG_DIRTY_RATIO" > /proc/sys/vm/dirty_ratio
    [ -n "$ORIG_DIRTY_BG_RATIO" ] && echo "$ORIG_DIRTY_BG_RATIO" > /proc/sys/vm/dirty_background_ratio

    if [ -n "$ORIG_MAX_RPCS" ]; then
        lctl set_param osc.*.max_rpcs_in_flight="$ORIG_MAX_RPCS" 2>/dev/null
    fi

    lctl set_param debug=- 2>/dev/null
    rm -f /tmp/dl_writer /tmp/dl_memeater /tmp/dl_writer.c /tmp/dl_memeater.c
    echo "Done"
}
trap cleanup EXIT

# ============================================================
# Compile writer (v6: pread PREAD_PAGES pages before each write)
# ============================================================
cat > /tmp/dl_writer.c << 'CEOF'
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>

#define MB (1024UL * 1024)
#define PAGE_SIZE 4096UL

int main(int argc, char **argv)
{
	if (argc < 5) {
		fprintf(stderr, "Usage: %s <file> <dirty_mb> <id> <pread_pages>\n",
			argv[0]);
		return 1;
	}

	const char *path = argv[1];
	size_t dirty_mb = atol(argv[2]);
	int id = atoi(argv[3]);
	int pread_pages = atoi(argv[4]);
	size_t size = dirty_mb * MB;
	int iteration = 0;
	char tmp;

	char *buf = malloc(size);
	if (!buf) {
		perror("malloc");
		return 1;
	}

	int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0644);
	if (fd < 0) {
		perror("open");
		return 1;
	}

	memset(buf, 'A', size);
	if (write(fd, buf, size) < 0) {
		perror("initial write");
		return 1;
	}
	if (fsync(fd) < 0) {
		perror("initial fsync");
		return 1;
	}

	fprintf(stderr, "[writer-%d] PID=%d, %luMB, pread_pages=%d, "
		"starting overwrite loop\n", id, getpid(), dirty_mb,
		pread_pages);

	while (1) {
		/*
		 * Promote pages 0..pread_pages-1 to active LRU.
		 * pread() triggers mark_page_accessed() which moves
		 * each page from inactive to active LRU list.
		 * kswapd only scans inactive LRU, so these pages are
		 * skipped. The writer's write() starts at page 0 and
		 * proceeds through all pread'd pages cleanly, then
		 * holds the extent (CACHE->ACTIVE) before hitting a
		 * prepped page (page N >= pread_pages).
		 */
		for (int p = 0; p < pread_pages; p++)
			pread(fd, &tmp, 1, (off_t)p * PAGE_SIZE);

		memset(buf, (char)(iteration & 0xff), size);
		lseek(fd, 0, SEEK_SET);

		ssize_t n = write(fd, buf, size);
		if (n < 0) {
			fprintf(stderr, "[writer-%d] write error: %s\n",
				id, strerror(errno));
			break;
		}

		iteration++;
		if (iteration % 1000 == 0) {
			fprintf(stderr, "[writer-%d] %d iterations\n",
				id, iteration);
		}
	}

	close(fd);
	free(buf);
	return 0;
}
CEOF

# ============================================================
# Compile memeater
# ============================================================
cat > /tmp/dl_memeater.c << 'CEOF'
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>

#define MB (1024UL * 1024)

static volatile int running = 1;

static void handle_signal(int sig)
{
	(void)sig;
	running = 0;
}

int main(int argc, char **argv)
{
	if (argc < 2) {
		fprintf(stderr, "Usage: %s <eat_mb>\n", argv[0]);
		return 1;
	}

	size_t eat_mb = atol(argv[1]);

	signal(SIGTERM, handle_signal);
	signal(SIGINT, handle_signal);

	fprintf(stderr, "[memeater] PID=%d, eating %luMB\n", getpid(), eat_mb);

	char **chunks = calloc(eat_mb, sizeof(char *));
	if (!chunks) {
		perror("calloc");
		return 1;
	}

	size_t allocated = 0;
	for (size_t i = 0; i < eat_mb && running; i++) {
		chunks[i] = malloc(MB);
		if (!chunks[i])
			break;
		memset(chunks[i], (char)(i & 0xff), MB);
		allocated++;

		if (allocated % 1024 == 0) {
			fprintf(stderr, "[memeater] %luMB allocated\n",
				allocated);
		}
	}

	fprintf(stderr, "[memeater] done: %luMB, holding...\n", allocated);

	while (running)
		pause();

	for (size_t i = 0; i < allocated; i++)
		free(chunks[i]);
	free(chunks);

	return 0;
}
CEOF

echo "Compiling..."
gcc -O2 -o /tmp/dl_writer /tmp/dl_writer.c
gcc -O2 -o /tmp/dl_memeater /tmp/dl_memeater.c
echo "Done"

# ============================================================
# Phase 1: Tuning
# ============================================================
echo ""
echo "--- Phase 1: Tuning ---"

# Network delay
NETEM_IFACE=$(ip route get "$SERVER_IP" | awk '/dev/ {for(i=1;i<=NF;i++) if($i=="dev") print $(i+1)}' | head -1)
if [ -n "$NETEM_IFACE" ]; then
    tc qdisc add dev "$NETEM_IFACE" root netem delay ${NETEM_DELAY_MS}ms 2>/dev/null
    echo "  netem: ${NETEM_DELAY_MS}ms on $NETEM_IFACE"
fi

# max_rpcs_in_flight=1
ORIG_MAX_RPCS=$(lctl get_param -n osc.*.max_rpcs_in_flight 2>/dev/null | head -1)
lctl set_param osc.*.max_rpcs_in_flight=1 2>/dev/null
echo "  max_rpcs_in_flight=1 (was $ORIG_MAX_RPCS)"

# OOM protection
for pid in $(pgrep -f 'incus-agent|lxd-agent' 2>/dev/null); do
    echo -1000 > /proc/$pid/oom_score_adj 2>/dev/null
done
for pid in $(pgrep -x 'sshd\|systemd-journald' 2>/dev/null); do
    echo -900 > /proc/$pid/oom_score_adj 2>/dev/null
done

ORIG_MIN_FREE=$(cat /proc/sys/vm/min_free_kbytes)
ORIG_VFS_CACHE_PRESSURE=$(cat /proc/sys/vm/vfs_cache_pressure)
ORIG_SWAPPINESS=$(cat /proc/sys/vm/swappiness)
ORIG_OVERCOMMIT=$(cat /proc/sys/vm/overcommit_memory)
ORIG_DIRTY_WB=$(cat /proc/sys/vm/dirty_writeback_centisecs)
ORIG_DIRTY_EXP=$(cat /proc/sys/vm/dirty_expire_centisecs)
ORIG_DIRTY_RATIO=$(cat /proc/sys/vm/dirty_ratio)
ORIG_DIRTY_BG_RATIO=$(cat /proc/sys/vm/dirty_background_ratio)

echo 0 > /proc/sys/vm/vfs_cache_pressure
echo 0 > /proc/sys/vm/swappiness
echo 1 > /proc/sys/vm/overcommit_memory
echo 600000 > /proc/sys/vm/dirty_writeback_centisecs
echo 600000 > /proc/sys/vm/dirty_expire_centisecs
# Prevent balance_dirty_pages from throttling writers during memeater allocation.
# When MemAvailable ~ 0, dirty_ratio threshold ~ 0 -> instant throttle.
# dirty_bytes=512MB >> total dirty ~ 24MB -> no throttle.
echo 536870912 > /proc/sys/vm/dirty_bytes
echo 268435456 > /proc/sys/vm/dirty_background_bytes
[ -f /proc/sys/vm/watermark_boost_factor ] && echo 0 > /proc/sys/vm/watermark_boost_factor
echo 131072 > /proc/sys/vm/min_free_kbytes

echo "  vm: vfs_cache_pressure=0 swappiness=0 overcommit=1"
echo "  vm: dirty_writeback=600000 min_free_kbytes=131072"
echo "  vm: dirty_bytes=512MB (prevent balance_dirty_pages)"

lctl set_param debug=+cache+error+warning 2>/dev/null
lctl clear 2>/dev/null

echo 3 > /proc/sys/vm/drop_caches
sleep 1

TOTAL_MEM_MB=$(($(awk '/MemTotal/ {print $2}' /proc/meminfo) / 1024))
FREE_MEM_MB=$(($(awk '/MemAvailable/ {print $2}' /proc/meminfo) / 1024))
CUR_MPPR=$(lctl get_param -n osc.*.max_pages_per_rpc 2>/dev/null | head -1)
echo ""
echo "  System: ${TOTAL_MEM_MB}MB total, ${FREE_MEM_MB}MB available"
echo "  max_pages_per_rpc=$CUR_MPPR, max_rpcs_in_flight=1"
echo "  DIRTY_MB=$DIRTY_MB -> $((DIRTY_MB * 256)) pages"
echo "  PREAD_PAGES=$PREAD_PAGES (pages 0-$((PREAD_PAGES-1)) promoted to active LRU)"

# ============================================================
# Phase 2: Start writers
# ============================================================
echo ""
echo "--- Phase 2: Starting $NUM_WRITERS writers ---"

rm -rf "$WRITE_DIR"
mkdir -p "$WRITE_DIR"
lfs setstripe -c 1 "$WRITE_DIR" 2>/dev/null

for i in $(seq 1 $NUM_WRITERS); do
    /tmp/dl_writer "${WRITE_DIR}/file${i}" $DIRTY_MB $i $PREAD_PAGES &
    pid=$!
    WRITER_PIDS+=("$pid")
    echo -1000 > /proc/$pid/oom_score_adj 2>/dev/null
    echo "  Writer-$i: PID=$pid"
done

sleep 3

CUR_DIRTY=$(lctl get_param -n osc.*.cur_dirty_bytes 2>/dev/null | awk '{s+=$1} END{print s}')
echo "  cur_dirty_bytes=$CUR_DIRTY"

# ============================================================
# Phase 3: Memeater cycles + monitoring
# ============================================================
echo ""
echo "--- Phase 3: Monitoring (up to ${DURATION}s) ---"
echo ""

declare -A DSTATE_SINCE

VMSTAT_BASE=$(awk '/pgscan_kswapd/ {s+=$2} END{print s}' /proc/vmstat)
START_TIME=$(date +%s)
DEADLOCK_DETECTED=0
DEADLOCK_PID=""
MEMEATER_CYCLE=0
MEMEATER_STATE="idle"
MEMEATER_START_TIME=0

calc_eat_mb() {
    local free_kb
    free_kb=$(awk '/MemAvailable/ {print $2}' /proc/meminfo)
    local eat=$((free_kb / 1024 - 50))
    [ $eat -lt 50 ] && eat=50
    echo $eat
}

start_memeater() {
    local eat_mb
    eat_mb=$(calc_eat_mb)
    MEMEATER_CYCLE=$((MEMEATER_CYCLE + 1))

    /tmp/dl_memeater "$eat_mb" &
    MEMEATER_PID=$!
    echo 1000 > /proc/$MEMEATER_PID/oom_score_adj 2>/dev/null

    MEMEATER_STATE="allocating"
    MEMEATER_START_TIME=$(date +%s)
}

kill_memeater() {
    if [ -n "$MEMEATER_PID" ] && kill -0 "$MEMEATER_PID" 2>/dev/null; then
        kill -9 "$MEMEATER_PID" 2>/dev/null
        wait "$MEMEATER_PID" 2>/dev/null
    fi
    MEMEATER_PID=""
    MEMEATER_STATE="idle"
}

start_memeater

while true; do
    NOW=$(date +%s)
    ELAPSED=$((NOW - START_TIME))
    [ $ELAPSED -ge $DURATION ] && { echo "  Duration limit reached"; break; }

    sleep $CHECK_INTERVAL

    NOW=$(date +%s)
    ELAPSED=$((NOW - START_TIME))
    MEMEATER_ELAPSED=$((NOW - MEMEATER_START_TIME))

    # Memeater cycle management
    case "$MEMEATER_STATE" in
        allocating)
            if ! kill -0 "$MEMEATER_PID" 2>/dev/null; then
                MEMEATER_STATE="idle"
                MEMEATER_START_TIME=$NOW
            elif [ $MEMEATER_ELAPSED -ge 120 ]; then
                kill_memeater
                MEMEATER_START_TIME=$NOW
            fi
            if [ "$MEMEATER_STATE" = "allocating" ]; then
                FREE_CUR=$(awk '/MemAvailable/ {print $2}' /proc/meminfo)
                if [ $((FREE_CUR / 1024)) -lt 200 ] && [ $MEMEATER_ELAPSED -ge 2 ]; then
                    MEMEATER_STATE="holding"
                fi
            fi
            ;;
        holding)
            if ! kill -0 "$MEMEATER_PID" 2>/dev/null; then
                MEMEATER_STATE="idle"
                MEMEATER_START_TIME=$NOW
            elif [ $MEMEATER_ELAPSED -ge $((MEMEATER_HOLD_SECS + 30)) ]; then
                kill_memeater
                MEMEATER_START_TIME=$NOW
            fi
            ;;
        idle)
            if [ $((NOW - MEMEATER_START_TIME)) -ge $MEMEATER_GAP_SECS ]; then
                start_memeater
            fi
            ;;
    esac

    # Deadlock detection
    for pid in "${WRITER_PIDS[@]}"; do
        kill -0 "$pid" 2>/dev/null || continue

        STATE=$(cat /proc/$pid/stat 2>/dev/null | awk '{print $3}')
        if [ "$STATE" = "D" ]; then
            STACK=$(cat /proc/$pid/stack 2>/dev/null)

            if echo "$STACK" | grep -q "wait_on_page_bit\|wait_on_page_writeback\|page_wait_bit\|balance_dirty_pages"; then
                if [ -z "${DSTATE_SINCE[$pid]:-}" ]; then
                    DSTATE_SINCE[$pid]=$NOW
                    echo "  [${ELAPSED}s] Writer PID=$pid D-state [cycle=$MEMEATER_CYCLE $MEMEATER_STATE]"
                    echo "$STACK" | head -3 | sed 's/^/    /'
                fi

                DSTATE_DURATION=$((NOW - ${DSTATE_SINCE[$pid]}))
                if [ $DSTATE_DURATION -ge $DEADLOCK_THRESHOLD ]; then
                    DEADLOCK_DETECTED=1
                    DEADLOCK_PID=$pid
                    echo ""
                    echo "  ============================================"
                    echo "  >>> DEADLOCK DETECTED <<<"
                    echo "  ============================================"
                    echo "  Writer PID=$pid stuck for ${DSTATE_DURATION}s"
                    echo "  Memeater cycle=$MEMEATER_CYCLE"
                    echo ""
                    echo "  Stack:"
                    echo "$STACK" | head -15 | sed 's/^/    /'
                    break 2
                fi
            else
                unset "DSTATE_SINCE[$pid]"
            fi
        else
            unset "DSTATE_SINCE[$pid]"
        fi
    done

    # Status every 10s
    if [ $((ELAPSED % 10)) -lt $((CHECK_INTERVAL + 1)) ] && [ $ELAPSED -gt 0 ]; then
        FREE_NOW=$(awk '/MemAvailable/ {print $2}' /proc/meminfo)
        WB=$(awk '/^Writeback:/ {print $2}' /proc/meminfo)
        VMSTAT_NOW=$(awk '/pgscan_kswapd/ {s+=$2} END{print s}' /proc/vmstat)
        KSWAPD_SCANNED=$((VMSTAT_NOW - VMSTAT_BASE))
        CUR_D=$(lctl get_param -n osc.*.cur_dirty_bytes 2>/dev/null | awk '{s+=$1} END{print s}')
        RPCS=$(lctl get_param -n osc.*.cur_write_rpcs_in_flight 2>/dev/null | awk '{s+=$1} END{print s}')

        # Count D-state writers
        DCOUNT=0
        for pid in "${WRITER_PIDS[@]}"; do
            S=$(cat /proc/$pid/stat 2>/dev/null | awk '{print $3}')
            [ "$S" = "D" ] && DCOUNT=$((DCOUNT + 1))
        done

        echo "  [${ELAPSED}s] free=$((FREE_NOW/1024))MB WB=${WB}kB kswapd=$KSWAPD_SCANNED dirty=$CUR_D rpcs=$RPCS cycle=$MEMEATER_CYCLE $MEMEATER_STATE D=$DCOUNT"
    fi

    # Check writers alive
    ALIVE=0
    for pid in "${WRITER_PIDS[@]}"; do
        kill -0 "$pid" 2>/dev/null && ALIVE=$((ALIVE + 1))
    done
    [ $ALIVE -eq 0 ] && { echo "  All writers died!"; break; }
done

kill_memeater

# ============================================================
# Phase 3.5: Diagnostics (before Phase 4)
# ============================================================
echo ""
echo "--- Diagnostics ---"

lctl dk /tmp/lustre_debug_deadlock.log 2>/dev/null
echo "  Debug log: /tmp/lustre_debug_deadlock.log"

VMSTAT_FINAL=$(awk '/pgscan_kswapd/ {s+=$2} END{print s}' /proc/vmstat)
echo "  kswapd scanned: $((VMSTAT_FINAL - VMSTAT_BASE))"
echo "  memeater cycles: $MEMEATER_CYCLE"

if [ $DEADLOCK_DETECTED -eq 1 ]; then
    echo ""
    echo "  === Deadlocked writer ==="
    echo "  /proc/$DEADLOCK_PID/stack:"
    cat /proc/$DEADLOCK_PID/stack 2>/dev/null | sed 's/^/    /'
    echo "  wchan: $(cat /proc/$DEADLOCK_PID/wchan 2>/dev/null)"

    echo ""
    echo "  Memory:"
    grep -E 'MemFree|MemAvailable|Writeback|Dirty' /proc/meminfo | sed 's/^/    /'

    echo ""
    echo "  Other D-state writers:"
    for pid in "${WRITER_PIDS[@]}"; do
        [ "$pid" = "$DEADLOCK_PID" ] && continue
        S=$(cat /proc/$pid/stat 2>/dev/null | awk '{print $3}')
        [ "$S" = "D" ] && { echo "    PID=$pid:"; cat /proc/$pid/stack 2>/dev/null | head -5 | sed 's/^/      /'; }
    done
fi

# ============================================================
# Phase 4: Trigger osc_extent_wait via sync
# ============================================================
if [ $DEADLOCK_DETECTED -eq 1 ]; then
    echo ""
    echo "============================================================"
    echo "--- Phase 4: Trigger osc_extent_wait (${EXTENT_WAIT_TIMEOUT}s) ---"
    echo "============================================================"
    echo ""
    echo "  Deadlocked writer PID=$DEADLOCK_PID holds extent in ACTIVE state."
    echo "  Triggering 'sync' to invoke osc_cache_writeback_range -> osc_extent_wait."
    echo "  osc_extent_wait will timeout after 600s and dump extent flags."
    echo "  Expected: |active|wiumY| in dmesg"
    echo ""

    # Kill other writers to reduce noise (keep deadlocked writer alive)
    echo "  Killing other writers..."
    for pid in "${WRITER_PIDS[@]}"; do
        [ "$pid" = "$DEADLOCK_PID" ] && continue
        kill "$pid" 2>/dev/null
    done
    sleep 2
    for pid in "${WRITER_PIDS[@]}"; do
        [ "$pid" = "$DEADLOCK_PID" ] && continue
        kill -9 "$pid" 2>/dev/null
        wait "$pid" 2>/dev/null
    done
    echo "  Done (deadlocked writer PID=$DEADLOCK_PID kept alive)"

    # Remove netem to let other RPCs complete normally
    if [ -n "$NETEM_IFACE" ]; then
        tc qdisc del dev "$NETEM_IFACE" root 2>/dev/null
        echo "  netem removed"
    fi

    # Restore max_rpcs_in_flight (stuck extent is ACTIVE anyway, won't help)
    if [ -n "$ORIG_MAX_RPCS" ]; then
        lctl set_param osc.*.max_rpcs_in_flight="$ORIG_MAX_RPCS" 2>/dev/null
        echo "  max_rpcs_in_flight restored to $ORIG_MAX_RPCS"
    fi

    # Clear debug log to capture fresh osc_extent_wait timeout
    lctl set_param debug=+cache+error+warning 2>/dev/null
    lctl clear 2>/dev/null
    echo ""

    # Record dmesg position before sync
    DMESG_LINES_BEFORE=$(dmesg 2>/dev/null | wc -l)

    # Trigger sync in background
    echo "  Starting 'sync' in background..."
    sync &
    SYNC_PID=$!
    echo "  sync PID=$SYNC_PID"
    echo ""

    # Verify sync enters D-state (ll_writepages -> osc_extent_wait)
    sleep 5
    SYNC_STATE=$(cat /proc/$SYNC_PID/stat 2>/dev/null | awk '{print $3}')
    if [ "$SYNC_STATE" = "D" ]; then
        echo "  sync process is in D-state (expected: blocking on osc_extent_wait)"
        SYNC_STACK=$(cat /proc/$SYNC_PID/stack 2>/dev/null)
        echo "$SYNC_STACK" | head -10 | sed 's/^/    /'
    else
        echo "  sync process state: $SYNC_STATE (expected D)"
    fi

    echo ""
    echo "  Waiting for osc_extent_wait 600s timeout..."
    echo "  Monitoring dmesg for 'wait ext to.*timedout' (up to ${EXTENT_WAIT_TIMEOUT}s)"
    echo ""

    PHASE4_START=$(date +%s)
    TIMEOUT_FOUND=0

    while true; do
        NOW=$(date +%s)
        P4_ELAPSED=$((NOW - PHASE4_START))

        if [ $P4_ELAPSED -ge $EXTENT_WAIT_TIMEOUT ]; then
            echo ""
            echo "  Phase 4 timeout (${EXTENT_WAIT_TIMEOUT}s) without osc_extent_wait dump"
            break
        fi

        # Check dmesg for the timeout message
        TIMEOUT_MSG=$(dmesg 2>/dev/null | tail -n +$((DMESG_LINES_BEFORE + 1)) | \
            grep -m1 "wait ext to.*timedout")

        if [ -n "$TIMEOUT_MSG" ]; then
            TIMEOUT_FOUND=1
            echo ""
            echo "  ============================================"
            echo "  >>> osc_extent_wait TIMEOUT DETECTED <<<"
            echo "  ============================================"
            echo ""

            # Dump all extent-related dmesg lines since sync
            echo "  dmesg extent dump:"
            dmesg 2>/dev/null | tail -n +$((DMESG_LINES_BEFORE + 1)) | \
                grep -E "osc_extent|wait ext to|active.*wi|extent.*\[" | \
                sed 's/^/    /'
            echo ""

            # Extract flags
            FLAGS=$(dmesg 2>/dev/null | tail -n +$((DMESG_LINES_BEFORE + 1)) | \
                grep -oP '\|active\|[^]|]+\|' | head -1)
            if [ -n "$FLAGS" ]; then
                echo "  Extent flags: $FLAGS"
                echo ""
                echo "  Flag decode:"
                echo "$FLAGS" | grep -q 'w' && echo "    w = write extent (oe_rw=0)"
                echo "$FLAGS" | grep -q 'i' && echo "    i = in RB tree"
                echo "$FLAGS" | grep -q 'u' && echo "    u = urgent (kswapd osc_flush_async_page)"
                echo "$FLAGS" | grep -q 'm' && echo "    m = memalloc (kswapd PF_MEMALLOC)"
                echo "$FLAGS" | grep -q 'Y' && echo "    Y = fsync_wait (sync osc_cache_writeback_range)"
            fi
            break
        fi

        # Progress every 60s
        if [ $((P4_ELAPSED % 60)) -lt 11 ] && [ $P4_ELAPSED -gt 0 ]; then
            # Check deadlocked writer still stuck
            DL_STATE=$(cat /proc/$DEADLOCK_PID/stat 2>/dev/null | awk '{print $3}')
            # Check sync still alive
            SYNC_ALIVE="no"
            kill -0 "$SYNC_PID" 2>/dev/null && SYNC_ALIVE="yes"
            WB=$(awk '/^Writeback:/ {print $2}' /proc/meminfo)
            echo "  [Phase4 ${P4_ELAPSED}s] writer=$DL_STATE sync_alive=$SYNC_ALIVE WB=${WB}kB"
        fi

        sleep 10
    done

    # Capture debug log after timeout
    echo ""
    echo "  Saving post-timeout debug log..."
    lctl dk /tmp/lustre_debug_phase4.log 2>/dev/null
    echo "  Debug log: /tmp/lustre_debug_phase4.log"

    # Final sync process status
    if kill -0 "$SYNC_PID" 2>/dev/null; then
        SYNC_STATE=$(cat /proc/$SYNC_PID/stat 2>/dev/null | awk '{print $3}')
        echo ""
        echo "  sync PID=$SYNC_PID still alive (state=$SYNC_STATE)"
        if [ "$SYNC_STATE" = "D" ]; then
            echo "  sync stack:"
            cat /proc/$SYNC_PID/stack 2>/dev/null | head -10 | sed 's/^/    /'
        fi
    fi

    echo ""
    echo "  ============================================"
    if [ $TIMEOUT_FOUND -eq 1 ]; then
        echo "  FULL REPRODUCTION: deadlock + osc_extent_wait timeout"
    else
        echo "  PARTIAL: deadlock reproduced but osc_extent_wait timeout not observed"
    fi
    echo "  ============================================"
else
    echo ""
    echo "  NOT REPRODUCED within ${DURATION}s"
fi

echo ""
echo "  dmesg (last errors):"
dmesg 2>/dev/null | grep -E "LustreError|osc_extent|timedout|Killed" | \
    tail -15 | sed 's/^/    /'
echo ""
