#!/bin/bash # test parameters pool=ost04 # depending on where last test finished, uncomment one of these two blocks bad_drive=d8000_sep500C0FF03C1AC73E_bay101-0 spare_drive=d8000_sep500C0FF03C1AC73E_bay050-0 #bad_drive=d8000_sep500C0FF03C1AC73E_bay050-0 #spare_drive=d8000_sep500C0FF03C1AC73E_bay101-0 # reset degraded state (assumes pool and dataset have the same name) service=$(zfs list -H -o lustre:svname $pool/$pool) lctl set_param obdfilter.$service.degraded=0 while true; do echo Testing pool: $pool, bad_drive: $bad_drive, spare_drive: $spare_drive echo Press enter to continue read # prepare replacement drive wipefs --all --force /dev/mapper/$spare_drive # fail drive zpool offline $pool $bad_drive # wait for drive to be faulted echo Waiting for drive $bad_drive to be faulted ruby -r timeout <<'EOF' Timeout::timeout(45) do loop do print '.' break if `zpool status` =~ /DEGRADED/ end end EOF echo # replace failed drive zpool replace $pool $bad_drive $spare_drive # wait for resilvering to finish echo Waiting for $spare_drive drive to be resilvered ruby -r timeout <<'EOF' Timeout::timeout(45) do loop do print '.' break if `zpool events` =~ /sysevent.fs.zfs.resilver_finish/ end end EOF echo # show status of degraded lctl get_param obdfilter.$service.degraded # swap bad_drive and spare_drive tmp_drive=$bad_drive bad_drive=$spare_drive spare_drive=$tmp_drive done