Details
-
Bug
-
Resolution: Fixed
-
Major
-
Lustre 2.8.0, Lustre 2.9.0
-
None
-
3
-
9223372036854775807
Description
osp_statfs_interpret can clear error in opd_pre_status despite of the
fact that osp_precreate_cleanup_orphans got error and doesn't know
exactly OST object last_id. Example:
- mdt sends req "create objects x..y"
- objects created. mdt gets OK
- MDT->OST reconnection
- MDT sends cleanup_orphans last_used_fid=x
- OST removes x..y and sends reply OK and last_id=x
- MDT->OST connection aborted. cleanup_orphans exits with EIO
- osp_statfs_interpret changes opd_pre_status from EIO to 0
- osp_precreate_reserve reserves object and changes last_used_id from x to x+1
- connection restored. MDT sends cleanup_orphans last_id=x+1
In fine OST has a gap - object x was removed by cleanup_orphans.
Below is reproducer that works only on singe node setup:
diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh
index c64ebab..f5026dc 100755
--- a/lustre/tests/conf-sanity.sh
+++ b/lustre/tests/conf-sanity.sh
@@ -6796,6 +6796,32 @@ test_97() {
}
run_test 97 "ldev returns correct ouput when querying based on role"
+test_98() {
+ local_mode || { skip "Need single node setup"; return; }
+ local cmp=0
+ local dev=$FSNAME-OST0000-osc-MDT0000
+ setupall
+
+ createmany -o $DIR1/$tfile-%d 50000&
+ cmp=$!
+ # MDT->OST reconnection causes MDT<->OST last_id synchornisation
+ # via osp_precreate_cleanup_orphans.
+ for i in $(seq 0 100); do
+ for k in $(seq 0 10); do
+ $LCTL --device $dev deactivate
+ $LCTL --device $dev activate
+ done
+ ls -asl $MOUNT | grep '???' && \
+ (kill -9 $cmp &>/dev/null; \
+ error "File hasn't object on OST")
+ ps -A -o pid | grep $cmp 1>/dev/null || break
+ done
+ wait $cmp
+ stopall
+}
+run_test 98 "Race MDT->OST reconnection with create"
+
+