Details
-
Bug
-
Resolution: Unresolved
-
Minor
-
None
-
None
-
None
-
3
-
9223372036854775807
Description
Dmesg:
[59151.186832] Lustre: lustre-MDT0000: in recovery but waiting for the first client to connect [59152.988779] Lustre: lustre-MDT0000: Will be in recovery for at least 1:00, or until 3 clients reconnect [59156.126197] LustreError: 14032:0:(fld_handler.c:225:fld_local_lookup()) srv-lustre-MDT0000: FLD cache range [0x00000002c0000400-0x0000000300000400]:1:ost does not match requested flag 0: rc = -5 [59156.131159] LustreError: 14032:0:(fld_handler.c:264:fld_server_lookup()) srv-lustre-MDT0000: Cannot find sequence 0x2c0000404: rc = -2
lctl dk:
00000020:00000001:0.0:1720639625.466361:0:25501:0:(update_recovery.c:1316:distribute_txn_replay_handle()) Process entered 00000020:00000010:0.0:1720639625.466365:0:25501:0:(lu_object.c:1696:keys_init()) kmalloced '(ctx->lc_value)': 320 at ffff93a06cab5a00. 00000020:00000010:0.0:1720639625.466369:0:25501:0:(lu_ucred.c:48:lu_ucred_key_init()) kmalloced '(value)': 120 at ffff93a01383cd00. 00000020:00000010:0.0:1720639625.466373:0:25501:0:(tgt_main.c:778:tgt_ses_key_init()) slab-alloced 'session': 200 at ffff93a01fb21bb8. 00000020:00080000:0.0:1720639625.466379:0:25501:0:(update_records.c:74:update_records_dump()) master transno = 68719476736 batchid = 17180134090 flags = 0 ops = 2 params = 4 00000020:00080000:0.0:1720639625.466385:0:25501:0:(update_records.c:93:update_records_dump()) update 0th [0x2c0000404:0x9e0:0x0] xattr_set params_count = 3 00000020:00080000:0.0:1720639625.466391:0:25501:0:(update_records.c:108:update_records_dump()) param = ffff93a06cb94064 0th off = 0 size = 12 00000020:00080000:0.0:1720639625.466395:0:25501:0:(update_records.c:108:update_records_dump()) param = ffff93a06cb9407c 1th off = 1 size = 52 00000020:00080000:0.0:1720639625.466399:0:25501:0:(update_records.c:108:update_records_dump()) param = ffff93a06cb940bc 2th off = 2 size = 4 00000020:00080000:0.0:1720639625.466404:0:25501:0:(update_records.c:93:update_records_dump()) update 1th [0x280000401:0x54cd:0x0] xattr_set params_count = 3 00000020:00080000:0.0:1720639625.466409:0:25501:0:(update_records.c:108:update_records_dump()) param = ffff93a06cb94064 0th off = 0 size = 12 00000020:00080000:0.0:1720639625.466414:0:25501:0:(update_records.c:108:update_records_dump()) param = ffff93a06cb940cc 1th off = 3 size = 52 00000020:00080000:0.0:1720639625.466418:0:25501:0:(update_records.c:108:update_records_dump()) param = ffff93a06cb940bc 2th off = 2 size = 4 00000020:00000010:0.0:1720639625.466423:0:25501:0:(update_trans.c:593:top_trans_create()) kmalloced '(top_th)': 88 at ffff93a013ee1cc0. 00000020:00000010:0.0:1720639625.466428:0:25501:0:(update_trans.c:1118:top_trans_create_tmt()) kmalloced '(tmt)': 120 at ffff93a01383ca00. 00000020:00000001:0.0:1720639625.466432:0:25501:0:(update_recovery.c:1160:update_recovery_exec()) Process entered .... 00000020:00000001:0.0:1720639625.466551:0:25501:0:(update_recovery.c:674:update_is_committed()) Process entered 80000000:00000001:0.0:1720639625.466554:0:25501:0:(fld_handler.c:248:fld_server_lookup()) Process entered 80000000:00000001:0.0:1720639625.466556:0:25501:0:(fld_handler.c:212:fld_local_lookup()) Process entered 80000000:00000001:0.0:1720639625.466558:0:25501:0:(fld_cache.c:468:fld_cache_lookup()) Process entered 80000000:00000001:0.0:1720639625.466560:0:25501:0:(fld_cache.c:487:fld_cache_lookup()) Process leaving (rc=0 : 0 : 0) 80000000:00020000:0.0:1720639625.466564:0:25501:0:(fld_handler.c:225:fld_local_lookup()) srv-lustre-MDT0000: FLD cache range [0x00000002c0000400-0x0000000300000400]:1:ost does not match requested flag 0: rc = -5 80000000:00000001:0.0:1720639625.471425:0:25501:0:(fld_handler.c:226:fld_local_lookup()) Process leaving (rc=18446744073709551611 : -5 : fffffffffffffffb) 80000000:00000001:0.0:1720639625.471426:0:25501:0:(fld_handler.c:111:fld_name_to_index()) Process entered 80000000:00000040:0.0:1720639625.471426:0:25501:0:(fld_handler.c:113:fld_name_to_index()) get index from srv-lustre-MDT0000 80000000:00000001:0.0:1720639625.471428:0:25501:0:(fld_handler.c:119:fld_name_to_index()) Process leaving (rc=1 : 1 : 1) 80000000:00020000:0.0:1720639625.471428:0:25501:0:(fld_handler.c:264:fld_server_lookup()) srv-lustre-MDT0000: Cannot find sequence 0x2c0000404: rc = -2 80000000:00000001:0.0:1720639625.472980:0:25501:0:(fld_handler.c:265:fld_server_lookup()) Process leaving (rc=18446744073709551614 : -2 : fffffffffffffffe) 00000020:00000001:0.0:1720639625.472983:0:25501:0:(update_recovery.c:698:update_is_committed()) Process leaving (rc=18446744073709551614 : -2 : fffffffffffffffe) 00000020:00000001:0.0:1720639625.472984:0:25501:0:(update_recovery.c:1208:update_recovery_exec()) Process leaving via next (rc=18446744073709551614 : -2 : 0xfffffffffffffffe) 00000004:00000001:0.0:1720639625.472987:0:25501:0:(osp_object.c:2390:osp_object_release()) Process entered 00000004:00000001:0.0:1720639625.472987:0:25501:0:(osp_object.c:2422:osp_object_release()) Process leaving 00000004:00000010:0.0:1720639625.472994:0:25501:0:(osp_object.c:2370:osp_object_free()) slab-freed 'obj': 384 at ffff93a013e07800. 00000004:00000010:0.0:1720639625.473228:0:25501:0:(lod_object.c:9666:lod_object_free()) slab-freed '(lo)': 168 at ffff93a026f87930. 00000004:00000010:0.0:1720639625.473230:0:25501:0:(mdd_object.c:341:mdd_object_free()) slab-freed '(mdd)': 120 at ffff93a0119869d8. 00000004:00000001:0.0:1720639625.473231:0:25501:0:(mdt_handler.c:6709:mdt_object_free()) Process entered 00000004:00000040:0.0:1720639625.473231:0:25501:0:(mdt_handler.c:6713:mdt_object_free()) object free, fid = [0x2c0000404:0x9e0:0x0] 00000004:00000010:0.0:1720639625.473232:0:25501:0:(mdt_handler.c:6720:mdt_object_free()) slab-freed 'mo': 336 at ffff93a027b632a0. 00000004:00000001:0.0:1720639625.473233:0:25501:0:(mdt_handler.c:6723:mdt_object_free()) Process leaving 00000020:00000001:0.0:1720639625.473234:0:25501:0:(update_recovery.c:1281:update_recovery_exec()) Process leaving (rc=18446744073709551614 : -2 : fffffffffffffffe) 00000020:00000001:0.0:1720639625.473236:0:25501:0:(update_recovery.c:1357:distribute_txn_replay_handle()) Process leaving via stop_trans (rc=18446744073709551614 : -2 : 0xfffffffffffffffe) 00000020:00000001:0.0:1720639625.473237:0:25501:0:(update_trans.c:930:top_trans_stop()) Process entered 00000020:00000001:0.0:1720639625.473239:0:25501:0:(update_trans.c:903:top_trans_wait_result()) Process leaving (rc=18446744073709551614 : -2 : fffffffffffffffe) 00000020:00000010:0.0:1720639625.473240:0:25501:0:(update_trans.c:1263:top_multiple_thandle_destroy()) kfreed 'st': 208 at ffff93a0427bd100. 00000020:00000010:0.0:1720639625.473241:0:25501:0:(update_trans.c:1265:top_multiple_thandle_destroy()) kfreed 'tmt': 120 at ffff93a01383ca00. 00000020:00000010:0.0:1720639625.473242:0:25501:0:(update_trans.c:1095:top_trans_stop()) kfreed 'top_th': 88 at ffff93a013ee1cc0. 00000020:00000001:0.0:1720639625.473243:0:25501:0:(update_trans.c:1096:top_trans_stop()) Process leaving (rc=18446744073709551614 : -2 : fffffffffffffffe) 00000020:00000010:0.0:1720639625.473244:0:25501:0:(lu_ucred.c:48:lu_ucred_key_fini()) kfreed 'info': 120 at ffff93a01383cd00. 00000020:00000010:0.0:1720639625.473245:0:25501:0:(tgt_main.c:790:tgt_ses_key_fini()) slab-freed '(session)': 200 at ffff93a01fb21bb8. 00000020:00000010:0.0:1720639625.473246:0:25501:0:(lu_object.c:1628:keys_fini()) kfreed 'ctx->lc_value': 320 at ffff93a06cab5a00. 00000020:00000001:0.0:1720639625.473247:0:25501:0:(update_recovery.c:1452:distribute_txn_replay_handle()) Process leaving (rc=18446744073709551614 : -2 : fffffffffffffffe)
The type of update log is OUT_XATTR_SET to set "trusted.fid on the OST objects:
#2061 (280)updatelog record master_transno:0 batchid:17180134095 flags:0x0 u_index:0 u_count:2 p_count:4 [0x2c0000404:0x9ea:0x0] type:xattr_set/7 params:3 p_0:0 p_1:1 p_2:2 [0x280000401:0x54d7:0x0] type:xattr_set/7 params:3 p_0:0 p_1:3 p_2:2 p_0 - 12/trusted.fid p_1 - 52/A#\x0000020000008\x110100000000000000@\x0002000000000000000000000000000000000000000000000000000000000000 p_2 - 4/\x010000 p_3 - 52/A#\x0000020000008\x110100010000000000@\x0002000000000000000000000000000000000000000000000000000000000000
Reproducer (on master branch):
lfs mkdir -i0 mdt0_dir lfs mkdir -i1 mdt1_dir touch mdt1_dir/test_file{1..1000} #Make the mdt objects remote mv mdt1_dir/test_file* mdt0_dir/ # migrate the files to create distributed transactions while true; do printf "%s\n" mdt0_dir/test_file{1..1000} | xargs -n1 -P20 lfs migrate -c4 -n; done #On the server mount/umount MDT0001 to force a recovery umount /dev/mapper/mds1_flakey; mount -tlustre /dev/mapper/mds1_flakey /mnt/lustre-mds1/
This is caused by update_is_committed() only looking for MDT objects:
static int update_is_committed(const struct lu_env *env, struct distribute_txn_replay_req *dtrq, struct dt_object *dt_obj, struct top_thandle *top_th, struct sub_thandle *st) ..... } else { struct lu_server_fld *fld; struct lu_seq_range range = {0}; int rc; fld = seq_site->ss_server_fld; fld_range_set_type(&range, LU_SEQ_RANGE_MDT); <---------- LASSERT(fld->lsf_seq_lookup != NULL); rc = fld->lsf_seq_lookup(env, fld, fid_seq(fid), &range); if (rc < 0) RETURN(rc); mdt_index = range.lsr_index;
if we add OSTs into distributed transaction then I guess transaction recovery would need to block until related OST(s) are ready? I believe currently we don't wait for OSTs.