Description
When I last checked there were more than 10 different assertions I could trip by having a client send an RPC to the wrong MDT. I'm pretty lazy so I just did something like the following.
diff --git a/lustre/lmv/lmv_internal.h b/lustre/lmv/lmv_internal.h index 15692c5..b261dda 100644 --- a/lustre/lmv/lmv_internal.h +++ b/lustre/lmv/lmv_internal.h @@ -133,7 +133,11 @@ lmv_find_target(struct lmv_obd *lmv, const struct lu_fid *fid) mdsno_t mds = 0; int rc; - if (lmv->desc.ld_tgt_count > 1) { + if (OBD_FAIL_CHECK(0x6000)) { + mds = 0; + } else if (OBD_FAIL_CHECK(0x6001)) { + mds = 1; + } else if (lmv->desc.ld_tgt_count > 1) { rc = lmv_fld_lookup(lmv, fid, &mds); if (rc) return ERR_PTR(rc);
Then I ran racer with MDSCOUNT=2 and I set fail_loc to 0x6000 and 0x6001. Here are some stack traces you can use to frighten small children:
LustreError: 13525:0:(mdt_handler.c:1284:mdt_getattr_name_lock()) ASSERTION( !mdt_object_remote(parent) ) failed: Parent [0x2c0000400:0x1:0x0] is on remote server LustreError: 13525:0:(mdt_handler.c:1284:mdt_getattr_name_lock()) LBUG Pid: 13525, comm: mdt01_004 Call Trace: [<ffffffffa02a9895>] libcfs_debug_dumpstack+0x55/0x80 [libcfs] [<ffffffffa02a9e97>] lbug_with_loc+0x47/0xb0 [libcfs] [<ffffffffa0bacd82>] mdt_getattr_name_lock+0x10f2/0x1a80 [mdt] [<ffffffffa069fb46>] ? __req_capsule_get+0x166/0x710 [ptlrpc] [<ffffffffa067a024>] ? lustre_msg_get_flags+0x34/0xb0 [ptlrpc] [<ffffffffa0bad9a3>] mdt_intent_getattr+0x293/0x470 [mdt] [<ffffffffa0b9ca99>] mdt_intent_policy+0x499/0xca0 [mdt] [<ffffffffa062f509>] ldlm_lock_enqueue+0x359/0x920 [ptlrpc] [<ffffffffa0658c4f>] ldlm_handle_enqueue0+0x4ef/0x10a0 [ptlrpc] [<ffffffffa06d3002>] tgt_enqueue+0x62/0x1d0 [ptlrpc] [<ffffffffa06d59fa>] tgt_handle_request0+0x2ea/0x1490 [ptlrpc] [<ffffffffa067792c>] ? lustre_msg_get_opc+0x9c/0x110 [ptlrpc] [<ffffffffa06d6fda>] tgt_request_handle+0x43a/0x980 [ptlrpc] [<ffffffffa068a295>] ptlrpc_main+0xd25/0x1970 [ptlrpc] [<ffffffffa0689570>] ? ptlrpc_main+0x0/0x1970 [ptlrpc] [<ffffffff81096a36>] kthread+0x96/0xa0 [<ffffffff8100c0ca>] child_rip+0xa/0x20 [<ffffffff810969a0>] ? kthread+0x0/0xa0 [<ffffffff8100c0c0>] ? child_rip+0x0/0x20 LustreError: 15553:0:(mdt_handler.c:2293:mdt_object_lock0()) ASSERTION( !(ibits & (MDS_INODELOCK_UPDATE | MDS_INODELOCK_PERM | MDS_INODELOCK_LAYOUT)) ) failed: lustre-MDT0000: wrong bit 0x2 for remote obj [0x2c0000400:0x1:0x0] LustreError: 15553:0:(mdt_handler.c:2293:mdt_object_lock0()) LBUG Pid: 15553, comm: mdt01_008 LustreError: 21830:0:(mdt_reint.c:401:mdt_attr_set()) ASSERTION( !mdt_object_remote(mo) ) failed: LustreError: 21830:0:(mdt_reint.c:401:mdt_attr_set()) LBUG Pid: 21830, comm: mdt00_001 Call Trace: [<ffffffffa0e49895>] libcfs_debug_dumpstack+0x55/0x80 [libcfs] [<ffffffffa0e49e97>] lbug_with_loc+0x47/0xb0 [libcfs] [<ffffffffa0939a59>] mdt_attr_set+0x3a9/0x570 [mdt] [<ffffffffa093a1dd>] mdt_reint_setattr+0x5bd/0xcf0 [mdt] [<ffffffffa039edae>] ? lustre_pack_reply_flags+0xae/0x1f0 [ptlrpc] [<ffffffffa0933a31>] mdt_reint_rec+0x41/0xe0 [mdt] [<ffffffffa0919ec3>] mdt_reint_internal+0x4c3/0x780 [mdt] [<ffffffffa091a70b>] mdt_reint+0x6b/0x120 [mdt] [<ffffffffa03fb9fa>] tgt_handle_request0+0x2ea/0x1490 [ptlrpc] [<ffffffffa039d92c>] ? lustre_msg_get_opc+0x9c/0x110 [ptlrpc] [<ffffffffa03fcfda>] tgt_request_handle+0x43a/0x980 [ptlrpc] [<ffffffffa03b0295>] ptlrpc_main+0xd25/0x1970 [ptlrpc] [<ffffffffa03af570>] ? ptlrpc_main+0x0/0x1970 [ptlrpc] [<ffffffff81096a36>] kthread+0x96/0xa0 [<ffffffff8100c0ca>] child_rip+0xa/0x20 [<ffffffff810969a0>] ? kthread+0x0/0xa0 [<ffffffff8100c0c0>] ? child_rip+0x0/0x20 LustreError: 5896:0:(mdt_handler.c:3201:mdt_intent_reint()) ASSERTION( lustre_handle_is_used(&lhc->mlh_reg_lh) ) failed: LustreError: 5896:0:(mdt_handler.c:3201:mdt_intent_reint()) LBUG Pid: 5896, comm: mdt00_002 Call Trace: [<ffffffffa02a9895>] libcfs_debug_dumpstack+0x55/0x80 [libcfs] [<ffffffffa02a9e97>] lbug_with_loc+0x47/0xb0 [libcfs] [<ffffffffa0b9e667>] mdt_intent_reint+0x4e7/0x520 [mdt] [<ffffffffa0b9ca99>] mdt_intent_policy+0x499/0xca0 [mdt] [<ffffffffa062f509>] ldlm_lock_enqueue+0x359/0x920 [ptlrpc] [<ffffffffa0658c4f>] ldlm_handle_enqueue0+0x4ef/0x10a0 [ptlrpc] [<ffffffffa06d3002>] tgt_enqueue+0x62/0x1d0 [ptlrpc] [<ffffffffa06d59fa>] tgt_handle_request0+0x2ea/0x1490 [ptlrpc] [<ffffffffa067792c>] ? lustre_msg_get_opc+0x9c/0x110 [ptlrpc] [<ffffffffa06d6fda>] tgt_request_handle+0x43a/0x980 [ptlrpc] [<ffffffffa068a295>] ptlrpc_main+0xd25/0x1970 [ptlrpc] [<ffffffffa0689570>] ? ptlrpc_main+0x0/0x1970 [ptlrpc] [<ffffffff81096a36>] kthread+0x96/0xa0 [<ffffffff8100c0ca>] child_rip+0xa/0x20 [<ffffffff810969a0>] ? kthread+0x0/0xa0 [<ffffffff8100c0c0>] ? child_rip+0x0/0x20 LustreError: 8529:0:(mdt_handler.c:2294:mdt_object_lock0()) ASSERTION( !(ibits & (MDS_INOD ELOCK_UPDATE | MDS_INODELOCK_PERM | MDS_INODELOCK_LAYOUT)) ) failed: lustre-MDT0000: wrong bit 0x12 for remote obj [0x2c0000400:0x1:0x0] LustreError: 8529:0:(mdt_handler.c:2294:mdt_object_lock0()) LBUG Pid: 8529, comm: mdt01_006 Call Trace: [<ffffffffa02a9895>] libcfs_debug_dumpstack+0x55/0x80 [libcfs] [<ffffffffa02a9e97>] lbug_with_loc+0x47/0xb0 [libcfs] [<ffffffffa0ba286e>] mdt_object_lock0+0xaee/0xaf0 [mdt] [<ffffffffa069fb46>] ? __req_capsule_get+0x166/0x710 [ptlrpc] [<ffffffffa0ba2934>] mdt_object_lock+0x14/0x20 [mdt] [<ffffffffa0bac935>] mdt_getattr_name_lock+0xc75/0x1af0 [mdt] [<ffffffffa069fb46>] ? __req_capsule_get+0x166/0x710 [ptlrpc] [<ffffffffa067a024>] ? lustre_msg_get_flags+0x34/0xb0 [ptlrpc] [<ffffffffa0bada43>] mdt_intent_getattr+0x293/0x470 [mdt] [<ffffffffa0b9ca99>] mdt_intent_policy+0x499/0xca0 [mdt] [<ffffffffa062f509>] ldlm_lock_enqueue+0x359/0x920 [ptlrpc] [<ffffffffa0658c4f>] ldlm_handle_enqueue0+0x4ef/0x10a0 [ptlrpc] [<ffffffffa06d3002>] tgt_enqueue+0x62/0x1d0 [ptlrpc] [<ffffffffa06d59fa>] tgt_handle_request0+0x2ea/0x1490 [ptlrpc] [<ffffffffa067792c>] ? lustre_msg_get_opc+0x9c/0x110 [ptlrpc] [<ffffffffa06d6fda>] tgt_request_handle+0x43a/0x980 [ptlrpc] [<ffffffffa068a295>] ptlrpc_main+0xd25/0x1970 [ptlrpc] [<ffffffffa0689570>] ? ptlrpc_main+0x0/0x1970 [ptlrpc] [<ffffffff81096a36>] kthread+0x96/0xa0 [<ffffffff8100c0ca>] child_rip+0xa/0x20 [<ffffffff810969a0>] ? kthread+0x0/0xa0 [<ffffffff8100c0c0>] ? child_rip+0x0/0x20 Lustre: 8370:0:(mdt_open.c:1509:mdt_cross_open()) Object isn't on this server! FLD error? LustreError: 5849:0:(osp_md_object.c:965:osp_it_load()) LBUG Pid: 5849, comm: mdt_rdpg00_000 Call Trace: [<ffffffffa02a4895>] libcfs_debug_dumpstack+0x55/0x80 [libcfs] [<ffffffffa02a4e97>] lbug_with_loc+0x47/0xb0 [libcfs] [<ffffffffa0ccf8ef>] osp_it_load+0x1f/0x20 [osp] [<ffffffffa0c774d1>] lod_it_load+0x21/0x90 [lod] [<ffffffffa0448f6d>] dt_index_walk+0xad/0x3d0 [obdclass] [<ffffffffa0b4e0e0>] ? mdd_dir_page_build+0x0/0x210 [mdd] [<ffffffffa0cd2cc7>] ? osp_md_object_read_lock+0x87/0x110 [osp] [<ffffffffa0b4fd7b>] mdd_readpage+0x38b/0x5a0 [mdd] [<ffffffffa0bae4cd>] mdt_readpage+0x47d/0x980 [mdt] [<ffffffffa06d29fa>] tgt_handle_request0+0x2ea/0x1490 [ptlrpc] [<ffffffffa067492c>] ? lustre_msg_get_opc+0x9c/0x110 [ptlrpc] [<ffffffffa06d3fda>] tgt_request_handle+0x43a/0x980 [ptlrpc] [<ffffffffa0687295>] ptlrpc_main+0xd25/0x1970 [ptlrpc] [<ffffffffa0686570>] ? ptlrpc_main+0x0/0x1970 [ptlrpc] [<ffffffff81096a36>] kthread+0x96/0xa0 [<ffffffff8100c0ca>] child_rip+0xa/0x20 [<ffffffff810969a0>] ? kthread+0x0/0xa0 [<ffffffff8100c0c0>] ? child_rip+0x0/0x20 LustreError: 5879:0:(mdt_internal.h:565:mdt_object_child()) ASSERTION( o ) failed: LustreError: 5879:0:(mdt_internal.h:565:mdt_object_child()) LBUG Pid: 5879, comm: mdt00_000 Call Trace: [<ffffffffa02a9895>] libcfs_debug_dumpstack+0x55/0x80 [libcfs] [<ffffffffa02a9e97>] lbug_with_loc+0x47/0xb0 [libcfs] [<ffffffffa0bad287>] mdt_getattr_name_lock+0x1517/0x1b30 [mdt] [<ffffffffa069fb46>] ? __req_capsule_get+0x166/0x710 [ptlrpc] [<ffffffffa067a024>] ? lustre_msg_get_flags+0x34/0xb0 [ptlrpc] [<ffffffffa0badb33>] mdt_intent_getattr+0x293/0x470 [mdt] [<ffffffffa0b9ca99>] mdt_intent_policy+0x499/0xd50 [mdt] [<ffffffffa062f509>] ldlm_lock_enqueue+0x359/0x920 [ptlrpc] [<ffffffffa0658c4f>] ldlm_handle_enqueue0+0x4ef/0x10a0 [ptlrpc] [<ffffffffa06d3002>] tgt_enqueue+0x62/0x1d0 [ptlrpc] [<ffffffffa06d59fa>] tgt_handle_request0+0x2ea/0x1490 [ptlrpc] [<ffffffffa067792c>] ? lustre_msg_get_opc+0x9c/0x110 [ptlrpc] [<ffffffffa06d6fda>] tgt_request_handle+0x43a/0x980 [ptlrpc] [<ffffffffa068a295>] ptlrpc_main+0xd25/0x1970 [ptlrpc] [<ffffffffa0689570>] ? ptlrpc_main+0x0/0x1970 [ptlrpc] [<ffffffff81096a36>] kthread+0x96/0xa0 [<ffffffff8100c0ca>] child_rip+0xa/0x20 [<ffffffff810969a0>] ? kthread+0x0/0xa0 [<ffffffff8100c0c0>] ? child_rip+0x0/0x20 LustreError: 8715:0:(mdt_handler.c:1283:mdt_getattr_name_lock()) parent [0x2c0000401:0x1632:0x0] is on remote server LustreError: 8715:0:(mdt_handler.c:2298:mdt_object_lock0()) ASSERTION( ibits & 0x000001 ) failed: LustreError: 8715:0:(mdt_handler.c:2298:mdt_object_lock0()) LBUG Pid: 8715, comm: mdt01_005 Call Trace: [<ffffffffa02a9895>] libcfs_debug_dumpstack+0x55/0x80 [libcfs] [<ffffffffa02a9e97>] lbug_with_loc+0x47/0xb0 [libcfs] [<ffffffffa0ba28c3>] mdt_object_lock0+0xa93/0xaf0 [mdt] [<ffffffffa0449e42>] ? lu_object_find_at+0x2d2/0x360 [obdclass] [<ffffffffa067a024>] ? lustre_msg_get_flags+0x34/0xb0 [ptlrpc] [<ffffffffa0ba29e4>] mdt_object_lock+0x14/0x20 [mdt] [<ffffffffa0ba2a80>] mdt_intent_getxattr+0x90/0x160 [mdt] [<ffffffffa0449ee6>] ? lu_object_find+0x16/0x20 [obdclass] [<ffffffffa0b9ca99>] mdt_intent_policy+0x499/0xd50 [mdt] [<ffffffffa062f509>] ldlm_lock_enqueue+0x359/0x920 [ptlrpc] [<ffffffffa0658c4f>] ldlm_handle_enqueue0+0x4ef/0x10a0 [ptlrpc] [<ffffffffa06d3002>] tgt_enqueue+0x62/0x1d0 [ptlrpc] [<ffffffffa06d59fa>] tgt_handle_request0+0x2ea/0x1490 [ptlrpc] [<ffffffffa067792c>] ? lustre_msg_get_opc+0x9c/0x110 [ptlrpc] [<ffffffffa06d6fda>] tgt_request_handle+0x43a/0x980 [ptlrpc] [<ffffffffa068a295>] ptlrpc_main+0xd25/0x1970 [ptlrpc] [<ffffffffa0689570>] ? ptlrpc_main+0x0/0x1970 [ptlrpc] [<ffffffff81096a36>] kthread+0x96/0xa0 [<ffffffff8100c0ca>] child_rip+0xa/0x20 [<ffffffff810969a0>] ? kthread+0x0/0xa0 [<ffffffff8100c0c0>] ? child_rip+0x0/0x20 Lustre: 585:0:(mdt_open.c:1509:mdt_cross_open()) ....lod@ffff8801a746ad68lod-object@ffff8801a746ad68 Kernel panic - not syncing: LBUG Pid: 30558, comm: mdt_rdpg01_001 Not tainted 2.6.32-358.18.1.el6.lustre.x86_64 #1 Call Trace: [<ffffffff8150f018>] ? panic+0xa7/0x16f [<ffffffffa0e49eeb>] ? lbug_with_loc+0x9b/0xb0 [libcfs] [<ffffffffa08308ef>] ? osp_it_load+0x1f/0x20 [osp] [<ffffffffa07d84d1>] ? lod_it_load+0x21/0x90 [lod] [<ffffffffa0f99f6d>] ? dt_index_walk+0xad/0x3d0 [obdclass] [<ffffffffa069e0e0>] ? mdd_dir_page_build+0x0/0x210 [mdd] [<ffffffffa069fd7b>] ? mdd_readpage+0x38b/0x5a0 [mdd] [<ffffffffa070f5dd>] ? mdt_readpage+0x47d/0x980 [mdt] [<ffffffffa12239fa>] ? tgt_handle_request0+0x2ea/0x1490 [ptlrpc] [<ffffffffa11c592c>] ? lustre_msg_get_opc+0x9c/0x110 [ptlrpc] [<ffffffffa1224fda>] ? tgt_request_handle+0x43a/0x980 [ptlrpc] [<ffffffffa11d8295>] ? ptlrpc_main+0xd25/0x1970 [ptlrpc] [<ffffffffa11d7570>] ? ptlrpc_main+0x0/0x1970 [ptlrpc] [<ffffffff81096a36>] ? kthread+0x96/0xa0 [<ffffffff8100c0ca>] ? child_rip+0xa/0x20 [<ffffffff810969a0>] ? kthread+0x0/0xa0 [<ffffffff8100c0c0>] ? child_rip+0x0/0x20