Details
-
Bug
-
Resolution: Fixed
-
Minor
-
None
-
None
-
3
-
9223372036854775807
Description
This problem has been discovered when trying to run with recent master configured with USE_LU_REF defined ("configure --enable-lu_ref").
After re-starting/mounting this specific version with an existing FileSystem, I have tried to clean-up its content using a "rm -rf /mnt/lustre/*" command and have experienced the following LBUG :
crash> dmesg | less ....................................... [84168.041219] Lustre: lustre-OST0001: Connection restored to 73f99590-7799-4 (at 10.8.1.5@tcp) [84168.053607] Lustre: Skipped 4 previous similar messages [84247.829852] LustreError: 94510:0:(lu_ref.c:96:lu_ref_print()) lu_ref: ffff8dc74be5c5a0 1 0 ldlm_lock_new:495 [84247.843091] LustreError: 94510:0:(lu_ref.c:98:lu_ref_print()) link: handle ffff8dc7abb78000 [84247.881978] LustreError: 94510:0:(lu_ref.c:257:lu_ref_del()) ASSERTION( 0 ) failed: [84247.892251] LustreError: 94510:0:(lu_ref.c:257:lu_ref_del()) LBUG [84247.900585] Pid: 94510, comm: jbd2/dm-0-8 3.10.0-862.14.4.el7_lustre_ClientSymlink_279c264.x86_64 #1 SMP Thu Oct 17 10:54:24 UTC 2019 [84247.917060] Call Trace: [84247.921267] [<ffffffffc0d4f0ec>] libcfs_call_trace+0x8c/0xc0 [libcfs] [84247.930052] [<ffffffffc0d4f19c>] lbug_with_loc+0x4c/0xa0 [libcfs] [84247.938357] [<ffffffffc0df9ab0>] lu_ref_set_at+0x0/0x160 [obdclass] [84247.946908] [<ffffffffc10d5373>] tgt_cancel_slc_locks+0x183/0x1f0 [ptlrpc] [84247.956058] [<ffffffffc10d733f>] tgt_cb_last_committed+0x2cf/0x3d0 [ptlrpc] [84247.965195] [<ffffffffc1438a43>] osd_trans_commit_cb+0xd3/0x3c0 [osd_ldiskfs] [84247.974493] [<ffffffffc13cbf23>] ldiskfs_journal_commit_callback+0x93/0xd0 [ldiskfs] [84247.984413] [<ffffffffc03a36a5>] jbd2_journal_commit_transaction+0x1635/0x19b0 [jbd2] [84247.994415] [<ffffffffc03a8ab9>] kjournald2+0xc9/0x260 [jbd2] [84248.002069] [<ffffffffb02bdf21>] kthread+0xd1/0xe0 [84248.008622] [<ffffffffb09255f7>] ret_from_fork_nospec_end+0x0/0x39 [84248.016717] [<ffffffffffffffff>] 0xffffffffffffffff [84248.023336] Kernel panic - not syncing: LBUG [84248.029116] CPU: 23 PID: 94510 Comm: jbd2/dm-0-8 Kdump: loaded Tainted: G IOE ------------ 3.10.0-862.14.4.el7_lustre_ClientSymlink_279c264.x86_64 #1 [84248.047299] Hardware name: Intel Corporation S2600WTT/S2600WTT, BIOS SE5C610.86B.01.01.0008.021120151325 02/11/2015 [84248.060028] Call Trace: [84248.063811] [<ffffffffb0913754>] dump_stack+0x19/0x1b [84248.070599] [<ffffffffb090d29f>] panic+0xe8/0x21f [84248.076984] [<ffffffffc0d4f1eb>] lbug_with_loc+0x9b/0xa0 [libcfs] [84248.084924] [<ffffffffc0df9ab0>] lu_ref_del+0x230/0x230 [obdclass] [84248.092967] [<ffffffffc10d5373>] tgt_cancel_slc_locks+0x183/0x1f0 [ptlrpc] [84248.101777] [<ffffffffc10d733f>] tgt_cb_last_committed+0x2cf/0x3d0 [ptlrpc] [84248.110644] [<ffffffffc1438a43>] osd_trans_commit_cb+0xd3/0x3c0 [osd_ldiskfs] [84248.119399] LustreError: 94525:0:(lu_ref.c:257:lu_ref_del()) ASSERTION( 0 ) failed: [84248.119402] LustreError: 94525:0:(lu_ref.c:257:lu_ref_del()) LBUG [84248.119405] Pid: 94525, comm: ldlm_bl_02 3.10.0-862.14.4.el7_lustre_ClientSymlink_279c264.x86_64 #1 SMP Thu Oct 17 10:54:24 UTC 2019 [84248.119406] Call Trace: [84248.119442] [<ffffffffc0d4f0ec>] libcfs_call_trace+0x8c/0xc0 [libcfs] [84248.119453] [<ffffffffc0d4f19c>] lbug_with_loc+0x4c/0xa0 [libcfs] [84248.119513] [<ffffffffc0df9ab0>] lu_ref_set_at+0x0/0x160 [obdclass] [84248.119614] [<ffffffffc10d308c>] tgt_discard_slc_lock+0x8c/0x120 [ptlrpc] [84248.119642] [<ffffffffc15c4ac7>] mdt_remote_blocking_ast+0x97/0x5a0 [mdt] [84248.119695] [<ffffffffc102efaa>] ldlm_cancel_callback+0x8a/0x2e0 [ptlrpc] [84248.119754] [<ffffffffc1046891>] ldlm_cli_cancel_local+0xd1/0x420 [ptlrpc] [84248.119811] [<ffffffffc104c76c>] ldlm_cli_cancel+0x12c/0x560 [ptlrpc] [84248.119838] [<ffffffffc15c4bb7>] mdt_remote_blocking_ast+0x187/0x5a0 [mdt] [84248.119896] [<ffffffffc1058668>] ldlm_handle_bl_callback+0xc8/0x3e0 [ptlrpc] [84248.119953] [<ffffffffc105911d>] ldlm_bl_thread_main+0x79d/0x9c0 [ptlrpc] [84248.119959] [<ffffffffb02bdf21>] kthread+0xd1/0xe0 [84248.119965] [<ffffffffb09255f7>] ret_from_fork_nospec_end+0x0/0x39 [84248.119997] [<ffffffffffffffff>] 0xffffffffffffffff [84248.269937] [<ffffffffc13cbf23>] ldiskfs_journal_commit_callback+0x93/0xd0 [ldiskfs] [84248.279610] [<ffffffffc03a36a5>] jbd2_journal_commit_transaction+0x1635/0x19b0 [jbd2] [84248.289367] [<ffffffffc03a8ab9>] kjournald2+0xc9/0x260 [jbd2] [84248.296764] [<ffffffffb02bef10>] ? wake_up_atomic_t+0x30/0x30 [84248.304160] [<ffffffffc03a89f0>] ? commit_timeout+0x10/0x10 [jbd2] [84248.312030] [<ffffffffb02bdf21>] kthread+0xd1/0xe0 [84248.318355] [<ffffffffb02bde50>] ? insert_kthread_work+0x40/0x40 [84248.326046] [<ffffffffb09255f7>] ret_from_fork_nospec_begin+0x21/0x21 [84248.334218] [<ffffffffb02bde50>] ? insert_kthread_work+0x40/0x40 (END) crash> crash> bt PID: 94510 TASK: ffff8dc8d9761fa0 CPU: 23 COMMAND: "jbd2/dm-0-8" #0 [ffff8dc743b17998] machine_kexec at ffffffffb0262a0a #1 [ffff8dc743b179f8] __crash_kexec at ffffffffb03166c2 #2 [ffff8dc743b17ac8] panic at ffffffffb090d2aa #3 [ffff8dc743b17b48] lbug_with_loc at ffffffffc0d4f1eb [libcfs] #4 [ffff8dc743b17ba0] tgt_cancel_slc_locks at ffffffffc10d5373 [ptlrpc] #5 [ffff8dc743b17c08] tgt_cb_last_committed at ffffffffc10d733f [ptlrpc] #6 [ffff8dc743b17c20] osd_trans_commit_cb at ffffffffc1438a43 [osd_ldiskfs] #7 [ffff8dc743b17c68] ldiskfs_journal_commit_callback at ffffffffc13cbf23 [ldiskfs] #8 [ffff8dc743b17ca8] jbd2_journal_commit_transaction at ffffffffc03a36a5 [jbd2] #9 [ffff8dc743b17e48] kjournald2 at ffffffffc03a8ab9 [jbd2] #10 [ffff8dc743b17ec8] kthread at ffffffffb02bdf21 crash> crash> bt 94525 PID: 94525 TASK: ffff8dc8862e6eb0 CPU: 11 COMMAND: "ldlm_bl_02" #0 [ffff8dc885f1bae0] panic at ffffffffb090d223 #1 [ffff8dc885f1bb60] lbug_with_loc at ffffffffc0d4f1eb [libcfs] #2 [ffff8dc885f1bbb8] tgt_discard_slc_lock at ffffffffc10d308c [ptlrpc] #3 [ffff8dc885f1bbe0] mdt_remote_blocking_ast at ffffffffc15c4ac7 [mdt] #4 [ffff8dc885f1bc60] ldlm_cancel_callback at ffffffffc102efaa [ptlrpc] #5 [ffff8dc885f1bcd0] ldlm_cli_cancel_local at ffffffffc1046891 [ptlrpc] #6 [ffff8dc885f1bcf8] ldlm_cli_cancel at ffffffffc104c76c [ptlrpc] #7 [ffff8dc885f1bd78] mdt_remote_blocking_ast at ffffffffc15c4bb7 [mdt] #8 [ffff8dc885f1bdf8] ldlm_handle_bl_callback at ffffffffc1058668 [ptlrpc] #9 [ffff8dc885f1be28] ldlm_bl_thread_main at ffffffffc105911d [ptlrpc] #10 [ffff8dc885f1bec8] kthread at ffffffffb02bdf21 crash> crash> lu_ref_link 0xffff8dc74be64398 struct lu_ref_link { ll_ref = 0xffff8dc74be5c5a0, ll_linkage = { next = 0xffff8dc74be5c5a8, prev = 0xffff8dc74be5c5a8 }, ll_scope = 0xffffffffc1134dc0 "handle", ll_source = 0xffff8dc7abb78000 } crash> crash> bt ffff8dc7abb78000 PID: 95382 TASK: ffff8dc7abb78000 CPU: 26 COMMAND: "mdt01_008" #0 [ffff8dc72568bcb8] __schedule at ffffffffb09188c4 #1 [ffff8dc72568bd40] schedule at ffffffffb0918f39 #2 [ffff8dc72568bd50] ptlrpc_wait_event at ffffffffc1086c45 [ptlrpc] #3 [ffff8dc72568bdf0] ptlrpc_main at ffffffffc108ed6d [ptlrpc] #4 [ffff8dc72568bec8] kthread at ffffffffb02bdf21 crash>
Further crash-dump debugging and associated source code browsing points to the fact that lu_ref_del()/REFASSERT() triggers the !(ref->lf_failed > 0) error/LBUG condition because lu_ref_find() has not been able to find the original lu_ref_link because it has been tagged using the "current" task_struct pointer but it is being deleted within an other jbd2/ldlm task context.
So a possible fix could be to not use "current" as the tag, in both concerned __ldlm_handle2lock() function and LDLM_LOCK_REF_DEL()macro, but the ldlm_lock address instead... Will push a patch soon to implement this fix and see how it works.
Landed for 2.14