Details
-
Bug
-
Resolution: Unresolved
-
Major
-
None
-
None
-
None
-
3
-
9223372036854775807
Description
LBUG:
console-20220406:2022-04-06T05:19:20.214620-05:00 c0-0c0s11n1 LustreError: 6221:0:(osc_request.c:1685:osc_brw_prep_request()) ASSERTION( page_count == 1 || (ergo(i == 0, poff + pg->count == PAGE_SIZE) && ergo(i > 0 && i < page_count - 1, poff == 0 && pg->count == PAGE_SIZE) && ergo(i == page_count - 1, poff == 0)) ) failed: i: 0/2 pg: 00000000295a3943 off: 0, count: 3928 console-20220406:2022-04-06T05:19:20.214684-05:00 c0-0c0s11n1 LustreError: 6221:0:(osc_request.c:1685:osc_brw_prep_request()) LBUG console-20220406:2022-04-06T05:19:20.214702-05:00 c0-0c0s11n1 Pid: 6221, comm: ptlrpcd_00_13 5.3.18-59.34_7.0.4.6-cray_ari_c #1 SMP Wed Mar 2 05:45:36 UTC 2022 (d0a6bb5) console-20220406:2022-04-06T05:19:20.214726-05:00 c0-0c0s11n1 Call Trace TBD: console-20220406:2022-04-06T05:19:20.214733-05:00 c0-0c0s11n1 [<0>] libcfs_call_trace+0x74/0xc0 [libcfs] console-20220406:2022-04-06T05:19:20.214745-05:00 c0-0c0s11n1 [<0>] lbug_with_loc+0x43/0x90 [libcfs] console-20220406:2022-04-06T05:19:20.214751-05:00 c0-0c0s11n1 [<0>] osc_brw_prep_request+0x163e/0x1ab0 [osc] console-20220406:2022-04-06T05:19:20.214756-05:00 c0-0c0s11n1 [<0>] osc_build_rpc+0xa11/0x1130 [osc] console-20220406:2022-04-06T05:19:20.214761-05:00 c0-0c0s11n1 [<0>] osc_io_unplug0+0x1549/0x17e0 [osc] console-20220406:2022-04-06T05:19:20.214772-05:00 c0-0c0s11n1 [<0>] brw_queue_work+0x33/0xd0 [osc] console-20220406:2022-04-06T05:19:20.214777-05:00 c0-0c0s11n1 [<0>] work_interpreter+0x33/0x100 [ptlrpc] console-20220406:2022-04-06T05:19:20.214788-05:00 c0-0c0s11n1 [<0>] ptlrpc_check_set+0x54b/0x2060 [ptlrpc] console-20220406:2022-04-06T05:19:20.214810-05:00 c0-0c0s11n1 [<0>] ptlrpcd+0x820/0xa10 [ptlrpc] console-20220406:2022-04-06T05:19:20.214820-05:00 c0-0c0s11n1 [<0>] kthread+0x120/0x140 console-20220406:2022-04-06T05:19:20.214841-05:00 c0-0c0s11n1 [<0>] ret_from_fork+0x3a/0x50 console-20220406:2022-04-06T05:19:20.214852-05:00 c0-0c0s11n1 Kernel panic - not syncing: LBUG console-20220406:2022-04-06T05:19:20.214887-05:00 c0-0c0s11n1 CPU: 39 PID: 6221 Comm: ptlrpcd_00_13 Tainted: P O 5.3.18-59.34_7.0.4.6-cray_ari_c #1 SLE15-SP3 (unreleased) console-20220406:2022-04-06T05:19:20.214918-05:00 c0-0c0s11n1 Hardware name: Cray Inc. Cascade/Cascade, BIOS 4.6.5 09/05/2019 console-20220406:2022-04-06T05:19:20.214925-05:00 c0-0c0s11n1 Call Trace: console-20220406:2022-04-06T05:19:20.214931-05:00 c0-0c0s11n1 dump_stack+0x7a/0xa5 console-20220406:2022-04-06T05:19:20.214936-05:00 c0-0c0s11n1 panic+0xfd/0x2c9 console-20220406:2022-04-06T05:19:20.214941-05:00 c0-0c0s11n1 lbug_with_loc+0x89/0x90 [libcfs] console-20220406:2022-04-06T05:19:20.214946-05:00 c0-0c0s11n1 osc_brw_prep_request+0x163e/0x1ab0 [osc] console-20220406:2022-04-06T05:19:20.214951-05:00 c0-0c0s11n1 ? osc_req_attr_set+0x1ed/0x680 [osc] console-20220406:2022-04-06T05:19:20.214956-05:00 c0-0c0s11n1 osc_build_rpc+0xa11/0x1130 [osc] console-20220406:2022-04-06T05:19:20.214963-05:00 c0-0c0s11n1 osc_io_unplug0+0x1549/0x17e0 [osc] console-20220406:2022-04-06T05:19:20.214987-05:00 c0-0c0s11n1 ? trace_hardirqs_on+0x38/0xe0 console-20220406:2022-04-06T05:19:20.214993-05:00 c0-0c0s11n1 ? __schedule+0x2f7/0x7c0 console-20220406:2022-04-06T05:19:20.214998-05:00 c0-0c0s11n1 brw_queue_work+0x33/0xd0 [osc] console-20220406:2022-04-06T05:19:20.215003-05:00 c0-0c0s11n1 work_interpreter+0x33/0x100 [ptlrpc] console-20220406:2022-04-06T05:19:20.215008-05:00 c0-0c0s11n1 ptlrpc_check_set+0x54b/0x2060 [ptlrpc] console-20220406:2022-04-06T05:19:20.215012-05:00 c0-0c0s11n1 ptlrpcd+0x820/0xa10 [ptlrpc] console-20220406:2022-04-06T05:19:20.215031-05:00 c0-0c0s11n1 ? do_wait_intr_irq+0x90/0x90 console-20220406:2022-04-06T05:19:20.215037-05:00 c0-0c0s11n1 kthread+0x120/0x140 console-20220406:2022-04-06T05:19:20.215043-05:00 c0-0c0s11n1 ? ptlrpcd_ctl_init+0x180/0x180 [ptlrpc] console-20220406:2022-04-06T05:19:20.215048-05:00 c0-0c0s11n1 ? kthread_create_worker_on_cpu+0x70/0x70 console-20220406:2022-04-06T05:19:20.215054-05:00 c0-0c0s11n1 ret_from_fork+0x3a/0x50 console-20220406:2022-04-06T05:19:20.215059-05:00 c0-0c0s11n1 Shutting down cpus with NMI console-20220406:2022-04-06T05:19:20.215064-05:00 c0-0c0s11n1 Kernel Offset: disabled console-20220406:2022-04-06T05:19:20.215070-05:00 c0-0c0s11n1 ---[ end Kernel panic - not syncing: LBUG ]---
The backtrace:
crash_x86_64> bt -ll PID: 8139 TASK: ffff888f88b15000 CPU: 24 COMMAND: "ptlrpcd_00_08" #0 [ffffc90007d779c8] panic at ffffffff8107aab7 /home/abuild/rpmbuild/BUILD/kernel-cray_ari_c-5.3.18/linux-5.3.18/linux-obj/../kernel/panic.c: 342 #1 [ffffc90007d77a50] lbug_with_loc at ffffffffa0246529 [libcfs] /home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/libcfs/libcfs/debug.c: 476 #2 [ffffc90007d77a70] osc_brw_prep_request at ffffffffa07ae4ee [osc] /home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/lustre/osc/osc_request.c: 1679 #3 [ffffc90007d77ba0] osc_build_rpc at ffffffffa07b3631 [osc] /home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/lustre/osc/osc_request.c: 2618 #4 [ffffc90007d77c60] osc_extent_finish at ffffffffa07cce99 [osc] /home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/lustre/osc/osc_cache.c: 2060 #5 [ffffc90007d77d40] brw_queue_work at ffffffffa07a6433 [osc] /home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/libcfs/include/libcfs/libcfs_debug.h: 155 #6 [ffffc90007d77d60] work_interpreter at ffffffffa0671df3 [ptlrpc] /usr/src/linux-5.3.18-59.34_7.0.4.6/include/linux/list.h: 135 #7 [ffffc90007d77d80] ptlrpc_check_set at ffffffffa067a88b [ptlrpc] /home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/lustre/include/lustre_net.h: 1151 #8 [ffffc90007d77e10] ptlrpcd at ffffffffa06a7ce0 [ptlrpc] /home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/lustre/ptlrpc/ptlrpcd.c: 361 #9 [ffffc90007d77f08] kthread at ffffffff810a2400 /home/abuild/rpmbuild/BUILD/kernel-cray_ari_c-5.3.18/linux-5.3.18/linux-obj/../kernel/kthread.c: 274 #10 [ffffc90007d77f50] ret_from_fork at ffffffff8180021a /home/abuild/rpmbuild/BUILD/kernel-cray_ari_c-5.3.18/linux-5.3.18/linux-obj/../arch/x86/entry/entry_64.S: 360 crash_x86_64>
The array pga which consists of brw_page entries is starting with a truncated page:
crash_x86_64> bt -f PID: 8139 TASK: ffff888f88b15000 CPU: 24 COMMAND: "ptlrpcd_00_08" #0 [ffffc90007d779c8] panic at ffffffff8107aab7 ffffc90007d779d0: ffffc90000000008 ffffc90007d77a58 ... ffffc90007d77b88: ffff888788dd28c0 ffff888788dd28d0 ffffc90007d77b98: ffffc90007d77c58 ffffffffa07b3631 #3 [ffffc90007d77ba0] osc_build_rpc at ffffffffa07b3631 [osc] ffffc90007d77ba8: 0000000000000000 0000000007d77bc8 ffffc90007d77bb8: 0000000000000000 ffff888789586af8 ...
Examine the entries (there are 2):
crash_x86_64> rd ffff888788dd28c0 8 ffff888788dd28c0: ffff8887814ae390 ffff8887814ae240 ..J.....@.J..... ffff888788dd28d0: 0000000000000000 0000000000000000 ................ ... crash_x86_64> brw_page ffff8887814ae390 struct brw_page { off = 0, pg = 0xffffea00205343c0, count = 3839, <------ A short page flag = 682, bp_off_diff = 0, bp_count_diff = 0, bp_padding = 0 } crash_x86_64> brw_page ffff8887814ae240 struct brw_page { off = 4096, pg = 0xffffea000008a7c0, count = 4096, flag = 682, bp_off_diff = 0, bp_count_diff = 0, bp_padding = 0 }
Where did this come from? Maybe the async_page will give us more information:
crash_x86_64> whatis osc_async_page.oap_brw_page struct osc_async_page { [72] struct brw_page oap_brw_page; } crash_x86_64> eval ffff8887814ae390 -72 hexadecimal: ffff8887814ae348 crash_x86_64> eval ffff8887814ae240 -72 hexadecimal: ffff8887814ae1f8 crash_x86_64> struct osc_async_page ffff8887814ae348 struct osc_async_page { oap_magic = 8675309, oap_cmd = 2, ... oap_obj_off = 0, oap_page_off = 0, oap_async_flags = (ASYNC_READY | ASYNC_URGENT | ASYNC_COUNT_STABLE), oap_brw_page = { off = 0, <----- Start of object pg = 0xffffea00205343c0, count = 3839, <----- Short page flag = 682, bp_off_diff = 0, bp_count_diff = 0, bp_padding = 0 }, crash_x86_64> struct osc_async_page ffff8887814ae1f8 struct osc_async_page { oap_magic = 8675309, oap_cmd = 2, ... oap_obj_off = 4096, <--- 1 page into the object oap_page_off = 0, oap_async_flags = (ASYNC_READY | ASYNC_URGENT | ASYNC_COUNT_STABLE), oap_brw_page = { off = 4096, pg = 0xffffea000008a7c0, count = 4096, <-------- full page at end of object flag = 682, bp_off_diff = 0, bp_count_diff = 0, bp_padding = 0 }, ...
It looks like non-contiguous aio/dio pages, checking the cl_page of each
crash_x86_64> whatis osc_page.ops_oap struct osc_page { [24] struct osc_async_page ops_oap; } crash_x86_64> eval 0xffff8887814ae348 - 24 hexadecimal: ffff8887814ae330 crash_x86_64> struct osc_page ffff8887814ae330 struct osc_page { ops_cl = { cpl_page = 0xffff8887814ae2a0, cpl_obj = 0xffff88878b4f4140, cpl_ops = 0xffffffffa07d9f00 <osc_page_ops> }, ops_oap = { crash_x86_64> struct cl_page 0xffff8887814ae2a0 struct cl_page { cp_ref = { counter = 2 }, cp_lov_index = 0, cp_osc_index = 0, cp_obj = 0xffff888781d2b958, cp_vmpage = 0xffffea00205343c0, cp_inode = 0x0, cp_batch = { next = 0xffff8887814ae178, prev = 0xffff888787dc28d8 }, cp_layer_offset = "\000(@", cp_layer_count = 3 '\003', cp_state = CPS_PAGEOUT, cp_type = CPT_TRANSIENT, <-- DIO/AIO cp_kmem_index = 0, cp_unused1 = 0, cp_owner = 0x0, cp_reference = {<No data fields>}, cp_obj_ref = {<No data fields>}, cp_queue_ref = {<No data fields>}, cp_sync_io = 0xffff888787dc2890 }
And the second page is also:
crash_x86_64> struct osc_page ffff8887814ae1e0 struct osc_page { ops_cl = { cpl_page = 0xffff8887814ae150, cpl_obj = 0xffff88878b4f4140, cpl_ops = 0xffffffffa07d9f00 <osc_page_ops> }, ops_oap = { ... crash_x86_64> struct cl_page 0xffff8887814ae150 struct cl_page { cp_ref = { counter = 2 }, cp_lov_index = 0, cp_osc_index = 1, cp_obj = 0xffff888781d2b958, cp_vmpage = 0xffffea000008a7c0, cp_inode = 0x0, cp_batch = { next = 0xffff888787dc28d8, prev = 0xffff8887814ae2c8 }, cp_layer_offset = "\000(@", cp_layer_count = 3 '\003', cp_state = CPS_PAGEOUT, cp_type = CPT_TRANSIENT, cp_kmem_index = 0, cp_unused1 = 0, cp_owner = 0x0, cp_reference = {<No data fields>}, cp_obj_ref = {<No data fields>}, cp_queue_ref = {<No data fields>}, cp_sync_io = 0xffff888787dc2890 } crash_x86_64>