Details
-
Bug
-
Resolution: Unresolved
-
Major
-
None
-
None
-
None
-
3
-
9223372036854775807
Description
LBUG:
console-20220406:2022-04-06T05:19:20.214620-05:00 c0-0c0s11n1 LustreError: 6221:0:(osc_request.c:1685:osc_brw_prep_request()) ASSERTION( page_count == 1 || (ergo(i == 0, poff + pg->count == PAGE_SIZE) && ergo(i > 0 && i < page_count - 1, poff == 0 && pg->count == PAGE_SIZE) && ergo(i == page_count - 1, poff == 0)) ) failed: i: 0/2 pg: 00000000295a3943 off: 0, count: 3928 console-20220406:2022-04-06T05:19:20.214684-05:00 c0-0c0s11n1 LustreError: 6221:0:(osc_request.c:1685:osc_brw_prep_request()) LBUG console-20220406:2022-04-06T05:19:20.214702-05:00 c0-0c0s11n1 Pid: 6221, comm: ptlrpcd_00_13 5.3.18-59.34_7.0.4.6-cray_ari_c #1 SMP Wed Mar 2 05:45:36 UTC 2022 (d0a6bb5) console-20220406:2022-04-06T05:19:20.214726-05:00 c0-0c0s11n1 Call Trace TBD: console-20220406:2022-04-06T05:19:20.214733-05:00 c0-0c0s11n1 [<0>] libcfs_call_trace+0x74/0xc0 [libcfs] console-20220406:2022-04-06T05:19:20.214745-05:00 c0-0c0s11n1 [<0>] lbug_with_loc+0x43/0x90 [libcfs] console-20220406:2022-04-06T05:19:20.214751-05:00 c0-0c0s11n1 [<0>] osc_brw_prep_request+0x163e/0x1ab0 [osc] console-20220406:2022-04-06T05:19:20.214756-05:00 c0-0c0s11n1 [<0>] osc_build_rpc+0xa11/0x1130 [osc] console-20220406:2022-04-06T05:19:20.214761-05:00 c0-0c0s11n1 [<0>] osc_io_unplug0+0x1549/0x17e0 [osc] console-20220406:2022-04-06T05:19:20.214772-05:00 c0-0c0s11n1 [<0>] brw_queue_work+0x33/0xd0 [osc] console-20220406:2022-04-06T05:19:20.214777-05:00 c0-0c0s11n1 [<0>] work_interpreter+0x33/0x100 [ptlrpc] console-20220406:2022-04-06T05:19:20.214788-05:00 c0-0c0s11n1 [<0>] ptlrpc_check_set+0x54b/0x2060 [ptlrpc] console-20220406:2022-04-06T05:19:20.214810-05:00 c0-0c0s11n1 [<0>] ptlrpcd+0x820/0xa10 [ptlrpc] console-20220406:2022-04-06T05:19:20.214820-05:00 c0-0c0s11n1 [<0>] kthread+0x120/0x140 console-20220406:2022-04-06T05:19:20.214841-05:00 c0-0c0s11n1 [<0>] ret_from_fork+0x3a/0x50 console-20220406:2022-04-06T05:19:20.214852-05:00 c0-0c0s11n1 Kernel panic - not syncing: LBUG console-20220406:2022-04-06T05:19:20.214887-05:00 c0-0c0s11n1 CPU: 39 PID: 6221 Comm: ptlrpcd_00_13 Tainted: P O 5.3.18-59.34_7.0.4.6-cray_ari_c #1 SLE15-SP3 (unreleased) console-20220406:2022-04-06T05:19:20.214918-05:00 c0-0c0s11n1 Hardware name: Cray Inc. Cascade/Cascade, BIOS 4.6.5 09/05/2019 console-20220406:2022-04-06T05:19:20.214925-05:00 c0-0c0s11n1 Call Trace: console-20220406:2022-04-06T05:19:20.214931-05:00 c0-0c0s11n1 dump_stack+0x7a/0xa5 console-20220406:2022-04-06T05:19:20.214936-05:00 c0-0c0s11n1 panic+0xfd/0x2c9 console-20220406:2022-04-06T05:19:20.214941-05:00 c0-0c0s11n1 lbug_with_loc+0x89/0x90 [libcfs] console-20220406:2022-04-06T05:19:20.214946-05:00 c0-0c0s11n1 osc_brw_prep_request+0x163e/0x1ab0 [osc] console-20220406:2022-04-06T05:19:20.214951-05:00 c0-0c0s11n1 ? osc_req_attr_set+0x1ed/0x680 [osc] console-20220406:2022-04-06T05:19:20.214956-05:00 c0-0c0s11n1 osc_build_rpc+0xa11/0x1130 [osc] console-20220406:2022-04-06T05:19:20.214963-05:00 c0-0c0s11n1 osc_io_unplug0+0x1549/0x17e0 [osc] console-20220406:2022-04-06T05:19:20.214987-05:00 c0-0c0s11n1 ? trace_hardirqs_on+0x38/0xe0 console-20220406:2022-04-06T05:19:20.214993-05:00 c0-0c0s11n1 ? __schedule+0x2f7/0x7c0 console-20220406:2022-04-06T05:19:20.214998-05:00 c0-0c0s11n1 brw_queue_work+0x33/0xd0 [osc] console-20220406:2022-04-06T05:19:20.215003-05:00 c0-0c0s11n1 work_interpreter+0x33/0x100 [ptlrpc] console-20220406:2022-04-06T05:19:20.215008-05:00 c0-0c0s11n1 ptlrpc_check_set+0x54b/0x2060 [ptlrpc] console-20220406:2022-04-06T05:19:20.215012-05:00 c0-0c0s11n1 ptlrpcd+0x820/0xa10 [ptlrpc] console-20220406:2022-04-06T05:19:20.215031-05:00 c0-0c0s11n1 ? do_wait_intr_irq+0x90/0x90 console-20220406:2022-04-06T05:19:20.215037-05:00 c0-0c0s11n1 kthread+0x120/0x140 console-20220406:2022-04-06T05:19:20.215043-05:00 c0-0c0s11n1 ? ptlrpcd_ctl_init+0x180/0x180 [ptlrpc] console-20220406:2022-04-06T05:19:20.215048-05:00 c0-0c0s11n1 ? kthread_create_worker_on_cpu+0x70/0x70 console-20220406:2022-04-06T05:19:20.215054-05:00 c0-0c0s11n1 ret_from_fork+0x3a/0x50 console-20220406:2022-04-06T05:19:20.215059-05:00 c0-0c0s11n1 Shutting down cpus with NMI console-20220406:2022-04-06T05:19:20.215064-05:00 c0-0c0s11n1 Kernel Offset: disabled console-20220406:2022-04-06T05:19:20.215070-05:00 c0-0c0s11n1 ---[ end Kernel panic - not syncing: LBUG ]---
The backtrace:
crash_x86_64> bt -ll
PID: 8139 TASK: ffff888f88b15000 CPU: 24 COMMAND: "ptlrpcd_00_08"
#0 [ffffc90007d779c8] panic at ffffffff8107aab7
/home/abuild/rpmbuild/BUILD/kernel-cray_ari_c-5.3.18/linux-5.3.18/linux-obj/../kernel/panic.c: 342
#1 [ffffc90007d77a50] lbug_with_loc at ffffffffa0246529 [libcfs]
/home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/libcfs/libcfs/debug.c: 476
#2 [ffffc90007d77a70] osc_brw_prep_request at ffffffffa07ae4ee [osc]
/home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/lustre/osc/osc_request.c: 1679
#3 [ffffc90007d77ba0] osc_build_rpc at ffffffffa07b3631 [osc]
/home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/lustre/osc/osc_request.c: 2618
#4 [ffffc90007d77c60] osc_extent_finish at ffffffffa07cce99 [osc]
/home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/lustre/osc/osc_cache.c: 2060
#5 [ffffc90007d77d40] brw_queue_work at ffffffffa07a6433 [osc]
/home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/libcfs/include/libcfs/libcfs_debug.h: 155
#6 [ffffc90007d77d60] work_interpreter at ffffffffa0671df3 [ptlrpc]
/usr/src/linux-5.3.18-59.34_7.0.4.6/include/linux/list.h: 135
#7 [ffffc90007d77d80] ptlrpc_check_set at ffffffffa067a88b [ptlrpc]
/home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/lustre/include/lustre_net.h: 1151
#8 [ffffc90007d77e10] ptlrpcd at ffffffffa06a7ce0 [ptlrpc]
/home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/lustre/ptlrpc/ptlrpcd.c: 361
#9 [ffffc90007d77f08] kthread at ffffffff810a2400
/home/abuild/rpmbuild/BUILD/kernel-cray_ari_c-5.3.18/linux-5.3.18/linux-obj/../kernel/kthread.c: 274
#10 [ffffc90007d77f50] ret_from_fork at ffffffff8180021a
/home/abuild/rpmbuild/BUILD/kernel-cray_ari_c-5.3.18/linux-5.3.18/linux-obj/../arch/x86/entry/entry_64.S: 360
crash_x86_64>
The array pga which consists of brw_page entries is starting with a truncated page:
crash_x86_64> bt -f
PID: 8139 TASK: ffff888f88b15000 CPU: 24 COMMAND: "ptlrpcd_00_08"
#0 [ffffc90007d779c8] panic at ffffffff8107aab7
ffffc90007d779d0: ffffc90000000008 ffffc90007d77a58
...
ffffc90007d77b88: ffff888788dd28c0 ffff888788dd28d0
ffffc90007d77b98: ffffc90007d77c58 ffffffffa07b3631
#3 [ffffc90007d77ba0] osc_build_rpc at ffffffffa07b3631 [osc]
ffffc90007d77ba8: 0000000000000000 0000000007d77bc8
ffffc90007d77bb8: 0000000000000000 ffff888789586af8
...
Examine the entries (there are 2):
crash_x86_64> rd ffff888788dd28c0 8
ffff888788dd28c0: ffff8887814ae390 ffff8887814ae240 ..J.....@.J.....
ffff888788dd28d0: 0000000000000000 0000000000000000 ................
...
crash_x86_64> brw_page ffff8887814ae390
struct brw_page {
off = 0,
pg = 0xffffea00205343c0,
count = 3839, <------ A short page
flag = 682,
bp_off_diff = 0,
bp_count_diff = 0,
bp_padding = 0
}
crash_x86_64> brw_page ffff8887814ae240
struct brw_page {
off = 4096,
pg = 0xffffea000008a7c0,
count = 4096,
flag = 682,
bp_off_diff = 0,
bp_count_diff = 0,
bp_padding = 0
}
Where did this come from? Maybe the async_page will give us more information:
crash_x86_64> whatis osc_async_page.oap_brw_page
struct osc_async_page {
[72] struct brw_page oap_brw_page;
}
crash_x86_64> eval ffff8887814ae390 -72
hexadecimal: ffff8887814ae348
crash_x86_64> eval ffff8887814ae240 -72
hexadecimal: ffff8887814ae1f8
crash_x86_64> struct osc_async_page ffff8887814ae348
struct osc_async_page {
oap_magic = 8675309,
oap_cmd = 2,
...
oap_obj_off = 0,
oap_page_off = 0,
oap_async_flags = (ASYNC_READY | ASYNC_URGENT | ASYNC_COUNT_STABLE),
oap_brw_page = {
off = 0, <----- Start of object
pg = 0xffffea00205343c0,
count = 3839, <----- Short page
flag = 682,
bp_off_diff = 0,
bp_count_diff = 0,
bp_padding = 0
},
crash_x86_64> struct osc_async_page ffff8887814ae1f8
struct osc_async_page {
oap_magic = 8675309,
oap_cmd = 2,
...
oap_obj_off = 4096, <--- 1 page into the object
oap_page_off = 0,
oap_async_flags = (ASYNC_READY | ASYNC_URGENT | ASYNC_COUNT_STABLE),
oap_brw_page = {
off = 4096,
pg = 0xffffea000008a7c0,
count = 4096, <-------- full page at end of object
flag = 682,
bp_off_diff = 0,
bp_count_diff = 0,
bp_padding = 0
},
...
It looks like non-contiguous aio/dio pages, checking the cl_page of each
crash_x86_64> whatis osc_page.ops_oap
struct osc_page {
[24] struct osc_async_page ops_oap;
}
crash_x86_64> eval 0xffff8887814ae348 - 24
hexadecimal: ffff8887814ae330
crash_x86_64> struct osc_page ffff8887814ae330
struct osc_page {
ops_cl = {
cpl_page = 0xffff8887814ae2a0,
cpl_obj = 0xffff88878b4f4140,
cpl_ops = 0xffffffffa07d9f00 <osc_page_ops>
},
ops_oap = {
crash_x86_64> struct cl_page 0xffff8887814ae2a0
struct cl_page {
cp_ref = {
counter = 2
},
cp_lov_index = 0,
cp_osc_index = 0,
cp_obj = 0xffff888781d2b958,
cp_vmpage = 0xffffea00205343c0,
cp_inode = 0x0,
cp_batch = {
next = 0xffff8887814ae178,
prev = 0xffff888787dc28d8
},
cp_layer_offset = "\000(@",
cp_layer_count = 3 '\003',
cp_state = CPS_PAGEOUT,
cp_type = CPT_TRANSIENT, <-- DIO/AIO
cp_kmem_index = 0,
cp_unused1 = 0,
cp_owner = 0x0,
cp_reference = {<No data fields>},
cp_obj_ref = {<No data fields>},
cp_queue_ref = {<No data fields>},
cp_sync_io = 0xffff888787dc2890
}
And the second page is also:
crash_x86_64> struct osc_page ffff8887814ae1e0
struct osc_page {
ops_cl = {
cpl_page = 0xffff8887814ae150,
cpl_obj = 0xffff88878b4f4140,
cpl_ops = 0xffffffffa07d9f00 <osc_page_ops>
},
ops_oap = {
...
crash_x86_64> struct cl_page 0xffff8887814ae150
struct cl_page {
cp_ref = {
counter = 2
},
cp_lov_index = 0,
cp_osc_index = 1,
cp_obj = 0xffff888781d2b958,
cp_vmpage = 0xffffea000008a7c0,
cp_inode = 0x0,
cp_batch = {
next = 0xffff888787dc28d8,
prev = 0xffff8887814ae2c8
},
cp_layer_offset = "\000(@",
cp_layer_count = 3 '\003',
cp_state = CPS_PAGEOUT,
cp_type = CPT_TRANSIENT,
cp_kmem_index = 0,
cp_unused1 = 0,
cp_owner = 0x0,
cp_reference = {<No data fields>},
cp_obj_ref = {<No data fields>},
cp_queue_ref = {<No data fields>},
cp_sync_io = 0xffff888787dc2890
}
crash_x86_64>