|
Customer reported two system panics while trying to gain system acceptance. They provided the attached tracebacks.
In addition, they provided the following information:
crash> bt
PID: 20417 TASK: ffff8803fe75a040 CPU: 0 COMMAND: "flush-lustre-1"
#0 [ffff880400ad1270] machine_kexec at ffffffff810310db
#1 [ffff880400ad12d0] crash_kexec at ffffffff810b63b2
#2 [ffff880400ad13a0] oops_end at ffffffff814dec50
#3 [ffff880400ad13d0] no_context at ffffffff81040cdb
#4 [ffff880400ad1420] __bad_area_nosemaphore at ffffffff81040f65
#5 [ffff880400ad1470] bad_area_nosemaphore at ffffffff81041033
#6 [ffff880400ad1480] __do_page_fault at ffffffff8104170d
#7 [ffff880400ad15a0] do_page_fault at ffffffff814e0c3e
#8 [ffff880400ad15d0] page_fault at ffffffff814ddfe5
[exception RIP: lov_ap_refresh_count+22]
RIP: ffffffffa07c3cc6 RSP: ffff880400ad1680 RFLAGS: 00010282
RAX: 0000000000000000 RBX: ffff8803ebe15230 RCX: ffff8803e9668360
RDX: ffff8803e9668360 RSI: 0000000000000002 RDI: 0000000000000000
RBP: ffff880400ad1690 R8: 0000000000000002 R9: 0000000000000000
R10: ffffffffa0806ee0 R11: 0000000000000003 R12: ffff8803e9668348
R13: 0000000000000000 R14: ffff8803e9668350 R15: ffff8803fa7c6ad0
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#9 [ffff880400ad1698] osc_send_oap_rpc at ffffffffa074feb9 [osc]
#10 [ffff880400ad17e8] osc_check_rpcs at ffffffffa0751121 [osc]
#11 [ffff880400ad1878] osc_set_async_flags at ffffffffa0751545 [osc]
#12 [ffff880400ad1938] lov_set_async_flags at ffffffffa07c6273 [lov]
#13 [ffff880400ad19b8] ll_writepage at ffffffffa085b2b4 [lustre]
#14 [ffff880400ad1a58] ll_writepage_26 at ffffffffa0876d6e [lustre]
#15 [ffff880400ad1a68] __writepage at ffffffff81120e67
#16 [ffff880400ad1a88] write_cache_pages at ffffffff811221f9
#17 [ffff880400ad1bb8] generic_writepages at ffffffff811224f4
#18 [ffff880400ad1bc8] do_writepages at ffffffff81122521
#19 [ffff880400ad1bd8] writeback_single_inode at ffffffff8119bbbd
#20 [ffff880400ad1c18] writeback_sb_inodes at ffffffff8119bfbe
#21 [ffff880400ad1c78] writeback_inodes_wb at ffffffff8119c11b
#22 [ffff880400ad1cd8] wb_writeback at ffffffff8119c4bb
#23 [ffff880400ad1dd8] wb_do_writeback at ffffffff8119c7a9
#24 [ffff880400ad1e68] bdi_writeback_task at ffffffff8119c8b3
#25 [ffff880400ad1eb8] bdi_start_fn at ffffffff81130c16
#26 [ffff880400ad1ee8] kthread at ffffffff8108ddf6
#27 [ffff880400ad1f48] kernel_thread at ffffffff8100c1ca
We crash in lov_ap_refresh_count which was called by osc_send_oap_rpc.
For whatever crazy reason osc/osc_request.c" line 2564 :
/* ask the caller for the size of the io as the rpc
leaves. */
if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE))
oap->oap_count =
ops->ap_refresh_count(oap->oap_caller_data,cmd); <--------------
if (oap->oap_count <= 0) {
CDEBUG(D_CACHE, "oap %p count %d, completing\n",
oap,
oap->oap_count);
where the <-------------- is above we send ap_refresh_count
oap->oap_caller_data
from the dump:
crash> print *(struct osc_async_page *)0xffff8803ebe15230
$89 = {
oap_magic = 8675309,
oap_cmd = 2,
oap_interrupted = 0,
oap_pending_item =
{
next = 0xffff8803ebe15238,
prev = 0xffff8803ebe15238
}
,
oap_urgent_item =
{
next = 0xffff8803ebe15248,
prev = 0xffff8803ebe15248
}
,
oap_rpc_item =
{
next = 0xffff8803ebe15258,
prev = 0xffff8803ebe15258
}
,
oap_obj_off = 0,
oap_page_off = 0,
oap_async_flags = 3,
oap_brw_page =
{
off = 0,
pg = 0xffffea00297d5fb0,
count = 0,
flag = 1056
}
,
oap_occ = {
occ_oig_item =
{
next = 0x0,
prev = 0x0
}
,
occ_interrupted = 0xffffffffa074b490 <osc_occ_interrupted>,
interrupted = 0
},
oap_oig = 0x0,
oap_request = 0x0,
oap_cli = 0xffff8803fa7c6ad0,
oap_loi = 0xffff8803e9668300,
oap_caller_ops = 0xffffffffa0806ee0,
oap_caller_data = 0x0,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
oap_page_list =
{
next = 0xffff8803eb850d70,
prev = 0xffff8803eb850d70
}
,
oap_ldlm_lock = 0xffff8803eb850c00,
oap_lock = {
raw_lock =
{
slock = 131074
}
}
}
ops->ap_refresh_count(oap->oap_caller_data,cmd); is the below:
static int lov_ap_refresh_count(void *data, int cmd)
{
struct lov_async_page *lap = LAP_FROM_COOKIE(data);
return
lap->lap_caller_ops->ap_refresh_count(lap->lap_caller_data,
cmd);
}
#define LAP_FROM_COOKIE(c)
\
(LASSERT(((struct lov_async_page *)(c))->lap_magic ==
LOV_AP_MAGIC), \
(struct lov_async_page *)(c))
sending this a NULL is probably not a great idea......
As to how oap_caller_data = 0x0 came to be I have no idea.
I was able to pull the lustre debug ring out of the dump but it doesn't
seem to have anything useful, the last entry appears to be a couple of
days before the dump....
|