Loading...

Details

Type: Bug
Resolution: Not a Bug
Priority: Major
Fix Version/s: None
Affects Version/s: Lustre 2.14.0
Labels:
None

Rank (Obsolete):
9223372036854775807

Description

[10444.024266] BUG: unable to handle kernel paging request at ffffffffffffffff
[10444.034709] IP: 0xffffffffffffffff
[10444.038729] PGD 1e0c067 P4D 1e0c067 PUD 1e0e067 PMD 0 
[10444.046081] Oops: 0010 [#1] SMP PTI
[10444.050413] CPU: 1 PID: 2493 Comm: kgnilnd_sd_00 Tainted: P           O     4.12.14-197.7_5.0.96-cray_ari_c #1 SLE15 (unreleased)
[10444.069060] Hardware name: Cray Inc. Cascade/Cascade, BIOS 4.6.5 09/05/2019
[10444.078578] task: ffff880f898c31c0 task.stack: ffffc90007610000
[10444.087691] RIP: 0010:0xffffffffffffffff
[10444.092405] RSP: 0018:ffffc90007613ae0 EFLAGS: 00010286
[10444.099635] RAX: ffff880f87d11bc0 RBX: ffff88078e307000 RCX: 00000000ffffffff
[10444.109846] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff88078e5e8f18
[10444.119712] RBP: ffffc90007613b90 R08: 0000000000000000 R09: 0000000000000000
[10444.130956] R10: 0000000000000000 R11: ffffea001e0c1a60 R12: ffff88078e5e8e58
[10444.142241] R13: 0000000000000001 R14: 0000000000000000 R15: ffffffffffffffff
[10444.152325] FS:  0000000000000000(0000) GS:ffff88085f840000(0000) knlGS:0000000000000000
[10444.164237] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[10444.172192] CR2: ffffffffffffffff CR3: 0000000001e0a001 CR4: 00000000001606e0
[10444.182560] Call Trace:
[10444.184378]  ? lnet_finalize+0x750/0x1130 [lnet]
[10444.190241]  ? _kgnilnd_schedule_conn+0x128/0x3a0 [kgnilnd]
[10444.197527]  kgnilnd_recv+0x5b6/0xcb0 [kgnilnd]
[10444.203490]  lnet_ni_recv+0x5a/0x2d0 [lnet]
[10444.209107]  lnet_recv_put+0x7f/0xb0 [lnet]
[10444.214445]  lnet_parse_local+0x642/0xdc0 [lnet]
[10444.220245]  ? gni_cq_get_event+0x47/0xb10 [kgni_ari]
[10444.227225]  lnet_parse+0xaf3/0x1120 [lnet]
[10444.231471]  ? kmem_cache_alloc+0x1cb/0x5e0
[10444.236043]  ? kgnilnd_check_fma_send_cq+0xdf5/0x1090 [kgnilnd]
[10444.244100]  kgnilnd_check_fma_rx+0x166d/0x1e90 [kgnilnd]
[10444.251562]  ? lock_timer_base+0x6b/0x90
[10444.256140]  kgnilnd_process_conns+0x527/0xe50 [kgnilnd]
[10444.263252]  ? kgnilnd_process_mapped_tx+0x574/0x810 [kgnilnd]
[10444.271873]  kgnilnd_scheduler+0x199/0x5b0 [kgnilnd]
[10444.278550]  ? wait_woken+0x80/0x80
[10444.282602]  kthread+0x121/0x140
[10444.285913]  ? kgnilnd_process_conns+0xe50/0xe50 [kgnilnd]
[10444.293607]  ? kthread_create_worker_on_cpu+0x50/0x50
[10444.300612]  ret_from_fork+0x3a/0x50
[10444.304848] Code:  Bad RIP value.
[10444.308187] Modules linked in: mgc(O) lustre(O) lmv(O) mdc(O) fid(O) lov(O) fld(O) osc(O) ptlrpc(O) obdclass(O) pm_api(O) xpmem(O) cmsr(O) bpmcdmod(O) x86_pkg_temp_thermal freemem(O) pcie_link_bw_monitor(O) ib_core(O) kdreg(O) kgnilnd(O) dvs(O) dvsipc(O) dvsipc_lnet(O) lnet(O) libcfs(O) dvsproc(O) gpcd_ari(O) ipogif_ari(O) kgni_ari(O) hwerr(PO) rca(O) heartbeat(O) simplex(PO) hss_os(PO) ghal_ari(O) craytrace(O)
[10444.369702] CR2: ffffffffffffffff
[10444.374673] ---[ end trace bd9de6e45567314b ]---
[10444.379464] RIP: 0010:0xffffffffffffffff
[10444.383581] RSP: 0018:ffffc90007613ae0 EFLAGS: 00010286
[10444.389141] RAX: ffff880f87d11bc0 RBX: ffff88078e307000 RCX: 00000000ffffffff
[10444.397849] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff88078e5e8f18
[10444.406839] RBP: ffffc90007613b90 R08: 0000000000000000 R09: 0000000000000000
[10444.415731] R10: 0000000000000000 R11: ffffea001e0c1a60 R12: ffff88078e5e8e58
[10444.425544] R13: 0000000000000001 R14: 0000000000000000 R15: ffffffffffffffff
[10444.435234] FS:  0000000000000000(0000) GS:ffff88085f840000(0000) knlGS:0000000000000000
[10444.447830] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[10444.455364] CR2: ffffffffffffffff CR3: 0000000001e0a001 CR4: 00000000001606e0
[10444.465273] Kernel panic - not syncing: Fatal exception
[10445.505720] Shutting down cpus with NMI
[10445.520518] Kernel Offset: disabled
[10445.524528] ---[ end Kernel panic - not syncing: Fatal exception

Appears the crash here:

static void
lnet_msg_detach_md(struct lnet_msg *msg, int status)
{
	struct lnet_libmd *md = msg->msg_md;
	lnet_handler_t handler = NULL;
	int cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
	int unlink;
 
	lnet_res_lock(cpt);
	while (md->md_flags & LNET_MD_FLAG_HANDLING)
		/* An event handler is running - wait for it to
		 * complete to avoid races.
		 */
		lnet_md_wait_handling(md, cpt);
 
	/* Now it's safe to drop my caller's ref */
	md->md_refcount--;
	LASSERT(md->md_refcount >= 0);
 
	unlink = lnet_md_unlinkable(md);
	if (md->md_handler) {
		if ((md->md_flags & LNET_MD_FLAG_ABORTED) && !status) {
			msg->msg_ev.status   = -ETIMEDOUT;
			CDEBUG(D_NET, "md 0x%p already unlinked\n", md);
		} else {
			msg->msg_ev.status   = status;
		}
		msg->msg_ev.unlinked = unlink;
		handler = md->md_handler;
		if (!unlink)
			md->md_flags |= LNET_MD_FLAG_HANDLING;
	}
 
	if (unlink || (md->md_refcount == 0 &&
		       md->md_threshold == LNET_MD_THRESH_INF))
		lnet_detach_rsp_tracker(md, cpt);
 
	msg->msg_md = NULL;
	if (unlink)
		lnet_md_unlink(md);
 
	lnet_res_unlock(cpt);
 
	if (handler) {
		handler(&msg->msg_ev);
		if (!unlink) {
			lnet_res_lock(cpt); /* # <<-------------------- crash is here. */
			md->md_flags &= ~LNET_MD_FLAG_HANDLING;
			wake_up_var(md);
			lnet_res_unlock(cpt);
		}
	}
}

Reverted ~~LU-10428~~ lnet: call event handlers without res_lock
and the crash seems to be gone

Attachments

Issue Links

is related to

LU-10428 LNet events should generated without resource lock held

Resolved

BUG: unable to handle kernel paging request at ffffffffffffffff

Details

Description

Attachments

Issue Links

Activity

People

Dates