Details
-
Bug
-
Resolution: Fixed
-
Critical
-
None
-
None
-
3
-
9223372036854775807
Description
Oops in lnet_health_check():
2021-04-21 21:34:39 [722035.308494] BUG: unable to handle kernel NULL pointer dereference at 0000000000000058 2021-04-21 21:34:39 [722035.317682] IP: [<ffffffffc0ac4f26>] lnet_finalize+0x1d6/0xf30 [lnet] 2021-04-21 21:34:39 [722035.325348] PGD 0 2021-04-21 21:34:39 [722035.328535] Oops: 0000 [#1] SMP ... 2021-04-21 21:34:40 [722035.467706] CPU: 12 PID: 88725 Comm: socknal_sd01_02 Kdump: loaded Tainted: P W OE ------------ 3.10.0-957.1.3957.1.3.x4.3.20.x86_64 #1 2021-04-21 21:34:40 [722035.483099] Hardware name: Viking Enterprise Solutions VSSEP1EA/VSSEP1EA, BIOS 10.06 05/26/2020 2021-04-21 21:34:40 [722035.492985] task: ffff89a20c35c100 ti: ffff89a21da40000 task.ti: ffff89a21da40000 2021-04-21 21:34:40 [722035.501649] RIP: 0010:[<ffffffffc0ac4f26>] [<ffffffffc0ac4f26>] lnet_finalize+0x1d6/0xf30 [lnet] 2021-04-21 21:34:40 [722035.511741] RSP: 0018:ffff89a21da43d60 EFLAGS: 00010286 2021-04-21 21:34:40 [722035.518219] RAX: 0000000000000000 RBX: ffff89a8b4961658 RCX: 0000000000000001 2021-04-21 21:34:40 [722035.526516] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 2021-04-21 21:34:40 [722035.534797] RBP: ffff89a21da43dc8 R08: 000000000001f120 R09: ffffffffc0b4f727 2021-04-21 21:34:40 [722035.543070] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 2021-04-21 21:34:40 [722035.551335] R13: 0000000000000000 R14: ffff89b1c6be9a00 R15: ffff89d1fafbfe00 2021-04-21 21:34:40 [722035.559580] FS: 0000000000000000(0000) GS:ffff89b22ef00000(0000) knlGS:0000000000000000 2021-04-21 21:34:40 [722035.568779] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 2021-04-21 21:34:40 [722035.575620] CR2: 0000000000000058 CR3: 0000003016f88000 CR4: 0000000000340fe0 2021-04-21 21:34:40 [722035.583846] Call Trace: 2021-04-21 21:34:40 [722035.587383] [<ffffffffc0b4f8fe>] ksocknal_tx_done+0x9e/0x1f0 [ksocklnd] 2021-04-21 21:34:40 [722035.595157] [<ffffffffc0b54930>] ksocknal_scheduler+0x350/0xd50 [ksocklnd] 2021-04-21 21:34:40 [722035.603180] [<ffffffffb1ac3050>] ? wake_up_atomic_t+0x30/0x30 2021-04-21 21:34:40 [722035.610054] [<ffffffffc0b545e0>] ? ksocknal_recv+0x2a0/0x2a0 [ksocklnd] 2021-04-21 21:34:40 [722035.617784] [<ffffffffb1ac1f81>] kthread+0xd1/0xe0
Analysis from c17819
0xffffffffc0ac4eca <lnet_finalize+378>: mov 0x4426f(%rip),%rcx # 0xffffffffc0b09140 <the_lnet+352> 0xffffffffc0ac4ed1 <lnet_finalize+385>: mov 0x80(%rbx),%r14 0xffffffffc0ac4ed8 <lnet_finalize+392>: mov 0xa0(%rbx),%r15 0xffffffffc0ac4edf <lnet_finalize+399>: cmovne 0x78(%rbx),%r14 0xffffffffc0ac4ee4 <lnet_finalize+404>: cmovne 0x98(%rbx),%r15 0xffffffffc0ac4eec <lnet_finalize+412>: cmp $0x1,%al 0xffffffffc0ac4eee <lnet_finalize+414>: sbb %eax,%eax 0xffffffffc0ac4ef0 <lnet_finalize+416>: mov %eax,0x30(%rsp) 0xffffffffc0ac4ef4 <lnet_finalize+420>: addb $0x1,0x30(%rsp) 0xffffffffc0ac4ef9 <lnet_finalize+425>: xor %eax,%eax 0xffffffffc0ac4efb <lnet_finalize+427>: xor $0x1,%r11d 0xffffffffc0ac4eff <lnet_finalize+431>: cmpl $0x3,(%rcx) 0xffffffffc0ac4f02 <lnet_finalize+434>: movzbl 0x30(%rsp),%ecx 0xffffffffc0ac4f07 <lnet_finalize+439>: mov %eax,%edi 0xffffffffc0ac4f09 <lnet_finalize+441>: cmovge %r11d,%edi 0xffffffffc0ac4f0d <lnet_finalize+445>: cmovge %ecx,%eax 0xffffffffc0ac4f10 <lnet_finalize+448>: test %r14,%r14 0xffffffffc0ac4f13 <lnet_finalize+451>: mov %dil,0x2c(%rsp) 0xffffffffc0ac4f18 <lnet_finalize+456>: mov %al,0x2b(%rsp) 0xffffffffc0ac4f1c <lnet_finalize+460>: je 0xffffffffc0ac4f30 <lnet_finalize+480> 0xffffffffc0ac4f1e <lnet_finalize+462>: mov 0x50(%r14),%rax 0xffffffffc0ac4f22 <lnet_finalize+466>: mov 0x20(%rax),%rax 0xffffffffc0ac4f26 <lnet_finalize+470>: cmpl $0x1,0x58(%rax) <==== crash 0xffffffffc0ac4f2a <lnet_finalize+474>: jle 0xffffffffc0ac5879 <lnet_finalize+2857> 0xffffffffc0ac4f30 <lnet_finalize+480>: test %dl,%dl 0xffffffffc0ac4f32 <lnet_finalize+482>: jne 0xffffffffc0ac5675 <lnet_finalize+2341> 0xffffffffc0ac4f38 <lnet_finalize+488>: test %r14,%r14 0xffffffffc0ac4f3b <lnet_finalize+491>: je 0xffffffffc0ac5b4d <lnet_finalize+3581> 0xffffffffc0ac4f41 <lnet_finalize+497>: test %r15,%r15 0xffffffffc0ac4f44 <lnet_finalize+500>: je 0xffffffffc0ac5b4d <lnet_finalize+3581> 0xffffffffc0ac4f4a <lnet_finalize+506>: testb $0x2,-0x4f7f8(%rip) # 0xffffffffc0a75759 <libcfs_debug+1> 0xffffffffc0ac4f51 <lnet_finalize+513>: je 0xffffffffc0ac4f60 <lnet_finalize+528> 0xffffffffc0ac4f53 <lnet_finalize+515>: testb $0x4,-0x4f7fd(%rip) # 0xffffffffc0a7575d <libcfs_subsystem_debug+1> 0xffffffffc0ac4f5a <lnet_finalize+522>: jne 0xffffffffc0ac5891 <lnet_finalize+2881> 0xffffffffc0ac4f60 <lnet_finalize+528>: mov 0x38(%rsp),%r8d 0xffffffffc0ac4f65 <lnet_finalize+533>: test %r8d,%r8d 0xffffffffc0ac4f68 <lnet_finalize+536>: jne 0xffffffffc0ac56b0 <lnet_finalize+2400> 0xffffffffc0ac4f6e <lnet_finalize+542>: mov 0xf8(%r15),%ecx 0xffffffffc0ac4f75 <lnet_finalize+549>: mov 0x4402d(%rip),%edi # 0xffffffffc0b08fa8 <lnet_health_sensitivity> 0xffffffffc0ac4f7b <lnet_finalize+555>: lea 0xf8(%r15),%r10 0xffffffffc0ac4f82 <lnet_finalize+562>: mov $0x3e8,%esi 0xffffffffc0ac4f87 <lnet_finalize+567>: cmp $0x3e8,%ecx
crash> lnet_peer_ni.lpni_peer_net -x struct lnet_peer_ni { [0x50] struct lnet_peer_net *lpni_peer_net; } crash> struct lnet_peer.lp_nnis -x struct lnet_peer { [0x58] int lp_nnis; }
static int lnet_health_check(struct lnet_msg *msg) { ... if (the_lnet.ln_ping_target->pb_nnis <= 2) { handle_local_health = false; attempt_local_resend = false; } /* For remote failures, health/recovery/resends are not needed if the * peer only has a single interface. Special case for routers where we * rely on health feature to manage route aliveness. NB: unlike pb_nnis * above, lp_nnis does _not_ include the lolnd, so a single-rail node * would have lp_nnis == 1. */ if (lpni && lpni->lpni_peer_net->lpn_peer->lp_nnis <= 1) { <==== crash attempt_remote_resend = false; if (!lnet_isrouter(lpni)) handle_remote_health = false; } if (!lo) LASSERT(ni && lpni); else LASSERT(ni); CDEBUG(D_NET, "health check: %s->%s: %s: %s\n", libcfs_nid2str(ni->ni_nid), (lo) ? "self" : libcfs_nid2str(lpni->lpni_nid), lnet_msgtyp2str(msg->msg_type), lnet_health_error2str(hstatus));
however lpni->lpni_peer_net->lpn_peer isn't 0 at the moment of the crash dump :
crash> lnet_peer_ni.lpni_peer_net ffff89b1c6be9a00 lpni_peer_net = 0xffff89a22d2f4180 crash> lnet_peer_net.lpn_peer 0xffff89a22d2f4180 lpn_peer = 0xffff89cbbed23c00
Buggy code introduced by LU-13501.