Uploaded image for project: 'Lustre'
  1. Lustre
  2. LU-14655

BUG: unable to handle kernel NULL pointer dereference at 0000000000000058

    XMLWordPrintable

Details

    • Bug
    • Resolution: Fixed
    • Critical
    • Lustre 2.15.0
    • None
    • None
    • 3
    • 9223372036854775807

    Description

      Oops in lnet_health_check():

      2021-04-21 21:34:39 [722035.308494] BUG: unable to handle kernel NULL pointer dereference at 0000000000000058
      2021-04-21 21:34:39 [722035.317682] IP: [<ffffffffc0ac4f26>] lnet_finalize+0x1d6/0xf30 [lnet]
      2021-04-21 21:34:39 [722035.325348] PGD 0 
      2021-04-21 21:34:39 [722035.328535] Oops: 0000 [#1] SMP 
      ...
      2021-04-21 21:34:40 [722035.467706] CPU: 12 PID: 88725 Comm: socknal_sd01_02 Kdump: loaded Tainted: P        W  OE  ------------   3.10.0-957.1.3957.1.3.x4.3.20.x86_64 #1
      2021-04-21 21:34:40 [722035.483099] Hardware name: Viking Enterprise Solutions VSSEP1EA/VSSEP1EA, BIOS 10.06 05/26/2020
      2021-04-21 21:34:40 [722035.492985] task: ffff89a20c35c100 ti: ffff89a21da40000 task.ti: ffff89a21da40000
      2021-04-21 21:34:40 [722035.501649] RIP: 0010:[<ffffffffc0ac4f26>]  [<ffffffffc0ac4f26>] lnet_finalize+0x1d6/0xf30 [lnet]
      2021-04-21 21:34:40 [722035.511741] RSP: 0018:ffff89a21da43d60  EFLAGS: 00010286
      2021-04-21 21:34:40 [722035.518219] RAX: 0000000000000000 RBX: ffff89a8b4961658 RCX: 0000000000000001
      2021-04-21 21:34:40 [722035.526516] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
      2021-04-21 21:34:40 [722035.534797] RBP: ffff89a21da43dc8 R08: 000000000001f120 R09: ffffffffc0b4f727
      2021-04-21 21:34:40 [722035.543070] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
      2021-04-21 21:34:40 [722035.551335] R13: 0000000000000000 R14: ffff89b1c6be9a00 R15: ffff89d1fafbfe00
      2021-04-21 21:34:40 [722035.559580] FS:  0000000000000000(0000) GS:ffff89b22ef00000(0000) knlGS:0000000000000000
      2021-04-21 21:34:40 [722035.568779] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
      2021-04-21 21:34:40 [722035.575620] CR2: 0000000000000058 CR3: 0000003016f88000 CR4: 0000000000340fe0
      2021-04-21 21:34:40 [722035.583846] Call Trace:
      2021-04-21 21:34:40 [722035.587383]  [<ffffffffc0b4f8fe>] ksocknal_tx_done+0x9e/0x1f0 [ksocklnd]
      2021-04-21 21:34:40 [722035.595157]  [<ffffffffc0b54930>] ksocknal_scheduler+0x350/0xd50 [ksocklnd]
      2021-04-21 21:34:40 [722035.603180]  [<ffffffffb1ac3050>] ? wake_up_atomic_t+0x30/0x30
      2021-04-21 21:34:40 [722035.610054]  [<ffffffffc0b545e0>] ? ksocknal_recv+0x2a0/0x2a0 [ksocklnd]
      2021-04-21 21:34:40 [722035.617784]  [<ffffffffb1ac1f81>] kthread+0xd1/0xe0
      

      Analysis from c17819

      0xffffffffc0ac4eca <lnet_finalize+378>: mov    0x4426f(%rip),%rcx        # 0xffffffffc0b09140 <the_lnet+352> 
      0xffffffffc0ac4ed1 <lnet_finalize+385>: mov    0x80(%rbx),%r14 
      0xffffffffc0ac4ed8 <lnet_finalize+392>: mov    0xa0(%rbx),%r15 
      0xffffffffc0ac4edf <lnet_finalize+399>: cmovne 0x78(%rbx),%r14 
      0xffffffffc0ac4ee4 <lnet_finalize+404>: cmovne 0x98(%rbx),%r15 
      0xffffffffc0ac4eec <lnet_finalize+412>: cmp    $0x1,%al 
      0xffffffffc0ac4eee <lnet_finalize+414>: sbb    %eax,%eax 
      0xffffffffc0ac4ef0 <lnet_finalize+416>: mov    %eax,0x30(%rsp) 
      0xffffffffc0ac4ef4 <lnet_finalize+420>: addb   $0x1,0x30(%rsp) 
      0xffffffffc0ac4ef9 <lnet_finalize+425>: xor    %eax,%eax 
      0xffffffffc0ac4efb <lnet_finalize+427>: xor    $0x1,%r11d 
      0xffffffffc0ac4eff <lnet_finalize+431>: cmpl   $0x3,(%rcx) 
      0xffffffffc0ac4f02 <lnet_finalize+434>: movzbl 0x30(%rsp),%ecx 
      0xffffffffc0ac4f07 <lnet_finalize+439>: mov    %eax,%edi 
      0xffffffffc0ac4f09 <lnet_finalize+441>: cmovge %r11d,%edi 
      0xffffffffc0ac4f0d <lnet_finalize+445>: cmovge %ecx,%eax 
      0xffffffffc0ac4f10 <lnet_finalize+448>: test   %r14,%r14 
      0xffffffffc0ac4f13 <lnet_finalize+451>: mov    %dil,0x2c(%rsp) 
      0xffffffffc0ac4f18 <lnet_finalize+456>: mov    %al,0x2b(%rsp) 
      0xffffffffc0ac4f1c <lnet_finalize+460>: je     0xffffffffc0ac4f30 <lnet_finalize+480> 
      0xffffffffc0ac4f1e <lnet_finalize+462>: mov    0x50(%r14),%rax 
      0xffffffffc0ac4f22 <lnet_finalize+466>: mov    0x20(%rax),%rax 
      0xffffffffc0ac4f26 <lnet_finalize+470>: cmpl   $0x1,0x58(%rax)                                  <==== crash
      0xffffffffc0ac4f2a <lnet_finalize+474>: jle    0xffffffffc0ac5879 <lnet_finalize+2857> 
      0xffffffffc0ac4f30 <lnet_finalize+480>: test   %dl,%dl 
      0xffffffffc0ac4f32 <lnet_finalize+482>: jne    0xffffffffc0ac5675 <lnet_finalize+2341> 
      0xffffffffc0ac4f38 <lnet_finalize+488>: test   %r14,%r14 
      0xffffffffc0ac4f3b <lnet_finalize+491>: je     0xffffffffc0ac5b4d <lnet_finalize+3581> 
      0xffffffffc0ac4f41 <lnet_finalize+497>: test   %r15,%r15 
      0xffffffffc0ac4f44 <lnet_finalize+500>: je     0xffffffffc0ac5b4d <lnet_finalize+3581> 
      0xffffffffc0ac4f4a <lnet_finalize+506>: testb  $0x2,-0x4f7f8(%rip)        # 0xffffffffc0a75759 <libcfs_debug+1> 
      0xffffffffc0ac4f51 <lnet_finalize+513>: je     0xffffffffc0ac4f60 <lnet_finalize+528> 
      0xffffffffc0ac4f53 <lnet_finalize+515>: testb  $0x4,-0x4f7fd(%rip)        # 0xffffffffc0a7575d <libcfs_subsystem_debug+1> 
      0xffffffffc0ac4f5a <lnet_finalize+522>: jne    0xffffffffc0ac5891 <lnet_finalize+2881> 
      0xffffffffc0ac4f60 <lnet_finalize+528>: mov    0x38(%rsp),%r8d 
      0xffffffffc0ac4f65 <lnet_finalize+533>: test   %r8d,%r8d 
      0xffffffffc0ac4f68 <lnet_finalize+536>: jne    0xffffffffc0ac56b0 <lnet_finalize+2400> 
      0xffffffffc0ac4f6e <lnet_finalize+542>: mov    0xf8(%r15),%ecx 
      0xffffffffc0ac4f75 <lnet_finalize+549>: mov    0x4402d(%rip),%edi        # 0xffffffffc0b08fa8 <lnet_health_sensitivity> 
      0xffffffffc0ac4f7b <lnet_finalize+555>: lea    0xf8(%r15),%r10 
      0xffffffffc0ac4f82 <lnet_finalize+562>: mov    $0x3e8,%esi 
      0xffffffffc0ac4f87 <lnet_finalize+567>: cmp    $0x3e8,%ecx
      
      crash> lnet_peer_ni.lpni_peer_net -x
      struct lnet_peer_ni {
         [0x50] struct lnet_peer_net *lpni_peer_net;
      }
      crash> struct lnet_peer.lp_nnis -x
      struct lnet_peer {
         [0x58] int lp_nnis;
      }
      
      static int
      lnet_health_check(struct lnet_msg *msg)
      {
      ...
      	if (the_lnet.ln_ping_target->pb_nnis <= 2) {
      		handle_local_health = false;
      		attempt_local_resend = false;
      	}
      
      	/* For remote failures, health/recovery/resends are not needed if the
      	 * peer only has a single interface. Special case for routers where we
      	 * rely on health feature to manage route aliveness. NB: unlike pb_nnis
      	 * above, lp_nnis does _not_ include the lolnd, so a single-rail node
      	 * would have lp_nnis == 1.
      	 */
      	if (lpni && lpni->lpni_peer_net->lpn_peer->lp_nnis <= 1) {                     <==== crash
      		attempt_remote_resend = false;
      		if (!lnet_isrouter(lpni))
      			handle_remote_health = false;
      	}
      
      	if (!lo)
      		LASSERT(ni && lpni);
      	else
      		LASSERT(ni);
      
      	CDEBUG(D_NET, "health check: %s->%s: %s: %s\n",
      	       libcfs_nid2str(ni->ni_nid),
      	       (lo) ? "self" : libcfs_nid2str(lpni->lpni_nid),
      	       lnet_msgtyp2str(msg->msg_type),
      	       lnet_health_error2str(hstatus));
      

      however lpni->lpni_peer_net->lpn_peer isn't 0 at the moment of the crash dump :

      crash> lnet_peer_ni.lpni_peer_net ffff89b1c6be9a00
        lpni_peer_net = 0xffff89a22d2f4180
      crash> lnet_peer_net.lpn_peer 0xffff89a22d2f4180
        lpn_peer = 0xffff89cbbed23c00
      

      Buggy code introduced by LU-13501.

      Attachments

        Activity

          People

            hornc Chris Horn
            hornc Chris Horn
            Votes:
            0 Vote for this issue
            Watchers:
            3 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved: