Details

    • Bug
    • Resolution: Fixed
    • Major
    • Lustre 2.1.6
    • Lustre 2.1.3
    • None
    • Lustre 2.1.3 Bull.2.308
    • 3
    • 6773

    Description

      While doing a tar on a login node, the login node encountered a general protection fault in osc_send_oap_rpc.

      It looks like a race where the loi_oap_pages structure of osc_send_oap_rpc gets poisoned:

      general protection fault: 0000 [#1] SMP 
      last sysfs file: /sys/devices/pci0000:80/0000:80:02.0/0000:83:00.0/host7/rport-7:0-0/target7:0:0/7:0:0:18/state
      CPU 3 
      Modules linked in: tcp_diag inet_diag iptable_filter ip_tables nfs fscache lmv(U) mgc(U) lustre(U) lov(U) osc(U) mdc(U)
      lquota(U) fid(U) fld(U) ko2iblnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) libcfs(U) nfsd lockd nfs_acl auth_rpcgss exportfs
      ipmi_devintf ipmi_si ipmi_msghandler sunrpc rdma_ucm(U) ib_sdp(U) rdma_cm(U) iw_cm(U) ib_addr(U) ib_ipoib(U) ib_cm(U)
      ib_sa(U) ipv6 ib_uverbs(U) ib_umad(U) mlx4_ib(U) mlx4_core(U) ib_mthca(U) ib_mad(U) ib_core(U) dm_mirror dm_region_hash
      dm_log dm_round_robin scsi_dh_rdac dm_multipath dm_mod uinput usbhid hid sg lpfc scsi_transport_fc scsi_tgt sb_edac
      edac_core i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support igb ioatdma dca ext4 mbcache jbd2 ehci_hcd sd_mod crc_t10dif
      ahci megaraid_sas [last unloaded: scsi_wait_scan]
      
      Pid: 30811, comm: tar Not tainted 2.6.32-220.23.1.bl6.Bull.28.8.x86_64 #1 Bull SAS bullx R/X9DRH-7TF/7F/iTF/iF
      RIP: 0010:[<ffffffffa09f8cb1>]  [<ffffffffa09f8cb1>] osc_send_oap_rpc+0x61/0x1b40 [osc]
      RSP: 0018:ffff8805da6f3588  EFLAGS: 00010296
      RAX: 5a5a5a5a5a5a5a42 RBX: ffff8804710c6a80 RCX: 5a5a5a5a5a5a5a5a
      RDX: ffff8804710c6a80 RSI: ffff881067b22648 RDI: ffff8804710c6aa8
      RBP: ffff8805da6f36b8 R08: ffff8804710c6a98 R09: 00000000000057bf
      R10: 0000000000000d9d R11: b000000000000000 R12: ffff881067b22648
      R13: ffff8804710c6a98 R14: ffff8805da6f3668 R15: ffff8804710c6a98
      FS:  00007f77e18f17a0(0000) GS:ffff880028260000(0000) knlGS:0000000000000000
      CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
      CR2: 00000032bcd9b4f8 CR3: 000000083eaac000 CR4: 00000000000406e0
      DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
      DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400
      
      
      
      
      
      0xffffffffa09f8c50 <osc_send_oap_rpc>:  push   %rbp
      0xffffffffa09f8c51 <osc_send_oap_rpc+1>:        mov    %rsp,%rbp
      0xffffffffa09f8c54 <osc_send_oap_rpc+4>:        push   %r15
      0xffffffffa09f8c56 <osc_send_oap_rpc+6>:        push   %r14
      0xffffffffa09f8c58 <osc_send_oap_rpc+8>:        push   %r13
      0xffffffffa09f8c5a <osc_send_oap_rpc+10>:       push   %r12
      0xffffffffa09f8c5c <osc_send_oap_rpc+12>:       push   %rbx
      0xffffffffa09f8c5d <osc_send_oap_rpc+13>:       sub    $0x108,%rsp
      0xffffffffa09f8c64 <osc_send_oap_rpc+20>:       nopl   0x0(%rax,%rax,1)
      0xffffffffa09f8c69 <osc_send_oap_rpc+25>:       testb  $0x1,-0x4f27ec(%rip)        # 0xffffffffa0506484
      0xffffffffa09f8c70 <osc_send_oap_rpc+32>:       lea    -0x50(%rbp),%r14
      0xffffffffa09f8c74 <osc_send_oap_rpc+36>:       mov    %rdi,-0x78(%rbp)
      0xffffffffa09f8c78 <osc_send_oap_rpc+40>:       mov    %rsi,%r12
      0xffffffffa09f8c7b <osc_send_oap_rpc+43>:       mov    %rdx,-0xc0(%rbp)
      0xffffffffa09f8c82 <osc_send_oap_rpc+50>:       mov    %ecx,-0xac(%rbp)
      0xffffffffa09f8c88 <osc_send_oap_rpc+56>:       mov    %r8,%r15
      0xffffffffa09f8c8b <osc_send_oap_rpc+59>:       mov    %r14,-0x50(%rbp)
      0xffffffffa09f8c8f <osc_send_oap_rpc+63>:       mov    %r14,-0x48(%rbp)
      0xffffffffa09f8c93 <osc_send_oap_rpc+67>:       je     0xffffffffa09f8ca2
      0xffffffffa09f8c95 <osc_send_oap_rpc+69>:       testb  $0x8,-0x4f281c(%rip)        # 0xffffffffa0506480
      0xffffffffa09f8c9c <osc_send_oap_rpc+76>:       jne    0xffffffffa09f9500
      0xffffffffa09f8ca2 <osc_send_oap_rpc+82>:       mov    0x10(%r15),%rcx         <=  %rcx comes from %r15, which
      itself comes from %r8 = 5th argument of osc_send_oap_rpc
      0xffffffffa09f8ca6 <osc_send_oap_rpc+86>:       lea    0x10(%r15),%rdi
      0xffffffffa09f8caa <osc_send_oap_rpc+90>:       lea    -0x18(%rcx),%rax        <= %rax now equals
      to 5a5a5a5a5a5a5a42 coming from %rcx - x18
      0xffffffffa09f8cae <osc_send_oap_rpc+94>:       cmp    %rcx,%rdi
      
      0xffffffffa09f8cb1 <osc_send_oap_rpc+97>:       mov    0x18(%rax),%rdx           <= Crashed here
      
      0xffffffffa09f8cb5 <osc_send_oap_rpc+101>:      je     0xffffffffa09f8d50
      0xffffffffa09f8cbb <osc_send_oap_rpc+107>:      sub    $0x18,%rdx
      0xffffffffa09f8cbf <osc_send_oap_rpc+111>:      xor    %ecx,%ecx
      0xffffffffa09f8cc1 <osc_send_oap_rpc+113>:      jmp    0xffffffffa09f8d10
      0xffffffffa09f8cc3 <osc_send_oap_rpc+115>:      nopl   0x0(%rax,%rax,1)
      
      
      
      static int
      osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli,
                       struct lov_oinfo *loi,
                       int cmd, struct loi_oap_pages *lop)
      
      %r15 = %r8 = lop
      
      crash> struct loi_oap_pages ffff8804710c6a98
      struct loi_oap_pages {
        lop_pending = {
          next = 0x5a5a5a5a5a5a5a5a, 
          prev = 0x5a5a5a5a5a5a5a5a
        }, 
        lop_urgent = {
          next = 0x5a5a5a5a5a5a5a5a, 
          prev = 0x5a5a5a5a5a5a5a5a
        }, 
        lop_pending_group = {
          next = 0x5a5a5a5a5a5a5a5a, 
          prev = 0x5a5a5a5a5a5a5a5a
        }, 
        lop_num_pending = 1515870810
      }
      
      static int
      osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli,
                       struct lov_oinfo *loi,
                       int cmd, struct loi_oap_pages *lop)
      {
              struct ptlrpc_request *req;
              obd_count page_count = 0;
              struct osc_async_page *oap = NULL, *tmp;
              struct osc_brw_async_args *aa;
              const struct obd_async_page_ops *ops;
              CFS_LIST_HEAD(rpc_list);
              int srvlock = 0, mem_tight = 0;
              struct cl_object *clob = NULL;
              obd_off starting_offset = OBD_OBJECT_EOF;
              unsigned int ending_offset;
              int starting_page_off = 0;
              ENTRY;
      
              /* ASYNC_HP pages first. At present, when the lock the pages is
               * to be canceled, the pages covered by the lock will be sent out
               * with ASYNC_HP. We have to send out them as soon as possible. */
              cfs_list_for_each_entry_safe(oap, tmp, &lop->lop_urgent, oap_urgent_item) {
                                                      ^^^^^^^^^^^^^^^
                                                      == LI_POISON
      
                      if (oap->oap_async_flags & ASYNC_HP)
                              cfs_list_move(&oap->oap_pending_item, &rpc_list);
                      else if (!(oap->oap_brw_flags & OBD_BRW_SYNC))
                              /* only do this for writeback pages. */
                              cfs_list_move_tail(&oap->oap_pending_item, &rpc_list);
                      if (++page_count >= cli->cl_max_pages_per_rpc)
                              break;
              }
      

      I attach a file with the output of dmesg,ps,bt, bt -a and foreach bt, if you need more elements.

      Attachments

        Issue Links

          Activity

            [LU-2797] GPF in osc_send_oap_rpc

            Hello Alex,

            Sorry about that, but do you have more infos to provide that may help me to understand what is going wrong ?? Is there any crash-dump available or any other infos for this dead-lock ?? Did you or on-site people make some troubleshooting already ??

            Also, which path/change did you integrate, #6089 or #6200 ??

            bfaccini Bruno Faccini (Inactive) added a comment - Hello Alex, Sorry about that, but do you have more infos to provide that may help me to understand what is going wrong ?? Is there any crash-dump available or any other infos for this dead-lock ?? Did you or on-site people make some troubleshooting already ?? Also, which path/change did you integrate, #6089 or #6200 ??

            quick feedback from site : with the proposed patch the system deadlock almost immediately.

            louveta Alexandre Louvet (Inactive) added a comment - quick feedback from site : with the proposed patch the system deadlock almost immediately.

            We might be able to reproduce the crashing workload on another machine which is currently under installation.
            I am currently in vacation, but will try that next week when I'm back.

            spiechurski Sebastien Piechurski added a comment - We might be able to reproduce the crashing workload on another machine which is currently under installation. I am currently in vacation, but will try that next week when I'm back.

            Hi Bruno,

            The bug only occurs on the login node, and this node is unique. So people on site would like to wait for inspections and approvals of the 2 patches before testing them.

            Thanks,
            Sebastien.

            sebastien.buisson Sebastien Buisson (Inactive) added a comment - Hi Bruno, The bug only occurs on the login node, and this node is unique. So people on site would like to wait for inspections and approvals of the 2 patches before testing them. Thanks, Sebastien.

            Seb(s), Benjamin,

            Could it be possible that you install+run builds with these 2 patches (on a few Nodes/Clients where the problem already occured) and expose them to the site work-load likely to reproduce the crash ??

            BTW, is Meteo site running under production already ??

            bfaccini Bruno Faccini (Inactive) added a comment - Seb(s), Benjamin, Could it be possible that you install+run builds with these 2 patches (on a few Nodes/Clients where the problem already occured) and expose them to the site work-load likely to reproduce the crash ?? BTW, is Meteo site running under production already ??

            Re-choose/refresh of lov_oinfo between write/read steps and calls of osc_send_oap_rpc() in osc_check_rpcs() seems to work but it may break the balancing algorithm and it also does not cover all cases since same problem is likely to also occur upon end of read step where the lov_oinfo may also have been freed (again during cl_loi_list_lock released window) and it is unlinked from any client_obd cl_loi_[hp_ready,ready,write_read]_list's before calling loi_list_maint() and next while() loop.

            Thus since the only routine used to maintain the cl_loi_[hp_ready,ready,write_read]list's is loi_list_maint() it may be possible to implement the 1st fix option I listed in prior comment, ie unlink lov_oinfo from client_obd cl_loi[hp_ready,ready,write_read]list's when choosen in osc_next_loi(), and keep it as-is during all these page-lists processing, this only needs to remove calls to loi_list_maint() (that will re-link lov_oinfo on cl_loi[hp_ready,ready,write_read]_list's !!) in both osc_send_oap_rpc() in osc_check_rpcs().

            New/2nd patch/way is at http://review.whamcloud.com/6200 .

            bfaccini Bruno Faccini (Inactive) added a comment - Re-choose/refresh of lov_oinfo between write/read steps and calls of osc_send_oap_rpc() in osc_check_rpcs() seems to work but it may break the balancing algorithm and it also does not cover all cases since same problem is likely to also occur upon end of read step where the lov_oinfo may also have been freed (again during cl_loi_list_lock released window) and it is unlinked from any client_obd cl_loi_ [hp_ready,ready,write_read] _list's before calling loi_list_maint() and next while() loop. Thus since the only routine used to maintain the cl_loi_ [hp_ready,ready,write_read] list's is loi_list_maint() it may be possible to implement the 1st fix option I listed in prior comment, ie unlink lov_oinfo from client_obd cl_loi [hp_ready,ready,write_read] list's when choosen in osc_next_loi(), and keep it as-is during all these page-lists processing, this only needs to remove calls to loi_list_maint() (that will re-link lov_oinfo on cl_loi [hp_ready,ready,write_read] _list's !!) in both osc_send_oap_rpc() in osc_check_rpcs(). New/2nd patch/way is at http://review.whamcloud.com/6200 .

            Patch is at http://review.whamcloud.com/6089.
            Patch-set #1 passed auto-tests, submitted patch-set #2 only adding comments inside code.
            This may be incomplete since the same issue is likely to occur after 2nd call to osc_send_oap_rpc() when same unsafe window occurs and osc_check_rpcs() unlinks lov_oinfo from all client_obd cl_loi_[hp_ready,ready,write_read]_list's before calling loi_list_maint() and moving to next while() loop ...

            bfaccini Bruno Faccini (Inactive) added a comment - Patch is at http://review.whamcloud.com/6089 . Patch-set #1 passed auto-tests, submitted patch-set #2 only adding comments inside code. This may be incomplete since the same issue is likely to occur after 2nd call to osc_send_oap_rpc() when same unsafe window occurs and osc_check_rpcs() unlinks lov_oinfo from all client_obd cl_loi_ [hp_ready,ready,write_read] _list's before calling loi_list_maint() and moving to next while() loop ...

            In the new crash-dump, the scenario is a bit different, the panic'ing thread is parsing the page-lists/loi_oap_pages (the loi_read_lop, it is important, see later) a lov_oinfo and, according to registers+memory content, this struct has been freed+poisoned and was even found re-allocated at the time of the crash-dump !!

            Thus, and I confirmed it by re-reading debugging notes for 1st crash-dump, my earlier comment "Problem looks like a race in parsing one of the client_obd's lists (cl_loi_hp_ready_list/cl_loi_ready_list/cl_loi_write_list/cl_loi_read_list) of lov_oinfo" is incomplete and would better be that when osc_check_rpcs() chooses/finds a lov_oinfo (via osc_next_loi()), it may have first called osc_send_oap_rpc() for lov_oinfo's loi_write_lop page-lists. During this cl_loi_list_lock can be returned and possibly granted by somebody else who can then free+poison lov_oinfo which is still linked on one of client_obd cl_loi_[hp_ready,ready,write_read]_list and can still be found via osc_next_loi(). Then, when cl_loi_list_lock is back available, osc_check_rpcs() can call osc_send_oap_rpc() for lov_oinfo's loi_read_lop page-lists and trigger the problem.

            Thus, possible fixes for the problem could be to unlink lov_oinfo from client_obd cl_loi_[hp_ready,ready,write_read]_list when choosen in osc_next_loi(), and keep it during all these page-lists processing (but then all further calls to loi_list_maint() will be useless ...). Or, revert the lock optimizations against long thread/time owning a critical lock, that cause cl_loi_list_lock to be returned in osc_build_req() (but looks tricky and of highly suspected big performance impact ...). Or only allow one thread at a time to be in osc_check_rpcs() (need to introduce a flag for this and test-and-set/swap it). Or re-choose the lov_oinfo (via osc_next_loi()) between osc_send_oap_rpc() calls for write/read...

            I will give it a try to the latest (simplest!) choice and let you know on how it goes.

            bfaccini Bruno Faccini (Inactive) added a comment - In the new crash-dump, the scenario is a bit different, the panic'ing thread is parsing the page-lists/loi_oap_pages (the loi_read_lop, it is important, see later) a lov_oinfo and, according to registers+memory content, this struct has been freed+poisoned and was even found re-allocated at the time of the crash-dump !! Thus, and I confirmed it by re-reading debugging notes for 1st crash-dump, my earlier comment "Problem looks like a race in parsing one of the client_obd's lists (cl_loi_hp_ready_list/cl_loi_ready_list/cl_loi_write_list/cl_loi_read_list) of lov_oinfo" is incomplete and would better be that when osc_check_rpcs() chooses/finds a lov_oinfo (via osc_next_loi()), it may have first called osc_send_oap_rpc() for lov_oinfo's loi_write_lop page-lists. During this cl_loi_list_lock can be returned and possibly granted by somebody else who can then free+poison lov_oinfo which is still linked on one of client_obd cl_loi_ [hp_ready,ready,write_read] _list and can still be found via osc_next_loi(). Then, when cl_loi_list_lock is back available, osc_check_rpcs() can call osc_send_oap_rpc() for lov_oinfo's loi_read_lop page-lists and trigger the problem. Thus, possible fixes for the problem could be to unlink lov_oinfo from client_obd cl_loi_ [hp_ready,ready,write_read] _list when choosen in osc_next_loi(), and keep it during all these page-lists processing (but then all further calls to loi_list_maint() will be useless ...). Or, revert the lock optimizations against long thread/time owning a critical lock, that cause cl_loi_list_lock to be returned in osc_build_req() (but looks tricky and of highly suspected big performance impact ...). Or only allow one thread at a time to be in osc_check_rpcs() (need to introduce a flag for this and test-and-set/swap it). Or re-choose the lov_oinfo (via osc_next_loi()) between osc_send_oap_rpc() calls for write/read... I will give it a try to the latest (simplest!) choice and let you know on how it goes.

            Thanks Bruno, we are looking forward to hearing from you!

            Cheers,
            Sebastien.

            sebastien.buisson Sebastien Buisson (Inactive) added a comment - Thanks Bruno, we are looking forward to hearing from you! Cheers, Sebastien.

            New crash-dump (including additional traces requested) analysis is on-going, it is not so easy due to osc_send_oap_rpc() routine size and tricky/optimized assembly code. Will update asap with more details.

            bfaccini Bruno Faccini (Inactive) added a comment - New crash-dump (including additional traces requested) analysis is on-going, it is not so easy due to osc_send_oap_rpc() routine size and tricky/optimized assembly code. Will update asap with more details.

            Agree, this ticket is my top priority. And I upgrade its own to major.

            bfaccini Bruno Faccini (Inactive) added a comment - Agree, this ticket is my top priority. And I upgrade its own to major.

            People

              bfaccini Bruno Faccini (Inactive)
              spiechurski Sebastien Piechurski
              Votes:
              0 Vote for this issue
              Watchers:
              11 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: