[LU-2797] GPF in osc_send_oap_rpc Created: 12/Feb/13  Updated: 17/Jul/13  Resolved: 26/Jun/13

Status: Resolved
Project: Lustre
Component/s: None
Affects Version/s: Lustre 2.1.3
Fix Version/s: Lustre 2.1.6

Type: Bug Priority: Major
Reporter: Sebastien Piechurski Assignee: Bruno Faccini (Inactive)
Resolution: Fixed Votes: 0
Labels: None
Environment:

Lustre 2.1.3 Bull.2.308


Attachments: File crash-dump-properties    
Issue Links:
Duplicate
is duplicated by LU-2853 osc send list corruption Closed
Severity: 3
Epic: client
Rank (Obsolete): 6773

 Description   

While doing a tar on a login node, the login node encountered a general protection fault in osc_send_oap_rpc.

It looks like a race where the loi_oap_pages structure of osc_send_oap_rpc gets poisoned:

general protection fault: 0000 [#1] SMP 
last sysfs file: /sys/devices/pci0000:80/0000:80:02.0/0000:83:00.0/host7/rport-7:0-0/target7:0:0/7:0:0:18/state
CPU 3 
Modules linked in: tcp_diag inet_diag iptable_filter ip_tables nfs fscache lmv(U) mgc(U) lustre(U) lov(U) osc(U) mdc(U)
lquota(U) fid(U) fld(U) ko2iblnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) libcfs(U) nfsd lockd nfs_acl auth_rpcgss exportfs
ipmi_devintf ipmi_si ipmi_msghandler sunrpc rdma_ucm(U) ib_sdp(U) rdma_cm(U) iw_cm(U) ib_addr(U) ib_ipoib(U) ib_cm(U)
ib_sa(U) ipv6 ib_uverbs(U) ib_umad(U) mlx4_ib(U) mlx4_core(U) ib_mthca(U) ib_mad(U) ib_core(U) dm_mirror dm_region_hash
dm_log dm_round_robin scsi_dh_rdac dm_multipath dm_mod uinput usbhid hid sg lpfc scsi_transport_fc scsi_tgt sb_edac
edac_core i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support igb ioatdma dca ext4 mbcache jbd2 ehci_hcd sd_mod crc_t10dif
ahci megaraid_sas [last unloaded: scsi_wait_scan]

Pid: 30811, comm: tar Not tainted 2.6.32-220.23.1.bl6.Bull.28.8.x86_64 #1 Bull SAS bullx R/X9DRH-7TF/7F/iTF/iF
RIP: 0010:[<ffffffffa09f8cb1>]  [<ffffffffa09f8cb1>] osc_send_oap_rpc+0x61/0x1b40 [osc]
RSP: 0018:ffff8805da6f3588  EFLAGS: 00010296
RAX: 5a5a5a5a5a5a5a42 RBX: ffff8804710c6a80 RCX: 5a5a5a5a5a5a5a5a
RDX: ffff8804710c6a80 RSI: ffff881067b22648 RDI: ffff8804710c6aa8
RBP: ffff8805da6f36b8 R08: ffff8804710c6a98 R09: 00000000000057bf
R10: 0000000000000d9d R11: b000000000000000 R12: ffff881067b22648
R13: ffff8804710c6a98 R14: ffff8805da6f3668 R15: ffff8804710c6a98
FS:  00007f77e18f17a0(0000) GS:ffff880028260000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 00000032bcd9b4f8 CR3: 000000083eaac000 CR4: 00000000000406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400





0xffffffffa09f8c50 <osc_send_oap_rpc>:  push   %rbp
0xffffffffa09f8c51 <osc_send_oap_rpc+1>:        mov    %rsp,%rbp
0xffffffffa09f8c54 <osc_send_oap_rpc+4>:        push   %r15
0xffffffffa09f8c56 <osc_send_oap_rpc+6>:        push   %r14
0xffffffffa09f8c58 <osc_send_oap_rpc+8>:        push   %r13
0xffffffffa09f8c5a <osc_send_oap_rpc+10>:       push   %r12
0xffffffffa09f8c5c <osc_send_oap_rpc+12>:       push   %rbx
0xffffffffa09f8c5d <osc_send_oap_rpc+13>:       sub    $0x108,%rsp
0xffffffffa09f8c64 <osc_send_oap_rpc+20>:       nopl   0x0(%rax,%rax,1)
0xffffffffa09f8c69 <osc_send_oap_rpc+25>:       testb  $0x1,-0x4f27ec(%rip)        # 0xffffffffa0506484
0xffffffffa09f8c70 <osc_send_oap_rpc+32>:       lea    -0x50(%rbp),%r14
0xffffffffa09f8c74 <osc_send_oap_rpc+36>:       mov    %rdi,-0x78(%rbp)
0xffffffffa09f8c78 <osc_send_oap_rpc+40>:       mov    %rsi,%r12
0xffffffffa09f8c7b <osc_send_oap_rpc+43>:       mov    %rdx,-0xc0(%rbp)
0xffffffffa09f8c82 <osc_send_oap_rpc+50>:       mov    %ecx,-0xac(%rbp)
0xffffffffa09f8c88 <osc_send_oap_rpc+56>:       mov    %r8,%r15
0xffffffffa09f8c8b <osc_send_oap_rpc+59>:       mov    %r14,-0x50(%rbp)
0xffffffffa09f8c8f <osc_send_oap_rpc+63>:       mov    %r14,-0x48(%rbp)
0xffffffffa09f8c93 <osc_send_oap_rpc+67>:       je     0xffffffffa09f8ca2
0xffffffffa09f8c95 <osc_send_oap_rpc+69>:       testb  $0x8,-0x4f281c(%rip)        # 0xffffffffa0506480
0xffffffffa09f8c9c <osc_send_oap_rpc+76>:       jne    0xffffffffa09f9500
0xffffffffa09f8ca2 <osc_send_oap_rpc+82>:       mov    0x10(%r15),%rcx         <=  %rcx comes from %r15, which
itself comes from %r8 = 5th argument of osc_send_oap_rpc
0xffffffffa09f8ca6 <osc_send_oap_rpc+86>:       lea    0x10(%r15),%rdi
0xffffffffa09f8caa <osc_send_oap_rpc+90>:       lea    -0x18(%rcx),%rax        <= %rax now equals
to 5a5a5a5a5a5a5a42 coming from %rcx - x18
0xffffffffa09f8cae <osc_send_oap_rpc+94>:       cmp    %rcx,%rdi

0xffffffffa09f8cb1 <osc_send_oap_rpc+97>:       mov    0x18(%rax),%rdx           <= Crashed here

0xffffffffa09f8cb5 <osc_send_oap_rpc+101>:      je     0xffffffffa09f8d50
0xffffffffa09f8cbb <osc_send_oap_rpc+107>:      sub    $0x18,%rdx
0xffffffffa09f8cbf <osc_send_oap_rpc+111>:      xor    %ecx,%ecx
0xffffffffa09f8cc1 <osc_send_oap_rpc+113>:      jmp    0xffffffffa09f8d10
0xffffffffa09f8cc3 <osc_send_oap_rpc+115>:      nopl   0x0(%rax,%rax,1)



static int
osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli,
                 struct lov_oinfo *loi,
                 int cmd, struct loi_oap_pages *lop)

%r15 = %r8 = lop

crash> struct loi_oap_pages ffff8804710c6a98
struct loi_oap_pages {
  lop_pending = {
    next = 0x5a5a5a5a5a5a5a5a, 
    prev = 0x5a5a5a5a5a5a5a5a
  }, 
  lop_urgent = {
    next = 0x5a5a5a5a5a5a5a5a, 
    prev = 0x5a5a5a5a5a5a5a5a
  }, 
  lop_pending_group = {
    next = 0x5a5a5a5a5a5a5a5a, 
    prev = 0x5a5a5a5a5a5a5a5a
  }, 
  lop_num_pending = 1515870810
}

static int
osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli,
                 struct lov_oinfo *loi,
                 int cmd, struct loi_oap_pages *lop)
{
        struct ptlrpc_request *req;
        obd_count page_count = 0;
        struct osc_async_page *oap = NULL, *tmp;
        struct osc_brw_async_args *aa;
        const struct obd_async_page_ops *ops;
        CFS_LIST_HEAD(rpc_list);
        int srvlock = 0, mem_tight = 0;
        struct cl_object *clob = NULL;
        obd_off starting_offset = OBD_OBJECT_EOF;
        unsigned int ending_offset;
        int starting_page_off = 0;
        ENTRY;

        /* ASYNC_HP pages first. At present, when the lock the pages is
         * to be canceled, the pages covered by the lock will be sent out
         * with ASYNC_HP. We have to send out them as soon as possible. */
        cfs_list_for_each_entry_safe(oap, tmp, &lop->lop_urgent, oap_urgent_item) {
                                                ^^^^^^^^^^^^^^^
                                                == LI_POISON

                if (oap->oap_async_flags & ASYNC_HP)
                        cfs_list_move(&oap->oap_pending_item, &rpc_list);
                else if (!(oap->oap_brw_flags & OBD_BRW_SYNC))
                        /* only do this for writeback pages. */
                        cfs_list_move_tail(&oap->oap_pending_item, &rpc_list);
                if (++page_count >= cli->cl_max_pages_per_rpc)
                        break;
        }

I attach a file with the output of dmesg,ps,bt, bt -a and foreach bt, if you need more elements.



 Comments   
Comment by Peter Jones [ 12/Feb/13 ]

Bruno

Could you please help with this one?

Thanks

Peter

Comment by Bruno Faccini (Inactive) [ 12/Feb/13 ]

Hello Seb !!
Can you also tell me if there are any additional patches on top of the Lustre 2.1.3 version you use ??
Thank's already for your report+infos, and in advance.
Bruno.

Comment by Sebastien Piechurski [ 13/Feb/13 ]

Hi Bruno !
We finally "meet" again ...

Sorry for the bad formatting of the initial description ...

Here is the list of Jiras corresponding to the included patches.
ORNL-22 general ptlrpcd threads pool support
LU-1144 implement a NUMA aware ptlrpcd binding policy,
LU-1110 MDS Oops in osd_xattr_get() during file open by FID,
LU-1363 llite: Not held lock when calling security_d_instantiate,
LU-645/BZ23978 getcwd failure,
LU-1299 (patch set 11) loading large enough binary from lustre trigger OOM killer
LU-549 Improve statfs performance if selinux is disabled (complement)
LU-1650 client crash in osc_req_attr_set() routine

Tell me if you need more informations from the dump.

Seb.

Comment by Bruno Faccini (Inactive) [ 18/Feb/13 ]

Hello Seb,
Usually when you want to ensure no [re-]formatting will occur, you have to embed your inputs with pieces of code, etc ..., as explained in wiki and as I modified for you. To read this editing hints, just click on the question-mark link just under the Comment sub-window (not so easy to explain!).

Concerning the crash-dump, we need to get extract more infos, and I would like you to retrieve and dump the 2nd parameter for osc_send_oap_rpc() (a struct client_obd *), which has been saved in %r12 (R12: ffff881067b22648) at the time of the GPF.

Comment by Bruno Faccini (Inactive) [ 19/Feb/13 ]

Problem looks like a race in parsing one of the client_obd's lists (cl_loi_hp_ready_list/cl_loi_ready_list/cl_loi_write_list/cl_loi_read_list) of lov_oinfo. This may come from the fact there are multiple "tar" processes running and competing to access these lists/structs for this same client_obd ... But this should not happen if lists access/updates are safely protected by cl_loi_list_lock !!

Comment by Sebastien Piechurski [ 22/Feb/13 ]

Here is the requested client_obd struct:

struct client_obd {
  cl_sem = {
    count = 0, 
    wait_lock = {
      raw_lock = {
        slock = 0
      }
    }, 
    wait_list = {
      next = 0xffff881067b22658, 
      prev = 0xffff881067b22658
    }
  }, 
  cl_target_uuid = {
    uuid = "scratch-OST0000_UUID\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
  }, 
  cl_import = 0xffff88105d718800, 
  cl_conn_count = 1, 
  cl_default_mds_easize = 0, 
  cl_max_mds_easize = 48, 
  cl_max_mds_cookiesize = 32, 
  cl_sp_me = LUSTRE_SP_CLI, 
  cl_sp_to = LUSTRE_SP_OST, 
  cl_flvr_mgc = {
    sf_rpc = 0, 
    sf_flags = 0, 
    u_rpc = {<No data fields>}, 
    u_bulk = {
      hash = {
        hash_alg = 0 '\000'
      }
    }
  }, 
  cl_llcd_offset = 0x0, 
  cl_dirty = 33554432, 
  cl_dirty_max = 33554432, 
  cl_dirty_transit = 0, 
  cl_avail_grant = 30281728, 
  cl_lost_grant = 4096, 
  cl_cache_waiters = {
    next = 0xffff880ddcc97738, 
    prev = 0xffff880ea95c7738
  }, 
  cl_next_shrink_grant = 4875484627, 
  cl_grant_shrink_list = {
    next = 0xffff881067b22708, 
    prev = 0xffff881067b22708
  }, 
  cl_grant_sem = {
    lock = {
      raw_lock = {
        slock = 0
      }
    }, 
    count = 1, 
    wait_list = {
      next = 0xffff881067b22720, 
      prev = 0xffff881067b22720
    }
  }, 
  cl_grant_shrink_interval = 1200, 
  cl_loi_list_lock = {
    lock = {
      raw_lock = {
        slock = 2091613343
      }
    }
  }, 
  cl_loi_ready_list = {
    next = 0xffff880be329f648, 
    prev = 0xffff880907ea58c8
  }, 
  cl_loi_hp_ready_list = {
    next = 0xffff881067b22748, 
    prev = 0xffff881067b22748
  }, 
  cl_loi_write_list = {
    next = 0xffff8808b83aec68, 
    prev = 0xffff880d34b198e8
  }, 
  cl_loi_read_list = {
    next = 0xffff881067b22768, 
    prev = 0xffff881067b22768
  }, 
  cl_r_in_flight = 0, 
  cl_w_in_flight = 4, 
  cl_pending_w_pages = 7334, 
  cl_pending_r_pages = 0, 
  cl_max_pages_per_rpc = 256, 
  cl_max_rpcs_in_flight = 8, 
  cl_read_rpc_hist = {
    oh_lock = {
      raw_lock = {
        slock = 2887691294
      }
    }, 
    oh_buckets = {191654, 46436, 23673, 14630, 6356, 5136, 4296, 8853, 3946, 968, 191, 52, 12, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
  }, 
  cl_write_rpc_hist = {
    oh_lock = {
      raw_lock = {
        slock = 2215216137
      }
    }, 
    oh_buckets = {82913, 96191, 49436, 39547, 21783, 15763, 16527, 19420, 10408, 5652, 2263, 1038, 347, 144, 34, 14, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
  }, 
  cl_read_page_hist = {
    oh_lock = {
      raw_lock = {
        slock = 2887691294
      }
    }, 
    oh_buckets = {72977, 13449, 9899, 2851, 5192, 2466, 3230, 4538, 191604, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
  }, 
  cl_write_page_hist = {
    oh_lock = {
      raw_lock = {
        slock = 2215216137
      }
    }, 
    oh_buckets = {8912, 4639, 9193, 6562, 3486, 3212, 4527, 4185, 316765, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
  }, 
  cl_read_offset_hist = {
    oh_lock = {
      raw_lock = {
        slock = 2887691294
      }
    }, 
    oh_buckets = {275181, 8501, 11733, 855, 676, 648, 1024, 2688, 4900, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
  }, 
  cl_write_offset_hist = {
    oh_lock = {
      raw_lock = {
        slock = 2215216137
      }
    }, 
    oh_buckets = {330039, 148, 272, 488, 1019, 1656, 3181, 4273, 20405, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
  }, 
  cl_destroy_in_flight = {
    counter = 0
  }, 
  cl_destroy_waitq = {
    lock = {
      raw_lock = {
        slock = 3409365814
      }
    }, 
    task_list = {
      next = 0xffff881067b22dd0, 
      prev = 0xffff881067b22dd0
    }
  }, 
  cl_rpc_lock = 0x0, 
  cl_close_lock = 0x0, 
  cl_oscc = {
    oscc_lock = {
      raw_lock = {
        slock = 1166034304
      }
    }, 
    oscc_wait_create_list = {
      next = 0xffff881067b22df8, 
      prev = 0xffff881067b22df8
    }, 
    oscc_obd = 0xffff881067b22138, 
    oscc_last_id = 1, 
    oscc_next_id = 2, 
    oscc_grow_count = 32, 
    oscc_max_grow_count = 20000, 
    oscc_oa = {
      o_valid = 0, 
      o_oi = {
        oi_id = 0, 
        oi_seq = 0
      }, 
      o_parent_seq = 0, 
      o_size = 0, 
      o_mtime = 0, 
      o_atime = 0, 
      o_ctime = 0, 
      o_blocks = 0, 
      o_grant = 0, 
      o_blksize = 0, 
      o_mode = 0, 
      o_uid = 0, 
      o_gid = 0, 
      o_flags = 0, 
      o_nlink = 0, 
      o_parent_oid = 0, 
      o_misc = 0, 
      o_ioepoch = 0, 
      o_stripe_idx = 0, 
      o_parent_ver = 0, 
      o_handle = {
        cookie = 0
      }, 
      o_lcookie = {
        lgc_lgl = {
          lgl_oid = 0, 
          lgl_oseq = 0, 
          lgl_ogen = 0
        }, 
        lgc_subsys = 0, 
        lgc_index = 0, 
        lgc_padding = 0
      }, 
      o_uid_h = 0, 
      o_gid_h = 0, 
      o_padding_3 = 0, 
      o_padding_4 = 0, 
      o_padding_5 = 0, 
      o_padding_6 = 0
    }, 
    oscc_flags = 1, 
    oscc_waitq = {
      lock = {
        raw_lock = {
          slock = 0
        }
      }, 
      task_list = {
        next = 0xffff881067b22f08, 
        prev = 0xffff881067b22f08
      }
    }
  }, 
  cl_mgc_sem = {
    lock = {
      raw_lock = {
        slock = 0
      }
    }, 
    count = 1, 
    wait_list = {
      next = 0xffff881067b22f20, 
      prev = 0xffff881067b22f20
    }
  }, 
  cl_mgc_vfsmnt = 0x0, 
  cl_mgc_configs_dir = 0x0, 
  cl_mgc_refcount = {
    counter = 0
  }, 
  cl_mgc_mgsexp = 0x0, 
  cl_checksum = 1, 
  cl_supp_cksum_types = 3, 
  cl_cksum_type = OBD_CKSUM_ADLER, 
  cl_ar = {
    ar_rc = 0, 
    ar_force_sync = 0, 
    ar_min_xid = 0
  }, 
  cl_qchk_stat = 1, 
  cl_seq = 0x0, 
  cl_resends = {
    counter = 10
  }
}

If needed, I can upload the whole dump.

Comment by Oleg Drokin [ 22/Feb/13 ]

Shadow noticed there is a long-standing bug in osc_send_oap_request, that could be traced back to clio introduction and further made worse be bug 18881 commit:

        req = osc_build_req(env, cli, &rpc_list, page_count,
                            mem_tight ? (cmd | OBD_BRW_MEMALLOC) : cmd);
        if (IS_ERR(req)) {
                                     <==   Here we must regain the obd_list_lock or we will work on unprotected list, what's worse, we'll return to caller without the lock and caller expects the lock to be locked.
                LASSERT(cfs_list_empty(&rpc_list));
                loi_list_maint(cli, loi);
                RETURN(PTR_ERR(req));
        }

Comment by Bruno Faccini (Inactive) [ 22/Feb/13 ]

Hello Seb,
Yes if it is possible, I would like to get the whole dump just to be able to double-check if we may have felt into the hole Alexey found and just reported in LU-2853 !
Do you know how you can use our upload ?

Comment by Bruno Faccini (Inactive) [ 22/Feb/13 ]

Pushed patch for b2_1 at http://review.whamcloud.com/5516.
No master patch required.

Comment by Alexey Lyashkov [ 23/Feb/13 ]

Bruno,

that fix need for all branches includes master.
but that is isn't one bug in that area.

Xyratex have dump with different oap list corruption.
>>
LustreError: 3818:0:(osc_request.c:1308:osc_brw_prep_request()) ASSERTION( i == 0 || pg->off > pg_prev->off ) failed: i 1 p_c 151 pg ffffea0004129ce8 [pri 18446612138517887360 ind 30208] off 123731968 prev_pg ffffea000dbc0e18 [pri 0 ind 263316] off 123731968
>>
as you see oap list have a pages with zero private (like truncated from memory).

bug also exist from 2.1 (first report) until 2.3 release.

Comment by Bruno Faccini (Inactive) [ 25/Feb/13 ]

Humm, seems that due to patch and spin-lock/LBUG order, sanity/test_118f fails due to time-out, with Client's threads stack-traces dumps showing hangs due to new spin-lock (Which means test expect LBUG to occur ??) !!

So trying to revert spin-lock/LBUG order, as commented by Alexey, this is patch #3.

Comment by Bruno Faccini (Inactive) [ 01/Mar/13 ]

Humm too fast ..., the patch still causes sanity/test_118f (simulate/inject EINVAL via OBD_FAIL_OSC_BRW_PREP_REQ2) to fail for a timeout with stacks like following being printed to Console :

17:08:15:BUG: soft lockup - CPU#0 stuck for 60s! [multiop:18829]
17:08:16:CPU 0:
17:08:16:Modules linked in: loop llite_lloop(U) lustre(U) mgc(U) lov(U) osc(U) mdc(U) lmv(U) fid(U) fld(U) lquota(U) ptlrpc(U) obdclass(U) lvfs(U) ksocklnd(U) lnet(U) libcfs(U) nfs nfsd exportfs nfs_acl auth_rpcgss autofs4 hidp rfcomm l2cap bluetooth lockd sunrpc be2iscsi iscsi_tcp bnx2i cnic uio cxgb3i libcxgbi iw_cxgb3 cxgb3 8021q libiscsi_tcp ib_iser libiscsi2 scsi_transport_iscsi2 scsi_transport_iscsi ib_srp rds ib_sdp ib_ipoib ipoib_helper ipv6 xfrm_nalgo crypto_api rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_addr ib_sa ib_mad ib_core dm_mirror dm_multipath scsi_dh video backlight sbs power_meter hwmon i2c_ec dell_wmi wmi button battery asus_acpi acpi_memhotplug ac parport_pc lp parport floppy i2c_piix4 i2c_core 8139too 8139cp tpm_tis pcspkr tpm tpm_bios serio_raw virtio_balloon mii dm_raid45 dm_message dm_region_hash dm_log dm_mod dm_mem_cache ata_piix libata sd_mod scsi_mod virtio_blk virtio_pci virtio_ring virtio ext3 jbd uhci_hcd ohci_hcd ehci_hcd
17:08:17:Pid: 18829, comm: multiop Tainted: G     ---- 2.6.18-308.20.1.el5 #1
17:08:17:RIP: 0010:[<ffffffff80064bdc>]  [<ffffffff80064bdc>] .text.lock.spinlock+0x2/0x30
17:08:17:RSP: 0018:ffff81005f9e5820  EFLAGS: 00000286
17:08:17:RAX: 0000000000000044 RBX: ffff81005f9e58c0 RCX: 0000000000000000
17:08:17:RDX: ffff81005f9e58e8 RSI: 0000000001762650 RDI: ffff81006ad22814
17:08:17:RBP: ffff81006249e7d0 R08: 5a5a5a5a5a5a5a5a R09: 5a5a5a5a5a5a5a5a
17:08:17:R10: 5a5a5a5a5a5a5a5a R11: 5a5a5a5a5a5a5a5a R12: ffffffff8893d9f3
17:08:17:R13: ffff81005d1b7280 R14: ffff81006a878380 R15: ffff81005d1b7280
17:08:17:FS:  00002af8a98ba0c0(0000) GS:ffffffff80432000(0000) knlGS:0000000000000000
17:08:17:CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
17:08:17:CR2: 00002af8a93519e0 CR3: 0000000064354000 CR4: 00000000000006e0
17:08:17:
17:08:17:Call Trace:
17:08:17: [<ffffffff88cac73c>] :osc:osc_send_oap_rpc+0x174c/0x1a80
17:08:17: [<ffffffff88cacc4e>] :osc:osc_check_rpcs+0x1de/0x450
17:08:17: [<ffffffff88caad11>] :osc:loi_list_maint+0xe1/0x110
17:08:17: [<ffffffff88cc4da0>] :osc:osc_io_submit+0x470/0x520
17:08:17: [<ffffffff88951263>] :obdclass:cl_io_submit_rw+0xd3/0x180
17:08:17: [<ffffffff88d3c5f7>] :lov:lov_io_submit+0x2a7/0xc60
17:08:17: [<ffffffff80063af9>] mutex_lock+0xd/0x1d
17:08:18: [<ffffffff88951263>] :obdclass:cl_io_submit_rw+0xd3/0x180
17:08:18: [<ffffffff88dfc6dd>] :lustre:ll_writepage+0x34d/0x460
17:08:18: [<ffffffff8001d0fb>] mpage_writepages+0x1bf/0x37d
17:08:18: [<ffffffff88dfc390>] :lustre:ll_writepage+0x0/0x460
17:08:18: [<ffffffff887f2edd>] :libcfs:libcfs_debug_msg+0x4d/0x60
17:08:18: [<ffffffff8005a9fb>] do_writepages+0x20/0x2f
17:08:18: [<ffffffff8004f8c0>] __filemap_fdatawrite_range+0x50/0x5b
17:08:18: [<ffffffff800c9888>] sync_page_range+0x3d/0xa0
17:08:18: [<ffffffff800c9b86>] generic_file_writev+0x8a/0xa3
17:08:18: [<ffffffff88e2438e>] :lustre:vvp_io_write_start+0x15e/0x210
17:08:18: [<ffffffff889513dc>] :obdclass:cl_io_start+0xcc/0x130
17:08:18: [<ffffffff88954adc>] :obdclass:cl_io_loop+0xbc/0x160
17:08:18: [<ffffffff88dd162a>] :lustre:ll_file_io_generic+0x4ba/0x500
17:08:18: [<ffffffff88dd182f>] :lustre:ll_file_writev+0x1bf/0x230
17:08:19: [<ffffffff88dddc9d>] :lustre:ll_file_write+0x15d/0x1d0
17:08:19: [<ffffffff80016b99>] vfs_write+0xce/0x174
17:08:19: [<ffffffff80017462>] sys_write+0x45/0x6e
17:08:19: [<ffffffff8005d28d>] tracesys+0xd5/0xe0
17:08:19:

having a look to the source code with these new infos, it looks like locking cl_loi_list_lock is not needed finally since it is already done upon errors in osc_build_req() !!
The whole code path comes from a very old commit (git log shows commit fbf5870b9848929d352460f1f005b79c0b5ccc5a and b=14166) which is obviously in since a long time.
So seems we are back to find an other way cl_loi_list_lock has not been granted before cl_loi_* lists access/update.
BTW, Seb you did not upload the crash-dump which may be of greater help now ?
And Alexey, do you have crash-dumps available for the other cases you commented ?

Comment by Sebastien Piechurski [ 05/Mar/13 ]

Sorry Bruno, I just woke up from a long period without reading my open tickets.
Could you tell me how to upload the dump, and I'll be happy to do it.

Thanks.

Comment by Oleg Drokin [ 05/Mar/13 ]

Shadow, your analysis here is incorrect. The "missing lock" is actually there, it's taken in osc_build_req on the error exit path... Sigh, that's a bad style for sure and deserves some comment.

Comment by Sebastien Piechurski [ 08/Mar/13 ]

A tarball with all required files is currently uploading.
Should be available in around 8 hours.

Comment by Bruno Faccini (Inactive) [ 11/Mar/13 ]

Got it ! Thank's Seb.

Comment by Sebastien Piechurski [ 02/Apr/13 ]

Hi Bruno,

Do you have any news on this bug ?
We faced the 3rd occurence on the customer's side this weekend.

It seems to happen when multiple tar are run with output to the same file.

Cheers,

Seb.

Comment by Bruno Faccini (Inactive) [ 03/Apr/13 ]

Yes and sorry, you are right and I know that I am late on this ticket, will try to give you an update asap now.

Comment by Bruno Faccini (Inactive) [ 05/Apr/13 ]

Crash-dump analysis is quite painful since there are a lot of tar threads currently competing on the same OSC and all can be suspected, so I am in the process to review them all.

I also thought I could get some help from the Lustre log/traces I extracted from the crash-dump, but no way it contains only D_HA traces of no interest.

Since you indicate you already had 3 crashes, is it possible that you enable at least D_CACHE+D_INODE+D_PAGE tracing on the (known ?) Clients ??

Comment by Sebastien Piechurski [ 05/Apr/13 ]

Ok Bruno,

I have set lnet.debug to "ioctl neterror warning error emerg ha config console inode cache page".

We'll be waiting for the next crash to have the traces.
Meanwhile, the user is trying to create a reproducer, which might help in the understanding of the problem.

Seb.

Comment by Benjamin Barthe (Inactive) [ 08/Apr/13 ]

Hi guys,

a new occurence happened this weekend, I'm currently uploading vmcore on FTP

Benj.

[EDIT] Upload done.

Comment by Sebastien Buisson (Inactive) [ 16/Apr/13 ]

Hi,

Did you have time to work on the crash dump supplied by Benjamin?

There were 5 more occurences this weekend at the customer's site. This is becoming more and more visible to the customer.

Clearly now this ticket's severity has to be increased.

Thanks,
Sebastien.

Comment by Bruno Faccini (Inactive) [ 16/Apr/13 ]

Agree, this ticket is my top priority. And I upgrade its own to major.

Comment by Bruno Faccini (Inactive) [ 17/Apr/13 ]

New crash-dump (including additional traces requested) analysis is on-going, it is not so easy due to osc_send_oap_rpc() routine size and tricky/optimized assembly code. Will update asap with more details.

Comment by Sebastien Buisson (Inactive) [ 17/Apr/13 ]

Thanks Bruno, we are looking forward to hearing from you!

Cheers,
Sebastien.

Comment by Bruno Faccini (Inactive) [ 19/Apr/13 ]

In the new crash-dump, the scenario is a bit different, the panic'ing thread is parsing the page-lists/loi_oap_pages (the loi_read_lop, it is important, see later) a lov_oinfo and, according to registers+memory content, this struct has been freed+poisoned and was even found re-allocated at the time of the crash-dump !!

Thus, and I confirmed it by re-reading debugging notes for 1st crash-dump, my earlier comment "Problem looks like a race in parsing one of the client_obd's lists (cl_loi_hp_ready_list/cl_loi_ready_list/cl_loi_write_list/cl_loi_read_list) of lov_oinfo" is incomplete and would better be that when osc_check_rpcs() chooses/finds a lov_oinfo (via osc_next_loi()), it may have first called osc_send_oap_rpc() for lov_oinfo's loi_write_lop page-lists. During this cl_loi_list_lock can be returned and possibly granted by somebody else who can then free+poison lov_oinfo which is still linked on one of client_obd cl_loi_[hp_ready,ready,write_read]_list and can still be found via osc_next_loi(). Then, when cl_loi_list_lock is back available, osc_check_rpcs() can call osc_send_oap_rpc() for lov_oinfo's loi_read_lop page-lists and trigger the problem.

Thus, possible fixes for the problem could be to unlink lov_oinfo from client_obd cl_loi_[hp_ready,ready,write_read]_list when choosen in osc_next_loi(), and keep it during all these page-lists processing (but then all further calls to loi_list_maint() will be useless ...). Or, revert the lock optimizations against long thread/time owning a critical lock, that cause cl_loi_list_lock to be returned in osc_build_req() (but looks tricky and of highly suspected big performance impact ...). Or only allow one thread at a time to be in osc_check_rpcs() (need to introduce a flag for this and test-and-set/swap it). Or re-choose the lov_oinfo (via osc_next_loi()) between osc_send_oap_rpc() calls for write/read...

I will give it a try to the latest (simplest!) choice and let you know on how it goes.

Comment by Bruno Faccini (Inactive) [ 24/Apr/13 ]

Patch is at http://review.whamcloud.com/6089.
Patch-set #1 passed auto-tests, submitted patch-set #2 only adding comments inside code.
This may be incomplete since the same issue is likely to occur after 2nd call to osc_send_oap_rpc() when same unsafe window occurs and osc_check_rpcs() unlinks lov_oinfo from all client_obd cl_loi_[hp_ready,ready,write_read]_list's before calling loi_list_maint() and moving to next while() loop ...

Comment by Bruno Faccini (Inactive) [ 29/Apr/13 ]

Re-choose/refresh of lov_oinfo between write/read steps and calls of osc_send_oap_rpc() in osc_check_rpcs() seems to work but it may break the balancing algorithm and it also does not cover all cases since same problem is likely to also occur upon end of read step where the lov_oinfo may also have been freed (again during cl_loi_list_lock released window) and it is unlinked from any client_obd cl_loi_[hp_ready,ready,write_read]_list's before calling loi_list_maint() and next while() loop.

Thus since the only routine used to maintain the cl_loi_[hp_ready,ready,write_read]list's is loi_list_maint() it may be possible to implement the 1st fix option I listed in prior comment, ie unlink lov_oinfo from client_obd cl_loi[hp_ready,ready,write_read]list's when choosen in osc_next_loi(), and keep it as-is during all these page-lists processing, this only needs to remove calls to loi_list_maint() (that will re-link lov_oinfo on cl_loi[hp_ready,ready,write_read]_list's !!) in both osc_send_oap_rpc() in osc_check_rpcs().

New/2nd patch/way is at http://review.whamcloud.com/6200 .

Comment by Bruno Faccini (Inactive) [ 29/Apr/13 ]

Seb(s), Benjamin,

Could it be possible that you install+run builds with these 2 patches (on a few Nodes/Clients where the problem already occured) and expose them to the site work-load likely to reproduce the crash ??

BTW, is Meteo site running under production already ??

Comment by Sebastien Buisson (Inactive) [ 30/Apr/13 ]

Hi Bruno,

The bug only occurs on the login node, and this node is unique. So people on site would like to wait for inspections and approvals of the 2 patches before testing them.

Thanks,
Sebastien.

Comment by Sebastien Piechurski [ 30/Apr/13 ]

We might be able to reproduce the crashing workload on another machine which is currently under installation.
I am currently in vacation, but will try that next week when I'm back.

Comment by Alexandre Louvet [ 15/May/13 ]

quick feedback from site : with the proposed patch the system deadlock almost immediately.

Comment by Bruno Faccini (Inactive) [ 16/May/13 ]

Hello Alex,

Sorry about that, but do you have more infos to provide that may help me to understand what is going wrong ?? Is there any crash-dump available or any other infos for this dead-lock ?? Did you or on-site people make some troubleshooting already ??

Also, which path/change did you integrate, #6089 or #6200 ??

Comment by Sebastien Piechurski [ 17/May/13 ]

Hi Bruno,

I think the package we tried was including both #6089 and #6200.
Looking into more details, adding to the fact that #6200 was rejected, I understand that these were not supposed to be applied together anyway, am I right ?

There is a comment for #6089 saying that there is another window for the same kind of race. Will there be an update to this patch to close this window ?

Comment by Bruno Faccini (Inactive) [ 17/May/13 ]

Hello Seb,

And thank you for the update.

Yes you are right, an may be I did not mention it enough explicitly, but #6089 and #6200 are 2 separate fixes and are not intended to be used together.

About the other possible window for the race, it seem to exist but in any case appears less important/ than the 1rst that you trigger and to be fixed in #6089. So can you give a try to #6089 solo/only ??

BTW, how did you integrate both patches since #6200 avoids any use/declare of lov_oinfo/loi, when #6089 uses it extensively ??!!

And again, many thank's for your help on this.

Comment by Sebastien Piechurski [ 21/May/13 ]

I am currently uploading the crash taken after the deadlock on the version with both patches applied.
This is on the ftp server in uploads/LU-2797/deadlock-login0-2013-05-13-patches-6089+6200.tar.xz
You will see in the dmesg the hang processes backtrace, but as the dump was taken after a failed reboot attempt, the processes have been already killed and disappeared.

Comment by Bruno Faccini (Inactive) [ 24/May/13 ]

Hello Seb,
I spent some time to debug this last crash and understand what kind of deadlock you encountered.
As you stated there was hung processes back-traces in the log and they were all with the same stack like following :

INFO: task get_bench.bull:18887 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
get_bench.bul D 0000000000000016     0 18887  18828 0x00000080
 ffff88084eaafcc8 0000000000000082 0000000000000001 5a5a5a5a5a5a5a5a
 5a5a5a5a5a5a5a5a ffff88084eaafd48 ffff88084eaafc48 ffffffffa04df92e
 ffff88085ceb43e0 ffff88084eaaffd8 000000000000db00 ffff88085ceb43e0
Call Trace:
 [<ffffffffa04df92e>] ? cfs_free+0xe/0x10 [libcfs]
 [<ffffffff8117b997>] ? dput+0xc7/0x190
 [<ffffffff81487205>] rwsem_down_failed_common+0x95/0x1f0
 [<ffffffff81487383>] rwsem_down_write_failed+0x23/0x30
 [<ffffffff81262023>] call_rwsem_down_write_failed+0x13/0x20
 [<ffffffff81486832>] ? down_write+0x32/0x40
 [<ffffffffa0b2ee3b>] ll_setattr_raw+0x18b/0xf20 [lustre]
 [<ffffffffa0b2fc2f>] ll_setattr+0x5f/0x100 [lustre]
 [<ffffffff8117f4e8>] notify_change+0x168/0x340
 [<ffffffff81161343>] sys_fchmodat+0xc3/0x100
 [<ffffffff81161398>] sys_chmod+0x18/0x20
 [<ffffffff810030f2>] system_call_fastpath+0x16/0x1b

and they were no-longer present at the time of the crash-dump. Which seems to indicate that, since they should be Un-interruptible at this point, the inode/ll_inode_info semaphore they were hung on became available, thus it was not really a dead-lock and also the Lustre code used there is far to be impacted by both patches.

BTW, in the crash-dump I also found that 11 "cat" processes are hung since about 118s, all with the same stack-trace like following :

PID: 17234  TASK: ffff880867c87080  CPU: 2   COMMAND: "cat"
 #0 [ffff88086473f138] schedule at ffffffff81484c05
 #1 [ffff88086473f200] io_schedule at ffffffff814853e3
 #2 [ffff88086473f220] sync_page at ffffffff810fd4ad
 #3 [ffff88086473f230] __wait_on_bit_lock at ffffffff81485c2a
 #4 [ffff88086473f280] __lock_page at ffffffff810fd447
 #5 [ffff88086473f2e0] vvp_page_own at ffffffffa0b6220a [lustre]
 #6 [ffff88086473f300] cl_page_own0 at ffffffffa05f80bb [obdclass]
 #7 [ffff88086473f350] cl_page_own at ffffffffa05f8330 [obdclass]
 #8 [ffff88086473f360] check_and_discard_cb at ffffffffa05fe9ff [obdclass]
 #9 [ffff88086473f3b0] cl_page_gang_lookup at ffffffffa05f9994 [obdclass]
#10 [ffff88086473f460] cl_lock_page_out at ffffffffa05fb53b [obdclass]
#11 [ffff88086473f4d0] osc_lock_flush at ffffffffa0a0177f [osc]
#12 [ffff88086473f520] osc_lock_cancel at ffffffffa0a01819 [osc]
#13 [ffff88086473f570] cl_lock_cancel0 at ffffffffa05fa1c5 [obdclass]
#14 [ffff88086473f5a0] cl_lock_cancel at ffffffffa05fafeb [obdclass]
#15 [ffff88086473f5c0] osc_ldlm_blocking_ast at ffffffffa0a02f7a [osc]
#16 [ffff88086473f630] ldlm_cancel_callback at ffffffffa07031f0 [ptlrpc]
#17 [ffff88086473f650] ldlm_cli_cancel_local at ffffffffa071d0cb [ptlrpc]
#18 [ffff88086473f680] ldlm_cli_cancel_list_local at ffffffffa071f9bd [ptlrpc]
#19 [ffff88086473f6e0] ldlm_cancel_resource_local at ffffffffa071fc62 [ptlrpc]
#20 [ffff88086473f750] osc_destroy at ffffffffa09ec44f [osc]
#21 [ffff88086473f800] lov_destroy at ffffffffa0a52cad [lov]
#22 [ffff88086473f8f0] ll_objects_destroy at ffffffffa0b447b7 [lustre]
#23 [ffff88086473fa20] ll_close_inode_openhandle at ffffffffa0b142a1 [lustre]
#24 [ffff88086473faa0] ll_md_real_close at ffffffffa0b1514a [lustre]
#25 [ffff88086473fad0] ll_md_close at ffffffffa0b15425 [lustre]
#26 [ffff88086473fb80] ll_file_release at ffffffffa0b15a35 [lustre]
#27 [ffff88086473fbd0] __fput at ffffffff81164a85
#28 [ffff88086473fc20] fput at ffffffff81164bc5
#29 [ffff88086473fc30] filp_close at ffffffff8116061d
#30 [ffff88086473fc60] put_files_struct at ffffffff8105732f
#31 [ffff88086473fcb0] exit_files at ffffffff810573f3
#32 [ffff88086473fce0] do_exit at ffffffff81059475
#33 [ffff88086473fd60] do_group_exit at ffffffff81059bc8
#34 [ffff88086473fd90] get_signal_to_deliver at ffffffff8106eec6
#35 [ffff88086473fe30] do_signal at ffffffff810022d5
#36 [ffff88086473ff30] do_notify_resume at ffffffff81002af0
#37 [ffff88086473ff50] int_signal at ffffffff810033c1
    RIP: 00000032bcad83f0  RSP: 00007fff228120a8  RFLAGS: 00000246
    RAX: fffffffffffffffc  RBX: 0000000000200000  RCX: ffffffffffffffff
    RDX: 0000000000200000  RSI: 00007fbf6c365000  RDI: 0000000000000003
    RBP: 00007fbf6c365000   R8: 00000000ffffffff   R9: 0000000000000000
    R10: 0000000000200fff  R11: 0000000000000246  R12: 0000000000000003
    R13: ffffffffffff8000  R14: 0000000000200000  R15: 00007fbf6c365000
    ORIG_RAX: 0000000000000000  CS: 0033  SS: 002b

and also, since about 112s, the Lustre umount process that was launched as part of the shutdown :

PID: 24287  TASK: ffff8810414317d0  CPU: 17  COMMAND: "umount"
 #0 [ffff88105cd07418] schedule at ffffffff81484c05
 #1 [ffff88105cd074e0] io_schedule at ffffffff814853e3
 #2 [ffff88105cd07500] sync_page at ffffffff810fd4ad
 #3 [ffff88105cd07510] __wait_on_bit_lock at ffffffff81485c2a
 #4 [ffff88105cd07560] __lock_page at ffffffff810fd447
 #5 [ffff88105cd075c0] vvp_page_own at ffffffffa0b6220a [lustre]
 #6 [ffff88105cd075e0] cl_page_own0 at ffffffffa05f80bb [obdclass]
 #7 [ffff88105cd07630] cl_page_own at ffffffffa05f8330 [obdclass]
 #8 [ffff88105cd07640] check_and_discard_cb at ffffffffa05fe9ff [obdclass]
 #9 [ffff88105cd07690] cl_page_gang_lookup at ffffffffa05f9994 [obdclass]
#10 [ffff88105cd07740] cl_lock_page_out at ffffffffa05fb53b [obdclass]
#11 [ffff88105cd077b0] osc_lock_flush at ffffffffa0a0177f [osc]
#12 [ffff88105cd07800] osc_lock_cancel at ffffffffa0a01819 [osc]
#13 [ffff88105cd07850] cl_lock_cancel0 at ffffffffa05fa1c5 [obdclass]
#14 [ffff88105cd07880] cl_lock_cancel at ffffffffa05fafeb [obdclass]
#15 [ffff88105cd078a0] osc_ldlm_blocking_ast at ffffffffa0a02f7a [osc]
#16 [ffff88105cd07910] ldlm_cancel_callback at ffffffffa07031f0 [ptlrpc]
#17 [ffff88105cd07930] ldlm_cli_cancel_local at ffffffffa071d0cb [ptlrpc]
#18 [ffff88105cd07960] ldlm_cli_cancel at ffffffffa0720f78 [ptlrpc]
#19 [ffff88105cd079a0] cleanup_resource at ffffffffa070ae18 [ptlrpc]
#20 [ffff88105cd07a00] ldlm_resource_clean at ffffffffa070afda [ptlrpc]
#21 [ffff88105cd07a20] cfs_hash_for_each_relax at ffffffffa04ef28f [libcfs]
#22 [ffff88105cd07ab0] cfs_hash_for_each_nolock at ffffffffa04f0caf [libcfs]
#23 [ffff88105cd07ae0] ldlm_namespace_cleanup at ffffffffa0707a69 [ptlrpc]
#24 [ffff88105cd07b00] osc_import_event at ffffffffa09f8f02 [osc]
#25 [ffff88105cd07b60] ptlrpc_invalidate_import at ffffffffa076c0f0 [ptlrpc]
#26 [ffff88105cd07c20] ptlrpc_set_import_active at ffffffffa074175d [ptlrpc]
#27 [ffff88105cd07c50] osc_iocontrol at ffffffffa09f45e0 [osc]
#28 [ffff88105cd07d00] lov_iocontrol at ffffffffa0a5a82f [lov]
#29 [ffff88105cd07ec0] ll_umount_begin at ffffffffa0b29520 [lustre]
#30 [ffff88105cd07ef0] sys_umount at ffffffff8118322a
#31 [ffff88105cd07f80] system_call_fastpath at ffffffff810030f2
    RIP: 00007f52e77837a7  RSP: 00007fffc0f25188  RFLAGS: 00010206
    RAX: 00000000000000a6  RBX: ffffffff810030f2  RCX: 0000000000000010
    RDX: 0000000000000000  RSI: 0000000000000001  RDI: 00007f52e8c75a00
    RBP: 00007f52e8c759e0   R8: 00007f52e8c75a20   R9: 0000000000000001
    R10: 00007fffc0f24fb0  R11: 0000000000000206  R12: 0000000000000000
    R13: 0000000000000000  R14: 0000000000000000  R15: 00007f52e8c75a60
    ORIG_RAX: 00000000000000a6  CS: 0033  SS: 002b

These last stacks seems to indicate that the Lustre pages flushing mechanism was slow (hung ?) but again the Lustre code used there is far to be impacted by both patches.

On the other hand, do you have some news about exposure of Change #6089 only vs the production work-load ??

Comment by Sebastien Piechurski [ 24/May/13 ]

Hi Bruno,

The workload was exercised during a whole night without a crash.
We can consider the bug as fixed.

Thanks for the work !

Generated at Sat Feb 10 01:28:17 UTC 2024 using Jira 9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c.