Details

    • Bug
    • Resolution: Fixed
    • Blocker
    • Lustre 2.4.0
    • Lustre 2.4.0
    • 3
    • 5435

    Description

      It appears that we are having some sort of mdc request leak problem on a client.

      The most easy way I see to trigger it is to run racer in a loop, eventually it fails to cleanup due to mdc still being referenced.

      Finally I traced it to some (I think, due to replay_cb set at mdc_replay_open) open requests emanating from rm (so I guess those are actually opendir).

      (gdb) p *(struct ptlrpc_request *)0xffff880099b38bf0
      $1 = {rq_type = 4711, rq_status = 301, rq_list = {next = 0xffff880099b38bf8, 
          prev = 0xffff880099b38bf8}, rq_timed_list = {next = 0xffff880099b38c08, 
          prev = 0xffff880099b38c08}, rq_history_list = {next = 0xffff880099b38c18, 
          prev = 0xffff880099b38c18}, rq_exp_list = {next = 0xffff880099b38c28, 
          prev = 0xffff880099b38c28}, rq_ops = 0x0, rq_svc_thread = 0x0, 
        rq_history_seq = 0, rq_at_index = 0, rq_lock = {raw_lock = {slock = 0}, 
          magic = 3735899821, owner_cpu = 4294967295, owner = 0xffffffffffffffff}, 
        rq_intr = 0, rq_replied = 1, rq_err = 0, rq_timedout = 0, rq_resend = 0, 
        rq_restart = 0, rq_replay = 1, rq_no_resend = 0, rq_waiting = 0, 
        rq_receiving_reply = 0, rq_no_delay = 0, rq_net_err = 0, rq_wait_ctx = 0, 
        rq_early = 0, rq_must_unlink = 0, rq_fake = 0, rq_memalloc = 0, 
        rq_packed_final = 0, rq_hp = 0, rq_at_linked = 0, rq_reply_truncate = 0, 
        rq_committed = 0, rq_invalid_rqset = 0, rq_generation_set = 0, 
        rq_no_retry_einprogress = 0, rq_nr_resend = 0, rq_phase = 3955285508, 
        rq_next_phase = 3955285510, rq_refcount = {counter = 1}, 
        rq_request_portal = 12, rq_reply_portal = 10, rq_nob_received = 544, 
        rq_reqlen = 576, rq_replen = 544, rq_reqmsg = 0xffff8800dd315bf0, 
        rq_repmsg = 0xffff88009a7d58b0, rq_transno = 0, rq_xid = 1417687094104231, 
        rq_replay_list = {next = 0xffff880099b38cb8, prev = 0xffff880099b38cb8}, 
        rq_cli_ctx = 0xffffffffa12b5fc0, rq_svc_ctx = 0x0, rq_ctx_chain = {
          next = 0xffff880099b38cd8, prev = 0xffff880099b38cd8}, rq_flvr = {
          sf_rpc = 0, sf_flags = 0, u_rpc = {<No data fields>}, u_bulk = {hash = {
              hash_alg = 0 '\000'}}}, rq_sp_from = LUSTRE_SP_CLI, rq_ctx_init = 0, 
      ---Type <return> to continue, or q <return> to quit---
        rq_ctx_fini = 0, rq_bulk_read = 0, rq_bulk_write = 0, rq_auth_gss = 0, 
        rq_auth_remote = 0, rq_auth_usr_root = 0, rq_auth_usr_mdt = 0, 
        rq_auth_usr_ost = 0, rq_pack_udesc = 0, rq_pack_bulk = 0, rq_no_reply = 0, 
        rq_pill_init = 1, rq_auth_uid = 0, rq_auth_mapped_uid = 0, 
        rq_user_desc = 0x0, rq_reqbuf = 0xffff8800dd315bf0, 
        rq_repbuf = 0xffff88009a7d57f0 "", rq_repdata = 0xffff88009a7d58b0, 
        rq_clrbuf = 0x0, rq_reqbuf_len = 1024, rq_reqdata_len = 576, 
        rq_repbuf_len = 2048, rq_repdata_len = 544, rq_clrbuf_len = 0, 
        rq_clrdata_len = 0, rq_reply_off = 192, rq_req_swab_mask = 0, 
        rq_rep_swab_mask = 0, rq_import_generation = 1, 
        rq_send_state = LUSTRE_IMP_FULL, rq_early_count = 0, rq_req_md_h = {
          cookie = 91348489}, rq_req_cbid = {
          cbid_fn = 0xffffffffa11f07f0 <request_out_callback>, 
          cbid_arg = 0xffff880099b38bf0}, rq_delay_limit = 0, 
        rq_queued_time = 4296236347, rq_arrival_time = {tv_sec = 1352013601, 
          tv_usec = 266507}, rq_reply_state = 0x0, rq_rqbd = 0x0, rq_reply_md_h = {
          cookie = 91348481}, rq_reply_waitq = {lock = {raw_lock = {slock = 0}, 
            magic = 3735899821, owner_cpu = 4294967295, owner = 0xffffffffffffffff}, 
          task_list = {next = 0xffff880099b38dc8, prev = 0xffff880099b38dc8}}, 
        rq_reply_cbid = {cbid_fn = 0xffffffffa11f1390 <reply_in_callback>, 
          cbid_arg = 0xffff880099b38bf0}, rq_self = 0, rq_peer = {nid = 0, pid = 0}, 
        rq_export = 0x0, rq_import = 0xffff8801e78dd7f0, 
        rq_replay_cb = 0xffffffffa13cfc20 <mdc_replay_open>, rq_commit_cb = 0, 
      ---Type <return> to continue, or q <return> to quit---
        rq_cb_data = 0x0, rq_bulk = 0x0, rq_sent = 1352013601, 
        rq_real_sent = 1352013601, rq_deadline = 1352013645, rq_reply_deadline = 0, 
        rq_bulk_deadline = 0, rq_timeout = 43, rq_set_waitq = {lock = {raw_lock = {
              slock = 0}, magic = 3735899821, owner_cpu = 4294967295, 
            owner = 0xffffffffffffffff}, task_list = {next = 0xffff880099b38e78, 
            prev = 0xffff880099b38e78}}, rq_set_chain = {next = 0xffff880099b38e88, 
          prev = 0xffff880099b38e88}, rq_set = 0x0, rq_interpret_reply = 0, 
        rq_async_args = {pointer_arg = {0x0 <repeats 11 times>}, space = {0, 0, 0, 
            0, 0, 0, 0}}, rq_pool = 0x0, rq_session = {lc_tags = 0, lc_state = 0, 
          lc_thread = 0x0, lc_value = 0x0, lc_remember = {next = 0x0, prev = 0x0}, 
          lc_version = 0, lc_cookie = 0}, rq_recov_session = {lc_tags = 0, 
          lc_state = 0, lc_thread = 0x0, lc_value = 0x0, lc_remember = {next = 0x0, 
            prev = 0x0}, lc_version = 0, lc_cookie = 0}, rq_pill = {
          rc_req = 0xffff880099b38bf0, rc_fmt = 0xffffffffa128e5e0, 
          rc_loc = RCL_CLIENT, rc_area = {{184, 104, 8, 136, 0, 0, 2, 72, 
              4294967295}, {184, 112, 216, 56, 260, 120, 120, 4294967295, 
              4294967295}}}}
      

      This is already post-cleanup:

      [root@rhel6 tests]# ../utils/lctl dl
       17 ST mdc lustre-MDT0000-mdc-ffff8801fbd8abf0 0e1d968f-d661-3c7e-0982-22c28b7e3db2 2
       24 ST mdc lustre-MDT0000-mdc-ffff8802349e9bf0 d165ded5-d083-219f-81f9-a68115593bdc 2
      [root@rhel6 tests]# ls /proc/fs/lustre/mdc/
      num_refs
      [root@rhel6 tests]# 
      

      Attachments

        Issue Links

          Activity

            People

              green Oleg Drokin
              green Oleg Drokin
              Votes:
              0 Vote for this issue
              Watchers:
              4 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: