Details
-
Bug
-
Resolution: Fixed
-
Blocker
-
Lustre 2.4.0
-
3
-
5435
Description
It appears that we are having some sort of mdc request leak problem on a client.
The most easy way I see to trigger it is to run racer in a loop, eventually it fails to cleanup due to mdc still being referenced.
Finally I traced it to some (I think, due to replay_cb set at mdc_replay_open) open requests emanating from rm (so I guess those are actually opendir).
(gdb) p *(struct ptlrpc_request *)0xffff880099b38bf0 $1 = {rq_type = 4711, rq_status = 301, rq_list = {next = 0xffff880099b38bf8, prev = 0xffff880099b38bf8}, rq_timed_list = {next = 0xffff880099b38c08, prev = 0xffff880099b38c08}, rq_history_list = {next = 0xffff880099b38c18, prev = 0xffff880099b38c18}, rq_exp_list = {next = 0xffff880099b38c28, prev = 0xffff880099b38c28}, rq_ops = 0x0, rq_svc_thread = 0x0, rq_history_seq = 0, rq_at_index = 0, rq_lock = {raw_lock = {slock = 0}, magic = 3735899821, owner_cpu = 4294967295, owner = 0xffffffffffffffff}, rq_intr = 0, rq_replied = 1, rq_err = 0, rq_timedout = 0, rq_resend = 0, rq_restart = 0, rq_replay = 1, rq_no_resend = 0, rq_waiting = 0, rq_receiving_reply = 0, rq_no_delay = 0, rq_net_err = 0, rq_wait_ctx = 0, rq_early = 0, rq_must_unlink = 0, rq_fake = 0, rq_memalloc = 0, rq_packed_final = 0, rq_hp = 0, rq_at_linked = 0, rq_reply_truncate = 0, rq_committed = 0, rq_invalid_rqset = 0, rq_generation_set = 0, rq_no_retry_einprogress = 0, rq_nr_resend = 0, rq_phase = 3955285508, rq_next_phase = 3955285510, rq_refcount = {counter = 1}, rq_request_portal = 12, rq_reply_portal = 10, rq_nob_received = 544, rq_reqlen = 576, rq_replen = 544, rq_reqmsg = 0xffff8800dd315bf0, rq_repmsg = 0xffff88009a7d58b0, rq_transno = 0, rq_xid = 1417687094104231, rq_replay_list = {next = 0xffff880099b38cb8, prev = 0xffff880099b38cb8}, rq_cli_ctx = 0xffffffffa12b5fc0, rq_svc_ctx = 0x0, rq_ctx_chain = { next = 0xffff880099b38cd8, prev = 0xffff880099b38cd8}, rq_flvr = { sf_rpc = 0, sf_flags = 0, u_rpc = {<No data fields>}, u_bulk = {hash = { hash_alg = 0 '\000'}}}, rq_sp_from = LUSTRE_SP_CLI, rq_ctx_init = 0, ---Type <return> to continue, or q <return> to quit--- rq_ctx_fini = 0, rq_bulk_read = 0, rq_bulk_write = 0, rq_auth_gss = 0, rq_auth_remote = 0, rq_auth_usr_root = 0, rq_auth_usr_mdt = 0, rq_auth_usr_ost = 0, rq_pack_udesc = 0, rq_pack_bulk = 0, rq_no_reply = 0, rq_pill_init = 1, rq_auth_uid = 0, rq_auth_mapped_uid = 0, rq_user_desc = 0x0, rq_reqbuf = 0xffff8800dd315bf0, rq_repbuf = 0xffff88009a7d57f0 "", rq_repdata = 0xffff88009a7d58b0, rq_clrbuf = 0x0, rq_reqbuf_len = 1024, rq_reqdata_len = 576, rq_repbuf_len = 2048, rq_repdata_len = 544, rq_clrbuf_len = 0, rq_clrdata_len = 0, rq_reply_off = 192, rq_req_swab_mask = 0, rq_rep_swab_mask = 0, rq_import_generation = 1, rq_send_state = LUSTRE_IMP_FULL, rq_early_count = 0, rq_req_md_h = { cookie = 91348489}, rq_req_cbid = { cbid_fn = 0xffffffffa11f07f0 <request_out_callback>, cbid_arg = 0xffff880099b38bf0}, rq_delay_limit = 0, rq_queued_time = 4296236347, rq_arrival_time = {tv_sec = 1352013601, tv_usec = 266507}, rq_reply_state = 0x0, rq_rqbd = 0x0, rq_reply_md_h = { cookie = 91348481}, rq_reply_waitq = {lock = {raw_lock = {slock = 0}, magic = 3735899821, owner_cpu = 4294967295, owner = 0xffffffffffffffff}, task_list = {next = 0xffff880099b38dc8, prev = 0xffff880099b38dc8}}, rq_reply_cbid = {cbid_fn = 0xffffffffa11f1390 <reply_in_callback>, cbid_arg = 0xffff880099b38bf0}, rq_self = 0, rq_peer = {nid = 0, pid = 0}, rq_export = 0x0, rq_import = 0xffff8801e78dd7f0, rq_replay_cb = 0xffffffffa13cfc20 <mdc_replay_open>, rq_commit_cb = 0, ---Type <return> to continue, or q <return> to quit--- rq_cb_data = 0x0, rq_bulk = 0x0, rq_sent = 1352013601, rq_real_sent = 1352013601, rq_deadline = 1352013645, rq_reply_deadline = 0, rq_bulk_deadline = 0, rq_timeout = 43, rq_set_waitq = {lock = {raw_lock = { slock = 0}, magic = 3735899821, owner_cpu = 4294967295, owner = 0xffffffffffffffff}, task_list = {next = 0xffff880099b38e78, prev = 0xffff880099b38e78}}, rq_set_chain = {next = 0xffff880099b38e88, prev = 0xffff880099b38e88}, rq_set = 0x0, rq_interpret_reply = 0, rq_async_args = {pointer_arg = {0x0 <repeats 11 times>}, space = {0, 0, 0, 0, 0, 0, 0}}, rq_pool = 0x0, rq_session = {lc_tags = 0, lc_state = 0, lc_thread = 0x0, lc_value = 0x0, lc_remember = {next = 0x0, prev = 0x0}, lc_version = 0, lc_cookie = 0}, rq_recov_session = {lc_tags = 0, lc_state = 0, lc_thread = 0x0, lc_value = 0x0, lc_remember = {next = 0x0, prev = 0x0}, lc_version = 0, lc_cookie = 0}, rq_pill = { rc_req = 0xffff880099b38bf0, rc_fmt = 0xffffffffa128e5e0, rc_loc = RCL_CLIENT, rc_area = {{184, 104, 8, 136, 0, 0, 2, 72, 4294967295}, {184, 112, 216, 56, 260, 120, 120, 4294967295, 4294967295}}}}
This is already post-cleanup:
[root@rhel6 tests]# ../utils/lctl dl 17 ST mdc lustre-MDT0000-mdc-ffff8801fbd8abf0 0e1d968f-d661-3c7e-0982-22c28b7e3db2 2 24 ST mdc lustre-MDT0000-mdc-ffff8802349e9bf0 d165ded5-d083-219f-81f9-a68115593bdc 2 [root@rhel6 tests]# ls /proc/fs/lustre/mdc/ num_refs [root@rhel6 tests]#