Details
-
Bug
-
Resolution: Fixed
-
Blocker
-
Lustre 2.4.0
-
3
-
5435
Description
It appears that we are having some sort of mdc request leak problem on a client.
The most easy way I see to trigger it is to run racer in a loop, eventually it fails to cleanup due to mdc still being referenced.
Finally I traced it to some (I think, due to replay_cb set at mdc_replay_open) open requests emanating from rm (so I guess those are actually opendir).
(gdb) p *(struct ptlrpc_request *)0xffff880099b38bf0
$1 = {rq_type = 4711, rq_status = 301, rq_list = {next = 0xffff880099b38bf8,
prev = 0xffff880099b38bf8}, rq_timed_list = {next = 0xffff880099b38c08,
prev = 0xffff880099b38c08}, rq_history_list = {next = 0xffff880099b38c18,
prev = 0xffff880099b38c18}, rq_exp_list = {next = 0xffff880099b38c28,
prev = 0xffff880099b38c28}, rq_ops = 0x0, rq_svc_thread = 0x0,
rq_history_seq = 0, rq_at_index = 0, rq_lock = {raw_lock = {slock = 0},
magic = 3735899821, owner_cpu = 4294967295, owner = 0xffffffffffffffff},
rq_intr = 0, rq_replied = 1, rq_err = 0, rq_timedout = 0, rq_resend = 0,
rq_restart = 0, rq_replay = 1, rq_no_resend = 0, rq_waiting = 0,
rq_receiving_reply = 0, rq_no_delay = 0, rq_net_err = 0, rq_wait_ctx = 0,
rq_early = 0, rq_must_unlink = 0, rq_fake = 0, rq_memalloc = 0,
rq_packed_final = 0, rq_hp = 0, rq_at_linked = 0, rq_reply_truncate = 0,
rq_committed = 0, rq_invalid_rqset = 0, rq_generation_set = 0,
rq_no_retry_einprogress = 0, rq_nr_resend = 0, rq_phase = 3955285508,
rq_next_phase = 3955285510, rq_refcount = {counter = 1},
rq_request_portal = 12, rq_reply_portal = 10, rq_nob_received = 544,
rq_reqlen = 576, rq_replen = 544, rq_reqmsg = 0xffff8800dd315bf0,
rq_repmsg = 0xffff88009a7d58b0, rq_transno = 0, rq_xid = 1417687094104231,
rq_replay_list = {next = 0xffff880099b38cb8, prev = 0xffff880099b38cb8},
rq_cli_ctx = 0xffffffffa12b5fc0, rq_svc_ctx = 0x0, rq_ctx_chain = {
next = 0xffff880099b38cd8, prev = 0xffff880099b38cd8}, rq_flvr = {
sf_rpc = 0, sf_flags = 0, u_rpc = {<No data fields>}, u_bulk = {hash = {
hash_alg = 0 '\000'}}}, rq_sp_from = LUSTRE_SP_CLI, rq_ctx_init = 0,
---Type <return> to continue, or q <return> to quit---
rq_ctx_fini = 0, rq_bulk_read = 0, rq_bulk_write = 0, rq_auth_gss = 0,
rq_auth_remote = 0, rq_auth_usr_root = 0, rq_auth_usr_mdt = 0,
rq_auth_usr_ost = 0, rq_pack_udesc = 0, rq_pack_bulk = 0, rq_no_reply = 0,
rq_pill_init = 1, rq_auth_uid = 0, rq_auth_mapped_uid = 0,
rq_user_desc = 0x0, rq_reqbuf = 0xffff8800dd315bf0,
rq_repbuf = 0xffff88009a7d57f0 "", rq_repdata = 0xffff88009a7d58b0,
rq_clrbuf = 0x0, rq_reqbuf_len = 1024, rq_reqdata_len = 576,
rq_repbuf_len = 2048, rq_repdata_len = 544, rq_clrbuf_len = 0,
rq_clrdata_len = 0, rq_reply_off = 192, rq_req_swab_mask = 0,
rq_rep_swab_mask = 0, rq_import_generation = 1,
rq_send_state = LUSTRE_IMP_FULL, rq_early_count = 0, rq_req_md_h = {
cookie = 91348489}, rq_req_cbid = {
cbid_fn = 0xffffffffa11f07f0 <request_out_callback>,
cbid_arg = 0xffff880099b38bf0}, rq_delay_limit = 0,
rq_queued_time = 4296236347, rq_arrival_time = {tv_sec = 1352013601,
tv_usec = 266507}, rq_reply_state = 0x0, rq_rqbd = 0x0, rq_reply_md_h = {
cookie = 91348481}, rq_reply_waitq = {lock = {raw_lock = {slock = 0},
magic = 3735899821, owner_cpu = 4294967295, owner = 0xffffffffffffffff},
task_list = {next = 0xffff880099b38dc8, prev = 0xffff880099b38dc8}},
rq_reply_cbid = {cbid_fn = 0xffffffffa11f1390 <reply_in_callback>,
cbid_arg = 0xffff880099b38bf0}, rq_self = 0, rq_peer = {nid = 0, pid = 0},
rq_export = 0x0, rq_import = 0xffff8801e78dd7f0,
rq_replay_cb = 0xffffffffa13cfc20 <mdc_replay_open>, rq_commit_cb = 0,
---Type <return> to continue, or q <return> to quit---
rq_cb_data = 0x0, rq_bulk = 0x0, rq_sent = 1352013601,
rq_real_sent = 1352013601, rq_deadline = 1352013645, rq_reply_deadline = 0,
rq_bulk_deadline = 0, rq_timeout = 43, rq_set_waitq = {lock = {raw_lock = {
slock = 0}, magic = 3735899821, owner_cpu = 4294967295,
owner = 0xffffffffffffffff}, task_list = {next = 0xffff880099b38e78,
prev = 0xffff880099b38e78}}, rq_set_chain = {next = 0xffff880099b38e88,
prev = 0xffff880099b38e88}, rq_set = 0x0, rq_interpret_reply = 0,
rq_async_args = {pointer_arg = {0x0 <repeats 11 times>}, space = {0, 0, 0,
0, 0, 0, 0}}, rq_pool = 0x0, rq_session = {lc_tags = 0, lc_state = 0,
lc_thread = 0x0, lc_value = 0x0, lc_remember = {next = 0x0, prev = 0x0},
lc_version = 0, lc_cookie = 0}, rq_recov_session = {lc_tags = 0,
lc_state = 0, lc_thread = 0x0, lc_value = 0x0, lc_remember = {next = 0x0,
prev = 0x0}, lc_version = 0, lc_cookie = 0}, rq_pill = {
rc_req = 0xffff880099b38bf0, rc_fmt = 0xffffffffa128e5e0,
rc_loc = RCL_CLIENT, rc_area = {{184, 104, 8, 136, 0, 0, 2, 72,
4294967295}, {184, 112, 216, 56, 260, 120, 120, 4294967295,
4294967295}}}}
This is already post-cleanup:
[root@rhel6 tests]# ../utils/lctl dl 17 ST mdc lustre-MDT0000-mdc-ffff8801fbd8abf0 0e1d968f-d661-3c7e-0982-22c28b7e3db2 2 24 ST mdc lustre-MDT0000-mdc-ffff8802349e9bf0 d165ded5-d083-219f-81f9-a68115593bdc 2 [root@rhel6 tests]# ls /proc/fs/lustre/mdc/ num_refs [root@rhel6 tests]#