<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:23:50 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2275] Open request leak</title>
                <link>https://jira.whamcloud.com/browse/LU-2275</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;It appears that we are having some sort of mdc request leak problem on a client.&lt;/p&gt;

&lt;p&gt;The most easy way I see to trigger it is to run racer in a loop, eventually it fails to cleanup due to mdc still being referenced.&lt;/p&gt;

&lt;p&gt;Finally I traced it to some (I think, due to replay_cb set at mdc_replay_open) open requests emanating from rm (so I guess those are actually opendir).&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;(gdb) p *(struct ptlrpc_request *)0xffff880099b38bf0
$1 = {rq_type = 4711, rq_status = 301, rq_list = {next = 0xffff880099b38bf8, 
    prev = 0xffff880099b38bf8}, rq_timed_list = {next = 0xffff880099b38c08, 
    prev = 0xffff880099b38c08}, rq_history_list = {next = 0xffff880099b38c18, 
    prev = 0xffff880099b38c18}, rq_exp_list = {next = 0xffff880099b38c28, 
    prev = 0xffff880099b38c28}, rq_ops = 0x0, rq_svc_thread = 0x0, 
  rq_history_seq = 0, rq_at_index = 0, rq_lock = {raw_lock = {slock = 0}, 
    magic = 3735899821, owner_cpu = 4294967295, owner = 0xffffffffffffffff}, 
  rq_intr = 0, rq_replied = 1, rq_err = 0, rq_timedout = 0, rq_resend = 0, 
  rq_restart = 0, rq_replay = 1, rq_no_resend = 0, rq_waiting = 0, 
  rq_receiving_reply = 0, rq_no_delay = 0, rq_net_err = 0, rq_wait_ctx = 0, 
  rq_early = 0, rq_must_unlink = 0, rq_fake = 0, rq_memalloc = 0, 
  rq_packed_final = 0, rq_hp = 0, rq_at_linked = 0, rq_reply_truncate = 0, 
  rq_committed = 0, rq_invalid_rqset = 0, rq_generation_set = 0, 
  rq_no_retry_einprogress = 0, rq_nr_resend = 0, rq_phase = 3955285508, 
  rq_next_phase = 3955285510, rq_refcount = {counter = 1}, 
  rq_request_portal = 12, rq_reply_portal = 10, rq_nob_received = 544, 
  rq_reqlen = 576, rq_replen = 544, rq_reqmsg = 0xffff8800dd315bf0, 
  rq_repmsg = 0xffff88009a7d58b0, rq_transno = 0, rq_xid = 1417687094104231, 
  rq_replay_list = {next = 0xffff880099b38cb8, prev = 0xffff880099b38cb8}, 
  rq_cli_ctx = 0xffffffffa12b5fc0, rq_svc_ctx = 0x0, rq_ctx_chain = {
    next = 0xffff880099b38cd8, prev = 0xffff880099b38cd8}, rq_flvr = {
    sf_rpc = 0, sf_flags = 0, u_rpc = {&amp;lt;No data fields&amp;gt;}, u_bulk = {hash = {
        hash_alg = 0 &apos;\000&apos;}}}, rq_sp_from = LUSTRE_SP_CLI, rq_ctx_init = 0, 
---Type &amp;lt;return&amp;gt; to continue, or q &amp;lt;return&amp;gt; to quit---
  rq_ctx_fini = 0, rq_bulk_read = 0, rq_bulk_write = 0, rq_auth_gss = 0, 
  rq_auth_remote = 0, rq_auth_usr_root = 0, rq_auth_usr_mdt = 0, 
  rq_auth_usr_ost = 0, rq_pack_udesc = 0, rq_pack_bulk = 0, rq_no_reply = 0, 
  rq_pill_init = 1, rq_auth_uid = 0, rq_auth_mapped_uid = 0, 
  rq_user_desc = 0x0, rq_reqbuf = 0xffff8800dd315bf0, 
  rq_repbuf = 0xffff88009a7d57f0 &quot;&quot;, rq_repdata = 0xffff88009a7d58b0, 
  rq_clrbuf = 0x0, rq_reqbuf_len = 1024, rq_reqdata_len = 576, 
  rq_repbuf_len = 2048, rq_repdata_len = 544, rq_clrbuf_len = 0, 
  rq_clrdata_len = 0, rq_reply_off = 192, rq_req_swab_mask = 0, 
  rq_rep_swab_mask = 0, rq_import_generation = 1, 
  rq_send_state = LUSTRE_IMP_FULL, rq_early_count = 0, rq_req_md_h = {
    cookie = 91348489}, rq_req_cbid = {
    cbid_fn = 0xffffffffa11f07f0 &amp;lt;request_out_callback&amp;gt;, 
    cbid_arg = 0xffff880099b38bf0}, rq_delay_limit = 0, 
  rq_queued_time = 4296236347, rq_arrival_time = {tv_sec = 1352013601, 
    tv_usec = 266507}, rq_reply_state = 0x0, rq_rqbd = 0x0, rq_reply_md_h = {
    cookie = 91348481}, rq_reply_waitq = {lock = {raw_lock = {slock = 0}, 
      magic = 3735899821, owner_cpu = 4294967295, owner = 0xffffffffffffffff}, 
    task_list = {next = 0xffff880099b38dc8, prev = 0xffff880099b38dc8}}, 
  rq_reply_cbid = {cbid_fn = 0xffffffffa11f1390 &amp;lt;reply_in_callback&amp;gt;, 
    cbid_arg = 0xffff880099b38bf0}, rq_self = 0, rq_peer = {nid = 0, pid = 0}, 
  rq_export = 0x0, rq_import = 0xffff8801e78dd7f0, 
  rq_replay_cb = 0xffffffffa13cfc20 &amp;lt;mdc_replay_open&amp;gt;, rq_commit_cb = 0, 
---Type &amp;lt;return&amp;gt; to continue, or q &amp;lt;return&amp;gt; to quit---
  rq_cb_data = 0x0, rq_bulk = 0x0, rq_sent = 1352013601, 
  rq_real_sent = 1352013601, rq_deadline = 1352013645, rq_reply_deadline = 0, 
  rq_bulk_deadline = 0, rq_timeout = 43, rq_set_waitq = {lock = {raw_lock = {
        slock = 0}, magic = 3735899821, owner_cpu = 4294967295, 
      owner = 0xffffffffffffffff}, task_list = {next = 0xffff880099b38e78, 
      prev = 0xffff880099b38e78}}, rq_set_chain = {next = 0xffff880099b38e88, 
    prev = 0xffff880099b38e88}, rq_set = 0x0, rq_interpret_reply = 0, 
  rq_async_args = {pointer_arg = {0x0 &amp;lt;repeats 11 times&amp;gt;}, space = {0, 0, 0, 
      0, 0, 0, 0}}, rq_pool = 0x0, rq_session = {lc_tags = 0, lc_state = 0, 
    lc_thread = 0x0, lc_value = 0x0, lc_remember = {next = 0x0, prev = 0x0}, 
    lc_version = 0, lc_cookie = 0}, rq_recov_session = {lc_tags = 0, 
    lc_state = 0, lc_thread = 0x0, lc_value = 0x0, lc_remember = {next = 0x0, 
      prev = 0x0}, lc_version = 0, lc_cookie = 0}, rq_pill = {
    rc_req = 0xffff880099b38bf0, rc_fmt = 0xffffffffa128e5e0, 
    rc_loc = RCL_CLIENT, rc_area = {{184, 104, 8, 136, 0, 0, 2, 72, 
        4294967295}, {184, 112, 216, 56, 260, 120, 120, 4294967295, 
        4294967295}}}}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This is already post-cleanup:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@rhel6 tests]# ../utils/lctl dl
 17 ST mdc lustre-MDT0000-mdc-ffff8801fbd8abf0 0e1d968f-d661-3c7e-0982-22c28b7e3db2 2
 24 ST mdc lustre-MDT0000-mdc-ffff8802349e9bf0 d165ded5-d083-219f-81f9-a68115593bdc 2
[root@rhel6 tests]# ls /proc/fs/lustre/mdc/
num_refs
[root@rhel6 tests]# 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="16565">LU-2275</key>
            <summary>Open request leak</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="green">Oleg Drokin</assignee>
                                    <reporter username="green">Oleg Drokin</reporter>
                        <labels>
                            <label>LB</label>
                    </labels>
                <created>Sun, 4 Nov 2012 03:02:18 +0000</created>
                <updated>Mon, 18 Feb 2013 13:04:47 +0000</updated>
                            <resolved>Mon, 18 Feb 2013 13:04:47 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                    <fixVersion>Lustre 2.4.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="47374" author="green" created="Sun, 4 Nov 2012 03:07:22 +0000"  >&lt;p&gt;The leaked requests were allocated on the intent_file_open path:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;    [&amp;lt;ffffffffa11dc0e7&amp;gt;] ptlrpc_request_alloc_internal+0xd7/0x330 [ptlrpc]
    [&amp;lt;ffffffffa11dc363&amp;gt;] ptlrpc_request_alloc+0x13/0x20 [ptlrpc]
    [&amp;lt;ffffffffa13e34ba&amp;gt;] mdc_enqueue+0x2fa/0x15b0 [mdc]
    [&amp;lt;ffffffffa13e4993&amp;gt;] mdc_intent_lock+0x223/0x5a8 [mdc]
    [&amp;lt;ffffffffa139f8f0&amp;gt;] lmv_intent_open+0x2d0/0x10d0 [lmv]
    [&amp;lt;ffffffffa13a09a3&amp;gt;] lmv_intent_lock+0x2b3/0x380 [lmv]
    [&amp;lt;ffffffffa0858fa6&amp;gt;] ll_intent_file_open+0x196/0x8d0 [lustre]
    [&amp;lt;ffffffffa085a140&amp;gt;] ll_file_open+0x230/0xc50 [lustre]
    [&amp;lt;ffffffffa08421db&amp;gt;] ll_dir_open+0xdb/0xf0 [lustre]
    [&amp;lt;ffffffff8117ae9a&amp;gt;] __dentry_open+0x10a/0x3e0
    [&amp;lt;ffffffff8117b284&amp;gt;] nameidata_to_filp+0x54/0x70
    [&amp;lt;ffffffff8118e5b0&amp;gt;] do_filp_open+0x6c0/0xd90
    [&amp;lt;ffffffff8117ac49&amp;gt;] do_sys_open+0x69/0x140
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="47377" author="green" created="Sun, 4 Nov 2012 21:18:12 +0000"  >&lt;p&gt;The problem seems to stem from the fact that the server returns a request that is both lookup-positive and lookup-negative. It&apos;s also enquque aborted, but open lock returned which is double contradictory too.&lt;/p&gt;

&lt;p&gt;Client is not prepared to such turn of events and leaks the request as it does not expect to have open reference if the lookup was negative.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000002:00100000:3.0:1352078051.109700:0:2913:0:(mdc_locks.c:523:mdc_finish_enqueue()) @@@ op: 1 disposition: 200002f, status: 0  req@ffff8801688bebf0 x1417754181011236/t0(0) o101-&amp;gt;lustre-MDT0000-mdc-ffff8801d5fa9bf0@0@lo:12/10 lens 576/544 e 0 to 0 dl 1352078082 ref 1 fl Complete:RP/0/0 rc 301/301
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Now why is this happening I don&apos;t have any idea yet, but still digging&lt;/p&gt;</comment>
                            <comment id="47379" author="green" created="Mon, 5 Nov 2012 00:35:09 +0000"  >&lt;p&gt;I think I got to the root of it.&lt;/p&gt;

&lt;p&gt;a bunch of races in mdt_finish_open and mdt_open_by_fid_lock could lead to a situation where DISP_LOOKUP_POS and DISP_OPEN_OPEN could be set and then later code decide that it lost some sort of a race and still return a negative lookup result. In the end client will get the above mentioned request and leak it.&lt;/p&gt;

&lt;p&gt;Patch to fix the leak on client when a nonsensical request is encountered: &lt;a href=&quot;http://review.whamcloud.com/4458&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4458&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Patch to try and eliminate races leading to nonsensical request sending: &lt;a href=&quot;http://review.whamcloud.com/4459&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4459&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Misc patch to cleanup error exit path in class_newdev() that I noticed early on doing mdc refcount code review: &lt;a href=&quot;http://review.whamcloud.com/4460&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4460&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="52644" author="pjones" created="Mon, 18 Feb 2013 13:04:47 +0000"  >&lt;p&gt;All three patches have landed for 2.4&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="16559">LU-2270</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="15605">LU-1793</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvbof:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>5435</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>