<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:53:44 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12568] LNetError: 28086:0:(lib-move.c:2862:lnet_detach_rsp_tracker()) ASSERTION( rspt-&gt;rspt_cpt == cpt ) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-12568</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;There is a use after free in the LNet response tracking code.&lt;/p&gt;

&lt;p&gt;If an MD is unlinked with a non-zero refcount the lnet_libhandle is invalidated so that future lookups of the MD are failed.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;/* must be called with lnet_res_lock held */
void
lnet_md_unlink(struct lnet_libmd *md)
{
        if ((md-&amp;gt;md_flags &amp;amp; LNET_MD_FLAG_ZOMBIE) == 0) {
                /* first unlink attempt... */
                struct lnet_me *me = md-&amp;gt;md_me;

                md-&amp;gt;md_flags |= LNET_MD_FLAG_ZOMBIE;

                /* Disassociate from ME (if any), and unlink it if it was created
                 * with LNET_UNLINK */
                if (me != NULL) {
                        /* detach MD from portal */
                        lnet_ptl_detach_md(me, md);
                        if (me-&amp;gt;me_unlink == LNET_UNLINK)
                                lnet_me_unlink(me);
                }

                /* ensure all future handle lookups fail */
                lnet_res_lh_invalidate(&amp;amp;md-&amp;gt;md_lh);
        }

        if (md-&amp;gt;md_refcount != 0) {
                CDEBUG(D_NET, &quot;Queueing unlink of md %p\n&quot;, md);
                return;
        }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;If a response tracker is attached to such an MD then it is possible for the lnet_finalize_expired_responses loop to free the rspt before it has been detached from the MD.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;static void
lnet_finalize_expired_responses(bool force)
{
&amp;lt;snip&amp;gt;
                       if (ktime_compare(ktime_get(), rspt-&amp;gt;rspt_deadline) &amp;gt;= 0 ||
                            force) {
                                struct lnet_peer_ni *lpni;
                                lnet_nid_t nid;

                                md = lnet_handle2md(&amp;amp;rspt-&amp;gt;rspt_mdh);
                                if (!md) {
                                        LNetInvalidateMDHandle(&amp;amp;rspt-&amp;gt;rspt_mdh);
                                        lnet_res_unlock(i);
                                        list_del_init(&amp;amp;rspt-&amp;gt;rspt_on_list);
                                        lnet_rspt_free(rspt, i);
                                        continue;
                                }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;When the final operation on the MD completes the MD is detached from the lnet_msg, the response tracker is detached from the MD, and the assert can be tripped:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lnet_finalize()-&amp;gt;lnet_msg_detach_md()-&amp;gt;lnet_detach_rsp_tracker()
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;void
lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt)
{
        struct lnet_rsp_tracker *rspt;

        /*
         * msg has a refcount on the MD so the MD is not going away.
         * The rspt queue for the cpt is protected by
         * the lnet_net_lock(cpt). cpt is the cpt of the MD cookie.
         */
        if (!md-&amp;gt;md_rspt_ptr)
                return;

        rspt = md-&amp;gt;md_rspt_ptr;
        md-&amp;gt;md_rspt_ptr = NULL;

        /* debug code */
        LASSERT(rspt-&amp;gt;rspt_cpt == cpt);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="56453">LU-12568</key>
            <summary>LNetError: 28086:0:(lib-move.c:2862:lnet_detach_rsp_tracker()) ASSERTION( rspt-&gt;rspt_cpt == cpt ) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="hornc">Chris Horn</assignee>
                                    <reporter username="hornc">Chris Horn</reporter>
                        <labels>
                    </labels>
                <created>Fri, 19 Jul 2019 19:11:47 +0000</created>
                <updated>Thu, 6 Feb 2020 15:56:55 +0000</updated>
                            <resolved>Thu, 15 Aug 2019 13:18:36 +0000</resolved>
                                                    <fixVersion>Lustre 2.13.0</fixVersion>
                    <fixVersion>Lustre 2.12.4</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="251727" author="hornc" created="Fri, 19 Jul 2019 19:23:52 +0000"  >&lt;p&gt;These trace messages demonstrate the bug being hit:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000400:00000200:10.0:1563533406.667520:000000:10121:0:(lib-move.c:4788:LNetPut()) LNetPut msg ffff881e8f3f9800 -&amp;gt; 12345-60@gni4
00000400:00000200:10.0:1563533406.667521:000000:10121:0:(lib-move.c:4817:LNetPut()) attach rsp tracker to msg ffff881e8f3f9800 md ffff881ce9276b28
00000400:00000200:10.0:1563533406.667523:000000:10121:0:(lib-move.c:4670:lnet_attach_rsp_tracker()) attach new rspt ffff881ff06b5c40 to md ffff881ce9276b28 cpt 0
00000400:00000200:10.0:1563533406.667623:000000:10121:0:(lib-move.c:1891:lnet_handle_send()) msg ffff881e8f3f9800 rspt_next_hop_nid = 10.12.0.4@o2ib40
00000400:00000200:0.0:1563533408.740457:000000:28061:0:(lib-md.c:65:lnet_md_unlink()) Queueing unlink of md ffff881ce9276b28

00000400:00000200:4.0:1563533420.155422:000000:28096:0:(lib-move.c:2941:lnet_finalize_expired_responses()) Response timeout: md = ffff881d5caa6c38: nid = 10.12.0.4@o2ib40 &amp;lt;&amp;lt;&amp;lt;&amp;lt; lnet_finalize_expired_responses is executing after the MD was queued for unlink

00000400:00000100:14.0:1563533429.266380:000000:28086:0:(lib-move.c:908:lnet_post_send_locked()) Aborting msg ffff881e8f3f9800 for 12345-10.12.0.4@o2ib40: LNetM[DE]Unlink() already called on the MD/ME.
00000400:00000200:14.0:1563533429.299202:000000:28086:0:(lib-move.c:912:lnet_post_send_locked()) msg ffff881e8f3f9800 to 12345-10.12.0.4@o2ib40 canceled and will not be resent
00000400:00000200:14.0:1563533429.299204:000000:28086:0:(lib-msg.c:968:lnet_is_health_check()) Msg ffff881e8f3f9800 is in inconsistent state, don&apos;t perform health checking (-125, 0)
00000400:00000200:14.0:1563533429.299205:000000:28086:0:(lib-msg.c:973:lnet_is_health_check()) msg ffff881e8f3f9800 health check = 0, status = -125, hstatus = 0
00000400:00000200:14.0:1563533429.299209:000000:28086:0:(lib-msg.c:928:lnet_msg_detach_md()) Detach rsp tracker from msg ffff881e8f3f9800 md ffff881ce9276b28
00000400:00000200:14.0:1563533429.299210:000000:28086:0:(lib-move.c:2860:lnet_detach_rsp_tracker()) Detach rspt ffff881ff06b5c40 from md ffff881ce9276b28 cpt 0
00000400:00040000:14.0:1563533429.299212:000000:28086:0:(lib-move.c:2862:lnet_detach_rsp_tracker()) ASSERTION( rspt-&amp;gt;rspt_cpt == cpt ) failed:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Note the response tracker was attached at 1563533406.667523. I&apos;m using the default lnet_transaction_timeout of 10 seconds, so when we see the lnet_finalize_expired_responses loop running at time 1563533420.155422 we can be confident that we&apos;re hitting the code I outlined in the description of this ticket.&lt;/p&gt;</comment>
                            <comment id="251751" author="gerrit" created="Sat, 20 Jul 2019 17:59:53 +0000"  >&lt;p&gt;Chris Horn (hornc@cray.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/35576&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/35576&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12568&quot; title=&quot;LNetError: 28086:0:(lib-move.c:2862:lnet_detach_rsp_tracker()) ASSERTION( rspt-&amp;gt;rspt_cpt == cpt ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12568&quot;&gt;&lt;del&gt;LU-12568&lt;/del&gt;&lt;/a&gt; lnet: Defer rspt cleanup when MD queued for unlink&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 2b069b22e1e22d5460ce266af09df9238dd4f031&lt;/p&gt;</comment>
                            <comment id="251984" author="hornc" created="Wed, 24 Jul 2019 20:57:46 +0000"  >&lt;p&gt;This issue was uncovered while testing the patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12441&quot; title=&quot;Response tracker is not detached on router ping reply&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12441&quot;&gt;&lt;del&gt;LU-12441&lt;/del&gt;&lt;/a&gt;, however we haven&apos;t been able to identify a manner in which that patch &lt;em&gt;causes&lt;/em&gt; this issue.&lt;/p&gt;</comment>
                            <comment id="253045" author="gerrit" created="Thu, 15 Aug 2019 07:51:09 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/35576/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/35576/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12568&quot; title=&quot;LNetError: 28086:0:(lib-move.c:2862:lnet_detach_rsp_tracker()) ASSERTION( rspt-&amp;gt;rspt_cpt == cpt ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12568&quot;&gt;&lt;del&gt;LU-12568&lt;/del&gt;&lt;/a&gt; lnet: Defer rspt cleanup when MD queued for unlink&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 4a4ac34de42c57cf6963c95aee8da634a767b38a&lt;/p&gt;</comment>
                            <comment id="257458" author="gerrit" created="Thu, 31 Oct 2019 23:11:08 +0000"  >&lt;p&gt;Amir Shehata (ashehata@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/36635&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/36635&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12568&quot; title=&quot;LNetError: 28086:0:(lib-move.c:2862:lnet_detach_rsp_tracker()) ASSERTION( rspt-&amp;gt;rspt_cpt == cpt ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12568&quot;&gt;&lt;del&gt;LU-12568&lt;/del&gt;&lt;/a&gt; lnet: Defer rspt cleanup when MD queued for unlink&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 0c3f4854dea3e88bd216d04ca95d33cc7dd20d44&lt;/p&gt;</comment>
                            <comment id="259206" author="gerrit" created="Thu, 5 Dec 2019 14:57:10 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/36635/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/36635/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12568&quot; title=&quot;LNetError: 28086:0:(lib-move.c:2862:lnet_detach_rsp_tracker()) ASSERTION( rspt-&amp;gt;rspt_cpt == cpt ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12568&quot;&gt;&lt;del&gt;LU-12568&lt;/del&gt;&lt;/a&gt; lnet: Defer rspt cleanup when MD queued for unlink&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 3df41bb8515d5012d7e2f19b2d7019e3e1b64a71&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                                        </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="55965">LU-12441</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="57248">LU-12906</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="57249">LU-12907</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00jxz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>