<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:00:38 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13362] Race between message finalize and discovery reply processing</title>
                <link>https://jira.whamcloud.com/browse/LU-13362</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Two threads trying to acquire lnet_net_lock(1):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash_x86_64&amp;gt; p ((struct cfs_percpt_lock *)0xffff8808299ed880)-&amp;gt;pcl_locks[1]
$11 = (spinlock_t *) 0xffff88076447c640
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;*hornc@cflosbld09 200312170801 $ grep -B 4 -A 8 ffff88076447c640 bt.all
PID: 8971   TASK: ffff88076b3c4f00  CPU: 1   COMMAND: &quot;kiblnd_sd_00_00&quot;
    [exception RIP: queued_spin_lock_slowpath+283]
    RIP: ffffffff810a573b  RSP: ffffc90000ea7a80  RFLAGS: 00000202
    RAX: 0000000000300101  RBX: ffff8808299ed880  RCX: 0000000000000001
    RDX: 0000000000000101  RSI: 0000000000000001  RDI: ffff88076447c640
    RBP: ffffc90000ea7a80   R8: 0000000000000101   R9: ffffffffa05d1729
    R10: 0000000000000000  R11: ffffffffa05ebb48  R12: 0000000000000008
    R13: ffff88082be96600  R14: ffff88082be96600  R15: ffff88071da3e568
    CS: 0010  SS: 0018
 #0 [ffffc90000ea7a88] _raw_spin_lock at ffffffff816b5411
 #1 [ffffc90000ea7a98] cfs_percpt_lock at ffffffffa053c546 [libcfs]
 #2 [ffffc90000ea7ac0] lnet_handle_send at ffffffffa05a3bd2 [lnet]
 #3 [ffffc90000ea7b28] lnet_select_pathway at ffffffffa05a6e18 [lnet]
--
PID: 8983   TASK: ffff8807167a3080  CPU: 11  COMMAND: &quot;lnet_discovery&quot;
    [exception RIP: queued_spin_lock_slowpath+184]
    RIP: ffffffff810a56d8  RSP: ffffc900286e3e48  RFLAGS: 00000202
    RAX: 0000000000000000  RBX: ffff8808299ed880  RCX: ffff88085fae2a80
    RDX: 0000000000300101  RSI: 0000000000000101  RDI: ffff88076447c640
    RBP: ffffc900286e3e48   R8: 0000000000300000   R9: 0000000000000000
    R10: ffffc900286e3e90  R11: 0000000000000f2f  R12: 0000000000000008
    R13: ffff880828f021d0  R14: 0000000000000240  R15: 000500280a0c0035
    CS: 0010  SS: 0018
 #0 [ffffc900286e3e50] _raw_spin_lock at ffffffff816b5411
 #1 [ffffc900286e3e60] cfs_percpt_lock at ffffffffa053c546 [libcfs]
 #2 [ffffc900286e3e88] lnet_peer_discovery at ffffffffa05bc031 [lnet]
 #3 [ffffc900286e3f08] kthread at ffffffff8107e807
*hornc@cflosbld09 200312170801 $
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;A debug patch tells us which thread is holding this lock:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash_x86_64&amp;gt; p *((struct cfs_percpt_lock *)0xffff8808299ed880)-&amp;gt;pcl_locks_info[1]
$4 = {
  pcl_fn_ptr = 0xffffffffa05a69b7 &amp;lt;lnet_select_pathway+103&amp;gt;,
  pcl_thr_ptr = 0xffff88082cd6f040
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash_x86_64&amp;gt; task 0xffff88082cd6f040 | grep PID
PID: 8965   TASK: ffff88082cd6f040  CPU: 10  COMMAND: &quot;kgnilnd_sd_06&quot;
crash_x86_64&amp;gt; bt 8965
PID: 8965   TASK: ffff88082cd6f040  CPU: 10  COMMAND: &quot;kgnilnd_sd_06&quot;
    [exception RIP: queued_spin_lock_slowpath+180]
    RIP: ffffffff810a56d4  RSP: ffffc90000d73ad8  RFLAGS: 00000202
    RAX: 0000000000000000  RBX: ffff880828f02100  RCX: ffff88085faa2a80
    RDX: 00000000002c0101  RSI: 0000000000000101  RDI: ffff880828f02184
    RBP: ffffc90000d73ad8   R8: 00000000002c0000   R9: 0000000000000000
    R10: 0000000000000000  R11: 0000000000000000  R12: ffff88071e12d568
    R13: ffff880828f02100  R14: 0000000000000001  R15: ffffffffffffffff
    CS: 0010  SS: 0018
 #0 [ffffc90000d73ae0] _raw_spin_lock at ffffffff816b5411
 #1 [ffffc90000d73af0] lnet_peer_gw_discovery at ffffffffa05bd329 [lnet]
 #2 [ffffc90000d73b08] lnet_initiate_peer_discovery at ffffffffa05a31c3 [lnet]
 #3 [ffffc90000d73b40] lnet_select_pathway at ffffffffa05a6a4c [lnet]
 #4 [ffffc90000d73c40] lnet_send at ffffffffa05a7af5 [lnet]
 #5 [ffffc90000d73c60] lnet_finalize at ffffffffa05999fa [lnet]
 #6 [ffffc90000d73cc0] kgnilnd_tx_done at ffffffffa064cba4 [kgnilnd]
 #7 [ffffc90000d73d20] kgnilnd_check_fma_send_cq at ffffffffa0655884 [kgnilnd]
 #8 [ffffc90000d73e78] kgnilnd_scheduler at ffffffffa065a6ca [kgnilnd]
 #9 [ffffc90000d73f08] kthread at ffffffff8107e807
#10 [ffffc90000d73f50] ret_from_fork at ffffffff8180024a
crash_x86_64&amp;gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This thread is trying to get lp_lock:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;bool
lnet_peer_gw_discovery(struct lnet_peer *lp)
{
        bool rc = false;

        spin_lock(&amp;amp;lp-&amp;gt;lp_lock); &amp;lt;&amp;lt;&amp;lt;&amp;lt; Thread here
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The peer in question:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash_x86_64&amp;gt; struct -o lnet_peer | grep lp_lock
  [132] spinlock_t lp_lock;
crash_x86_64&amp;gt; eval ffff880828f02184 - 132
hexadecimal: ffff880828f02100
    decimal: 18446612167360782592  (-131906348769024)
      octal: 1777774200405074020400
     binary: 1111111111111111100010000000100000101000111100000010000100000000
crash_x86_64&amp;gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;From backtrace info in the dump we can see a few threads with this address in their stack.&lt;/p&gt;

&lt;p&gt;One is a kgnilnd_sd thread:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 8960   TASK: ffff8807644aa0c0  CPU: 13  COMMAND: &quot;kgnilnd_sd_01&quot;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This thread is finalizing a message. It is stuck acquiring an lp_lock in lnet_return_rx_credits_locked().&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lnet_return_rx_credits_locked(struct lnet_msg *msg)
{
...
                lp = rxpeerni-&amp;gt;lpni_peer_net-&amp;gt;lpn_peer;

                /* give back peer router credits */
                msg-&amp;gt;msg_peerrtrcredit = 0;

                spin_lock(&amp;amp;rxpeerni-&amp;gt;lpni_lock);
                spin_lock(&amp;amp;lp-&amp;gt;lp_lock);  &amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt; Thread is here
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Meanwhile, a kiblnd_sd thread is processing a discovery reply:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash_x86_64&amp;gt; bt 8977
PID: 8977   TASK: ffff88076550afc0  CPU: 15  COMMAND: &quot;kiblnd_sd_03_00&quot;
    [exception RIP: queued_spin_lock_slowpath+281]
    RIP: ffffffff810a5739  RSP: ffffc90000f13ae8  RFLAGS: 00000202
    RAX: 0000000000000101  RBX: ffff88082cfe3200  RCX: 0000000000000001
    RDX: 0000000000000101  RSI: 0000000000000001  RDI: ffff88082cfe32a4
    RBP: ffffc90000f13ae8   R8: 0000000000000101   R9: 000000000000000f
    R10: 0000000000000001  R11: ffff8806e12b7eea  R12: ffff880828f02100
    R13: ffff880828cb2209  R14: ffff880828f02184  R15: ffff880828cb2200
    CS: 0010  SS: 0018
 #0 [ffffc90000f13af0] _raw_spin_lock at ffffffff816b5411
 #1 [ffffc90000f13b00] lnet_peer_ni_clr_non_mr_pref_nid at ffffffffa05b81eb [lnet]
 #2 [ffffc90000f13b20] lnet_peer_clr_non_mr_pref_nids at ffffffffa05b82db [lnet]
 #3 [ffffc90000f13b40] lnet_discovery_event_handler at ffffffffa05be2cf [lnet]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It is holding lp_lock.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev)
{
...
        spin_lock(&amp;amp;lp-&amp;gt;lp_lock);
...
                        CDEBUG(D_NET, &quot;peer %s(%p) is MR capable\n&quot;,
                               libcfs_nid2str(lp-&amp;gt;lp_primary_nid), lp);
                        lp-&amp;gt;lp_state |= LNET_PEER_MULTI_RAIL;
                        lnet_peer_clr_non_mr_pref_nids(lp);

void
lnet_peer_clr_non_mr_pref_nids(struct lnet_peer *lp)
{
        struct lnet_peer_ni *lpni = NULL;

        while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL)
                lnet_peer_ni_clr_non_mr_pref_nid(lpni);
}

lnet_peer_ni_clr_non_mr_pref_nid(struct lnet_peer_ni *lpni)
{
...
        spin_lock(&amp;amp;lpni-&amp;gt;lpni_lock); &amp;lt;&amp;lt;&amp;lt;&amp;lt; Thread is here
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Thread A: spin_lock(lp_lock)&lt;br/&gt;
Thread B: spin_lock(lpni_lock)&lt;br/&gt;
Thread A: spin_lock(lpni_lock)&lt;br/&gt;
Thread B: spin_lock(lp_lock)&lt;br/&gt;
Deadlock.&lt;/p&gt;

&lt;p&gt;Since these two threads are deadlocked, this is preventing PID 8965 (which is holding lnet_net_lock(1)) from making progress. Thus causing further deadlocks with threads trying to acquire lnet_net_lock(1).&lt;/p&gt;

&lt;p&gt;Same issue exists with lnet_post_routed_recv_locked()&lt;/p&gt;</description>
                <environment></environment>
        <key id="58384">LU-13362</key>
            <summary>Race between message finalize and discovery reply processing</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="hornc">Chris Horn</assignee>
                                    <reporter username="hornc">Chris Horn</reporter>
                        <labels>
                    </labels>
                <created>Mon, 16 Mar 2020 15:55:37 +0000</created>
                <updated>Fri, 21 May 2021 13:10:01 +0000</updated>
                            <resolved>Tue, 24 Mar 2020 11:36:46 +0000</resolved>
                                    <version>Lustre 2.14.0</version>
                    <version>Lustre 2.12.5</version>
                                    <fixVersion>Lustre 2.14.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>2</watches>
                                                                            <comments>
                            <comment id="265349" author="gerrit" created="Mon, 16 Mar 2020 15:57:10 +0000"  >&lt;p&gt;Chris Horn (chris.horn@hpe.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/37937&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/37937&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13362&quot; title=&quot;Race between message finalize and discovery reply processing&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13362&quot;&gt;&lt;del&gt;LU-13362&lt;/del&gt;&lt;/a&gt; lnet: Disc reply race with finalize and routed recv&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 1453f3fdf7c3d6bba99a3b464b0a5b19997db1f5&lt;/p&gt;</comment>
                            <comment id="265927" author="gerrit" created="Tue, 24 Mar 2020 05:16:27 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/37937/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/37937/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13362&quot; title=&quot;Race between message finalize and discovery reply processing&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13362&quot;&gt;&lt;del&gt;LU-13362&lt;/del&gt;&lt;/a&gt; lnet: Disc reply race with finalize and routed recv&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: c700b4a410ab5542391d006ce541023ecf9b7a5d&lt;/p&gt;</comment>
                            <comment id="265975" author="pjones" created="Tue, 24 Mar 2020 11:36:46 +0000"  >&lt;p&gt;Landed for 2.14&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00vkf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>