<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:10:01 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-7569] IB leaf switch caused LNet routers to crash</title>
                <link>https://jira.whamcloud.com/browse/LU-7569</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;During testing we lost one of the IB leaf switches which caused all of our lustre router to crash with the following error:&lt;/p&gt;

&lt;p&gt;2015-12-11T10:53:29.539273-05:00 c0-0c0s2n3 LNetError: 4675:0:(o2iblnd.c:399:kiblnd_find_peer_locked()) ASSERTION( peer-&amp;gt;ibp_connecting &amp;gt; 0 || peer-&amp;gt;ibp_accepting &amp;gt; 0 || !list_empty(&amp;amp;peer-&amp;gt;ibp_conns) ) failed:&lt;br/&gt;
2015-12-11T10:53:29.539305-05:00 c0-0c0s2n3 LNetError: 4675:0:(o2iblnd.c:399:kiblnd_find_peer_locked()) LBUG&lt;br/&gt;
2015-12-11T10:53:29.539313-05:00 c0-0c0s2n3 Pid: 4675, comm: kgnilnd_sd_02&lt;br/&gt;
2015-12-11T10:53:29.539319-05:00 c0-0c0s2n3 Call Trace:&lt;br/&gt;
2015-12-11T10:53:29.539324-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81006651&amp;gt;&amp;#93;&lt;/span&gt; try_stack_unwind+0x161/0x1a0&lt;br/&gt;
2015-12-11T10:53:29.539332-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81004eb9&amp;gt;&amp;#93;&lt;/span&gt; dump_trace+0x89/0x430&lt;br/&gt;
2015-12-11T10:53:29.539339-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa025bac0&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x90/0x1d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539348-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa036f32b&amp;gt;&amp;#93;&lt;/span&gt; kiblnd_find_peer_locked+0x14b/0x150 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539358-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa036f379&amp;gt;&amp;#93;&lt;/span&gt; kiblnd_query+0x49/0x1c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539364-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa02d5aee&amp;gt;&amp;#93;&lt;/span&gt; lnet_post_send_locked+0x2ee/0x740 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539369-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa02d84f0&amp;gt;&amp;#93;&lt;/span&gt; lnet_send+0x6a0/0xcf0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539375-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa02cbe94&amp;gt;&amp;#93;&lt;/span&gt; lnet_finalize+0x424/0x800 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539380-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03d256b&amp;gt;&amp;#93;&lt;/span&gt; kgnilnd_recv+0x73b/0xdf0 &lt;span class=&quot;error&quot;&gt;&amp;#91;kgnilnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539385-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa02d432f&amp;gt;&amp;#93;&lt;/span&gt; lnet_ni_recv+0xcf/0x330 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539389-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa02dac26&amp;gt;&amp;#93;&lt;/span&gt; lnet_parse+0x3c6/0xe40 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539394-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03d8111&amp;gt;&amp;#93;&lt;/span&gt; kgnilnd_check_fma_rx+0x1af1/0x1f50 &lt;span class=&quot;error&quot;&gt;&amp;#91;kgnilnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539406-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03dbbc4&amp;gt;&amp;#93;&lt;/span&gt; kgnilnd_process_conns+0x554/0x15d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;kgnilnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539411-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03dcf1e&amp;gt;&amp;#93;&lt;/span&gt; kgnilnd_scheduler+0x2de/0x5f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;kgnilnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539416-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81067ace&amp;gt;&amp;#93;&lt;/span&gt; kthread+0x9e/0xb0&lt;br/&gt;
2015-12-11T10:53:29.539421-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81490074&amp;gt;&amp;#93;&lt;/span&gt; kernel_thread_helper+0x4/0x10&lt;br/&gt;
2015-12-11T10:53:29.539427-05:00 c0-0c0s2n3 Kernel panic - not syncing: LBUG&lt;br/&gt;
2015-12-11T10:53:29.539432-05:00 c0-0c0s2n3 Pid: 4675, comm: kgnilnd_sd_02 Tainted: P             3.0.101-0.46.1_1.0502.8871-cray_gem_s #1&lt;br/&gt;
2015-12-11T10:53:29.539437-05:00 c0-0c0s2n3 Call Trace:&lt;br/&gt;
2015-12-11T10:53:29.539442-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81006651&amp;gt;&amp;#93;&lt;/span&gt; try_stack_unwind+0x161/0x1a0&lt;br/&gt;
2015-12-11T10:53:29.539447-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81004eb9&amp;gt;&amp;#93;&lt;/span&gt; dump_trace+0x89/0x430&lt;br/&gt;
2015-12-11T10:53:29.539452-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810060bc&amp;gt;&amp;#93;&lt;/span&gt; show_trace_log_lvl+0x5c/0x80&lt;br/&gt;
2015-12-11T10:53:29.539457-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810060f5&amp;gt;&amp;#93;&lt;/span&gt; show_trace+0x15/0x20&lt;br/&gt;
2015-12-11T10:53:29.539462-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8148b31c&amp;gt;&amp;#93;&lt;/span&gt; dump_stack+0x79/0x84&lt;br/&gt;
2015-12-11T10:53:29.539467-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8148b3bb&amp;gt;&amp;#93;&lt;/span&gt; panic+0x94/0x1da&lt;br/&gt;
2015-12-11T10:53:29.539473-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa025bbf1&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x1c1/0x1d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539479-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa036f32b&amp;gt;&amp;#93;&lt;/span&gt; kiblnd_find_peer_locked+0x14b/0x150 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539484-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa036f379&amp;gt;&amp;#93;&lt;/span&gt; kiblnd_query+0x49/0x1c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539489-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa02d5aee&amp;gt;&amp;#93;&lt;/span&gt; lnet_post_send_locked+0x2ee/0x740 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539494-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa02d84f0&amp;gt;&amp;#93;&lt;/span&gt; lnet_send+0x6a0/0xcf0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539500-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa02cbe94&amp;gt;&amp;#93;&lt;/span&gt; lnet_finalize+0x424/0x800 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539505-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03d256b&amp;gt;&amp;#93;&lt;/span&gt; kgnilnd_recv+0x73b/0xdf0 &lt;span class=&quot;error&quot;&gt;&amp;#91;kgnilnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539511-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa02d432f&amp;gt;&amp;#93;&lt;/span&gt; lnet_ni_recv+0xcf/0x330 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539519-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa02dac26&amp;gt;&amp;#93;&lt;/span&gt; lnet_parse+0x3c6/0xe40 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539526-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03d8111&amp;gt;&amp;#93;&lt;/span&gt; kgnilnd_check_fma_rx+0x1af1/0x1f50 &lt;span class=&quot;error&quot;&gt;&amp;#91;kgnilnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539532-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03dbbc4&amp;gt;&amp;#93;&lt;/span&gt; kgnilnd_process_conns+0x554/0x15d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;kgnilnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539537-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03dcf1e&amp;gt;&amp;#93;&lt;/span&gt; kgnilnd_scheduler+0x2de/0x5f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;kgnilnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
2015-12-11T10:53:29.539544-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81067ace&amp;gt;&amp;#93;&lt;/span&gt; kthread+0x9e/0xb0&lt;br/&gt;
2015-12-11T10:53:29.539550-05:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81490074&amp;gt;&amp;#93;&lt;/span&gt; kernel_thread_helper+0x4/0x10&lt;/p&gt;</description>
                <environment></environment>
        <key id="33736">LU-7569</key>
            <summary>IB leaf switch caused LNet routers to crash</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="doug">Doug Oucharek</assignee>
                                    <reporter username="simmonsja">James A Simmons</reporter>
                        <labels>
                    </labels>
                <created>Wed, 16 Dec 2015 23:43:44 +0000</created>
                <updated>Tue, 24 Oct 2017 13:02:34 +0000</updated>
                            <resolved>Mon, 18 Jan 2016 18:59:45 +0000</resolved>
                                    <version>Lustre 2.8.0</version>
                                    <fixVersion>Lustre 2.8.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>21</watches>
                                                                            <comments>
                            <comment id="136643" author="yujian" created="Thu, 17 Dec 2015 00:56:46 +0000"  >&lt;p&gt;Hi Amir,&lt;/p&gt;

&lt;p&gt;Could you please advise? Thank you.&lt;/p&gt;</comment>
                            <comment id="136691" author="jfilizetti" created="Thu, 17 Dec 2015 13:21:27 +0000"  >&lt;p&gt;I&apos;ve seen a handful of at least suspects ways these conditional checks for ibp_connecting, ibp_accepting and ibp_conns can be incorrect if things are slow to respond and connections get rejected.  With the inclusion of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3322&quot; title=&quot;ko2iblnd support for different map_on_demand and peer_credits between systems&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3322&quot;&gt;&lt;del&gt;LU-3322&lt;/del&gt;&lt;/a&gt; rejections can be common and I think this is exposing some of these problems with the reconnect logic and more so conn race patch &lt;a href=&quot;http://review.whamcloud.com/14600/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/14600/&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="136714" author="gerrit" created="Thu, 17 Dec 2015 16:15:18 +0000"  >&lt;p&gt;Liang Zhen (liang.zhen@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/17661&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/17661&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7569&quot; title=&quot;IB leaf switch caused LNet routers to crash&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7569&quot;&gt;&lt;del&gt;LU-7569&lt;/del&gt;&lt;/a&gt; o2iblnd: multiple fixes for reconnection&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: e4777765fc6e4bdc9a9331e139a5884261245eb0&lt;/p&gt;</comment>
                            <comment id="136715" author="liang" created="Thu, 17 Dec 2015 16:18:25 +0000"  >&lt;p&gt;I&apos;ve submitted a patch which could be helpful, &lt;a href=&quot;http://review.whamcloud.com/17661&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/17661&lt;/a&gt; , but I have no environment to test it, so it is only for review for the time being.&lt;/p&gt;</comment>
                            <comment id="136761" author="simmonsja" created="Thu, 17 Dec 2015 20:22:03 +0000"  >&lt;p&gt;Liang I just rebooted a system with 17661 and we rebooted the leaf switch. It completely worked. No more routers OOPs on us. Thank you.&lt;/p&gt;</comment>
                            <comment id="136777" author="doug" created="Thu, 17 Dec 2015 21:05:58 +0000"  >&lt;p&gt;James, did all the clients re-connect ok?  None of them got stuck on reconnecting?&lt;/p&gt;</comment>
                            <comment id="137802" author="simmonsja" created="Mon, 4 Jan 2016 16:13:41 +0000"  >&lt;p&gt;Yes Doug they did all reconnect okay. I did find a problem with this patch tho. I found if I place the following in my modprobe configuration file I can crash my client nodes.&lt;/p&gt;

&lt;p&gt;options ko2iblnd timeout=100 credits=2560 ntx=5120 peer_credits=63 concurrent_sends=63 fmr_pool_size=1280 fmr_flush_trigger=1024 map_on_demand=64&lt;/p&gt;

&lt;p&gt;and then modprobe lnet;lctl net up&lt;/p&gt;

&lt;p&gt;You will then see the following back trace:&lt;br/&gt;
Dec 30 15:02:05 spoon17.ccs.ornl.gov kernel: [ 337.292250] LNetError: 20000:0:(o2iblnd_cb.c:1309:kiblnd_reconnect_pee r()) ASSERTION( peer-&amp;gt;ibp_connecting == 1 ) failed: &lt;br/&gt;
Dec 30 15:02:05 spoon17.ccs.ornl.gov kernel: [ 337.303363] LNetError: 20000:0:(o2iblnd_cb.c:1309:kiblnd_reconnect_pee r()) LBUG Dec 30 15:02:05 spoon17.ccs.ornl.gov kernel: [ 337.310739] Pid: 20000, comm: kiblnd_connd &lt;br/&gt;
Dec 30 15:02:05 spoon17.ccs.ornl.gov kernel: [ 337.314891] &lt;br/&gt;
Dec 30 15:02:05 spoon17.ccs.ornl.gov kernel: [ 337.314892] Call Trace: Dec 30 15:02:05 spoon17.ccs.ornl.gov kernel: [ 337.318939] &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0740875&amp;gt;&amp;#93;&lt;/span&gt; libcfs_debug_dumpstack+0x55/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Dec 30 15:02:05 spoon17.ccs.ornl.gov kernel: [ 337.325962] &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0740e77&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt; &lt;br/&gt;
Dec 30 15:02:05 spoon17.ccs.ornl.gov kernel: [ 337.332212] &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa08431c8&amp;gt;&amp;#93;&lt;/span&gt; kiblnd_reconnect_peer+0x118/0x150 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt; &lt;br/&gt;
Dec 30 15:02:05 spoon17.ccs.ornl.gov kernel: [ 337.339608] &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa083aee0&amp;gt;&amp;#93;&lt;/span&gt; kiblnd_destroy_conn+0x4c0/0x810 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt; &lt;br/&gt;
Dec 30 15:02:05 spoon17.ccs.ornl.gov kernel: [ 337.346784] &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa08486b1&amp;gt;&amp;#93;&lt;/span&gt; kiblnd_connd+0xc1/0xbc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
Dec 30 15:02:05 spoon17.ccs.ornl.gov kernel: [ 337.353248] &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81064d00&amp;gt;&amp;#93;&lt;/span&gt; ? default_wake_function+0x0/0x20 &lt;/p&gt;</comment>
                            <comment id="138090" author="simmonsja" created="Wed, 6 Jan 2016 17:07:11 +0000"  >&lt;p&gt;Liang, Doug have you been able to duplicate my crash?&lt;/p&gt;</comment>
                            <comment id="138134" author="doug" created="Wed, 6 Jan 2016 21:00:22 +0000"  >&lt;p&gt;Just a status update on this patch:  There are multiple problems with the reconnection code on o2iblnd which we are trying to address here (see list of related patches).  As you can see from Liang&apos;s patch, significant changes are being made to the reconnection strategy to address them.&lt;/p&gt;

&lt;p&gt;At the moment, I know of two issues with the current version of this patch:&lt;/p&gt;

&lt;p&gt;1- Reconnections due to different negotiated parameters can cause an LBUG (what you are finding James)&lt;br/&gt;
2- When an LNet router reboots, an infinite loop of CONN RACE reconnects can ensue if the LNet router has the larger NID value.&lt;/p&gt;

&lt;p&gt;I&apos;m working on number 2 with a customer who has run into this.  My current theory is that the client tries to reconnect to the router while the router has not completely come up yet.  If that connection attempt gets stuck (i.e. client never hears back from it), it can trigger never-ending reconnects.&lt;/p&gt;

&lt;p&gt;I&apos;d like the patch for this ticket address the above 2 issues so we can kill off many problems at once here.&lt;/p&gt;</comment>
                            <comment id="138136" author="simmonsja" created="Wed, 6 Jan 2016 21:19:05 +0000"  >&lt;p&gt;I think I know why number 1 happens. The function kiblnd_check_reconnect() return right away if peer-&amp;gt;ibp_connecting != 1. So for the checks to actually happen we need the condition peer-&amp;gt;ibp_connecting == 1. But for some of the checks we end up incrementing ibp_connecting again. I think the logic is reversed from what it should be. Since we know peer-&amp;gt;ibp_connecting == 1 on critical failure it should decremented. I&apos;m testing this change now.&lt;/p&gt;</comment>
                            <comment id="138146" author="simmonsja" created="Wed, 6 Jan 2016 22:44:30 +0000"  >&lt;p&gt;My theory was wrong. Still crashes.&lt;/p&gt;</comment>
                            <comment id="138205" author="simmonsja" created="Thu, 7 Jan 2016 15:07:23 +0000"  >&lt;p&gt;Thanks to Jeremy he pointed out I was using a old patch so with my fix the latest version of the patch resolves problem 1 Doug listed. I haven&apos;t run into case 2 so no fix for that.&lt;/p&gt;</comment>
                            <comment id="138212" author="doug" created="Thu, 7 Jan 2016 17:00:17 +0000"  >&lt;p&gt;That is great news!  For issue 2, I have a theoretical fix which will be tested today.  From the logs, I am seeing this pattern which causes issue 2:&lt;/p&gt;

&lt;p&gt;1- LNet router reboots (for any reason).&lt;br/&gt;
2- A client fails to transmit to the router and fails the connection.  This cleans up the connection and peer structure.&lt;br/&gt;
3- Attempts to create active connections from the client to the router are continuously made until the router is back up.&lt;br/&gt;
4- I then see the router trying to create an active connection to the client.&lt;br/&gt;
5- For some unknown reason, the client seems to have a connection to the router stuck in a connecting state.  &lt;br/&gt;
6- In this scenario. the client has the larger NID value so the client rejects router&apos;s connection attempt as CONN RACE.&lt;br/&gt;
7- The router does a reconnect, goto 5.&lt;/p&gt;

&lt;p&gt;Thus, we have an infinite loop caused by what appears to be a stuck connecting connection.  The code assumes we will always hear back from connection attempts so there is no cleanup of the connection by connd.  Why the connection is stuck is unknown to me, but the code should be robust enough to detect this and avoid this infinite loop (i.e. self-healing code).&lt;/p&gt;

&lt;p&gt;Note: I am only seeing this with mlx5-based cards.&lt;/p&gt;</comment>
                            <comment id="138345" author="liang" created="Fri, 8 Jan 2016 17:25:02 +0000"  >&lt;p&gt;as the original patch has defects and been reverted from master, so the above patch is invalid now, I reimplemented the patch, which includes the original feature and improvements : &lt;a href=&quot;http://review.whamcloud.com/#/c/17892/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/17892/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;For the stuck issue described by Doug, I&apos;m not sure if it is a general issue, or just a bug in a particular version of mlx5 driver. So this patch didn&apos;t include the self-healing code mentioned by Doug. If you don&apos;t have the stuck issue, then probably you don&apos;t need the self-healing code at all.&lt;/p&gt;</comment>
                            <comment id="138352" author="simmonsja" created="Fri, 8 Jan 2016 18:06:10 +0000"  >&lt;p&gt;Doug since you don&apos;t have a solution just yet for the connection issue could you create a new patch on top of the new one posted by Liang.&lt;/p&gt;</comment>
                            <comment id="138354" author="doug" created="Fri, 8 Jan 2016 18:08:50 +0000"  >&lt;p&gt;Ok, will do. I will create a new Jira ticket for that solution so this ticket is free to land Liang&apos;s latest patch and close.&lt;/p&gt;</comment>
                            <comment id="138356" author="simmonsja" created="Fri, 8 Jan 2016 18:20:03 +0000"  >&lt;p&gt;Perhaps this problem will not exist with Liang latest patch? Its worth a try.&lt;/p&gt;</comment>
                            <comment id="138395" author="doug" created="Fri, 8 Jan 2016 22:40:56 +0000"  >&lt;p&gt;Jay: once this has landed to master (inspected, tested, etc) then it can be ported.&lt;/p&gt;</comment>
                            <comment id="138511" author="simmonsja" created="Mon, 11 Jan 2016 16:31:58 +0000"  >&lt;p&gt;I&apos;m of the opinion that this should be a blocker. Currently without this work it is possible if a IB leaf switch reboots to take down all the LNet routers. Because of this it needs to be slated for 2.8 landing.&lt;/p&gt;</comment>
                            <comment id="138544" author="doug" created="Mon, 11 Jan 2016 19:25:49 +0000"  >&lt;p&gt;Is this still true with 14600 reverted?&lt;/p&gt;</comment>
                            <comment id="138845" author="simmonsja" created="Wed, 13 Jan 2016 20:28:09 +0000"  >&lt;p&gt;I just tested this patch with 14600 reverted on our Cray system and the routers crashed again. So this is still a serious problem. If we lose a IB leaf switch we lose the entire file system. IMHO this should be a blocker.&lt;/p&gt;</comment>
                            <comment id="138848" author="liang" created="Wed, 13 Jan 2016 20:34:01 +0000"  >&lt;p&gt;what&apos;s the crash looks like after reverting of 14600, is it OOM or another assertion?&lt;/p&gt;</comment>
                            <comment id="138850" author="simmonsja" created="Wed, 13 Jan 2016 21:01:05 +0000"  >&lt;p&gt;Hmmm. Doesn&apos;t appear to LNet related.&lt;/p&gt;

&lt;p&gt;2016-01-13T15:56:22.464787-05:00 c0-0c0s7n1 LustreError: 167-0: sultan-OST000e-osc-ffff880405221400: This client was evicted by sultan-OST000e; in progress operations using this service will fail.&lt;br/&gt;
2016-01-13T15:56:22.464804-05:00 c0-0c0s7n1 LustreError: Skipped 11 previous similar messages&lt;br/&gt;
2016-01-13T15:56:22.514072-05:00 c0-0c0s7n1 Lustre: 2541:0:(llite_lib.c:2628:ll_dirty_page_discard_warn()) sultan: dirty page discard: 10.37.248.67@o2ib1:/sultan/fid: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x20000a898:0x21:0x0&amp;#93;&lt;/span&gt;//stf008/scratch/jsimmons/test_ior/testfile.out may get corrupted (rc -108)&lt;br/&gt;
2016-01-13T15:56:22.756634-05:00 c0-0c0s7n1 Lustre: sultan-OST0025-osc-ffff880405221400: Connection restored to 10.37.248.70@o2ib1 (at 10.37.248.70@o2ib1)&lt;/p&gt;</comment>
                            <comment id="139158" author="gerrit" created="Mon, 18 Jan 2016 05:58:34 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/17892/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/17892/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7569&quot; title=&quot;IB leaf switch caused LNet routers to crash&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7569&quot;&gt;&lt;del&gt;LU-7569&lt;/del&gt;&lt;/a&gt; o2iblnd: avoid intensive reconnecting&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 9ab698e4d99103b2fecf19b0fd3f90d28723e9d1&lt;/p&gt;</comment>
                            <comment id="139196" author="pjones" created="Mon, 18 Jan 2016 18:59:45 +0000"  >&lt;p&gt;Landed for 2.8&lt;/p&gt;</comment>
                            <comment id="148099" author="dmiter" created="Thu, 7 Apr 2016 14:40:36 +0000"  >&lt;p&gt;Jay, This patch is under review now. So, soon it will be landed to b2_7_fe also.&lt;/p&gt;</comment>
                            <comment id="148198" author="doug" created="Thu, 7 Apr 2016 23:58:51 +0000"  >&lt;p&gt;Jay, it has been ported to 2.7 FE as: &lt;a href=&quot;http://review.whamcloud.com/#/c/18051/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/18051/&lt;/a&gt;. &lt;/p&gt;</comment>
                            <comment id="148203" author="jaylan" created="Fri, 8 Apr 2016 00:38:28 +0000"  >&lt;p&gt;Doug, although the patch at &lt;a href=&quot;http://review.whamcloud.com/#/c/18051/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/18051/&lt;/a&gt;. &lt;br/&gt;
says &quot;b2_7_fe&quot; under &quot;branch&quot;, there is a red remark saying &quot;Cannot Merge&quot; under &quot;Strategy.&quot;&lt;/p&gt;

&lt;p&gt;Probably the patch was not generated against b2_7_fe? I encountered non-trivia conflicts at&lt;br/&gt;
	both modified:   lnet/klnds/o2iblnd/o2iblnd.h&lt;br/&gt;
	both modified:   lnet/klnds/o2iblnd/o2iblnd_cb.c&lt;/p&gt;
</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="33008">LU-7390</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="34030">LU-7646</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="26914">LU-5718</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="32327">LU-7210</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="18907">LU-3322</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="32697">LU-7314</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="34143">LU-7676</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzxw3r:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>