<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:50:51 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5364] Lustre Router connection hangs one side of fabric</title>
                <link>https://jira.whamcloud.com/browse/LU-5364</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We have 2 IB fabrics connected with 2 lustre routers. One side of fabrics connected via obsidain longbows and the other fabrics is directed connected to routers via qdr switch.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Fabric1_o2ib233 &amp;lt;---&amp;gt;LONGBOW1&amp;lt;----&amp;lt;ROUTER1&amp;gt;----&amp;gt;QDR&amp;lt;--Fabric2_o2ib
Fabric1_o2ib233 &amp;lt;---&amp;gt;LONGBOW2&amp;lt;----&amp;lt;ROUTER2&amp;gt;-----&amp;gt;QDR&amp;lt;--Fabric2_o2ib
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;We get Router disconnects on the fabric2_o2ib side with errors like this on the routers&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;LNet: 1310:0:(o2iblnd_cb.c:2360:kiblnd_passive_connect()) Conn race 10.151.27.74@o2ib
LNet: 1308:0:(o2iblnd_cb.c:2360:kiblnd_passive_connect()) Conn race 10.151.27.86@o2ib
LNet: 1312:0:(o2iblnd_cb.c:2360:kiblnd_passive_connect()) Conn race 10.151.25.242@o2ib
LNet: 1312:0:(o2iblnd_cb.c:2360:kiblnd_passive_connect()) Conn race 10.151.25.156@o2ib
LNet: 1314:0:(o2iblnd_cb.c:2360:kiblnd_passive_connect()) Conn race 10.151.27.80@o2ib
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;ROUTER MODULE SETTINGS&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;options lnet networks=&lt;span class=&quot;code-quote&quot;&gt;&quot;o2ib(ib1),o2ib233(ib0)&quot;&lt;/span&gt; forwarding=enabled
options ko2iblnd require_privileged_port=0
options ko2iblnd use_privileged_port=0
options ko2iblnd timeout=150
options ko2iblnd retry_count=7
options ko2iblnd peer_timeout=0
options ptlrpc at_min=100
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;SERVERS SETTINGS&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;options ko2iblnd require_privileged_port=0
options ko2iblnd use_privileged_port=0
options lnet networks=o2ib(ib1),o2ib100(ib1) routes=&lt;span class=&quot;code-quote&quot;&gt;&quot;o2ib233 10.151.27.[58,93]@o2ib&quot;&lt;/span&gt; dead_router_check_interval=60 live_router_check_interval=60
# Get rid of messages &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; missing, special-purpose hardware (LU-1599)
blacklist padlock-sha
options ko2iblnd timeout=150
options ko2iblnd retry_count=7
options ko2iblnd peer_timeout=0
options ptlrpc at_min=100
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;CLIENTS&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;options ko2iblnd require_privileged_port=0
options ko2iblnd use_privileged_port=0
options lnet networks=o2ib233(ib1) routes=&lt;span class=&quot;code-quote&quot;&gt;&quot;o2ib 10.153.27.[58,93]@o2ib233&quot;&lt;/span&gt; dead_router_check_interval=60 live_router_check_interval=60
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="25636">LU-5364</key>
            <summary>Lustre Router connection hangs one side of fabric</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="6">Not a Bug</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Thu, 17 Jul 2014 20:52:13 +0000</created>
                <updated>Tue, 12 Aug 2014 13:54:16 +0000</updated>
                            <resolved>Tue, 22 Jul 2014 18:52:26 +0000</resolved>
                                    <version>Lustre 2.4.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="89417" author="pjones" created="Thu, 17 Jul 2014 21:03:35 +0000"  >&lt;p&gt;Amir&lt;/p&gt;

&lt;p&gt;Could you please assist with this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="89436" author="ashehata" created="Fri, 18 Jul 2014 00:14:58 +0000"  >&lt;p&gt;currently investigating.  Will update when I have more information.&lt;/p&gt;</comment>
                            <comment id="89560" author="ashehata" created="Fri, 18 Jul 2014 23:06:36 +0000"  >&lt;p&gt;The race the error message is referring to, is when the router receives an ib connect but there already exists a peer with the same nid (of the destination) in connecting state.  This happens if the router is already in the process of establishing a connection with that nid.  In this case the incoming connection gets dropped.  There are multiple scenarios where that occurs:&lt;br/&gt;
1. if the router is in the process of transmitting a message to the destination nid, and is currently connecting&lt;br/&gt;
2. if the router receives 2 consecutive connects from the same nid (although, I&apos;m not sure if this is a possible case).&lt;br/&gt;
3. if the router is reconnecting to the peer when it gets another connection request.&lt;/p&gt;

&lt;p&gt;I&apos;m still investigating the code more thoroughly to try and understand which scenario is more likely.&lt;/p&gt;

&lt;p&gt;Would it be possible to grab syslog messages from the router to see the errors in context.&lt;/p&gt;

&lt;p&gt;Also do you hit this issue rightaway or does the system work for a while before the problem is encountered?&lt;/p&gt;</comment>
                            <comment id="89635" author="ashehata" created="Mon, 21 Jul 2014 16:33:06 +0000"  >&lt;p&gt;After examining the code some more, these &quot;Conn race&quot; should not result in hanging.  The side that the router disconnected due to a race, only did so, because there is already another connection to that side in progress.&lt;/p&gt;

&lt;p&gt;I just want to clarify if the symptoms being experienced are temporary disconnects or permanent hangs with no recovery?&lt;/p&gt;

&lt;p&gt;Also, as indicated in my previous comments, if we could get the logs from both sides of the router, as well as logs from the router, when this problem occurs, that&apos;ll help in giving context to the problem.&lt;/p&gt;</comment>
                            <comment id="89764" author="mhanafi" created="Tue, 22 Jul 2014 17:56:50 +0000"  >&lt;p&gt;Further testing showed that this may have been due IB fabric. &lt;/p&gt;


&lt;p&gt;You may close this for now. I will reopen it when we have more data.&lt;/p&gt;
</comment>
                            <comment id="89776" author="pjones" created="Tue, 22 Jul 2014 18:52:26 +0000"  >&lt;p&gt;ok thanks Mahmoud!&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwrpj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>14961</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>