<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:11:17 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-14615] can&apos;t add tcp nid</title>
                <link>https://jira.whamcloud.com/browse/LU-14615</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;On a few host when adding tcp nid using &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
lnetctl lnet configure
lnetctl net add --net tcp --&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; ib1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;We get this error&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[1137072.940179] LNet: 39800:0:(config.c:1641:lnet_inet_enumerate()) lnet: Ignoring &lt;span class=&quot;code-keyword&quot;&gt;interface&lt;/span&gt; eth2: it&apos;s down
[1137072.950118] LNet: 39800:0:(config.c:1641:lnet_inet_enumerate()) Skipped 2 previous similar messages
[1137072.959931] LNet: Added LNI 10.151.27.21@tcp [8/256/0/180]
[1137072.959988] LNetError: 39814:0:(lib-socket.c:315:lnet_sock_listen()) Can&apos;t create socket: port 988 already in use
[1137072.970687] LNetError: 122-1: Can&apos;t start acceptor on port 988: port already in use
[1137072.970724] LNetError: 39800:0:(api-ni.c:3123:lnet_add_net_common()) Failed to start up acceptor thread
[1137073.977512] LNet: Removed LNI 10.151.27.21@tcp
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Nothing is using that port&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
# lsof -i tcp@localhost:988
# lsof -i udp@localhost:988
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="63800">LU-14615</key>
            <summary>can&apos;t add tcp nid</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Wed, 14 Apr 2021 20:26:48 +0000</created>
                <updated>Fri, 16 Sep 2022 16:14:00 +0000</updated>
                            <resolved>Fri, 16 Sep 2022 16:14:00 +0000</resolved>
                                    <version>Lustre 2.12.5</version>
                    <version>Lustre 2.12.6</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="298845" author="ashehata" created="Thu, 15 Apr 2021 07:04:43 +0000"  >&lt;p&gt;Are you using ib1 for an o2iblnd network as well?&lt;/p&gt;</comment>
                            <comment id="298865" author="pjones" created="Thu, 15 Apr 2021 13:07:44 +0000"  >&lt;p&gt;What version is this Mahmoud?&lt;/p&gt;</comment>
                            <comment id="298882" author="mhanafi" created="Thu, 15 Apr 2021 14:56:33 +0000"  >&lt;p&gt;This is 2.12.5 and yes we are using o2ib on ib1 also. It worked on most of the node. &#160;&lt;/p&gt;</comment>
                            <comment id="299068" author="ashehata" created="Fri, 16 Apr 2021 22:08:42 +0000"  >&lt;p&gt;Can you check the service port for the o2iblnd? What is that set to? I&apos;m thinking if it&apos;s 988 instead of 987, then you could run into this problem.&lt;/p&gt;</comment>
                            <comment id="300141" author="mhanafi" created="Thu, 29 Apr 2021 17:42:51 +0000"  >&lt;p&gt;We don&apos;t have o2ib listed in /etc/services.&lt;/p&gt;

&lt;p&gt;How do I check the port. This also occurs randomly when nodes are rebooted.&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="300368" author="ashehata" created="Mon, 3 May 2021 20:00:02 +0000"  >&lt;p&gt;Does this show any useful results&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 netstat -lnp&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;the error in the description is printed if kernel_bind() returns EADDRINUSE.&lt;/p&gt;

&lt;p&gt;I looked at the kernel_bind() code and it seems that a port can be shared in the following circumstances&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
&lt;span class=&quot;code-comment&quot;&gt;// from  include/net/inet_hashtables.h
&lt;/span&gt; 45  *&#187;&#183;&#183;&#183;&#183;&#183;1) Sockets bound to different interfaces may share a local port.                               
 46  *&#187;&#183;&#183;&#183;&#183;&#183;   Failing that, &lt;span class=&quot;code-keyword&quot;&gt;goto&lt;/span&gt; test 2.                                                                  
 47  *&#187;&#183;&#183;&#183;&#183;&#183;2) If all sockets have sk-&amp;gt;sk_reuse set, and none of them are in                               
 48  *&#187;&#183;&#183;&#183;&#183;&#183;   TCP_LISTEN state, the port may be shared.                                                   
 49  *&#187;&#183;&#183;&#183;&#183;&#183;   Failing that, &lt;span class=&quot;code-keyword&quot;&gt;goto&lt;/span&gt; test 3.                                                                  
 50  *&#187;&#183;&#183;&#183;&#183;&#183;3) If all sockets are bound to a specific inet_sk(sk)-&amp;gt;rcv_saddr local                         
 51  *&#187;&#183;&#183;&#183;&#183;&#183;   address, and none of them are the same, the port may be                                     
 52  *&#187;&#183;&#183;&#183;&#183;&#183;   shared.                                                                                     
 53  *&#187;&#183;&#183;&#183;&#183;&#183;   Failing &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt;, the port cannot be shared. &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;When we create the port we do set the SO_REUSEADDR and we bind to any address on the system&lt;/p&gt;

&lt;p&gt;Some debugging steps I would take&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;Try a different listening port instead of 988&lt;/li&gt;
	&lt;li&gt;Does it happen on all privileged ports? Is the behaviour different if we use a non-privileged port?&lt;/li&gt;
	&lt;li&gt;What&apos;s the ib status of the interface at the time of the bind? Could it be possible that the IB HCA hasn&apos;t fully initialized yet? We&apos;ve seen cases when the IB stack might not have been initialized by the time we bring up the LND. I know you&apos;re using the interface for ethernet, but it&apos;s worth looking at the status of the card.&lt;/li&gt;
	&lt;li&gt;Dump the results of the netstat -lnp and ibstatus at the beginning when a node is rebooted.&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;Are you able to consistently reproduce this problem?&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="300396" author="mhanafi" created="Tue, 4 May 2021 04:12:00 +0000"  >&lt;p&gt;netstat show nothing. None else using that port.&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;How do I try a different port.&lt;/li&gt;
	&lt;li&gt;We don&apos;t privileged port configured. (why is it using a privilege port?)&lt;br/&gt;
     I don&apos;t see the same options for tcp
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 options ko2iblnd require_privileged_port=0 use_privileged_port=0
 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;3 . We have seen this issue but that give a different error and we can bring up the interface later.&lt;/p&gt;

&lt;p&gt;When the node is in this state if we remove the tcp option from lustre.conf and try to load the module we the same error.&lt;/p&gt;</comment>
                            <comment id="300453" author="ashehata" created="Tue, 4 May 2021 15:31:27 +0000"  >&lt;p&gt;you can set:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
options lnet accept_port=XXX &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;By non-privileged I was thinking anything above 1024.&lt;/p&gt;

&lt;p&gt;The accept_port would need to be set consistently in order for the nodes to connect.&lt;/p&gt;

&lt;p&gt;Another thing to look out for is the actual network interface not coming up or maybe the IPoIB is not finished configuring (not loaded) before LNet tries to bind to the port? (Maybe something like: &lt;a href=&quot;https://unix.stackexchange.com/questions/126009/cause-a-script-to-execute-after-networking-has-started&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://unix.stackexchange.com/questions/126009/cause-a-script-to-execute-after-networking-has-started&lt;/a&gt;)&lt;/p&gt;

&lt;p&gt;In Syslog you should see&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 IPv6: ADDRCONF(NETDEV_CHANGE): ib0: link becomes ready&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Or something to that effect. Does that happen before or after LNet throws the error?&lt;/p&gt;

&lt;p&gt;Maybe if we can grab the entire syslog when this problem happens, we can look at the context for other clues.&lt;/p&gt;</comment>
                            <comment id="346935" author="mhanafi" created="Fri, 16 Sep 2022 16:11:27 +0000"  >&lt;p&gt;Please close this case&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i01s93:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>