<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:30:25 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-9914] Dynamic Discovery - discovery hangs if max_interfaces is changed from 200-&gt;16</title>
                <link>https://jira.whamcloud.com/browse/LU-9914</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;NOTE: I see that after this patch:&lt;br/&gt;
&lt;a href=&quot;https://review.whamcloud.com/#/c/28702/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/28702/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Without this patch the problem was being hidden, by an immediate failure.&lt;/p&gt;

&lt;p&gt;Steps:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Peer 2:
net:
    - net type: lo
      local NI(s):
        - nid: 0@lo
          status: up
    - net type: tcp
      local NI(s):
        - nid: 192.168.122.30@tcp
          status: up
          interfaces:
              0: eth0
        - nid: 192.168.122.31@tcp
          status: up
          interfaces:
              0: eth1
        - nid: 192.168.122.32@tcp
          status: up
          interfaces:
              0: eth2
        - nid: 192.168.122.33@tcp
          status: up
          interfaces:
              0: eth3
        - nid: 192.168.122.34@tcp
          status: up
          interfaces:
              0: eth4
        - nid: 192.168.122.35@tcp
          status: up
          interfaces:
              0: eth5
        - nid: 192.168.122.36@tcp
          status: up
          interfaces:
              0: eth6
        - nid: 192.168.122.37@tcp
          status: up
          interfaces:
              0: eth7
        - nid: 192.168.122.38@tcp
          status: up
          interfaces:
              0: eth8
        - nid: 192.168.122.39@tcp
          status: up
          interfaces:
              0: eth9
        - nid: 192.168.122.40@tcp
          status: up
          interfaces:
              0: eth10
        - nid: 192.168.122.41@tcp
          status: up
          interfaces:
              0: eth11
        - nid: 192.168.122.42@tcp
          status: up
          interfaces:
              0: eth12
        - nid: 192.168.122.43@tcp
          status: up
          interfaces:
              0: eth13
        - nid: 192.168.122.44@tcp
          status: up
          interfaces:
              0: eth14
        - nid: 192.168.122.45@tcp
          status: up
          interfaces:
              0: eth15
        - nid: 192.168.122.46@tcp
          status: up
          interfaces:
              0: eth16

#peer 1
modprobe lnet
lnetctl lnet configure
lnetctl net add --net tcp --&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; eth0,eth1
# max_interfaces &lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt; to 200
lnetctl discover 192.168.122.30@tcp
lnetctl set max_interfaces 16
# discover hangs (I kill it... so it might come back after a &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt;, but haven&apos;t waited)
lnetctl discover 192.168.122.30@tcp
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="47960">LU-9914</key>
            <summary>Dynamic Discovery - discovery hangs if max_interfaces is changed from 200-&gt;16</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="ashehata">Amir Shehata</reporter>
                        <labels>
                            <label>patch</label>
                    </labels>
                <created>Fri, 25 Aug 2017 01:10:11 +0000</created>
                <updated>Sat, 29 Jan 2022 10:48:06 +0000</updated>
                                                                                <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="206375" author="ashehata" created="Fri, 25 Aug 2017 01:24:30 +0000"  >&lt;p&gt;problem is here:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;1154 &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;
1155 lnet_ping_info_validate(struct lnet_ping_info *pinfo)
1156 {
1157 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!pinfo)
1158 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; -EINVAL;
1159 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (pinfo-&amp;gt;pi_magic != LNET_PROTO_PING_MAGIC)
1160 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; -EPROTO;
1161 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!(pinfo-&amp;gt;pi_features &amp;amp; LNET_PING_FEAT_NI_STATUS))
1162 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; -EPROTO;
1163 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-comment&quot;&gt;/* Loopback is guaranteed to be present */&lt;/span&gt;
1164 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (pinfo-&amp;gt;pi_nnis &amp;lt; 1 || pinfo-&amp;gt;pi_nnis &amp;gt; lnet_interfaces_max)
1165 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; -ERANGE;
1166 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (LNET_NETTYP(LNET_NIDNET(LNET_PING_INFO_LONI(pinfo))) != LOLND)
1167 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; -EPROTO;
1168 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 0; 
1169 }


2103 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;/*
2104 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183; * A reply with invalid or corrupted info. Set PING_FAILED to
2105 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183; * trigger a retry.
2106 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183; */
2107 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;rc = lnet_ping_info_validate(&amp;amp;pbuf-&amp;gt;pb_info);
2108 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc) {
2109 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;lp-&amp;gt;lp_state |= LNET_PEER_PING_FAILED;
2110 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;lp-&amp;gt;lp_ping_error = 0;
2111 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;CDEBUG(D_NET, &lt;span class=&quot;code-quote&quot;&gt;&quot;Corrupted Ping Reply from %s: %d\n&quot;&lt;/span&gt;,
2112 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;       libcfs_nid2str(lp-&amp;gt;lp_primary_nid), rc);
2113 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;goto&lt;/span&gt; out;
2114 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Doesn&apos;t look like the state machine is handling the ping failure properly. Basically, the local lnet_interfaces_max is less than the number of interfaces on the far end 16 &amp;lt; 18. So we should get an -ERANGE.&lt;/p&gt;

&lt;p&gt;Looks like we&apos;re stuck in a loop retrying the ping for discover and it keeps failing with the same error:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;(peer.c:2112:lnet_discovery_event_reply()) Corrupted Ping Reply from 192.168.122.30@tcp: -34
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="206425" author="olaf" created="Fri, 25 Aug 2017 15:32:37 +0000"  >&lt;p&gt;To be honest, &lt;tt&gt;lnet_interfaces_max&lt;/tt&gt; exists only to avoid hard-coding a limit, and you ought to run with compatible values across the cluster. Meaning that &lt;tt&gt;lnet_interfaces_max&lt;/tt&gt; on each node should be at least the number of interfaces of its peers.&lt;/p&gt;

&lt;p&gt;Still, what happens here isn&apos;t exactly graceful handling of the problematic configuration. My proposal would be to fail discovery of nodes that have more interfaces than &lt;tt&gt;lnet_interfaces_max&lt;/tt&gt;, add some checks to prevent discovery from retrying, and emit an error message indicating that this problem has been encountered.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="206448" author="gerrit" created="Fri, 25 Aug 2017 17:49:50 +0000"  >&lt;p&gt;Olaf Weber (olaf.weber@hpe.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/28714&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/28714&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9914&quot; title=&quot;Dynamic Discovery - discovery hangs if max_interfaces is changed from 200-&amp;gt;16&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9914&quot;&gt;LU-9914&lt;/a&gt; lnet: gracefully handle peers with too many NIs&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: b973f67c227f5b988afb052171cf74cc7a097157&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzizz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>