<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:41:19 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-11143] Multi-Rail/Dynamic Discovery break LNet router checker and asymmetric route failure detection</title>
                <link>https://jira.whamcloud.com/browse/LU-11143</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;The LNet router checker needs to ping the interface defined in the route table, but MR can choose a different interface for those pings.&lt;br/&gt;
As a result the router can end up marking some interfaces down because they aren&apos;t seeing any traffic in the check_interval + router ping timeout window.&lt;br/&gt;
This causes routes to be considered down because of asymmetric route failure detection.&lt;/p&gt;

&lt;p&gt;Reproduced on a three node VM.&lt;br/&gt;
Router:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;sles15build01:/tmp # lctl list_nids
192.168.2.20@tcp99
192.168.2.20@tcp1
192.168.2.20@tcp
sles15build01:~ # lnetctl route show -v
sles15build01:~ # lnetctl peer show -v
peer:
    - primary nid: 192.168.2.22@tcp99
      Multi-Rail: True
      peer ni:
        - nid: 192.168.2.22@tcp1
          state: up
          max_ni_tx_credits: 8
          available_tx_credits: 8
          min_tx_credits: 7
          tx_q_num_of_buf: 0
          available_rtr_credits: 8
          min_rtr_credits: 7
          refcount: 1
          statistics:
              send_count: 22
              recv_count: 22
              drop_count: 0
        - nid: 192.168.2.22@tcp99
          state: up
          max_ni_tx_credits: 8
          available_tx_credits: 8
          min_tx_credits: 7
          tx_q_num_of_buf: 0
          available_rtr_credits: 8
          min_rtr_credits: 7
          refcount: 1
          statistics:
              send_count: 18
              recv_count: 18
              drop_count: 0
    - primary nid: 192.168.2.21@tcp
      Multi-Rail: True
      peer ni:
        - nid: 192.168.2.21@tcp
          state: up
          max_ni_tx_credits: 8
          available_tx_credits: 8
          min_tx_credits: 7
          tx_q_num_of_buf: 0
          available_rtr_credits: 8
          min_rtr_credits: 7
          refcount: 1
          statistics:
              send_count: 41
              recv_count: 41
              drop_count: 0
sles15build01:~ #
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Client:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;sles15c01:/tmp # lctl list_nids
192.168.2.22@tcp99
192.168.2.22@tcp1
sles15c01:/tmp # lctl show_route
net                tcp hops 4294967295 gw                192.168.2.20@tcp1 up pri 0
sles15c01:/tmp #
sles15c01:~ # lnetctl route show -v
route:
    - net: tcp
      gateway: 192.168.2.20@tcp1
      hop: -1
      priority: 0
      state: up
sles15c01:~ # lnetctl peer show -v
peer:
    - primary nid: 192.168.2.20@tcp99
      Multi-Rail: True
      peer ni:
        - nid: 192.168.2.20@tcp1
          state: up
          max_ni_tx_credits: 8
          available_tx_credits: 8
          min_tx_credits: 7
          tx_q_num_of_buf: 0
          available_rtr_credits: 8
          min_rtr_credits: 8
          refcount: 4
          statistics:
              send_count: 23
              recv_count: 23
              drop_count: 0
        - nid: 192.168.2.20@tcp99
          state: NA
          max_ni_tx_credits: 8
          available_tx_credits: 8
          min_tx_credits: 7
          tx_q_num_of_buf: 0
          available_rtr_credits: 8
          min_rtr_credits: 8
          refcount: 1
          statistics:
              send_count: 18
              recv_count: 18
              drop_count: 0
        - nid: 192.168.2.20@tcp
          state: NA
          max_ni_tx_credits: 0
          available_tx_credits: 0
          min_tx_credits: 0
          tx_q_num_of_buf: 0
          available_rtr_credits: 0
          min_rtr_credits: 0
          refcount: 2
          statistics:
              send_count: 0
              recv_count: 0
              drop_count: 0
    - primary nid: 192.168.2.21@tcp
      Multi-Rail: True
      peer ni:
        - nid: 192.168.2.21@tcp
          state: NA
          max_ni_tx_credits: 0
          available_tx_credits: 0
          min_tx_credits: 0
          tx_q_num_of_buf: 0
          available_rtr_credits: 0
          min_rtr_credits: 0
          refcount: 2
          statistics:
              send_count: 0
              recv_count: 0
              drop_count: 0
sles15c01:~ #
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Server:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;sles15s01:/tmp # lctl list_nids
192.168.2.21@tcp
sles15s01:/tmp # lctl show_route
net               tcp1 hops 4294967295 gw                 192.168.2.20@tcp up pri 0
net              tcp99 hops 4294967295 gw                 192.168.2.20@tcp down pri 0
sles15s01:~ # lnetctl route show -v
route:
    - net: tcp1
      gateway: 192.168.2.20@tcp
      hop: -1
      priority: 0
      state: up
    - net: tcp99
      gateway: 192.168.2.20@tcp
      hop: -1
      priority: 0
      state: up
sles15s01:~ # lnetctl peer show -v
peer:
    - primary nid: 192.168.2.20@tcp99
      Multi-Rail: True
      peer ni:
        - nid: 192.168.2.20@tcp
          state: up
          max_ni_tx_credits: 8
          available_tx_credits: 8
          min_tx_credits: 7
          tx_q_num_of_buf: 0
          available_rtr_credits: 8
          min_rtr_credits: 8
          refcount: 5
          statistics:
              send_count: 42
              recv_count: 42
              drop_count: 0
        - nid: 192.168.2.20@tcp99
          state: NA
          max_ni_tx_credits: 0
          available_tx_credits: 0
          min_tx_credits: 0
          tx_q_num_of_buf: 0
          available_rtr_credits: 0
          min_rtr_credits: 0
          refcount: 2
          statistics:
              send_count: 0
              recv_count: 0
              drop_count: 0
        - nid: 192.168.2.20@tcp1
          state: NA
          max_ni_tx_credits: 0
          available_tx_credits: 0
          min_tx_credits: 0
          tx_q_num_of_buf: 0
          available_rtr_credits: 0
          min_rtr_credits: 0
          refcount: 2
          statistics:
              send_count: 0
              recv_count: 0
              drop_count: 0
    - primary nid: 192.168.2.22@tcp99
      Multi-Rail: True
      peer ni:
        - nid: 192.168.2.22@tcp99
          state: NA
          max_ni_tx_credits: 0
          available_tx_credits: 0
          min_tx_credits: 0
          tx_q_num_of_buf: 0
          available_rtr_credits: 0
          min_rtr_credits: 0
          refcount: 2
          statistics:
              send_count: 0
              recv_count: 0
              drop_count: 0
        - nid: 192.168.2.22@tcp1
          state: NA
          max_ni_tx_credits: 0
          available_tx_credits: 0
          min_tx_credits: 0
          tx_q_num_of_buf: 0
          available_rtr_credits: 0
          min_rtr_credits: 0
          refcount: 2
          statistics:
              send_count: 0
              recv_count: 0
              drop_count: 0
sles15s01:~ #
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Here we see the router checker thread on the client needs to ping the &amp;#64;tcp1 nid but lnet_select_pathway() chooses a different nid for the router.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000400:00000200:2.0:1531326577.044094:0:29632:0:(router.c:1099:lnet_ping_router_locked()) Check: 12345-192.168.2.20@tcp1
00000400:00000200:2.0:1531326577.044100:0:29632:0:(lib-move.c:3251:LNetGet()) LNetGet -&amp;gt; 12345-192.168.2.20@tcp1
00000400:00000200:2.0:1531326577.044213:0:29632:0:(lib-move.c:2172:lnet_select_pathway()) TRACE: 192.168.2.22@tcp99(192.168.2.22@tcp99:&amp;lt;?&amp;gt;) -&amp;gt; 192.168.2.20@tcp99(192.168.2.20@tcp1:192.168.2.20@tcp99) : GET
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="52691">LU-11143</key>
            <summary>Multi-Rail/Dynamic Discovery break LNet router checker and asymmetric route failure detection</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="6" iconUrl="https://jira.whamcloud.com/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="sharmaso">Sonia Sharma</assignee>
                                    <reporter username="hornc">Chris Horn</reporter>
                        <labels>
                    </labels>
                <created>Wed, 11 Jul 2018 17:01:57 +0000</created>
                <updated>Fri, 27 Jan 2023 17:04:53 +0000</updated>
                            <resolved>Fri, 27 Jan 2023 17:04:53 +0000</resolved>
                                    <version>Lustre 2.11.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="231660" author="pjones" created="Wed, 8 Aug 2018 18:27:28 +0000"  >&lt;p&gt;Sonia&lt;/p&gt;

&lt;p&gt;Any comment?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="231681" author="ashehata" created="Wed, 8 Aug 2018 22:35:54 +0000"  >&lt;p&gt;both &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11143&quot; title=&quot;Multi-Rail/Dynamic Discovery break LNet router checker and asymmetric route failure detection&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11143&quot;&gt;&lt;del&gt;LU-11143&lt;/del&gt;&lt;/a&gt; and &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11144&quot; title=&quot;Dynamic Discovery is not triggered for router peers&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11144&quot;&gt;&lt;del&gt;LU-11144&lt;/del&gt;&lt;/a&gt; are related.&lt;/p&gt;

&lt;p&gt;I address them here:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://wiki.whamcloud.com/display/LNet/Routing+and+MR+integration&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://wiki.whamcloud.com/display/LNet/Routing+and+MR+integration&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Might be a good idea to use that link for feedback on the proposals&lt;/p&gt;</comment>
                            <comment id="252054" author="spitzcor" created="Fri, 26 Jul 2019 03:55:02 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=ashehata&quot; class=&quot;user-hover&quot; rel=&quot;ashehata&quot;&gt;ashehata&lt;/a&gt;, ready to close this and &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11144&quot; title=&quot;Dynamic Discovery is not triggered for router peers&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11144&quot;&gt;&lt;del&gt;LU-11144&lt;/del&gt;&lt;/a&gt;?&lt;/p&gt;</comment>
                            <comment id="252075" author="ashehata" created="Fri, 26 Jul 2019 15:23:56 +0000"  >&lt;p&gt;I believe this issue has been resolved in the new routing code.&lt;/p&gt;</comment>
                            <comment id="253596" author="spitzcor" created="Mon, 26 Aug 2019 16:06:50 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=ashehata&quot; class=&quot;user-hover&quot; rel=&quot;ashehata&quot;&gt;ashehata&lt;/a&gt;, will you be resolving this issue then?  Can you point at a specific commit or LU that resolved it?&lt;/p&gt;</comment>
                            <comment id="360685" author="hornc" created="Fri, 27 Jan 2023 17:04:53 +0000"  >&lt;p&gt;Resolved with the MR routing feature&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="52692">LU-11144</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzz27:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>