<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:47:30 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-11853] Automated update peer NID state if client changed from multi-rail to non multi-rail</title>
                <link>https://jira.whamcloud.com/browse/LU-11853</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Currently, when if client changed multi-rail to non multi-rail setting, client can&apos;t mount filesystem unless current client&apos;s peer nid state on servers removed.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;options lnet networks=&quot;o2ib10(ib0,ib2)&quot;

[root@s184 ~]# mount -t lustre 10.0.11.90@o2ib10:/cache1 /cache1
[root@s184 ~]# lnetctl net show
net:
    - net type: lo
      local NI(s):
        - nid: 0@lo
          status: up
    - net type: o2ib10
      local NI(s):
        - nid: 10.0.10.184@o2ib10
          status: up
          interfaces:
              0: ib0
        - nid: 10.2.10.184@o2ib10
          status: up
          interfaces:
              0: ib2
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;if NID state changed and remount lustre on client fails unless clear all that client state on all servers.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;options lnet networks=&quot;o2ib10(ib0)&quot;

[root@s184 ~]# umount -t lustre -a
[root@s184 ~]# lustre_rmmod 
[root@s184 ~]# mount -t lustre 10.0.11.90@o2ib10:/cache1 /cache1
mount.lustre: mount 10.0.11.90@o2ib10:/cache1 at /cache1 failed: Input/output error
Is the MGS running?
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Server side, client peer state is still multi-rail.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@es14k-vm1 ~]# lnetctl peer show
peer:
    - primary nid: 0@lo
      Multi-Rail: False
      peer ni:
        - nid: 0@lo
          state: NA
    - primary nid: 10.0.11.92@o2ib10
      Multi-Rail: True
      peer ni:
        - nid: 10.0.11.92@o2ib10
          state: NA
        - nid: 10.1.11.92@o2ib10
          state: NA
    - primary nid: 10.0.11.91@o2ib10
      Multi-Rail: True
      peer ni:
        - nid: 10.0.11.91@o2ib10
          state: NA
        - nid: 10.1.11.91@o2ib10
          state: NA
    - primary nid: 10.0.11.93@o2ib10
      Multi-Rail: True
      peer ni:
        - nid: 10.0.11.93@o2ib10
          state: NA
        - nid: 10.1.11.93@o2ib10
          state: NA
    - primary nid: 10.0.10.184@o2ib10
      Multi-Rail: True &amp;lt;------ Still Multi-rail
      peer ni:
        - nid: 10.0.10.184@o2ib10
          state: NA
        - nid: 10.2.10.184@o2ib10
          state: NA
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;a workaround is removing nid state on all servers, then mount it again. that works, but perfer automated peer state update.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@es14k-vm1 ~]# clush -g oss lnetctl peer del --prim_nid 10.0.10.184@o2ib10 --nid 10.0.10.184@o2ib10
[root@es14k-vm1 ~]# clush -g oss lnetctl peer del --prim_nid 10.0.10.184@o2ib10 --nid 10.2.10.184@o2ib10

[root@s184 ~]# mount -t lustre 10.0.11.90@o2ib10:/cache1 /cache1
[root@s184 ~]# 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>2.12 and master</environment>
        <key id="54503">LU-11853</key>
            <summary>Automated update peer NID state if client changed from multi-rail to non multi-rail</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="sihara">Shuichi Ihara</reporter>
                        <labels>
                    </labels>
                <created>Fri, 11 Jan 2019 20:48:35 +0000</created>
                <updated>Tue, 15 Jan 2019 00:58:26 +0000</updated>
                                                                                <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="239845" author="ashehata" created="Fri, 11 Jan 2019 21:48:29 +0000"  >&lt;p&gt;when you bring down the client and remount it, the client should be retriggering a discovery round, which would update the local peer on the servers.&lt;/p&gt;

&lt;p&gt;are you able to get net/neterror logging from when the client fails to mount on both client and server to see the reason for the failure?&lt;/p&gt;</comment>
                            <comment id="239850" author="ashehata" created="Fri, 11 Jan 2019 23:40:35 +0000"  >&lt;p&gt;Another question, has the config on the client changed? Was the first (primary) NID removed?&lt;/p&gt;

&lt;p&gt;IE the config went from:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
    - primary nid: 10.0.10.184@o2ib10
      Multi-Rail: True &amp;lt;------ Still Multi-rail
      peer ni:
        - nid: 10.0.10.184@o2ib10
          state: NA
        - nid: 10.2.10.184@o2ib10
          state: NA &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;to&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
    - primary nid: 10.0.10.184@o2ib10
      Multi-Rail: True &amp;lt;------ Still Multi-rail
      peer ni:
        - nid: 10.2.10.184@o2ib10
          state: NA &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;?&lt;/p&gt;</comment>
                            <comment id="239852" author="sihara" created="Sat, 12 Jan 2019 00:17:30 +0000"  >&lt;blockquote&gt;&lt;p&gt;Another question, has the config on the client changed? Was the first (primary) NID removed?&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Yes, client NID state updated on client side after chnaged to non multi-rail. please see below.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@s184 ~]# lustre_rmmod 
[root@s184 ~]# mount -t lustre 10.0.11.90@o2ib10:/cache1 /cache1
mount.lustre: mount 10.0.11.90@o2ib10:/cache1 at /cache1 failed: Input/output error
Is the MGS running?
[root@s184 ~]# lnetctl net show
net:
    - net type: lo
      local NI(s):
        - nid: 0@lo
          status: up
    - net type: o2ib10
      local NI(s):
        - nid: 10.0.10.184@o2ib10
          status: up
          interfaces:
              0: ib0
[root@s184 ~]# ssh 10.0.11.90 lnetctl peer show 
peer:
    - primary nid: 0@lo
      Multi-Rail: False
      peer ni:
        - nid: 0@lo
          state: NA
    - primary nid: 10.0.11.92@o2ib10
      Multi-Rail: True
      peer ni:
        - nid: 10.0.11.92@o2ib10
          state: NA
        - nid: 10.1.11.92@o2ib10
          state: NA
    - primary nid: 10.0.11.91@o2ib10
      Multi-Rail: True
      peer ni:
        - nid: 10.0.11.91@o2ib10
          state: NA
        - nid: 10.1.11.91@o2ib10
          state: NA
    - primary nid: 10.0.11.93@o2ib10
      Multi-Rail: True
      peer ni:
        - nid: 10.0.11.93@o2ib10
          state: NA
        - nid: 10.1.11.93@o2ib10
          state: NA
    - primary nid: 10.0.10.184@o2ib10
      Multi-Rail: True
      peer ni:
        - nid: 10.0.10.184@o2ib10
          state: NA
        - nid: 10.2.10.184@o2ib10
          state: NA
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I&apos;m collecting debug log and upload iy shortly.&lt;/p&gt;</comment>
                            <comment id="239854" author="sihara" created="Sat, 12 Jan 2019 00:35:54 +0000"  >&lt;p&gt;attached debug log (net/neterr) lctl-dk-es14k-vm1.txt (one of server) and lctl-dk-s184-vm1.txt (client) when problme was reproduced. I did test several times and it seems sometimes NI update worked properly, but sometimes doesn&apos;t work.&lt;/p&gt;</comment>
                            <comment id="239944" author="ashehata" created="Tue, 15 Jan 2019 00:58:26 +0000"  >&lt;p&gt;There is an issue with the discovery mechanism. If you bring a peer down and then up with a changed NID list, the discovery mechanism will not pick the change up. This will result in the communication errors you&apos;re seeing. This has already been fixed as part of the Multi-Rail Router feature.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://review.whamcloud.com/#/c/33304/10&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/33304/10&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Is this an urgent issue that would require back porting this change?&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="31769" name="lctl-dk-es14k-vm1.txt" size="4292608" author="sihara" created="Sat, 12 Jan 2019 00:33:45 +0000"/>
                            <attachment id="31770" name="lctl-dk-s184.txt" size="93516" author="sihara" created="Sat, 12 Jan 2019 00:33:39 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i009b3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>