<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:51:09 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12274] Clients aren&apos;t connecting to OST defined failover.node</title>
                <link>https://jira.whamcloud.com/browse/LU-12274</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;I tried running tunefs.lustre and successfully changed the failover NIDs to what they should be. This problem is happening on several OSTs, but fixing one should fix them all. &lt;/p&gt;

&lt;p&gt;I&apos;m assuming I forgot a step when I ran tunefs.lustre. &lt;/p&gt;

&lt;p&gt;tunefs.lustre --erase-param failover.node --param &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;mailto:failover.node=172.17.1.103@o2ib,172.16.1.103@tcp1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;failover.node=172.17.1.103@o2ib,172.16.1.103@tcp1&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/mail_small.gif&quot; height=&quot;12&quot; width=&quot;13&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; /dev/mapper/mpathg &lt;/p&gt;

&lt;p&gt;The OST OST0017 is mounted on 172.17.1.103 with the following parameters: &lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@apslstr03 ~&amp;#93;&lt;/span&gt;# tunefs.lustre --dryrun /dev/mapper/mpathg &lt;br/&gt;
checking for existing Lustre data: found &lt;br/&gt;
Reading CONFIGS/mountdata &lt;/p&gt;

&lt;p&gt;&#160;&#160; Read previous values: &lt;br/&gt;
Target:&#160;&#160;&#160;&#160; lustrefc-OST0017 &lt;br/&gt;
Index:&#160;&#160;&#160;&#160;&#160; 23 &lt;br/&gt;
Lustre FS:&#160; lustrefc &lt;br/&gt;
Mount type: ldiskfs &lt;br/&gt;
Flags:&#160;&#160;&#160;&#160;&#160; 0x2 &lt;br/&gt;
&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; (OST ) &lt;br/&gt;
Persistent mount opts: ,errors=remount-ro &lt;br/&gt;
Parameters:&#160; &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;mailto:failover.node=172.17.1.103@o2ib,172.16.1.103@tcp1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;failover.node=172.17.1.103@o2ib,172.16.1.103@tcp1&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/mail_small.gif&quot; height=&quot;12&quot; width=&quot;13&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;mailto:mgsnode=172.17.1.112@o2ib,172.16.1.112@tcp1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;mgsnode=172.17.1.112@o2ib,172.16.1.112@tcp1&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/mail_small.gif&quot; height=&quot;12&quot; width=&quot;13&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;mailto:mgsnode=172.17.1.113@o2ib,172.16.1.113@tcp1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;mgsnode=172.17.1.113@o2ib,172.16.1.113@tcp1&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/mail_small.gif&quot; height=&quot;12&quot; width=&quot;13&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;/p&gt;


&lt;p&gt;&#160;&#160; Permanent disk data: &lt;br/&gt;
Target:&#160;&#160;&#160;&#160; lustrefc-OST0017 &lt;br/&gt;
Index:&#160;&#160;&#160;&#160;&#160; 23 &lt;br/&gt;
Lustre FS:&#160; lustrefc &lt;br/&gt;
Mount type: ldiskfs &lt;br/&gt;
Flags:&#160;&#160;&#160;&#160;&#160; 0x2 &lt;br/&gt;
&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; (OST ) &lt;br/&gt;
Persistent mount opts: ,errors=remount-ro &lt;br/&gt;
Parameters:&#160; &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;mailto:failover.node=172.17.1.103@o2ib,172.16.1.103@tcp1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;failover.node=172.17.1.103@o2ib,172.16.1.103@tcp1&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/mail_small.gif&quot; height=&quot;12&quot; width=&quot;13&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;mailto:mgsnode=172.17.1.112@o2ib,172.16.1.112@tcp1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;mgsnode=172.17.1.112@o2ib,172.16.1.112@tcp1&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/mail_small.gif&quot; height=&quot;12&quot; width=&quot;13&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;mailto:mgsnode=172.17.1.113@o2ib,172.16.1.113@tcp1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;mgsnode=172.17.1.113@o2ib,172.16.1.113@tcp1&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/mail_small.gif&quot; height=&quot;12&quot; width=&quot;13&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;/p&gt;

&lt;p&gt;exiting before disk write. &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@apslstr03 ~&amp;#93;&lt;/span&gt;# &lt;/p&gt;

&lt;p&gt;However, the clients are still displaying errors like this: &lt;/p&gt;

&lt;p&gt;May&#160; 8 11:43:33 localhost kernel: Lustre: 2028:0:(client.c:2114:ptlrpc_expire_one_request()) @@@ Request sent has timed out for sent delay: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1557333772/real 0&amp;#93;&lt;/span&gt; req@ffff880bd9296f00 x1632920191594624/t0(0) o8-&amp;gt;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;mailto:lustrefc-OST0017-osc-ffff8817ef372000@172.17.1.106@o2ib:28/4&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;lustrefc-OST0017-osc-ffff8817ef372000@172.17.1.106@o2ib:28/4&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/mail_small.gif&quot; height=&quot;12&quot; width=&quot;13&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; lens 520/544 e 0 to 1 dl 1557333813 ref 2 fl Rpc:XN/0/ffffffff rc 0/-1 &lt;br/&gt;
May&#160; 8 11:43:33 localhost kernel: Lustre: 2028:0:(client.c:2114:ptlrpc_expire_one_request()) Skipped 65 previous similar messages &lt;br/&gt;
May&#160; 8 11:45:26 localhost kernel: LNet: 1994:0:(o2iblnd_cb.c:3192:kiblnd_check_conns()) Timed out tx for 172.17.1.106@o2ib: 3 seconds &lt;br/&gt;
May&#160; 8 11:45:26 localhost kernel: LNet: 1994:0:(o2iblnd_cb.c:3192:kiblnd_check_conns()) Skipped 39 previous similar messages&lt;/p&gt;</description>
                <environment>Servers: Lustre-2.10.2, Kernel: 3.10.0-693.5.2.el7_lustre.x86_64&lt;br/&gt;
Clients: Lustre-2.10.3, Kernel: 3.10.0-693.21.1.el7.x86_64&lt;br/&gt;
Client/Server OS: CentOS Linux release 7.4.1708</environment>
        <key id="55591">LU-12274</key>
            <summary>Clients aren&apos;t connecting to OST defined failover.node</summary>
                <type id="9" iconUrl="https://jira.whamcloud.com/images/icons/issuetypes/undefined.png">Question/Request</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="sebastien">Sebastien Buisson</assignee>
                                    <reporter username="rs1">Roger  Sersted</reporter>
                        <labels>
                    </labels>
                <created>Wed, 8 May 2019 19:47:12 +0000</created>
                <updated>Sun, 12 May 2019 14:38:37 +0000</updated>
                                            <version>Lustre 2.10.2</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="246888" author="pjones" created="Thu, 9 May 2019 14:29:36 +0000"  >&lt;p&gt;Sebastien&lt;/p&gt;

&lt;p&gt;Could you please advise here?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="246896" author="sebastien" created="Thu, 9 May 2019 16:23:31 +0000"  >&lt;p&gt;Hi,&lt;/p&gt;

&lt;p&gt;Did you run the tunes.lustre commands while the targets were stopped (ie unmounted)?&lt;/p&gt;</comment>
                            <comment id="246899" author="rs1" created="Thu, 9 May 2019 16:48:00 +0000"  >&lt;p&gt;I unmounted the OSTs inquestion.&#160; OSTs not being modified were mounted.&#160; The MDT and MGT were both mounted.&lt;/p&gt;</comment>
                            <comment id="246954" author="sebastien" created="Fri, 10 May 2019 07:37:45 +0000"  >&lt;p&gt;I think we will need to have a look at the Lustre Logs on the MGS.&lt;br/&gt;
Could you please run the following commands on the MGS and attach the lustrefc-client.txt file to this ticket (output of llog_reader)?&lt;/p&gt;

&lt;p&gt;Assuming your Lustre file system name is lustrefc, that would be:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;mgs# debugfs -c -R &apos;dump CONFIGS/lustrefc-client /tmp/lustrefc-client&apos; &amp;lt;mgt device&amp;gt;
mgs# llog_reader /tmp/lustrefc-client &amp;gt; /tmp/lustrefc-client.txt
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Thanks.&lt;/p&gt;
</comment>
                            <comment id="246972" author="rs1" created="Fri, 10 May 2019 14:35:43 +0000"  >&lt;p&gt;I have attached the requested output. I should add, my cluster is down due to this problem.&lt;/p&gt;</comment>
                            <comment id="246977" author="sebastien" created="Fri, 10 May 2019 15:14:33 +0000"  >&lt;p&gt;As the name suggests, failover node parameter serves the purpose of specifying failover NIDs for targets. It does not reflect the primary NID of a target.&lt;/p&gt;

&lt;p&gt;So now that you mention that your cluster is down, I am wondering if your targets have been moved so that their primary NID is now different. If the targets did not move and they all run on their primary node, then this problem with the failover node change should not lead to any downtime.&lt;/p&gt;</comment>
                            <comment id="246979" author="rs1" created="Fri, 10 May 2019 15:58:20 +0000"  >&lt;p&gt;I&apos;m in the process of updating the Lustre servers.&#160; I unmounted the OSTs on one set of servers and mounted them on to their HA partners.&#160; I have done this in the past with one node and it worked fine.&#160; I have 6 OSSes configured in HA pairs.&#160; I am not running any HA software.&#160; If a server fails, I manually unmount and then mount to the HA partner.&#160;&lt;/p&gt;</comment>
                            <comment id="246980" author="sebastien" created="Fri, 10 May 2019 16:14:42 +0000"  >&lt;p&gt;I can see in the llog_reader output that target lustrefc-OST0017 for instance is still registered with NIDs 172.17.1.105 and 172.17.1.106. It explains the error messages on the clients.&lt;/p&gt;

&lt;p&gt;I just noticed this important message in the Lustre Operations Manual:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;If a --failnode option is added to a target to designate a failover server for the target, the
target must be re-mounted on the primary node before the --failnode option takes effect
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So the problem you are facing could be due to the fact that after tunefs.lustre, the target was directly mounted on the failover node. For instance on 172.17.1.103 for target lustrefc-OST0017. Does it make sense?&lt;/p&gt;

&lt;p&gt;Targets should be mounted on the primary node right after a tunefs.lustre that changes the failnodes, and only after that failed over to a secondary node.&lt;/p&gt;</comment>
                            <comment id="246984" author="rs1" created="Fri, 10 May 2019 18:29:58 +0000"  >&lt;p&gt;Great catch on that.&#160; I went through all of my OSTs and remounted and checked the failover setting.&#160; I then remounted to the HA partner and the filesystem is working.&#160; Quick question, &quot;How would I change primary NID of an OST?&quot;&#160; Would I specify the &quot;servicenode&quot; option?&lt;/p&gt;</comment>
                            <comment id="247037" author="sebastien" created="Sun, 12 May 2019 14:38:37 +0000"  >&lt;p&gt;Good to hear!&lt;/p&gt;

&lt;p&gt;If you need to change a primary nid, I would advise to follow the dedicated instructions in the Lustre Operations Manual:&lt;br/&gt;
&lt;a href=&quot;http://doc.lustre.org/lustre_manual.xhtml#dbdoclet.changingservernid&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://doc.lustre.org/lustre_manual.xhtml#dbdoclet.changingservernid&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="32558" name="lustrefc-client.txt" size="42408" author="rs1" created="Fri, 10 May 2019 14:32:27 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                    <customfield id="customfield_10030" key="com.atlassian.jira.plugin.system.customfieldtypes:labels">
                        <customfieldname>Epic/Theme</customfieldname>
                        <customfieldvalues>
                                        <label>Lustre-2.10.2</label>
    
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00fzj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>