<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:38:40 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10842] Recovery stalls when target is failed over to failover partner</title>
                <link>https://jira.whamcloud.com/browse/LU-10842</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;On our testbed filesystem lquake, I caused an OST to failover by unmounting OST0000 from jet17 and mounting it on jet18. The target successfully mounted on the failover node, but it appears the node is stuck recovering the newly acquired OST. Below is some information I collected. The system is stuck in the perpetual recovery state if anyone needs more information.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@jet18:~]# cat /proc/fs/lustre/obdfilter/lquake-OST0000/recovery_status 
status: RECOVERING
recovery_start: 0
time_remaining: 0
connected_clients: 0/91
req_replay_clients: 0
lock_repay_clients: 0
completed_clients: 0
evicted_clients: 0
replayed_requests: 0
queued_requests: 0
next_transno: 352189672717
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Clients are repeatedly logging the following:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[Thu Mar 22 10:13:16 2018] Lustre: 18447:0:(client.c:2109:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1521738740/real 1521738740]  req@ffff881ff260dd00 x1595482252012848/t0(0) o8-&amp;gt;lquake-OST0000-osc-ffff8801688d7800@172.19.1.127@o2ib100:28/4 lens 520/544 e 0 to 1 dl 1521738795 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Some MDSs are seeing the following message:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[Thu Mar 22 09:38:26 2018] Lustre: lquake-OST0000-osc-MDT0001: Connection to lquake-OST0000 (at 172.19.1.127@o2ib100) was lost; in progress operations using this service will wait for recovery to complete
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;All MDSs appear to be logging the following message repeatedly:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[Thu Mar 22 09:41:27 2018] Lustre: 16423:0:(client.c:2114:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1521736826/real 1521736826]  req@ffff883f416cbc00 x1595592082316800/t0(0) o8-&amp;gt;lquake-OST0000-osc-MDT0001@172.19.1.127@o2ib100:28/4 lens 520/544 e 0 to 1 dl 1521736881 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Is there any other info that you need? Will this filesystem ever recover? Will these connections ever timeout?&lt;/p&gt;</description>
                <environment></environment>
        <key id="51480">LU-10842</key>
            <summary>Recovery stalls when target is failed over to failover partner</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="6">Not a Bug</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="dinatale2">Giuseppe Di Natale</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Thu, 22 Mar 2018 17:45:47 +0000</created>
                <updated>Fri, 23 Mar 2018 22:01:42 +0000</updated>
                            <resolved>Fri, 23 Mar 2018 22:01:42 +0000</resolved>
                                    <version>Lustre 2.10.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="224294" author="ofaaland" created="Thu, 22 Mar 2018 17:55:13 +0000"  >&lt;p&gt;Joe,&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;What date and time was it when you umounted on jet17, and when you mounted on jet18?&lt;/li&gt;
	&lt;li&gt;What does &quot;status: &quot; say in the recovery_status file for all the MDTs?&lt;/li&gt;
&lt;/ol&gt;
</comment>
                            <comment id="224296" author="dinatale2" created="Thu, 22 Mar 2018 18:07:10 +0000"  >&lt;p&gt;Unmounted on jet17 and mounted on jet18 at Mar 22 09:36:33.&lt;/p&gt;

&lt;p&gt;Recovery status for all MDTs:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@jeti:~]# pdsh -g mds &apos;cat /proc/fs/lustre/mdt/*/recovery_status&apos; | dshbak -c
----------------
ejet1
----------------
status: COMPLETE
recovery_start: 1521675012
recovery_duration: 502
completed_clients: 90/91
replayed_requests: 3003
last_transno: 339312468139
VBR: ENABLED
IR: DISABLED
----------------
ejet2
----------------
status: COMPLETE
recovery_start: 1521675193
recovery_duration: 319
completed_clients: 90/90
replayed_requests: 0
last_transno: 365072220160
VBR: DISABLED
IR: DISABLED
----------------
ejet3
----------------
status: COMPLETE
recovery_start: 1521736312
recovery_duration: 71
completed_clients: 90/90
replayed_requests: 0
last_transno: 347892350976
VBR: DISABLED
IR: ENABLED
----------------
ejet4
----------------
status: COMPLETE
recovery_start: 1521675875
recovery_duration: 70
completed_clients: 90/90
replayed_requests: 0
last_transno: 352187318272
VBR: DISABLED
IR: ENABLED
----------------
ejet5
----------------
status: COMPLETE
recovery_start: 1521675438
recovery_duration: 75
completed_clients: 90/90
replayed_requests: 0
last_transno: 335007449088
VBR: DISABLED
IR: ENABLED
----------------
ejet6
----------------
status: COMPLETE
recovery_start: 1521675438
recovery_duration: 75
completed_clients: 90/90
replayed_requests: 0
last_transno: 343597383680
VBR: DISABLED
IR: ENABLED
----------------
ejet7
----------------
status: COMPLETE
recovery_start: 1521675438
recovery_duration: 75
completed_clients: 90/90
replayed_requests: 0
last_transno: 330712481792
VBR: DISABLED
IR: ENABLED
----------------
ejet8
----------------
status: COMPLETE
recovery_start: 1521675438
recovery_duration: 75
completed_clients: 90/90
replayed_requests: 0
last_transno: 339302416384
VBR: DISABLED
IR: ENABLED
----------------
ejet9
----------------
status: COMPLETE
recovery_start: 1521675439
recovery_duration: 75
completed_clients: 90/90
replayed_requests: 0
last_transno: 390842023936
VBR: DISABLED
IR: ENABLED
----------------
ejet10
----------------
status: COMPLETE
recovery_start: 1521675439
recovery_duration: 75
completed_clients: 90/90
replayed_requests: 0
last_transno: 373662154752
VBR: DISABLED
IR: ENABLED
----------------
ejet[11,14]
----------------
status: COMPLETE
recovery_start: 1521675439
recovery_duration: 75
completed_clients: 90/90
replayed_requests: 0
last_transno: 339302416384
VBR: DISABLED
IR: ENABLED
----------------
ejet12
----------------
status: COMPLETE
recovery_start: 1521675439
recovery_duration: 75
completed_clients: 90/90
replayed_requests: 0
last_transno: 343597383680
VBR: DISABLED
IR: ENABLED
----------------
ejet13
----------------
status: COMPLETE
recovery_start: 1521675462
recovery_duration: 59
completed_clients: 90/90
replayed_requests: 0
last_transno: 335007449088
VBR: DISABLED
IR: ENABLED
----------------
ejet15
----------------
status: COMPLETE
recovery_start: 1521675439
recovery_duration: 75
completed_clients: 90/90
replayed_requests: 0
last_transno: 468151435264
VBR: DISABLED
IR: ENABLED
----------------
ejet16
----------------
status: COMPLETE
recovery_start: 1521675439
recovery_duration: 75
completed_clients: 90/90
replayed_requests: 0
last_transno: 356482285568
VBR: DISABLED
IR: ENABLED
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;For completeness, OSS recover status:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;----------------
ejet18
----------------
status: RECOVERING
recovery_start: 0
time_remaining: 0
connected_clients: 0/91
req_replay_clients: 0
lock_repay_clients: 0
completed_clients: 0
evicted_clients: 0
replayed_requests: 0
queued_requests: 0
next_transno: 352189672717
status: COMPLETE
recovery_start: 1521666573
recovery_duration: 71
completed_clients: 90/90
replayed_requests: 0
last_transno: 343597406813
VBR: DISABLED
IR: DISABLED
----------------
ejet19
----------------
status: COMPLETE
recovery_start: 1521666577
recovery_duration: 71
completed_clients: 90/90
replayed_requests: 0
last_transno: 343597408098
VBR: DISABLED
IR: DISABLED
----------------
ejet20
----------------
status: COMPLETE
recovery_start: 1521666584
recovery_duration: 64
completed_clients: 90/90
replayed_requests: 0
last_transno: 352187344607
VBR: DISABLED
IR: DISABLED
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="224297" author="ofaaland" created="Thu, 22 Mar 2018 18:09:53 +0000"  >&lt;p&gt;What are the NIDs assigned to jet17 and jet18?&lt;/p&gt;</comment>
                            <comment id="224298" author="dinatale2" created="Thu, 22 Mar 2018 18:13:21 +0000"  >&lt;p&gt;NIDs are as follows:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;jet17            172.19.1.127@o2ib100
jet18            172.19.1.128@o2ib100 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="224428" author="green" created="Fri, 23 Mar 2018 17:13:30 +0000"  >&lt;p&gt;So it looks like OST_CONNECT mesasges destined to the failed over node don&apos;t get through? And so recovery does not start on OST either. So can you make sure networking andtargets are all doing hte expected thing? Can you do lnet pings.to the target addresses from clients?&lt;/p&gt;</comment>
                            <comment id="224431" author="adilger" created="Fri, 23 Mar 2018 17:26:07 +0000"  >&lt;p&gt;From the messages that are posted here, it appears that the clients and MDS are only trying to connect to jet17 (NID 172.19.1.127) and not to jet18 at all. The recovery status on jet18 also shows that no clients have tried to connect, and recovery hasn&apos;t even started for that OST.  That could be an incorrect conclusion based on the three lines of console messages available here...&lt;/p&gt;


&lt;p&gt;My guess is that the OST0000 is not configured properly for failover. You should check the failover config for OST0000 on the MGS like: &lt;tt&gt;lctl --device MGS llog_print lquake-client&lt;/tt&gt;&lt;/p&gt;</comment>
                            <comment id="224435" author="dinatale2" created="Fri, 23 Mar 2018 18:14:42 +0000"  >&lt;p&gt;Andreas, looking at the output from lctl, I think you are right. None of my MDTs are listed in the catalog file twice with different nids. I could&apos;ve sworn we set it up... let me try and configure failover again. I&apos;ll post back shortly.&lt;/p&gt;</comment>
                            <comment id="224454" author="dinatale2" created="Fri, 23 Mar 2018 21:30:10 +0000"  >&lt;p&gt;Andreas, looks like that was it. I set the failover nodes with lctl and all is well again. Sorry for the noise!&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="224455" author="dinatale2" created="Fri, 23 Mar 2018 21:32:19 +0000"  >&lt;p&gt;Looks like I can&apos;t edit the tags once an issue is closed. Peter, do you mind removing the topllnl tag? Thanks again for the help!&lt;/p&gt;</comment>
                            <comment id="224456" author="pjones" created="Fri, 23 Mar 2018 22:01:42 +0000"  >&lt;p&gt;Yup. Things are totally locked down once tickets are moved to Closed state&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzupr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>