<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:53:58 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5724] IR recovery doesn&apos;t behave properly with Lustre 2.5</title>
                <link>https://jira.whamcloud.com/browse/LU-5724</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Today we experienced a hardware failure with our MDS. The MDS rebooted and then came back. We restarted the MDS but IR behaved strangely. Four clients got evicted but when the timer to completion got down to zero IR restarted all over again. Then once it got to the 700 second range the timer starting to go up. It did this a few times before letting the timer running out. Once the timer did finally get to zero the recovery state was reported as still being in recovery. It removed this way for several more minutes before finally being in a recovered state. In all it toke 54 minutes to recover.&lt;/p&gt;</description>
                <environment>MDS server running RHEL6.5 running ORNL 2.5.3 branch with about 12 patches.</environment>
        <key id="26957">LU-5724</key>
            <summary>IR recovery doesn&apos;t behave properly with Lustre 2.5</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="6" iconUrl="https://jira.whamcloud.com/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="simmonsja">James A Simmons</reporter>
                        <labels>
                            <label>llnl</label>
                            <label>ornl</label>
                            <label>p4o</label>
                    </labels>
                <created>Fri, 10 Oct 2014 19:51:27 +0000</created>
                <updated>Fri, 20 Feb 2015 14:59:30 +0000</updated>
                            <resolved>Fri, 20 Feb 2015 14:59:30 +0000</resolved>
                                    <version>Lustre 2.5.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>16</watches>
                                                                            <comments>
                            <comment id="96147" author="simmonsja" created="Fri, 10 Oct 2014 19:56:07 +0000"  >&lt;p&gt;I attached the dmesg but sorry we didn&apos;t collect any data this time.&lt;/p&gt;</comment>
                            <comment id="96148" author="simmonsja" created="Fri, 10 Oct 2014 20:00:59 +0000"  >&lt;p&gt;It was noticed that when a client reconnects the replay timer goes up a bit. Is this proper behavior.&lt;/p&gt;</comment>
                            <comment id="96149" author="pjones" created="Fri, 10 Oct 2014 20:08:42 +0000"  >&lt;p&gt;Jinshan&lt;/p&gt;

&lt;p&gt;Could you please comment?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="96156" author="jay" created="Fri, 10 Oct 2014 21:03:14 +0000"  >&lt;p&gt;From the dmesg, IR is enabled on this server:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Oct 10 12:00:56 atlas-mds1.ccs.ornl.gov kernel: [  280.655680] Lustre: atlas1-MDT0000: Imperative Recovery enabled, recovery window shrunk from 1800-5400 down to 900-2700
Oct 10 12:00:56 atlas-mds1.ccs.ornl.gov kernel: [  280.661720] Lustre: atlas1-MDT0000: Will be in recovery for at least 15:00, or until 20178 clients reconnect
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;After 15 minutes, the recovery timed out and 4 clients were evicted.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Oct 10 12:15:56 atlas-mds1.ccs.ornl.gov kernel: [ 1181.536843] Lustre: atlas1-MDT0000: recovery is timed out, evict stale exports
Oct 10 12:15:56 atlas-mds1.ccs.ornl.gov kernel: [ 1181.547322] Lustre: atlas1-MDT0000: disconnecting 4 stale clients
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;There were few clients couldn&apos;t reconnect to the server because one active RPC still existed:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Oct 10 12:17:53 atlas-mds1.ccs.ornl.gov kernel: [ 1297.846751] Lustre: atlas1-MDT0000: Client 31fe9dcf-fbb8-41ea-72d2-cdae9b6dd941 (at 15449@gni100) refused reconnection, still busy with 1 active RPCs
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I suspect this is the exact reason why it couldn&apos;t be recovered. There are few bugs about this `active RPCs&apos;, we will have to investigate if this is issue has already been fixed.&lt;/p&gt;

&lt;p&gt;Finally, the MDT lost its temper and those clients were evicted finally.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Oct 10 12:51:28 atlas-mds1.ccs.ornl.gov kernel: [ 3315.271837] Lustre: 19764:0:(ldlm_lib.c:2088:target_recovery_thread()) too long recovery - read logs
Oct 10 12:51:28 atlas-mds1.ccs.ornl.gov kernel: [ 3315.282307] LustreError: dumping log to /tmp/lustre-log.1412959888.19764
Oct 10 12:51:28 atlas-mds1.ccs.ornl.gov kernel: [ 3315.282310] Lustre: atlas1-MDT0000: Recovery over after 50:32, of 20178 clients 20171 recovered and 7 were evicted.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;From the log, there are 20178 clients in total, and 4 clients couldn&apos;t reconnect in the first place so they were evicted after IR recovery window timed out. The other 3 got into `active RPCs&apos; trouble and the MDT has to wait for them to finish recovery. I feel that IR was working well in this example. As you know, IR can only help notify the clients the events of server restart, obviously the recovery took way too long time to finish.&lt;/p&gt;

&lt;p&gt;Another message popped out many times in the log&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Lustre: mdt: This server is not able to keep up with request traffic (cpu-bound).
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It seems like the MDT can&apos;t handle the RPCs in time so the RPC expired. Can you please verify that the CPU was super busy at that time?&lt;/p&gt;</comment>
                            <comment id="96171" author="ezell" created="Sat, 11 Oct 2014 02:40:46 +0000"  >&lt;p&gt;Hi Jinshan- Thanks for the analysis.&lt;/p&gt;

&lt;p&gt;After we recently upgraded clients and servers to 2.5.x, we hit &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5651&quot; title=&quot;ASSERTION( req-&amp;gt;rq_export-&amp;gt;exp_lock_replay_needed ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5651&quot;&gt;&lt;del&gt;LU-5651&lt;/del&gt;&lt;/a&gt; (issue covered in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5719&quot; title=&quot;target_queue_recovery_request() ASSERTION failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5719&quot;&gt;&lt;del&gt;LU-5719&lt;/del&gt;&lt;/a&gt;).  To resolve, we backed out the patch from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-793&quot; title=&quot;Reconnections should not be refused when there is a request in progress from this client.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-793&quot;&gt;&lt;del&gt;LU-793&lt;/del&gt;&lt;/a&gt;.  This could have caused new instances of the &quot;still busy with 1 active RPC&quot; messages.&lt;/p&gt;

&lt;p&gt;While the MDS was trying to recover, I was logged in looking at &apos;top&apos; and &apos;perf top&apos; to see the current load.  Neither became excessive, so I wonder if there&apos;s a situation where the &quot;server is not able to keep up with request traffic&quot; message may be emitted for reasons other than excessive CPU usage.  It is emitted when the server tries to send early replies but it&apos;s already past the request deadline.  When in recovery, does that metric even make sense?  I fear that it&apos;s not CPU usage but some other issue (deadlock?) causing the problem.&lt;/p&gt;

&lt;p&gt;Maybe I don&apos;t understand the normal recovery sequence, but at 12:15:56 when recovery timed out and 4 clients were evicted, why did recovery continue and run into the &quot;still busy&quot; clients?  Then it had to wait until 12:51:28 for the 3 additional ones?  Why wasn&apos;t every non-compliant client evicted at that time so the MDS could move on with its life?&lt;/p&gt;

&lt;p&gt;Anyway, I think we need to get a stable fix for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5651&quot; title=&quot;ASSERTION( req-&amp;gt;rq_export-&amp;gt;exp_lock_replay_needed ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5651&quot;&gt;&lt;del&gt;LU-5651&lt;/del&gt;&lt;/a&gt; and re-enable the patch from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-793&quot; title=&quot;Reconnections should not be refused when there is a request in progress from this client.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-793&quot;&gt;&lt;del&gt;LU-793&lt;/del&gt;&lt;/a&gt; to see if it improves the recovery process.  Other issues may persist, and that&apos;s what we need to track in this ticket.&lt;/p&gt;</comment>
                            <comment id="96205" author="hongchao.zhang" created="Mon, 13 Oct 2014 12:14:15 +0000"  >&lt;p&gt;Hi Matt,&lt;/p&gt;

&lt;p&gt;the extra recovery period is caused by VBR (version based recovery).  &quot;check_and_start_recovery_timer&quot; will start the first period of&lt;br/&gt;
recovery when some client is connecting, and the period is &quot;3*obd_timeout = 1800s = 30m&quot;, which is reduced by IR to 900s = 15m,&lt;br/&gt;
as per the log (&quot;check_for_clients&quot; in stack), the MDT has been waiting the connection of all clients in this period (4 clients didn&apos;t connect).&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;tgt_recov     D 000000000000000f     0 19764      2 0x00000000
Oct 10 12:14:20 atlas-mds1.ccs.ornl.gov kernel: [ 1084.914484]  ffff881fcaed1da0 0000000000000046 0000000000000000 ffff881fcaed1d64
Oct 10 12:14:20 atlas-mds1.ccs.ornl.gov kernel: [ 1084.923068]  0000009100000000 ffff88207fc28800 ffff88011c456880 0000000000000400
Oct 10 12:14:20 atlas-mds1.ccs.ornl.gov kernel: [ 1084.931650]  ffff881fcaecb058 ffff881fcaed1fd8 000000000000fbc8 ffff881fcaecb058
Oct 10 12:14:20 atlas-mds1.ccs.ornl.gov kernel: [ 1084.940231] Call Trace:
Oct 10 12:14:20 atlas-mds1.ccs.ornl.gov kernel: [ 1084.943131]  [&amp;lt;ffffffffa07ca620&amp;gt;] ? check_for_clients+0x0/0x70 [ptlrpc]
Oct 10 12:14:20 atlas-mds1.ccs.ornl.gov kernel: [ 1084.950680]  [&amp;lt;ffffffffa07cbc8d&amp;gt;] target_recovery_overseer+0x9d/0x230 [ptlrpc]
Oct 10 12:14:20 atlas-mds1.ccs.ornl.gov kernel: [ 1084.959026]  [&amp;lt;ffffffffa07ca310&amp;gt;] ? exp_connect_healthy+0x0/0x20 [ptlrpc]
Oct 10 12:14:20 atlas-mds1.ccs.ornl.gov kernel: [ 1084.966778]  [&amp;lt;ffffffff8109af00&amp;gt;] ? autoremove_wake_function+0x0/0x40
Oct 10 12:14:20 atlas-mds1.ccs.ornl.gov kernel: [ 1084.974140]  [&amp;lt;ffffffffa07d2550&amp;gt;] ? target_recovery_thread+0x0/0x1920 [ptlrpc]
Oct 10 12:14:20 atlas-mds1.ccs.ornl.gov kernel: [ 1084.982456]  [&amp;lt;ffffffffa07d2a90&amp;gt;] target_recovery_thread+0x540/0x1920 [ptlrpc]
Oct 10 12:14:20 atlas-mds1.ccs.ornl.gov kernel: [ 1084.990824]  [&amp;lt;ffffffff81061d12&amp;gt;] ? default_wake_function+0x12/0x20
Oct 10 12:14:20 atlas-mds1.ccs.ornl.gov kernel: [ 1084.997983]  [&amp;lt;ffffffffa07d2550&amp;gt;] ? target_recovery_thread+0x0/0x1920 [ptlrpc]
Oct 10 12:14:20 atlas-mds1.ccs.ornl.gov kernel: [ 1085.006277]  [&amp;lt;ffffffff8109ab56&amp;gt;] kthread+0x96/0xa0
Oct 10 12:14:20 atlas-mds1.ccs.ornl.gov kernel: [ 1085.011867]  [&amp;lt;ffffffff8100c20a&amp;gt;] child_rip+0xa/0x20
Oct 10 12:14:20 atlas-mds1.ccs.ornl.gov kernel: [ 1085.017531]  [&amp;lt;ffffffff8109aac0&amp;gt;] ? kthread+0x0/0xa0
Oct 10 12:14:20 atlas-mds1.ccs.ornl.gov kernel: [ 1085.023222]  [&amp;lt;ffffffff8100c200&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Oct 10 12:00:56 atlas-mds1.ccs.ornl.gov kernel: [  280.655680] Lustre: atlas1-MDT0000: Imperative Recovery enabled, recovery window shrunk from 1800-5400 down to 900-2700
Oct 10 12:00:56 atlas-mds1.ccs.ornl.gov kernel: [  280.661720] Lustre: atlas1-MDT0000: Will be in recovery for at least 15:00, or until 20178 clients reconnect
...
Oct 10 12:15:56 atlas-mds1.ccs.ornl.gov kernel: [ 1181.536843] Lustre: atlas1-MDT0000: recovery is timed out, evict stale exports
Oct 10 12:15:56 atlas-mds1.ccs.ornl.gov kernel: [ 1181.547322] Lustre: atlas1-MDT0000: disconnecting 4 stale clients
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;after the recovery timer expired, there will be another recovery period to startup (VBR &amp;#8211; version based recovery), and the  &lt;br/&gt;
is expired, obd_device-&amp;gt;obd_version_recov will be set and the extra recovery period (the deadline is 2700s = 45m) is started&lt;br/&gt;
by calling &quot;extend_recovery_timer&quot; in &quot;check_and_start_recovery_timer&quot; and &quot;handle_recovery_req&quot;. and the recovery timer&lt;br/&gt;
expired while waiting the request with the next transno to replay (&quot;check_for_next_transno&quot; in stack), and 1 client failed.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Oct 10 12:44:21 atlas-mds1.ccs.ornl.gov kernel: [ 2887.578366] tgt_recov     D 0000000000000015     0 19764      2 0x00000000
Oct 10 12:44:21 atlas-mds1.ccs.ornl.gov kernel: [ 2887.586215]  ffff881fcaed1da0 0000000000000046 ffff881fcaed1d00 ffff881fcaed1d64
Oct 10 12:44:21 atlas-mds1.ccs.ornl.gov kernel: [ 2887.594772]  ffffc9007f72b470 ffff88402a8af430 0000000000004ed2 0000000000004ece
Oct 10 12:44:21 atlas-mds1.ccs.ornl.gov kernel: [ 2887.603336]  ffff881fcaecb058 ffff881fcaed1fd8 000000000000fbc8 ffff881fcaecb058
Oct 10 12:44:21 atlas-mds1.ccs.ornl.gov kernel: [ 2887.611889] Call Trace:
Oct 10 12:44:21 atlas-mds1.ccs.ornl.gov kernel: [ 2887.614787]  [&amp;lt;ffffffffa07cea60&amp;gt;] ? check_for_next_transno+0x0/0x590 [ptlrpc]
Oct 10 12:44:21 atlas-mds1.ccs.ornl.gov kernel: [ 2887.622914]  [&amp;lt;ffffffffa07cbc8d&amp;gt;] target_recovery_overseer+0x9d/0x230 [ptlrpc]
Oct 10 12:44:21 atlas-mds1.ccs.ornl.gov kernel: [ 2887.631242]  [&amp;lt;ffffffffa07ca330&amp;gt;] ? exp_req_replay_healthy+0x0/0x30 [ptlrpc]
Oct 10 12:44:21 atlas-mds1.ccs.ornl.gov kernel: [ 2887.639246]  [&amp;lt;ffffffff8109af00&amp;gt;] ? autoremove_wake_function+0x0/0x40
Oct 10 12:44:21 atlas-mds1.ccs.ornl.gov kernel: [ 2887.646589]  [&amp;lt;ffffffffa07d2cba&amp;gt;] target_recovery_thread+0x76a/0x1920 [ptlrpc]
Oct 10 12:44:21 atlas-mds1.ccs.ornl.gov kernel: [ 2887.654880]  [&amp;lt;ffffffff81061d12&amp;gt;] ? default_wake_function+0x12/0x20
Oct 10 12:44:21 atlas-mds1.ccs.ornl.gov kernel: [ 2887.662034]  [&amp;lt;ffffffffa07d2550&amp;gt;] ? target_recovery_thread+0x0/0x1920 [ptlrpc]
Oct 10 12:44:21 atlas-mds1.ccs.ornl.gov kernel: [ 2887.670330]  [&amp;lt;ffffffff8109ab56&amp;gt;] kthread+0x96/0xa0
Oct 10 12:44:21 atlas-mds1.ccs.ornl.gov kernel: [ 2887.675928]  [&amp;lt;ffffffff8100c20a&amp;gt;] child_rip+0xa/0x20
Oct 10 12:44:21 atlas-mds1.ccs.ornl.gov kernel: [ 2887.681593]  [&amp;lt;ffffffff8109aac0&amp;gt;] ? kthread+0x0/0xa0
Oct 10 12:44:21 atlas-mds1.ccs.ornl.gov kernel: [ 2887.687255]  [&amp;lt;ffffffff8100c200&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Oct 10 12:45:20 atlas-mds1.ccs.ornl.gov kernel: [ 2946.578608] Lustre: atlas1-MDT0000: recovery is timed out, evict stale exports
Oct 10 12:45:20 atlas-mds1.ccs.ornl.gov kernel: [ 2946.588579] Lustre: atlas1-MDT0000: disconnecting 1 stale clients
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;the load of the system should be not high, for there are only one thread to process these replay request and the count&lt;br/&gt;
of clients is relative high (20178), then some request could be spent more time to trigger the log,&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Oct 10 12:16:27 atlas-mds1.ccs.ornl.gov kernel: [ 1212.683035] Lustre: 16227:0:(service.c:1304:ptlrpc_at_send_early_reply()) @@@ Already past deadline (-62s), not sending early reply. Consider increasing at_early_margin (5)?  req@ffff88404991d000 x1481333400177244/t0(0) o400-&amp;gt;dbcda88f-598d-9915-c1a2-3272067b8e42@18919@gni100:0/0 lens 224/0 e 1 to 0 dl 1412957725 ref 2 fl Complete:H/c0/ffffffff rc 0/-1
Oct 10 12:16:27 atlas-mds1.ccs.ornl.gov kernel: [ 1212.718387] Lustre: 16227:0:(service.c:1304:ptlrpc_at_send_early_reply()) Skipped 6595 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="96498" author="hilljjornl" created="Thu, 16 Oct 2014 17:24:25 +0000"  >&lt;p&gt;After talking with James Nunez on the call today some more background information might be useful.&lt;/p&gt;

&lt;p&gt;There were 2 outages on Friday 10/10/. The first was the MDS rebooted spuriously and there was no known cause. After we returned to service more hardware validation and debugging occurred and a hardware fault was found. A second outage was taken to swap the MDS hardware (not MDT). The first outage had ~1.5 hours between when the mds was unresponsive and became responsive again. The second outage was no more than 20 minutes. The first outage had the issue described in this ticket; the second outage recovered successfully and cleanly in a much shorter time (less than 30 minutes).&lt;/p&gt;</comment>
                            <comment id="96499" author="jay" created="Thu, 16 Oct 2014 17:42:10 +0000"  >&lt;p&gt;30 minutes was pretty good for this size of cluster, in my personal opinion.&lt;/p&gt;</comment>
                            <comment id="96500" author="hilljjornl" created="Thu, 16 Oct 2014 17:43:55 +0000"  >&lt;p&gt;jinshan:&lt;/p&gt;

&lt;p&gt;We are happy with the second outage&apos;s performance. the 1.5 hours for the first outage&apos;s performance is where we have problems.&lt;/p&gt;
</comment>
                            <comment id="96533" author="hilljjornl" created="Thu, 16 Oct 2014 21:09:32 +0000"  >&lt;p&gt;We&apos;re currently in the holding pattern of the MDS having 0s remaining on the recovery timer but not exiting recovery. &lt;/p&gt;

&lt;p&gt;status: RECOVERING&lt;br/&gt;
recovery_start: 1413488962&lt;br/&gt;
time_remaining: 0&lt;br/&gt;
connected_clients: 20177/20178&lt;br/&gt;
req_replay_clients: 409&lt;br/&gt;
lock_repay_clients: 481&lt;br/&gt;
completed_clients: 19696&lt;br/&gt;
evicted_clients: 1&lt;br/&gt;
replayed_requests: 269&lt;br/&gt;
queued_requests: 408&lt;br/&gt;
next_transno: 148691449625&lt;/p&gt;

&lt;p&gt;Oct 16 17:05:17 atlas-mds3.ccs.ornl.gov kernel: [ 4769.501597] Lustre: mdt: This server is not able to keep up with request traffic (cpu-bound).&lt;br/&gt;
Oct 16 17:05:17 atlas-mds3.ccs.ornl.gov kernel: [ 4769.503568] Lustre: 14922:0:(service.c:1507:ptlrpc_at_check_timed()) earlyQ=1 reqQ=0 recA=0, svcEst=250, delay=0(jiff)&lt;br/&gt;
Oct 16 17:05:17 atlas-mds3.ccs.ornl.gov kernel: [ 4769.503571] Lustre: 14922:0:(service.c:1507:ptlrpc_at_check_timed()) Skipped 8 previous similar messages&lt;br/&gt;
Oct 16 17:05:17 atlas-mds3.ccs.ornl.gov kernel: [ 4769.503579] Lustre: 14922:0:(service.c:1304:ptlrpc_at_send_early_reply()) @@@ Already past deadline (&lt;del&gt;62s), not sending early reply. Consider increasing at_early_margin (5)? req@ffff883ff8de8400 x1481332130881448/t0(148691450005) o36&lt;/del&gt;&amp;gt;1304f9d3-b2ef-59fa-e0f8-5662fc869d82@173@gni3:0/0 lens 488/0 e 1 to 0 dl 1413493455 ref 2 fl Complete:/6/ffffffff rc 0/-1&lt;/p&gt;</comment>
                            <comment id="96534" author="ezell" created="Thu, 16 Oct 2014 21:16:05 +0000"  >&lt;p&gt;I just uploaded some debug logs to your FTP server:&lt;br/&gt;
/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5724&quot; title=&quot;IR recovery doesn&amp;#39;t behave properly with Lustre 2.5&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5724&quot;&gt;&lt;del&gt;LU-5724&lt;/del&gt;&lt;/a&gt;/atlas-mds3-recovery-20141016-all.gz&lt;/p&gt;</comment>
                            <comment id="96570" author="hongchao.zhang" created="Fri, 17 Oct 2014 09:35:37 +0000"  >&lt;p&gt;according the log &quot;atlas-mds3-recovery-20141016-all.gz&quot;, there is one client failing to connect during the first phase of recovery(900s),&lt;br/&gt;
and the process of replay request is very slow (and there are so much reconnection requests from clients), which prolongs the recovery&lt;br/&gt;
to exceed the &quot;hard&quot; limit of the recovery (obd_recovery_time_hard), then the &quot;time_remaining&quot; in the &quot;recovery_status&quot; is &quot;0&quot;.&lt;/p&gt;

&lt;p&gt;Hi Matt,&lt;br/&gt;
Is there any difference between this recovery and the first one in the ticket (such as outputting more logs, etc)? and that one is much&lt;br/&gt;
faster in the second phase (VBR phase), and the recovery expired at the hard recovery limit. but in this new recovery instance, &lt;br/&gt;
replay requests are processed slowly and extend the recovery timer step by step to exceed the &quot;hard&quot; limit, the recovery is not aborted&lt;br/&gt;
for there are still health clients to recover (only very slow).&lt;/p&gt;

&lt;p&gt;the recovery speed depends on the clients for given the same &quot;sys.timeout&quot; parameter,  if some clients failed to connect for recovery,&lt;br/&gt;
the recovery could need long time to make sure the client is failed and to evict it.&lt;/p&gt;</comment>
                            <comment id="96623" author="ezell" created="Fri, 17 Oct 2014 18:43:24 +0000"  >&lt;p&gt;We run with:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@atlas-mds3 ~]# cat /proc/sys/lnet/debug
ioctl neterror warning error emerg ha config console
[root@atlas-mds3 ~]# cat /proc/sys/lnet/printk 
warning error emerg console
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In the previous scenario, we only provided the output of what printk()&apos;ed.  For the more recent instance, I used &apos;lctl dk&apos; regularly to create the logs I provided.  I think for diagnosing recovery, the D_HA messages are helpful.&lt;/p&gt;

&lt;p&gt;In both instances, we had hardware issues that caused the MDS to go down.  I think we followed the same procedures when we tried to bring them back up.  It&apos;s possible that in the most recent instance there were still some lingering hardware issues.&lt;/p&gt;


&lt;p&gt;Is it normal for the stale client to be reported as just a UUID and not a NID?&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;atlas2-MDT0000: disconnect stale client 57ffbae2-6467-b467-fad5-58b51832006c@&amp;lt;unknown&amp;gt;
atlas2-MDT0000: EVICTING ffff881eaf853800 57ffbae2-6467-b467-fad5-58b51832006c (no nid) 2 (0 0 0) 0 0 1 0: (&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;)  146091266917
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;After the fact now, can we tell which node that was? (looking in the exports now, that UUID doesn&apos;t exist).  I&apos;d like to see what messages it logged.&lt;/p&gt;</comment>
                            <comment id="96638" author="simmonsja" created="Fri, 17 Oct 2014 21:14:44 +0000"  >&lt;p&gt;On our smaller scale test bed system applying the patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5079&quot; title=&quot;conf-sanity test_47 timeout&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5079&quot;&gt;&lt;del&gt;LU-5079&lt;/del&gt;&lt;/a&gt; seems to help resolve the recovery issues. Next week we will move to testing to a large size system. It looks like the patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4578&quot; title=&quot;Early replies do not honor at_max&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4578&quot;&gt;&lt;del&gt;LU-4578&lt;/del&gt;&lt;/a&gt; is what is causing the breakage. It is also impacting Lustre 2.7 as reported in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5077&quot; title=&quot;insanity test_1: out of memory on MDT in crypto_create_tfm()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5077&quot;&gt;&lt;del&gt;LU-5077&lt;/del&gt;&lt;/a&gt;. We might have a fix &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="100223" author="pjones" created="Fri, 28 Nov 2014 13:55:17 +0000"  >&lt;p&gt;James&lt;/p&gt;

&lt;p&gt;Are you comfortable to close this as a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5079&quot; title=&quot;conf-sanity test_47 timeout&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5079&quot;&gt;&lt;del&gt;LU-5079&lt;/del&gt;&lt;/a&gt;?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="100304" author="simmonsja" created="Mon, 1 Dec 2014 15:28:03 +0000"  >&lt;p&gt;The cause of our recovery issues was three things. They are &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5079&quot; title=&quot;conf-sanity test_47 timeout&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5079&quot;&gt;&lt;del&gt;LU-5079&lt;/del&gt;&lt;/a&gt;, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5287&quot; title=&quot;(ldlm_lib.c:2253:target_queue_recovery_request()) ASSERTION( req-&amp;gt;rq_export-&amp;gt;exp_lock_replay_needed ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5287&quot;&gt;&lt;del&gt;LU-5287&lt;/del&gt;&lt;/a&gt;, and lastly &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5651&quot; title=&quot;ASSERTION( req-&amp;gt;rq_export-&amp;gt;exp_lock_replay_needed ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5651&quot;&gt;&lt;del&gt;LU-5651&lt;/del&gt;&lt;/a&gt;. Of those only &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5651&quot; title=&quot;ASSERTION( req-&amp;gt;rq_export-&amp;gt;exp_lock_replay_needed ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5651&quot;&gt;&lt;del&gt;LU-5651&lt;/del&gt;&lt;/a&gt; is left to be merged to b2_5. So this ticket should remain open until that patch lands.&lt;/p&gt;</comment>
                            <comment id="101661" author="simmonsja" created="Mon, 15 Dec 2014 22:42:49 +0000"  >&lt;p&gt;Today we tested the latest 2.5 lustre code with the following patches:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-793&quot; title=&quot;Reconnections should not be refused when there is a request in progress from this client.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-793&quot;&gt;&lt;del&gt;LU-793&lt;/del&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3338&quot; title=&quot;IOC_MDC_GETFILESTRIPE can abuse vmalloc()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3338&quot;&gt;&lt;del&gt;LU-3338&lt;/del&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5485&quot; title=&quot;first mount always fail with avoid_asym_router_failure&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5485&quot;&gt;&lt;del&gt;LU-5485&lt;/del&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5651&quot; title=&quot;ASSERTION( req-&amp;gt;rq_export-&amp;gt;exp_lock_replay_needed ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5651&quot;&gt;&lt;del&gt;LU-5651&lt;/del&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5740&quot; title=&quot;Kernel upgrade [RHEL6.6 2.6.32-504.el6]&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5740&quot;&gt;&lt;del&gt;LU-5740&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;witha 500 client node. Recovery completely failed to complete. After a hour and 22 minutes we gave up and ended recovery. During recovery we lost a OSS node which I attached the lustre log it dumped. We also have a core I can post from that OSS as well.&lt;/p&gt;</comment>
                            <comment id="101726" author="simmonsja" created="Tue, 16 Dec 2014 17:52:50 +0000"  >&lt;p&gt;Some more info from todays testings. The failure to recovery occurred when both the MDS and an OSS were failed over. If we did just a MDS or a OSS recovery would complete. When we did the second round of testing with a single server node we noticed that IR was reported as disabled even tho we have no non-IR clients. We checked that on the MGS.&lt;/p&gt;</comment>
                            <comment id="101740" author="jay" created="Tue, 16 Dec 2014 18:56:45 +0000"  >&lt;p&gt;Does &quot;single server node&quot; mean that the MGS was also restarted in the test?&lt;/p&gt;</comment>
                            <comment id="101742" author="simmonsja" created="Tue, 16 Dec 2014 19:15:09 +0000"  >&lt;p&gt;No. Only the MDS and OSS were restarted.&lt;/p&gt;</comment>
                            <comment id="102426" author="simmonsja" created="Tue, 30 Dec 2014 20:37:51 +0000"  >&lt;p&gt;We did another test run for recovery in the case of both MDS and OSS fail. I collected logs and placed them at ftp.whamcloud.com/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5724&quot; title=&quot;IR recovery doesn&amp;#39;t behave properly with Lustre 2.5&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5724&quot;&gt;&lt;del&gt;LU-5724&lt;/del&gt;&lt;/a&gt;/*.log. The OSS seem to recovery but the MDS did not recovery properly.&lt;/p&gt;</comment>
                            <comment id="102520" author="hongchao.zhang" created="Mon, 5 Jan 2015 10:10:59 +0000"  >&lt;p&gt;as per the log &quot;dump_atlas-tds-mds1-after-recovery.log&quot;,  there are 3 out of 4 clients completed the recovery at MDT.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:02000000:13.0:1419964653.561987:0:15786:0:(ldlm_lib.c:1392:target_finish_recovery()) atlastds-MDT0000: Recovery over after 30:00, of 4 clients 3 recovered and 1 was evicted.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;which nodes does the client log &quot;client-dump.log&quot; contain? no eviction record was found in this log.&lt;/p&gt;

&lt;p&gt;btw, do you use 4 clients and a separated MGS in this test? and could you please attach the console/sys logs along with&lt;br/&gt;
those debug logs?&lt;/p&gt;

&lt;p&gt;Thanks!&lt;/p&gt;</comment>
                            <comment id="102563" author="simmonsja" created="Mon, 5 Jan 2015 19:05:33 +0000"  >&lt;p&gt;The OSS reconnected to the MDS but none of the clients every reconnected. The clients appeared stuck. The client logs are from the client nodes we used. As for the configuration the MGS is a stand alone node and we tested with 4 nodes. Will grab the logs.&lt;/p&gt;</comment>
                            <comment id="102579" author="simmonsja" created="Mon, 5 Jan 2015 21:57:26 +0000"  >&lt;p&gt;Here you go. These are the logs from the clients and servers.&lt;/p&gt;</comment>
                            <comment id="102727" author="hongchao.zhang" created="Wed, 7 Jan 2015 09:37:05 +0000"  >&lt;p&gt;is there only one Lustre client at 10.38.144.11 in this configuration? are these logs in the same failover test above? &lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 2267.379541] Lustre: atlastds-MDT0000: Will be in recovery for at least 30:00, or until 1 client reconnects
Dec 29 14:31:02 atlas-tds-mds1.ccs.ornl.gov kernel: [ 2267.409294] Lustre: atlastds-MDT0000: Denying connection for new client 3ae0ecec-84ef-cf8f-c128-51873c53d1ad (at 10.38.144.11@o2ib4), waiting for all 1 known clients (0 recovered, 0 in progress, and 0 evicted) to recover in 29:59
Dec 29 14:31:08 atlas-tds-mds1.ccs.ornl.gov kernel: [ 2272.910080] Lustre: atlastds-MDT0000: Denying connection for new client 5116891d-0ace-dffd-7497-218db0b23e98 (at 10.38.144.11@o2ib4), waiting for all 1 known clients (0 recovered, 0 in progress, and 0 evicted) to recover in 29:54
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;the MDT &amp;amp; OSSs are waiting the client to reconnect for recovery, but it somehow failed to reconnect and seems to connect as&lt;br/&gt;
a new Lustre client which was denied by the MDS&amp;amp;OSSs for it was recovering from failover.&lt;/p&gt;

&lt;p&gt;Could you please attach the console&amp;amp;sys logs of the client? Thanks!&lt;/p&gt;</comment>
                            <comment id="102771" author="simmonsja" created="Wed, 7 Jan 2015 17:43:44 +0000"  >&lt;p&gt;Here are the kern logs for a client and a router. If you want the logs for all the clients let me know.&lt;/p&gt;</comment>
                            <comment id="102853" author="hongchao.zhang" created="Thu, 8 Jan 2015 10:52:52 +0000"  >&lt;p&gt;Is the client (rhea513, 10.38.146.45) the Lustre client connected to &quot;atlastds&quot;?&lt;br/&gt;
this client mounted at &quot;Dec 29 12:55:33&quot;, but umounted at &quot;Dec 29 14:04:54&quot;, but the MDT failed over at &quot;Dec 29 13:55:12&quot; and the Lustre was started only at &quot;Dec 29 14:26:55&quot;.&lt;/p&gt;

&lt;p&gt;at Client (rhea513, 10.38.146.45)&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; 
Dec 29 12:55:33 rhea513.ccs.ornl.gov kernel: Lustre: client wants to enable acl, but mdt not!
Dec 29 12:55:33 rhea513.ccs.ornl.gov kernel: Lustre: Layout lock feature supported.
Dec 29 12:55:33 rhea513.ccs.ornl.gov kernel: Lustre: Mounted atlastds-client
Dec 29 14:00:00 rhea513.ccs.ornl.gov kernel: Lustre: 5367:0:(client.c:1940:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1419879033/real 1419879033]  req@ffff881000328000 x1487757986577588/t0(0) o400-&amp;gt;atlastds-OST0033-osc-ffff880820a28c00@10.36.226.67@o2ib:28/4 lens 224/224 e 0 to 1 dl 1419879600 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
Dec 29 14:00:00 rhea513.ccs.ornl.gov kernel: Lustre: atlastds-OST0023-osc-ffff880820a28c00: Connection to atlastds-OST0023 (at 10.36.226.67@o2ib) was lost; in progress operations using this service will wait for recovery to complete
Dec 29 14:00:00 rhea513.ccs.ornl.gov kernel: Lustre: atlastds-OST002e-osc-ffff880820a28c00: Connection to atlastds-OST002e (at 10.36.226.70@o2ib) was lost; in progress operations using this service will wait for recovery to complete
Dec 29 14:00:00 rhea513.ccs.ornl.gov kernel: LustreError: 166-1: MGC10.36.226.79@o2ib: Connection to MGS (at 10.36.226.79@o2ib) was lost; in progress operations using this service will fail
Dec 29 14:00:00 rhea513.ccs.ornl.gov kernel: Lustre: 5367:0:(client.c:1940:ptlrpc_expire_one_request()) Skipped 52 previous similar messages
Dec 29 14:04:40 rhea513.ccs.ornl.gov kernel: Lustre: 5358:0:(client.c:1940:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1419879600/real 1419879600]  req@ffff88102a64b800 x1487757986596960/t0(0) o8-&amp;gt;atlastds-OST0033-osc-ffff880820a28c00@10.36.226.67@o2ib:28/4 lens 400/544 e 0 to 1 dl 1419879880 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
Dec 29 14:04:40 rhea513.ccs.ornl.gov kernel: Lustre: 5358:0:(client.c:1940:ptlrpc_expire_one_request()) Skipped 1 previous similar message
Dec 29 14:04:54 rhea513.ccs.ornl.gov kernel: Lustre: Unmounted atlastds-client
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;at MDT (10.36.226.69)&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Dec 29 13:56:16 atlas-tds-oss6.ccs.ornl.gov kernel: [  117.814766]  sdbd:
Dec 29 13:56:16 atlas-tds-oss6.ccs.ornl.gov kernel: [  117.814952] sd 7:0:0:98: [sdaj] Attached SCSI disk
Dec 29 13:56:16 atlas-tds-oss6.ccs.ornl.gov kernel: [  117.835086]  unknown partition table
Dec 29 13:56:16 atlas-tds-oss6.ccs.ornl.gov kernel: [  117.845364]  unknown partition table
Dec 29 13:56:16 atlas-tds-oss6.ccs.ornl.gov kernel: [  117.864189] sd 6:0:0:71: [sdau] Attached SCSI disk
Dec 29 13:56:16 atlas-tds-oss6.ccs.ornl.gov kernel: [  117.864257] sd 6:0:0:101: [sdbd] Attached SCSI disk
Dec 29 13:56:16 atlas-tds-oss6.ccs.ornl.gov kernel: [  117.970252] device-mapper: multipath round-robin: version 1.0.0 loaded
Dec 29 14:26:55 atlas-tds-oss6.ccs.ornl.gov kernel: [ 1957.955421] LNet: HW CPU cores: 16, npartitions: 4
Dec 29 14:26:55 atlas-tds-oss6.ccs.ornl.gov kernel: [ 1957.963523] alg: No test for crc32 (crc32-table)
Dec 29 14:26:55 atlas-tds-oss6.ccs.ornl.gov kernel: [ 1957.977845] alg: No test for adler32 (adler32-zlib)
Dec 29 14:26:55 atlas-tds-oss6.ccs.ornl.gov kernel: [ 1957.988788] alg: No test for crc32 (crc32-pclmul)
Dec 29 14:26:59 atlas-tds-oss6.ccs.ornl.gov kernel: [ 1962.022800] padlock: VIA PadLock Hash Engine not detected.
Dec 29 14:27:04 atlas-tds-oss6.ccs.ornl.gov kernel: [ 1966.277906] LNet: Added LNI 10.36.226.69@o2ib [63/2560/0/180]
Dec 29 14:27:04 atlas-tds-oss6.ccs.ornl.gov kernel: [ 1966.814213] LNet: Added LNI 10.36.226.69@o2ib200 [63/2560/0/180]
Dec 29 14:27:04 atlas-tds-oss6.ccs.ornl.gov kernel: [ 1967.054096] Lustre: Lustre: Build Version: 2.5.3-g6158f83-CHANGED-2.6.32-431.29.2.el6.atlas.x86_64
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;btw, in this failover test of both MDT and OSS was rebooted, and recovery will be slow for the OSSs will wait time to evict the client from MDT, which won&apos;t reconnect to recover.&lt;br/&gt;
and Lustre seems not to support both MDS and OSS to failover, IIRC.&lt;/p&gt;</comment>
                            <comment id="102880" author="simmonsja" created="Thu, 8 Jan 2015 15:55:22 +0000"  >&lt;p&gt;We have had success before with MDS+OSS failing over at the same time in the past. We really like to have that functionality restored.&lt;/p&gt;</comment>
                            <comment id="102908" author="green" created="Thu, 8 Jan 2015 19:31:08 +0000"  >&lt;p&gt;I think what HongChao tries to say is that when MDS and OST both go down, then since MDS is a client of OST, the OST recovery can never complete because of the missing client.&lt;/p&gt;

&lt;p&gt;But on the other hand we took two steps to help this. First, MDS client UUID should be always the same, so even after restart it still should be allowed to reconnect as the old known client (this is assuming it actually got up and into reconecting state in time for OST recovery. if your MDS takes ages to reboot, for example, it might miss this window, esp. if it&apos;s a shortened window thanks to IR).&lt;/p&gt;

&lt;p&gt;Second, we have VBR to deal with missing clients during recovery, which is esp. easy with MDT client, since it never has any outstanding uncommitted transactions to replay.&lt;/p&gt;</comment>
                            <comment id="102913" author="yujian" created="Thu, 8 Jan 2015 20:16:16 +0000"  >&lt;p&gt;In Lustre test suite, the following sub-tests in insanity.sh test failing MDS and OSS at the same time:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;run_test 2 &quot;Second Failure Mode: MDS/OST `date`&quot;
run_test 4 &quot;Fourth Failure Mode: OST/MDS `date`&quot;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The basic test steps for sub-test 2 are:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;fail MDS
fail OSS
start OSS
start MDS
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And the basic test steps for sub-test 4 are:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;fail OSS
fail MDS
start OSS
start MDS
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Here is the insanity test report for Lustre b2_5 build #107: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/bfd812b0-8a4d-11e4-a10b-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/bfd812b0-8a4d-11e4-a10b-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="102954" author="hongchao.zhang" created="Fri, 9 Jan 2015 02:32:07 +0000"  >&lt;p&gt;sorry for misunderstanding! Yes, Lustre supports both MDT and OSS to fail over.&lt;/p&gt;</comment>
                            <comment id="103513" author="simmonsja" created="Wed, 14 Jan 2015 19:13:42 +0000"  >&lt;p&gt;Testing failover and we see with just the MDS being failed over:&lt;/p&gt;

&lt;p&gt;very 1.0s: cat recovery_status                                                                                                                                                        Wed Jan 14 14:11:11 2015&lt;/p&gt;

&lt;p&gt;status: COMPLETE  &lt;br/&gt;
recovery_start: 1421262582&lt;br/&gt;
recovery_duration: 60&lt;br/&gt;
completed_clients: 82/82&lt;br/&gt;
replayed_requests: 0&lt;br/&gt;
last_transno: 90194315377&lt;br/&gt;
VBR: DISABLED&lt;br/&gt;
IR: DISABLED&lt;/p&gt;

&lt;p&gt;All clients are 2.5+ so there should be no reason that IR is disabled. Can you report this problem on your side?&lt;/p&gt;

&lt;p&gt;On the MGS we see during failover.&lt;/p&gt;

&lt;p&gt;root@atlas-tds-mds1 MGC10.36.226.79@o2ib]# cat ir_state&lt;br/&gt;
imperative_recovery: ENABLED&lt;br/&gt;
client_state:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;{ client: atlastds-MDT0000, nidtbl_version: 957 }&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="103582" author="hongchao.zhang" created="Thu, 15 Jan 2015 10:34:10 +0000"  >&lt;p&gt;Are both MGS and MDS failed over in this test?&lt;/p&gt;

&lt;p&gt;the IR status will be set IR_STARTUP after MGS is started and will be changed to IR_FULL after &quot;ir_timeout&quot;  seconds&lt;br/&gt;
(default is OBD_IR_MGS_TIMEOUT = &quot;4*obd_timeout&quot;).  the target(MDT or OST) registered to MGS will only be marked as&lt;br/&gt;
&quot;LDD_F_IR_CAPABLE&quot; if the IR status is IR_FULL, and &quot;IR&quot; will be printed as &quot;DISABLED&quot; in this case.&lt;/p&gt;

&lt;p&gt;for the client side, the imperative_recovery will be marked as &quot;Enabled&quot; if the connection with the server supports recovery&lt;br/&gt;
(imp-&amp;gt;imp_connect_data &amp;amp; OBD_CONNECT_IMP_RECOV == TRUE).&lt;/p&gt;</comment>
                            <comment id="103590" author="simmonsja" created="Thu, 15 Jan 2015 14:22:23 +0000"  >&lt;p&gt;No the MGS is left up. We failed over the MDS and OSS together.&lt;/p&gt;</comment>
                            <comment id="103708" author="hongchao.zhang" created="Fri, 16 Jan 2015 03:00:10 +0000"  >&lt;p&gt;Is the failover mode the same for both tests at Dec 31, 2014 and at Jan 14, 2015, which is there is a separated node&lt;br/&gt;
only running MGS connected by MDS, OSSs and the clients nodes, and the MDS and OSSs are failed over together? &lt;/p&gt;</comment>
                            <comment id="104573" author="yujian" created="Fri, 23 Jan 2015 21:26:41 +0000"  >&lt;p&gt;With the patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4119&quot; title=&quot;recovery time hard doesn&amp;#39;t limit recovery duration&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4119&quot;&gt;&lt;del&gt;LU-4119&lt;/del&gt;&lt;/a&gt;, the recovery issue did not occur at small scale testing in ORNL. Large scale testing will be performed.&lt;/p&gt;</comment>
                            <comment id="105555" author="yujian" created="Tue, 3 Feb 2015 18:43:35 +0000"  >&lt;p&gt;Large scale testing passed.&lt;/p&gt;

&lt;p&gt;Hi James, can we close this ticket now?&lt;/p&gt;</comment>
                            <comment id="105900" author="yujian" created="Thu, 5 Feb 2015 18:48:18 +0000"  >&lt;p&gt;Close this ticket as a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4119&quot; title=&quot;recovery time hard doesn&amp;#39;t limit recovery duration&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4119&quot;&gt;&lt;del&gt;LU-4119&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="21496">LU-4119</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="24752">LU-5079</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="15927" name="atlas-mds1.log" size="683574" author="simmonsja" created="Fri, 10 Oct 2014 19:56:07 +0000"/>
                            <attachment id="16650" name="atlas-tds-kernel-logs_20141229.tar.gz" size="271052" author="simmonsja" created="Mon, 5 Jan 2015 21:57:26 +0000"/>
                            <attachment id="16599" name="atlas-tds-oss1_recovery_lustre-log.1418679242.16958" size="304" author="simmonsja" created="Mon, 15 Dec 2014 22:42:49 +0000"/>
                            <attachment id="16658" name="rhea-rtr1_kern_12292014.log" size="374403" author="simmonsja" created="Wed, 7 Jan 2015 17:43:44 +0000"/>
                            <attachment id="16659" name="rhea513_kern_12292014.log" size="493748" author="simmonsja" created="Wed, 7 Jan 2015 17:43:44 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwya7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>16076</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>