<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:10:12 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-7588] endless recovery on lustre 2.7</title>
                <link>https://jira.whamcloud.com/browse/LU-7588</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;After upgrading Lustre from 2.5.1 to 2.7.61 on snx11117 the clients can not be mounted (mount client hangs) because of endless recovery:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[423225.578209] Lustre: snx11117-MDT0000: Denying connection for new client 3ec3f6c4-e172-39d7-383c-d4c19737f54c(at 10.9.100.9@o2ib3), waitin
g for 2 known clients (2 recovered, 0 in progress, and 0 evicted) to recover in 21188498:05                                                  
[423225.601237] Lustre: Skipped 41 previous similar messages   &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;It seems &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3540&quot; title=&quot;recovery for cross-MDT operation&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3540&quot;&gt;&lt;del&gt;LU-3540&lt;/del&gt;&lt;/a&gt; lod: update recovery thread&quot; broke recovery_time_hard functionality.&lt;br/&gt;
check_for_recovery_ready causes endless loop in target_recovery_overseer when flag tdtd_replay_ready is not set:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;static int check_for_recovery_ready(struct lu_target *lut)
...
       if (!obd-&amp;gt;obd_abort_recovery &amp;amp;&amp;amp; !obd-&amp;gt;obd_recovery_expired) {
                LASSERT(clnts &amp;lt;= obd-&amp;gt;obd_max_recoverable_clients);
                if (clnts + obd-&amp;gt;obd_stale_clients &amp;lt;
                    obd-&amp;gt;obd_max_recoverable_clients)
                        return 0;
        }    

        if (lut-&amp;gt;lut_tdtd != NULL) {
                if (!lut-&amp;gt;lut_tdtd-&amp;gt;tdtd_replay_ready) {
                        /* Let&apos;s extend recovery timer, in case the recovery
                         * timer expired, and some clients got evicted */
                        extend_recovery_timer(obd, obd-&amp;gt;obd_recovery_timeout,
                                              true);
                        return 0;
                } else {
                        dtrq_list_dump(lut-&amp;gt;lut_tdtd, D_HA);
                }    
        }    
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;check_for_recovery_ready doesn&apos;t return 1 despite of the fact that all clients already connected and recovery has expired:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:00080000:0.0:1450170133.405945:0:243397:0:(ldlm_lib.c:2081:check_for_recovery_ready()) connected 2 stale 0 max_recoverable_clients 2 abort 0 expired 1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Because of not set tdtd_replay_ready flag check_for_recovery_ready returns 0 and trying to extend recovery timer(without success):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:00080000:0.0:1450170133.405947:0:243397:0:(ldlm_lib.c:1745:extend_recovery_timer()) snx11117-MDT0000: recovery timer will expire in 4294905278 seconds&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Imo below strings brake previous logic of target_recovery_overseer and recovery_time_hard:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;        if (!obd-&amp;gt;obd_abort_recovery &amp;amp;&amp;amp; !obd-&amp;gt;obd_recovery_expired) {
                LASSERT(clnts &amp;lt;= obd-&amp;gt;obd_max_recoverable_clients);
                if (clnts + obd-&amp;gt;obd_stale_clients &amp;lt;
                    obd-&amp;gt;obd_max_recoverable_clients)
                        return 0;
        }&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;See the difference with check_for_clients used before &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3540&quot; title=&quot;recovery for cross-MDT operation&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3540&quot;&gt;&lt;del&gt;LU-3540&lt;/del&gt;&lt;/a&gt;:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;static int check_for_clients(struct obd_device *obd)
{
       unsigned int clnts = atomic_read(&amp;amp;obd-&amp;gt;obd_connected_clients);

       if (obd-&amp;gt;obd_abort_recovery || obd-&amp;gt;obd_recovery_expired)
               return 1;
       LASSERT(clnts &amp;lt;= obd-&amp;gt;obd_max_recoverable_clients);
       return (clnts + obd-&amp;gt;obd_stale_clients ==
               obd-&amp;gt;obd_max_recoverable_clients);
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="33801">LU-7588</key>
            <summary>endless recovery on lustre 2.7</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="di.wang">Di Wang</assignee>
                                    <reporter username="scherementsev">Sergey Cheremencev</reporter>
                        <labels>
                    </labels>
                <created>Mon, 21 Dec 2015 18:54:46 +0000</created>
                <updated>Fri, 11 May 2018 18:27:42 +0000</updated>
                            <resolved>Fri, 9 Sep 2016 17:48:14 +0000</resolved>
                                    <version>Lustre 2.7.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="137206" author="pjones" created="Tue, 22 Dec 2015 18:32:17 +0000"  >&lt;p&gt;Di&lt;/p&gt;

&lt;p&gt;I know that things have changed in this area recently. Is this a duplicate of an existing issue or something new?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="137210" author="di.wang" created="Tue, 22 Dec 2015 18:48:25 +0000"  >&lt;p&gt;Yes, the protocol has been changed here. For DNE system, cross-MDT recovery will not expire by normal timeout, until sys admin abort recovery manually (obd_force_abort_recovery == 1), to avoid filesystem inconsistency. Is this multiple MDT filesystem?  I think we need to know why the recovery hang there (or even failed)  Could you please post the stack trace and console log here?&lt;/p&gt;

&lt;p&gt;Though this console message is misleading, and needs to be fixed.&lt;/p&gt;
</comment>
                            <comment id="137595" author="sergey" created="Tue, 29 Dec 2015 20:12:17 +0000"  >&lt;p&gt;Thanks for explanation. &lt;/p&gt;

&lt;p&gt;Yes, it was DNE system.&lt;br/&gt;
Recovery continued for several days before was aborted and lustre logs don&apos;t cover the moment of recovery start.&lt;br/&gt;
Console logs also don&apos;t include any information about the failure. Both are spammed with:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;309642.675887] Lustre: snx11117-MDT0000: Client snx11117-MDT0001-mdtlov_UUID (at 10.9.100.15@o2ib3) reconnecting, waiting for 22 clients in recovery for 21183383:17
...
and
00010000:00080000:4.0:1449753210.886072:0:174125:0:(ldlm_lib.c:2081:check_for_recovery_ready()) connected 2 stale 0 max_recoverable_clients 22 abort 0 expired 1
00010000:00080000:4.0:1449753210.886074:0:174125:0:(ldlm_lib.c:1745:extend_recovery_timer()) snx11117-MDT0000: recovery timer will expire in 4294901676 seconds
00010000:00080000:4.0:1449753220.469393:0:174125:0:(ldlm_lib.c:2081:check_for_recovery_ready()) connected 2 stale 0 max_recoverable_clients 22 abort 0 expired 1
00010000:00080000:4.0:1449753220.469397:0:174125:0:(ldlm_lib.c:1745:extend_recovery_timer()) snx11117-MDT0000: recovery timer will expire in 4294901666 seconds&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;If it is expected behaviour that recovery should be aborted manually in such case I agree we can lower ticket priority.&lt;br/&gt;
I asked our testers to try to reproduce the problem and gather more logs.&lt;/p&gt;

&lt;blockquote&gt;&lt;p&gt;For DNE system, cross-MDT recovery will not expire by normal timeout, until sys admin abort recovery manually (obd_force_abort_recovery == 1), to avoid filesystem inconsistency.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Do you have any documentation about this ?&lt;/p&gt;

&lt;p&gt;Want to note that delay above recovery_time_hard will cause recovery abort and loss of all replays as result.&lt;br/&gt;
Is it reasonable to arm recovery_timer only when tdtd_replay_ready is set ?&lt;br/&gt;
I mean we can add to recovery_time_hard the difference between the start_of_recovery and the time when tdtd_replay_ready is set.&lt;br/&gt;
This gives the ability to complete recovery in case when too much time spent to update logs from all MDTs.&lt;/p&gt;

&lt;p&gt;Also suggest to add following fix:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -1740,8 +1740,9 @@ static void extend_recovery_timer(struct obd_device *obd, int drt, bool extend)
         }
        spin_unlock(&amp;amp;obd-&amp;gt;obd_dev_lock);
 
-       CDEBUG(D_HA, &quot;%s: recovery timer will expire in %u seconds\n&quot;,
-               obd-&amp;gt;obd_name, (unsigned)cfs_time_sub(end, now));
+       if (end &amp;gt; now)
+               CDEBUG(D_HA, &quot;%s: recovery timer will expire in %u seconds\n&quot;,
+                       obd-&amp;gt;obd_name, (unsigned)cfs_time_sub(end, now));
 }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;And something should be done with &quot;to recover in 21188498:05&quot;:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c
index 13e3e59..65f4a72 100644
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -1216,17 +1216,25 @@ no_export:
                        i = atomic_read(&amp;amp;target-&amp;gt;obd_lock_replay_clients);
                        k = target-&amp;gt;obd_max_recoverable_clients;
                        s = target-&amp;gt;obd_stale_clients;
-                       t = cfs_timer_deadline(&amp;amp;target-&amp;gt;obd_recovery_timer);
-                       t = cfs_time_sub(t, cfs_time_current());
-                       t = cfs_duration_sec(t);
                        LCONSOLE_WARN(&quot;%s: Denying connection for new client %s&quot;
                                      &quot;(at %s), waiting for %d known clients &quot;
                                      &quot;(%d recovered, %d in progress, and %d &quot;
-                                     &quot;evicted) to recover in %d:%.02d\n&quot;,
+                                     &quot;evicted) to recover\n&quot;,
                                      target-&amp;gt;obd_name, cluuid.uuid,
                                      libcfs_nid2str(req-&amp;gt;rq_peer.nid), k,
-                                     c - i, i, s, (int)t / 60,
-                                     (int)t % 60);
+                                     c - i, i, s);
+                       t = cfs_timer_deadline(&amp;amp;target-&amp;gt;obd_recovery_timer);
+                       if (cfs_time_befor(cfs_time_current(), t)) {
+                               t = cfs_time_sub(t, cfs_time_current());
+                               t = cfs_duration_sec(t);
+                               LCONSOLE_WARN(&quot;%s: Recover expired in %d:%.02d\n&quot;,
+                                       target-&amp;gt;obd_name, (int)t / 60,
+                                       (int)t % 60);
+                       } else {
+                               LCONSOLE_WARN(&quot;%s: Recover expired in unknown time\n&quot;,
+                                       target-&amp;gt;obd_name);
+                       }
+
                        rc = -EBUSY;
                } else {
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="142088" author="di.wang" created="Fri, 12 Feb 2016 18:44:06 +0000"  >&lt;p&gt;Sorry for delay response.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Do you have any documentation about this ?
{no format}

Sorry, no formal document yet. But check this patch http://review.whamcloud.com/17885 LU-7638, the console message has been improved to notify the user to abort or wait a few more minutes.

{ noformat}
Want to note that delay above recovery_time_hard will cause recovery abort and loss of all replays as result.
Is it reasonable to arm recovery_timer only when tdtd_replay_ready is set ?
I mean we can add to recovery_time_hard the difference between the start_of_recovery and the time when tdtd_replay_ready is set.
This gives the ability to complete recovery in case when too much time spent to update logs from all MDTs.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;recovery_time_hard is sth the user provides (or define by system) to limit the time of recovery, I am not sure we should extend it during recovery. Hmm, the reason we do not abort DNE recovery is to keep the namespace consistency, but after COS is landed, maybe we should stop DNE recovery as well when it reaches to obd_recovery_time_hard.&lt;/p&gt;

&lt;p&gt;Anyway, please check the latest master, which already includes &lt;a href=&quot;http://review.whamcloud.com/17885&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/17885&lt;/a&gt;, see if you can still reproduce the problem. Thanks.&lt;/p&gt;
</comment>
                            <comment id="165507" author="di.wang" created="Fri, 9 Sep 2016 17:48:14 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/17885&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/17885&lt;/a&gt; is already landed.  Let&apos;s close the ticket for now.  Sergey, if you do not think this  solve your problem, feel free to re-open it.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzxwhr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>