<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:08:42 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-14318] Add the option to limit the overall recovery time</title>
                <link>https://jira.whamcloud.com/browse/LU-14318</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Currently, the recovery time could be extended to several hours if the recovery&lt;br/&gt;
between MDT is stuck or some other reasons, which can cause the cluster to be&lt;br/&gt;
unavailable for long time if no one is noticed and use &quot;lctl&quot; to abort the recovery,&lt;br/&gt;
then there is some option to be needed to add to limit the overall recovery time. &lt;/p&gt;</description>
                <environment></environment>
        <key id="62297">LU-14318</key>
            <summary>Add the option to limit the overall recovery time</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="hongchao.zhang">Hongchao Zhang</reporter>
                        <labels>
                    </labels>
                <created>Mon, 11 Jan 2021 07:34:54 +0000</created>
                <updated>Tue, 14 Sep 2021 12:07:06 +0000</updated>
                                                                                <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="289465" author="hongchao.zhang" created="Thu, 14 Jan 2021 09:59:10 +0000"  >&lt;p&gt;Hongchao Zhang (hongchao@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/41171&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/41171&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14318&quot; title=&quot;Add the option to limit the overall recovery time&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14318&quot;&gt;LU-14318&lt;/a&gt; ldlm: add recovery time limit&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 2&lt;br/&gt;
Commit: 3f8c53ec33e518658d91d7ff3d07e13f5ad43327&lt;/p&gt;</comment>
                            <comment id="289711" author="hongchao.zhang" created="Mon, 18 Jan 2021 02:31:35 +0000"  >&lt;p&gt;Currently, if some other MDT can&apos;t be connected during the MDT recovery, the recovery process can be extended to&lt;br/&gt;
last several hours (maybe forever if not aborted by the lctl) in &quot;check_for_recovery_ready&quot;&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;static int check_for_recovery_ready(struct lu_target *lut)
{
        struct obd_device *obd = lut-&amp;gt;lut_obd;
        unsigned int clnts = atomic_read(&amp;amp;obd-&amp;gt;obd_connected_clients);

        CDEBUG(D_HA,
               &quot;connected %d stale %d max_recoverable_clients %d abort %d expired %d\n&quot;,
               clnts, obd-&amp;gt;obd_stale_clients,
               atomic_read(&amp;amp;obd-&amp;gt;obd_max_recoverable_clients),
               obd-&amp;gt;obd_abort_recovery, obd-&amp;gt;obd_recovery_expired);

        if (!obd-&amp;gt;obd_abort_recovery &amp;amp;&amp;amp; !obd-&amp;gt;obd_recovery_expired) {
                LASSERT(clnts &amp;lt;=
                        atomic_read(&amp;amp;obd-&amp;gt;obd_max_recoverable_clients));
                if (clnts + obd-&amp;gt;obd_stale_clients &amp;lt;
                    atomic_read(&amp;amp;obd-&amp;gt;obd_max_recoverable_clients))
                        return 0;
        }

        if (!obd-&amp;gt;obd_abort_recov_mdt &amp;amp;&amp;amp; lut-&amp;gt;lut_tdtd != NULL) {
                if (!lut-&amp;gt;lut_tdtd-&amp;gt;tdtd_replay_ready &amp;amp;&amp;amp;
                    !obd-&amp;gt;obd_abort_recovery &amp;amp;&amp;amp; !obd-&amp;gt;obd_stopping) {
                        /*
                         * Let&apos;s extend recovery timer, in case the recovery
                         * timer expired, and some clients got evicted
                         */
                        extend_recovery_timer(obd, obd-&amp;gt;obd_recovery_timeout, &amp;lt;--- the recovery will be extended even if the timer expired
                                              true);
                        CDEBUG(D_HA,
                               &quot;%s update recovery is not ready, extend recovery %d\n&quot;,
                               obd-&amp;gt;obd_name, obd-&amp;gt;obd_recovery_timeout);
                        return 0;
                }
        }

        return 1;
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="290751" author="adilger" created="Fri, 29 Jan 2021 23:44:36 +0000"  >&lt;p&gt;Hongchao, is this patch only needed for the case when the remote MDT is not available at all, or is there some other problem like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13608&quot; title=&quot;MDT stuck in WAITING, abort_recov stuck too&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13608&quot;&gt;&lt;del&gt;LU-13608&lt;/del&gt;&lt;/a&gt; that is causing recovery to be stuck for a long time even when all of the MDTs are available?&lt;/p&gt;

&lt;p&gt;Rather than timing out recovery for a remote MDT completely, it would probably be better to keep the recovery &lt;b&gt;for that MDT&lt;/b&gt; pending until the MDT is available again, and then do the remote recovery when the MDTs reconnect.  That might only be a small (or no) difference in the normal case when all of the MDTs are available at mount, but I think this may give a very important improvement when some MDTs are unavailable.&lt;/p&gt;

&lt;p&gt;The big improvement would be if MDT0000 and other MDTs are available at restart time, it would complete recovery with all those MDTs quickly, and not block access to files/directories that are on available MDTs.  It would allow most client access to work, and only remote/striped directories would be blocked and/or time out (allow CTRL-C for client processes).  This would be better for users, if they are mostly using remote directories for subtrees of the filesystem, since only the subtrees on the missing MDTs would be inaccessible.&lt;/p&gt;

&lt;p&gt;Eventually, having mirrored entries for &lt;tt&gt;ROOT/&lt;/tt&gt; on several/all MDTs could allow the filesystem to be accessible even if MDT0000 is unavailable, but that is definitely a separate project.&lt;/p&gt;</comment>
                            <comment id="290824" author="hongchao.zhang" created="Mon, 1 Feb 2021 05:33:51 +0000"  >&lt;p&gt;HI,&lt;br/&gt;
this patch is meant to solve the recovery hung caused by some unresponsive MDT during recovery, the patch in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13608&quot; title=&quot;MDT stuck in WAITING, abort_recov stuck too&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13608&quot;&gt;&lt;del&gt;LU-13608&lt;/del&gt;&lt;/a&gt; is meant to fix&lt;br/&gt;
the &quot;lctl abort_recovery&quot; hung when it was used to abort the recovery.&lt;/p&gt;

&lt;p&gt;Okay, I will create another patch to allow the recovery to continue if some of MDTs other than MDT0000 is unavailable during recovery.&lt;br/&gt;
Thanks!&lt;/p&gt;</comment>
                            <comment id="291337" author="gerrit" created="Fri, 5 Feb 2021 09:58:11 +0000"  >&lt;p&gt;Hongchao Zhang (hongchao@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/41424&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/41424&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14318&quot; title=&quot;Add the option to limit the overall recovery time&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14318&quot;&gt;LU-14318&lt;/a&gt; ldlm: don&apos;t wait other MDT forever&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 4c2b2b26bd29b1a8f522fbf705a3d145cb06eb9f&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="59362">LU-13608</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i01j1r:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>