<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:50:02 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12144] Lustre Stuck on Recovery</title>
                <link>https://jira.whamcloud.com/browse/LU-12144</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We been stuck in MDT recovery for our exascaler host for about 2 hours.&#160; We&apos;ve been wrestling with it back and forth now but it will not finish its recovering.&#160; The following errors are seen:&lt;/p&gt;

&lt;p&gt;Apr 1 16:20:55 scratchlfsmds01 kernel: Lustre: scrlfs-MDT0000: Denying connection for new client f3980ca0-c3a8-4c99-749a-9f58c2850ad2(at 10.242.104.147@tcp), waiting for 4041 known clients (4012 recovered, 29 in progress, and 0 evicted) to recover in 0:36&lt;br/&gt;
Apr 1 16:20:55 scratchlfsmds01 kernel: Lustre: Skipped 1 previous similar message&lt;br/&gt;
Apr 1 16:21:00 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Retransmit List: 222 &lt;br/&gt;
Apr 1 16:21:00 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Retransmit List: 226 &lt;br/&gt;
Apr 1 16:21:03 scratchlfsmds01 kernel: LustreError: 19352:0:(tgt_handler.c:509:tgt_filter_recovery_request()) @@@ not permitted during recovery req@ffff9dbb52bcb440 x1629643355468768/t0(0) o601-&amp;gt;scrlfs-MDT0000-lwp-OST0004_UUID@10.31.164.250@o2ib:735/0 lens 336/0 e 0 to 0 dl 1554150115 ref 1 fl Interpret:/0/ffffffff rc 0/-1&lt;br/&gt;
Apr 1 16:21:03 scratchlfsmds01 kernel: LustreError: 19352:0:(tgt_handler.c:509:tgt_filter_recovery_request()) Skipped 47 previous similar messages&lt;br/&gt;
Apr 1 16:21:10 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Retransmit List: 22b &lt;br/&gt;
Apr 1 16:21:18 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Retransmit List: 231 &lt;br/&gt;
Apr 1 16:21:18 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Retransmit List: 233 &lt;br/&gt;
Apr 1 16:21:21 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Retransmit List: 235 &lt;br/&gt;
Apr 1 16:21:21 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Retransmit List: 238 &lt;br/&gt;
Apr 1 16:21:31 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Retransmit List: 23a &lt;br/&gt;
Apr 1 16:21:31 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Retransmit List: 23d &lt;br/&gt;
Apr 1 16:21:36 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Retransmit List: 240 &lt;br/&gt;
Apr 1 16:21:41 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Retransmit List: 244 &lt;br/&gt;
Apr 1 16:21:51 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Retransmit List: 249 &lt;br/&gt;
Apr 1 16:21:55 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Retransmit List: 24e &lt;br/&gt;
Apr 1 16:21:55 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Marking ringid 0 interface 10.31.72.11 FAULTY&lt;br/&gt;
Apr 1 16:21:56 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Automatically recovered ring 0&lt;br/&gt;
Apr 1 16:22:01 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Retransmit List: 253 &lt;br/&gt;
Apr 1 16:22:01 scratchlfsmds01 systemd&lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt;: Starting Cleanup of Temporary Directories...&lt;br/&gt;
Apr 1 16:22:01 scratchlfsmds01 systemd&lt;span class=&quot;error&quot;&gt;&amp;#91;1&amp;#93;&lt;/span&gt;: Started Cleanup of Temporary Directories.&lt;br/&gt;
Apr 1 16:22:11 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Retransmit List: 259 &lt;br/&gt;
Apr 1 16:22:10 scratchlfsmds01 kernel: Lustre: scrlfs-MDT0000: Denying connection for new client f3980ca0-c3a8-4c99-749a-9f58c2850ad2(at 10.242.104.147@tcp), waiting for 4041 known clients (4012 recovered, 29 in progress, and 0 evicted) to recover in 21188504:41&lt;br/&gt;
Apr 1 16:22:10 scratchlfsmds01 kernel: Lustre: Skipped 2 previous similar messages&lt;br/&gt;
Apr 1 16:22:13 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Retransmit List: 25d &lt;br/&gt;
Apr 1 16:22:13 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Retransmit List: 25f &lt;br/&gt;
Apr 1 16:22:13 scratchlfsmds01 kernel: LustreError: 19370:0:(ldlm_request.c:130:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554149835, 300s ago); not entering recovery in server code, just going back to sleep ns: MGS lock: ffff9dd3daa01000/0x6bc5271fecb20ca6 lrc: 3/0,1 mode: --/EX res: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x73666c726373:0x2:0x0&amp;#93;&lt;/span&gt;.0x0 rrc: 2037 type: PLN flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 19370 timeout: 0 lvb_type: 0&lt;br/&gt;
Apr 1 16:22:13 scratchlfsmds01 kernel: Lustre: MGS: Received new LWP connection from 10.31.163.202@o2ib, removing former export from same NID&lt;br/&gt;
Apr 1 16:22:13 scratchlfsmds01 kernel: Lustre: MGS: Connection restored to 988e49b9-7fc3-aa82-6ab5-3ad6ec520922 (at 10.31.163.202@o2ib)&lt;br/&gt;
Apr 1 16:22:13 scratchlfsmds01 kernel: Lustre: Skipped 396 previous similar messages&lt;br/&gt;
Apr 1 16:22:13 scratchlfsmds01 kernel: LustreError: dumping log to /tmp/lustre-log.1554150135.19370&lt;br/&gt;
Apr 1 16:22:14 scratchlfsmds01 kernel: Lustre: MGS: Received new LWP connection from 10.31.161.45@o2ib, removing former export from same NID&lt;br/&gt;
Apr 1 16:22:14 scratchlfsmds01 kernel: Lustre: Skipped 56 previous similar messages&lt;br/&gt;
Apr 1 16:22:15 scratchlfsmds01 kernel: Lustre: MGS: Received new LWP connection from 10.31.160.124@o2ib, removing former export from same NID&lt;br/&gt;
Apr 1 16:22:15 scratchlfsmds01 kernel: Lustre: Skipped 165 previous similar messages&lt;br/&gt;
Apr 1 16:22:16 scratchlfsmds01 kernel: LustreError: 166-1: MGC10.31.164.248@o2ib: Connection to MGS (at 0@lo) was lost; in progress operations using this service will fail&lt;br/&gt;
Apr 1 16:22:16 scratchlfsmds01 kernel: LustreError: 19118:0:(ldlm_request.c:148:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554149838, 300s ago), entering recovery for MGS@10.31.164.248@o2ib ns: MGC10.31.164.248@o2ib lock: ffff9dd3e6932c00/0x6bc5271fecb2391c lrc: 4/1,0 mode: --/CR res: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x73666c726373:0x2:0x0&amp;#93;&lt;/span&gt;.0x0 rrc: 2 type: PLN flags: 0x1000000000000 nid: local remote: 0x6bc5271fecb23923 expref: -99 pid: 19118 timeout: 0 lvb_type: 0&lt;br/&gt;
Apr 1 16:22:16 scratchlfsmds01 kernel: LustreError: 23294:0:(ldlm_resource.c:1100:ldlm_resource_complain()) MGC10.31.164.248@o2ib: namespace resource &lt;span class=&quot;error&quot;&gt;&amp;#91;0x73666c726373:0x2:0x0&amp;#93;&lt;/span&gt;.0x0 (ffff9dd3e6b4e900) refcount nonzero (2) after lock cleanup; forcing cleanup.&lt;br/&gt;
Apr 1 16:22:16 scratchlfsmds01 kernel: LustreError: 23294:0:(ldlm_resource.c:1682:ldlm_resource_dump()) &amp;#8212; Resource: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x73666c726373:0x2:0x0&amp;#93;&lt;/span&gt;.0x0 (ffff9dd3e6b4e900) refcount = 3&lt;br/&gt;
Apr 1 16:22:16 scratchlfsmds01 kernel: LustreError: 23294:0:(ldlm_resource.c:1703:ldlm_resource_dump()) Waiting locks:&lt;br/&gt;
Apr 1 16:22:16 scratchlfsmds01 kernel: LustreError: 23294:0:(ldlm_resource.c:1705:ldlm_resource_dump()) ### ### ns: MGC10.31.164.248@o2ib lock: ffff9dd3e6932c00/0x6bc5271fecb2391c lrc: 4/1,0 mode: --/CR res: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x73666c726373:0x2:0x0&amp;#93;&lt;/span&gt;.0x0 rrc: 4 type: PLN flags: 0x1106400000000 nid: local remote: 0x6bc5271fecb23923 expref: -99 pid: 19118 timeout: 0 lvb_type: 0&lt;br/&gt;
Apr 1 16:22:17 scratchlfsmds01 kernel: Lustre: MGS: Received new LWP connection from 10.242.128.221@tcp, removing former export from same NID&lt;br/&gt;
Apr 1 16:22:17 scratchlfsmds01 kernel: Lustre: Skipped 327 previous similar messages&lt;br/&gt;
Apr 1 16:22:21 scratchlfsmds01 corosync&lt;span class=&quot;error&quot;&gt;&amp;#91;13445&amp;#93;&lt;/span&gt;: &lt;span class=&quot;error&quot;&gt;&amp;#91;TOTEM &amp;#93;&lt;/span&gt; Retransmit List: 262&lt;/p&gt;

&lt;p&gt;It then gives a bogus time for recovery which equal 40 years.&lt;/p&gt;

&lt;p&gt;Apr 1 16:24:23 scratchlfsmds01 kernel: Lustre: scrlfs-MDT0000: Denying connection for new client f3980ca0-c3a8-4c99-749a-9f58c2850ad2(at 10.242.104.147@tcp), waiting for 4041 known clients (4012 recovered, 29 in progress, and 0 evicted) to recover in 21188502:27&lt;/p&gt;

&lt;p&gt;Which is clearly impossible.&#160; We eventually see this:&lt;/p&gt;

&lt;p&gt;Apr 1 16:35:09 scratchlfsmds01 kernel: Lustre: scrlfs-MDT0000: Recovery already passed deadline 13:38, It is most likely due to DNE recovery is failed or stuck, please wait a few more minutes or abort the recovery.&lt;/p&gt;

&lt;p&gt;Which as you can imagine is not helpful.&#160; It seems that we have 29 clients that will not recover and Lustre will not evict.&#160; We tried forcing an abort via lctl on the recovery but that did not work.&#160; Right now it is just sitting and waiting.&lt;/p&gt;

&lt;p&gt;This filesystem is our main scratch filesystem, with it down our cluster is out of the water.&#160; So we need help ASAP.&#160; Thanks in advance.&lt;/p&gt;

&lt;p&gt;&lt;del&gt;Paul Edmon&lt;/del&gt;&lt;/p&gt;</description>
                <environment>DDN Exascaler</environment>
        <key id="55322">LU-12144</key>
            <summary>Lustre Stuck on Recovery</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="pedmon">Paul Edmon</reporter>
                        <labels>
                    </labels>
                <created>Mon, 1 Apr 2019 20:44:05 +0000</created>
                <updated>Tue, 2 Apr 2019 02:50:05 +0000</updated>
                            <resolved>Mon, 1 Apr 2019 22:51:43 +0000</resolved>
                                    <version>Lustre 2.10.5</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="245053" author="simmonsja" created="Mon, 1 Apr 2019 21:07:58 +0000"  >&lt;p&gt;Can you try patch&#160;&lt;a href=&quot;https://review.whamcloud.com/#/c/33883/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/33883/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="245055" author="pjones" created="Mon, 1 Apr 2019 21:51:45 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=pedmon&quot; class=&quot;user-hover&quot; rel=&quot;pedmon&quot;&gt;pedmon&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;If this is for an EXASCaler deployment then you should go through DDN support channels to make sure that you get advise appropriate for your setup. The LU project is for tracking items relating to the community releases of Lustre and so there are engineers like James (from ORNL) active who may not have the full context of the setup&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="245056" author="pedmon" created="Mon, 1 Apr 2019 21:54:21 +0000"  >&lt;p&gt;Yes, we&apos;ve reached out to DDN as well.&#160; That bug though looks like it may be what we are hitting.&lt;/p&gt;</comment>
                            <comment id="245062" author="pjones" created="Mon, 1 Apr 2019 22:51:43 +0000"  >&lt;p&gt;Regardless of the fix, you will still need it in an EXA build to be able to leverage it &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&#160;I&apos;ll close this ticket and expect to see an escalation through DDN channels shortly.&lt;/p&gt;</comment>
                            <comment id="245065" author="adilger" created="Mon, 1 Apr 2019 23:58:21 +0000"  >&lt;p&gt;James, two notes here:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;the reported problem is on 2.10.5, while your patch is based on master (with a bunch of patches in between), so would not apply&lt;/li&gt;
	&lt;li&gt;your patch (AFAICS) only affects the message being printed on the console, it doesn&apos;t actually affect the behavior of the system&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Paul,&lt;br/&gt;
note that &quot;system down&quot; tickets should be filed with &quot;Severity 1&quot;, rather than &quot;Severity 4&quot; (which is the least important).&lt;/p&gt;</comment>
                            <comment id="245066" author="pjones" created="Tue, 2 Apr 2019 00:22:20 +0000"  >&lt;p&gt;This is getting help through Ddn support channels so dropping dev to avoid confusion&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00ebr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10023"><![CDATA[4]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>