<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:28:53 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-9748] DNE recovery hangs, blocks Lustre recovery</title>
                <link>https://jira.whamcloud.com/browse/LU-9748</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Sequence: &lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;MDS failover occurs.&lt;/li&gt;
	&lt;li&gt;failover nodes complete.&lt;/li&gt;
	&lt;li&gt;recovery across all MDS blocks
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Jul  7 15:34:17 soak-9 kernel: LDISKFS-fs warning (device dm-6): ldiskfs_multi_mount_protect:322: MMP interval 42 higher than expected, please wait.
Jul  7 15:35:00 soak-9 kernel: LDISKFS-fs (dm-6): recovery complete
Jul  7 15:35:00 soak-9 kernel: LDISKFS-fs (dm-6): mounted filesystem with ordered data mode. Opts: user_xattr,errors=remount-ro,user_xattr,no_mbcache,nodelalloc
Jul  7 15:35:06 soak-9 kernel: LustreError: 137-5: soaked-MDT0001_UUID: not available &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; connect from 192.168.1.128@o2ib (no target). If you are running an HA pair check that the target is mounted on the other server.
Jul  7 15:35:06 soak-9 kernel: Lustre: soaked-MDT0001: Not available &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; connect from 192.168.1.132@o2ib (not set up)
Jul  7 15:35:06 soak-9 kernel: LustreError: 11-0: soaked-MDT0000-osp-MDT0001: operation mds_connect to node 192.168.1.108@o2ib failed: rc = -114
Jul  7 15:35:07 soak-9 kernel: Lustre: soaked-MDT0001: Imperative Recovery enabled, recovery window shrunk from 300-900 down to 150-900
Jul  7 15:35:09 soak-9 kernel: Lustre: soaked-MDT0001: Will be in recovery &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; at least 2:30, or until 37 clients reconnect
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The failover node stays in a WAITING state:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;soak-10
----------------
mdt.soaked-MDT0002.recovery_status=
status: WAITING
non-ready MDTs:  0003
recovery_start: 1499451258
time_waited: 2147

Jul  7 18:29:12 soak-10 kernel: LustreError: 11-0: soaked-MDT0003-osp-MDT0002: operation mds_connect to node 192.168.1.111@o2ib failed: rc = -114
Jul  7 18:29:12 soak-10 kernel: LustreError: Skipped 11 previous similar messages
Jul  7 18:29:13 soak-10 kernel: Lustre: 3682:0:(ldlm_lib.c:1784:extend_recovery_timer()) soaked-MDT0002: extended recovery timer reaching hard limit: 900, extend: 1
Jul  7 18:29:13 soak-10 kernel: Lustre: 3682:0:(ldlm_lib.c:1784:extend_recovery_timer()) Skipped 9 previous similar messages
Jul  7 18:29:29 soak-10 kernel: Lustre: soaked-MDT0002: Recovery already passed deadline 0:08, It is most likely due to DNE recovery is failed or stuck, please wait a few more minutes or abort the recovery.

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;dumped lustre-logs on the MDS multiple times during this, dumped stacks, attached&lt;/p&gt;</description>
                <environment>Soak test cluster, lustre-master build 3606 version=2.9.59_32_g62bc3af</environment>
        <key id="47156">LU-9748</key>
            <summary>DNE recovery hangs, blocks Lustre recovery</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="laisiyao">Lai Siyao</assignee>
                                    <reporter username="cliffw">Cliff White</reporter>
                        <labels>
                            <label>soak</label>
                    </labels>
                <created>Fri, 7 Jul 2017 18:56:00 +0000</created>
                <updated>Wed, 9 Aug 2017 16:13:51 +0000</updated>
                            <resolved>Wed, 9 Aug 2017 16:13:51 +0000</resolved>
                                    <version>Lustre 2.11.0</version>
                                    <fixVersion>Lustre 2.11.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="201395" author="cliffw" created="Fri, 7 Jul 2017 18:57:42 +0000"  >&lt;p&gt;Further details in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9274&quot; title=&quot;LBUG: (recover.c:157:ptlrpc_replay_next()) ASSERTION( !list_empty(&amp;amp;req-&amp;gt;rq_cli.cr_unreplied_list) ) failed:&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9274&quot;&gt;&lt;del&gt;LU-9274&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="201396" author="cliffw" created="Fri, 7 Jul 2017 19:02:38 +0000"  >&lt;p&gt;Also during this recovery time, clients are blocked, entire system is wedged waiting for recovery to complete. &lt;/p&gt;</comment>
                            <comment id="201400" author="cliffw" created="Fri, 7 Jul 2017 19:44:21 +0000"  >&lt;p&gt;After rebooting soak-8 (MGS) and soak-11 (mentioned in soak-10 errors) soak-10 completed recovery.&lt;br/&gt;
However, soak-8 and soak-11 are now wedged in wait.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;----------------
soak-8
----------------
mdt.soaked-MDT0000.recovery_status=
status: WAITING
non-ready MDTs:  0000 0001 0002 0003
recovery_start: 1499455762
time_waited: 832
----------------
soak-9
----------------
mdt.soaked-MDT0001.recovery_status=
status: COMPLETE
recovery_start: 1499446112
recovery_duration: 47
completed_clients: 7/37
replayed_requests: 0
last_transno: 1245546027187
VBR: DISABLED
IR: ENABLED
----------------
soak-10
----------------
mdt.soaked-MDT0002.recovery_status=
status: COMPLETE
recovery_start: 1499451258
recovery_duration: 4604
completed_clients: 35/37
replayed_requests: 0
last_transno: 1189768212435
VBR: ENABLED
IR: ENABLED
----------------
soak-11
----------------
mdt.soaked-MDT0003.recovery_status=
status: WAITING
non-ready MDTs:  0001
recovery_start: 1499455317
time_waited: 1277
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="201401" author="cliffw" created="Fri, 7 Jul 2017 19:45:18 +0000"  >&lt;p&gt;Further set of lustre/console logs attached, I will leave the system in this state if anyone wishes to examine it. &lt;/p&gt;</comment>
                            <comment id="201402" author="cliffw" created="Fri, 7 Jul 2017 20:01:43 +0000"  >&lt;p&gt;With system completely idle, recovery finally completes:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;soak-8
----------------
mdt.soaked-MDT0000.recovery_status=
status: COMPLETE
recovery_start: 1499455762
recovery_duration: 1500
completed_clients: 36/37
replayed_requests: 0
last_transno: 1297102100712
VBR: DISABLED
IR: DISABLED
----------------
soak-9
----------------
mdt.soaked-MDT0001.recovery_status=
status: COMPLETE
recovery_start: 1499446112
recovery_duration: 47
completed_clients: 7/37
replayed_requests: 0
last_transno: 1245546027187
VBR: DISABLED
IR: ENABLED
----------------
soak-10
----------------
mdt.soaked-MDT0002.recovery_status=
status: COMPLETE
recovery_start: 1499451258
recovery_duration: 4604
completed_clients: 35/37
replayed_requests: 0
last_transno: 1189768212435
VBR: ENABLED
IR: ENABLED
----------------
soak-11
----------------
mdt.soaked-MDT0003.recovery_status=
status: COMPLETE
recovery_start: 1499455317
recovery_duration: 1678
completed_clients: 37/37
replayed_requests: 0
last_transno: 1262744109605
VBR: DISABLED
IR: ENABLED
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="201673" author="laisiyao" created="Tue, 11 Jul 2017 14:31:32 +0000"  >&lt;p&gt;&apos;grep lod_sub_recovery_ *&apos; shows:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;soak-10.lustre.log.txt:00000004:00080000:16.0:1499451252.837946:0:3678:0:(lod_dev.c:433:lod_sub_recovery_thread()) soaked-MDT0002-osd retrieve update log: rc = 0
soak-10.2.log:00000004:00080000:19.0:1499351735.916949:0:3600:0:(lod_dev.c:433:lod_sub_recovery_thread()) soaked-MDT0001-osp-MDT0002 retrieve update log: rc = 0
soak-10.2.log:00000004:00000010:19.0:1499351735.916950:0:3600:0:(lod_dev.c:463:lod_sub_recovery_thread()) kfreed &apos;lrd&apos;: 32 at ffff8804177b0ba0.
soak-10.2.log:00000004:00000001:19.0:1499351735.916982:0:3600:0:(lod_dev.c:469:lod_sub_recovery_thread()) Process leaving (rc=0 : 0 : 0)
soak-10.2.log:00000004:00080000:16.0:1499351736.870806:0:3599:0:(lod_dev.c:433:lod_sub_recovery_thread()) soaked-MDT0000-osp-MDT0002 retrieve update log: rc = 0
soak-10.2.log:00000004:00000010:16.0:1499351736.870813:0:3599:0:(lod_dev.c:463:lod_sub_recovery_thread()) kfreed &apos;lrd&apos;: 32 at ffff8804177b0b80.
soak-10.2.log:00000004:00000001:16.0:1499351736.870858:0:3599:0:(lod_dev.c:469:lod_sub_recovery_thread()) Process leaving (rc=0 : 0 : 0)
soak-10.lustre.log.4.txt:00000004:00080000:3.0:1499455861.692338:0:3681:0:(lod_dev.c:433:lod_sub_recovery_thread()) soaked-MDT0003-osp-MDT0002 retrieve update log: rc = 0
soak-10.lustre.log.4.txt:00000004:00080000:3.0:1499455861.692340:0:3681:0:(lod_dev.c:456:lod_sub_recovery_thread()) soaked-MDT0002 got update logs from all MDTs.
soak-10.lustre.log.4.txt:00000004:00000010:3.0:1499455861.692358:0:3681:0:(lod_dev.c:463:lod_sub_recovery_thread()) kfreed &apos;lrd&apos;: 32 at ffff88082c4bccc0.
soak-10.lustre.log.4.txt:00000004:00000001:3.0:1499455861.692412:0:3681:0:(lod_dev.c:469:lod_sub_recovery_thread()) Process leaving (rc=0 : 0 : 0)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;MDT0002 retrieved remote update log from MDT0000 and MDT0001 at 1499351735,  local update log at 1499451252, and from MDT0003 at 1499455861. It means it took 28hours to fetch update logs from other MDTs, this is really unacceptable. Since this last too long, logs of MDT0, MDT1 and MDT3 doesn&apos;t contain any useful information.&lt;/p&gt;</comment>
                            <comment id="201833" author="gerrit" created="Wed, 12 Jul 2017 14:19:16 +0000"  >&lt;p&gt;Lai Siyao (lai.siyao@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/28000&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/28000&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9748&quot; title=&quot;DNE recovery hangs, blocks Lustre recovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9748&quot;&gt;&lt;del&gt;LU-9748&lt;/del&gt;&lt;/a&gt; lod: safely access update log stat&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: ceaae52b221d0986599796420add36305f01eff3&lt;/p&gt;</comment>
                            <comment id="204864" author="gerrit" created="Wed, 9 Aug 2017 04:18:26 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/28000/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/28000/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9748&quot; title=&quot;DNE recovery hangs, blocks Lustre recovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9748&quot;&gt;&lt;del&gt;LU-9748&lt;/del&gt;&lt;/a&gt; lod: safely access update log stat&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: a446cbb8b1e2fa73c30938d043f79f644c13efe7&lt;/p&gt;</comment>
                            <comment id="204915" author="mdiep" created="Wed, 9 Aug 2017 16:13:51 +0000"  >&lt;p&gt;Landed in 2.11&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="45140">LU-9274</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="27496" name="soak-10.2.log.gz" size="1185695" author="cliffw" created="Fri, 7 Jul 2017 18:55:55 +0000"/>
                            <attachment id="27495" name="soak-10.lustre.log.3.txt.gz" size="2730457" author="cliffw" created="Fri, 7 Jul 2017 18:55:53 +0000"/>
                            <attachment id="27511" name="soak-10.lustre.log.4.txt.gz" size="4891639" author="cliffw" created="Fri, 7 Jul 2017 19:46:13 +0000"/>
                            <attachment id="27512" name="soak-10.lustre.log.5.txt.gz" size="4029269" author="cliffw" created="Fri, 7 Jul 2017 19:46:11 +0000"/>
                            <attachment id="27521" name="soak-10.lustre.log.6.txt.gz" size="3885875" author="cliffw" created="Fri, 7 Jul 2017 20:05:27 +0000"/>
                            <attachment id="27522" name="soak-10.lustre.log.7.txt.gz" size="3579065" author="cliffw" created="Fri, 7 Jul 2017 20:05:23 +0000"/>
                            <attachment id="27494" name="soak-10.lustre.log.txt.gz" size="76244" author="cliffw" created="Fri, 7 Jul 2017 18:55:47 +0000"/>
                            <attachment id="27513" name="soak-10.postMGSreboot.log.gz" size="3336" author="cliffw" created="Fri, 7 Jul 2017 19:46:00 +0000"/>
                            <attachment id="27493" name="soak-10.stacks.and.console.txt.gz" size="172696" author="cliffw" created="Fri, 7 Jul 2017 18:55:48 +0000"/>
                            <attachment id="27492" name="soak-11.2.log.gz" size="3616862" author="cliffw" created="Fri, 7 Jul 2017 18:55:56 +0000"/>
                            <attachment id="27491" name="soak-11.lustre.log.3.txt.gz" size="3649358" author="cliffw" created="Fri, 7 Jul 2017 18:55:56 +0000"/>
                            <attachment id="27514" name="soak-11.lustre.log.4.txt.gz" size="460205" author="cliffw" created="Fri, 7 Jul 2017 19:46:02 +0000"/>
                            <attachment id="27515" name="soak-11.lustre.log.5.txt.gz" size="20255" author="cliffw" created="Fri, 7 Jul 2017 19:46:03 +0000"/>
                            <attachment id="27523" name="soak-11.lustre.log.6.txt.gz" size="21016" author="cliffw" created="Fri, 7 Jul 2017 20:05:18 +0000"/>
                            <attachment id="27524" name="soak-11.lustre.log.7.txt.gz" size="3647470" author="cliffw" created="Fri, 7 Jul 2017 20:05:27 +0000"/>
                            <attachment id="27490" name="soak-11.lustre.log.txt.gz" size="12368" author="cliffw" created="Fri, 7 Jul 2017 18:55:48 +0000"/>
                            <attachment id="27516" name="soak-11.postreboot.log.gz" size="3506" author="cliffw" created="Fri, 7 Jul 2017 19:46:03 +0000"/>
                            <attachment id="27489" name="soak-11.stacks.and.console.txt.gz" size="887498" author="cliffw" created="Fri, 7 Jul 2017 18:55:52 +0000"/>
                            <attachment id="27504" name="soak-8.2.log.gz" size="3342158" author="cliffw" created="Fri, 7 Jul 2017 18:53:53 +0000"/>
                            <attachment id="27503" name="soak-8.lustre.log.3.txt.gz" size="4407183" author="cliffw" created="Fri, 7 Jul 2017 18:53:54 +0000"/>
                            <attachment id="27505" name="soak-8.lustre.log.4.txt.gz" size="481753" author="cliffw" created="Fri, 7 Jul 2017 19:46:00 +0000"/>
                            <attachment id="27506" name="soak-8.lustre.log.5.txt.gz" size="1726910" author="cliffw" created="Fri, 7 Jul 2017 19:46:04 +0000"/>
                            <attachment id="27517" name="soak-8.lustre.log.6.txt.gz" size="1857474" author="cliffw" created="Fri, 7 Jul 2017 20:05:18 +0000"/>
                            <attachment id="27518" name="soak-8.lustre.log.7.txt.gz" size="4116566" author="cliffw" created="Fri, 7 Jul 2017 20:05:25 +0000"/>
                            <attachment id="27502" name="soak-8.lustre.log.txt.gz" size="12834264" author="cliffw" created="Fri, 7 Jul 2017 18:54:01 +0000"/>
                            <attachment id="27507" name="soak-8.postreboot.log.gz" size="2056" author="cliffw" created="Fri, 7 Jul 2017 19:45:59 +0000"/>
                            <attachment id="27501" name="soak-8.stacks.and.console.txt.gz" size="779614" author="cliffw" created="Fri, 7 Jul 2017 18:53:47 +0000"/>
                            <attachment id="27500" name="soak-9.2.log.gz" size="2584652" author="cliffw" created="Fri, 7 Jul 2017 18:55:31 +0000"/>
                            <attachment id="27499" name="soak-9.lustre.log.3.txt.gz" size="3552292" author="cliffw" created="Fri, 7 Jul 2017 18:55:32 +0000"/>
                            <attachment id="27508" name="soak-9.lustre.log.4.txt.gz" size="5259775" author="cliffw" created="Fri, 7 Jul 2017 19:46:13 +0000"/>
                            <attachment id="27509" name="soak-9.lustre.log.5.txt.gz" size="3937719" author="cliffw" created="Fri, 7 Jul 2017 19:46:12 +0000"/>
                            <attachment id="27519" name="soak-9.lustre.log.6.txt.gz" size="4121848" author="cliffw" created="Fri, 7 Jul 2017 20:05:26 +0000"/>
                            <attachment id="27520" name="soak-9.lustre.log.7.txt.gz" size="3686618" author="cliffw" created="Fri, 7 Jul 2017 20:05:26 +0000"/>
                            <attachment id="27498" name="soak-9.lustre.log.txt.gz" size="555" author="cliffw" created="Fri, 7 Jul 2017 18:55:28 +0000"/>
                            <attachment id="27510" name="soak-9.postreboot.log.gz" size="4515" author="cliffw" created="Fri, 7 Jul 2017 19:45:58 +0000"/>
                            <attachment id="27497" name="soak-9.stacks.and.console.txt.gz" size="328515" author="cliffw" created="Fri, 7 Jul 2017 18:55:28 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzg9z:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>