<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:01:30 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13464] MDT0000 remount in recovery 40 hours</title>
                <link>https://jira.whamcloud.com/browse/LU-13464</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;&lt;b&gt;Summary&lt;/b&gt;&lt;br/&gt;
metadata server mds01 encountered ldiskfs error which caused the mdt0000 to be remounted read-only.&lt;br/&gt;
Exascaler HA remounted mdt0000 volume, which was in recovery 40+ hours.&lt;/p&gt;

&lt;p&gt;&lt;b&gt;Details&lt;/b&gt;&lt;/p&gt;

&lt;p&gt;mds01 encountered ldiskfs error &quot;illegal pblock&quot;, which causted mdt0000 to be remounted read-only&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Mar 21 14:41:20 mds01 kernel: LDISKFS-fs error (device dm-18): ldiskfs_map_blocks:592: inode #1426025594: block 1836017711: comm mdt00_094: lblock 0 mapped to illegal pblock 1836017711 (length 1)
Mar 21 14:41:20 mds01 kernel: Aborting journal on device dm-18-8.
Mar 21 14:41:20 mds01 kernel: LustreError: 47100:0:(osd_handler.c:1727:osd_trans_commit_cb()) transaction @0xffff9575746da700 commit error: 2
Mar 21 14:41:20 mds01 kernel: LDISKFS-fs (dm-18): Remounting filesystem read-only
Mar 21 14:41:20 mds01 kernel: LustreError: 47222:0:(osd_io.c:1833:osd_ldiskfs_read()) dm-18: can&apos;t read 59@0 on ino 1426025594: rc = -5
Mar 21 14:41:20 mds01 kernel: LustreError: 47222:0:(mdd_dir.c:4507:mdd_migrate()) eaglefs-MDD0000: [0x2001457f2:0xbb37:0x0] readlink failed: rc = -5
Mar 21 14:41:20 mds01 kernel: LDISKFS-fs warning (device dm-18): kmmpd:186: kmmpd being stopped since filesystem has been remounted as readonly.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Then Exascaler HA remounted mdt0000 volume&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Mar 21 14:42:25 mds01 kernel: LDISKFS-fs warning (device dm-18): ldiskfs_multi_mount_protect:321: MMP interval 42 higher than expected, please wait.\x0a
Mar 21 14:43:08 mds01 kernel: LDISKFS-fs warning (device dm-18): ldiskfs_clear_journal_err:4994: Filesystem error recorded from previous mount: IO failure
Mar 21 14:43:08 mds01 kernel: LDISKFS-fs warning (device dm-18): ldiskfs_clear_journal_err:4995: Marking fs in need of filesystem check.
Mar 21 14:43:08 mds01 kernel: LDISKFS-fs (dm-18): warning: mounting fs with errors, running e2fsck is recommended
Mar 21 14:43:08 mds01 kernel: LDISKFS-fs (dm-18): recovery complete
Mar 21 14:43:08 mds01 kernel: LDISKFS-fs (dm-18): mounted filesystem with ordered data mode. Opts: user_xattr,errors=remount-ro,no_mbcache,nodelalloc
Mar 21 14:43:09 mds01 kernel: Lustre: eaglefs-MDT0000: Imperative Recovery enabled, recovery window shrunk from 300-900 down to 150-900
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;mds01 begin recovery &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Mar 21 14:43:09 mds01 kernel: Lustre: eaglefs-MDT0000: Will be in recovery for at least 2:30, or until 2199 clients rec
onnect
Mar 21 14:48:46 mds01 kernel: Lustre: eaglefs-MDT0000: Denying connection for new client 4b73dcdb-c341-5e93-6e2d-56df59cccb3c (at 10.148.4.147@o2ib), waiting for 2199 known clients (2152 recovered, 44 in progress, and 3 evicted) to recover in 0:53
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;mds01 recovery encountered &quot;hard timeout&quot; 41 hours later&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Mar 23 08:31:14 mds01 kernel: Lustre: eaglefs-MDT0000: Denying connection for new client de9156e9-00e3-cbf3-a651-7566c6
3216df (at 10.148.0.125@o2ib), waiting for 2199 known clients (2152 recovered, 44 in progress, and 3 evicted) already p
assed deadline 2507:31
Mar 23 08:37:35 mds01 kernel: Lustre: eaglefs-OST0014-osc-MDT0000: Connection to eaglefs-OST0014 (at 10.148.66.34@o2ib)
 was lost; in progress operations using this service will wait for recovery to complete
Mar 23 08:38:00 mds01 kernel: Lustre: eaglefs-OST003c-osc-MDT0000: Connection to eaglefs-OST003c (at 10.148.66.84@o2ib)
 was lost; in progress operations using this service will wait for recovery to complete
Mar 23 08:38:47 mds01 kernel: Lustre: eaglefs-OST000a-osc-MDT0000: Connection to eaglefs-OST000a (at 10.148.66.22@o2ib) was lost; in progress operations using this service will wait for recovery to complete
Mar 23 08:41:12 mds01 kernel: Lustre: eaglefs-MDT0000: Denying connection for new client de9156e9-00e3-cbf3-a651-7566c63216df (at 10.148.0.125@o2ib), waiting for 2199 known clients (2152 recovered, 44 in progress, and 3 evicted) already passed deadline 2517:31

Mar 23 08:44:39 mds01 kernel: Lustre: 64029:0:(ldlm_lib.c:2046:target_recovery_overseer()) eaglefs-MDT0000 recovery is aborted by hard timeout
Mar 23 08:44:39 mds01 kernel: Lustre: 64029:0:(ldlm_lib.c:2056:target_recovery_overseer()) recovery is aborted, evict exports in recovery
Mar 23 08:44:41 mds01 kernel: Lustre: eaglefs-MDT0000: Recovery over after 2527:32, of 2199 clients 0 recovered and 2199 were evicted.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="58819">LU-13464</key>
            <summary>MDT0000 remount in recovery 40 hours</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="hongchao.zhang">Hongchao Zhang</reporter>
                        <labels>
                    </labels>
                <created>Sun, 19 Apr 2020 12:28:54 +0000</created>
                <updated>Fri, 8 Jan 2021 08:31:29 +0000</updated>
                            <resolved>Wed, 27 May 2020 15:19:14 +0000</resolved>
                                                    <fixVersion>Lustre 2.14.0</fixVersion>
                    <fixVersion>Lustre 2.12.6</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>3</watches>
                                                                            <comments>
                            <comment id="267984" author="gerrit" created="Sun, 19 Apr 2020 14:00:51 +0000"  >&lt;p&gt;Hongchao Zhang (hongchao@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/38277&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/38277&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13464&quot; title=&quot;MDT0000 remount in recovery 40 hours&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13464&quot;&gt;&lt;del&gt;LU-13464&lt;/del&gt;&lt;/a&gt; target: abort recovery if timer fail&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 34ca5b9121654a0a125e2dbd0e3984faf74b6804&lt;/p&gt;</comment>
                            <comment id="271244" author="gerrit" created="Wed, 27 May 2020 05:04:55 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/38277/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/38277/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13464&quot; title=&quot;MDT0000 remount in recovery 40 hours&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13464&quot;&gt;&lt;del&gt;LU-13464&lt;/del&gt;&lt;/a&gt; target: abort recovery if timer fail&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 87443d9c27e8535c3e17d6bf142ad68d4449b93f&lt;/p&gt;</comment>
                            <comment id="271296" author="pjones" created="Wed, 27 May 2020 15:19:14 +0000"  >&lt;p&gt;Landed for 2.14&lt;/p&gt;</comment>
                            <comment id="282639" author="gerrit" created="Mon, 19 Oct 2020 19:02:37 +0000"  >&lt;p&gt;Jian Yu (yujian@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/40303&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40303&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13464&quot; title=&quot;MDT0000 remount in recovery 40 hours&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13464&quot;&gt;&lt;del&gt;LU-13464&lt;/del&gt;&lt;/a&gt; target: abort recovery if timer fail&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 6ccd96714ce1061c1018086ae5ac1f228f618ed0&lt;/p&gt;</comment>
                            <comment id="283583" author="gerrit" created="Thu, 29 Oct 2020 07:49:56 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/40303/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40303/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13464&quot; title=&quot;MDT0000 remount in recovery 40 hours&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13464&quot;&gt;&lt;del&gt;LU-13464&lt;/del&gt;&lt;/a&gt; target: abort recovery if timer fail&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: d1f4de9bb568affc523dcbc46d82f4a6676990de&lt;/p&gt;</comment>
                            <comment id="289011" author="gerrit" created="Fri, 8 Jan 2021 08:31:29 +0000"  >&lt;p&gt;Hongchao Zhang (hongchao@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/41171&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/41171&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13464&quot; title=&quot;MDT0000 remount in recovery 40 hours&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13464&quot;&gt;&lt;del&gt;LU-13464&lt;/del&gt;&lt;/a&gt; ldlm: add recovery time limit&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: ea7a9006d1fe7d3ac5b8a027325901bfbad43a21&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00y7z:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>