<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:33:08 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10222] DNE recovery is failed or stuck</title>
                <link>https://jira.whamcloud.com/browse/LU-10222</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;After an MDT is stopped on one node and brought up another, recovery fails to complete and attempts to access the filesystem from clients hang.&lt;/p&gt;

&lt;p&gt;In the console log of the affected MDT:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Nov 8 13:49:41 jet15 kernel: [11181.278822] Lustre: lquake-MDT000e: Recovery already passed deadline 5:59. It is due to DNE recovery failed/stuck on the 1 MDT(s): 0001. Please wait until all MDTs recovered or abort the recovery by force.

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;In the recovery_status procfile of the affected MDT, lquake-MDT000E (on host jet15)&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;status: WAITING
non-ready MDTs: 0001
recovery_start: 1510186333
time_waited: 76934

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;In the recovery_status procfile of lquake-MDT0001 (on host jet2)&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;status: COMPLETE
recovery_start: 1509398176
recovery_duration: 39
completed_clients: 91/91
replayed_requests: 0
last_transno: 163209257851
VBR: DISABLED
IR: DISABLED

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;At 13:47 on jet2, the kernel watchdog reports several blocked threads, whose stacks all look like this:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;INFO: task z_wr_iss:15546 blocked for more than 120 seconds.
&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
z_wr_iss D ffff883f6eaeaf70 0 15546 2 0x00000080 
 ffff883f6671bbc0 0000000000000046 ffff883f6671bfd8 ffff883f6671bfd8 
 ffff883f6671bfd8 ffff883f6671bfd8 ffff882c14c70fd0 ffff883f6eaeaf70
 ffffffff00000001 ffff887f7471cac0 ffff883f6671bfd8 ffffffff00000000
Call Trace:
 [&amp;lt;ffffffff816ca759&amp;gt;] schedule+0x29/0x70
 [&amp;lt;ffffffff816cc335&amp;gt;] rwsem_down_write_failed+0x285/0x3f0
 [&amp;lt;ffffffff813481d7&amp;gt;] call_rwsem_down_write_failed+0x17/0x30
 [&amp;lt;ffffffffc03fef85&amp;gt;] ? spl_kmem_free+0x35/0x40 [spl]
 [&amp;lt;ffffffff816c9b40&amp;gt;] down_write+0x40/0x50
 [&amp;lt;ffffffffc07beb27&amp;gt;] dbuf_write_ready+0x207/0x310 [zfs]
 [&amp;lt;ffffffffc07b8b26&amp;gt;] arc_write_ready+0xa6/0x310 [zfs]
 [&amp;lt;ffffffff816c88d5&amp;gt;] ? mutex_lock+0x25/0x42
 [&amp;lt;ffffffffc0885ec4&amp;gt;] zio_ready+0x94/0x420 [zfs]
 [&amp;lt;ffffffffc040783e&amp;gt;] ? tsd_get_by_thread+0x2e/0x50 [spl]
 [&amp;lt;ffffffffc04013c8&amp;gt;] ? taskq_member+0x18/0x30 [spl]
 [&amp;lt;ffffffffc087f7ac&amp;gt;] zio_execute+0x9c/0x100 [zfs]
 [&amp;lt;ffffffffc0402326&amp;gt;] taskq_thread+0x246/0x470 [spl]
 [&amp;lt;ffffffff810c9de0&amp;gt;] ? wake_up_state+0x20/0x20
 [&amp;lt;ffffffffc04020e0&amp;gt;] ? taskq_thread_spawn+0x60/0x60 [spl]
 [&amp;lt;ffffffff810b4eef&amp;gt;] kthread+0xcf/0xe0
 [&amp;lt;ffffffff810b4e20&amp;gt;] ? insert_kthread_work+0x40/0x40
 [&amp;lt;ffffffff816d6818&amp;gt;] ret_from_fork+0x58/0x90
 [&amp;lt;ffffffff810b4e20&amp;gt;] ? insert_kthread_work+0x40/0x40

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Which I believe means they were blocked in dbuf_write_ready() at rw_enter():&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt; &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!BP_IS_EMBEDDED(bp))
 BP_SET_FILL(bp, fill);

mutex_exit(&amp;amp;db-&amp;gt;db_mtx);

rw_enter(&amp;amp;dn-&amp;gt;dn_struct_rwlock, RW_WRITER);
 *db-&amp;gt;db_blkptr = *bp;
 rw_exit(&amp;amp;dn-&amp;gt;dn_struct_rwlock);

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;</description>
                <environment>fs/lustre-release-fe&lt;br/&gt;
lustre-2.8.0_13&lt;br/&gt;
zfs-0.7.2-4llnl.ch6.x86_64&lt;br/&gt;
kernel-3.10.0-693.5.2.1chaos.ch6.x86_64&lt;br/&gt;
DNE1 file system with 16 MDTs</environment>
        <key id="49227">LU-10222</key>
            <summary>DNE recovery is failed or stuck</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="6" iconUrl="https://jira.whamcloud.com/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="6">Not a Bug</resolution>
                                        <assignee username="pjones">Peter Jones</assignee>
                                    <reporter username="ofaaland">Olaf Faaland</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Thu, 9 Nov 2017 23:06:02 +0000</created>
                <updated>Mon, 20 Nov 2017 17:01:03 +0000</updated>
                            <resolved>Mon, 20 Nov 2017 17:00:57 +0000</resolved>
                                                                        <due></due>
                            <votes>0</votes>
                                    <watches>3</watches>
                                                                            <comments>
                            <comment id="213302" author="ofaaland" created="Fri, 10 Nov 2017 01:36:41 +0000"  >&lt;p&gt;Don&apos;t do any work on this yet.&#160; I&apos;m attempting to find out what process is holding the lock.&lt;/p&gt;</comment>
                            <comment id="213320" author="pjones" created="Fri, 10 Nov 2017 13:16:42 +0000"  >&lt;p&gt;ok Olaf. We&apos;ll hold tight for now.&lt;/p&gt;</comment>
                            <comment id="214153" author="ofaaland" created="Mon, 20 Nov 2017 16:58:40 +0000"  >&lt;p&gt;We concluded this was not caused by Lustre, as our 2.8.0-based Lustre build does not take this lock itself.&#160; Closing.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzznl3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>