<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:10:39 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-7640] stuck mdt thread required reboot of mds</title>
                <link>https://jira.whamcloud.com/browse/LU-7640</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;MDS reported stuck mdt threads and dump stack trace&lt;/p&gt;

&lt;p&gt;&amp;lt;code&amp;gt;&lt;br/&gt;
Jan  7 20:02:53 nbp8-mds1 kernel: LNet: Service thread pid 16286 was inactive for 464.00s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:&lt;br/&gt;
Jan  7 20:02:53 nbp8-mds1 kernel: LNet: Skipped 4 previous similar messages&lt;br/&gt;
Jan  7 20:02:57 nbp8-mds1 kernel: Pid: 16286, comm: mdt02_020&lt;br/&gt;
Jan  7 20:02:57 nbp8-mds1 kernel: &lt;br/&gt;
Jan  7 20:02:57 nbp8-mds1 kernel: Call Trace:&lt;br/&gt;
Jan  7 20:02:57 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04eee01&amp;gt;&amp;#93;&lt;/span&gt; ? libcfs_debug_msg+0x41/0x50 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:02:57 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa078af70&amp;gt;&amp;#93;&lt;/span&gt; ? ldlm_expired_completion_wait+0x0/0x360 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:02:57 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa078f835&amp;gt;&amp;#93;&lt;/span&gt; ldlm_completion_ast+0x545/0x920 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:02:57 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81061fe0&amp;gt;&amp;#93;&lt;/span&gt; ? default_wake_function+0x0/0x20&lt;br/&gt;
Jan  7 20:02:57 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa078ef00&amp;gt;&amp;#93;&lt;/span&gt; ldlm_cli_enqueue_local+0x1f0/0x5e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:02:57 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa078f2f0&amp;gt;&amp;#93;&lt;/span&gt; ? ldlm_completion_ast+0x0/0x920 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:02:57 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e72de0&amp;gt;&amp;#93;&lt;/span&gt; ? mdt_blocking_ast+0x0/0x2a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e7cc06&amp;gt;&amp;#93;&lt;/span&gt; mdt_object_lock0+0x1b6/0xb30 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e72de0&amp;gt;&amp;#93;&lt;/span&gt; ? mdt_blocking_ast+0x0/0x2a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa078f2f0&amp;gt;&amp;#93;&lt;/span&gt; ? ldlm_completion_ast+0x0/0x920 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e7d644&amp;gt;&amp;#93;&lt;/span&gt; mdt_object_lock+0x14/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e85b8e&amp;gt;&amp;#93;&lt;/span&gt; mdt_getattr_name_lock+0x8fe/0x19d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07df766&amp;gt;&amp;#93;&lt;/span&gt; ? __req_capsule_get+0x166/0x710 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07ba7b4&amp;gt;&amp;#93;&lt;/span&gt; ? lustre_msg_get_flags+0x34/0xb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e86ef9&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_getattr+0x299/0x480 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e75c3e&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_policy+0x3ae/0x770 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa076f2c5&amp;gt;&amp;#93;&lt;/span&gt; ldlm_lock_enqueue+0x135/0x980 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0798ebb&amp;gt;&amp;#93;&lt;/span&gt; ldlm_handle_enqueue0+0x51b/0x10c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e76106&amp;gt;&amp;#93;&lt;/span&gt; mdt_enqueue+0x46/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e7aada&amp;gt;&amp;#93;&lt;/span&gt; mdt_handle_common+0x52a/0x1470 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0eb74a5&amp;gt;&amp;#93;&lt;/span&gt; mds_regular_handle+0x15/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07c80c5&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request+0x385/0xc00 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04f08d5&amp;gt;&amp;#93;&lt;/span&gt; ? lc_watchdog_touch+0x65/0x170 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07c0a69&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_wait_event+0xa9/0x2d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07ca89d&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0xafd/0x1780 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c20a&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
Jan  7 20:03:00 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07c9da0&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1780 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan  7 20:03:01 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c200&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;br/&gt;
Jan  7 20:03:01 nbp8-mds1 kernel: &lt;br/&gt;
Jan  7 20:03:01 nbp8-mds1 kernel: LustreError: dumping log to /tmp/lustre-log.1452225773.16286&lt;br/&gt;
&amp;lt;code&amp;gt;&lt;/p&gt;

&lt;p&gt;I am attaching  /var/log/messages and lustre debug dump.&lt;/p&gt;

&lt;p&gt;The mds need to be rebooted to clear up the error state.&lt;/p&gt;</description>
                <environment></environment>
        <key id="34007">LU-7640</key>
            <summary>stuck mdt thread required reboot of mds</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="bobijam">Zhenyu Xu</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Fri, 8 Jan 2016 06:46:16 +0000</created>
                <updated>Wed, 26 Apr 2017 14:45:56 +0000</updated>
                            <resolved>Thu, 4 Feb 2016 22:19:34 +0000</resolved>
                                    <version>Lustre 2.5.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="138291" author="bobijam" created="Fri, 8 Jan 2016 06:49:45 +0000"  >&lt;p&gt;it&apos;s dup of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7372&quot; title=&quot;replay-dual test_26: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7372&quot;&gt;&lt;del&gt;LU-7372&lt;/del&gt;&lt;/a&gt;, and there is patch at  &lt;a href=&quot;http://review.whamcloud.com/17853&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/17853&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="139523" author="jaylan" created="Wed, 20 Jan 2016 23:47:01 +0000"  >&lt;p&gt;How did you think this is a dup of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7372&quot; title=&quot;replay-dual test_26: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7372&quot;&gt;&lt;del&gt;LU-7372&lt;/del&gt;&lt;/a&gt;? The stack trace do not look alike.&lt;/p&gt;</comment>
                            <comment id="139539" author="bobijam" created="Thu, 21 Jan 2016 04:21:43 +0000"  >&lt;p&gt;The thread is waiting for a lock get granted or cancelled (ldlm_completion_ast()), and that never happens. And #17853 has fix about ldlm_expired_completion_wait() returning -ETIMEDOUT other than 0, so that the thread won&apos;t stuck.&lt;/p&gt;</comment>
                            <comment id="139646" author="jaylan" created="Thu, 21 Jan 2016 19:26:17 +0000"  >&lt;p&gt;Thank you Zhenyu!&lt;br/&gt;
After a series of problems last night to the mds/mgs of one of our lustre filesystems, it was upgraded to run with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7372&quot; title=&quot;replay-dual test_26: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7372&quot;&gt;&lt;del&gt;LU-7372&lt;/del&gt;&lt;/a&gt; patch. We will see &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/wink.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="139666" author="mhanafi" created="Thu, 21 Jan 2016 19:48:08 +0000"  >&lt;p&gt;We had a crash after the patch was applied.&lt;/p&gt;</comment>
                            <comment id="139702" author="bobijam" created="Fri, 22 Jan 2016 02:24:34 +0000"  >&lt;p&gt;what&apos;s the crash backtrace?&lt;/p&gt;</comment>
                            <comment id="141260" author="pjones" created="Thu, 4 Feb 2016 22:19:34 +0000"  >&lt;p&gt;duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7692&quot; title=&quot;LNet: Service thread Hung&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7692&quot;&gt;&lt;del&gt;LU-7692&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="193592" author="jhammond" created="Wed, 26 Apr 2017 14:45:56 +0000"  >&lt;p&gt;Just to clarify, recent versions of &lt;a href=&quot;http://review.whamcloud.com/17853&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/17853&lt;/a&gt; no longer contain the change to ldlm_expired_completion_wait() mentioned above and this should not be considered a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7372&quot; title=&quot;replay-dual test_26: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7372&quot;&gt;&lt;del&gt;LU-7372&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="32965">LU-7372</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="20063" name="lustre-log.1452225773.16286.gz" size="257" author="mhanafi" created="Fri, 8 Jan 2016 06:46:16 +0000"/>
                            <attachment id="20064" name="messages.gz" size="18077" author="mhanafi" created="Fri, 8 Jan 2016 06:46:16 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzxxkn:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>