<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:25:02 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2419] mdt threads stuck in ldlm_expired_completion_wait</title>
                <link>https://jira.whamcloud.com/browse/LU-2419</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;One of our production MDS is in trouble, causing application hangs.  It looks like CPU usage is low, but the node has mdt threads hanging for 800+ seconds before timeout.  It is frequently printing backtraces like so:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2012-11-30 16:32:02 Lustre: Service thread pid 4557 was inactive for 808.00s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging pur
poses:
2012-11-30 16:32:02 Lustre: Skipped 4 previous similar messages
2012-11-30 16:32:02 Pid: 4557, comm: mdt_294
2012-11-30 16:32:02 
2012-11-30 16:32:02 Call Trace:
2012-11-30 16:32:02  [&amp;lt;ffffffffa071c590&amp;gt;] ? ldlm_expired_completion_wait+0x0/0x270 [ptlrpc]
2012-11-30 16:32:02  [&amp;lt;ffffffffa04913f1&amp;gt;] ? libcfs_debug_vmsg1+0x41/0x50 [libcfs]
2012-11-30 16:32:02  [&amp;lt;ffffffffa071c590&amp;gt;] ? ldlm_expired_completion_wait+0x0/0x270 [ptlrpc]
2012-11-30 16:32:02  [&amp;lt;ffffffffa048854e&amp;gt;] cfs_waitq_wait+0xe/0x10 [libcfs]
2012-11-30 16:32:02  [&amp;lt;ffffffffa071fe6a&amp;gt;] ldlm_completion_ast+0x4da/0x690 [ptlrpc]
2012-11-30 16:32:02  [&amp;lt;ffffffff8105ea30&amp;gt;] ? default_wake_function+0x0/0x20
2012-11-30 16:32:02  [&amp;lt;ffffffffa071f706&amp;gt;] ldlm_cli_enqueue_local+0x1e6/0x470 [ptlrpc]
2012-11-30 16:32:02  [&amp;lt;ffffffffa071f990&amp;gt;] ? ldlm_completion_ast+0x0/0x690 [ptlrpc]
2012-11-30 16:32:02  [&amp;lt;ffffffffa0c59180&amp;gt;] ? mdt_blocking_ast+0x0/0x230 [mdt]
2012-11-30 16:32:02  [&amp;lt;ffffffffa0c5ae5f&amp;gt;] mdt_object_lock+0x28f/0x980 [mdt]
2012-11-30 16:32:02  [&amp;lt;ffffffffa0c59180&amp;gt;] ? mdt_blocking_ast+0x0/0x230 [mdt]
2012-11-30 16:32:02  [&amp;lt;ffffffffa071f990&amp;gt;] ? ldlm_completion_ast+0x0/0x690 [ptlrpc]
2012-11-30 16:32:02  [&amp;lt;ffffffffa0c5b871&amp;gt;] mdt_object_find_lock+0x61/0x100 [mdt]
2012-11-30 16:32:02  [&amp;lt;ffffffffa0c70fe2&amp;gt;] mdt_md_create+0x102/0x5a0 [mdt]
2012-11-30 16:32:02  [&amp;lt;ffffffffa03af96c&amp;gt;] ? lprocfs_counter_add+0x11c/0x190 [lvfs]
2012-11-30 16:32:02  [&amp;lt;ffffffffa0c71598&amp;gt;] mdt_reint_create+0x118/0x5e0 [mdt]
2012-11-30 16:32:02  [&amp;lt;ffffffffa0c6f2d0&amp;gt;] mdt_reint_rec+0x40/0xb0 [mdt]
2012-11-30 16:32:02  [&amp;lt;ffffffffa0740eb4&amp;gt;] ? lustre_msg_get_flags+0x34/0x70 [ptlrpc]
2012-11-30 16:32:02  [&amp;lt;ffffffffa0c6a0c8&amp;gt;] mdt_reint_internal+0x4f8/0x770 [mdt]
2012-11-30 16:32:02  [&amp;lt;ffffffffa0c6a384&amp;gt;] mdt_reint+0x44/0xc0 [mdt]
2012-11-30 16:32:03  [&amp;lt;ffffffffa0c5e79d&amp;gt;] mdt_handle_common+0x73d/0x12c0 [mdt]
2012-11-30 16:32:03  [&amp;lt;ffffffffa0740cc4&amp;gt;] ? lustre_msg_get_transno+0x54/0x90 [ptlrpc]
2012-11-30 16:32:03  [&amp;lt;ffffffffa0c5f3f5&amp;gt;] mdt_regular_handle+0x15/0x20 [mdt]
2012-11-30 16:32:03  [&amp;lt;ffffffffa074cd64&amp;gt;] ptlrpc_main+0xd24/0x1740 [ptlrpc]
2012-11-30 16:32:03  [&amp;lt;ffffffffa074c040&amp;gt;] ? ptlrpc_main+0x0/0x1740 [ptlrpc]
2012-11-30 16:32:03  [&amp;lt;ffffffff8100c14a&amp;gt;] child_rip+0xa/0x20
2012-11-30 16:32:03  [&amp;lt;ffffffffa074c040&amp;gt;] ? ptlrpc_main+0x0/0x1740 [ptlrpc]
2012-11-30 16:32:03  [&amp;lt;ffffffffa074c040&amp;gt;] ? ptlrpc_main+0x0/0x1740 [ptlrpc]
2012-11-30 16:32:03  [&amp;lt;ffffffff8100c140&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;See attached file console.momus-mds1.txt for more of the console log, including backtraces from the processes on the system.&lt;/p&gt;</description>
                <environment>Lustre 2.1.2-3chaos (github.com/chaos/lustre)</environment>
        <key id="16834">LU-2419</key>
            <summary>mdt threads stuck in ldlm_expired_completion_wait</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="6" iconUrl="https://jira.whamcloud.com/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="bzzz">Alex Zhuravlev</assignee>
                                    <reporter username="morrone">Christopher Morrone</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Fri, 30 Nov 2012 20:28:12 +0000</created>
                <updated>Thu, 16 Apr 2020 08:29:36 +0000</updated>
                            <resolved>Thu, 16 Apr 2020 08:29:36 +0000</resolved>
                                    <version>Lustre 2.1.2</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="48642" author="pjones" created="Sat, 1 Dec 2012 10:27:48 +0000"  >&lt;p&gt;Alex could you please assess this one?&lt;/p&gt;</comment>
                            <comment id="48854" author="bzzz" created="Thu, 6 Dec 2012 07:53:29 +0000"  >&lt;p&gt;2012-11-30 14:38:46 LustreError: 4187:0:(mdt_recovery.c:1011:mdt_steal_ack_locks()) Resent req xid 1417917631054119 has mismatched opc: new 101 old 0&lt;br/&gt;
2012-11-30 14:38:46 Lustre: All locks stolen from rs ffff880129196000 x1417917631054119.t281940978801 o0 NID 172.16.65.148@tcp&lt;br/&gt;
2012-11-30 14:38:47 LustreError: 4303:0:(mdt_recovery.c:1011:mdt_steal_ack_locks()) Resent req xid 1417917631054855 has mismatched opc: new 101 old 0&lt;br/&gt;
2012-11-30 14:38:47 Lustre: All locks stolen from rs ffff8801567d8000 x1417917631054855.t281940981086 o0 NID 172.16.65.148@tcp&lt;/p&gt;

&lt;p&gt;this does not look normal: resents from the same client (172.16.65.148@tcp) within 1s ? given mdc semaphore is serializing requests on the client side.&lt;/p&gt;

&lt;p&gt;the backtraces were result of ldlm lock enqueues taking too long. &lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="22968">LU-4572</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="17823">LU-2944</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="12077" name="console.momus-mds1.txt" size="1215738" author="morrone" created="Fri, 30 Nov 2012 20:28:12 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10490" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>End date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Thu, 26 Jun 2014 20:28:12 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvd5z:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>5736</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10493" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>Start date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Fri, 30 Nov 2012 20:28:12 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                    </customfields>
    </item>
</channel>
</rss>