<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:43:56 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4572] hung mdt threads</title>
                <link>https://jira.whamcloud.com/browse/LU-4572</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;met threads hug. Forced reboot of mds 2 different time. &lt;/p&gt;

&lt;p&gt;uploading the following to ftp site:&lt;br/&gt;
lustre-log.1391239242.7851.txt.gz &lt;br/&gt;
vmcore-dmesg.txt.gz&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Lustre: MGS: haven&apos;t heard from client c546719d-1bcc-571f-a4e3-17f67dc35b50 (at 10.151.31.4@o2ib) in 199 seconds. I think it&apos;s dead, and I am evicting it. exp ffff880fd0587800, cur 1391236411 expire 1391236261 last 1391236212
LNet: Service thread pid 7851 was inactive for 200.00s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
Pid: 7851, comm: mdt01_055

Call Trace:
 [&amp;lt;ffffffff815404c2&amp;gt;] schedule_timeout+0x192/0x2e0
 [&amp;lt;ffffffff81080610&amp;gt;] ? process_timeout+0x0/0x10
 [&amp;lt;ffffffffa04156d1&amp;gt;] cfs_waitq_timedwait+0x11/0x20 [libcfs]
 [&amp;lt;ffffffffa06d201d&amp;gt;] ldlm_completion_ast+0x4ed/0x960 [ptlrpc]
 [&amp;lt;ffffffffa06cd790&amp;gt;] ? ldlm_expired_completion_wait+0x0/0x390 [ptlrpc]
 [&amp;lt;ffffffff81063be0&amp;gt;] ? default_wake_function+0x0/0x20
 [&amp;lt;ffffffffa06d1758&amp;gt;] ldlm_cli_enqueue_local+0x1f8/0x5d0 [ptlrpc]
 [&amp;lt;ffffffffa06d1b30&amp;gt;] ? ldlm_completion_ast+0x0/0x960 [ptlrpc]
 [&amp;lt;ffffffffa0dd7a90&amp;gt;] ? mdt_blocking_ast+0x0/0x2a0 [mdt]
 [&amp;lt;ffffffffa0dddc0c&amp;gt;] mdt_object_lock0+0x28c/0xaf0 [mdt]
 [&amp;lt;ffffffffa0dd7a90&amp;gt;] ? mdt_blocking_ast+0x0/0x2a0 [mdt]
 [&amp;lt;ffffffffa06d1b30&amp;gt;] ? ldlm_completion_ast+0x0/0x960 [ptlrpc]
 [&amp;lt;ffffffffa0dde534&amp;gt;] mdt_object_lock+0x14/0x20 [mdt]
 [&amp;lt;ffffffffa0dde5a1&amp;gt;] mdt_object_find_lock+0x61/0x170 [mdt]
 [&amp;lt;ffffffffa0e0c80c&amp;gt;] mdt_reint_open+0x8cc/0x20e0 [mdt]
 [&amp;lt;ffffffffa043185e&amp;gt;] ? upcall_cache_get_entry+0x28e/0x860 [libcfs]
 [&amp;lt;ffffffffa06fadcc&amp;gt;] ? lustre_msg_add_version+0x6c/0xc0 [ptlrpc]
 [&amp;lt;ffffffffa05921b0&amp;gt;] ? lu_ucred+0x20/0x30 [obdclass]
 [&amp;lt;ffffffffa0dd7015&amp;gt;] ? mdt_ucred+0x15/0x20 [mdt]
 [&amp;lt;ffffffffa0df31cc&amp;gt;] ? mdt_root_squash+0x2c/0x410 [mdt]
 [&amp;lt;ffffffffa0df7981&amp;gt;] mdt_reint_rec+0x41/0xe0 [mdt]
 [&amp;lt;ffffffffa0ddcb03&amp;gt;] mdt_reint_internal+0x4c3/0x780 [mdt]
 [&amp;lt;ffffffffa0ddd090&amp;gt;] mdt_intent_reint+0x1f0/0x530 [mdt]
 [&amp;lt;ffffffffa0ddaf3e&amp;gt;] mdt_intent_policy+0x39e/0x720 [mdt]
 [&amp;lt;ffffffffa06b2831&amp;gt;] ldlm_lock_enqueue+0x361/0x8d0 [ptlrpc]
 [&amp;lt;ffffffffa06d91ef&amp;gt;] ldlm_handle_enqueue0+0x4ef/0x10b0 [ptlrpc]
 [&amp;lt;ffffffffa0ddb3c6&amp;gt;] mdt_enqueue+0x46/0xe0 [mdt]
 [&amp;lt;ffffffffa0de1ad7&amp;gt;] mdt_handle_common+0x647/0x16d0 [mdt]
 [&amp;lt;ffffffffa0e1b615&amp;gt;] mds_regular_handle+0x15/0x20 [mdt]
 [&amp;lt;ffffffffa070b3c8&amp;gt;] ptlrpc_server_handle_request+0x398/0xc60 [ptlrpc]
 [&amp;lt;ffffffffa04155de&amp;gt;] ? cfs_timer_arm+0xe/0x10 [libcfs]
 [&amp;lt;ffffffffa0426d9f&amp;gt;] ? lc_watchdog_touch+0x6f/0x170 [libcfs]
 [&amp;lt;ffffffffa0702729&amp;gt;] ? ptlrpc_wait_event+0xa9/0x290 [ptlrpc]
 [&amp;lt;ffffffff81055813&amp;gt;] ? __wake_up+0x53/0x70
 [&amp;lt;ffffffffa070c75e&amp;gt;] ptlrpc_main+0xace/0x1700 [ptlrpc]
 [&amp;lt;ffffffffa070bc90&amp;gt;] ? ptlrpc_main+0x0/0x1700 [ptlrpc]
 [&amp;lt;ffffffff8100c0ca&amp;gt;] child_rip+0xa/0x20
 [&amp;lt;ffffffffa070bc90&amp;gt;] ? ptlrpc_main+0x0/0x1700 [ptlrpc]
 [&amp;lt;ffffffffa070bc90&amp;gt;] ? ptlrpc_main+0x0/0x1700 [ptlrpc]
 [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20
LustreError: dumping log to /tmp/lustre-log.1391239242.7851
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>our lustre source tree is at:&lt;br/&gt;
&lt;a href=&quot;https://github.com/jlan/lustre-nas&quot;&gt;https://github.com/jlan/lustre-nas&lt;/a&gt;</environment>
        <key id="22968">LU-4572</key>
            <summary>hung mdt threads</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="green">Oleg Drokin</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Sat, 1 Feb 2014 23:42:05 +0000</created>
                <updated>Wed, 6 Jan 2016 20:22:56 +0000</updated>
                            <resolved>Wed, 6 Jan 2016 20:22:56 +0000</resolved>
                                    <version>Lustre 2.4.1</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="76047" author="mhanafi" created="Sat, 1 Feb 2014 23:56:02 +0000"  >&lt;p&gt;also uploaded bt of all tasks: bt.all.lu-4572.gz&lt;/p&gt;</comment>
                            <comment id="76054" author="pjones" created="Sun, 2 Feb 2014 13:17:26 +0000"  >&lt;p&gt;Nathaniel&lt;/p&gt;

&lt;p&gt;Could you please look into this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="76173" author="utopiabound" created="Tue, 4 Feb 2014 13:26:55 +0000"  >&lt;p&gt;This looks like it may be the same issue as &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2419&quot; title=&quot;mdt threads stuck in ldlm_expired_completion_wait&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2419&quot;&gt;&lt;del&gt;LU-2419&lt;/del&gt;&lt;/a&gt; (the stack traces are very similar).&lt;/p&gt;</comment>
                            <comment id="76212" author="mhanafi" created="Tue, 4 Feb 2014 19:37:48 +0000"  >&lt;p&gt;Do you need any additional info to debug the issue?&lt;/p&gt;</comment>
                            <comment id="77561" author="green" created="Thu, 20 Feb 2014 23:46:28 +0000"  >&lt;p&gt;What&apos;s your at_max?&lt;/p&gt;</comment>
                            <comment id="77562" author="green" created="Thu, 20 Feb 2014 23:50:56 +0000"  >&lt;p&gt;Also: do you run 2.4 clients from that source as well? Can you please show me dmesg from e.g. 10.151.17.136 during all of this happening?&lt;br/&gt;
Did you just crashdump the node to get all of this info or did you let it stay alive and just live-dumped?&lt;/p&gt;</comment>
                            <comment id="77665" author="mhanafi" created="Fri, 21 Feb 2014 21:08:29 +0000"  >&lt;p&gt;I have attached the console logs from 10.151.17.136 (r445i5n0.conlog.gz). &lt;/p&gt;

&lt;p&gt;We have both 2.4 and 2.1.5 clients.&lt;/p&gt;

&lt;p&gt;at_max = 600. &lt;/p&gt;

&lt;p&gt;The info was taken from crash dump. &lt;/p&gt;</comment>
                            <comment id="77684" author="green" created="Sat, 22 Feb 2014 05:20:02 +0000"  >&lt;p&gt;Well, the timeout you posted in the ticket seems to be related to client 10.151.31.4@o2ib being unresponsive, since the client is unresponsive, it does not send any lock cancels too I assume, so the locks it holds could not be cacelled and others who need said locks are forced to wait.&lt;br/&gt;
I also see a number of network errors in logs (and client logs too), are you having some network problems there?&lt;/p&gt;

&lt;p&gt;Further on I see a different kind of problem:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&amp;lt;4&amp;gt;Lustre: 65757:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1391239042/real 1391239042]  req@ffff8806c8f5d800 x1458619173294584/t0(0) o601-&amp;gt;nbp7-MDT0000-lwp-MDT0000@0@lo:12/10 lens 336/336 e 3 to 1 dl 1391240399 ref 2 fl Rpc:XN/0/ffffffff rc 0/-1
&amp;lt;4&amp;gt;Lustre: 65757:0:(client.c:1868:ptlrpc_expire_one_request()) Skipped 5 previous similar messages
&amp;lt;4&amp;gt;Lustre: 65757:0:(service.c:2031:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (1110:247s); client may timeout.  req@ffff8806d18a2000 x1458067638265204/t0(0) o101-&amp;gt;446c8dd2-5b53-9b80-51b7-1573c85a021b@10.151.17.177@o2ib:0/0 lens 616/544 e 3 to 0 dl 1391240152 ref 1 fl Complete:/0/0 rc 301/301
&amp;lt;4&amp;gt;LNet: Service thread pid 7918 completed after 1357.11s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So here we are trying to send a quota rpc over local-connected lwp and it&apos;s timing out? that&apos;s not sounding right. Also the timeout for this local connection is huge for whatever reason. Also I do not see it being assigned any sane value too.&lt;/p&gt;

&lt;p&gt;We can see that overtime the timeout there grows even more.&lt;/p&gt;

&lt;p&gt;The other worrying sign is this:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;&amp;lt;4&amp;gt;Clocksource tsc unstable (delta = -68719428120 ns).  Enable clocksource failo
ver by adding clocksource_failover kernel parameter.
&amp;lt;4&amp;gt;Lustre: 6577:0:(service.c:1889:ptlrpc_server_handle_req_in()) @@@ Slow req_in
 handling 76s  req@ffff880f7ded2850 x1447159886197772/t0(0) o400-&amp;gt;449796f8-1d26-
c76e-7df6-0123fb984781@10.151.57.111@o2ib:0/0 lens 192/0 e 0 to 0 dl 0 ref 1 fl 
New:/0/ffffffff rc 0/-1
&amp;lt;4&amp;gt;Lustre: mdt: This server is not able to keep up with request traffic (cpu-bou
nd).
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So 70+ seconds to handle request_in code is way too excessive. When this happens do you experience huge cpu utilization.&lt;br/&gt;
Also I see the tsc unstable message - I think kernel uses that as internal timestamp counter, so if it deviates this hugely, could it be all internal timeouts are just not working properly? Is this something you can address just to be sure?&lt;/p&gt;</comment>
                            <comment id="77686" author="green" created="Sat, 22 Feb 2014 05:40:33 +0000"  >&lt;p&gt;One more question, how many OSTs do you have in your system?&lt;/p&gt;</comment>
                            <comment id="77844" author="mhanafi" created="Tue, 25 Feb 2014 18:29:10 +0000"  >&lt;p&gt;1. I am working through the logs to see if there were any IB fabric issues.&lt;/p&gt;

&lt;p&gt;2. When the system was in that state cpu utilization was minimal. &lt;/p&gt;

&lt;p&gt;3. We have seen the clocksource in our logs but we haven&apos;t found it to cause any issue. But I will switch it to hpet.  &lt;/p&gt;

&lt;p&gt;4. This filesystem has 84 OSTs. &lt;/p&gt;
</comment>
                            <comment id="77870" author="green" created="Tue, 25 Feb 2014 22:48:13 +0000"  >&lt;p&gt;I see. Let&apos;s see if switching to HPET helps any, I guess.&lt;br/&gt;
With no cpu usage when this happened I do not see why things would be delayed by 70 seconds other than a broken clock.&lt;/p&gt;</comment>
                            <comment id="97849" author="mhanafi" created="Wed, 29 Oct 2014 16:26:52 +0000"  >&lt;p&gt;please close not seen the issue again.&lt;/p&gt;</comment>
                            <comment id="97854" author="pjones" created="Wed, 29 Oct 2014 16:31:22 +0000"  >&lt;p&gt;ok thanks Mahmoud&lt;/p&gt;</comment>
                            <comment id="138115" author="jay" created="Wed, 6 Jan 2016 19:29:37 +0000"  >&lt;p&gt;I reopened this ticket because I tend to think ldlm_expired_completion_wait() should return an error code at least on client side so that the client enqueue process won&apos;t be stuck on completion AST forever.&lt;/p&gt;

&lt;p&gt;The same issue is seen on &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7372&quot; title=&quot;replay-dual test_26: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7372&quot;&gt;&lt;del&gt;LU-7372&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="138130" author="jay" created="Wed, 6 Jan 2016 20:22:56 +0000"  >&lt;p&gt;I decided to fix this issue in the ticket U-7372&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="16834">LU-2419</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="17823">LU-2944</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="32965">LU-7372</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="14152" name="r445i5n0.conlog.gz" size="2964" author="mhanafi" created="Fri, 21 Feb 2014 21:01:34 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwe33:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>12492</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>