<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:52:16 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5530] MDS thread lockup witrh patched 2.5 server</title>
                <link>https://jira.whamcloud.com/browse/LU-5530</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;While testing 2.5 servers for a possible upcoming test shot I hit this bug while running simul.&lt;/p&gt;

&lt;p&gt;&amp;lt;6&amp;gt;[ 3724.190222] mdt00_003     D 0000000000000010     0 15957      2 0x00000000&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.197292]  ffff8807e1f118b8 0000000000000046 0000000000000000 ffffffffa0f5b6eb&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.205018]  ffff8810021cf190 ffff8810021cf138 ffff8808326a0538 ffffffffa0f5b6eb&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.212759]  ffff8807f846fab8 ffff8807e1f11fd8 000000000000fbc8 ffff8807f846fab8&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.220491] Call Trace:&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.223032]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8152a6d5&amp;gt;&amp;#93;&lt;/span&gt; rwsem_down_failed_common+0x95/0x1d0&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.229597]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d9d3fb&amp;gt;&amp;#93;&lt;/span&gt; ? ldiskfs_xattr_trusted_get+0x2b/0x30 &lt;span class=&quot;error&quot;&gt;&amp;#91;ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.237247]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff811ae017&amp;gt;&amp;#93;&lt;/span&gt; ? generic_getxattr+0x87/0x90&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.243199]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8152a866&amp;gt;&amp;#93;&lt;/span&gt; rwsem_down_read_failed+0x26/0x30&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.249499]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0fe8083&amp;gt;&amp;#93;&lt;/span&gt; ? lod_xattr_get+0x153/0x420 &lt;span class=&quot;error&quot;&gt;&amp;#91;lod&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.255867]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8128eab4&amp;gt;&amp;#93;&lt;/span&gt; call_rwsem_down_read_failed+0x14/0x30&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.262580]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81529d64&amp;gt;&amp;#93;&lt;/span&gt; ? down_read+0x24/0x30&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.267923]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0f2569d&amp;gt;&amp;#93;&lt;/span&gt; mdt_object_open_lock+0x1ed/0x9d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.274736]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0f077e0&amp;gt;&amp;#93;&lt;/span&gt; ? mdt_attr_get_complex+0x520/0x7f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.281720]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0f2dcc7&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint_open+0x15b7/0x2150 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.288187]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05e9f76&amp;gt;&amp;#93;&lt;/span&gt; ? upcall_cache_get_entry+0x296/0x880 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.295688]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa073fc10&amp;gt;&amp;#93;&lt;/span&gt; ? lu_ucred+0x20/0x30 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.301900]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0f16611&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint_rec+0x41/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.307916]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0efbe63&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint_internal+0x4c3/0x780 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.314551]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0efc3ee&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_reint+0x1ee/0x520 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.321023]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0ef9bce&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_policy+0x3ae/0x770 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.327619]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa085a2e5&amp;gt;&amp;#93;&lt;/span&gt; ldlm_lock_enqueue+0x135/0x950 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.334445]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0883ccf&amp;gt;&amp;#93;&lt;/span&gt;&lt;br/&gt;
ldlm_handle_enqueue0+0x50f/0x10c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.334445]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0883ccf&amp;gt;&amp;#93;&lt;/span&gt; ldlm_handle_enqueue0+0x50f/0x10c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.341608]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0efa096&amp;gt;&amp;#93;&lt;/span&gt; mdt_enqueue+0x46/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.347484]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0efec5a&amp;gt;&amp;#93;&lt;/span&gt; mdt_handle_common+0x52a/0x1470 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.354147]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0f3b945&amp;gt;&amp;#93;&lt;/span&gt; mds_regular_handle+0x15/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.360670]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa08b4015&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request+0x385/0xc00 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.368508]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05ce4ce&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_timer_arm+0xe/0x10 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.374864]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05df3cf&amp;gt;&amp;#93;&lt;/span&gt; ? lc_watchdog_touch+0x6f/0x170 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.381791]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa08ab699&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_wait_event+0xa9/0x2d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.388691]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810546b9&amp;gt;&amp;#93;&lt;/span&gt; ? __wake_up_common+0x59/0x90&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.394666]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa08b537d&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0xaed/0x1920 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.402340]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa08b4890&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1920 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.408702]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109ab56&amp;gt;&amp;#93;&lt;/span&gt; kthread+0x96/0xa0&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.413684]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c20a&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.418754]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109aac0&amp;gt;&amp;#93;&lt;/span&gt; ? kthread+0x0/0xa0&lt;br/&gt;
&amp;lt;4&amp;gt;[ 3724.423841]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c200&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;/p&gt;</description>
                <environment>RHEL6.5 MDS server running latest 2.5 with patches for&lt;br/&gt;
&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-793&quot; title=&quot;Reconnections should not be refused when there is a request in progress from this client.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-793&quot;&gt;&lt;strike&gt;LU-793&lt;/strike&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2827&quot; title=&quot;mdt_intent_fixup_resent() cannot find the proper lock in hash&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2827&quot;&gt;&lt;strike&gt;LU-2827&lt;/strike&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3338&quot; title=&quot;IOC_MDC_GETFILESTRIPE can abuse vmalloc()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3338&quot;&gt;&lt;strike&gt;LU-3338&lt;/strike&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4933&quot; title=&quot;Automatically tune the max_dirty_mb&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4933&quot;&gt;&lt;strike&gt;LU-4933&lt;/strike&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5266&quot; title=&quot;LBUG on Failover -ldlm_process_extent_lock()) ASSERTION( lock-&amp;gt;l_granted_mode != lock-&amp;gt;l_req_mode ) &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5266&quot;&gt;&lt;strike&gt;LU-5266&lt;/strike&gt;&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5496&quot; title=&quot;fix for LU-5266&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5496&quot;&gt;&lt;strike&gt;LU-5496&lt;/strike&gt;&lt;/a&gt;&lt;br/&gt;
</environment>
        <key id="26108">LU-5530</key>
            <summary>MDS thread lockup witrh patched 2.5 server</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="green">Oleg Drokin</assignee>
                                    <reporter username="simmonsja">James A Simmons</reporter>
                        <labels>
                    </labels>
                <created>Thu, 21 Aug 2014 18:58:52 +0000</created>
                <updated>Wed, 13 Oct 2021 03:05:46 +0000</updated>
                            <resolved>Sat, 11 Oct 2014 04:11:11 +0000</resolved>
                                    <version>Lustre 2.5.2</version>
                                    <fixVersion>Lustre 2.7.0</fixVersion>
                    <fixVersion>Lustre 2.5.4</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>11</watches>
                                                                            <comments>
                            <comment id="92161" author="simmonsja" created="Thu, 21 Aug 2014 19:07:17 +0000"  >&lt;p&gt;I uploaded a crash dump to ftp.whamcloud.com/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5530&quot; title=&quot;MDS thread lockup witrh patched 2.5 server&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5530&quot;&gt;&lt;del&gt;LU-5530&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Also you will see the debuginfo rpms as well.&lt;/p&gt;</comment>
                            <comment id="92179" author="pjones" created="Thu, 21 Aug 2014 21:29:56 +0000"  >&lt;p&gt;Niu&lt;/p&gt;

&lt;p&gt;Could you please advise on this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="92203" author="niu" created="Fri, 22 Aug 2014 09:29:09 +0000"  >&lt;p&gt;Hi, James&lt;/p&gt;

&lt;p&gt;I downloaded the vmcore-dmesg.txt, it looks lots service threads are blocked on the inode-&amp;gt;xattr_sem when trying to read xattr. unfortunately, downloading those files are extremely slow for me, it could take more than 10 hours to download a core file, I tried few times, and gave up at the end.&lt;/p&gt;

&lt;p&gt;Could you send me the output of the &quot;bt -a&quot; and &quot;log&quot; (two crash commands)? Thanks in advance.&lt;/p&gt;</comment>
                            <comment id="92735" author="simmonsja" created="Thu, 28 Aug 2014 18:17:34 +0000"  >&lt;p&gt;I did get the bt-a and log data but it didn&apos;t reveal much. The bt-a looked like the dmesg output. Perhaps we should approach this with kernel lockdep?&lt;/p&gt;</comment>
                            <comment id="92776" author="niu" created="Fri, 29 Aug 2014 03:01:42 +0000"  >&lt;p&gt;Is it possible that the messages in vmcore-dmesg.txt was truncated? I think the output of bt-a and log could be still valuable. I never used lockdep, so I&apos;m afraid that I can&apos;t give any advice on this.&lt;/p&gt;</comment>
                            <comment id="92977" author="simmonsja" created="Tue, 2 Sep 2014 17:18:01 +0000"  >&lt;p&gt;crash&amp;gt; bt ffff8807e2630080&lt;br/&gt;
PID: 15771  TASK: ffff8807e2630080  CPU: 20  COMMAND: &quot;mdt01_000&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e26817f8&amp;#93;&lt;/span&gt; schedule at ffffffff81528020&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e26818c0&amp;#93;&lt;/span&gt; rwsem_down_failed_common at ffffffff8152a6d5&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e2681920&amp;#93;&lt;/span&gt; rwsem_down_read_failed at ffffffff8152a866&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e2681960&amp;#93;&lt;/span&gt; call_rwsem_down_read_failed at ffffffff8128eab4&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e26819c8&amp;#93;&lt;/span&gt; mdt_object_open_lock at ffffffffa0f2569d &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e2681a38&amp;#93;&lt;/span&gt; mdt_reint_open at ffffffffa0f2dcc7 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #6 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e2681b18&amp;#93;&lt;/span&gt; mdt_reint_rec at ffffffffa0f16611 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #7 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e2681b38&amp;#93;&lt;/span&gt; mdt_reint_internal at ffffffffa0efbe63 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e2681b78&amp;#93;&lt;/span&gt; mdt_intent_reint at ffffffffa0efc3ee &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #9 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e2681bc8&amp;#93;&lt;/span&gt; mdt_intent_policy at ffffffffa0ef9bce &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
#10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e2681c08&amp;#93;&lt;/span&gt; ldlm_lock_enqueue at ffffffffa085a2e5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#11 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e2681c78&amp;#93;&lt;/span&gt; ldlm_handle_enqueue0 at ffffffffa0883ccf &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#12 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e2681ce8&amp;#93;&lt;/span&gt; mdt_enqueue at ffffffffa0efa096 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
#13 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e2681d08&amp;#93;&lt;/span&gt; mdt_handle_common at ffffffffa0efec5a &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
#14 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e2681d58&amp;#93;&lt;/span&gt; mds_regular_handle at ffffffffa0f3b945 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
#15 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e2681d68&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request at ffffffffa08b4015 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#16 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e2681e48&amp;#93;&lt;/span&gt; ptlrpc_main at ffffffffa08b537d &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#17 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e2681ee8&amp;#93;&lt;/span&gt; kthread at ffffffff8109ab56&lt;br/&gt;
#18 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8807e2681f48&amp;#93;&lt;/span&gt; kernel_thread at ffffffff8100c20a&lt;/p&gt;

&lt;p&gt;info&lt;/p&gt;

&lt;p&gt;[ 3724.166886] INFO: task mdt00_003:15957 blocked for more than 120 seconds.&lt;br/&gt;
[ 3724.173775]       Tainted: P           ---------------    2.6.32-431.17.1.el6.wc.x86_64 #1&lt;br/&gt;
[ 3724.182236] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.&lt;br/&gt;
[ 3724.190222] mdt00_003     D 0000000000000010     0 15957      2 0x00000000&lt;br/&gt;
[ 3724.197292]  ffff8807e1f118b8 0000000000000046 0000000000000000 ffffffffa0f5b6eb&lt;br/&gt;
[ 3724.205018]  ffff8810021cf190 ffff8810021cf138 ffff8808326a0538 ffffffffa0f5b6eb&lt;br/&gt;
[ 3724.212759]  ffff8807f846fab8 ffff8807e1f11fd8 000000000000fbc8 ffff8807f846fab8&lt;br/&gt;
[ 3724.220491] Call Trace:&lt;br/&gt;
[ 3724.223032]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8152a6d5&amp;gt;&amp;#93;&lt;/span&gt; rwsem_down_failed_common+0x95/0x1d0&lt;br/&gt;
[ 3724.229597]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d9d3fb&amp;gt;&amp;#93;&lt;/span&gt; ? ldiskfs_xattr_trusted_get+0x2b/0x30 &lt;span class=&quot;error&quot;&gt;&amp;#91;ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.237247]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff811ae017&amp;gt;&amp;#93;&lt;/span&gt; ? generic_getxattr+0x87/0x90&lt;br/&gt;
[ 3724.243199]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8152a866&amp;gt;&amp;#93;&lt;/span&gt; rwsem_down_read_failed+0x26/0x30&lt;br/&gt;
[ 3724.249499]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0fe8083&amp;gt;&amp;#93;&lt;/span&gt; ? lod_xattr_get+0x153/0x420 &lt;span class=&quot;error&quot;&gt;&amp;#91;lod&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.255867]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8128eab4&amp;gt;&amp;#93;&lt;/span&gt; call_rwsem_down_read_failed+0x14/0x30&lt;br/&gt;
[ 3724.262580]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81529d64&amp;gt;&amp;#93;&lt;/span&gt; ? down_read+0x24/0x30&lt;br/&gt;
[ 3724.267923]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0f2569d&amp;gt;&amp;#93;&lt;/span&gt; mdt_object_open_lock+0x1ed/0x9d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.274736]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0f077e0&amp;gt;&amp;#93;&lt;/span&gt; ? mdt_attr_get_complex+0x520/0x7f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.281720]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0f2dcc7&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint_open+0x15b7/0x2150 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.288187]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05e9f76&amp;gt;&amp;#93;&lt;/span&gt; ? upcall_cache_get_entry+0x296/0x880 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.295688]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa073fc10&amp;gt;&amp;#93;&lt;/span&gt; ? lu_ucred+0x20/0x30 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.301900]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0f16611&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint_rec+0x41/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.307916]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0efbe63&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint_internal+0x4c3/0x780 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.314551]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0efc3ee&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_reint+0x1ee/0x520 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.321023]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0ef9bce&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_policy+0x3ae/0x770 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.327619]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa085a2e5&amp;gt;&amp;#93;&lt;/span&gt; ldlm_lock_enqueue+0x135/0x950 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;[ 3724.334445]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0883ccf&amp;gt;&amp;#93;&lt;/span&gt; ldlm_handle_enqueue0+0x50f/0x10c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.341608]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0efa096&amp;gt;&amp;#93;&lt;/span&gt; mdt_enqueue+0x46/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.347484]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0efec5a&amp;gt;&amp;#93;&lt;/span&gt; mdt_handle_common+0x52a/0x1470 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.354147]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0f3b945&amp;gt;&amp;#93;&lt;/span&gt; mds_regular_handle+0x15/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.360670]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa08b4015&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request+0x385/0xc00 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.368508]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05ce4ce&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_timer_arm+0xe/0x10 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.374864]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05df3cf&amp;gt;&amp;#93;&lt;/span&gt; ? lc_watchdog_touch+0x6f/0x170 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.381791]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa08ab699&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_wait_event+0xa9/0x2d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.388691]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810546b9&amp;gt;&amp;#93;&lt;/span&gt; ? __wake_up_common+0x59/0x90&lt;br/&gt;
[ 3724.394666]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa08b537d&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0xaed/0x1920 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.402340]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa08b4890&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1920 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
[ 3724.408702]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109ab56&amp;gt;&amp;#93;&lt;/span&gt; kthread+0x96/0xa0&lt;br/&gt;
[ 3724.413684]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c20a&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
[ 3724.418754]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109aac0&amp;gt;&amp;#93;&lt;/span&gt; ? kthread+0x0/0xa0&lt;br/&gt;
[ 3724.423841]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c200&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;/p&gt;</comment>
                            <comment id="93549" author="green" created="Tue, 9 Sep 2014 16:58:31 +0000"  >&lt;p&gt;So I have looked in the core file, checked all mds threads.&lt;br/&gt;
There is no obvious culprit for the deadlock observed. Nothing sleeping or otherwise blocked with this lock held.&lt;br/&gt;
So it appears that the semaphore is somehow leaked.&lt;br/&gt;
I inspected all the callers and users of the open semaphore and there does not seem to be any possible way for the leakage.&lt;br/&gt;
I double-checked using ORNL patched 2.5 tree and it seems to be the case there as well.&lt;/p&gt;

&lt;p&gt;As such I am switching mode now.&lt;br/&gt;
The plan is to trigger resending of the client requests at all times due to too small of a buffer. Either by making the server to always send huge replies or by the client to always provide too little ones.&lt;/p&gt;

&lt;p&gt;I am getting some interesting crashes this way already, though possibly this is  bit self-inflicted.&lt;br/&gt;
This is also useful so that we can execute this code even on our small test clusters during normal operations.&lt;/p&gt;</comment>
                            <comment id="93637" author="green" created="Wed, 10 Sep 2014 01:51:47 +0000"  >&lt;p&gt;My &quot;force frequent resends to facilitate lu2827-like issues&quot; patch reallyworked as expected ( &lt;a href=&quot;http://review.whamcloud.com/11842&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11842&lt;/a&gt; ).&lt;/p&gt;

&lt;p&gt;Now I was able to reproduce this problem and found out that it is &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2827&quot; title=&quot;mdt_intent_fixup_resent() cannot find the proper lock in hash&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2827&quot;&gt;&lt;del&gt;LU-2827&lt;/del&gt;&lt;/a&gt; related too - mismatched release of a semaphore (so that it was released more than it was taken) in mdt_reint_open.&lt;br/&gt;
Proposed patch for this issue: &lt;a href=&quot;http://review.whamcloud.com/11841&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11841&lt;/a&gt; (also applies to b2_5)&lt;/p&gt;

&lt;p&gt;In addition, after fixing this I also hit  &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5579&quot; title=&quot;MDS crashed by &amp;quot;mdt_check_resent_lock()) ASSERTION( lock != NULL ) failed&amp;quot;&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5579&quot;&gt;&lt;del&gt;LU-5579&lt;/del&gt;&lt;/a&gt; while testing with racer. And &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5496&quot; title=&quot;fix for LU-5266&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5496&quot;&gt;&lt;del&gt;LU-5496&lt;/del&gt;&lt;/a&gt; is also a bugfix in a related area so both of those patches should be adopted too.&lt;/p&gt;</comment>
                            <comment id="96173" author="pjones" created="Sat, 11 Oct 2014 04:11:11 +0000"  >&lt;p&gt;Landed for 2.5.4 and 2.7&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="17609">LU-2827</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="26782">LU-5686</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwu9j:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>15395</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>