<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:04:14 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-6898] ldlm_resource_dump()) Granted locks (in reverse order)</title>
                <link>https://jira.whamcloud.com/browse/LU-6898</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;&lt;b&gt;On the client we see errors&lt;/b&gt;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[1437673998.848774] LustreError: 11-0: nbp8-MDT0000-mdc-ffff8806cb247400: Communicating with 10.151.27.60@o2ib, operation obd_ping failed with -107.^M
[1437673998.860774] Lustre: nbp8-MDT0000-mdc-ffff8806cb247400: Connection to nbp8-MDT0000 (at 10.151.27.60@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete^M
[1437673998.880773] LustreError: 167-0: nbp8-MDT0000-mdc-ffff8806cb247400: This client was evicted by nbp8-MDT0000; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail.^M
[1437673998.916773] LustreError: 81375:0:(ldlm_resource.c:809:ldlm_resource_complain()) nbp8-MDT0000-mdc-ffff8806cb247400: namespace resource [0x360375393:0xe66d:0x0].0 (ffff8fc07bee8a80) refcount nonzero (1) after lock cleanup; forcing cleanup.^M
[1437673998.940772] LustreError: 81375:0:(ldlm_resource.c:809:ldlm_resource_complain()) Skipped 2587 previous similar messages^M
[1437673998.952772] LustreError: 81375:0:(ldlm_resource.c:1448:ldlm_resource_dump()) --- Resource: [0x360375393:0xe66d:0x0].0 (ffff8fc07bee8a80) refcount = 2^M
[1437673998.952772] LustreError: 81375:0:(ldlm_resource.c:1451:ldlm_resource_dump()) Granted locks (in reverse order):^M
[1437673998.952772] LustreError: 81375:0:(ldlm_resource.c:1454:ldlm_resource_dump()) ### ### ns: nbp8-MDT0000-mdc-ffff8806cb247400 lock: ffff8fc0fa26dbc0/0x7f099458bb92bf52 lrc: 2/0,0 mode: PR/PR res: [0x360375393:0xe66d:0x0].0 bits 0x1b rrc: 2 type: IBT flags: 0x12e400000000 nid: local remote: 0x551d423294fa4bce expref: -99 pid: 46426 timeout: 0 lvb_type: 3^M
[1437673998.952772] LustreError: 81375:0:(ldlm_resource.c:1454:ldlm_resource_dump()) Skipped 3648 previous similar messages^M
[1437673998.952772] LustreError: 81375:0:(ldlm_resource.c:1448:ldlm_resource_dump()) --- Resource: [0x3603755cc:0x6454:0x0].0 (ffff8b075a9a8bc0) refcount = 2^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;&lt;b&gt;Server&lt;/b&gt;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Jul 23 10:53:08 nbp8-mds1 kernel: LustreError: 0:0:(ldlm_lockd.c:344:waiting_locks_callback()) ### lock callback timer expired after 226s: evicting client at 10.151.63.50@o2ib  ns: mdt-nbp8-MDT0000_UUID lock: ffff882a2c6794c0/0x551d4232f4dfbb5e lrc: 3/0,0 mode: PR/PR res: [0x4976d01:0xe2d4a4f3:0x0].0 bits 0x13 rrc: 848 type: IBT flags: 0x60200000000020 nid: 10.151.63.50@o2ib remote: 0x7f099458bb9c07c4 expref: 9 pid: 9672 timeout: 8029699438 lvb_type: 0
Jul 23 10:53:09 nbp8-mds1 kernel: LNet: 5828:0:(lib-move.c:865:lnet_post_send_locked()) Aborting message &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 12345-10.151.12.174@o2ib: LNetM[DE]Unlink() already called on the MD/ME.
Jul 23 10:53:09 nbp8-mds1 kernel: LNet: 5828:0:(lib-move.c:865:lnet_post_send_locked()) Skipped 41 previous similar messages
Jul 23 10:53:39 nbp8-mds1 kernel: format at ldlm_pool.c:628:ldlm_pool_recalc doesn&apos;t end in newline
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;



&lt;p&gt;On the client all ldlm_bl threads are stuck &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;0xffff8a0739b06080    21185        2  1  711   R  0xffff8a0739b066f0  ldlm_bl_110^M
 [&amp;lt;ffffffff814760e8&amp;gt;] _raw_spin_unlock_irqrestore+0x8/0x10^M
 [&amp;lt;ffffffffa0d7a807&amp;gt;] osc_page_delete+0xe7/0x360 [osc]^M
 [&amp;lt;ffffffffa0ad14d5&amp;gt;] cl_page_delete0+0xc5/0x4e0 [obdclass]^M
 [&amp;lt;ffffffffa0ad192a&amp;gt;] cl_page_delete+0x3a/0x120 [obdclass]^M
 [&amp;lt;ffffffffa0ee16a6&amp;gt;] ll_invalidatepage+0x96/0x160 [lustre]^M
 [&amp;lt;ffffffffa0ef314d&amp;gt;] vvp_page_discard+0x8d/0x120 [lustre]^M
 [&amp;lt;ffffffffa0acda58&amp;gt;] cl_page_invoid+0x78/0x170 [obdclass]^M
 [&amp;lt;ffffffffa0ad490c&amp;gt;] discard_cb+0xbc/0x1e0 [obdclass]^M
 [&amp;lt;ffffffffa0ad2467&amp;gt;] cl_page_gang_lookup+0x1f7/0x3f0 [obdclass]^M
 [&amp;lt;ffffffffa0ad471a&amp;gt;] cl_lock_discard_pages+0xfa/0x1d0 [obdclass]^M
 [&amp;lt;ffffffffa0d7c0d2&amp;gt;] osc_lock_flush+0xf2/0x260 [osc]^M
 [&amp;lt;ffffffffa0d7c339&amp;gt;] osc_lock_cancel+0xf9/0x1e0 [osc]^M
 [&amp;lt;ffffffffa0ad2bd5&amp;gt;] cl_lock_cancel0+0x65/0x150 [obdclass]^M
 [&amp;lt;ffffffffa0ad394b&amp;gt;] cl_lock_cancel+0x14b/0x150 [obdclass]^M
 [&amp;lt;ffffffffa0d7cc1d&amp;gt;] osc_lock_blocking+0x5d/0xf0 [osc]^M
 [&amp;lt;ffffffffa0d7dff9&amp;gt;] osc_dlm_blocking_ast0+0xf9/0x210 [osc]^M
 [&amp;lt;ffffffffa0d7e15c&amp;gt;] osc_ldlm_blocking_ast+0x4c/0x100 [osc]^M
 [&amp;lt;ffffffffa0be4eef&amp;gt;] ldlm_cancel_callback+0x5f/0x180 [ptlrpc]^M
 [&amp;lt;ffffffffa0bf380f&amp;gt;] ldlm_cli_cancel_local+0x7f/0x480 [ptlrpc]^M
 [&amp;lt;ffffffffa0bf6b82&amp;gt;] ldlm_cli_cancel_list_local+0xf2/0x290 [ptlrpc]^M
 [&amp;lt;ffffffffa0bfba07&amp;gt;] ldlm_bl_thread_main+0xf7/0x450 [ptlrpc]^M
 [&amp;lt;ffffffff81083ae6&amp;gt;] kthread+0x96/0xa0^M
 [&amp;lt;ffffffff8147f164&amp;gt;] kernel_thread_helper+0x4/0x10^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;These events will cause the MDS IO to stop for a few minutes. &lt;/p&gt;</description>
                <environment>CENTOS6 Lustre2.5.3  Server MOFED2.4&lt;br/&gt;
SLES11 Lustre2.5.3 Client MOFED3.0</environment>
        <key id="31202">LU-6898</key>
            <summary>ldlm_resource_dump()) Granted locks (in reverse order)</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="jay">Jinshan Xiong</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Fri, 24 Jul 2015 01:41:55 +0000</created>
                <updated>Fri, 29 Apr 2016 17:58:44 +0000</updated>
                            <resolved>Thu, 10 Mar 2016 22:02:17 +0000</resolved>
                                    <version>Lustre 2.5.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>13</watches>
                                                                            <comments>
                            <comment id="122148" author="pjones" created="Fri, 24 Jul 2015 17:21:35 +0000"  >&lt;p&gt;Bobijam&lt;/p&gt;

&lt;p&gt;Could you please look into this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="122248" author="bobijam" created="Mon, 27 Jul 2015 12:08:12 +0000"  >&lt;p&gt;the log shows that a client hadn&apos;t finished a lock cancellation while MDT thought that the client is dead and evicted it.  &lt;/p&gt;</comment>
                            <comment id="122313" author="mhanafi" created="Mon, 27 Jul 2015 19:37:14 +0000"  >&lt;p&gt;why was this causing a pause on the MDS? &lt;br/&gt;
FYI, the client is a 1024 core sharemem host. 100&apos;s of  ldlm_bl threads are all stuck at osc_page_delete.&lt;/p&gt;</comment>
                            <comment id="122356" author="bobijam" created="Tue, 28 Jul 2015 01:14:17 +0000"  >&lt;p&gt;Do you have log from the MDS?&lt;/p&gt;</comment>
                            <comment id="122468" author="mhanafi" created="Tue, 28 Jul 2015 19:51:56 +0000"  >&lt;p&gt;There isn&apos;t much more on the MDS logs other than the &quot;lock callback timer expired&quot; I tried to get a lustre debug dump but wan&apos;t able to capture it quickly enough. &lt;/p&gt;</comment>
                            <comment id="122537" author="bobijam" created="Wed, 29 Jul 2015 12:43:58 +0000"  >&lt;p&gt;I think it could be that client&apos;s too busy cancelling locks but still miss the lock callback timeout, can you try &lt;a href=&quot;http://review.whamcloud.com/#/c/14342/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/14342/&lt;/a&gt; and &lt;a href=&quot;http://review.whamcloud.com/#/c/12603&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/12603&lt;/a&gt; on the client node?&lt;/p&gt;</comment>
                            <comment id="122602" author="jaylan" created="Wed, 29 Jul 2015 19:06:07 +0000"  >&lt;p&gt;I need a b2_5 port of #12603. I have a conflict in lustre/ldlm/ldlm_request.c.&lt;/p&gt;</comment>
                            <comment id="122642" author="bobijam" created="Thu, 30 Jul 2015 04:53:48 +0000"  >&lt;p&gt;here &lt;a href=&quot;http://review.whamcloud.com/15800&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/15800&lt;/a&gt; is the b2_5 port of #12603&lt;/p&gt;</comment>
                            <comment id="122767" author="jaylan" created="Thu, 30 Jul 2015 19:45:14 +0000"  >&lt;p&gt;Thank you , Zhenyu.&lt;/p&gt;</comment>
                            <comment id="123078" author="mhanafi" created="Mon, 3 Aug 2015 20:35:57 +0000"  >&lt;p&gt;We tried the patch and it didn&apos;t help. Running a 312 cpu ior job and canceling the ior run would cause all the ldlm treads to lockup in _raw_spin_unlock_irqrestore. Uploading debug logs&lt;/p&gt;

&lt;p&gt;debug.out.withpatch.ofed3.5.2.1438633632.bz2&lt;br/&gt;
debug.out.withpatch.mofed.1438631502.bz2&lt;br/&gt;
debug.out.withpatch.mofed.secondrun.1438631826.bz2&lt;/p&gt;</comment>
                            <comment id="123138" author="jay" created="Tue, 4 Aug 2015 04:00:58 +0000"  >&lt;p&gt;Those threads were not stuck but simply busy. Also the eviction happened on MDC. How many threads did you notice on this state?&lt;/p&gt;</comment>
                            <comment id="123148" author="mhanafi" created="Tue, 4 Aug 2015 07:04:58 +0000"  >&lt;p&gt;I had a 312 CPU job so they or at least that many threads in that state. The threads never finish and cause the client to get evicted.&lt;/p&gt;</comment>
                            <comment id="123881" author="mhanafi" created="Tue, 11 Aug 2015 18:36:46 +0000"  >&lt;p&gt;please update this case.&lt;/p&gt;</comment>
                            <comment id="123892" author="jay" created="Tue, 11 Aug 2015 21:00:12 +0000"  >&lt;p&gt;I will work on this issue.&lt;/p&gt;</comment>
                            <comment id="123917" author="jay" created="Wed, 12 Aug 2015 00:40:26 +0000"  >&lt;p&gt;There are 128 ldlm block threads and all of them are busy at discarding pages. There was probably a long queue out there and it would take long waiting time before this blocking AST from MDT gets handled.&lt;/p&gt;

&lt;p&gt;Based on the reality that this node has 1024 cores, I will increase the number of ldlm threads and see how it goes. At present there is a kernel module parameter ldlm_num_threads but it has a hard limit as 128. Nobody could predict that there exists such fat node.&lt;/p&gt;

&lt;p&gt;If it doesn&apos;t help by increasing number dlm threads, I will make blocking callback of inodebits lock to have higher priority.&lt;/p&gt;</comment>
                            <comment id="123920" author="mhanafi" created="Wed, 12 Aug 2015 02:12:44 +0000"  >&lt;p&gt;We don&apos;t see this issue in 2.4.3 and can reproduce very easily in 2.5.3. So the behavior has changed. Any patches that may have caused this change?&lt;/p&gt;

</comment>
                            <comment id="123922" author="gerrit" created="Wed, 12 Aug 2015 03:20:34 +0000"  >&lt;p&gt;Jinshan Xiong (jinshan.xiong@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/15960&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/15960&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6898&quot; title=&quot;ldlm_resource_dump()) Granted locks (in reverse order)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6898&quot;&gt;&lt;del&gt;LU-6898&lt;/del&gt;&lt;/a&gt; ldlm: increase LDLM_NTHRS_MAX of ldlm threads&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_5&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 6ef32d9684b78d04bf67b936c28992a154f14345&lt;/p&gt;</comment>
                            <comment id="123923" author="jay" created="Wed, 12 Aug 2015 03:22:36 +0000"  >&lt;p&gt;I will check it. Meanwhile, if you have hardware resource, you can figure it out by &apos;git bisect&apos;.&lt;/p&gt;</comment>
                            <comment id="123944" author="tomtervo" created="Wed, 12 Aug 2015 13:21:18 +0000"  >&lt;p&gt;We see this same issue on compute nodes with lot of memory (1.5TB, 32 CPU&apos;s) (2.6.32-504.12.2.el6.x86_64, MOFED 2.3 and lustre 2.5.3.90)&lt;/p&gt;</comment>
                            <comment id="124190" author="mhanafi" created="Fri, 14 Aug 2015 21:00:57 +0000"  >&lt;p&gt;patch: &lt;a href=&quot;http://review.whamcloud.com/15960&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/15960&lt;/a&gt; helped some. The client will eventually worked through all the waiting ldlm_bl threads but it takes time. Where in 2.4.3 this was never the case.  So still not fit for production on a large cpu node.&lt;/p&gt;

&lt;p&gt;bt during the wait.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[0]kdb&amp;gt; btp 31094
Stack traceback &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; pid 31094
0xffff8ac79e562540    31094        2  1  753   R  0xffff8ac79e562bb0  ldlm_bl_169
 [&amp;lt;ffffffff814760e8&amp;gt;] _raw_spin_unlock_irqrestore+0x8/0x10
 [&amp;lt;ffffffffa0d08817&amp;gt;] osc_page_delete+0xe7/0x360 [osc]
 [&amp;lt;ffffffffa0c2f615&amp;gt;] cl_page_delete0+0xc5/0x4e0 [obdclass]
 [&amp;lt;ffffffffa0c2fa6a&amp;gt;] cl_page_delete+0x3a/0x120 [obdclass]
 [&amp;lt;ffffffffa1056776&amp;gt;] ll_invalidatepage+0x96/0x160 [lustre]
 [&amp;lt;ffffffffa106821d&amp;gt;] vvp_page_discard+0x8d/0x120 [lustre]
 [&amp;lt;ffffffffa0c2ba68&amp;gt;] cl_page_invoid+0x78/0x170 [obdclass]
 [&amp;lt;ffffffffa0c32d0c&amp;gt;] discard_cb+0xbc/0x1e0 [obdclass]
 [&amp;lt;ffffffffa0c305a7&amp;gt;] cl_page_gang_lookup+0x1f7/0x3f0 [obdclass]
 [&amp;lt;ffffffffa0c32b1a&amp;gt;] cl_lock_discard_pages+0xfa/0x1d0 [obdclass]
 [&amp;lt;ffffffffa0d0a0f2&amp;gt;] osc_lock_flush+0xf2/0x260 [osc]
 [&amp;lt;ffffffffa0d0a359&amp;gt;] osc_lock_cancel+0xf9/0x1e0 [osc]
 [&amp;lt;ffffffffa0c30fd5&amp;gt;] cl_lock_cancel0+0x65/0x150 [obdclass]
 [&amp;lt;ffffffffa0c31d4b&amp;gt;] cl_lock_cancel+0x14b/0x150 [obdclass]
 [&amp;lt;ffffffffa0d0ac3d&amp;gt;] osc_lock_blocking+0x5d/0xf0 [osc]
 [&amp;lt;ffffffffa0d0c019&amp;gt;] osc_dlm_blocking_ast0+0xf9/0x210 [osc]
 [&amp;lt;ffffffffa0d0c17c&amp;gt;] osc_ldlm_blocking_ast+0x4c/0x100 [osc]
 [&amp;lt;ffffffffa0ec7670&amp;gt;] ldlm_handle_bl_callback+0xc0/0x420 [ptlrpc]
 [&amp;lt;ffffffffa0ec7bd1&amp;gt;] ldlm_bl_thread_main+0x201/0x450 [ptlrpc]
 [&amp;lt;ffffffff81083ae6&amp;gt;] kthread+0x96/0xa0
[0]more&amp;gt; g
Only &lt;span class=&quot;code-quote&quot;&gt;&apos;q&apos;&lt;/span&gt; or &lt;span class=&quot;code-quote&quot;&gt;&apos;Q&apos;&lt;/span&gt; are processed at more prompt, input ignored
 [&amp;lt;ffffffff8147f164&amp;gt;] kernel_thread_helper+0x4/0x10
     r15 = 0xffff8fc79e187db0      r14 = 0xffff88a092987558 
     r13 = 0xffff8fc79e1875e0      r12 = 0xffffffff8147f31e 
      bp = 0xffff88a092987558       bx = 0x0000000100033e68 
     r11 = 0xffffffffa1060bb0      r10 = 0x00000000000001e1 
      r9 = 0xffff88a736d76200       r8 = 0xffffffffa0d36e40 
      ax = 0xffffffffa0d36e58       cx = 0x0000000000000000 
      dx = 0xffffffffa0d36e58       si = 0x0000000000000282 
      di = 0x0000000000000282  orig_ax = 0xffffffffffffff01 
      ip = 0xffffffff814760e8       cs = 0x0000000000000010 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Number of lock threads&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[mhanafi@endeavour2:~]$ ps -ef |grep ldlm_bl | wc -l
1295
[mhanafi@endeavour2:~]$ ps -ef |grep ldlm_cb | wc -l
438
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="124196" author="jay" created="Fri, 14 Aug 2015 22:29:38 +0000"  >&lt;p&gt;Hi Mahmoud,&lt;/p&gt;

&lt;p&gt;Do you have a rough idea what type of locks(read, or write) are being canceled? For read lock, clients will have to do more work to check if the pages being destroyed are covered by other lock.&lt;/p&gt;</comment>
                            <comment id="124354" author="jay" created="Mon, 17 Aug 2015 20:39:22 +0000"  >&lt;p&gt;hmm.. this is write lock.&lt;/p&gt;

&lt;p&gt;What&apos;s the spinlock at  &apos;osc_page_delete+0xe7/0x360&apos;? it looks like this lock is being highly contended.&lt;/p&gt;</comment>
                            <comment id="124636" author="jaylan" created="Wed, 19 Aug 2015 18:57:40 +0000"  >&lt;p&gt;Which commit or tag of 2.5.3 (ie b2_5) would be a good starting point to establish the &apos;git bisect good&apos; for this problem as suggested in &lt;br/&gt;
&lt;a href=&quot;https://jira.hpdd.intel.com/browse/LU-6898?focusedCommentId=123923&amp;amp;page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-123923&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jira.hpdd.intel.com/browse/LU-6898?focusedCommentId=123923&amp;amp;page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-123923&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="124656" author="jay" created="Wed, 19 Aug 2015 21:16:29 +0000"  >&lt;p&gt;what was the exact version number of 2.4 that was running okay on the node?&lt;/p&gt;</comment>
                            <comment id="124662" author="jaylan" created="Wed, 19 Aug 2015 23:09:58 +0000"  >&lt;p&gt;Our version of 2.4 is based on 2.4.3 with some extra patches. You can see our git repo at&lt;br/&gt;
&lt;a href=&quot;https://github.com/jlan/lustre-nas/tree/nas-2.4.3&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/jlan/lustre-nas/tree/nas-2.4.3&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The system in question is running a build at tag 2.4.3-12nasC, which corresponding to this:&lt;br/&gt;
8d46eeb - &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4604&quot; title=&quot;improve LFSCK async RPCs control&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4604&quot;&gt;&lt;del&gt;LU-4604&lt;/del&gt;&lt;/a&gt; obdclass: handle ldt_device_nr/ldt_linkage properly&lt;br/&gt;
which was committed to nas-2.4.3 on Dec 23, 2014.&lt;/p&gt;</comment>
                            <comment id="124725" author="jay" created="Thu, 20 Aug 2015 18:46:01 +0000"  >&lt;p&gt;I didn&apos;t find that area of code changed much from tag v2_4_3 to v2_5_3(I checked the extra patches applied to your branch as well).&lt;/p&gt;

&lt;p&gt;From the stack trace, it looks like all canceling threads are contended at LRU list lock at osc_lru_del(), which you can verify it by running the reproduction program with &apos;perf top&apos; for example. But the thing is this area of code haven&apos;t changed that much so probably there exists a hidden issue, therefore I would like to hold the effort to create a patch now.&lt;/p&gt;

&lt;p&gt;There are roughly around 800 commits between v2_4_3 and v2_5_3. We can probably identify the problematic patch(es) in less than 10 iterations.&lt;/p&gt;</comment>
                            <comment id="124738" author="jaylan" created="Thu, 20 Aug 2015 20:07:44 +0000"  >&lt;p&gt;b2_5 branch branched out  at roughly&lt;br/&gt;
&quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1617&quot; title=&quot;have git ignore generated files via .gitignore&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1617&quot;&gt;&lt;del&gt;LU-1617&lt;/del&gt;&lt;/a&gt; build: ignore automatically generated files,&quot;&lt;br/&gt;
which is close to v2_4_0 (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2784&quot; title=&quot;Provide RPMs for automatic installation on Intel(R) Xeon Phi(TM) card&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2784&quot;&gt;&lt;del&gt;LU-2784&lt;/del&gt;&lt;/a&gt;).&lt;/p&gt;

&lt;p&gt;We tested a branch of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2784&quot; title=&quot;Provide RPMs for automatic installation on Intel(R) Xeon Phi(TM) card&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2784&quot;&gt;&lt;del&gt;LU-2784&lt;/del&gt;&lt;/a&gt; from b2_5 tree and (YEAH!!!) it did not show the problem. The branch from v2_4_52 did show the problem!&lt;/p&gt;

&lt;p&gt;Our dedicated time on our big system ran out. We will continue tomorrow (6pm - 8pm) to narrow it down. If you can see the culprit by examining the code that would be even great!&lt;/p&gt;</comment>
                            <comment id="124840" author="jaylan" created="Fri, 21 Aug 2015 23:45:09 +0000"  >&lt;p&gt;We have it! It is commit c8fd9c3 &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2850&quot; title=&quot;Client support for upstream 3.8 kernel&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2850&quot;&gt;&lt;del&gt;LU-2850&lt;/del&gt;&lt;/a&gt; kernel: 3.8 upstream kills daemonize()&quot; that caused the ldlm lock threads taking up 100% cpu time and not going away on large CPU-size systems!&lt;/p&gt;

&lt;p&gt;We tested the commit a35113b - &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2901&quot; title=&quot;Duplicate filename on the same ldiskfs directory on MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2901&quot;&gt;&lt;del&gt;LU-2901&lt;/del&gt;&lt;/a&gt; mdt: duplicate link names in directory&quot;, the one immediately before c8fd9c3 and it did not show the problem.&lt;/p&gt;

&lt;p&gt;It is a new kernel feature from Linux kernel 3.8 that you brought in. Somewhere in your code need to adapt to this new feature.&lt;/p&gt;</comment>
                            <comment id="124841" author="pjones" created="Fri, 21 Aug 2015 23:49:38 +0000"  >&lt;p&gt;Nice detective work Jay! &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="124852" author="jay" created="Sat, 22 Aug 2015 21:35:35 +0000"  >&lt;p&gt;This is a huge patch - I&apos;m looking at it.&lt;/p&gt;</comment>
                            <comment id="124853" author="jay" created="Sat, 22 Aug 2015 22:06:29 +0000"  >&lt;p&gt;Is that because there are signals pending for ldlm lock canceling threads?&lt;/p&gt;</comment>
                            <comment id="124910" author="gerrit" created="Mon, 24 Aug 2015 17:20:25 +0000"  >&lt;p&gt;Jinshan Xiong (jinshan.xiong@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/16063&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16063&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6898&quot; title=&quot;ldlm_resource_dump()) Granted locks (in reverse order)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6898&quot;&gt;&lt;del&gt;LU-6898&lt;/del&gt;&lt;/a&gt; ldlm: block signal for dlm block thread&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_5&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 3e676c6dd7d279fb846a20fd6750c45f14d617d6&lt;/p&gt;</comment>
                            <comment id="124911" author="jay" created="Mon, 24 Aug 2015 17:21:56 +0000"  >&lt;p&gt;patch 16063 is to verify the idea by blocking signals of ldlm blocking threads. If it can fix the problem, I&apos;m going to check updates of commit c8fd9c3 and decide which threads will need signals to be blocked.&lt;/p&gt;</comment>
                            <comment id="125111" author="jaylan" created="Tue, 25 Aug 2015 20:17:38 +0000"  >&lt;p&gt;We do not have the system to test until tomorrow.&lt;/p&gt;

&lt;p&gt;BTW, I looked at the task_struct of one of the ldlm thread and saw its sigpending struct as&lt;br/&gt;
below:&lt;/p&gt;

&lt;p&gt;  shared_pending = {&lt;br/&gt;
    list = &lt;/p&gt;
{
      next = 0xffff8d87bb5ce160, 
      prev = 0xffff8d87bb5ce160
    }
&lt;p&gt;, &lt;br/&gt;
    signal = {&lt;br/&gt;
      sig = &lt;/p&gt;
{18446618212836368752}
&lt;p&gt;    }&lt;br/&gt;
  }, &lt;/p&gt;

&lt;p&gt;The hex value of 18446618212836368752 is 0xffff8d87bb5ce170&lt;br/&gt;
Does this make sense?&lt;/p&gt;</comment>
                            <comment id="125140" author="jaylan" created="Wed, 26 Aug 2015 00:53:50 +0000"  >&lt;p&gt;Any suggestion of testing if our problem still exhibits with patch 16063 tomorrow?&lt;/p&gt;</comment>
                            <comment id="125150" author="jay" created="Wed, 26 Aug 2015 07:18:38 +0000"  >&lt;p&gt;we should check the sigset task_struct::blocked of the thread in question.&lt;/p&gt;

&lt;p&gt;Most likely this is the problem. I will take further look if patch 16063 is not working.&lt;/p&gt;</comment>
                            <comment id="125251" author="jaylan" created="Wed, 26 Aug 2015 18:29:09 +0000"  >&lt;p&gt;Jishan. Patch 16063 did not help. &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/sad.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;

&lt;p&gt;We have another block time reserved for Friday 4-6pm. Please advise if we should keep that reservation. A block time at Friday afternoon would prohibit jobs that needs more than 2 days to start.&lt;/p&gt;</comment>
                            <comment id="125259" author="jay" created="Wed, 26 Aug 2015 18:44:40 +0000"  >&lt;p&gt;Do you still own the test node? It would be helpful to get a coredump and upload to our ftp site so that I can take a further look.&lt;/p&gt;</comment>
                            <comment id="125261" author="jay" created="Wed, 26 Aug 2015 18:48:27 +0000"  >&lt;p&gt;Yes, please keep the reservation.&lt;/p&gt;

&lt;p&gt;I found a problem with the patch. I am updating it now.&lt;/p&gt;</comment>
                            <comment id="125262" author="jaylan" created="Wed, 26 Aug 2015 18:55:11 +0000"  >&lt;p&gt;Taking a vmcore of a system of this size takes times. I am looking for your next patch. If that still fails we then will take a kdump.&lt;/p&gt;

&lt;p&gt;Also, &quot;please keep the reservation&quot; do you meant the block time now of the block time on Friday?&lt;/p&gt;</comment>
                            <comment id="125271" author="jay" created="Wed, 26 Aug 2015 19:34:20 +0000"  >&lt;p&gt;That was an answer to the question &apos;Please advise if we should keep that reservation.&apos;&lt;/p&gt;</comment>
                            <comment id="125289" author="jaylan" created="Wed, 26 Aug 2015 21:34:00 +0000"  >&lt;p&gt;Please take a look at the btall.gz we uploaded on July 27, it was a backtrace of all kernel threads from a machine with 1024 cores and 4TB.&lt;/p&gt;

&lt;p&gt;We still have the vmcore of that incident.&lt;br/&gt;
Checking the blocked signal in the vmcore, I found:&lt;br/&gt;
1) all kernel threads of &quot;ldlm_bl_***&quot; that with stack trace like this do NOT have blocked sigset:&lt;/p&gt;

&lt;p&gt;#16 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1b9f8&amp;#93;&lt;/span&gt; osc_page_delete at ffffffffa0d7a807 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#17 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1ba28&amp;#93;&lt;/span&gt; cl_page_delete0 at ffffffffa0ad14d5 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
#18 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1ba68&amp;#93;&lt;/span&gt; cl_page_delete at ffffffffa0ad192a &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
#19 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1ba88&amp;#93;&lt;/span&gt; ll_invalidatepage at ffffffffa0ee16a6 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
#20 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1bab8&amp;#93;&lt;/span&gt; vvp_page_discard at ffffffffa0ef314d &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
#21 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1bad8&amp;#93;&lt;/span&gt; cl_page_invoid at ffffffffa0acda58 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
#22 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1bb18&amp;#93;&lt;/span&gt; check_and_discard_cb at ffffffffa0ad5dac &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
#23 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1bb58&amp;#93;&lt;/span&gt; cl_page_gang_lookup at ffffffffa0ad2467 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
#24 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1bbf8&amp;#93;&lt;/span&gt; cl_lock_discard_pages at ffffffffa0ad471a &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
#25 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1bc38&amp;#93;&lt;/span&gt; osc_lock_flush at ffffffffa0d7c0d2 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#26 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1bc88&amp;#93;&lt;/span&gt; osc_lock_cancel at ffffffffa0d7c339 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#27 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1bcc8&amp;#93;&lt;/span&gt; cl_lock_cancel0 at ffffffffa0ad2bd5 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
#28 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1bce8&amp;#93;&lt;/span&gt; cl_lock_cancel at ffffffffa0ad394b &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
#29 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1bd08&amp;#93;&lt;/span&gt; osc_lock_blocking at ffffffffa0d7cc1d &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#30 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1bd38&amp;#93;&lt;/span&gt; osc_dlm_blocking_ast0 at ffffffffa0d7dff9 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#31 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1bd78&amp;#93;&lt;/span&gt; osc_ldlm_blocking_ast at ffffffffa0d7e15c &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#32 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1bdb8&amp;#93;&lt;/span&gt; ldlm_cancel_callback at ffffffffa0be4eef &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#33 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1bdc8&amp;#93;&lt;/span&gt; ldlm_cli_cancel_local at ffffffffa0bf380f &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#34 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1bde8&amp;#93;&lt;/span&gt; ldlm_cli_cancel_list_local at ffffffffa0bf6b82 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#35 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1be48&amp;#93;&lt;/span&gt; ldlm_bl_thread_main at ffffffffa0bfba07 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#36 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1bee8&amp;#93;&lt;/span&gt; kthread at ffffffff81083ae6&lt;br/&gt;
#37 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8fa79de1bf48&amp;#93;&lt;/span&gt; kernel_thread_helper at ffffffff8147f164&lt;/p&gt;


&lt;p&gt;2) all kernel threads of &quot;ldlm_cb*&lt;b&gt;_&lt;/b&gt;**&quot; with stack trace like this:&lt;br/&gt;
PID: 4677   TASK: ffff8834baff84c0  CPU: 18  COMMAND: &quot;ldlm_cb00_008&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88366f143cb0&amp;#93;&lt;/span&gt; schedule at ffffffff81473d9b&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88366f143df8&amp;#93;&lt;/span&gt; ptlrpc_wait_event at ffffffffa0c25445 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88366f143e78&amp;#93;&lt;/span&gt; ptlrpc_main at ffffffffa0c2e91b &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88366f143ee8&amp;#93;&lt;/span&gt; kthread at ffffffff81083ae6&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88366f143f48&amp;#93;&lt;/span&gt; kernel_thread_helper at ffffffff8147f164&lt;br/&gt;
DOES have a blocked sigset:&lt;br/&gt;
  blocked = {&lt;br/&gt;
    sig = &lt;/p&gt;
{18446744073709551615}&lt;br/&gt;
  }, &lt;br/&gt;
&lt;br/&gt;
3) all kernel threads of &quot;ptlrpc_hr*&lt;b&gt;_&lt;/b&gt;**&quot; with stack trace of:&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88679ecb9ce0&amp;#93;&lt;/span&gt; schedule at ffffffff81473d9b&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88679ecb9d88&amp;#93;&lt;/span&gt; cfs_cpt_bind at ffffffffa098a0bd &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88679ecb9e28&amp;#93;&lt;/span&gt; ptlrpc_hr_main at ffffffffa0c28ad7 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88679ecb9ee8&amp;#93;&lt;/span&gt; kthread at ffffffff81083ae6&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88679ecb9f48&amp;#93;&lt;/span&gt; kernel_thread_helper at ffffffff8147f164&lt;br/&gt;
DOES have a blocked sigset:&lt;br/&gt;
  blocked = {&lt;br/&gt;
    sig = {18446744073709551615}
&lt;p&gt;  }, &lt;/p&gt;

&lt;p&gt;3) all kernel threads of &quot;kiblnd_sd_&lt;cite&gt;_&lt;/cite&gt;&quot; with stack trace of&lt;br/&gt;
PID: 25939  TASK: ffff88e768552040  CPU: 24  COMMAND: &quot;kiblnd_sd_00_00&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88e768555cb0&amp;#93;&lt;/span&gt; schedule at ffffffff81473d9b&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88e768555df8&amp;#93;&lt;/span&gt; kiblnd_scheduler at ffffffffa0ba52a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88e768555ee8&amp;#93;&lt;/span&gt; kthread at ffffffff81083ae6&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88e768555f48&amp;#93;&lt;/span&gt; kernel_thread_helper at ffffffff8147f164&lt;br/&gt;
DOES have a blocked sigset:&lt;br/&gt;
  blocked = {&lt;br/&gt;
    sig = &lt;/p&gt;
{18446744073709551615}&lt;br/&gt;
  }, &lt;br/&gt;
&lt;br/&gt;
4) all kernel threads of &quot;ptlrpcd_????&quot; with stack trace of&lt;br/&gt;
PID: 27092  TASK: ffff8f079ef16340  CPU: 1019  COMMAND: &quot;ptlrpcd_1023&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8f079ef19c30&amp;#93;&lt;/span&gt; schedule at ffffffff81473d9b&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8f079ef19d78&amp;#93;&lt;/span&gt; schedule_timeout at ffffffff81474550&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8f079ef19e08&amp;#93;&lt;/span&gt; ptlrpcd at ffffffffa0c3d7c5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8f079ef19ee8&amp;#93;&lt;/span&gt; kthread at ffffffff81083ae6&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8f079ef19f48&amp;#93;&lt;/span&gt; kernel_thread_helper at ffffffff8147f164&lt;br/&gt;
DOES have a blocked sigset:&lt;br/&gt;
  blocked = {&lt;br/&gt;
    sig = {18446744073709551615}
&lt;p&gt;  }, &lt;/p&gt;

&lt;p&gt;It seems that all lustre kernel threads except #1 category (&quot;ldlm_bl_???&quot;)  have a blocked sigset. The content of the sigset all the same.&lt;/p&gt;</comment>
                            <comment id="125313" author="jay" created="Thu, 27 Aug 2015 00:26:46 +0000"  >&lt;p&gt;Hi Jay,&lt;/p&gt;

&lt;p&gt;I realized that the new patch set 16063 won&apos;t fix the problem either. I have no idea how commit c8fd9c3 could cause this problem. I will do furhther investigation and will update this ticket if I find something new.&lt;/p&gt;</comment>
                            <comment id="125314" author="jaylan" created="Thu, 27 Aug 2015 00:35:34 +0000"  >&lt;p&gt;Hi Jinshan,&lt;/p&gt;

&lt;p&gt;If you get the conclusion because of the comment I made a few hour before, please be advised that the blocked sigset data were from the vmcore dated July 27. I do not have a chance to test your new patch 16063 (patch set #2) yet.&lt;/p&gt;</comment>
                            <comment id="125322" author="jay" created="Thu, 27 Aug 2015 01:40:33 +0000"  >&lt;p&gt;In that case, please try the 2nd patch anyway as I&apos;m investigating the problem.&lt;/p&gt;

&lt;p&gt;Please take a coredump if the problem still exists so that we can do postmortem analysis.&lt;/p&gt;</comment>
                            <comment id="125610" author="jaylan" created="Sat, 29 Aug 2015 01:35:09 +0000"  >&lt;p&gt;We reproduced the problem with  patch at&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/#/c/16063/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/16063/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;We took a vmcore. Either Mahmoud or Herbert will&lt;br/&gt;
encrypt and ftp to your server. Please note that the vmcore is restricted to US citizen.&lt;/p&gt;

&lt;p&gt;The git repo is at&lt;br/&gt;
&lt;a href=&quot;https://github.com/jlan/lustre-nas/tree/nas-2.5.3-LU6898&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/jlan/lustre-nas/tree/nas-2.5.3-LU6898&lt;/a&gt;&lt;/p&gt;
</comment>
                            <comment id="125627" author="jay" created="Sun, 30 Aug 2015 01:40:07 +0000"  >&lt;p&gt;hmm.. I will take a look at coredump after it&apos;s uploaded.&lt;/p&gt;

&lt;p&gt;Do you have another chance to run a test again to make sure that the problem does not exist in commit a35113b? I can&apos;t figure out why commit c8fd9c3 could cause this problem otherwise I will escalate this ticket.&lt;/p&gt;</comment>
                            <comment id="125928" author="gerrit" created="Tue, 1 Sep 2015 18:41:29 +0000"  >&lt;p&gt;Jinshan Xiong (jinshan.xiong@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/16165&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16165&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6898&quot; title=&quot;ldlm_resource_dump()) Granted locks (in reverse order)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6898&quot;&gt;&lt;del&gt;LU-6898&lt;/del&gt;&lt;/a&gt; ldlm: debug patch&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_5&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 7bcd94d94c915f6f3670f6355494daaa66662149&lt;/p&gt;</comment>
                            <comment id="125929" author="jay" created="Tue, 1 Sep 2015 18:43:06 +0000"  >&lt;p&gt;I pushed patch 16165 to revert the changes to ldlm and ptlrpc in c8fd9c3. Please check if you can still see the problem by applying this patch.&lt;/p&gt;</comment>
                            <comment id="125930" author="simmonsja" created="Tue, 1 Sep 2015 19:02:04 +0000"  >&lt;p&gt;Patch 16165 can&apos;t be the final fix since daemonize is actually gone for new kernels like RHEL7 and SLES12. Jay Lan how are your producing this problem. I like to see if I can duplicate it.&lt;/p&gt;</comment>
                            <comment id="125939" author="jay" created="Tue, 1 Sep 2015 19:44:42 +0000"  >&lt;p&gt;Obviously it isn&apos;t - it&apos;s a debug patch to identify the problem as the patch title says.&lt;/p&gt;</comment>
                            <comment id="125941" author="jaylan" created="Tue, 1 Sep 2015 19:55:57 +0000"  >&lt;p&gt;Tommi Tervo also reported the problem at 12/Aug/15 6:21 AM. His system is a 1.5TB,32-cores.&lt;/p&gt;

&lt;p&gt;We can not reproduce the problem on small systems, but can easily reproduce on our 2TB,512-cores SGI UV2000 system.&lt;/p&gt;

&lt;p&gt;In Mahmoud&apos;s reproducer, the lustre fs uses stripe-count=10. His script submits PBS job to run mpiexec of 312 copies of IOR.&lt;/p&gt;

&lt;p&gt;Using &apos;top&apos; to monitor the run. When you see IOR takes up the &apos;top&apos; page, ctrl-C to kill the job. Then, if you see all ldlm_bl_xxx threads showing up in &apos;top&apos; pages taking ~100% CPU time, the problem is reproduced. Sometimes it takes a couple of runs to reproduce the problem.&lt;/p&gt;</comment>
                            <comment id="125951" author="mhanafi" created="Tue, 1 Sep 2015 21:46:29 +0000"  >&lt;p&gt;The reproducer runs on single stripe file per process IOR. Not stripe count of 10.&lt;/p&gt;
</comment>
                            <comment id="126104" author="jaylan" created="Wed, 2 Sep 2015 21:27:08 +0000"  >&lt;p&gt;Jinshan,&lt;/p&gt;

&lt;p&gt;8 files of your patch caused conflicts.&lt;/p&gt;

&lt;p&gt;Could create a branch at commit c8fd9c3 and create your patch from there? Commit c8fd9c3 is where the problem starts.&lt;/p&gt;</comment>
                            <comment id="126113" author="jay" created="Wed, 2 Sep 2015 22:22:20 +0000"  >&lt;p&gt;the patch is based on b2_5, please apply it to the top of b2_5.&lt;/p&gt;

&lt;p&gt;Is it possible to enable lock stat and collect perf(1) data so that we know if there are contention in the code?&lt;/p&gt;</comment>
                            <comment id="126287" author="jaylan" created="Thu, 3 Sep 2015 20:49:21 +0000"  >&lt;p&gt;Have you found anything from the crash dump, Jinshan and Oleg?&lt;/p&gt;</comment>
                            <comment id="126538" author="green" created="Sun, 6 Sep 2015 16:07:56 +0000"  >&lt;p&gt;We found that the patch from Jinshan managed to set blocking callback just like it was supposed to, so there now should be no differences prior to the patch you mention.&lt;br/&gt;
I also see that all ldlm_bl callbacks are having very uniform backtraces all of them culminating at released spinlock which is kind of strange to be a coincidence considering there are 128 of them, but I have no explanation for it yet.&lt;/p&gt;

&lt;p&gt;We are still thinking of the reasons, but meanwhile Jinshan is very interested in b2_5 + the patch he just reverted to be run on your system to see how it helps.&lt;/p&gt;</comment>
                            <comment id="126708" author="jaylan" created="Tue, 8 Sep 2015 18:41:54 +0000"  >&lt;p&gt;After applying the debug patch to nas-2.5.3, we can not reproduce the problem.&lt;/p&gt;

&lt;p&gt;What feature requires commit c8fd9c3? Do we need this patch on sles11sp3?&lt;/p&gt;</comment>
                            <comment id="126714" author="jay" created="Tue, 8 Sep 2015 19:18:46 +0000"  >&lt;p&gt;As far as I know, there is no new feature that requires commit c8fd9c3. It will just make lustre code more in line with kernel code.&lt;/p&gt;

&lt;p&gt;I will make further investigation on this issue. Please apply the debug patch in your release while we&apos;re investigating this issue.&lt;/p&gt;</comment>
                            <comment id="126719" author="jaylan" created="Tue, 8 Sep 2015 20:08:21 +0000"  >&lt;p&gt;The ldlm_bl_xxx callbacks were the ones stalled the system. They all ran at ~100% cpu, with commit c8fd9c3.&lt;/p&gt;</comment>
                            <comment id="138023" author="wanglu" created="Wed, 6 Jan 2016 01:48:34 +0000"  >&lt;p&gt;Hi Jinshan and Mahamoud,&lt;br/&gt;
Recently our MDS servers are frequently stucked by some Lustre clients. The symptom is same as Mahmoud Hanafi reported in this track and perhaps &lt;a href=&quot;https://jira.hpdd.intel.com/browse/LU-4572&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jira.hpdd.intel.com/browse/LU-4572&lt;/a&gt;&lt;br/&gt;
It happens in this sequence:&lt;br/&gt;
1. one client reports -107 connection error with a MDS server&lt;br/&gt;
2. it starts to release locks, a lot of ldlm_resource_dump found in syslog&lt;br/&gt;
3. During these period ldlm_bl threads take 100% CPU usage on the client, and mds reports &quot;### lock expired XXX seconds&quot; with this client. &lt;br/&gt;
4. Other clients find the MDS stuck.&lt;br/&gt;
Since we have 7 Lustre instances, sometimes, one client will has 107 error with more than one MDS server, then this client would stuck more than one Lustre instance. &lt;br/&gt;
Our current solution is manually 1) restart the client or 2) remount the MDT device. Do you have any updates or workaround on this problem?&lt;/p&gt;

&lt;p&gt;By the way, our Lustre version is 2.5.3 without any modification/patch. &lt;/p&gt;</comment>
                            <comment id="138050" author="wanglu" created="Wed, 6 Jan 2016 07:49:57 +0000"  >&lt;p&gt;The back trace of ldlm_bl:&lt;br/&gt;
&amp;lt;6&amp;gt;ldlm_bl_19    R  running task        0  8801      2 0x00000000&lt;br/&gt;
&amp;lt;4&amp;gt; ffff8802d24b3b30 ffffffff8129120a 000000000003ffff ffff880201c33a00&lt;br/&gt;
&amp;lt;4&amp;gt; ffff8802d24b3b10 ffffffffa0527a12 ffff8802d24b3b30 ffffffffa0868809&lt;br/&gt;
&amp;lt;4&amp;gt; 000000000000000e 000000000000000e ffff8802d24b3be0 ffffffffa052a687&lt;br/&gt;
&amp;lt;4&amp;gt;Call Trace:&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8129120a&amp;gt;&amp;#93;&lt;/span&gt; ? radix_tree_gang_lookup+0x7a/0xf0&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0527a12&amp;gt;&amp;#93;&lt;/span&gt; ? cl_page_vmpage+0x82/0xb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0868809&amp;gt;&amp;#93;&lt;/span&gt; ? weigh_cb+0x19/0x50 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa052a687&amp;gt;&amp;#93;&lt;/span&gt; cl_page_gang_lookup+0x277/0x3f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa08687f0&amp;gt;&amp;#93;&lt;/span&gt; ? weigh_cb+0x0/0x50 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa08689b7&amp;gt;&amp;#93;&lt;/span&gt; osc_ldlm_weigh_ast+0x177/0x400 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa084d05d&amp;gt;&amp;#93;&lt;/span&gt; osc_cancel_for_recovery+0x5d/0xa0 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa064011b&amp;gt;&amp;#93;&lt;/span&gt; ldlm_cancel_lrur_policy+0xeb/0x100 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0643f2b&amp;gt;&amp;#93;&lt;/span&gt; ldlm_prepare_lru_list+0x1cb/0x460 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0640030&amp;gt;&amp;#93;&lt;/span&gt; ? ldlm_cancel_lrur_policy+0x0/0x100 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0644344&amp;gt;&amp;#93;&lt;/span&gt; ldlm_cancel_lru_local+0x24/0x40 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0644d02&amp;gt;&amp;#93;&lt;/span&gt; ldlm_cli_cancel+0x122/0x360 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0869877&amp;gt;&amp;#93;&lt;/span&gt; osc_ldlm_blocking_ast+0x1b7/0x350 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8152a61d&amp;gt;&amp;#93;&lt;/span&gt; ? wait_for_completion+0x1d/0x20&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06483c0&amp;gt;&amp;#93;&lt;/span&gt; ldlm_handle_bl_callback+0x130/0x400 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06488f1&amp;gt;&amp;#93;&lt;/span&gt; ldlm_bl_thread_main+0x261/0x3c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81064b90&amp;gt;&amp;#93;&lt;/span&gt; ? default_wake_function+0x0/0x20&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0648690&amp;gt;&amp;#93;&lt;/span&gt; ? ldlm_bl_thread_main+0x0/0x3c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109e66e&amp;gt;&amp;#93;&lt;/span&gt; kthread+0x9e/0xc0&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c20a&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109e5d0&amp;gt;&amp;#93;&lt;/span&gt; ? kthread+0x0/0xc0&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c200&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;br/&gt;
&amp;lt;6&amp;gt;ldlm_bl_19    R  running task        0  8802      2 0x00000000&lt;br/&gt;
&amp;lt;4&amp;gt; ffff88027f49bbb0 0000000000000046 000000000003ffff ffff880391a08000&lt;br/&gt;
&amp;lt;4&amp;gt; ffff88027f49bb10 ffffffffa0527a12 ffff88027f49bb30 ffffffffa0868809&lt;br/&gt;
&amp;lt;4&amp;gt; 000000000000000e 000000000000000e ffff88082aa4f098 ffff88027f49bfd8&lt;br/&gt;
&amp;lt;4&amp;gt;Call Trace:&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0527a12&amp;gt;&amp;#93;&lt;/span&gt; ? cl_page_vmpage+0x82/0xb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0868809&amp;gt;&amp;#93;&lt;/span&gt; ? weigh_cb+0x19/0x50 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa08687f0&amp;gt;&amp;#93;&lt;/span&gt; ? weigh_cb+0x0/0x50 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8106c85a&amp;gt;&amp;#93;&lt;/span&gt; __cond_resched+0x2a/0x40&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8152a3d0&amp;gt;&amp;#93;&lt;/span&gt; _cond_resched+0x30/0x40&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0868b25&amp;gt;&amp;#93;&lt;/span&gt; osc_ldlm_weigh_ast+0x2e5/0x400 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa084d05d&amp;gt;&amp;#93;&lt;/span&gt; osc_cancel_for_recovery+0x5d/0xa0 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa064011b&amp;gt;&amp;#93;&lt;/span&gt; ldlm_cancel_lrur_policy+0xeb/0x100 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0643f2b&amp;gt;&amp;#93;&lt;/span&gt; ldlm_prepare_lru_list+0x1cb/0x460 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0640030&amp;gt;&amp;#93;&lt;/span&gt; ? ldlm_cancel_lrur_policy+0x0/0x100 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0644344&amp;gt;&amp;#93;&lt;/span&gt; ldlm_cancel_lru_local+0x24/0x40 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0644d02&amp;gt;&amp;#93;&lt;/span&gt; ldlm_cli_cancel+0x122/0x360 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0869877&amp;gt;&amp;#93;&lt;/span&gt; osc_ldlm_blocking_ast+0x1b7/0x350 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06483c0&amp;gt;&amp;#93;&lt;/span&gt; ldlm_handle_bl_callback+0x130/0x400 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06488f1&amp;gt;&amp;#93;&lt;/span&gt; ldlm_bl_thread_main+0x261/0x3c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81064b90&amp;gt;&amp;#93;&lt;/span&gt; ? default_wake_function+0x0/0x20&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0648690&amp;gt;&amp;#93;&lt;/span&gt; ? ldlm_bl_thread_main+0x0/0x3c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109e66e&amp;gt;&amp;#93;&lt;/span&gt; kthread+0x9e/0xc0&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c20a&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109e5d0&amp;gt;&amp;#93;&lt;/span&gt; ? kthread+0x0/0xc0&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c200&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;/p&gt;

&lt;p&gt;The client syslog:&lt;br/&gt;
Jan  5 18:12:49 bwsjnws020 kernel: LustreError: 11-0: bes3fs-OST0051-osc-ffff881070edf000: Communicating with 192.168.50.165@tcp, operation obd_ping failed with -107.&lt;br/&gt;
Jan  5 18:12:49 bwsjnws020 kernel: Lustre: pubfs-MDT0000-mdc-ffff88105ab8e400: Connection to pubfs-MDT0000 (at 192.168.50.67@tcp) was lost; in progress operations using this service will wait for recovery to complete&lt;br/&gt;
Jan  5 18:12:49 bwsjnws020 kernel: Lustre: Skipped 3 previous similar messages&lt;br/&gt;
Jan  5 18:12:49 bwsjnws020 kernel: LustreError: Skipped 6 previous similar messages&lt;br/&gt;
Jan  5 18:12:49 bwsjnws020 kernel: LustreError: 167-0: pubfs-MDT0000-mdc-ffff88105ab8e400: This client was evicted by pubfs-MDT0000; in progress operations using this service will fail.&lt;br/&gt;
Jan  5 18:12:49 bwsjnws020 kernel: LustreError: Skipped 3 previous similar messages&lt;br/&gt;
Jan  5 18:12:49 bwsjnws020 kernel: LustreError: 1058:0:(ldlm_resource.c:809:ldlm_resource_complain()) pubfs-MDT0000-mdc-ffff88105ab8e400: namespace resource &lt;span class=&quot;error&quot;&gt;&amp;#91;0x200007898:0x1:0x0&amp;#93;&lt;/span&gt;.0 (ffff8806de1651c0) refcount nonzero (2) after lock cleanup; forcing cleanup.&lt;br/&gt;
Jan  5 18:12:49 bwsjnws020 kernel: LustreError: 1058:0:(ldlm_resource.c:809:ldlm_resource_complain()) Skipped 7 previous similar messages&lt;br/&gt;
Jan  5 18:12:49 bwsjnws020 kernel: LustreError: 1058:0:(ldlm_resource.c:1448:ldlm_resource_dump()) &amp;#8212; Resource: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x200007898:0x1:0x0&amp;#93;&lt;/span&gt;.0 (ffff8806de1651c0) refcount = 3&lt;br/&gt;
Jan  5 18:12:49 bwsjnws020 kernel: LustreError: 1058:0:(ldlm_resource.c:1448:ldlm_resource_dump()) &amp;#8212; Resource: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x2000074d1:0x94e:0x0&amp;#93;&lt;/span&gt;.0 (ffff8801d79be340) refcount = 3&lt;br/&gt;
Jan  5 18:12:49 bwsjnws020 kernel: Lustre: pubfs-MDT0000-mdc-ffff88105ab8e400: Connection restored to pubfs-MDT0000 (at 192.168.50.67@tcp)&lt;br/&gt;
Jan  5 18:12:49 bwsjnws020 kernel: Lustre: Skipped 3 previous similar messages&lt;br/&gt;
Jan  5 18:12:49 bwsjnws020 kernel: LustreError: 1059:0:(ldlm_resource.c:1448:ldlm_resource_dump()) &amp;#8212; Resource: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x1b995:0x0:0x0&amp;#93;&lt;/span&gt;.0 (ffff880721859700) refcount = 2&lt;/p&gt;

&lt;p&gt;The server log:&lt;br/&gt;
Jan  5 18:23:27 smds kernel: Lustre: 10627:0:(service.c:1347:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (5/5), not sending early reply&lt;br/&gt;
Jan  5 18:23:27 smds kernel:  req@ffff8812ab48ec00 x1518974973017848/t0(0) o101-&amp;gt;c82eb4b4-c8b9-07c2-e3fc-87539e7b2adf@192.168.54.29@tcp:0/0 lens 576/3384 e 6 to 0 dl 1451989412 ref 2 fl Interpret:/0/0 rc 0/0&lt;br/&gt;
Jan  5 18:23:27 smds kernel: Lustre: 10627:0:(service.c:1347:ptlrpc_at_send_early_reply()) Skipped 12 previous similar messages&lt;br/&gt;
Jan  5 18:23:28 smds kernel: Lustre: 10819:0:(service.c:1347:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (5/5), not sending early reply&lt;br/&gt;
Jan  5 18:23:28 smds kernel:  req@ffff880650196400 x1513068049228408/t0(0) o101-&amp;gt;5c631268-d0fc-8c0d-05f8-c3d7a7faae99@192.168.57.199@tcp:0/0 lens 576/3384 e 6 to 0 dl 1451989413 ref 2 fl Interpret:/0/0 rc 0/0&lt;br/&gt;
Jan  5 18:23:28 smds kernel: Lustre: 10819:0:(service.c:1347:ptlrpc_at_send_early_reply()) Skipped 2 previous similar messages&lt;br/&gt;
Jan  5 18:23:34 smds kernel: Lustre: 10534:0:(service.c:1347:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (5/5), not sending early reply&lt;br/&gt;
Jan  5 18:23:34 smds kernel:  req@ffff88169949fc00 x1509295010536664/t0(0) o101-&amp;gt;9dda6a2d-4137-995a-cad9-a28186cc8b73@192.168.57.44@tcp:0/0 lens 576/3384 e 6 to 0 dl 1451989419 ref 2 fl Interpret:/0/0 rc 0/0&lt;br/&gt;
Jan  5 18:23:34 smds kernel: Lustre: 10534:0:(service.c:1347:ptlrpc_at_send_early_reply()) Skipped 1 previous similar message&lt;br/&gt;
Jan  5 18:23:47 smds kernel: Lustre: 10819:0:(service.c:1347:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (5/5), not sending early reply&lt;br/&gt;
Jan  5 18:23:47 smds kernel:  req@ffff880213478800 x1520526088261116/t0(0) o101-&amp;gt;fd06b39c-b81c-d983-43f8-89c8672bb709@202.122.33.204@tcp:0/0 lens 544/3384 e 6 to 0 dl 1451989432 ref 2 fl Interpret:/0/0 rc 0/0&lt;br/&gt;
Jan  5 18:24:03 smds kernel: Lustre: 10819:0:(service.c:1347:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (5/5), not sending early reply&lt;br/&gt;
Jan  5 18:24:03 smds kernel:  req@ffff880a487c0000 x1520538965607805/t0(0) o101-&amp;gt;1a2f04d1-18eb-410e-f207-5222c830edf2@202.122.33.209@tcp:0/0 lens 544/3384 e 4 to 0 dl 1451989448 ref 2 fl Interpret:/0/0 rc 0/0&lt;br/&gt;
Jan  5 18:24:51 smds kernel: Lustre: 10819:0:(service.c:1347:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (5/5), not sending early reply&lt;br/&gt;
Jan  5 18:24:51 smds kernel:  req@ffff8803751db800 x1520526088262835/t0(0) o101-&amp;gt;fd06b39c-b81c-d983-43f8-89c8672bb709@202.122.33.204@tcp:0/0 lens 544/3384 e 2 to 0 dl 1451989496 ref 2 fl Interpret:/0/0 rc 0/0&lt;br/&gt;
Jan  5 18:25:03 smds kernel: Lustre: 10554:0:(service.c:2039:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (600:90s); client may timeout.  req@ffff8817e6235000 x1509380311299204/t0(0) o101-&amp;gt;d0626c3e-a7a7-7421-ed73-e28aa9242043@192.168.61.234@tcp:0/0 lens 576/536 e 6 to 0 dl 1451989413 ref 1 fl Complete:/0/0 rc 0/0&lt;br/&gt;
Jan  5 18:25:03 smds kernel: Lustre: 10554:0:(service.c:2039:ptlrpc_server_handle_request()) Skipped 12 previous similar messages&lt;br/&gt;
Jan 05 18:33:01 smds ihep_heartbeat_log: the heplog system is collecting logs. @Jan 05 18:33:01&lt;br/&gt;
Jan  5 18:34:59 smds kernel: Lustre: 10547:0:(service.c:1347:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (5/4), not sending early reply&lt;br/&gt;
Jan  5 18:34:59 smds kernel:  req@ffff8815354d3000 x1519342037432032/t0(0) o101-&amp;gt;740c17eb-17dd-83c0-ad32-6236642ecf32@202.122.33.224@tcp:0/0 lens 576/3384 e 0 to 0 dl 1451990104 ref 2 fl Interpret:/0/0 rc 0/0&lt;br/&gt;
Jan  5 18:35:04 smds kernel: LustreError: 0:0:(ldlm_lockd.c:344:waiting_locks_callback()) ### lock callback timer expired after 601s: evicting client at 192.168.85.14@tcp  ns: mdt-scrafs-MDT0000_UUID lock: ffff880b43a1e1c0/0x24184c57e1317020 lrc: 3/0,0 mode: PR/PR res: &lt;span class=&quot;error&quot;&gt;&amp;#91;0xa000001:0xb432f5:0x0&amp;#93;&lt;/span&gt;.0 bits 0x13 rrc: 560 type: IBT flags: 0x60200000000020 nid: 192.168.85.14@tcp remote: 0x42dffa022b197861 expref: 22 pid: 10741 timeout: 15444565589 lvb_type: 0&lt;br/&gt;
Jan  5 18:35:04 smds kernel: LustreError: 0:0:(ldlm_lockd.c:344:waiting_locks_callback()) Skipped 4 previous similar messages&lt;br/&gt;
Jan  5 18:35:04 smds kernel: LustreError: 10673:0:(ldlm_lockd.c:1335:ldlm_handle_enqueue0()) ### lock on destroyed export ffff880bbf050800 ns: mdt-scrafs-MDT0000_UUID lock: ffff8807bae9db80/0x24184c57e1317090 lrc: 3/0,0 mode: PR/PR res: &lt;span class=&quot;error&quot;&gt;&amp;#91;0xa000001:0xb432f5:0x0&amp;#93;&lt;/span&gt;.0 bits 0x13 rrc: 628 type: IBT flags: 0x50200400000020 nid: 192.168.85.14@tcp remote: 0x42dffa022b197868 expref: 7 pid: 10673 timeout: 0 lvb_type: 0&lt;/p&gt;
</comment>
                            <comment id="138117" author="jay" created="Wed, 6 Jan 2016 19:32:09 +0000"  >&lt;p&gt;Hi Lu,&lt;/p&gt;

&lt;p&gt;Have you noticed that there was heavy contention on some locks when this issue occurred? It looks like the OSC in question was in recovery state therefore it picked some read locks to cancel before recovery.&lt;/p&gt;</comment>
                            <comment id="141761" author="jfilizetti" created="Wed, 10 Feb 2016 15:09:01 +0000"  >&lt;p&gt;Has there been any progress on fixing this issue?&lt;/p&gt;</comment>
                            <comment id="145180" author="mhanafi" created="Thu, 10 Mar 2016 20:12:14 +0000"  >&lt;p&gt;We haven&apos;t seen this issue. Close will reopen if needed.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="18493" name="btall.gz" size="361229" author="mhanafi" created="Mon, 27 Jul 2015 22:28:15 +0000"/>
                            <attachment id="18554" name="debug.out.mofed.withpatch.1438631502.bz2" size="275" author="mhanafi" created="Mon, 3 Aug 2015 20:40:01 +0000"/>
                            <attachment id="18553" name="debug.out.withpatch.mofed.secondrun.1438631826.bz2" size="296" author="mhanafi" created="Mon, 3 Aug 2015 20:40:00 +0000"/>
                            <attachment id="18552" name="debug.out.withpatch.ofed3.5.2.1438633632.bz2" size="284" author="mhanafi" created="Mon, 3 Aug 2015 20:40:00 +0000"/>
                            <attachment id="18494" name="dmesg.out.gz" size="426737" author="mhanafi" created="Mon, 27 Jul 2015 22:28:15 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzxivz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>