<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:03:24 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13692] MDS slow/hung threads at mdt_object_local_lock</title>
                <link>https://jira.whamcloud.com/browse/LU-13692</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt; MDS threads hang/slow getting stack trace dumps&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Jun 16 14:20:20 nbp8-mds1 kernel: [1660354.377038] Lustre: MGS: Connection restored to 09397a7f-3fe2-2dc8-d25a-74d157cb2008 (at 10.151.50.72@o2ib)
Jun 16 14:20:20 nbp8-mds1 kernel: [1660354.377044] Lustre: Skipped 277 previous similar messages
Jun 16 14:22:20 nbp8-mds1 kernel: [1660474.852981] Lustre: 8579:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (30/30), not sending early reply
Jun 16 14:22:20 nbp8-mds1 kernel: [1660474.852981]   req@ffff8979f9e16300 x1668974534442592/t1531807577764(0) o36-&amp;gt;67852282-a3ed-5acb-a9e2-3cae43fe0406@10.151.0.201@o2ib:5/0 lens 488/3152 e 0 to 0 dl 1592342570 ref 2 fl Interpret:/0/0 rc 0/0
Jun 16 14:22:20 nbp8-mds1 kernel: [1660474.952277] Lustre: 8579:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 134 previous similar messages
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.333617] LNet: Service thread pid 14118 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 551.86s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; debugging purposes:
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.389992] LNet: Skipped 3 previous similar messages
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.407185] Pid: 14118, comm: mdt00_115 3.10.0-1062.12.1.el7_lustre2124.x86_64 #1 SMP Tue Mar 17 13:32:19 PDT 2020
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.407190] Call Trace:
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.407202]  [&amp;lt;ffffffffc0f17c90&amp;gt;] ldlm_completion_ast+0x430/0x860 [ptlrpc]
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.412473]  [&amp;lt;ffffffffc0f187b1&amp;gt;] ldlm_cli_enqueue_local+0x231/0x830 [ptlrpc]
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.412496]  [&amp;lt;ffffffffc16205cb&amp;gt;] mdt_object_local_lock+0x50b/0xb20 [mdt]
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.412506]  [&amp;lt;ffffffffc1620c50&amp;gt;] mdt_object_lock_internal+0x70/0x360 [mdt]
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.412516]  [&amp;lt;ffffffffc1620f9c&amp;gt;] mdt_reint_object_lock+0x2c/0x60 [mdt]
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.412530]  [&amp;lt;ffffffffc1639efc&amp;gt;] mdt_reint_striped_lock+0x8c/0x510 [mdt]
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.412541]  [&amp;lt;ffffffffc163d866&amp;gt;] mdt_reint_setattr+0x676/0x1290 [mdt]
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.412552]  [&amp;lt;ffffffffc163f963&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.412562]  [&amp;lt;ffffffffc161c273&amp;gt;] mdt_reint_internal+0x6e3/0xaf0 [mdt]
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.412571]  [&amp;lt;ffffffffc16276e7&amp;gt;] mdt_reint+0x67/0x140 [mdt]
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.412623]  [&amp;lt;ffffffffc0fb73ca&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.412659]  [&amp;lt;ffffffffc0f5b47b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.412691]  [&amp;lt;ffffffffc0f5ede4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.412696]  [&amp;lt;ffffffff858c61f1&amp;gt;] kthread+0xd1/0xe0
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.412699]  [&amp;lt;ffffffff85f8dd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.412722]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Jun 16 14:26:14 nbp8-mds1 kernel: [1660708.412725] LustreError: dumping log to /tmp/lustre-log.1592342774.14118
Jun 16 14:26:14 nbp8-mds1 sec[2849]: SEC_EVENT |msg lustre hung thread
Jun 16 14:26:15 nbp8-mds1 kernel: [1660709.178343] LNet: Service thread pid 8575 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 552.75s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; debugging purposes:
Jun 16 14:26:15 nbp8-mds1 kernel: [1660709.234458] Pid: 8575, comm: mdt00_043 3.10.0-1062.12.1.el7_lustre2124.x86_64 #1 SMP Tue Mar 17 13:32:19 PDT 2020
Jun 16 14:26:15 nbp8-mds1 kernel: [1660709.234459] Call Trace:
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.234472]  [&amp;lt;ffffffffc0f17c90&amp;gt;] ldlm_completion_ast+0x430/0x860 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.239700]  [&amp;lt;ffffffffc0f187b1&amp;gt;] ldlm_cli_enqueue_local+0x231/0x830 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.239749]  [&amp;lt;ffffffffc16205cb&amp;gt;] mdt_object_local_lock+0x50b/0xb20 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.239763]  [&amp;lt;ffffffffc1620c50&amp;gt;] mdt_object_lock_internal+0x70/0x360 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.239773]  [&amp;lt;ffffffffc1620f9c&amp;gt;] mdt_reint_object_lock+0x2c/0x60 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.239785]  [&amp;lt;ffffffffc1639efc&amp;gt;] mdt_reint_striped_lock+0x8c/0x510 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.239797]  [&amp;lt;ffffffffc163d866&amp;gt;] mdt_reint_setattr+0x676/0x1290 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.239809]  [&amp;lt;ffffffffc163f963&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.239819]  [&amp;lt;ffffffffc161c273&amp;gt;] mdt_reint_internal+0x6e3/0xaf0 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.239829]  [&amp;lt;ffffffffc16276e7&amp;gt;] mdt_reint+0x67/0x140 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.239874]  [&amp;lt;ffffffffc0fb73ca&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.239909]  [&amp;lt;ffffffffc0f5b47b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.239945]  [&amp;lt;ffffffffc0f5ede4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.239952]  [&amp;lt;ffffffff858c61f1&amp;gt;] kthread+0xd1/0xe0
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.239958]  [&amp;lt;ffffffff85f8dd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.239982]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.239988] Pid: 14083, comm: mdt00_088 3.10.0-1062.12.1.el7_lustre2124.x86_64 #1 SMP Tue Mar 17 13:32:19 PDT 2020
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.239988] Call Trace:
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240020]  [&amp;lt;ffffffffc0f17c90&amp;gt;] ldlm_completion_ast+0x430/0x860 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240050]  [&amp;lt;ffffffffc0f187b1&amp;gt;] ldlm_cli_enqueue_local+0x231/0x830 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240060]  [&amp;lt;ffffffffc16205cb&amp;gt;] mdt_object_local_lock+0x50b/0xb20 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240071]  [&amp;lt;ffffffffc1620c50&amp;gt;] mdt_object_lock_internal+0x70/0x360 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240082]  [&amp;lt;ffffffffc1620f9c&amp;gt;] mdt_reint_object_lock+0x2c/0x60 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240093]  [&amp;lt;ffffffffc1639efc&amp;gt;] mdt_reint_striped_lock+0x8c/0x510 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240104]  [&amp;lt;ffffffffc163d866&amp;gt;] mdt_reint_setattr+0x676/0x1290 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240116]  [&amp;lt;ffffffffc163f963&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240127]  [&amp;lt;ffffffffc161c273&amp;gt;] mdt_reint_internal+0x6e3/0xaf0 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240137]  [&amp;lt;ffffffffc16276e7&amp;gt;] mdt_reint+0x67/0x140 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240174]  [&amp;lt;ffffffffc0fb73ca&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240207]  [&amp;lt;ffffffffc0f5b47b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240240]  [&amp;lt;ffffffffc0f5ede4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240244]  [&amp;lt;ffffffff858c61f1&amp;gt;] kthread+0xd1/0xe0
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240247]  [&amp;lt;ffffffff85f8dd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240251]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240254] Pid: 10527, comm: mdt00_069 3.10.0-1062.12.1.el7_lustre2124.x86_64 #1 SMP Tue Mar 17 13:32:19 PDT 2020
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240254] Call Trace:
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240285]  [&amp;lt;ffffffffc0f17c90&amp;gt;] ldlm_completion_ast+0x430/0x860 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240313]  [&amp;lt;ffffffffc0f187b1&amp;gt;] ldlm_cli_enqueue_local+0x231/0x830 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240325]  [&amp;lt;ffffffffc16205cb&amp;gt;] mdt_object_local_lock+0x50b/0xb20 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240335]  [&amp;lt;ffffffffc1620c50&amp;gt;] mdt_object_lock_internal+0x70/0x360 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240344]  [&amp;lt;ffffffffc1620f9c&amp;gt;] mdt_reint_object_lock+0x2c/0x60 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240357]  [&amp;lt;ffffffffc1639efc&amp;gt;] mdt_reint_striped_lock+0x8c/0x510 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240369]  [&amp;lt;ffffffffc163d866&amp;gt;] mdt_reint_setattr+0x676/0x1290 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240380]  [&amp;lt;ffffffffc163f963&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240389]  [&amp;lt;ffffffffc161c273&amp;gt;] mdt_reint_internal+0x6e3/0xaf0 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240400]  [&amp;lt;ffffffffc16276e7&amp;gt;] mdt_reint+0x67/0x140 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240439]  [&amp;lt;ffffffffc0fb73ca&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240478]  [&amp;lt;ffffffffc0f5b47b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240510]  [&amp;lt;ffffffffc0f5ede4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240514]  [&amp;lt;ffffffff858c61f1&amp;gt;] kthread+0xd1/0xe0
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240516]  [&amp;lt;ffffffff85f8dd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240520]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240523] Pid: 14084, comm: mdt00_089 3.10.0-1062.12.1.el7_lustre2124.x86_64 #1 SMP Tue Mar 17 13:32:19 PDT 2020
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240523] Call Trace:
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240554]  [&amp;lt;ffffffffc0f17c90&amp;gt;] ldlm_completion_ast+0x430/0x860 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240585]  [&amp;lt;ffffffffc0f187b1&amp;gt;] ldlm_cli_enqueue_local+0x231/0x830 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240598]  [&amp;lt;ffffffffc16205cb&amp;gt;] mdt_object_local_lock+0x50b/0xb20 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240610]  [&amp;lt;ffffffffc1620c50&amp;gt;] mdt_object_lock_internal+0x70/0x360 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240620]  [&amp;lt;ffffffffc1620f9c&amp;gt;] mdt_reint_object_lock+0x2c/0x60 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240631]  [&amp;lt;ffffffffc1639efc&amp;gt;] mdt_reint_striped_lock+0x8c/0x510 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240647]  [&amp;lt;ffffffffc163d866&amp;gt;] mdt_reint_setattr+0x676/0x1290 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240657]  [&amp;lt;ffffffffc163f963&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240666]  [&amp;lt;ffffffffc161c273&amp;gt;] mdt_reint_internal+0x6e3/0xaf0 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240677]  [&amp;lt;ffffffffc16276e7&amp;gt;] mdt_reint+0x67/0x140 [mdt]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240716]  [&amp;lt;ffffffffc0fb73ca&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240749]  [&amp;lt;ffffffffc0f5b47b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240781]  [&amp;lt;ffffffffc0f5ede4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240784]  [&amp;lt;ffffffff858c61f1&amp;gt;] kthread+0xd1/0xe0
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240788]  [&amp;lt;ffffffff85f8dd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240792]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Jun 16 14:26:16 nbp8-mds1 kernel: [1660709.240796] LNet: Service thread pid 8568 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 552.81s. Watchdog stack traces are limited to 3 per 300 seconds, skipping &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; one.
 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>clients running l2.12.3</environment>
        <key id="59623">LU-13692</key>
            <summary>MDS slow/hung threads at mdt_object_local_lock</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="green">Oleg Drokin</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Wed, 17 Jun 2020 20:56:15 +0000</created>
                <updated>Tue, 18 May 2021 18:50:24 +0000</updated>
                            <resolved>Thu, 29 Oct 2020 12:06:18 +0000</resolved>
                                    <version>Lustre 2.12.4</version>
                                    <fixVersion>Lustre 2.14.0</fixVersion>
                    <fixVersion>Lustre 2.12.6</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>15</watches>
                                                                            <comments>
                            <comment id="273151" author="mhanafi" created="Wed, 17 Jun 2020 21:17:54 +0000"  >&lt;p&gt;This filesystem is not using DOM.&lt;/p&gt;</comment>
                            <comment id="273197" author="adilger" created="Thu, 18 Jun 2020 14:52:12 +0000"  >&lt;p&gt;Mahmoud, could you please attach a larger chunk of the &lt;tt&gt;/var/log/message&lt;/tt&gt; file before these stacks are dumped. It looks like there were client connection issues before these stacks were dumped?  &lt;/p&gt;

&lt;p&gt;Do you still have the &lt;tt&gt;/tmp/lustre-log.*&lt;/tt&gt; files that were saved on the MDS from the first stack dumps? It would be useful to attach at least the first ones (lower time stamp).&lt;/p&gt;

&lt;p&gt;Did this issue resolve over time, or is it still ongoing?&lt;/p&gt;</comment>
                            <comment id="273214" author="mhanafi" created="Thu, 18 Jun 2020 17:12:52 +0000"  >&lt;p&gt;Attaching logs files.&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/35214/35214_vmcore-dmesg.txt&quot; title=&quot;vmcore-dmesg.txt attached to LU-13692&quot;&gt;vmcore-dmesg.txt&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;</comment>
                            <comment id="273335" author="pjones" created="Fri, 19 Jun 2020 17:34:25 +0000"  >&lt;p&gt;Amir&lt;/p&gt;

&lt;p&gt;What do you recommend here?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="273674" author="mhanafi" created="Wed, 24 Jun 2020 19:18:37 +0000"  >&lt;p&gt;Any updates?&lt;/p&gt;</comment>
                            <comment id="273786" author="ashehata" created="Thu, 25 Jun 2020 22:25:00 +0000"  >&lt;p&gt;The messages which could be relevant to the slow down are:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 (o2iblnd_cb.c:3351:kiblnd_check_txs_locked()) Timed out tx: active_txs, 1 seconds&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This means that transmits have been posted but haven&apos;t completed yet. This indicates that the OFED on the passive side is unable to process these transmits in a timely manner.&lt;/p&gt;

&lt;p&gt;What do you have the lnet_transaction_timeout set to?&lt;/p&gt;

&lt;p&gt;How many CPTs are configured? Looking at top, do you see only a subset of the ko2iblnd scheduler threads working? Or do you see the work spread across all the threads across all the CPTs?&lt;/p&gt;

&lt;p&gt;I&apos;m just trying to identify if there is a bottleneck some where.&lt;/p&gt;</comment>
                            <comment id="274594" author="mhanafi" created="Tue, 7 Jul 2020 08:41:26 +0000"  >&lt;p&gt;Past 2 days we been hitting this issue several times. We have identified a user&apos;s code that is causing this issue. I don&apos;t see any network errors or timeouts. This may not be a networking issue. More like a ldlm deadlock...&lt;/p&gt;

&lt;p&gt;The issue start like this...&lt;/p&gt;

&lt;p&gt;&lt;b&gt;Client Logs&lt;/b&gt;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 Mon Jul  6 16:38:30 2020 C r135i2n18 [1594078710.191034] Lustre: 43689:0:(client.c:2133:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1594078087/real 1594078087]  req@ffff8c2eda903240 x1669114153850208/t0(0) o101-&amp;gt;nbp13-MDT0000-mdc-ffff8c16da19e000@10.151.26.183@o2ib:12/10 lens 376/1392 e 0 to 1 dl 1594078710 ref 2 fl Rpc:X/0/ffffffff rc 0/-1
Mon Jul  6 16:38:30 2020 C r135i2n18 [1594078710.219034] Lustre: nbp13-MDT0000-mdc-ffff8c16da19e000: Connection to nbp13-MDT0000 (at 10.151.26.183@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Mon Jul  6 16:38:30 2020 C r135i4n11 [1594078710.803698] Lustre: 44292:0:(client.c:2133:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1594078087/real 1594078087]  req@ffff8aa99b843680 x1669114204037872/t0(0) o101-&amp;gt;nbp13-MDT0000-mdc-ffff8a92f11a6800@10.151.26.183@o2ib:12/10 lens 376/1392 e 0 to 1 dl 1594078710 ref 2 fl Rpc:X/0/ffffffff rc 0/-1
Mon Jul  6 16:38:30 2020 C r135i4n11 [1594078710.831698] Lustre: nbp13-MDT0000-mdc-ffff8a92f11a6800: Connection to nbp13-MDT0000 (at 10.151.26.183@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Mon Jul  6 16:38:30 2020 C r147i0n28 [1594078710.161451] Lustre: 55178:0:(client.c:2133:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1594078087/real 1594078087]  req@ffff9ecf51918000 x1668975755824896/t0(0) o101-&amp;gt;nbp13-MDT0000-mdc-ffff9ed0d75e9800@10.151.26.183@o2ib:12/10 lens 376/1392 e 0 to 1 dl 1594078710 ref 2 fl Rpc:X/0/ffffffff rc 0/-1
Mon Jul  6 16:38:30 2020 C r147i0n28 [1594078710.189451] Lustre: nbp13-MDT0000-mdc-ffff9ed0d75e9800: Connection to nbp13-MDT0000 (at 10.151.26.183@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Mon Jul  6 16:38:30 2020 C r147i0n28 [1594078710.205451] Lustre: 55178:0:(client.c:2133:ptlrpc_expire_one_request()) Skipped 15 previous similar messages
Mon Jul  6 16:38:30 2020 C r147i0n30 [1594078710.924656] Lustre: 1843:0:(client.c:2133:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1594078087/real 1594078087]  req@ffff92d658d45b00 x1668975460885376/t0(0) o101-&amp;gt;nbp13-MDT0000-mdc-ffff92c2161d7800@10.151.26.183@o2ib:12/10 lens 376/1392 e 0 to 1 dl 1594078710 ref 2 fl Rpc:X/0/ffffffff rc 0/-1
Mon Jul  6 16:38:30 2020 C r147i0n30 [1594078710.952656] Lustre: nbp13-MDT0000-mdc-ffff92c2161d7800: Connection to nbp13-MDT0000 (at 10.151.26.183@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Mon Jul  6 16:38:30 2020 C r147i0n30 [1594078710.968656] Lustre: 1843:0:(client.c:2133:ptlrpc_expire_one_request()) Skipped 15 previous similar messages
Mon Jul  6 16:38:30 2020 C r147i3n11 [1594078710.345202] Lustre: 99945:0:(client.c:2133:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1594078087/real 1594078087]  req@ffff8cea213b2480 x1668975668763616/t0(0) o101-&amp;gt;nbp13-MDT0000-mdc-ffff8cfdc2e07800@10.151.26.183@o2ib:12/10 lens 376/1392 e 0 to 1 dl 1594078710 ref 2 fl Rpc:X/0/ffffffff rc 0/-1
Mon Jul  6 16:38:30 2020 C r147i3n11 [1594078710.373202] Lustre: nbp13-MDT0000-mdc-ffff8cfdc2e07800: Connection to nbp13-MDT0000 (at 10.151.26.183@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Mon Jul  6 16:38:30 2020 C r147i3n11 [1594078710.389202] Lustre: 99945:0:(client.c:2133:ptlrpc_expire_one_request()) Skipped 3 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&lt;b&gt;Server Logs&lt;/b&gt;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 Jul 6 15:39:18 nbp13-srv1 kernel: [14768.604982] Lustre: 34157:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (30/30), not sending early reply
 Jul 6 15:39:18 nbp13-srv1 kernel: [14768.604982] req@ffff9f6a31f58900 x1668977731129264/t0(0) o101-&amp;gt;10e00de0-1f20-c94c-9a9b-c9d0c3e9a774@10.149.10.9@o2ib313:653/0 lens 376/1392 e 0 to 0 dl 1594075188 ref 2 fl Interpret:/0/0 rc 0/0
 Jul 6 15:39:18 nbp13-srv1 kernel: [14768.693110] Lustre: 34157:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 193 previous similar messages
 Jul 6 15:39:52 nbp13-srv1 kernel: [14802.511304] Lustre: 32519:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (348:4s); client may timeout. req@ffff9f7c4bd93f00 x1668975755407840/t0(0) o101-&amp;gt;8dc79bfc-5f46-cac5-08bb-3869eb835eca@10.149.9.136@o2ib313:653/0 lens 376/832 e 0 to 0 dl 1594075188 ref 1 fl Complete:/0/0 rc 0/0
 Jul 6 15:39:52 nbp13-srv1 kernel: [14802.597869] Lustre: 32519:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 91 previous similar messages
 Jul 6 15:42:05 nbp13-srv1 kernel: [14935.249217] LustreError: 137-5: nbp13-OST000c_UUID: not available &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; connect from 10.149.3.193@o2ib313 (no target). If you are running an HA pair check that the target is mounted on the other server.
 Jul 6 15:44:07 nbp13-srv1 kernel: [15057.419615] LustreError: 14961:0:(tgt_grant.c:758:tgt_grant_check()) nbp13-OST0007: cli 1dd3e76b-132a-6719-83dc-8060654c4e5c claims 40960 GRANT, real grant 0
 Jul 6 15:44:07 nbp13-srv1 kernel: [15057.461996] LustreError: 14961:0:(tgt_grant.c:758:tgt_grant_check()) Skipped 110 previous similar messages
 Jul 6 15:45:00 nbp13-srv1 kernel: [15109.616188] Lustre: nbp13-OST000a: Connection restored to 13dd3081-fbf9-b7c0-0398-194130227ad4 (at 10.151.56.237@o2ib)
 Jul 6 15:45:00 nbp13-srv1 kernel: [15109.616191] Lustre: Skipped 2324 previous similar messages
 Jul 6 15:54:09 nbp13-srv1 kernel: [15657.879212] LustreError: 15055:0:(tgt_grant.c:758:tgt_grant_check()) nbp13-OST0005: cli 60265261-ea48-de16-a49c-beb94df9c24a claims 28672 GRANT, real grant 0
 Jul 6 15:54:09 nbp13-srv1 kernel: [15657.921581] LustreError: 15055:0:(tgt_grant.c:758:tgt_grant_check()) Skipped 92 previous similar messages
 Jul 6 15:55:00 nbp13-srv1 kernel: [15708.471963] Lustre: nbp13-OST000a: Connection restored to d6946db5-2ac8-b89e-14d0-4b161627dac1 (at 10.151.56.222@o2ib)
 Jul 6 15:55:00 nbp13-srv1 kernel: [15708.471966] Lustre: Skipped 2289 previous similar messages
 Jul 6 15:59:24 nbp13-srv1 kernel: [15972.045956] Lustre: 16461:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (30/30), not sending early reply
 Jul 6 15:59:24 nbp13-srv1 kernel: [15972.045956] req@ffff9f80ae28da00 x1668975743536160/t0(0) o101-&amp;gt;3f04c178-1d91-8836-3e9e-0c4282050daf@10.149.9.137@o2ib313:349/0 lens 376/1392 e 0 to 0 dl 1594076394 ref 2 fl Interpret:/0/0 rc 0/0
 Jul 6 15:59:24 nbp13-srv1 kernel: [15972.134340] Lustre: 16461:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 234 previous similar messages
 Jul 6 16:03:18 nbp13-srv1 kernel: [16204.969982] LNet: Service thread pid 34212 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 551.64s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; debugging purposes:
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.020985] Pid: 34212, comm: mdt02_064 3.10.0-1127.10.1.el7_lustre2125.x86_64 #1 SMP Wed Jun 17 08:01:50 PDT 2020
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.020986] Call Trace:
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.020997] [&amp;lt;ffffffffc1341c90&amp;gt;] ldlm_completion_ast+0x430/0x860 [ptlrpc]
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.041743] [&amp;lt;ffffffffc13427b1&amp;gt;] ldlm_cli_enqueue_local+0x231/0x830 [ptlrpc]
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.041759] [&amp;lt;ffffffffc18715cb&amp;gt;] mdt_object_local_lock+0x50b/0xb20 [mdt]
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.041765] [&amp;lt;ffffffffc1871c50&amp;gt;] mdt_object_lock_internal+0x70/0x360 [mdt]
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.041774] [&amp;lt;ffffffffc1871f9c&amp;gt;] mdt_reint_object_lock+0x2c/0x60 [mdt]
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.041782] [&amp;lt;ffffffffc187228b&amp;gt;] mdt_layout_change+0x20b/0x480 [mdt]
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.041790] [&amp;lt;ffffffffc187a7d0&amp;gt;] mdt_intent_layout+0x8a0/0xe00 [mdt]
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.041798] [&amp;lt;ffffffffc1877d35&amp;gt;] mdt_intent_policy+0x435/0xd80 [mdt]
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.041812] [&amp;lt;ffffffffc1328e06&amp;gt;] ldlm_lock_enqueue+0x356/0xa20 [ptlrpc]
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.041829] [&amp;lt;ffffffffc1351516&amp;gt;] ldlm_handle_enqueue0+0xa56/0x15f0 [ptlrpc]
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.041860] [&amp;lt;ffffffffc13da5d2&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.041884] [&amp;lt;ffffffffc13e13ea&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.041904] [&amp;lt;ffffffffc138548b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.041922] [&amp;lt;ffffffffc1388df4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.041926] [&amp;lt;ffffffffbdac6691&amp;gt;] kthread+0xd1/0xe0
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.041929] [&amp;lt;ffffffffbe192d1d&amp;gt;] ret_from_fork_nospec_begin+0x7/0x21
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.041944] [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
 Jul 6 16:03:18 nbp13-srv1 kernel: [16205.041946] LustreError: dumping log to /tmp/lustre-log.1594076598.34212&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;I looked at the debug dump. The debug buffer was to small, it only has about 1min of logs.&lt;/p&gt;

&lt;p&gt;RPC history show RPC in &apos;Interpret&apos; state.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
6846513251564060672:10.151.26.183@o2ib:12345-10.149.9.229@o2ib313:x1670506690195456:376:Interpret:1594078087.153533:-1594078088.846466s(-1594080927.0s) opc 101
6846513251571204096:10.151.26.183@o2ib:12345-10.149.10.50@o2ib313:x1669038774702880:376:Interpret:1594078087.155269:-1594078088.844730s(-1594081550.0s) opc 101
6846513251575857152:10.151.26.183@o2ib:12345-10.149.10.19@o2ib313:x1668983977176208:376:Interpret:1594078087.156408:-1594078088.843591s(-1594080927.0s) opc 101
6846513251675078656:10.151.26.183@o2ib:12345-10.149.10.52@o2ib313:x1669038696609136:376:Interpret:1594078087.180633:-1594078088.819366s(-1594081551.0s) opc 101
6846513251688841216:10.151.26.183@o2ib:12345-10.149.9.232@o2ib313:x1668975691189088:376:Interpret:1594078087.183999:-1594078088.816000s(-1594081550.0s) opc 101
6846515834582925312:10.151.26.183@o2ib:12345-10.149.9.225@o2ib313:x1668975407998848:376:Interpret:1594078688.579198:-1594078689.420801s(-1594084920.0s) opc 101
6846515834807320576:10.151.26.183@o2ib:12345-10.149.10.15@o2ib313:x1668984073580016:376:Interpret:1594078688.633968:-1594078689.366031s(-1594080991.0s) opc 101
6846515834814791680:10.151.26.183@o2ib:12345-10.149.10.43@o2ib313:x1671120375878864:376:Interpret:1594078688.635799:-1594078689.364200s(-1594088664.0s) opc 101
6846515834852278272:10.151.26.183@o2ib:12345-10.149.10.9@o2ib313:x1668977731546976:376:Interpret:1594078688.644950:-1594078689.355049s(-1594081802.0s) opc 101
6846515834871939072:10.151.26.183@o2ib:12345-10.149.10.51@o2ib313:x1669038774709456:376:Interpret:1594078688.649757:-1594078689.350242s(-1594085129.0s) opc 101
6846515834965852160:10.151.26.183@o2ib:12345-10.149.10.45@o2ib313:x1669038855886144:376:Interpret:1594078688.672678:-1594078689.327321s(-1594080998.0s) opc 101
6846515835105443840:10.151.26.183@o2ib:12345-10.149.9.228@o2ib313:x1671120242121136:376:Interpret:1594078688.706757:-1594078689.293242s(-1594080997.0s) opc 101
6846515835166261248:10.151.26.183@o2ib:12345-10.149.9.135@o2ib313:x1668975409880352:376:Interpret:1594078688.721603:-1594078689.278396s(-1594080998.0s) opc 101
6846515835180023808:10.151.26.183@o2ib:12345-10.149.10.10@o2ib313:x1668977813158432:376:Interpret:1594078688.724960:-1594078689.275039s(-1594084003.0s) opc 101
.....
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Evicting the first client on the list breaks the deadlock and the MDT returns to healthy state.&lt;/p&gt;</comment>
                            <comment id="274746" author="green" created="Wed, 8 Jul 2020 14:04:36 +0000"  >&lt;p&gt;unfortunately it&apos;s not enough to just look at the server side syslogs to see what&apos;s going on and why clients don&apos;t release the locks.&lt;/p&gt;

&lt;p&gt;At first I Thought this might be similar to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13131&quot; title=&quot;Partial writes on multi-client strided files&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13131&quot;&gt;&lt;del&gt;LU-13131&lt;/del&gt;&lt;/a&gt;, but in your case it&apos;s mdt locks that are not being released, so it seems to be something else.&lt;/p&gt;

&lt;p&gt;To better understand this we&apos;ll need a solid reproducer so we can get detailed logs on our systems or you will need to collect detailed Lustre debug logs from both the MDS server and all clients (esp. the one that when evicted breaks the deadlock). You said you isolated the job, any idea of what it does?&lt;/p&gt;

&lt;p&gt;Ideally the debug log would be at -1 with no gaps on both client and servers but that might not be feasible depending on activity levels so it might be needed to dropped a few notches down ( we definitely need dlmtrace and rpctrace at the very least).&lt;/p&gt;

&lt;p&gt;Also I see your clients are at 2.12.3, there was a number of fixes in .4 and .5 so if possible upgrading the clients to 2.12.5 first might be a good idea.&lt;/p&gt;</comment>
                            <comment id="274808" author="mhanafi" created="Wed, 8 Jul 2020 22:54:33 +0000"  >&lt;p&gt;The code is FORTRAN mpi code. It is writing HDF5 and binary restart files. I am trying to reproduce the issue on our test filesystem and gather debug info.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="274905" author="mhanafi" created="Thu, 9 Jul 2020 18:41:28 +0000"  >&lt;p&gt;I was able reproduce using IOR. This was using lustre-2.12.5 server and lustre-2.12.3 clients.&lt;/p&gt;

&lt;p&gt;This is using HPE-mpt 2.17r13 and intel 2016.2.181&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
$  mpiexec /u/mhanafi/bin/IOR.MPIIO -a MPIIO -w  -t 4k -b 10m -i 100
IOR-2.10.3: MPI Coordinated Test of Parallel I/ORun began: Thu Jul  9 11:15:25 2020
Command line used: /u/mhanafi/bin/IOR.MPIIO -a MPIIO -w -t 4k -b 10m -i 100
Machine: Linux r901i3n10Summary:
	api                = MPIIO (version=3, subversion=1)
	test filename      = testFile
	access             = single-shared-file
	ordering in a file = sequential offsets
	ordering inter file= no tasks offsets
	clients            = 1800 (30 per node)
	repetitions        = 100
	xfersize           = 4096 bytes
	blocksize          = 10 MiB
	aggregate filesize = 17.58 GiB
	Lustre stripe size = Use &lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;
	      stripe count = Use &lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I got debug logs from server and client I will have to upload them to the ftp site.&lt;br/&gt;
 ftp:/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;.tgz&lt;br/&gt;
 I include file &quot;hostname-ip-map&quot; to resolve nid address to hostnames.&lt;br/&gt;
 Please let me know if it captured necessary debug info.&lt;/p&gt;

&lt;p&gt;It will takes sometime to get setup for testing using 2.12.5 clients.&lt;/p&gt;

&lt;p&gt;Here is the service side logs&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 Jul  9 11:20:44 nbptest3-srv1 kernel: [ 7545.162610] Lustre: 16185:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (30/30), not sending early reply
Jul  9 11:20:44 nbptest3-srv1 kernel: [ 7545.162610]   req@ffff9cfc80aa2400 x1668974508088176/t0(0) o101-&amp;gt;0e53cdce-c2ce-279c-1bda-abae6ca8fc35@10.141.6.196@o2ib417:474/0 lens 376/0 e 1 to 0 dl 1594318874 ref 2 fl New:/0/ffffffff rc 0/-1
Jul  9 11:20:44 nbptest3-srv1 kernel: [ 7545.259022] Lustre: 16185:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 424 previous similar messages
Jul  9 11:21:20 nbptest3-srv1 kernel: [ 7581.233493] Lustre: 16192:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (30/30), not sending early reply
Jul  9 11:21:20 nbptest3-srv1 kernel: [ 7581.233493]   req@ffff9d0e01b73600 x1667404859964912/t0(0) o101-&amp;gt;93fc8911-9a6c-9428-f682-a88016025469@10.151.27.22@o2ib:510/0 lens 576/0 e 1 to 0 dl 1594318910 ref 2 fl New:/0/ffffffff rc 0/-1
Jul  9 11:22:17 nbptest3-srv1 kernel: [ 7638.354601] LustreError: 6611:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 412s: evicting client at 10.141.6.236@o2ib417  ns: mdt-nbptest3-MDT0000_UUID lock: ffff9cfc82e99200/0xf0e4f75150665c0f lrc: 3/0,0 mode: CR/CR res: [0x200004a51:0x2:0x0].0x0 bits 0x8/0x0 rrc: 490 type: IBT flags: 0x60200400000020 nid: 10.141.6.236@o2ib417 remote: 0x9bea31f0e36eb3a0 expref: 19 pid: 16200 timeout: 7638 lvb_type: 0
Jul  9 11:22:17 nbptest3-srv1 kernel: [ 7638.482253] Lustre: 16103:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (349:63s); client may timeout.  req@ffff9d0e02b05580 x1668974021539200/t0(0) o101-&amp;gt;b6231da5-843d-7974-af9e-038000d0baef@10.141.6.192@o2ib417:474/0 lens 376/1000 e 1 to 0 dl 1594318874 ref 1 fl Complete:/0/0 rc 0/0
Jul  9 11:22:18 nbptest3-srv1 kernel: [ 7638.577573] LustreError: 16103:0:(service.c:2128:ptlrpc_server_handle_request()) @@@ Dropping timed-out request from 12345-10.141.6.213@o2ib417: deadline 349:63s ago
Jul  9 11:22:18 nbptest3-srv1 kernel: [ 7638.577573]   req@ffff9d0e02b36c00 x1668974504934624/t0(0) o101-&amp;gt;9b7650f7-c3aa-0860-7a72-aa3fe5ce22a6@10.141.6.213@o2ib417:474/0 lens 376/0 e 1 to 0 dl 1594318874 ref 1 fl Interpret:/0/ffffffff rc 0/-1
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.590810] LNet: Service thread pid 16057 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 551.09s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; debugging purposes:
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.646622] Pid: 16057, comm: mdt00_053 3.10.0-1127.10.1.el7_lustre2125.x86_64 #1 SMP Wed Jun 17 08:01:50 PDT 2020
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.646623] Call Trace:
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.646636]  [&amp;lt;ffffffffc0ce1c90&amp;gt;] ldlm_completion_ast+0x430/0x860 [ptlrpc]
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.669376]  [&amp;lt;ffffffffc0ce27b1&amp;gt;] ldlm_cli_enqueue_local+0x231/0x830 [ptlrpc]
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.669396]  [&amp;lt;ffffffffc11ef5cb&amp;gt;] mdt_object_local_lock+0x50b/0xb20 [mdt]
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.669406]  [&amp;lt;ffffffffc11efc50&amp;gt;] mdt_object_lock_internal+0x70/0x360 [mdt]
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.669416]  [&amp;lt;ffffffffc11eff9c&amp;gt;] mdt_reint_object_lock+0x2c/0x60 [mdt]
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.669425]  [&amp;lt;ffffffffc11f028b&amp;gt;] mdt_layout_change+0x20b/0x480 [mdt]
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.669435]  [&amp;lt;ffffffffc11f87d0&amp;gt;] mdt_intent_layout+0x8a0/0xe00 [mdt]
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.669444]  [&amp;lt;ffffffffc11f5d35&amp;gt;] mdt_intent_policy+0x435/0xd80 [mdt]
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.669469]  [&amp;lt;ffffffffc0cc8e06&amp;gt;] ldlm_lock_enqueue+0x356/0xa20 [ptlrpc]
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.669497]  [&amp;lt;ffffffffc0cf1516&amp;gt;] ldlm_handle_enqueue0+0xa56/0x15f0 [ptlrpc]
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.669539]  [&amp;lt;ffffffffc0d7a5d2&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.669576]  [&amp;lt;ffffffffc0d813ea&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.669607]  [&amp;lt;ffffffffc0d2548b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.669637]  [&amp;lt;ffffffffc0d28df4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.669644]  [&amp;lt;ffffffff880c6691&amp;gt;] kthread+0xd1/0xe0
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.669649]  [&amp;lt;ffffffff88792d37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
Jul  9 11:24:37 nbptest3-srv1 kernel: [ 7777.669669]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="274929" author="mhanafi" created="Thu, 9 Jul 2020 23:41:59 +0000"  >&lt;p&gt;Our PFL settings&lt;/p&gt;

&lt;p&gt;&#160;lfs setstripe -E 10M -c 1 -E 16G -c 4 -E 64G -c 8 -E -1 -c 16 /nobackuptest&lt;/p&gt;</comment>
                            <comment id="274949" author="green" created="Fri, 10 Jul 2020 05:29:24 +0000"  >&lt;p&gt;so looking through the logs the picture is really strange.&lt;/p&gt;

&lt;p&gt;we can see the lock that&apos;s being evicted with remove handle of 0x9bea31f0e36eb3a0 being requiested by the client, server grants it immediately contends it and then returns it, client receives the blocking AST and that&apos;s it, no more mentionings of 0x9bea31f0e36eb3a0.&lt;/p&gt;

&lt;p&gt;Does not even indicate that the reply was received, and it&apos;s unclear if it&apos;s because some log messages got dropped or because the message was lost.&lt;/p&gt;

&lt;p&gt;Can you please reproduce again with debug_daemon in place to ensure nothing is dropped or if dropped - we receive a very visible message.&lt;/p&gt;

&lt;p&gt;Also please include client and server dmesg/syslog so we can see the evicted messages and only stop the lustre debug logging once there&apos;s eviction on the server (or even better if both the server and the client).&lt;/p&gt;

&lt;p&gt;Also as a follow-up from the call - hopefully you can provide two sets of logs, once with pfl and once without (from that other system)&lt;/p&gt;</comment>
                            <comment id="275275" author="mhanafi" created="Mon, 13 Jul 2020 19:50:37 +0000"  >&lt;p&gt;Uploaded new logs to ftp site. This was PFL. I&apos;ll try to get debug logs for non-PFL&lt;/p&gt;

&lt;p&gt;ftp:/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;_07_13_2020.tgz&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="275570" author="mhanafi" created="Thu, 16 Jul 2020 16:53:33 +0000"  >&lt;p&gt;I was able to reproduce the issue using 2.12.5 clients with 2.12.5 servers.&lt;/p&gt;

&lt;p&gt;I couldn&apos;t get a reproducer for non-PFL.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="276008" author="green" created="Thu, 23 Jul 2020 02:15:02 +0000"  >&lt;p&gt;So after reviewing the logs it looks like we are dealing with a missign RPC.&lt;/p&gt;

&lt;p&gt;We see the response is being sent by the server after finishing processing:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:00010000:6.0:1594668516.184157:0:45180:0:(ldlm_lockd.c:1401:ldlm_handle_enqueue0()) ### server-side enqueue handler, sending reply(err=0, rc=0) ns: mdt-nbptest3-MDT0000_UUID lock: ffff9c746ca00000/0xba2cc1762941d1ca lrc: 4/0,0 mode: CR/CR res: [0x2000061c1:0x2:0x0].0x0 bits 0x8/0x0 rrc: 88 type: IBT flags: 0x60200400000020 nid: 10.141.6.235@o2ib417 remote: 0x8dae759881f1de9c expref: 90 pid: 45180 timeout: 260195 lvb_type: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;But it&apos;s never seen on the client and the wait is interrupted because of the evicted:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000100:00100000:17.0:1594668927.514597:0:4814:0:(client.c:3145:ptlrpc_abort_inflight()) @@@ inflight  req@ffff93bbc978adc0 x1672126583040528/t0(0) o101-&amp;gt;nbptest3-MDT0000-mdc-ffff93c15a74d800@10.151.27.53@o2ib:12/10 lens 376/1192 e 0 to 0 dl 1594669139 ref 2 fl Rpc:/2/ffffffff rc 0/-1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Not sure what is this was supposed to signify, but it&apos;s about 40 seconds from the moment the reply was lost:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[259827.263246] Lustre: DEBUG MARKER: hung rpcs
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;so I assume you started to see tuff stopping from working as they waited for the lock to clean?&lt;/p&gt;

&lt;p&gt;Now the big question is how coem that RPC reply was lost? how good is your network?&lt;/p&gt;</comment>
                            <comment id="276009" author="mhanafi" created="Thu, 23 Jul 2020 02:33:36 +0000"  >&lt;p&gt;I was watching mdt rpc history at &apos;hung rpcs&apos; mark I start rpc not getting clear off the history&lt;/p&gt;

&lt;p&gt;These nodes are across routers.&lt;/p&gt;

&lt;p&gt;Does the server get an ack for the rpc?&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;I try to reproduce using nettrace.&lt;/p&gt;</comment>
                            <comment id="276013" author="green" created="Thu, 23 Jul 2020 05:51:54 +0000"  >&lt;p&gt;well, it&apos;s a &quot;difficult reply&quot; for enqueue RPCs so I believe it should get ack, but there is no resending of replies so if ack did not come we&apos;ll just keep the slot occupied for longer waiting for the client to resend their request so we can reconstruct from the slot.&lt;/p&gt;

&lt;p&gt;I was hoping it would have been a -1 log and you would only drop to reduced logs i -1 log could not be obtained due to size&lt;/p&gt;</comment>
                            <comment id="276092" author="mhanafi" created="Fri, 24 Jul 2020 18:48:08 +0000"  >&lt;p&gt;I tried to reproduce with &quot;-1&quot; but I couldn&apos;t reproduce the issue. I tried with just +nettrace and still couldn&apos;t reproduce it.&lt;/p&gt;

&lt;p&gt;As soon as I turned off nettrace it hung.&lt;/p&gt;

&lt;p&gt;I will try again +nettrace.&lt;/p&gt;

&lt;p&gt;We don&apos;t see any errors on the IB network or routers. I think the rpc never leaves the server.&lt;/p&gt;</comment>
                            <comment id="276195" author="mhanafi" created="Tue, 28 Jul 2020 05:53:13 +0000"  >&lt;p&gt;I was able to get debug with +net. But it is 27GB of logs. Would you like all of it?&lt;/p&gt;</comment>
                            <comment id="276207" author="pjones" created="Tue, 28 Jul 2020 12:47:03 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=mhanafi&quot; class=&quot;user-hover&quot; rel=&quot;mhanafi&quot;&gt;mhanafi&lt;/a&gt; Oleg is out of the office this week so will be slower to respond. My suggestion is to upload everything to the ftp site so it is available if that is the most useful option.&lt;/p&gt;</comment>
                            <comment id="276347" author="mhanafi" created="Wed, 29 Jul 2020 23:00:10 +0000"  >&lt;p&gt;I uploaded logs to ftp site.&lt;/p&gt;

&lt;p&gt;/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;_7_28_2020_v2.tgz&lt;/p&gt;

&lt;p&gt;Please ignore /uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;_7_28_2020.tgz&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="276688" author="green" created="Wed, 5 Aug 2020 06:17:57 +0000"  >&lt;p&gt;wow, 500G of logs, no wonders decompression took forever.&lt;/p&gt;

&lt;p&gt;The dmesg does not include the eviction so tracing which lock got &quot;hung&quot; is going to be hard I suspect. Did you wait until the eviction by any chance and can provide me with the lock handle?&lt;/p&gt;</comment>
                            <comment id="276724" author="mhanafi" created="Wed, 5 Aug 2020 15:27:28 +0000"  >&lt;p&gt;There wasn&apos;t any evictions.&lt;/p&gt;

&lt;p&gt;Writes that caused the threads to hang started at 20:01:22. That should help to narrow down the timing. Also its looks like the RPCs that never complete are sent at least twice and the server marks them as duplicate.&lt;/p&gt;</comment>
                            <comment id="276726" author="green" created="Wed, 5 Aug 2020 16:36:16 +0000"  >&lt;p&gt;Well, the client side resend logic at work here with the duplicate RPCs, but the issue at least with the previous set of logs is the RPC reply sent by the server was never arriving to the client.&lt;/p&gt;

&lt;p&gt;Now it was relatively easy to find that because eventually the lock was evicted for nonresponsiveness and so we could trace the whole lifecycle on both ends by the lock handle.&lt;/p&gt;

&lt;p&gt;I&apos;ll try to see if I can pick any of hte &quot;hung&quot; RPCs and see what is it waiting for&lt;/p&gt;</comment>
                            <comment id="276783" author="green" created="Thu, 6 Aug 2020 04:11:59 +0000"  >&lt;p&gt;I sifted through logs for all the timed out threads and they all are stuck on &quot;&lt;span class=&quot;error&quot;&gt;&amp;#91;0x20000813c:0xf36:0x0&amp;#93;&lt;/span&gt;&quot; resource, there are no granted locks on it &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/warning.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;

&lt;p&gt;So the problem you hit this time appears to be different than what this all started with.&lt;/p&gt;

&lt;p&gt;from the log we can see:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:00010000:14.0:1595991684.497678:0:18783:0:(ldlm_request.c:204:ldlm_completion_tail()) ### client-side enqueue: granted ns: mdt-nbptest3-MDT0000_UUID lock: ffff8f483e40ee40/0x28e6d526063f9c9 lrc: 3/0,1 mode: EX/EX res: [0x20000813c:0xf36:0x0].0x0 bits 0x8/0x0 rrc: 531 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 18783 timeout: 0 lvb_type: 0
00010000:00010000:14.0:1595991684.497683:0:18783:0:(ldlm_request.c:516:ldlm_cli_enqueue_local()) ### client-side local enqueue handler, new lock created ns: mdt-nbptest3-MDT0000_UUID lock: ffff8f483e40ee40/0x28e6d526063f9c9 lrc: 3/0,1 mode: EX/EX res: [0x20000813c:0xf36:0x0].0x0 bits 0x8/0x0 rrc: 531 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 18783 timeout: 0 lvb_type: 0
00010000:00010000:14.0:1595991684.497705:0:18783:0:(ldlm_lock.c:1073:ldlm_granted_list_add_lock()) ### About to add lock: ns: mdt-nbptest3-MDT0000_UUID lock: ffff8f483e40ee40/0x28e6d526063f9c9 lrc: 3/0,0 mode: CR/CR res: [0x20000813c:0xf36:0x0].0x0 bits 0x8/0x0 rrc: 534 type: IBT flags: 0x50200400000000 nid: 10.141.6.181@o2ib417 remote: 0xbcf4e69b81010fa6 expref: 939 pid: 18783 timeout: 0 lvb_type: 0
00010000:00010000:14.0:1595991684.497763:0:18783:0:(ldlm_inodebits.c:95:ldlm_reprocess_inodebits_queue()) --- Reprocess resource [0x20000813c:0xf36:0x0].0x0 (ffff8f41e16095c0)
00010000:00010000:14.0:1595991684.497764:0:18783:0:(ldlm_inodebits.c:112:ldlm_reprocess_inodebits_queue()) ### Reprocessing lock from queue 3 ns: mdt-nbptest3-MDT0000_UUID lock: ffff8f4877c15c40/0x28e6d526063f9e5 lrc: 3/0,1 mode: --/EX res: [0x20000813c:0xf36:0x0].0x0 bits 0x8/0x0 rrc: 533 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 18943 timeout: 0 lvb_type: 0
00010000:00010000:14.0:1595991684.497773:0:18783:0:(ldlm_lock.c:210:ldlm_lock_put()) ### final lock_put on destroyed lock, freeing it. ns: mdt-nbptest3-MDT0000_UUID lock: ffff8f36d003e0c0/0x28e6d526063f9c2 lrc: 0/0,0 mode: --/EX res: [0x20000813c:0xf36:0x0].0x0 bits 0x8/0x0 rrc: 534 type: IBT flags: 0x44000000000000 nid: 10.141.6.181@o2ib417 remote: 0xbcf4e69b81010fa6 expref: 939 pid: 18783 timeout: 0 lvb_type: 3
00010000:00010000:14.0:1595991684.497781:0:18783:0:(ldlm_lockd.c:465:ldlm_add_waiting_lock()) ### adding to wait list(timeout: 412, AT: on) ns: mdt-nbptest3-MDT0000_UUID lock: ffff8f483e40ee40/0x28e6d526063f9c9 lrc: 5/0,0 mode: CR/CR res: [0x20000813c:0xf36:0x0].0x0 bits 0x8/0x0 rrc: 532 type: IBT flags: 0x70200400000020 nid: 10.141.6.181@o2ib417 remote: 0xbcf4e69b81010fa6 expref: 938 pid: 18783 timeout: 11704 lvb_type: 0
00010000:00010000:14.0:1595991684.497787:0:18783:0:(ldlm_lockd.c:1401:ldlm_handle_enqueue0()) ### server-side enqueue handler, sending reply(err=0, rc=0) ns: mdt-nbptest3-MDT0000_UUID lock: ffff8f483e40ee40/0x28e6d526063f9c9 lrc: 5/0,0 mode: CR/CR res: [0x20000813c:0xf36:0x0].0x0 bits 0x8/0x0 rrc: 533 type: IBT flags: 0x70200400000020 nid: 10.141.6.181@o2ib417 remote: 0xbcf4e69b81010fa6 expref: 938 pid: 18783 timeout: 11704 lvb_type: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So the lock 0x28e6d526063f9c9 was just granted and converter from EX to CR to be sent back to the client causing 0x28e6d526063f9e5 to be reprocessed (in turn sending blocking AST to the just granted lock).&lt;/p&gt;

&lt;p&gt;In parallel there&apos;s another conflicting request for this lock that arrived and instantly queued blocking ast for this newly granted lock:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:00010000:11.0:1595991684.497758:0:18888:0:(ldlm_resource.c:1633:ldlm_resource_add_lock()) ### About to add this lock ns: mdt-nbptest3-MDT0000_UUID lock: ffff8f3540318480/0x28e6d5260640c1b lrc: 5/0,1 mode: --/EX res: [0x20000813c:0xf36:0x0].0x0 bits 0x8/0x0 rrc: 533 type: IBT flags: 0x50210000000000 nid: local remote: 0x0 expref: -99 pid: 18888 timeout: 0 lvb_type: 0
00010000:00010000:11.0:1595991684.497787:0:18888:0:(ldlm_lockd.c:903:ldlm_server_blocking_ast()) ### server preparing blocking AST ns: mdt-nbptest3-MDT0000_UUID lock: ffff8f483e40ee40/0x28e6d526063f9c9 lrc: 5/0,0 mode: CR/CR res: [0x20000813c:0xf36:0x0].0x0 bits 0x8/0x0 rrc: 533 type: IBT flags: 0x70200400000020 nid: 10.141.6.181@o2ib417 remote: 0xbcf4e69b81010fa6 expref: 938 pid: 18783 timeout: 11704 lvb_type: 0
00010000:00010000:11.0:1595991684.497793:0:18888:0:(ldlm_lockd.c:465:ldlm_add_waiting_lock()) ### not re-adding to wait list(timeout: 412, AT: on) ns: mdt-nbptest3-MDT0000_UUID lock: ffff8f483e40ee40/0x28e6d526063f9c9 lrc: 5/0,0 mode: CR/CR res: [0x20000813c:0xf36:0x0].0x0 bits 0x8/0x0 rrc: 532 type: IBT flags: 0x70200400000020 nid: 10.141.6.181@o2ib417 remote: 0xbcf4e69b81010fa6 expref: 938 pid: 18783 timeout: 11704 lvb_type: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Now as the second process is sending the BL ast the client has already returned the lock (as it came with the BL AST bit set on grant):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:00010000:11.0:1595991684.498535:0:18888:0:(ldlm_lockd.c:695:ldlm_handle_ast_error()) ### client (nid 10.141.6.181@o2ib417) returned -22 from blocking AST (req@ffff8f40cc10da00 x1673509146453120) - normal race ns: mdt-nbptest3-MDT0000_UUID lock: ffff8f483e40ee40/0x28e6d526063f9c9 lrc: 4/0,0 mode: CR/CR res: [0x20000813c:0xf36:0x0].0x0 bits 0x8/0x0 rrc: 532 type: IBT flags: 0x60200400000020 nid: 10.141.6.181@o2ib417 remote: 0xbcf4e69b81010fa6 expref: 938 pid: 18783 timeout: 11704 lvb_type: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;(we can see in client log how the lock was being canceled right away so when the bl ast came - it replied with the error)&lt;/p&gt;

&lt;p&gt;and then we see how both the error in ast causes the lock to be destroyed while the incoming cancel that the client has sent decided that &quot;we got a cancel for the lock that does not exist&quot;:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:00010000:11.0:1595991684.498549:0:18888:0:(ldlm_lockd.c:531:ldlm_del_waiting_lock()) ### removed ns: mdt-nbptest3-MDT0000_UUID lock: ffff8f483e40ee40/0x28e6d526063f9c9 lrc: 4/0,0 mode: CR/CR res: [0x20000813c:0xf36:0x0].0x0 bits 0x8/0x0 rrc: 533 type: IBT flags: 0x50200400000020 nid: 10.141.6.181@o2ib417 remote: 0xbcf4e69b81010fa6 expref: 939 pid: 18783 timeout: 11704 lvb_type: 0
00010000:00010000:0.0:1595991684.498549:0:17908:0:(ldlm_lockd.c:2436:ldlm_cancel_hpreq_check()) ### hpreq cancel/convert lock ns: mdt-nbptest3-MDT0000_UUID lock: ffff8f483e40ee40/0x28e6d526063f9c9 lrc: 4/0,0 mode: CR/CR res: [0x20000813c:0xf36:0x0].0x0 bits 0x8/0x0 rrc: 533 type: IBT flags: 0x50200400000020 nid: 10.141.6.181@o2ib417 remote: 0xbcf4e69b81010fa6 expref: 939 pid: 18783 timeout: 11704 lvb_type: 0
00010000:00010000:11.0:1595991684.498557:0:18888:0:(ldlm_lock.c:210:ldlm_lock_put()) ### final lock_put on destroyed lock, freeing it. ns: mdt-nbptest3-MDT0000_UUID lock: ffff8f483e40ee40/0x28e6d526063f9c9 lrc: 0/0,0 mode: --/CR res: [0x20000813c:0xf36:0x0].0x0 bits 0x8/0x0 rrc: 532 type: IBT flags: 0x44a01400000020 nid: 10.141.6.181@o2ib417 remote: 0xbcf4e69b81010fa6 expref: 939 pid: 18783 timeout: 11704 lvb_type: 0
00000100:00000200:0.0:1595991684.498557:0:17908:0:(nrs.c:896:ptlrpc_nrs_hpreq_add_nolock()) @@@ high priority req  req@ffff8f3573015580 x1673025973253184/t0(0) o103-&amp;gt;6357b1f7-e6ee-7152-9b79-4bef4e870e84@10.141.6.181@o2ib417:552/0 lens 328/0 e 0 to 0 dl 1595992032 ref 1 fl New:H/0/ffffffff rc 0/-1
00000100:00100000:11.0:1595991684.498563:0:18888:0:(client.c:2060:ptlrpc_check_set()) Completed RPC pname:cluuid:pid:xid:nid:opc mdt01_098:nbptest3-MDT0000_UUID:18888:1673509146453120:10.141.6.181@o2ib417:104
00000100:00100000:0.0:1595991684.498566:0:17908:0:(nrs_fifo.c:179:nrs_fifo_req_get()) NRS start fifo request from 12345-10.141.6.181@o2ib417, seq: 2884733
00000100:00100000:0.0:1595991684.498569:0:17908:0:(service.c:2140:ptlrpc_server_handle_request()) Handling RPC pname:cluuid+ref:pid:xid:nid:opc ldlm_cn00_004:6357b1f7-e6ee-7152-9b79-4bef4e870e84+938:3865:x1673025973253184:12345-10.141.6.181@o2ib417:103
00000100:00000200:0.0:1595991684.498571:0:17908:0:(service.c:2145:ptlrpc_server_handle_request()) got req 1673025973253184
00010000:00010000:0.0:1595991684.498573:0:17908:0:(ldlm_lockd.c:1626:ldlm_request_cancel()) ### server-side cancel handler START: 1 locks, starting at 0
00010000:00010000:0.0:1595991684.498576:0:17908:0:(ldlm_lockd.c:1633:ldlm_request_cancel()) ### server-side cancel handler stale lock (cookie 184204835340679625)
00010000:00010000:0.0:1595991684.498577:0:17908:0:(ldlm_lockd.c:1677:ldlm_request_cancel()) ### server-side cancel handler END
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;and this is it, no more activity on this resource even though what one would expect is when the lock is canceled it would wake up whatever was waiting on it which should have been 0x28e6d526063f9e5&lt;/p&gt;

&lt;p&gt;So it looks like the reprocess was somehow missed, normally it would recover because some other lock request for this resource would come, BUT coincidentally there were no more lock requests for this resource too so we got stuck.&lt;/p&gt;

&lt;p&gt;I quickly inspected the code and I don&apos;t immediately see any path where handle_ast_error/ldlm_run_ast_work would ignore the ERESTART that should call reprocess at all times.&lt;/p&gt;</comment>
                            <comment id="276814" author="green" created="Thu, 6 Aug 2020 14:12:57 +0000"  >&lt;p&gt;Just to confirm, this is a vanilla 2.12.4 on the server just like it seems in the logs, there are no patches applied?&lt;/p&gt;</comment>
                            <comment id="276838" author="mhanafi" created="Thu, 6 Aug 2020 16:53:09 +0000"  >&lt;p&gt;Here is our branch.&#160;&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://github.com/jlan/lustre-nas/tree/nas-2.12.4&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/jlan/lustre-nas/tree/nas-2.12.4&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="276863" author="jaylan" created="Thu, 6 Aug 2020 20:11:03 +0000"  >&lt;p&gt;I do not know 2.12.4-1nas or 2.12.4-2nas was used when the problem was reported.&lt;/p&gt;

&lt;p&gt;These patches were cherry-picked for 2.12.4-1nas (ie, on top of 2.12.4 release)&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11922&quot; title=&quot;mkfs.lustre in 1.44.3.wc1 causes corruption if &amp;#39;metadata_csum&amp;#39; option enabled&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11922&quot;&gt;&lt;del&gt;LU-11922&lt;/del&gt;&lt;/a&gt; ldiskfs: make dirdata work with metadata_csum&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13037&quot; title=&quot;print tbf stats&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13037&quot;&gt;LU-13037&lt;/a&gt; nrs: dump stats of TBF clients&lt;/p&gt;

&lt;p&gt;These patches were cherry-picked between 2.12.4-1nas and 2.12.4-2nas:&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12424&quot; title=&quot;LNet MR routing: possible loop when discovery is off&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12424&quot;&gt;LU-12424&lt;/a&gt; lnet: prevent loop in LNetPrimaryNID()&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9971&quot; title=&quot;MR: ABA problem in lnet_discover_peer_locked&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9971&quot;&gt;&lt;del&gt;LU-9971&lt;/del&gt;&lt;/a&gt; lnet: use after free in lnet_discover_peer_locked()&lt;/p&gt;</comment>
                            <comment id="276918" author="gerrit" created="Fri, 7 Aug 2020 07:42:36 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/39598&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/39598&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt; ldlm: Ensure we reprocess the resource on ast error&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: ea93a5e7d21d4ea3a184291d2d862f6096480cc2&lt;/p&gt;</comment>
                            <comment id="276919" author="green" created="Fri, 7 Aug 2020 07:44:49 +0000"  >&lt;p&gt;I found a gap in lock processing logic that would lead to the problem exposed by the latest set of logs.&lt;/p&gt;

&lt;p&gt;This patch should also apply to 2.12.x branch. it is a server-only patch.&lt;/p&gt;

&lt;p&gt;Now this is a different problem than what was reported originally with the lost RPC so I hope you can now adopt that patch and run the reproducer again with the elevated log levels to hopefully catch the missing RPC trace now.&lt;/p&gt;

&lt;p&gt;Thank you.&lt;/p&gt;</comment>
                            <comment id="276920" author="green" created="Fri, 7 Aug 2020 08:11:25 +0000"  >&lt;p&gt;Ok I guess I spoke too soon and the patch might need a bit more work so don&apos;t apply it just yet.&lt;/p&gt;</comment>
                            <comment id="276970" author="green" created="Fri, 7 Aug 2020 17:43:11 +0000"  >&lt;p&gt;The current iteration of patch 39598 (set 3) is what I&apos;d like you to try please.&lt;/p&gt;</comment>
                            <comment id="277239" author="mhanafi" created="Tue, 11 Aug 2020 21:57:15 +0000"  >&lt;p&gt;Progress... The server doesn&apos;t get into a complete deadlock, like before when it would require a reboot. But I was able to reproduce a similar issue. The&#160; rpcs hung until timeout and the client gets evicted. At the eviction the application crashes with an I/O error.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[1046354.720641] LustreError: 40059:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 412s: evicting client at 10.141.6.218@o2ib417  ns: mdt-nbptest3-MDT0000_UUID lock: ffff8e4f95824140/0x4015736665bf730e lrc: 3/0,0 mode: EX/EX res: [0x20000b7b2:0x86:0x0].0x0 bits 0x8/0x0 rrc: 134 type: IBT flags: 0x60200000000020 nid: 10.141.6.218@o2ib417 remote: 0xe8b7391db938d471 expref: 130 pid: 44433 timeout: 1046339 lvb_type: 0
[1046354.851994] LustreError: 44288:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8e4f6bef9800 ns: mdt-nbptest3-MDT0000_UUID lock: ffff8e57506a69c0/0x4015736665bf755a lrc: 1/0,0 mode: --/CR res: [0x20000b7b2:0x86:0x0].0x0 bits 0x8/0x0 rrc: 131 type: IBT flags: 0x54a01400000000 nid: 10.141.6.218@o2ib417 remote: 0xe8b7391db938d478 expref: 15 pid: 44288 timeout: 0 lvb_type: 0
[1046354.969040] LustreError: 44288:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) Skipped 1 previous similar message

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Uploaded debug logs to: ftp.whamcloud.com:/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;_patch_testing.tgz&lt;/p&gt;</comment>
                            <comment id="277771" author="mhanafi" created="Wed, 19 Aug 2020 21:53:40 +0000"  >&lt;p&gt;Any updates?&lt;/p&gt;</comment>
                            <comment id="277821" author="green" created="Thu, 20 Aug 2020 19:34:23 +0000"  >&lt;p&gt;from the affected client log:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:00010000:63.0:1597179887.674436:0:15383:0:(ldlm_lockd.c:1844:ldlm_handle_cp_callback()) ### Double grant race happened ns: nbptest3-MDT0000-mdc-ffff8c1b58e0c800 lock: ffff8c1aa526ad80/0xe8b7391db938d471 lrc: 2/0,0 mode: EX/EX res: [0x20000b7b2:0x86:0x0].0x0 bits 0x8/0x0 rrc: 10 type: IBT flags: 0x0 nid: local remote: 0x4015736665bf730e expref: -99 pid: 17795 timeout: 0 lvb_type: 3
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This is certainly a message I have not seen before that seems to be the source of the bug (seems to be another bug again compared to the initial report!)&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (ldlm_is_destroyed(lock) ||
            ldlm_is_granted(lock)) {
                &lt;span class=&quot;code-comment&quot;&gt;/* b=11300: the lock has already been granted */&lt;/span&gt;
                unlock_res_and_lock(lock);
                LDLM_DEBUG(lock, &lt;span class=&quot;code-quote&quot;&gt;&quot;&lt;span class=&quot;code-object&quot;&gt;Double&lt;/span&gt; grant race happened&quot;&lt;/span&gt;);
                GOTO(out, rc = 0);
        }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This completely skips over later check for AST_SENT/CBPENDING flags leading the client to miss server&apos;s desire for the client to drop the lock.&lt;/p&gt;

&lt;p&gt;Here&apos;s server side story:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:00010000:11.0:1597179887.630195:0:44433:0:(ldlm_lockd.c:1401:ldlm_handle_enqueue0()) ### server-side enqueue handler, sending reply(err=0, rc=0) ns: mdt-nbptest3-MDT0000_UUID lock: ffff8e4f95824140/0x4015736665bf730e lrc: 3/0,0 mode: --/EX res: [0x20000b7b2:0x86:0x0].0x0 bits 0x8/0x0 rrc: 111 type: IBT flags: 0x40200000000000 nid: 10.141.6.218@o2ib417 remote: 0xe8b7391db938d471 expref: 162 pid: 44433 timeout: 0 lvb_type: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;this is the initial rpc reply no AST_SENT flag.&lt;/p&gt;

&lt;p&gt;actually what&apos;s weird is it&apos;s not even granted yet and would not be for some time.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:00010000:13.0:1597179887.630275:0:44299:0:(ldlm_lockd.c:891:ldlm_server
_blocking_ast()) ### lock not granted, not sending blocking AST ns: mdt-nbptest3
-MDT0000_UUID lock: ffff8e4f95824140/0x4015736665bf730e lrc: 3/0,0 mode: --/EX r
es: [0x20000b7b2:0x86:0x0].0x0 bits 0x8/0x0 rrc: 114 type: IBT flags: 0x60200000
000020 nid: 10.141.6.218@o2ib417 remote: 0xe8b7391db938d471 expref: 161 pid: 444
33 timeout: 0 lvb_type: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;here it was blocked shortly after&lt;/p&gt;

&lt;p&gt;and eventually once granted, the cp ast was sent:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:00010000:7.0:1597179887.674021:0:45141:0:(ldlm_lock.c:1073:ldlm_granted_list_add_lock()) ### About to add lock: ns: mdt-nbptest3-MDT0000_UUID lock: ffff8e4f95824140/0x4015736665bf730e lrc: 3/0,0 mode: EX/EX res: [0x20000b7b2:0x86:0
x0].0x0 bits 0x8/0x0 rrc: 135 type: IBT flags: 0x60280000000020 nid: 10.141.6.218@o2ib417 remote: 0xe8b7391db938d471 expref: 165 pid: 44433 timeout: 0 lvb_type: 0
00010000:00010000:7.0:1597179887.674036:0:45141:0:(ldlm_lockd.c:1014:ldlm_server
_completion_ast()) ### server preparing completion AST ns: mdt-nbptest3-MDT0000_
UUID lock: ffff8e4f95824140/0x4015736665bf730e lrc: 3/0,0 mode: EX/EX res: [0x20
000b7b2:0x86:0x0].0x0 bits 0x8/0x0 rrc: 136 type: IBT flags: 0x60200000000020 ni
d: 10.141.6.218@o2ib417 remote: 0xe8b7391db938d471 expref: 165 pid: 44433 timeou
t: 0 lvb_type: 0
00010000:00010000:7.0:1597179887.674042:0:45141:0:(ldlm_lockd.c:465:ldlm_add_waiting_lock()) ### adding to wait list(timeout: 412, AT: on) ns: mdt-nbptest3-MDT0000_UUID lock: ffff8e4f95824140/0x4015736665bf730e lrc: 4/0,0 mode: EX/EX res: [0x20000b7b2:0x86:0x0].0x0 bits 0x8/0x0 rrc: 136 type: IBT flags: 0x70200000000020 nid: 10.141.6.218@o2ib417 remote: 0xe8b7391db938d471 expref: 165 pid: 44433 timeout: 1046339 lvb_type: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;Hm, so this actually exposes the &quot;we tell the client it got a granted lock&quot; when it did not somehow.&lt;/p&gt;

&lt;p&gt;I can confirm from the log the path triggered specifically that was patched in my patch and that somehow apparently converted into intent lock request into a regular EX by not yet obvious means.&lt;/p&gt;

&lt;p&gt;So can you please try patchset 2 for the same patch a try (patchset3 is what you tried and got this result)? the difference is patchset2 only adds a reprocess back and patchset 3 also removes some visibly unneeded logic in addition.&lt;/p&gt;</comment>
                            <comment id="277902" author="mhanafi" created="Fri, 21 Aug 2020 23:02:18 +0000"  >&lt;p&gt;Uploaded debug for patchset2 - ftp:/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;_patch2_2020_08_21.tgz.&lt;/p&gt;

&lt;p&gt;I was able reproduce the issue. I stopped debugging after the first client eviction. May be it was too soon... Please take a look and let me know if you want me to re-run it.&lt;/p&gt;</comment>
                            <comment id="277915" author="green" created="Fri, 21 Aug 2020 23:54:27 +0000"  >&lt;p&gt;so this last log 100% matches the first log - server receives request xid 1675670653565120, processes it and sends a reply to the client. The client never receives the reply, so when the server sends bl ast, the client, not aware of the lock being granted, does not release the lock which leads to eviction.&lt;/p&gt;

&lt;p&gt;This log does not include nettrace/neterrors so I cannot see what actually happened to the reply - i.e. was it lost on the wire or got stuck in some outgoing queue.&lt;/p&gt;</comment>
                            <comment id="277921" author="adilger" created="Sat, 22 Aug 2020 01:23:28 +0000"  >&lt;p&gt;Oleg, if the client gets a blocking callback for a lock that it doesn&apos;t think is granted, should it &quot;grant&quot; the lock at that time with the blocking flag, and reply to the server AST, so that the lock is cancelled immediately after use but doesn&apos;t cause the client to stall? &lt;/p&gt;

&lt;p&gt;&#160;Alternately, the client could drop the lock but then it doesn&apos;t make any forward progress. &lt;/p&gt;</comment>
                            <comment id="277923" author="mhanafi" created="Sat, 22 Aug 2020 03:36:04 +0000"  >&lt;p&gt;It is highly unlikely that we would have a drop on the net. I will re-run with nettrace. &lt;/p&gt;</comment>
                            <comment id="277935" author="green" created="Sat, 22 Aug 2020 21:03:54 +0000"  >&lt;p&gt;Andreas, while we can grant the lock on receipt of the bl callback, I am not sure that alone would unstuck the enqueue thread that&apos;s waiting for the RPC to return first. If we force-resume it then I imagine all sorts of strange behavior will crop up in normal races.&lt;/p&gt;

&lt;p&gt;Also just granting the lock will not make it go away because the stuck thread is holding a reference on it.&lt;/p&gt;</comment>
                            <comment id="277991" author="mhanafi" created="Mon, 24 Aug 2020 20:58:45 +0000"  >&lt;p&gt;Uploaded new debug logs with nettrace&lt;br/&gt;
ftp:/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;_patch2_testing_2020_08_24.tgz&lt;/p&gt;

&lt;p&gt;Server side threads remain stuck until server reboot.&lt;/p&gt;</comment>
                            <comment id="278215" author="mhanafi" created="Thu, 27 Aug 2020 16:09:21 +0000"  >&lt;p&gt;Any updates?&lt;/p&gt;</comment>
                            <comment id="278260" author="green" created="Fri, 28 Aug 2020 03:58:59 +0000"  >&lt;p&gt;I don&apos;t see anything different in this log vs the previous one&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:00010000:4.0:1598299915.176480:0:14307:0:(ldlm_lockd.c:1401:ldlm_handle_enqueue0()) ### server-side enqueue handler, sending reply(err=0, rc=0) ns: mdt-nbptest3-MDT0000_UUID lock: ffff9c95908f1200/0xcb77a9c20dcbc88e lrc: 4/0,0 mode: CR/CR res: [0x20000d6f1:0x1:0x0].0x0 bits 0x8/0x0 rrc: 488 type: IBT flags: 0x60200400000020 nid: 10.141.6.214@o2ib417 remote: 0xdd261a7834f00346 expref: 98 pid: 14307 timeout: 6658 lvb_type: 0
00010000:00010000:4.0:1598299915.176495:0:14307:0:(ldlm_lockd.c:1479:ldlm_handle_enqueue0()) ### server-side enqueue handler END (lock ffff9c95908f1200, rc 0)
00000100:00100000:4.0:1598299915.176508:0:14307:0:(service.c:2190:ptlrpc_server_handle_request()) Handled RPC pname:cluuid+ref:pid:xid:nid:opc mdt00_005:221c8e76-3a4a-d7bb-7d3c-6ea3526db183+98:4765:x1675936175078016:12345-10.141.6.214@o2ib417:101 Request processed in 8860us (8880us total) trans 107374186758 rc 0/0
00000100:00100000:4.0:1598299915.176512:0:14307:0:(nrs_fifo.c:241:nrs_fifo_req_stop()) NRS stop fifo request from 12345-10.141.6.214@o2ib417, seq: 2640
00000100:00100000:4.0:1598299915.176518:0:14307:0:(nrs_fifo.c:179:nrs_fifo_req_get()) NRS start fifo request from 12345-10.141.3.114@o2ib417, seq: 2761
00000100:00100000:4.0:1598299915.176521:0:14307:0:(service.c:2140:ptlrpc_server_handle_request()) Handling RPC pname:cluuid+ref:pid:xid:nid:opc mdt00_005:44cfaec0-6c8c-6d1b-0b71-5fc557b7218e+97:4797:x1675936177175232:12345-10.141.3.114@o2ib417:101
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;request handled, reply is supposedly sent (no messages from lnet indicating otherwise at least) yet hte client did not receive it.&lt;/p&gt;</comment>
                            <comment id="278264" author="adilger" created="Fri, 28 Aug 2020 05:24:21 +0000"  >&lt;p&gt;Oleg, in addition to the BL AST &quot;granting&quot; the lock with &lt;tt&gt;LDLM_FL_CBPENDING&lt;/tt&gt;, could it also wake up the thread waiting for the lock reply to allow it to finish processing?&#160; There would need to be some way to trigger this so that the client thinks the RPC was received successfully.&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Probably a better idea is if the RPC wait in that thread could have a periodic timeout check if the lock is already granted, and exit the wait itself?&#160; Then the BL AST can mark the lock granted with &lt;tt&gt;CBPENDING&lt;/tt&gt;, and when the waiting thread next wakes up it will see the lock is granted and clean itself up and continue processing the syscall.&lt;/p&gt;</comment>
                            <comment id="278306" author="mhanafi" created="Fri, 28 Aug 2020 19:00:27 +0000"  >&lt;p&gt;In the last set of logs I had &quot;nettrace&quot; and &quot;neterror&quot;. We didn&apos;t see any network errors. I can run with +net, if you would like. &lt;br/&gt;
Would make sense to see if you can reproduce this locally?&lt;/p&gt;</comment>
                            <comment id="278316" author="green" created="Fri, 28 Aug 2020 20:39:19 +0000"  >&lt;p&gt;I discussed with Amir and he thinks your situation resembles a case where there&apos;s not enough peer credits so messages are dropped.&lt;/p&gt;

&lt;p&gt;I checked the logs and I do see this:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000800:00000200:5.0:1595883687.321977:0:15687:0:(o2iblnd_cb.c:900:kiblnd_post_tx_locked()) 10.151.26.238@o2ib: no credits
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;but the date is strange, its from July. No newer messages, but it&apos;s only output at trace level &quot;net&quot; that you&apos;ve not been including later (I guess you were not removing older server.dk files?)&lt;/p&gt;

&lt;p&gt;He says you need to check peer_credits_hiw and peer_credits values and peer_credits_hiw should be set to 1/2 of the peer_credits to ensure proper reclaim of credits for certain types of messages (perhaps growing peer credits might not be a bad idea depending on what your settings are now)&lt;/p&gt;

&lt;p&gt;&quot;lnetctl net show -v 4&quot;, &quot;lnetctl peer show -v&quot; should show some related stats and min credits.&lt;/p&gt;</comment>
                            <comment id="278317" author="green" created="Fri, 28 Aug 2020 20:51:06 +0000"  >&lt;p&gt;Andreas, the thread is sleeping in ptlrpc_set_wait in ptlrpc, so waking it up from ldlm is going to be very nontrivial. We can find the thread easily as it&apos;s recorded in the lock. But then we need to determine the ptlrpc set that&apos;s privately allocated and referenced on stack of that thread only.&lt;/p&gt;</comment>
                            <comment id="278560" author="mhanafi" created="Tue, 1 Sep 2020 21:36:19 +0000"  >&lt;p&gt;I think we have 2 issues here. First is the ldlm deadlock and second the credit issue.&lt;br/&gt;
I am going to retest with different credit values. Should I test with Patch#2 or Patch#3.&lt;/p&gt;</comment>
                            <comment id="278651" author="green" created="Wed, 2 Sep 2020 17:08:10 +0000"  >&lt;p&gt;patchset 4 if you can please.&lt;/p&gt;

&lt;p&gt;I agree we are having at least two issues at hand.&lt;/p&gt;</comment>
                            <comment id="278776" author="mhanafi" created="Thu, 3 Sep 2020 17:20:00 +0000"  >&lt;p&gt;Uploaded logs running with patch#4. &lt;/p&gt;

&lt;p&gt;debug=&quot;ioctl neterror net warning dlmtrace error emerg ha rpctrace config console lfsck&quot;&lt;/p&gt;

&lt;p&gt;ftp:/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;_patch4_20200902.tgz&lt;/p&gt;</comment>
                            <comment id="278953" author="green" created="Mon, 7 Sep 2020 01:52:40 +0000"  >&lt;p&gt;I checked for the first eviction and that seems to have been removed from the client log by the time it was dumped so it&apos;s not possible to see what was going on on the client at the time.&lt;/p&gt;

&lt;p&gt;I see there&apos;s still a bunch of stuck threads after that that I need to see if I can find why - probably another AST lost but it&apos;s not a give that client log (whatever it was) survived either I guess.&lt;/p&gt;</comment>
                            <comment id="279044" author="mhanafi" created="Tue, 8 Sep 2020 18:04:22 +0000"  >&lt;p&gt;With Patch#4 we see hung threads at start of the first iteration of MPI run. Without any patches it would take a number of iterations before we would see hung threads. I re-ran things today with peer_credits=128 to make sure we don&apos;t have any credit issues. As with the last set of logs I am setting the debug_daemon size very big not to miss anything - I am not sure why we missed the first eviction. This set of logs shouldn&apos;t be missing any thing. &lt;br/&gt;
/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;_patch4_2020_09_08.tgz&lt;/p&gt;</comment>
                            <comment id="279281" author="green" created="Thu, 10 Sep 2020 20:42:08 +0000"  >&lt;p&gt;Thank you, this is a good trace.&lt;/p&gt;

&lt;p&gt;Now we see the lost RPC and we see it was beign sent off from the server.&lt;/p&gt;

&lt;p&gt;The thing is - it&apos;s not sent to the client, but to an intermediate router (not in the hostlist too):&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000400:00000200:1.0:1599585761.690684:0:15434:0:(lib-move.c:1752:lnet_handle_s
end()) TRACE: 10.151.27.53@o2ib(10.151.27.53@o2ib:10.151.27.53@o2ib) -&amp;gt; 10.141.6
.236@o2ib417(10.141.6.236@o2ib417:10.151.26.246@o2ib) : PUT try# 0
00000800:00000200:1.0:1599585761.690687:0:15434:0:(o2iblnd_cb.c:1663:kiblnd_send
()) sending 1000 bytes in 1 frags to 12345-10.151.26.246@o2ib
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So this router (10.151.26.246) is the node that ate the message then I imagine? Any useful messages there?&lt;/p&gt;</comment>
                            <comment id="279284" author="mhanafi" created="Thu, 10 Sep 2020 21:11:37 +0000"  >&lt;p&gt;There is nothing in the logs on that router. I guess I need to run debug trace on the routers, clients, and server.&lt;/p&gt;

&lt;p&gt;Router debug logs may be really big...&lt;/p&gt;

&lt;p&gt;I don&apos;t understand why these two clients send rpc with same xid.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
00000100:00100000:0.0:1599585761.680712:0:15434:0:(service.c:2140:ptlrpc_server_handle_request()) Handling RPC pname:cluuid+ref:pid:xid:nid:opc mdt00_071:64f13ca0-a798-8be7-76ba-6d1f388635bd+88:6761:x1676790192962432:12345-10.141.6.236@o2ib417:101
00000100:00000200:0.0:1599585761.680714:0:15434:0:(service.c:2145:ptlrpc_server_handle_request()) got req 1676790192962432
00000100:00100000:8.0:1599585761.683587:0:15309:0:(service.c:1989:ptlrpc_server_handle_req_in()) got req x1676790192962432
00000100:00100000:8.0:1599585761.683601:0:15309:0:(service.c:2140:ptlrpc_server_handle_request()) Handling RPC pname:cluuid+ref:pid:xid:nid:opc mdt01_039:673a32db-c752-20f9-dd67-f80a34f5fd8d+90:6813:x1676790192962432:12345-10.141.6.235@o2ib417:101
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="279366" author="ashehata" created="Fri, 11 Sep 2020 17:41:18 +0000"  >&lt;p&gt;I was talking with Oleg, there are a few thing we can try to collect/do to see if there are issues on the LNet/networking side.&lt;/p&gt;

&lt;p&gt;First of all, let&apos;s grab the LNet statistics from the node (server/routers/clients with the problem), to see if there are drops:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
lnetctl net show -v 4
lnetctl stats show &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Oleg was saying that he sees the message depart the server but never arrives on the client. No router logs.&lt;/p&gt;

&lt;p&gt;If we get the above stats from the router, we can see if there are drops happening on the routers. We can also monitor the stats on some interval and see if the errors/drops increase at the same time frame as the problem. We can do the same type of monitoring on the servers and clients. I think if we&apos;re able to correlate LNet errors with the time frame of the problem, we&apos;ll have some lead to go on. We can at least look at the type of errors which are happening during that time and see how we handle them.&lt;/p&gt;

&lt;p&gt;The other thing I&apos;m thinking is to try and reduce the complexity of LNet. Can we try disabling health, if we haven&apos;t already:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
lnetctl set health_sensitivity 0 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This avoids any resends at the LNet layer.&lt;/p&gt;</comment>
                            <comment id="279371" author="mhanafi" created="Fri, 11 Sep 2020 18:03:58 +0000"  >&lt;p&gt;We do run with &apos;health_sensitivity=0&apos;. I saved &quot;net show&quot; and &quot;stats show&quot; These were not cleared before. I&apos;ll try to re-run and clear the stats before the test. &lt;/p&gt;

&lt;p&gt;ftp uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;/router.netshow.2020_09_11&lt;br/&gt;
 ftp&#160; uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;/router.netshow.2020_09_11&lt;/p&gt;

&lt;p&gt;I was able re-test today with router logs. see ftp:/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt;_patch4_2020_09_11.tgz&lt;/p&gt;

&lt;p&gt;&lt;b&gt;What I saw the server send the rpc&lt;/b&gt;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 --- SERVER ---
00000100:00100000:7.0:1599839141.481820:0:8199:0:(nrs_fifo.c:179:nrs_fifo_req_get()) NRS start fifo request from 12345-10.141.6.236@o2ib417, seq: 2361
00000100:00100000:7.0:1599839141.481824:0:8199:0:(service.c:2140:ptlrpc_server_handle_request()) Handling RPC pname:cluuid+ref:pid:xid:nid:opc mdt00_012:9308d1b4-ec54-b743-7579-5daa9b33d4cd+88:4780:x1677516826795584:12345-10.141.6.236@o2ib417:101
00010000:00010000:7.0:1599839141.507240:0:8199:0:(ldlm_lock.c:1073:ldlm_granted_list_add_lock()) ### About to add lock: ns: mdt-nbptest3-MDT0000_UUID lock: ffff99b56d30a1c0/0x8e13058be0aade00 lrc: 3/0,0 mode: CR/CR res: [0x200019271:0x2:0x0].0x0 bits 0x8/0x0 rrc: 489 type: IBT flags: 0x50200400000000 nid: 10.141.6.236@o2ib417 remote: 0xd532d2c8ba1b0d1 expref: 97 pid: 8199 timeout: 0 lvb_type: 0
00010000:00010000:7.0:1599839141.507253:0:8199:0:(ldlm_lock.c:681:ldlm_add_bl_work_item()) ### lock incompatible; sending blocking AST. ns: mdt-nbptest3-MDT0000_UUID lock: ffff99b56d30a1c0/0x8e13058be0aade00 lrc: 3/0,0 mode: CR/CR res: [0x200019271:0x2:0x0].0x0 bits 0x8/0x0 rrc: 489 type: IBT flags: 0x40200400000000 nid: 10.141.6.236@o2ib417 remote: 0xd532d2c8ba1b0d1 expref: 97 pid: 8199 timeout: 0 lvb_type: 0
00010000:00010000:7.0:1599839141.507263:0:8199:0:(ldlm_lockd.c:903:ldlm_server_blocking_ast()) ### server preparing blocking AST ns: mdt-nbptest3-MDT0000_UUID lock: ffff99b56d30a1c0/0x8e13058be0aade00 lrc: 4/0,0 mode: CR/CR res: [0x200019271:0x2:0x0].0x0 bits 0x8/0x0 rrc: 489 type: IBT flags: 0x50200400000020 nid: 10.141.6.236@o2ib417 remote: 0xd532d2c8ba1b0d1 expref: 97 pid: 8199 timeout: 0 lvb_type: 0
00010000:00010000:7.0:1599839141.507269:0:8199:0:(ldlm_lockd.c:465:ldlm_add_waiting_lock()) ### adding to wait list(timeout: 412, AT: on) ns: mdt-nbptest3-MDT0000_UUID lock: ffff99b56d30a1c0/0x8e13058be0aade00 lrc: 5/0,0 mode: CR/CR res: [0x200019271:0x2:0x0].0x0 bits 0x8/0x0 rrc: 489 type: IBT flags: 0x70200400000020 nid: 10.141.6.236@o2ib417 remote: 0xd532d2c8ba1b0d1 expref: 97 pid: 8199 timeout: 1350 lvb_type: 0
00000100:00100000:7.0:1599839141.507276:0:8199:0:(client.c:1630:ptlrpc_send_new_req()) Sending RPC pname:cluuid:pid:xid:nid:opc mdt00_012:nbptest3-MDT0000_UUID:8199:1677552166027200:10.141.6.236@o2ib417:104
00000400:00000200:7.0:1599839141.507285:0:8199:0:(lib-move.c:4684:LNetPut()) LNetPut -&amp;gt; 12345-10.141.6.236@o2ib417
00000400:00000200:7.0:1599839141.507288:0:8199:0:(lib-move.c:2478:lnet_handle_send_case_locked()) Source ANY to NMR:  10.141.6.236@o2ib417 routed destination
00000400:00000200:7.0:1599839141.507296:0:8199:0:(lib-move.c:1752:lnet_handle_send()) TRACE: 10.151.27.53@o2ib(10.151.27.53@o2ib:&amp;lt;?&amp;gt;) -&amp;gt; 10.141.6.236@o2ib417(10.141.6.236@o2ib417:10.151.26.234@o2ib) : PUT &lt;span class=&quot;code-keyword&quot;&gt;try&lt;/span&gt;# 0
00000100:00000200:7.0:1599839141.507889:0:8199:0:(events.c:93:reply_in_callback()) @@@ type 6, status 0  req@ffff99b5661a0900 x1677552166027200/t0(0) o104-&amp;gt;nbptest3-MDT0000@10.141.6.236@o2ib417:15/16 lens 296/224 e 0 to 0 dl 1599839764 ref 1 fl Rpc:R/0/ffffffff rc 0/-1
00000100:00000200:7.0:1599839141.507895:0:8199:0:(events.c:114:reply_in_callback()) @@@ unlink  req@ffff99b5661a0900 x1677552166027200/t0(0) o104-&amp;gt;nbptest3-MDT0000@10.141.6.236@o2ib417:15/16 lens 296/224 e 0 to 0 dl 1599839764 ref 1 fl Rpc:R/0/ffffffff rc 0/-1
00000100:00100000:7.0:1599839141.507904:0:8199:0:(client.c:2060:ptlrpc_check_set()) Completed RPC pname:cluuid:pid:xid:nid:opc mdt00_012:nbptest3-MDT0000_UUID:8199:1677552166027200:10.141.6.236@o2ib417:104
00010000:00010000:7.0:1599839141.507912:0:8199:0:(ldlm_lock.c:210:ldlm_lock_put()) ### &lt;span class=&quot;code-keyword&quot;&gt;final&lt;/span&gt; lock_put on destroyed lock, freeing it. ns: mdt-nbptest3-MDT0000_UUID lock: ffff99b56d30f740/0x8e13058be0aaddf2 lrc: 0/0,0 mode: --/EX res: [0x200019271:0x2:0x0].0x0 bits 0x8/0x0 rrc: 489 type: IBT flags: 0x44000000000000 nid: 10.141.6.236@o2ib417 remote: 0xd532d2c8ba1b0d1 expref: 97 pid: 8199 timeout: 0 lvb_type: 3
00010000:00010000:7.0:1599839141.507918:0:8199:0:(ldlm_lockd.c:465:ldlm_add_waiting_lock()) ### not re-adding to wait list(timeout: 412, AT: on) ns: mdt-nbptest3-MDT0000_UUID lock: ffff99b56d30a1c0/0x8e13058be0aade00 lrc: 4/0,0 mode: CR/CR res: [0x200019271:0x2:0x0].0x0 bits 0x8/0x0 rrc: 488 type: IBT flags: 0x70200400000020 nid: 10.141.6.236@o2ib417 remote: 0xd532d2c8ba1b0d1 expref: 96 pid: 8199 timeout: 1350 lvb_type: 0
00010000:00010000:7.0:1599839141.507924:0:8199:0:(ldlm_lockd.c:1401:ldlm_handle_enqueue0()) ### server-side enqueue handler, sending reply(err=0, rc=0) ns: mdt-nbptest3-MDT0000_UUID lock: ffff99b56d30a1c0/0x8e13058be0aade00 lrc: 4/0,0 mode: CR/CR res: [0x200019271:0x2:0x0].0x0 bits 0x8/0x0 rrc: 488 type: IBT flags: 0x60200400000020 nid: 10.141.6.236@o2ib417 remote: 0xd532d2c8ba1b0d1 expref: 96 pid: 8199 timeout: 1350 lvb_type: 0
00010000:00000200:7.0:1599839141.507935:0:8199:0:(ldlm_lib.c:2967:target_send_reply_msg()) @@@ sending reply  req@ffff99b5702d2d00 x1677516826795584/t201863467188(0) o101-&amp;gt;9308d1b4-ec54-b743-7579-5daa9b33d4cd@10.141.6.236@o2ib417:131/0 lens 376/1032 e 0 to 0 dl 1599839846 ref 1 fl Interpret:/0/0 rc 0/0
00000400:00000200:7.0:1599839141.507944:0:8199:0:(lib-move.c:4684:LNetPut()) LNetPut -&amp;gt; 12345-10.141.6.236@o2ib417
00000400:00000200:7.0:1599839141.507946:0:8199:0:(lib-move.c:2478:lnet_handle_send_case_locked()) Source Specified: 10.151.27.53@o2ib to NMR:  10.141.6.236@o2ib417 routed destination
00000400:00000200:7.0:1599839141.507955:0:8199:0:(lib-move.c:1752:lnet_handle_send()) TRACE: 10.151.27.53@o2ib(10.151.27.53@o2ib:10.151.27.53@o2ib) -&amp;gt; 10.141.6.236@o2ib417(10.141.6.236@o2ib417:10.151.26.236@o2ib) : PUT &lt;span class=&quot;code-keyword&quot;&gt;try&lt;/span&gt;# 0
00000100:00100000:7.0:1599839141.507968:0:8199:0:(service.c:2190:ptlrpc_server_handle_request()) Handled RPC pname:cluuid+ref:pid:xid:nid:opc mdt00_012:9308d1b4-ec54-b743-7579-5daa9b33d4cd+96:4780:x1677516826795584:12345-10.141.6.236@o2ib417:101 Request processed in 26145us (26195us total) trans 201863467188 rc 0/0
00000100:00100000:7.0:1599839141.507973:0:8199:0:(nrs_fifo.c:241:nrs_fifo_req_stop()) NRS stop fifo request from 12345-10.141.6.236@o2ib417, seq: 2361
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;*The router get the requests and (tries) to send it out. I wasn&apos;t sure if it actually makes it out of the router. *&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
00000800:00000200:71.0:1599839141.508041:0:25250:0:(o2iblnd_cb.c:343:kiblnd_handle_rx()) Received d1[0] from 10.151.27.53@o2ib
00000400:00000200:71.0:1599839141.508045:0:25250:0:(lib-move.c:4190:lnet_parse()) TRACE: 10.141.6.236@o2ib417(10.151.26.236@o2ib) &amp;lt;- 10.151.27.53@o2ib : PUT - routed
00000400:00000200:71.0:1599839141.508050:0:25250:0:(lib-msg.c:918:lnet_is_health_check()) health check = 0, status = 0, hstatus = 0
00000400:00000200:71.0:1599839141.508053:0:25250:0:(lib-move.c:2478:lnet_handle_send_case_locked()) Source ANY to NMR:  10.141.6.236@o2ib417 local destination
00000400:00000200:71.0:1599839141.508061:0:25250:0:(lib-move.c:1752:lnet_handle_send()) TRACE: 10.151.27.53@o2ib(10.141.26.236@o2ib417:&amp;lt;?&amp;gt;) -&amp;gt; 10.141.6.236@o2ib417(10.141.6.236@o2ib417:10.141.6.236@o2ib417) : PUT &lt;span class=&quot;code-keyword&quot;&gt;try&lt;/span&gt;# 0
00000800:00000200:71.0:1599839141.508065:0:25250:0:(o2iblnd_cb.c:1663:kiblnd_send()) sending 1000 bytes in 1 frags to 12345-10.141.6.236@o2ib417
00000800:00000200:71.0:1599839141.508071:0:25250:0:(o2iblnd.c:405:kiblnd_find_peer_locked()) got peer_ni [ffff9c5a323c9d80] -&amp;gt; 10.141.6.236@o2ib417 (2) version: 12
00000800:00000200:71.0:1599839141.508072:0:25250:0:(o2iblnd_cb.c:1543:kiblnd_launch_tx()) conn[ffff9c4700f2ea00] (68)++
00000800:00000200:71.0:1599839141.508074:0:25250:0:(o2iblnd_cb.c:1316:kiblnd_queue_tx_locked()) conn[ffff9c4700f2ea00] (69)++
00000800:00000200:71.0:1599839141.508077:0:25250:0:(o2iblnd_cb.c:1549:kiblnd_launch_tx()) conn[ffff9c4700f2ea00] (70)--
00000800:00000200:71.0:1599839141.508078:0:25250:0:(o2iblnd_cb.c:205:kiblnd_post_rx()) conn[ffff9c5b223f4200] (69)++
00000800:00000200:71.0:1599839141.508080:0:25250:0:(o2iblnd_cb.c:239:kiblnd_post_rx()) conn[ffff9c5b223f4200] (70)--
00000800:00000200:71.0:1599839141.508081:0:25250:0:(o2iblnd_cb.c:3875:kiblnd_scheduler()) conn[ffff9c5b223f4200] (69)--
00000800:00000200:71.0:1599839141.508457:0:25250:0:(o2iblnd_cb.c:3859:kiblnd_scheduler()) conn[ffff9c5ad91c0200] (70)++
00000800:00000200:71.0:1599839141.508461:0:25250:0:(o2iblnd_cb.c:3875:kiblnd_scheduler()) conn[ffff9c5ad91c0200] (71)--
00000800:00000200:71.0:1599839141.508462:0:25250:0:(o2iblnd_cb.c:3859:kiblnd_scheduler()) conn[ffff9c5ad91c0200] (70)++
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;There is no completion for conn&lt;span class=&quot;error&quot;&gt;&amp;#91;ffff9c5ad91c0200&amp;#93;&lt;/span&gt;.&lt;/p&gt;</comment>
                            <comment id="279385" author="ashehata" created="Fri, 11 Sep 2020 20:36:49 +0000"  >&lt;p&gt;I think if we have a script to monitor the statistics on the router and prove that we have increment of drop counters or other errors around the time of the issue that would be helpful.&lt;/p&gt;

&lt;p&gt;Looking at the stats, it appears that some of the routers have very high number of drops, &amp;gt;12K. Do you think that would indicate a problem? The stats can&apos;t be cleared, so these numbers are from the time the routers came up. But a script can be written to figure out the delta. and see if they get incremented in bursts.&lt;/p&gt;

&lt;p&gt;I&apos;m also seeing a lot of:&lt;/p&gt;

&lt;p&gt;remote_dropped_count in the stats show.&lt;/p&gt;

&lt;p&gt;There are only 2 places where that could get incremented.&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;If the tx completes with failure. Basically message is posted but the underlying stack fails to complete the message&lt;/li&gt;
	&lt;li&gt;838 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;CNETERR(&quot;Dropping message for %s: peer not alive\n&quot;,&lt;br/&gt;
 839 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;libcfs_id2str(msg-&amp;gt;msg_target));&lt;br/&gt;
 840 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;msg-&amp;gt;msg_health_status = LNET_MSG_STATUS_REMOTE_DROPPED;&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;For 2 that should be easy to verify. On the router logs, do you see that message?&lt;/p&gt;

&lt;p&gt;I think that could explain why some of the messages are not making it to the clients. They could be dropped by the router.&lt;/p&gt;</comment>
                            <comment id="279590" author="mhanafi" created="Tue, 15 Sep 2020 06:01:01 +0000"  >&lt;p&gt;Amir,&lt;/p&gt;

&lt;p&gt;I re-ran the test with saving the router stats before and after. None of the drop counter were incremented. Please look at the router debug logs from the last set uploaded to see if you can find anything unusual.&lt;/p&gt;</comment>
                            <comment id="280430" author="mhanafi" created="Wed, 23 Sep 2020 18:28:08 +0000"  >&lt;p&gt;I been trying to re-test but unable to reproduce the issue. I will keep trying.&lt;/p&gt;</comment>
                            <comment id="281582" author="mhanafi" created="Tue, 6 Oct 2020 17:59:44 +0000"  >&lt;p&gt;Patch#4 appears stable - I haven&apos;t seen any more hung threads.&#160;&lt;/p&gt;</comment>
                            <comment id="283338" author="gerrit" created="Tue, 27 Oct 2020 05:36:57 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/40412&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40412&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt; ldlm: Ensure we reprocess the resource on ast error&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: ac40c31fd49b17ad809652cbd85d39617906d213&lt;/p&gt;</comment>
                            <comment id="283568" author="gerrit" created="Thu, 29 Oct 2020 04:37:58 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/39598/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/39598/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt; ldlm: Ensure we reprocess the resource on ast error&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 24e3b5395bc61333a32b1e9725a0d7273925ef05&lt;/p&gt;</comment>
                            <comment id="283611" author="pjones" created="Thu, 29 Oct 2020 12:06:18 +0000"  >&lt;p&gt;Landed for 2.14&lt;/p&gt;</comment>
                            <comment id="284175" author="gerrit" created="Tue, 3 Nov 2020 20:09:12 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/40412/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40412/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt; ldlm: Ensure we reprocess the resource on ast error&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: d3dfcba36ea04809d09d38ccd97da053be85404c&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="59006">LU-13500</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="61325">LU-14069</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="35216" name="lustre-log.1592343544.8525" size="4981410" author="mhanafi" created="Thu, 18 Jun 2020 17:11:07 +0000"/>
                            <attachment id="35215" name="messages" size="52678" author="mhanafi" created="Thu, 18 Jun 2020 17:10:51 +0000"/>
                            <attachment id="35214" name="vmcore-dmesg.txt" size="1045495" author="mhanafi" created="Thu, 18 Jun 2020 17:10:53 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i0132f:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>