<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:49:49 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12120] LustreError: 15069:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG </title>
                <link>https://jira.whamcloud.com/browse/LU-12120</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;First time we have see this lbug. We have a crash dump.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[11168227.609396] Lustre: Skipped 4149 previous similar messages
[11168546.559600] LustreError: 15069:0:(tgt_grant.c:559:tgt_grant_incoming()) nbp2-OST0121: cli 811dc111-6b22-b447-9144-bcd1173a571d/ffff881d57bff000 dirty -610055168 pend 0 grant -643806208
[11168546.609562] LustreError: 15069:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG
[11168546.631425] Pid: 15069, comm: ll_ost_io01_084 3.10.0-693.21.1.el7.20180508.x86_64.lustre2105 #1 SMP Mon Aug 27 23:04:41 UTC 2018
[11168546.631428] Call Trace:
[11168546.631439]  [&amp;lt;ffffffff8103a1f2&amp;gt;] save_stack_trace_tsk+0x22/0x40
[11168546.650223]  [&amp;lt;ffffffffa08597cc&amp;gt;] libcfs_call_trace+0x8c/0xc0 [libcfs]
[11168546.650228]  [&amp;lt;ffffffffa085987c&amp;gt;] lbug_with_loc+0x4c/0xa0 [libcfs]
[11168546.650292]  [&amp;lt;ffffffffa0e8c270&amp;gt;] tgt_grant_prepare_read+0x0/0x3b0 [ptlrpc]
[11168546.650325]  [&amp;lt;ffffffffa0e8c37b&amp;gt;] tgt_grant_prepare_read+0x10b/0x3b0 [ptlrpc]
[11168546.650335]  [&amp;lt;ffffffffa0c86590&amp;gt;] ofd_preprw+0x450/0x1170 [ofd]
[11168546.650365]  [&amp;lt;ffffffffa0e702a5&amp;gt;] tgt_brw_read+0x975/0x1860 [ptlrpc]
[11168546.650394]  [&amp;lt;ffffffffa0e6deca&amp;gt;] tgt_request_handle+0x92a/0x1370 [ptlrpc]
[11168546.650420]  [&amp;lt;ffffffffa0e164bb&amp;gt;] ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
[11168546.650445]  [&amp;lt;ffffffffa0e1a4a2&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[11168546.650449]  [&amp;lt;ffffffff810b1131&amp;gt;] kthread+0xd1/0xe0
[11168546.650452]  [&amp;lt;ffffffff816a14f7&amp;gt;] ret_from_fork+0x77/0xb0
[11168546.650468]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[11168546.650469] Kernel panic - not syncing: LBUG 
 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="55257">LU-12120</key>
            <summary>LustreError: 15069:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="pfarrell">Patrick Farrell</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Tue, 26 Mar 2019 16:14:41 +0000</created>
                <updated>Fri, 7 May 2021 03:27:48 +0000</updated>
                            <resolved>Tue, 4 Jun 2019 12:01:04 +0000</resolved>
                                    <version>Lustre 2.10.5</version>
                                    <fixVersion>Lustre 2.13.0</fixVersion>
                    <fixVersion>Lustre 2.12.3</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>14</watches>
                                                                            <comments>
                            <comment id="244673" author="pfarrell" created="Tue, 26 Mar 2019 16:24:32 +0000"  >&lt;p&gt;Mahmoud,&lt;/p&gt;

&lt;p&gt;Can you attach dmesg from the crash dump to this ticket?&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;edit&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;Ah, sorry - I think you&apos;ll need to upload this to the FTP site, as per usual for your stuff.&#160; I&apos;m a US citizen, so I should be good to do your usual.&lt;/p&gt;</comment>
                            <comment id="244762" author="mhanafi" created="Wed, 27 Mar 2019 18:31:53 +0000"  >&lt;p&gt;There is not much in dmesg. if you need the crash dump i can upload that.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;

[11157553.559563] Lustre: nbp2-OST0009: Connection restored to  (at 10.151.47.247@o2ib)
[11157553.559567] Lustre: Skipped 2229 previous similar messages
[11158158.187717] Lustre: nbp2-OST00e5: Connection restored to  (at 10.151.11.61@o2ib)
[11158158.187721] Lustre: Skipped 2109 previous similar messages
[11158781.287068] Lustre: nbp2-OST0031: Connection restored to  (at 10.151.10.92@o2ib)
[11158781.287072] Lustre: Skipped 89 previous similar messages
[11159441.850269] Lustre: nbp2-OST0009: Connection restored to  (at 10.151.11.61@o2ib)
[11159441.850273] Lustre: Skipped 91 previous similar messages
[11160063.955535] Lustre: nbp2-OST0009: Connection restored to  (at 10.151.11.50@o2ib)
[11160063.955539] Lustre: Skipped 1131 previous similar messages
[11160670.013646] Lustre: nbp2-OST0009: Connection restored to  (at 10.151.11.62@o2ib)
[11160670.013651] Lustre: Skipped 1944 previous similar messages
[11161386.121609] Lustre: nbp2-OST00d1: Connection restored to 48a54ddc-bf2a-1d44-4076-327f70b4e827 (at 10.151.1.239@o2ib)
[11161386.121614] Lustre: Skipped 594 previous similar messages
[11162015.156665] Lustre: nbp2-OST0009: Connection restored to  (at 10.151.32.210@o2ib)
[11162015.156670] Lustre: Skipped 74 previous similar messages
[11162637.629822] Lustre: nbp2-OST00d1: Connection restored to  (at 10.151.11.60@o2ib)
[11162637.629826] Lustre: Skipped 1555 previous similar messages
[11163270.344192] Lustre: nbp2-OST0009: Connection restored to 9b31c728-933f-18fc-bcda-5b799a087513 (at 10.151.1.242@o2ib)
[11163270.344197] Lustre: Skipped 1528 previous similar messages
[11163888.595384] Lustre: nbp2-OST00e5: Connection restored to  (at 10.151.11.57@o2ib)
[11163888.595388] Lustre: Skipped 288 previous similar messages
[11164490.836549] Lustre: nbp2-OST006d: Connection restored to c856fa67-0701-b7f3-02cf-f69db41e8d1d (at 10.151.1.240@o2ib)
[11164490.836554] Lustre: Skipped 1637 previous similar messages
[11165102.952430] Lustre: nbp2-OST0045: Connection restored to  (at 10.151.11.62@o2ib)
[11165102.952434] Lustre: Skipped 8443 previous similar messages
[11165310.128550] Lustre: nbp2-OST015d: haven&lt;span class=&quot;code-quote&quot;&gt;&apos;t heard from client 192bb260-2e08-d556-5bbc-551d3c81b303 (at 10.153.17.23@o2ib233) in 227 seconds. I think it&apos;&lt;/span&gt;s dead, and I am evicting it. exp ffff881a0c74b400, cur 1553565825 expire 1553565675 last 1553565598
[11165310.195673] Lustre: Skipped 17 previous similar messages
[11165312.124432] Lustre: nbp2-OST0009: haven&lt;span class=&quot;code-quote&quot;&gt;&apos;t heard from client 192bb260-2e08-d556-5bbc-551d3c81b303 (at 10.153.17.23@o2ib233) in 227 seconds. I think it&apos;&lt;/span&gt;s dead, and I am evicting it. exp ffff881f31954400, cur 1553565827 expire 1553565677 last 1553565600
[11165674.088234] Lustre: nbp2-OST0009: haven&lt;span class=&quot;code-quote&quot;&gt;&apos;t heard from client e5d71d7a-c600-4ced-e3c1-f2fef9c9cd6e (at 10.153.16.135@o2ib233) in 227 seconds. I think it&apos;&lt;/span&gt;s dead, and I am evicting it. exp ffff8806b193f800, cur 1553566189 expire 1553566039 last 1553565962
[11165674.155621] Lustre: Skipped 16 previous similar messages
[11165685.085120] Lustre: nbp2-OST001d: haven&lt;span class=&quot;code-quote&quot;&gt;&apos;t heard from client e5d71d7a-c600-4ced-e3c1-f2fef9c9cd6e (at 10.153.16.135@o2ib233) in 227 seconds. I think it&apos;&lt;/span&gt;s dead, and I am evicting it. exp ffff8819ea256400, cur 1553566200 expire 1553566050 last 1553565973
[11165685.152521] Lustre: Skipped 16 previous similar messages
[11165702.912656] Lustre: nbp2-OST0009: Connection restored to e9cef456-58ec-d630-9d0f-ff38ed086c7c (at 10.151.54.96@o2ib)
[11165702.912660] Lustre: Skipped 2391 previous similar messages
[11165999.048926] Lustre: nbp2-OST0081: haven&lt;span class=&quot;code-quote&quot;&gt;&apos;t heard from client a6d6cf05-8260-34cd-b6d2-dc585513fd8b (at 10.153.16.179@o2ib233) in 227 seconds. I think it&apos;&lt;/span&gt;s dead, and I am evicting it. exp ffff8805c91b9800, cur 1553566514 expire 1553566364 last 1553566287
[11166363.594085] Lustre: nbp2-OST001d: Connection restored to 122590e3-9341-2c74-ed0b-68dfd0a810e0 (at 10.151.1.236@o2ib)
[11166363.594089] Lustre: Skipped 1438 previous similar messages
[11166981.021408] Lustre: nbp2-OST006d: Connection restored to c856fa67-0701-b7f3-02cf-f69db41e8d1d (at 10.151.1.240@o2ib)
[11166981.021413] Lustre: Skipped 566 previous similar messages
[11167620.549034] Lustre: nbp2-OST0081: Connection restored to 198b10d9-fc0c-7da5-dd67-f2e64d011f82 (at 10.151.2.18@o2ib)
[11167620.549038] Lustre: Skipped 190 previous similar messages
[11168227.609392] Lustre: nbp2-OST00bd: Connection restored to 9b31c728-933f-18fc-bcda-5b799a087513 (at 10.151.1.242@o2ib)
[11168227.609396] Lustre: Skipped 4149 previous similar messages
[11168546.559600] LustreError: 15069:0:(tgt_grant.c:559:tgt_grant_incoming()) nbp2-OST0121: cli 811dc111-6b22-b447-9144-bcd1173a571d/ffff881d57bff000 dirty -610055168 pend 0 grant -643806208
[11168546.609562] LustreError: 15069:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG
[11168546.631425] Pid: 15069, comm: ll_ost_io01_084 3.10.0-693.21.1.el7.20180508.x86_64.lustre2105 #1 SMP Mon Aug 27 23:04:41 UTC 2018
[11168546.631428] Call Trace:
[11168546.631439]  [&amp;lt;ffffffff8103a1f2&amp;gt;] save_stack_trace_tsk+0x22/0x40
[11168546.650223]  [&amp;lt;ffffffffa08597cc&amp;gt;] libcfs_call_trace+0x8c/0xc0 [libcfs]
[11168546.650228]  [&amp;lt;ffffffffa085987c&amp;gt;] lbug_with_loc+0x4c/0xa0 [libcfs]
[11168546.650292]  [&amp;lt;ffffffffa0e8c270&amp;gt;] tgt_grant_prepare_read+0x0/0x3b0 [ptlrpc]
[11168546.650325]  [&amp;lt;ffffffffa0e8c37b&amp;gt;] tgt_grant_prepare_read+0x10b/0x3b0 [ptlrpc]
[11168546.650335]  [&amp;lt;ffffffffa0c86590&amp;gt;] ofd_preprw+0x450/0x1170 [ofd]
[11168546.650365]  [&amp;lt;ffffffffa0e702a5&amp;gt;] tgt_brw_read+0x975/0x1860 [ptlrpc]
[11168546.650394]  [&amp;lt;ffffffffa0e6deca&amp;gt;] tgt_request_handle+0x92a/0x1370 [ptlrpc]
[11168546.650420]  [&amp;lt;ffffffffa0e164bb&amp;gt;] ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
[11168546.650445]  [&amp;lt;ffffffffa0e1a4a2&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[11168546.650449]  [&amp;lt;ffffffff810b1131&amp;gt;] kthread+0xd1/0xe0
[11168546.650452]  [&amp;lt;ffffffff816a14f7&amp;gt;] ret_from_fork+0x77/0xb0
[11168546.650468]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[11168546.650469] Kernel panic - not syncing: LBUG
[11168546.650472] CPU: 18 PID: 15069 Comm: ll_ost_io01_084 Tainted: G           OE  ------------   3.10.0-693.21.1.el7.20180508.x86_64.lustre2105 #1
[11168546.650472] Hardware name: SGI.COM CH-C2112-GP2/X10DRU-i+, BIOS 1.0b 05/08/2015
[11168546.650473] Call Trace:
[11168546.650477]  [&amp;lt;ffffffff8168f4b8&amp;gt;] dump_stack+0x19/0x1b
[11168546.650480]  [&amp;lt;ffffffff81689aa2&amp;gt;] panic+0xe8/0x21f
[11168546.650486]  [&amp;lt;ffffffffa08598cb&amp;gt;] lbug_with_loc+0x9b/0xa0 [libcfs]
[11168546.650515]  [&amp;lt;ffffffffa0e8c270&amp;gt;] tgt_grant_incoming.isra.5+0x540/0x540 [ptlrpc]
[11168546.650544]  [&amp;lt;ffffffffa0e8c37b&amp;gt;] tgt_grant_prepare_read+0x10b/0x3b0 [ptlrpc]
[11168546.650549]  [&amp;lt;ffffffffa0c86590&amp;gt;] ofd_preprw+0x450/0x1170 [ofd]
[11168546.650576]  [&amp;lt;ffffffffa0e3082f&amp;gt;] ? __req_capsule_get+0x15f/0x740 [ptlrpc]
[11168546.650605]  [&amp;lt;ffffffffa0e702a5&amp;gt;] tgt_brw_read+0x975/0x1860 [ptlrpc]
[11168546.650632]  [&amp;lt;ffffffffa099c709&amp;gt;] ? lprocfs_counter_add+0xf9/0x160 [obdclass]
[11168546.650658]  [&amp;lt;ffffffffa0e42fb6&amp;gt;] ? null_alloc_rs+0x176/0x330 [ptlrpc]
[11168546.650683]  [&amp;lt;ffffffffa0e0b04f&amp;gt;] ? lustre_pack_reply_v2+0x14f/0x280 [ptlrpc]
[11168546.650707]  [&amp;lt;ffffffffa0e0b1ef&amp;gt;] ? lustre_pack_reply_flags+0x6f/0x1e0 [ptlrpc]
[11168546.650731]  [&amp;lt;ffffffffa0e0b371&amp;gt;] ? lustre_pack_reply+0x11/0x20 [ptlrpc]
[11168546.650760]  [&amp;lt;ffffffffa0e6deca&amp;gt;] tgt_request_handle+0x92a/0x1370 [ptlrpc]
[11168546.650786]  [&amp;lt;ffffffffa0e164bb&amp;gt;] ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
[11168546.650789]  [&amp;lt;ffffffff810bad2b&amp;gt;] ? __wake_up_common+0x5b/0x90
[11168546.650813]  [&amp;lt;ffffffffa0e1a4a2&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[11168546.650837]  [&amp;lt;ffffffffa0e19a10&amp;gt;] ? ptlrpc_register_service+0xe30/0xe30 [ptlrpc]
[11168546.650839]  [&amp;lt;ffffffff810b1131&amp;gt;] kthread+0xd1/0xe0
[11168546.650841]  [&amp;lt;ffffffff810b1060&amp;gt;] ? insert_kthread_work+0x40/0x40
[11168546.650843]  [&amp;lt;ffffffff816a14f7&amp;gt;] ret_from_fork+0x77/0xb0
[11168546.650844]  [&amp;lt;ffffffff810b1060&amp;gt;] ? insert_kthread_work+0x40/0x40
 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="244768" author="pfarrell" created="Wed, 27 Mar 2019 19:41:33 +0000"  >&lt;p&gt;Hmm, OK, yeah - I was looking for evictions, and I see them.&#160; There is a known (not yet resolved, but known) bug in the grant code that sometimes happens with evictions.&lt;/p&gt;

&lt;p&gt;While the LBUG is not good (we have a bug open to try to improve the behavior here in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11939&quot; title=&quot;ASSERTION( tgd-&amp;gt;tgd_tot_granted &amp;gt;= ted-&amp;gt;ted_grant ) on OSS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11939&quot;&gt;&lt;del&gt;LU-11939&lt;/del&gt;&lt;/a&gt;), the more serious problem is all of those (thousands) of lost connections and the evictions.&#160; Those look like they are probably network errors, and there are a &lt;b&gt;lot&lt;/b&gt; of them.&#160; Do you have any insight on why those are happening?&#160; If you can resolve those, you shouldn&apos;t see this crash again.&#160; (And, also, evictions will cause job failures.)&lt;/p&gt;

&lt;p&gt;Without more info, all I can say is they might be network failures.&lt;/p&gt;</comment>
                            <comment id="244781" author="mhanafi" created="Thu, 28 Mar 2019 02:56:06 +0000"  >&lt;p&gt;Yeah the evictions is part of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11644&quot; title=&quot;LNet: Service thread inactive for 300  causes client evictions &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11644&quot;&gt;LU-11644&lt;/a&gt;. We are planning on trying reverting &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt; next week. &lt;/p&gt;</comment>
                            <comment id="245045" author="pfarrell" created="Mon, 1 Apr 2019 16:28:08 +0000"  >&lt;p&gt;Mahmoud,&lt;/p&gt;

&lt;p&gt;Sorry for the delayed reply, got lost in the shuffle.&lt;/p&gt;

&lt;p&gt;So as I noted, this crash &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11939&quot; title=&quot;ASSERTION( tgd-&amp;gt;tgd_tot_granted &amp;gt;= ted-&amp;gt;ted_grant ) on OSS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11939&quot;&gt;&lt;del&gt;LU-11939&lt;/del&gt;&lt;/a&gt;, and this should go away once the evictions are resolved.&#160; Are you OK with that as a resolution for your issue?&#160; We will get &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11939&quot; title=&quot;ASSERTION( tgd-&amp;gt;tgd_tot_granted &amp;gt;= ted-&amp;gt;ted_grant ) on OSS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11939&quot;&gt;&lt;del&gt;LU-11939&lt;/del&gt;&lt;/a&gt; fixed (and we plan put that in our 2.12 maintenance release), but it&apos;s not been a priority because it only happens in situations where severe problems are &lt;b&gt;already&lt;/b&gt; occurring (specifically, high numbers of evictions).&lt;/p&gt;</comment>
                            <comment id="246143" author="sthiell" created="Mon, 22 Apr 2019 04:23:51 +0000"  >&lt;p&gt;We hit this today on 6 OSS, several times with 2.12.0+patches. It was painful, it would be nice to avoid the LBUG. But like Patrick said, severe problems were already happening (DOM locking issue in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12037&quot; title=&quot;Possible DNE issue leading to hung filesystem&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12037&quot;&gt;&lt;del&gt;LU-12037&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11359&quot; title=&quot;racer test 1 times out with client hung in dir_create.sh, ls, &#8230; and MDS in ldlm_completion_ast()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11359&quot;&gt;&lt;del&gt;LU-11359&lt;/del&gt;&lt;/a&gt;).&lt;/p&gt;</comment>
                            <comment id="246171" author="sthiell" created="Mon, 22 Apr 2019 17:33:43 +0000"  >&lt;p&gt;Hit again after about 1 hour uptime:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[Mon Apr 22 10:20:58 2019][ 6643.068873] LustreError: 81562:0:(tgt_grant.c:563:tgt_grant_incoming()) fir-OST0011: cli b214bbaa-dfa8-5550-6c23-a4a7f34b2d1a/ffff970be7f44800 dirty 4169728 pend 0 grant -12582912^M
[Mon Apr 22 10:20:58 2019][ 6643.084869] LustreError: 81562:0:(tgt_grant.c:565:tgt_grant_incoming()) LBUG^M
[Mon Apr 22 10:20:58 2019][ 6643.091935] Pid: 81562, comm: ll_ost_io01_025 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018^M
[Mon Apr 22 10:20:58 2019][ 6643.102364] Call Trace:^M
[Mon Apr 22 10:20:58 2019][ 6643.104866]  [&amp;lt;ffffffffc09947cc&amp;gt;] libcfs_call_trace+0x8c/0xc0 [libcfs]^M
[Mon Apr 22 10:20:58 2019][ 6643.111440]  [&amp;lt;ffffffffc099487c&amp;gt;] lbug_with_loc+0x4c/0xa0 [libcfs]^M
[Mon Apr 22 10:20:58 2019][ 6643.117678]  [&amp;lt;ffffffffc0dd66f0&amp;gt;] tgt_grant_prepare_read+0x0/0x3b0 [ptlrpc]^M
[Mon Apr 22 10:20:58 2019][ 6643.124732]  [&amp;lt;ffffffffc0dd81d4&amp;gt;] tgt_grant_prepare_write+0x2f4/0xf70 [ptlrpc]^M
[Mon Apr 22 10:20:58 2019][ 6643.132032]  [&amp;lt;ffffffffc13aea9c&amp;gt;] ofd_preprw_write.isra.31+0x14c/0xe60 [ofd]^M
[Mon Apr 22 10:20:58 2019][ 6643.139121]  [&amp;lt;ffffffffc13afbd2&amp;gt;] ofd_preprw+0x422/0x1160 [ofd]^M
[Mon Apr 22 10:20:58 2019][ 6643.145075]  [&amp;lt;ffffffffc0db54be&amp;gt;] tgt_brw_write+0xc7e/0x1a90 [ptlrpc]^M
[Mon Apr 22 10:20:58 2019][ 6643.151584]  [&amp;lt;ffffffffc0db910a&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]^M
[Mon Apr 22 10:20:58 2019][ 6643.158566]  [&amp;lt;ffffffffc0d5d6db&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]^M
[Mon Apr 22 10:20:58 2019][ 6643.166284]  [&amp;lt;ffffffffc0d6100c&amp;gt;] ptlrpc_main+0xafc/0x1fc0 [ptlrpc]^M
[Mon Apr 22 10:20:58 2019][ 6643.172657]  [&amp;lt;ffffffff8aac1c31&amp;gt;] kthread+0xd1/0xe0^M
[Mon Apr 22 10:20:58 2019][ 6643.177576]  [&amp;lt;ffffffff8b174c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21^M
[Mon Apr 22 10:20:58 2019][ 6643.184081]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff^M
[Mon Apr 22 10:20:58 2019][ 6643.189112] Kernel panic - not syncing: LBUG^M
[Mon Apr 22 10:20:58 2019][ 6643.193392] CPU: 13 PID: 81562 Comm: ll_ost_io01_025 Kdump: loaded Tainted: G           OE  ------------   3.10.0-957.1.3.el7_lustre.x86_64 #1^M
[Mon Apr 22 10:20:58 2019][ 6643.206155] Hardware name: Dell Inc. PowerEdge R6415/065PKD, BIOS 1.6.7 10/29/2018^M
[Mon Apr 22 10:20:58 2019][ 6643.213723] Call Trace:^M
[Mon Apr 22 10:20:58 2019][ 6643.216183]  [&amp;lt;ffffffff8b161e41&amp;gt;] dump_stack+0x19/0x1b^M
[Mon Apr 22 10:20:58 2019][ 6643.221326]  [&amp;lt;ffffffff8b15b550&amp;gt;] panic+0xe8/0x21f^M
[Mon Apr 22 10:20:59 2019][ 6643.226124]  [&amp;lt;ffffffffc09948cb&amp;gt;] lbug_with_loc+0x9b/0xa0 [libcfs]^M
[Mon Apr 22 10:20:59 2019][ 6643.232358]  [&amp;lt;ffffffffc0dd66f0&amp;gt;] tgt_grant_incoming.isra.6+0x540/0x540 [ptlrpc]^M
[Mon Apr 22 10:20:59 2019][ 6643.239788]  [&amp;lt;ffffffffc0dd81d4&amp;gt;] tgt_grant_prepare_write+0x2f4/0xf70 [ptlrpc]^M
[Mon Apr 22 10:20:59 2019][ 6643.247038]  [&amp;lt;ffffffffc0b18423&amp;gt;] ? lu_object_find_at+0x1b3/0x280 [obdclass]^M
[Mon Apr 22 10:20:59 2019][ 6643.254091]  [&amp;lt;ffffffffc13aea9c&amp;gt;] ofd_preprw_write.isra.31+0x14c/0xe60 [ofd]^M
[Mon Apr 22 10:20:59 2019][ 6643.261144]  [&amp;lt;ffffffffc13afbd2&amp;gt;] ofd_preprw+0x422/0x1160 [ofd]^M
[Mon Apr 22 10:20:59 2019][ 6643.267102]  [&amp;lt;ffffffffc0db54be&amp;gt;] tgt_brw_write+0xc7e/0x1a90 [ptlrpc]^M
[Mon Apr 22 10:20:59 2019][ 6643.273571]  [&amp;lt;ffffffffc0d4f0a7&amp;gt;] ? lustre_msg_buf+0x17/0x60 [ptlrpc]^M
[Mon Apr 22 10:20:59 2019][ 6643.280047]  [&amp;lt;ffffffffc0d76b4f&amp;gt;] ? __req_capsule_get+0x15f/0x740 [ptlrpc]^M
[Mon Apr 22 10:20:59 2019][ 6643.286957]  [&amp;lt;ffffffffc0af8279&amp;gt;] ? class_handle2object+0xb9/0x1c0 [obdclass]^M
[Mon Apr 22 10:20:59 2019][ 6643.294095]  [&amp;lt;ffffffff8aae011c&amp;gt;] ? update_curr+0x14c/0x1e0^M
[Mon Apr 22 10:20:59 2019][ 6643.299673]  [&amp;lt;ffffffff8aadca7e&amp;gt;] ? account_entity_dequeue+0xae/0xd0^M
[Mon Apr 22 10:20:59 2019][ 6643.306025]  [&amp;lt;ffffffff8aadb738&amp;gt;] ? __enqueue_entity+0x78/0x80^M
[Mon Apr 22 10:20:59 2019][ 6643.311859]  [&amp;lt;ffffffff8b165e92&amp;gt;] ? mutex_lock+0x12/0x2f^M
[Mon Apr 22 10:20:59 2019][ 6643.317212]  [&amp;lt;ffffffffc0db910a&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]^M
[Mon Apr 22 10:20:59 2019][ 6643.324118]  [&amp;lt;ffffffffc0d92801&amp;gt;] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc]^M
[Mon Apr 22 10:20:59 2019][ 6643.331694]  [&amp;lt;ffffffffc0994bde&amp;gt;] ? ktime_get_real_seconds+0xe/0x10 [libcfs]^M
[Mon Apr 22 10:20:59 2019][ 6643.338772]  [&amp;lt;ffffffffc0d5d6db&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]^M
[Mon Apr 22 10:20:59 2019][ 6643.346462]  [&amp;lt;ffffffffc0d5a565&amp;gt;] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc]^M
[Mon Apr 22 10:20:59 2019][ 6643.353251]  [&amp;lt;ffffffff8aad67c2&amp;gt;] ? default_wake_function+0x12/0x20^M
[Mon Apr 22 10:20:59 2019][ 6643.359519]  [&amp;lt;ffffffff8aacba9b&amp;gt;] ? __wake_up_common+0x5b/0x90^M
[Mon Apr 22 10:20:59 2019][ 6643.365380]  [&amp;lt;ffffffffc0d6100c&amp;gt;] ptlrpc_main+0xafc/0x1fc0 [ptlrpc]^M
[Mon Apr 22 10:20:59 2019][ 6643.371679]  [&amp;lt;ffffffffc0d60510&amp;gt;] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc]^M
[Mon Apr 22 10:20:59 2019][ 6643.379080]  [&amp;lt;ffffffff8aac1c31&amp;gt;] kthread+0xd1/0xe0^M
[Mon Apr 22 10:20:59 2019][ 6643.383957]  [&amp;lt;ffffffff8aac1b60&amp;gt;] ? insert_kthread_work+0x40/0x40^M
[Mon Apr 22 10:20:59 2019][ 6643.390052]  [&amp;lt;ffffffff8b174c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21^M
[Mon Apr 22 10:20:59 2019][ 6643.396497]  [&amp;lt;ffffffff8aac1b60&amp;gt;] ? insert_kthread_work+0x40/0x40^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Filesystem was down last night due to this LBUG, in the morning we ran a fsck on this OST (with e2fsprogs-1.44.5.wc1-0.el7.x86_64) that fixed the following things:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Apr 22 08:32:31 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011      Starting fsck
Apr 22 08:32:32 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011      MMP interval is 10 seconds and total wait time is 42 seconds. Please wait...
Apr 22 08:33:15 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011      fir-OST0011: recovering journal
Apr 22 08:33:42 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011      Pass 1: Checking inodes, blocks, and sizes
Apr 22 08:36:28 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011      Pass 2: Checking directory structure
Apr 22 08:37:14 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011      Pass 3: Checking directory connectivity
Apr 22 08:37:17 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011      Pass 4: Checking reference counts
Apr 22 08:37:25 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011      Pass 5: Checking group summary information
Apr 22 08:37:38 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011      Free blocks count wrong (10628562260, counted=10628555100).
Apr 22 08:37:38 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011      Fix? yes
Apr 22 08:37:38 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011
Apr 22 08:37:46 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011      Free inodes count wrong (53346944, counted=53345769).
Apr 22 08:37:46 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011      Fix? yes
Apr 22 08:37:46 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011
Apr 22 08:37:46 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011      Update quota info for quota type 0? yes
Apr 22 08:37:46 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011
Apr 22 08:37:47 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011      Update quota info for quota type 1? yes
Apr 22 08:37:47 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011
Apr 22 08:38:13 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011
Apr 22 08:38:13 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011      fir-OST0011: ***** FILE SYSTEM WAS MODIFIED *****
Apr 22 08:38:13 fir-io2-s2 journal: Shine.Lustre.Actions.Fsck INFO  fir-OST0011      fir-OST0011: 7700375/61046144 files (4.1% non-contiguous), 4999233699/15627788800 blocks
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;But it just crashed anyway. Any recommendation are welcomed. We&apos;re running 2.12.0 + the following patches on all servers:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2eec4f8 LU-12018 quota: do not start a thread under memory pressure
1819063 LU-11359 mdt: fix mdt_dom_discard_data() timeouts
3ed10f4 LU-11964 mdc: prevent glimpse lock count grow
565011c LU-12037 mdt: add option for cross-MDT rename
b6be1d9 LU-12065 lnd: increase CQ entries
6b2c97b LU-12037 mdt: call mdt_dom_discard_data() after rename unlock
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Those crashes seem to have started only after we applied &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11359&quot; title=&quot;racer test 1 times out with client hung in dir_create.sh, ls, &#8230; and MDS in ldlm_completion_ast()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11359&quot;&gt;&lt;del&gt;LU-11359&lt;/del&gt;&lt;/a&gt; mdt: fix mdt_dom_discard_data() timeouts&quot; which seems very weird... not sure how it could be related..&lt;/p&gt;</comment>
                            <comment id="246173" author="sthiell" created="Mon, 22 Apr 2019 17:44:23 +0000"  >&lt;p&gt;Do you think disabling ost quota enforcement could help avoiding these LBUG (for now)?&lt;/p&gt;</comment>
                            <comment id="246174" author="sthiell" created="Mon, 22 Apr 2019 18:19:23 +0000"  >&lt;p&gt;We&apos;re actually investigating if this could be caused by &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11644&quot; title=&quot;LNet: Service thread inactive for 300  causes client evictions &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11644&quot;&gt;LU-11644&lt;/a&gt; reported by NASA, as I noticed a few similar call traces:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Apr 22 10:48:49 fir-io3-s2 kernel: LustreError: 78496:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.9.101.28@o2ib4) failed to reply to blocking AST (req@ffff981d18ba8c00 x1631526698598368 status 0 rc -110), evict it ns: filter-fir-OST0023_UUID lock: ffff98016d218b40/0xd80148300515f99f lrc: 4/0,0 mo
Apr 22 10:48:49 fir-io3-s2 kernel: LustreError: 138-a: fir-OST001b: A client on nid 10.9.101.28@o2ib4 was evicted due to a lock blocking callback time out: rc -110
Apr 22 10:48:49 fir-io3-s2 kernel: LustreError: 77026:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 105s: evicting client at 10.9.101.28@o2ib4  ns: filter-fir-OST001b_UUID lock: ffff97f619e7af40/0xd80148300516127a lrc: 3/0,0 mode: PR/PR res: [0xd00000401:0x91e8cf:0x0].0x0 rrc: 3 type
Apr 22 10:48:49 fir-io3-s2 kernel: LustreError: 78496:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) Skipped 1 previous similar message
Apr 22 10:49:49 fir-io3-s2 kernel: LNet: Service thread pid 77058 was inactive for 200.48s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
Apr 22 10:49:49 fir-io3-s2 kernel: Pid: 77058, comm: ll_ost00_000 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018
Apr 22 10:49:49 fir-io3-s2 kernel: Call Trace:
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f9b640&amp;gt;] ptlrpc_set_wait+0x500/0x8d0 [ptlrpc]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f58fe5&amp;gt;] ldlm_run_ast_work+0xd5/0x3a0 [ptlrpc]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f7a69b&amp;gt;] ldlm_glimpse_locks+0x3b/0x100 [ptlrpc]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc162910b&amp;gt;] ofd_intent_policy+0x69b/0x920 [ofd]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f59d26&amp;gt;] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f826d7&amp;gt;] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc10090b2&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc101010a&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0fb46db&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0fb800c&amp;gt;] ptlrpc_main+0xafc/0x1fc0 [ptlrpc]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffff8b8c1c31&amp;gt;] kthread+0xd1/0xe0
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffff8bf74c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Apr 22 10:49:49 fir-io3-s2 kernel: LustreError: dumping log to /tmp/lustre-log.1555955389.77058
Apr 22 10:49:49 fir-io3-s2 kernel: Pid: 78291, comm: ll_ost00_019 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018
Apr 22 10:49:49 fir-io3-s2 kernel: Call Trace:
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f9b640&amp;gt;] ptlrpc_set_wait+0x500/0x8d0 [ptlrpc]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f58fe5&amp;gt;] ldlm_run_ast_work+0xd5/0x3a0 [ptlrpc]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f7a69b&amp;gt;] ldlm_glimpse_locks+0x3b/0x100 [ptlrpc]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc162910b&amp;gt;] ofd_intent_policy+0x69b/0x920 [ofd]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f59d26&amp;gt;] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f826d7&amp;gt;] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc10090b2&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc101010a&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0fb46db&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0fb800c&amp;gt;] ptlrpc_main+0xafc/0x1fc0 [ptlrpc]
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffff8b8c1c31&amp;gt;] kthread+0xd1/0xe0
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffff8bf74c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
Apr 22 10:49:49 fir-io3-s2 kernel:  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Apr 22 10:49:49 fir-io3-s2 kernel: Lustre: fir-OST0019: Connection restored to 708f469b-f504-7275-d236-e90aa8bcec67 (at 10.9.0.64@o2ib4)
Apr 22 10:49:49 fir-io3-s2 kernel: Lustre: Skipped 24 previous similar messages
Apr 22 10:49:55 fir-io3-s2 kernel: LNet: Service thread pid 78318 was inactive for 200.37s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
Apr 22 10:49:55 fir-io3-s2 kernel: LNet: Skipped 1 previous similar message
Apr 22 10:49:55 fir-io3-s2 kernel: Pid: 78318, comm: ll_ost02_032 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018
Apr 22 10:49:55 fir-io3-s2 kernel: Call Trace:
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f9b640&amp;gt;] ptlrpc_set_wait+0x500/0x8d0 [ptlrpc]
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f58fe5&amp;gt;] ldlm_run_ast_work+0xd5/0x3a0 [ptlrpc]
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f7a69b&amp;gt;] ldlm_glimpse_locks+0x3b/0x100 [ptlrpc]
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc162910b&amp;gt;] ofd_intent_policy+0x69b/0x920 [ofd]
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f59d26&amp;gt;] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc]
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f826d7&amp;gt;] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc]
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc10090b2&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc101010a&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0fb46db&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0fb800c&amp;gt;] ptlrpc_main+0xafc/0x1fc0 [ptlrpc]
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffff8b8c1c31&amp;gt;] kthread+0xd1/0xe0
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffff8bf74c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Apr 22 10:49:55 fir-io3-s2 kernel: LustreError: dumping log to /tmp/lustre-log.1555955395.78318
Apr 22 10:49:55 fir-io3-s2 kernel: Pid: 78868, comm: ll_ost03_105 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018
Apr 22 10:49:55 fir-io3-s2 kernel: Call Trace:
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f9b640&amp;gt;] ptlrpc_set_wait+0x500/0x8d0 [ptlrpc]
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f58fe5&amp;gt;] ldlm_run_ast_work+0xd5/0x3a0 [ptlrpc]
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f7a69b&amp;gt;] ldlm_glimpse_locks+0x3b/0x100 [ptlrpc]
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc162910b&amp;gt;] ofd_intent_policy+0x69b/0x920 [ofd]
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f59d26&amp;gt;] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc]
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0f826d7&amp;gt;] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc]
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc10090b2&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc101010a&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
Apr 22 10:49:55 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0fb46db&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Apr 22 10:49:56 fir-io3-s2 kernel:  [&amp;lt;ffffffffc0fb800c&amp;gt;] ptlrpc_main+0xafc/0x1fc0 [ptlrpc]
Apr 22 10:49:56 fir-io3-s2 kernel:  [&amp;lt;ffffffff8b8c1c31&amp;gt;] kthread+0xd1/0xe0
Apr 22 10:49:56 fir-io3-s2 kernel:  [&amp;lt;ffffffff8bf74c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
Apr 22 10:49:56 fir-io3-s2 kernel:  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Apr 22 10:50:12 fir-io3-s2 kernel: Lustre: fir-OST0021: haven&apos;t heard from client 83316b44-aad9-bbe2-1580-9c149ecd135e (at 10.9.101.28@o2ib4) in 227 seconds. I think it&apos;s dead, and I am evicting it. exp ffff97fd659af000, cur 1555955412 expire 1555955262 last 1555955185
Apr 22 10:50:12 fir-io3-s2 kernel: Lustre: Skipped 19 previous similar messages
Apr 22 10:50:12 fir-io3-s2 kernel: LNet: Service thread pid 77058 completed after 223.36s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
Apr 22 10:50:12 fir-io3-s2 kernel: LustreError: 78624:0:(client.c:1175:ptlrpc_import_delay_req()) @@@ IMP_CLOSED   req@ffff981b3d9bad00 x1631526704122896/t0(0) o104-&amp;gt;fir-OST0021@10.9.101.28@o2ib4:15/16 lens 296/224 e 0 to 0 dl 0 ref 1 fl Rpc:/0/ffffffff rc 0/-1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I have enabled +lnet debug on all servers to see how it goes.&lt;/p&gt;</comment>
                            <comment id="246175" author="pfarrell" created="Mon, 22 Apr 2019 18:39:59 +0000"  >&lt;p&gt;Stephane,&lt;/p&gt;

&lt;p&gt;If true, that would seem to put you in &lt;b&gt;exactly&lt;/b&gt; the same situation as NASA, who are also hitting that LBUG because of the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11644&quot; title=&quot;LNet: Service thread inactive for 300  causes client evictions &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11644&quot;&gt;LU-11644&lt;/a&gt; evictions.&lt;/p&gt;

&lt;p&gt;By the way, I meant to close this bug, was just waiting on a reply from Mahmoud - the LBUG is tracked under &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11939&quot; title=&quot;ASSERTION( tgd-&amp;gt;tgd_tot_granted &amp;gt;= ted-&amp;gt;ted_grant ) on OSS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11939&quot;&gt;&lt;del&gt;LU-11939&lt;/del&gt;&lt;/a&gt; if you want to follow that.&lt;/p&gt;</comment>
                            <comment id="246188" author="sthiell" created="Tue, 23 Apr 2019 02:42:31 +0000"  >&lt;p&gt;Patrick, I think this may actually be due to another thing, likely an OSS deadlock. Please see &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12215&quot; title=&quot;OSS network errors 2.12&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12215&quot;&gt;&lt;del&gt;LU-12215&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="247763" author="pjones" created="Mon, 27 May 2019 14:39:53 +0000"  >&lt;p&gt;Note that some comments that may assist in the analysis have been added to the resolved unassigned ticket &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12322&quot; title=&quot;negative grant and tgt_grant.c:561:tgt_grant_incoming() LBUG&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12322&quot;&gt;&lt;del&gt;LU-12322&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="247930" author="bougetq" created="Wed, 29 May 2019 11:36:10 +0000"  >&lt;p&gt;I am fairly certain that the code that generates the negative value is in &lt;tt&gt;tgt_grant_shrink()&lt;/tt&gt;.&lt;/p&gt;

&lt;p&gt;@Stephane, as a temporary fix, you can try deactivating grant shrinking on clients with:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param osc.*.grant_shrink=0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I do not know how much of an impact this has on OSSes, but considering the feature was disable by default up until july 2018 (cf. &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8708&quot; title=&quot;Grant shrinking disabled all the time&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8708&quot;&gt;&lt;del&gt;LU-8708&lt;/del&gt;&lt;/a&gt;), I assume it should be bearable.&lt;/p&gt;</comment>
                            <comment id="247946" author="tappro" created="Wed, 29 May 2019 13:25:42 +0000"  >&lt;p&gt;Quentin, I agree with your analysis and also think that &lt;tt&gt;tgt_grant_shrink&lt;/tt&gt; should not decrease &lt;tt&gt;ted_grant&lt;/tt&gt; blindly but do sanity checks there. I will prepare patch today&lt;/p&gt;</comment>
                            <comment id="247957" author="bougetq" created="Wed, 29 May 2019 14:58:46 +0000"  >&lt;p&gt;Ok, thanks Mikhail!&lt;/p&gt;

&lt;p&gt;From what I understand clients and servers maintain separate views of grants and this LU relates to pairs of (client; server) that drift apart from each other. At the next shrink request, that results in the client asking to give back more space than the server remembers granting.&lt;/p&gt;

&lt;p&gt;&amp;#8212;&lt;/p&gt;

&lt;p&gt;Does anyone know if there a valid reason as to why in &lt;tt&gt;tgt_grant_incoming()&lt;/tt&gt;:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-c&quot;&gt;
&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;&lt;span class=&quot;code-object&quot;&gt;void&lt;/span&gt;&lt;/span&gt; tgt_grant_incoming(...)
{
        ...
        CDEBUG(D_CACHE, &lt;span class=&quot;code-quote-red&quot;&gt;&quot;%s: cli %s/%p reports grant %llu dropped %u, local %lu\n&quot;&lt;/span&gt;,
               obd-&amp;gt;obd_name, exp-&amp;gt;exp_client_uuid.uuid, exp, oa-&amp;gt;o_grant,
               oa-&amp;gt;o_dropped, ted-&amp;gt;ted_grant);
        ...
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The value printed after &quot;reports grant&quot; would not be the same as the value printed after &quot;local&quot;?&lt;br/&gt;
Can this be used to track the desynchronization?&lt;/p&gt;</comment>
                            <comment id="248041" author="gerrit" created="Thu, 30 May 2019 09:34:30 +0000"  >&lt;p&gt;Mike Pershin (mpershin@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/34996&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34996&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12120&quot; title=&quot;LustreError: 15069:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12120&quot;&gt;&lt;del&gt;LU-12120&lt;/del&gt;&lt;/a&gt; grants: prevent negative ted_grant value&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 296ac6b910880dc65c9cee8be2552bba759cea57&lt;/p&gt;</comment>
                            <comment id="248327" author="gerrit" created="Tue, 4 Jun 2019 05:26:55 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/34996/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34996/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12120&quot; title=&quot;LustreError: 15069:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12120&quot;&gt;&lt;del&gt;LU-12120&lt;/del&gt;&lt;/a&gt; grants: prevent negative ted_grant value&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 7e08317ef5cbed5cd587017cbe343eb4cc52822c&lt;/p&gt;</comment>
                            <comment id="248357" author="pjones" created="Tue, 4 Jun 2019 12:01:04 +0000"  >&lt;p&gt;Landed for 2.13&lt;/p&gt;</comment>
                            <comment id="248564" author="gerrit" created="Thu, 6 Jun 2019 16:56:47 +0000"  >&lt;p&gt;Minh Diep (mdiep@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/35084&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/35084&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12120&quot; title=&quot;LustreError: 15069:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12120&quot;&gt;&lt;del&gt;LU-12120&lt;/del&gt;&lt;/a&gt; grants: prevent negative ted_grant value&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 82ad1cf4fd9298f95a8f233ee18feef363b10ebb&lt;/p&gt;</comment>
                            <comment id="249535" author="gerrit" created="Thu, 20 Jun 2019 03:56:12 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/35084/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/35084/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12120&quot; title=&quot;LustreError: 15069:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12120&quot;&gt;&lt;del&gt;LU-12120&lt;/del&gt;&lt;/a&gt; grants: prevent negative ted_grant value&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 83d43ecc947fa8f43bd5c7f792f3788cadd0d3ef&lt;/p&gt;</comment>
                            <comment id="254300" author="mhanafi" created="Fri, 6 Sep 2019 21:47:38 +0000"  >&lt;p&gt;We need a 2.10 backport. Our 2.10.7 server this lbug&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 7955125.139926] LustreError: 23678:0:(tgt_grant.c:1068:tgt_grant_discard()) ASSERTION( tgd-&amp;gt;tgd_tot_granted &amp;gt;= ted-&amp;gt;ted_grant ) failed: nbp7-OST0017: tot_granted 7448219072 cli 4817e87d-d95a-1480-0cbd-835e2c3966b1/ffff880f68fdc400 ted_grant -49152
[7955125.164200] LustreError: 23678:0:(tgt_grant.c:1068:tgt_grant_discard()) LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="254301" author="sthiell" created="Fri, 6 Sep 2019 21:59:24 +0000"  >&lt;p&gt;We used the patch from&#160;&lt;a href=&quot;https://review.whamcloud.com/35084/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/35084/&lt;/a&gt;&#160;which applied against 2.10.8 with no change (or it was trivial).&lt;/p&gt;</comment>
                            <comment id="254302" author="jaylan" created="Fri, 6 Sep 2019 22:00:41 +0000"  >&lt;p&gt;The patch was applied cleanly to nas-2.10.8. No need of a backport.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="54805">LU-11939</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="55693">LU-12322</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="56908">LU-12759</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00dxr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>