<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:51:33 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12322] negative grant and tgt_grant.c:561:tgt_grant_incoming() LBUG</title>
                <link>https://jira.whamcloud.com/browse/LU-12322</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;New LBUG tonight with 2.10.7, likely a duplicate of&#160;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12120&quot; title=&quot;LustreError: 15069:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12120&quot;&gt;&lt;del&gt;LU-12120&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[219417.266382] LustreError: 281433:0:(tgt_grant.c:559:tgt_grant_incoming()) oak-OST0053: cli 0eb5afeb-9924-327f-d61d-428dac6cb441/ffff883c7ef04800 dirty 0 pend 0 grant -29360128
[219417.283969] LustreError: 281433:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG
[219417.292035] Pid: 281433, comm: ll_ost00_045 3.10.0-693.2.2.el7_lustre.pl3.x86_64 #1 SMP Thu Mar 15 13:06:45 PDT 2018
[219417.303881] Call Trace:
[219417.306716] [&amp;lt;ffffffff8103a212&amp;gt;] save_stack_trace_tsk+0x22/0x40
[219417.313560] [&amp;lt;ffffffffc08087cc&amp;gt;] libcfs_call_trace+0x8c/0xc0 [libcfs]
[219417.320982] [&amp;lt;ffffffffc080887c&amp;gt;] lbug_with_loc+0x4c/0xa0 [libcfs]
[219417.328013] [&amp;lt;ffffffffc0bf81b0&amp;gt;] tgt_grant_prepare_read+0x0/0x3b0 [ptlrpc]
[219417.335979] [&amp;lt;ffffffffc0bf82bb&amp;gt;] tgt_grant_prepare_read+0x10b/0x3b0 [ptlrpc]
[219417.344121] [&amp;lt;ffffffffc119df6d&amp;gt;] ofd_set_info_hdl+0x23d/0x4a0 [ofd]
[219417.351343] [&amp;lt;ffffffffc0bda115&amp;gt;] tgt_request_handle+0x925/0x1370 [ptlrpc]
[219417.359202] [&amp;lt;ffffffffc0b82dd6&amp;gt;] ptlrpc_server_handle_request+0x236/0xa90 [ptlrpc]
[219417.367930] [&amp;lt;ffffffffc0b86512&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[219417.375101] [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
[219417.380671] [&amp;lt;ffffffff816b4f58&amp;gt;] ret_from_fork+0x58/0x90
[219417.386823] [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[219417.392529] Kernel panic - not syncing: LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;&#160;prior to this, a lot of evictions and network errors&#160;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[218920.410034] LustreError: 257942:0:(events.c:449:server_bulk_callback()) event type 5, status -125, desc ffff881d0b310a00
[218923.029069] LustreError: 257941:0:(events.c:449:server_bulk_callback()) event type 5, status -125, desc ffff882566547000
[218928.063611] LustreError: 257941:0:(events.c:449:server_bulk_callback()) event type 5, status -125, desc ffff880050289600
[218957.709743] Lustre: oak-OST004d: haven&apos;t heard from client ed27e8aa-82e0-d7bd-37eb-95d83ba476b8 (at 10.8.7.32@o2ib6) in 227 seconds. I think it&apos;s dead, and I am evicting it. exp ffff883c83a64000, cur 1558412848 expire 1558412698 last 1558412621 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Attaching vmcore-dmesg, output of basic crash commands and foreach bt.&lt;/p&gt;

&lt;p&gt;Please note that (just in case) I didn&apos;t have the patch for&#160;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12018&quot; title=&quot;deadlock on OSS: quota reintegration vs memory release&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12018&quot;&gt;&lt;del&gt;LU-12018&lt;/del&gt;&lt;/a&gt; on this OSS. It was a stock 2.10.7. Now the OSS has been updated.&lt;/p&gt;</description>
                <environment>CentOS 7.6</environment>
        <key id="55693">LU-12322</key>
            <summary>negative grant and tgt_grant.c:561:tgt_grant_incoming() LBUG</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="sthiell">Stephane Thiell</reporter>
                        <labels>
                    </labels>
                <created>Tue, 21 May 2019 05:40:14 +0000</created>
                <updated>Mon, 27 May 2019 14:38:45 +0000</updated>
                            <resolved>Tue, 21 May 2019 17:18:42 +0000</resolved>
                                    <version>Lustre 2.10.7</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>3</watches>
                                                                            <comments>
                            <comment id="247453" author="pjones" created="Tue, 21 May 2019 17:18:42 +0000"  >&lt;p&gt;Hi Stephane&lt;/p&gt;

&lt;p&gt;We reviewed the ticket and agree that this looks to be a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12120&quot; title=&quot;LustreError: 15069:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12120&quot;&gt;&lt;del&gt;LU-12120&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="247679" author="sthiell" created="Sat, 25 May 2019 01:58:47 +0000"  >&lt;p&gt;I got the same kind of crash with 2.12 today. Exact same scenario than with 2.10: a bunch of Bulk IO error, &lt;tt&gt;event type 5, status -125&lt;/tt&gt;, negative grant, then crash. More than 5 OSS crashed today on Fir. We had to remount the OSTs with &lt;tt&gt;-o abort_recov&lt;/tt&gt;. Then everything went back to normal.&lt;/p&gt;</comment>
                            <comment id="247728" author="sthiell" created="Sat, 25 May 2019 16:32:58 +0000"  >&lt;p&gt;Peter,&lt;br/&gt;
 Is it possible to increase the severity of this issue or &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12120&quot; title=&quot;LustreError: 15069:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12120&quot;&gt;&lt;del&gt;LU-12120&lt;/del&gt;&lt;/a&gt;? Hit us again this morning with 2.12. No massive evictions seen prior the LBUG nor network issues:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[Sat May 25 07:46:41 2019]fir-io2-s1 login: [ 3622.566306] Lustre: fir-OST000e: haven&apos;t heard from client df23de56-dd2e-8a63-a8d5-338686a662ec (at 10.8.17.18@o2ib6) in 227 seconds. I think it&apos;s dead, and I am evicting it. exp ffff9530d128d000, cur 1558795971 expire 1558795821 last 1558795744^M
[Sat May 25 08:04:13 2019][ 4304.680584] Lustre: fir-OST000c: Connection restored to d7e6489c-4ac3-8ad5-951f-ff4462d87a95 (at 10.8.15.2@o2ib6)^M
[Sat May 25 08:04:13 2019][ 4304.690868] Lustre: Skipped 17 previous similar messages^M
[Sat May 25 08:05:29 2019][ 4380.571168] Lustre: fir-OST0014: haven&apos;t heard from client 25c68a06-73fb-c817-93ec-98f34b969f96 (at 10.8.23.14@o2ib6) in 227 seconds. I think it&apos;s dead, and I am evicting it. exp ffff952a0dde9c00, cur 1558796729 expire 1558796579 last 1558796502^M
[Sat May 25 08:05:29 2019][ 4380.592942] Lustre: Skipped 53 previous similar messages^M
[Sat May 25 08:20:16 2019][ 5267.591216] Lustre: fir-OST000c: Connection restored to 25c68a06-73fb-c817-93ec-98f34b969f96 (at 10.8.23.14@o2ib6)^M
[Sat May 25 08:20:16 2019][ 5267.601571] Lustre: Skipped 5 previous similar messages^M
[Sat May 25 08:36:11 2019][ 6223.053938] Lustre: fir-OST000c: Connection restored to f0ba9527-4000-78d5-07ea-44ae00898a99 (at 10.8.15.5@o2ib6)^M
[Sat May 25 08:36:11 2019][ 6223.064213] Lustre: Skipped 5 previous similar messages^M
[Sat May 25 08:36:36 2019][ 6247.582854] Lustre: fir-OST000c: Connection restored to e267eed0-45c3-44be-99a6-206acecf07ea (at 10.8.15.6@o2ib6)^M
[Sat May 25 08:36:36 2019][ 6247.593114] Lustre: Skipped 5 previous similar messages^M
[Sat May 25 08:40:43 2019][ 6495.331104] Lustre: fir-OST000c: Connection restored to 85720600-89f3-ddb2-047a-1b853ce6c3da (at 10.8.21.24@o2ib6)^M
[Sat May 25 08:40:43 2019][ 6495.341461] Lustre: Skipped 5 previous similar messages^M
[Sat May 25 08:41:10 2019][ 6521.769152] Lustre: fir-OST000c: Connection restored to b8bfc82f-1f8e-8596-4178-06354c0fe0f1 (at 10.8.17.15@o2ib6)^M
[Sat May 25 08:41:10 2019][ 6521.779522] Lustre: Skipped 5 previous similar messages^M
[Sat May 25 08:41:19 2019][ 6531.255514] Lustre: fir-OST000c: Connection restored to e6c11c37-f182-6c12-4d0c-27449ff35ca8 (at 10.8.17.16@o2ib6)^M
[Sat May 25 08:41:19 2019][ 6531.265922] Lustre: Skipped 5 previous similar messages^M
[Sat May 25 08:41:37 2019][ 6548.836936] Lustre: fir-OST000c: Connection restored to 834edbb7-1073-058c-c5af-7281bda3b502 (at 10.8.17.17@o2ib6)^M
[Sat May 25 08:41:37 2019][ 6548.847289] Lustre: Skipped 9 previous similar messages^M
[Sat May 25 08:41:56 2019][ 6568.361589] Lustre: fir-OST000c: Connection restored to 29c83ccc-4cd0-0a75-8213-b42166238c97 (at 10.8.17.19@o2ib6)^M
[Sat May 25 08:41:56 2019][ 6568.371960] Lustre: Skipped 17 previous similar messages^M
[Sat May 25 08:42:49 2019][ 6620.986599] Lustre: fir-OST000c: Connection restored to f70dbc8f-f692-ceed-bbff-b6639edcae72 (at 10.9.113.8@o2ib4)^M
[Sat May 25 08:42:49 2019][ 6620.996948] Lustre: Skipped 5 previous similar messages^M
[Sat May 25 08:44:04 2019][ 6696.251816] Lustre: fir-OST000c: Connection restored to dc489b81-8258-354a-1cb8-0896baf24eea (at 10.8.12.24@o2ib6)^M
[Sat May 25 08:44:04 2019][ 6696.262177] Lustre: Skipped 17 previous similar messages^M
[Sat May 25 08:47:59 2019][ 6931.536750] Lustre: fir-OST000c: Connection restored to dca21337-fdba-5128-347e-592b37646902 (at 10.9.108.60@o2ib4)^M
[Sat May 25 08:47:59 2019][ 6931.547220] Lustre: Skipped 5 previous similar messages^M
[Sat May 25 08:55:44 2019][ 7396.501593] LustreError: 79137:0:(tgt_grant.c:563:tgt_grant_incoming()) fir-OST0012: cli ebef6758-802b-3d88-0fb7-39f9e3a97c72/ffff952bb4d08000 dirty 0 pend 0 grant -29360128^M
[Sat May 25 08:55:44 2019][ 7396.517073] LustreError: 79137:0:(tgt_grant.c:565:tgt_grant_incoming()) LBUG^M
[Sat May 25 08:55:44 2019][ 7396.524130] Pid: 79137, comm: ll_ost_io02_069 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018^M
[Sat May 25 08:55:44 2019][ 7396.534427] Call Trace:^M
[Sat May 25 08:55:44 2019][ 7396.536895]  [&amp;lt;ffffffffc0c217cc&amp;gt;] libcfs_call_trace+0x8c/0xc0 [libcfs]^M
[Sat May 25 08:55:44 2019][ 7396.543496]  [&amp;lt;ffffffffc0c2187c&amp;gt;] lbug_with_loc+0x4c/0xa0 [libcfs]^M
[Sat May 25 08:55:44 2019][ 7396.549728]  [&amp;lt;ffffffffc1475d20&amp;gt;] tgt_grant_prepare_read+0x0/0x3b0 [ptlrpc]^M
[Sat May 25 08:55:44 2019][ 7396.556804]  [&amp;lt;ffffffffc1475e2b&amp;gt;] tgt_grant_prepare_read+0x10b/0x3b0 [ptlrpc]^M
[Sat May 25 08:55:44 2019][ 7396.564043]  [&amp;lt;ffffffffc1861c00&amp;gt;] ofd_preprw+0x450/0x1160 [ofd]^M
[Sat May 25 08:55:44 2019][ 7396.569998]  [&amp;lt;ffffffffc1459bab&amp;gt;] tgt_brw_read+0x9db/0x1e50 [ptlrpc]^M
[Sat May 25 08:55:44 2019][ 7396.576479]  [&amp;lt;ffffffffc145873a&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]^M
[Sat May 25 08:55:45 2019][ 7396.583429]  [&amp;lt;ffffffffc13fcd0b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]^M
[Sat May 25 08:55:45 2019][ 7396.591167]  [&amp;lt;ffffffffc140063c&amp;gt;] ptlrpc_main+0xafc/0x1fc0 [ptlrpc]^M
[Sat May 25 08:55:45 2019][ 7396.597506]  [&amp;lt;ffffffffabcc1c31&amp;gt;] kthread+0xd1/0xe0^M
[Sat May 25 08:55:45 2019][ 7396.602421]  [&amp;lt;ffffffffac374c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21^M
[Sat May 25 08:55:45 2019][ 7396.608915]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff^M
[Sat May 25 08:55:45 2019][ 7396.613949] Kernel panic - not syncing: LBUG^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;A patch to avoid the OSS LBUG would be a good first step I guess.&lt;/p&gt;</comment>
                            <comment id="247729" author="pjones" created="Sat, 25 May 2019 16:41:46 +0000"  >&lt;p&gt;Sure. We&apos;ll get someone who is not out for Memorial Day long weekend to look at &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12120&quot; title=&quot;LustreError: 15069:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12120&quot;&gt;&lt;del&gt;LU-12120&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="247744" author="bougetq" created="Mon, 27 May 2019 00:40:56 +0000"  >&lt;p&gt;Hello,&lt;/p&gt;

&lt;p&gt;I&apos;ve looked at the code a little, and here is what I found (I hope it helps):&lt;/p&gt;

&lt;p&gt;I suppose this is now obvious to anyone who read Stephane&apos;s description: the LBUG is triggered by a negative value of &lt;tt&gt;ted_grant&lt;/tt&gt;.&lt;/p&gt;

&lt;p&gt;I first thought it happened in &lt;tt&gt;tgt_grant_incoming()&lt;/tt&gt; itself, because of this line:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
        ted-&amp;gt;ted_grant -= dropped;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;But this assignment is actually protected:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
        dropped = oa-&amp;gt;oa_dropped;
        ...
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (ted-&amp;gt;ted_grant &amp;lt; dropped) {
                CDEBUG(D_CACHE, ...);
                dropped = 0;
        }
        ...
        ted-&amp;gt;ted_grant -= dropped;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Guided by &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6877&quot; title=&quot;Potential integer overflow in osc_shrink_grant and osc_shrink_grant_to_target&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6877&quot;&gt;LU-6877&lt;/a&gt;, I looked for a potential error during an arithmetic conversion:&lt;/p&gt;
&lt;div class=&apos;table-wrap&apos;&gt;
&lt;table class=&apos;confluenceTable&apos;&gt;&lt;tbody&gt;
&lt;tr&gt;
&lt;th class=&apos;confluenceTh&apos;&gt;variable&lt;/th&gt;
&lt;td class=&apos;confluenceTd&apos;&gt;data type&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td class=&apos;confluenceTd&apos;&gt;&lt;tt&gt;oa-&amp;gt;oa_dropped&lt;/tt&gt;&lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt;&lt;tt&gt;__u32&lt;/tt&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td class=&apos;confluenceTd&apos;&gt;&lt;tt&gt;dropped&lt;/tt&gt;&lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt;&lt;tt&gt;long long&lt;/tt&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td class=&apos;confluenceTd&apos;&gt;&lt;tt&gt;ted-&amp;gt;ted_grant&lt;/tt&gt;&lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt;&lt;tt&gt;long&lt;/tt&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;&lt;/table&gt;
&lt;/div&gt;


&lt;p&gt;I could not find any (though that looked promising at first with &lt;tt&gt;long&lt;/tt&gt; being at least an &lt;tt&gt;int32_t&lt;/tt&gt; and &lt;tt&gt;long long&lt;/tt&gt; an &lt;tt&gt;int64_t&lt;/tt&gt;).&lt;/p&gt;

&lt;p&gt;From there I concluded that &lt;tt&gt;ted-&amp;gt;ted_grant&lt;/tt&gt; must be set to a negative value elsewhere/earlier. Repeating a similar analysis, I could only find one place where&#160;this can happen &lt;span class=&quot;error&quot;&gt;&amp;#91;*&amp;#93;&lt;/span&gt;:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; void tgt_grant_shrink(struct obd_export *exp, struct obdo *oa,
			     u64 left_space)
{
        struct tg_export_data *ted = &amp;amp;exp-&amp;gt;exp_target_data;
        struct obd_device     *obd = exp-&amp;gt;exp_obd;
        struct tg_grants_data *tgd = &amp;amp;obd-&amp;gt;u.obt.obt_lut-&amp;gt;lut_tgd;
        &lt;span class=&quot;code-object&quot;&gt;long&lt;/span&gt; grant_shrink;

        assert_spin_locked(&amp;amp;tgd-&amp;gt;tgd_grant_lock);
        LASSERT(exp);
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (left_space &amp;gt;= tgd-&amp;gt;tgd_tot_granted_clients *
                          TGT_GRANT_SHRINK_LIMIT(exp))
                &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;;
        grant_shrink = oa-&amp;gt;o_grant;
        ted-&amp;gt;ted_grant -= grant_shrink;
        tgd-&amp;gt;tgd_tot_granted -= grant_shrink;
        CDEBUG(D_CACHE, &lt;span class=&quot;code-quote&quot;&gt;&quot;%s: cli %s/%p shrink %ld ted_grant %ld total %llu\n&quot;&lt;/span&gt;,
               obd-&amp;gt;obd_name, exp-&amp;gt;exp_client_uuid.uuid, exp, grant_shrink,
               ted-&amp;gt;ted_grant, tgd-&amp;gt;tgd_tot_granted);

        &lt;span class=&quot;code-comment&quot;&gt;/* client has just released some grant, don&apos;t grant any space back */&lt;/span&gt;
        oa-&amp;gt;o_grant = 0;
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;All the other places where &lt;tt&gt;ted_grant&lt;/tt&gt; is set seem to be protected with appropriate checks.&lt;/p&gt;

&lt;p&gt;Now &lt;tt&gt;tgt_grant_shrink()&lt;/tt&gt; is only called in two places: &lt;tt&gt;tgt_grant_prepare_read()&lt;/tt&gt; and &lt;tt&gt;tgt_grant_prepare_write()&lt;/tt&gt;. The odd part is that in &lt;tt&gt;tgt_grant_prepare_read()&lt;/tt&gt; it is called &lt;b&gt;after&lt;/b&gt; &lt;tt&gt;tgt_grant_incoming()&lt;/tt&gt; (the function that raises the LBUG).&lt;/p&gt;

&lt;p&gt;My understanding is that there is a workqueue on the client side that periodically updates a client&apos;s grant size (cf. &lt;tt&gt;ost_grant_work_handler()&lt;/tt&gt;). The update RPC is sent asynchronously and a callback is set to process the server&apos;s response:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-c&quot;&gt;
&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;&lt;/span&gt; osc_shrink_grant_interpret(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;struct&lt;/span&gt; lu_env *env,
				      &lt;span class=&quot;code-keyword&quot;&gt;struct&lt;/span&gt; ptlrpc_request *req,
				      &lt;span class=&quot;code-keyword&quot;&gt;&lt;span class=&quot;code-object&quot;&gt;void&lt;/span&gt;&lt;/span&gt; *args, &lt;span class=&quot;code-keyword&quot;&gt;&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;&lt;/span&gt; rc)
{
	&lt;span class=&quot;code-keyword&quot;&gt;struct&lt;/span&gt; osc_grant_args *aa = args;
	&lt;span class=&quot;code-keyword&quot;&gt;struct&lt;/span&gt; client_obd *cli = &amp;amp;req-&amp;gt;rq_import-&amp;gt;imp_obd-&amp;gt;u.cli;
	&lt;span class=&quot;code-keyword&quot;&gt;struct&lt;/span&gt; ost_body *body;

	if (rc != 0) {
		__osc_update_grant(cli, aa-&amp;gt;aa_oa-&amp;gt;o_grant);
		GOTO(out, rc);
	}

	body = req_capsule_server_get(&amp;amp;req-&amp;gt;rq_pill, &amp;amp;RMF_OST_BODY);
	LASSERT(body);
	osc_update_grant(cli, body);
out:
	OBD_SLAB_FREE_PTR(aa-&amp;gt;aa_oa, osc_obdo_kmem);

	&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; rc;
}

...

&lt;span class=&quot;code-keyword&quot;&gt;&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;&lt;/span&gt; osc_set_info_async(...)
{
        ...
        if (KEY_IS(KEY_GRANT_SHRINK)) {
                ...
                req-&amp;gt;rq_interpret_reply = osc_shrink_grant_interpret;
        }
        ...
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So at some point the client probably receives a bogus value from the server, and the next RPC triggers the LBUG.&lt;/p&gt;

&lt;p&gt;&lt;b&gt;Where to go from there?&lt;/b&gt;&lt;/p&gt;

&lt;p&gt;I could not figure out where &lt;tt&gt;exp-&amp;gt;exp_target_data&lt;/tt&gt; comes from in &lt;tt&gt;tgt_grant_shrink()&lt;/tt&gt;. If someone can, we should be able to find the root cause of all this.&lt;/p&gt;

&lt;p&gt;&amp;#8212;&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;*&amp;#93;&lt;/span&gt; I only looked at places in the code where &lt;tt&gt;ted_grant&lt;/tt&gt; is explicitly set (&lt;tt&gt;grep &amp;#45;R &apos;ted-&amp;gt;ted_grant\s+&lt;span class=&quot;error&quot;&gt;&amp;#91;-+&amp;#93;&lt;/span&gt;?=&apos; lustre&lt;/tt&gt;). However, it is possible that somewhere, the whole &lt;tt&gt;ted&lt;/tt&gt; structure is set at once and I missed it (for example during RPC packing / unpacking).&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="55257">LU-12120</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="32601" name="oak-io2-s2-crash-cmd-20190520.txt" size="8537" author="sthiell" created="Tue, 21 May 2019 05:39:54 +0000"/>
                            <attachment id="32602" name="oak-io2-s2-foreach-bt-20190520.txt" size="1656040" author="sthiell" created="Tue, 21 May 2019 05:39:58 +0000"/>
                            <attachment id="32603" name="oak-io2-s2-vmcore-dmesg-20190520.txt" size="875338" author="sthiell" created="Tue, 21 May 2019 05:39:56 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00gm7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>