<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:48:15 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-11939] ASSERTION( tgd-&gt;tgd_tot_granted &gt;= ted-&gt;ted_grant ) on OSS</title>
                <link>https://jira.whamcloud.com/browse/LU-11939</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We just hit the following LBUG with Lustre 2.12 on an OSS (Fir). All clients are running Lustre 2.12 also.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[1708550.581820] LustreError: 123124:0:(tgt_grant.c:1079:tgt_grant_discard()) ASSERTION( tgd-&amp;gt;tgd_tot_granted &amp;gt;= ted-&amp;gt;ted_grant ) failed: fir-OST001b: tot_granted 50041695803 cli d5e4b60f-fe33-b991-7d48-5b8db7e07ab0/ffff926b10975c00 ted_grant -49152
[1708550.603611] LustreError: 123124:0:(tgt_grant.c:1079:tgt_grant_discard()) LBUG
[1708550.610923] Pid: 123124, comm: ll_ost00_019 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018
[1708550.621180] Call Trace:
[1708550.623814]  [&amp;lt;ffffffffc0aa37cc&amp;gt;] libcfs_call_trace+0x8c/0xc0 [libcfs]
[1708550.630548]  [&amp;lt;ffffffffc0aa387c&amp;gt;] lbug_with_loc+0x4c/0xa0 [libcfs]
[1708550.636935]  [&amp;lt;ffffffffc0f220bc&amp;gt;] tgt_grant_discard+0x1dc/0x1e0 [ptlrpc]
[1708550.643892]  [&amp;lt;ffffffffc14c81d4&amp;gt;] ofd_obd_disconnect+0x74/0x220 [ofd]
[1708550.650541]  [&amp;lt;ffffffffc0e60157&amp;gt;] target_handle_disconnect+0xd7/0x450 [ptlrpc]
[1708550.658005]  [&amp;lt;ffffffffc0efeb77&amp;gt;] tgt_disconnect+0x37/0x140 [ptlrpc]
[1708550.664609]  [&amp;lt;ffffffffc0f0635a&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
[1708550.671734]  [&amp;lt;ffffffffc0eaa92b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[1708550.679628]  [&amp;lt;ffffffffc0eae25c&amp;gt;] ptlrpc_main+0xafc/0x1fc0 [ptlrpc]
[1708550.686136]  [&amp;lt;ffffffff8dcc1c31&amp;gt;] kthread+0xd1/0xe0
[1708550.691224]  [&amp;lt;ffffffff8e374c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
[1708550.697873]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[1708550.703065] Kernel panic - not syncing: LBUG
[1708550.707509] CPU: 20 PID: 123124 Comm: ll_ost00_019 Kdump: loaded Tainted: G           OE  ------------   3.10.0-957.1.3.el7_lustre.x86_64 #1
[1708550.720273] Hardware name: Dell Inc. PowerEdge R6415/065PKD, BIOS 1.6.7 10/29/2018
[1708550.728015] Call Trace:
[1708550.730645]  [&amp;lt;ffffffff8e361e41&amp;gt;] dump_stack+0x19/0x1b
[1708550.735962]  [&amp;lt;ffffffff8e35b550&amp;gt;] panic+0xe8/0x21f
[1708550.740937]  [&amp;lt;ffffffffc0aa38cb&amp;gt;] lbug_with_loc+0x9b/0xa0 [libcfs]
[1708550.747346]  [&amp;lt;ffffffffc0f220bc&amp;gt;] tgt_grant_discard+0x1dc/0x1e0 [ptlrpc]
[1708550.754230]  [&amp;lt;ffffffffc14c81d4&amp;gt;] ofd_obd_disconnect+0x74/0x220 [ofd]
[1708550.760880]  [&amp;lt;ffffffffc0e9ed81&amp;gt;] ? lustre_pack_reply+0x11/0x20 [ptlrpc]
[1708550.767783]  [&amp;lt;ffffffffc0ec3933&amp;gt;] ? req_capsule_server_pack+0x43/0xf0 [ptlrpc]
[1708550.775207]  [&amp;lt;ffffffffc0e60157&amp;gt;] target_handle_disconnect+0xd7/0x450 [ptlrpc]
[1708550.782634]  [&amp;lt;ffffffffc0efeb77&amp;gt;] tgt_disconnect+0x37/0x140 [ptlrpc]
[1708550.789194]  [&amp;lt;ffffffffc0f0635a&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
[1708550.796272]  [&amp;lt;ffffffffc0edfa51&amp;gt;] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc]
[1708550.804022]  [&amp;lt;ffffffffc0aa3bde&amp;gt;] ? ktime_get_real_seconds+0xe/0x10 [libcfs]
[1708550.811281]  [&amp;lt;ffffffffc0eaa92b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[1708550.819142]  [&amp;lt;ffffffffc0ea77b5&amp;gt;] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc]
[1708550.826110]  [&amp;lt;ffffffff8dcd67c2&amp;gt;] ? default_wake_function+0x12/0x20
[1708550.832548]  [&amp;lt;ffffffff8dccba9b&amp;gt;] ? __wake_up_common+0x5b/0x90
[1708550.838589]  [&amp;lt;ffffffffc0eae25c&amp;gt;] ptlrpc_main+0xafc/0x1fc0 [ptlrpc]
[1708550.845068]  [&amp;lt;ffffffffc0ead760&amp;gt;] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc]
[1708550.852636]  [&amp;lt;ffffffff8dcc1c31&amp;gt;] kthread+0xd1/0xe0
[1708550.857688]  [&amp;lt;ffffffff8dcc1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
[1708550.863956]  [&amp;lt;ffffffff8e374c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
[1708550.870567]  [&amp;lt;ffffffff8dcc1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
</description>
                <environment>CentOS 7.6, 3.10.0-957.1.3.el7_lustre.x86_64</environment>
        <key id="54805">LU-11939</key>
            <summary>ASSERTION( tgd-&gt;tgd_tot_granted &gt;= ted-&gt;ted_grant ) on OSS</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="tappro">Mikhail Pershin</assignee>
                                    <reporter username="sthiell">Stephane Thiell</reporter>
                        <labels>
                    </labels>
                <created>Wed, 6 Feb 2019 21:25:33 +0000</created>
                <updated>Mon, 22 Nov 2021 15:52:40 +0000</updated>
                            <resolved>Thu, 18 Nov 2021 00:11:57 +0000</resolved>
                                    <version>Lustre 2.12.0</version>
                                    <fixVersion>Lustre 2.14.0</fixVersion>
                    <fixVersion>Lustre 2.12.8</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="241524" author="sthiell" created="Thu, 7 Feb 2019 05:37:31 +0000"  >&lt;p&gt;Same crash happened on OST unmount while trying to fix another issue (@tcp announcing clients, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11888&quot; title=&quot;Unreachable client NID confusing Lustre 2.12&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11888&quot;&gt;LU-11888&lt;/a&gt;):&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[1739660.322150] Lustre: Failing over fir-OST000b
[1739660.326649] Lustre: Skipped 5 previous similar messages
[1739660.337361] LustreError: 24209:0:(tgt_grant.c:1079:tgt_grant_discard()) ASSERTION( tgd-&amp;gt;tgd_tot_granted &amp;gt;= ted-&amp;gt;ted_grant ) failed: fir-OST0005: tot_granted 2114072895 cli e43da944-d239-923f-8f68-10646264727b/ffff90a28977ac00 ted_grant -12582912
[1739660.359265] LustreError: 24209:0:(tgt_grant.c:1079:tgt_grant_discard()) LBUG
[1739660.366492] Pid: 24209, comm: umount 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018
[1739660.376143] Call Trace:
[1739660.378780]  [&amp;lt;ffffffffc0afd7cc&amp;gt;] libcfs_call_trace+0x8c/0xc0 [libcfs]
[1739660.385519]  [&amp;lt;ffffffffc0afd87c&amp;gt;] lbug_with_loc+0x4c/0xa0 [libcfs]
[1739660.391906]  [&amp;lt;ffffffffc13fb0bc&amp;gt;] tgt_grant_discard+0x1dc/0x1e0 [ptlrpc]
[1739660.398876]  [&amp;lt;ffffffffc12631d4&amp;gt;] ofd_obd_disconnect+0x74/0x220 [ofd]
[1739660.405522]  [&amp;lt;ffffffffc0d2e7d6&amp;gt;] class_disconnect_export_list+0x1c6/0x680 [obdclass]
[1739660.413582]  [&amp;lt;ffffffffc0d2eda5&amp;gt;] class_disconnect_exports+0x115/0x310 [obdclass]
[1739660.421279]  [&amp;lt;ffffffffc0d493e7&amp;gt;] class_cleanup+0x297/0xbd0 [obdclass]
[1739660.428031]  [&amp;lt;ffffffffc0d4a9ac&amp;gt;] class_process_config+0x65c/0x2830 [obdclass]
[1739660.435465]  [&amp;lt;ffffffffc0d4cd46&amp;gt;] class_manual_cleanup+0x1c6/0x710 [obdclass]
[1739660.442816]  [&amp;lt;ffffffffc0d7dc2e&amp;gt;] server_put_super+0x8de/0xcd0 [obdclass]
[1739660.449827]  [&amp;lt;ffffffff84243dbd&amp;gt;] generic_shutdown_super+0x6d/0x100
[1739660.456303]  [&amp;lt;ffffffff842441b2&amp;gt;] kill_anon_super+0x12/0x20
[1739660.462077]  [&amp;lt;ffffffffc0d4f8b2&amp;gt;] lustre_kill_super+0x32/0x50 [obdclass]
[1739660.468989]  [&amp;lt;ffffffff8424456e&amp;gt;] deactivate_locked_super+0x4e/0x70
[1739660.475456]  [&amp;lt;ffffffff84244cf6&amp;gt;] deactivate_super+0x46/0x60
[1739660.481314]  [&amp;lt;ffffffff8426326f&amp;gt;] cleanup_mnt+0x3f/0x80
[1739660.486739]  [&amp;lt;ffffffff84263302&amp;gt;] __cleanup_mnt+0x12/0x20
[1739660.492338]  [&amp;lt;ffffffff840be79b&amp;gt;] task_work_run+0xbb/0xe0
[1739660.497948]  [&amp;lt;ffffffff8402bc65&amp;gt;] do_notify_resume+0xa5/0xc0
[1739660.503813]  [&amp;lt;ffffffff84775124&amp;gt;] int_signal+0x12/0x17
[1739660.509154]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[1739660.514347] Kernel panic - not syncing: LBUG
[1739660.518793] CPU: 0 PID: 24209 Comm: umount Kdump: loaded Tainted: G           OE  ------------   3.10.0-957.1.3.el7_lustre.x86_64 #1
[1739660.530863] Hardware name: Dell Inc. PowerEdge R6415/065PKD, BIOS 1.6.7 10/29/2018
[1739660.538601] Call Trace:
[1739660.541232]  [&amp;lt;ffffffff84761e41&amp;gt;] dump_stack+0x19/0x1b
[1739660.546543]  [&amp;lt;ffffffff8475b550&amp;gt;] panic+0xe8/0x21f
[1739660.551512]  [&amp;lt;ffffffffc0afd8cb&amp;gt;] lbug_with_loc+0x9b/0xa0 [libcfs]
[1739660.557903]  [&amp;lt;ffffffffc13fb0bc&amp;gt;] tgt_grant_discard+0x1dc/0x1e0 [ptlrpc]
[1739660.564783]  [&amp;lt;ffffffffc12631d4&amp;gt;] ofd_obd_disconnect+0x74/0x220 [ofd]
[1739660.571411]  [&amp;lt;ffffffffc0c89f02&amp;gt;] ? libcfs_nid2str_r+0xe2/0x130 [lnet]
[1739660.578124]  [&amp;lt;ffffffffc0d2e7d6&amp;gt;] class_disconnect_export_list+0x1c6/0x680 [obdclass]
[1739660.586143]  [&amp;lt;ffffffffc0d2eda5&amp;gt;] class_disconnect_exports+0x115/0x310 [obdclass]
[1739660.593814]  [&amp;lt;ffffffffc0d493e7&amp;gt;] class_cleanup+0x297/0xbd0 [obdclass]
[1739660.600517]  [&amp;lt;ffffffffc0b03f07&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
[1739660.607324]  [&amp;lt;ffffffffc0d30836&amp;gt;] ? class_name2dev_nolock+0x46/0xb0 [obdclass]
[1739660.614735]  [&amp;lt;ffffffffc0d4a9ac&amp;gt;] class_process_config+0x65c/0x2830 [obdclass]
[1739660.622131]  [&amp;lt;ffffffffc0b03f07&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
[1739660.628939]  [&amp;lt;ffffffffc0d4cd46&amp;gt;] class_manual_cleanup+0x1c6/0x710 [obdclass]
[1739660.636265]  [&amp;lt;ffffffffc0d7dc2e&amp;gt;] server_put_super+0x8de/0xcd0 [obdclass]
[1739660.643223]  [&amp;lt;ffffffff84243dbd&amp;gt;] generic_shutdown_super+0x6d/0x100
[1739660.649661]  [&amp;lt;ffffffff842441b2&amp;gt;] kill_anon_super+0x12/0x20
[1739660.655427]  [&amp;lt;ffffffffc0d4f8b2&amp;gt;] lustre_kill_super+0x32/0x50 [obdclass]
[1739660.662296]  [&amp;lt;ffffffff8424456e&amp;gt;] deactivate_locked_super+0x4e/0x70
[1739660.668736]  [&amp;lt;ffffffff84244cf6&amp;gt;] deactivate_super+0x46/0x60
[1739660.674571]  [&amp;lt;ffffffff8426326f&amp;gt;] cleanup_mnt+0x3f/0x80
[1739660.679968]  [&amp;lt;ffffffff84263302&amp;gt;] __cleanup_mnt+0x12/0x20
[1739660.685541]  [&amp;lt;ffffffff840be79b&amp;gt;] task_work_run+0xbb/0xe0
[1739660.691115]  [&amp;lt;ffffffff8402bc65&amp;gt;] do_notify_resume+0xa5/0xc0
[1739660.696947]  [&amp;lt;ffffffff84775124&amp;gt;] int_signal+0x12/0x17
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;vmcore is available if needed&lt;/p&gt;</comment>
                            <comment id="241556" author="pjones" created="Thu, 7 Feb 2019 18:27:46 +0000"  >&lt;p&gt;Patrick is investigating&lt;/p&gt;</comment>
                            <comment id="241558" author="pfarrell" created="Thu, 7 Feb 2019 18:32:46 +0000"  >&lt;p&gt;VMcore would be welcome, first off, dmesg in both cases would be great to have.&lt;/p&gt;



&lt;p&gt;Few questions.&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;OST backing file system - ZFS or ldiskfs?&lt;/li&gt;
	&lt;li&gt;Any evictions?&#160; (Hoping to see that in dmesg)&lt;/li&gt;
&lt;/ol&gt;
</comment>
                            <comment id="241573" author="sthiell" created="Thu, 7 Feb 2019 23:29:21 +0000"  >&lt;p&gt;Hi Patrick,&lt;/p&gt;

&lt;p&gt;Thanks for investigating and congrats on your new position!! &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;

&lt;p&gt;1. ldiskfs&lt;br/&gt;
2. many evictions due to clients configured with tcp0 nids even though the servers are IB only, see &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11888&quot; title=&quot;Unreachable client NID confusing Lustre 2.12&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11888&quot;&gt;LU-11888&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11937&quot; title=&quot;lnet.service randomly load tcp NIDs&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11937&quot;&gt;&lt;del&gt;LU-11937&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Attached the dmesg to the ticket:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;first crash is vmcore-dmesg_fir-io3-s2_2019-02-06_12_50_35.txt&lt;/li&gt;
	&lt;li&gt;second crash while unmounting is vmcore-dmesg_fir-io1-s2_2019-02-06_21_29_56.txt&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;I also just uploaded the two vmcores to the ftp:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;vmcore_fir-io3-s2_2019-02-06_12_50_35.gz&lt;/li&gt;
	&lt;li&gt;vmcore_fir-io1-s2_2019-02-06_21_29_56.gz&lt;br/&gt;
I believe the debuginfo rpms are already available in the ftp:&lt;/li&gt;
	&lt;li&gt;kernel-debuginfo-3.10.0-957.1.3.el7_lustre.x86_64.rpm&lt;/li&gt;
	&lt;li&gt;kernel-debuginfo-common-x86_64-3.10.0-957.1.3.el7_lustre.x86_64.rpm&lt;br/&gt;
and I also uploaded our lustre debuginfo rpm:&lt;/li&gt;
	&lt;li&gt;lustre-debuginfo-2.12.0-1.el7.x86_64.rpm&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;No more crash so far since we restarted all servers and fixed our clients announcing *@tcp NIDs.&lt;/p&gt;

&lt;p&gt;Thanks!&lt;br/&gt;
Stephane&lt;/p&gt;
</comment>
                            <comment id="241575" author="pfarrell" created="Fri, 8 Feb 2019 01:41:26 +0000"  >&lt;p&gt;Thanks, Stephane!&lt;/p&gt;

&lt;p&gt;So as you&apos;ve probably guessed, there&apos;s some sort of bug related to grant handling on evictions.&#160; If you&apos;re not having evictions, you shouldn&apos;t see this.&#160; So you&apos;re probably good to go from a &quot;site reliability&quot; perspective.&lt;/p&gt;

&lt;p&gt;I&apos;m going to look in to the grant behavior at eviction, and also see about not asserting here.&#160; Clean up and print an error instead.&lt;/p&gt;</comment>
                            <comment id="241601" author="gerrit" created="Fri, 8 Feb 2019 17:07:11 +0000"  >&lt;p&gt;Patrick Farrell (pfarrell@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/34215&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34215&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11939&quot; title=&quot;ASSERTION( tgd-&amp;gt;tgd_tot_granted &amp;gt;= ted-&amp;gt;ted_grant ) on OSS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11939&quot;&gt;&lt;del&gt;LU-11939&lt;/del&gt;&lt;/a&gt; tgt: Do not assert during grant cleanup&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: f44d793b8a4786042a4cf38cf967ce9686da5b81&lt;/p&gt;</comment>
                            <comment id="241605" author="pfarrell" created="Fri, 8 Feb 2019 17:17:11 +0000"  >&lt;p&gt;Stephane,&lt;/p&gt;

&lt;p&gt;Looking a little in to the underlying bug...&lt;/p&gt;

&lt;p&gt;What are your max_pages_per_rpc, max_dirty_mb, and max_rpcs_in_flight settings on the client?&lt;/p&gt;</comment>
                            <comment id="241628" author="sthiell" created="Fri, 8 Feb 2019 20:13:21 +0000"  >&lt;p&gt;Thanks Patrick!&lt;/p&gt;

&lt;p&gt;On our clients, we have:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;max_rpcs_in_flight=8 (default), only the data transfer nodes and robinhood server have max_rpcs_in_flight=32&lt;/li&gt;
	&lt;li&gt;max_dirty_mb=2000 (default), only the data transfer nodes and robinhood server have max_dirty_mb=128&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;As for max_pages_per_rpc, it should be set to 4096 and brw_size=16, but I noticed that it doesn&apos;t seem to be the case on all clients:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@sh-101-20 ~]# cd /proc/fs/lustre/osc; for o in fir-*; do echo -n &quot;$o:&quot;; cat $o/max_pages_per_rpc; done
fir-OST0000-osc-ffff9d0cad3de000:4096
fir-OST0001-osc-ffff9d0cad3de000:4096
fir-OST0002-osc-ffff9d0cad3de000:1024
fir-OST0003-osc-ffff9d0cad3de000:1024
fir-OST0004-osc-ffff9d0cad3de000:1024
fir-OST0005-osc-ffff9d0cad3de000:1024
fir-OST0006-osc-ffff9d0cad3de000:1024
fir-OST0007-osc-ffff9d0cad3de000:4096
fir-OST0008-osc-ffff9d0cad3de000:1024
fir-OST0009-osc-ffff9d0cad3de000:1024
fir-OST000a-osc-ffff9d0cad3de000:1024
fir-OST000b-osc-ffff9d0cad3de000:1024
fir-OST000c-osc-ffff9d0cad3de000:1024
fir-OST000d-osc-ffff9d0cad3de000:1024
fir-OST000e-osc-ffff9d0cad3de000:1024
fir-OST000f-osc-ffff9d0cad3de000:1024
fir-OST0010-osc-ffff9d0cad3de000:4096
fir-OST0011-osc-ffff9d0cad3de000:4096
fir-OST0012-osc-ffff9d0cad3de000:4096
fir-OST0013-osc-ffff9d0cad3de000:1024
fir-OST0014-osc-ffff9d0cad3de000:1024
fir-OST0015-osc-ffff9d0cad3de000:4096
fir-OST0016-osc-ffff9d0cad3de000:1024
fir-OST0017-osc-ffff9d0cad3de000:1024
fir-OST0018-osc-ffff9d0cad3de000:4096
fir-OST0019-osc-ffff9d0cad3de000:4096
fir-OST001a-osc-ffff9d0cad3de000:1024
fir-OST001b-osc-ffff9d0cad3de000:1024
fir-OST001c-osc-ffff9d0cad3de000:4096
fir-OST001d-osc-ffff9d0cad3de000:4096
fir-OST001e-osc-ffff9d0cad3de000:1024
fir-OST001f-osc-ffff9d0cad3de000:1024
fir-OST0020-osc-ffff9d0cad3de000:4096
fir-OST0021-osc-ffff9d0cad3de000:4096
fir-OST0022-osc-ffff9d0cad3de000:1024
fir-OST0023-osc-ffff9d0cad3de000:1024
fir-OST0024-osc-ffff9d0cad3de000:1024
fir-OST0025-osc-ffff9d0cad3de000:1024
fir-OST0026-osc-ffff9d0cad3de000:1024
fir-OST0027-osc-ffff9d0cad3de000:1024
fir-OST0028-osc-ffff9d0cad3de000:1024
fir-OST0029-osc-ffff9d0cad3de000:1024
fir-OST002a-osc-ffff9d0cad3de000:4096
fir-OST002b-osc-ffff9d0cad3de000:4096
fir-OST002c-osc-ffff9d0cad3de000:4096
fir-OST002d-osc-ffff9d0cad3de000:1024
fir-OST002e-osc-ffff9d0cad3de000:1024
fir-OST002f-osc-ffff9d0cad3de000:1024
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;We used this on the MGS:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param -P fir-OST*.osc.max_pages_per_rpc=4096
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So this isn&apos;t good. I just re-applied this command on the MGS:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[138882.544463] Lustre: Modifying parameter osc.fir-OST*.osc.max_pages_per_rpc in log params
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and a newly mounted client is now set up at 4096. Do you think that could have caused this issue?&lt;/p&gt;

&lt;p&gt;As for brw_size:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# clush -w @oss -b &apos;cat /proc/fs/lustre/obdfilter/*/brw_size&apos;
---------------
fir-io[1-4]-s[1-2] (8)
---------------
16
16
16
16
16
16
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="241635" author="pfarrell" created="Fri, 8 Feb 2019 21:23:03 +0000"  >&lt;p&gt;Pages per RPC isn&apos;t a big deal (so, no, probably not), but max_dirty_mb may be.&lt;/p&gt;

&lt;p&gt;Hmm, 2000 is &lt;b&gt;not&lt;/b&gt; the default (the default is, I think, max_rpcs_in_flight * RPC size...&#160; It&apos;s certainly much smaller than this value.).&#160; So that&apos;s getting set somewhere.&lt;/p&gt;

&lt;p&gt;It&apos;s also potentially too high.&#160; max_dirty_mb is used in calculating grant requests, and there are some grant overflow bugs that occur when it&apos;s set that high.&#160; (Particularly with 16 MiB RPCs.)&#160; All the ones I know of are fixed in 2.12, but...&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;I strongly suspect you may have hit an overflow, leading to the grant inconsistency, leading to this crash.&#160;&lt;/p&gt;

&lt;p&gt;The grant value reported for the export for this client is negative -&#160;ted_grant -49152 in one case, in the other ted_grant -12582912.&#160; These small-ish negative values strongly suggest overflow.&#160; The server side value being compared against (tot_granted) is unsigned, and comparison with this negative value is why the &quot;total grant &amp;gt;= grant for this export&quot; assert we hit failed.&#160; (The fact that your max_dirty_mb is at 2 GiB just makes the overflow explanation more likely.)&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Your max_dirty_mb is above what should help performance, so tuning it down is a good idea.&lt;/p&gt;

&lt;p&gt;The rule I use for max_dirty_mb is 2 * mb_per_rpc * rpcs_in_flight - the idea being that you can accumulate some dirty data so you&apos;re always ready to make an RPC when one completes, but you don&apos;t have tons of dirty data sitting around if it isn&apos;t getting processed fast enough.&#160; (There&apos;s some docs floating around that say 4*, but with RPC sizes and counts increasing, that tends to be too much data.&#160; 2* should be plenty for good performance.)&lt;/p&gt;

&lt;p&gt;So for you, that&apos;s 2*16*8 = 256, or in the case of your RBH nodes, that&apos;s 2*16*32=1024.&lt;/p&gt;

&lt;p&gt;So I&apos;d suggest turning down your max_dirty_mb to no more than 1 GiB.&lt;/p&gt;</comment>
                            <comment id="241650" author="sthiell" created="Sat, 9 Feb 2019 01:01:59 +0000"  >&lt;p&gt;Wow, thanks much for the detailed explanation. This is SUPER helpful. But... I don&apos;t think we have explicitly changed the value of &lt;tt&gt;max_dirty_mb&lt;/tt&gt;. So I&apos;ve been trying to track down why it is so high for ALL of our Lustre filesystems mounted on Sherlock (regal, oak and fir). If I understand correctly,&#160;&lt;tt&gt;/sys/fs/lustre/max_dirty_mb&lt;/tt&gt; is used by the udev script provided by the lustre-client RPM right? and then the values probably max out at 2000?&lt;br/&gt;
&#160;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@sh-ln06 ~]# cat /sys/fs/lustre/version
2.12.0
[root@sh-ln06 ~]# cat /sys/fs/lustre/max_dirty_mb 
32107
[root@sh-ln06 ~]# ls -l /sys/fs/lustre/max_dirty_mb
-rw-r--r-- 1 root root 4096 Feb&#160; 8 16:54 /sys/fs/lustre/max_dirty_mb
[root@sh-ln06 ~]# cat /etc/udev/rules.d/99-lustre.rules
KERNEL==&quot;obd&quot;, MODE=&quot;0666&quot;

# set sysfs values on client
SUBSYSTEM==&quot;lustre&quot;, ACTION==&quot;change&quot;, ENV{PARAM}==&quot;?*&quot;, RUN+=&quot;/usr/sbin/lctl set_param &apos;$env{PARAM}=$env{SETTING}&apos;&quot;


[root@sh-ln06 ~]# rpm -q --info lustre-client
Name        : lustre-client
Version     : 2.12.0
Release     : 1.el7
Architecture: x86_64
Install Date: Tue 05 Feb 2019 05:17:58 PM PST
Group       : System Environment/Kernel
Size        : 2007381
License     : GPL
Signature   : (none)
Source RPM  : lustre-client-2.12.0-1.el7.src.rpm
Build Date  : Fri 21 Dec 2018 01:53:18 PM PST
Build Host  : trevis-307-el7-x8664-3.trevis.whamcloud.com
Relocations : (not relocatable)
URL         : https://wiki.whamcloud.com/
Summary     : Lustre File System
Description :
Userspace tools and files for the Lustre file system.


[root@sh-ln06 ~]# lctl get_param osc.*.max_dirty_mb
osc.fir-OST0000-osc-ffff9bad01395000.max_dirty_mb=2000
osc.fir-OST0001-osc-ffff9bad01395000.max_dirty_mb=2000
...
osc.fir-OST002e-osc-ffff9bad01395000.max_dirty_mb=2000
osc.fir-OST002f-osc-ffff9bad01395000.max_dirty_mb=2000
osc.oak-OST0000-osc-ffff9baceaa3d800.max_dirty_mb=2000
osc.oak-OST0001-osc-ffff9baceaa3d800.max_dirty_mb=2000
...
osc.oak-OST0071-osc-ffff9baceaa3d800.max_dirty_mb=2000
osc.regal-OST0000-osc-ffff9bace6e28800.max_dirty_mb=2000
...
osc.regal-OST006a-osc-ffff9bace6e28800.max_dirty_mb=2000
osc.regal-OST006b-osc-ffff9bace6e28800.max_dirty_mb=2000
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="241652" author="pfarrell" created="Sat, 9 Feb 2019 01:57:49 +0000"  >&lt;p&gt;Hmm, I&apos;m not familiar with the script, so I don&apos;t really know.&#160; I don&apos;t think so, though...?&lt;/p&gt;

&lt;p&gt;It&apos;s possible you&apos;re hitting:&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11919&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.whamcloud.com/browse/LU-11919&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Which is basically &quot;cl_max_dirty_mb is supposed to start at zero, but instead starts with whatever was in memory&quot;.&#160; Then, whatever was in memory is processed like it was a setting from userspace.&#160; So if it&apos;s not zero (the most likely case, especially at startup), it&apos;s reasonably likely (though not guaranteed - it&apos;s more complicated than just &quot;existing value in memory is &amp;gt; 2000 means 2000&quot;) to get set to the max.&lt;/p&gt;

&lt;p&gt;Anyway, you can override that with a set_param -P.&lt;/p&gt;</comment>
                            <comment id="241659" author="sthiell" created="Sat, 9 Feb 2019 07:41:19 +0000"  >&lt;p&gt;OK, we never noticed that before (with 2.10 clients). Thanks for your help! I used set_param -P on the MGS of Fir to set max_dirty_mb to 256 and it did work.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param -P osc.*.max_dirty_mb=256
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@sh-ln06 ~]# lctl get_param osc.*.max_dirty_mb
osc.fir-OST0000-osc-ffff9bad01395000.max_dirty_mb=256
osc.fir-OST0001-osc-ffff9bad01395000.max_dirty_mb=256
...
osc.fir-OST002e-osc-ffff9bad01395000.max_dirty_mb=256
osc.fir-OST002f-osc-ffff9bad01395000.max_dirty_mb=256
osc.oak-OST0000-osc-ffff9baceaa3d800.max_dirty_mb=256
osc.oak-OST0001-osc-ffff9baceaa3d800.max_dirty_mb=256
...
osc.oak-OST0070-osc-ffff9baceaa3d800.max_dirty_mb=256
osc.oak-OST0071-osc-ffff9baceaa3d800.max_dirty_mb=256
osc.regal-OST0000-osc-ffff9bace6e28800.max_dirty_mb=256
osc.regal-OST0001-osc-ffff9bace6e28800.max_dirty_mb=256
osc.regal-OST0002-osc-ffff9bace6e28800.max_dirty_mb=256
...
osc.regal-OST006b-osc-ffff9bace6e28800.max_dirty_mb=256
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So that should be much better. I&apos;ll report any new event regarding this issue, but so far so good. Thanks again.&lt;/p&gt;</comment>
                            <comment id="242110" author="pjones" created="Sat, 16 Feb 2019 16:47:52 +0000"  >&lt;p&gt;So ok to close this one as a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11919&quot; title=&quot;cl_max_dirty_pages not zeroed on startup&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11919&quot;&gt;&lt;del&gt;LU-11919&lt;/del&gt;&lt;/a&gt;?&lt;/p&gt;</comment>
                            <comment id="245445" author="pfarrell" created="Mon, 8 Apr 2019 19:03:38 +0000"  >&lt;p&gt;Nah, we&apos;ve still got a patch to track under this&lt;/p&gt;</comment>
                            <comment id="251242" author="pfarrell" created="Fri, 12 Jul 2019 14:13:41 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=tappro&quot; class=&quot;user-hover&quot; rel=&quot;tappro&quot;&gt;tappro&lt;/a&gt;:&lt;/p&gt;

&lt;p&gt;Mike,&lt;/p&gt;

&lt;p&gt;Didn&apos;t you fix this grant bug in another LU?&#160; I can&apos;t find it right now...&lt;/p&gt;</comment>
                            <comment id="251287" author="tappro" created="Fri, 12 Jul 2019 18:42:31 +0000"  >&lt;p&gt;Patrick, do you mean patch from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12120&quot; title=&quot;LustreError: 15069:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12120&quot;&gt;&lt;del&gt;LU-12120&lt;/del&gt;&lt;/a&gt;?&lt;/p&gt;</comment>
                            <comment id="251289" author="pfarrell" created="Fri, 12 Jul 2019 18:51:40 +0000"  >&lt;p&gt;Yes, that looks like the right one.&#160; Do you agree that should take care of this issue as well?&lt;/p&gt;</comment>
                            <comment id="254964" author="pjones" created="Wed, 18 Sep 2019 11:20:29 +0000"  >&lt;p&gt;Mike confirms that this is a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12120&quot; title=&quot;LustreError: 15069:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12120&quot;&gt;&lt;del&gt;LU-12120&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="262879" author="gerrit" created="Sat, 8 Feb 2020 03:59:35 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/34215/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34215/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11939&quot; title=&quot;ASSERTION( tgd-&amp;gt;tgd_tot_granted &amp;gt;= ted-&amp;gt;ted_grant ) on OSS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11939&quot;&gt;&lt;del&gt;LU-11939&lt;/del&gt;&lt;/a&gt; tgt: Do not assert during grant cleanup&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: af2d3ac30eafead6b47c5db20d76433c091d89de&lt;/p&gt;</comment>
                            <comment id="317669" author="gerrit" created="Mon, 8 Nov 2021 18:47:49 +0000"  >&lt;p&gt;&quot;Mike Pershin &amp;lt;mpershin@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/45489&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/45489&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11939&quot; title=&quot;ASSERTION( tgd-&amp;gt;tgd_tot_granted &amp;gt;= ted-&amp;gt;ted_grant ) on OSS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11939&quot;&gt;&lt;del&gt;LU-11939&lt;/del&gt;&lt;/a&gt; tgt: Do not assert during grant cleanup&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 81f17cf04fc7d4d4bd7ab87cfe572b7f59cf81f3&lt;/p&gt;</comment>
                            <comment id="318439" author="gerrit" created="Wed, 17 Nov 2021 18:44:19 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/45489/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/45489/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11939&quot; title=&quot;ASSERTION( tgd-&amp;gt;tgd_tot_granted &amp;gt;= ted-&amp;gt;ted_grant ) on OSS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11939&quot;&gt;&lt;del&gt;LU-11939&lt;/del&gt;&lt;/a&gt; tgt: Do not assert during grant cleanup&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 372c77f0a11573e9f8818751c24735e151aafc74&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="55257">LU-12120</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="31955" name="vmcore-dmesg_fir-io1-s2_2019-02-06_21_29_56.txt" size="952218" author="sthiell" created="Thu, 7 Feb 2019 23:14:05 +0000"/>
                            <attachment id="31954" name="vmcore-dmesg_fir-io3-s2_2019-02-06_12_50_35.txt" size="934279" author="sthiell" created="Thu, 7 Feb 2019 23:14:02 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00b5r:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>