<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:45:41 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-11644] LNet: Service thread inactive for 300  causes client evictions </title>
                <link>https://jira.whamcloud.com/browse/LU-11644</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Update to 2.10.5 now we are seeing periods of mass evictions from servers. On the server we have the following stack trace&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Nov  7 11:33:12 nbp8-oss7 kernel: [531465.033253] Pid: 11080, comm: ll_ost01_220 3.10.0-693.21.1.el7.20180508.x86_64.lustre2105 #1 SMP Mon Aug 27 23:04:41 UTC 2018
Nov  7 11:33:12 nbp8-oss7 kernel: [531465.033260] Call Trace:
Nov  7 11:33:12 nbp8-oss7 kernel: [531465.033274]  [&amp;lt;ffffffffa0c1d0e0&amp;gt;] ptlrpc_set_wait+0x4c0/0x920 [ptlrpc]
Nov  7 11:33:12 nbp8-oss7 kernel: [531465.038258]  [&amp;lt;ffffffffa0bdae43&amp;gt;] ldlm_run_ast_work+0xd3/0x3a0 [ptlrpc]
Nov  7 11:33:12 nbp8-oss7 kernel: [531465.038285]  [&amp;lt;ffffffffa0bfbabb&amp;gt;] ldlm_glimpse_locks+0x3b/0x100 [ptlrpc]
Nov  7 11:33:12 nbp8-oss7 kernel: [531465.038294]  [&amp;lt;ffffffffa10e78a4&amp;gt;] ofd_intent_policy+0x444/0xa40 [ofd]
Nov  7 11:33:12 nbp8-oss7 kernel: [531465.038318]  [&amp;lt;ffffffffa0bda2ba&amp;gt;] ldlm_lock_enqueue+0x38a/0x980 [ptlrpc]
Nov  7 11:33:12 nbp8-oss7 kernel: [531465.038346]  [&amp;lt;ffffffffa0c03b53&amp;gt;] ldlm_handle_enqueue0+0x9d3/0x16a0 [ptlrpc]
Nov  7 11:33:12 nbp8-oss7 kernel: [531465.038390]  [&amp;lt;ffffffffa0c89262&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
Nov  7 11:33:12 nbp8-oss7 kernel: [531465.038425]  [&amp;lt;ffffffffa0c8ceca&amp;gt;] tgt_request_handle+0x92a/0x1370 [ptlrpc]
Nov  7 11:33:12 nbp8-oss7 kernel: [531465.038455]  [&amp;lt;ffffffffa0c354bb&amp;gt;] ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
Nov  7 11:33:12 nbp8-oss7 kernel: [531465.038484]  [&amp;lt;ffffffffa0c394a2&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
Nov  7 11:33:12 nbp8-oss7 kernel: [531465.038489]  [&amp;lt;ffffffff810b1131&amp;gt;] kthread+0xd1/0xe0
Nov  7 11:33:12 nbp8-oss7 kernel: [531465.038492]  [&amp;lt;ffffffff816a14f7&amp;gt;] ret_from_fork+0x77/0xb0
Nov  7 11:33:12 nbp8-oss7 kernel: [531465.038512]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Nov  7 11:33:12 nbp8-oss7 kernel: [531465.038515] LustreError: dumping log to /tmp/lustre-log.1541619192.11080
Nov  7 11:33:14 nbp8-oss7 kernel: [531467.254898] LNet: Service thread pid 9724 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 303.19s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; debugging purposes:
Nov  7 11:33:14 nbp8-oss7 kernel: [531467.310852] Pid: 9724, comm: ll_ost01_019 3.10.0-693.21.1.el7.20180508.x86_64.lustre2105 #1 SMP Mon Aug 27 23:04:41 UTC 2018
Nov  7 11:33:14 nbp8-oss7 kernel: [531467.310854] Call Trace:
Nov  7 11:33:14 nbp8-oss7 kernel: [531467.310866]  [&amp;lt;ffffffffa0c1d0e0&amp;gt;] ptlrpc_set_wait+0x4c0/0x920 [ptlrpc]
Nov  7 11:33:14 nbp8-oss7 kernel: [531467.332869]  [&amp;lt;ffffffffa0bdae43&amp;gt;] ldlm_run_ast_work+0xd3/0x3a0 [ptlrpc]
Nov  7 11:33:14 nbp8-oss7 kernel: [531467.332902]  [&amp;lt;ffffffffa0bfbabb&amp;gt;] ldlm_glimpse_locks+0x3b/0x100 [ptlrpc]
Nov  7 11:33:14 nbp8-oss7 kernel: [531467.332912]  [&amp;lt;ffffffffa10e78a4&amp;gt;] ofd_intent_policy+0x444/0xa40 [ofd]
Nov  7 11:33:14 nbp8-oss7 kernel: [531467.332936]  [&amp;lt;ffffffffa0bda2ba&amp;gt;] ldlm_lock_enqueue+0x38a/0x980 [ptlrpc]
Nov  7 11:33:15 nbp8-oss7 kernel: [531467.332988]  [&amp;lt;ffffffffa0c03b53&amp;gt;] ldlm_handle_enqueue0+0x9d3/0x16a0 [ptlrpc]
Nov  7 11:33:15 nbp8-oss7 kernel: [531467.333032]  [&amp;lt;ffffffffa0c89262&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
Nov  7 11:33:15 nbp8-oss7 kernel: [531467.333067]  [&amp;lt;ffffffffa0c8ceca&amp;gt;] tgt_request_handle+0x92a/0x1370 [ptlrpc]
Nov  7 11:33:15 nbp8-oss7 kernel: [531467.333099]  [&amp;lt;ffffffffa0c354bb&amp;gt;] ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
Nov  7 11:33:15 nbp8-oss7 kernel: [531467.333128]  [&amp;lt;ffffffffa0c394a2&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
Nov  7 11:33:15 nbp8-oss7 kernel: [531467.333134]  [&amp;lt;ffffffff810b1131&amp;gt;] kthread+0xd1/0xe0
Nov  7 11:33:15 nbp8-oss7 kernel: [531467.333137]  [&amp;lt;ffffffff816a14f7&amp;gt;] ret_from_fork+0x77/0xb0
Nov  7 11:33:15 nbp8-oss7 kernel: [531467.333158]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;will upload to ftp:/uploads/LU11613/lustre-log.1541619192.11080&lt;/p&gt;

&lt;p&gt;we didn&apos;t have rpctrace or dlmtrace so may not be very useful.&lt;/p&gt;

&lt;p&gt;Could be related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11613&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.whamcloud.com/browse/LU-11613&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</description>
                <environment></environment>
        <key id="53967">LU-11644</key>
            <summary>LNet: Service thread inactive for 300  causes client evictions </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Thu, 8 Nov 2018 04:03:54 +0000</created>
                <updated>Fri, 14 May 2021 22:53:56 +0000</updated>
                                            <version>Lustre 2.10.5</version>
                    <version>Lustre 2.12.1</version>
                                                        <due></due>
                            <votes>1</votes>
                                    <watches>15</watches>
                                                                            <comments>
                            <comment id="236631" author="mhanafi" created="Thu, 8 Nov 2018 04:34:16 +0000"  >&lt;p&gt;Better logs, on a different servers. Uploaded ftp:/uploads/LU11644_1/lustre-log.1541648790.87846&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Nov  7 19:44:29 nbp2-oss3 kernel: [820650.532934] Lustre: Skipped 3581 previous similar messages
Nov  7 19:44:29 nbp2-oss3 kernel: [820650.853265] Lustre: nbp2-OST012e: Client cedc88fe-8812-7c23-9986-b50c84150b0e (at 10.151.17.70@o2ib) reconnecting
Nov  7 19:44:29 nbp2-oss3 kernel: [820650.853266] Lustre: nbp2-OST0106: Client cedc88fe-8812-7c23-9986-b50c84150b0e (at 10.151.17.70@o2ib) reconnecting
Nov  7 19:44:29 nbp2-oss3 kernel: [820650.915564] Lustre: Skipped 91 previous similar messages
Nov  7 19:44:29 nbp2-oss3 kernel: [820650.975975] LustreError: 92324:0:(ldlm_lockd.c:2365:ldlm_cancel_handler()) ldlm_cancel from 10.149.11.42@o2ib313 arrived at 1541648669
 with bad export cookie 2655206210869360537
Nov  7 19:44:29 nbp2-oss3 kernel: [820651.023594] LustreError: 92324:0:(ldlm_lockd.c:2365:ldlm_cancel_handler()) Skipped 26315 previous similar messages
Nov  7 19:44:30 nbp2-oss3 kernel: [820651.424409] Lustre: nbp2-OST002a: haven&apos;t heard from client 7788fc55-4957-f8c6-7825-fd1fdbcba946 (at 10.149.2.70@o2ib313) in 206 secon
ds. I think it&apos;s dead, and I am evicting it. exp ffff88185cdcf400, cur 1541648670 expire 1541648520 last 1541648464
Nov  7 19:44:30 nbp2-oss3 kernel: [820651.490743] Lustre: Skipped 1602 previous similar messages
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.470035] LustreError: 92941:0:(ldlm_lib.c:3237:target_bulk_io()) @@@ timeout on bulk WRITE after 300+0s  req@ffff8816d7e53c50 x1615
791089168704/t0(0) o4-&amp;gt;fde038a9-ef4b-50d5-6625-3411db4c7f00@10.149.15.68@o2ib313:717/0 lens 608/448 e 5 to 0 dl 1541648807 ref 1 fl Interpret:/0/0 rc 0/0
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.546266] Lustre: nbp2-OST012e: Bulk IO write error with fde038a9-ef4b-50d5-6625-3411db4c7f00 (at 10.149.15.68@o2ib313), client will
 retry: rc = -110
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.587136] Lustre: Skipped 7 previous similar messages
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.607054] LNet: Service thread pid 87846 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 301.04s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; debugging purposes:
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.658059] Pid: 87846, comm: ll_ost01_184 3.10.0-693.21.1.el7.20180508.x86_64.lustre2105 #1 SMP Mon Aug 27 23:04:41 UTC 2018
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.658060] Call Trace:
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.658069]  [&amp;lt;ffffffffa10540e0&amp;gt;] ptlrpc_set_wait+0x4c0/0x920 [ptlrpc]
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.677999]  [&amp;lt;ffffffffa1011e43&amp;gt;] ldlm_run_ast_work+0xd3/0x3a0 [ptlrpc]
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.678029]  [&amp;lt;ffffffffa1032abb&amp;gt;] ldlm_glimpse_locks+0x3b/0x100 [ptlrpc]
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.678046]  [&amp;lt;ffffffffa0d4d8a4&amp;gt;] ofd_intent_policy+0x444/0xa40 [ofd]
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.678074]  [&amp;lt;ffffffffa10112ba&amp;gt;] ldlm_lock_enqueue+0x38a/0x980 [ptlrpc]
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.678104]  [&amp;lt;ffffffffa103ab53&amp;gt;] ldlm_handle_enqueue0+0x9d3/0x16a0 [ptlrpc]
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.678147]  [&amp;lt;ffffffffa10c0262&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.678184]  [&amp;lt;ffffffffa10c3eca&amp;gt;] tgt_request_handle+0x92a/0x1370 [ptlrpc]
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.678217]  [&amp;lt;ffffffffa106c4bb&amp;gt;] ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.678250]  [&amp;lt;ffffffffa10704a2&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.678255]  [&amp;lt;ffffffff810b1131&amp;gt;] kthread+0xd1/0xe0
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.678261]  [&amp;lt;ffffffff816a14f7&amp;gt;] ret_from_fork+0x77/0xb0
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.678279]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Nov  7 19:46:30 nbp2-oss3 kernel: [820771.678282] LustreError: dumping log to /tmp/lustre-log.1541648790.87846
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="236678" author="pjones" created="Thu, 8 Nov 2018 18:23:48 +0000"  >&lt;p&gt;Oleg&lt;/p&gt;

&lt;p&gt;Could you please advise?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="236786" author="mhanafi" created="Sat, 10 Nov 2018 07:05:50 +0000"  >&lt;p&gt;From server metrics there is 3 to 4 mins where the server is locked up. I think this is the window where all the lnet pings get dropped. I enabled +net debug and it seem to prevent us hitting this bug. I disabled and not long after that we saw mass evictions. I will enable it for the next few days and see if we still hit the but.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Found this looks similar&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10035&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.whamcloud.com/browse/LU-10035&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="237162" author="green" created="Mon, 19 Nov 2018 02:15:37 +0000"  >&lt;p&gt;Well, if you have locked up servers that don&apos;t even respond to pings, that does not look like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10035&quot; title=&quot;Many threads hanging on OST, lustre-log dumps&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10035&quot;&gt;LU-10035&lt;/a&gt; or &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11613&quot; title=&quot;MDS and OSS locked up wait_transaction_locked+0x85/0xd0 [jbd2]&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11613&quot;&gt;&lt;del&gt;LU-11613&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;If you increase lnet logging and it makes the problem go away, that sounds like some sort of a lockup inside the lnet. It would be great if you can get a crashdump while in this state so every thread could be examined (sadly cannot just do sysrq-t since threads spinning on CPU cannot be captured in this way - so if in this state and the node is alive enough that you still can issue commands - check if there&apos;s a bunch of Lustre threads hoggign cpu, if not - then sysrq-t is useful, if yes or if you cannot get console access either - trigger a crashdump).&lt;/p&gt;

&lt;p&gt;I guess it&apos;s great news that you have a workaround of enabling _net logging so you can at least make sure this does not happen outside of when you actually want to for debug purposes.&lt;/p&gt;</comment>
                            <comment id="238211" author="bruno" created="Sat, 8 Dec 2018 07:37:29 +0000"  >&lt;p&gt;+1 at&#160;&lt;a href=&quot;https://testing.whamcloud.com/test_sessions/d7f0ec38-7b1d-4cba-8e5f-ca45754a694d&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sessions/d7f0ec38-7b1d-4cba-8e5f-ca45754a694d&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="238711" author="lflis" created="Mon, 17 Dec 2018 22:14:59 +0000"  >&lt;p&gt;I can confirm seeing the same issue in Cyfronet with 2.10.6.RC3&lt;/p&gt;</comment>
                            <comment id="238767" author="mhanafi" created="Tue, 18 Dec 2018 17:25:47 +0000"  >&lt;p&gt;@Lukasz Do you have a reproducer&lt;/p&gt;</comment>
                            <comment id="242417" author="mhanafi" created="Thu, 21 Feb 2019 06:46:48 +0000"  >&lt;p&gt;what does this mean?&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 Feb 20 04:30:59 nbp8-oss18 kernel: [68015.261353] LustreError: 32857:0:(ldlm_lockd.c:2365:ldlm_cancel_handler()) ldlm_cancel from 10.149.14.197@o2ib313 arrived at 1550665859 with bad export cookie 2159411342194476890
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Could this be related to &lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11931&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.whamcloud.com/browse/LU-11931&lt;/a&gt;&lt;br/&gt;
Because the evictions are proceed by kiblnd_check_txs_locked around the time that obd pings should be arriving from the clients.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Feb 20 04:24:30 nbp8-oss18 kernel: [67626.491267] LNetError: 16978:0:(o2iblnd_cb.c:3147:kiblnd_check_txs_locked()) Timed out tx: active_txs, 2 seconds
Feb 20 04:24:30 nbp8-oss18 kernel: [67626.524744] LNetError: 16978:0:(o2iblnd_cb.c:3147:kiblnd_check_txs_locked()) Skipped 12 previous similar messages
Feb 20 04:24:30 nbp8-oss18 kernel: [67626.558524] LNetError: 16978:0:(o2iblnd_cb.c:3222:kiblnd_check_conns()) Timed out RDMA with 10.151.15.142@o2ib (268): c: 62, oc: 0, rc: 63
Feb 20 04:24:30 nbp8-oss18 kernel: [67626.599434] LNetError: 16978:0:(o2iblnd_cb.c:3222:kiblnd_check_conns()) Skipped 12 previous similar messages
Feb 20 04:24:51 nbp8-oss18 kernel: [67647.490633] LNetError: 16978:0:(o2iblnd_cb.c:3147:kiblnd_check_txs_locked()) Timed out tx: active_txs, 2 seconds
Feb 20 04:24:51 nbp8-oss18 kernel: [67647.524115] LNetError: 16978:0:(o2iblnd_cb.c:3222:kiblnd_check_conns()) Timed out RDMA with 10.151.14.36@o2ib (290): c: 62, oc: 0, rc: 63

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I uploaded lustre-log.1550676361.85441.gz to ftp.whamcloud.com/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11644&quot; title=&quot;LNet: Service thread inactive for 300  causes client evictions &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11644&quot;&gt;LU-11644&lt;/a&gt; it shows &quot;no credits&apos; messages &lt;/p&gt;</comment>
                            <comment id="242486" author="mhanafi" created="Thu, 21 Feb 2019 22:49:02 +0000"  >&lt;p&gt;Here is what nis stats look like on the server during the eviction&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 ========== /proc/sys/lnet/nis ==========
nid                      status alive refs peer  rtr   max    tx   min
0@lo                         up     0    2    0    0     0     0     0
0@lo                         up     0    0    0    0     0     0     0
10.151.27.84@o2ib            up    -1 12874   63    0 31384 31384 -86290
10.151.27.84@o2ib            up    -1 13237   63    0 31384 31384 -84712
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt; our current ko2iblnd setting&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
options ko2iblnd ntx=125536 credits=62768 fmr_pool_size=31385 
options ko2iblnd timeout=150 retry_count=7 peer_timeout=0 map_on_demand=32 peer_credits=63 concurrent_sends=63

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="242492" author="ashehata" created="Fri, 22 Feb 2019 00:35:38 +0000"  >&lt;p&gt;When did this start happening? Was this triggered by some change in the network?&lt;/p&gt;

&lt;p&gt;From the logs in the comment above messages are timing out while on the active_txs. This means the message has been posted on the wire, but hasn&apos;t been completed yet. This usually indicates an underlying network issue? I&apos;m thinking that when you enable net logging it slows down the flow just enough. You have set the timeout to 150s, which means messages remain in incomplete state for that long before they expire.&lt;/p&gt;

&lt;p&gt;Are there any IB statistics we can take a look at? see if there are issues with the fabric? Drops? failures?&lt;/p&gt;</comment>
                            <comment id="242540" author="mhanafi" created="Fri, 22 Feb 2019 20:08:52 +0000"  >&lt;p&gt;This started when we moved from 2.7.3 mofed3.3 to 2.10.5 mofed4.4.2. The IB logs look clean. We don&apos;t see any drops of failures.&lt;/p&gt;

&lt;p&gt;What do you think about the large negative values for &quot;min&quot; in the nis file? Should we increase our credits? Our cluster has &amp;gt;12K clients and our OSSes have 13 to 19 OST per oss.&lt;/p&gt;

&lt;p&gt;Why is it timing out the active_txs after 2 seconds?&lt;/p&gt;</comment>
                            <comment id="242551" author="ashehata" created="Fri, 22 Feb 2019 22:33:39 +0000"  >&lt;p&gt;The large negative values are a result of the o2iblnd not completing messages. Basically messages are not complete. You run out of credits at the LNet level, then messages start to get queued. From the stats shown you have 86K and 84K messages queued up.&lt;/p&gt;

&lt;p&gt;By the way, why are their two NIs with the same NID?&lt;/p&gt;

&lt;p&gt;Would you be able to share your lnet configuration?&lt;/p&gt;

&lt;p&gt;Also would you be able to provide the output from &quot;lnetctl net show -v 3&quot; and &quot;lnetctl peer show -v 3&quot;. Maybe we can see if there is a particular peer which is causing the problem.&lt;/p&gt;

&lt;p&gt;The 2 seconds is the time after the deadline. So if you set the msg deadline to 150s after posting, then the 2 means it actually expired 2 seconds after the deadline has passed.&lt;/p&gt;

&lt;p&gt;I don&apos;t think increasing the credits will resolve the issue in this particular case.&lt;/p&gt;

&lt;p&gt;Would we be able to try and reduce the concurrent_sends to 32. I&apos;d like to see what would happen if we throttle down the number of in flight messages.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="242568" author="mhanafi" created="Sat, 23 Feb 2019 00:36:06 +0000"  >&lt;p&gt;@Amir Shehata: sent you the output for lnetctl.&lt;/p&gt;

&lt;p&gt;What is 1 credit? 1 RDMA message? How does that related to an RPC? &lt;br/&gt;
How does peer_credits relate to map_on_demand and fmr?&lt;/p&gt;</comment>
                            <comment id="242573" author="ashehata" created="Sat, 23 Feb 2019 02:21:56 +0000"  >&lt;p&gt;At the LNet level, 1 credit is consumed for every LNet message sent, IE: PUT, GET, ACK, REPLY&lt;/p&gt;

&lt;p&gt;So in the above case when you see large negative values, these represent lnet messages being queued because there are no credits available on the NI. You set the credits to: 62768, which gets divided between two CPTs., that&apos;s why you see 31384. An RPC can be formed of multiple LNet messages, depending on the RPC being sent.&lt;/p&gt;

&lt;p&gt;Peer credits is primarily used in the o2iblnd, and it determines the queue depth of the QP created.&lt;/p&gt;

&lt;p&gt;FMR is the memory registration type being used. You can control the the number of these FMR pools allocated when the NI is brought up. These pools can dynamically grow&lt;/p&gt;

&lt;p&gt;map-on-demand originally was used to decide when the LND should use FMR memory registration or FastReg memory registration instead of global memory regions. So when it was set to 32 and the RDMA would need to be broken up to more than 32 fragments, then instead of using global memory regions, the RDMA data is mapped to device memory using FMR or FastReg. However since now we use FMR (or fastreg) all the time (global memory regions is no longer supported in RHEL 7.4 and later I believe), the map_on_demand has no impact on how LNet works.&lt;/p&gt;

&lt;p&gt;But to be absolutely sure can you verify if the kernel you&apos;re using has HAVE_IB_GET_DMA_MR defined. That&apos;s what we use to determine if we should try and use global memory regions.&lt;/p&gt;

&lt;p&gt;You can also confirm which memory registration you&apos;re using. You should see one of these two messages when you first configure the NI&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Using FMR &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; registration
#or
Using FastReg &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; registration&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;As to regards to the lnetctl output, I don&apos;t see any peer consuming too much credits. There is just a lot of them.&lt;/p&gt;

&lt;p&gt;Will you be able to try reducing concurrent_sends to see if it helps with the timeout?&lt;/p&gt;</comment>
                            <comment id="242651" author="mhanafi" created="Mon, 25 Feb 2019 06:00:47 +0000"  >&lt;p&gt;What should be the ratio/relation between ko2iblnd timeout and lnet at_min timeout setting.&lt;/p&gt;</comment>
                            <comment id="242729" author="ashehata" created="Mon, 25 Feb 2019 19:24:13 +0000"  >&lt;p&gt;at_min is part of the adaptive timeout mechanism and defines the minimum timeout for RPC responses.&lt;/p&gt;

&lt;p&gt;The timeout grows with slow connections and decreases with healthy connection. ptlrpc keeps track of the RTT + handling time of the RPC messages and based on that it changes the adaptive timeout.&lt;/p&gt;

&lt;p&gt;The ko2iblnd timeout is really below all that. Each RPC message is composed of an LNet message (PUT, GET). A PUT or a GET in turn is composed of multiple ko2iblnd transmits, depending on the message type. The ko2iblnd timeout applies to the ko2iblnd transmits. If the ko2iblnd timeout is larger than the maximum RPC timeout, then it essentially means that RPCs will timeout before the ko2iblnd tx times out. I think it makes more sense for the ko2iblnd tx to time out before the RPC times out.&lt;/p&gt;

&lt;p&gt;However, that said, after looking at the logs, I don&apos;t see except 6 timeouts in the o2iblnd:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Closing conn to 10.151.7.78@o2ib: error -110(waiting)
Closing conn to 10.151.7.82@o2ib: error -110(waiting)
Closing conn to 10.151.6.43@o2ib: error -110(waiting)
Closing conn to 10.151.7.105@o2ib: error -110(waiting)
Closing conn to 10.151.7.39@o2ib: error -110(waiting)
Closing conn to 10.151.7.50@o2ib: error -110(waiting)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I don&apos;t think that would be responsible for the mass evictions you&apos;re seeing.&lt;/p&gt;

&lt;p&gt;Usually if the network is an issue, you&apos;ll be seeing a lot of o2iblnd timeouts occurring. Based on the logs you uploaded this is not the case. (unless these are routers which many clients connect through?)&lt;/p&gt;</comment>
                            <comment id="242732" author="mhanafi" created="Mon, 25 Feb 2019 20:01:30 +0000"  >&lt;p&gt;We do have clients connected through routers. The clients don&apos;t see any timeouts until they are evicted. &lt;/p&gt;

&lt;p&gt;Should ko2iblnd timeout  be less than at_min?&lt;br/&gt;
Does changing concurrent_sends require all clients and servers much be change at the same time?&lt;br/&gt;
Should we consider lowering ko2iblnd timeout to 100?&lt;br/&gt;
Currently we have ko2iblnd timeout=150 and our at_min=150. I did change the at_min to 200.&lt;/p&gt;</comment>
                            <comment id="242754" author="ashehata" created="Mon, 25 Feb 2019 23:42:18 +0000"  >&lt;p&gt;Are all the clients experiencing evictions connect through routers? if so, are there any issues on routers? drops? timeouts?&lt;/p&gt;

&lt;p&gt;I think if we set at_min to 200 and ko2iblnd timeout to 150 should be sane.&lt;/p&gt;

&lt;p&gt;Unfortunately concurrent sends are not negotiated and therefore I recommend the change across the cluster.&lt;/p&gt;

&lt;p&gt;&#160;Another question: have you applied &lt;a href=&quot;https://review.whamcloud.com/#/c/33975/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/33975/&lt;/a&gt; on your build? We&apos;re currently investigating an issue on another site and we&apos;re suspecting that this patch might be causing the timeouts. Although the other site is OPA.&lt;/p&gt;</comment>
                            <comment id="242759" author="mhanafi" created="Tue, 26 Feb 2019 00:39:29 +0000"  >&lt;p&gt;Our 2.10.5 didn&apos;t have this patch. But our most current 2.10.6-2nas does have that patch. But we are seeing the evictions on both versions.&lt;/p&gt;

&lt;p&gt;The client evictions are all over not just clients behind the router. I think this is a server side issues. Before the eviction the rpcs rates drop to almost zero. It seems like there is a dead lock on the server for 30 to 120 seconds.  &lt;/p&gt;</comment>
                            <comment id="242852" author="mhanafi" created="Tue, 26 Feb 2019 20:22:31 +0000"  >&lt;p&gt;I wonder if &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5368&quot; title=&quot;errors in/from ldlm_run_ast_work() ignored&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5368&quot;&gt;LU-5368&lt;/a&gt; is related.&lt;/p&gt;</comment>
                            <comment id="242853" author="ashehata" created="Tue, 26 Feb 2019 20:59:30 +0000"  >&lt;p&gt;From the logs, these are the clients having RDMA time outs:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
10.151.7.53@o2ib
10.151.7.39@o2ib
10.151.6.43@o2ib
10.151.7.82@o2ib
10.151.4.110@o2ib
10.151.7.50@o2ib
10.151.7.105@o2ib
10.151.7.78@o2ib &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Is it possible to take a look at the logs from the clients which get evicted?&lt;/p&gt;

&lt;p&gt;The timeouts ocurring on the active_tx leads me to believe that there was some sort of a connection breakdown between the server and these clients. Basically the server tries to send messages which never make it to the client. The server gets stuck on that connection waiting for a notification that the tx has completed, but doesn&apos;t it get it in 150 seconds.&lt;/p&gt;

&lt;p&gt;If we can get logs from the clients during that time, maybe we can see more details.&lt;/p&gt;</comment>
                            <comment id="243045" author="mhanafi" created="Thu, 28 Feb 2019 16:25:26 +0000"  >&lt;p&gt;@Amir I will try to get you the logs. What I been able to determine before the eviction there is a period of over 2 mins that the server doesn&apos;t receive any obdping rpcs. For example on the server I was looking at it get ~120K obdping rpcs over 60sec.&#160; But it didn&apos;t get any for over 2 mins before the eviction. Although, it gets zero obdping rpcs it still is getting ost_io (read/write) rpcs.&lt;/p&gt;

&lt;p&gt;Do the obdping rpcs have to do any ipoib lookup? (we been have some ipoib issues on our fabric)&lt;/p&gt;

&lt;p&gt;How are obdping rpcs different than ost_io rpcs?&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="243060" author="ashehata" created="Thu, 28 Feb 2019 17:51:33 +0000"  >&lt;p&gt;I&apos;ll let Oleg answer the rpc/ost_io question.&lt;/p&gt;

&lt;p&gt;Regarding IPoIB, it is used when establishing a connection, address resolution and route resolution. If there are issues on the fabric, then some connections may fail. This will lead to reconnect attempts. If you have a storm of reconnects, it&apos;s possible that&apos;ll interfere with normal operations . Do you see reconnects happening during the lock up time?&lt;/p&gt;</comment>
                            <comment id="243198" author="green" created="Sat, 2 Mar 2019 00:19:33 +0000"  >&lt;p&gt;obd pings are only sent from clients that did not have a recent communication with a server (there are two states - we sent som edata nad it was not committed, that would trigger an obd ping in 7 seconds, for clients that don&apos;t have any uncommitted data on a server we ping 4 times per obd_timeout.&lt;/p&gt;

&lt;p&gt;so it&apos;s totally normal not to get any pings when there&apos;s an active io ongoing from a client.&lt;/p&gt;</comment>
                            <comment id="243582" author="mhanafi" created="Sat, 9 Mar 2019 00:16:47 +0000"  >&lt;p&gt;The stack trace look very similar to&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5368&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.whamcloud.com/browse/LU-5368&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;And there is evidence that at that the time the last ping is received there is a spike in load and lustre.portals.ost.ldlm_extend_enqueue.&lt;/p&gt;</comment>
                            <comment id="243782" author="mhanafi" created="Tue, 12 Mar 2019 23:26:49 +0000"  >&lt;p&gt;We had an additional mass evictions. These all correspond to a spike in ldlm_glimpse_enqueue rpcs.&lt;/p&gt;

&lt;p&gt;Is it possible for ldlm_reclaim to block all rpcs&#160;until it is finished? I think ldlm_reclaim may be involved here.&#160;&#160;&lt;/p&gt;</comment>
                            <comment id="243904" author="mhanafi" created="Thu, 14 Mar 2019 06:31:17 +0000"  >&lt;p&gt;I uploaded debug and /var/log/messages for a server that experienced the mass eviction. Please take a look.&lt;/p&gt;

&lt;p&gt;ftp.whamcloud.com:/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11644&quot; title=&quot;LNet: Service thread inactive for 300  causes client evictions &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11644&quot;&gt;LU-11644&lt;/a&gt;/&lt;/p&gt;

&lt;p&gt;&#160;s604.lctl.dump.gz&lt;/p&gt;

&lt;p&gt;&#160;s604.var.log.messages.gz&lt;/p&gt;

&lt;p&gt;s608.lctl.dump.gz&lt;/p&gt;

&lt;p&gt;s608.var.log.messages.gz&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="243953" author="ashehata" created="Thu, 14 Mar 2019 21:36:46 +0000"  >&lt;p&gt;I&apos;m looking at a similar symptom reported here: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12065&quot; title=&quot;Client got evicted when  lock callback timer expired  on OSS &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12065&quot;&gt;&lt;del&gt;LU-12065&lt;/del&gt;&lt;/a&gt;. We also see RDMA timeout on active_tx in 2.10.7-RC1. The last clean run was with 2.10.6. I&apos;m suspecting a particular patch, which reduces the number of CQ entries. My current suspicion is if there aren&apos;t enough available CQ entries, we could have a boost in completion events, which might not be processed, leaving the corresponding txs to timeout. This would explain why we&apos;re seeing RDMA timeouts for transmits on active_txs.&lt;/p&gt;

&lt;p&gt;This change came in:&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt; lnd: use less CQ entries for each connection&lt;br/&gt;
I&apos;m going to push a patch to revert the patch. It&apos;s not a straight revert as some other code changes happened after.&lt;/p&gt;

&lt;p&gt;Do you have this patch on your systems?&lt;/p&gt;

&lt;p&gt;Would you be able to try with the fix (when available) to see if it resolves the issue?&lt;/p&gt;</comment>
                            <comment id="243962" author="mhanafi" created="Thu, 14 Mar 2019 23:34:19 +0000"  >&lt;p&gt;our 2.10.5 and 2.10.6 build does have &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;$ git lg |grep &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;3970a8c4f1 - &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt; lnet: fix build with M-OFED 4.1 (7 weeks ago)&lt;/li&gt;
	&lt;li&gt;31e16f27cc - &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt; lnd: use less CQ entries for each connection (8 weeks ago)&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="243966" author="ashehata" created="Fri, 15 Mar 2019 00:31:13 +0000"  >&lt;p&gt;We&apos;re seeing similar symptoms (IE client evictions) on our soak as I mentioned above. What we&apos;re trying to do is apply: &lt;a href=&quot;https://review.whamcloud.com/34427&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34427&lt;/a&gt;, and see if this resolves the issue. This change simply reverts the functionality of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt;. If it doesn&apos;t resolve the issue then the next step for us would be to revert both:&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt; lnd: use less CQ entries for each connection&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10213&quot; title=&quot;o2iblnd: Potential discrepancy when allocating qp&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10213&quot;&gt;&lt;del&gt;LU-10213&lt;/del&gt;&lt;/a&gt; lnd: calculate qp max_send_wrs properly&lt;br/&gt;
This way we can isolate the issue to one of these changes.&lt;/p&gt;

&lt;p&gt;If you guys have a test clustre which you&apos;re able to reproduce the problem on, maybe you can try the same approach. Otherwise, I&apos;ll update the ticket when we have more details on the debugging on our end.&lt;/p&gt;</comment>
                            <comment id="244166" author="pjones" created="Mon, 18 Mar 2019 21:28:04 +0000"  >&lt;p&gt;Mahmoud&lt;/p&gt;

&lt;p&gt;Have you tried applying this revert into production?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="244176" author="jaylan" created="Mon, 18 Mar 2019 23:07:02 +0000"  >&lt;p&gt;I just did a build of nas-2.10.5 by backing out &lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt; lnd: use less CQ entries for each connection&lt;/p&gt;

&lt;p&gt;Our nas-2.10.5 did not have &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10213&quot; title=&quot;o2iblnd: Potential discrepancy when allocating qp&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10213&quot;&gt;&lt;del&gt;LU-10213&lt;/del&gt;&lt;/a&gt; patch installed, so no need to back that out. (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10213&quot; title=&quot;o2iblnd: Potential discrepancy when allocating qp&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10213&quot;&gt;&lt;del&gt;LU-10213&lt;/del&gt;&lt;/a&gt; is in nas-2.10.6.) Since we reported the problem on 2.10.5, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10213&quot; title=&quot;o2iblnd: Potential discrepancy when allocating qp&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10213&quot;&gt;&lt;del&gt;LU-10213&lt;/del&gt;&lt;/a&gt; should not be the culprit. &lt;/p&gt;</comment>
                            <comment id="244330" author="ashehata" created="Wed, 20 Mar 2019 18:13:35 +0000"  >&lt;p&gt;Our testing on soak narrowed down the RDMA timeouts and subsequent evictions to&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt; lnd: use less CQ entries for each connection&lt;/p&gt;

&lt;p&gt;Please let me us know if the situation is improved.&lt;/p&gt;</comment>
                            <comment id="244948" author="pjones" created="Sat, 30 Mar 2019 14:30:29 +0000"  >&lt;p&gt;NASA&lt;/p&gt;

&lt;p&gt;Any updates?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="245539" author="mhanafi" created="Wed, 10 Apr 2019 19:06:18 +0000"  >&lt;p&gt;Finally got to try it on 1 server. I didn&apos;t help. During your testing did you have to do both clients and servers?&lt;/p&gt;</comment>
                            <comment id="245550" author="jaylan" created="Thu, 11 Apr 2019 01:26:49 +0000"  >&lt;p&gt;I have reverted&#160;&quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt; lnd: use less CQ entries for each connection&quot; in our nas-2.10.5 and nas-2.10.6 for Mahmoud to test.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;The revert failed on 2.12.0 though. It would bring back&lt;br/&gt;
+#define IBLND_SEND_WRS(c) &#160;&#160;&#160;&#160;&#160;\ &lt;br/&gt;
+ &#160;&#160;&#160;&#160;&#160;&#160;((c-&amp;gt;ibc_max_frags + 1) * kiblnd_concurrent_sends(c-&amp;gt;ibc_version, \ &lt;br/&gt;
+ &#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;c-&amp;gt;ibc_peer-&amp;gt;ibp_ni)) &lt;br/&gt;
+#define IBLND_CQ_ENTRIES(c) &#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;(IBLND_RECV_WRS(c) + IBLND_SEND_WRS(c))&lt;br/&gt;
and&#160;kiblnd_concurrent_sends is no longer defined in&#160;lnet/klnds/o2iblnd/o2iblnd.h.&lt;/p&gt;</comment>
                            <comment id="245606" author="ashehata" created="Thu, 11 Apr 2019 17:04:41 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=mhanafi&quot; class=&quot;user-hover&quot; rel=&quot;mhanafi&quot;&gt;mhanafi&lt;/a&gt;, yes this will need to be done on clients and servers. There is also another patch which ORNL is running with, but not 100% sure if it&apos;s applicable in your case: &lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11931&quot; title=&quot;RDMA packets sent from client to MGS are timing out &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11931&quot;&gt;&lt;del&gt;LU-11931&lt;/del&gt;&lt;/a&gt;: &lt;a href=&quot;https://review.whamcloud.com/34396&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34396&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="245607" author="mhanafi" created="Thu, 11 Apr 2019 17:11:00 +0000"  >&lt;p&gt;I think will pickup &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11931&quot; title=&quot;RDMA packets sent from client to MGS are timing out &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11931&quot;&gt;&lt;del&gt;LU-11931&lt;/del&gt;&lt;/a&gt; also. Are there any other major changes in the ko2ib from 2.7 to 2.10? &lt;/p&gt;</comment>
                            <comment id="245608" author="ashehata" created="Thu, 11 Apr 2019 17:25:37 +0000"  >&lt;p&gt;I believe these two are the ones which were causing problems. Technically the CQE issue is due to a problem with the way OFED works. We&apos;re still investigating that.&lt;/p&gt;</comment>
                            <comment id="245796" author="mhanafi" created="Mon, 15 Apr 2019 22:25:24 +0000"  >&lt;p&gt;We got  all the server for one our large file system rebooted into the version without &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt;. We are going to reboot the clients next. But with just the servers done I was surprised that the by it self didn&apos;t  help the eviction issue. Do you still think that reboot of clients will help.&lt;/p&gt;</comment>
                            <comment id="245939" author="ashehata" created="Wed, 17 Apr 2019 17:17:56 +0000"  >&lt;p&gt;I believe that we will need both clients and servers to have these changes to ensure the problem is resolved on both sides of the connection.&lt;/p&gt;</comment>
                            <comment id="245972" author="mhanafi" created="Thu, 18 Apr 2019 02:20:16 +0000"  >&lt;p&gt;@Amir Shehata, I was able to capture lnet metrics during the eviction. See the attached chart. &lt;br/&gt;
This was captured with the server running without &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9180&quot; title=&quot;Upstream ko2iblnd does not work with map_on_demand &amp;lt;256&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9180&quot;&gt;&lt;del&gt;LU-9180&lt;/del&gt;&lt;/a&gt;. But the clients were still running with the patch. &lt;br/&gt;
At the start client message loss, lnet_msg_alloc starts to grow sharply while send and receive count drop too low levels. &lt;br/&gt;
When the evictions happen, the msg_alloc drops to zero and there is a large spike in the send and receive counts. After which everything returns to normal.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/32440/32440_lnet_metrics_during_eviction.pdf&quot; title=&quot;lnet_metrics_during_eviction.pdf attached to LU-11644&quot;&gt;lnet_metrics_during_eviction.pdf&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;/p&gt;
</comment>
                            <comment id="246135" author="mhanafi" created="Mon, 22 Apr 2019 00:48:27 +0000"  >&lt;p&gt;The revert of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9180&quot; title=&quot;Upstream ko2iblnd does not work with map_on_demand &amp;lt;256&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9180&quot;&gt;&lt;del&gt;LU-9180&lt;/del&gt;&lt;/a&gt; has NOT fixed this issue for us. If we turn off &quot;net&quot; debugging we hit the mass client eviction.&lt;/p&gt;</comment>
                            <comment id="246241" author="ashehata" created="Tue, 23 Apr 2019 19:49:19 +0000"  >&lt;p&gt;I&apos;ll work on a debug patch to see if we can narrow down what&apos;s going on.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="248601" author="mhanafi" created="Thu, 6 Jun 2019 23:16:07 +0000"  >&lt;p&gt;I been try to see if I can reproduce some lnet failures using lnettest. In my test case I had 120 clients read/writing to a single server &lt;br/&gt;
 I get this type of bandwidth&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[LNet Rates of servers]
[R] Avg: 13910    RPC/s Min: 13910    RPC/s Max: 13910    RPC/s
[W] Avg: 18334    RPC/s Min: 18334    RPC/s Max: 18334    RPC/s
[LNet Bandwidth of servers]
[R] Avg: 4755.38  MiB/s Min: 4755.38  MiB/s Max: 4755.38  MiB/s 
[W] Avg: 4410.69  MiB/s Min: 4410.69  MiB/s Max: 4410.69  MiB/s 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;At the same time I tried to lctl ping the server from the same set of clients. At different time the lctl ping will fail with like this&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
r467i1n3: failed to ping 10.151.27.53@o2ib: Input/output error &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;and in the debug on the client i will get this messages&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
00000400:00000100:36.0F:1559862327.071451:0:3733:0:(lib-move.c:2153:lnet_parse_reply()) 10.151.23.139@o2ib: Dropping REPLY from 12345-10.151.27.53@o2ib &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; invalid MD 0x159da02807b48584.0x10ab844d

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Not sure if this is significant or relevant.&lt;/p&gt;</comment>
                            <comment id="249143" author="ashehata" created="Wed, 12 Jun 2019 18:20:01 +0000"  >&lt;p&gt;I&apos;ve been looking at the logs and I see&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 Feb 11 09:12:32 nbp2-oss3 kernel: [9077322.900796] LNetError: 14085:0:(o2iblnd_cb.c:3147:kiblnd_check_txs_locked()) Timed out tx: active_txs, 19 seconds
Feb 11 09:12:32 nbp2-oss3 kernel: [9077322.932031] LNetError: 14085:0:(o2iblnd_cb.c:3222:kiblnd_check_conns()) Timed out RDMA with 10.151.33.79@o2ib (169): c: 61, oc: 0, rc: 63
Feb 11 09:12:32 nbp2-oss3 kernel: [9077322.969509] LNet: 14085:0:(o2iblnd_cb.c:1914:kiblnd_close_conn_locked()) Closing conn to 10.151.33.79@o2ib: error -110(waiting)
Feb 11 09:12:32 nbp2-oss3 kernel: [9077322.969916] LustreError: 14089:0:(events.c:449:server_bulk_callback()) event type 5, status -5, desc ffff880aea612600
Feb 11 09:12:50 nbp2-oss3 kernel: [9077340.858729] LNet: 89480:0:(o2iblnd_cb.c:3013:kiblnd_cm_callback()) 10.151.33.79@o2ib: ROUTE ERROR -22
Feb 11 09:12:50 nbp2-oss3 kernel: [9077340.858735] LNet: 89480:0:(o2iblnd_cb.c:3013:kiblnd_cm_callback()) Skipped 66022 previous similar messages
Feb 11 09:12:50 nbp2-oss3 kernel: [9077340.858746] LNet: 89480:0:(o2iblnd_cb.c:2094:kiblnd_peer_connect_failed()) Deleting messages &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 10.151.33.79@o2ib: connection failed
Feb 11 09:12:50 nbp2-oss3 kernel: [9077340.858749] LNet: 89480:0:(o2iblnd_cb.c:2094:kiblnd_peer_connect_failed()) Skipped 66022 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;There appears to be significant number of route errors with -22. There are multiple places on the verbs/infiniband driver path where an -EINVAL can occur. Is this an error you can explain?&lt;/p&gt;

&lt;p&gt;If not, we should probably compare other logs from other instances where the evictions occurred and see if we consistently see these errors.&lt;/p&gt;

&lt;p&gt;Can we look at the opensm log (usually located in /var/log/opensm.log)? Are there any errors related to routing?&lt;/p&gt;</comment>
                            <comment id="249240" author="ashehata" created="Fri, 14 Jun 2019 00:21:53 +0000"  >&lt;p&gt;Summary of suggestions from today&apos;s call:&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;Try 3.3 MOFED or a version before 4.2 which does not have the routing issue reported. (possibly with the selftest reproducer Mahmoud mentioned above)
	&lt;ol&gt;
		&lt;li&gt;The idea is if we can prove that rolling back MOFED resolves the issue, then we can narrow down the problem to MOFED.&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
	&lt;li&gt;Take a look at opensm log to see if there are any errors related to route resolution during the eviction time or before. Maybe we can compare with the time stamp where we start seeing the route resolution problem in the lustre log.&lt;/li&gt;
	&lt;li&gt;Do a patch to keep track of the active_tx queue length. This can be monitored and if it starts increasing it could be an indication that a problem has occurred.&lt;/li&gt;
	&lt;li&gt;Patch can be used to trigger MLNX debug procedure discussed&lt;/li&gt;
	&lt;li&gt;Capture perf data as described here: &lt;a href=&quot;http://www.brendangregg.com/FlameGraphs/cpuflamegraphs.html.&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://www.brendangregg.com/FlameGraphs/cpuflamegraphs.html.&lt;/a&gt;
	&lt;ol&gt;
		&lt;li&gt;Flamegraphs are a good way to visualize the call stack information. This could show us what&apos;s happening on the server when the problem is hit.&lt;/li&gt;
		&lt;li&gt;Perf capture can be triggered periodically, or it can be triggered when the active_tx starts growing beyond the average.&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;I also looked at the rdma_resolve_route call.&lt;/p&gt;

&lt;p&gt;It basically queries the sm to get the path record:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
160 struct sa_path_rec_ib {
161 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;__be16       dlid;
162 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;__be16       slid;
163 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;u8           raw_traffic;
164 };

190 struct sa_path_rec {                                                            
191 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;union ib_gid dgid;                                                      
192 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;union ib_gid sgid;                                                      
193 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;__be64       service_id;                                                
194 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-comment&quot;&gt;/* reserved */&lt;/span&gt;                                                          
195 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;__be32       flow_label;                                                
196 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;u8           hop_limit;                                                 
197 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;u8           traffic_class;                                             
198 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;u8           reversible;                                                
199 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;u8           numb_path;                                                 
200 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;__be16       pkey;                                                      
201 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;__be16       qos_class;                                                 
202 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;u8           sl;                                                        
203 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;u8           mtu_selector;                                              
204 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;u8           mtu;                                                       
205 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;u8           rate_selector;                                             
206 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;u8           rate;                                                      
207 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;u8           packet_life_time_selector;                                 
208 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;u8           packet_life_time;                                          
209 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;u8           preference;                                                
210 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;union {                                                                 
211 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;struct sa_path_rec_ib ib;                                       
212 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;struct sa_path_rec_roce roce;                                   
213 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;struct sa_path_rec_opa opa;                                     
214 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;};                                                                      
215 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;enum&lt;/span&gt; sa_path_rec_type rec_type;                                         
216 };  &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This data is stored in the cmid and used in the rdma_connect() to create the connection. Eventually calls into cma_connect_ib()&lt;/p&gt;

&lt;p&gt;I don&apos;t think this call can just be bypassed.&lt;/p&gt;</comment>
                            <comment id="249286" author="mhanafi" created="Fri, 14 Jun 2019 17:29:01 +0000"  >&lt;p&gt;Here is our git tree.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://github.com/jlan/lustre-nas/tree/nas-2.10.8&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/jlan/lustre-nas/tree/nas-2.10.8&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&#160;I check the SM logs they were clean with no errors during the last 4 client eviction events.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="249287" author="mhanafi" created="Fri, 14 Jun 2019 17:55:00 +0000"  >&lt;p&gt;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/32805/32805_eviction_s611.06.05.19&quot; title=&quot;eviction_s611.06.05.19 attached to LU-11644&quot;&gt;eviction_s611.06.05.19&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;The evictions happened at&lt;/p&gt;

&lt;p&gt;12:18:00&lt;/p&gt;

&lt;p&gt;12:27:30&lt;/p&gt;

&lt;p&gt;12:42:44&lt;/p&gt;

&lt;p&gt;The file width is very wide.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Re-ran my lnet tests with L2.10.5/mofed4.5/el74 was able to get the&#160;-22 errors.&lt;br/&gt;
But when I tried L2.7/mofed3.3/el6.8 couldn&apos;t reproduce the errors. We will try to do a build of mofed4.1 and re-run tests.&lt;/p&gt;</comment>
                            <comment id="249443" author="ashehata" created="Tue, 18 Jun 2019 14:21:11 +0000"  >&lt;p&gt;Would you be able to try L2.10.5/mofed3.3/el6.8?&lt;/p&gt;</comment>
                            <comment id="249453" author="mhanafi" created="Tue, 18 Jun 2019 18:32:16 +0000"  >&lt;p&gt;we will try to do a mofed3.3 and mofed4.1 build.&lt;/p&gt;</comment>
                            <comment id="249456" author="ashehata" created="Tue, 18 Jun 2019 19:18:47 +0000"  >&lt;p&gt;I&apos;m looking at the git tree you shared, if this is what&apos;s deployed on your nodes, then you still have the CQ problem.&lt;/p&gt;

&lt;p&gt;On your tree&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 168 &lt;span class=&quot;code-comment&quot;&gt;/* 2 = LNet msg + Transfer chain */&lt;/span&gt;                                                          
 169 #define IBLND_CQ_ENTRIES(c)&#187;&#183;&#183;&#183;&#183;\                                                            
 170 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;(IBLND_RECV_WRS(c) + 2 * kiblnd_concurrent_sends(c-&amp;gt;ibc_version, \
 171 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183; c-&amp;gt;ibc_peer-&amp;gt;ibp_ni))   &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;It should be&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 #define IBLND_CQ_ENTRIES(c)         (IBLND_RECV_WRS(c) + IBLND_SEND_WRS(c))&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;If you can revert&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
37919179dc61ebb7a63b8ca1c9e1bf76dd3356e8 LU-9810 lnd: use less CQ entries &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; each connection&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="249459" author="mhanafi" created="Tue, 18 Jun 2019 19:57:24 +0000"  >&lt;p&gt;you should bat 2.10.8. That version reverts &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://github.com/jlan/lustre-nas/commits/nas-2.10.8&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/jlan/lustre-nas/commits/nas-2.10.8&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="249460" author="ashehata" created="Tue, 18 Jun 2019 21:41:36 +0000"  >&lt;p&gt;You&apos;re right. I was looking at 2.10.5.&lt;/p&gt;

&lt;p&gt;There is one issue with 2.10.8 though. It does two things. It has:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 b856230fd2ac2a7b0d0b39a347fa981de424839a Revert &lt;span class=&quot;code-quote&quot;&gt;&quot;LU-9810 lnd: use less CQ entries &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; each connection&quot;&lt;/span&gt;&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
6c12330a350d3e082c835fb61cfcc5c9c93e9bdd LU-12065 lnd: increase CQ entries&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;When I was suggesting reverting &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt; the understanding is that it would be reverted on 2.10.5 and 2.10.6&lt;/p&gt;

&lt;p&gt;2.10.8 has the actual fix which is the second commit above. I think to make the branch more closely resemble the official 2.10.8, you should only have:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
6c12330a350d3e082c835fb61cfcc5c9c93e9bdd LU-12065 lnd: increase CQ entries&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I&apos;ll have the debug patch we discussed available by tomorrow.&lt;/p&gt;</comment>
                            <comment id="249630" author="ashehata" created="Thu, 20 Jun 2019 23:53:19 +0000"  >&lt;p&gt;took me a bit longer to get through.&lt;/p&gt;

&lt;p&gt;Here is a debug patch which can be used to monitor the internal iblnd queues and trigger an action when the queues get too large. I tested it locally, but I don&apos;t have a large cluster. So it&apos;ll be a good idea to test it on a larger cluster before deploying it on a live system.&lt;/p&gt;

&lt;p&gt;Patch is &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/32830/32830_nasa_lu11644.patch&quot; title=&quot;nasa_lu11644.patch attached to LU-11644&quot;&gt;nasa_lu11644.patch&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;.&lt;/p&gt;

&lt;p&gt;you can run:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
lnetctl peer show --lnd --net &amp;lt;net type: ex o2ib1&amp;gt; &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This will dump output like&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 [root@trevis-407 ~]# lnetctl peer show --lnd --net o2ib
lnd_peer:
    - nid: 172.16.1.6@o2ib
      ni_nid: 172.16.1.7@o2ib
      num_conns: 1
      tx_queue: 0
      accepting: 0
      connecting: 0
      reconnecting: 0
      conn_races: 0
      reconnected: 0
      refcount: 2
      max_frags: 256
      queue_depth: 8
      conns:
        - refcount: 0
          credits: 20
          outstanding_credits: 8
          reserved_credits: 1
          early_rxs: 8
          tx_noops: 0
          tx_active: 0
          tx_queue_nocred: 0
          tx_queue_rsrvd: 0
          tx_queue: 0
          conn_state: 0
          sends_posted: 3
          queue_depth: 0
          max_frags: 0
    - nid: 172.16.1.7@o2ib
      ni_nid: 172.16.1.7@o2ib
      num_conns: 2
      tx_queue: 0
      accepting: 0
      connecting: 0
      reconnecting: 0
      conn_races: 0
      reconnected: 0
      refcount: 3
      max_frags: 256
      queue_depth: 8
      conns:
        - refcount: 0
          credits: 20
          outstanding_credits: 7
          reserved_credits: 0
          early_rxs: 8
          tx_noops: 0
          tx_active: 0
          tx_queue_nocred: 0
          tx_queue_rsrvd: 0
          tx_queue: 0
          conn_state: 0
          sends_posted: 3
          queue_depth: 0
          max_frags: 0
        - refcount: 16777224
          credits: 20
          outstanding_credits: 8
          reserved_credits: 1
          early_rxs: 8
          tx_noops: 0
          tx_active: 0
          tx_queue_nocred: 0
          tx_queue_rsrvd: 0
          tx_queue: 0
          conn_state: 0
          sends_posted: 3
          queue_depth: 0
          max_frags: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Under the conns section you can monitor the different queue sizes: tx_active is the one of interest at the moment. This will require a bit of experimentation on a heavily loaded system, to see the average size of these queues. Then you can have a python script (or similar) to trigger an action whenever the queues grow beyond the expected average. The discussed action was to initiate some MLNX debugging to capture more data.&lt;/p&gt;

&lt;p&gt;Note the patch grabs the first 4096 peers only.&lt;/p&gt;

&lt;p&gt;An example python script is below. I used o2ib for the network and 300 for the expected average queue size. For the action, I simply print an output.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
&lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt; yaml
&lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt; subprocess
&lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt; time

&lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; True:
    output = subprocess.check_output([&lt;span class=&quot;code-quote&quot;&gt;&apos;lnetctl&apos;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&apos;peer&apos;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&apos;show&apos;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&apos;--lnd&apos;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&apos;--net&apos;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&apos;o2ib&apos;&lt;/span&gt;])
    y = yaml.safe_load(output)
    &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; i in range (0, len(y[&lt;span class=&quot;code-quote&quot;&gt;&apos;lnd_peer&apos;&lt;/span&gt;])):
        peer = y[&lt;span class=&quot;code-quote&quot;&gt;&apos;lnd_peer&apos;&lt;/span&gt;][i]
        &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; j in range(0, len(peer[&lt;span class=&quot;code-quote&quot;&gt;&apos;conns&apos;&lt;/span&gt;])):
           &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; peer[&lt;span class=&quot;code-quote&quot;&gt;&apos;conns&apos;&lt;/span&gt;][j][&lt;span class=&quot;code-quote&quot;&gt;&apos;tx_active&apos;&lt;/span&gt;] &amp;gt; 300:
              print(&lt;span class=&quot;code-quote&quot;&gt;&quot;peer: &quot;&lt;/span&gt;, peer[&lt;span class=&quot;code-quote&quot;&gt;&apos;nid&apos;&lt;/span&gt;], &lt;span class=&quot;code-quote&quot;&gt;&quot;active_tx is growing too large: &quot;&lt;/span&gt;, peer[&lt;span class=&quot;code-quote&quot;&gt;&apos;conns&apos;&lt;/span&gt;][j][&lt;span class=&quot;code-quote&quot;&gt;&apos;tx_active&apos;&lt;/span&gt;])
    time.sleep(10)

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="249669" author="mhanafi" created="Fri, 21 Jun 2019 17:09:06 +0000"  >&lt;p&gt;Thanks, will test and report back results.&lt;/p&gt;</comment>
                            <comment id="250030" author="mhanafi" created="Wed, 26 Jun 2019 00:02:35 +0000"  >&lt;p&gt;I tested the patch with +8000 clients and no issues. It will take a few week to schedule dedicated time on our production filesystem. Will report back once I have more data.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="250093" author="mhanafi" created="Wed, 26 Jun 2019 17:35:36 +0000"  >&lt;p&gt;Found an issue with the debug patch. It is only reporting the first 4097 peers.&lt;/p&gt;

&lt;p&gt;nbptest2-srv1 ~ # ls -l /proc/fs/lustre/mdt/nbptest2-MDT0000/exports/| wc -l&lt;br/&gt;
 14260&lt;br/&gt;
 nbptest2-srv1 ~ # lnetctl peer show --lnd --net o2ib |grep &apos; nid:&apos; | wc -l&lt;br/&gt;
 4097&lt;/p&gt;

&lt;p&gt;&#160;We recompiled with larger value for peers.lndprs_num_peers = 16k.&lt;/p&gt;</comment>
                            <comment id="250137" author="ashehata" created="Thu, 27 Jun 2019 14:06:19 +0000"  >&lt;p&gt;Hi Mahmoud, I noted this limitation in my initial comment about the patch. I wanted to get the patch out and it would&apos;ve taken longer to implement an iterative way of pulling up the peers in 4K chunks. But as long at the 16K works, it should be ok.&lt;/p&gt;</comment>
                            <comment id="251341" author="pjones" created="Sat, 13 Jul 2019 15:42:49 +0000"  >&lt;p&gt;Mahmoud&lt;/p&gt;

&lt;p&gt;Do you have the dedicated time scheduled to run this test yet?&lt;/p&gt;

&lt;p&gt;Peter &lt;/p&gt;</comment>
                            <comment id="251345" author="mhanafi" created="Sat, 13 Jul 2019 18:37:36 +0000"  >&lt;p&gt;Because we had the crash with the larger peers.lndprs_num_peers we wanted to do more testing before installing on production filesystem. I haven&apos;t had time to get back to this yet... Since things are stable with +net debugging it&apos;s a bit lower on the priority.&lt;/p&gt;</comment>
                            <comment id="259048" author="mhanafi" created="Mon, 2 Dec 2019 21:46:31 +0000"  >&lt;p&gt;I was able to capture some rpc rates before a client evictions. It showed that the server gets large spike in ldlm_glimpse_equeue rpcs that starves out the ping rpcs. I have two charts that show this. So some how when we enable +net debugging it slows down things and the pings rpcs doen&apos;t get blocked. &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33951/33951_client_evictions_charts.pdf&quot; title=&quot;client_evictions_charts.pdf attached to LU-11644&quot;&gt;client_evictions_charts.pdf&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;br/&gt;
What can cause just a large spike in ldlm_glimpse_equeue?&lt;/p&gt;</comment>
                            <comment id="260695" author="mhanafi" created="Tue, 7 Jan 2020 18:43:46 +0000"  >&lt;p&gt;I am trying to create a reproducer for the case were a OSS get a large spike in ldlm_glimpse_equeue RPCS. What is the best way to recreate this RPC workload.&lt;/p&gt;</comment>
                            <comment id="260817" author="mhanafi" created="Thu, 9 Jan 2020 05:33:02 +0000"  >&lt;p&gt;I was able to get a backtrace of all threads during when the servers ping rpcs drop to zero. It show 508 outof 512 ll_ost threads in ldlm_run_ast_work. This must block receiving all other RPCS. &lt;/p&gt;

&lt;p&gt;What options do we have to slow down the rate of ldlm_glimpse_equeues?&lt;/p&gt;

&lt;p&gt; &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/34092/34092_s214_bt.20200108.18.21.23&quot; title=&quot;s214_bt.20200108.18.21.23 attached to LU-11644&quot;&gt;s214_bt.20200108.18.21.23&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;/p&gt;</comment>
                            <comment id="260942" author="adilger" created="Fri, 10 Jan 2020 00:32:49 +0000"  >&lt;p&gt;Mahmoud, do you know what the client application is doing at this point in the run?  Glimpse RPCs are generated when clients do &quot;&lt;tt&gt;stat()&lt;/tt&gt;&quot; operations on files to get the size, that send the LDLM glimpse RPC for the OST object(s) in the file, which may in turn cause the OST to send RPCs to the client(s) holding the locks for the file if it is actively being written.  So if there were a multiple clients doing parallel directory tree traversal in the same directory where other clients are writing it could generate a lot of glimpses, or if the application was malformed and calling &lt;tt&gt;stat()&lt;/tt&gt; repeatedly on a shared file for some reason (e.g. to poll for updates/completion)?&lt;/p&gt;</comment>
                            <comment id="260947" author="green" created="Fri, 10 Jan 2020 01:26:09 +0000"  >&lt;p&gt;can you please describe what workload is it? some big activity on shared files but enough of them to generate plenty of glimpses? something else?&lt;/p&gt;

&lt;p&gt;You can try scaling up number of ost threads and see if that helps though obviously it&apos;s not ideal. there&apos;s probably a number of other workarounds we can use to either elevate ping rpc priority if it&apos;s not high yet or such.&lt;/p&gt;</comment>
                            <comment id="261040" author="mhanafi" created="Fri, 10 Jan 2020 16:35:29 +0000"  >&lt;p&gt;I haven&apos;t been able to track down the user or application creating the glimpse RPCs. Part of the issue is enabling RPCTRACE slows things such enough to prevent the evictions. I tired to limit rpcs using tbf but all rpcs are issues as user root. So that didn&apos;t work.&#160;&lt;/p&gt;

&lt;p&gt;I will try to setup some additional debugging triggers to determine the application/user.&lt;/p&gt;</comment>
                            <comment id="261067" author="adilger" created="Fri, 10 Jan 2020 22:35:44 +0000"  >&lt;p&gt;It looks like 2.10.x has the patch &lt;a href=&quot;https://review.whamcloud.com/17345&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/17345&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7470&quot; title=&quot;Extend TBF policy with NID/JobID expressions&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7470&quot;&gt;&lt;del&gt;LU-7470&lt;/del&gt;&lt;/a&gt; nrs: extend TBF with NID/JobID/OPCode expression&lt;/tt&gt;&quot; which allows you to rate-limit the RPCs by opcode.  While I would not normally suggest to limit LDLM traffic, you could try this for opc 101 (&lt;tt&gt;LDLM_ENQUEUE&lt;/tt&gt;) to see if the problem can be avoided.  I&apos;m not sure of the specific rate limits you want, but looking at the recently-attached graph you could try 4000 as a reasonable cap (i.e. above normal usage, but enough to slow down the huge spike of incoming RPCs.  You may have more fine-grained breakdown of RPCs by type to see how many of the RPCs in that spike are from &lt;tt&gt;LDLM_ENQUEUE&lt;/tt&gt;.&lt;/p&gt;

&lt;p&gt;If enabling &lt;tt&gt;+rpctrace&lt;/tt&gt; debugging is too heavy-weight, you could also try to enable the RPC history functionality, via &quot;&lt;tt&gt;lctl set_param ost.OSS.&amp;#42;.req_buffer_history_max=10000&lt;/tt&gt;&quot; and then grab it every second via &quot;&lt;tt&gt;lctl get_param ost.OSS.&amp;#42;.req_history &amp;gt;&amp;gt; /var/log/lustre_oss_history-$(date +%Y%m%d%H%M)&lt;/tt&gt;&quot; (make sure you have a rule in &lt;tt&gt;/etc/logrotate.d/lustre.conf&lt;/tt&gt; to clean up those logs if you cannot reproduce it quickly) and then post-process it after an event for a detailed RPC log (&lt;tt&gt;sort -u&lt;/tt&gt; would be needed to discard duplicate records in a given logfile).  The fields are:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;timestamp:target_nid:client_nid:client_xid:req_length:req_state:arrival_time:expiry_time opc number
6780438299461025792:192.168.20.1@tcp:12345-192.168.20.159@tcp:x1654112309314912:328:Complete:1578693813.544069:-1.455930s(-43.0s) opc 101
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Since the RPC history does not involve printing to the logs (just saving the request into a list), it &lt;em&gt;may&lt;/em&gt; be less overhead than the kernel debug logging, and allow you to capture information when the problem happens.&lt;/p&gt;</comment>
                            <comment id="261073" author="gerrit" created="Fri, 10 Jan 2020 23:19:59 +0000"  >&lt;p&gt;Andreas Dilger (adilger@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/37193&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/37193&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11644&quot; title=&quot;LNet: Service thread inactive for 300  causes client evictions &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11644&quot;&gt;LU-11644&lt;/a&gt; ptlrpc: show target name in req_history&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 3d7e754be0a0aec62cd74c977cab159796b4b8a8&lt;/p&gt;</comment>
                            <comment id="261199" author="mhanafi" created="Tue, 14 Jan 2020 20:20:56 +0000"  >&lt;p&gt;For now we have enabled rpctrace. Long term I think elevating ping rpc prior it may be a good option. For very large clusters it would also be nice to set obd_ping rate and eviction timeout independent of at_min. At 10k nodes our OSS average ~4000 obd_ping rpcs/sec&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="261246" author="adilger" created="Wed, 15 Jan 2020 11:43:00 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;For very large clusters it would also be nice to set obd_ping rate and eviction timeout independent of at_min. At 10k nodes our OSS average ~4000 obd_ping rpcs/sec
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;For newer Lustre versions (2.12 client+server), clients will disconnect from an OST in the background if they are not actively communicating with it.  This will reduce the number of pings from an idle client, and will also reduce the load during recovery due to fewer clients being connected.&lt;/p&gt;</comment>
                            <comment id="261965" author="gerrit" created="Tue, 28 Jan 2020 06:02:32 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/37193/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/37193/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11644&quot; title=&quot;LNet: Service thread inactive for 300  causes client evictions &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11644&quot;&gt;LU-11644&lt;/a&gt; ptlrpc: show target name in req_history&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 83b6c6608e94c05766ed1eddad1b7f2fee5e79fa&lt;/p&gt;</comment>
                            <comment id="262023" author="pjones" created="Tue, 28 Jan 2020 15:59:02 +0000"  >&lt;p&gt;Do I understand correctly that the patch that landed was not a fix for the issue but just made the issue easier to debug?&lt;/p&gt;</comment>
                            <comment id="262221" author="adilger" created="Thu, 30 Jan 2020 19:20:45 +0000"  >&lt;p&gt;Correct, patch is only to improve diagnostics for similar situations. &lt;/p&gt;</comment>
                            <comment id="264132" author="mhanafi" created="Thu, 27 Feb 2020 04:28:27 +0000"  >&lt;p&gt;What about raising the priority of obd ping rpcs. This will ensure that they are not dropped.&#160;&lt;/p&gt;</comment>
                            <comment id="268399" author="adilger" created="Thu, 23 Apr 2020 17:36:53 +0000"  >&lt;p&gt;One issue about elevating &lt;tt&gt;OBD_PING&lt;/tt&gt; priority is that these RPCs are mostly &quot;dead weight&quot; just to keep the servers informed that the clients are still alive. I think the &lt;tt&gt;idle_timeout&lt;/tt&gt; feature would instead allow idle clients to disconnect and reduce the &lt;tt&gt;OBD_PING&lt;/tt&gt; traffic on the servers.&lt;/p&gt;

&lt;p&gt;Another approach is to prevent evicting clients if the service threads are not making progress on the RPC queue, or if the clients already have a pending &lt;tt&gt;OBD_PING&lt;/tt&gt; RPC in the request queue, even if it hasn&apos;t been processed yet.  That avoids interrupting useful work by prioritizing &lt;tt&gt;OBD_PING&lt;/tt&gt; processing under normal usage, but also avoids mass evictions in the rare case that the server threads are overloaded.&lt;/p&gt;

&lt;p&gt;As for the ldlm contention issue, it would be useful to get the &lt;tt&gt;+rpctrace&lt;/tt&gt; (and &lt;tt&gt;+ldlmtrace&lt;/tt&gt; if possible}} debug log from one of these events, to see what the root of the problem is. Fixing &lt;tt&gt;OBD_PING&lt;/tt&gt; is only the symptom. &lt;/p&gt;</comment>
                            <comment id="268415" author="mhanafi" created="Thu, 23 Apr 2020 21:58:13 +0000"  >&lt;p&gt;By enabling +rpctrace it slow the server just enough for the issue to not occur.&lt;/p&gt;</comment>
                            <comment id="271389" author="mhanafi" created="Thu, 28 May 2020 06:01:02 +0000"  >&lt;p&gt;We need a way to debug this issue. We are still hitting this only way to workaround it is by enabling rpctrace.&lt;/p&gt;</comment>
                            <comment id="290867" author="mhanafi" created="Mon, 1 Feb 2021 17:54:25 +0000"  >&lt;p&gt;We have a large user job (1844 nodes/ 73760cores) triggering client evictions even with our workaround of enabling RPCTRACE. I was able to gather debug logs from the last event. We have mitigated the issue by enabling TBF QoS to rate limit ost_io rpcs. The logs showed that all ll_ost_io threads sit in &apos;D&apos; state and all i/o rates drop to zero. This starts at 19:31:36 and ends at 19:35:10. At which time the server starts to evict clients running the job (see s618.out). See zero.io.top.20210130.19.31.37 for thread states. I uploaded lustre debug file to ftp.whamcloud.com/uploads/zero.io.lctl.dk.20210130.19.34.56.gz.new. The evictions only happen on OSS. This filesystem has 20 OSSes and 342 OSTs.&lt;/p&gt;</comment>
                            <comment id="291018" author="green" created="Tue, 2 Feb 2021 21:27:03 +0000"  >&lt;p&gt;I checked the debug log and I don&apos;t think I see any traces of requests being stuck in the incoming queue for a long time. various &quot;Already past deadline&quot; sort of messages are not part of the logs.&lt;/p&gt;

&lt;p&gt;I picked the very first evicted client which is &quot;10.149.3.186@o2ib313&quot; talking to &quot;nbp2-OST0025&quot; and I see last message from it as expected - getting a PR lock.&lt;/p&gt;

&lt;p&gt;I do see a bunch of ping requests from this same nid/client across time, but there&apos;s no way to know what particular OST they were sent to (and there&apos;s more than one on this node). The important part is - there&apos;s no significant delay in request processing between its arrival and it being taken into actual processing.&lt;/p&gt;

&lt;p&gt;E.g. the eviction happened at:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000100:00080000:11.0:1612064007.003210:0:97446:0:(service.c:1128:ptlrpc_update_export_timer()) nbp2-OST0025: Think about evicting 10.149.3.186@o2ib313 from 1612063856
...
00000100:00080000:9.0:1612064083.110023:0:50801:0:(pinger.c:610:ping_evictor_main()) evicting all exports of obd nbp2-OST0025 older than 1612063933
00000100:00100000:12.0:1612064083.110024:0:21995:0:(nrs_tbf.c:3127:nrs_tbf_req_get()) TBF dequeues: class@ffffa08b32e6d200 rate 5000 gen 2 token 2, rule@ffffa095af158100 rate 5000 gen 2
00000100:02000400:9.0:1612064083.110025:0:50801:0:(pinger.c:636:ping_evictor_main()) nbp2-OST0025: haven&apos;t heard from client b58b2f27-aabc-f700-545a-8d4bc7d4c55c (at 10.149.3.186@o2ib313) in 227 seconds. I think it&apos;s dead, and I am evicting it. exp ffffa07a96360000, cur 1612064083 expire 1612063933 last 1612063856
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;now looking back from that looking for traffic to 10.149.3.186@o2ib313:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000400:00000200:9.0:1612064056.837794:0:34195:0:(lib-move.c:4114:lnet_parse()) TRACE: 10.151.26.114@o2ib(10.151.26.114@o2ib) &amp;lt;- 10.149.3.186@o2ib313 : PUT - for me
00000400:00000200:9.0:1612064056.837798:0:34195:0:(lib-ptl.c:571:lnet_ptl_match_md()) Request from 12345-10.149.3.186@o2ib313 of length 224 into portal 28 MB=0x6011a1473a440
00000400:00000200:9.0:1612064056.837801:0:34195:0:(lib-ptl.c:200:lnet_try_match_md()) Incoming put index 1c from 12345-10.149.3.186@o2ib313 of length 224/224 into md 0x86246a0871 [1] + 0
00000400:00000200:9.0:1612064056.837803:0:34195:0:(lib-md.c:65:lnet_md_unlink()) Queueing unlink of md ffffa07b4745a2a8
00000100:00000200:9.0:1612064056.837804:0:34195:0:(events.c:305:request_in_callback()) event type 2, status 0, service ost
00000100:00100000:9.0:1612064056.837807:0:34195:0:(events.c:351:request_in_callback()) peer: 12345-10.149.3.186@o2ib313 (source: 12345-10.149.3.186@o2ib313)
00000400:00000200:9.0:1612064056.837809:0:34195:0:(lib-md.c:69:lnet_md_unlink()) Unlinking md ffffa07b4745a2a8
00000400:00000200:9.0:1612064056.837810:0:34195:0:(lib-msg.c:816:lnet_is_health_check()) health check = 0, status = 0, hstatus = 0
00000800:00000200:9.0:1612064056.837811:0:34195:0:(o2iblnd_cb.c:205:kiblnd_post_rx()) conn[ffffa081f0c66000] (69)++
00000100:00100000:10.0:1612064056.837812:0:37831:0:(service.c:1989:ptlrpc_server_handle_req_in()) got req x1690061384164416
00000800:00000200:9.0:1612064056.837812:0:34195:0:(o2iblnd_cb.c:239:kiblnd_post_rx()) conn[ffffa081f0c66000] (70)--
00000800:00000200:9.0:1612064056.837813:0:34195:0:(o2iblnd_cb.c:3859:kiblnd_scheduler()) conn[ffffa081f0c66000] (69)--
00000100:00100000:10.0:1612064056.837817:0:37831:0:(nrs_tbf.c:3210:nrs_tbf_req_add()) TBF enqueues: class@ffffa081e90ff600 rate 5000 gen 2 token 0, rule@ffffa07df979db00 rate 5000 gen 2
00000100:00100000:10.0:1612064056.837821:0:37831:0:(nrs_tbf.c:3127:nrs_tbf_req_get()) TBF dequeues: class@ffffa081e90ff600 rate 5000 gen 2 token 1, rule@ffffa07df979db00 rate 5000 gen 2
00000100:00100000:10.0:1612064056.837828:0:37831:0:(service.c:2140:ptlrpc_server_handle_request()) Handling RPC pname:cluuid+ref:pid:xid:nid:opc ll_ost00_250:b58b2f27-aabc-f700-545a-8d4bc7d4c55c+7:3580:x1690061384164416:12345-10.149.3.186@o2ib313:400
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;so as you can see - processing is instanteous.&lt;/p&gt;

&lt;p&gt;What does the dmesg on the client side say? Could the RPCs be dropped somewhere along the way like routers?&lt;/p&gt;

&lt;p&gt;I don&apos;t see any obvious signs of anything being slow - no complaints about slow io nor any threads being stuck for a long time like the original bugreport shows (the &quot;LNet: Service thread pid XXX was inactive for&quot;)&lt;/p&gt;</comment>
                            <comment id="291019" author="mhanafi" created="Tue, 2 Feb 2021 21:41:36 +0000"  >&lt;p&gt;I don&apos;t think this is due dropped messages. If we had dropped messages it would effect multiple servers and osts. There are no errors on the client until they get evicted. No errors on the routers and there are no lnet errors on the servers.&lt;/p&gt;

&lt;p&gt;What is troubling why all I/O halts while all ll_ost_io threads are sitting in &apos;D&apos; state. I will try to get a dump of stack of the threads.&lt;/p&gt;

&lt;p&gt;If a I/O request take longer to service ( while io threads sitting in &apos;D&apos; state) than obdping timeout will the client get evicted.&#160;&lt;/p&gt;</comment>
                            <comment id="291027" author="green" created="Tue, 2 Feb 2021 22:20:32 +0000"  >&lt;p&gt;First of all I do not think you have any outstanding io requests from this partcular client. If you did - we&apos;d see aborted bulk messages and I don&apos;t think I saw any?&lt;/p&gt;

&lt;p&gt;if the io thread is taking too long in the actual filesystem layer we&apos;d see messages like this for writes: &quot;transaction handle %p was open for too long: now %lld, alloced %lld, started %lld, closed %lld&quot; and there were a slow io of other kind messages that I cannot readily find. In addition the lnet &quot;thread took too long&quot; would also happen and I think you did not have any incidents of any watchdogs triggering this time?&lt;/p&gt;

&lt;p&gt;If you have requests sitting too long in the serving queue there&apos;d be the &quot;Already past deadline&quot; messages.&lt;/p&gt;

&lt;p&gt;If you have requests taking to long actually being serviced.- the client would timeout (you&apos;d also have watchdogs trigger).&lt;/p&gt;

&lt;p&gt;Looking at the code we prolong export several times: when a request comes in and is added to the req in queue, when we start serving it and when we finish serving it.&lt;br/&gt;
So while it&apos;s possible for a too long request to end up in ping eviction in absence of other requests, the fact that the export&apos;s &quot;last served&quot; time did not advance in all this time and we can see it matching last confirmed request from this client, I think it&apos;s a pretty solid evidence that at least this particular client did not really send a ping request for this OST or if it did - it did not arrive to the OSS node - note if this was the case you must see a timeout for the opc 400 on the client though.&lt;/p&gt;

&lt;p&gt;3rd option is if the client and server idea of obdtimeout is different (with client having a significantly longer timeout than the server) then client might be sending pings a lot less frequently than the server expects.&lt;/p&gt;</comment>
                            <comment id="291029" author="mhanafi" created="Tue, 2 Feb 2021 22:39:21 +0000"  >&lt;p&gt;This filesystem never sees more 1 second of zero reads and writes. While during this event it see zero I/O for 2 mins, so something must be locked up. I will try to get a stack trace of all threads.&lt;/p&gt;

&lt;p&gt;The file I attached yesterday (s618.out) show that all I/O drops to zero and server load starts to climb starting at 19:31:36&#160; to 19:34:56, when the evictions happen.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;The watchdog timers (500 seconds) are much higher then obdping timeout so we wouldn&apos;t see those warnings.&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="291042" author="adilger" created="Wed, 3 Feb 2021 00:29:18 +0000"  >&lt;p&gt;Oleg, is it possible that something in the NRS TBF path is bypassing/blocking the update of the export and preventing the request from being processed by the server?  &lt;/p&gt;

&lt;p&gt;In your previous example, it shows that this client is active for other pings and yet it is being evicted shortly thereafter.  At the time of eviction, it looks like a request is being dequeued for the export, but It probably make sense for the target to evict a client that still has a pending request in the RPC queue:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;nrs_tbf_req_get()) TBF dequeues: class@ffffa08b32e6d200 gen 2 token 2, rule@ffffa095af158100 rate 5000 gen 2
ping_evictor_main()) nbp2-OST0025: haven&apos;t heard from client (at 10.149.3.186@o2ib313) in 227 seconds. I think it&apos;s dead, and I am evicting it. cur 1612064083 expire 1612063933 last 1612063856
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;so it might be useful to print a bit more information in &lt;tt&gt;nrs_tbf_req_get()&lt;/tt&gt; about the request so that we can see what it is.  Maybe change the &lt;tt&gt;CDEBUG(D_RPCTRACE, ...)&lt;/tt&gt; there to a &lt;tt&gt;DEBUG_REQ(D_RPCTRACE, ...)&lt;/tt&gt;.  Also adding the target name into the &lt;tt&gt;ptlrpc_server_handle_request()&lt;/tt&gt; &quot;&lt;tt&gt;Handling&lt;/tt&gt;&quot; and &quot;&lt;tt&gt;Handled&lt;/tt&gt;&quot; lines (from &lt;tt&gt;req-&amp;gt;rq_export-&amp;gt;exp_obd-&amp;gt;obd_name&lt;/tt&gt;, if &lt;tt&gt;rq_export&lt;/tt&gt; isn&apos;t NULL) would definitely be very useful for lots of reasons.&lt;/p&gt;</comment>
                            <comment id="291043" author="mhanafi" created="Wed, 3 Feb 2021 00:33:24 +0000"  >&lt;p&gt;I don&apos;t think tbf is involved because we saw this with fifo.&#160;&lt;/p&gt;</comment>
                            <comment id="291048" author="adilger" created="Wed, 3 Feb 2021 01:18:05 +0000"  >&lt;blockquote&gt;
&lt;p&gt;3rd option is if the client and server idea of obd timeout is different (with client having a significantly longer timeout than the server) then client might be sending pings a lot less frequently than the server expects.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Mahmoud, do you have multiple different filesystems mounted on the same client, possibly with different timeout values configured by their MGS config logs?  There is an old problem &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9912&quot; title=&quot;fix multiple client mounts with different server timeouts&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9912&quot;&gt;LU-9912&lt;/a&gt; that relates to this, in the uncommon case where different filesystems have been tuned with different timeouts...&lt;/p&gt;</comment>
                            <comment id="291106" author="mhanafi" created="Wed, 3 Feb 2021 16:49:09 +0000"  >&lt;p&gt;We do mount multiple filesystems, but we don&apos;t save those timeout values on the MGS. I double checked the MGS config logs there were no timeout settings.&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="291156" author="green" created="Wed, 3 Feb 2021 20:19:36 +0000"  >&lt;p&gt;Andres, other pings are apparently coming to other OSTs on this node. We don&apos;t print the target information so I don&apos;t know how to 100% confirm that theory but it seems to be the most likely one.&lt;/p&gt;

&lt;p&gt;Mahmoud - can you please directly check the timeout value on the client I looked at vs the server just to be sure?&lt;/p&gt;

&lt;p&gt;The watchdog timeout is set based on AT values so it varies and the bigger it gets the more latency there was in the system beforehand. but doing sysrq-t as you see fit should show you where everything is stuck&lt;/p&gt;</comment>
                            <comment id="291164" author="mhanafi" created="Wed, 3 Feb 2021 20:50:16 +0000"  >&lt;p&gt;I check at_min, at_max, and time on all servers and clients they are the same.&lt;/p&gt;

&lt;p&gt;timeout=100&lt;/p&gt;

&lt;p&gt;at_min=275&lt;/p&gt;

&lt;p&gt;at_max=600&lt;/p&gt;

&lt;p&gt;We know that the server receives obdping from other clients and sends replies. These clients, that get evicted, are sending ping to the other OSTs. other wise we would see evictions on more than one OST.&#160;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="54277">LU-11768</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="33951" name="client_evictions_charts.pdf" size="65011" author="mhanafi" created="Mon, 2 Dec 2019 21:45:10 +0000"/>
                            <attachment id="32805" name="eviction_s611.06.05.19" size="23917" author="mhanafi" created="Fri, 14 Jun 2019 17:54:27 +0000"/>
                            <attachment id="32440" name="lnet_metrics_during_eviction.pdf" size="407096" author="mhanafi" created="Thu, 18 Apr 2019 02:17:41 +0000"/>
                            <attachment id="32830" name="nasa_lu11644.patch" size="15019" author="ashehata" created="Thu, 20 Jun 2019 23:54:23 +0000"/>
                            <attachment id="34092" name="s214_bt.20200108.18.21.23" size="1013701" author="mhanafi" created="Thu, 9 Jan 2020 05:31:52 +0000"/>
                            <attachment id="37342" name="s618.out" size="38312" author="mhanafi" created="Mon, 1 Feb 2021 17:38:45 +0000"/>
                            <attachment id="37343" name="zero.io.top.20210130.19.31.37" size="491393" author="mhanafi" created="Mon, 1 Feb 2021 17:38:57 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i0060v:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>