<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:58:36 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-6251] Melanox / O2ib lnd cause a OOM on OST node</title>
                <link>https://jira.whamcloud.com/browse/LU-6251</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Due investigation an OOM on node, we found a large number allocations done with 532480 and 266240 bytes size.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Example of vm_struct &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; memory region with size 266240:
crash&amp;gt; vm_struct ffff880019c542c0
struct vm_struct {
  next = 0xffff880588f29900, 
  addr = 0xffffc904a626d000, 
  size = 266240, 
  flags = 4, 
  pages = 0x0, 
  nr_pages = 0, 
  phys_addr = 0, 
  caller = 0xffffffffa00b7136 &amp;lt;mlx4_buf_alloc+870&amp;gt;
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;99% of memory regions with size 266240 and 523480 has caller = 0xffffffffa00b7136 &amp;lt;mlx4_buf_alloc+870&amp;gt;.&lt;/p&gt;

&lt;p&gt;number a regions is 31042 / 31296.&lt;br/&gt;
I found strange backtraces in kernel &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;PID: 83859  TASK: ffff8807d64ca040  CPU: 0   COMMAND: &lt;span class=&quot;code-quote&quot;&gt;&quot;kiblnd_connd&quot;&lt;/span&gt;
 #0 [ffff8807b2835a90] schedule at ffffffff815253c0
 #1 [ffff8807b2835b58] schedule_timeout at ffffffff815262a5
 #2 [ffff8807b2835c08] wait_for_common at ffffffff81525f23
 #3 [ffff8807b2835c98] wait_for_completion at ffffffff8152603d
 #4 [ffff8807b2835ca8] synchronize_sched at ffffffff81096e88
 #5 [ffff8807b2835cf8] mlx4_cq_free at ffffffffa00bf188 [mlx4_core]
 #6 [ffff8807b2835d68] mlx4_ib_destroy_cq at ffffffffa04725f5 [mlx4_ib]
 #7 [ffff8807b2835d88] ib_destroy_cq at ffffffffa043de99 [ib_core]
 #8 [ffff8807b2835d98] kiblnd_destroy_conn at ffffffffa0acbafc [ko2iblnd]
 #9 [ffff8807b2835dd8] kiblnd_connd at ffffffffa0ad5fe1 [ko2iblnd]
#10 [ffff8807b2835ee8] kthread at ffffffff8109ac66
#11 [ffff8807b2835f48] kernel_thread at ffffffff8100c20a
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;so thread blocked with something while destroy an ib connection.&lt;br/&gt;
inspecting a task&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;crash&amp;gt; p ((struct task_struct *)0xffff8807d64ca040)-&amp;gt;se.cfs_rq-&amp;gt;rq-&amp;gt;clock
$25 = 230339336880160
 crash&amp;gt; p ((struct task_struct *)0xffff8807d64ca040)-&amp;gt;se.block_start
$26 = 230337329685261
 &amp;gt;&amp;gt;&amp;gt; (230339336880160-230337329685261)/10**9
2
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;but more interested in an o2ib lnd statistic i found &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;crash&amp;gt; kib_net 0xffff8808325e9dc0
struct kib_net {
  ibn_list = {
    next = 0xffff8807b40a2f40, 
    prev = 0xffff8807b40a2f40
  }, 
  ibn_incarnation = 1423478059211439, 
  ibn_init = 2, 
  ibn_shutdown = 0, 
  ibn_npeers = {
    counter = 31042
  }, 
  ibn_nconns = {
    counter = 31041
  },
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;so 31k peers - but tests are run on cluster with 14 real clients and 5 server nodes, so isn&apos;t more 20 connections exist.&lt;br/&gt;
but where it placed?&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;crash&amp;gt; p &amp;amp;kiblnd_data.kib_connd_zombies
$7 = (struct list_head *) 0xffffffffa0ae7e70 &amp;lt;kiblnd_data+112&amp;gt;
crash&amp;gt; list -H 0xffffffffa0ae7e70 -o kib_conn.ibc_list | wc -l
31030
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;so all memory consumed with zombi which need more than 2s to destroy.&lt;/p&gt;</description>
                <environment>2.5.1 based Lustre code.</environment>
        <key id="28713">LU-6251</key>
            <summary>Melanox / O2ib lnd cause a OOM on OST node</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="2">Won&apos;t Fix</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="shadow">Alexey Lyashkov</reporter>
                        <labels>
                    </labels>
                <created>Mon, 16 Feb 2015 16:59:52 +0000</created>
                <updated>Wed, 13 Oct 2021 01:35:50 +0000</updated>
                            <resolved>Wed, 13 Oct 2021 01:35:50 +0000</resolved>
                                    <version>Lustre 2.5.1</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>16</watches>
                                                                            <comments>
                            <comment id="107091" author="sergey" created="Mon, 16 Feb 2015 18:44:09 +0000"  >&lt;p&gt;mlnx version: mlnx-ofa_kernel-2.3&lt;/p&gt;</comment>
                            <comment id="124776" author="bruno.travouillon" created="Fri, 21 Aug 2015 12:42:17 +0000"  >&lt;p&gt;Alexey, Sergey,&lt;/p&gt;

&lt;p&gt;Have you been able to troubleshoot this issue? We are hitting a similar issue with 2.5.3.90 and OFED 3.12.&lt;/p&gt;</comment>
                            <comment id="124870" author="shadow" created="Mon, 24 Aug 2015 09:18:16 +0000"  >&lt;p&gt;Bruno,&lt;/p&gt;

&lt;p&gt;no yet. we have it issue once. Liang was created patch &lt;a href=&quot;http://review.whamcloud.com/#/c/14600/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/14600/&lt;/a&gt; which may help for it, but it&apos;s isn&apos;t help with original bug create for. So if you have repeatable it issue - try it patch and say how it help for you.&lt;/p&gt;</comment>
                            <comment id="124998" author="shadow" created="Tue, 25 Aug 2015 12:03:39 +0000"  >&lt;p&gt;I have discussion with IB guys today, he say LNet had a bug in IB connect error handing as lack a something like a&lt;br/&gt;
&lt;a href=&quot;http://sourceforge.net/p/scst/svn/HEAD/tree/trunk/iscsi-scst/kernel/isert-scst/iser_rdma.c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://sourceforge.net/p/scst/svn/HEAD/tree/trunk/iscsi-scst/kernel/isert-scst/iser_rdma.c&lt;/a&gt;&lt;br/&gt;
line 782 &lt;/p&gt;

&lt;p&gt;it may put connection in wrong state if some packets was lost during connect handshake.&lt;/p&gt;</comment>
                            <comment id="125337" author="bruno.travouillon" created="Thu, 27 Aug 2015 06:49:08 +0000"  >&lt;p&gt;Thanks Alexey. We still need to reproduce the issue on a test cluster, I should test with the patch afterwards.&lt;/p&gt;

&lt;p&gt;However, it will only avoid the memory pressure and won&apos;t solve the underlying issue with the zombie connection.&lt;/p&gt;</comment>
                            <comment id="125383" author="shadow" created="Thu, 27 Aug 2015 15:14:48 +0000"  >&lt;p&gt;from my point view, zombi is result of handshake packet lost, so if we fix it problem we will fix zombie.&lt;/p&gt;</comment>
                            <comment id="127054" author="wesley" created="Fri, 11 Sep 2015 08:41:47 +0000"  >&lt;p&gt;Alexey, Bruno,&lt;br/&gt;
We also encountered  this OOM recently since memory was exhausted by mlx4_buf_alloc.&lt;br/&gt;
Do you know how the issue is triggered and can we do something helpful for it?&lt;/p&gt;</comment>
                            <comment id="128918" author="bruno.travouillon" created="Wed, 30 Sep 2015 17:10:26 +0000"  >&lt;p&gt;Wesley,&lt;/p&gt;

&lt;p&gt;We are still investigating. I should try  &lt;a href=&quot;http://review.whamcloud.com/#/c/14600/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/14600/&lt;/a&gt; soon.&lt;/p&gt;</comment>
                            <comment id="134622" author="doug" created="Thu, 26 Nov 2015 16:47:28 +0000"  >&lt;p&gt;Hi Alexey,  Has there been any progress on this issue?  We are seeing more cases of this OOM with mlx5.  I&apos;m starting to suspect that mlx5 is even more aggressive in memory usage making this problem even worse with newer Mellanox cards.  Patch &lt;a href=&quot;http://review.whamcloud.com/#/c/14600/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/14600/&lt;/a&gt; does not seem to be enough of a break to allow memory to be released.&lt;/p&gt;</comment>
                            <comment id="136209" author="sergey" created="Mon, 14 Dec 2015 17:07:29 +0000"  >&lt;p&gt;The issue is fixed in seagate after updating mlnx-ofa_kernel from 2.3 to 3.1-1.0.3.&lt;br/&gt;
In several words the problem is caused by internal mlnx4 ib driver error.&lt;br/&gt;
This error causes creation of huge number of zombie connections(about 300 000).&lt;br/&gt;
This connections consumes all the memory on server.&lt;br/&gt;
Below is example of such error symptoms:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@windu-head ~]# pdsh -S -w windu-client[02-11] perfquery | grep PortXmitWait
windu-client02: PortXmitWait:....................0
windu-client07: PortXmitWait:....................0
windu-client03: PortXmitWait:....................0
windu-client10: PortXmitWait:....................0
windu-client06: PortXmitWait:....................0
windu-client11: PortXmitWait:....................298906
windu-client08: PortXmitWait:....................0
windu-client04: PortXmitWait:....................0
windu-client09: PortXmitWait:....................0
windu-client05: PortXmitWait:....................0
# ibqueryerrors
Errors for &quot;winduoem mlx4_0&quot;
   GUID 0x2c903004cb42d port 1: [LinkDownedCounter == 16] [PortXmitDiscards == 32]
Errors for &quot;windu02 mlx4_0&quot;
   GUID 0x1e67030066eb15 port 1: [PortXmitWait == 1]
Errors for &quot;windu07 mlx4_0&quot;
   GUID 0x1e67030066ed9d port 1: [PortXmitWait == 1]
Errors for &quot;MT25408 ConnectX Mellanox Technologies&quot;
   GUID 0x2590ffffdfac0d port 1: [PortXmitWait == 298906]
Errors for 0xf452140300838740 &quot;SwitchX -  Mellanox Technologies&quot;
   GUID 0xf452140300838740 port ALL: [LinkDownedCounter == 255] [PortRcvSwitchRelayErrors == 4951] [PortXmitWait == 4294967295]
   GUID 0xf452140300838740 port 1: [LinkDownedCounter == 1] [PortRcvSwitchRelayErrors == 1]
   GUID 0xf452140300838740 port 4: [LinkDownedCounter == 38] [PortRcvSwitchRelayErrors == 485] [PortXmitWait == 25194846]
   GUID 0xf452140300838740 port 6: [LinkDownedCounter == 75] [PortXmitWait == 2742]
   GUID 0xf452140300838740 port 11: [LinkDownedCounter == 98] [PortRcvSwitchRelayErrors == 352] [PortXmitWait == 4484903]
   GUID 0xf452140300838740 port 12: [LinkDownedCounter == 96] [PortRcvSwitchRelayErrors == 352] [PortXmitWait == 95352145]
   GUID 0xf452140300838740 port 18: [PortXmitWait == 3689472946]
   GUID 0xf452140300838740 port 20: [LinkDownedCounter == 27]
   GUID 0xf452140300838740 port 22: [LinkDownedCounter == 204] [PortXmitWait == 206951]
   GUID 0xf452140300838740 port 26: [LinkDownedCounter == 42] [PortRcvSwitchRelayErrors == 1009] [PortXmitWait == 7327592]
   GUID 0xf452140300838740 port 28: [LinkDownedCounter == 42] [PortRcvSwitchRelayErrors == 1006] [PortXmitWait == 7555602]
   GUID 0xf452140300838740 port 30: [LinkDownedCounter == 41] [PortRcvSwitchRelayErrors == 1000] [PortXmitWait == 6546015]
   GUID 0xf452140300838740 port 32: [LinkDownedCounter == 194]
   GUID 0xf452140300838740 port 34: [LinkDownedCounter == 36] [PortRcvSwitchRelayErrors == 746] [PortXmitWait == 6957849]
   GUID 0xf452140300838740 port 36: [LinkDownedCounter == 184] [PortXmitWait == 4294967295]
Errors for &quot;windu08 mlx4_0&quot;
   GUID 0x1e670300670add port 1: [PortXmitWait == 1]
Errors for 0xf4521403008386c0 &quot;SwitchX -  Mellanox Technologies&quot;
   GUID 0xf4521403008386c0 port ALL: [LinkDownedCounter == 255] [PortRcvSwitchRelayErrors == 5742] [PortXmitWait == 4294967295]
   GUID 0xf4521403008386c0 port 6: [LinkDownedCounter == 26] [PortXmitWait == 2687]
   GUID 0xf4521403008386c0 port 8: [LinkDownedCounter == 176] [PortXmitWait == 1849083011]
   GUID 0xf4521403008386c0 port 10: [LinkDownedCounter == 71] [PortXmitWait == 3901]
   GUID 0xf4521403008386c0 port 11: [LinkDownedCounter == 94] [PortRcvSwitchRelayErrors == 352] [PortXmitWait == 14950798]
   GUID 0xf4521403008386c0 port 12: [LinkDownedCounter == 98] [PortRcvSwitchRelayErrors == 352] [PortXmitWait == 5555905]
   GUID 0xf4521403008386c0 port 13: [LinkDownedCounter == 38] [PortRcvSwitchRelayErrors == 1007] [PortXmitWait == 7829732]
   GUID 0xf4521403008386c0 port 14: [LinkDownedCounter == 38] [PortRcvSwitchRelayErrors == 1014] [PortXmitWait == 6837648]
   GUID 0xf4521403008386c0 port 15: [LinkDownedCounter == 39] [PortRcvSwitchRelayErrors == 1006] [PortXmitWait == 7558563]
   GUID 0xf4521403008386c0 port 16: [LinkDownedCounter == 42] [PortRcvSwitchRelayErrors == 1004] [PortXmitWait == 6831755]
   GUID 0xf4521403008386c0 port 17: [LinkDownedCounter == 39] [PortRcvSwitchRelayErrors == 1006] [PortXmitWait == 7336329]
   GUID 0xf4521403008386c0 port 20: [PortXmitWait == 3261894682]
   GUID 0xf4521403008386c0 port 30: [LinkDownedCounter == 10] [PortRcvSwitchRelayErrors == 1]
## Summary: 26 nodes checked, 7 bad nodes found
##          96 ports checked, 31 ports have errors beyond threshold
## Thresholds: 
## Suppressed:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;On the other hand each connection should be destroyed and freed much faster.&lt;br/&gt;
According to my investigation kiblnd_connd spend about 2 seconds to destroy each connection !(see description in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6251&quot; title=&quot;Melanox / O2ib lnd cause a OOM on OST node&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6251&quot;&gt;&lt;del&gt;LU-6251&lt;/del&gt;&lt;/a&gt;).&lt;br/&gt;
Possible reason is RCU mechanism used in mlx4_cq_free.&lt;br/&gt;
RCU locking in mlx4_cq_free is replaced by spin locks in newer mlnx drivers.&lt;br/&gt;
I found discussion of similar problem at &lt;a href=&quot;http://permalink.gmane.org/gmane.linux.drivers.rdma/22243&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://permalink.gmane.org/gmane.linux.drivers.rdma/22243&lt;/a&gt;.&lt;br/&gt;
There is a patch to solve the issue. But it can&apos;t be applied for 2.3 or for 3.1.&lt;br/&gt;
Anyway changing RCU locking to spin locking that is done in 3.1 is enough here.&lt;/p&gt;

&lt;p&gt;We could reproduce the problem on last master with kernel 2.6.32-431.17.1 and default mlnx-ofa-kernel-2.3.&lt;br/&gt;
After updating mlnx-ofa_kernel from 2.3 to 3.1 the problem is not seen anymore.&lt;/p&gt;

&lt;p&gt;Also want to point that mlnx-ofa_kernel 3.1-1.0.3 for unknown reasons has needed changes from RCU to spin locking only for mlnx4.&lt;br/&gt;
mlnx5 is still not fixed in 3.1-1.0.3 !&lt;/p&gt;</comment>
                            <comment id="136261" author="doug" created="Mon, 14 Dec 2015 21:29:57 +0000"  >&lt;p&gt;I&apos;ve been investigating a similar issue.  Here is what I think I am seeing:&lt;/p&gt;

&lt;p&gt;1- Two nodes have a race condition as they try to connect to each other (a reconnect actually).&lt;br/&gt;
2- Liang&apos;s patch &lt;a href=&quot;http://review.whamcloud.com/#/c/14600/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/14600/&lt;/a&gt; has the lower NID side back off so the higher NID can reconnect successfully.&lt;br/&gt;
3- The connection cleanup is delayed on the passive side due to 14600.&lt;br/&gt;
4- Leaving the connection around in a closing state means the RDMA queue and CQ are both still in play.&lt;br/&gt;
5- All the Rx buffers begin to fail due to IB_WC_WR_FLUSH_ERROR  (Alexey refers to this in his link above).&lt;br/&gt;
6- At the same time as the Rx buffers are failing on the CQ, we are seeing stale connection failures on the RDMA queue (they are matching up with the Rx buffer failures).&lt;br/&gt;
7- Because of the logic in the code, we are doing a reconnect due to the RDMA queue stale connection failures.  Since there are many Rx buffers, we end up with many reconnects occurring at the same time.  And these are themselves failing (not sure why) which triggers a new batch of escalating reconnects.&lt;/p&gt;

&lt;p&gt;Over a short time, the number of reconnects to one node is generating a huge number of zombies which are occupying all the memory.&lt;/p&gt;

&lt;p&gt;I&apos;m hypothesizing a two part fix for now given mlx5 is not fixed as you mention above:&lt;/p&gt;

&lt;p&gt;1- When we fail a connection (i.e. due to race), immediately close the RDMA queue (cmid) so it cannot trigger a bunch of reconnects.&lt;br/&gt;
2- Check the connecting counter for the peer and only allow one reconnect to be in flight at any given moment.&lt;/p&gt;

&lt;p&gt;In theory, 1 should prevent 2, but I think doing 2 is good programming to prevent any unexpected failures of this sort.&lt;/p&gt;

&lt;p&gt;Comments?&lt;/p&gt;</comment>
                            <comment id="315362" author="adilger" created="Wed, 13 Oct 2021 01:35:50 +0000"  >&lt;p&gt;MOFED 2.x is no longer of interest.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzx6en:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>17504</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>