<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:08:29 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-7390] Router memory leak if we start a new router on a operationel configuration</title>
                <link>https://jira.whamcloud.com/browse/LU-7390</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;&#160;Router memory leak if we start a new router on a operationel configuration&lt;/p&gt;

&lt;p&gt;configuration :&lt;/p&gt;

&lt;p&gt;lustre server 2.5.3.90 with one IB and 2 ip address QQ.P.BBO.SY QQ.P.BBB.SY&lt;/p&gt;

&lt;p&gt;2 lustre router 2.7 with 4 IB card and 4 ip address&lt;br/&gt;
 IB0 - JO.BOO.RX.RY &lt;br/&gt;
 IB1 - QQ.P.BBO.RY&lt;br/&gt;
 IB2 - JO.BOB.RX.RY&lt;br/&gt;
 IB3 - QQ.P.BBB.RY&lt;/p&gt;

&lt;p&gt;~130 lustre clients i2.7 with one IB and 2 ip address JO.BOO.CX.CY JO.BOB.CX.CY&lt;/p&gt;

&lt;p&gt;we start all servers one router and all clients and waiting that &lt;br/&gt;
 the production start.&lt;/p&gt;

&lt;p&gt;and we start the router with modprobe lustre, the router never start &lt;br/&gt;
 correctly and panic on Out of memory and no killable processes...&lt;/p&gt;

&lt;p&gt;KERNEL: /usr/lib/debug/lib/modules/3.10.0-229.7.2.el7.x86_64/vmlinux&lt;br/&gt;
 DUMPFILE: /var/crash/127.0.0.1-2015.09.23-09:00:12/vmcore &lt;span class=&quot;error&quot;&gt;&amp;#91;PARTIAL DUMP&amp;#93;&lt;/span&gt;&lt;br/&gt;
 CPUS: 32&lt;br/&gt;
 DATE: Wed Sep 23 08:59:56 2015&lt;br/&gt;
 UPTIME: 14:49:59&lt;br/&gt;
 LOAD AVERAGE: 11.71, 10.11, 5.64&lt;br/&gt;
 TASKS: 547&lt;br/&gt;
 NODENAME: neel121&lt;br/&gt;
 RELEASE: 3.10.0-229.7.2.el7.x86_64&lt;br/&gt;
 VERSION: #1 SMP Fri May 15 21:38:46 EDT 2015&lt;br/&gt;
 MACHINE: x86_64 (2299 Mhz)&lt;br/&gt;
 MEMORY: 127.9 GB&lt;br/&gt;
 PANIC: &quot;Kernel panic - not syncing: Out of memory and no killable processes...&quot;&lt;br/&gt;
 PID: 5002&lt;br/&gt;
 COMMAND: &quot;kworker/u64:1&quot;&lt;br/&gt;
 TASK: ffff8810154816c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;THREAD_INFO: ffff882028314000&amp;#93;&lt;/span&gt;&lt;br/&gt;
 CPU: 23&lt;br/&gt;
 STATE: TASK_RUNNING (PANIC)&lt;/p&gt;

&lt;p&gt;crash&amp;gt; kmem -i&lt;br/&gt;
 PAGES TOTAL PERCENTAGE&lt;br/&gt;
 TOTAL MEM 32900006 125.5 GB ----&lt;br/&gt;
 FREE 131353 513.1 MB 0% of TOTAL MEM&lt;br/&gt;
 USED 32768653 125 GB 99% of TOTAL MEM&lt;br/&gt;
 SHARED 79 316 KB 0% of TOTAL MEM&lt;br/&gt;
 BUFFERS 0 0 0% of TOTAL MEM&lt;br/&gt;
 CACHED 6497 25.4 MB 0% of TOTAL MEM&lt;br/&gt;
 SLAB 993205 3.8 GB 3% of TOTAL MEM&lt;/p&gt;

&lt;p&gt;TOTAL SWAP 0 0 ----&lt;br/&gt;
 SWAP USED 0 0 100% of TOTAL SWAP&lt;br/&gt;
 SWAP FREE 0 0 0% of TOTAL SWAP&lt;br/&gt;
 crash&amp;gt; bt&lt;br/&gt;
 PID: 5002 TASK: ffff8810154816c0 CPU: 23 COMMAND: &quot;kworker/u64:1&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317690&amp;#93;&lt;/span&gt; machine_kexec at ffffffff8104c4eb&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8820283176f0&amp;#93;&lt;/span&gt; crash_kexec at ffffffff810e2052&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8820283177c0&amp;#93;&lt;/span&gt; panic at ffffffff815fdc31&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317840&amp;#93;&lt;/span&gt; out_of_memory at ffffffff8115a96a&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8820283178d8&amp;#93;&lt;/span&gt; __alloc_pages_nodemask at ffffffff81160af5&lt;br/&gt;
 #5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317a10&amp;#93;&lt;/span&gt; dma_generic_alloc_coherent at ffffffff8101981f&lt;br/&gt;
 #6 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317a58&amp;#93;&lt;/span&gt; x86_swiotlb_alloc_coherent at ffffffff810560e1&lt;br/&gt;
 #7 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317a88&amp;#93;&lt;/span&gt; mlx5_dma_zalloc_coherent_node at ffffffffa012607d &lt;span class=&quot;error&quot;&gt;&amp;#91;mlx5_core&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317ac8&amp;#93;&lt;/span&gt; mlx5_buf_alloc_node at ffffffffa0126627 &lt;span class=&quot;error&quot;&gt;&amp;#91;mlx5_core&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #9 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317b18&amp;#93;&lt;/span&gt; mlx5_buf_alloc at ffffffffa0126755 &lt;span class=&quot;error&quot;&gt;&amp;#91;mlx5_core&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317b28&amp;#93;&lt;/span&gt; create_kernel_qp at ffffffffa0158903 &lt;span class=&quot;error&quot;&gt;&amp;#91;mlx5_ib&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #11 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317ba0&amp;#93;&lt;/span&gt; create_qp_common at ffffffffa0159236 &lt;span class=&quot;error&quot;&gt;&amp;#91;mlx5_ib&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #12 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317c38&amp;#93;&lt;/span&gt; __create_qp at ffffffffa0159ab1 &lt;span class=&quot;error&quot;&gt;&amp;#91;mlx5_ib&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #13 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317c98&amp;#93;&lt;/span&gt; mlx5_ib_create_qp at ffffffffa015a023 &lt;span class=&quot;error&quot;&gt;&amp;#91;mlx5_ib&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #14 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317cc8&amp;#93;&lt;/span&gt; ib_create_qp at ffffffffa00ed3b2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ib_core&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #15 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317d00&amp;#93;&lt;/span&gt; rdma_create_qp at ffffffffa0549999 &lt;span class=&quot;error&quot;&gt;&amp;#91;rdma_cm&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #16 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317d28&amp;#93;&lt;/span&gt; kiblnd_create_conn at ffffffffa0926747 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #17 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317d90&amp;#93;&lt;/span&gt; kiblnd_cm_callback at ffffffffa0934b89 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #18 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317df8&amp;#93;&lt;/span&gt; cma_work_handler at ffffffffa054c98c &lt;span class=&quot;error&quot;&gt;&amp;#91;rdma_cm&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #19 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317e20&amp;#93;&lt;/span&gt; process_one_work at ffffffff8108f0bb&lt;br/&gt;
 #20 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317e68&amp;#93;&lt;/span&gt; worker_thread at ffffffff8108fe8b&lt;br/&gt;
 #21 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317ec8&amp;#93;&lt;/span&gt; kthread at ffffffff8109726f&lt;br/&gt;
 #22 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff882028317f50&amp;#93;&lt;/span&gt; ret_from_fork at ffffffff81614158&lt;/p&gt;

&lt;p&gt;There are a lot of zombies connections on the list :&lt;/p&gt;

&lt;p&gt;crash&amp;gt; p kiblnd_data.kib_connd_zombies&lt;br/&gt;
 $48 = {&lt;br/&gt;
 next = 0xffff881fac9ed418, &lt;br/&gt;
 prev = 0xffff8810aae96818&lt;br/&gt;
 }&lt;br/&gt;
 crash&amp;gt; list 0xffff881fac9ed418 | wc -l&lt;br/&gt;
 122060&lt;br/&gt;
 crash&amp;gt;&lt;/p&gt;

&lt;p&gt;All the connections have an ibc_state = 0x5 and&lt;br/&gt;
 and an ibc_comms_error = 0xfffffffb ( -5 EIO Input/Output error ) for 120688 connections and ibc_comms_error = 0 for the others (1372)&lt;/p&gt;

&lt;p&gt;we can see on the lustre debug trace some faulted connection :&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@neel121 127.0.0.1-2015.09.23-09:54:45&amp;#93;&lt;/span&gt;# grep kiblnd_rx_complete lustre.log&lt;br/&gt;
 00000800:00000100:18.0:1442994842.103700:0:4513:0:(o2iblnd_cb.c:491:kiblnd_rx_complete()) Rx from JO.BOO.BZP.LW@o2ib3 failed: 5&lt;br/&gt;
 00000800:00000200:18.0:1442994842.103701:0:4513:0:(o2iblnd_cb.c:537:kiblnd_rx_complete()) rx ffff881080c31000 conn ffff8810b37a6000&lt;br/&gt;
 00000800:00000100:23.0:1442994846.067198:0:4517:0:(o2iblnd_cb.c:491:kiblnd_rx_complete()) Rx from JO.BOB.BZP.BLP@o2ib30 failed: 5&lt;br/&gt;
 00000800:00000200:23.0:1442994846.067199:0:4517:0:(o2iblnd_cb.c:537:kiblnd_rx_complete()) rx ffff8810819cc000 conn ffff88109266f600&lt;br/&gt;
 00000800:00000100:18.0:1442994863.480144:0:4511:0:(o2iblnd_cb.c:491:kiblnd_rx_complete()) Rx from JO.BOO.BZZ.FL@o2ib3 failed: 5&lt;br/&gt;
 00000800:00000200:18.0:1442994863.480144:0:4511:0:(o2iblnd_cb.c:537:kiblnd_rx_complete()) rx ffff881085047000 conn ffff8810b31ccc00&lt;/p&gt;

&lt;p&gt;I don&apos;t understand why a lot of connections have an EIO error but that explain the memory leak ....&lt;/p&gt;

&lt;p&gt;Router work fine if we start all router before start lustre on the clients&lt;br/&gt;
 The issue is reprodutible only if we start the second router after the real production is started&lt;/p&gt;

&lt;p&gt;I find on Jira lustre Intel database the &lt;del&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5718&quot; title=&quot;RDMA too fragmented with router&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5718&quot;&gt;&lt;del&gt;LU-5718&lt;/del&gt;&lt;/a&gt;&lt;/del&gt;, Could you confirm that the Jira &lt;del&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5718&quot; title=&quot;RDMA too fragmented with router&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5718&quot;&gt;&lt;del&gt;LU-5718&lt;/del&gt;&lt;/a&gt;&lt;/del&gt; could help for this issue ?&lt;/p&gt;

&lt;p&gt;Lustre version :&lt;br/&gt;
 For client and router&lt;br/&gt;
 lustre-modules-2.7.0-3.10.0_229.7.2.el7.x86_64_1.el7.Bull.0.005.20150727.x86_64.rpm&lt;br/&gt;
 For server&lt;br/&gt;
 lustre-modules_H-2.5.3.90-2.6.32_573.1.1.el6.Bull.80.x86_64_Bull.4.113.el6.20150731.x86_64.rpm&lt;/p&gt;

&lt;p&gt;Lustre configuration&lt;/p&gt;

&lt;p&gt;router :&lt;br/&gt;
 networks.conf&lt;br/&gt;
 LNET_OPTIONS=&apos;networks=o2ib3(ib0),o2ib30(ib2),o2ib2(ib1.8110),o2ib20(ib3.8111)&apos;&lt;br/&gt;
 routers.conf&lt;br/&gt;
 LNET_ROUTER_OPTIONS=&apos;forwarding=&quot;enabled&quot;&apos;&lt;/p&gt;

&lt;p&gt;Client:&lt;br/&gt;
 networks.conf&lt;br/&gt;
 LNET_OPTIONS=&apos;o2ib3(ib0),o2ib30(ib0:1)&apos;&lt;br/&gt;
 routers.conf&lt;br/&gt;
 LNET_ROUTER_OPTIONS=&apos;routes=&quot;o2ib2 JO.BOO.184.&lt;span class=&quot;error&quot;&gt;&amp;#91;121-122&amp;#93;&lt;/span&gt;@o2ib3;o2ib20 JO.BOB.184.&lt;span class=&quot;error&quot;&gt;&amp;#91;121-122&amp;#93;&lt;/span&gt;@o2ib30&quot; dead_router_check_interval=59 live_router_check_interval=107 check _routers_before_use=1&apos;&lt;/p&gt;

&lt;p&gt;Server:&lt;br/&gt;
 networks.conf&lt;br/&gt;
 LNET_OPTIONS=&apos;o2ib2(ib0.8110),o2ib20(ib0.8111)&apos;&lt;br/&gt;
 routers.conf&lt;br/&gt;
 LNET_ROUTER_OPTIONS=&apos;routes=&quot;o2ib3 QQ.P.BBO.&lt;span class=&quot;error&quot;&gt;&amp;#91;121-122&amp;#93;&lt;/span&gt;@o2ib2;o2ib30 QQ.P.BBB.&lt;span class=&quot;error&quot;&gt;&amp;#91;121-122&amp;#93;&lt;/span&gt;@o2ib30&quot; dead_router_check_interval=59 live_router_check_interval=107 check _routers_before_use=1&apos;&lt;/p&gt;

&lt;p&gt;on the server side, there are a lot of other route that I didn&apos;t reported on the LNET_ROUTER_OPTIONS&lt;br/&gt;
 and also the IB configuration on the IB network server use PKEY.&lt;/p&gt;</description>
                <environment>redhat7 mlx5 EDR and Connect-IB</environment>
        <key id="33008">LU-7390</key>
            <summary>Router memory leak if we start a new router on a operationel configuration</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="apercher">Antoine Percher</reporter>
                        <labels>
                    </labels>
                <created>Thu, 5 Nov 2015 09:47:53 +0000</created>
                <updated>Wed, 28 Jun 2017 21:17:07 +0000</updated>
                                            <version>Lustre 2.7.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>14</watches>
                                                                            <comments>
                            <comment id="132731" author="bfaccini" created="Thu, 5 Nov 2015 14:59:22 +0000"  >&lt;p&gt;Since I have also been involved on this problem when being on-site, I can complete the whole problem description by adding what occurs on Clients after the problem on the new router.&lt;/p&gt;

&lt;p&gt;So first of all, it seems that the problem can be reproduced simply by &lt;span class=&quot;error&quot;&gt;&amp;#91;re-&amp;#93;&lt;/span&gt;starting one of the LNET-Router in config.&lt;/p&gt;

&lt;p&gt;When this occurs, this router quite quickly triggers an OOM situation which is likely to be caused by a huge number of allocs in LNet/ko2iblnd layers seen in Lustre debug trace and possibly corresponding to numerous buffers in IB layer &#8230;.&lt;/p&gt;

&lt;p&gt;Then Clients fall in the situation where  Lustre can not be fully shudown because filesystem can be unmounted ok, but then lustre_rmmod stalls due to underlying rmmod being stuck with the following stack trace :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;__schedule()
schedule()
schedule_timeout()
__down_common()
__down()
lnet_router_checker_stop()
LNetNIFini()
ptlrpc_ni_fini()
ptlrpc_exit_portals()
cleanup_module()
sys_delete_module()
system_call_fastpath()
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;when at the same time, &#171; router_checker &#187; thread is stuck with the following stack trace :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;__schedule()
schedule()
schedule_timeout()
lnet_prune_rc_data()
lnet_router_checker()
kthread()
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;At the same time, &#171; Waiting for rc buffers to unlink\n &#187; msg is repeatedly printed on the Console of Clients where the &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre_&amp;#93;&lt;/span&gt;rmmod is stuck.&lt;/p&gt;

&lt;p&gt;It is also interesting that this same behavior can occur even if the new router being started has been dynamically removed from the Clients LNET config using both &quot;lnetlctl route  del &lt;span class=&quot;error&quot;&gt;&amp;#91;--net,--gateway&amp;#93;&lt;/span&gt;&quot; cmds.&lt;/p&gt;

&lt;p&gt;Amir, has already suggested to try running &quot;lctl net down&quot; cmd before the lustre_rmmod cmd to see if it helps, but site has presently no more dedicated test slot to give it a try.&lt;/p&gt;

&lt;p&gt;Last, I had an interesting update from the site about their current network config/cabling which may be of interest :&lt;br/&gt;
	_ each Clients/Servers have only one IB board/attachment, but with 2 IP aliases (for 2 different networks) on each.&lt;br/&gt;
	_ each router has 2 IB boards/attachments on each (Clients, Servers) side, both connected on the same fabric but each with an IP address in a different Clients/Servers network.&lt;/p&gt;</comment>
                            <comment id="132760" author="bruno.travouillon" created="Thu, 5 Nov 2015 18:06:50 +0000"  >&lt;p&gt;FTR, we hit a similar issue with RHEL6/OFED3.12/Lustre 2.5.3.90 on some OSSs.&lt;/p&gt;

&lt;p&gt;_each OSS has 2 IB boards/attachment, both connected to the same fabric but each with an IP address in a different Client network.&lt;br/&gt;
_each Client has only one IB board/attachment, but with 2 IP aliases (for 2 different networks) on each.&lt;/p&gt;

&lt;p&gt;The memory leak happened during production. We were not trying to failover the OSTs or stopping the Lustre filesystem.&lt;/p&gt;

&lt;p&gt;During my research, I found &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6251&quot; title=&quot;Melanox / O2ib lnd cause a OOM on OST node&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6251&quot;&gt;&lt;del&gt;LU-6251&lt;/del&gt;&lt;/a&gt; in which Alexey pointed out the patch &lt;a href=&quot;http://review.whamcloud.com/#/c/14600/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/14600/&lt;/a&gt; to fix the memory leak.&lt;/p&gt;

&lt;p&gt;I have not been able to reproduce yet, nor to test the proposed patch.&lt;/p&gt;</comment>
                            <comment id="132763" author="jgmitter" created="Thu, 5 Nov 2015 18:15:43 +0000"  >&lt;p&gt;Hi Amir,&lt;br/&gt;
Can you investigate this issue?&lt;br/&gt;
Thanks.&lt;br/&gt;
Joe&lt;/p&gt;</comment>
                            <comment id="132794" author="ashehata" created="Thu, 5 Nov 2015 21:16:27 +0000"  >&lt;p&gt;I agree with Bruno. When a router is started it&apos;ll get many connection requests, which could exploit the issue fixed in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5718&quot; title=&quot;RDMA too fragmented with router&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5718&quot;&gt;&lt;del&gt;LU-5718&lt;/del&gt;&lt;/a&gt;. The fact that there is so many connection on the connd zombie list seem to point to this issue. &lt;/p&gt;

&lt;p&gt;Could you apply this patch and see if it resolves the issue?&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/#/c/14600&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/14600&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="132896" author="ashehata" created="Fri, 6 Nov 2015 21:04:09 +0000"  >&lt;p&gt;After discussing it internally, it seems that we&apos;re seeing this OOM issues on multiple different sites. All sites are using mlx5 stack. Is it possible to roll back to mlx4 on the routers?&lt;/p&gt;</comment>
                            <comment id="132904" author="simmonsja" created="Fri, 6 Nov 2015 21:54:46 +0000"  >&lt;p&gt;Not if you have a Connect-IB card. Mind you most of systems use mlx5 and we don&apos;t see this problem.&lt;/p&gt;</comment>
                            <comment id="132956" author="bruno.travouillon" created="Sun, 8 Nov 2015 10:39:59 +0000"  >&lt;p&gt;We have ConnectX-4 and Connect-IB cards on the routers, so we are stuck with mlx5.&lt;/p&gt;

&lt;p&gt;The similar memory leaks reported in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6251&quot; title=&quot;Melanox / O2ib lnd cause a OOM on OST node&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6251&quot;&gt;&lt;del&gt;LU-6251&lt;/del&gt;&lt;/a&gt; indicate that it happens with mlx4 as well. I&apos;ve seen this memory leak with both mlx drivers (mlx5 on these RHEL7/MLNX_OFED/Lustre 2.7 routers, mlx4 on the RHEL6/OFED3.12/Lustre 2.5.3.90 OSSs).&lt;/p&gt;

&lt;p&gt;As per your request, our engineering should apply &lt;a href=&quot;http://review.whamcloud.com/#/c/14600&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/14600&lt;/a&gt; quickly to see if it can solve the issue.&lt;/p&gt;</comment>
                            <comment id="133082" author="doug" created="Mon, 9 Nov 2015 23:59:35 +0000"  >&lt;p&gt;Does this system have the patch to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3322&quot; title=&quot;ko2iblnd support for different map_on_demand and peer_credits between systems&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3322&quot;&gt;&lt;del&gt;LU-3322&lt;/del&gt;&lt;/a&gt; applied?  If so, it was a &quot;partial&quot; patch and could cause OOM situations.  See patch &lt;a href=&quot;http://review.whamcloud.com/#/c/17074/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/17074/&lt;/a&gt; for the necessary addition.&lt;/p&gt;</comment>
                            <comment id="133428" author="pichong" created="Fri, 13 Nov 2015 10:11:40 +0000"  >&lt;p&gt;No, the Lustre version installed on this system does not include the patch to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3322&quot; title=&quot;ko2iblnd support for different map_on_demand and peer_credits between systems&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3322&quot;&gt;&lt;del&gt;LU-3322&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="136765" author="hornc" created="Thu, 17 Dec 2015 20:32:37 +0000"  >&lt;p&gt;FWIW, we (Cray) seem to be hitting this issue as well and the patch &lt;a href=&quot;http://review.whamcloud.com/#/c/14600&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/14600&lt;/a&gt; did not resolve the issue.&lt;/p&gt;</comment>
                            <comment id="136788" author="doug" created="Thu, 17 Dec 2015 22:08:30 +0000"  >&lt;p&gt;Patch 14600 has been &quot;reinvented&quot; as: &lt;a href=&quot;http://review.whamcloud.com/#/c/17661&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/17661&lt;/a&gt;.  This is new and needs validation.  I need to spend some time to determine if it can address this ticket.  However, if you have time, please remove 14600 and apply 17661 and see if this addresses your problem.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="33736">LU-7569</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10490" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>End date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Fri, 8 Jan 2016 09:47:53 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzxsan:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10493" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>Start date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Thu, 5 Nov 2015 09:47:53 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                    </customfields>
    </item>
</channel>
</rss>