<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:54:59 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5841] Lustre 2.4.2 MDS, hitting OOM errors </title>
                <link>https://jira.whamcloud.com/browse/LU-5841</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Lustre 2.4.2, MDS reports OOM:&lt;/p&gt;

&lt;p&gt;Please see attached logs.&lt;/p&gt;</description>
                <environment>Linux meerkat-mds-10-1.local 2.6.32-358.23.2.el6_lustre.x86_64 #1 SMP Thu Dec 19 19:57:45 PST 2013 x86_64 x86_64 x86_64 GNU/Linux</environment>
        <key id="27409">LU-5841</key>
            <summary>Lustre 2.4.2 MDS, hitting OOM errors </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="niu">Niu Yawei</assignee>
                                    <reporter username="haisong">Haisong Cai</reporter>
                        <labels>
                            <label>sdsc</label>
                    </labels>
                <created>Sat, 1 Nov 2014 19:01:31 +0000</created>
                <updated>Mon, 21 Nov 2016 02:41:50 +0000</updated>
                            <resolved>Mon, 21 Nov 2016 02:41:50 +0000</resolved>
                                    <version>Lustre 2.4.2</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="98171" author="pjones" created="Mon, 3 Nov 2014 13:30:33 +0000"  >&lt;p&gt;Niu&lt;/p&gt;

&lt;p&gt;Could you please advise on this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="98203" author="adilger" created="Mon, 3 Nov 2014 18:39:47 +0000"  >&lt;p&gt;This might be related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5726&quot; title=&quot;MDS buffer not freed when deleting files&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5726&quot;&gt;&lt;del&gt;LU-5726&lt;/del&gt;&lt;/a&gt;?  There were a couple of other MDS memory issues (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5079&quot; title=&quot;conf-sanity test_47 timeout&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5079&quot;&gt;&lt;del&gt;LU-5079&lt;/del&gt;&lt;/a&gt;, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5727&quot; title=&quot;MDS OOMs with 2.5.3 clients and lru_size != 0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5727&quot;&gt;&lt;del&gt;LU-5727&lt;/del&gt;&lt;/a&gt;) but they only affected 2.5 and later, unless you have backported patches to your 2.4.2 release?&lt;/p&gt;</comment>
                            <comment id="98206" author="haisong" created="Mon, 3 Nov 2014 18:50:28 +0000"  >&lt;p&gt;Andreas,&lt;/p&gt;

&lt;p&gt;It does look very similar.&lt;/p&gt;

&lt;p&gt;thanks,&lt;br/&gt;
Haisong&lt;/p&gt;</comment>
                            <comment id="98247" author="haisong" created="Tue, 4 Nov 2014 00:43:29 +0000"  >&lt;p&gt;Right now I have a MDS server that looks like is heading to a memory problem.&lt;/p&gt;

&lt;p&gt;== Here is &quot;top&quot; output&lt;/p&gt;

&lt;p&gt;Tasks: 820 total,   3 running, 817 sleeping,   0 stopped,   0 zombie&lt;br/&gt;
Cpu(s):  0.0%us,  7.3%sy,  0.0%ni, 91.4%id,  1.2%wa,  0.0%hi,  0.0%si,  0.0%st&lt;br/&gt;
Mem:  24730000k total, 19445820k used,  5284180k free, 16473344k buffers&lt;br/&gt;
Swap:  1020116k total,    16056k used,  1004060k free,    12956k cached&lt;/p&gt;

&lt;p&gt;  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND                                                                                 &lt;br/&gt;
 3305 root      20   0     0    0    0 R 99.9  0.0 276:03.80 socknal_sd00_00                                                                          &lt;br/&gt;
 3314 root      20   0     0    0    0 S  4.0  0.0  98:28.48 socknal_sd03_00                                                                          &lt;br/&gt;
 3960 root      20   0     0    0    0 S  1.7  0.0   4:25.62 mdt_rdpg03_003                                                                           &lt;br/&gt;
18062 root      20   0     0    0    0 S  1.3  0.0   0:47.64 mdt03_018                                                                                &lt;br/&gt;
 3428 root      20   0     0    0    0 S  1.0  0.0   9:56.10 mdt03_001                                                                                &lt;br/&gt;
 3429 root      20   0     0    0    0 S  1.0  0.0  11:26.84 mdt03_002                                                                                &lt;br/&gt;
 3708 root      20   0     0    0    0 S  1.0  0.0  11:32.89 mdt03_005                                                                                &lt;br/&gt;
 6209 root      20   0     0    0    0 S  1.0  0.0  10:31.10 mdt03_007                                                                                &lt;br/&gt;
16559 root      20   0     0    0    0 S  1.0  0.0   3:36.86 mdt03_013                                                                                &lt;br/&gt;
16746 root      20   0     0    0    0 S  1.0  0.0   3:35.11 mdt03_014                                                                                &lt;br/&gt;
 3427 root      20   0     0    0    0 S  0.7  0.0  11:14.18 mdt03_000                                                                                &lt;br/&gt;
 3641 root      20   0     0    0    0 S  0.7  0.0  10:59.76 mdt03_003                                                                                &lt;br/&gt;
 3703 root      20   0     0    0    0 S  0.7  0.0   9:50.36 mdt03_004                                                                                &lt;br/&gt;
 7181 root      20   0     0    0    0 S  0.7  0.0   9:41.32 mdt03_009                                                                                &lt;br/&gt;
 8921 root      20   0     0    0    0 S  0.7  0.0   7:57.07 mdt03_012                                                                                &lt;br/&gt;
18061 root      20   0     0    0    0 S  0.7  0.0   0:52.57 mdt03_017                                                                                &lt;br/&gt;
18405 root      20   0 15560 1832  940 R  0.7  0.0   0:00.13 top                                                                                      &lt;br/&gt;
  234 root      39  19     0    0    0 S  0.3  0.0   9:27.27 kipmi0                                                                                   &lt;br/&gt;
 3187 root      20   0     0    0    0 S  0.3  0.0  92:39.94 md0_raid10                                                                               &lt;br/&gt;
 3306 root      20   0     0    0    0 S  0.3  0.0  48:36.17 socknal_sd00_01                                                                          &lt;br/&gt;
 3309 root      20   0     0    0    0 S  0.3  0.0  17:21.89 socknal_sd01_01                                                                          &lt;br/&gt;
 3339 root      20   0     0    0    0 S  0.3  0.0   1:30.66 ptlrpcd_15                                                                               &lt;br/&gt;
 6214 root      20   0     0    0    0 S  0.3  0.0  14:40.83 mdt01_011         &lt;br/&gt;
...&lt;/p&gt;

&lt;p&gt;== 2 socknal_sd* processes are hanging:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@meerkat-mds-10-1 tmp&amp;#93;&lt;/span&gt;# ps -ef | grep 3314&lt;br/&gt;
root      3314     2  3 Nov01 ?        01:38:28 &lt;span class=&quot;error&quot;&gt;&amp;#91;socknal_sd03_00&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@meerkat-mds-10-1 tmp&amp;#93;&lt;/span&gt;# ps -ef | grep 3305&lt;br/&gt;
root      3305     2  8 Nov01 ?        04:36:15 &lt;span class=&quot;error&quot;&gt;&amp;#91;socknal_sd00_00&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;== dmesg of MDS shows clients, as well as OSS servers,  timing out:&lt;/p&gt;

&lt;p&gt;LustreError: 138-a: meerkat-MDT0000: A client on nid 192.168.230.53@tcp was evicted due to a lock blocking callback time out: rc -107&lt;br/&gt;
LustreError: 3630:0:(ldlm_lockd.c:2348:ldlm_cancel_handler()) ldlm_cancel from 192.168.230.53@tcp arrived at 1415059768 with bad export cookie 459840027438824761&lt;br/&gt;
Lustre: meerkat-MDT0000: Client e016f72b-cc4a-cee3-5faa-cdb0f5a24764 (at 10.7.102.192@o2ib) reconnecting&lt;br/&gt;
Lustre: Skipped 11 previous similar messages&lt;br/&gt;
Lustre: meerkat-MDT0000: Client d1fdcea8-c2ec-897d-75dc-7b3fe95da5a3 (at 10.7.102.119@o2ib) refused reconnection, still busy with 1 active RPCs&lt;br/&gt;
Lustre: meerkat-MDT0000: Client 677d7d3a-37ea-920c-c096-20a623186fa9 (at 10.7.103.114@o2ib) reconnecting&lt;br/&gt;
Lustre: Skipped 21 previous similar messages&lt;br/&gt;
LustreError: 13042:0:(ldlm_lockd.c:391:waiting_locks_callback()) ### lock callback timer expired after 100s: evicting client at 192.168.230.53@tcp  ns: mdt-meerkat-MDT0000_UUID lock: ffff8800acc2f480/0x661ae11b70ceedf lrc: 3/0,0 mode: PR/PR res: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x2000060bc:0x5508:0x0&amp;#93;&lt;/span&gt;.0 bits 0x2 rrc: 2 type: IBT flags: 0x20 nid: 192.168.230.53@tcp remote: 0x7185dbd0be13beea expref: 36 pid: 3419 timeout: 4486388246 lvb_type: 0&lt;br/&gt;
LustreError: 13042:0:(ldlm_lib.c:2730:target_bulk_io()) @@@ bulk PUT failed: rc &lt;del&gt;107  req@ffff8800a3513800 x1474469876758132/t0(0) o37&lt;/del&gt;&amp;gt;453cd0d9-c8e3-0e50-da9e-7953a9c89205@192.168.230.53@tcp:0/0 lens 448/440 e 0 to 0 dl 1415060499 ref 1 fl Interpret:/0/0 rc 0/0&lt;br/&gt;
Lustre: 3328:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1415060429/real 1415060430&amp;#93;&lt;/span&gt;  req@ffff88031321c800 x1483597813199484/t0(0) o13-&amp;gt;meerkat-OST001c-osc@172.25.32.115@tcp:7/4 lens 224/368 e 0 to 1 dl 1415060438 ref 1 fl Rpc:X/0/ffffffff rc 0/-1&lt;br/&gt;
Lustre: meerkat-OST001c-osc: Connection to meerkat-OST001c (at 172.25.32.115@tcp) was lost; in progress operations using this service will wait for recovery to complete&lt;br/&gt;
Lustre: Skipped 2 previous similar messages&lt;br/&gt;
Lustre: 3325:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1415060429/real 1415060430&amp;#93;&lt;/span&gt;  req@ffff880636df9400 x1483597813199476/t0(0) o13-&amp;gt;meerkat-OST0026-osc@172.25.32.243@tcp:7/4 lens 224/368 e 0 to 1 dl 1415060438 ref 1 fl Rpc:X/0/ffffffff rc 0/-1&lt;br/&gt;
Lustre: meerkat-OST0026-osc: Connection to meerkat-OST0026 (at 172.25.32.243@tcp) was lost; in progress operations using this service will wait for recovery to complete&lt;br/&gt;
Lustre: 3336:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out for sent delay: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1415060432/real 0&amp;#93;&lt;/span&gt;  req@ffff8800b8211400 x1483597813201928/t0(0) o6-&amp;gt;meerkat-OST0034-osc@172.25.32.115@tcp:28/4 lens 664/432 e 0 to 1 dl 1415060440 ref 2 fl Rpc:X/0/ffffffff rc 0/-1&lt;br/&gt;
Lustre: meerkat-OST0034-osc: Connection to meerkat-OST0034 (at 172.25.32.115@tcp) was lost; in progress operations using this service will wait for recovery to complete&lt;br/&gt;
Lustre: MGS: Client 340f1fa1-9370-bc71-a6e3-834f520374a2 (at 10.7.103.181@o2ib) reconnecting&lt;br/&gt;
Lustre: Skipped 7 previous similar messages&lt;br/&gt;
Lustre: meerkat-OST0014-osc: Connection to meerkat-OST0014 (at 172.25.32.115@tcp) was lost; in progress operations using this service will wait for recovery to complete&lt;br/&gt;
Lustre: Skipped 5 previous similar messages&lt;br/&gt;
Lustre: 3325:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out for sent delay: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1415060432/real 0&amp;#93;&lt;/span&gt;  req@ffff8800aa611400 x1483597813202612/t0(0) o6-&amp;gt;meerkat-OST000e-osc@172.25.32.243@tcp:28/4 lens 664/432 e 0 to 1 dl 1415060443 ref 2 fl Rpc:X/0/ffffffff rc 0/-1&lt;br/&gt;
Lustre: 3325:0:(client.c:1868:ptlrpc_expire_one_request()) Skipped 82 previous similar messages&lt;br/&gt;
Lustre: 3319:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out for sent delay: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1415060440/real 0&amp;#93;&lt;/span&gt;  req@ffff8801efc24800 x1483597813206052/t0(0) o8-&amp;gt;meerkat-OST0036-osc@172.25.32.243@tcp:28/4 lens 400/544 e 0 to 1 dl 1415060447 ref 2 fl Rpc:XN/0/ffffffff rc 0/-1&lt;br/&gt;
Lustre: 3319:0:(client.c:1868:ptlrpc_expire_one_request()) Skipped 52 previous similar messages&lt;br/&gt;
Lustre: meerkat-MDT0000: Client 06bc4379-10ca-76ad-cd98-1d1013f1b911 (at 10.7.103.252@o2ib) refused reconnection, still busy with 1 active RPCs&lt;br/&gt;
Lustre: 18052:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1415060449/real 1415060451&amp;#93;&lt;/span&gt;  req@ffff8802ebaad800 x1483597813206540/t0(0) o104-&amp;gt;meerkat-MDT0000@10.7.103.252@o2ib:15/16 lens 296/224 e 0 to 1 dl 1415060456 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1&lt;br/&gt;
Lustre: 18052:0:(client.c:1868:ptlrpc_expire_one_request()) Skipped 18 previous similar messages&lt;br/&gt;
Lustre: meerkat-OST0004-osc: Connection restored to meerkat-OST0004 (at 172.25.32.115@tcp)&lt;br/&gt;
Lustre: Skipped 2 previous similar messages&lt;br/&gt;
Lustre: meerkat-OST0006-osc: Connection restored to meerkat-OST0006 (at 172.25.32.243@tcp)&lt;br/&gt;
Lustre: meerkat-OST000c-osc: Connection restored to meerkat-OST000c (at 172.25.32.115@tcp)&lt;br/&gt;
Lustre: meerkat-MDT0000: Client 7a7ab9a5-c8e6-abb6-2f14-1ebe9b1fdab3 (at 10.7.104.32@o2ib) reconnecting&lt;br/&gt;
Lustre: Skipped 218 previous similar messages&lt;br/&gt;
Lustre: 3325:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1415060911/real 1415060921&amp;#93;&lt;/span&gt;  req@ffff8800a7a9fc00 x1483597814687272/t0(0) o6-&amp;gt;meerkat-OST000c-osc@172.25.32.115@tcp:28/4 lens 664/432 e 0 to 1 dl 1415060925 ref 1 fl Rpc:X/0/ffffffff rc 0/-1&lt;br/&gt;
Lustre: meerkat-OST000c-osc: Connection to meerkat-OST000c (at 172.25.32.115@tcp) was lost; in progress operations using this service will wait for recovery to complete&lt;br/&gt;
Lustre: Skipped 7 previous similar messages&lt;br/&gt;
Lustre: meerkat-OST003c-osc: Connection to meerkat-OST003c (at 172.25.32.115@tcp) was lost; in progress operations using this service will wait for recovery to complete&lt;br/&gt;
Lustre: meerkat-OST000c-osc: Connection restored to meerkat-OST000c (at 172.25.32.115@tcp)&lt;br/&gt;
Lustre: Skipped 13 previous similar messages&lt;br/&gt;
LustreError: 11-0: meerkat-OST0006-osc: Communicating with 172.25.32.243@tcp, operation ost_connect failed with -16.&lt;br/&gt;
LustreError: Skipped 2 previous similar messages&lt;br/&gt;
Lustre: meerkat-OST0024-osc: Connection restored to meerkat-OST0024 (at 172.25.32.115@tcp)&lt;br/&gt;
Lustre: Skipped 6 previous similar messages&lt;br/&gt;
Lustre: meerkat-MDT0000: Client 22670471-b57b-0d1a-cd38-f4f39735b005 (at 10.7.103.146@o2ib) reconnecting&lt;br/&gt;
Lustre: Skipped 34 previous similar messages&lt;/p&gt;

&lt;p&gt;== kill -9 of processes 3305 and 3314 (socknal_sd00_00 &amp;amp; socknal_sd03_00) fail&lt;/p&gt;
</comment>
                            <comment id="98250" author="niu" created="Tue, 4 Nov 2014 01:26:59 +0000"  >&lt;p&gt;Hi, Cai&lt;/p&gt;

&lt;p&gt;What&apos;s the total memory of MDS and the value of min_free_kbytes? Could you try to increase the min_free_kbytes as I suggested in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5726&quot; title=&quot;MDS buffer not freed when deleting files&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5726&quot;&gt;&lt;del&gt;LU-5726&lt;/del&gt;&lt;/a&gt; to see if it helps? Thanks.&lt;/p&gt;</comment>
                            <comment id="98255" author="haisong" created="Tue, 4 Nov 2014 02:56:48 +0000"  >&lt;p&gt;Hi Yawei,&lt;/p&gt;

&lt;p&gt;Here is the setting of the min_free_kbytes, about 5% of total RAM as you suggested.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@meerkat-mds-10-1 ~&amp;#93;&lt;/span&gt;# sysctl -a | grep free_kbytes&lt;br/&gt;
vm.min_free_kbytes = 1228800&lt;br/&gt;
vm.extra_free_kbytes = 0&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@meerkat-mds-10-1 ~&amp;#93;&lt;/span&gt;# free&lt;br/&gt;
             total       used       free     shared    buffers     cached&lt;br/&gt;
Mem:      24730000   22800600    1929400          0   20016692      15880&lt;br/&gt;
-/+ buffers/cache:    2768028   21961972&lt;br/&gt;
Swap:      1020116      24828     995288&lt;/p&gt;

&lt;p&gt;Haisong&lt;/p&gt;

</comment>
                            <comment id="98285" author="haisong" created="Tue, 4 Nov 2014 16:53:05 +0000"  >&lt;p&gt;Overnight into this morning, MDS has dumped more errors and some of the messages I haven&apos;t seen before.&lt;br/&gt;
I am include dmesg output here for debugging purpose. &lt;/p&gt;

&lt;p&gt;Haisong&lt;/p&gt;</comment>
                            <comment id="98301" author="haisong" created="Tue, 4 Nov 2014 18:33:09 +0000"  >
&lt;p&gt;MDS came to a point where it became unresponsive, system load at 65, buffer memory at 20GB out of 24GB total and wouldn&apos;t release.&lt;br/&gt;
I Attempted unmounting MDT and reboot MDS where the server kernel panic&apos;ed &lt;/p&gt;

&lt;p&gt;Attach screen dump here.&lt;/p&gt;</comment>
                            <comment id="98465" author="rmohr" created="Wed, 5 Nov 2014 20:15:50 +0000"  >&lt;p&gt;Do you have vm.zone_reclaim_mode=0 set on your MDS server?  I ran into issues with sluggish MDS server performance earlier this year that were fixed by setting that parameter.&lt;/p&gt;</comment>
                            <comment id="98484" author="haisong" created="Wed, 5 Nov 2014 22:54:10 +0000"  >
&lt;p&gt;Rick,&lt;/p&gt;

&lt;p&gt;Thank you for the note. I saw your comments in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5726&quot; title=&quot;MDS buffer not freed when deleting files&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5726&quot;&gt;&lt;del&gt;LU-5726&lt;/del&gt;&lt;/a&gt; today and have disabled vm.zone_reclaim_mode.&lt;br/&gt;
In &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5726&quot; title=&quot;MDS buffer not freed when deleting files&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5726&quot;&gt;&lt;del&gt;LU-5726&lt;/del&gt;&lt;/a&gt;, you commented on disabling vm.zone_reclaim_mode  &quot;... just took longer for the same underlying problem to become evident again&quot;. Had the problem reoccurring in your MDS?&lt;/p&gt;

&lt;p&gt;thanks,&lt;br/&gt;
Haisong &lt;/p&gt;</comment>
                            <comment id="98496" author="niu" created="Thu, 6 Nov 2014 01:26:56 +0000"  >&lt;p&gt;Hi, Haisong&lt;/p&gt;

&lt;p&gt;The log &amp;amp; stack trace shows that the server ran into OOM situation at the end, and the initial cause is that unstable network. We can see lots of clients reconnect and bulk io timeout errors on MDT at the beginning, could you check your network if it&apos;s healthy?&lt;/p&gt;

&lt;p&gt;The last crash in lu_context_key_degister() is dup of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3806&quot; title=&quot;separate SESSION FLAG for server and client stack.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3806&quot;&gt;&lt;del&gt;LU-3806&lt;/del&gt;&lt;/a&gt;, I think.&lt;/p&gt;</comment>
                            <comment id="98505" author="haisong" created="Thu, 6 Nov 2014 04:37:51 +0000"  >&lt;p&gt;Hi Yawei,&lt;/p&gt;

&lt;p&gt;Typical symptoms of this problem, in our case at least, has been processes, whether LNET, MDT, or MGC hang. Not only processes hang, a lot of times MDS OS itself would hang for a few minutes at a time. What you are seeing I believe are the results of some hanging LNET or Lustre network processes, following by disconnections to OSS/OST and clients.&lt;/p&gt;

&lt;p&gt;We have implemented suggestion from Rick Mohr, by disabling vm.zone_reclaim_mode in MDS. So far MDS has been behaving. We will continue monitoring.&lt;/p&gt;

&lt;p&gt;thanks,&lt;br/&gt;
Haisong&lt;/p&gt;</comment>
                            <comment id="98726" author="adilger" created="Sat, 8 Nov 2014 23:02:46 +0000"  >&lt;p&gt;Haisong, to clarify, you are now running your MDS with &lt;tt&gt;vm.zone_reclaim_mode=0&lt;/tt&gt; and that has resolved, or at least reduced the memory problems?&lt;/p&gt;

&lt;p&gt;We should consider setting this tunable by default on MDS nodes via mount.lustre, as we do with other tunables. There is some concern that this would go against the tunings of the administrator, and I&apos;m not sure how to best handle that...&lt;/p&gt;</comment>
                            <comment id="98743" author="haisong" created="Mon, 10 Nov 2014 04:57:58 +0000"  >&lt;p&gt;Andreas,&lt;/p&gt;

&lt;p&gt;Indeed we have set vm.zone_reclaim_mode=0 in our MDS servers since last Wednesday. From observation&lt;br/&gt;
using &quot;collectl -sM&quot;, 2 noticeable changes:&lt;/p&gt;

&lt;p&gt;1) buffer memory doesn&apos;t grow like used to, and &lt;br/&gt;
2) used memory balances between 2 CPU nodes, where before it was one 2 or 3 times hight than the other.&lt;/p&gt;

&lt;p&gt;Here is a sample output I got just now:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@meerkat-mds-10-2 ~&amp;#93;&lt;/span&gt;# collectl -sM -i 10&lt;br/&gt;
waiting for 10 second sample...&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;MEMORY STATISTICS&lt;/li&gt;
	&lt;li&gt;Node    Total     Used     Free     Slab   Mapped     Anon   Locked    Inact   Hit%&lt;br/&gt;
     0   12279M   10422M    1856M    2458M    3140K   41836K        0    3528M 100.00&lt;br/&gt;
     1   12288M    9831M    2456M    3529M    2988K   33116K        0    2768M 100.00&lt;br/&gt;
     0   12279M   10422M    1856M    2458M    3140K   41840K        0    3528M 100.00&lt;br/&gt;
     1   12288M    9832M    2455M    3529M    2988K   33112K        0    2767M 100.00&lt;br/&gt;
     0   12279M   10422M    1856M    2457M    3048K   41836K        0    3528M 100.00&lt;br/&gt;
     1   12288M    9833M    2454M    3530M    2988K   33004K        0    2767M 100.00&lt;br/&gt;
     0   12279M   10423M    1855M    2458M    3140K   41844K        0    3528M 100.00&lt;br/&gt;
     1   12288M    9835M    2452M    3532M    2988K   33108K        0    2768M 100.00&lt;/li&gt;
&lt;/ol&gt;



&lt;p&gt;Haisong &lt;/p&gt;</comment>
                            <comment id="98825" author="rmohr" created="Mon, 10 Nov 2014 23:00:31 +0000"  >&lt;p&gt;Haisong,&lt;/p&gt;

&lt;p&gt;Disabling zone_reclaim_mode seemed to fix our original issue with sluggish MDS performance, although I really don&apos;t know if this is in any way directly related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5726&quot; title=&quot;MDS buffer not freed when deleting files&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5726&quot;&gt;&lt;del&gt;LU-5726&lt;/del&gt;&lt;/a&gt; or not.&lt;/p&gt;</comment>
                            <comment id="105989" author="niu" created="Fri, 6 Feb 2015 02:32:03 +0000"  >&lt;p&gt;Could you try if the fix of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5726&quot; title=&quot;MDS buffer not freed when deleting files&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5726&quot;&gt;&lt;del&gt;LU-5726&lt;/del&gt;&lt;/a&gt; can resolve your problem as well? Thanks.&lt;/p&gt;</comment>
                            <comment id="174412" author="niu" created="Mon, 21 Nov 2016 02:41:50 +0000"  >&lt;p&gt;Dup of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5726&quot; title=&quot;MDS buffer not freed when deleting files&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5726&quot;&gt;&lt;del&gt;LU-5726&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="26970">LU-5726</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="31065">LU-6842</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="16306" name="Screenshot-meerkat-1104.png" size="31647" author="haisong" created="Tue, 4 Nov 2014 18:32:56 +0000"/>
                            <attachment id="16294" name="debug_kernel.1101.gz" size="233" author="haisong" created="Sat, 1 Nov 2014 19:01:31 +0000"/>
                            <attachment id="16304" name="dmesg-meerkat-11-04" size="508793" author="haisong" created="Tue, 4 Nov 2014 16:52:59 +0000"/>
                            <attachment id="16295" name="dmesg.1101" size="485514" author="haisong" created="Sat, 1 Nov 2014 19:01:31 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10490" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>End date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Tue, 14 Jul 2015 19:01:31 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwzzb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>16369</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10493" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>Start date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Mon, 3 Nov 2014 19:01:31 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                    </customfields>
    </item>
</channel>
</rss>