<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:39:04 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4033] Failure on test suite parallel-scale-nfsv4 test_iorssf: MDS oom</title>
                <link>https://jira.whamcloud.com/browse/LU-4033</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for sarah &amp;lt;sarah@whamcloud.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;http://maloo.whamcloud.com/test_sets/960b8b64-2915-11e3-b598-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://maloo.whamcloud.com/test_sets/960b8b64-2915-11e3-b598-52540035b04c&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_iorssf failed with the following error:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;test failed to respond and timed out&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;MDS console&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;17:14:54:ptlrpcd_0: page allocation failure. order:1, mode:0x40
17:14:55:Pid: 2780, comm: ptlrpcd_0 Not tainted 2.6.32-358.18.1.el6_lustre.x86_64 #1
17:14:56:Call Trace:
17:14:57: [&amp;lt;ffffffff8112c257&amp;gt;] ? __alloc_pages_nodemask+0x757/0x8d0
17:14:58: [&amp;lt;ffffffff81166d92&amp;gt;] ? kmem_getpages+0x62/0x170
17:14:59: [&amp;lt;ffffffff811679aa&amp;gt;] ? fallback_alloc+0x1ba/0x270
17:14:59: [&amp;lt;ffffffff811673ff&amp;gt;] ? cache_grow+0x2cf/0x320
17:14:59: [&amp;lt;ffffffff81167729&amp;gt;] ? ____cache_alloc_node+0x99/0x160
17:14:59: [&amp;lt;ffffffffa0538ed7&amp;gt;] ? LNetMDAttach+0x157/0x5a0 [lnet]
17:14:59: [&amp;lt;ffffffff811684f9&amp;gt;] ? __kmalloc+0x189/0x220
17:14:59: [&amp;lt;ffffffffa0538ed7&amp;gt;] ? LNetMDAttach+0x157/0x5a0 [lnet]
17:15:00: [&amp;lt;ffffffffa0771b35&amp;gt;] ? ptlrpc_register_bulk+0x265/0x9d0 [ptlrpc]
17:15:00: [&amp;lt;ffffffffa0773a12&amp;gt;] ? ptl_send_rpc+0x232/0xc40 [ptlrpc]
17:15:00: [&amp;lt;ffffffff81281b74&amp;gt;] ? snprintf+0x34/0x40
17:15:01: [&amp;lt;ffffffffa0488761&amp;gt;] ? libcfs_debug_msg+0x41/0x50 [libcfs]
17:15:01: [&amp;lt;ffffffffa07685f4&amp;gt;] ? ptlrpc_send_new_req+0x454/0x790 [ptlrpc]
17:15:02: [&amp;lt;ffffffffa076c368&amp;gt;] ? ptlrpc_check_set+0x888/0x1b40 [ptlrpc]
17:15:02: [&amp;lt;ffffffffa079801b&amp;gt;] ? ptlrpcd_check+0x53b/0x560 [ptlrpc]
17:15:03: [&amp;lt;ffffffffa079853b&amp;gt;] ? ptlrpcd+0x20b/0x370 [ptlrpc]
17:15:03: [&amp;lt;ffffffff81063410&amp;gt;] ? default_wake_function+0x0/0x20
17:15:03: [&amp;lt;ffffffffa0798330&amp;gt;] ? ptlrpcd+0x0/0x370 [ptlrpc]
17:15:03: [&amp;lt;ffffffff81096a36&amp;gt;] ? kthread+0x96/0xa0
17:15:03: [&amp;lt;ffffffff8100c0ca&amp;gt;] ? child_rip+0xa/0x20
17:15:04: [&amp;lt;ffffffff810969a0&amp;gt;] ? kthread+0x0/0xa0
17:15:04: [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20
17:15:06:Mem-Info:
17:15:06:Node 0 DMA per-cpu:
17:15:06:CPU    0: hi:    0, btch:   1 usd:   0
17:15:06:Node 0 DMA32 per-cpu:
17:15:06:CPU    0: hi:  186, btch:  31 usd:  42
17:15:06:active_anon:2345 inactive_anon:2732 isolated_anon:0
17:15:07: active_file:110430 inactive_file:238985 isolated_file:0
17:15:07: unevictable:0 dirty:3 writeback:0 unstable:0
17:15:07: free:14257 slab_reclaimable:7260 slab_unreclaimable:76976
17:15:07: mapped:2551 shmem:41 pagetables:794 bounce:0
17:15:08:Node 0 DMA free:8264kB min:332kB low:412kB high:496kB active_anon:0kB inactive_anon:0kB active_file:272kB inactive_file:5444kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15324kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:36kB slab_unreclaimable:1700kB kernel_stack:16kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
17:15:08:lowmem_reserve[]: 0 2003 2003 2003
17:15:09:Node 0 DMA32 free:48764kB min:44720kB low:55900kB high:67080kB active_anon:9380kB inactive_anon:10928kB active_file:441448kB inactive_file:950496kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:2052064kB mlocked:0kB dirty:12kB writeback:0kB mapped:10204kB shmem:164kB slab_reclaimable:29004kB slab_unreclaimable:306204kB kernel_stack:1984kB pagetables:3176kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
17:15:09:lowmem_reserve[]: 0 0 0 0
17:15:10:Node 0 DMA: 58*4kB 104*8kB 102*16kB 42*32kB 6*64kB 2*128kB 2*256kB 2*512kB 0*1024kB 1*2048kB 0*4096kB = 8264kB
17:15:11:Node 0 DMA32: 10659*4kB 2*8kB 2*16kB 2*32kB 2*64kB 0*128kB 1*256kB 1*512kB 1*1024kB 0*2048kB 1*4096kB = 48764kB
17:15:11:269122 total pagecache pages
17:15:11:28 pages in swap cache
17:15:11:Swap cache stats: add 62, delete 34, find 18/22
17:15:11:Free swap  = 4128648kB
17:15:12:Total swap = 4128760kB
17:15:12:524284 pages RAM
17:15:12:43669 pages reserved
17:15:13:282260 pages shared
17:15:13:194054 pages non-shared
17:15:14:LNetError: 2780:0:(lib-lnet.h:457:lnet_md_alloc()) LNET: out of memory at /var/lib/jenkins/workspace/lustre-master/arch/x86_64/build_type/server/distro/el6/ib_stack/inkernel/BUILD/BUILD/lustre-2.4.93/lnet/include/lnet/lib-lnet.h:457 (tried to alloc &apos;(md)&apos; = 4208)
17:15:14:LNetError: 2780:0:(lib-lnet.h:457:lnet_md_alloc()) LNET: 55064047 total bytes allocated by lnet
17:15:15:LustreError: 2780:0:(niobuf.c:376:ptlrpc_register_bulk()) lustre-OST0002-osc-ffff88006f296400: LNetMDAttach failed x1447417177531472/0: rc = -12
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>server and client: lustre-master build # 1687&lt;br/&gt;
client is running SLES11 SP2</environment>
        <key id="21211">LU-4033</key>
            <summary>Failure on test suite parallel-scale-nfsv4 test_iorssf: MDS oom</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="bogl">Bob Glossman</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Mon, 30 Sep 2013 23:21:32 +0000</created>
                <updated>Wed, 21 May 2014 19:02:27 +0000</updated>
                            <resolved>Thu, 13 Feb 2014 22:05:33 +0000</resolved>
                                    <version>Lustre 2.5.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="68020" author="sarah" created="Tue, 1 Oct 2013 02:50:58 +0000"  >&lt;p&gt;parallel-scale-nfsv3 test_connectathon failed with similar error&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/d951fa2a-2915-11e3-b598-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/d951fa2a-2915-11e3-b598-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;mds dmesg:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;ptlrpcd_1: page allocation failure. order:1, mode:0x40
Pid: 2773, comm: ptlrpcd_1 Not tainted 2.6.32-358.18.1.el6_lustre.x86_64 #1
Call Trace:
 [&amp;lt;ffffffff8112c257&amp;gt;] ? __alloc_pages_nodemask+0x757/0x8d0
 [&amp;lt;ffffffff81166d92&amp;gt;] ? kmem_getpages+0x62/0x170
 [&amp;lt;ffffffff811679aa&amp;gt;] ? fallback_alloc+0x1ba/0x270
 [&amp;lt;ffffffff811673ff&amp;gt;] ? cache_grow+0x2cf/0x320
 [&amp;lt;ffffffff81167729&amp;gt;] ? ____cache_alloc_node+0x99/0x160
 [&amp;lt;ffffffffa0538ed7&amp;gt;] ? LNetMDAttach+0x157/0x5a0 [lnet]
 [&amp;lt;ffffffff811684f9&amp;gt;] ? __kmalloc+0x189/0x220
 [&amp;lt;ffffffffa0538ed7&amp;gt;] ? LNetMDAttach+0x157/0x5a0 [lnet]
 [&amp;lt;ffffffffa0771b35&amp;gt;] ? ptlrpc_register_bulk+0x265/0x9d0 [ptlrpc]
 [&amp;lt;ffffffffa0773a12&amp;gt;] ? ptl_send_rpc+0x232/0xc40 [ptlrpc]
 [&amp;lt;ffffffff81281b74&amp;gt;] ? snprintf+0x34/0x40
 [&amp;lt;ffffffffa0488761&amp;gt;] ? libcfs_debug_msg+0x41/0x50 [libcfs]
 [&amp;lt;ffffffffa07685f4&amp;gt;] ? ptlrpc_send_new_req+0x454/0x790 [ptlrpc]
 [&amp;lt;ffffffffa076c368&amp;gt;] ? ptlrpc_check_set+0x888/0x1b40 [ptlrpc]
 [&amp;lt;ffffffffa079801b&amp;gt;] ? ptlrpcd_check+0x53b/0x560 [ptlrpc]
 [&amp;lt;ffffffffa079853b&amp;gt;] ? ptlrpcd+0x20b/0x370 [ptlrpc]
 [&amp;lt;ffffffff81063410&amp;gt;] ? default_wake_function+0x0/0x20
 [&amp;lt;ffffffffa0798330&amp;gt;] ? ptlrpcd+0x0/0x370 [ptlrpc]
 [&amp;lt;ffffffff81096a36&amp;gt;] ? kthread+0x96/0xa0
 [&amp;lt;ffffffff8100c0ca&amp;gt;] ? child_rip+0xa/0x20
 [&amp;lt;ffffffff810969a0&amp;gt;] ? kthread+0x0/0xa0
 [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20
Mem-Info:
Node 0 DMA per-cpu:
CPU    0: hi:    0, btch:   1 usd:   0
Node 0 DMA32 per-cpu:
CPU    0: hi:  186, btch:  31 usd: 100
active_anon:2425 inactive_anon:2637 isolated_anon:0
 active_file:18661 inactive_file:310952 isolated_file:0
 unevictable:0 dirty:24 writeback:0 unstable:0
 free:18497 slab_reclaimable:4305 slab_unreclaimable:95385
 mapped:2054 shmem:41 pagetables:790 bounce:0
Node 0 DMA free:8276kB min:332kB low:412kB high:496kB active_anon:0kB inactive_anon:0kB active_file:92kB inactive_file:5752kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15324kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:32kB slab_unreclaimable:1572kB kernel_stack:8kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 2003 2003 2003
Node 0 DMA32 free:65712kB min:44720kB low:55900kB high:67080kB active_anon:9700kB inactive_anon:10548kB active_file:74552kB inactive_file:1238056kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:2052064kB mlocked:0kB dirty:96kB writeback:0kB mapped:8216kB shmem:164kB slab_reclaimable:17188kB slab_unreclaimable:379968kB kernel_stack:2040kB pagetables:3160kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 0 DMA: 25*4kB 16*8kB 23*16kB 10*32kB 3*64kB 2*128kB 5*256kB 3*512kB 2*1024kB 1*2048kB 0*4096kB = 8276kB
Node 0 DMA32: 12818*4kB 329*8kB 116*16kB 87*32kB 20*64kB 0*128kB 1*256kB 1*512kB 1*1024kB 0*2048kB 1*4096kB = 65712kB
329671 total pagecache pages
0 pages in swap cache
Swap cache stats: add 0, delete 0, find 0/0
Free swap  = 4128760kB
Total swap = 4128760kB
524284 pages RAM
43669 pages reserved
339843 pages shared
128622 pages non-shared
LNetError: 2773:0:(lib-lnet.h:457:lnet_md_alloc()) LNET: out of memory at /var/lib/jenkins/workspace/lustre-master/arch/x86_64/build_type/server/distro/el6/ib_stack/inkernel/BUILD/BUILD/lustre-2.4.93/lnet/include/lnet/lib-lnet.h:457 (tried to alloc &apos;(md)&apos; = 4192)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="68060" author="jlevi" created="Tue, 1 Oct 2013 16:11:43 +0000"  >&lt;p&gt;Bob,&lt;br/&gt;
Could you have a look at this one please?&lt;br/&gt;
Thank you!&lt;/p&gt;</comment>
                            <comment id="68062" author="adilger" created="Tue, 1 Oct 2013 16:21:54 +0000"  >&lt;p&gt;We need to get some information from the MDS, like /proc/slabinfo and &quot;lctl get_param memused&quot; to determine where the memory is being used.  This should be saved every minute or so, since it will not be able to collect this info when the MDS node runs out of memory.&lt;/p&gt;</comment>
                            <comment id="68109" author="bogl" created="Tue, 1 Oct 2013 20:33:15 +0000"  >&lt;p&gt;A couple of items of info gleaned from discussion with sarah&lt;/p&gt;

&lt;p&gt;1) This is a new failure.  Only seen during testing on the 2.4.93 tag.  Not seen in earlier tests.&lt;/p&gt;

&lt;p&gt;2) Going to try to reproduce it with SLES11 SP3 clients to see if it happens there too.&lt;/p&gt;</comment>
                            <comment id="68173" author="bogl" created="Wed, 2 Oct 2013 17:38:55 +0000"  >&lt;p&gt;Seen only on full tests, not review.&lt;br/&gt;
Seen only after several other test failures, including ones with client side alloc failures.  Suspect other failed tests aren&apos;t cleaning up after themselves nicely.&lt;/p&gt;</comment>
                            <comment id="68193" author="adilger" created="Wed, 2 Oct 2013 19:57:10 +0000"  >&lt;p&gt;Bob, though this still concerns me if either the client or the MDS are getting OOM problems.  There shouldn&apos;t be a client workload or failure condition that causes the MDS to run out of memory.  The fact that it is only seen under full tests and not review just means that we don&apos;t have as much testing for each patch as we really need, and doesn&apos;t at all imply that this isn&apos;t a problem that will be hit under normal usage.  We reformat and restart our filesystems so often under review testing that it doesn&apos;t really resemble a normal workload for the servers.&lt;/p&gt;</comment>
                            <comment id="68194" author="bogl" created="Wed, 2 Oct 2013 20:05:10 +0000"  >&lt;p&gt;Andreas, I agree with you.  Shouldn&apos;t be seeing alloc failures in clients or servers with any reasonable kind of load.  Shouldn&apos;t see effects persisting on MDS after client side failures.&lt;/p&gt;

&lt;p&gt;So far haven&apos;t been able to reproduce this failure on a small scale using only VMs. No free nodes available on Toro to test at bigger scale at the moment.  No loadable profiles for SLES11 SP2 or SP3 on Rosso, so I can&apos;t test there.&lt;/p&gt;</comment>
                            <comment id="68202" author="sarah" created="Wed, 2 Oct 2013 22:14:14 +0000"  >&lt;p&gt;I ran iorssf two times with SLES11 SP3 client, cannot reproduce:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/5f03fc72-2ba0-11e3-a203-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/5f03fc72-2ba0-11e3-a203-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Do I need run the full test suite on SLES11 SP3?&lt;/p&gt;</comment>
                            <comment id="68204" author="bogl" created="Wed, 2 Oct 2013 22:21:20 +0000"  >&lt;p&gt;I hate to waste the time, but I think we do need to make best effort to do the same test sequence that failed in SP2 to see if it reproduces in SP3.  So yes, please run the full test suite.&lt;/p&gt;

&lt;p&gt;The fact that the single test alone passed repeatedly is a good indication that it by itself isn&apos;t the direct cause of resource exhaustion on the MDS.  I&apos;m betting if you just ran this test alone on SP2 a few time that would pass too.&lt;/p&gt;

&lt;p&gt;I&apos;m worried that there may be a slow memory leak somewhere that we are hitting only in the case of a full test and a particular sequence of failures in different places.  Not sure how we go about tracking down such a thing.&lt;/p&gt;</comment>
                            <comment id="68230" author="bogl" created="Thu, 3 Oct 2013 13:40:54 +0000"  >&lt;p&gt;I&apos;m wondering if this bug may be related to the leak behavior Andreas is looking at in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4053&quot; title=&quot;client leaking objects/locks during IO&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4053&quot;&gt;&lt;del&gt;LU-4053&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="68267" author="adilger" created="Thu, 3 Oct 2013 16:56:44 +0000"  >&lt;p&gt;Running dbench for 1h on a single-node test setup in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4053&quot; title=&quot;client leaking objects/locks during IO&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4053&quot;&gt;&lt;del&gt;LU-4053&lt;/del&gt;&lt;/a&gt; provides a very similar memory usage profile.  We can focus &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4053&quot; title=&quot;client leaking objects/locks during IO&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4053&quot;&gt;&lt;del&gt;LU-4053&lt;/del&gt;&lt;/a&gt; for the client-side memory usage, and this bug can focus on the MDS-side memory usage.&lt;/p&gt;

&lt;p&gt;After deleting all of the files in the filesystem and unmounting the client (freeing all of the client-side locks and slabs), the MDS still has a huge number of objects allocated.  During the test run there were never more than about 1000 files in existence, but a relatively large number of files created and deleted.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;  OBJS ACTIVE  USE OBJ SIZE  SLABS OBJ/SLAB CACHE SIZE NAME                   
289460 132395  45%    0.19K  14473       20     57892K size-192
203463 179354  88%    0.10K   5499       37     21996K buffer_head
128894 127934  99%    0.11K   3791       34     15164K lod_obj
128880 127934  99%    0.08K   2685       48     10740K mdd_obj
128869 127934  99%    0.28K   9913       13     39652K mdt_obj
 71760  50205  69%    0.19K   3588       20     14352K dentry
  1491   1176  78%    1.03K    497        3      1988K ldiskfs_inode_cache
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;After leaving the system idle overnight, there are still a large number of objects, but some have been freed:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;  OBJS ACTIVE  USE OBJ SIZE  SLABS OBJ/SLAB CACHE SIZE NAME                   
221280  91056  41%    0.19K  11064       20     44256K size-192
203463 147927  72%    0.10K   5499       37     21996K buffer_head
 94944  86590  91%    0.08K   1978       48      7912K mdd_obj
 94418  86590  91%    0.11K   2777       34     11108K lod_obj
 93106  86590  93%    0.28K   7162       13     28648K mdt_obj
   987    955  96%    1.03K    329        3      1316K ldiskfs_inode_cache
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It doesn&apos;t make sense to me that the MDT is caching objects that have been deleted.  This is just causing useless memory pressure, even if they are eventually freed.&lt;/p&gt;</comment>
                            <comment id="68330" author="sarah" created="Thu, 3 Oct 2013 21:36:41 +0000"  >&lt;p&gt;I just pushed a for test only patch to run full suite with SLES11 SP3 client:&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/#/c/7688/1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/7688/1&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="68372" author="bogl" created="Fri, 4 Oct 2013 15:22:34 +0000"  >&lt;p&gt;I&apos;m wondering if there&apos;s a low rate flaw in the reference counting of objects.  It occurs to me that all it would take is one seldom used code path with one too many xxx_object_get() calls or one too few xxx_object_put() calls and the reference count will never drop to 0.  If the reference count is wrong that could explain never freeing objects.  Over time the population of unreleased objects would grow.&lt;/p&gt;</comment>
                            <comment id="68384" author="green" created="Fri, 4 Oct 2013 16:57:21 +0000"  >&lt;p&gt;I think it&apos;s still not related to any real object leakage, otherwise we would have noticed that by other means.&lt;br/&gt;
Most likely it&apos;s just like in 4053 - since we stopped proactively clearing inodes from cache in 2.3&lt;/p&gt;</comment>
                            <comment id="68386" author="adilger" created="Fri, 4 Oct 2013 17:07:00 +0000"  >&lt;p&gt;This bug is intended to track the problem with MDS-side objects not being freed (mdd_obj, lod_obj, mdt_obj slabs).  The &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4053&quot; title=&quot;client leaking objects/locks during IO&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4053&quot;&gt;&lt;del&gt;LU-4053&lt;/del&gt;&lt;/a&gt; ticket is to track the client-side CLIO objects not being freed.&lt;/p&gt;

&lt;p&gt;I think there is just something wrong in the MDS stack that it is not destroying the whole lu_obj (or whatever?) when an object is unlinked, and this is only freed at unmount time or possibly very slowly under memory pressure.  It doesn&apos;t make any sense to keep objects in memory for FIDs that have been deleted.&lt;/p&gt;</comment>
                            <comment id="68389" author="bzzz" created="Fri, 4 Oct 2013 17:22:50 +0000"  >&lt;p&gt;I did a simple test:&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;grep mdt_obj /proc/slabinfo&lt;br/&gt;
mdt_obj               28     28    280   14    1 : tunables   32   16    8 : slabdata      2      2      0 : globalstat   19938   5040  1413   23 	&lt;/li&gt;
	&lt;li&gt;./createmany -o /mnt/lustre/d0/f 10000&lt;br/&gt;
total: 10000 creates in 2.97 seconds: 3366.59 creates/second&lt;/li&gt;
	&lt;li&gt;grep mdt_obj /proc/slabinfo&lt;br/&gt;
mdt_obj            10038  10038    280   14    1 : tunables   32   16    8 : slabdata    717    717      0 : globalstat   29948  10038  2128   23 				 &lt;/li&gt;
	&lt;li&gt;./unlinkmany /mnt/lustre/d0/f 10000&lt;/li&gt;
&lt;/ol&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;unlinked 0 (time 1380765784 ; total 0 ; last 0)&lt;/li&gt;
&lt;/ul&gt;
&lt;ol&gt;
	&lt;li&gt;grep mdt_obj /proc/slabinfo&lt;br/&gt;
mdt_obj              206    280    280   14    1 : tunables   32   16    8 : slabdata     17     20    128 : globalstat   39958  10038  2843   32 				   0    4    0    0    0 : cpustat  37165   2866  37516   2487&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;then in few dozen seconds:&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;grep mdt_obj /proc/slabinfo&lt;br/&gt;
mdt_obj               28     28    280   14    1 : tunables   32   16    8 : slabdata      2      2      0 : globalstat   39958  10038  2843   41 	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;probably MM just retains lots of actually free pieces in per-cpu caches  or something like that?&lt;/p&gt;</comment>
                            <comment id="68452" author="adilger" created="Sat, 5 Oct 2013 18:33:38 +0000"  >&lt;p&gt;If the VM was just retaining the slabs, then they would not be marked active, I think. Also, near the end of my 1h dbench run there was starting to be some considerable memory pressure on the other slabs, so these should have been shrunk at that time if they were just in percpu cache.&lt;/p&gt;

&lt;p&gt;It may be that a workload different from createmany/unlinkmany is needed?  For example, neither of these operations does a lookup or readdir or a stat, or any number of other combinations. I don&apos;t think a 3600s dbench run is needed, I just was doing that to see if there is a long-term increase in memory use (which there is). Probably even a short run with full +malloc tracing would be enough. &lt;/p&gt;</comment>
                            <comment id="68453" author="bzzz" created="Sat, 5 Oct 2013 18:42:42 +0000"  >&lt;p&gt;sure, I&apos;ll try to reproduce with dbench.&lt;/p&gt;</comment>
                            <comment id="68464" author="bzzz" created="Sun, 6 Oct 2013 17:31:17 +0000"  >&lt;p&gt;the root cause seems to be remaining dirty pages on the client which cause ENQUEUEs for the layouts which in turn populate MDS cache with lu-objects for already removed files.&lt;/p&gt;</comment>
                            <comment id="68486" author="bzzz" created="Mon, 7 Oct 2013 10:35:37 +0000"  >&lt;p&gt;to clarify a bit.. given OST_DESTROY isn&apos;t executed immediately (but after commit on MDS at least), there is a window when MDT object is destroyed (in cache), but OST objects aren&apos;t. if the kernel decides to flush dirty pages at that point, the client code will try to revalidate layout (which was invalidated by UNLINK). this way we get lu-objects in the memory. they are supposed to be purged at some point (given no access can be made after OST_DESTROY). I&apos;d think this isn&apos;t a big issue, but it&apos;d be cool if we can use layout lock for this purpose as well.&lt;/p&gt;</comment>
                            <comment id="69318" author="bzzz" created="Fri, 18 Oct 2013 18:43:09 +0000"  >&lt;p&gt;Andreas, have a look at the proto: &lt;a href=&quot;http://review.whamcloud.com/#/c/8003/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/8003/&lt;/a&gt; - the idea is to signal the client the file is removed, so the client can reset nlink and let the kernel to drop the inode and the pages. it&apos;s not intended for landing yet, but I&apos;d like to hear your opinion on the approach.&lt;/p&gt;</comment>
                            <comment id="72783" author="sarah" created="Wed, 4 Dec 2013 04:19:43 +0000"  >&lt;p&gt;hit this bug in interop testing between 2.6 server and 2.5 server&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/f380a6a4-5beb-11e3-8bdd-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/f380a6a4-5beb-11e3-8bdd-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="22373">LU-4357</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="21245">LU-4053</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzw4hj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>10835</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>