<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:35:59 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-3680] OOM crash: null_alloc_rs()) ASSERTION( rs-&gt;rs_size &gt;= rs_size ) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-3680</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Hit this running sanity in a loop:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;&amp;lt;4&amp;gt;[80900.195000] ldlm_cn01_000: page allocation failure. order:1, mode:0x40
&amp;lt;4&amp;gt;[80900.195002] Pid: 17587, comm: ldlm_cn01_000 Not tainted 2.6.32-rhe6.4-debug #2
&amp;lt;4&amp;gt;[80900.195003] Call Trace:
&amp;lt;4&amp;gt;[80900.195008]  [&amp;lt;ffffffff8112a666&amp;gt;] ? __alloc_pages_nodemask+0x7c6/0x980
&amp;lt;4&amp;gt;[80900.195011]  [&amp;lt;ffffffff811658f2&amp;gt;] ? kmem_getpages+0x62/0x170
&amp;lt;4&amp;gt;[80900.195013]  [&amp;lt;ffffffff8116834a&amp;gt;] ? fallback_alloc+0x1ba/0x270
&amp;lt;4&amp;gt;[80900.195015]  [&amp;lt;ffffffff81167bf7&amp;gt;] ? cache_grow+0x4d7/0x520
&amp;lt;4&amp;gt;[80900.195017]  [&amp;lt;ffffffff81168038&amp;gt;] ? ____cache_alloc_node+0xa8/0x200
&amp;lt;4&amp;gt;[80900.195018]  [&amp;lt;ffffffff81168943&amp;gt;] ? kmem_cache_alloc_trace+0x1c3/0x250
&amp;lt;4&amp;gt;[80900.195029]  [&amp;lt;ffffffffa099cbc5&amp;gt;] ? osd_key_init+0x25/0x4e0 [osd_ldiskfs]
&amp;lt;4&amp;gt;[80900.195035]  [&amp;lt;ffffffffa099cbc5&amp;gt;] ? osd_key_init+0x25/0x4e0 [osd_ldiskfs]
&amp;lt;4&amp;gt;[80900.195060]  [&amp;lt;ffffffffa0bdd27f&amp;gt;] ? keys_fill+0x6f/0x190 [obdclass]
&amp;lt;4&amp;gt;[80900.195090]  [&amp;lt;ffffffffa0be132e&amp;gt;] ? lu_context_init+0x4e/0x240 [obdclass]
&amp;lt;4&amp;gt;[80900.195109]  [&amp;lt;ffffffffa0be1383&amp;gt;] ? lu_context_init+0xa3/0x240 [obdclass]
&amp;lt;4&amp;gt;[80900.195111]  [&amp;lt;ffffffff811665be&amp;gt;] ? cache_free_debugcheck+0x2ae/0x360
&amp;lt;4&amp;gt;[80900.195130]  [&amp;lt;ffffffffa0be153e&amp;gt;] ? lu_env_init+0x1e/0x30 [obdclass]
&amp;lt;4&amp;gt;[80900.195140]  [&amp;lt;ffffffffa0e3d69a&amp;gt;] ? ofd_lvbo_update+0x7a/0xea8 [ofd]
&amp;lt;4&amp;gt;[80900.195164]  [&amp;lt;ffffffffa04ac434&amp;gt;] ? ldlm_resource_putref+0x1d4/0x280 [ptlrpc]
&amp;lt;4&amp;gt;[80900.195186]  [&amp;lt;ffffffffa04c97b7&amp;gt;] ? ldlm_request_cancel+0x247/0x410 [ptlrpc]
&amp;lt;4&amp;gt;[80900.195206]  [&amp;lt;ffffffffa04c9abd&amp;gt;] ? ldlm_handle_cancel+0x13d/0x240 [ptlrpc]
&amp;lt;4&amp;gt;[80900.195226]  [&amp;lt;ffffffffa04cefb9&amp;gt;] ? ldlm_cancel_handler+0x1e9/0x500 [ptlrpc]
&amp;lt;4&amp;gt;[80900.195250]  [&amp;lt;ffffffffa04ffad1&amp;gt;] ? ptlrpc_server_handle_request+0x3b1/0xc70 [ptlrpc]
&amp;lt;4&amp;gt;[80900.195260]  [&amp;lt;ffffffffa0a2355e&amp;gt;] ? cfs_timer_arm+0xe/0x10 [libcfs]
&amp;lt;4&amp;gt;[80900.195270]  [&amp;lt;ffffffffa0a34b6f&amp;gt;] ? lc_watchdog_touch+0x6f/0x170 [libcfs]
&amp;lt;4&amp;gt;[80900.195340]  [&amp;lt;ffffffffa04f6bb1&amp;gt;] ? ptlrpc_wait_event+0xb1/0x2a0 [ptlrpc]
&amp;lt;4&amp;gt;[80900.195345]  [&amp;lt;ffffffff81054613&amp;gt;] ? __wake_up+0x53/0x70
&amp;lt;4&amp;gt;[80900.195367]  [&amp;lt;ffffffffa0500db2&amp;gt;] ? ptlrpc_main+0xa22/0x1650 [ptlrpc]
&amp;lt;4&amp;gt;[80900.195437]  [&amp;lt;ffffffffa0500390&amp;gt;] ? ptlrpc_main+0x0/0x1650 [ptlrpc]
&amp;lt;4&amp;gt;[80900.195441]  [&amp;lt;ffffffff81094606&amp;gt;] ? kthread+0x96/0xa0
&amp;lt;4&amp;gt;[80900.195444]  [&amp;lt;ffffffff8100c10a&amp;gt;] ? child_rip+0xa/0x20
&amp;lt;4&amp;gt;[80900.195447]  [&amp;lt;ffffffff81094570&amp;gt;] ? kthread+0x0/0xa0
&amp;lt;4&amp;gt;[80900.195448]  [&amp;lt;ffffffff8100c100&amp;gt;] ? child_rip+0x0/0x20
&amp;lt;6&amp;gt;[80900.195449] Mem-Info:
&amp;lt;4&amp;gt;[80900.195450] Node 0 DMA per-cpu:
&amp;lt;4&amp;gt;[80900.195451] CPU    0: hi:    0, btch:   1 usd:   0
&amp;lt;4&amp;gt;[80900.195452] CPU    1: hi:    0, btch:   1 usd:   0
&amp;lt;4&amp;gt;[80900.195453] CPU    2: hi:    0, btch:   1 usd:   0
&amp;lt;4&amp;gt;[80900.195454] CPU    3: hi:    0, btch:   1 usd:   0
&amp;lt;4&amp;gt;[80900.195455] CPU    4: hi:    0, btch:   1 usd:   0
&amp;lt;4&amp;gt;[80900.195456] CPU    5: hi:    0, btch:   1 usd:   0
&amp;lt;4&amp;gt;[80900.195458] CPU    6: hi:    0, btch:   1 usd:   0
&amp;lt;4&amp;gt;[80900.195459] CPU    7: hi:    0, btch:   1 usd:   0
&amp;lt;4&amp;gt;[80900.195459] Node 0 DMA32 per-cpu:
&amp;lt;4&amp;gt;[80900.195460] CPU    0: hi:  186, btch:  31 usd:  51
&amp;lt;4&amp;gt;[80900.195461] CPU    1: hi:  186, btch:  31 usd:  26
&amp;lt;4&amp;gt;[80900.195462] CPU    2: hi:  186, btch:  31 usd:   0
&amp;lt;4&amp;gt;[80900.195463] CPU    3: hi:  186, btch:  31 usd:   0
&amp;lt;4&amp;gt;[80900.195464] CPU    4: hi:  186, btch:  31 usd:  57
&amp;lt;4&amp;gt;[80900.195465] CPU    5: hi:  186, btch:  31 usd: 174
&amp;lt;4&amp;gt;[80900.195466] CPU    6: hi:  186, btch:  31 usd: 162
&amp;lt;4&amp;gt;[80900.195467] CPU    7: hi:  186, btch:  31 usd:  32
&amp;lt;4&amp;gt;[80900.195470] active_anon:61548 inactive_anon:61459 isolated_anon:0
&amp;lt;4&amp;gt;[80900.195470]  active_file:94797 inactive_file:74222 isolated_file:0
&amp;lt;4&amp;gt;[80900.195471]  unevictable:0 dirty:20 writeback:0 unstable:0
&amp;lt;4&amp;gt;[80900.195471]  free:43025 slab_reclaimable:75111 slab_unreclaimable:271092
&amp;lt;4&amp;gt;[80900.195472]  mapped:577 shmem:119300 pagetables:383 bounce:0
&amp;lt;4&amp;gt;[80900.195473] Node 0 DMA free:9692kB min:136kB low:168kB high:204kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:9296kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes
&amp;lt;4&amp;gt;[80900.195478] lowmem_reserve[]: 0 2967 2967 2967
&amp;lt;4&amp;gt;[80900.195479] Node 0 DMA32 free:162408kB min:44916kB low:56144kB high:67372kB active_anon:246192kB inactive_anon:245836kB active_file:379188kB inactive_file:296888kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:3039076kB mlocked:0kB dirty:80kB writeback:0kB mapped:2308kB shmem:477200kB slab_reclaimable:300444kB slab_unreclaimable:1084368kB kernel_stack:3296kB pagetables:1532kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
&amp;lt;4&amp;gt;[80900.195485] lowmem_reserve[]: 0 0 0 0
&amp;lt;4&amp;gt;[80900.195486] Node 0 DMA: 3*4kB 0*8kB 3*16kB 1*32kB 2*64kB 0*128kB 1*256kB 0*512kB 1*1024kB 0*2048kB 2*4096kB = 9692kB
&amp;lt;4&amp;gt;[80900.195490] Node 0 DMA32: 37378*4kB 1032*8kB 32*16kB 1*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 1*4096kB = 162408kB
&amp;lt;4&amp;gt;[80900.195495] 129723 total pagecache pages
&amp;lt;4&amp;gt;[80900.195496] 925 pages in swap cache
&amp;lt;4&amp;gt;[80900.195497] Swap cache stats: add 376365, delete 375440, find 1323456/1328202
&amp;lt;4&amp;gt;[80900.195498] Free swap  = 1869104kB
&amp;lt;4&amp;gt;[80900.195499] Total swap = 2097144kB
&amp;lt;6&amp;gt;[80900.198861] 774396 pages RAM
&amp;lt;6&amp;gt;[80900.198861] 38583 pages reserved
&amp;lt;6&amp;gt;[80900.198861] 11942 pages shared
&amp;lt;6&amp;gt;[80900.198861] 675636 pages non-shared
&amp;lt;4&amp;gt;[80900.226747] 129650 total pagecache pages
&amp;lt;4&amp;gt;[80900.226747] 1136 pages in swap cache
&amp;lt;4&amp;gt;[80900.226747] Swap cache stats: add 376650, delete 375514, find 1323456/1328202
&amp;lt;4&amp;gt;[80900.226747] Free swap  = 1867964kB
&amp;lt;4&amp;gt;[80900.226747] Total swap = 2097144kB
&amp;lt;6&amp;gt;[80900.226747] 774396 pages RAM
&amp;lt;6&amp;gt;[80900.226747] 38583 pages reserved
&amp;lt;6&amp;gt;[80900.226747] 11963 pages shared
&amp;lt;6&amp;gt;[80900.226747] 668761 pages non-shared
&amp;lt;0&amp;gt;[80900.502883] LustreError: 17604:0:(sec_null.c:318:null_alloc_rs()) ASSERTION( rs-&amp;gt;rs_size &amp;gt;= rs_size ) failed: 
&amp;lt;0&amp;gt;[80900.504111] LustreError: 17604:0:(sec_null.c:318:null_alloc_rs()) LBUG
&amp;lt;4&amp;gt;[80900.504782] Pid: 17604, comm: mdt01_002
&amp;lt;4&amp;gt;[80900.505352] 
&amp;lt;4&amp;gt;[80900.505353] Call Trace:
&amp;lt;4&amp;gt;[80900.506312]  [&amp;lt;ffffffffa0a228a5&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
&amp;lt;4&amp;gt;[80900.507011]  [&amp;lt;ffffffffa0a22ea7&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
&amp;lt;4&amp;gt;[80900.507716]  [&amp;lt;ffffffffa052a382&amp;gt;] null_alloc_rs+0x272/0x390 [ptlrpc]
&amp;lt;4&amp;gt;[80900.508419]  [&amp;lt;ffffffffa0518f19&amp;gt;] sptlrpc_svc_alloc_rs+0x1d9/0x2a0 [ptlrpc]
&amp;lt;4&amp;gt;[80900.509166]  [&amp;lt;ffffffffa04ef218&amp;gt;] lustre_pack_reply_v2+0x98/0x2a0 [ptlrpc]
&amp;lt;4&amp;gt;[80900.509906]  [&amp;lt;ffffffffa04ef4ce&amp;gt;] lustre_pack_reply_flags+0xae/0x1f0 [ptlrpc]
&amp;lt;4&amp;gt;[80900.510935]  [&amp;lt;ffffffffa04ef621&amp;gt;] lustre_pack_reply+0x11/0x20 [ptlrpc]
&amp;lt;4&amp;gt;[80900.511642]  [&amp;lt;ffffffffa0516603&amp;gt;] req_capsule_server_pack+0x53/0x100 [ptlrpc]
&amp;lt;4&amp;gt;[80900.513248]  [&amp;lt;ffffffffa0d472e5&amp;gt;] mdt_getxattr+0x585/0x13c0 [mdt]
&amp;lt;4&amp;gt;[80900.514017]  [&amp;lt;ffffffffa0d2570e&amp;gt;] mdt_intent_getxattr+0x9e/0x160 [mdt]
&amp;lt;4&amp;gt;[80900.514572]  [&amp;lt;ffffffffa0d2265e&amp;gt;] mdt_intent_policy+0x3ae/0x770 [mdt]
&amp;lt;4&amp;gt;[80900.515391]  [&amp;lt;ffffffffa04a735a&amp;gt;] ldlm_lock_enqueue+0x2ea/0x860 [ptlrpc]
&amp;lt;4&amp;gt;[80900.516099]  [&amp;lt;ffffffffa04cfc7f&amp;gt;] ldlm_handle_enqueue0+0x4ef/0x10c0 [ptlrpc]
&amp;lt;4&amp;gt;[80900.518542]  [&amp;lt;ffffffffa0d22b26&amp;gt;] mdt_enqueue+0x46/0xe0 [mdt]
&amp;lt;4&amp;gt;[80900.519098]  [&amp;lt;ffffffffa0d28ca7&amp;gt;] mdt_handle_common+0x647/0x16d0 [mdt]
&amp;lt;4&amp;gt;[80900.519529]  [&amp;lt;ffffffffa0d63335&amp;gt;] mds_regular_handle+0x15/0x20 [mdt]
&amp;lt;4&amp;gt;[80900.519959]  [&amp;lt;ffffffffa04ffad1&amp;gt;] ptlrpc_server_handle_request+0x3b1/0xc70 [ptlrpc]
&amp;lt;4&amp;gt;[80900.520740]  [&amp;lt;ffffffffa0a2355e&amp;gt;] ? cfs_timer_arm+0xe/0x10 [libcfs]
&amp;lt;4&amp;gt;[80900.521190]  [&amp;lt;ffffffffa0a34b6f&amp;gt;] ? lc_watchdog_touch+0x6f/0x170 [libcfs]
&amp;lt;4&amp;gt;[80900.521644]  [&amp;lt;ffffffffa04f6bb1&amp;gt;] ? ptlrpc_wait_event+0xb1/0x2a0 [ptlrpc]
&amp;lt;4&amp;gt;[80900.522146]  [&amp;lt;ffffffff81054613&amp;gt;] ? __wake_up+0x53/0x70
&amp;lt;4&amp;gt;[80900.522670]  [&amp;lt;ffffffffa0500db2&amp;gt;] ptlrpc_main+0xa22/0x1650 [ptlrpc]
&amp;lt;4&amp;gt;[80900.523097]  [&amp;lt;ffffffffa0500390&amp;gt;] ? ptlrpc_main+0x0/0x1650 [ptlrpc]
&amp;lt;4&amp;gt;[80900.523532]  [&amp;lt;ffffffff81094606&amp;gt;] kthread+0x96/0xa0
&amp;lt;4&amp;gt;[80900.526690]  [&amp;lt;ffffffff8100c10a&amp;gt;] child_rip+0xa/0x20
&amp;lt;4&amp;gt;[80900.527224]  [&amp;lt;ffffffff81094570&amp;gt;] ? kthread+0x0/0xa0
&amp;lt;4&amp;gt;[80900.527603]  [&amp;lt;ffffffff8100c100&amp;gt;] ? child_rip+0x0/0x20
&amp;lt;4&amp;gt;[80900.528010] 
&amp;lt;0&amp;gt;[80900.528950] Kernel panic - not syncing: LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="20139">LU-3680</key>
            <summary>OOM crash: null_alloc_rs()) ASSERTION( rs-&gt;rs_size &gt;= rs_size ) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="green">Oleg Drokin</assignee>
                                    <reporter username="green">Oleg Drokin</reporter>
                        <labels>
                    </labels>
                <created>Thu, 1 Aug 2013 01:25:57 +0000</created>
                <updated>Wed, 16 Mar 2016 05:22:47 +0000</updated>
                            <resolved>Sun, 19 Jan 2014 04:09:44 +0000</resolved>
                                    <version>Lustre 2.5.0</version>
                                    <fixVersion>Lustre 2.6.0</fixVersion>
                    <fixVersion>Lustre 2.5.1</fixVersion>
                    <fixVersion>Lustre 2.4.3</fixVersion>
                    <fixVersion>Lustre 2.8.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>13</watches>
                                                                            <comments>
                            <comment id="63894" author="green" created="Thu, 8 Aug 2013 17:16:34 +0000"  >&lt;p&gt;At the very minimum we need to fix the error handling for this allocation.&lt;/p&gt;</comment>
                            <comment id="64122" author="keith" created="Mon, 12 Aug 2013 19:20:50 +0000"  >&lt;p&gt;Part of the issue is we never check the return state of the call and seem to just pass it along. &lt;/p&gt;

&lt;p&gt;This svc-&amp;gt;srv_ops.so_req_handler(request) should be failing with ENOMEM, but we don&apos;t check and just call ptlrpc_rqphase_move like everything is fine. &lt;br/&gt;
from ptlrpc_server_handle_request&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;        rc = svc-&amp;gt;srv_ops.so_req_handler(request);

        ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE);

put_conn:

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The only mention of rc to the end of the function is an log message. I will make a patch to spurn conversation it seems we only return 0 or 1 out of this function. &lt;/p&gt;</comment>
                            <comment id="64131" author="keith" created="Mon, 12 Aug 2013 22:39:51 +0000"  >&lt;p&gt;Hmm I am not sure these are 2 backtraces are 100% related as they are from different threads but there is an odd things I see on the lbug error call chain that confuses me a bit. &lt;/p&gt;


&lt;p&gt;In: &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
                         __u32 *lens, char **bufs, int flags)
{   
        struct ptlrpc_reply_state *rs;
        int                        msg_len, rc; 
        ENTRY;
    
        LASSERT(req-&amp;gt;rq_reply_state == NULL);   &amp;lt;====  LASSERT rq_reply_state is NULL

        if ((flags &amp;amp; LPRFL_EARLY_REPLY) == 0) {
                spin_lock(&amp;amp;req-&amp;gt;rq_lock);
                req-&amp;gt;rq_packed_final = 1;
                spin_unlock(&amp;amp;req-&amp;gt;rq_lock);
        }

        msg_len = lustre_msg_size_v2(count, lens);
        rc = sptlrpc_svc_alloc_rs(req, msg_len);    &amp;lt;==== pass that same req on. 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;sptlrpc_svc_alloc_rs does a little safety check and calls this:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;static
int null_alloc_rs(struct ptlrpc_request *req, int msgsize)
{
        struct ptlrpc_reply_state *rs;
        int rs_size = sizeof(*rs) + msgsize;

        LASSERT(msgsize % 8 == 0);

        rs = req-&amp;gt;rq_reply_state;     &amp;lt;=== LASSERTED as NULL just a little while ago. 

        if (rs) { 
                /* pre-allocated */
                LASSERT(rs-&amp;gt;rs_size &amp;gt;= rs_size);   &amp;lt;==== we hit this and I am confused as to why. 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Is some separate thread acting on this request at the same time? &lt;/p&gt;

&lt;p&gt;You don&apos;t happen to have the a Lustre Debug log hanging around do you Oleg? &lt;/p&gt;</comment>
                            <comment id="67137" author="adilger" created="Fri, 20 Sep 2013 16:43:47 +0000"  >&lt;p&gt;I&apos;m more concerned about why running sanity.sh in a loop is running out of memory?&lt;/p&gt;</comment>
                            <comment id="67927" author="lixi" created="Sun, 29 Sep 2013 15:46:04 +0000"  >&lt;p&gt;We hit the same LBUG on MDS too. Maybe service-&amp;gt;srv_max_reply_size is not big enough to hold the reply message? When memory is under pressure and the buffer is returned by lustre_get_emerg_rs, this LBUG happens.&lt;/p&gt;</comment>
                            <comment id="68428" author="green" created="Sat, 5 Oct 2013 01:26:43 +0000"  >&lt;p&gt;here&apos;s a debug log from recent instance.&lt;/p&gt;

&lt;p&gt;The crash info is:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;&amp;lt;4&amp;gt;[19273.725502] mdt01_000: page allocation failure. order:2, mode:0x40
&amp;lt;4&amp;gt;[19273.726180] Pid: 14141, comm: mdt01_000 Not tainted 2.6.32-rhe6.4-debug #2
&amp;lt;4&amp;gt;[19273.726873] Call Trace:
&amp;lt;4&amp;gt;[19273.727380]  [&amp;lt;ffffffff8112a666&amp;gt;] ? __alloc_pages_nodemask+0x7c6/0x980
&amp;lt;4&amp;gt;[19273.728063]  [&amp;lt;ffffffff8127f456&amp;gt;] ? vsnprintf+0x336/0x5e0
&amp;lt;4&amp;gt;[19273.728697]  [&amp;lt;ffffffff811658f2&amp;gt;] ? kmem_getpages+0x62/0x170
&amp;lt;4&amp;gt;[19273.729334]  [&amp;lt;ffffffff8116834a&amp;gt;] ? fallback_alloc+0x1ba/0x270
&amp;lt;4&amp;gt;[19273.729334]  [&amp;lt;ffffffff81167bf7&amp;gt;] ? cache_grow+0x4d7/0x520
&amp;lt;4&amp;gt;[19273.729334]  [&amp;lt;ffffffff81168038&amp;gt;] ? ____cache_alloc_node+0xa8/0x200
&amp;lt;4&amp;gt;[19273.729334]  [&amp;lt;ffffffff811686e8&amp;gt;] ? __kmalloc+0x208/0x2a0
&amp;lt;4&amp;gt;[19273.729334]  [&amp;lt;ffffffffa118d515&amp;gt;] ? null_alloc_rs+0xc5/0x390 [ptlrpc]
&amp;lt;4&amp;gt;[19273.729334]  [&amp;lt;ffffffffa118d515&amp;gt;] ? null_alloc_rs+0xc5/0x390 [ptlrpc]
&amp;lt;4&amp;gt;[19273.729334]  [&amp;lt;ffffffffa117c484&amp;gt;] ? sptlrpc_svc_alloc_rs+0x74/0x2a0 [ptlrpc]
&amp;lt;4&amp;gt;[19273.729334]  [&amp;lt;ffffffffa1153678&amp;gt;] ? lustre_pack_reply_v2+0x98/0x2a0 [ptlrpc]
&amp;lt;4&amp;gt;[19273.729334]  [&amp;lt;ffffffffa115392e&amp;gt;] ? lustre_pack_reply_flags+0xae/0x1f0 [ptlrpc]
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffffa1153a81&amp;gt;] ? lustre_pack_reply+0x11/0x20 [ptlrpc]
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffffa1179c93&amp;gt;] ? req_capsule_server_pack+0x53/0x100 [ptlrpc]
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffffa05a6765&amp;gt;] ? mdt_getxattr+0x545/0x1490 [mdt]
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffffa05847be&amp;gt;] ? mdt_intent_getxattr+0x9e/0x160 [mdt]
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffffa0581e2e&amp;gt;] ? mdt_intent_policy+0x3ae/0x770 [mdt]
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffffa110b34a&amp;gt;] ? ldlm_lock_enqueue+0x2ea/0x860 [ptlrpc]
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffffa1133c6f&amp;gt;] ? ldlm_handle_enqueue0+0x4ef/0x10b0 [ptlrpc]
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffffa05822f6&amp;gt;] ? mdt_enqueue+0x46/0xe0 [mdt]
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffffa058897a&amp;gt;] ? mdt_handle_common+0x52a/0x1470 [mdt]
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffffa05c2005&amp;gt;] ? mds_regular_handle+0x15/0x20 [mdt]
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffffa11633f5&amp;gt;] ? ptlrpc_server_handle_request+0x395/0xc20 [ptlrpc]
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffffa0eca35f&amp;gt;] ? lc_watchdog_touch+0x6f/0x170 [libcfs]
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffffa115ad61&amp;gt;] ? ptlrpc_wait_event+0xc1/0x2e0 [ptlrpc]
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffffa11646da&amp;gt;] ? ptlrpc_main+0xa5a/0x1690 [ptlrpc]
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffffa1163c80&amp;gt;] ? ptlrpc_main+0x0/0x1690 [ptlrpc]
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffff81094606&amp;gt;] ? kthread+0x96/0xa0
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffff8100c10a&amp;gt;] ? child_rip+0xa/0x20
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffff81094570&amp;gt;] ? kthread+0x0/0xa0
&amp;lt;4&amp;gt;[19273.738776]  [&amp;lt;ffffffff8100c100&amp;gt;] ? child_rip+0x0/0x20
&amp;lt;6&amp;gt;[19273.738776] Mem-Info:
&amp;lt;4&amp;gt;[19273.738776] Node 0 DMA per-cpu:
&amp;lt;4&amp;gt;[19273.738776] CPU    0: hi:    0, btch:   1 usd:   0
&amp;lt;4&amp;gt;[19273.738776] CPU    1: hi:    0, btch:   1 usd:   0
&amp;lt;4&amp;gt;[19273.738776] CPU    2: hi:    0, btch:   1 usd:   0
&amp;lt;4&amp;gt;[19273.738776] CPU    3: hi:    0, btch:   1 usd:   0
&amp;lt;4&amp;gt;[19273.738776] CPU    4: hi:    0, btch:   1 usd:   0
&amp;lt;4&amp;gt;[19273.738776] CPU    5: hi:    0, btch:   1 usd:   0
&amp;lt;4&amp;gt;[19273.738776] CPU    6: hi:    0, btch:   1 usd:   0
&amp;lt;4&amp;gt;[19273.738776] CPU    7: hi:    0, btch:   1 usd:   0
&amp;lt;4&amp;gt;[19273.738776] Node 0 DMA32 per-cpu:
&amp;lt;4&amp;gt;[19273.738776] CPU    0: hi:  186, btch:  31 usd:  33
&amp;lt;4&amp;gt;[19273.738776] CPU    1: hi:  186, btch:  31 usd: 172
&amp;lt;4&amp;gt;[19273.738776] CPU    2: hi:  186, btch:  31 usd: 147
&amp;lt;4&amp;gt;[19273.738776] CPU    3: hi:  186, btch:  31 usd: 154
&amp;lt;4&amp;gt;[19273.738776] CPU    4: hi:  186, btch:  31 usd: 126
&amp;lt;4&amp;gt;[19273.738776] CPU    5: hi:  186, btch:  31 usd:  33
&amp;lt;4&amp;gt;[19273.738776] CPU    6: hi:  186, btch:  31 usd: 155
&amp;lt;4&amp;gt;[19273.738776] CPU    7: hi:  186, btch:  31 usd: 123
&amp;lt;4&amp;gt;[19273.738776] active_anon:96976 inactive_anon:80133 isolated_anon:0
&amp;lt;4&amp;gt;[19273.738776]  active_file:49462 inactive_file:38723 isolated_file:0
&amp;lt;4&amp;gt;[19273.738776]  unevictable:0 dirty:1065 writeback:0 unstable:0
&amp;lt;4&amp;gt;[19273.738776]  free:25017 slab_reclaimable:77546 slab_unreclaimable:312838
&amp;lt;4&amp;gt;[19273.738776]  mapped:625 shmem:174432 pagetables:389 bounce:0
&amp;lt;4&amp;gt;[19273.738776] Node 0 DMA free:9692kB min:136kB low:168kB high:204kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:9296kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes
&amp;lt;4&amp;gt;[19273.738776] lowmem_reserve[]: 0 2967 2967 2967
&amp;lt;4&amp;gt;[19273.738776] Node 0 DMA32 free:91616kB min:44916kB low:56144kB high:67372kB active_anon:387904kB inactive_anon:320532kB active_file:196476kB inactive_file:154892kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:3039076kB mlocked:0kB dirty:4260kB writeback:0kB mapped:2500kB shmem:697728kB slab_reclaimable:310184kB slab_unreclaimable:1251744kB kernel_stack:3304kB pagetables:1556kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
&amp;lt;4&amp;gt;[19273.738776] lowmem_reserve[]: 0 0 0 0
&amp;lt;4&amp;gt;[19273.738776] Node 0 DMA: 3*4kB 0*8kB 3*16kB 1*32kB 2*64kB 0*128kB 1*256kB 0*512kB 1*1024kB 0*2048kB 2*4096kB = 9692kB
&amp;lt;4&amp;gt;[19273.738776] Node 0 DMA32: 10341*4kB 4432*8kB 23*16kB 19*32kB 10*64kB 2*128kB 6*256kB 2*512kB 2*1024kB 2*2048kB 1*4096kB = 91492kB
&amp;lt;4&amp;gt;[19273.738776] 232691 total pagecache pages
&amp;lt;4&amp;gt;[19273.738776] 53 pages in swap cache
&amp;lt;4&amp;gt;[19273.738776] Swap cache stats: add 1502, delete 1449, find 18/20
&amp;lt;4&amp;gt;[19273.738776] Free swap  = 2091200kB
&amp;lt;4&amp;gt;[19273.738776] Total swap = 2097144kB
&amp;lt;6&amp;gt;[19273.738776] 774396 pages RAM
&amp;lt;6&amp;gt;[19273.738776] 38583 pages reserved
&amp;lt;6&amp;gt;[19273.738776] 15526 pages shared
&amp;lt;6&amp;gt;[19273.738776] 689850 pages non-shared
&amp;lt;0&amp;gt;[19273.940246] LustreError: 14141:0:(sec_null.c:318:null_alloc_rs()) ASSERTION( rs-&amp;gt;rs_size &amp;gt;= rs_size ) failed: 
&amp;lt;0&amp;gt;[19273.941315] LustreError: 14141:0:(sec_null.c:318:null_alloc_rs()) LBUG
&amp;lt;4&amp;gt;[19273.941915] Pid: 14141, comm: mdt01_000
&amp;lt;4&amp;gt;[19273.942396] 
&amp;lt;4&amp;gt;[19273.942396] Call Trace:
&amp;lt;4&amp;gt;[19273.945170]  [&amp;lt;ffffffffa0eb88a5&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
&amp;lt;4&amp;gt;[19273.945793]  [&amp;lt;ffffffffa0eb8ea7&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
&amp;lt;4&amp;gt;[19273.950476]  [&amp;lt;ffffffffa118d6c2&amp;gt;] null_alloc_rs+0x272/0x390 [ptlrpc]
&amp;lt;4&amp;gt;[19273.951103]  [&amp;lt;ffffffffa117c5e9&amp;gt;] sptlrpc_svc_alloc_rs+0x1d9/0x2a0 [ptlrpc]
&amp;lt;4&amp;gt;[19273.955634]  [&amp;lt;ffffffffa1153678&amp;gt;] lustre_pack_reply_v2+0x98/0x2a0 [ptlrpc]
&amp;lt;4&amp;gt;[19273.956346]  [&amp;lt;ffffffffa115392e&amp;gt;] lustre_pack_reply_flags+0xae/0x1f0 [ptlrpc]
&amp;lt;4&amp;gt;[19273.957072]  [&amp;lt;ffffffffa1153a81&amp;gt;] lustre_pack_reply+0x11/0x20 [ptlrpc]
&amp;lt;4&amp;gt;[19273.957780]  [&amp;lt;ffffffffa1179c93&amp;gt;] req_capsule_server_pack+0x53/0x100 [ptlrpc]
&amp;lt;4&amp;gt;[19273.958495]  [&amp;lt;ffffffffa05a6765&amp;gt;] mdt_getxattr+0x545/0x1490 [mdt]
&amp;lt;4&amp;gt;[19273.970919]  [&amp;lt;ffffffffa05847be&amp;gt;] mdt_intent_getxattr+0x9e/0x160 [mdt]
&amp;lt;4&amp;gt;[19273.971619]  [&amp;lt;ffffffffa0581e2e&amp;gt;] mdt_intent_policy+0x3ae/0x770 [mdt]
&amp;lt;4&amp;gt;[19273.972322]  [&amp;lt;ffffffffa110b34a&amp;gt;] ldlm_lock_enqueue+0x2ea/0x860 [ptlrpc]
&amp;lt;4&amp;gt;[19273.973035]  [&amp;lt;ffffffffa1133c6f&amp;gt;] ldlm_handle_enqueue0+0x4ef/0x10b0 [ptlrpc]
&amp;lt;4&amp;gt;[19273.973881]  [&amp;lt;ffffffffa05822f6&amp;gt;] mdt_enqueue+0x46/0xe0 [mdt]
&amp;lt;4&amp;gt;[19273.995450]  [&amp;lt;ffffffffa058897a&amp;gt;] mdt_handle_common+0x52a/0x1470 [mdt]
&amp;lt;4&amp;gt;[19273.996181]  [&amp;lt;ffffffffa05c2005&amp;gt;] mds_regular_handle+0x15/0x20 [mdt]
&amp;lt;4&amp;gt;[19274.008857]  [&amp;lt;ffffffffa11633f5&amp;gt;] ptlrpc_server_handle_request+0x395/0xc20 [ptlrpc]
&amp;lt;4&amp;gt;[19274.011971]  [&amp;lt;ffffffffa0eca35f&amp;gt;] ? lc_watchdog_touch+0x6f/0x170 [libcfs]
&amp;lt;4&amp;gt;[19274.012692]  [&amp;lt;ffffffffa115ad61&amp;gt;] ? ptlrpc_wait_event+0xc1/0x2e0 [ptlrpc]
&amp;lt;4&amp;gt;[19274.018425]  [&amp;lt;ffffffffa11646da&amp;gt;] ptlrpc_main+0xa5a/0x1690 [ptlrpc]
&amp;lt;4&amp;gt;[19274.019129]  [&amp;lt;ffffffffa1163c80&amp;gt;] ? ptlrpc_main+0x0/0x1690 [ptlrpc]
&amp;lt;4&amp;gt;[19274.019808]  [&amp;lt;ffffffff81094606&amp;gt;] kthread+0x96/0xa0
&amp;lt;4&amp;gt;[19274.020415]  [&amp;lt;ffffffff8100c10a&amp;gt;] child_rip+0xa/0x20
&amp;lt;4&amp;gt;[19274.021025]  [&amp;lt;ffffffff81094570&amp;gt;] ? kthread+0x0/0xa0
&amp;lt;4&amp;gt;[19274.021639]  [&amp;lt;ffffffff8100c100&amp;gt;] ? child_rip+0x0/0x20
&amp;lt;4&amp;gt;[19274.022265] 
&amp;lt;0&amp;gt;[19274.095286] Kernel panic - not syncing: LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I also have a crashdump.&lt;/p&gt;</comment>
                            <comment id="70743" author="paf" created="Tue, 5 Nov 2013 16:44:15 +0000"  >&lt;p&gt;Hit this twice in the last few days on an MDS running master from Nov. 1st during testing for another bug.  I&apos;ve got dumps of both events if they&apos;d be of any use.  (Logging was set to default levels)&lt;/p&gt;

&lt;p&gt;The stack traces look the same for both of them, here&apos;s one:&lt;br/&gt;
&amp;lt;0&amp;gt;LustreError: 2252:0:(sec_null.c:318:null_alloc_rs()) ASSERTION( rs-&amp;gt;rs_size &amp;gt;= rs_size ) failed:&lt;br/&gt;
&amp;lt;0&amp;gt;LustreError: 2252:0:(sec_null.c:318:null_alloc_rs()) LBUG&lt;br/&gt;
&amp;lt;4&amp;gt;Pid: 2252, comm: mdt00_001&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;br/&gt;
&amp;lt;4&amp;gt;Call Trace:&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa032d895&amp;gt;&amp;#93;&lt;/span&gt; libcfs_debug_dumpstack+0x55/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa032de97&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0627722&amp;gt;&amp;#93;&lt;/span&gt; null_alloc_rs+0x272/0x390 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06161e9&amp;gt;&amp;#93;&lt;/span&gt; sptlrpc_svc_alloc_rs+0x1d9/0x2a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05ecd03&amp;gt;&amp;#93;&lt;/span&gt; lustre_pack_reply_v2+0x93/0x280 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05ecf9e&amp;gt;&amp;#93;&lt;/span&gt; lustre_pack_reply_flags+0xae/0x1f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05ed0f1&amp;gt;&amp;#93;&lt;/span&gt; lustre_pack_reply+0x11/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06138b3&amp;gt;&amp;#93;&lt;/span&gt; req_capsule_server_pack+0x53/0x100 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c3eea5&amp;gt;&amp;#93;&lt;/span&gt; mdt_getxattr+0x545/0x1490 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c1c5ee&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_getxattr+0x9e/0x160 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c198ce&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_policy+0x3ae/0x770 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05a4461&amp;gt;&amp;#93;&lt;/span&gt; ldlm_lock_enqueue+0x361/0x8c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05cd17f&amp;gt;&amp;#93;&lt;/span&gt; ldlm_handle_enqueue0+0x4ef/0x10a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c19d96&amp;gt;&amp;#93;&lt;/span&gt; mdt_enqueue+0x46/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c20a8a&amp;gt;&amp;#93;&lt;/span&gt; mdt_handle_common+0x52a/0x1470 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c5ac55&amp;gt;&amp;#93;&lt;/span&gt; mds_regular_handle+0x15/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05fce25&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request+0x385/0xc00 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa032e4ce&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_timer_arm+0xe/0x10 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa033f27f&amp;gt;&amp;#93;&lt;/span&gt; ? lc_watchdog_touch+0x6f/0x170 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05f44c9&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_wait_event+0xa9/0x2d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81051439&amp;gt;&amp;#93;&lt;/span&gt; ? __wake_up_common+0x59/0x90&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05fe18d&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0xaed/0x1740 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05fd6a0&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1740 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81096a36&amp;gt;&amp;#93;&lt;/span&gt; kthread+0x96/0xa0&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0ca&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810969a0&amp;gt;&amp;#93;&lt;/span&gt; ? kthread+0x0/0xa0&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0c0&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;/p&gt;</comment>
                            <comment id="70904" author="paf" created="Wed, 6 Nov 2013 20:10:08 +0000"  >&lt;p&gt;Just occurred to me I should mention:  I can reproduce this fairly consistently.  If requested, I can try to do this with debugging turned up.&lt;/p&gt;</comment>
                            <comment id="70930" author="keith" created="Wed, 6 Nov 2013 23:18:10 +0000"  >&lt;p&gt;Patrick.  &lt;/p&gt;

&lt;p&gt;  What is the memory usage like when you hit this error?&lt;/p&gt;</comment>
                            <comment id="70931" author="paf" created="Wed, 6 Nov 2013 23:22:58 +0000"  >&lt;p&gt;Keith,&lt;/p&gt;

&lt;p&gt;Very high - Memory pressure, definitely.  Not actually OOM, but as low as it can go (At the point where the kernel starts freeing memory and I see memory usage bounce around in the tens of MB free.).&lt;/p&gt;

&lt;p&gt;I&apos;ve been working around it (still testing my other, unrelated bug) by adding more RAM to the virtual machine running the MDS and also dumping caches periodically to keep from going OOM.  That&apos;s worked for me.&lt;/p&gt;</comment>
                            <comment id="70934" author="keith" created="Wed, 6 Nov 2013 23:32:35 +0000"  >&lt;p&gt;Low memory is a tricky space in Lustre. Are you seeing other errors or just this one? &lt;/p&gt;</comment>
                            <comment id="70936" author="paf" created="Thu, 7 Nov 2013 00:02:20 +0000"  >&lt;p&gt;This is the only one I&apos;ve seen on the MDS.  However, I&apos;ve seen this one at least three times, and can reproduce it readily, so it may be obscuring others.&lt;/p&gt;

&lt;p&gt;On the client, we&apos;re chasing a recurrence of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3027&quot; title=&quot;Failure on test suite parallel-scale test_write_disjoint: invalid file size 140329 instead of 160376 = 20047 * 8&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3027&quot;&gt;&lt;del&gt;LU-3027&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3889&quot; title=&quot; LBUG: (osc_lock.c:497:osc_lock_upcall()) ASSERTION( lock-&amp;gt;cll_state &amp;gt;= CLS_QUEUING ) &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3889&quot;&gt;&lt;del&gt;LU-3889&lt;/del&gt;&lt;/a&gt;, in which low memory can also play a role, but that&apos;s the client, not the MDS.  (Oddly, the client testing for THAT was what led to our low memory issues on the MDS, but that&apos;s mostly due to an underspecced MDS in the testing system.)&lt;/p&gt;</comment>
                            <comment id="70941" author="lixi" created="Thu, 7 Nov 2013 01:14:45 +0000"  >&lt;p&gt;Hi Patrick,&lt;/p&gt;

&lt;p&gt;Would you please apply following patch and try to reproduce the problem again? I am not able to reproduce this consistently.&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/#/c/8200/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/8200/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Thank you very much!&lt;/p&gt;</comment>
                            <comment id="70980" author="paf" created="Thu, 7 Nov 2013 16:21:03 +0000"  >&lt;p&gt;Li,&lt;/p&gt;

&lt;p&gt;Sure.  I&apos;m doing that now - I&apos;ve just tested to confirm I can still hit the problem, now I&apos;m going to test the patch in the same situation.&lt;/p&gt;</comment>
                            <comment id="70989" author="paf" created="Thu, 7 Nov 2013 16:56:35 +0000"  >&lt;p&gt;Li,&lt;/p&gt;

&lt;p&gt;Still hit the same crash with that patch in place.  Stack trace is the same as before.  I can make the dump available if you&apos;d like.  (Does Intel have an FTP site I could use?  Otherwise I can find another way to make it available)&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;Edit&amp;#93;&lt;/span&gt;&lt;br/&gt;
It occurred to me it&apos;s past midnight in China, so I&apos;ve uploaded the dump rather than potentially making you wait another day if you do need it.&lt;/p&gt;

&lt;p&gt;It&apos;s at:&lt;br/&gt;
ftp.us.cray.com&lt;br/&gt;
Username:&lt;br/&gt;
anonymous&lt;br/&gt;
Password:&lt;br/&gt;
anonymous&lt;/p&gt;

&lt;p&gt;The file is at:&lt;br/&gt;
outbound/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3680&quot; title=&quot;OOM crash: null_alloc_rs()) ASSERTION( rs-&amp;gt;rs_size &amp;gt;= rs_size ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3680&quot;&gt;&lt;del&gt;LU-3680&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3680&quot; title=&quot;OOM crash: null_alloc_rs()) ASSERTION( rs-&amp;gt;rs_size &amp;gt;= rs_size ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3680&quot;&gt;&lt;del&gt;LU-3680&lt;/del&gt;&lt;/a&gt;-mds-dump.tar.gz&lt;/p&gt;

&lt;p&gt;You cannot ls in outbound or &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3680&quot; title=&quot;OOM crash: null_alloc_rs()) ASSERTION( rs-&amp;gt;rs_size &amp;gt;= rs_size ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3680&quot;&gt;&lt;del&gt;LU-3680&lt;/del&gt;&lt;/a&gt;, but you should be able to grab the dump.&lt;/p&gt;</comment>
                            <comment id="71057" author="lixi" created="Fri, 8 Nov 2013 00:23:50 +0000"  >&lt;p&gt;Hi Patrick,&lt;/p&gt;

&lt;p&gt;Thank you very much for your test. It seems that the problem is complexer than I thought. I will investigate more.&lt;/p&gt;

&lt;p&gt;We can share the debug files by uploading it to &lt;a href=&quot;ftp://ftp.whamcloud.com/uploads/LU-*&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;ftp://ftp.whamcloud.com/uploads/LU-*&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="71059" author="lixi" created="Fri, 8 Nov 2013 00:58:05 +0000"  >&lt;p&gt;Hi Patrick,&lt;/p&gt;

&lt;p&gt;It seems that there is a problem in lustre_get_emerg_rs(). The size of the buffer is set to zero, which will cause this LBUG when the memeory is under high pressure. I&apos;ve updated the patch. Would you please test whether it works? Thank you very much!&lt;/p&gt;
</comment>
                            <comment id="71074" author="paf" created="Fri, 8 Nov 2013 02:27:23 +0000"  >&lt;p&gt;Li,&lt;/p&gt;

&lt;p&gt;Sure.  I&apos;ll probably be testing this tomorrow.  I expect you&apos;ve noticed this, but you&apos;ve got a small mistake in the current patch that&apos;s causing it not to build on RHEL6:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-comment&quot;&gt;/* Just &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; failure &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; the size is too big */&lt;/span&gt;
CERROR(&lt;span class=&quot;code-quote&quot;&gt;&quot;size of message is too big (%lu), %d allowed&quot;&lt;/span&gt;,
msglen + sizeof(struct ptlrpc_reply_state),
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;You&apos;re printing %lu, but msglen + sizeof(&lt;span class=&quot;error&quot;&gt;&amp;#91;...&amp;#93;&lt;/span&gt;) is an int.  I&apos;d just do a cast to a long unsigned int, I suppose.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;(unsigned &lt;span class=&quot;code-object&quot;&gt;long&lt;/span&gt;) (msglen + sizeof(struct ptlrpc_reply_state)),
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Should do it.&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Patrick&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="71075" author="paf" created="Fri, 8 Nov 2013 02:37:35 +0000"  >&lt;p&gt;Li,&lt;/p&gt;

&lt;p&gt;Just testing the FTP site...&lt;br/&gt;
I&apos;m connected to ftp.whamcloud.com as anonymous, but I can&apos;t create files in /upload, and I can&apos;t seem to cd to any LU directories.  I tried &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3680&quot; title=&quot;OOM crash: null_alloc_rs()) ASSERTION( rs-&amp;gt;rs_size &amp;gt;= rs_size ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3680&quot;&gt;&lt;del&gt;LU-3680&lt;/del&gt;&lt;/a&gt;, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3027&quot; title=&quot;Failure on test suite parallel-scale test_write_disjoint: invalid file size 140329 instead of 160376 = 20047 * 8&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3027&quot;&gt;&lt;del&gt;LU-3027&lt;/del&gt;&lt;/a&gt;, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4152&quot; title=&quot; layout locks can cause deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4152&quot;&gt;&lt;del&gt;LU-4152&lt;/del&gt;&lt;/a&gt; and a few others.  Also can&apos;t make a new directory.&lt;/p&gt;

&lt;p&gt;Is there something I&apos;ve missed here?&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Patrick&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="71080" author="lixi" created="Fri, 8 Nov 2013 03:00:18 +0000"  >&lt;p&gt;Hi Patrick,&lt;/p&gt;

&lt;p&gt;I&apos;ve just test the FTP. It works well. Following are the commands.&lt;/p&gt;

&lt;p&gt;lixitekiMacBook-Pro:~ lixi$ ftp ftp.whamcloud.com&lt;br/&gt;
Connected to eric.whamcloud.com.&lt;br/&gt;
220 (vsFTPd 2.2.2)&lt;br/&gt;
Name (ftp.whamcloud.com:lixi): anonymous&lt;br/&gt;
331 Please specify the password.&lt;br/&gt;
Password: &lt;br/&gt;
230 Login successful.&lt;br/&gt;
Remote system type is UNIX.&lt;br/&gt;
Using binary mode to transfer files.&lt;br/&gt;
ftp&amp;gt; ls&lt;br/&gt;
229 Entering Extended Passive Mode (|||38409|).&lt;br/&gt;
150 Here comes the directory listing.&lt;br/&gt;
d-wxrwx---   68 123      840000001     4096 Nov 05 08:12 uploads&lt;br/&gt;
226 Directory send OK.&lt;br/&gt;
ftp&amp;gt; cd uploads&lt;br/&gt;
250 Directory successfully changed.&lt;br/&gt;
ftp&amp;gt; mkdir &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3680&quot; title=&quot;OOM crash: null_alloc_rs()) ASSERTION( rs-&amp;gt;rs_size &amp;gt;= rs_size ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3680&quot;&gt;&lt;del&gt;LU-3680&lt;/del&gt;&lt;/a&gt;&lt;br/&gt;
257 &quot;/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3680&quot; title=&quot;OOM crash: null_alloc_rs()) ASSERTION( rs-&amp;gt;rs_size &amp;gt;= rs_size ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3680&quot;&gt;&lt;del&gt;LU-3680&lt;/del&gt;&lt;/a&gt;&quot; created&lt;br/&gt;
ftp&amp;gt; cd &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3680&quot; title=&quot;OOM crash: null_alloc_rs()) ASSERTION( rs-&amp;gt;rs_size &amp;gt;= rs_size ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3680&quot;&gt;&lt;del&gt;LU-3680&lt;/del&gt;&lt;/a&gt;&lt;br/&gt;
250 Directory successfully changed.&lt;br/&gt;
ftp&amp;gt; lpwd&lt;br/&gt;
Local directory: /Users/lixi&lt;br/&gt;
ftp&amp;gt; put test.rtf &lt;br/&gt;
local: test.rtf remote: test.rtf&lt;br/&gt;
229 Entering Extended Passive Mode (|||53942|).&lt;br/&gt;
150 Ok to send data.&lt;br/&gt;
100% |***********************************|   315      397.43 KiB/s    00:00 ETA&lt;br/&gt;
226 Transfer complete.&lt;br/&gt;
315 bytes sent in 00:00 (0.49 KiB/s)&lt;br/&gt;
ftp&amp;gt; ls&lt;br/&gt;
229 Entering Extended Passive Mode (|||40673|).&lt;br/&gt;
150 Here comes the directory listing.&lt;br/&gt;
&lt;del&gt;rw-r&lt;/del&gt;&lt;del&gt;r&lt;/del&gt;-    1 123      114           315 Nov 07 18:57 test.rtf&lt;br/&gt;
226 Directory send OK.&lt;/p&gt;</comment>
                            <comment id="71084" author="paf" created="Fri, 8 Nov 2013 03:20:00 +0000"  >&lt;p&gt;Li,&lt;/p&gt;

&lt;p&gt;You&apos;re right - I was using a Windows FTP command line client, and now that I&apos;ve switched to a Unix one, everything is fine.&lt;/p&gt;

&lt;p&gt;Thanks!&lt;/p&gt;</comment>
                            <comment id="71100" author="paf" created="Fri, 8 Nov 2013 05:30:13 +0000"  >&lt;p&gt;Li,&lt;/p&gt;

&lt;p&gt;Different LBUG, same function:&lt;br/&gt;
&amp;lt;0&amp;gt;LustreError: 12155:0:(sec_null.c:318:null_alloc_rs()) ASSERTION( rs-&amp;gt;rs_size &amp;gt;= rs_size ) failed:&lt;br/&gt;
&amp;lt;0&amp;gt;LustreError: 12155:0:(sec_null.c:318:null_alloc_rs()) LBUG&lt;br/&gt;
&amp;lt;4&amp;gt;Pid: 12155, comm: mdt00_000&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;br/&gt;
&amp;lt;4&amp;gt;Call Trace:&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa032d895&amp;gt;&amp;#93;&lt;/span&gt; libcfs_debug_dumpstack+0x55/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa032de97&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06296a2&amp;gt;&amp;#93;&lt;/span&gt; null_alloc_rs+0x272/0x390 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0617527&amp;gt;&amp;#93;&lt;/span&gt; sptlrpc_svc_alloc_rs+0x1e7/0x350 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05edfb3&amp;gt;&amp;#93;&lt;/span&gt; lustre_pack_reply_v2+0x93/0x280 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05ee24e&amp;gt;&amp;#93;&lt;/span&gt; lustre_pack_reply_flags+0xae/0x1f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05ee3a1&amp;gt;&amp;#93;&lt;/span&gt; lustre_pack_reply+0x11/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0614be3&amp;gt;&amp;#93;&lt;/span&gt; req_capsule_server_pack+0x53/0x100 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c3ef65&amp;gt;&amp;#93;&lt;/span&gt; mdt_getxattr+0x545/0x1490 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c212ae&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_getxattr+0x9e/0x160 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa046a356&amp;gt;&amp;#93;&lt;/span&gt; ? lu_object_find+0x16/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c1b659&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_policy+0x499/0xca0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05a5441&amp;gt;&amp;#93;&lt;/span&gt; ldlm_lock_enqueue+0x361/0x8c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05ce17f&amp;gt;&amp;#93;&lt;/span&gt; ldlm_handle_enqueue0+0x4ef/0x10a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0643b12&amp;gt;&amp;#93;&lt;/span&gt; tgt_enqueue+0x62/0x1d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa064203f&amp;gt;&amp;#93;&lt;/span&gt; tgt_request_handle+0x5ff/0x1200 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05ef2bc&amp;gt;&amp;#93;&lt;/span&gt; ? lustre_msg_get_transno+0x8c/0x100 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05fe0d5&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request+0x385/0xc00 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa032e4ce&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_timer_arm+0xe/0x10 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa033f3df&amp;gt;&amp;#93;&lt;/span&gt; ? lc_watchdog_touch+0x6f/0x170 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05f5779&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_wait_event+0xa9/0x2d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81051439&amp;gt;&amp;#93;&lt;/span&gt; ? __wake_up_common+0x59/0x90&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05ff43d&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0xaed/0x1740 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05fe950&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1740 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81096a36&amp;gt;&amp;#93;&lt;/span&gt; kthread+0x96/0xa0&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0ca&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810969a0&amp;gt;&amp;#93;&lt;/span&gt; ? kthread+0x0/0xa0&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0c0&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;/p&gt;

&lt;p&gt;Dump is going up on the WC FTP site momentarily.&lt;/p&gt;

&lt;p&gt;Dump is in /&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3680&quot; title=&quot;OOM crash: null_alloc_rs()) ASSERTION( rs-&amp;gt;rs_size &amp;gt;= rs_size ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3680&quot;&gt;&lt;del&gt;LU-3680&lt;/del&gt;&lt;/a&gt;, file is named:&lt;br/&gt;
lu-3680-mds-dump 2.tar.gz&lt;/p&gt;</comment>
                            <comment id="71502" author="lixi" created="Thu, 14 Nov 2013 03:42:50 +0000"  >&lt;p&gt;Patrick,&lt;/p&gt;

&lt;p&gt;Thanks for you test. I am sorry that this LBUG happended again. You said that this is a different LBUG, but the trace looks the same to me. Is there anything I missed?&lt;br/&gt;
It surprises me that the patch does not help at all. I though it fixes a problem and might eliminate the LBUG. Would you please doule check that the patch is applied properly?&lt;br/&gt;
Since the problem happens when the memory is under heavy pressure, I think the buffer is likely to be allocaed by lustre_get_emerg_rs(). And that&apos;s why I am surpised that the patch didn&apos;t help...&lt;/p&gt;

&lt;p&gt;Thanks!&lt;/p&gt;
</comment>
                            <comment id="71503" author="paf" created="Thu, 14 Nov 2013 03:56:12 +0000"  >&lt;p&gt;Li,&lt;/p&gt;

&lt;p&gt;I&apos;m sorry, you&apos;re right - I misread my own dump.  I&apos;m fairly sure I had the patch in place, but I&apos;m happy to try again.  It&apos;s always possible I made a mistake in setting things up.&lt;/p&gt;

&lt;p&gt;I should be able to test that tomorrow.&lt;/p&gt;

&lt;p&gt;By the way, if you&apos;re interested in trying to reproduce this, I&apos;m just running this script from a client to generate activity:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; idx in $(seq 0 10000); &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt;
    time ls -laR &amp;gt; /dev/&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;
    touch somefile
    rm -f somefiles
    echo $idx: $(date +%T) $(grep MemFree /proc/meminfo)
done
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Then running this code on the MDS to create memory pressure (you have to hold down enter, it consumes real memory much more slowly than the rate at which it&apos;s allocating memory).  Once memory gets low, just keep running this code and you should see the bug within a minute or so:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;#include &amp;lt;stdio.h&amp;gt;
#include &amp;lt;unistd.h&amp;gt;

&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; main()
{
    &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; i;
    &lt;span class=&quot;code-object&quot;&gt;char&lt;/span&gt;* junk;

start: i = 0;

    &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt;(i &amp;lt; 50) { 
        printf(&lt;span class=&quot;code-quote&quot;&gt;&quot;Malloc!\n&quot;&lt;/span&gt;); 
        junk = malloc(1024*1024*1024); 
        junk[0] = i; 
        i++; 
    }

    printf(&lt;span class=&quot;code-quote&quot;&gt;&quot;Mallocced 50 GB. Press enter to malloc another 50.\n&quot;&lt;/span&gt;);
    printf(&lt;span class=&quot;code-quote&quot;&gt;&quot;Note: This seems to use roughly 10 MB of real memory each time.\n&quot;&lt;/span&gt;);
    getchar();
    &lt;span class=&quot;code-keyword&quot;&gt;goto&lt;/span&gt; start;
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="71573" author="paf" created="Thu, 14 Nov 2013 20:55:52 +0000"  >&lt;p&gt;Li,&lt;/p&gt;

&lt;p&gt;It looks like you&apos;re right - I must have made a mistake in my build/install process before.  With your patch, I am no longer able to hit this bug.&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Patrick&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="71605" author="lixi" created="Fri, 15 Nov 2013 02:04:00 +0000"  >&lt;p&gt;Patrick,&lt;/p&gt;

&lt;p&gt;Thank you very much for confirming this! I am glad that we&apos;ve made some progress finally.&lt;/p&gt;

&lt;p&gt;Li Xi&lt;/p&gt;</comment>
                            <comment id="72148" author="paf" created="Fri, 22 Nov 2013 16:49:09 +0000"  >&lt;p&gt;Re-pushed patch to get Maloo to test again.  If tests pass, I&apos;m planning to invite Oleg to review.&lt;/p&gt;</comment>
                            <comment id="75249" author="green" created="Sun, 19 Jan 2014 04:08:25 +0000"  >&lt;p&gt;Patch landed to master for 2.6.0, and to b2_4 for 2.4.3 (should it ever happen) and b2_5 for 2.5.1.&lt;/p&gt;</comment>
                            <comment id="77026" author="adilger" created="Thu, 13 Feb 2014 22:06:35 +0000"  >&lt;p&gt;Shows mode:0x40 == __GFP_IO, but missing __GFP_WAIT from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4357&quot; title=&quot;page allocation failure. mode:0x40 caused by missing __GFP_WAIT flag&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4357&quot;&gt;&lt;del&gt;LU-4357&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="79677" author="bzzz" created="Wed, 19 Mar 2014 17:09:09 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/#/c/9726/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/9726/&lt;/a&gt; - osd_thread_info fits 4K&lt;/p&gt;</comment>
                            <comment id="108518" author="gerrit" created="Tue, 3 Mar 2015 02:17:14 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/9726/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/9726/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3680&quot; title=&quot;OOM crash: null_alloc_rs()) ASSERTION( rs-&amp;gt;rs_size &amp;gt;= rs_size ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3680&quot;&gt;&lt;del&gt;LU-3680&lt;/del&gt;&lt;/a&gt; osd: reduce osd_thread_info in ldiskfs osd&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 56875fde8c66a4a937b173ccb065a9a6a3c67419&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="22373">LU-4357</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="29542">LU-6472</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="28931">LU-6324</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="13594" name="lu3680.txt.gz" size="852506" author="green" created="Sat, 5 Oct 2013 01:26:43 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvwyf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9502</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>