<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:09:11 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-659] Experiencing heavy IO load, client eviction and RPC timeouts after upgrade to lustre-1.8.5.0-5 (chaos release)</title>
                <link>https://jira.whamcloud.com/browse/LU-659</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Since upgrading from TOSS-1.3.4 we have been experiencing MAJOR problems with stability. The one common denominator seems to be latency between lustre clients and servers. Both servers and clients are dumping lots of syslog/dmesg messages about timeouts and heavy IO loads.In particular we are seeing large volumes of messages like this from clients:&lt;/p&gt;

&lt;p&gt;2011-09-02 10:18:49 rs249 INFO: task xsolver:23906 blocked for more than 120 seconds. &amp;lt;kern.err&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message. &amp;lt;kern.err&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 xsolver       D ffff81000101d640     0 23906  23896         23915 23905 (NOTLB) &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 ffff810162bc9be8 0000000000000046 0000000000000000 0000000000400000 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 ffff8101d9eff000 0000000000000007 ffff8101eb2ce7f0 ffff81020554c7f0 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 0000460dc26b8914 000000000000b9f5 ffff8101eb2ce9d8 00000003de55f1e8 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 Call Trace: &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8002960b&amp;gt;&amp;#93;&lt;/span&gt; sync_page+0x0/0x42 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80066812&amp;gt;&amp;#93;&lt;/span&gt; io_schedule+0x3f/0x63 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80029649&amp;gt;&amp;#93;&lt;/span&gt; sync_page+0x3e/0x42 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80066975&amp;gt;&amp;#93;&lt;/span&gt; __wait_on_bit_lock+0x42/0x78 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80041222&amp;gt;&amp;#93;&lt;/span&gt; __lock_page+0x64/0x6b &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800a822d&amp;gt;&amp;#93;&lt;/span&gt; wake_bit_function+0x0/0x2a &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800140ac&amp;gt;&amp;#93;&lt;/span&gt; find_lock_page+0x69/0xa3 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff888c468d&amp;gt;&amp;#93;&lt;/span&gt; :lustre:ll_file_readv+0xbcd/0x2100 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff888f60f8&amp;gt;&amp;#93;&lt;/span&gt; :lustre:ll_stats_ops_tally+0x48/0xf0 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff888c5bde&amp;gt;&amp;#93;&lt;/span&gt; :lustre:ll_file_read+0x1e/0x20 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8000b80f&amp;gt;&amp;#93;&lt;/span&gt; vfs_read+0xcc/0x172 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80011fef&amp;gt;&amp;#93;&lt;/span&gt; sys_read+0x47/0x6f &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 10:18:49 rs249 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80060116&amp;gt;&amp;#93;&lt;/span&gt; system_call+0x7e/0x83 &amp;lt;kern.warning&amp;gt;&lt;/p&gt;

&lt;p&gt;and:&lt;/p&gt;

&lt;p&gt;2011-09-02 03:36:28 rs2166 LustreError: 4284:0:(o2iblnd_cb.c:2984:kiblnd_check_txs()) Timed out tx: tx_queue, 106 seconds &amp;lt;kern.err&lt;br/&gt;
&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 LustreError: 4284:0:(o2iblnd_cb.c:3001:kiblnd_conn_timed_out()) Timed out RDMA on queue ibc_tx_queue (se&lt;br/&gt;
nds that need a credit) &amp;lt;kern.err&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre: conn&lt;span class=&quot;error&quot;&gt;&amp;#91;29&amp;#93;&lt;/span&gt; ffff81037c66e1c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;version 12&amp;#93;&lt;/span&gt; -&amp;gt; 10.1.36.9@o2ib:  &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:    state 3 nposted 1/1 cred 0 o_cred 0 r_cred 8 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:    ready 0 scheduled -1 comms_err 0 last_send 1033d0ce5 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:    early_rxs: &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:    tx_queue_nocred: &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:    tx_queue_rsrvd: &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:    tx_queue: &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffffc20010790d10 snd 0 q 1 w 0 rc 0 dl 1033b6b9a cookie 0xd3368 msg !- type d1 cred 2 aqt&lt;br/&gt;
10339d4e1 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffffc2001078fd70 snd 0 q 1 w 0 rc 0 dl 1033b6b9a cookie 0xd3369 msg !- type d1 cred 0 aqt&lt;br/&gt;
10339d3ea &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffffc20010790798 snd 0 q 1 w 0 rc 0 dl 1033c177a cookie 0xd38c4 msg !- type d1 cred 1 aqt&lt;br/&gt;
1033a9074 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffffc2001078eb78 snd 0 q 1 w 0 rc 0 dl 1033c177a cookie 0xd38c5 msg !- type d1 cred 0 aqt&lt;br/&gt;
1033a851c &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffffc2001078ffc8 snd 0 q 1 w 0 rc 0 dl 1033ccb2a cookie 0xd3dd9 msg !- type d1 cred 2 aqt&lt;br/&gt;
1033b31b1 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffffc2001078dca0 snd 0 q 1 w 0 rc 0 dl 1033ccb2a cookie 0xd3ddb msg !- type d1 cred 0 aqt&lt;br/&gt;
1033b2a31 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffffc20010791350 snd 0 q 1 w 0 rc 0 dl 1033d86aa cookie 0xd43bb msg !- type d1 cred 1 aqt&lt;br/&gt;
1033bff6d &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffffc2001078fb18 snd 0 q 1 w 0 rc 0 dl 1033d86aa cookie 0xd43bc msg !- type d1 cred 1 aqt&lt;br/&gt;
1033bf8fc &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:    active_txs: &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffffc20010790860 snd 1 q 0 w 0 rc 0 dl 1033e9385 cookie 0xd4fcd msg &amp;#8211; type d0 cred 0 aqt&lt;br/&gt;
1033d0ce5 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:    rxs: &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffff81037defd000 status 0 msg_type d1 cred 2 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffff81037defd068 status 0 msg_type d1 cred 2 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffff81037defd0d0 status 0 msg_type d1 cred 0 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffff81037defd138 status 0 msg_type d1 cred 0 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffff81037defd1a0 status 0 msg_type d1 cred 2 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffff81037defd208 status 0 msg_type d1 cred 2 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffff81037defd270 status 0 msg_type d1 cred 0 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffff81037defd2d8 status 0 msg_type d1 cred 0 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffff81037defd340 status 0 msg_type d0 cred 0 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffff81037defd3a8 status 0 msg_type d1 cred 2 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffff81037defd410 status 0 msg_type d1 cred 0 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffff81037defd478 status 0 msg_type d1 cred 2 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffff81037defd4e0 status 0 msg_type d1 cred 0 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffff81037defd548 status 0 msg_type d1 cred 0 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffff81037defd5b0 status 0 msg_type d1 cred 0 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffff81037defd618 status 0 msg_type d1 cred 2 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffff81037defd680 status 0 msg_type d1 cred 2 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:       ffff81037defd6e8 status 0 msg_type d1 cred 0 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:    ib_qp: qp_state 3 cur_qp_state 3 mtu 4 mig_state 0 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:    ib_qp: qkey 3411518464 rq_psn 10726368 sq_psn 13326244 dest_qp_num 2623659 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:    ib_qp: qkey 3411518464 rq_psn 10726368 sq_psn 13326244 dest_qp_num 2623659 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:    ib_qp_cap: swr 4096 rwr 32 ssge 1 rsge 1 inline 0 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:    ib_ah_attr     : dlid 21 sl 0 s_p_bits 0 rate 2 flags 0 port 1 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:    ib_ah_attr(alt): dlid 12433 sl 14 s_p_bits 24 rate 0 flags 1 port 1 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:    ib_qp: pkey 0 alt_pkey 17 en 3 sq 0 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:    ib_qp: max_rd 1 max_dest 1 min_rnr 27 port 1 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 Lustre:    ib_qp: timeout 19 retry 5 rnr_re 6 alt_port 1 alt_timeout 14 &amp;lt;kern.info&amp;gt;&lt;br/&gt;
2011-09-02 03:36:28 rs2166 LustreError: 4284:0:(o2iblnd_cb.c:3079:kiblnd_check_conns()) Timed out RDMA with 10.1.36.9@o2ib (0) &amp;lt;ker&lt;br/&gt;
n.err&amp;gt;&lt;/p&gt;

&lt;p&gt;On the server side we are seeing lot of messages similar to this:&lt;/p&gt;

&lt;p&gt;2011-09-02 05:30:59 oss-scratch14 Lustre: Service thread pid 11707 was inactive for 600.00s. The thread might be hung, or it might &lt;br/&gt;
only be slow and will resume later. Dumping the stack trace for debugging purposes: &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 Pid: 11707, comm: ll_ost_io_318 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14  &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 Call Trace: &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80066031&amp;gt;&amp;#93;&lt;/span&gt; thread_return+0x5e/0xf6 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80093b67&amp;gt;&amp;#93;&lt;/span&gt; default_wake_function+0xd/0xf &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80091f8c&amp;gt;&amp;#93;&lt;/span&gt; __wake_up_common+0x3e/0x68 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800532b5&amp;gt;&amp;#93;&lt;/span&gt; __wake_up_locked+0x13/0x15 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8029d1c9&amp;gt;&amp;#93;&lt;/span&gt; __down_trylock+0x1c/0x5a &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80093b5a&amp;gt;&amp;#93;&lt;/span&gt; default_wake_function+0x0/0xf &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800677dd&amp;gt;&amp;#93;&lt;/span&gt; __down_failed+0x35/0x3a &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887df498&amp;gt;&amp;#93;&lt;/span&gt; .text.lock.ldlm_pool+0x55/0x7d &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800227da&amp;gt;&amp;#93;&lt;/span&gt; __up_read+0x7a/0x83 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887dd3f2&amp;gt;&amp;#93;&lt;/span&gt; ldlm_pools_srv_shrink+0x12/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800409ea&amp;gt;&amp;#93;&lt;/span&gt; shrink_slab+0xd3/0x15c &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800d487f&amp;gt;&amp;#93;&lt;/span&gt; zone_reclaim+0x25f/0x306 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800d0af1&amp;gt;&amp;#93;&lt;/span&gt; __rmqueue+0x47/0xcb &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8000a96a&amp;gt;&amp;#93;&lt;/span&gt; get_page_from_freelist+0xb6/0x411 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8000f5ad&amp;gt;&amp;#93;&lt;/span&gt; __alloc_pages+0x78/0x30e &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8001662f&amp;gt;&amp;#93;&lt;/span&gt; alloc_pages_current+0x9f/0xa8 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800ce803&amp;gt;&amp;#93;&lt;/span&gt; __page_cache_alloc+0x6d/0x71 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800265fe&amp;gt;&amp;#93;&lt;/span&gt; find_or_create_page+0x37/0x7b &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88af34a8&amp;gt;&amp;#93;&lt;/span&gt; filter_get_page+0x38/0x70 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdfilter&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88af569a&amp;gt;&amp;#93;&lt;/span&gt; filter_preprw+0x146a/0x1d30 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdfilter&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887ae1f9&amp;gt;&amp;#93;&lt;/span&gt; lock_handle_addref+0x9/0x10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88745c91&amp;gt;&amp;#93;&lt;/span&gt; class_handle2object+0xe1/0x170 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887ae192&amp;gt;&amp;#93;&lt;/span&gt; lock_res_and_lock+0xc2/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88a9ff77&amp;gt;&amp;#93;&lt;/span&gt; ost_brw_write+0xf67/0x2410 &lt;span class=&quot;error&quot;&gt;&amp;#91;ost&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887ef928&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_send_reply+0x5f8/0x610 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887f3eb0&amp;gt;&amp;#93;&lt;/span&gt; lustre_msg_check_version_v2+0x10/0x30 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887f4642&amp;gt;&amp;#93;&lt;/span&gt; lustre_msg_check_version+0x22/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88aa4053&amp;gt;&amp;#93;&lt;/span&gt; ost_handle+0x2c33/0x5690 &lt;span class=&quot;error&quot;&gt;&amp;#91;ost&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8015f9f8&amp;gt;&amp;#93;&lt;/span&gt; __next_cpu+0x19/0x28 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8007a442&amp;gt;&amp;#93;&lt;/span&gt; smp_send_reschedule+0x4a/0x50 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887f3cf5&amp;gt;&amp;#93;&lt;/span&gt; lustre_msg_get_opc+0x35/0xf0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8880342e&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request+0x96e/0xdc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88803b8a&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_wait_event+0x30a/0x320 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88804b06&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0xf66/0x1110 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8006101d&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x11 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88803ba0&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0x0/0x1110 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 05:30:59 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80061013&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0x0/0x11 &amp;lt;kern.warning&amp;gt;&lt;/p&gt;

&lt;p&gt;and:&lt;/p&gt;

&lt;p&gt;2011-09-01 19:47:14 oss-scratch14  &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 LustreError: dumping log to /lustre-tmp/oss-scratch14.1314928034.10857 &amp;lt;kern.alert&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 Lustre: scratch1-OST0034: slow start_page_write 600s due to heavy IO load &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 Lustre: Service thread pid 11729 was inactive for 600.00s. The thread might be hung, or it might &lt;br/&gt;
only be slow and will resume later. Dumping the stack trace for debugging purposes: &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 Pid: 11729, comm: ll_ost_io_340 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14  &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 Call Trace: &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80067b32&amp;gt;&amp;#93;&lt;/span&gt; __down+0xc5/0xd9 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80093b5a&amp;gt;&amp;#93;&lt;/span&gt; default_wake_function+0x0/0xf &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800677dd&amp;gt;&amp;#93;&lt;/span&gt; __down_failed+0x35/0x3a &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887df498&amp;gt;&amp;#93;&lt;/span&gt; .text.lock.ldlm_pool+0x55/0x7d &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800227da&amp;gt;&amp;#93;&lt;/span&gt; __up_read+0x7a/0x83 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887dd3f2&amp;gt;&amp;#93;&lt;/span&gt; ldlm_pools_srv_shrink+0x12/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800409ea&amp;gt;&amp;#93;&lt;/span&gt; shrink_slab+0xd3/0x15c &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800d487f&amp;gt;&amp;#93;&lt;/span&gt; zone_reclaim+0x25f/0x306 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800d0af1&amp;gt;&amp;#93;&lt;/span&gt; __rmqueue+0x47/0xcb &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8000a96a&amp;gt;&amp;#93;&lt;/span&gt; get_page_from_freelist+0xb6/0x411 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8000f5ad&amp;gt;&amp;#93;&lt;/span&gt; __alloc_pages+0x78/0x30e &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8001662f&amp;gt;&amp;#93;&lt;/span&gt; alloc_pages_current+0x9f/0xa8 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800ce803&amp;gt;&amp;#93;&lt;/span&gt; __page_cache_alloc+0x6d/0x71 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800265fe&amp;gt;&amp;#93;&lt;/span&gt; find_or_create_page+0x37/0x7b &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88af34a8&amp;gt;&amp;#93;&lt;/span&gt; filter_get_page+0x38/0x70 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdfilter&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88af569a&amp;gt;&amp;#93;&lt;/span&gt; filter_preprw+0x146a/0x1d30 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdfilter&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887ae1f9&amp;gt;&amp;#93;&lt;/span&gt; lock_handle_addref+0x9/0x10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88745c91&amp;gt;&amp;#93;&lt;/span&gt; class_handle2object+0xe1/0x170 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887ae192&amp;gt;&amp;#93;&lt;/span&gt; lock_res_and_lock+0xc2/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88a9ff77&amp;gt;&amp;#93;&lt;/span&gt; ost_brw_write+0xf67/0x2410 &lt;span class=&quot;error&quot;&gt;&amp;#91;ost&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887ef928&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_send_reply+0x5f8/0x610 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887f3eb0&amp;gt;&amp;#93;&lt;/span&gt; lustre_msg_check_version_v2+0x10/0x30 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887f4642&amp;gt;&amp;#93;&lt;/span&gt; lustre_msg_check_version+0x22/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88aa4053&amp;gt;&amp;#93;&lt;/span&gt; ost_handle+0x2c33/0x5690 &lt;span class=&quot;error&quot;&gt;&amp;#91;ost&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8015f9f8&amp;gt;&amp;#93;&lt;/span&gt; __next_cpu+0x19/0x28 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8007a442&amp;gt;&amp;#93;&lt;/span&gt; smp_send_reschedule+0x4a/0x50 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887f3cf5&amp;gt;&amp;#93;&lt;/span&gt; lustre_msg_get_opc+0x35/0xf0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8880342e&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request+0x96e/0xdc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88803b8a&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_wait_event+0x30a/0x320 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88804b06&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0xf66/0x1110 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8006101d&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x11 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88803ba0&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0x0/0x1110 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80061013&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0x0/0x11 &amp;lt;kern.warning&amp;gt;&lt;/p&gt;

&lt;p&gt;and:&lt;/p&gt;

&lt;p&gt;2011-09-02 10:28:09 oss-scratch14 Lustre: Skipped 11 previous similar messages &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-02 10:28:09 oss-scratch14 Lustre: scratch1-OST0034: Client fb914147-23d5-58f1-2781-03d1a9f96701 (at 10.1.3.226@o2ib) refuse&lt;br/&gt;
d reconnection, still busy with 1 active RPCs &amp;lt;kern.warning&amp;gt;&lt;/p&gt;

&lt;p&gt;When a particular server starts getting bound we can see the load average go up to greater that 700 (on an 8 core server). &lt;/p&gt;

&lt;p&gt;Another point: We also implemented quotas as a means to quickly see disk usage (to replace du as the defacto method), but when problems presented themselves we removed quotas from the configuration (i.e., used tunefs.lustre to reset the parameters). There is a question as to whether the quota files that may persist on the block devices may be contributing. I did see some discussion on lustre-discuss with regard to these files but there was no real information on impact if quotas were disabled or how to remove them it there was.&lt;/p&gt;

&lt;p&gt;I have also attached one of the many lustre dumps. Hopefully this information is sufficient to at least set a starting point for analyzing the problem. &lt;/p&gt;</description>
                <environment>Redsky Cluster - Oracle (Sun) C48 Blade servers ~2700 nodes, running LLNL Chaos version 4.4.3 (TOSS 1.4.3) with lustre version 1.8.5.0-5. Blades consist of 2 nahalem 4 core procs with 12GB mem. Networking is QDR-IB only using a 3D torus routing algorithm. Storage is software RAID 6 8+2 (mdadm) running on Oracle (Sun) J4400 JBODs. More specifically:&lt;br/&gt;
&lt;br/&gt;
OFED1.5&lt;br/&gt;
Kernel 2.6.18-107 + two patches:&lt;br/&gt;
Lustre kernel patch (raid10-soft-lockups.patch)&lt;br/&gt;
OFED kernel patch (mad-qp-size-tunable.patch)&lt;br/&gt;
Lustre-1.8.5.0-3chaos, which seems to be  Lustre-1.8.5.0-3 + five patches:&lt;br/&gt;
ff2ef0c &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-337&quot; title=&quot;Processes stuck in sync_page on lustre client&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-337&quot;&gt;&lt;strike&gt;LU-337&lt;/strike&gt;&lt;/a&gt; Fix alloc mask in alloc_qinfo()&lt;br/&gt;
f9e0e36 &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-234&quot; title=&quot;OOM killer causes node hang&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-234&quot;&gt;&lt;strike&gt;LU-234&lt;/strike&gt;&lt;/a&gt; OOM killer causes node hang.&lt;br/&gt;
09eb8f9 &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-286&quot; title=&quot;racer: general protection fault: 0000 [1] SMP RIP: __wake_up_common+60}&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-286&quot;&gt;&lt;strike&gt;LU-286&lt;/strike&gt;&lt;/a&gt; racer: general protection fault.&lt;br/&gt;
f5a9068 &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-274&quot; title=&quot;Client delayed file status (cache meta-data) causing job failures&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-274&quot;&gt;&lt;strike&gt;LU-274&lt;/strike&gt;&lt;/a&gt; Update LVB from disk when glimpse callback return error&lt;br/&gt;
c4d695f Add IP to error message when peer&amp;#39;s IB port is not privileged.&lt;br/&gt;
</environment>
        <key id="11665">LU-659</key>
            <summary>Experiencing heavy IO load, client eviction and RPC timeouts after upgrade to lustre-1.8.5.0-5 (chaos release)</summary>
                <type id="5" iconUrl="https://jira.whamcloud.com/images/icons/issuetypes/epic.png">Epic</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="niu">Niu Yawei</assignee>
                                    <reporter username="jamervi">Joe Mervini</reporter>
                        <labels>
                            <label>o2iblnd</label>
                    </labels>
                <created>Fri, 2 Sep 2011 14:36:47 +0000</created>
                <updated>Wed, 16 Nov 2011 14:37:52 +0000</updated>
                            <resolved>Thu, 27 Oct 2011 09:44:13 +0000</resolved>
                                                                        <due></due>
                            <votes>0</votes>
                                    <watches>16</watches>
                                                                            <comments>
                            <comment id="19898" author="green" created="Fri, 2 Sep 2011 17:17:59 +0000"  >&lt;p&gt;Did you always get this many IB errors?&lt;br/&gt;
Do you have read only cache enabled on the OST? the OST trace shows things are stuck in memory allocation, and I remember there were some fixes in that area, ie both the cache and in mem callbacks since 1.8.5&lt;/p&gt;</comment>
                            <comment id="19900" author="jamervi" created="Fri, 2 Sep 2011 17:31:42 +0000"  >&lt;p&gt;&lt;br/&gt;
We have read_cache_enable set to 1 (enabled) if that is what you mean. Early on (1.8.0.1) we had this disabled because of CPU soft lockups but re-enabled it when we went to 1.8.3.  &lt;/p&gt;</comment>
                            <comment id="19901" author="jamervi" created="Fri, 2 Sep 2011 17:40:08 +0000"  >&lt;p&gt;Oleg, did you mean that we should disable read_cache_enable?&lt;/p&gt;</comment>
                            <comment id="19902" author="cliffw" created="Fri, 2 Sep 2011 17:41:50 +0000"  >&lt;p&gt;I would suggest turning off the read cache to see if it helps.&lt;/p&gt;</comment>
                            <comment id="19903" author="adilger" created="Fri, 2 Sep 2011 17:48:38 +0000"  >&lt;p&gt;Joe, do you know what else changed in your environment as part of the TOSS 1.3.4 -&amp;gt; 1.4.3 upgrade?  Did this include a new major version of OFED?  I&apos;m just wondering whether there are perhaps some complications at the IB layer that are causing the timeouts and reconnection?  What was the previous kernel version?&lt;/p&gt;

&lt;p&gt;IIRC (not being a networking expert, and only hearing about this tangentially) that the Sun IB topology for Red Sky was non-standard and involved changes to the subnet manager to implement the torus topology?  Maybe I&apos;m recalling some other site, but I thought I would inquire.&lt;/p&gt;

&lt;p&gt;Secondly, with the older setup, were there limits put on the number of OST threads active at one time, that might have been lost with the upgrade (e.g. &quot;option ost ost_num_threads=128&quot; in /etc/modprobe.conf or similar)?  How many OSTs per OSS are there?  I know Kevin had done this at some of the J4400 sites in order to avoid overloading the OSTs, with the optimum performance being seen at about 32 threads per OST.&lt;/p&gt;</comment>
                            <comment id="19904" author="jamervi" created="Fri, 2 Sep 2011 18:29:33 +0000"  >&lt;p&gt;Andreas, There were many changes to the environment as a result of the upgrade. I believe this also included changes to OFED. Also you are correct in your recollection of our IB configuration. Matthew Bohnsack is going to respond more authoritatively on that and the network topology and torus configuration.&lt;/p&gt;

&lt;p&gt;Our configuration is 4 OSTs/OSS. With regard to the number of threads, we had always had the option of reducing the thread count in our back pocket. But we never changed it because the file system was fairly tolerant with the 512 thread pool and we wanted maximum performance. But we can always pull that trigger if you all determine that is the best course of action.&lt;/p&gt;</comment>
                            <comment id="19927" author="mpbohns" created="Fri, 2 Sep 2011 19:19:08 +0000"  >&lt;p&gt;Andreas,&lt;/p&gt;

&lt;p&gt;Before upgrade:&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Distro: chaos-release-4.3-4.ch4.3&lt;/li&gt;
	&lt;li&gt;Kernel: 2.6.18-93.2redsky_chaos&lt;/li&gt;
	&lt;li&gt;OFED: 1.4&lt;/li&gt;
	&lt;li&gt;Lustre: 1.8.3.0-7&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;After upgrade:&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Distro: chaos-release-4.4-3.ch4.4&lt;/li&gt;
	&lt;li&gt;Kernel: 2.6.18-107redsky_chaos&lt;/li&gt;
	&lt;li&gt;OFED: 1.5&lt;/li&gt;
	&lt;li&gt;Lustre: 1.8.5.0-5&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;There could be complications at the IB layer causing timeouts and reconnections, but we&apos;re not seeing them at the lower IB layers.  We monitor the state of the IB fabric very carefully, and while these Lustre problems are occurring, a) the SM is stable, and b) we see almost zero symbol errors, receive errors, etc. on the fabric.  We see zero IB counters that would indicate congestion or deadlock.&lt;/p&gt;

&lt;p&gt;Red Sky is arranged as a 3-torus and this involved changes to the SM to make things work.  In addition to the SM being aware of the torus, we needed to make sure that IB ULPs did path record queries that honored the bits set in the returned service level (SL).  For example, MPI didn&apos;t do this correctly, before we fixed it.  None of this has been an issue for us for a couple years, but who knows, something may have come up in the most recent OFED/kernel/Lustre that is biting us.&lt;/p&gt;

&lt;p&gt;There&apos;s a published presentation on the Red Sky torus implementation, if you&apos;re interested.  It covers things in more detail: &lt;a href=&quot;http://www.openfabrics.org/archives/spring2010sonoma/Wednesday/9.00%20Marcus%20Epperson%20Red%20Sky/OFED%202010%20RedSky%20IB.pdf&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://www.openfabrics.org/archives/spring2010sonoma/Wednesday/9.00%20Marcus%20Epperson%20Red%20Sky/OFED%202010%20RedSky%20IB.pdf&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;I think Joe answered your OST thread question.&lt;/p&gt;</comment>
                            <comment id="19928" author="adilger" created="Fri, 2 Sep 2011 19:34:14 +0000"  >&lt;p&gt;Looking at this more closely, it isn&apos;t clear that the server thread count is the real culprit, though I was confused to see the server load at 700, even though the maximum thread count is 512.  That may be caused by the clients timing out and trying to reconnect via a separate thread, while all of the IO threads are blocked.&lt;/p&gt;

&lt;p&gt;The stack trace shows a potential deadlock, possibly caused by &quot;lockless IO&quot;, where the OST thread is grabbing the DLM lock on behalf of the client, but the memory pressure is causing the page allocations to recurse into the DLM and getting stuck on the lock the thread is already holding (or some other combination of A/B/C/.../A lock deadlock).&lt;/p&gt;

&lt;p&gt; __down_failed+0x35/0x3a &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
 .text.lock.ldlm_pool+0x55/0x7d &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
 ldlm_pools_srv_shrink+0x12/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
 shrink_slab+0xd3/0x15c &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
 alloc_pages_current+0x9f/0xa8 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
 __page_cache_alloc+0x6d/0x71 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
 find_or_create_page+0x37/0x7b &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
 filter_get_page+0x38/0x70 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdfilter&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
 filter_preprw+0x146a/0x1d30 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdfilter&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
 lock_handle_addref+0x9/0x10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
 lock_res_and_lock+0xc2/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
 ost_brw_write+0xf67/0x2410 &lt;span class=&quot;error&quot;&gt;&amp;#91;ost&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;/p&gt;

&lt;p&gt;Two potential short-term workarounds for this:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;disable the read cache on the OSS nodes, to avoid memory pressure when&lt;br/&gt;
  allocating the pages from ost_brw_write().  They currently do not use&lt;br/&gt;
  GFP_NOFS, because otherwise there is no memory pressure on the OSS to&lt;br/&gt;
  free up pages on the OSSes.&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;  lctl set_param obdfilter.*.readcache_enable=0&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;disable lockless IO on the OSSes for the time being.  Check if this is&lt;br/&gt;
  enabled first, to see if this is really the culprit (though from the&lt;br/&gt;
  stack I would say yes):&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;  lctl get_param ldlm.namespaces.*.max_nolock_bytes&lt;/p&gt;

&lt;p&gt;  and disable it via:&lt;br/&gt;
  lctl set_param ldlm.namespaces.*.max_nolock_bytes=0&lt;/p&gt;</comment>
                            <comment id="19929" author="adilger" created="Fri, 2 Sep 2011 19:42:19 +0000"  >&lt;p&gt;Hmm, I&apos;m not 100% positive about the max_nolock_bytes=0 setting, since this appears to be the default.  Disabling the read cache should avoid this problem I think.&lt;/p&gt;</comment>
                            <comment id="19930" author="adilger" created="Fri, 2 Sep 2011 19:44:13 +0000"  >&lt;p&gt;Also, I don&apos;t see it mentioned anywhere what the previous version of Lustre was?&lt;/p&gt;</comment>
                            <comment id="19931" author="adilger" created="Fri, 2 Sep 2011 19:44:35 +0000"  >&lt;p&gt;Doh, 1.8.3, if I actually open my eyes...&lt;/p&gt;</comment>
                            <comment id="19932" author="pjones" created="Fri, 2 Sep 2011 19:58:47 +0000"  >&lt;p&gt;Johann,&lt;/p&gt;

&lt;p&gt;Can you please comment?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="19933" author="jamervi" created="Fri, 2 Sep 2011 22:10:22 +0000"  >&lt;p&gt;Andres - I meant to mention in my last post that I &lt;em&gt;did&lt;/em&gt; turn off read_cache_enable poking the proc entry via &quot;cat 0 &amp;gt; /proc/fs/lsutre/obdfilter/&amp;lt;OST&amp;gt;/read_cache_enable&quot;. I am assuming this accomplish the same thing as &quot;lctl set_param obdfilter.*.readcache_enable=0&quot;. Doing this on the live file system did not provide any positive results.&lt;/p&gt;

&lt;p&gt;I also check the state of /proc/fs/lustre/ldlm/namespaces/filter-&amp;lt;OST&amp;gt;/max_nolock_bytes on each of the 4 OSTs and they are all set to 0.&lt;/p&gt;</comment>
                            <comment id="19934" author="adilger" created="Sat, 3 Sep 2011 05:36:12 +0000"  >&lt;p&gt;Joe, when you write &quot;did not provide any positive results&quot; does that mean the OSS nodes are still hanging?  Had they already dumped stacks (watchdog stack dumps due to inactivity, with ldlm_pools_srv_shrink() on the stack)?  If that is the case, then disabling read cache would be too late, and would not &quot;unblock&quot; the OSS.  Any OSS nodes that are already hung need to be rebooted, and the &quot;lctl set_param&quot; (or equivalent &quot;echo 0 &amp;gt;&quot;) needs to be run right after boot.&lt;/p&gt;</comment>
                            <comment id="19936" author="jamervi" created="Sat, 3 Sep 2011 14:36:33 +0000"  >&lt;p&gt;Andres, the short answer is: yes, the OSSs were still hanging. Today I preemptively restarted all the OSS servers disabling read_cache_enable at mount time. We&apos;ll see if there are any positive effects.&lt;/p&gt;

&lt;p&gt;One thing that I did notice that I hadn&apos;t before is that I am also seeing blocked tasks on the OSS nodes at during boot time (below). Since we do not start either lustre or mdadm automatically (we do it via nodeup scripts) I personally can not see how either are contributing to these messages being generated. But that is beyond my understanding of both linux and lustre internals. &lt;/p&gt;

&lt;p&gt;INFO: task modprobe:1778 blocked for more than 120 seconds.&lt;br/&gt;
&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.&lt;br/&gt;
modprobe      D ffff810001004440     0  1778   1742                     (NOTLB)&lt;br/&gt;
ffff8101dcf0fd78 0000000000000046 ffff81025e5eefff ffff8101de3de678&lt;br/&gt;
0000000000000001 0000000000000007 ffff8101ff54b040 ffffffff80324b60&lt;br/&gt;
00000055e1c307ae 0000000000002c68 ffff8101ff54b228 0000000080091f8c&lt;br/&gt;
Call Trace:&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80067b32&amp;gt;&amp;#93;&lt;/span&gt; __down+0xc5/0xd9&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80093b5a&amp;gt;&amp;#93;&lt;/span&gt; default_wake_function+0x0/0xf&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800677dd&amp;gt;&amp;#93;&lt;/span&gt; __down_failed+0x35/0x3a&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff801dc223&amp;gt;&amp;#93;&lt;/span&gt; __driver_attach+0x0/0xb6&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff801dc2c2&amp;gt;&amp;#93;&lt;/span&gt; __driver_attach+0x9f/0xb6&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff801dc223&amp;gt;&amp;#93;&lt;/span&gt; __driver_attach+0x0/0xb6&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff801db9e9&amp;gt;&amp;#93;&lt;/span&gt; bus_for_each_dev+0x49/0x7a&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff801dc051&amp;gt;&amp;#93;&lt;/span&gt; driver_attach+0x1c/0x1e&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff801db5ee&amp;gt;&amp;#93;&lt;/span&gt; bus_add_driver+0x78/0x116&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff801dc4fc&amp;gt;&amp;#93;&lt;/span&gt; driver_register+0x8f/0x93&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8016dff8&amp;gt;&amp;#93;&lt;/span&gt; __pci_register_driver+0x5a/0xb7&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff882f4017&amp;gt;&amp;#93;&lt;/span&gt; :pata_acpi:pacpi_init+0x17/0x19&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800aeb64&amp;gt;&amp;#93;&lt;/span&gt; sys_init_module+0xb0/0x1f5&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80060116&amp;gt;&amp;#93;&lt;/span&gt; system_call+0x7e/0x83&lt;/p&gt;

&lt;p&gt;INFO: task modprobe:1901 blocked for more than 120 seconds.&lt;br/&gt;
&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.&lt;br/&gt;
modprobe      D ffff810205462040     0  1901   1809                     (NOTLB)&lt;br/&gt;
ffff8101dce3fd58 0000000000000046 00000002fe821000 0000000000000002&lt;br/&gt;
00000006dce3fd48 0000000000000007 ffff8101fe24c7f0 ffff8102056bb7f0&lt;br/&gt;
00000055e16baed4 0000000000002556 ffff8101fe24c9d8 0000000600000096&lt;br/&gt;
Call Trace:&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80067b32&amp;gt;&amp;#93;&lt;/span&gt; __down+0xc5/0xd9&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80093b5a&amp;gt;&amp;#93;&lt;/span&gt; default_wake_function+0x0/0xf&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800677dd&amp;gt;&amp;#93;&lt;/span&gt; __down_failed+0x35/0x3a&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff801dc2c2&amp;gt;&amp;#93;&lt;/span&gt; __driver_attach+0x9f/0xb6&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff801dc223&amp;gt;&amp;#93;&lt;/span&gt; __driver_attach+0x0/0xb6&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff801db9e9&amp;gt;&amp;#93;&lt;/span&gt; bus_for_each_dev+0x49/0x7a&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff801dc051&amp;gt;&amp;#93;&lt;/span&gt; driver_attach+0x1c/0x1e&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff801db5ee&amp;gt;&amp;#93;&lt;/span&gt; bus_add_driver+0x78/0x116&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff801dc4fc&amp;gt;&amp;#93;&lt;/span&gt; driver_register+0x8f/0x93&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8016dff8&amp;gt;&amp;#93;&lt;/span&gt; __pci_register_driver+0x5a/0xb7&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff883d5074&amp;gt;&amp;#93;&lt;/span&gt; :i7core_edac:i7core_init+0x74/0x98&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800aeb64&amp;gt;&amp;#93;&lt;/span&gt; sys_init_module+0xb0/0x1f5&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80060116&amp;gt;&amp;#93;&lt;/span&gt; system_call+0x7e/0x83&lt;/p&gt;</comment>
                            <comment id="19937" author="jamervi" created="Sat, 3 Sep 2011 18:42:57 +0000"  >&lt;p&gt;The scratch file system was only usable for a short period of time after I posted my last remarks (I was away for several hours but others who were monitoring the system reported hung clients and load averages on the servers in the hundreds.) So I think it is safe to say that turning off read cache provided no relief.&lt;/p&gt;</comment>
                            <comment id="19940" author="adilger" created="Sun, 4 Sep 2011 01:19:32 +0000"  >&lt;p&gt;Joe, I&apos;m assuming that since you run the chaos release that this is the same code that is being run at LLNL?  Is this problem specific to a particular user job, or is it happening regardless of the type of job being run?&lt;/p&gt;

&lt;p&gt;If I provided you with a Lustre patch to avoid the allocations recursing into the DLM, can you build it, or do you normally get pre-built RPMs?&lt;/p&gt;

&lt;p&gt;The other question is if it would be worthwhile to try the 1.8.6.wc1 release on a handful of OSS nodes to see if it avoids the problem?&lt;/p&gt;</comment>
                            <comment id="19941" author="pjones" created="Sun, 4 Sep 2011 07:46:01 +0000"  >&lt;p&gt;Niu&lt;/p&gt;

&lt;p&gt;Could you please help out with this one?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="19943" author="adilger" created="Sun, 4 Sep 2011 12:09:22 +0000"  >&lt;p&gt;Joe, if you build your own Lustre packages, you could first try running configure with the &quot;--disable-lru-resize&quot; option (along with any other options you normally use) and installing this on some/all of the OSS nodes. &lt;/p&gt;</comment>
                            <comment id="19944" author="jamervi" created="Sun, 4 Sep 2011 12:54:29 +0000"  >&lt;p&gt;Andres, At this point we have decided to pull back on any modifications to the system in favor of  trying to isolate the problem by removing the local software raid lustre scratch file system from the equation and having users run out of our site file system. I think for now, we should plan on taking this up again Tuesday as this is a long holiday weekend for us. But I was wondering if you had any opinion on the blocked tasks during boot up? One thing that we failed to mention that is relevant to the environment is that the system is essentially a diskless configuration where the OS is delivered to all nodes - with the exception of bootnodes - via NFS over IB.   &lt;/p&gt;</comment>
                            <comment id="19945" author="adilger" created="Sun, 4 Sep 2011 14:43:40 +0000"  >&lt;p&gt;If there is no swap on the nodes, then the kernel has to work much harder in case of memory pressure. If you are booting diskless I would still recommend having a small swap partition on the server nodes.&lt;/p&gt;

&lt;p&gt;The pauses during boot may well relate to this, if the boot code is doing any upcalls to load configs or firmware. &lt;/p&gt;</comment>
                            <comment id="19957" author="johann" created="Mon, 5 Sep 2011 08:01:39 +0000"  >&lt;p&gt;&amp;gt; Two potential short-term workarounds for this:&lt;br/&gt;
&amp;gt;&lt;br/&gt;
&amp;gt; disable the read cache on the OSS nodes, to avoid memory pressure when&lt;br/&gt;
&amp;gt; allocating the pages from ost_brw_write(). They currently do not use&lt;br/&gt;
&amp;gt; GFP_NOFS, because otherwise there is no memory pressure on the OSS to&lt;br/&gt;
&amp;gt; free up pages on the OSSes.&lt;br/&gt;
&amp;gt; lctl set_param obdfilter.*.readcache_enable=0&lt;/p&gt;

&lt;p&gt;Actually this just disables the cache for bulk reads, but leaves pages in the page cache for bulk writes.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;e.g.:
# lctl get_param obdfilter.*.*cache_enable
obdfilter.lustre-OST0001.read_cache_enable=4294967295
obdfilter.lustre-OST0001.writethrough_cache_enable=4294967295
# lctl set_param obdfilter.*.read_cache_enable=0
obdfilter.lustre-OST0001.read_cache_enable=0
# lctl get_param obdfilter.*.*cache_enable
obdfilter.lustre-OST0001.read_cache_enable=0
obdfilter.lustre-OST0001.writethrough_cache_enable=4294967295
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;To disable both, you should run:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# lctl set_param obdfilter.*.*cache_enable=0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;For pages which are already in cache, you can free them with &quot;echo 1 &amp;gt; /proc/sys/vm/drop_caches&quot;&lt;br/&gt;
After that, you should really no longer see any memory pressure on the OSSs.&lt;/p&gt;

&lt;p&gt;Could you please runs those two commands and check that the OSSs are never under memory pressure again (with top, free, vmstat, ...)?&lt;/p&gt;

&lt;p&gt;&amp;gt; disable lockless IO on the OSSes for the time being. Check if this is&lt;br/&gt;
&amp;gt; enabled first, to see if this is really the culprit (though from the&lt;br/&gt;
&amp;gt; stack I would say yes):&lt;br/&gt;
&amp;gt; lctl get_param ldlm.namespaces.*.max_nolock_bytes&lt;br/&gt;
&amp;gt;&lt;br/&gt;
&amp;gt; and disable it via:&lt;br/&gt;
&amp;gt; lctl set_param ldlm.namespaces.*.max_nolock_bytes=0&lt;/p&gt;

&lt;p&gt;Unless this has been explicitly enabled, lockless I/Os are disabled by default, BUT are still used for direct I/Os. &lt;/p&gt;</comment>
                            <comment id="19974" author="morrone" created="Tue, 6 Sep 2011 19:24:20 +0000"  >&lt;p&gt;FYI, if you are using our tree and having o2iblnd problems, we have an additional couple of histograms that might be useful in:&lt;/p&gt;

&lt;p&gt;  /proc/sys/lnet/o2iblnd/stats&lt;/p&gt;
</comment>
                            <comment id="20035" author="niu" created="Wed, 7 Sep 2011 22:25:17 +0000"  >&lt;p&gt;Hi, Joe&lt;/p&gt;

&lt;p&gt;Any news on the problem isolation? I think it&apos;s worth while to try disabling the lru-resize as Andreas suggested, and it&apos;ll be great if we can catch the full stack on mds when you hit the problem again.&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;</comment>
                            <comment id="20039" author="jamervi" created="Thu, 8 Sep 2011 08:42:58 +0000"  >&lt;p&gt;Niu, We are starting to look at user(s) jobs as the source of the problem. We had a similar problem that occurred on another one of our lustre file systems where the MDS was getting buried by multiple user jobs (same user) were running chgrp recursively on a directory with ~4.5M files/directories (this was reported separately in another LU).&lt;/p&gt;

&lt;p&gt;WRT rebuilding lustre with lru-resize disabled, we have not gone down that path yet. Because of the complexity of this particular environment (diskless, IB only network, etc.) it is more complicated than a simple drop-in of the new bits on the servers. However, we have implemented the non-invasive changes that Johann has recommended and will start testing today.&lt;/p&gt;</comment>
                            <comment id="20041" author="jamervi" created="Thu, 8 Sep 2011 08:58:59 +0000"  >&lt;p&gt;One other observation: It appears that we have been kind of chasing our tails with the &quot;...blocked for more than 120 seconds&quot; messages we were seeing in dmesg and syslogs. Come to find out that this particular error message is new with the new kernel (We checked through the source of the previous kernel and this reporting didn&apos;t exist). Based on the volume of questions posted on the web, it seems like this is a commonly observed condition. The question is, is there reason for concern? &lt;/p&gt;</comment>
                            <comment id="20043" author="niu" created="Thu, 8 Sep 2011 09:50:38 +0000"  >&lt;blockquote&gt;
&lt;p&gt;One other observation: It appears that we have been kind of chasing our tails with the &quot;...blocked for more than 120 seconds&quot; messages we were seeing in dmesg and syslogs. Come to find out that this particular error message is new with the new kernel (We checked through the source of the previous kernel and this reporting didn&apos;t exist). Based on the volume of questions posted on the web, it seems like this is a commonly observed condition. The question is, is there reason for concern? &lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;When some process hasn&apos;t been scheduled for a long time (usually caused by deadlock or the system is overload), kernel will print such kind of message to warn user that there is something wrong.&lt;/p&gt;</comment>
                            <comment id="20045" author="niu" created="Thu, 8 Sep 2011 10:00:29 +0000"  >&lt;blockquote&gt;
&lt;p&gt;Niu, We are starting to look at user(s) jobs as the source of the problem. We had a similar problem that occurred on another one of our lustre file systems where the MDS was getting buried by multiple user jobs (same user) were running chgrp recursively on a directory with ~4.5M files/directories (this was reported separately in another LU).&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;What&apos;s the LU number?&lt;/p&gt;</comment>
                            <comment id="20048" author="jamervi" created="Thu, 8 Sep 2011 12:33:47 +0000"  >&lt;p&gt;The other LU is &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-667&quot; title=&quot;Experiencing sluggish, intermittently unresponsive, and OOM killed MDS nodes&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-667&quot;&gt;&lt;del&gt;LU-667&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="20101" author="niu" created="Thu, 8 Sep 2011 23:28:31 +0000"  >&lt;blockquote&gt;
&lt;p&gt;the MDS was getting buried by multiple user jobs (same user) were running chgrp recursively on a directory with ~4.5M files/directories&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Hi, Joe&lt;/p&gt;

&lt;p&gt;How many clients are performing such jobs? are they working on same directory? and what&apos;s the client&apos;s memory size and MDS memory size? Did you run the same jobs and hit the same problem before upgrading?&lt;/p&gt;

&lt;p&gt;Now I&apos;m suspecting that with lru-resize enabled, client can cache much more dlm locks, though MDS can inform client to cancel locks when it&apos;s getting overloaded, the client might not be able to response quickly enough to releive the situation.&lt;/p&gt;

&lt;p&gt;If it&apos;s possible, it&apos;s better to get the full stack trace on MDS and the statistics of /proc/fs/lustre/ldlm/namespaces/xxx/* and pool on both client and MDS when you hit the problem again. Thank you.&lt;/p&gt;</comment>
                            <comment id="20208" author="smonk" created="Wed, 14 Sep 2011 10:09:01 +0000"  >&lt;p&gt;FYI..To allow our users to get some work done, we are limiting the use of this Red Sky local Lustre file system in favor of our site Lustre file system.  We plan on testing the fixes mentioned next week as part of a maintenance time and will report back accordingly.&lt;br/&gt;
Thanks,&lt;br/&gt;
Steve&lt;/p&gt;</comment>
                            <comment id="20513" author="jamervi" created="Mon, 26 Sep 2011 14:11:13 +0000"  >&lt;p&gt;During our dedicated application time we were able to recreate the high load and lustre hang condition limited to 4 user jobs - with the suspicion that either one or both of 2 codes being the trigger. Other fallout that occurred with this hang condition that wasn&apos;t mentioned before is that we began having problems with booting servers and client nodes via gpxe over IB, and in the case when the problem first arose, a complete system restart including a flush of the subnet manager was require to get the system operational again. &lt;/p&gt;

&lt;p&gt;Today we are going to try and duplicate the condition on our test platform. If we are successful, we try to zero in on which code is the culprit. Otherwise we suspect it might be a scaling issue.&lt;/p&gt;

&lt;p&gt;One question that came up in a meeting this morning: Is it possible to assign LNET its own P_KEY when running over IB? The idea is that if we isolate LNET traffic to it&apos;s own lane and QOS we can see whether it is causing contention on the fabric.&lt;/p&gt;</comment>
                            <comment id="20514" author="adilger" created="Mon, 26 Sep 2011 14:56:16 +0000"  >&lt;p&gt;.bq &amp;gt; Is it possible to assign LNET its own P_KEY when running over IB? The idea is that if we isolate LNET traffic to it&apos;s own lane and QOS we can see whether it is causing contention on the fabric.&lt;/p&gt;

&lt;p&gt;I believe this is possible, and some discussion/patches about this a year ago on lustre-discuss, IIRC.  I don&apos;t know what the state of that is today, however.  I&apos;ve CC&apos;d Liang to see if he recalls the details.&lt;/p&gt;</comment>
                            <comment id="20631" author="jamervi" created="Mon, 3 Oct 2011 14:57:34 +0000"  >&lt;p&gt;We are experiencing another interruption due to high load averages on another one of our file systems. The parameters are essentially the same with the exception that this file system is running RAID10 OSTs vs. RAID6. &lt;/p&gt;

&lt;p&gt;The predominant factors are the load is above 150 and there are a couple of ll_ost_io threads and a kjournald thread are eating up a large percentage of the CPU utilization:&lt;/p&gt;

&lt;p&gt;top - 12:40:17 up 31 days,  4:50,  3 users,  load average: 151.00, 150.86, 150.49&lt;br/&gt;
Tasks: 1176 total,   2 running, 1174 sleeping,   0 stopped,   0 zombie&lt;br/&gt;
Cpu(s):  0.0%us, 23.7%sy,  0.0%ni, 75.5%id,  0.6%wa,  0.0%hi,  0.1%si,  0.0%st&lt;br/&gt;
Mem:  11814704k total, 11753832k used,    60872k free,   355376k buffers&lt;br/&gt;
Swap:        0k total,        0k used,        0k free,  8289304k cached&lt;/p&gt;

&lt;p&gt;  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND                                         &lt;br/&gt;
 7747 root      11  -5     0    0    0 D 63.1  0.0 452:00.23 kjournald                                        &lt;br/&gt;
 8004 root      15   0     0    0    0 D 62.8  0.0 352:06.50 ll_ost_io_94                                     &lt;br/&gt;
10329 root      15   0     0    0    0 R 61.5  0.0 397:11.57 ll_ost_io_243                                    &lt;br/&gt;
24954 root      16   0 11664 2096  888 S  0.7  0.0   0:02.97 top                                              &lt;br/&gt;
 8083 root      10  -5     0    0    0 S  0.6  0.0 240:03.48 md2_raid10                                       &lt;br/&gt;
 7569 root      10  -5     0    0    0 S  0.5  0.0 239:52.87 md1_raid10                                    &lt;/p&gt;

&lt;p&gt;I have captured the outputs from sysrq -t and -m as well as the slabinfo and have them attached. I have tried unsuccessfully to kill the ll_ost_io threads. I saw some mention of an OOM killer but it looks like that is only in 2.1. At this point I need to reboot the system to restore services. Hopefully the information provided can some insight into what is happening.&lt;/p&gt;</comment>
                            <comment id="20648" author="green" created="Mon, 3 Oct 2011 15:43:22 +0000"  >&lt;p&gt;It does look like a high mem pressure on the node.&lt;br/&gt;
In the backtraces you provided I can only see ll_ost_io_243 in the mempressure path, but not the io_94 or journald, so I don&apos;t know what the other two are doing.&lt;/p&gt;

&lt;p&gt;From the slabinfo it looks like you have 400k locks or so too.&lt;br/&gt;
Any chance you can limit your clients to a certain preset number of locks and see what the impact would be?&lt;/p&gt;

&lt;p&gt;Just echo some number like 600 to /proc/fs/lustre/ldlm/namespaces/&lt;b&gt;osc&lt;/b&gt;/lru_size on all clients to achieve that.&lt;/p&gt;

&lt;p&gt;Also it is a bad idea to try to kill ost threads, so please don&apos;t do it (it won&apos;t work anyway too).&lt;/p&gt;</comment>
                            <comment id="20711" author="jamervi" created="Mon, 3 Oct 2011 16:39:38 +0000"  >&lt;p&gt;Thank Oleg. I&apos;ll give that a try. Also I know that killing off ost threads is not a good idea but I was hoping that if I was able to I could find the node or job that was creating the problem. &lt;/p&gt;</comment>
                            <comment id="21588" author="mdiep" created="Fri, 21 Oct 2011 00:34:33 +0000"  >&lt;p&gt;Hi Joe,&lt;/p&gt;

&lt;p&gt;Have you tried disable oss read cache?&lt;/p&gt;</comment>
                            <comment id="21968" author="johann" created="Wed, 26 Oct 2011 14:23:48 +0000"  >&lt;p&gt;&amp;gt; Have you tried disable oss read cache?&lt;/p&gt;

&lt;p&gt;I suggested this a while back (05/Sep/11 8:01 AM), however i don&apos;t know whether or not it has been tried.&lt;/p&gt;</comment>
                            <comment id="21975" author="jamervi" created="Wed, 26 Oct 2011 20:34:48 +0000"  >&lt;p&gt;Sorry for the slow response. Basically we had been running for most of the life of this file system with read caching turned off because of earlier 1.8 issues. We re-enabled it for a brief time before we started encountering these problems and part of our trouble shooting efforts were to return the system to the environmental conditions (with the exception of changes due to the OS and other upgrades mentioned above). It didn&apos;t have any impact.&lt;/p&gt;

&lt;p&gt;We now believe that a combination of user jobs/code (most likely single node), software RAID are creating a deadlock condition that impacts the entire system catastrophically. Because I renders the system unusable we have opted to sideline this particular file system and are using our global scratch file system to handle that need.&lt;/p&gt;

&lt;p&gt;I think for the time being we should do the same with this ticket. We have seen evidence that the same problem might be occurring in another fashion and it may be something we can possibly create a duplicator. If we can I will either reopen this case or create a new one depending on how you would like to approach it.&lt;/p&gt;

&lt;p&gt;Thanks. &lt;/p&gt;</comment>
                            <comment id="21976" author="jamervi" created="Wed, 26 Oct 2011 20:38:12 +0000"  >&lt;p&gt;Man - after a long day you don&apos;t notice all your typos until you see it posted. Hopefully you get the gist of my message. If not or if there are any questions just let me know.&lt;/p&gt;</comment>
                            <comment id="22003" author="pjones" created="Thu, 27 Oct 2011 09:44:13 +0000"  >&lt;p&gt;ok Joe. Thanks for the update. We&apos;ll close this ticket for now and either reopen it or open a new ticket if you have some more information in the future.&lt;/p&gt;</comment>
                            <comment id="23096" author="mdiep" created="Wed, 16 Nov 2011 14:37:51 +0000"  >&lt;p&gt;Just FYI, We no longer see the following watchdog when we disable oss read cache&lt;/p&gt;

&lt;p&gt;2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887dd3f2&amp;gt;&amp;#93;&lt;/span&gt; ldlm_pools_srv_shrink+0x12/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800409ea&amp;gt;&amp;#93;&lt;/span&gt; shrink_slab+0xd3/0x15c &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800d487f&amp;gt;&amp;#93;&lt;/span&gt; zone_reclaim+0x25f/0x306 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800d0af1&amp;gt;&amp;#93;&lt;/span&gt; __rmqueue+0x47/0xcb &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8000a96a&amp;gt;&amp;#93;&lt;/span&gt; get_page_from_freelist+0xb6/0x411 &amp;lt;kern.warning&amp;gt;&lt;br/&gt;
2011-09-01 19:47:14 oss-scratch14 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8000f5ad&amp;gt;&amp;#93;&lt;/span&gt; __alloc_pages+0x78/0x30e &amp;lt;kern.warning&amp;gt;&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="10520" name="dmesg" size="123797" author="jamervi" created="Mon, 3 Oct 2011 14:57:21 +0000"/>
                            <attachment id="10408" name="oss-scratch16.1314981155.10959" size="27248" author="jamervi" created="Fri, 2 Sep 2011 14:36:47 +0000"/>
                            <attachment id="10519" name="slabinfo" size="17781" author="jamervi" created="Mon, 3 Oct 2011 14:57:21 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10040" key="com.atlassian.jira.plugin.system.customfieldtypes:labels">
                        <customfieldname>Epic</customfieldname>
                        <customfieldvalues>
                                        <label>hang</label>
            <label>server</label>
            <label>timeout</label>
    
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                    <customfield id="customfield_10094" key="com.pyxis.greenhopper.jira:gh-epic-status">
                        <customfieldname>Epic Status</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10052"><![CDATA[Done]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                    <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzw04v:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>10101</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>