<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:28:07 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2779] LBUG in discard_cb: !(page-&gt;cp_type == CPT_CACHEABLE) || (!PageWriteback(cl_page_vmpage(env, page)))</title>
                <link>https://jira.whamcloud.com/browse/LU-2779</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We hit this on Sequoia during shutdown time, I don&apos;t see an existing bug open for this crash either:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-02-01 14:35:53.812430 {R4-llnl} [bgqadmin]{5}.3.1: 
2013-02-01 14:35:53.812843 {R4-llnl} [bgqadmin]{5}.3.1: Broadcast message from root@seqio262-ib0
2013-02-01 14:35:53.813165 {R4-llnl} [bgqadmin]{5}.3.1: 	(unknown) at 14:35 ...
2013-02-01 14:35:53.813752 {R4-llnl} [bgqadmin]{5}.3.1: The system is going down for halt NOW!
2013-02-01 14:35:53.814093 {R4-llnl} [bgqadmin]{5}.2.3: Stopping Common I/O Services: LustreError: 4653:0:(cl_lock.c:1967:discard_cb()) ASSERTION( (!(page-&amp;gt;cp_type == CPT_CACHEABLE) || (!PageWriteback(cl_page_vmpage(env, page)))) ) failed: 
2013-02-01 14:35:53.814429 {R4-llnl} [bgqadmin]{5}.2.3: LustreError: 4653:0:(cl_lock.c:1967:discard_cb()) LBUG
2013-02-01 14:35:53.814746 {R4-llnl} [bgqadmin]{5}.2.3: Call Trace:
2013-02-01 14:35:53.815076 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e392b40] [c000000000008d1c] .show_stack+0x7c/0x184 (unreliable)
2013-02-01 14:35:53.815397 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e392bf0] [8000000000ab0c88] .libcfs_debug_dumpstack+0xd8/0x150 [libcfs]
2013-02-01 14:35:53.815717 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e392ca0] [8000000000ab1450] .lbug_with_loc+0x50/0xc0 [libcfs]
2013-02-01 14:35:53.816042 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e392d30] [80000000024f15f8] .discard_cb+0x238/0x240 [obdclass]
2013-02-01 14:35:53.816392 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e392dd0] [80000000024ecadc] .cl_page_gang_lookup+0x26c/0x600 [obdclass]
2013-02-01 14:35:53.816732 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e392ef0] [80000000024f11f8] .cl_lock_discard_pages+0x188/0x2c0 [obdclass]
2013-02-01 14:35:53.817047 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e392fa0] [80000000046aa390] .osc_lock_flush+0x290/0x4a0 [osc]
2013-02-01 14:35:53.817363 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393090] [80000000046aa6dc] .osc_lock_cancel+0x13c/0x2c0 [osc]
2013-02-01 14:35:53.817877 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393160] [80000000024eda90] .cl_lock_cancel0+0xd0/0x2b0 [obdclass]
2013-02-01 14:35:53.818248 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393220] [80000000024f09f8] .cl_lock_hold_release+0x258/0x450 [obdclass]
2013-02-01 14:35:53.818565 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e3932f0] [80000000024f36fc] .cl_lock_unhold+0x8c/0x270 [obdclass]
2013-02-01 14:35:53.818901 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e3933c0] [800000000513e5b4] .lov_sublock_release+0x244/0x370 [lov]
2013-02-01 14:35:53.819221 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393480] [8000000005141f68] .lov_lock_enqueue+0x388/0xb20 [lov]
2013-02-01 14:35:53.819535 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e3935c0] [80000000024f4d88] .cl_enqueue_try+0x1d8/0x510 [obdclass]
2013-02-01 14:35:53.819908 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e3936d0] [80000000024f6d88] .cl_enqueue_locked+0xa8/0x2c0 [obdclass]
2013-02-01 14:35:53.820387 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393780] [80000000024f72b0] .cl_lock_request+0xe0/0x370 [obdclass]
2013-02-01 14:35:53.820707 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393850] [800000000695efb4] .cl_glimpse_lock+0x2b4/0x640 [lustre]
2013-02-01 14:35:53.821021 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393940] [800000000695f538] .cl_glimpse_size0+0x1f8/0x270 [lustre]
2013-02-01 14:35:53.821337 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393a10] [80000000068f1510] .ll_inode_revalidate_it+0x220/0x2c0 [lustre]
2013-02-01 14:35:53.821652 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393ad0] [80000000068f15f0] .ll_getattr_it+0x40/0x180 [lustre]
2013-02-01 14:35:53.821966 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393b70] [80000000068f1774] .ll_getattr+0x44/0x60 [lustre]
2013-02-01 14:35:53.822282 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393c20] [c0000000000d57d8] .vfs_getattr+0x38/0x60
2013-02-01 14:35:53.822595 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393c90] [c0000000000d5e4c] .vfs_fstatat+0x78/0xa8
2013-02-01 14:35:53.822909 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393d30] [c0000000000d5f00] .SyS_newfstatat+0x2c/0x58
2013-02-01 14:35:53.823222 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393e30] [c000000000000580] syscall_exit+0x0/0x2c
2013-02-01 14:35:53.823534 {R4-llnl} [bgqadmin]{5}.2.3: Kernel panic - not syncing: LBUG
2013-02-01 14:35:53.823844 {R4-llnl} [bgqadmin]{5}.2.3: Call Trace:
2013-02-01 14:35:53.824153 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e392b60] [c000000000008d1c] .show_stack+0x7c/0x184 (unreliable)
2013-02-01 14:35:53.824466 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e392c10] [c000000000431ef4] .panic+0x80/0x1ac
2013-02-01 14:35:53.824776 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e392ca0] [8000000000ab14b0] .lbug_with_loc+0xb0/0xc0 [libcfs]
2013-02-01 14:35:53.825089 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e392d30] [80000000024f15f8] .discard_cb+0x238/0x240 [obdclass]
2013-02-01 14:35:53.825401 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e392dd0] [80000000024ecadc] .cl_page_gang_lookup+0x26c/0x600 [obdclass]
2013-02-01 14:35:53.825721 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e392ef0] [80000000024f11f8] .cl_lock_discard_pages+0x188/0x2c0 [obdclass]
2013-02-01 14:35:53.826045 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e392fa0] [80000000046aa390] .osc_lock_flush+0x290/0x4a0 [osc]
2013-02-01 14:35:53.826358 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393090] [80000000046aa6dc] .osc_lock_cancel+0x13c/0x2c0 [osc]
2013-02-01 14:35:53.826670 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393160] [80000000024eda90] .cl_lock_cancel0+0xd0/0x2b0 [obdclass]
2013-02-01 14:35:53.826982 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393220] [80000000024f09f8] .cl_lock_hold_release+0x258/0x450 [obdclass]
2013-02-01 14:35:53.827295 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e3932f0] [80000000024f36fc] .cl_lock_unhold+0x8c/0x270 [obdclass]
2013-02-01 14:35:53.827608 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e3933c0] [800000000513e5b4] .lov_sublock_release+0x244/0x370 [lov]
2013-02-01 14:35:53.827920 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393480] [8000000005141f68] .lov_lock_enqueue+0x388/0xb20 [lov]
2013-02-01 14:35:53.828232 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e3935c0] [80000000024f4d88] .cl_enqueue_try+0x1d8/0x510 [obdclass]
2013-02-01 14:35:53.828649 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e3936d0] [80000000024f6d88] .cl_enqueue_locked+0xa8/0x2c0 [obdclass]
2013-02-01 14:35:53.829092 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393780] [80000000024f72b0] .cl_lock_request+0xe0/0x370 [obdclass]
2013-02-01 14:35:53.829361 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393850] [800000000695efb4] .cl_glimpse_lock+0x2b4/0x640 [lustre]
2013-02-01 14:35:53.829629 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393940] [800000000695f538] .cl_glimpse_size0+0x1f8/0x270 [lustre]
2013-02-01 14:35:53.829892 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393a10] [80000000068f1510] .ll_inode_revalidate_it+0x220/0x2c0 [lustre]
2013-02-01 14:35:53.830155 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393ad0] [80000000068f15f0] .ll_getattr_it+0x40/0x180 [lustre]
2013-02-01 14:35:53.830420 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393b70] [80000000068f1774] .ll_getattr+0x44/0x60 [lustre]
2013-02-01 14:35:53.830686 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393c20] [c0000000000d57d8] .vfs_getattr+0x38/0x60
2013-02-01 14:35:53.830952 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393c90] [c0000000000d5e4c] .vfs_fstatat+0x78/0xa8
2013-02-01 14:35:53.831217 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393d30] [c0000000000d5f00] .SyS_newfstatat+0x2c/0x58
2013-02-01 14:35:53.831483 {R4-llnl} [bgqadmin]{5}.2.3: [c00000038e393e30] [c000000000000580] syscall_exit+0x0/0x2c
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="17498">LU-2779</key>
            <summary>LBUG in discard_cb: !(page-&gt;cp_type == CPT_CACHEABLE) || (!PageWriteback(cl_page_vmpage(env, page)))</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="jay">Jinshan Xiong</assignee>
                                    <reporter username="prakash">Prakash Surya</reporter>
                        <labels>
                            <label>sequoia</label>
                            <label>topsequoia</label>
                    </labels>
                <created>Thu, 7 Feb 2013 13:39:40 +0000</created>
                <updated>Fri, 7 Aug 2015 17:25:55 +0000</updated>
                            <resolved>Wed, 25 Sep 2013 21:48:59 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                    <fixVersion>Lustre 2.5.0</fixVersion>
                    <fixVersion>Lustre 2.4.2</fixVersion>
                                        <due></due>
                            <votes>2</votes>
                                    <watches>15</watches>
                                                                            <comments>
                            <comment id="52031" author="bfaccini" created="Fri, 8 Feb 2013 08:57:26 +0000"  >&lt;p&gt;This Assert looks familiar to me since I triggered it during &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-874&quot; title=&quot;Client eviction on lock callback timeout &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-874&quot;&gt;&lt;del&gt;LU-874&lt;/del&gt;&lt;/a&gt; tests and attempts to reproduce, running with a Lustre version that was builded for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1680&quot; title=&quot;LBUG cl_lock.c:1949:discard_cb()) (ORI-726)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1680&quot;&gt;&lt;del&gt;LU-1680&lt;/del&gt;&lt;/a&gt; patches ...&lt;/p&gt;

&lt;p&gt;Will try to 1st do some historical research to understand the conditions leading to this problem.&lt;/p&gt;</comment>
                            <comment id="52152" author="prakash" created="Mon, 11 Feb 2013 14:53:32 +0000"  >&lt;p&gt;This might end up being related to, or a duplicate of, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2683&quot; title=&quot;Client deadlock in cl_lock_mutex_get&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2683&quot;&gt;&lt;del&gt;LU-2683&lt;/del&gt;&lt;/a&gt;. I&apos;m looking at a console log for a node which hit this a few days ago, and it triggered when a reboot was attempted after what &lt;em&gt;appears&lt;/em&gt; to be the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2683&quot; title=&quot;Client deadlock in cl_lock_mutex_get&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2683&quot;&gt;&lt;del&gt;LU-2683&lt;/del&gt;&lt;/a&gt; hang.&lt;/p&gt;

&lt;p&gt;For example, I see these messages:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-02-04 10:36:46.046647 {DefaultControlEventListener} [mmcs]{739}.14.1: LustreError: 35299:0:(import.c:329:ptlrpc_invalidate_import()) ls1-OST0132_UUID: rc = -110 waiting for callback (1 != 0)
2013-02-04 10:36:46.047324 {DefaultControlEventListener} [mmcs]{739}.14.1: LustreError: 35299:0:(import.c:329:ptlrpc_invalidate_import()) Skipped 91 previous similar messages
2013-02-04 10:36:46.047897 {DefaultControlEventListener} [mmcs]{739}.14.1: LustreError: 35299:0:(import.c:355:ptlrpc_invalidate_import()) @@@ still on sending list  req@c0000002818f5000 x1425746435123190/t0(0) o101-&amp;gt;ls1-OST0132-osc-c0000003e98eb000@172.20.2.106@o2ib500:28/4 lens 328/368 e 0 to 0 dl 1359967526 ref 1 fl Interpret:RE/0/0 rc -5/301
2013-02-04 10:36:46.048227 {DefaultControlEventListener} [mmcs]{739}.14.1: LustreError: 35299:0:(import.c:355:ptlrpc_invalidate_import()) Skipped 98 previous similar messages
2013-02-04 10:36:46.048870 {DefaultControlEventListener} [mmcs]{739}.14.1: LustreError: 35299:0:(import.c:371:ptlrpc_invalidate_import()) ls1-OST0132_UUID: RPCs in &quot;Unregistering&quot; phase found (0). Network is sluggish? Waiting them to error out.
2013-02-04 10:36:46.049450 {DefaultControlEventListener} [mmcs]{739}.14.1: LustreError: 35299:0:(import.c:371:ptlrpc_invalidate_import()) Skipped 101 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;And many other network/recovery related messages in the logs for hours before the LBUG was triggered.&lt;/p&gt;</comment>
                            <comment id="52232" author="morrone" created="Tue, 12 Feb 2013 16:14:51 +0000"  >&lt;p&gt;We&apos;ve been hitting this more frequently recently, on login nodes in particular.&lt;/p&gt;</comment>
                            <comment id="52233" author="prakash" created="Tue, 12 Feb 2013 16:31:30 +0000"  >&lt;p&gt;Chris, do we see this in the absence of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2683&quot; title=&quot;Client deadlock in cl_lock_mutex_get&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2683&quot;&gt;&lt;del&gt;LU-2683&lt;/del&gt;&lt;/a&gt; on the LAC nodes? All of the occurrences I&apos;ve seen in the ION logs are at reboot time after &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2683&quot; title=&quot;Client deadlock in cl_lock_mutex_get&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2683&quot;&gt;&lt;del&gt;LU-2683&lt;/del&gt;&lt;/a&gt; likely occurred.&lt;/p&gt;</comment>
                            <comment id="52243" author="prakash" created="Tue, 12 Feb 2013 20:34:35 +0000"  >&lt;p&gt;Jinshan, when you get some time, can you look over this ticket? We&apos;re constantly hitting this in production, as of the past couple days.&lt;/p&gt;</comment>
                            <comment id="52249" author="jay" created="Wed, 13 Feb 2013 00:18:30 +0000"  >&lt;p&gt;I&apos;m looking at this one.&lt;/p&gt;</comment>
                            <comment id="52252" author="jay" created="Wed, 13 Feb 2013 01:55:18 +0000"  >&lt;p&gt;I have known the problem, I will work out a patch tomorrow.&lt;/p&gt;</comment>
                            <comment id="52255" author="bfaccini" created="Wed, 13 Feb 2013 02:40:49 +0000"  >&lt;p&gt;Sorry to be late on this but it took me sometimes to retrieve the infos from the times I hit this bug weeks ago ...&lt;/p&gt;

&lt;p&gt;In fact I have already seen it when I tried to shutdown nodes with hung threads with the following kind of stacks I was able to dump :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;==============================================================================================
Nov 27 01:09:17 client-15 kernel: reproducer S 0000000000000001 0 8231 1 0x00000080
Nov 27 01:09:17 client-15 kernel: ffff8802e399d538 0000000000000082 ffff8802e399d538 ffffffff814fea8a
Nov 27 01:09:17 client-15 kernel: ffff8802e399d4b8 0000000000000286 000000000000001c 00000000000000ff
Nov 27 01:09:17 client-15 kernel: ffff88032f0e3af8 ffff8802e399dfd8 000000000000fb88 ffff88032f0e3af8
Nov 27 01:09:17 client-15 kernel: Call Trace:
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffff814fea8a&amp;gt;] ? schedule_timeout+0x19a/0x2e0
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0e0d77e&amp;gt;] cfs_waitq_wait+0xe/0x10 [libcfs]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa05f6d27&amp;gt;] osc_extent_wait+0x5f7/0x6c0 [osc]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffff81060250&amp;gt;] ? default_wake_function+0x0/0x20
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa05f714c&amp;gt;] osc_cache_wait_range+0x35c/0x9a0 [osc]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0f85a3e&amp;gt;] ? cl_object_put+0xe/0x10 [obdclass]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa05f38b8&amp;gt;] ? osc_io_unplug0+0x748/0x12a0 [osc]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffff8127ce66&amp;gt;] ? vsnprintf+0x2b6/0x5f0
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa05fc731&amp;gt;] osc_cache_writeback_range+0xd61/0x1160 [osc]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0e0c27b&amp;gt;] ? cfs_set_ptldebug_header+0x2b/0xc0 [libcfs]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0e22d97&amp;gt;] ? cfs_hash_bd_lookup_intent+0x37/0x130 [libcfs]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0e222d2&amp;gt;] ? cfs_hash_bd_add_locked+0x62/0x90 [libcfs]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0f8709d&amp;gt;] ? cl_env_nested_get+0x5d/0xc0 [obdclass]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0e25cbb&amp;gt;] ? cfs_hash_add_unique+0x1b/0x40 [libcfs]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa05e40f6&amp;gt;] osc_lock_flush+0x86/0x200 [osc]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa05e4357&amp;gt;] osc_lock_cancel+0xe7/0x1c0 [osc]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0f8d0a5&amp;gt;] cl_lock_cancel0+0x75/0x160 [obdclass]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0f8ece6&amp;gt;] cl_lock_hold_release+0x1c6/0x2b0 [obdclass]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0f90837&amp;gt;] cl_lock_unhold+0x37/0x130 [obdclass]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa067e5f8&amp;gt;] lov_sublock_release+0x1a8/0x2a0 [lov]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa06811fb&amp;gt;] lov_lock_enqueue+0x2cb/0x830 [lov]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0f9149c&amp;gt;] cl_enqueue_try+0xfc/0x310 [obdclass]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0f9295d&amp;gt;] cl_enqueue_locked+0x6d/0x210 [obdclass]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0f9363e&amp;gt;] cl_lock_request+0x7e/0x280 [obdclass]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0be3deb&amp;gt;] cl_glimpse_lock+0x17b/0x4a0 [lustre]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0be4677&amp;gt;] cl_glimpse_size0+0x187/0x190 [lustre]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0ba0ee2&amp;gt;] ll_inode_revalidate_it+0xf2/0x1c0 [lustre]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0e22134&amp;gt;] ? cfs_hash_dual_bd_unlock+0x34/0x60 [libcfs]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0ba0ff9&amp;gt;] ll_getattr_it+0x49/0x170 [lustre]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffffa0ba1157&amp;gt;] ll_getattr+0x37/0x40 [lustre]
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffff812143d3&amp;gt;] ? security_inode_getattr+0x23/0x30
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffff81180571&amp;gt;] vfs_getattr+0x51/0x80
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffff8118082f&amp;gt;] vfs_fstat+0x3f/0x60
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffff81180874&amp;gt;] sys_newfstat+0x24/0x40
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffff810d6b12&amp;gt;] ? audit_syscall_entry+0x272/0x2a0
Nov 27 01:09:17 client-15 kernel: [&amp;lt;ffffffff8100b0f2&amp;gt;] system_call_fastpath+0x16/0x1b
==============================================================================================&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;before node crashed during shutdown within the same task context and with the following stack which is the same you got :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LustreError: 8231:0:(cl_lock.c:1960:discard_cb()) ASSERTION( (!(page-&amp;gt;cp_type == CPT_CACHEABLE) || (!PageWriteback(cl_page_vmpage(env, page)))) ) failed: 
LustreError: 8231:0:(cl_lock.c:1960:discard_cb()) LBUG
[ OK Pid: 8231, comm: reproducer
Call Trace:
[&amp;lt;ffffffffa0e0c905&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
[&amp;lt;ffffffffa0e0cf17&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
[&amp;lt;ffffffffa0f8f66c&amp;gt;] discard_cb+0x1ec/0x1f0 [obdclass]
[&amp;lt;ffffffffa0f8c864&amp;gt;] cl_page_gang_lookup+0x1f4/0x400 [obdclass]
[&amp;lt;ffffffffa0f8f480&amp;gt;] ? discard_cb+0x0/0x1f0 [obdclass]
[&amp;lt;ffffffffa0f8f480&amp;gt;] ? discard_cb+0x0/0x1f0 [obdclass]
[&amp;lt;ffffffffa0f8f33e&amp;gt;] cl_lock_discard_pages+0x11e/0x1f0 [obdclass]
[&amp;lt;ffffffffa05e4180&amp;gt;] osc_lock_flush+0x110/0x200 [osc]
[&amp;lt;ffffffffa05e4357&amp;gt;] osc_lock_cancel+0xe7/0x1c0 [osc]
[&amp;lt;ffffffffa0f8d0a5&amp;gt;] cl_lock_cancel0+0x75/0x160 [obdclass]
[&amp;lt;ffffffffa0f8ece6&amp;gt;] cl_lock_hold_release+0x1c6/0x2b0 [obdclass]
[&amp;lt;ffffffffa0f90837&amp;gt;] cl_lock_unhold+0x37/0x130 [obdclass]
[&amp;lt;ffffffffa067e5f8&amp;gt;] lov_sublock_release+0x1a8/0x2a0 [lov]
[&amp;lt;ffffffffa06811fb&amp;gt;] lov_lock_enqueue+0x2cb/0x830 [lov]
[&amp;lt;ffffffffa0f9149c&amp;gt;] cl_enqueue_try+0xfc/0x310 [obdclass]
[&amp;lt;ffffffffa0f9295d&amp;gt;] cl_enqueue_locked+0x6d/0x210 [obdclass]
[&amp;lt;ffffffffa0f9363e&amp;gt;] cl_lock_request+0x7e/0x280 [obdclass]
[&amp;lt;ffffffffa0be3deb&amp;gt;] cl_glimpse_lock+0x17b/0x4a0 [lustre]
[&amp;lt;ffffffffa0be4677&amp;gt;] cl_glimpse_size0+0x187/0x190 [lustre]
[&amp;lt;ffffffffa0ba0ee2&amp;gt;] ll_inode_revalidate_it+0xf2/0x1c0 [lustre]
[&amp;lt;ffffffffa0e22134&amp;gt;] ? cfs_hash_dual_bd_unlock+0x34/0x60 [libcfs]
[&amp;lt;ffffffffa0ba0ff9&amp;gt;] ll_getattr_it+0x49/0x170 [lustre]
[&amp;lt;ffffffffa0ba1157&amp;gt;] ll_getattr+0x37/0x40 [lustre]
[&amp;lt;ffffffff812143d3&amp;gt;] ? security_inode_getattr+0x23/0x30
[&amp;lt;ffffffff81180571&amp;gt;] vfs_getattr+0x51/0x80
[&amp;lt;ffffffff8118082f&amp;gt;] vfs_fstat+0x3f/0x60
[&amp;lt;ffffffff81180874&amp;gt;] sys_newfstat+0x24/0x40
[&amp;lt;ffffffff810d6b12&amp;gt;] ? audit_syscall_entry+0x272/0x2a0
[&amp;lt;ffffffff8100b0f2&amp;gt;] system_call_fastpath+0x16/0x1b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;and I confirm that there was also recent msgs, like you have, claiming for &quot;unregistering&quot; RPCs and possible sluggish network.&lt;/p&gt;

&lt;p&gt;Can you confirm if you have the same kind of stacks for hung threads in the crash-dumps from previous occurences ?&lt;/p&gt;

&lt;p&gt;If yes, Jinshan, could it be possible that we handle again pages that should have been already discarded but are still stuck (orphaned?) awaiting write-back ??&lt;/p&gt;

&lt;p&gt;BTW, I got this LBUG when trying to reproduce &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-874&quot; title=&quot;Client eviction on lock callback timeout &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-874&quot;&gt;&lt;del&gt;LU-874&lt;/del&gt;&lt;/a&gt; problem with its known reproducer, and &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-874&quot; title=&quot;Client eviction on lock callback timeout &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-874&quot;&gt;&lt;del&gt;LU-874&lt;/del&gt;&lt;/a&gt; has also be linked to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2683&quot; title=&quot;Client deadlock in cl_lock_mutex_get&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2683&quot;&gt;&lt;del&gt;LU-2683&lt;/del&gt;&lt;/a&gt;. Since that time I also installed &lt;a href=&quot;http://review.whamcloud.com/5208&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5208&lt;/a&gt; from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2683&quot; title=&quot;Client deadlock in cl_lock_mutex_get&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2683&quot;&gt;&lt;del&gt;LU-2683&lt;/del&gt;&lt;/a&gt; and I have not been able to reproduce anymore.&lt;/p&gt;

&lt;p&gt;Is it possible for you to run &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-874&quot; title=&quot;Client eviction on lock callback timeout &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-874&quot;&gt;&lt;del&gt;LU-874&lt;/del&gt;&lt;/a&gt; reproducer on you system where failure occur ?? I can help to refresh details about it if required but the code is very simple and the way to run it too in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-874&quot; title=&quot;Client eviction on lock callback timeout &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-874&quot;&gt;&lt;del&gt;LU-874&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;And also, it would help to have &quot;dlmtrace&quot;, &quot;cache&quot; and &quot;page&quot; debug flags set in addition on Client-nodes likely to reproduce.&lt;/p&gt;

</comment>
                            <comment id="52264" author="bfaccini" created="Wed, 13 Feb 2013 07:38:42 +0000"  >&lt;p&gt;Jinshan, just some thoughts after re-reading source code involved and talking about it with Johann, don&apos;t you think it could be much safer to call cl_page_own() BEFORE any ASSERTs in discard_cb() ??&lt;/p&gt;

&lt;p&gt;Seems to me that this will avoid situations where we exited with error (timeout?) from osc_cache_writeback_range(), which is not tested in osc_lock_flush(), it will then call cl_lock_discard_pages()/cl_page_gang_lookup()/discard_cb() and re-parse same pages and thus find some to trigger assert.&lt;/p&gt;
</comment>
                            <comment id="52287" author="jay" created="Wed, 13 Feb 2013 12:52:51 +0000"  >&lt;p&gt;Hi Bruno, you&apos;re right about the code. Obviously the process waiting for the IO to finish received a signal from init because the system is shutting down so yes, this problem can be fixed by taking away this LASSERT - this is what I planned to do last right. However, the process will still have to wait page writeback to finish at cl_page_own(), and it will hang forever if the OST is already shutdown at the moment. Therefore you have to hit the power button to shut it down.&lt;/p&gt;

&lt;p&gt;This led me to think how to shutdown a client node gracefully. Obviously we should notify ptlrpcd threads for this event, then we should abort all of the write RPCs with error, then waiting processes can be notified and mount point can be cleaned up. This is out of the scope of this issue.&lt;/p&gt;

&lt;p&gt;BTW, I pushed a new patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2683&quot; title=&quot;Client deadlock in cl_lock_mutex_get&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2683&quot;&gt;&lt;del&gt;LU-2683&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-874&quot; title=&quot;Client eviction on lock callback timeout &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-874&quot;&gt;&lt;del&gt;LU-874&lt;/del&gt;&lt;/a&gt;, will you please verify it again? Thanks.&lt;/p&gt;</comment>
                            <comment id="52297" author="prakash" created="Wed, 13 Feb 2013 13:59:28 +0000"  >&lt;p&gt;I also want to note, that we hit this on the login nodes &lt;b&gt;without&lt;/b&gt; a manual reboot or restart. There are traces of network issues (perhaps &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2683&quot; title=&quot;Client deadlock in cl_lock_mutex_get&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2683&quot;&gt;&lt;del&gt;LU-2683&lt;/del&gt;&lt;/a&gt;, the logs are consistent) and evictions, followed by the ASSERT triggering on its own. Without an in depth knowledge of the client code, I&apos;m inclined to think that the client was trying to clean up after the evictions, and happened try and discard a page which still had its writeback bit set.&lt;/p&gt;

&lt;p&gt;Bruno, I can probably get some time to run your reproducer, but I&apos;m not sure it will be enlightening in any way. Jinshan, I don&apos;t see an updated patch set to &lt;a href=&quot;http://review.whamcloud.com/5208&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5208&lt;/a&gt; yet (still patch set 2), have you pushed the new patch somewhere else?&lt;/p&gt;</comment>
                            <comment id="52300" author="jay" created="Wed, 13 Feb 2013 14:19:09 +0000"  >&lt;p&gt;The only path I can think of right now is that osc_extent_wait() is interrupted so that it can&apos;t wait for the IO to finish. Can you try to reproduce it with DLMTRACE and CACHE enabled on the client side?&lt;/p&gt;</comment>
                            <comment id="52304" author="prakash" created="Wed, 13 Feb 2013 14:45:56 +0000"  >&lt;p&gt;I can try. Although, it&apos;s not dumping a lustre debug log when it ASSERTS, so I need to remind myself if it is supposed to or not. Also, it&apos;s hitting it without any messages to the console in nearly two hours prior to the crash. So maybe this isn&apos;t as closely related to evictions as I previously thought.&lt;/p&gt;</comment>
                            <comment id="52305" author="prakash" created="Wed, 13 Feb 2013 14:47:53 +0000"  >&lt;p&gt;I do see a couple messages from osc_extent_wait nearly two hours before one specific crash:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-02-13 01:45:43 LustreError: 47189:0:(osc_cache.c:896:osc_extent_wait()) extent c000000c3d8c2990@{[0 -&amp;gt; 0/15], [3|0|+|rpc|wihY|c000000c08582450], [65536|1|+|-|c000000bf0414730|16|c000000f53444090]} lsa-OST0101-osc-c000000f544e2180: wait ext to 0 timedout, recovery in progress?
2013-02-13 01:45:48 LustreError: 3843:0:(osc_cache.c:896:osc_extent_wait()) extent c000000c3d8c6158@{[0 -&amp;gt; 0/15], [3|0|+|rpc|wihY|c000000c08581a30], [65536|1|+|-|c000000bf041fe10|16|c000000f37c8b4e0]} lsa-OST00eb-osc-c000000f544e2180: wait ext to 0 timedout, recovery in progress?
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Not sure if that&apos;s related though, as I haven&apos;t looked at the code to decipher its meaning yet.&lt;/p&gt;</comment>
                            <comment id="52307" author="jay" created="Wed, 13 Feb 2013 15:06:07 +0000"  >&lt;p&gt;please apply this patch: &lt;a href=&quot;http://review.whamcloud.com/5419&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5419&lt;/a&gt; and monitor console while you&apos;re doing test, thanks.&lt;/p&gt;</comment>
                            <comment id="52315" author="prakash" created="Wed, 13 Feb 2013 16:46:50 +0000"  >&lt;p&gt;OK. I&apos;ll pull that into a tag and see if I can get it installed soon. It will be going onto one of our production machines, so I just want to check that it&apos;s safe to remove that assertion, right?&lt;/p&gt;</comment>
                            <comment id="52318" author="jay" created="Wed, 13 Feb 2013 17:24:58 +0000"  >&lt;p&gt;won&apos;t be worse.&lt;/p&gt;</comment>
                            <comment id="52373" author="bfaccini" created="Thu, 14 Feb 2013 10:31:31 +0000"  >&lt;p&gt;Prakash, right, don&apos;t waste time running &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-874&quot; title=&quot;Client eviction on lock callback timeout &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-874&quot;&gt;&lt;del&gt;LU-874&lt;/del&gt;&lt;/a&gt; reproducer, re-reading all our last comments and source code I am sure now that it was only one more occurrence of the same scenario we track here and you able to trigger easily with your own work-load.&lt;br/&gt;
So we only need now to wait for the debug traces and Jinshan&apos;s added ones, if you applied his patch from &lt;a href=&quot;http://review.whamcloud.com/5419&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5419&lt;/a&gt;, to learn why we returned from osc_extent_wait().&lt;br/&gt;
But then, could this be simply and again because of a signal (SIGINT/SIGHUP from interactive session, Slurm for batch, ...) ??&lt;/p&gt;

&lt;p&gt;Jinshan, &lt;a href=&quot;http://review.whamcloud.com/5208&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5208&lt;/a&gt; patch #2 testing is running now, will update you on how it works.&lt;/p&gt;</comment>
                            <comment id="52718" author="prakash" created="Tue, 19 Feb 2013 18:07:30 +0000"  >&lt;p&gt;Jinshan, I&apos;ve attached &quot;console.rzuseqlac2.bz2&quot; which contains console output from a node running with &lt;a href=&quot;http://review.whamcloud.com/5419&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;5419&lt;/a&gt;. There are many messages in the logs that are similar to what has been previously reported.&lt;/p&gt;</comment>
                            <comment id="52967" author="bfaccini" created="Mon, 25 Feb 2013 11:08:01 +0000"  >&lt;p&gt;Jinshan, in the last trace from Prakash, I find only ETIMEDOUT/-110 msgs from osc_extent_wait() !! &lt;/p&gt;

&lt;p&gt;Prakash, would have been also interesting to have crash-dump/Alt-SysRq-T to see if there were hung tasks in cl_page_own() (or better in the 2nd call to l_wait_event() in osc_extent_wait()) which can be already highly suspected since you has not been able to shutdown gracefully.&lt;/p&gt;

&lt;p&gt;But anyway, clearly during errors or time-outs handling we need to avoid the LBUG and at least wait or retry for ever, or decide what to do with these orphan pages.&lt;/p&gt;</comment>
                            <comment id="53003" author="jay" created="Mon, 25 Feb 2013 23:23:42 +0000"  >&lt;p&gt;After reading the backtrace, I realize this is the same problem with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2683&quot; title=&quot;Client deadlock in cl_lock_mutex_get&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2683&quot;&gt;&lt;del&gt;LU-2683&lt;/del&gt;&lt;/a&gt;. The IO has already been finished but the reply can&apos;t be handled because ptlrpcd thread was blocked at cl_mutex_lock_get(). I&apos;m going to find a solution for both problem.&lt;/p&gt;

&lt;p&gt;Just in case, did you apply patches both 5419 and 5208 in your test?&lt;/p&gt;

&lt;p&gt;Sorry for delay response, BTW.&lt;/p&gt;</comment>
                            <comment id="53004" author="bfaccini" created="Mon, 25 Feb 2013 23:37:23 +0000"  >&lt;p&gt;Jinshan, ptlrpcd being blocked can explain/cause the time-out, is that what you mean? But anyway in all cases don&apos;t you think we need to wait/retry unless to trigger the LBUG/Assert ??&lt;/p&gt;

&lt;p&gt;Also your latest patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2683&quot; title=&quot;Client deadlock in cl_lock_mutex_get&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2683&quot;&gt;&lt;del&gt;LU-2683&lt;/del&gt;&lt;/a&gt; was successful is&apos;nt-it ?&lt;/p&gt;</comment>
                            <comment id="53024" author="jay" created="Tue, 26 Feb 2013 04:07:06 +0000"  >&lt;p&gt;Yes, it is. Anyway I&apos;ve understood the root cause of this problem, including why it hit assertion. Patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2683&quot; title=&quot;Client deadlock in cl_lock_mutex_get&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2683&quot;&gt;&lt;del&gt;LU-2683&lt;/del&gt;&lt;/a&gt; caused another problem which I will take a look at.&lt;/p&gt;

&lt;p&gt;I&apos;m going to work out a new patch to address LASSERT problem.&lt;/p&gt;</comment>
                            <comment id="53044" author="prakash" created="Tue, 26 Feb 2013 12:14:55 +0000"  >&lt;p&gt;Jinshan, the attached console was from a machine running the 2.3.58-13chaos tag which &lt;b&gt;does&lt;/b&gt; have both 5419 and 5208 applied.&lt;/p&gt;</comment>
                            <comment id="53093" author="jay" created="Wed, 27 Feb 2013 03:37:56 +0000"  >&lt;p&gt;I saw some suspicious backtrace from console message. It&apos;ll be really useful if you happened to collect backtrace of all processes in the system? Also did you see some suspicious messages on the correctponding OSS?&lt;/p&gt;

&lt;p&gt;OSS ran into problem:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-02-16 01:37:23 LustreError: 11-0: lsa-OST0048-osc-c000000f546f0600: Communicating with 172.16.69.25@tcp1, operation obd_ping failed with -107
2013-02-16 01:37:23 Lustre: lsa-OST0048-osc-c000000f546f0600: Connection to lsa-OST0048 (at 172.16.69.25@tcp1) was lost; in progress operations using this service will wait for recovery to complete
2013-02-16 01:37:23 LustreError: 167-0: lsa-OST0048-osc-c000000f546f0600: This client was evicted by lsa-OST0048; in progress operations using this service will fail.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And backtrace from console:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-02-16 01:39:03 INFO: task ldlm_bl_13:28757 blocked for more than 120 seconds.
2013-02-16 01:39:03 &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
2013-02-16 01:39:03 ldlm_bl_13    D 0000000000000000     0 28757      2 0x00008000
2013-02-16 01:39:03 Call Trace:
2013-02-16 01:39:03 [c000000f3d813550] [c000000f3d813600] 0xc000000f3d813600 (unreliable)
2013-02-16 01:39:03 [c000000f3d813720] [c0000000000142d8] .__switch_to+0xf8/0x1d0
2013-02-16 01:39:03 [c000000f3d8137b0] [c0000000005bb0a8] .schedule+0x3f8/0xd30
2013-02-16 01:39:03 [c000000f3d813ab0] [c0000000005bcb4c] .__mutex_lock_slowpath+0x1bc/0x2d0
2013-02-16 01:39:03 [c000000f3d813b90] [c0000000005bd42c] .mutex_lock+0x5c/0x60
2013-02-16 01:39:03 [c000000f3d813c10] [d00000000f292398] .cl_lock_mutex_get+0xc8/0x110 [obdclass]
2013-02-16 01:39:03 [c000000f3d813ca0] [d0000000111ecf98] .osc_ldlm_blocking_ast+0x98/0x4e0 [osc]
2013-02-16 01:39:03 [c000000f3d813d80] [d00000001095b440] .ldlm_handle_bl_callback+0x1b0/0x7d0 [ptlrpc]
2013-02-16 01:39:03 [c000000f3d813e40] [d00000001095bdd0] .ldlm_bl_thread_main+0x370/0x610 [ptlrpc]
2013-02-16 01:39:03 [c000000f3d813f90] [c000000000032fd4] .kernel_thread+0x54/0x70
2013-02-16 01:39:03 INFO: task ldlm_bl_14:28758 blocked for more than 120 seconds.
2013-02-16 01:39:03 &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
2013-02-16 01:39:03 ldlm_bl_14    D 0000000000000000     0 28758      2 0x00008000
2013-02-16 01:39:03 Call Trace:
2013-02-16 01:39:03 [c000000dadbfb550] [c000000dadbfb600] 0xc000000dadbfb600 (unreliable)
2013-02-16 01:39:03 [c000000dadbfb720] [c0000000000142d8] .__switch_to+0xf8/0x1d0
2013-02-16 01:39:03 [c000000dadbfb7b0] [c0000000005bb0a8] .schedule+0x3f8/0xd30
2013-02-16 01:39:03 [c000000dadbfbab0] [c0000000005bcb4c] .__mutex_lock_slowpath+0x1bc/0x2d0
2013-02-16 01:39:03 [c000000dadbfbb90] [c0000000005bd42c] .mutex_lock+0x5c/0x60
2013-02-16 01:39:03 [c000000dadbfbc10] [d00000000f292398] .cl_lock_mutex_get+0xc8/0x110 [obdclass]
2013-02-16 01:39:03 [c000000dadbfbca0] [d0000000111ecf98] .osc_ldlm_blocking_ast+0x98/0x4e0 [osc]
2013-02-16 01:39:03 [c000000dadbfbd80] [d00000001095b440] .ldlm_handle_bl_callback+0x1b0/0x7d0 [ptlrpc]
2013-02-16 01:39:03 [c000000dadbfbe40] [d00000001095bdd0] .ldlm_bl_thread_main+0x370/0x610 [ptlrpc]
2013-02-16 01:39:03 [c000000dadbfbf90] [c000000000032fd4] .kernel_thread+0x54/0x70
2013-02-16 01:39:03 INFO: task atsmercury_back:21298 blocked for more than 120 seconds.
2013-02-16 01:39:03 &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
2013-02-16 01:39:03 atsmercury_ba D 00000080538c8c60     0 21298  21297 0x00008000
2013-02-16 01:39:03 Call Trace:
2013-02-16 01:39:03 [c0000005af6b2ed0] [c0000005af6b2f80] 0xc0000005af6b2f80 (unreliable)
2013-02-16 01:39:03 [c0000005af6b30a0] [c0000000000142d8] .__switch_to+0xf8/0x1d0
2013-02-16 01:39:03 [c0000005af6b3130] [c0000000005bb0a8] .schedule+0x3f8/0xd30
2013-02-16 01:39:03 [c0000005af6b3430] [c0000000005bcb4c] .__mutex_lock_slowpath+0x1bc/0x2d0
2013-02-16 01:39:03 [c0000005af6b3510] [c0000000005bd42c] .mutex_lock+0x5c/0x60
2013-02-16 01:39:03 [c0000005af6b3590] [d00000000f292398] .cl_lock_mutex_get+0xc8/0x110 [obdclass]
2013-02-16 01:39:03 [c0000005af6b3620] [d00000000f295c7c] .cl_lock_hold_mutex+0x11c/0x960 [obdclass]
2013-02-16 01:39:03 [c0000005af6b3750] [d00000000f297d60] .cl_lock_request+0xc0/0x370 [obdclass]
2013-02-16 01:39:03 [c0000005af6b3820] [d00000001369fb94] .cl_glimpse_lock+0x2b4/0x640 [lustre]
2013-02-16 01:39:03 [c0000005af6b3910] [d0000000136a0118] .cl_glimpse_size0+0x1f8/0x270 [lustre]
2013-02-16 01:39:03 [c0000005af6b39e0] [d0000000136315f0] .ll_inode_revalidate_it+0x220/0x2c0 [lustre]
2013-02-16 01:39:03 [c0000005af6b3aa0] [d0000000136316d0] .ll_getattr_it+0x40/0x180 [lustre]
2013-02-16 01:39:03 [c0000005af6b3b40] [d000000013631854] .ll_getattr+0x44/0x60 [lustre]
2013-02-16 01:39:03 [c0000005af6b3bf0] [c0000000001c8fe4] .vfs_getattr+0x74/0xf0
2013-02-16 01:39:03 [c0000005af6b3c90] [c0000000001c90e0] .vfs_fstatat+0x80/0xb0
2013-02-16 01:39:03 [c0000005af6b3d30] [c0000000001c9274] .SyS_newlstat+0x24/0x50
2013-02-16 01:39:03 [c0000005af6b3e30] [c000000000008564] syscall_exit+0x0/0x40
2013-02-16 01:39:03 LustreError: 21507:0:(import.c:329:ptlrpc_invalidate_import()) lsa-OST0048_UUID: rc = -110 waiting for callback (1 != 0)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="53104" author="jay" created="Wed, 27 Feb 2013 04:53:00 +0000"  >&lt;p&gt;From the stacktrace above, the blocking_ast was blocked at acquiring cl_lock mutex, so the reply of write RPC shouldn&apos;t be blocked by it because different ptlrpc portal should be used. The status on the OST is also very important.&lt;/p&gt;</comment>
                            <comment id="53124" author="prakash" created="Wed, 27 Feb 2013 12:36:58 +0000"  >&lt;p&gt;Here&apos;s the only messages on the OSS console I see regarding this client on 2013-02-16:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-02-16 01:36:56 LustreError: 0:0:(ldlm_lockd.c:358:waiting_locks_callback()) ### lock callback timer expired after 84460s: evicting client at 172.21.1.62@o2ib200  ns: filter-lsa-OST0048_UUID lock: ffff880078fbf800/0x63cba3a8ff7a6537 lrc: 3/0,0 mode: PW/PW res: 19309611/0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;65535) flags: 0x10020 remote: 0x4a6511c74b338406 expref: 6 pid: 3504 timeout 19649874112
2013-02-16 01:41:10 Lustre: lsa-OST0048: haven&apos;t heard from client a15eb878-e92e-c8f0-0c41-9520d958df72 (at 172.21.1.62@o2ib200) in 227 seconds. I think it&apos;s dead, and I am evicting it. exp ffff8801102ff000, cur 1361007670 expire 1361007520 last 1361007443
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="53239" author="jay" created="Sun, 3 Mar 2013 12:14:59 +0000"  >&lt;p&gt;Hi Prakash,&lt;/p&gt;

&lt;p&gt;Bruno will try to reproduce this issue locally. Can you please reproduce this issue and collect the output of backtrace of all processes? Also, please apply patch 2638 and &lt;b&gt;patch set 2&lt;/b&gt; of 5419 when doing the test, thanks in advance.&lt;/p&gt;</comment>
                            <comment id="53444" author="bfaccini" created="Wed, 6 Mar 2013 10:48:39 +0000"  >&lt;p&gt;Jinshan, Prakash,&lt;/p&gt;

&lt;p&gt;Since end of last week, I did intensive testing running with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-874&quot; title=&quot;Client eviction on lock callback timeout &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-874&quot;&gt;&lt;del&gt;LU-874&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2683&quot; title=&quot;Client deadlock in cl_lock_mutex_get&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2683&quot;&gt;&lt;del&gt;LU-2683&lt;/del&gt;&lt;/a&gt; patch (aka, change 5208) and I have never been able to reproduce this problem anymore, even when simulating/injecting some networking issues to cause the thread hangs where bad things start ... May be you have better idea on how to fall in the same situation ??&lt;/p&gt;

&lt;p&gt;Prakash, may be you can help too if you can qualify with the behavior of the workload that was running at the time of the crashes ??&lt;/p&gt;
</comment>
                            <comment id="57111" author="adrian" created="Fri, 26 Apr 2013 11:09:38 +0000"  >&lt;p&gt;We recently started to hit this issue (3 crashes so far) while running the git-v2_3_61 client.&lt;/p&gt;

&lt;p&gt;I could upload the dumped `vmcore&apos; from (at lest) two crashes if someone would like to have a look at them.&lt;/p&gt;</comment>
                            <comment id="57119" author="pjones" created="Fri, 26 Apr 2013 13:09:14 +0000"  >&lt;p&gt;Yes please Adrian!&lt;/p&gt;</comment>
                            <comment id="57122" author="adrian" created="Fri, 26 Apr 2013 13:23:03 +0000"  >&lt;p&gt;Does whamcloud have a private/write-only FTP server to upload the cores? (~16GB) -&amp;gt; The crash data is from a login-node and will therefore include private user data.&lt;/p&gt;

&lt;p&gt;If not: Just give me a pgp-key ID and i&apos;ll encrypt + upload them to our own webserver&lt;/p&gt;</comment>
                            <comment id="57123" author="pjones" created="Fri, 26 Apr 2013 13:27:24 +0000"  >&lt;p&gt;yes we have a private ftp area. I will mail you the details directly&lt;/p&gt;</comment>
                            <comment id="57124" author="adrian" created="Fri, 26 Apr 2013 14:24:14 +0000"  >&lt;p&gt;Thanks: I started to upload the first core to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2779&quot; title=&quot;LBUG in discard_cb: !(page-&amp;gt;cp_type == CPT_CACHEABLE) || (!PageWriteback(cl_page_vmpage(env, page)))&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2779&quot;&gt;&lt;del&gt;LU-2779&lt;/del&gt;&lt;/a&gt;/127.0.0.1-2013-04-17-10:22:30/&lt;/p&gt;

&lt;p&gt;I&apos;ll upload the 2nd vmcore file on Monday (the firstone is still uploading)&lt;/p&gt;</comment>
                            <comment id="57125" author="pjones" created="Fri, 26 Apr 2013 14:27:40 +0000"  >&lt;p&gt;Thanks Adrian!&lt;/p&gt;</comment>
                            <comment id="57481" author="jay" created="Wed, 1 May 2013 23:53:46 +0000"  >&lt;p&gt;Hi Adrian, can you please try patch:&lt;/p&gt;

&lt;p&gt;commit 2448de6c51ceccea6a308d73d7960f236e0c0847&lt;br/&gt;
Author: Jinshan Xiong &amp;lt;jinshan.xiong@intel.com&amp;gt;&lt;br/&gt;
Date:   Tue Jan 29 16:35:49 2013 -0800&lt;/p&gt;

&lt;p&gt;    &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2683&quot; title=&quot;Client deadlock in cl_lock_mutex_get&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2683&quot;&gt;&lt;del&gt;LU-2683&lt;/del&gt;&lt;/a&gt; lov: release all locks in closure to release sublock&lt;/p&gt;

&lt;p&gt;and see if it helps.&lt;/p&gt;</comment>
                            <comment id="57510" author="adrian" created="Thu, 2 May 2013 13:19:03 +0000"  >&lt;p&gt;We already have this patch: Our RPM was compiled using the v2_3_63_0 tag (which includes commit 2448de6c51ceccea6a308d73d7960f236e0c0847)&lt;/p&gt;

&lt;p&gt;Btw: We just had yet another crash, so i could provide yet another 8GB vmcore &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/wink.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;
</comment>
                            <comment id="57529" author="jay" created="Thu, 2 May 2013 15:48:03 +0000"  >&lt;p&gt;Can you please upload the vmcore along with the kernel modules on the client side? and what&apos;s the kernel version #? Just in case, the tip of your git tree is f3ef9ea9, right?&lt;/p&gt;</comment>
                            <comment id="57534" author="adrian" created="Thu, 2 May 2013 16:02:04 +0000"  >&lt;p&gt;I&apos;ll upload the latest crash + the lustre *.ko-modules to &quot;uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2779&quot; title=&quot;LBUG in discard_cb: !(page-&amp;gt;cp_type == CPT_CACHEABLE) || (!PageWriteback(cl_page_vmpage(env, page)))&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2779&quot;&gt;&lt;del&gt;LU-2779&lt;/del&gt;&lt;/a&gt;-2&quot; in a few minutes. (The upload will probably take a few hours to complete :/)&lt;/p&gt;

&lt;p&gt;We are running 2.6.32-358.0.1.el6.x86_64&lt;/p&gt;

&lt;p&gt;&lt;del&gt;our git-head is slighlty newer:&lt;/del&gt;&lt;/p&gt;

&lt;p&gt;&lt;del&gt;$ git rev-parse HEAD&lt;/del&gt;&lt;br/&gt;
&lt;del&gt;b859a51e5fa580797dd833bb8e5ec7d6e41411af&lt;/del&gt;&lt;/p&gt;

&lt;p&gt;Uhm: Just noticed that the crashed node was still running 2.3.62 &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;br/&gt;
So the HEAD used was 87ee788bd137d0d82ca107a7615f18f420a3699a, sorry for mixing this up&lt;/p&gt;</comment>
                            <comment id="57538" author="jay" created="Thu, 2 May 2013 16:18:56 +0000"  >&lt;p&gt;Thank you. I will take a look at it.&lt;/p&gt;</comment>
                            <comment id="57674" author="jay" created="Fri, 3 May 2013 21:05:50 +0000"  >&lt;p&gt;Hi Adrian,&lt;/p&gt;

&lt;p&gt;Please check patch at &lt;a href=&quot;http://review.whamcloud.com/6262&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/6262&lt;/a&gt;, with this patch, there should be no assertion any more.&lt;/p&gt;

&lt;p&gt;However, there are other problems from what I can see from the console message. The client was evicted by some OSTs, for example, this kind of message:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;&amp;lt;4&amp;gt;Lustre: nero-OST0006-osc-ffff881c3a770400: Connection to nero-OST0006 (at 10.201.62.37@o2ib) was lost; in progress operations using this service will wait for recovery to complete
&amp;lt;3&amp;gt;LustreError: 167-0: nero-OST0006-osc-ffff881c3a770400: This client was evicted by nero-OST0006; in progress operations using this service will fail.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I need the log on the OST side to know what happened.&lt;/p&gt;

&lt;p&gt;ALso, there are couple of OOM messages. Can you please show me the output of: lctl get_param llite.&amp;#42;.max_cached_mb and lctl get_param osc.&amp;#42;.cached_mb on the client side? Thanks!&lt;/p&gt;</comment>
                            <comment id="57719" author="adrian" created="Mon, 6 May 2013 12:53:10 +0000"  >&lt;p&gt;Thanks for the patch! I&apos;ll give it a try.&lt;/p&gt;

&lt;p&gt;I know about the eviction and the OOM errors: This is from a login node where people run really ugly stuff (forkbombing themselfs, allocating gigabytes of RAM, etc).&lt;/p&gt;

&lt;p&gt;ltcl output:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;$ lctl get_param llite.nero-ffff8804388ba800.max_cached_mb
 llite.nero-ffff8804388ba800.max_cached_mb=
 users: 32
 max_cached_mb: 96767
 used_mb: 49481
 unused_mb: 47286
 reclaim_count: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;$ lctl get_param osc.*.osc_cached_mb
osc.nero-OST0000-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1474
busy_cnt: 0
osc.nero-OST0001-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1297
busy_cnt: 2
osc.nero-OST0002-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1387
busy_cnt: 1
osc.nero-OST0003-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1611
busy_cnt: 2
osc.nero-OST0004-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1593
busy_cnt: 3
osc.nero-OST0005-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1369
busy_cnt: 1
osc.nero-OST0006-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1361
busy_cnt: 0
osc.nero-OST0007-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1253
busy_cnt: 0
osc.nero-OST0008-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1044
busy_cnt: 29
osc.nero-OST0009-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1358
busy_cnt: 2
osc.nero-OST000a-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1075
busy_cnt: 0
osc.nero-OST000b-osc-ffff8804388ba800.osc_cached_mb=used_mb: 947
busy_cnt: 1
osc.nero-OST000c-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1029
busy_cnt: 1
osc.nero-OST000d-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1015
busy_cnt: 25
osc.nero-OST000e-osc-ffff8804388ba800.osc_cached_mb=used_mb: 872
busy_cnt: 2
osc.nero-OST000f-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1324
busy_cnt: 0
osc.nero-OST0010-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1001
busy_cnt: 1
osc.nero-OST0011-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1651
busy_cnt: 1
osc.nero-OST0012-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1379
busy_cnt: 0
osc.nero-OST0013-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1558
busy_cnt: 3
osc.nero-OST0014-osc-ffff8804388ba800.osc_cached_mb=used_mb: 2142
busy_cnt: 1
osc.nero-OST0015-osc-ffff8804388ba800.osc_cached_mb=used_mb: 2085
busy_cnt: 2
osc.nero-OST0016-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1758
busy_cnt: 0
osc.nero-OST0017-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1167
busy_cnt: 1
osc.nero-OST0018-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1551
busy_cnt: 1
osc.nero-OST0019-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1236
busy_cnt: 1
osc.nero-OST001a-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1200
busy_cnt: 2
osc.nero-OST001b-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1463
busy_cnt: 1
osc.nero-OST001c-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1130
busy_cnt: 1
osc.nero-OST001d-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1304
busy_cnt: 2
osc.nero-OST001e-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1516
busy_cnt: 1
osc.nero-OST001f-osc-ffff8804388ba800.osc_cached_mb=used_mb: 1704
busy_cnt: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The relevant logs on the OSS during the eviction:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;2013-05-02T17:25:40+02:00 n-oss02 LustreError: 0:0:(ldlm_lockd.c:357:waiting_locks_callback()) ### lock callback timer expired after 12291s: evicting client at 10.201.32.34@o2ib  ns: filter-nero-OST0019_UUID lock: ffff8801b1d256c0/0x35306a2f3a794cd
3 lrc: 3/0,0 mode: PW/PW res: 76295681/0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;4095) flags: 0x20 remote: 0x63d4bbb0c510ffb7 expref: 3351 pid: 4333 timeout 27550303246
2013-05-02T17:25:45+02:00 n-oss02 LustreError: 4489:0:(ldlm_lib.c:2239:target_send_reply_msg()) @@@ processing error (-107)  req@ffff8805f1b0e000 x1431201755470460/t0(0) o400-&amp;gt;&amp;lt;?&amp;gt;@&amp;lt;?&amp;gt;:0/0 lens 224/0 e 0 to 0 dl 1367508351 ref 1 fl Interpret:H/0/fff
fffff rc -107/-1
2013-05-02T17:38:47+02:00 n-oss02 LustreError: 2752:0:(o2iblnd_cb.c:2991:kiblnd_check_txs_locked()) Timed out tx: tx_queue, 11 seconds
2013-05-02T17:38:47+02:00 n-oss02 LustreError: 2752:0:(o2iblnd_cb.c:3054:kiblnd_check_conns()) Timed out RDMA with 10.201.32.34@o2ib (68): c: 0, oc: 0, rc: 8
2013-05-02T17:39:32+02:00 n-oss02 LustreError: 138-a: nero-OST0001: A client on nid 10.201.32.34@o2ib was evicted due to a lock glimpse callback time out: rc -4
2013-05-02T18:47:14+02:00 n-oss02 LustreError: 0:0:(ldlm_lockd.c:357:waiting_locks_callback()) ### lock callback timer expired after 7026s: evicting client at 10.201.43.27@o2ib  ns: filter-nero-OST0001_UUID lock: ffff88047bd866c0/0x35306a2f3c0c65ec
 lrc: 3/0,0 mode: PW/PW res: 76322164/0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;18446744073709551615) flags: 0x80000020 remote: 0xf6d8f9457b07fb6a expref: 176 pid: 3961 timeout 27555197173
2013-05-02T18:47:32+02:00 n-oss02 LustreError: 4288:0:(ldlm_lib.c:2239:target_send_reply_msg()) @@@ processing error (-107)  req@ffff8805d7e86400 x1427319253357050/t0(0) o400-&amp;gt;&amp;lt;?&amp;gt;@&amp;lt;?&amp;gt;:0/0 lens 224/0 e 0 to 0 dl 1367513258 ref 1 fl Interpret:H/0/fff
fffff rc -107/-1
2013-05-02T20:34:41+02:00 n-oss02 LustreError: 0:0:(ldlm_lockd.c:357:waiting_locks_callback()) ### lock callback timer expired after 20590s: evicting client at 10.201.35.45@o2ib  ns: filter-nero-OST0009_UUID lock: ffff88082d127900/0x35306a2f3b4ac15
1 lrc: 3/0,0 mode: PW/PW res: 76311076/0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 8388608-&amp;gt;18446744073709551615) flags: 0x20 remote: 0x60a148bfb337040c expref: 208 pid: 4312 timeout 27561644539
2013-05-02T20:35:27+02:00 n-oss02 LustreError: 4288:0:(ldlm_lib.c:2239:target_send_reply_msg()) @@@ processing error (-107)  req@ffff8805ff210400 x1427382795207052/t0(0) o400-&amp;gt;&amp;lt;?&amp;gt;@&amp;lt;?&amp;gt;:0/0 lens 224/0 e 0 to 0 dl 1367519733 ref 1 fl Interpret:H/0/fff
fffff rc -107/-1

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="57780" author="jay" created="Mon, 6 May 2013 22:34:45 +0000"  >&lt;p&gt;What&apos;s the ip address of the node where crash happened?&lt;/p&gt;</comment>
                            <comment id="57808" author="adrian" created="Tue, 7 May 2013 06:36:05 +0000"  >&lt;p&gt;The NID of the evicted and crashed client is: 10.201.32.34@o2ib&lt;/p&gt;

&lt;p&gt;I&apos;m not very surprised about the eviction: The client is known to have a pretty bad infiniband interface: We already observed random IB stalls on this hardware type.&lt;/p&gt;</comment>
                            <comment id="64254" author="paf" created="Wed, 14 Aug 2013 16:24:00 +0000"  >&lt;p&gt;Jinshan,&lt;/p&gt;

&lt;p&gt;Cray has been hitting what appears to be this bug recently.  We&apos;ll be running with &lt;a href=&quot;http://review.whamcloud.com/#/c/5419/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/5419/&lt;/a&gt; on our tests systems and will report back with what we see.&lt;/p&gt;</comment>
                            <comment id="64802" author="cheng_shao" created="Wed, 21 Aug 2013 23:15:54 +0000"  >&lt;p&gt;I understand that we revamped the osc_lock_flush code path to replace the page-based approach as shown in cl_lock_page_out with extent-based one now. In the old code path, we will end up waiting in cl_sync_io_wait and if the first wait timed out, we will enter the second infinite uninterruptible wait anyway. That is equivalent to the effect of applying Jinshan&apos;s patch above. In another word, the simple fix doesn&apos;t make it worse. Therefore, should we move forward to get it landed? &lt;/p&gt;</comment>
                            <comment id="64868" author="spitzcor" created="Thu, 22 Aug 2013 18:29:25 +0000"  >&lt;p&gt;This bug has two patches, #5419 and #6262.  One should be abandoned.&lt;/p&gt;</comment>
                            <comment id="66356" author="paf" created="Wed, 11 Sep 2013 15:49:33 +0000"  >&lt;p&gt;Forgot to update this with our results. We haven&apos;t had this issue since landing &lt;a href=&quot;http://review.whamcloud.com/#/c/5419/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/5419/&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;In addition, we haven&apos;t noticed any of the possible issues with unkillable threads.&lt;/p&gt;</comment>
                            <comment id="66367" author="morrone" created="Wed, 11 Sep 2013 17:25:44 +0000"  >&lt;p&gt;LLNL has been carrying the &lt;a href=&quot;http://review.whamcloud.com/5419&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;5419&lt;/a&gt; patch in our tree and running with it in production.&lt;/p&gt;

&lt;p&gt;On one of our &lt;em&gt;smaller&lt;/em&gt; BG/Q systems I counted the number of hits of the &quot;wait ext to %d timedout, recovery in progress?&quot; message from the console logs and found it hit 1258 times.  There is a fair bit of clustering that I didn&apos;t spend the time colapsing, so that may be more like 20-100 times since May.&lt;/p&gt;</comment>
                            <comment id="67436" author="pjones" created="Tue, 24 Sep 2013 19:01:47 +0000"  >&lt;p&gt;So, a patch just landed to master for this issue. Is that enough to warrant marking the issue as resolved or is something further required?&lt;/p&gt;</comment>
                            <comment id="67440" author="paf" created="Tue, 24 Sep 2013 19:11:14 +0000"  >&lt;p&gt;From the Cray perspective, I don&apos;t see anything further needed.  LLNL might feel differently.&lt;/p&gt;</comment>
                            <comment id="67474" author="morrone" created="Tue, 24 Sep 2013 21:56:12 +0000"  >&lt;p&gt;Oh, hmm...actually we are running any earlier version of the patch.  I have no idea what to make of the one that landed.&lt;/p&gt;</comment>
                            <comment id="67475" author="m.magrys" created="Tue, 24 Sep 2013 22:02:22 +0000"  >&lt;p&gt;We also hit this bug on 2.4.0 clients and 2.4.1RC2 servers. I think that if possible the patch should be added to 2.4.2 release, as it is severe.&lt;/p&gt;</comment>
                            <comment id="67554" author="jay" created="Wed, 25 Sep 2013 16:09:21 +0000"  >&lt;p&gt;Hi Chris,&lt;/p&gt;

&lt;p&gt;What&apos;s the version you&apos;re running right now?&lt;/p&gt;

&lt;p&gt;Jinshan&lt;/p&gt;</comment>
                            <comment id="67580" author="morrone" created="Wed, 25 Sep 2013 18:33:01 +0000"  >&lt;p&gt;Patch set 2.&lt;/p&gt;</comment>
                            <comment id="67624" author="jay" created="Wed, 25 Sep 2013 21:48:46 +0000"  >&lt;p&gt;Patch set 2 has more debug information. The real fix is the same.&lt;/p&gt;</comment>
                            <comment id="67629" author="jay" created="Wed, 25 Sep 2013 22:03:34 +0000"  >&lt;p&gt;The reason why this patch was made is that OSC has to wait for the IO RPC to finish anyway, no matter how much time it will need; otherwise it will hit the assertion in discard_cb(). If the OST is in recovery, it may take really long time for that OST to finish the RPC.&lt;/p&gt;</comment>
                            <comment id="68018" author="morrone" created="Tue, 1 Oct 2013 01:04:04 +0000"  >&lt;p&gt;But the fix looks different.  Patch set 4 changes the LWI_INTR interrupt handler from LWI_ON_SIGNAL_NOOP to NULL, and does nothing else.  Patch set 2 did not do that.  Instead the only significant change in patch set 2 was the removal of the KLASSERT.&lt;/p&gt;

&lt;p&gt;I&apos;m missing how those two things are equivalent.&lt;/p&gt;</comment>
                            <comment id="68185" author="jay" created="Wed, 2 Oct 2013 18:48:53 +0000"  >&lt;p&gt;indeed. patch set 2 allows the interrupt by setting interruption callback to LWI_ON_SIGNAL_NOOP. In that case, it makes sense to upgrade to patch set 4.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="17314">LU-2683</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="23001">LU-4581</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="12261" name="console.rzuseqlac2.bz2" size="10267" author="prakash" created="Tue, 19 Feb 2013 18:07:30 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvitr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>6732</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>