<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:26:22 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2576] Hangs in osc_enter_cache due to dirty pages not being flushed</title>
                <link>https://jira.whamcloud.com/browse/LU-2576</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We&apos;ve had reports of IO writes hanging in Sequoia, and after some initial debugging, have narrowed the hung threads to getting stuck with the following stack trace:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; sysiod        S 00000fffa842633c     0 27910   3135 0x00000000                    
 Call Trace:                                                                       
 [c0000003ee34e480] [c0000003ee34e590] 0xc0000003ee34e590 (unreliable)             
 [c0000003ee34e650] [c000000000009b8c] .__switch_to+0xc4/0x100                     
 [c0000003ee34e6e0] [c0000000004364c8] .schedule+0x858/0x9c0                       
 [c0000003ee34e990] [8000000000a720c0] .cfs_waitq_wait+0x10/0x30 [libcfs]          
 [c0000003ee34ea00] [8000000004533560] .osc_enter_cache+0x880/0x12c0 [osc]         
 [c0000003ee34ebd0] [800000000453b210] .osc_queue_async_io+0xd10/0x1a40 [osc]   
 [c0000003ee34edf0] [8000000004516fe8] .osc_page_cache_add+0xf8/0x2a0 [osc]        
 [c0000003ee34eeb0] [80000000024468b8] .cl_page_cache_add+0xe8/0x3b0 [obdclass] 
 [c0000003ee34efe0] [8000000004facae8] .lov_page_cache_add+0xc8/0x340 [lov]        
 [c0000003ee34f0b0] [80000000024468b8] .cl_page_cache_add+0xe8/0x3b0 [obdclass] 
 [c0000003ee34f1e0] [80000000067ecda4] .vvp_io_commit_write+0x474/0x8a0 [lustre]
 [c0000003ee34f300] [800000000246098c] .cl_io_commit_write+0x11c/0x2d0 [obdclass]
 [c0000003ee34f3c0] [80000000067af410] .ll_commit_write+0x120/0x3e0 [lustre]       
 [c0000003ee34f490] [80000000067d1634] .ll_write_end+0x34/0x80 [lustre]            
 [c0000003ee34f520] [c000000000098bac] .generic_file_buffered_write+0x1ec/0x374 
 [c0000003ee34f660] [c000000000099290] .__generic_file_aio_write+0x374/0x3d8       
 [c0000003ee34f760] [c00000000009936c] .generic_file_aio_write+0x78/0xe8           
 [c0000003ee34f810] [80000000067eff9c] .vvp_io_write_start+0xfc/0x3e0 [lustre]  
 [c0000003ee34f8e0] [800000000245aedc] .cl_io_start+0xcc/0x220 [obdclass]          
 [c0000003ee34f980] [8000000002462cf4] .cl_io_loop+0x194/0x2c0 [obdclass]          
 [c0000003ee34fa30] [800000000676a278] .ll_file_io_generic+0x498/0x670 [lustre] 
 [c0000003ee34fb30] [800000000676a8d4] .ll_file_aio_write+0x1d4/0x3a0 [lustre]  
 [c0000003ee34fc00] [800000000676abf0] .ll_file_write+0x150/0x320 [lustre]         
 [c0000003ee34fce0] [c0000000000d5968] .vfs_write+0xd0/0x1c4                       
 [c0000003ee34fd80] [c0000000000d5b58] .SyS_write+0x54/0x98                        
 [c0000003ee34fe30] [c000000000000580] syscall_exit+0x0/0x2c
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The specific line it&apos;s stuck at is here:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;(gdb) l *osc_enter_cache+0x880
0x47f50 is in osc_enter_cache (/builddir/build/BUILD/lustre-2.3.56/lustre/osc/osc_cache.c:1526).
1521    /builddir/build/BUILD/lustre-2.3.56/lustre/osc/osc_cache.c: No such file or directory.
        in /builddir/build/BUILD/lustre-2.3.56/lustre/osc/osc_cache.c
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;1523                 CDEBUG(D_CACHE, &quot;%s: sleeping for cache space @ %p for %p\n&quot;,   
1524                        cli-&amp;gt;cl_import-&amp;gt;imp_obd-&amp;gt;obd_name, &amp;amp;ocw, oap);           
1525                                                                                 
1526                 rc = l_wait_event(ocw.ocw_waitq,                                
1527                                   cfs_list_empty(&amp;amp;ocw.ocw_entry), &amp;amp;lwi);        
1528                                                                                 
1529                 client_obd_list_lock(&amp;amp;cli-&amp;gt;cl_loi_list_lock);                   
1530                 cfs_list_del_init(&amp;amp;ocw.ocw_entry); 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;There were specific OSCs on a couple of the problem clients showing outstanding dirty pages for a significant amount of time:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;seqlac2@root:pdsh -w vulcanio[42-43] &apos;grep -v 0 /proc/fs/lustre/osc/*/cur_dirty_bytes&apos; | dshbak -c
----------------                                                                
vulcanio42                                                                      
----------------                                                                
/proc/fs/lustre/osc/ls1-OST0026-osc-c0000003c1100000/cur_dirty_bytes:65536         
/proc/fs/lustre/osc/ls1-OST0027-osc-c0000003c1100000/cur_dirty_bytes:65536         
/proc/fs/lustre/osc/ls1-OST0028-osc-c0000003c1100000/cur_dirty_bytes:65536         
/proc/fs/lustre/osc/ls1-OST002b-osc-c0000003c1100000/cur_dirty_bytes:65536         
----------------                                                                
vulcanio43                                                                      
----------------                                                                
/proc/fs/lustre/osc/ls1-OST0053-osc-c0000003e336b000/cur_dirty_bytes:65536
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Manually dropping caches on the problem clients seems to have cleared things up:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# echo 3 &amp;gt; /proc/sys/vm/drop_caches
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In case it is useful, attached is a dump of the lustre pages taken prior to dropping caches on one of the problem clients:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# cat /proc/fs/lustre/llite/ls1-c0000003e336b000/dump_page_cache &amp;gt; vulcanio43-dump_page_cache-`date +%s`
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I also ran the same command after dropping caches, but the file was empty, so there&apos;s no reason to post it.&lt;/p&gt;</description>
                <environment></environment>
        <key id="17090">LU-2576</key>
            <summary>Hangs in osc_enter_cache due to dirty pages not being flushed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="niu">Niu Yawei</assignee>
                                    <reporter username="prakash">Prakash Surya</reporter>
                        <labels>
                            <label>MB</label>
                            <label>ptr</label>
                            <label>sequoia</label>
                            <label>topsequoia</label>
                    </labels>
                <created>Fri, 4 Jan 2013 18:37:48 +0000</created>
                <updated>Sat, 14 Sep 2013 17:28:46 +0000</updated>
                            <resolved>Fri, 15 Feb 2013 21:33:29 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                    <fixVersion>Lustre 2.4.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="49979" author="prakash" created="Fri, 4 Jan 2013 19:02:15 +0000"  >&lt;p&gt;I should note we&apos;re running &lt;a href=&quot;https://github.com/chaos/lustre/commits/2.3.57-2chaos&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;2.3.57-2chaos&lt;/a&gt; on these clients.&lt;/p&gt;</comment>
                            <comment id="50022" author="pjones" created="Sat, 5 Jan 2013 10:28:38 +0000"  >&lt;p&gt;Niu&lt;/p&gt;

&lt;p&gt;Could you please look into this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="50035" author="niu" created="Sun, 6 Jan 2013 03:09:28 +0000"  >&lt;p&gt;In osc_enter_cache():&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
                rc = l_wait_event(ocw.ocw_waitq,
                                  cfs_list_empty(&amp;amp;ocw.ocw_entry), &amp;amp;lwi);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I think we should take cl_loi_list_lock when checking the cfs_list_empty(&amp;amp;ocw.ocw_entry), otherwise, the wakeup could be missed.&lt;/p&gt;</comment>
                            <comment id="50036" author="niu" created="Sun, 6 Jan 2013 03:25:02 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/4963&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4963&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="50069" author="morrone" created="Mon, 7 Jan 2013 13:42:47 +0000"  >&lt;p&gt;Did you do any sanity testing on 4963?  We have testing time tomorrow on Sequoia, and I&apos;m willing to pull it in if you have done some minimal testing.   Otherwise I would wait for it to run through autotest and maybe test it on Thursday.&lt;/p&gt;</comment>
                            <comment id="50079" author="jay" created="Mon, 7 Jan 2013 16:15:28 +0000"  >&lt;p&gt;Apparently this problem is that a dirty page couldn&apos;t be written out so that there is no I/O RPC between client and OST, this caused the writing process  kept waiting for grants.&lt;/p&gt;

&lt;p&gt;I don&apos;t know why the dirty page was not written back. What are the value of /proc/sys/vm/dirty_XXX on the client node?&lt;/p&gt;</comment>
                            <comment id="50092" author="niu" created="Mon, 7 Jan 2013 23:45:22 +0000"  >&lt;p&gt;Chris, please hold on. On second thought, I realized that 4963 might not be the real fix. Thanks.&lt;/p&gt;</comment>
                            <comment id="50244" author="niu" created="Wed, 9 Jan 2013 23:50:21 +0000"  >&lt;p&gt;I think the wakeup condition in osc_enter_cache() is problematic: It checks only if the ocw is removed from the waiting list, however, in the wakup function osc_wake_cache_waiters(), it&apos;s possible return without removing the ocw if the global obd_dirty_pages is too high.&lt;/p&gt;

&lt;p&gt;I think we&apos;d changed the wakup condition as before: list_empty(&amp;amp;ocw-&amp;gt;ocw_entry) || rpcs_in_flight(cli) == 0, then if the obd_dirty_pages is too high &amp;amp; the dirty pages of this osc has been flushed, osc_enter_cache() will not go to sleep.&lt;/p&gt;</comment>
                            <comment id="50247" author="jay" created="Thu, 10 Jan 2013 02:31:34 +0000"  >&lt;p&gt;not sure I understand, if there is already too much dirty_pages, it doesn&apos;t help anyway&lt;/p&gt;</comment>
                            <comment id="50254" author="niu" created="Thu, 10 Jan 2013 04:12:42 +0000"  >&lt;blockquote&gt;
&lt;p&gt;not sure I understand, if there is already too much dirty_pages, it doesn&apos;t help anyway&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;If there are too many dirty pages (for the whole client but not this specific osc, obd_dirty_pages &amp;gt; obd_max_dirty_pages), osc_enter_cache() should return -EDQUOT to make this osc to start sync write, however with current code, osc_enter_cache() will go to sleep. (see osc_wake_cache_waiters(), no wakeup if obd_dirty_pages is too high)&lt;/p&gt;</comment>
                            <comment id="50297" author="prakash" created="Thu, 10 Jan 2013 18:32:42 +0000"  >&lt;blockquote&gt;
&lt;p&gt;If there are too many dirty pages (for the whole client but not this specific osc, obd_dirty_pages &amp;gt; obd_max_dirty_pages), osc_enter_cache() should return -EDQUOT to make this osc to start sync write&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;I don&apos;t see how the updated patch will cause &lt;tt&gt;osc_enter_cache()&lt;/tt&gt; to return &lt;tt&gt;-EDQUOT&lt;/tt&gt; in this case. If &lt;tt&gt;obd_dirty_pages &amp;gt; obd_max_dirty_pages&lt;/tt&gt; then &lt;tt&gt;ocw.ocw_rc&lt;/tt&gt; will not get set to &lt;tt&gt;-EDQUOT&lt;/tt&gt;, right? So then, how would &lt;tt&gt;osc_enter_cache()&lt;/tt&gt; return &lt;tt&gt;-EDQUOT&lt;/tt&gt;?&lt;/p&gt;</comment>
                            <comment id="50320" author="niu" created="Thu, 10 Jan 2013 23:01:10 +0000"  >&lt;blockquote&gt;
&lt;p&gt;I don&apos;t see how the updated patch will cause osc_enter_cache() to return -EDQUOT in this case. If obd_dirty_pages &amp;gt; obd_max_dirty_pages then ocw.ocw_rc will not get set to -EDQUOT, right? So then, how would osc_enter_cache() return -EDQUOT?&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;I think you are right, we shouldn&apos;t override rc by ocw_rc in such case.&lt;/p&gt;</comment>
                            <comment id="50568" author="prakash" created="Wed, 16 Jan 2013 14:30:31 +0000"  >&lt;p&gt;Revision 5 looks like it might fix the issue. I&apos;ll pull it in to our branch for testing once it goes through Maloo without issues.&lt;/p&gt;</comment>
                            <comment id="50606" author="prakash" created="Wed, 16 Jan 2013 18:33:52 +0000"  >&lt;blockquote&gt;
&lt;p&gt;I don&apos;t know why the dirty page was not written back. What are the value of /proc/sys/vm/dirty_XXX on the client node?&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;I haven&apos;t had access to a system hung in this state, but here&apos;s the values from one of the nodes:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;$ grep . /proc/sys/vm/dirty_*
/proc/sys/vm/dirty_background_bytes:0
/proc/sys/vm/dirty_background_ratio:10
/proc/sys/vm/dirty_bytes:0
/proc/sys/vm/dirty_expire_centisecs:3000
/proc/sys/vm/dirty_ratio:20
/proc/sys/vm/dirty_writeback_centisecs:500
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="50877" author="niu" created="Mon, 21 Jan 2013 03:43:23 +0000"  >&lt;p&gt;Seems the patchset 5 reveals another problem:&lt;/p&gt;

&lt;p&gt;In osc_enter_cache(), it&apos;s going to wait for grant when cli-&amp;gt;cl_dirty &amp;gt; 0, however osc_io_unplug() doesn&apos;t tirgger any IO because of oo_nr_writes == 0. I think there should be something wrong in the new extent code. Jingshan, any idea?&lt;/p&gt;

&lt;p&gt;It can be reproduced by replay-single test_88.&lt;/p&gt;</comment>
                            <comment id="50915" author="jay" created="Mon, 21 Jan 2013 13:44:11 +0000"  >&lt;blockquote&gt;
&lt;p&gt;In osc_enter_cache(), it&apos;s going to wait for grant when cli-&amp;gt;cl_dirty &amp;gt; 0, however osc_io_unplug() doesn&apos;t tirgger any IO because of oo_nr_writes == 0. I think there should be something wrong in the new extent code. Jingshan, any idea?&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;This is possible. If an extent is in OES_ACTIVE state, cl_dirty will be greater than 0 but oo_nr_writes can be 0. However, when the IO is finished, osc_extent_release() will detect that someone is waiting for grant and then unplug the queue immediately.&lt;/p&gt;
</comment>
                            <comment id="50971" author="niu" created="Tue, 22 Jan 2013 08:57:28 +0000"  >&lt;p&gt;I see the problem, in osc_enter_cache():&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt; &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; (cli-&amp;gt;cl_dirty &amp;gt; 0 || cli-&amp;gt;cl_w_in_flight &amp;gt; 0) {
   osc_io_unplug();
   wait_for_grant();
 }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Actually we shouldn&apos;t use &apos;while&apos; here, because osc_io_unplug() can&apos;t trigger dirty flush for the object which isn&apos;t on the cl_loi_ready_list or cl_loi_hp_ready_list list (they do have few dirty pages, but not optimal to write out yet), in such case, I think we&apos;d simply return -EDQUOT but not wait in the loop until the dirty pages were flushed.&lt;/p&gt;</comment>
                            <comment id="50978" author="jay" created="Tue, 22 Jan 2013 13:26:16 +0000"  >&lt;p&gt;I don&apos;t know what the root cause is but apparently the above comment is incorrect because the queue will be unplugged no matter how many dirty pages there are, if there is cache waiters.&lt;/p&gt;</comment>
                            <comment id="51005" author="niu" created="Tue, 22 Jan 2013 21:25:16 +0000"  >&lt;p&gt;Xiong, indeed, my previous comment was wrong. The real problem is that we don&apos;t have any API to trigger dirty flush on all objects now, we were using osc_check_rpcs() for this purpose before, but now it&apos;s wrapped in osc_io_unplug0(), and unfortunately, what osc_io_unplug0() does is: If current object needs be flushed, flush all dirty object, otherwise, do nothing.&lt;/p&gt;

&lt;p&gt;So if the current object in osc_enter_cache() has no dirty pages (it&apos;s the first page to be put in cache), dirty flush will not be triggered by osc_io_unplug().&lt;/p&gt;</comment>
                            <comment id="51006" author="niu" created="Tue, 22 Jan 2013 21:29:01 +0000"  >&lt;p&gt;I realized that passing NULL to osc_io_unplug() can trigger dirty flush on all objects, will cook a new fix.&lt;/p&gt;</comment>
                            <comment id="51517" author="niu" created="Thu, 31 Jan 2013 03:04:41 +0000"  >&lt;p&gt;Prakash, does the patch fix your problem? Thanks.&lt;/p&gt;</comment>
                            <comment id="51541" author="morrone" created="Thu, 31 Jan 2013 11:48:51 +0000"  >&lt;p&gt;We don&apos;t know yet.  Other bugs keep preventing reasonable testing of the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2576&quot; title=&quot;Hangs in osc_enter_cache due to dirty pages not being flushed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2576&quot;&gt;&lt;del&gt;LU-2576&lt;/del&gt;&lt;/a&gt; patch.  I see it landed to master.  My hope is to get a good test run in today and leave a version of lustre with the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2576&quot; title=&quot;Hangs in osc_enter_cache due to dirty pages not being flushed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2576&quot;&gt;&lt;del&gt;LU-2576&lt;/del&gt;&lt;/a&gt; patch installed for users to try and see if things improve.&lt;/p&gt;</comment>
                            <comment id="51638" author="morrone" created="Fri, 1 Feb 2013 13:30:13 +0000"  >&lt;p&gt;Testing for this issue went well yesterday.  We&apos;ll see how the patch holds up to real users over the next week, but I am cautiously optimistic that the problem is fixed.&lt;/p&gt;</comment>
                            <comment id="52469" author="jlevi" created="Fri, 15 Feb 2013 14:18:32 +0000"  >&lt;p&gt;Has the testing continued to be successful and we can close this ticket?&lt;/p&gt;</comment>
                            <comment id="52499" author="morrone" created="Fri, 15 Feb 2013 18:32:16 +0000"  >&lt;p&gt;Yes, so far no repeats of this bug.&lt;/p&gt;</comment>
                            <comment id="52517" author="niu" created="Fri, 15 Feb 2013 21:33:29 +0000"  >&lt;p&gt;patch landed for 2.4&lt;/p&gt;</comment>
                            <comment id="66676" author="lflis" created="Sat, 14 Sep 2013 11:59:57 +0000"  >&lt;p&gt;Dear All,&lt;/p&gt;

&lt;p&gt;Has this patch (#7) been really landed for 2.4 and 2.4.1-RC2?&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/#/c/4963/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/4963/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;We have observed the issue for 2.4.0 and 2.4.1-RC2 so looks like the problem is still there.&lt;/p&gt;

&lt;p&gt;Here&apos;s our process stack:&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: ost_check     S 0000000000000005     0 10821      1 0x00000080&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: ffff880127051548 0000000000000086 0000000000000000 0000000000000000&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: 0000000000000000 0000000000000082 ffff880127051528 ffffffff8106327e&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: ffff88020d5a3af8 ffff880127051fd8 000000000000fb88 ffff88020d5a3af8&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: Call Trace:&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8106327e&amp;gt;&amp;#93;&lt;/span&gt; ? try_to_wake_up+0x24e/0x3e0&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03236fe&amp;gt;&amp;#93;&lt;/span&gt; cfs_waitq_wait+0xe/0x10 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa08d09ea&amp;gt;&amp;#93;&lt;/span&gt; osc_enter_cache+0x85a/0xb40 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0338917&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_hash_bd_lookup_intent+0x37/0x130 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81063410&amp;gt;&amp;#93;&lt;/span&gt; ? default_wake_function+0x0/0x20&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa08d7fd1&amp;gt;&amp;#93;&lt;/span&gt; osc_queue_async_io+0x1081/0x1c18 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810573c5&amp;gt;&amp;#93;&lt;/span&gt; ? select_idle_sibling+0x95/0x150&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8121ba89&amp;gt;&amp;#93;&lt;/span&gt; ? security_capable+0x29/0x30&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8107baba&amp;gt;&amp;#93;&lt;/span&gt; ? capable+0x2a/0x60&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04841c5&amp;gt;&amp;#93;&lt;/span&gt; ? cl_page_slice_add+0x55/0x140 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa08bd2ec&amp;gt;&amp;#93;&lt;/span&gt; ? osc_page_init+0xec/0x890 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa08be2e9&amp;gt;&amp;#93;&lt;/span&gt; osc_page_cache_add+0xc9/0x1d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04848af&amp;gt;&amp;#93;&lt;/span&gt; cl_page_cache_add+0x7f/0x2a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa094e335&amp;gt;&amp;#93;&lt;/span&gt; lov_page_cache_add+0x85/0x200 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04848af&amp;gt;&amp;#93;&lt;/span&gt; cl_page_cache_add+0x7f/0x2a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04889fc&amp;gt;&amp;#93;&lt;/span&gt; ? cl_page_find0+0x44c/0x810 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0481439&amp;gt;&amp;#93;&lt;/span&gt; ? cl_env_hops_keycmp+0x19/0x70 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03389bd&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_hash_bd_lookup_intent+0xdd/0x130 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0a26225&amp;gt;&amp;#93;&lt;/span&gt; vvp_io_commit_write+0x3e5/0x5b0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0493cbd&amp;gt;&amp;#93;&lt;/span&gt; cl_io_commit_write+0xad/0x1d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa09fb37e&amp;gt;&amp;#93;&lt;/span&gt; ll_commit_write+0xee/0x320 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0a13650&amp;gt;&amp;#93;&lt;/span&gt; ll_write_end+0x30/0x60 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8111a81a&amp;gt;&amp;#93;&lt;/span&gt; generic_file_buffered_write+0x18a/0x2e0&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81075887&amp;gt;&amp;#93;&lt;/span&gt; ? current_fs_time+0x27/0x30&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8111c210&amp;gt;&amp;#93;&lt;/span&gt; __generic_file_aio_write+0x260/0x490&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8111c4c8&amp;gt;&amp;#93;&lt;/span&gt; generic_file_aio_write+0x88/0x100&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0a283db&amp;gt;&amp;#93;&lt;/span&gt; vvp_io_write_start+0xcb/0x2e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa049072a&amp;gt;&amp;#93;&lt;/span&gt; cl_io_start+0x6a/0x140 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0494e64&amp;gt;&amp;#93;&lt;/span&gt; cl_io_loop+0xb4/0x1b0 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa09ce9e0&amp;gt;&amp;#93;&lt;/span&gt; ll_file_io_generic+0x450/0x600 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa09cf922&amp;gt;&amp;#93;&lt;/span&gt; ll_file_aio_write+0x142/0x2c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa09cfc0c&amp;gt;&amp;#93;&lt;/span&gt; ll_file_write+0x16c/0x2a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81181368&amp;gt;&amp;#93;&lt;/span&gt; vfs_write+0xb8/0x1a0&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81181c61&amp;gt;&amp;#93;&lt;/span&gt; sys_write+0x51/0x90&lt;br/&gt;
Sep 14 13:40:12 n0269-g6l kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100b072&amp;gt;&amp;#93;&lt;/span&gt; system_call_fastpath+0x16/0x1b&lt;/p&gt;


&lt;p&gt;Unfortunately dropping caches doesn&apos;t unblock the process. &lt;br/&gt;
Only way to get ost communication to work again is to deactivate and reactivate osc&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@n0269-g6l ~&amp;#93;&lt;/span&gt;# echo 0 &amp;gt; /proc/fs/lustre/osc/scratch-OST0003-osc-ffff8802126ba400/active &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@n0269-g6l ~&amp;#93;&lt;/span&gt;# echo 1 &amp;gt; /proc/fs/lustre/osc/scratch-OST0003-osc-ffff8802126ba400/active&lt;/p&gt;

&lt;p&gt;Could you please have a look?&lt;/p&gt;

&lt;p&gt;In case more debugging info is needed please let us know. We still keep some &lt;br/&gt;
clients hanging for debugging purposes.&lt;/p&gt;

&lt;p&gt;Best Regards&lt;br/&gt;
&amp;#8211;&lt;br/&gt;
Lukasz Flis&lt;br/&gt;
ACC Cyfronet&lt;/p&gt;</comment>
                            <comment id="66678" author="jay" created="Sat, 14 Sep 2013 17:11:02 +0000"  >&lt;p&gt;Hi Lukasz,&lt;/p&gt;


&lt;p&gt;Do you have &lt;a href=&quot;http://review.whamcloud.com/6554&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/6554&lt;/a&gt; in your tree? It looks like we need to cherry pick this patch into b2_4 as well.&lt;/p&gt;</comment>
                            <comment id="66679" author="jay" created="Sat, 14 Sep 2013 17:16:41 +0000"  >&lt;p&gt;patch for b2_4 is at: &lt;a href=&quot;http://review.whamcloud.com/7657&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/7657&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="66680" author="lflis" created="Sat, 14 Sep 2013 17:28:46 +0000"  >&lt;p&gt;Hi Jinshan,&lt;/p&gt;

&lt;p&gt;Thank you very much for pointing this one out and for patch for 2.4&lt;br/&gt;
we&apos;ll give it a try ASAP. There was no sign of this change in 2.4.0 and 2.4.1RC2&lt;/p&gt;

&lt;p&gt;I see that 2.4.1 is ready for release (packages are ready and published) - maybe it&apos;s worth &lt;br/&gt;
to consider holding (or updating) release a bit as this bug can manifest itself in few hours &lt;br/&gt;
(i have an impression that oss recoveries raise the chance of getting this one)&lt;/p&gt;


&lt;p&gt;Thank you very much for the quick patch. I&apos;ll try to give a feedback asap&lt;/p&gt;

&lt;p&gt;&amp;#8211;&lt;br/&gt;
Lukasz Flis&lt;/p&gt;
</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="15971">LU-2139</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="18673">LU-3261</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="19217">LU-3416</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="18712">LU-3277</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="12140" name="vulcanio43-dump_page_cache-1357340735" size="275" author="prakash" created="Fri, 4 Jan 2013 18:37:48 +0000"/>
                    </attachments>
                <subtasks>
                            <subtask id="17097">LU-2582</subtask>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzven3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>6013</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>