<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:26:46 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2622] All CPUs spinning on cl_envs_guard lock under ll_releasepage during memory reclaim</title>
                <link>https://jira.whamcloud.com/browse/LU-2622</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;During testing on Sequoia, we&apos;ve seen specific IO nodes heavily contend on the &lt;tt&gt;cl_envs_guard&lt;/tt&gt; lock while memory reclaim is happening. As a result, this severely degrades the performance of the problem client.&lt;/p&gt;

&lt;p&gt;An example stack trace of a thread spinning on the lock is below:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;CPU56:                                                                             
Call Trace:                                                                        
[c00000000fe3bb30] [c000000000008d1c] .show_stack+0x7c/0x184 (unreliable)          
[c00000000fe3bbe0] [c00000000027604c] .showacpu+0x64/0x94                          
[c00000000fe3bc70] [c000000000068b30] .generic_smp_call_function_interrupt+0x10c/0x230
[c00000000fe3bd40] [c00000000001d11c] .smp_message_recv+0x34/0x78                  
[c00000000fe3bdc0] [c00000000002526c] .bgq_ipi_dispatch+0x118/0x18c                
[c00000000fe3be50] [c00000000007b20c] .handle_IRQ_event+0x88/0x18c                 
[c00000000fe3bf00] [c00000000007dc90] .handle_percpu_irq+0x8c/0x100                
[c00000000fe3bf90] [c00000000001b808] .call_handle_irq+0x1c/0x2c                   
[c0000003e1c4a4c0] [c0000000000059f0] .do_IRQ+0x154/0x1e0                          
[c0000003e1c4a570] [c0000000000144dc] exc_external_input_book3e+0x110/0x114        
--- Exception: 501 at ._raw_spin_lock+0xd8/0x1a8                                   
    LR = ._raw_spin_lock+0x104/0x1a8                                               
[c0000003e1c4a860] [8000000000b04f38] libcfs_nidstrings+0x2acc/0xfffffffffffe5824 [libcfs] (unreliable)
[c0000003e1c4a910] [c00000000042d4cc] ._spin_lock+0x10/0x24                        
[c0000003e1c4a980] [80000000024c2f4c] .cl_env_get+0xec/0x480 [obdclass]            
[c0000003e1c4aa60] [80000000024c336c] .cl_env_nested_get+0x8c/0xf0 [obdclass]   
[c0000003e1c4aaf0] [800000000692070c] .ll_releasepage+0xbc/0x200 [lustre]          
[c0000003e1c4aba0] [c000000000094110] .try_to_release_page+0x68/0x8c               
[c0000003e1c4ac10] [c0000000000a4190] .shrink_page_list.clone.0+0x3d8/0x63c        
[c0000003e1c4adc0] [c0000000000a47d8] .shrink_inactive_list+0x3e4/0x690            
[c0000003e1c4af90] [c0000000000a4f54] .shrink_zone+0x4d0/0x4d4                     
[c0000003e1c4b0c0] [c0000000000a5a68] .try_to_free_pages+0x204/0x3d0               
[c0000003e1c4b220] [c00000000009d044] .__alloc_pages_nodemask+0x460/0x738          
[c0000003e1c4b3a0] [c000000000095af4] .grab_cache_page_write_begin+0x7c/0xec       
[c0000003e1c4b450] [8000000006920964] .ll_write_begin+0x94/0x270 [lustre]          
[c0000003e1c4b520] [c0000000000968c8] .generic_file_buffered_write+0x148/0x374  
[c0000003e1c4b660] [c000000000097050] .__generic_file_aio_write+0x374/0x3d8        
[c0000003e1c4b760] [c00000000009712c] .generic_file_aio_write+0x78/0xe8            
[c0000003e1c4b810] [800000000693ed4c] .vvp_io_write_start+0xfc/0x3e0 [lustre]   
[c0000003e1c4b8e0] [80000000024d9c6c] .cl_io_start+0xcc/0x220 [obdclass]           
[c0000003e1c4b980] [80000000024e1a84] .cl_io_loop+0x194/0x2c0 [obdclass]           
[c0000003e1c4ba30] [80000000068ba1d8] .ll_file_io_generic+0x498/0x670 [lustre]  
[c0000003e1c4bb30] [80000000068ba834] .ll_file_aio_write+0x1d4/0x3a0 [lustre]   
[c0000003e1c4bc00] [80000000068bab50] .ll_file_write+0x150/0x320 [lustre]          
[c0000003e1c4bce0] [c0000000000d1ba8] .vfs_write+0xd0/0x1c4                        
[c0000003e1c4bd80] [c0000000000d1d98] .SyS_write+0x54/0x98                         
[c0000003e1c4be30] [c000000000000580] syscall_exit+0x0/0x2c
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I also see this for calls in &lt;tt&gt;cl_env_put&lt;/tt&gt;:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;CPU63:                                                                          
Call Trace:                                                                     
[c00000000fe03b30] [c000000000008d1c] .show_stack+0x7c/0x184 (unreliable)       
[c00000000fe03be0] [c00000000027604c] .showacpu+0x64/0x94                       
[c00000000fe03c70] [c000000000068b30] .generic_smp_call_function_interrupt+0x10c/0x230
[c00000000fe03d40] [c00000000001d11c] .smp_message_recv+0x34/0x78               
[c00000000fe03dc0] [c00000000002526c] .bgq_ipi_dispatch+0x118/0x18c             
[c00000000fe03e50] [c00000000007b20c] .handle_IRQ_event+0x88/0x18c              
[c00000000fe03f00] [c00000000007dc90] .handle_percpu_irq+0x8c/0x100             
[c00000000fe03f90] [c00000000001b808] .call_handle_irq+0x1c/0x2c                
[c0000003c4f0a510] [c0000000000059f0] .do_IRQ+0x154/0x1e0                       
[c0000003c4f0a5c0] [c0000000000144dc] exc_external_input_book3e+0x110/0x114     
--- Exception: 501 at ._raw_spin_lock+0xdc/0x1a8                                
    LR = ._raw_spin_lock+0x104/0x1a8                                             
[c0000003c4f0a8b0] [800000000697a578] msgdata.87439+0x20/0xfffffffffffccf88 [lustre] (unreliable)
[c0000003c4f0a960] [c00000000042d4cc] ._spin_lock+0x10/0x24                     
[c0000003c4f0a9d0] [80000000024c17e8] .cl_env_put+0x178/0x420 [obdclass]        
[c0000003c4f0aa70] [80000000024c1ab0] .cl_env_nested_put+0x20/0x40 [obdclass]   
[c0000003c4f0aaf0] [8000000006920794] .ll_releasepage+0x144/0x200 [lustre]      
[c0000003c4f0aba0] [c000000000094110] .try_to_release_page+0x68/0x8c            
[c0000003c4f0ac10] [c0000000000a4190] .shrink_page_list.clone.0+0x3d8/0x63c     
[c0000003c4f0adc0] [c0000000000a47d8] .shrink_inactive_list+0x3e4/0x690         
[c0000003c4f0af90] [c0000000000a4f54] .shrink_zone+0x4d0/0x4d4                  
[c0000003c4f0b0c0] [c0000000000a5a68] .try_to_free_pages+0x204/0x3d0            
[c0000003c4f0b220] [c00000000009d044] .__alloc_pages_nodemask+0x460/0x738       
[c0000003c4f0b3a0] [c000000000095af4] .grab_cache_page_write_begin+0x7c/0xec    
[c0000003c4f0b450] [8000000006920964] .ll_write_begin+0x94/0x270 [lustre]       
[c0000003c4f0b520] [c0000000000968c8] .generic_file_buffered_write+0x148/0x374  
[c0000003c4f0b660] [c000000000097050] .__generic_file_aio_write+0x374/0x3d8     
[c0000003c4f0b760] [c00000000009712c] .generic_file_aio_write+0x78/0xe8         
[c0000003c4f0b810] [800000000693ed4c] .vvp_io_write_start+0xfc/0x3e0 [lustre]   
[c0000003c4f0b8e0] [80000000024d9c6c] .cl_io_start+0xcc/0x220 [obdclass]        
[c0000003c4f0b980] [80000000024e1a84] .cl_io_loop+0x194/0x2c0 [obdclass]        
[c0000003c4f0ba30] [80000000068ba1d8] .ll_file_io_generic+0x498/0x670 [lustre]  
[c0000003c4f0bb30] [80000000068ba834] .ll_file_aio_write+0x1d4/0x3a0 [lustre]   
[c0000003c4f0bc00] [80000000068bab50] .ll_file_write+0x150/0x320 [lustre]       
[c0000003c4f0bce0] [c0000000000d1ba8] .vfs_write+0xd0/0x1c4                     
[c0000003c4f0bd80] [c0000000000d1d98] .SyS_write+0x54/0x98                      
[c0000003c4f0be30] [c000000000000580] syscall_exit+0x0/0x2c
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Listing the &lt;tt&gt;cl_env_*&lt;/tt&gt; addresses:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedHeader panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;cl_env_get+0xec&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;(gdb) l *cl_env_get+0xec
0xa2f4c is in cl_env_get (/builddir/build/BUILD/lustre-2.3.58/lustre/obdclass/cl_object.c:804).

 803         ENTRY;                                                                  
 804         spin_lock(&amp;amp;cl_envs_guard);                                              
 805         LASSERT(equi(cl_envs_cached_nr == 0, cfs_list_empty(&amp;amp;cl_envs))); 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedHeader panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;cl_env_put+0x178&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;(gdb) l *cl_env_put+0x178
0xa17e8 is in cl_env_put (/builddir/build/BUILD/lustre-2.3.58/lustre/obdclass/cl_object.c:979).

 978                     (env-&amp;gt;le_ses-&amp;gt;lc_tags &amp;amp; ~LCT_HAS_EXIT) == LCT_SESSION) {    
 979                         spin_lock(&amp;amp;cl_envs_guard);                              
 980                         cfs_list_add(&amp;amp;cle-&amp;gt;ce_linkage, &amp;amp;cl_envs);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Dumping the stacks for all active CPUs shows nearly every one of them contending on this lock.&lt;/p&gt;</description>
                <environment></environment>
        <key id="17179">LU-2622</key>
            <summary>All CPUs spinning on cl_envs_guard lock under ll_releasepage during memory reclaim</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="jay">Jinshan Xiong</assignee>
                                    <reporter username="prakash">Prakash Surya</reporter>
                        <labels>
                            <label>MB</label>
                            <label>llnl</label>
                    </labels>
                <created>Tue, 15 Jan 2013 17:55:05 +0000</created>
                <updated>Wed, 3 May 2017 20:04:48 +0000</updated>
                            <resolved>Wed, 3 May 2017 20:04:48 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                    <fixVersion>Lustre 2.4.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="50511" author="pjones" created="Tue, 15 Jan 2013 18:31:30 +0000"  >&lt;p&gt;Jinshan&lt;/p&gt;

&lt;p&gt;Could you please comment on this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="50512" author="jay" created="Tue, 15 Jan 2013 18:52:35 +0000"  >&lt;p&gt;Let&apos;s improve the logic in cl_env_cache_purge() to delist a couple of cl_env inside cl_envs_guard.&lt;/p&gt;</comment>
                            <comment id="50517" author="morrone" created="Tue, 15 Jan 2013 19:48:57 +0000"  >&lt;p&gt;I&apos;m not sure why we started seeing this problem recently, but today it was so bad that both writes and reads averaged 16 GB/s on Sequoia to all of grove.  Two weeks ago we were running at 850 GB/s.&lt;/p&gt;</comment>
                            <comment id="50522" author="jay" created="Tue, 15 Jan 2013 20:38:42 +0000"  >&lt;p&gt;This sounds like recent patches degraded the performance. It will be great if you can do a comparison and collect performance data.&lt;/p&gt;

&lt;p&gt;Anyway, I pushed a patch to reduce contention on this lock at: &lt;a href=&quot;http://review.whamcloud.com/5034&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5034&lt;/a&gt;, please give it a try.&lt;/p&gt;</comment>
                            <comment id="50555" author="prakash" created="Wed, 16 Jan 2013 13:13:16 +0000"  >&lt;p&gt;Jinshan, why do you think optimizing &lt;tt&gt;cl_env_cache_purge()&lt;/tt&gt; will fix our issue? I didn&apos;t see any threads down in that function, so I&apos;m skeptical. I think it&apos;s the {{cl_env_&lt;/p&gt;
{put|get}
&lt;p&gt;}} functions that we need to take a closer look at.&lt;/p&gt;

&lt;p&gt;Also, we don&apos;t have enough evidence to make the statement that we haven&apos;t seen this previously, IMO. We may have seen it on a much larger portion of the IO nodes than previously, but there&apos;s a chance we&apos;ve been seeing it sporadically all along and didn&apos;t notice.&lt;/p&gt;</comment>
                            <comment id="50557" author="jay" created="Wed, 16 Jan 2013 13:36:47 +0000"  >&lt;p&gt;You&apos;re right. I thought cache shrinker is linked to cl_env_cache_purge(), but this patch does optimization anyway. I will work out a new patch soon.&lt;/p&gt;</comment>
                            <comment id="50564" author="prakash" created="Wed, 16 Jan 2013 14:27:10 +0000"  >&lt;p&gt;OK. I&apos;m glad you agree.&lt;/p&gt;

&lt;p&gt;Slightly related, is it even save to call &lt;tt&gt;cl_env_get&lt;/tt&gt; from within shrinker context? I worry about the case where there are no available &lt;tt&gt;env&lt;/tt&gt; objects in the cache, thus causing an allocation to occur via &lt;tt&gt;cl_env_new&lt;/tt&gt; while we&apos;re trying to free memory. Is that a valid concern?&lt;/p&gt;</comment>
                            <comment id="50717" author="jay" created="Thu, 17 Jan 2013 14:53:35 +0000"  >&lt;p&gt;Shrinker should be okay because it&apos;s called when the memory is under pressure.&lt;/p&gt;

&lt;p&gt;For this problem, I think we should use a percpu data structure to manage caching cl_envs. However, I&apos;m wondering what kind of job you were running because usually there should be no so many cl_env_get/put calls.&lt;/p&gt;</comment>
                            <comment id="50723" author="prakash" created="Thu, 17 Jan 2013 16:19:39 +0000"  >&lt;p&gt;I believe the workload just a simple IOR. We somehow got into a low memory situation where each CPU was trying to call ll_releasepage, causing the contention on cl_env_get/put.&lt;/p&gt;

&lt;p&gt;I also thought about using percpu caches, but how do you suppose we handle threads being rescheduled to different cpus? The first solution that comes to mind:&lt;/p&gt;

&lt;p&gt;1. Thread A on CPU1 cl_env_get&apos;s from CPU1&apos;s cache&lt;br/&gt;
2. Thread A sleeps and is rescheduled to CPU2&lt;br/&gt;
3. Thread A on CPU2 cl_env_put&apos;s to CPU2 cache&lt;/p&gt;

&lt;p&gt;There would be no need to lock the cache during insertion and deletions, but could lead to an uneven distribution of objects on the different CPU caches (i.e. unmatched number of get&apos;s and put&apos;s to any CPU cache). Although, that might not be much of a problem in practice.&lt;/p&gt;</comment>
                            <comment id="51425" author="prakash" created="Tue, 29 Jan 2013 19:10:08 +0000"  >&lt;p&gt;Jinshan, Do we absolutely need to use a &lt;tt&gt;env&lt;/tt&gt; in the &lt;tt&gt;ll_releasepage&lt;/tt&gt; call path? Rather than making the code more complex, it would be great if we could just eliminate the need for the &lt;tt&gt;env&lt;/tt&gt; structure at all in this call path.&lt;/p&gt;</comment>
                            <comment id="51428" author="morrone" created="Tue, 29 Jan 2013 19:41:58 +0000"  >&lt;p&gt;This does seem to be a very serious performance problem for Sequoia.  I don&apos;t know the reason, but under 2.3.58-7chaos the problem appears to be even worse.  Basically &lt;em&gt;all&lt;/em&gt; IONs easily get into the state where many threads are thrashing on a spin lock.  While oprofile on ppc64 won&apos;t give me a backtrace, the next highest cpu users are cl_env_get and cl_env_put.  At the same time, the sysrq-l backtraces show many active tasks spinning under either cl_env_get or cl_env_put.&lt;/p&gt;

&lt;p&gt;The remaining writers that are not currently active on a cpu tend to be in either this spot:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-01-29 12:40:05.157376 {DefaultControlEventListener} [mmcs]{760}.0.0: sysiod        S 00000fffa956633c     0  3926   3075 0x00000000
2013-01-29 12:40:05.157427 {DefaultControlEventListener} [mmcs]{760}.0.0: Call Trace:
2013-01-29 12:40:05.157478 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06de340] [c0000003e06de400] 0xc0000003e06de400 (unreliable)
2013-01-29 12:40:05.157534 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06de510] [c000000000009b2c] .__switch_to+0xc4/0x100
2013-01-29 12:40:05.157586 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06de5a0] [c00000000042a418] .schedule+0x7d4/0x944
2013-01-29 12:40:05.157636 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06de850] [c00000000042ab04] .schedule_timeout+0x1e0/0x228
2013-01-29 12:40:05.157689 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06de930] [8000000000ab2034] .cfs_waitq_timedwait+0x14/0x30 [libcfs]
2013-01-29 12:40:05.157751 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06de9a0] [80000000046a39c8] .osc_enter_cache+0xc78/0x16a0 [osc]
2013-01-29 12:40:05.157809 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06debd0] [80000000046ab440] .osc_queue_async_io+0xd10/0x1a20 [osc]
2013-01-29 12:40:05.157860 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06dedf0] [8000000004686d48] .osc_page_cache_add+0xf8/0x2a0 [osc]
2013-01-29 12:40:05.160971 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06deeb0] [80000000024c59c8] .cl_page_cache_add+0xe8/0x3b0 [obdclass]
2013-01-29 12:40:05.161028 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06defe0] [800000000511c7f8] .lov_page_cache_add+0xc8/0x340 [lov]
2013-01-29 12:40:05.161079 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06df0b0] [80000000024c59c8] .cl_page_cache_add+0xe8/0x3b0 [obdclass]
2013-01-29 12:40:05.161130 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06df1e0] [800000000694bb54] .vvp_io_commit_write+0x474/0x8a0 [lustre]
2013-01-29 12:40:05.161181 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06df300] [80000000024df71c] .cl_io_commit_write+0x11c/0x2d0 [obdclass]
2013-01-29 12:40:05.161232 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06df3c0] [800000000690eaa0] .ll_commit_write+0x120/0x3e0 [lustre]
2013-01-29 12:40:05.161283 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06df490] [8000000006930884] .ll_write_end+0x34/0x80 [lustre]
2013-01-29 12:40:05.161334 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06df520] [c00000000009696c] .generic_file_buffered_write+0x1ec/0x374
2013-01-29 12:40:05.161384 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06df660] [c000000000097050] .__generic_file_aio_write+0x374/0x3d8
2013-01-29 12:40:05.161435 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06df760] [c00000000009712c] .generic_file_aio_write+0x78/0xe8
2013-01-29 12:40:05.161486 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06df810] [800000000694ed4c] .vvp_io_write_start+0xfc/0x3e0 [lustre]
2013-01-29 12:40:05.161537 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06df8e0] [80000000024d9c6c] .cl_io_start+0xcc/0x220 [obdclass]
2013-01-29 12:40:05.161588 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06df980] [80000000024e1a84] .cl_io_loop+0x194/0x2c0 [obdclass]
2013-01-29 12:40:05.161638 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06dfa30] [80000000068ca1d8] .ll_file_io_generic+0x498/0x670 [lustre]
2013-01-29 12:40:05.161689 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06dfb30] [80000000068ca834] .ll_file_aio_write+0x1d4/0x3a0 [lustre]
2013-01-29 12:40:05.161740 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06dfc00] [80000000068cab50] .ll_file_write+0x150/0x320 [lustre]
2013-01-29 12:40:05.161791 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06dfce0] [c0000000000d1ba8] .vfs_write+0xd0/0x1c4
2013-01-29 12:40:05.161842 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06dfd80] [c0000000000d1d98] .SyS_write+0x54/0x98
2013-01-29 12:40:05.161893 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e06dfe30] [c000000000000580] syscall_exit+0x0/0x2c
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;or waiting here:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-01-29 12:40:05.155420 {DefaultControlEventListener} [mmcs]{760}.0.0: sysiod        R  running task        0  3957   3075 0x00000000
2013-01-29 12:40:05.155470 {DefaultControlEventListener} [mmcs]{760}.0.0: Call Trace:
2013-01-29 12:40:05.155522 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e083ef90] [80000000024c148c] .cl_object_attr_unlock+0x1c/0x30 [obdclass] (unreliable)
2013-01-29 12:40:05.155572 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e083f160] [c000000000009b2c] .__switch_to+0xc4/0x100
2013-01-29 12:40:05.155624 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e083f1f0] [c00000000042a418] .schedule+0x7d4/0x944
2013-01-29 12:40:05.155675 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e083f4a0] [c00000000042a890] ._cond_resched+0x38/0x64
2013-01-29 12:40:05.155726 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e083f520] [c00000000009699c] .generic_file_buffered_write+0x21c/0x374
2013-01-29 12:40:05.155776 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e083f660] [c000000000097050] .__generic_file_aio_write+0x374/0x3d8
2013-01-29 12:40:05.155835 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e083f760] [c00000000009712c] .generic_file_aio_write+0x78/0xe8
2013-01-29 12:40:05.155897 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e083f810] [800000000694ed4c] .vvp_io_write_start+0xfc/0x3e0 [lustre]
2013-01-29 12:40:05.155952 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e083f8e0] [80000000024d9c6c] .cl_io_start+0xcc/0x220 [obdclass]
2013-01-29 12:40:05.156003 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e083f980] [80000000024e1a84] .cl_io_loop+0x194/0x2c0 [obdclass]
2013-01-29 12:40:05.156055 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e083fa30] [80000000068ca1d8] .ll_file_io_generic+0x498/0x670 [lustre]
2013-01-29 12:40:05.156105 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e083fb30] [80000000068ca834] .ll_file_aio_write+0x1d4/0x3a0 [lustre]
2013-01-29 12:40:05.156156 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e083fc00] [80000000068cab50] .ll_file_write+0x150/0x320 [lustre]
2013-01-29 12:40:05.156206 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e083fce0] [c0000000000d1ba8] .vfs_write+0xd0/0x1c4
2013-01-29 12:40:05.156257 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e083fd80] [c0000000000d1d98] .SyS_write+0x54/0x98
2013-01-29 12:40:05.156308 {DefaultControlEventListener} [mmcs]{760}.0.0: [c0000003e083fe30] [c000000000000580] syscall_exit+0x0/0x2c
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The net effect of the thrashing clients is that we tend to get less than 200GB/s aggregate throughput seen by the servers.&lt;/p&gt;

&lt;p&gt;Under 2.3.57-2chaos, we would see 850GB/s at the beginning of ior runs.  However 2.3.57-2chaos had its own problem.  After the bulk of the IO was finished, there was a very long straggler period where a relatively small number of the servers would be handling a only a few MB/s.  Today I tracked down the cause of that, and it seemed to be this same bug.  Under 2.3.57-2chaos, it seems like perhaps it was just less likely, and only a couple of client nodes would begin thrashing.&lt;/p&gt;

&lt;p&gt;I think that back around 2.3.54 we didn&apos;t see this pretty much at all, but I can&apos;t remember exactly which version had neither straggler nor constant thrashing problems.&lt;/p&gt;

&lt;p&gt;But all signs seem to point to this cl_env_put/get lock being a major problem.&lt;/p&gt;</comment>
                            <comment id="51443" author="jay" created="Wed, 30 Jan 2013 01:33:57 +0000"  >&lt;blockquote&gt;
&lt;p&gt;There would be no need to lock the cache during insertion and deletions, but could lead to an uneven distribution of objects on the different CPU caches (i.e. unmatched number of get&apos;s and put&apos;s to any CPU cache). Although, that might not be much of a problem in practice.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;I think uneven distribution shouldn&apos;t be a problem. I think it&apos;s really worth trying.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Jinshan, Do we absolutely need to use a env in the ll_releasepage call path? Rather than making the code more complex, it would be great if we could just eliminate the need for the env structure at all in this call path.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;The main purpose of env is that some local variables are stored there. We used to have problem for stack use.&lt;/p&gt;

&lt;p&gt;To Chris, I guess the current grant algorithm is broken. Once I saw -ENOSPC is returned from writing back of caching pages. Johann should be the right person for what&apos;s updated recently for this piece of code. I will forward this question to him tomorrow.&lt;/p&gt;
</comment>
                            <comment id="51466" author="prakash" created="Wed, 30 Jan 2013 12:06:38 +0000"  >&lt;blockquote&gt;
&lt;p&gt;The main purpose of env is that some local variables are stored there. We used to have problem for stack use.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Right. But can we simply remove this dependency on the env in this callpath? For example, pass in NULL and fix up the call path to accept this as a valid value. There is a comment in &lt;tt&gt;ll_releasepage&lt;/tt&gt; which makes it sound like we &lt;b&gt;need&lt;/b&gt; an env for &lt;tt&gt;cl_page_put&lt;/tt&gt;. I&apos;m curious if we can just remove this need, and refrain from taking an env from the cache altogether in &lt;tt&gt;ll_releasepage&lt;/tt&gt;.&lt;/p&gt;</comment>
                            <comment id="51469" author="jay" created="Wed, 30 Jan 2013 13:42:34 +0000"  >&lt;p&gt;there may be lots of related call behind cl_page_put() which needs env. I would suggest to use a percpu preallocated env array for ll_releasepage() purpose. We just need to disable preempt(should be disabled by default for server kernels) to call it.&lt;/p&gt;</comment>
                            <comment id="51471" author="morrone" created="Wed, 30 Jan 2013 14:06:53 +0000"  >&lt;p&gt;I don&apos;t think that you should rely on kernel preemption being disabled for client code.  We&apos;re already patchless on the client side, so that assumption may be false.&lt;/p&gt;</comment>
                            <comment id="51473" author="jay" created="Wed, 30 Jan 2013 14:51:56 +0000"  >&lt;p&gt;no, no. I meant we will disable preemption by our own.&lt;/p&gt;

&lt;p&gt;I&apos;m pretty interested in the backtrace of sleep in the function osc_enter_cache(). Do you have any use case to reproduce it?&lt;/p&gt;</comment>
                            <comment id="51474" author="morrone" created="Wed, 30 Jan 2013 15:03:29 +0000"  >&lt;p&gt;Yes, yesterday ior reproduced it quite well under 2.3.58-7chaos on Sequoia.  Command was:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;ior -F -e -g -t1m -b512m -o /p/lsfull/morrone/f&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;But smaller transfer sizes might be even better:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;ior -F -e -g -t32k -b128m -o /p/lsfull/morrone/f&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;That was 96K MPI tasks of ior that I was using, which means 128 writers writing to 128 files per lustre client.&lt;/p&gt;</comment>
                            <comment id="51477" author="prakash" created="Wed, 30 Jan 2013 15:54:57 +0000"  >&lt;p&gt;I pushed a preliminary patch of the per CPU lists idea: &lt;a href=&quot;http://review.whamcloud.com/5215&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5215&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;It&apos;s still far from being complete, but can be a starting point for discussion on implementation details.&lt;/p&gt;</comment>
                            <comment id="52240" author="prakash" created="Tue, 12 Feb 2013 18:59:16 +0000"  >&lt;p&gt;I have also pushed this patch which completely removes the list, as I now think it is unnecessary: &lt;a href=&quot;http://review.whamcloud.com/5204&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5204&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="54325" author="jay" created="Tue, 19 Mar 2013 00:26:05 +0000"  >&lt;p&gt;Hi Prakash, can you please add one more tag: LCT_ATOMIC for cl_env. This tag will be used in ll_releasepage() so that keys will be allocated with GFP_ATOMIC; then a few number of preallocated cl_env list will be used in case the allocation fails.&lt;/p&gt;

&lt;p&gt;We make this effort so that ll_releasepage() can make progress even under the worst situation.&lt;/p&gt;</comment>
                            <comment id="54390" author="prakash" created="Tue, 19 Mar 2013 16:58:06 +0000"  >&lt;p&gt;The &lt;a href=&quot;http://review.whamcloud.com/5204&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;patch to remove the list&lt;/a&gt; has already landed. Should there be a follow up patch to add the LCT_ATOMIC flag when allocating cl_env&apos;s? Or add that to the &lt;a href=&quot;http://review.whamcloud.com/5446&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;preallocated cl_env per cpu patch&lt;/a&gt;?&lt;/p&gt;</comment>
                            <comment id="54614" author="jay" created="Thu, 21 Mar 2013 21:48:03 +0000"  >&lt;p&gt;Hi Prakash, I&apos;d like to have a LCT_ATOMIC one. Sorry for delay response.&lt;/p&gt;</comment>
                            <comment id="54941" author="pjones" created="Wed, 27 Mar 2013 18:02:08 +0000"  >&lt;p&gt;Prakash&lt;/p&gt;

&lt;p&gt;Does the remaining work left to land on this ticket warrant it remaining as a blocker?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="54944" author="prakash" created="Wed, 27 Mar 2013 18:16:34 +0000"  >&lt;p&gt;I don&apos;t think it&apos;s ready to be closed, but I&apos;m OK dropping the priority. &lt;/p&gt;</comment>
                            <comment id="54947" author="pjones" created="Wed, 27 Mar 2013 18:24:16 +0000"  >&lt;p&gt;ok thanks Prakash!&lt;/p&gt;</comment>
                            <comment id="55025" author="prakash" created="Thu, 28 Mar 2013 17:46:41 +0000"  >&lt;p&gt;Jinshan, I briefly looked into making the memory allocations atomic under ll_releasepage, but it doesn&apos;t look as simple as I hoped. From what I can tell there will potentially by 3 or more memory allocations when creating a new lu_env structure: 1. for the lu_env itself, 2. for the lu_env&apos;s keys, 3. for the lu_global_key 4. any allocations for other keys. So I don&apos;t think just making steps 1 and 2 atomic will provide us any benefit. We also need to ensure steps 3+ are atomically done as well, right? With that said, I&apos;m not sure making these allocations atomic buys us anything. Shouldn&apos;t we just pass in the correct gfp flags to ensure we loop back on ourselves (e.g GFP_NOIO)?&lt;/p&gt;

&lt;p&gt;Also, I made a debug patch to simply drop the lu_env altogether from ll_releasepage, and pushed that through Maloo, &lt;a href=&quot;http://review.whamcloud.com/5847&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5847&lt;/a&gt;. I didn&apos;t get any failures, so that is probably a better approach to spend time on, IMO. Although merging a change like that will involve changes to the debug infrastructure. I opened &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3033&quot; title=&quot;Remove dependency for lu_env structure in lu_cdebug_printer&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3033&quot;&gt;&lt;del&gt;LU-3033&lt;/del&gt;&lt;/a&gt; to facilitate discussion on that topic.&lt;/p&gt;</comment>
                            <comment id="55064" author="jay" created="Fri, 29 Mar 2013 00:21:41 +0000"  >&lt;blockquote&gt;
&lt;p&gt;So I don&apos;t think just making steps 1 and 2 atomic will provide us any benefit. We also need to ensure steps 3+ are atomically done as well, right? With that said, I&apos;m not sure making these allocations atomic buys us anything. Shouldn&apos;t we just pass in the correct gfp flags to ensure we loop back on ourselves (e.g GFP_NOIO)?&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;We should define a new context tag, say LCT_ATOMIC and this tag will be set when calling cl_env_alloc() in ll_releasepage(). Of course, key allocation functions should be changed to use GFP_ATOMIC if LCT_ATOMIC tag is set.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Also, I made a debug patch to simply drop the lu_env altogether from ll_releasepage, and pushed that through Maloo, &lt;a href=&quot;http://review.whamcloud.com/5847&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5847&lt;/a&gt;. I didn&apos;t get any failures, so that is probably a better approach to spend time on, IMO. Although merging a change like that will involve changes to the debug infrastructure. I opened &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3033&quot; title=&quot;Remove dependency for lu_env structure in lu_cdebug_printer&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3033&quot;&gt;&lt;del&gt;LU-3033&lt;/del&gt;&lt;/a&gt; to facilitate discussion on that topic.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;The major problem is that cl_page_put() can sleep, the calling path is as follows: cl_page_put -&amp;gt; cl_page_free -&amp;gt; cl_object_put -&amp;gt; lu_object_put -&amp;gt; lu_object_free -&amp;gt; lov_delete_raid0 -&amp;gt; cl_locks_prune. I&apos;m quite sure env will be used in this calling path.&lt;/p&gt;

&lt;p&gt;Update: after taking a further look, I realize ll_releasepage() may not sleep in the above code path. I will write a patch to make sure.&lt;/p&gt;</comment>
                            <comment id="58526" author="jfilizetti" created="Wed, 15 May 2013 01:27:10 +0000"  >&lt;p&gt;While testing the patch for this bug: &lt;a href=&quot;http://review.whamcloud.com/#change,5446&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,5446&lt;/a&gt; for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3321&quot; title=&quot;2.x single thread/process throughput degraded from 1.8&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3321&quot;&gt;&lt;del&gt;LU-3321&lt;/del&gt;&lt;/a&gt; I came across an lbug.  I think it happened dropping the caches manually (echo 1 &amp;gt; /proc/sys/vm/drop_caches).&lt;/p&gt;

&lt;p&gt; kernel:LustreError: 28127:0:(cl_object.c:1104:cl_env_percpu_put()) ASSERTION( cle == &amp;amp;cl_env_percpu&lt;span class=&quot;error&quot;&gt;&amp;#91;cpu&amp;#93;&lt;/span&gt; ) failed:&lt;/p&gt;</comment>
                            <comment id="58582" author="prakash" created="Wed, 15 May 2013 16:54:43 +0000"  >&lt;p&gt;Hmm.. I&apos;m assuming you&apos;re running with patch set 5. Is there a back trace associated with the crash that you can post? That assertion makes me think the process was rescheduled to a different CPU in between the get() and put() calls, but preemption should have been disabled, so that shouldn&apos;t have happened..&lt;/p&gt;</comment>
                            <comment id="58584" author="jay" created="Wed, 15 May 2013 17:01:43 +0000"  >&lt;p&gt;Hi Prakash, it turns out that your original patch is correct. I realize that it can cause reschedule by cl_page_put(). Can you please reinstate your patch so that Jeremy can give it a try?&lt;/p&gt;

&lt;p&gt;I will try to fix this problem in CLIO simplification project.&lt;/p&gt;

&lt;p&gt;Thanks in advance, &lt;/p&gt;</comment>
                            <comment id="58593" author="prakash" created="Wed, 15 May 2013 17:44:04 +0000"  >&lt;p&gt;Sure, looking at the history, I think patch set 2 was where I left off. I&apos;ll move that to the front of the queue sometime today, in the mean time, Jeremy should be able to cherry pick from the specific patch set the same way he would have gotten patch set 5. Specifically, I think this command will do it: &lt;tt&gt;git fetch &lt;a href=&quot;http://review.whamcloud.com/p/fs/lustre-release&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/p/fs/lustre-release&lt;/a&gt; refs/changes/46/5446/2 &amp;amp;&amp;amp; git cherry-pick FETCH_HEAD&lt;/tt&gt;&lt;/p&gt;</comment>
                            <comment id="71159" author="jay" created="Fri, 8 Nov 2013 20:19:38 +0000"  >&lt;p&gt;Hi Prakash,&lt;/p&gt;

&lt;p&gt;I reposed the percpu patch at &lt;a href=&quot;http://review.whamcloud.com/8174&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8174&lt;/a&gt;. I have fixed the reschedule problem so it should be working now.&lt;/p&gt;</comment>
                            <comment id="194367" author="adilger" created="Wed, 3 May 2017 20:04:48 +0000"  >&lt;p&gt;Fixed via &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3321&quot; title=&quot;2.x single thread/process throughput degraded from 1.8&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3321&quot;&gt;&lt;del&gt;LU-3321&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="18906">LU-3321</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10490" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>End date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Thu, 26 Jun 2014 17:55:05 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvfcn:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>6133</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10493" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>Start date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Tue, 15 Jan 2013 17:55:05 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                    </customfields>
    </item>
</channel>
</rss>