<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:48:51 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5138] hang in osc_lru_reserve despite recoverable state</title>
                <link>https://jira.whamcloud.com/browse/LU-5138</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This process has been waiting in osc_lru_reserve for a very long time:&lt;/p&gt;

&lt;p&gt;PID: 22025  TASK: ffff88017aeba480  CPU: 5   COMMAND: &quot;reads&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2ff838&amp;#93;&lt;/span&gt; schedule at ffffffff8145ec7b&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2ff980&amp;#93;&lt;/span&gt; osc_lru_reserve at ffffffffa0e16ee5 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2ffa00&amp;#93;&lt;/span&gt; osc_page_init at ffffffffa0e1710d &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2ffa40&amp;#93;&lt;/span&gt; lov_page_init_raid0 at ffffffffa0ea48b0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2ffaa0&amp;#93;&lt;/span&gt; cl_page_alloc at ffffffffa0aae632 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2ffae0&amp;#93;&lt;/span&gt; cl_page_find at ffffffffa0aae91b &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #6 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2ffb30&amp;#93;&lt;/span&gt; ll_write_begin at ffffffffa0f96f8d &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #7 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2ffb90&amp;#93;&lt;/span&gt; generic_perform_write at ffffffff810f8242&lt;br/&gt;
 #8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2ffc10&amp;#93;&lt;/span&gt; generic_file_buffered_write at ffffffff810f83a1&lt;br/&gt;
 #9 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2ffc60&amp;#93;&lt;/span&gt; __generic_file_aio_write at ffffffff810fb336&lt;br/&gt;
#10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2ffd10&amp;#93;&lt;/span&gt; generic_file_aio_write at ffffffff810fb57c&lt;br/&gt;
#11 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2ffd50&amp;#93;&lt;/span&gt; vvp_io_write_start at ffffffffa0faae48 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
#12 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2ffda0&amp;#93;&lt;/span&gt; cl_io_start at ffffffffa0ab65f9 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
#13 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2ffdd0&amp;#93;&lt;/span&gt; cl_io_loop at ffffffffa0aba123 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
#14 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2ffe00&amp;#93;&lt;/span&gt; ll_file_io_generic at ffffffffa0f46af1 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
#15 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2ffe70&amp;#93;&lt;/span&gt; ll_file_aio_write at ffffffffa0f47037 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
#16 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2ffec0&amp;#93;&lt;/span&gt; ll_file_write at ffffffffa0f47a00 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
#17 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2fff10&amp;#93;&lt;/span&gt; vfs_write at ffffffff8115aeae&lt;br/&gt;
#18 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2fff40&amp;#93;&lt;/span&gt; sys_write at ffffffff8115b023&lt;br/&gt;
#19 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88017b2fff80&amp;#93;&lt;/span&gt; system_call_fastpath at ffffffff81468d92&lt;br/&gt;
   RIP: 00002aaaaad99630  RSP: 00007fffffffc568  RFLAGS: 00010246&lt;br/&gt;
   RAX: 0000000000000001  RBX: ffffffff81468d92  RCX: 00007fffffffc510&lt;br/&gt;
   RDX: 0000000000010000  RSI: 0000000000603040  RDI: 0000000000000003&lt;br/&gt;
   RBP: 0000000000010000   R8: 0000000000000000   R9: 0101010101010101&lt;br/&gt;
   R10: 00007fffffffc3b0  R11: 0000000000000246  R12: 0000000000010000&lt;br/&gt;
   R13: 0000000000000001  R14: 00000000063b0000  R15: 00000000063c0000&lt;br/&gt;
   ORIG_RAX: 0000000000000001  CS: 0033  SS: 002b&lt;/p&gt;

&lt;p&gt;While testing for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4856&quot; title=&quot;osc_lru_reserve()) ASSERTION( atomic_read(cli-&amp;gt;cl_lru_left) &amp;gt;= 0 ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4856&quot;&gt;&lt;del&gt;LU-4856&lt;/del&gt;&lt;/a&gt;, the bug described in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5123&quot; title=&quot;lprocfs_write_frac_u64_helper does not respect multiplier&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5123&quot;&gt;&lt;del&gt;LU-5123&lt;/del&gt;&lt;/a&gt; caused sanity 101a to run with ccc_lru_max = 32 (pages),  I have not tried, but it should be possible to reproduce this in master by modifying 101a to set max_dirty_mb to 128k.&lt;/p&gt;

&lt;p&gt;This is a pathological condition, but I think it exposed a real bug.  Namely, it appears that the wakeup from the sleep in osc_lru_reserve can be incidental - causes by another process that just happens to do something that triggers an osc_lru_shrink, rather than the deliberate and specific process of the conditions which caused the sleep being addressed when it becomes possible to do so. &lt;/p&gt;

&lt;p&gt;I have a core and debug log from a system in this state, and will attach the debug log, and paste my notes in a comment.&lt;/p&gt;</description>
                <environment>patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4856&quot; title=&quot;osc_lru_reserve()) ASSERTION( atomic_read(cli-&amp;gt;cl_lru_left) &amp;gt;= 0 ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4856&quot;&gt;&lt;strike&gt;LU-4856&lt;/strike&gt;&lt;/a&gt; + without a patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5123&quot; title=&quot;lprocfs_write_frac_u64_helper does not respect multiplier&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5123&quot;&gt;&lt;strike&gt;LU-5123&lt;/strike&gt;&lt;/a&gt;</environment>
        <key id="24997">LU-5138</key>
            <summary>hang in osc_lru_reserve despite recoverable state</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="2">Won&apos;t Fix</resolution>
                                        <assignee username="pjones">Peter Jones</assignee>
                                    <reporter username="schamp">Stephen Champion</reporter>
                        <labels>
                    </labels>
                <created>Mon, 2 Jun 2014 18:26:02 +0000</created>
                <updated>Mon, 17 Nov 2014 19:03:34 +0000</updated>
                            <resolved>Mon, 17 Nov 2014 19:03:34 +0000</resolved>
                                                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="85496" author="schamp" created="Mon, 2 Jun 2014 18:27:52 +0000"  >&lt;p&gt;The last we hear from the hung process is that it has no free pages.  It goes to sleep immediately afterwards.  At this point in time. there are no free slots.  All lru items are busy:&lt;/p&gt;


&lt;p&gt; 00000008:00000020:5.0:1400133972.928506:0:22025:0:(osc_page.c:833:osc_lru_reclaim()) accfs1-OST0002-osc-ffff88022017e000: cli ffff880431ea5538 no free slots, pages: 0, busy: 30.&lt;br/&gt;
 00000008:00000020:5.0:1400133972.928507:0:22025:0:(osc_page.c:849:osc_lru_reclaim()) accfs1-OST0003-osc-ffff88022017e000: cli ffff8803653cd578 LRU pages: 0, busy: 0.&lt;br/&gt;
 00000008:00000020:5.0:1400133972.928509:0:22025:0:(osc_page.c:849:osc_lru_reclaim()) accfs1-OST0001-osc-ffff88022017e000: cli ffff8804316575f8 LRU pages: 0, busy: 2.&lt;br/&gt;
 00000008:00000020:5.0:1400133972.928510:0:22025:0:(osc_page.c:849:osc_lru_reclaim()) accfs1-OST0000-osc-ffff88022017e000: cli ffff880430d09538 LRU pages: 0, busy: 0.&lt;br/&gt;
 00000020:00001000:5.0:1400133972.928511:0:22025:0:(cl_object.c:966:cl_env_put()) 1@ffff880179bdee58&lt;br/&gt;
 00000008:00000020:5.0:1400133972.928513:0:22025:0:(osc_page.c:867:osc_lru_reclaim()) accfs1-OST0000-osc-ffff88022017e000: cli ffff880430d09538 freed 0 pages.&lt;/p&gt;


&lt;p&gt;Then a sixteen page extent is completed.&lt;/p&gt;

&lt;p&gt; 00000008:00000020:12.0:1400134007.925975:0:19741:0:(osc_cache.c:810:osc_extent_finish()) extent ffff8803803091e0@&lt;/p&gt;
{[0 -&amp;gt; 15/255], [2|0|-|rpc|wiuY|ffff880352ecfc48], [65536|16|+|-|ffff8803823a0660|256|ffff880166486640]}
&lt;p&gt; extent finished.&lt;/p&gt;

&lt;p&gt;And the lru_queue work is requested for ptlrpc_0&lt;/p&gt;

&lt;p&gt; 00000100:00000040:12.0:1400134007.925983:0:19741:0:(ptlrpcd.c:271:ptlrpcd_add_req()) @@@ add req &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880347956000&amp;#93;&lt;/span&gt; to pc &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpcd_0:0&amp;#93;&lt;/span&gt;  req@ffff880347956000 x1468144665523584/t0(0) o-1-&amp;gt;accfs1-OST0002-osc-ffff88022017e000@192.168.21.8@o2ib1:0/0 lens 0/0 e 0 to 0 dl 1400134027 ref 2 fl Interpret:N/ffffffff/ffffffff rc 0/-1&lt;/p&gt;

&lt;p&gt;The req is processed&lt;/p&gt;

&lt;p&gt; 00000008:00000020:8.0:1400134007.926058:0:19727:0:(osc_page.c:568:lru_queue_work()) Run LRU work for client obd ffff880431ea5538.&lt;/p&gt;

&lt;p&gt;It goes to shrink&lt;/p&gt;

&lt;p&gt; 00000008:00000001:8.0:1400134007.926061:0:19727:0:(osc_page.c:687:osc_lru_shrink()) Process entered&lt;/p&gt;

&lt;p&gt;It leaves shrink with nothing&lt;/p&gt;

&lt;p&gt; 00000008:00000001:8.0:1400134007.926066:0:19727:0:(osc_page.c:801:osc_lru_shrink()) Process leaving (rc=0 : 0 : 0)&lt;/p&gt;

&lt;p&gt;And finishes the req&lt;/p&gt;

&lt;p&gt; 00000008:00000001:8.0:1400134007.926068:0:19727:0:(osc_page.c:573:lru_queue_work()) Process leaving (rc=0 : 0 : 0)&lt;br/&gt;
 00000100:00000001:8.0:1400134007.926069:0:19727:0:(client.c:1888:ptlrpc_check_set()) Process leaving (rc=1 : 1 : 1)&lt;/p&gt;

&lt;p&gt;It should of had 16 pages to shrink, if they are not referenced.&lt;/p&gt;


&lt;p&gt;Currently, all of the pages have a ref count of 1, and would be shrunk.  This may not have been the case the last time we called the shrink, though...&lt;/p&gt;

&lt;p&gt;The last time we called shrink was &lt;br/&gt;
 00000008:00000001:8.0:1400134007.926066:0:19727:0:(osc_page.c:801:osc_lru_shrink()) Process leaving (rc=0 : 0 : 0)&lt;/p&gt;

&lt;p&gt;Since then, we&apos;ve put 32 pages.&lt;/p&gt;

&lt;p&gt; 00000020:00000001:12.0:1400134007.926074:0:19741:0:(cl_page.c:385:cl_page_put()) page@ffff8803772e6800&lt;span class=&quot;error&quot;&gt;&amp;#91;2 ffff880381c2c0f0 0 0 1           (null)           (null) 0x0&amp;#93;&lt;/span&gt;&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926075:0:19741:0:(cl_page.c:385:cl_page_put()) 2&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926103:0:19741:0:(cl_page.c:385:cl_page_put()) page@ffff8803772e6400&lt;span class=&quot;error&quot;&gt;&amp;#91;2 ffff880381c2c0f0 0 0 1           (null)           (null) 0x0&amp;#93;&lt;/span&gt;&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926104:0:19741:0:(cl_page.c:385:cl_page_put()) 2&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926130:0:19741:0:(cl_page.c:385:cl_page_put()) page@ffff88036f5c8000&lt;span class=&quot;error&quot;&gt;&amp;#91;2 ffff880381c2c0f0 0 0 1           (null)           (null) 0x0&amp;#93;&lt;/span&gt;&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926131:0:19741:0:(cl_page.c:385:cl_page_put()) 2&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926157:0:19741:0:(cl_page.c:385:cl_page_put()) page@ffff88036f5c8800&lt;span class=&quot;error&quot;&gt;&amp;#91;2 ffff880381c2c0f0 0 0 1           (null)           (null) 0x0&amp;#93;&lt;/span&gt;&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926158:0:19741:0:(cl_page.c:385:cl_page_put()) 2&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926185:0:19741:0:(cl_page.c:385:cl_page_put()) page@ffff88036f5c8400&lt;span class=&quot;error&quot;&gt;&amp;#91;2 ffff880381c2c0f0 0 0 1           (null)           (null) 0x0&amp;#93;&lt;/span&gt;&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926185:0:19741:0:(cl_page.c:385:cl_page_put()) 2&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926212:0:19741:0:(cl_page.c:385:cl_page_put()) page@ffff880377565c00&lt;span class=&quot;error&quot;&gt;&amp;#91;2 ffff880381c2c0f0 0 0 1           (null)           (null) 0x0&amp;#93;&lt;/span&gt;&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926213:0:19741:0:(cl_page.c:385:cl_page_put()) 2&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926239:0:19741:0:(cl_page.c:385:cl_page_put()) page@ffff8803820d9800&lt;span class=&quot;error&quot;&gt;&amp;#91;2 ffff880381c2c0f0 0 0 1           (null)           (null) 0x0&amp;#93;&lt;/span&gt;&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926239:0:19741:0:(cl_page.c:385:cl_page_put()) 2&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926266:0:19741:0:(cl_page.c:385:cl_page_put()) page@ffff88037856b000&lt;span class=&quot;error&quot;&gt;&amp;#91;2 ffff880381c2c0f0 0 0 1           (null)           (null) 0x0&amp;#93;&lt;/span&gt;&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926267:0:19741:0:(cl_page.c:385:cl_page_put()) 2&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926293:0:19741:0:(cl_page.c:385:cl_page_put()) page@ffff880380d12400&lt;span class=&quot;error&quot;&gt;&amp;#91;2 ffff880381c2c0f0 0 0 1           (null)           (null) 0x0&amp;#93;&lt;/span&gt;&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926294:0:19741:0:(cl_page.c:385:cl_page_put()) 2&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926320:0:19741:0:(cl_page.c:385:cl_page_put()) page@ffff8803811ef400&lt;span class=&quot;error&quot;&gt;&amp;#91;2 ffff880381c2c0f0 0 0 1           (null)           (null) 0x0&amp;#93;&lt;/span&gt;&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926321:0:19741:0:(cl_page.c:385:cl_page_put()) 2&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926347:0:19741:0:(cl_page.c:385:cl_page_put()) page@ffff8803811ef000&lt;span class=&quot;error&quot;&gt;&amp;#91;2 ffff880381c2c0f0 0 0 1           (null)           (null) 0x0&amp;#93;&lt;/span&gt;&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926348:0:19741:0:(cl_page.c:385:cl_page_put()) 2&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926374:0:19741:0:(cl_page.c:385:cl_page_put()) page@ffff88037f49f800&lt;span class=&quot;error&quot;&gt;&amp;#91;2 ffff880381c2c0f0 0 0 1           (null)           (null) 0x0&amp;#93;&lt;/span&gt;&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926375:0:19741:0:(cl_page.c:385:cl_page_put()) 2&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926401:0:19741:0:(cl_page.c:385:cl_page_put()) page@ffff880362cf4000&lt;span class=&quot;error&quot;&gt;&amp;#91;2 ffff880381c2c0f0 0 0 1           (null)           (null) 0x0&amp;#93;&lt;/span&gt;&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926402:0:19741:0:(cl_page.c:385:cl_page_put()) 2&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926428:0:19741:0:(cl_page.c:385:cl_page_put()) page@ffff880376c65c00&lt;span class=&quot;error&quot;&gt;&amp;#91;2 ffff880381c2c0f0 0 0 1           (null)           (null) 0x0&amp;#93;&lt;/span&gt;&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926429:0:19741:0:(cl_page.c:385:cl_page_put()) 2&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926455:0:19741:0:(cl_page.c:385:cl_page_put()) page@ffff880386096000&lt;span class=&quot;error&quot;&gt;&amp;#91;2 ffff880381c2c0f0 0 0 1           (null)           (null) 0x0&amp;#93;&lt;/span&gt;&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926456:0:19741:0:(cl_page.c:385:cl_page_put()) 2&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926482:0:19741:0:(cl_page.c:385:cl_page_put()) page@ffff880431d7e800&lt;span class=&quot;error&quot;&gt;&amp;#91;2 ffff880381c2c0f0 0 0 1           (null)           (null) 0x0&amp;#93;&lt;/span&gt;&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926483:0:19741:0:(cl_page.c:385:cl_page_put()) 2&lt;/p&gt;

&lt;p&gt;And all of them had a refcnt of 2 at the time, so they were unshrinkable at the last shrink, are now shrinkable, but nobody has called shrink since the pages were put...&lt;/p&gt;

&lt;p&gt;So how did put get called?&lt;/p&gt;

&lt;p&gt;Here is the last call exiting, followed by it&apos;s call chain:&lt;/p&gt;

&lt;p&gt; 00000020:00000001:12.0:1400134007.926483:0:19741:0:(cl_page.c:400:cl_page_put()) Process leaving&lt;br/&gt;
 00000020:00000001:12.0:1400134007.926484:0:19741:0:(cl_page.c:985:cl_page_completion()) Process leaving&lt;br/&gt;
 00000008:00000001:12.0:1400134007.926484:0:19741:0:(osc_cache.c:1316:osc_completion()) Process leaving (rc=0 : 0 : 0)&lt;br/&gt;
 00000008:00000001:12.0:1400134007.926486:0:19741:0:(osc_cache.c:1896:osc_ap_completion()) Process leaving&lt;br/&gt;
 [ ... ]&lt;br/&gt;
 00000008:00000001:12.0:1400134007.926499:0:19741:0:(osc_cache.c:851:osc_extent_finish()) Process leaving (rc=0 : 0 : 0)&lt;/p&gt;

&lt;p&gt;So... osc_extent_finish() called osc_lru_add_batch, responsible for moving pages from the busy list the regular list - then scheduling a shrink - right off the bat.  Since then, the pages have been put, and could be shrunk, but nobody has requested a shrink.&lt;/p&gt;

&lt;p&gt;That seems wrong to me.&lt;/p&gt;</comment>
                            <comment id="85497" author="schamp" created="Mon, 2 Jun 2014 18:31:54 +0000"  >&lt;p&gt;Extra notes, with structure dumps, disassembly, etc.&lt;/p&gt;</comment>
                            <comment id="85501" author="jay" created="Mon, 2 Jun 2014 19:10:16 +0000"  >&lt;p&gt;Indeed, we should schedule a shrink outside of osc_lru_add_batch(), it should be in osc_extent_finish() after osc_ap_completion() is called.&lt;/p&gt;

&lt;p&gt;Can you please create a patch for this, Stephen?&lt;/p&gt;</comment>
                            <comment id="85524" author="schamp" created="Mon, 2 Jun 2014 22:33:19 +0000"  >&lt;p&gt;The other part is adding a test, which should just be a modified sanity 101a.  I may be able to cook something up this week.&lt;/p&gt;

&lt;p&gt;Do you have an opinion on whether the shrink should be inline or a via ptlrpcd request?  I&apos;m thinking this can be inline:&lt;/p&gt;

&lt;p&gt;if osc_cache_too_much&lt;br/&gt;
  osc_lru_shrink&lt;/p&gt;

&lt;p&gt;but there may be advantages to queuing a request instead:&lt;/p&gt;

&lt;p&gt;if osc_cache_too_much&lt;br/&gt;
  ptlrpcd_queue_work cl_lru_work&lt;/p&gt;</comment>
                            <comment id="86635" author="schamp" created="Sat, 14 Jun 2014 01:55:31 +0000"  >&lt;p&gt;Just an update:&lt;/p&gt;

&lt;p&gt;I wrote a test for this, which worked.  It also failed (in a different way) with the patch I wrote.&lt;br/&gt;
The patch I wrote also failed on another test...  So there&apos;s a good bit of work to do understanding these cases.&lt;/p&gt;

&lt;p&gt;I intend to get back this, but have some pressing customer issues to deal with.&lt;/p&gt;</comment>
                            <comment id="89432" author="jfc" created="Fri, 18 Jul 2014 00:00:48 +0000"  >&lt;p&gt;Hello Stephen,&lt;br/&gt;
Is this still a live issue for you?&lt;br/&gt;
If so, can you please give us an update on where you are with it.&lt;br/&gt;
Many thanks,&lt;br/&gt;
~ jfc.&lt;/p&gt;</comment>
                            <comment id="89435" author="schamp" created="Fri, 18 Jul 2014 00:13:30 +0000"  >&lt;p&gt;This is not really a problem for us - it&apos;s just a bug I stumbled across.  I wrote a test that triggers another bug that I have not fully investigated, and a fix that crashes in a later acceptance test.&lt;/p&gt;

&lt;p&gt;Our customers suffer from too much memory, not too little, so I am unlikely to be able to spend additional time on this.  I&apos;d be happy to pass my incomplete work along for someone to pick up.&lt;/p&gt;</comment>
                            <comment id="90100" author="jfc" created="Fri, 25 Jul 2014 23:39:54 +0000"  >&lt;p&gt;Thank you Stephen.&lt;br/&gt;
We&apos;ll leave the ticket open for the time being and please let us know if it rises in priority in the future.&lt;br/&gt;
~ jfc.&lt;/p&gt;</comment>
                            <comment id="99378" author="jfc" created="Mon, 17 Nov 2014 19:03:34 +0000"  >&lt;p&gt;We can reopen this ticket if this issue becomes a priority.&lt;/p&gt;

&lt;p&gt;~ jfc.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="15108" name="debug.txt" size="205" author="schamp" created="Mon, 2 Jun 2014 18:26:02 +0000"/>
                            <attachment id="15109" name="notes.txt" size="70126" author="schamp" created="Mon, 2 Jun 2014 18:31:54 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwnlj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>14168</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>