<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:59:48 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-6390] lru_size on the OSC is not honored</title>
                <link>https://jira.whamcloud.com/browse/LU-6390</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Here is all resutls results with 200K files.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# lctl set_param ldlm.namespaces.*.lru_size=1000
ldlm.namespaces.MGC10.0.10.153@o2ib.lru_size=1000
ldlm.namespaces.lustre-MDT0000-mdc-ffff881fccd75800.lru_size=1000
ldlm.namespaces.lustre-OST0000-osc-ffff881fccd75800.lru_size=1000
ldlm.namespaces.lustre-OST0001-osc-ffff881fccd75800.lru_size=1000
ldlm.namespaces.lustre-OST0002-osc-ffff881fccd75800.lru_size=1000
ldlm.namespaces.lustre-OST0003-osc-ffff881fccd75800.lru_size=1000

# ls -lR /lustre
# lctl get_param ldlm.namespaces.*.lock_count
ldlm.namespaces.MGC10.0.10.153@o2ib.lock_count=4
ldlm.namespaces.lustre-MDT0000-mdc-ffff881fccd75800.lock_count=1002
ldlm.namespaces.lustre-OST0000-osc-ffff881fccd75800.lock_count=50003
ldlm.namespaces.lustre-OST0001-osc-ffff881fccd75800.lock_count=50002
ldlm.namespaces.lustre-OST0002-osc-ffff881fccd75800.lock_count=50003
ldlm.namespaces.lustre-OST0003-osc-ffff881fccd75800.lock_count=50004
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="29176">LU-6390</key>
            <summary>lru_size on the OSC is not honored</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="jay">Jinshan Xiong</assignee>
                                    <reporter username="jay">Jinshan Xiong</reporter>
                        <labels>
                            <label>patch</label>
                    </labels>
                <created>Thu, 19 Mar 2015 23:34:14 +0000</created>
                <updated>Thu, 1 Nov 2018 07:28:34 +0000</updated>
                            <resolved>Fri, 19 Jun 2015 12:01:18 +0000</resolved>
                                    <version>Lustre 2.8.0</version>
                                    <fixVersion>Lustre 2.8.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>11</watches>
                                                                            <comments>
                            <comment id="111432" author="gerrit" created="Thu, 2 Apr 2015 19:07:15 +0000"  >&lt;p&gt;Vitaly Fertman (vitaly_fertman@xyratex.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/14342&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/14342&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6390&quot; title=&quot;lru_size on the OSC is not honored&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6390&quot;&gt;&lt;del&gt;LU-6390&lt;/del&gt;&lt;/a&gt; ldlm: restore the ELC for enqueue&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 3f9b5d1aea04d30a76f874edc5048689cd98308a&lt;/p&gt;</comment>
                            <comment id="116722" author="amk" created="Thu, 28 May 2015 17:47:33 +0000"  >&lt;p&gt;Customer site installed  &lt;a href=&quot;http://review.whamcloud.com/14342&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/14342&lt;/a&gt; on their clients. After ~6 hours of running, a data mover client hung. The node is not configured to take memory dumps so we captured stack traces from /proc. The stack traces are reminiscent of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4300&quot; title=&quot;ptlrpcd threads deadlocked in cl_lock_mutex_get&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4300&quot;&gt;&lt;del&gt;LU-4300&lt;/del&gt;&lt;/a&gt;. 11 out of 32 ptlrcpd threads and 47 out of 60 ldlm_bl threads are waiting in cl_lock_mutex_get. And 2 ll_agl threads are stuck in osc_extent_wait:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;10730 ll_agl_21508
11228 ll_agl_21487
[&amp;lt;ffffffffa0f4eb90&amp;gt;] osc_extent_wait+0x420/0x670 [osc]
[&amp;lt;ffffffffa0f4f0af&amp;gt;] osc_cache_wait_range+0x2cf/0x890 [osc]
[&amp;lt;ffffffffa0f50281&amp;gt;] osc_cache_writeback_range+0xc11/0xfb0 [osc]
[&amp;lt;ffffffffa0f3b6f4&amp;gt;] osc_lock_flush+0x84/0x280 [osc]
[&amp;lt;ffffffffa0f3b9d7&amp;gt;] osc_lock_cancel+0xe7/0x1c0 [osc]
[&amp;lt;ffffffffa0b4cbf5&amp;gt;] cl_lock_cancel0+0x75/0x160 [obdclass]
[&amp;lt;ffffffffa0b4d7ab&amp;gt;] cl_lock_cancel+0x13b/0x140 [obdclass]
[&amp;lt;ffffffffa0f3cf1a&amp;gt;] osc_ldlm_blocking_ast+0x13a/0x350 [osc]
[&amp;lt;ffffffffa0cf703c&amp;gt;] ldlm_cancel_callback+0x6c/0x1a0 [ptlrpc]
[&amp;lt;ffffffffa0d06eaa&amp;gt;] ldlm_cli_cancel_local+0x8a/0x470 [ptlrpc]
[&amp;lt;ffffffffa0d0a1ae&amp;gt;] ldlm_cli_cancel_list_local+0xee/0x290 [ptlrpc]
[&amp;lt;ffffffffa0d0b055&amp;gt;] ldlm_cancel_lru_local+0x35/0x40 [ptlrpc]
[&amp;lt;ffffffffa0d0c4cc&amp;gt;] ldlm_prep_elc_req+0x3ec/0x4b0 [ptlrpc]
[&amp;lt;ffffffffa0d0c5b8&amp;gt;] ldlm_prep_enqueue_req+0x28/0x30 [ptlrpc]
[&amp;lt;ffffffffa0f205d9&amp;gt;] osc_enqueue_base+0x109/0x5a0 [osc]
[&amp;lt;ffffffffa0f3c5cd&amp;gt;] osc_lock_enqueue+0x1ed/0x890 [osc]
[&amp;lt;ffffffffa0b50c2c&amp;gt;] cl_enqueue_try+0xfc/0x300 [obdclass]
[&amp;lt;ffffffffa0fce64a&amp;gt;] lov_lock_enqueue+0x21a/0xf10 [lov]
[&amp;lt;ffffffffa0b50c2c&amp;gt;] cl_enqueue_try+0xfc/0x300 [obdclass]
[&amp;lt;ffffffffa0b51b4f&amp;gt;] cl_enqueue_locked+0x6f/0x1f0 [obdclass]
[&amp;lt;ffffffffa0b5279e&amp;gt;] cl_lock_request+0x7e/0x270 [obdclass]
[&amp;lt;ffffffffa109d000&amp;gt;] cl_glimpse_lock+0x180/0x490 [lustre]
[&amp;lt;ffffffffa109d875&amp;gt;] cl_glimpse_size0+0x1a5/0x1d0 [lustre]
[&amp;lt;ffffffffa1095ffb&amp;gt;] ll_agl_trigger+0x1db/0x4b0 [lustre]
[&amp;lt;ffffffffa1096e6e&amp;gt;] ll_agl_thread+0x15e/0x490 [lustre]
[&amp;lt;ffffffff8109abf6&amp;gt;] kthread+0x96/0xa0
[&amp;lt;ffffffff8100c20a&amp;gt;] child_rip+0xa/0x20
[&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff

6143 ptlrpcd_0 + 10 more ptlrpcd threads (out of 32)
[&amp;lt;ffffffffa0b4e6df&amp;gt;] cl_lock_mutex_get+0x6f/0xd0 [obdclass]
[&amp;lt;ffffffffa0fd5b19&amp;gt;] lovsub_parent_lock+0x49/0x120 [lov]
[&amp;lt;ffffffffa0fd6c4f&amp;gt;] lovsub_lock_modify+0x7f/0x1e0 [lov]
[&amp;lt;ffffffffa0b4e108&amp;gt;] cl_lock_modify+0x98/0x310 [obdclass]
[&amp;lt;ffffffffa0f3de32&amp;gt;] osc_lock_granted+0x1e2/0x2b0 [osc]
[&amp;lt;ffffffffa0f3e308&amp;gt;] osc_lock_upcall+0x408/0x600 [osc]
[&amp;lt;ffffffffa0f1e7a6&amp;gt;] osc_enqueue_fini+0x106/0x240 [osc]
[&amp;lt;ffffffffa0f23272&amp;gt;] osc_enqueue_interpret+0xe2/0x1e0 [osc]
[&amp;lt;ffffffffa0d2487c&amp;gt;] ptlrpc_check_set+0x2bc/0x1b50 [ptlrpc]
[&amp;lt;ffffffffa0d500cb&amp;gt;] ptlrpcd_check+0x53b/0x560 [ptlrpc]
[&amp;lt;ffffffffa0d5071b&amp;gt;] ptlrpcd+0x33b/0x3f0 [ptlrpc]
[&amp;lt;ffffffff8109abf6&amp;gt;] kthread+0x96/0xa0
[&amp;lt;ffffffff8100c20a&amp;gt;] child_rip+0xa/0x20
[&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff

6209  ldlm_bl_00 + 46 other ldlm_bl threads (out of 60)
[&amp;lt;ffffffffa0b4e6df&amp;gt;] cl_lock_mutex_get+0x6f/0xd0 [obdclass]
[&amp;lt;ffffffffa0f3ce5a&amp;gt;] osc_ldlm_blocking_ast+0x7a/0x350 [osc]
[&amp;lt;ffffffffa0d0f0c0&amp;gt;] ldlm_handle_bl_callback+0x130/0x400 [ptlrpc]
[&amp;lt;ffffffffa0d0f5f1&amp;gt;] ldlm_bl_thread_main+0x261/0x3c0 [ptlrpc]
[&amp;lt;ffffffff8109abf6&amp;gt;] kthread+0x96/0xa0
[&amp;lt;ffffffff8100c20a&amp;gt;] child_rip+0xa/0x20
[&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I&apos;ll attach the complete list of stack traces to this ticket.&lt;/p&gt;

&lt;p&gt;Let me know whether you need a dump and I&apos;ll see if we can reproduce the bug on a test system.&lt;/p&gt;</comment>
                            <comment id="116723" author="amk" created="Thu, 28 May 2015 17:51:49 +0000"  >&lt;p&gt;ps output followed by &lt;br/&gt;
for each pid in /proc;  do cat /proc/$pid/stack; done&lt;/p&gt;</comment>
                            <comment id="116724" author="amk" created="Thu, 28 May 2015 17:52:46 +0000"  >&lt;p&gt;Unique stack traces for Lustre processes extracted from bt.all&lt;/p&gt;</comment>
                            <comment id="116725" author="amk" created="Thu, 28 May 2015 17:58:59 +0000"  >&lt;p&gt;dmesg file from the data mover node. Shows partial output from &lt;br/&gt;
echo t &amp;gt; /proc/sysrq-trigger&lt;/p&gt;

&lt;p&gt;The forced stack trace dump was done at least 30 minutes before the /proc/pid/stack output was captured. So comparisons between dmesg and bt.all show that the threads are indeed hung.&lt;/p&gt;</comment>
                            <comment id="116748" author="vitaly_fertman" created="Thu, 28 May 2015 20:36:27 +0000"  >&lt;p&gt;cl_lock_mutex_get does not exists in 2.8, since the CLIO simplification, so this was not 2.8 lustre what you tested.&lt;/p&gt;

&lt;p&gt;the patch by itself is supposed to be correct, I think the problem is related to the issue rased in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5781&quot; title=&quot;endless loop in osc_lock_weight()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5781&quot;&gt;&lt;del&gt;LU-5781&lt;/del&gt;&lt;/a&gt;, cancel_lru_policy is called not atomically with set_cbpending, so some dirty pages could be added in between.&lt;/p&gt;</comment>
                            <comment id="116751" author="amk" created="Thu, 28 May 2015 20:55:47 +0000"  >&lt;p&gt;Sorry, forgot to mention, the client is running 2.5.1 on CentOS 2.6.32-431.20.3.el6.x86_64. (Same system as in LELUS-294).&lt;/p&gt;

&lt;p&gt;So are you saying that the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5781&quot; title=&quot;endless loop in osc_lock_weight()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5781&quot;&gt;&lt;del&gt;LU-5781&lt;/del&gt;&lt;/a&gt; patch is needed in addition to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6390&quot; title=&quot;lru_size on the OSC is not honored&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6390&quot;&gt;&lt;del&gt;LU-6390&lt;/del&gt;&lt;/a&gt;? And that the 2 together will fix the problems with too many locks in the LRU? Certainly would explain why we didn&apos;t see the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4300&quot; title=&quot;ptlrpcd threads deadlocked in cl_lock_mutex_get&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4300&quot;&gt;&lt;del&gt;LU-4300&lt;/del&gt;&lt;/a&gt;-like hang in our internal testing, since the versions we typically run have both patches.&lt;/p&gt;</comment>
                            <comment id="119056" author="gerrit" created="Fri, 19 Jun 2015 07:02:02 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/14342/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/14342/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6390&quot; title=&quot;lru_size on the OSC is not honored&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6390&quot;&gt;&lt;del&gt;LU-6390&lt;/del&gt;&lt;/a&gt; ldlm: restore the ELC for enqueue&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: ac5abd46e95edd97316ff0e9563288636e7c42bc&lt;/p&gt;</comment>
                            <comment id="119086" author="pjones" created="Fri, 19 Jun 2015 12:01:18 +0000"  >&lt;p&gt;Landed for 2.8&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="29728">LU-6529</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="53598">LU-11518</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="17996" name="bt.all" size="323275" author="amk" created="Thu, 28 May 2015 17:51:49 +0000"/>
                            <attachment id="17997" name="bt.uniq" size="10636" author="amk" created="Thu, 28 May 2015 17:52:46 +0000"/>
                            <attachment id="17998" name="dmesg" size="494816" author="amk" created="Thu, 28 May 2015 17:58:59 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzx8zj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>