<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:38:32 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10826] Regression in LU-9372 on OPA enviroment and no recovery triggered</title>
                <link>https://jira.whamcloud.com/browse/LU-10826</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Looks like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9372&quot; title=&quot;OOM happens on OSS during Lustre recovery for more than 5000 clients&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9372&quot;&gt;&lt;del&gt;LU-9372&lt;/del&gt;&lt;/a&gt; has regression on OPA enviroment and somehow when test_req_buffer_pressure is enabled (test_req_buffer_pressure=1) on OSS or MDS, client never reconnect and no trigger Lustre recovery. &lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@es14k-vm1 ~]# umount -t lustre -a
[root@es14k-vm1 ~]# lustre_rmmod 
[root@es14k-vm1 ~]# vi /etc/modprobe.d/lustre.conf

options ptlrpc test_req_buffer_pressure=1

[root@es14k-vm1 ~]# mount -t lustre /dev/ddn/scratch0_ost0000 /lustre/scratch0/ost0000

[root@es14k-vm1 ~]# cat /proc/fs/lustre/obdfilter/scratch0-OST0000/recovery_status 
status: WAITING_FOR_CLIENTS
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Never client Reconnected and no lustre recovery triggered&lt;/p&gt;</description>
                <environment>master, centos7.4 &lt;br/&gt;
1 x MDS/MDT, 1 x OSS/OST and 1 client with OPA-10.6</environment>
        <key id="51413">LU-10826</key>
            <summary>Regression in LU-9372 on OPA enviroment and no recovery triggered</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bfaccini">Bruno Faccini</assignee>
                                    <reporter username="ihara">Shuichi Ihara</reporter>
                        <labels>
                    </labels>
                <created>Mon, 19 Mar 2018 14:45:33 +0000</created>
                <updated>Tue, 24 Jul 2018 17:50:11 +0000</updated>
                            <resolved>Mon, 9 Apr 2018 21:00:43 +0000</resolved>
                                                    <fixVersion>Lustre 2.12.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="223948" author="bfaccini" created="Mon, 19 Mar 2018 15:45:40 +0000"  >&lt;p&gt;Hmm, I may have been too restrictive in my &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9372&quot; title=&quot;OOM happens on OSS during Lustre recovery for more than 5000 clients&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9372&quot;&gt;&lt;del&gt;LU-9372&lt;/del&gt;&lt;/a&gt; patches when&#160;test_req_buffer_pressure is being configured.&lt;/p&gt;

&lt;p&gt;I will double-check on a test platform.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="224014" author="gerrit" created="Tue, 20 Mar 2018 10:11:51 +0000"  >&lt;p&gt;Faccini Bruno (bruno.faccini@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/31690&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/31690&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10826&quot; title=&quot;Regression in LU-9372 on OPA enviroment and no recovery triggered&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10826&quot;&gt;&lt;del&gt;LU-10826&lt;/del&gt;&lt;/a&gt; ptlrpc: fix test_req_buffer_pressure behavior&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 8420c6b4f04ee29c4374f9fdbe216c3f87e6fc26&lt;/p&gt;</comment>
                            <comment id="224080" author="ihara" created="Wed, 21 Mar 2018 03:57:34 +0000"  >&lt;p&gt;Thanks Bruno, patch &lt;a href=&quot;https://review.whamcloud.com/31690&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/31690&lt;/a&gt; works and now triggred Lustre recovery.&lt;br/&gt;
However, main concerns are tunning of resources allocation for all clients requests with test_req_buffer_pressure=1.&lt;br/&gt;
In fact, as &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9372&quot; title=&quot;OOM happens on OSS during Lustre recovery for more than 5000 clients&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9372&quot;&gt;&lt;del&gt;LU-9372&lt;/del&gt;&lt;/a&gt; described, if single OSS has many OSTs under limited amount of memory resources, Lustre recovery triggers OOM when Lustre recovery started in parallel for many OSTs.  We want to prevent OOM on that situation. However, curently, once we set test_req_buffer_pressure=1, it&apos;s controled by memory size or we use req_buffers_max for manuall setting. Still not not automanted setting.&lt;/p&gt;</comment>
                            <comment id="224089" author="bfaccini" created="Wed, 21 Mar 2018 08:33:47 +0000"  >&lt;p&gt;Shuichi, I am already looking for a way to auto-tune&#160;req_buffers_max based, possibly based on current number of active OSTs/targets, number of connected/to-be-recovered Clients, memory available, ...&lt;/p&gt;

&lt;p&gt;I also want to check some others options which may help to limit each rqbd-buffer memory footprint (presently allocated in a 32K slab when it size is only 17K (this since patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4755&quot; title=&quot;ASSERTION( req-&amp;gt;rq_reqbuf_len &amp;gt;= msgsize ) failed when using 4MB RPC&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4755&quot;&gt;&lt;del&gt;LU-4755&lt;/del&gt;&lt;/a&gt; &quot;ptlrpc: enlarge OST_MAXREQSIZE for 4MB RPC&quot;), like to use a dedicated kmem_cache (if no underlying interference/drift from the Slab/Slub layer), or try to reduce OST_MAXREQSIZE in order to have a&#160;full buffer size down to 16K (if it keeps the &quot;4MB RPC&quot; capability) and thus have it being allocated in a 16K Slab.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="225508" author="gerrit" created="Mon, 9 Apr 2018 19:51:11 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/31690/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/31690/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10826&quot; title=&quot;Regression in LU-9372 on OPA enviroment and no recovery triggered&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10826&quot;&gt;&lt;del&gt;LU-10826&lt;/del&gt;&lt;/a&gt; ptlrpc: fix test_req_buffer_pressure behavior&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 040eca67f8d5422b0099d1b70594b5eb40a0f9ef&lt;/p&gt;</comment>
                            <comment id="225552" author="pjones" created="Mon, 9 Apr 2018 21:00:43 +0000"  >&lt;p&gt;&lt;font color=&quot;#000000&quot;&gt;Landed for 2.12&lt;/font&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="45601">LU-9372</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="52099">LU-10993</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzujb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>