<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:23:08 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-16002] Ping evictor delayed client eviction for 3 ping interval more than defined</title>
                <link>https://jira.whamcloud.com/browse/LU-16002</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Ping evictor adds 3 ping interval to eviction time(6*ping interval) PING_EVICT_TIMEOUT. For obd_timeout 300 the result eviction time became 670 instead of 450. It confuses and delays all conflicting requests on server side.&lt;/p&gt;</description>
                <environment></environment>
        <key id="71085">LU-16002</key>
            <summary>Ping evictor delayed client eviction for 3 ping interval more than defined</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="4" iconUrl="https://jira.whamcloud.com/images/icons/statuses/reopened.png" description="This issue was once resolved, but the resolution was deemed incorrect. From here issues are either marked assigned or resolved.">Reopened</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="aboyko">Alexander Boyko</assignee>
                                    <reporter username="aboyko">Alexander Boyko</reporter>
                        <labels>
                            <label>patch</label>
                    </labels>
                <created>Sun, 10 Jul 2022 10:47:00 +0000</created>
                <updated>Wed, 13 Sep 2023 06:11:24 +0000</updated>
                                            <version>Lustre 2.16.0</version>
                                    <fixVersion>Lustre 2.16.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="339993" author="gerrit" created="Sun, 10 Jul 2022 11:50:12 +0000"  >&lt;p&gt;&quot;Alexander Boyko &amp;lt;alexander.boyko@hpe.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/47928&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47928&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16002&quot; title=&quot;Ping evictor delayed client eviction for 3 ping interval more than defined&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16002&quot;&gt;LU-16002&lt;/a&gt; ptlrpc: reduce pinger eviction time&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 747f6d6f7dfad19a9340275e905e79152978cf35&lt;/p&gt;</comment>
                            <comment id="340831" author="gerrit" created="Tue, 19 Jul 2022 12:30:31 +0000"  >&lt;p&gt;&quot;Alexander Boyko &amp;lt;alexander.boyko@hpe.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/47982&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47982&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16002&quot; title=&quot;Ping evictor delayed client eviction for 3 ping interval more than defined&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16002&quot;&gt;LU-16002&lt;/a&gt; ptlrpc: adds configurable ping interval&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: f53e72d9b8ad713f5bb509d6e3ff3765eef8f587&lt;/p&gt;</comment>
                            <comment id="346998" author="gerrit" created="Sat, 17 Sep 2022 06:23:38 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/47982/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47982/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16002&quot; title=&quot;Ping evictor delayed client eviction for 3 ping interval more than defined&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16002&quot;&gt;LU-16002&lt;/a&gt; ptlrpc: adds configurable ping interval&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 8e66f061c01e53cda84ce80af3860f488e927210&lt;/p&gt;</comment>
                            <comment id="349774" author="gerrit" created="Sat, 15 Oct 2022 05:54:38 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/47928/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/47928/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16002&quot; title=&quot;Ping evictor delayed client eviction for 3 ping interval more than defined&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16002&quot;&gt;LU-16002&lt;/a&gt; ptlrpc: reduce pinger eviction time&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 6bdeda7afe92d61db56367875774fa074aaac0fd&lt;/p&gt;</comment>
                            <comment id="349802" author="pjones" created="Sun, 16 Oct 2022 01:02:18 +0000"  >&lt;p&gt;Landed for 2.16&lt;/p&gt;</comment>
                            <comment id="385544" author="adilger" created="Mon, 11 Sep 2023 23:03:58 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=aboyko&quot; class=&quot;user-hover&quot; rel=&quot;aboyko&quot;&gt;aboyko&lt;/a&gt;, could you please provide some more background on why a tunable &lt;tt&gt;ping_interval&lt;/tt&gt; is needed?  I&apos;m concerned that allowing &lt;tt&gt;ping_interval&lt;/tt&gt; to be tuned separately from &lt;tt&gt;obd_timeout&lt;/tt&gt; can lead to random client eviction when clients are not sending RPCs or &lt;tt&gt;OBD_PING&lt;/tt&gt; in a timely manner.  This might be hard to notice if it works out like e.g. &lt;tt&gt;ping_interval = obd_timeout - 10&lt;/tt&gt; and this is OK while an import is active and sending RPCs or &lt;tt&gt;OBD_PING&lt;/tt&gt;, but fails intermittently if the import becomes idle and a ping is also lost.&lt;/p&gt;

&lt;p&gt;I&apos;d much prefer to have a per-device &lt;tt&gt;obd_timeout&lt;/tt&gt; value as implemented in patch &lt;a href=&quot;https://review.whamcloud.com/50519&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/50519&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9912&quot; title=&quot;fix multiple client mounts with different server timeouts&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9912&quot;&gt;LU-9912&lt;/a&gt; ptlrpc: make obd timeout a per-device param&lt;/tt&gt;&quot;, and then the &lt;tt&gt;ping_interval&lt;/tt&gt; for each import is controlled by &lt;tt&gt;obd-&amp;gt;obd_timeout / 4&lt;/tt&gt;.  This would work properly for clients that mount multiple filesystems, unlike having a global &lt;tt&gt;obd_timeout&lt;/tt&gt; (and now global &lt;tt&gt;ping_interval&lt;/tt&gt;).&lt;/p&gt;


&lt;p&gt;However, before we change anything with the global &lt;tt&gt;ping_interval&lt;/tt&gt; that was aadded in 2.16, I&apos;d like to understand why it was added and what problem it was solving.  I&apos;d prefer to avoid having a tunable &lt;tt&gt;ping_interval&lt;/tt&gt; entirely, just because it can go badly.  If this was needed to solve some specific problem, would a per-device &lt;tt&gt;obd_timeout&lt;/tt&gt; also solve this same issue?  Also, &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=hornc&quot; class=&quot;user-hover&quot; rel=&quot;hornc&quot;&gt;hornc&lt;/a&gt; landed patch &lt;a href=&quot;https://review.whamcloud.com/49807&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/49807&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16483&quot; title=&quot;Loss of idle ping causes reconnect even if subsequent ping succeeds&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16483&quot;&gt;&lt;del&gt;LU-16483&lt;/del&gt;&lt;/a&gt; ptlrpc: Track highest reply XID&lt;/tt&gt;&quot; that also solves a longstanding problem where clients reconnect on a ping failure, even though they have successfully sent other pings in the meantime.&lt;/p&gt;


&lt;p&gt;I&apos;m thinking we should remove the global &lt;tt&gt;ping_interval&lt;/tt&gt; tunable completely (so that pings are always tied to &lt;tt&gt;obd_timeout&lt;/tt&gt;), and use something like:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
#define PING_INTERVAL(obd) (obd_timeout(obd) / 4)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It would still be possible to keep &lt;tt&gt;evict_multiplier&lt;/tt&gt; if that is important, something like:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
#define PING_INTERVAL(obd) (obd_timeout(obd) * 3 / (evict_multiplier * 2))
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;but before we add complexity I&apos;d like to understand what this was needed for.&lt;/p&gt;</comment>
                            <comment id="385600" author="aboyko" created="Tue, 12 Sep 2023 12:54:03 +0000"  >&lt;p&gt;We had an issue there cascading failures bring timeouts to ~1700s, blocking callback timeout. Something like - one client node with LDLM lock crashed, server waited it, increased AT. Crash and eviction was not a problem to a whole system but it highly increased AT and shared lock for a root directory. We detected 3-6 problems during it, bl timeouts, eviction logic, etc.&#160; The one way to prevent such case is to detect crashed client early and evict it by pinger_evictor, we can reduce ping_interval and evictor multiplier for this. By default eviction time is 6 ping interval, server would not evict client if 5 pings are lost. For a perfect network it is overhead, and could be reduced (eviction multiplier). Similar things relate to a ping_interval. If obd_timeout is 300s(it is used in real), ping interval is 75s. To detect client fail faster ping interval should be reduced.&lt;/p&gt;

&lt;p&gt;I&apos;ve made a comment about obd_timeout at LU. From my point of view obd_timeout is especially recovery timeout, but recovery and pinger don&apos;t have any relations. Only some historical.&lt;/p&gt;</comment>
                            <comment id="385728" author="adilger" created="Wed, 13 Sep 2023 06:11:24 +0000"  >&lt;p&gt;I agree that it is useful in such cases to be able to tune &lt;tt&gt;ping_interval&lt;/tt&gt; and/or &lt;tt&gt;evict_multiplier&lt;/tt&gt;, but it would make sense to ensure that &lt;tt&gt;ping_interval &amp;lt; obd_timeout / 2&lt;/tt&gt; and &lt;tt&gt;evict_multiplier &amp;gt;= 2&lt;/tt&gt; so that it cannot be set to a value where the client will be evicted easily.  Even so, it still makes sense to use &lt;tt&gt;obd_timeout(exp)&lt;/tt&gt; directly by default, unless &lt;tt&gt;ping_interval&lt;/tt&gt; is explicitly set:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
#define PING_INTERVAL(obd) (ping_interval ?: (obd_timeout(obd) * 3 / (evict_multiplier * 2)))
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Also, for future reference, it is possible to evict specific clients from the MDS with &quot;&lt;tt&gt;lctl set_param mdt.&amp;#42;.evict_client=UUID&lt;/tt&gt;&quot; or &quot;&lt;tt&gt;lctl set_param mdt.&amp;#42;.evict_client=nid:NID&lt;/tt&gt;&quot;.  It will evict the client UUID/NID from all the targets if &quot;&lt;tt&gt;mdt.&amp;#42;.evict_tgt_nids&lt;/tt&gt;&quot; is set.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="47950">LU-9912</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="72954">LU-16271</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i02u7j:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>