<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:57:40 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13020] ko2iblnd tuning</title>
                <link>https://jira.whamcloud.com/browse/LU-13020</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We have been setting ko2iblnd timeout = 150 (default of 50) for our cluster. From reading the code this is no longer being used and instead lnet_lnd_timeout is used.&lt;/p&gt;

&lt;p&gt;For example in kiblnd_queue_tx_locked&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
    timeout_ns = lnet_get_lnd_timeout() * NSEC_PER_SEC;
    tx-&amp;gt;tx_queued = 1;
    tx-&amp;gt;tx_deadline = ktime_add_ns(ktime_get(), timeout_ns);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and &lt;br/&gt;
lnet_get_lnd_timeout() returns the new default of 5. Does this mean we went from 150 to 5! &lt;/p&gt;

&lt;p&gt;In the documentation it says that lnet_lnd_timeout derived from lnet_transaction_timeout and retry_count. But that is not getting set for tx-&amp;gt;tx_deadline.&lt;/p&gt;

&lt;p&gt;Am I reading the code correctly?&lt;/p&gt;</description>
                <environment></environment>
        <key id="57477">LU-13020</key>
            <summary>ko2iblnd tuning</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Tue, 26 Nov 2019 20:34:29 +0000</created>
                <updated>Wed, 6 Jan 2021 13:34:54 +0000</updated>
                            <resolved>Wed, 6 Jan 2021 13:34:54 +0000</resolved>
                                    <version>Lustre 2.12.2</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="258862" author="ashehata" created="Tue, 26 Nov 2019 22:01:38 +0000"  >&lt;p&gt;If you look at retry_count_set() and transaction_to_set() you&apos;ll see that it&apos;s setting the lnet_lnd_timeout based on the following calculation&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
retry_count_set()
 482 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (value == 0)
 483 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;lnet_lnd_timeout = lnet_transaction_timeout;
 484 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;else&lt;/span&gt;
 485 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;lnet_lnd_timeout = lnet_transaction_timeout / value;

transaction_to_set()
 436 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;*transaction_to = value;
 437 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (lnet_retry_count == 0)
 438 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;lnet_lnd_timeout = value;
 439 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;else&lt;/span&gt;
 440 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;lnet_lnd_timeout = value / lnet_retry_count;&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I checked master and if you specify&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 options lnet lnet_transaction_timeout=150&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;transaction_to_set() gets called:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 (api-ni.c:442:transaction_to_set()) lnet_lnd_timeout = 50

# it&apos;s 50 because retry_count=3
# however &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; you &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt;:
options lnet lnet_retry_count=0
options lnet lnet_transaction_timeout=150

# you should see:
(api-ni.c:442:transaction_to_set()) lnet_lnd_timeout = 50
(api-ni.c:489:retry_count_set()) lnet_lnd_timeout = 150

# note the above is debug code I added.&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Is this not working for you?&lt;/p&gt;</comment>
                            <comment id="258864" author="mhanafi" created="Tue, 26 Nov 2019 22:34:10 +0000"  >&lt;p&gt;So &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt; 
#define LNET_LND_DEFAULT_TIMEOUT 5
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;is not used? Because when lnet module loads it sets it to 50?&lt;/p&gt;


&lt;p&gt;Our cluster running 2.12.2 every where we still get client evictions. And enabling &apos;debug=+net&apos; make the issue go away (ref: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11644&quot; title=&quot;LNet: Service thread inactive for 300  causes client evictions &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11644&quot;&gt;LU-11644&lt;/a&gt;)&lt;/p&gt;

&lt;p&gt;The other issue, on our remote HDR clustre we are getting RDMA timeout to the lustre routers. Running lnet_selftest with-in compute nodes we get timeouts. This may be a fabric issue. But something is quite not right.... &lt;br/&gt;
I&apos;ll run some tests changing lnet_transaction_timeout to see if I can see some difference.&lt;/p&gt;</comment>
                            <comment id="258931" author="mhanafi" created="Wed, 27 Nov 2019 19:55:48 +0000"  >&lt;p&gt;The ko2iblnd timeout should be removed and Docs should be updated. This would eliminate some of the confusion.&lt;/p&gt;</comment>
                            <comment id="258932" author="ashehata" created="Wed, 27 Nov 2019 20:38:18 +0000"  >&lt;p&gt;did increasing the timeout resolve the client evictions issue (ref: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11644&quot; title=&quot;LNet: Service thread inactive for 300  causes client evictions &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11644&quot;&gt;LU-11644&lt;/a&gt;)?&lt;/p&gt;</comment>
                            <comment id="259040" author="mhanafi" created="Mon, 2 Dec 2019 18:53:28 +0000"  >&lt;p&gt;I ended up setting &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
    transaction_timeout: 200
   at_min=275
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Things look ok for now. I&apos;ll know more in a few days. &lt;br/&gt;
I tried setting retry_count=2 with transaction_timeout=200 it causes a lot of timeouts. Worse than setting transaction_timeout=100 and retry_count=0.&lt;/p&gt;</comment>
                            <comment id="259047" author="ashehata" created="Mon, 2 Dec 2019 20:59:06 +0000"  >&lt;p&gt;Would it be possible to get some net logging around the timeouts that occur with:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
lnet_transaction_timeout=200
lnet_retry_count=2 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;It would be very useful for me to understand the cause of these timeouts, so I can fine tune the feature if required.&lt;/p&gt;

&lt;p&gt;The idea with this config, is that the LND timeout will be set to 100. We&apos;ll attempt 2 retries within that 200s window.&lt;/p&gt;

&lt;p&gt;I&apos;d like to see if we do attempt the retry and for what reason and the implications of the retries.&lt;/p&gt;</comment>
                            <comment id="259253" author="gerrit" created="Thu, 5 Dec 2019 23:36:17 +0000"  >&lt;p&gt;Amir Shehata (ashehata@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/36944&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/36944&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13020&quot; title=&quot;ko2iblnd tuning&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13020&quot;&gt;&lt;del&gt;LU-13020&lt;/del&gt;&lt;/a&gt; o2iblnd: timeout is now obsolete&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: d6cbd02ff5d17b0cf53683e1c8db8364850ac4cc&lt;/p&gt;</comment>
                            <comment id="259535" author="mhanafi" created="Tue, 10 Dec 2019 18:07:05 +0000"  >&lt;p&gt;I will try to reproduce the issue with retry_count=2 on our test filesystem and gather logs.&lt;/p&gt;</comment>
                            <comment id="288756" author="mhanafi" created="Wed, 6 Jan 2021 01:42:29 +0000"  >&lt;p&gt;We can close this&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="57816">LU-13145</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00q13:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>