<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:48:58 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5151] Oops in lnet_return_rx_credits_locked</title>
                <link>https://jira.whamcloud.com/browse/LU-5151</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;While testing 2.6 in my Cray test environment I keep losing my routers which NMI produces the following back traces:&lt;/p&gt;

&lt;p&gt;2014-06-05T16:45:09.828951-04:00 c0-0c0s2n3 Pid: 4554, comm: kiblnd_sd_01_01 Tainted: P           N  3.0.82-0.7.9_1.0502.7780-cray_gem_s #1  &lt;br/&gt;
2014-06-05T16:45:09.828965-04:00 c0-0c0s2n3 RIP: 0010:&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0341831&amp;gt;&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0341831&amp;gt;&amp;#93;&lt;/span&gt; lnet_return_rx_credits_locked+0x171/0x310 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
2014-06-05T16:45:09.828971-04:00 c0-0c0s2n3 RSP: 0018:ffff8803ea379bb0  EFLAGS: 00010286&lt;br/&gt;
2014-06-05T16:45:09.858936-04:00 c0-0c0s2n3 RAX: dead000000200200 RBX: ffff880317d5a800 RCX: 00000000ffffffff&lt;br/&gt;
2014-06-05T16:45:09.858949-04:00 c0-0c0s2n3 RDX: dead000000100100 RSI: 0000000000000001 RDI: ffff880317d5a800&lt;br/&gt;
2014-06-05T16:45:09.858960-04:00 c0-0c0s2n3 RBP: ffff8803ea379be0 R08: ffff8803e821c860 R09: ffff880317d5a850&lt;br/&gt;
2014-06-05T16:45:09.858970-04:00 c0-0c0s2n3 R10: 0000000000000000 R11: 0000000000000000 R12: ffff880317d5a800&lt;br/&gt;
2014-06-05T16:45:09.858977-04:00 c0-0c0s2n3 R13: ffff8803daf91880 R14: 00000000fffffff5 R15: 0000000000000001&lt;br/&gt;
2014-06-05T16:45:09.888794-04:00 c0-0c0s2n3 FS:  00007f28c44457a0(0000) GS:ffff880407cc0000(0000) knlGS:0000000000000000&lt;br/&gt;
2014-06-05T16:45:09.888807-04:00 c0-0c0s2n3 CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b&lt;br/&gt;
2014-06-05T16:45:09.888818-04:00 c0-0c0s2n3 CR2: 000000000063c800 CR3: 000000031f33f000 CR4: 00000000000007e0&lt;br/&gt;
2014-06-05T16:45:09.888824-04:00 c0-0c0s2n3 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000&lt;br/&gt;
2014-06-05T16:45:09.888834-04:00 c0-0c0s2n3 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400&lt;br/&gt;
2014-06-05T16:45:09.918910-04:00 c0-0c0s2n3 Process kiblnd_sd_01_01 (pid: 4554, threadinfo ffff8803ea378000, task ffff8803e89480c0)&lt;br/&gt;
2014-06-05T16:45:09.918924-04:00 c0-0c0s2n3 Stack:&lt;br/&gt;
2014-06-05T16:45:09.918940-04:00 c0-0c0s2n3 ffff8803ea379bd0 ffff880317d5a800 0000000000000001 0000000000000001&lt;br/&gt;
2014-06-05T16:45:09.918951-04:00 c0-0c0s2n3 00000000fffffff5 0000000000000001 ffff8803ea379c10 ffffffffa0338b28&lt;br/&gt;
2014-06-05T16:45:09.918956-04:00 c0-0c0s2n3 ffff880317d5a918 dead000000200200 ffff880317d5a800 ffff8803e9b18d80&lt;br/&gt;
2014-06-05T16:45:09.918961-04:00 c0-0c0s2n3 Call Trace:&lt;br/&gt;
2014-06-05T16:45:09.918966-04:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0338b28&amp;gt;&amp;#93;&lt;/span&gt; lnet_msg_decommit+0xf8/0x6b0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
2014-06-05T16:45:09.948770-04:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0339b47&amp;gt;&amp;#93;&lt;/span&gt; lnet_finalize+0x297/0x7d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
2014-06-05T16:45:09.948783-04:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03465ed&amp;gt;&amp;#93;&lt;/span&gt; lnet_parse+0xc2d/0x1b80 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
2014-06-05T16:45:09.948794-04:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03db68a&amp;gt;&amp;#93;&lt;/span&gt; kiblnd_handle_rx+0x30a/0x690 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
2014-06-05T16:45:09.948805-04:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03e03af&amp;gt;&amp;#93;&lt;/span&gt; kiblnd_rx_complete+0x34f/0x420 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
2014-06-05T16:45:09.948815-04:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03e0d25&amp;gt;&amp;#93;&lt;/span&gt; kiblnd_scheduler+0x7c5/0x970 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
2014-06-05T16:45:09.948821-04:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810672fe&amp;gt;&amp;#93;&lt;/span&gt; kthread+0x9e/0xb0&lt;br/&gt;
2014-06-05T16:45:09.978765-04:00 c0-0c0s2n3 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81481874&amp;gt;&amp;#93;&lt;/span&gt; kernel_thread_helper+0x4/0x10&lt;br/&gt;
2014-06-05T16:45:09.978785-04:00 c0-0c0s2n3 Code: c2 0f 85 2b 01 00 00 8d 41 01 85 c0 41 89 45 48 0f 8f dc fe ff ff 49 8b 7d 20 be 01 00 00 00 48 83 &lt;br/&gt;
ef 10 48 8b 47 18 48 8b 57 10 &lt;br/&gt;
2014-06-05T16:45:10.004304-04:00 c0-0c0s2n3 89 42 08 48 89 10 48 b8 00 01 10 00 00 00 ad de 48 89 47 10 &lt;br/&gt;
2014-06-05T16:45:10.004326-04:00 c0-0c0s2n3 RIP  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0341831&amp;gt;&amp;#93;&lt;/span&gt; lnet_return_rx_credits_locked+0x171/0x310 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
2014-06-05T16:45:10.004333-04:00 c0-0c0s2n3 RSP &amp;lt;ffff8803ea379bb0&amp;gt;&lt;br/&gt;
2014-06-05T16:45:10.029888-04:00 c0-0c0s2n3 --&lt;del&gt;[ end trace 17126666cf42dece ]&lt;/del&gt;--&lt;/p&gt;</description>
                <environment>Cray router to connect infiniband to gemini interconnect.</environment>
        <key id="25046">LU-5151</key>
            <summary>Oops in lnet_return_rx_credits_locked</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="liang">Liang Zhen</assignee>
                                    <reporter username="simmonsja">James A Simmons</reporter>
                        <labels>
                            <label>lnet</label>
                    </labels>
                <created>Thu, 5 Jun 2014 22:24:47 +0000</created>
                <updated>Wed, 11 Jun 2014 13:41:54 +0000</updated>
                            <resolved>Wed, 11 Jun 2014 13:41:54 +0000</resolved>
                                    <version>Lustre 2.6.0</version>
                                    <fixVersion>Lustre 2.6.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="85942" author="jamesanunez" created="Thu, 5 Jun 2014 23:22:12 +0000"  >&lt;p&gt;Liang, &lt;/p&gt;

&lt;p&gt;Would you please comment on this ticket?&lt;/p&gt;

&lt;p&gt;Thank you, &lt;br/&gt;
James&lt;/p&gt;</comment>
                            <comment id="85974" author="liang" created="Fri, 6 Jun 2014 05:47:04 +0000"  >&lt;p&gt;Hi James, to narrow down the problem, have you ever seen this issue with other versions between 2.4 and 2.6? thanks&lt;/p&gt;</comment>
                            <comment id="85988" author="liang" created="Fri, 6 Jun 2014 09:00:44 +0000"  >&lt;p&gt;I see the reason here, I think it&apos;s because this patch changed returned value of lnet_post_routed_recv_locked() from positive to negative (&lt;a href=&quot;http://review.whamcloud.com/#/c/9369/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/9369/&lt;/a&gt;)&lt;br/&gt;
which means lnet_parse_forward_locked()-&amp;gt;lnet_post_routed_recv_locked() will return -EAGAIN instead of EAGAIN, and it will be treated as real error in lnet_parse, although it is not a real error because EAGAIN means message is waiting for router buffer, and we will try to finalise a message which is still queued:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;        if (!for_me) {
                rc = lnet_parse_forward_locked(ni, msg);
                lnet_net_unlock(cpt);

                if (rc &amp;lt; 0)
                        goto free_drop;
                if (rc == 0) {
                        lnet_ni_recv(ni, msg-&amp;gt;msg_private, msg, 0,
                                     0, payload_length, payload_length);
                }
                return 0;
        }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="85996" author="simmonsja" created="Fri, 6 Jun 2014 12:26:06 +0000"  >&lt;p&gt;I have only seen the problem with 2.6.&lt;/p&gt;</comment>
                            <comment id="86000" author="liang" created="Fri, 6 Jun 2014 13:13:31 +0000"  >&lt;p&gt;patch is here: &lt;a href=&quot;http://review.whamcloud.com/#/c/10625/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/10625/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="86038" author="simmonsja" created="Fri, 6 Jun 2014 17:56:25 +0000"  >&lt;p&gt;I changed it from EAGAIN to -EAGAIN so it matches the behavior in the upstream kernel. It is frowned on to use EAGAIN in kernel space. How about instead we just do:&lt;/p&gt;

&lt;p&gt;diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c&lt;br/&gt;
index 6097ae0..5fcc19b 100644&lt;br/&gt;
&amp;#8212; a/lnet/lnet/lib-move.c&lt;br/&gt;
+++ b/lnet/lnet/lib-move.c&lt;br/&gt;
@@ -1961,12 +1961,14 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,&lt;br/&gt;
                rc = lnet_parse_forward_locked(ni, msg);&lt;br/&gt;
                lnet_net_unlock(cpt);&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;if (rc &amp;lt; 0)&lt;br/&gt;
+               if (rc == -EAGAIN) /* waiting for buffer */&lt;br/&gt;
+                       return 0;&lt;br/&gt;
+&lt;br/&gt;
+               if (rc != 0)&lt;br/&gt;
                        goto free_drop;&lt;/li&gt;
	&lt;li&gt;if (rc == 0) 
{
-                       lnet_ni_recv(ni, msg-&amp;gt;msg_private, msg, 0,
-                                    0, payload_length, payload_length);
-               }
&lt;p&gt;+&lt;br/&gt;
+               lnet_ni_recv(ni, msg-&amp;gt;msg_private, msg, 0,&lt;br/&gt;
+                            0, payload_length, payload_length);&lt;br/&gt;
                return 0;&lt;br/&gt;
        }&lt;/p&gt;&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="86087" author="simmonsja" created="Mon, 9 Jun 2014 13:05:28 +0000"  >&lt;p&gt;I tired the above version of my patch and it resolved the issue. Do you Liang mind if we go with that version instead.&lt;/p&gt;</comment>
                            <comment id="86099" author="liang" created="Mon, 9 Jun 2014 15:13:37 +0000"  >&lt;p&gt;Hi James, both way should be fine, my concern about for -EAGAIN is, we need to impose another condition to LND: -EAGAIN should never returned by lnet_ni_eager_recv.&lt;br/&gt;
It is true for now, but it&apos;s probably better not to have this limit?&lt;/p&gt;</comment>
                            <comment id="86116" author="simmonsja" created="Mon, 9 Jun 2014 16:41:06 +0000"  >&lt;p&gt;I wouldn&apos;t consider that a huge limitation. The LASSERT in lnet_ni_eager_recv would handle this case today. So a potential driver writer would know not to return a -EAGAIN.&lt;/p&gt;</comment>
                            <comment id="86164" author="simmonsja" created="Mon, 9 Jun 2014 21:59:38 +0000"  >&lt;p&gt;I see their are strong opinions on this. I purpose that if you want to continue positive values in some of the function that we call them something else besides EAGAIN and ENOENT. This will be flagged by the HPPD checker and upstream using those values will be frowned on. So I suggest you define your own errors so EAGAIN will become LNET_RETRY and ENOENT becomes LNET_MISMATCH. Can you live with this compromise?&lt;/p&gt;</comment>
                            <comment id="86200" author="liang" created="Tue, 10 Jun 2014 11:52:46 +0000"  >&lt;p&gt;sorry I didn&apos;t notice there is update and review comments on patch, and overwrote it. &lt;br/&gt;
After rethink, I agree with Isaac and still tend to keep positive value, it will be much cleaner to me.&lt;/p&gt;</comment>
                            <comment id="86212" author="spitzcor" created="Tue, 10 Jun 2014 14:15:01 +0000"  >&lt;p&gt;James, thanks for reporting this problem.  We have seen this at Cray as well, of course, and only on routers as you had.  We have been working around the problem by using b2_5 vintage routers.&lt;/p&gt;</comment>
                            <comment id="86215" author="simmonsja" created="Tue, 10 Jun 2014 14:49:51 +0000"  >&lt;p&gt;Thank you Liang for working with me on this issue. IMHO the new LNET_CREDIT_*  values make it far more clear what is going on than using EAGAIN or just returning zero.&lt;/p&gt;</comment>
                            <comment id="86295" author="liang" created="Wed, 11 Jun 2014 02:59:57 +0000"  >&lt;p&gt;welcome, it&apos;s assigned to me anyway &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/wink.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;. I think we can close it now because it&apos;s landed which is the only place we have this issue.&lt;/p&gt;</comment>
                            <comment id="86314" author="jlevi" created="Wed, 11 Jun 2014 13:41:54 +0000"  >&lt;p&gt;Patch landed to Master.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10040" key="com.atlassian.jira.plugin.system.customfieldtypes:labels">
                        <customfieldname>Epic</customfieldname>
                        <customfieldvalues>
                                        <label>lnet</label>
    
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwnuv:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>14214</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>