<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:31:20 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10020] mlx5_warn:mlx5_1:dump_cqe:257:(pid 4031): dump error cqe</title>
                <link>https://jira.whamcloud.com/browse/LU-10020</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We have the patch from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8752&quot; title=&quot;mlx5_warn:mlx5_0:dump_cqe:257:&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8752&quot;&gt;&lt;del&gt;LU-8752&lt;/del&gt;&lt;/a&gt; applied.&lt;/p&gt;

&lt;p&gt;We are testing lustre2.10.1 pre-release on a mlx5 hca host. lnet_selftest fails and mounting filesystem produced this error.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[ 435.503071] mlx5_warn:mlx5_1:dump_cqe:257:(pid 4031): dump error cqe
[ 435.503072] 00000000 00000000 00000000 00000000
[ 435.503072] 00000000 00000000 00000000 00000000
[ 435.503073] 00000000 00000000 00000000 00000000
[ 435.503075] 00000000 9d005304 08000069 005878d2
[ 435.503078] LNet: 4031:0:(o2iblnd_cb.c:3475:kiblnd_complete()) RDMA (tx: ffffc90063356f28) failed: 4
[ 435.503292] LNet: 4029:0:(o2iblnd_cb.c:967:kiblnd_tx_complete()) Tx -&amp;gt; 10.151.20.103@o2ib cookie 0x67 sending 1 waiting 0: failed 5
[ 435.503295] LNet: 4029:0:(o2iblnd_cb.c:1919:kiblnd_close_conn_locked()) Closing conn to 10.151.20.103@o2ib: error -5(waiting)
[ 435.503304] LNet: 4029:0:(rpc.c:1413:srpc_lnet_ev_handler()) LNet event status -5 type 1, RPC errors 11
[ 435.503306] LNet: 4029:0:(rpc.c:1413:srpc_lnet_ev_handler()) Skipped 1 previous similar message
[ 435.503396] LNet: 4151:0:(rpc.c:1143:srpc_client_rpc_done()) Client RPC done: service 5, peer 12345-10.151.20.103@o2ib, status SWI_STATE_REQUEST_SUBMITTED:1:-4
[ 440.503751] LNet: 4152:0:(lib-move.c:830:lnet_post_send_locked()) Dropping message &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 12345-10.151.20.103@o2ib: peer not alive
[ 440.503754] LNet: 4152:0:(lib-move.c:2827:LNetPut()) Error sending PUT to 12345-10.151.20.103@o2ib: -113
[ 440.503757] LNet: 4152:0:(rpc.c:1413:srpc_lnet_ev_handler()) LNet event status -113 type 5, RPC errors 16
[ 440.503758] LNet: 4152:0:(rpc.c:1413:srpc_lnet_ev_handler()) Skipped 4 previous similar messages
[ 440.503765] LNet: 4152:0:(rpc.c:1143:srpc_client_rpc_done()) Client RPC done: service 5, peer 12345-10.151.20.103@o2ib, status SWI_STATE_REQUEST_SUBMITTED:1:-4
[ 506.581347] LNet: 4173:0:(rpc.c:1069:srpc_client_rpc_expired()) Client RPC expired: service 11, peer 12345-10.151.20.103@o2ib, timeout 64.
[ 506.581363] LNet: 4147:0:(rpc.c:1143:srpc_client_rpc_done()) Client RPC done: service 11, peer 12345-10.151.20.103@o2ib, status SWI_STATE_REQUEST_SENT:1:-4
[ 506.581367] LustreError: 4147:0:(brw_test.c:344:brw_client_done_rpc()) BRW RPC to 12345-10.151.20.103@o2ib failed with -110


&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</description>
                <environment></environment>
        <key id="48427">LU-10020</key>
            <summary>mlx5_warn:mlx5_1:dump_cqe:257:(pid 4031): dump error cqe</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="6">Not a Bug</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Fri, 22 Sep 2017 00:55:36 +0000</created>
                <updated>Tue, 1 Sep 2020 14:13:25 +0000</updated>
                            <resolved>Tue, 18 Dec 2018 17:43:25 +0000</resolved>
                                    <version>Lustre 2.10.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="209166" author="bhoagland" created="Fri, 22 Sep 2017 01:16:04 +0000"  >&lt;p&gt;Hello,&lt;/p&gt;

&lt;p&gt;Can you please confirm you are using 2.10.1 pre-release on a production system?&lt;/p&gt;</comment>
                            <comment id="209167" author="mhanafi" created="Fri, 22 Sep 2017 01:25:30 +0000"  >&lt;p&gt;This particular system is a router and not in production yet. We are testing multirail for production use.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="209170" author="pjones" created="Fri, 22 Sep 2017 02:11:41 +0000"  >&lt;p&gt;Then let&apos;s move to sev 2 - sev 1 is just for production outages&lt;/p&gt;</comment>
                            <comment id="209178" author="pjones" created="Fri, 22 Sep 2017 03:15:35 +0000"  >&lt;p&gt;Amir&lt;/p&gt;

&lt;p&gt;Can you please advise on this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="209283" author="mhanafi" created="Fri, 22 Sep 2017 13:47:28 +0000"  >&lt;p&gt;Additional Info:&lt;/p&gt;

&lt;p&gt;HCA:&#160; Mellanox Technologies MT27700 Family &lt;span class=&quot;error&quot;&gt;&amp;#91;ConnectX-4&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;OS: Sles12 SP2 4.4.74-92.32.1.20170808-nasa&lt;/p&gt;

&lt;p&gt;OFED: mlnx ofed 3.4.2&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Lustre 2.9 Works&lt;/p&gt;

&lt;p&gt;Lustre 2.10.0 Works.&lt;/p&gt;

&lt;p&gt;Lustre 2.10.1 Does not work&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="209312" author="ashehata" created="Fri, 22 Sep 2017 18:16:02 +0000"  >&lt;p&gt;I wonder if commit:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;commit f87c7c2cee6fc5a0864a757917a414dc605554b3
Author: Doug Oucharek &amp;lt;doug.s.oucharek@intel.com&amp;gt;
Date:   Tue May 16 16:00:53 2017 -0700

    LU-9500 lnd: Don&apos;t Page Align remote_addr with FastReg
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Is the problem. &lt;/p&gt;

&lt;p&gt;Can you take out this commit from your 2.10.1 tree and try it out?&lt;/p&gt;

&lt;p&gt;I&apos;ll try and reproduce locally as well&lt;/p&gt;</comment>
                            <comment id="209341" author="mhanafi" created="Fri, 22 Sep 2017 22:11:23 +0000"  >&lt;p&gt;removing commit f87c7c2cee6fc5a0864a757917a414dc605554b3 fixed the problem in mofed3.4.2. We are building mofed4.1 and will test soon.&lt;br/&gt;
It may be required with mofed4.x.&lt;/p&gt;</comment>
                            <comment id="209346" author="ashehata" created="Fri, 22 Sep 2017 23:45:06 +0000"  >&lt;p&gt;Ok. &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9500&quot; title=&quot;MOFED 4/mlx5: Aligning non-aligned page addresses trigger dump_cqe&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9500&quot;&gt;&lt;del&gt;LU-9500&lt;/del&gt;&lt;/a&gt; was intended to get mofed 4.1 working, but it shouldn&apos;t have broken mofed3.4.2. We&apos;ll need to resolve that.&lt;/p&gt;</comment>
                            <comment id="209370" author="mhanafi" created="Sun, 24 Sep 2017 03:07:20 +0000"  >&lt;p&gt;I tested mofed4.1 and lustre2.10.1. Didn&apos;t get the error.&lt;/p&gt;</comment>
                            <comment id="209439" author="ashehata" created="Mon, 25 Sep 2017 16:26:44 +0000"  >&lt;p&gt;Just to clarify, is the passing test with mofed 4.1, Lustre 2.10.1 minus &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9500&quot; title=&quot;MOFED 4/mlx5: Aligning non-aligned page addresses trigger dump_cqe&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9500&quot;&gt;&lt;del&gt;LU-9500&lt;/del&gt;&lt;/a&gt;? Or with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9500&quot; title=&quot;MOFED 4/mlx5: Aligning non-aligned page addresses trigger dump_cqe&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9500&quot;&gt;&lt;del&gt;LU-9500&lt;/del&gt;&lt;/a&gt;?&lt;/p&gt;</comment>
                            <comment id="209494" author="ashehata" created="Mon, 25 Sep 2017 22:10:27 +0000"  >&lt;p&gt;I couldn&apos;t reproduce the failure with lnet-selftest on RHEL 7.3/MOFED 3.4.2/Lustre 2.10.1-RC1.&lt;br/&gt;
are you able to consistently reproduce this with mofed3.4.2 + 2.10.1? This patch specifically addresses fastreg.&lt;/p&gt;

&lt;p&gt;would you be able to provide us with net/neterror logs for this problem?&lt;/p&gt;</comment>
                            <comment id="209500" author="mhanafi" created="Mon, 25 Sep 2017 22:50:57 +0000"  >&lt;p&gt;With Rhel7.3/Mofed3.4.2/lustre2.10.1-Rc1 I didn&apos;t always get the &quot;dump error cqe.&quot; But lnet-selftest wasn&apos;t working with lots lnet errors.&lt;/p&gt;

&lt;p&gt;With Rhel7.3/Mofed3.4.2/lustre 2.10 and  Rhel7.3/Mofed4.1/lustre 2.10.1-RC1 it always works&lt;/p&gt;

&lt;p&gt;And Removing &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9500&quot; title=&quot;MOFED 4/mlx5: Aligning non-aligned page addresses trigger dump_cqe&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9500&quot;&gt;&lt;del&gt;LU-9500&lt;/del&gt;&lt;/a&gt; with Rhel7.3/Mofed3.4.2/lustre2.10.1-Rc1 it always worked.&lt;/p&gt;

&lt;p&gt;Currently I am running with Rhel7.3/Mofed4.1/lustre 2.10.1-RC1. I&apos;ll will revert and gather some logs.&lt;/p&gt;

&lt;p&gt;I think to reproduce it map_on_demand needs to be configured&lt;br/&gt;
Here is our config.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;options ko2iblnd timeout=150 retry_count=7 peer_timeout=0 map_on_demand=32 peer_credits=63 concurrent_sends=63

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="209505" author="ashehata" created="Tue, 26 Sep 2017 00:00:13 +0000"  >&lt;p&gt;ok, I&apos;m able to reproduce with map_on_demand=32.&lt;/p&gt;

&lt;p&gt;I&apos;ll investigate further.&lt;/p&gt;</comment>
                            <comment id="209512" author="ashehata" created="Tue, 26 Sep 2017 01:48:37 +0000"  >&lt;p&gt;It looks like the issue is:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;kiblnd_fmr_map_tx()
...
    &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!is_fastreg) 
         rd-&amp;gt;rd_frags[0].rf_addr &amp;amp;= ~hdev-&amp;gt;ibh_page_mask;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;for MOFED 3.4.2 it appears that we need to page align the remote_addr even when fast_reg is enabled.&lt;/p&gt;

&lt;p&gt;Next step is to install MOFED 4.1 and see if page aligning the remote_address will trigger failure. If it does then we might need to check for MOFED version to see if we need to page align.&lt;/p&gt;</comment>
                            <comment id="210328" author="ashehata" created="Wed, 4 Oct 2017 20:49:48 +0000"  >&lt;p&gt;Please look at &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt;. I believe this would be the same problem. &lt;a href=&quot;https://review.whamcloud.com/29290&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29290&lt;/a&gt; should resolve that issue.&lt;/p&gt;

&lt;p&gt;Let me know if the patch helps in your case.&lt;/p&gt;</comment>
                            <comment id="210760" author="mhanafi" created="Tue, 10 Oct 2017 21:07:33 +0000"  >&lt;p&gt;Will &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; land as the solution for this issue? We have moved up to mofed4 so we are no longer seeing the issue.&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="210914" author="ashehata" created="Thu, 12 Oct 2017 00:20:54 +0000"  >&lt;p&gt;Please take a look at my comment here:&lt;br/&gt;
&lt;a href=&quot;https://jira.hpdd.intel.com/browse/LU-10089?focusedCommentId=210745&amp;amp;page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-210745&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jira.hpdd.intel.com/browse/LU-10089?focusedCommentId=210745&amp;amp;page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-210745&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="238765" author="mhanafi" created="Tue, 18 Dec 2018 17:23:03 +0000"  >&lt;p&gt;think this can be close. All fixes has been pushed to mofed 4.4.2&lt;/p&gt;</comment>
                            <comment id="238770" author="pjones" created="Tue, 18 Dec 2018 17:43:25 +0000"  >&lt;p&gt;ok - thanks&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzkm7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>