<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:31:49 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10073] lnet-selftest test_smoke: lst Error found</title>
                <link>https://jira.whamcloud.com/browse/LU-10073</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;&lt;a href=&quot;https://testing.whamcloud.com/test_sets/87032fec-9d50-11e7-b778-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/87032fec-9d50-11e7-b778-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Seen previously in 2.9 testing (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6622&quot; title=&quot;lnet-selftest test_smoke: @@@@@@ FAIL: lst Error found&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6622&quot;&gt;&lt;del&gt;LU-6622&lt;/del&gt;&lt;/a&gt;).&lt;/p&gt;

&lt;p&gt;From test_log:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Batch is stopped
12345-10.9.0.84@tcp: [Session 0 brw errors, 30 ping errors] [RPC: 0 errors, 0 dropped, 30 expired]
12345-10.9.0.85@tcp: [Session 0 brw errors, 30 ping errors] [RPC: 0 errors, 0 dropped, 30 expired]
c:
Total 2 error nodes in c
12345-10.9.5.24@tcp: [Session 0 brw errors, 30 ping errors] [RPC: 0 errors, 0 dropped, 30 expired]
12345-10.9.5.25@tcp: [Session 0 brw errors, 30 ping errors] [RPC: 0 errors, 0 dropped, 30 expired]
s:
Total 2 error nodes in s
session is ended
Total 2 error nodes in c
Total 2 error nodes in s
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;  and &lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Started clients trevis-77vm3.trevis.hpdd.intel.com,trevis-77vm4: 
CMD: trevis-77vm3.trevis.hpdd.intel.com,trevis-77vm4 mount | grep /mnt/lustre&apos; &apos;
10.9.5.25@tcp:/lustre on /mnt/lustre type lustre (rw,flock,user_xattr,lazystatfs)
10.9.5.25@tcp:/lustre on /mnt/lustre type lustre (rw,flock,user_xattr,lazystatfs)
CMD: trevis-77vm4 PATH=/usr/lib64/lustre/tests:/usr/lib/lustre/tests:/usr/lib64/lustre/tests:/opt/iozone/bin:/opt/iozone/bin:/usr/lib64/lustre/tests/mpi:/usr/lib64/lustre/tests/racer:/usr/lib64/lustre/../lustre-iokit/sgpdd-survey:/usr/lib64/lustre/tests:/usr/lib64/lustre/utils/gss:/usr/lib64/lustre/utils:/usr/lib64/qt-3.3/bin:/usr/lib64/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/usr/sbin:/sbin:/bin::/sbin:/bin:/usr/sbin: NAME=autotest_config sh rpc.sh set_default_debug \&quot;vfstrace rpctrace dlmtrace neterror ha config ioctl super lfsck\&quot; \&quot;all\&quot; 4 
trevis-77vm4: h2tcp: deprecated, use h2nettype instead
trevis-77vm4: trevis-77vm4.trevis.hpdd.intel.com: executing set_default_debug vfstrace rpctrace dlmtrace neterror ha config ioctl super lfsck all 4
 lnet-selftest test_smoke: @@@@@@ FAIL: lst Error found 
  Trace dump:
  = /usr/lib64/lustre/tests/test-framework.sh:5289:error()
  = /usr/lib64/lustre/tests/lnet-selftest.sh:153:check_lst_err()
  = /usr/lib64/lustre/tests/lnet-selftest.sh:179:test_smoke()
  = /usr/lib64/lustre/tests/test-framework.sh:5565:run_one()
  = /usr/lib64/lustre/tests/test-framework.sh:5604:run_one_logged()
  = /usr/lib64/lustre/tests/test-framework.sh:5451:run_test()
  = /usr/lib64/lustre/tests/lnet-selftest.sh:182:main()
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>trevis, full, x86_64 servers, ppc clients&lt;br/&gt;
servers: el7.4, ldiskfs, branch master, v2.10.53.1, b3642&lt;br/&gt;
clients: el7.4, branch master, v2.10.53.1, b3642&lt;br/&gt;
</environment>
        <key id="48588">LU-10073</key>
            <summary>lnet-selftest test_smoke: lst Error found</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="simmonsja">James A Simmons</assignee>
                                    <reporter username="jcasper">James Casper</reporter>
                        <labels>
                            <label>LTS12</label>
                            <label>arm</label>
                            <label>ppc</label>
                            <label>rhel8</label>
                            <label>ubuntu</label>
                    </labels>
                <created>Wed, 4 Oct 2017 16:40:37 +0000</created>
                <updated>Sat, 20 Jan 2024 00:46:14 +0000</updated>
                            <resolved>Tue, 31 May 2022 01:46:11 +0000</resolved>
                                    <version>Lustre 2.11.0</version>
                    <version>Lustre 2.13.0</version>
                    <version>Lustre 2.10.7</version>
                    <version>Lustre 2.12.1</version>
                    <version>Lustre 2.12.3</version>
                    <version>Lustre 2.12.4</version>
                    <version>Lustre 2.12.5</version>
                    <version>Lustre 2.12.6</version>
                                    <fixVersion>Lustre 2.16.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>13</watches>
                                                                            <comments>
                            <comment id="237546" author="adilger" created="Tue, 27 Nov 2018 19:16:18 +0000"  >&lt;p&gt;Still seeing these failures on a regular basis - &lt;a href=&quot;https://testing.whamcloud.com/sub_tests/query?utf8=%E2%9C%93&amp;amp;warn%5Bnotice%5D=&amp;amp;test_set_script_id=c24874b2-4a56-11e0-a7f6-52540025f9af&amp;amp;status%5B%5D=FAIL&amp;amp;horizon=518400&amp;amp;commit=Update+results&amp;amp;buggable_class=SubTest&amp;amp;num_results=250&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;50 failures in the past week&lt;/a&gt;, which is about 23% of all runs, increasing to 45% of ARM runs:&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/sub_tests/021c4e14-f01d-11e8-bfe1-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/sub_tests/021c4e14-f01d-11e8-bfe1-52540065bddc&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/sub_tests/0512c886-f1c7-11e8-815b-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/sub_tests/0512c886-f1c7-11e8-815b-52540065bddc&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/sub_tests/f7e4f2a6-edac-11e8-815b-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/sub_tests/f7e4f2a6-edac-11e8-815b-52540065bddc&lt;/a&gt;&lt;br/&gt;
:&lt;br/&gt;
:&lt;/p&gt;</comment>
                            <comment id="237556" author="sharmaso" created="Tue, 27 Nov 2018 20:19:13 +0000"  >&lt;p&gt;I would say these current failures are different from what is mentioned in the description of this ticket. In the description, it shows to have ping errors but in all the above test failure links, the errors are the brw errors.&lt;br/&gt;
Here is the ticket that James had opened for similar brw errors that were seen.&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11389&quot; title=&quot;lnet-setltest test smoke fails with &#8216;lst Error found&#8217;&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11389&quot;&gt;&lt;del&gt;LU-11389&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="238440" author="artem_blagodarenko" created="Wed, 12 Dec 2018 08:21:27 +0000"  >&lt;p&gt;Happened again&#160;&lt;a href=&quot;https://testing.whamcloud.com/test_sets/e1d460c4-f7e7-11e8-b67f-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/e1d460c4-f7e7-11e8-b67f-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="238501" author="sharmaso" created="Thu, 13 Dec 2018 01:13:49 +0000"  >&lt;p&gt;Ideally this again looks more related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11389&quot; title=&quot;lnet-setltest test smoke fails with &#8216;lst Error found&#8217;&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11389&quot;&gt;&lt;del&gt;LU-11389&lt;/del&gt;&lt;/a&gt; than the ping errors reported in this ticket description.&lt;/p&gt;

&lt;p&gt;But in this particular case, I also see socklnd error messages in the logs like below which I did not see in other reported cases&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
LNetError: 19630:0:(socklnd.c:1679:ksocknal_destroy_conn()) Completing partial receive from 12345-10.9.8.62@tcp[2], ip 10.9.8.62:1021, with error, wanted: 811328, left: 811328, last alive is 5 secs ago

LNet: 2475:0:(rpc.c:1250:srpc_send_rpc()) Remote error 74 at 12345-10.9.8.62@tcp, unlink bulk buffer in &lt;span class=&quot;code-keyword&quot;&gt;case&lt;/span&gt; peer didn&apos;t initiate bulk transfer&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;And these -&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
00000800:00020000:0.0:1543943185.853200:0:19630:0:(socklnd.c:1679:ksocknal_destroy_conn()) Completing partial receive from 12345-10.9.8.62@tcp[2], ip 10.9.8.62:1021, with error, wanted: 811328, left: 811328, last alive is 5 secs ago
00000400:00000100:0.0:1543943185.858360:0:19630:0:(rpc.c:1418:srpc_lnet_ev_handler()) LNet event status -5 type 2, RPC errors 1
00000400:00000100:0.0:1543943185.865739:0:19632:0:(lib-move.c:3856:lnet_parse_reply()) 10.9.4.72@tcp: Dropping REPLY from 12345-10.9.8.62@tcp &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; invalid MD 0x156d2737da0adbc9.0x483fb9
00000400:00000100:0.0:1543943185.866100:0:19632:0:(lib-move.c:3856:lnet_parse_reply()) 10.9.4.72@tcp: Dropping REPLY from 12345-10.9.8.62@tcp &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; invalid MD 0x156d2737da0adbc9.0x483fbd
00000400:00000100:0.0:1543943185.866322:0:19632:0:(lib-move.c:3856:lnet_parse_reply()) 10.9.4.72@tcp: Dropping REPLY from 12345-10.9.8.62@tcp &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; invalid MD 0x156d2737da0adbc9.0x483fc1
00000400:00000100:0.0:1543943185.866328:0:19632:0:(lib-move.c:3856:lnet_parse_reply()) 10.9.4.72@tcp: Dropping REPLY from 12345-10.9.8.62@tcp &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; invalid MD 0x156d2737da0adbc9.0x483fc5
00000400:00000100:0.0:1543943185.866333:0:19632:0:(lib-move.c:3856:lnet_parse_reply()) 10.9.4.72@tcp: Dropping REPLY from 12345-10.9.8.62@tcp &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; invalid MD 0x156d2737da0adbc9.0x483fc9
00000800:00000100:0.0:1543943185.866336:0:19632:0:(socklnd_cb.c:972:ksocknal_launch_packet()) No usable routes to 12345-10.9.8.62@tcp
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Investigating on these errors.&lt;/p&gt;</comment>
                            <comment id="238905" author="adilger" created="Thu, 20 Dec 2018 06:39:06 +0000"  >&lt;p&gt;Still seeing lnet-selftest on master after patch &lt;a href=&quot;https://review.whamcloud.com/33231&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/33231&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11389&quot; title=&quot;lnet-setltest test smoke fails with &#8216;lst Error found&#8217;&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11389&quot;&gt;&lt;del&gt;LU-11389&lt;/del&gt;&lt;/a&gt; lnet: increase lnet transaction timeout&lt;/tt&gt;&quot; has landed.&lt;/p&gt;

&lt;p&gt;This particular failure is with an Ubuntu 18.04 client for a regular review-ldiskfs test run:&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/9e8fb032-040f-11e9-b970-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/9e8fb032-040f-11e9-b970-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Since bug &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11389&quot; title=&quot;lnet-setltest test smoke fails with &#8216;lst Error found&#8217;&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11389&quot;&gt;&lt;del&gt;LU-11389&lt;/del&gt;&lt;/a&gt; has been closed for 2.12, and autotest is flagging this ticket for all of the current test failures, let&apos;s just use this ticket for the BRW errors currently being seen and reported here.&lt;/p&gt;

&lt;p&gt;The following messages are in the MDS log:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 7418.687141] LNetError: 14039:0:(socklnd.c:1679:ksocknal_destroy_conn()) Completing partial receive from 12345-10.9.4.230@tcp[2], ip 10.9.4.230:1021, with error, wanted: 1013616, left: 1013616, last alive is 0 secs ago
[ 7418.688754] LustreError: 5373:0:(brw_test.c:415:brw_bulk_ready()) BRW bulk READ failed for RPC from 12345-10.9.4.230@tcp: -5
[ 7418.688761] LustreError: 5373:0:(brw_test.c:389:brw_server_rpc_done()) Bulk transfer to 12345-10.9.4.230@tcp has failed: -5
[ 7469.603687] LustreError: 5373:0:(brw_test.c:415:brw_bulk_ready()) BRW bulk WRITE failed for RPC from 12345-10.9.4.230@tcp: -4
[ 7469.605632] LustreError: 5373:0:(brw_test.c:415:brw_bulk_ready()) Skipped 2 previous similar messages
[ 7469.607239] LustreError: 5373:0:(brw_test.c:389:brw_server_rpc_done()) Bulk transfer from 12345-10.9.4.230@tcp has failed: -5
[ 7469.609038] LustreError: 5373:0:(brw_test.c:389:brw_server_rpc_done()) Skipped 2 previous similar messages
[ 7481.461431] LNet: 5389:0:(rpc.c:1072:srpc_client_rpc_expired()) Client RPC expired: service 11, peer 12345-10.9.4.230@tcp, timeout 64.
[ 7481.463637] LustreError: 5373:0:(brw_test.c:344:brw_client_done_rpc()) BRW RPC to 12345-10.9.4.230@tcp failed with -110
[ 7595.562071] LustreError: 5373:0:(brw_test.c:415:brw_bulk_ready()) BRW bulk READ failed for RPC from 12345-10.9.4.231@tcp: -5
[ 7595.562075] LNetError: 14039:0:(socklnd.c:1679:ksocknal_destroy_conn()) Completing partial receive from 12345-10.9.4.231@tcp[2], ip 10.9.4.231:1021, with error, wanted: 487400, left: 487400, last alive is 0 secs ago
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="238977" author="adilger" created="Thu, 20 Dec 2018 19:03:16 +0000"  >&lt;p&gt;I&apos;m actually seeing &lt;b&gt;both&lt;/b&gt; the ping and BRW timeout errors in the same run, so it doesn&apos;t seem like they are different problems in the end.  Some logs from &lt;a href=&quot;https://testing.whamcloud.com/sub_tests/4acfe152-0102-11e9-b970-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/sub_tests/4acfe152-0102-11e9-b970-52540065bddc&lt;/a&gt; :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 6714.891483] LNet: 27908:0:(rpc.c:612:srpc_service_add_buffers()) waiting for adding buffer
[ 6715.091612] LNet: 27908:0:(rpc.c:612:srpc_service_add_buffers()) waiting for adding buffer
[ 6715.292141] LNet: 27908:0:(rpc.c:612:srpc_service_add_buffers()) waiting for adding buffer
[ 6715.492122] LNet: 27908:0:(rpc.c:612:srpc_service_add_buffers()) waiting for adding buffer
[ 6715.692075] LNet: 27908:0:(rpc.c:612:srpc_service_add_buffers()) waiting for adding buffer
[ 6715.892115] LNet: 27908:0:(rpc.c:612:srpc_service_add_buffers()) waiting for adding buffer
[ 6716.092499] LNet: 27908:0:(rpc.c:612:srpc_service_add_buffers()) waiting for adding buffer
[ 6716.291542] LNet: 27908:0:(rpc.c:612:srpc_service_add_buffers()) waiting for adding buffer
[ 6716.491536] LNet: 27908:0:(rpc.c:612:srpc_service_add_buffers()) waiting for adding buffer
[ 6716.691533] LNet: 27908:0:(rpc.c:612:srpc_service_add_buffers()) waiting for adding buffer
[ 6780.283118] LNet: 27912:0:(rpc.c:1072:srpc_client_rpc_expired()) Client RPC expired: service 12, peer 12345-10.9.5.35@tcp, timeout 64.
[ 6780.286872] LustreError: 27910:0:(ping_test.c:132:ping_client_done_rpc()) Unable to ping 12345-10.9.5.35@tcp (0): -110
[ 7260.303131] LNet: 27912:0:(rpc.c:1072:srpc_client_rpc_expired()) Client RPC expired: service 11, peer 12345-10.9.5.35@tcp, timeout 64.
[ 7260.303351] LNet: 27912:0:(rpc.c:1072:srpc_client_rpc_expired()) Skipped 29 previous similar messages
[ 7260.303813] LustreError: 27910:0:(brw_test.c:344:brw_client_done_rpc()) BRW RPC to 12345-10.9.5.35@tcp failed with -110
[ 7268.313051] LNet: 27912:0:(rpc.c:1072:srpc_client_rpc_expired()) Client RPC expired: service 11, peer 12345-10.9.5.35@tcp, timeout 64.
[ 7268.313209] LNet: 27912:0:(rpc.c:1072:srpc_client_rpc_expired()) Skipped 80 previous similar messages
[ 7268.313397] LustreError: 27910:0:(brw_test.c:344:brw_client_done_rpc()) BRW RPC to 12345-10.9.5.35@tcp failed with -110
[ 7268.313470] LustreError: 27910:0:(brw_test.c:344:brw_client_done_rpc()) Skipped 80 previous similar messages
[ 8519.063017] LNetError: 20589:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don&apos;t perform health checking (-125, 0)
[ 8520.507784] Lustre: 27908:0:(ping_test.c:79:ping_client_fini()) 30 pings have failed.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Over the past 4 weeks we are seeing about 10% failures in lnet-selftest overall and for x86/RHEL 7.5 in general, jumping up to 20% for ARM (4.14 kernel) and 100% failures for Ubuntu 18.08 (4.15 kernel) and PPC kernels.  I think this issue needs more attention.&lt;/p&gt;</comment>
                            <comment id="239040" author="sharmaso" created="Fri, 21 Dec 2018 21:30:24 +0000"  >&lt;p&gt;I investigated the lnet_selftest failure with this patch&#160;&lt;a href=&quot;https://testing.whamcloud.com/test_sets/e1d460c4-f7e7-11e8-b67f-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/e1d460c4-f7e7-11e8-b67f-52540065bddc&lt;/a&gt;&#160;(&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7159&quot; title=&quot;sanity 224c always passes&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7159&quot;&gt;&lt;del&gt;LU-7159&lt;/del&gt;&lt;/a&gt;). I wasn&apos;t able to reproduce it using the same build on 2 ARM nodes and just running selftest between them. So I tried changing the order of the tests in gerrit by listing these test parameters.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Test-Parameters: clientarch=aarch64 testlist=node-provisioning-1,lustre-initialization-1,lnet-selftest,sanity
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;With the change in the order of the execution of lnet_selftest and sanity i.e. running lnet_selftest before sanity, lnet_selftest pass but the test 103b in sanity crashes.&lt;br/&gt;
So I would think the issue is not with the lnet_selftest in this particular failure case.&lt;/p&gt;

&lt;p&gt;Still investigating selftest failure on Ubuntu.&lt;/p&gt;</comment>
                            <comment id="241504" author="adilger" created="Wed, 6 Feb 2019 22:12:27 +0000"  >&lt;p&gt;The test_103b failure is a known issue - &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11878&quot; title=&quot;sanity test 103b: OOM because of too many bash processes: page allocation stalls for 18420ms&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11878&quot;&gt;&lt;del&gt;LU-11878&lt;/del&gt;&lt;/a&gt;.  I&apos;d be reasonably content if we changed the test order to run lnet-selftest before sanity, since that also seems more logical.&lt;/p&gt;</comment>
                            <comment id="241514" author="adilger" created="Wed, 6 Feb 2019 23:05:19 +0000"  >&lt;p&gt;I pushed patch &lt;a href=&quot;https://review.whamcloud.com/34201&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34201&lt;/a&gt; to change the order of the tests for &lt;tt&gt;review-ldiskfs&lt;/tt&gt; and &lt;tt&gt;review-ldiskfs-arm&lt;/tt&gt; so that &lt;tt&gt;lnet-selftest&lt;/tt&gt; runs before &lt;tt&gt;sanity&lt;/tt&gt;.  Not ideal, but it may allow ARM testing to be more reliable.&lt;/p&gt;</comment>
                            <comment id="242864" author="simmonsja" created="Tue, 26 Feb 2019 23:29:32 +0000"  >&lt;p&gt;I see this Ubuntu18 testing as well.&lt;/p&gt;</comment>
                            <comment id="243274" author="adilger" created="Mon, 4 Mar 2019 06:23:21 +0000"  >&lt;p&gt;Sonia, &lt;tt&gt;lnet-selftest&lt;/tt&gt; is still failing even after 34201 landed, putting it at the start of the test sessions.&lt;/p&gt;</comment>
                            <comment id="243282" author="sharmaso" created="Mon, 4 Mar 2019 12:22:21 +0000"  >&lt;p&gt;Are the failures for ARM as well? Any recent test-failure I can look at?&lt;/p&gt;</comment>
                            <comment id="243284" author="adilger" created="Mon, 4 Mar 2019 12:38:06 +0000"  >&lt;p&gt;It seems more prevalent on ARM, but it also happens on x86 client/server as well, please see:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://testing.whamcloud.com/sub_tests/query?utf8=%E2%9C%93&amp;amp;warn%5Bnotice%5D=&amp;amp;test_set_script_id=c24874b2-4a56-11e0-a7f6-52540025f9af&amp;amp;sub_test_script_id=c252c2c8-4a56-11e0-a7f6-52540025f9af&amp;amp;status%5B%5D=FAIL&amp;amp;horizon=518400&amp;amp;commit=Update+results&amp;amp;buggable_class=SubTest&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/sub_tests/query?utf8=%E2%9C%93&amp;amp;warn%5Bnotice%5D=&amp;amp;test_set_script_id=c24874b2-4a56-11e0-a7f6-52540025f9af&amp;amp;sub_test_script_id=c252c2c8-4a56-11e0-a7f6-52540025f9af&amp;amp;status%5B%5D=FAIL&amp;amp;horizon=518400&amp;amp;commit=Update+results&amp;amp;buggable_class=SubTest&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="243433" author="sharmaso" created="Wed, 6 Mar 2019 22:57:25 +0000"  >&lt;p&gt;I checked the logs of the various failure cases in that link. There are two kind of logs we see depending on if it is the brw_errors or the ping errors. We also have cases where&#160; both ping errors and brw errors are there.&lt;/p&gt;

&lt;p&gt;One example of the ping error case -&#160;&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/sub_tests/92ecef78-3e26-11e9-9720-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/sub_tests/92ecef78-3e26-11e9-9720-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The ping error logs looks like below.&#160;These logs just suggests pings being timed out.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
00000400:00000100:1.0:1551576978.818367:2480:6658:0:(lib-move.c:3753:lnet_parse_put()) Dropping PUT from 12345-10.9.5.36@tcp portal 52 match 11025656312733141039 offset 0 length 160: 4
00000400:00000100:1.0:1551576978.818373:2480:6658:0:(lib-move.c:3753:lnet_parse_put()) Dropping PUT from 
00000400:00000400:0.0:1551577048.385694:1680:16992:0:(rpc.c:1072:srpc_client_rpc_expired()) Client RPC expired: service 12, peer 12345-10.9.5.35@tcp, timeout 64.
00000400:00000400:0.0:1551577048.385846:1680:16992:0:(rpc.c:1072:srpc_client_rpc_expired()) Client RPC expired: service 12, peer 12345-10.9.5.36@tcp, timeout 64.
00000400:00000400:0.0:1551577048.385848:1680:16992:0:(rpc.c:1072:srpc_client_rpc_expired()) Client RPC expired: service 12, peer 12345-10.9.5.35@tcp, timeout 64
00000400:00000100:0.0:1551577048.385892:2032:16988:0:(rpc.c:1146:srpc_client_rpc_done()) Client RPC done: service 12, peer 12345-10.9.5.35@tcp, status SWI_STATE_REQUEST_SENT:1:-4
00000001:00020000:0.0:1551577048.385895:2432:16988:0:(ping_test.c:132:ping_client_done_rpc()) Unable to ping 12345-10.9.5.35@tcp (0): -110
00000400:00000100:0.0:1551577048.385970:2032:16988:0:(rpc.c:1146:srpc_client_rpc_done()) Client RPC done: service 12, peer 12345-10.9.5.36@tcp, status SWI_STATE_REQUEST_SENT:1:-4
00000001:00020000:0.0:1551577048.385972:2432:16988:0:(ping_test.c:132:ping_client_done_rpc()) Unable to ping 12345-10.9.5.36@tcp (1): -110
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I tried to reproduce locally again but could not.&#160;&lt;/p&gt;

&lt;p&gt;At this point, I think I would like to check if there is any issue with the way maloo is setting up lnet-selftest or the network connectivity between the nodes that the selftest is running on. For example - if the IP address of the nodes is taken correctly.&lt;/p&gt;

&lt;p&gt;I think it would be best to first rule out any possibility of the issue with the way maloo is running this test.&lt;/p&gt;</comment>
                            <comment id="244533" author="mdiep" created="Fri, 22 Mar 2019 15:03:36 +0000"  >&lt;p&gt;+1 on b2_12 but not on ARM&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/4ad11d4c-4c6b-11e9-a256-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/4ad11d4c-4c6b-11e9-a256-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="244861" author="adilger" created="Thu, 28 Mar 2019 23:12:20 +0000"  >&lt;p&gt;Have hit this five times in a row on non-ARM review-ldiskfs when testing Ubuntu 18.04 on the client for patch &lt;a href=&quot;https://review.whamcloud.com/34456&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34456&lt;/a&gt; which is only changing the Debian build files:&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sessions/34b7097c-f756-48b9-8e32-26d9ff4122a6&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sessions/34b7097c-f756-48b9-8e32-26d9ff4122a6&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sessions/270c5ece-da24-4307-9001-6989c1a8c5c5&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sessions/270c5ece-da24-4307-9001-6989c1a8c5c5&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sessions/6706e3d9-698a-4346-98cd-3b4a6c45d6e0&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sessions/6706e3d9-698a-4346-98cd-3b4a6c45d6e0&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sessions/c3e2410d-f9c5-4aba-996e-371dcb810afb&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sessions/c3e2410d-f9c5-4aba-996e-371dcb810afb&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sessions/2151adcf-c785-46c4-973f-683f1f7561c7&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sessions/2151adcf-c785-46c4-973f-683f1f7561c7&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;It looks like Minh&apos;s recent failure is Ubuntu 18.04 as well, so that may be a better way to debug this issue.&lt;/p&gt;</comment>
                            <comment id="244927" author="gerrit" created="Fri, 29 Mar 2019 19:01:11 +0000"  >&lt;p&gt;James Nunez (jnunez@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/34543&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34543&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10073&quot; title=&quot;lnet-selftest test_smoke: lst Error found&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10073&quot;&gt;&lt;del&gt;LU-10073&lt;/del&gt;&lt;/a&gt; tests: stop running smoke test for ARM&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: c890e0700c671258c6c3105b1eaa05c331ebca3e&lt;/p&gt;</comment>
                            <comment id="245350" author="gerrit" created="Mon, 8 Apr 2019 05:31:36 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/34543/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34543/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10073&quot; title=&quot;lnet-selftest test_smoke: lst Error found&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10073&quot;&gt;&lt;del&gt;LU-10073&lt;/del&gt;&lt;/a&gt; tests: stop running smoke test&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: ddf3c0416790f74e10abc39543843e0de49b176e&lt;/p&gt;</comment>
                            <comment id="245438" author="pjones" created="Mon, 8 Apr 2019 17:38:50 +0000"  >&lt;p&gt;Landed for 2.13&lt;/p&gt;</comment>
                            <comment id="245440" author="jamesanunez" created="Mon, 8 Apr 2019 17:41:01 +0000"  >&lt;p&gt;The patch that landed adds test smoke to the ALWAYS_EXCEPT list.&lt;/p&gt;

&lt;p&gt;We do not have a solution for these failures. Thus, I&apos;m reopening the ticket.&lt;/p&gt;</comment>
                            <comment id="246766" author="adilger" created="Mon, 6 May 2019 23:22:55 +0000"  >&lt;p&gt;The &lt;tt&gt;lnet-selftest&lt;/tt&gt; has failed 5x in a row with patch &lt;a href=&quot;https://review.whamcloud.com/34800&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34800&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11960&quot; title=&quot;Add missing libssl-dev DEB package&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11960&quot;&gt;&lt;del&gt;LU-11960&lt;/del&gt;&lt;/a&gt; build: Add missing libssl-dev DEB package&lt;/tt&gt;&quot; when running an Ubuntu 18.04 client, so it seems like Ubuntu is a 100% reproducer for this issue, and should allow debugging what is going wrong.&lt;/p&gt;</comment>
                            <comment id="246767" author="gerrit" created="Mon, 6 May 2019 23:25:25 +0000"  >&lt;p&gt;Andreas Dilger (adilger@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/34814&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34814&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10073&quot; title=&quot;lnet-selftest test_smoke: lst Error found&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10073&quot;&gt;&lt;del&gt;LU-10073&lt;/del&gt;&lt;/a&gt; tests: stop running smoke test&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 8e3aa2e7ca5e35a68100aa564f901f49868b8a84&lt;/p&gt;</comment>
                            <comment id="247000" author="gerrit" created="Fri, 10 May 2019 21:03:56 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/34814/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34814/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10073&quot; title=&quot;lnet-selftest test_smoke: lst Error found&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10073&quot;&gt;&lt;del&gt;LU-10073&lt;/del&gt;&lt;/a&gt; tests: stop running smoke test&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 0411b8139e18c243547535b5733b6408461ec72a&lt;/p&gt;</comment>
                            <comment id="250853" author="mdiep" created="Mon, 8 Jul 2019 19:17:23 +0000"  >&lt;p&gt;also hit it on el8 client &lt;a href=&quot;https://testing.whamcloud.com/test_sessions/39b4428a-bf50-4283-b6c2-bcc389e99b19&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sessions/39b4428a-bf50-4283-b6c2-bcc389e99b19&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="250915" author="jgmitter" created="Tue, 9 Jul 2019 17:28:08 +0000"  >&lt;p&gt;Hi Amir,&lt;/p&gt;

&lt;p&gt;Can you look into this failure?&lt;/p&gt;

&lt;p&gt;Thanks.&lt;br/&gt;
Joe&lt;/p&gt;</comment>
                            <comment id="251313" author="pjones" created="Fri, 12 Jul 2019 22:20:55 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=simmonsja&quot; class=&quot;user-hover&quot; rel=&quot;simmonsja&quot;&gt;simmonsja&lt;/a&gt; is this issue something on your radar?&lt;/p&gt;</comment>
                            <comment id="251314" author="simmonsja" created="Fri, 12 Jul 2019 22:22:26 +0000"  >&lt;p&gt;Does this happen on Power with RHEL 3.10 kernels?&lt;/p&gt;</comment>
                            <comment id="251315" author="pjones" created="Fri, 12 Jul 2019 22:26:23 +0000"  >&lt;p&gt;I &lt;b&gt;think&lt;/b&gt; that the ppc tests that hit it were running the same kernel as Arm - 4.14&lt;/p&gt;</comment>
                            <comment id="251574" author="gerrit" created="Wed, 17 Jul 2019 17:50:20 +0000"  >&lt;p&gt;Amir Shehata (ashehata@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/35540&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/35540&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10073&quot; title=&quot;lnet-selftest test_smoke: lst Error found&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10073&quot;&gt;&lt;del&gt;LU-10073&lt;/del&gt;&lt;/a&gt; lnet: increase transaction timeout&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 07b40bdd960d579a70c933f4d17faec844b211a4&lt;/p&gt;</comment>
                            <comment id="251604" author="ashehata" created="Thu, 18 Jul 2019 03:48:56 +0000"  >&lt;p&gt;I was able to reproduce this on onyx-121vm&lt;span class=&quot;error&quot;&gt;&amp;#91;1-4&amp;#93;&lt;/span&gt;.&#160; I then run &lt;tt&gt;lnetctl set transaction_timeout 60&lt;/tt&gt; once lnet_selftest is loaded up and when I re-ran lnet-selftest.sh the problem did not occur. I pushed the same change for test with autotest and the problem happens. Once the test failed I took over the cluster: onyx-30vm&lt;span class=&quot;error&quot;&gt;&amp;#91;1-4&amp;#93;&lt;/span&gt;, and ran lnet-selftest.sh manually (the same way I do with on onyx-121vm&lt;span class=&quot;error&quot;&gt;&amp;#91;1-4&amp;#93;&lt;/span&gt;) and it failed. I made sure that both clusters are setup with the same kernel and the same lustre build&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
onyx-121vm[1-2] 
----------------
lustre-iokit-2.12.56_1_g07b40bd-1.el8.x86_64
kmod-lustre-client-2.12.56_1_g07b40bd-1.el8.x86_64
lustre-client-tests-2.12.56_1_g07b40bd-1.el8.x86_64
kmod-lustre-client-tests-2.12.56_1_g07b40bd-1.el8.x86_64
lustre-client-2.12.56_1_g07b40bd-1.el8.x86_64
kernel-4.18.0-80.4.2.el8_0.x86_64
kernel-tools-libs-4.18.0-80.el8.x86_64
kernel-tools-4.18.0-80.el8.x86_64
kernel-4.18.0-80.el8.x86_64
kernel-core-4.18.0-80.4.2.el8_0.x86_64
kernel-core-4.18.0-80.el8.x86_64
kernel-modules-4.18.0-80.el8.x86_64
kernel-headers-4.18.0-80.el8.x86_64
kernel-modules-4.18.0-80.4.2.el8_0.x86_64

onyx-121vm[3-4]
----------------
kernel-tools-libs-3.10.0-957.21.3.el7_lustre.x86_64
kmod-lustre-osd-ldiskfs-2.12.56_1_g07b40bd-1.el7.x86_64
lustre-tests-2.12.56_1_g07b40bd-1.el7.x86_64
python-perf-3.10.0-957.21.3.el7_lustre.x86_64
kernel-tools-3.10.0-957.21.3.el7_lustre.x86_64
kernel-devel-3.10.0-957.21.3.el7_lustre.x86_64
lustre-osd-ldiskfs-mount-2.12.56_1_g07b40bd-1.el7.x86_64
lustre-2.12.56_1_g07b40bd-1.el7.x86_64
lustre-iokit-2.12.56_1_g07b40bd-1.el7.x86_64
kernel-3.10.0-957.21.3.el7_lustre.x86_64
kmod-lustre-2.12.56_1_g07b40bd-1.el7.x86_64
kmod-lustre-tests-2.12.56_1_g07b40bd-1.el7.x86_64
kernel-headers-3.10.0-957.21.3.el7_lustre.x86_6&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
onyx-30vm[1-2]
---------------
kmod-lustre-client-2.12.56_1_g07b40bd-1.el8.x86_64
lustre-client-2.12.56_1_g07b40bd-1.el8.x86_64
lustre-iokit-2.12.56_1_g07b40bd-1.el8.x86_64
lustre-client-tests-2.12.56_1_g07b40bd-1.el8.x86_64
kmod-lustre-client-tests-2.12.56_1_g07b40bd-1.el8.x86_64
kernel-modules-4.18.0-80.4.2.el8_0.x86_64
kernel-headers-4.18.0-80.el8.x86_64
kernel-core-4.18.0-80.el8.x86_64
kernel-modules-4.18.0-80.el8.x86_64
kernel-tools-libs-4.18.0-80.el8.x86_64
kernel-4.18.0-80.el8.x86_64
kernel-core-4.18.0-80.4.2.el8_0.x86_64
kernel-tools-4.18.0-80.el8.x86_64
kernel-4.18.0-80.4.2.el8_0.x86_64


onyx-30vm[3-4]
--------------
lustre-tests-2.12.56_1_g07b40bd-1.el7.x86_64
kmod-lustre-2.12.56_1_g07b40bd-1.el7.x86_64
kmod-lustre-tests-2.12.56_1_g07b40bd-1.el7.x86_64
lustre-2.12.56_1_g07b40bd-1.el7.x86_64
lustre-iokit-2.12.56_1_g07b40bd-1.el7.x86_64
kernel-3.10.0-957.21.3.el7_lustre.x86_64
lustre-osd-ldiskfs-mount-2.12.56_1_g07b40bd-1.el7.x86_64
kmod-lustre-osd-ldiskfs-2.12.56_1_g07b40bd-1.el7.x86_64
kernel-headers-3.10.0-957.1.3.el7.x86_64
kernel-tools-libs-3.10.0-957.1.3.el7.x86_64
kernel-3.10.0-957.1.3.el7.x86_64
kernel-tools-3.10.0-957.1.3.el7.x86_64
kernel-devel-3.10.0-957.1.3.el7.x86_64

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The cluster setup by autotest has some differences in rpms installed, but nothing I would think would cause this problem.&lt;/p&gt;

&lt;p&gt;To make sure it&apos;s not the transaction_timeout (health code) which is causing this. I turned up the transaction timeout to 9000 seconds, turned off retries and turned off health sensitivity and reran the test. It still failed on the autotest setup cluster&lt;/p&gt;

&lt;p&gt;Results from ony-121vm&lt;span class=&quot;error&quot;&gt;&amp;#91;1-4&amp;#93;&lt;/span&gt;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 1 batch in stopping
Batch is stopped
c:
Total 0 error nodes in c
s:
Total 0 error nodes in s
session is ended
Total 0 error nodes in c
Total 0 error nodes in s
onyx-121vm4: onyx-121vm4.onyx.whamcloud.com: executing lst_cleanup
onyx-121vm3: onyx-121vm3.onyx.whamcloud.com: executing lst_cleanup
onyx-121vm1: onyx-121vm1.onyx.whamcloud.com: executing lst_cleanup
onyx-121vm1: onyx-121vm1.onyx.whamcloud.com: executing lst_cleanup
onyx-121vm2: onyx-121vm2.onyx.whamcloud.com: executing lst_cleanup
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Results from onyx-30vm&lt;span class=&quot;error&quot;&gt;&amp;#91;1-4&amp;#93;&lt;/span&gt;:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 killing 14697 ...
2 batch in stopping
Batch is stopped
12345-10.2.4.93@tcp: [Session 206 brw errors, 15 ping errors] [RPC: 117 errors, 113 dropped, 216 expired]
12345-10.2.4.94@tcp: [Session 230 brw errors, 30 ping errors] [RPC: 62 errors, 144 dropped, 253 expired]
c:
Total 2 error nodes in c
12345-10.2.4.95@tcp: [Session 232 brw errors, 15 ping errors] [RPC: 95 errors, 151 dropped, 247 expired]
12345-10.2.4.96@tcp: [Session 239 brw errors, 30 ping errors] [RPC: 72 errors, 101 dropped, 266 expired]
s:
Total 2 error nodes in s
session is ended
Total 2 error nodes in c
Total 2 error nodes in s
Starting client onyx-30vm1,onyx-30vm1.onyx.whamcloud.com,onyx-30vm2:  -o user_xattr,flock onyx-30vm4@tcp:/lustre /mnt/lustre
Started clients onyx-30vm1,onyx-30vm1.onyx.whamcloud.com,onyx-30vm2: 
10.2.4.96@tcp:/lustre on /mnt/lustre type lustre (rw,flock,user_xattr,lazystatfs)
10.2.4.96@tcp:/lustre on /mnt/lustre type lustre (rw,flock,user_xattr,lazystatfs)
10.2.4.96@tcp:/lustre on /mnt/lustre type lustre (rw,flock,user_xattr,lazystatfs)
10.2.4.96@tcp:/lustre on /mnt/lustre type lustre (rw,flock,user_xattr,lazystatfs)
10.2.4.96@tcp:/lustre on /mnt/lustre type lustre (rw,flock,user_xattr,lazystatfs)
onyx-30vm1: onyx-30vm1.onyx.whamcloud.com: executing set_default_debug vfstrace rpctrace dlmtrace neterror ha config ioctl &lt;span class=&quot;code-keyword&quot;&gt;super&lt;/span&gt; lfsck all 4
onyx-30vm2: onyx-30vm2.onyx.whamcloud.com: executing set_default_debug vfstrace rpctrace dlmtrace neterror ha config ioctl &lt;span class=&quot;code-keyword&quot;&gt;super&lt;/span&gt; lfsck all 4
 lnet-selftest test_smoke: @@@@@@ FAIL: lst Error found 
  Trace dump:
  = /usr/lib64/lustre/tests/test-framework.sh:6030:error()
  = /usr/lib64/lustre/tests/lnet-selftest.sh:161:check_lst_err()
  = /usr/lib64/lustre/tests/lnet-selftest.sh:187:test_smoke()
  = /usr/lib64/lustre/tests/test-framework.sh:6332:run_one()
  = /usr/lib64/lustre/tests/test-framework.sh:6371:run_one_logged()
  = /usr/lib64/lustre/tests/test-framework.sh:6217:run_test()
  = /usr/lib64/lustre/tests/lnet-selftest.sh:190:main()&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;When the test fails, I see two issues:&lt;/p&gt;

&lt;p&gt;1) BRW timeouts. Basically lots of&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
(socklnd_cb.c:2593:ksocknal_check_peer_timeouts()) Total 2 stale ZC_REQs &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; peer_ni 10.2.4.95@tcp detected; the oldest(0000000025bf6991) timed out 1 secs ago, resid: 0, wmem: 977752

(socklnd.c:1665:ksocknal_destroy_conn()) Completing partial receive from 12345-10.2.4.95@tcp[2], ip 10.2.4.95:7988, with error, wanted: 523368, left: 523368, last alive is 0 secs ago&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;2) lnet_selftest outputs a message stating that it&apos;s uanble to get stats from 1 or 2 of the nodes.&lt;/p&gt;

&lt;p&gt;I tried to check the differences in HW between onyx-30 and onyx-121.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
onyx-121
---------
 [root@onyx-121 ~]# lscpu
Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
&lt;span class=&quot;code-object&quot;&gt;Byte&lt;/span&gt; Order:            Little Endian
CPU(s):                24
On-line CPU(s) list:   0-23
&lt;span class=&quot;code-object&quot;&gt;Thread&lt;/span&gt;(s) per core:    1
Core(s) per socket:    12
Socket(s):             2
NUMA node(s):          2
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 62
Model name:            Intel(R) Xeon(R) CPU E5-2697 v2 @ 2.70GHz
Stepping:              4
CPU MHz:               3000.058
CPU max MHz:           3500.0000
CPU min MHz:           1200.0000
BogoMIPS:              5387.10
Virtualization:        VT-x
L1d cache:             32K
L1i cache:             32K
L2 cache:              256K
L3 cache:              30720K
NUMA node0 CPU(s):     0-11
NUMA node1 CPU(s):     12-23
Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm epb tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms xsaveopt dtherm ida arat pln pts

[root@onyx-121 ~]# lspci | grep -i eth
02:00.0 Ethernet controller: Intel Corporation I350 Gigabit Network Connection (rev 01)
02:00.1 Ethernet controller: Intel Corporation I350 Gigabit Network Connection (rev 01)
02:00.2 Ethernet controller: Intel Corporation I350 Gigabit Network Connection (rev 01)
02:00.3 Ethernet controller: Intel Corporation I350 Gigabit Network Connection (rev 01)&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 [root@onyx-30 ~]# lscpu
Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
&lt;span class=&quot;code-object&quot;&gt;Byte&lt;/span&gt; Order:            Little Endian
CPU(s):                32
On-line CPU(s) list:   0-31
&lt;span class=&quot;code-object&quot;&gt;Thread&lt;/span&gt;(s) per core:    2
Core(s) per socket:    8
Socket(s):             2
NUMA node(s):          2
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 45
Model name:            Genuine Intel(R) CPU  @ 2.60GHz
Stepping:              5
CPU MHz:               2999.902
CPU max MHz:           3300.0000
CPU min MHz:           1200.0000
BogoMIPS:              5187.48
Virtualization:        VT-x
L1d cache:             32K
L1i cache:             32K
L2 cache:              256K
L3 cache:              20480K
NUMA node0 CPU(s):     0-7,16-23
NUMA node1 CPU(s):     8-15,24-31
Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx lahf_lm epb tpr_shadow vnmi flexpriority ept vpid xsaveopt dtherm ida arat pln pts

[root@onyx-30 ~]# lspci | grep -i eth
04:00.0 Ethernet controller: Intel Corporation I350 Gigabit Network Connection (rev 01)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;It appears like onyx-30 only has one ethernet device. trevis-41 which is another clustre the test failed on, has only 2 ethernet devices. onyx-121 has 4.&lt;/p&gt;

&lt;p&gt;It&apos;s important to note also that the lnet-selftest.sh add 71 tests. Running just 1 test doesn&apos;t cause a problem. So it appears like batching up so many tests in one go causes the problem to occur.&lt;/p&gt;

&lt;p&gt;I&apos;m investigating whether the number of interfaces accounts for the difference in behavior. So far it appears like this is the most significant change between the two clusters. Software wise both clusters have the same set of software installed.&lt;/p&gt;</comment>
                            <comment id="251734" author="ashehata" created="Fri, 19 Jul 2019 23:43:10 +0000"  >&lt;p&gt;After moving onyx-30vm&lt;span class=&quot;error&quot;&gt;&amp;#91;1-2&amp;#93;&lt;/span&gt; from RHEL8 clients to RHEL7.6 clients the issue is not reproducible any longer.&lt;/p&gt;

&lt;p&gt;Will start backing out socklnd patches to see if any of them is the problem.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="251738" author="adilger" created="Sat, 20 Jul 2019 09:16:03 +0000"  >&lt;blockquote&gt;
&lt;p&gt;After moving onyx-30vm&lt;span class=&quot;error&quot;&gt;&amp;#91;1-2&amp;#93;&lt;/span&gt; from RHEL8 clients to RHEL7.6 clients the issue is not reproducible any longer.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;This is also true of ARM and Ubuntu clients - they cause constant failures, but x86 clients do not.  My thinking is that this relates to the specific kernel version used, and it just happens that ARM, Ubuntu, and RHEL8 are newer kernels that somehow changed the kernel socket interfaces.&lt;/p&gt;</comment>
                            <comment id="252037" author="ashehata" created="Thu, 25 Jul 2019 18:17:40 +0000"  >&lt;p&gt;I believe so too.&lt;/p&gt;

&lt;p&gt;I tried RHEL8 across all the VMs and the problem persists.&lt;/p&gt;

&lt;p&gt;I then increased the lnet_selftest rpc timeout to 256 seconds. And the test passed. IE no RPC errors or drops.&lt;/p&gt;

&lt;p&gt;I measured the time it takes to complete RPCs from lnet_selftest perspective and I noticed the following behavior:&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;with one test in the batch, RPCs take a max of 1 second to complete&lt;/li&gt;
	&lt;li&gt;with two tests in the batch I see RPCs taking close to 10 seconds&lt;/li&gt;
	&lt;li&gt;As I increase the number of tests in the batch I see that there are RPCs which take longer and longer to complete. With 40+ tests in the batch (which what lnet-selftest.sh does) I see RPCs taking up to 130 seconds to complete.&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;I then went to the previous setup with 2 RHEL8 clients and 2 RHEL7.6 servers and captured performance data using perf and generated flame graphs&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33272/33272_perf-kernel-vm1.svg&quot; title=&quot;perf-kernel-vm1.svg attached to LU-10073&quot;&gt;perf-kernel-vm1.svg&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33273/33273_perf-kernel-vm2.svg&quot; title=&quot;perf-kernel-vm2.svg attached to LU-10073&quot;&gt;perf-kernel-vm2.svg&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33274/33274_perf-kernel-vm3.svg&quot; title=&quot;perf-kernel-vm3.svg attached to LU-10073&quot;&gt;perf-kernel-vm3.svg&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33275/33275_perf-kernel-vm4.svg&quot; title=&quot;perf-kernel-vm4.svg attached to LU-10073&quot;&gt;perf-kernel-vm4.svg&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;There appears to be a key difference in the flamegraphs captured on the RHEL8 VMs vs the RHEL7.6 VMs. The ksoftirqd/1 is appearing significantly less on the RHEL7.6 VMs (~43 samples) vs RHEL8 VMs (~7000 samples).&lt;/p&gt;

&lt;p&gt;My next steps are&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;Attempt and reproduce this on physical nodes&lt;/li&gt;
	&lt;li&gt;Investigate and see why interrupt handling on RHEL 8 is happening much more frequently. Is it only on VMs or physical machines as well&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="252101" author="ashehata" created="Fri, 26 Jul 2019 23:57:57 +0000"  >&lt;p&gt;I setup 2 nodes with Ubuntu 18: 4.15.0-45-generic and two nodes with centos 7.6: 3.10.0-957.21.3.el7_lustre.x86_64&lt;/p&gt;

&lt;p&gt;We couldn&apos;t install RHEL8 on the physical nodes, so the setup is not 100% the same as the VM one. Once the RHEL8 installation is resolved I&apos;ll attempt the test again.&lt;/p&gt;

&lt;p&gt;Running the exact same script which failed on the VMs, passed on the physical setup.&lt;/p&gt;

&lt;p&gt;I collected the flamegraphs below and there is no significant differences between the Ubuntu18 and centos 7.6 with regards to softirq handling.&lt;/p&gt;

&lt;p&gt;Ubuntu 18 client: &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33281/33281_perf-kernel-121.svg&quot; title=&quot;perf-kernel-121.svg attached to LU-10073&quot;&gt;perf-kernel-121.svg&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;br/&gt;
Ubuntu 18 client: &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33279/33279_perf-kernel-122.svg&quot; title=&quot;perf-kernel-122.svg attached to LU-10073&quot;&gt;perf-kernel-122.svg&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;br/&gt;
RHEL 7.6 server: &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33278/33278_perf-kernel-123.svg&quot; title=&quot;perf-kernel-123.svg attached to LU-10073&quot;&gt;perf-kernel-123.svg&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;br/&gt;
RHEL 7.6 server: &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33280/33280_perf-kernel-124.svg&quot; title=&quot;perf-kernel-124.svg attached to LU-10073&quot;&gt;perf-kernel-124.svg&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;This issue appears to be localized to VM setups. As far as I know there hasn&apos;t been reports of test failure on a physical setup.&lt;/p&gt;

&lt;p&gt;The VMs are started on a RHEL7.5 host. Could there be an interaction issue between host and VM? One thing to try is to deploy the VMs on a RHEL8 host and try the test. Currently there is a problem installing RHEL8 on physical nodes. Will try the test once this issue is resolved.&lt;/p&gt;</comment>
                            <comment id="253640" author="yujian" created="Tue, 27 Aug 2019 00:50:05 +0000"  >&lt;p&gt;The failure occurred on RHEL 8.0 vm client+server against master branch:&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/380744c0-c709-11e9-a25b-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/380744c0-c709-11e9-a25b-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="256620" author="hornc" created="Fri, 18 Oct 2019 02:22:22 +0000"  >&lt;p&gt;+1 on master &lt;a href=&quot;https://testing.whamcloud.com/test_sessions/26e84ad7-8e0a-4307-a1f8-1c5281550588&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sessions/26e84ad7-8e0a-4307-a1f8-1c5281550588&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="261548" author="yujian" created="Tue, 21 Jan 2020 06:35:42 +0000"  >&lt;p&gt;+1 on RHEL 8.1 client testing: &lt;a href=&quot;https://testing.whamcloud.com/test_sets/e47aaeaa-3ba7-11ea-bb75-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/e47aaeaa-3ba7-11ea-bb75-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="261554" author="yujian" created="Tue, 21 Jan 2020 08:17:26 +0000"  >&lt;p&gt;The failure also occurred on RHEL 7.7 client + server testing session: &lt;a href=&quot;https://testing.whamcloud.com/test_sets/10d17588-3bb7-11ea-adca-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/10d17588-3bb7-11ea-adca-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="262682" author="gerrit" created="Wed, 5 Feb 2020 22:52:41 +0000"  >&lt;p&gt;James Nunez (jnunez@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/37450&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/37450&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10073&quot; title=&quot;lnet-selftest test_smoke: lst Error found&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10073&quot;&gt;&lt;del&gt;LU-10073&lt;/del&gt;&lt;/a&gt; tests: skip test smoke for PPC&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: c28c9cabd71b8b0d4e45e909acfd4c797176ed59&lt;/p&gt;</comment>
                            <comment id="263660" author="gerrit" created="Thu, 20 Feb 2020 07:15:33 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/37450/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/37450/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10073&quot; title=&quot;lnet-selftest test_smoke: lst Error found&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10073&quot;&gt;&lt;del&gt;LU-10073&lt;/del&gt;&lt;/a&gt; tests: skip test smoke for PPC&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 5ab2220687f4ef3a1d5b435f1e34f808723a9bf5&lt;/p&gt;</comment>
                            <comment id="272177" author="gerrit" created="Mon, 8 Jun 2020 00:25:46 +0000"  >&lt;p&gt;James Simmons (jsimmons@infradead.org) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/38857&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/38857&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10073&quot; title=&quot;lnet-selftest test_smoke: lst Error found&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10073&quot;&gt;&lt;del&gt;LU-10073&lt;/del&gt;&lt;/a&gt; tests: re-enable lnet selftest smoke test for PPC + ARM&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: ff47b8bfb0d507c8a75338b7ddfde4eef99d5bb6&lt;/p&gt;</comment>
                            <comment id="282516" author="pjones" created="Sun, 18 Oct 2020 14:49:52 +0000"  >&lt;p&gt;James&lt;/p&gt;

&lt;p&gt;It looks like this is an area that you are still looking into&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="322232" author="gerrit" created="Mon, 10 Jan 2022 23:43:05 +0000"  >&lt;p&gt;&quot;James Simmons &amp;lt;jsimmons@infradead.org&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/46037&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46037&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10073&quot; title=&quot;lnet-selftest test_smoke: lst Error found&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10073&quot;&gt;&lt;del&gt;LU-10073&lt;/del&gt;&lt;/a&gt; tests: re-enable lnet selftest smoke test 4.4+ kernels&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 29bb14ccb62a1bccf066d558d35658fb57ffca11&lt;/p&gt;</comment>
                            <comment id="322294" author="simmonsja" created="Tue, 11 Jan 2022 14:05:49 +0000"  >&lt;p&gt;Good news is that in my testing of LNet selftest on Ubuntu I didn&apos;t see any issues with newer kernels. Enabling it for RHEL8 maloo also passed. Most likely some recent LNet bug that was resolved fixed this issue. Due to the lack of ARM / PPC available for testing I can&apos;t prove if this is working on those platforms.&lt;/p&gt;</comment>
                            <comment id="322415" author="xinliang" created="Wed, 12 Jan 2022 06:40:40 +0000"  >&lt;p&gt;Let me check again, James. Last patch update time I didn&apos;t encounter this issue. I will run this test suite on arm again on master branch.&lt;/p&gt;</comment>
                            <comment id="322417" author="xinliang" created="Wed, 12 Jan 2022 07:27:11 +0000"  >&lt;p&gt;Tested with the patch on arm64. It passes.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
$ ./auster &#160;-vr lnet-selftest
...Batch is stopped
c:
Total 0 error nodes in c
s:
Total 0 error nodes in s
session is ended
Total 0 error nodes in c
Total 0 error nodes in s
lustre-aio: xxxx clients: &lt;span class=&quot;code-quote&quot;&gt;&apos;lustre-aio lustre-aio mds-01&apos;&lt;/span&gt;
lustre-aio: 1xxxx clients: &apos;lustre-aio
lustre-aio: mds-01&apos;
lustre-aio: 2xxxx clients: 2 &lt;span class=&quot;code-quote&quot;&gt;&apos;lustre-aio,mds-01&apos;&lt;/span&gt;
lustre-aio: lustre-aio: executing lst_cleanup
mds-01: liuxl-mds-test-01.novalocal: executing lst_cleanup
PASS smoke (322s) &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="322441" author="simmonsja" created="Wed, 12 Jan 2022 14:05:28 +0000"  >&lt;p&gt;Once PowerPC comes back I will test on that platform. I suspect this issue is resolved but we will have to wait and see.&lt;/p&gt;</comment>
                            <comment id="322970" author="gerrit" created="Tue, 18 Jan 2022 09:07:51 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/46037/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46037/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10073&quot; title=&quot;lnet-selftest test_smoke: lst Error found&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10073&quot;&gt;&lt;del&gt;LU-10073&lt;/del&gt;&lt;/a&gt; tests: re-enable lnet selftest smoke test 4.4+ kernels&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 24424791ac7233e393a32be189fe77102857653b&lt;/p&gt;</comment>
                            <comment id="322998" author="simmonsja" created="Tue, 18 Jan 2022 14:08:40 +0000"  >&lt;p&gt;We still have Power8 to test on to verify it works.&lt;/p&gt;</comment>
                            <comment id="323001" author="gerrit" created="Tue, 18 Jan 2022 14:30:17 +0000"  >&lt;p&gt;&quot;James Simmons &amp;lt;jsimmons@infradead.org&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/46169&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46169&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10073&quot; title=&quot;lnet-selftest test_smoke: lst Error found&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10073&quot;&gt;&lt;del&gt;LU-10073&lt;/del&gt;&lt;/a&gt; tests: re-enable lnet selftest smoke test for PPC + ARM&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 187eef8fd9cc5dce0014afe7db45c60caf7ee605&lt;/p&gt;</comment>
                            <comment id="336323" author="gerrit" created="Mon, 30 May 2022 18:43:50 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/38857/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/38857/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10073&quot; title=&quot;lnet-selftest test_smoke: lst Error found&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10073&quot;&gt;&lt;del&gt;LU-10073&lt;/del&gt;&lt;/a&gt; tests: re-enable lnet selftest smoke test for PPC + ARM&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: c25a4e1fd92523247a0fa5a5c2809321765df263&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="48924">LU-10157</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="53339">LU-11389</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="54620">LU-11878</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="49488">LU-10300</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="57609">LU-13063</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="55582">LU-12269</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="69981">LU-15781</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="33281" name="perf-kernel-121.svg" size="967699" author="ashehata" created="Fri, 26 Jul 2019 23:55:55 +0000"/>
                            <attachment id="33279" name="perf-kernel-122.svg" size="697288" author="ashehata" created="Fri, 26 Jul 2019 23:55:16 +0000"/>
                            <attachment id="33278" name="perf-kernel-123.svg" size="1193446" author="ashehata" created="Fri, 26 Jul 2019 23:55:08 +0000"/>
                            <attachment id="33280" name="perf-kernel-124.svg" size="775604" author="ashehata" created="Fri, 26 Jul 2019 23:55:28 +0000"/>
                            <attachment id="33272" name="perf-kernel-vm1.svg" size="120624" author="ashehata" created="Thu, 25 Jul 2019 18:08:25 +0000"/>
                            <attachment id="33273" name="perf-kernel-vm2.svg" size="134552" author="ashehata" created="Thu, 25 Jul 2019 18:08:36 +0000"/>
                            <attachment id="33274" name="perf-kernel-vm3.svg" size="205367" author="ashehata" created="Thu, 25 Jul 2019 18:08:52 +0000"/>
                            <attachment id="33275" name="perf-kernel-vm4.svg" size="193030" author="ashehata" created="Thu, 25 Jul 2019 18:08:59 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzl93:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>