<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:40:24 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4181] lnet-selftest test_smoke: lst error found</title>
                <link>https://jira.whamcloud.com/browse/LU-4181</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for wangdi &amp;lt;di.wang@intel.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;http://maloo.whamcloud.com/test_sets/794b62ea-40b4-11e3-af95-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://maloo.whamcloud.com/test_sets/794b62ea-40b4-11e3-af95-52540035b04c&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_smoke failed with the following error:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Total 1 error nodes in c&lt;br/&gt;
Total 3 error nodes in s&lt;br/&gt;
 lnet-selftest test_smoke: @@@@@@ FAIL: lst Error found &lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Info required for matching: lnet-selftest smoke&lt;/p&gt;</description>
                <environment></environment>
        <key id="21713">LU-4181</key>
            <summary>lnet-selftest test_smoke: lst error found</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="isaac">Isaac Huang</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                            <label>mn4</label>
                            <label>yuc2</label>
                    </labels>
                <created>Tue, 29 Oct 2013 20:41:08 +0000</created>
                <updated>Thu, 13 Oct 2016 18:45:18 +0000</updated>
                            <resolved>Tue, 5 Jan 2016 07:52:05 +0000</resolved>
                                    <version>Lustre 2.6.0</version>
                    <version>Lustre 2.4.2</version>
                    <version>Lustre 2.7.0</version>
                    <version>Lustre 2.4.3</version>
                    <version>Lustre 2.8.0</version>
                                    <fixVersion>Lustre 2.7.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>15</watches>
                                                                            <comments>
                            <comment id="70268" author="pjones" created="Wed, 30 Oct 2013 16:19:26 +0000"  >&lt;p&gt;Amir&lt;/p&gt;

&lt;p&gt;Could you please comment on this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="70449" author="di.wang" created="Thu, 31 Oct 2013 22:40:58 +0000"  >&lt;p&gt;Just disable lnet_selftest now for DNE &lt;a href=&quot;http://review.whamcloud.com/8130&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8130&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="70459" author="adilger" created="Fri, 1 Nov 2013 04:07:06 +0000"  >&lt;p&gt;It looks like lnet-selftest has only been failing since 2013-10-25, but only for review-dne and review-dne-zfs tests.&lt;/p&gt;

&lt;p&gt;In &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/794b62ea-40b4-11e3-af95-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/794b62ea-40b4-11e3-af95-52540035b04c&lt;/a&gt; it looks like there is some kind of communication problem between the DNE nodes, possibly because there are more of nodes?  It seems some of the nodes are having problems with .218, except for the one that reports problems with .215.&lt;/p&gt;

&lt;p&gt;MDS 1 (wtm-21vm3):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;02:51:19:LNet: 12592:0:(rpc.c:1119:srpc_client_rpc_expired()) Client RPC expired: service 11, peer 12345-10.10.16.218@tcp, timeout 64.
02:51:19:LNet: 12592:0:(rpc.c:1119:srpc_client_rpc_expired()) Client RPC expired: service 11, peer 12345-10.10.16.218@tcp, timeout 64.
02:51:19:LustreError: 12591:0:(brw_test.c:333:brw_client_done_rpc()) BRW RPC to 12345-10.10.16.218@tcp failed with -110
02:51:19:LustreError: 12591:0:(brw_test.c:333:brw_client_done_rpc()) BRW RPC to 12345-10.10.16.218@tcp failed with -110
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Client 2 (wtm-21vm6):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;02:24:09:LNet: 18251:0:(rpc.c:1119:srpc_client_rpc_expired()) Client RPC expired: service 11, peer 12345-10.10.16.215@tcp, timeout 64.
02:24:09:LNet: 18251:0:(rpc.c:1119:srpc_client_rpc_expired()) Client RPC expired: service 11, peer 12345-10.10.16.215@tcp, timeout 64.
02:24:09:LustreError: 18249:0:(brw_test.c:333:brw_client_done_rpc()) BRW RPC to 12345-10.10.16.215@tcp failed with -110
02:24:09:LustreError: 18249:0:(brw_test.c:333:brw_client_done_rpc()) BRW RPC to 12345-10.10.16.215@tcp failed with -110
02:24:09:LustreError: 18249:0:(ping_test.c:135:ping_client_done_rpc()) Unable to ping 12345-10.10.16.220@tcp (21): -110
02:24:09:LustreError: 18249:0:(ping_test.c:135:ping_client_done_rpc()) Unable to ping 12345-10.10.16.220@tcp (22): -110
02:51:29:Lustre: 18248:0:(ping_test.c:82:ping_client_fini()) 16 pings have failed.
02:51:29:Lustre: DEBUG MARKER: /usr/sbin/lctl mark  lnet-selftest test_smoke: @@@@@@ FAIL: lst Error found 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;OST 9-16 (wtm-21vm8):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;02:51:15:LNet: 581:0:(rpc.c:1119:srpc_client_rpc_expired()) Client RPC expired: service 11, peer 12345-10.10.16.218@tcp, timeout 64.
02:51:15:LustreError: 580:0:(brw_test.c:333:brw_client_done_rpc()) BRW RPC to 12345-10.10.16.218@tcp failed with -110
02:51:15:LustreError: 580:0:(brw_test.c:333:brw_client_done_rpc()) BRW RPC to 12345-10.10.16.218@tcp failed with -110
02:51:15:Lustre: 579:0:(ping_test.c:82:ping_client_fini()) 8 pings have failed.
02:51:15:Lustre: 579:0:(ping_test.c:82:ping_client_fini()) 8 pings have failed.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;MDS 4 (wtm-21vm1):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;02:51:15:LNet: 28972:0:(rpc.c:1119:srpc_client_rpc_expired()) Client RPC expired: service 11, peer 12345-10.10.16.218@tcp, timeout 64.
02:51:15:LNet: 28972:0:(rpc.c:1119:srpc_client_rpc_expired()) Client RPC expired: service 11, peer 12345-10.10.16.218@tcp, timeout 64.
02:51:15:LustreError: 28971:0:(brw_test.c:333:brw_client_done_rpc()) BRW RPC to 12345-10.10.16.218@tcp failed with -110
02:51:15:LustreError: 28971:0:(brw_test.c:333:brw_client_done_rpc()) BRW RPC to 12345-10.10.16.218@tcp failed with -110
02:51:15:Lustre: 28970:0:(ping_test.c:82:ping_client_fini()) 5 pings have failed.
02:51:15:Lustre: 28970:0:(ping_test.c:82:ping_client_fini()) 5 pings have failed.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="71658" author="adilger" created="Fri, 15 Nov 2013 19:11:02 +0000"  >&lt;p&gt;Amir, any chance to look at this failure?&lt;/p&gt;</comment>
                            <comment id="71712" author="ashehata" created="Sat, 16 Nov 2013 00:51:06 +0000"  >&lt;p&gt;Is there a way to rerun the test and add some debug statements in the test.  the test gets the list of nids of servers and clients, I want to make sure that this list is sane and what is expected.  Could their be some assumptions based on the order of the nids, that is no longer true for DNE systems?&lt;/p&gt;</comment>
                            <comment id="71730" author="adilger" created="Sat, 16 Nov 2013 22:58:13 +0000"  >&lt;p&gt;Amir, the best bet would be to submit a patch to lnet-selftest.sh that adds the debugging you want and the run the test with:&lt;/p&gt;

&lt;p&gt; Test-Parameters: fortestonly testlist=lnet-selftest mdscount=4 osscount=2&lt;/p&gt;

&lt;p&gt;Or similar (please check wiki page to verify exact parameter names).&lt;/p&gt;</comment>
                            <comment id="71743" author="adilger" created="Mon, 18 Nov 2013 01:47:36 +0000"  >&lt;p&gt;The patch to disable lnet-selftest for DNE shows as landed on 11-08, but I still see four failures in review-dne in the past few days. Is that because those patches have not been rebased, or is the test not being skipped for some reason?&lt;/p&gt;</comment>
                            <comment id="72156" author="bogl" created="Fri, 22 Nov 2013 18:23:24 +0000"  >&lt;p&gt;maloo says &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/2d3946f0-539d-11e3-9901-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/2d3946f0-539d-11e3-9901-52540035b04c&lt;/a&gt; is this bug, but it was seen in review not in review-dne&lt;/p&gt;</comment>
                            <comment id="72171" author="ashehata" created="Fri, 22 Nov 2013 23:02:05 +0000"  >&lt;p&gt;Discussed this issue with Isaac, and we&apos;re suspecting that it could be a network issue.  In the original report there is many ping failures, which would indicate that one of the clients is unable to communicate with the server.  Is there a way to hop onto the test system that&apos;s having the problem and try and do lctl ping from the client to the server.  &lt;br/&gt;
Beside that, I&apos;m making a few changes to lnet-selftest to dump out more debug information, so I can tell if one of the clients is simply failing to communicate with the servers.&lt;/p&gt;</comment>
                            <comment id="73836" author="yujian" created="Thu, 19 Dec 2013 12:23:41 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_4/69/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_4/69/&lt;/a&gt; (2.4.2 RC1)&lt;br/&gt;
MDSCOUNT=4&lt;/p&gt;

&lt;p&gt;The same failure occurred:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/6af5f312-6879-11e3-a9a3-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/6af5f312-6879-11e3-a9a3-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="74522" author="adilger" created="Tue, 7 Jan 2014 22:45:19 +0000"  >&lt;p&gt;Since Chris changed the review-dne node configuration to 2x MDS + 1x OSS on 2013-12-26 as part of TEI-1312, it seems this is allowing lnet-selftest can pass again (on b2_5 where it is not excluded):&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/b040595e-7269-11e3-ab15-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/b040595e-7269-11e3-ab15-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;It is probably worthwhile to submit a patch to re-enable this test to see if it is now working for review-dne runs on master.&lt;/p&gt;</comment>
                            <comment id="74839" author="sarah" created="Mon, 13 Jan 2014 18:37:05 +0000"  >&lt;p&gt;I just submit a patch to re-enable it: &lt;a href=&quot;http://review.whamcloud.com/#/c/8823/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/8823/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="75223" author="sarah" created="Fri, 17 Jan 2014 22:02:20 +0000"  >&lt;p&gt;the following result shows that this test can pass with current DNE config&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sessions/4753c032-7dab-11e3-91f7-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sessions/4753c032-7dab-11e3-91f7-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="75243" author="adilger" created="Sat, 18 Jan 2014 17:46:59 +0000"  >&lt;p&gt;So it seems the problem is due to the number of nodes in the old review-dne config. We have since reduced the node count in the review-dne config to just one more than review-ldiskfs. &lt;/p&gt;

&lt;p&gt;However, since review-dne already takes much more time to run than review-ldiskfs (14h vs. 3h) it doesn&apos;t make sense to enable this on review-dne at this time, since the testing is equivalent.&lt;/p&gt;</comment>
                            <comment id="78137" author="sarah" created="Fri, 28 Feb 2014 23:05:37 +0000"  >&lt;p&gt;Hit this error in zfs testing&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/9235abcc-9f18-11e3-934b-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/9235abcc-9f18-11e3-934b-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="79498" author="yujian" created="Mon, 17 Mar 2014 08:26:15 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_4/73/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_4/73/&lt;/a&gt; (2.4.3 RC1)&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64&lt;br/&gt;
MDSCOUNT=4&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/00d71476-abee-11e3-bcad-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/00d71476-abee-11e3-bcad-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="81139" author="bogl" created="Mon, 7 Apr 2014 20:16:30 +0000"  >&lt;p&gt;another, in b2_5&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/0cf3118c-be8c-11e3-b5bd-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/0cf3118c-be8c-11e3-b5bd-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="94719" author="ashehata" created="Tue, 23 Sep 2014 15:25:58 +0000"  >&lt;p&gt;Digging through some of the dmesg log files I see the error messages below.  Some of these errors are repeated all over the the log.  Probably will need someone more familiar with the lustre codebase to take a look and determine the cause of these errors.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;LustreError: 11-0: lustre-MDT0000-mdc-ffff880037c15400: Communicating with 10.2.4.158@tcp, operation mds_reint failed with -19.
LustreError: 17968:0:(file.c:171:ll_close_inode_openhandle()) inode 144115205272520296 mdc close failed: rc = -4
LustreError: 166-1: MGC10.2.4.158@tcp: Connection to MGS (at 10.2.4.158@tcp) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail
LustreError: 21700:0:(lmv_obd.c:1524:lmv_statfs()) can&apos;t stat MDS #0 (lustre-MDT0000-mdc-ffff880037c15400), error -5
LustreError: 21700:0:(llite_lib.c:1610:ll_statfs_internal()) md_statfs fails: rc = -5
LustreError: 167-0: lustre-OST0000-osc-ffff880037c15400: This client was evicted by lustre-OST0000; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail.
LustreError: 11-0: MGC10.2.4.158@tcp: Communicating with 10.2.4.158@tcp, operation obd_ping failed with -107.
LustreError: 26751:0:(fail.c:133:__cfs_fail_timeout_set()) cfs_fail_timeout id 801 sleeping &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 20000ms
LustreError: 26751:0:(fail.c:137:__cfs_fail_timeout_set()) cfs_fail_timeout id 801 awake
LustreError: 11-0: lustre-OST0000-osc-ffff8800701a3400: Communicating with 10.2.4.159@tcp, operation ost_connect failed with -16.

LustreError: 167-0: lustre-OST0000-osc-ffff8800701a3400: This client was evicted by lustre-OST0000; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail.
LustreError: 29724:0:(osc_lock.c:833:osc_ldlm_completion_ast()) lock@ffff88007ac36ed0[3 3 0 1 1 00000000] W(2):[0, 18446744073709551615]@[0x100000000:0xe22:0x0] {
LustreError: 29724:0:(osc_lock.c:833:osc_ldlm_completion_ast())     lovsub@ffff88006cec7a20: [0 ffff88007bd6a470 W(2):[0, 18446744073709551615]@[0x200002b11:0x14:0x0]] 
LustreError: 29724:0:(osc_lock.c:833:osc_ldlm_completion_ast())     osc@ffff88006efaca28: ffff88007cf06d40    0x20040000001 0x27bfcba3c3cbb9cb 3 ffff8800698c9dd8 size: 0 mtime: 0 atime: 0 ctime: 0 blocks: 0
LustreError: 29724:0:(osc_lock.c:833:osc_ldlm_completion_ast()) } lock@ffff88007ac36ed0
LustreError: 29724:0:(osc_lock.c:833:osc_ldlm_completion_ast()) dlmlock returned -5
LustreError: 29724:0:(ldlm_resource.c:809:ldlm_resource_complain()) lustre-OST0000-osc-ffff8800701a3400: namespace resource [0xe22:0x0:0x0].0 (ffff88006af141c0) refcount nonzero (1) after lock cleanup; forcing cleanup.
LustreError: 29724:0:(ldlm_resource.c:1448:ldlm_resource_dump()) --- Resource: [0xe22:0x0:0x0].0 (ffff88006af141c0) refcount = 2
LustreError: 29724:0:(ldlm_resource.c:1451:ldlm_resource_dump()) Granted locks (in reverse order):
LustreError: 29724:0:(ldlm_resource.c:1454:ldlm_resource_dump()) ### ### ns: lustre-OST0000-osc-ffff8800701a3400 lock: ffff88007cf06d40/0x27bfcba3c3cbb9cb lrc: 4/0,1 mode: PW/PW res: [0xe22:0x0:0x0].0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;4095) flags: 0x126400000000 nid: local remote: 0xbb105070585d6d1c expref: -99 pid: 29703 timeout: 0 lvb_type: 1
LustreError: 29703:0:(cl_lock.c:1422:cl_unuse_try()) result = -5, &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; is unlikely!
Lustre: lustre-OST0000-osc-ffff8800701a3400: Connection restored to lustre-OST0000 (at 10.2.4.159@tcp)
LustreError: 29703:0:(cl_lock.c:1437:cl_unuse_locked()) lock@ffff88007ac36df8[2 0 0 1 0 00000000] W(2):[0, 18446744073709551615]@[0x200002b11:0x14:0x0] {
LustreError: 29703:0:(cl_lock.c:1437:cl_unuse_locked())     vvp@ffff880037c86650: 
LustreError: 29703:0:(cl_lock.c:1437:cl_unuse_locked())     lov@ffff88007bd6a470: 1
LustreError: 29703:0:(cl_lock.c:1437:cl_unuse_locked())     0 0: ---
LustreError: 29703:0:(cl_lock.c:1437:cl_unuse_locked()) 
LustreError: 29703:0:(cl_lock.c:1437:cl_unuse_locked()) } lock@ffff88007ac36df8
LustreError: 29703:0:(cl_lock.c:1437:cl_unuse_locked()) unuse &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; -5

LustreError: 9132:0:(mdc_locks.c:918:mdc_enqueue()) ldlm_cli_enqueue: -2
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="95481" author="doug" created="Wed, 1 Oct 2014 22:26:20 +0000"  >&lt;p&gt;I thought when we test lnet_selftest, we are only loading the modules lnet and lnet_selftest.  Those errors are from Lustre.  What is it doing running during an lnet_selftest?&lt;/p&gt;</comment>
                            <comment id="95660" author="ashehata" created="Fri, 3 Oct 2014 19:44:08 +0000"  >&lt;p&gt;Continuing to look at the LustreErrors in the dmesg on the client on the latest failures.  Currently looking at: &lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sessions/2bf8aa7e-4ad7-11e4-8d48-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sessions/2bf8aa7e-4ad7-11e4-8d48-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;LustreError: 11-0: lustre-MDT0000-mdc-ffff88007c2b8800: Communicating with 10.2.4.131@tcp, operation obd_ping failed with -107. 
## 107 ENOTCONN
LustreError: 166-1: MGC10.2.4.131@tcp: Connection to MGS (at 10.2.4.131@tcp) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail
LustreError: 2373:0:(client.c:2804:ptlrpc_replay_interpret()) @@@ status 301, old was 0  req@ffff88007bb64c00 x1480913582033212/t4294967470(4294967470) o101-&amp;gt;lustre-MDT0000-mdc-ffff88007c2b8800@10.2.4.131@tcp:12/10 lens 632/544 e 0 to 0 dl 1412309449 ref 2 fl Interpret:RP/4/0 rc 301/301
LustreError: 5833:0:(mgc_request.c:517:do_requeue()) failed processing log: -5
LustreError: 11-0: lustre-MDT0000-mdc-ffff88007c2b8800: Communicating with 10.2.4.131@tcp, operation ldlm_enqueue failed with -95.

LustreError: 22804:0:(fail.c:132:__cfs_fail_timeout_set()) cfs_fail_timeout id 412 sleeping &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 1000ms
LustreError: 22804:0:(fail.c:136:__cfs_fail_timeout_set()) cfs_fail_timeout id 412 awake

LustreError: 29859:0:(osc_request.c:1983:osc_build_rpc()) prep_req failed: -12

LustreError: 133-1: lustre-OST0004-osc-ffff88007c2b8800: BAD READ CHECKSUM: from 10.2.4.132@tcp inode [0x0:0x0:0x0] object 0x0:1925 extent [0-1048575]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;These errors are repeated through out the log.&lt;/p&gt;

&lt;p&gt;One observation, is that all the errors are on the tcp network not IB.  I&apos;m not familiar with the test configuration, so that could simply be because the test runs only on tcp network.&lt;/p&gt;

&lt;p&gt;On the server side I see the following LustreErrors:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;LustreError: 11-0: lustre-MDT0000-lwp-OST0000: Communicating with 10.2.4.95@tcp, operation mds_connect failed with -16.
LustreError: 11-0: MGC10.2.4.95@tcp: Communicating with 10.2.4.95@tcp, operation obd_ping failed with -107.
LustreError: 137-5: lustre-OST0006_UUID: not available &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; connect from 10.2.4.98@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.
LustreError: 31991:0:(ldlm_lib.c:2106:target_stop_recovery_thread()) lustre-OST0000: Aborting recovery
LustreError: 168-f: BAD WRITE CHECKSUM: lustre-OST0000 from 12345-10.2.4.98@tcp inode [0x200000bd0:0xe626:0x0] object 0x0:1899 extent [0-1048575]: client csum f1e7c95b, server csum f1e7c95a
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="95998" author="ashehata" created="Thu, 9 Oct 2014 08:21:15 +0000"  >&lt;p&gt;We created a patch, where lnet-selftest.sh unmounts the file system (which it already does) and and unload the lustre modules (new), then ran lnet-selftest.sh on it 22 times.  And it passed.  I would like to commit that change to see if resolves the errors that are occuring.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/#/c/12214/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/12214/&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/f9be7532-4f0b-11e4-92ea-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/f9be7532-4f0b-11e4-92ea-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="96512" author="ashehata" created="Thu, 16 Oct 2014 18:24:09 +0000"  >&lt;p&gt;looked at the test failure from Jian Yu on 4/Oct/14 and it looks like there are many 108 (ESHUTDOWN) errors on one of the nodes, which could indicate that the lnet/self-test modules are being unloaded before the test is completed.  This would result in the lst failure.&lt;/p&gt;

&lt;p&gt;Does the test system attempt to unload modules after the test?  Could explain the errors.  Ad mentioned before, when I an lnetselftest on its own 20 consecutive times, everything works.&lt;/p&gt;
</comment>
                            <comment id="96629" author="jhammond" created="Fri, 17 Oct 2014 19:59:52 +0000"  >&lt;p&gt;Please see &lt;a href=&quot;http://review.whamcloud.com/12332&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/12332&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="97814" author="jlevi" created="Wed, 29 Oct 2014 12:31:02 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/#/c/12469/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/12469/&lt;/a&gt;&lt;/p&gt;
</comment>
                            <comment id="98029" author="yong.fan" created="Fri, 31 Oct 2014 04:21:12 +0000"  >&lt;p&gt;Another failure instance:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/034d0816-60b4-11e4-a66b-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/034d0816-60b4-11e4-a66b-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="98138" author="yong.fan" created="Sat, 1 Nov 2014 11:10:19 +0000"  >&lt;p&gt;Another failure instance:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/94b38298-61a0-11e4-8b49-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/94b38298-61a0-11e4-8b49-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="98282" author="jlevi" created="Tue, 4 Nov 2014 16:02:15 +0000"  >&lt;p&gt;Patch landed to Master.&lt;/p&gt;</comment>
                            <comment id="98967" author="jhammond" created="Wed, 12 Nov 2014 15:58:31 +0000"  >&lt;p&gt;The patch landed was a debug patch.&lt;/p&gt;</comment>
                            <comment id="98968" author="jhammond" created="Wed, 12 Nov 2014 16:00:00 +0000"  >&lt;p&gt;I saw an instance of this today with the new debug patch.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;https://testing.hpdd.intel.com/test_sets/8f6d3750-6a53-11e4-a7b0-5254006e85c2
https://testing.hpdd.intel.com/test_logs/9b2922d4-6a53-11e4-a7b0-5254006e85c2

00000001:02000400:1.0:1415666453.647581:0:10468:0:(debug.c:345:libcfs_debug_mark_buffer()) DEBUG MARKER: -----============= acceptance-small: lnet-selftest ============----- Tue Nov 11 00:40:53 UTC 2014
00000001:02000400:1.0:1415666454.125144:0:10613:0:(debug.c:345:libcfs_debug_mark_buffer()) DEBUG MARKER: /usr/sbin/lctl mark == lnet-selftest test complete, duration -o sec == 00:40:53 \(1415666453\)
00000001:02000400:0.0:1415666454.269086:0:10672:0:(debug.c:345:libcfs_debug_mark_buffer()) DEBUG MARKER: == lnet-selftest test complete, duration -o sec == 00:40:53 (1415666453)
00000001:02000400:0.0:1415666454.844848:0:10856:0:(debug.c:345:libcfs_debug_mark_buffer()) DEBUG MARKER: /usr/sbin/lctl list_nids | grep tcp | cut -f 1 -d &apos;@&apos;
00000001:02000400:0.0:1415666455.483429:0:11174:0:(debug.c:345:libcfs_debug_mark_buffer()) DEBUG MARKER: /usr/sbin/lctl mark excepting tests:
00000001:02000400:1.0:1415666455.618475:0:11232:0:(debug.c:345:libcfs_debug_mark_buffer()) DEBUG MARKER: excepting tests:

00000001:02000400:0.0:1415666458.017200:0:11346:0:(debug.c:345:libcfs_debug_mark_buffer()) DEBUG MARKER: /usr/sbin/lctl mark == lnet-selftest test smoke: lst regression test == 00:40:57 \(1415666457\)
00000001:02000400:0.0:1415666458.154280:0:11404:0:(debug.c:345:libcfs_debug_mark_buffer()) DEBUG MARKER: == lnet-selftest test smoke: lst regression test == 00:40:57 (1415666457)
00000001:02000400:0.0:1415666458.424825:0:11506:0:(debug.c:345:libcfs_debug_mark_buffer()) DEBUG MARKER: PATH=/usr/lib64/lustre/tests:/usr/lib/lustre/tests:/usr/lib64/lustre/tests:/opt/iozone/bin:/opt/iozone/bin:/usr/lib64/lustre/tests/mpi:/usr/lib64/lustre/tests/racer:/usr/lib64/lustre/../lustre-iokit/sgpdd-survey:/usr/lib64/lustre/tests:/usr/lib64/lustre/u
00000001:02000400:1.0:1415666459.001668:0:11759:0:(debug.c:345:libcfs_debug_mark_buffer()) DEBUG MARKER: PATH=/usr/lib64/lustre/tests:/usr/lib/lustre/tests:/usr/lib64/lustre/tests:/opt/iozone/bin:/opt/iozone/bin:/usr/lib64/lustre/tests/mpi:/usr/lib64/lustre/tests/racer:/usr/lib64/lustre/../lustre-iokit/sgpdd-survey:/usr/lib64/lustre/tests:/usr/lib64/lustre/u

00000400:00020000:1.0:1415666764.476159:0:414:0:(rpc.c:1470:srpc_lnet_ev_handler()) ev-&amp;gt;status = -125, ev-&amp;gt;type = 5, errors = 1, rpcs_sent = 17427, rpcs_rcvd = 16841, rpcs_dropped = 0, rpcs_expired = 0
00000001:02000400:1.0:1415666768.744246:0:12074:0:(debug.c:345:libcfs_debug_mark_buffer()) DEBUG MARKER: /usr/sbin/lctl mark  lnet-selftest test_smoke: @@@@@@ FAIL: lst Error found
00000001:02000400:1.0:1415666768.878468:0:12132:0:(debug.c:345:libcfs_debug_mark_buffer()) DEBUG MARKER: lnet-selftest test_smoke: @@@@@@ FAIL: lst Error found
00000001:02000400:1.0:1415666769.107358:0:12233:0:(debug.c:345:libcfs_debug_mark_buffer()) DEBUG MARKER: /usr/sbin/lctl dk &amp;gt; /logdir/test_logs/2014-11-10/lustre-reviews-el6-x86_64--review-ldiskfs--2_4_1__28281__-70078148216900-145241/lnet-selftest.test_smoke.debug_log.$(hostname -\
s).1415666768.log;

ev-&amp;gt;type == 5 == LNET_EVENT_SEND

t:lustre-release# errno 125
#define ECANCELED       125     /* Operation Canceled */
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Since the status is -ECANCELED and is reported about 300 seconds after the start of test smoke this supports the idea that the error is due to the test being shutdown.&lt;/p&gt;</comment>
                            <comment id="99713" author="yujian" created="Thu, 20 Nov 2014 18:47:58 +0000"  >&lt;p&gt;More instance on master: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/c9b21b00-70bc-11e4-95d2-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/c9b21b00-70bc-11e4-95d2-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="99949" author="ashehata" created="Mon, 24 Nov 2014 18:37:38 +0000"  >&lt;p&gt;I looked at the other instances of these issues over the past few days and it appears that the failure is exactly the same across the different tests:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;ev-&amp;gt;status = -125 and ev-&amp;gt;type = 5
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Also the failure rate hasn&apos;t really changed after the patch landed. Approximately, 1 failure/day&lt;/p&gt;

&lt;p&gt;ECANCELED is set when the operation is canceled either due to a test being shutdown or selftest module being unloaded.  I don&apos;t think there is any issues with selftest, as far as I could tell.  &lt;/p&gt;

&lt;p&gt;Another possibility is that it could be a test system issue.&lt;/p&gt;

&lt;p&gt;All occurrences of the debug message added is towards the end of the log file.  Could it be that the test system is shutting down the test after it thinks it is complete, but the traffic hasn&apos;t been stopped yet, causing this error to occur?&lt;/p&gt;</comment>
                            <comment id="100254" author="yong.fan" created="Sun, 30 Nov 2014 12:22:51 +0000"  >&lt;p&gt;Another failure instance:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/8f027262-76ce-11e4-b1ab-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/8f027262-76ce-11e4-b1ab-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="100329" author="yujian" created="Mon, 1 Dec 2014 18:36:30 +0000"  >&lt;blockquote&gt;&lt;p&gt;Could it be that the test system is shutting down the test after it thinks it is complete, but the traffic hasn&apos;t been stopped yet, causing this error to occur?&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;In lnet-selftest.sh test_smoke():&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;    run_lst $runlst | tee $log
    rc=${PIPESTATUS[0]}
    [ $rc = 0 ] || error &lt;span class=&quot;code-quote&quot;&gt;&quot;$runlst failed: $rc&quot;&lt;/span&gt;
    
    lst_end_session --verbose | tee -a $log
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;We can add some codes before running lst_end_session to make sure the sessions are really finished. And it looks like the changes need to be made in the following codes in test_smoke_sub():&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;    echo $LST run b
    echo sleep 1
    echo &lt;span class=&quot;code-quote&quot;&gt;&quot;$LST stat --delay 10 --timeout 10 c s &amp;amp;&quot;&lt;/span&gt;
    echo &lt;span class=&quot;code-quote&quot;&gt;&apos;pid=$!&apos;&lt;/span&gt;
    echo &lt;span class=&quot;code-quote&quot;&gt;&apos;trap &lt;span class=&quot;code-quote&quot;&gt;&quot;cleanup $pid&quot;&lt;/span&gt; INT TERM&apos;&lt;/span&gt;
    echo sleep $smoke_DURATION
    echo &lt;span class=&quot;code-quote&quot;&gt;&apos;cleanup $pid&apos;&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="101490" author="jlevi" created="Fri, 12 Dec 2014 18:33:54 +0000"  >&lt;p&gt;Isaac,&lt;br/&gt;
Can you please take this one?&lt;br/&gt;
Thank you&lt;/p&gt;</comment>
                            <comment id="102092" author="isaac" created="Fri, 19 Dec 2014 19:07:00 +0000"  >&lt;p&gt;I believe the root cause was that there&apos;s no way for a script to know whether tests are still running when end_session is called. In other words a script can&apos;t figure out when to end_session. The lst was mainly designed for interactive use - when the r/w stats dwindle down to near zero, the tests have completed and it&apos;s time for further actions (e.g. running more tests, or end_session and stop).&lt;/p&gt;

&lt;p&gt;The fix here is not to count it as an error, if lst tests are stopped from administrative actions (e.g. end_session from the remote test console, or local root user trying to unload the lnet_selftest kernel module). Such aren&apos;t network errors, and shouldn&apos;t be counted anyway.&lt;/p&gt;

&lt;p&gt;Increasing smoke_DURATION would reduce the chances of hitting such bogus errors but not eliminate them. It also comes at the cost of increasing test run time unnecessarily. So I wouldn&apos;t recommend it.&lt;/p&gt;</comment>
                            <comment id="102554" author="adilger" created="Mon, 5 Jan 2015 18:28:12 +0000"  >&lt;p&gt;Since test_smoke is just to verify that lnet selftest is working, it would probably be better to reduce the amount of work that is being done, instead of increasing the test duration further (it is already taking 2000s to complete).&lt;/p&gt;

&lt;p&gt;It might be possible to make the test size conditional on whether it is running in a VM or not. &lt;/p&gt;</comment>
                            <comment id="102566" author="sarah" created="Mon, 5 Jan 2015 19:21:16 +0000"  >&lt;p&gt;Hit this in tag-2.6.92 zfs testing:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/d45d80fc-9363-11e4-b7aa-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/d45d80fc-9363-11e4-b7aa-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="102816" author="gerrit" created="Wed, 7 Jan 2015 22:06:19 +0000"  >&lt;p&gt;Isaac Huang (he.huang@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/13279&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/13279&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4181&quot; title=&quot;lnet-selftest test_smoke: lst error found&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4181&quot;&gt;&lt;del&gt;LU-4181&lt;/del&gt;&lt;/a&gt; lnet_selftest: bogus lst errors&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 827440ea6db076aceb2be3a65fd17cbcdcee0c2e&lt;/p&gt;</comment>
                            <comment id="103710" author="gerrit" created="Fri, 16 Jan 2015 03:25:56 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/13279/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/13279/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4181&quot; title=&quot;lnet-selftest test_smoke: lst error found&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4181&quot;&gt;&lt;del&gt;LU-4181&lt;/del&gt;&lt;/a&gt; lnet_selftest: bogus lst errors&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 8cfa7f80d3975298d5738091da7d9c28d6f5c9f5&lt;/p&gt;</comment>
                            <comment id="103833" author="sarah" created="Mon, 19 Jan 2015 06:12:28 +0000"  >&lt;p&gt;Didn&apos;t see this issue in build #2821(include the fix) and #2822, verified.&lt;/p&gt;</comment>
                            <comment id="103846" author="pjones" created="Mon, 19 Jan 2015 13:46:26 +0000"  >&lt;p&gt;Thanks Sarah.&lt;/p&gt;</comment>
                            <comment id="112020" author="sarah" created="Mon, 13 Apr 2015 18:29:16 +0000"  >&lt;p&gt;Hit this master branch build #2983 with EL7 client&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/f7370aee-df8a-11e4-bf2e-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/f7370aee-df8a-11e4-bf2e-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[LNet Rates of c]
[R] Avg: 272      RPC/s Min: 272      RPC/s Max: 272      RPC/s
[W] Avg: 265      RPC/s Min: 265      RPC/s Max: 265      RPC/s
[LNet Bandwidth of c]
[R] Avg: 23.06    MB/s  Min: 23.06    MB/s  Max: 23.06    MB/s
[W] Avg: 20.04    MB/s  Min: 20.04    MB/s  Max: 20.04    MB/s
killing 5477 ...
RPC failure, can&apos;t show error on 12345-10.2.4.205@tcp
12345-10.2.4.206@tcp: [Session 0 brw errors, 0 ping errors] [RPC: 2 errors, 0 dropped, 0 expired]
c:
Total 2 error nodes in c
12345-10.2.4.203@tcp: [Session 120 brw errors, 15 ping errors] [RPC: 62 errors, 58 dropped, 131 expired]
12345-10.2.4.204@tcp: [Session 120 brw errors, 15 ping errors] [RPC: 1 errors, 0 dropped, 135 expired]
s:
Total 2 error nodes in s
stop batch RPC failed on 12345-10.2.4.205@tcp: Unknown error -110
session is ended
Total 2 error nodes in c
Total 2 error nodes in s
 lnet-selftest test_smoke: @@@@@@ FAIL: lst Error found 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="112038" author="isaac" created="Mon, 13 Apr 2015 22:43:55 +0000"  >&lt;p&gt;This appeared to be a different issue: while previously the errors were bogus, here there were more than one hundred expired RPCs.&lt;/p&gt;</comment>
                            <comment id="137913" author="adilger" created="Tue, 5 Jan 2016 07:52:05 +0000"  >&lt;p&gt;Close this old ticket again.  New test failures are being tracked under &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6622&quot; title=&quot;lnet-selftest test_smoke: @@@@@@ FAIL: lst Error found&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6622&quot;&gt;&lt;del&gt;LU-6622&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="40585">LU-8705</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzw78v:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>11318</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>