<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:18:44 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-8573] IOR: niobuf.c:319:ptlrpc_register_bulk()) ASSERTION( desc-&gt;bd_md_count == 0 ) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-8573</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Attempted IOR test on both sets of OSTs. &lt;br/&gt;
No progress made on test.&lt;br/&gt;
Client and OST logs are full of:&lt;br/&gt;
OST&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;LustreError: 5245:0:(events.c:449:server_bulk_callback()) event type 3, status -113, desc ffff88104dc53200
LustreError: 9071:0:(events.c:449:server_bulk_callback()) event type 3, status -103, desc ffff880b0c956200
LustreError: 5245:0:(events.c:449:server_bulk_callback()) event type 5, status -113, desc ffff880b13e95200
LustreError: 5245:0:(events.c:449:server_bulk_callback()) event type 3, status -113, desc ffff880b13e95200
LustreError: 5245:0:(events.c:449:server_bulk_callback()) event type 5, status -113, desc ffff881032b5a800
LustreError: 5245:0:(events.c:449:server_bulk_callback()) event type 3, status -113, desc ffff881032b5a800
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Client&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[12378.313197] LustreError: 3138:0:(events.c:203:client_bulk_callback()) event type 1, status -5, desc ffff8807442f9c00
[12378.313223] LustreError: 3136:0:(events.c:203:client_bulk_callback()) event type 1, status -5, desc ffff8807442f9800
[12378.337649] LustreError: 3136:0:(events.c:203:client_bulk_callback()) event type 1, status -5, desc ffff8810102c4200
[12378.337671] LustreError: 3136:0:(events.c:203:client_bulk_callback()) event type 1, status -5, desc ffff880f55547400
[12378.337677] LustreError: 3138:0:(events.c:203:client_bulk_callback()) event type 1, status -5, desc ffff880729970000
[12378.362167] LustreError: 3137:0:(events.c:203:client_bulk_callback()) event type 1, status -5, desc ffff880804b78800
[12378.374324] LustreError: 3137:0:(events.c:203:client_bulk_callback()) event type 1, status -5, desc ffff8807299e5200
[12378.410886] LustreError: 3137:0:(events.c:203:client_bulk_callback()) event type 1, status -5, desc ffff880f53ce1c00
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Eventually, one client LBUGS&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[12377.942289] LustreError: 3153:0:(niobuf.c:319:ptlrpc_register_bulk()) ASSERTION( desc-&amp;gt;bd_md_count == 0 ) failed: 
[12378.564210] LustreError: 3153:0:(niobuf.c:319:ptlrpc_register_bulk()) LBUG
[12378.564210] LustreError: 3153:0:(niobuf.c:319:ptlrpc_register_bulk()) LBUG
[12378.571890] Pid: 3153, comm: ptlrpcd_01_02
[12378.576468]
Call Trace:
[12378.580868]  [&amp;lt;ffffffffa08557d3&amp;gt;] libcfs_debug_dumpstack+0x53/0x80 [libcfs]
[12378.588652]  [&amp;lt;ffffffffa0855d75&amp;gt;] lbug_with_loc+0x45/0xc0 [libcfs]
[12378.595597]  [&amp;lt;ffffffffa0c3e661&amp;gt;] ptlrpc_register_bulk+0x831/0x9c0 [ptlrpc]
[12378.603389]  [&amp;lt;ffffffffa08cace2&amp;gt;] ? LNetMDUnlink+0xe2/0x180 [lnet]
[12378.610322]  [&amp;lt;ffffffffa0c6be76&amp;gt;] ? sptlrpc_import_sec_ref+0x36/0x40 [ptlrpc]
[12378.618321]  [&amp;lt;ffffffffa0c3f1af&amp;gt;] ptl_send_rpc+0x1ff/0xda0 [ptlrpc]
[12378.625361]  [&amp;lt;ffffffffa0c39256&amp;gt;] ptlrpc_check_set.part.23+0x1896/0x1dd0 [ptlrpc]
[12378.633743]  [&amp;lt;ffffffffa0c397eb&amp;gt;] ptlrpc_check_set+0x5b/0xe0 [ptlrpc]
[12378.640976]  [&amp;lt;ffffffffa0c643fb&amp;gt;] ptlrpcd_check+0x4eb/0x5e0 [ptlrpc]
[12378.648095]  [&amp;lt;ffffffffa0c647ab&amp;gt;] ptlrpcd+0x2bb/0x560 [ptlrpc]
[12378.654613]  [&amp;lt;ffffffff810b8910&amp;gt;] ? default_wake_function+0x0/0x20
[12378.661550]  [&amp;lt;ffffffffa0c644f0&amp;gt;] ? ptlrpcd+0x0/0x560 [ptlrpc]
[12378.668067]  [&amp;lt;ffffffff810a5b2f&amp;gt;] kthread+0xcf/0xe0
[12378.673515]  [&amp;lt;ffffffff810a5a60&amp;gt;] ? kthread+0x0/0xe0
[12378.679058]  [&amp;lt;ffffffff81646a98&amp;gt;] ret_from_fork+0x58/0x90
[12378.685086]  [&amp;lt;ffffffff810a5a60&amp;gt;] ? kthread+0x0/0xe0
[12378.690626]
[12378.692396] Kernel panic - not syncing: LBUG
[12378.697164] CPU: 9 PID: 3153 Comm: ptlrpcd_01_02 Tainted: G           OE  ------------   3.10.0-327.28.3.el7.x86_64 #1
[12378.709098] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.01.0002.082220131453 08/22/2013
[12378.720548]  ffffffffa0872def 000000003210ab53 ffff881000c1ba80 ffffffff81636453
[12378.728852]  ffff881000c1bb00 ffffffff8162fce7 ffffffff00000008 ffff881000c1bb10
[12378.737165]  ffff881000c1bab0 000000003210ab53 ffffffffa0c90d30 0000000000000246
[12378.745463] Call Trace:
[12378.748204]  [&amp;lt;ffffffff81636453&amp;gt;] dump_stack+0x19/0x1b
[12378.753939]  [&amp;lt;ffffffff8162fce7&amp;gt;] panic+0xd8/0x1e7
[12378.759295]  [&amp;lt;ffffffffa0855ddb&amp;gt;] lbug_with_loc+0xab/0xc0 [libcfs]
[12378.766235]  [&amp;lt;ffffffffa0c3e661&amp;gt;] ptlrpc_register_bulk+0x831/0x9c0 [ptlrpc]
[12378.774009]  [&amp;lt;ffffffffa08cace2&amp;gt;] ? LNetMDUnlink+0xe2/0x180 [lnet]
[12378.780949]  [&amp;lt;ffffffffa0c6be76&amp;gt;] ? sptlrpc_import_sec_ref+0x36/0x40 [ptlrpc]
[12378.788950]  [&amp;lt;ffffffffa0c3f1af&amp;gt;] ptl_send_rpc+0x1ff/0xda0 [ptlrpc]
[12378.795981]  [&amp;lt;ffffffffa0c39256&amp;gt;] ptlrpc_check_set.part.23+0x1896/0x1dd0 [ptlrpc]
[12378.804368]  [&amp;lt;ffffffffa0c397eb&amp;gt;] ptlrpc_check_set+0x5b/0xe0 [ptlrpc]
[12378.811596]  [&amp;lt;ffffffffa0c643fb&amp;gt;] ptlrpcd_check+0x4eb/0x5e0 [ptlrpc]
[12378.818725]  [&amp;lt;ffffffffa0c647ab&amp;gt;] ptlrpcd+0x2bb/0x560 [ptlrpc]
[12378.825237]  [&amp;lt;ffffffff810b8910&amp;gt;] ? wake_up_state+0x20/0x20
[12378.831492]  [&amp;lt;ffffffffa0c644f0&amp;gt;] ? ptlrpcd_check+0x5e0/0x5e0 [ptlrpc]
[12378.838778]  [&amp;lt;ffffffff810a5b2f&amp;gt;] kthread+0xcf/0xe0
[12378.844224]  [&amp;lt;ffffffff810a5a60&amp;gt;] ? kthread_create_on_node+0x140/0x140
[12378.851510]  [&amp;lt;ffffffff81646a98&amp;gt;] ret_from_fork+0x58/0x90
[12378.857534]  [&amp;lt;ffffffff810a5a60&amp;gt;] ? kthread_create_on_node+0x140/0x140
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;vmcore is available on Spirit for analysis&lt;/p&gt;</description>
                <environment>Lustre-master 2.8.56_68_gd4a4c07 build 3430, RHEL7, Spirit performance cluster Revision: d4a4c0795b3befb87d47a5bf441adeba3b1c36f8&lt;br/&gt;
</environment>
        <key id="39247">LU-8573</key>
            <summary>IOR: niobuf.c:319:ptlrpc_register_bulk()) ASSERTION( desc-&gt;bd_md_count == 0 ) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="cliffw">Cliff White</reporter>
                        <labels>
                    </labels>
                <created>Wed, 31 Aug 2016 06:23:40 +0000</created>
                <updated>Sun, 2 Dec 2018 16:18:47 +0000</updated>
                            <resolved>Sat, 1 Dec 2018 05:34:34 +0000</resolved>
                                    <version>Lustre 2.9.0</version>
                    <version>Lustre 2.10.3</version>
                                    <fixVersion>Lustre 2.9.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>12</watches>
                                                                            <comments>
                            <comment id="163746" author="green" created="Wed, 31 Aug 2016 18:49:00 +0000"  >&lt;p&gt;What&apos;s the path to vmcore and the debuginfo kernel and modules to make use of the vmcore&lt;/p&gt;</comment>
                            <comment id="163765" author="cliffw" created="Wed, 31 Aug 2016 20:24:46 +0000"  >&lt;p&gt;The vmcore is in /scratch/dumps/spirit-33* on any Spirit node. The related RPMS are at /scratch/rpms or /scratch/hudson.&lt;/p&gt;</comment>
                            <comment id="164817" author="cliffw" created="Fri, 2 Sep 2016 15:06:28 +0000"  >&lt;p&gt;File a DCO ticket requesting access, include your public key (DSA/RSA) The login is passwordless. &lt;br/&gt;
The spirit head node (where you login) has all the relevant data under /scratch&lt;br/&gt;
/scratch/dumps/&amp;lt;hostname&amp;gt;&lt;br/&gt;
/scratch/rpms/ or /scratch/hudson has the RPMS for each install. I am going on sabbatical, Frank Heckes is taking over and should be able to help you. &lt;/p&gt;</comment>
                            <comment id="165283" author="gerrit" created="Thu, 8 Sep 2016 11:36:08 +0000"  >&lt;p&gt;Hongchao Zhang (hongchao.zhang@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/22378&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/22378&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8573&quot; title=&quot;IOR: niobuf.c:319:ptlrpc_register_bulk()) ASSERTION( desc-&amp;gt;bd_md_count == 0 ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8573&quot;&gt;&lt;del&gt;LU-8573&lt;/del&gt;&lt;/a&gt; ptlrpc: always unregister bulk&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 5b1832e89bd0c9432592602149e8496e7c5d4fdf&lt;/p&gt;</comment>
                            <comment id="167617" author="heckes" created="Wed, 28 Sep 2016 17:48:57 +0000"  >&lt;p&gt;the error also occurs running &lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-reviews/41665/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.hpdd.intel.com/job/lustre-reviews/41665/&lt;/a&gt; (&lt;a href=&quot;http://review.whamcloud.com/#/c/22739/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/22739/&lt;/a&gt; )&lt;br/&gt;
Test was running on centOS-7.2 using patch set 2 of original fix #22738&lt;br/&gt;
I&apos;ll attach debug logs later (approximately in 1 hour)&lt;/p&gt;

&lt;p&gt;P.S.: Sorry for using your account on spirit to push the change&lt;/p&gt;</comment>
                            <comment id="167705" author="hongchao.zhang" created="Thu, 29 Sep 2016 11:26:53 +0000"  >&lt;p&gt;Hi Frank,&lt;br/&gt;
Where do you put the debug logs?&lt;br/&gt;
Thanks!&lt;/p&gt;</comment>
                            <comment id="168131" author="cliffw" created="Mon, 3 Oct 2016 21:53:04 +0000"  >&lt;p&gt;Here are the debug logs from the OSTs, MDS and couple of clients&lt;/p&gt;</comment>
                            <comment id="168132" author="cliffw" created="Mon, 3 Oct 2016 21:57:09 +0000"  >&lt;p&gt;Syslog and console messages from the run&lt;/p&gt;</comment>
                            <comment id="168845" author="hongchao.zhang" created="Sun, 9 Oct 2016 09:15:09 +0000"  >&lt;p&gt;Hi Cliff,&lt;/p&gt;

&lt;p&gt;Thanks for the logs!&lt;br/&gt;
I&apos;m sorry that I can&apos;t find the logs of the client which encountered the &quot;ASSERTION( desc-&amp;gt;bd_md_count == 0 )&quot; in the attached logs,&lt;br/&gt;
could you attach the logs (syslog, console and debug log) of the client which encountered the ASSERTION?&lt;br/&gt;
Thanks!&lt;/p&gt;</comment>
                            <comment id="168943" author="cliffw" created="Mon, 10 Oct 2016 16:16:10 +0000"  >&lt;p&gt;Here is the system log with the assertion, It looks like the debug logs are no longer on the system&lt;/p&gt;</comment>
                            <comment id="169072" author="hongchao.zhang" created="Tue, 11 Oct 2016 09:27:25 +0000"  >&lt;p&gt;Hi Cliff,&lt;/p&gt;

&lt;p&gt;In the newly attach log of spirit-33, the assertion is occurred at Aug 31, and there is no recent assertion.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Aug 31 01:36:14 spirit-33 kernel: Pid: 3153, comm: ptlrpcd_01_02
Aug 31 01:36:14 spirit-33 kernel: #012Call Trace:
Aug 31 01:36:14 spirit-33 kernel: [&amp;lt;ffffffffa08557d3&amp;gt;] libcfs_debug_dumpstack+0x53/0x80 [libcfs]
Aug 31 01:36:14 spirit-33 kernel: [&amp;lt;ffffffffa0855d75&amp;gt;] lbug_with_loc+0x45/0xc0 [libcfs]
Aug 31 01:36:14 spirit-33 kernel: [&amp;lt;ffffffffa0c3e661&amp;gt;] ptlrpc_register_bulk+0x831/0x9c0 [ptlrpc]
Aug 31 01:36:14 spirit-33 kernel: [&amp;lt;ffffffffa08cace2&amp;gt;] ? LNetMDUnlink+0xe2/0x180 [lnet]
Aug 31 01:36:14 spirit-33 kernel: [&amp;lt;ffffffffa0c6be76&amp;gt;] ? sptlrpc_import_sec_ref+0x36/0x40 [ptlrpc]
Aug 31 01:36:14 spirit-33 kernel: [&amp;lt;ffffffffa0c3f1af&amp;gt;] ptl_send_rpc+0x1ff/0xda0 [ptlrpc]
Aug 31 01:36:14 spirit-33 kernel: [&amp;lt;ffffffffa0c39256&amp;gt;] ptlrpc_check_set.part.23+0x1896/0x1dd0 [ptlrpc]
Aug 31 01:36:14 spirit-33 kernel: [&amp;lt;ffffffffa0c397eb&amp;gt;] ptlrpc_check_set+0x5b/0xe0 [ptlrpc]
Aug 31 01:36:14 spirit-33 kernel: [&amp;lt;ffffffffa0c643fb&amp;gt;] ptlrpcd_check+0x4eb/0x5e0 [ptlrpc]
Aug 31 01:36:14 spirit-33 kernel: [&amp;lt;ffffffffa0c647ab&amp;gt;] ptlrpcd+0x2bb/0x560 [ptlrpc]
Aug 31 01:36:14 spirit-33 kernel: [&amp;lt;ffffffff810b8910&amp;gt;] ? default_wake_function+0x0/0x20
Aug 31 01:36:14 spirit-33 kernel: [&amp;lt;ffffffffa0c644f0&amp;gt;] ? ptlrpcd+0x0/0x560 [ptlrpc]
Aug 31 01:36:14 spirit-33 kernel: [&amp;lt;ffffffff810a5b2f&amp;gt;] kthread+0xcf/0xe0
Aug 31 01:36:14 spirit-33 kernel: [&amp;lt;ffffffff810a5a60&amp;gt;] ? kthread+0x0/0xe0
Aug 31 01:36:14 spirit-33 kernel: [&amp;lt;ffffffff81646a98&amp;gt;] ret_from_fork+0x58/0x90
Aug 31 01:36:14 spirit-33 kernel: [&amp;lt;ffffffff810a5a60&amp;gt;] ? kthread+0x0/0xe0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="169185" author="cliffw" created="Tue, 11 Oct 2016 17:45:26 +0000"  >&lt;p&gt;Yes, August 31 was the last time this test was run.&lt;/p&gt;</comment>
                            <comment id="169255" author="hongchao.zhang" created="Wed, 12 Oct 2016 10:28:56 +0000"  >&lt;p&gt;But Frank said it should run around Sept 29.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Frank Heckes added a comment - 29/Sep/16 1:48 AM&lt;/p&gt;

&lt;p&gt;the error also occurs running &lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-reviews/41665/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.hpdd.intel.com/job/lustre-reviews/41665/&lt;/a&gt; (&lt;a href=&quot;http://review.whamcloud.com/#/c/22739/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/22739/&lt;/a&gt; )&lt;br/&gt;
Test was running on centOS-7.2 using patch set 2 of original fix #22738&lt;/p&gt;&lt;/blockquote&gt;</comment>
                            <comment id="169981" author="cliffw" created="Mon, 17 Oct 2016 18:14:42 +0000"  >&lt;p&gt;Test again with latest master, still have this issue. LDEV-510 has more details&lt;/p&gt;</comment>
                            <comment id="169992" author="cliffw" created="Mon, 17 Oct 2016 18:48:42 +0000"  >&lt;p&gt;Repeated test with latest master, same result. Nodes are currently halted with this issue, can be examined.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Oct 17 17:13:27 spirit-5 kernel: Lustre: DEBUG MARKER: == parallel-scale test iorfpp: iorfpp ================================================================ 17:13:46 (1476724426)
Oct 17 17:13:27 spirit-5 sshd[23824]: Received disconnect from 10.10.1.11: 11: disconnected by user
Oct 17 17:13:27 spirit-5 sshd[23824]: pam_unix(sshd:session): session closed &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; user root
Oct 17 17:13:27 spirit-5 systemd-logind: Removed session 94.
Oct 17 17:13:31 spirit-5 kernel: Lustre: lustre-OST000c: Client 2af8c060-dda3-f8ed-5532-73d48d5bec75 (at 192.168.1.21@o2ib) reconnecting
Oct 17 17:13:31 spirit-5 kernel: Lustre: lustre-OST000c: Connection restored to 78757673-8c34-2ea0-d2f0-d29a18f40898 (at 192.168.1.16@o2ib)
Oct 17 17:13:31 spirit-5 kernel: Lustre: Skipped 64 previous similar messages
Oct 17 17:13:31 spirit-5 kernel: LustreError: 19542:0:(ldlm_lib.c:3184:target_bulk_io()) @@@ bulk WRITE failed: rc -107  req@ffff88074fd57c50 x1548457026272736/t0(0) o4-&amp;gt;b765b4c7-de22-04f2-d85c-19567447ab65@192.168.1.15@o2ib:-1/-1 lens 608/448 e 0 to 0 dl 1476724421 ref 1 fl Interpret:/0/0 rc 0/0
Oct 17 17:13:31 spirit-5 kernel: LustreError: 19542:0:(ldlm_lib.c:3184:target_bulk_io()) Skipped 12 previous similar messages
Oct 17 17:13:31 spirit-5 kernel: Lustre: lustre-OST0000: Bulk IO write error with b765b4c7-de22-04f2-d85c-19567447ab65 (at 192.168.1.15@o2ib), client will retry: rc -107
Oct 17 17:13:31 spirit-5 kernel: Lustre: Skipped 145 previous similar messages
Oct 17 17:13:31 spirit-5 kernel: LustreError: 19363:0:(ldlm_lib.c:3234:target_bulk_io()) @@@ Reconnect on bulk WRITE  req@ffff88074fd53050 x1548457087089584/t0(0) o4-&amp;gt;2af8c060-dda3-f8ed-5532-73d48d5bec75@192.168.1.21@o2ib:-1/-1 lens 608/448 e 0 to 0 dl 1476724421 ref 1 fl Interpret:/0/0 rc 0/0
Oct 17 17:13:31 spirit-5 kernel: LustreError: 19363:0:(ldlm_lib.c:3234:target_bulk_io()) Skipped 1319 previous similar messages
...
Oct 17 17:14:28 spirit-5 kernel: LNetError: 14218:0:(o2iblnd_cb.c:3131:kiblnd_check_txs_locked()) Timed out tx: active_txs, 7 seconds
Oct 17 17:14:28 spirit-5 kernel: LNetError: 14218:0:(o2iblnd_cb.c:3131:kiblnd_check_txs_locked()) Skipped 3 previous similar messages
Oct 17 17:14:28 spirit-5 kernel: LNetError: 14218:0:(o2iblnd_cb.c:3194:kiblnd_check_conns()) Timed out RDMA with 192.168.1.11@o2ib (1): c: 103, oc: 0, rc: 124
Oct 17 17:14:28 spirit-5 kernel: LNetError: 14218:0:(o2iblnd_cb.c:3194:kiblnd_check_conns()) Skipped 3 previous similar messages
Oct 17 17:14:28 spirit-5 kernel: LustreError: 14218:0:(events.c:446:server_bulk_callback()) event type 5, status -103, desc ffff880faf285a00
Oct 17 17:14:28 spirit-5 kernel: LustreError: 14218:0:(events.c:446:server_bulk_callback()) event type 3, status -103, desc ffff880faf285a00
Oct 17 17:14:28 spirit-5 kernel: LustreError: 14218:0:(events.c:446:server_bulk_callback()) event type 5, status -103, desc ffff88084d834e00
Oct 17 17:14:28 spirit-5 kernel: LustreError: 14218:0:(events.c:446:server_bulk_callback()) event type 3, status -103, desc ffff88084d834e00
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Client side&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Oct 17 17:14:47 spirit-11 kernel: LNetError: 762:0:(o2iblnd_cb.c:1074:kiblnd_init_rdma()) RDMA is too large &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; peer 192.168.1.5@o2ib (131072), src size: 1024000 dst size: 1024000
Oct 17 17:14:47 spirit-11 kernel: LNetError: 762:0:(o2iblnd_cb.c:1698:kiblnd_reply()) Can&apos;t setup rdma &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; GET from 192.168.1.5@o2ib: -90
Oct 17 17:14:47 spirit-11 kernel: LNetError: 762:0:(o2iblnd_cb.c:1698:kiblnd_reply()) Skipped 10 previous similar messages
Oct 17 17:14:47 spirit-11 kernel: LustreError: 762:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff88080cef7a00
Oct 17 17:14:47 spirit-11 kernel: Lustre: 4800:0:(client.c:2111:ptlrpc_expire_one_request()) @@@ Request sent has failed due to network error: [sent 1476724429/real 1476724429]  req@ffff8807c3730300 x1548457007399104/t0(0) o4-&amp;gt;lustre-OST000c-osc-ffff880036709000@192.168.1.5@o2ib:6/4 lens 608/448 e 6 to 1 dl 1476724517 ref 2 fl Rpc:eX/2/ffffffff rc 0/-1
Oct 17 17:14:47 spirit-11 kernel: LustreError: 4787:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff88080c746600
Oct 17 17:14:47 spirit-11 kernel: Lustre: lustre-OST0012-osc-ffff880036709000: Connection to lustre-OST0012 (at 192.168.1.5@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Oct 17 17:14:47 spirit-11 kernel: Lustre: Skipped 9 previous similar messages
Oct 17 17:14:47 spirit-11 kernel: LustreError: 4785:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff880ffa415800
Oct 17 17:14:47 spirit-11 kernel: Lustre: lustre-OST0012-osc-ffff880036709000: Connection restored to 192.168.1.5@o2ib (at 192.168.1.5@o2ib)
Oct 17 17:14:47 spirit-11 kernel: Lustre: Skipped 9 previous similar messages
Oct 17 17:14:47 spirit-11 kernel: Lustre: 4800:0:(client.c:2111:ptlrpc_expire_one_request()) Skipped 12 previous similar messages
Oct 17 17:14:47 spirit-11 kernel: LustreError: 4791:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff8807c360b800
Oct 17 17:14:47 spirit-11 kernel: LustreError: 4788:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff8807c360ba00
Oct 17 17:14:47 spirit-11 kernel: LustreError: 4791:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff8807c3711e00
Oct 17 17:14:47 spirit-11 kernel: LustreError: 4791:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff88080cdc2a00
Oct 17 17:14:47 spirit-11 kernel: LustreError: 4789:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff88080c014400
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="169993" author="cliffw" created="Mon, 17 Oct 2016 18:49:34 +0000"  >&lt;p&gt;Lustre logs dumped from client (spirit-11) and OSS nodes with debug=-1. Logs dumped by hand after failure. &lt;/p&gt;</comment>
                            <comment id="170039" author="hongchao.zhang" created="Tue, 18 Oct 2016 00:53:50 +0000"  >&lt;p&gt;Hi Cliff,&lt;/p&gt;

&lt;p&gt;Thanks!&lt;br/&gt;
Could you please attach the logs of the node where the &quot;ASSERTION( desc-&amp;gt;bd_md_count == 0 )&quot; is encountered? I can&apos;t find it in the attached logs (spirit-5,6,11).&lt;/p&gt;</comment>
                            <comment id="170198" author="cliffw" created="Tue, 18 Oct 2016 17:43:55 +0000"  >&lt;p&gt;The logs were taken several hours after the failure, if the ASSERT isn&apos;t in those logs, then it must have been over-written. I can re-run the test, and you can login to the nodes if you wish.&lt;/p&gt;</comment>
                            <comment id="170708" author="hongchao.zhang" created="Mon, 24 Oct 2016 07:41:01 +0000"  >&lt;p&gt;Hi Cliff, &lt;/p&gt;

&lt;p&gt;I saw there are several tests of &quot;parallel-scale test iorfpp: iorfpp&quot; after &quot;Oct 18 00:00:00&quot; at spirit-11. Do you still use spirit-11 to test&lt;br/&gt;
this issue? there is no &quot;client_bulk_callback&quot; errors in the following tests and no &quot;ASSERTION&quot; was found in the /var/log/messages&lt;br/&gt;
(the messages after Sep 26).&lt;/p&gt;</comment>
                            <comment id="170718" author="yong.fan" created="Mon, 24 Oct 2016 08:54:41 +0000"  >&lt;p&gt;With the patch &lt;a href=&quot;http://review.whamcloud.com/#/c/22739/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/22739/&lt;/a&gt; applied, there should be no &quot;ASSERTION( desc-&amp;gt;bd_md_count == 0 )&quot; any longer. Because the patch deregisters the the RPC&apos;s MD unconditionally before such ASSERTION() check. That is almost equal to removing the ASSERTION(). But in fact, such ASSERTION() means we assume that the RPC environment should be sane, no need to force cleanup via deregister RPC&apos;s MD. So we should find out why the RPC environment is not sane at that time, not only simply &apos;bypass&apos; such ASSERTION().&lt;/p&gt;

&lt;p&gt;On the other hand, it is NOT to say the patch &lt;a href=&quot;http://review.whamcloud.com/#/c/22739/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/22739/&lt;/a&gt; is meaningless, because if we still trigger the ASSERTION() with the patch applied, that means there are other logic paths not handled properly.&lt;/p&gt;</comment>
                            <comment id="170730" author="bzzz" created="Mon, 24 Oct 2016 12:44:28 +0000"  >&lt;p&gt;I&apos;m not familiar with the related code (still studying..), but it looks like ptlrpc_unregister_bulk() is missing in some codepath..&lt;/p&gt;</comment>
                            <comment id="170921" author="gerrit" created="Tue, 25 Oct 2016 07:47:50 +0000"  >&lt;p&gt;Fan Yong (fan.yong@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/23354&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/23354&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8573&quot; title=&quot;IOR: niobuf.c:319:ptlrpc_register_bulk()) ASSERTION( desc-&amp;gt;bd_md_count == 0 ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8573&quot;&gt;&lt;del&gt;LU-8573&lt;/del&gt;&lt;/a&gt; ptlrpc: cleanup bulks for ptlrpc_register_bulk failure&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 683841cc2ea9ee0d16f9fe1c5baf77ed668c15fb&lt;/p&gt;</comment>
                            <comment id="171240" author="cliffw" created="Wed, 26 Oct 2016 19:27:47 +0000"  >&lt;p&gt;IOR continues to fail&lt;br/&gt;
Server logs from start of tests&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Oct 26 18:45:05 spirit-aeon-1 kernel: Lustre: zfstest-OST0002: Client de07d034-8d22-f659-1cbd-768d0030c923 (at 192.168.1.26@o2ib) reconnecting
Oct 26 18:45:05 spirit-aeon-1 kernel: Lustre: zfstest-OST0002: Connection restored to de07d034-8d22-f659-1cbd-768d0030c923 (at 192.168.1.26@o2ib)
Oct 26 18:45:05 spirit-aeon-1 kernel: Lustre: Skipped 9 previous similar messages
Oct 26 18:45:05 spirit-aeon-1 kernel: LustreError: 29216:0:(ldlm_lib.c:3183:target_bulk_io()) @@@ bulk WRITE failed: rc -107  req@ffff8808089b4850 x1549270578154064/t0(0) o4-&amp;gt;7522edd5-eea9-2c67-68cd-2b45f3964653@192.168.1.17@o2ib:-1/-1 lens 608/448 e 0 to 0 dl 1477507516 ref 1 fl Interpret:/0/0 rc 0/0
Oct 26 18:45:05 spirit-aeon-1 kernel: Lustre: zfstest-OST0002: Bulk IO write error with 7522edd5-eea9-2c67-68cd-2b45f3964653 (at 192.168.1.17@o2ib), client will retry: rc -107
Oct 26 18:45:05 spirit-aeon-1 kernel: Lustre: zfstest-OST0000: Client ec385605-4e60-f4be-b60f-cd3e019315dd (at 192.168.1.21@o2ib) reconnecting
Oct 26 18:45:05 spirit-aeon-1 kernel: Lustre: Skipped 23 previous similar messages
Oct 26 18:45:06 spirit-aeon-1 kernel: LustreError: 25234:0:(ldlm_lib.c:3233:target_bulk_io()) @@@ Reconnect on bulk WRITE  req@ffff8808089b7850 x1549270581336640/t0(0) o4-&amp;gt;de07d034-8d22-f659-1cbd-768d0030c923@192.168.1.26@o2ib:-1/-1 lens 608/448 e 0 to 0 dl 1477507516 ref 1 fl Interpret:/0/0 rc 0/0
Oct 26 18:45:06 spirit-aeon-1 kernel: LustreError: 25234:0:(ldlm_lib.c:3233:target_bulk_io()) Skipped 2 previous similar messages
Oct 26 18:45:06 spirit-aeon-1 kernel: Lustre: zfstest-OST0002: Connection restored to 027d9577-68a8-de83-149f-38f3670e21f2 (at 192.168.1.14@o2ib)
Oct 26 18:45:06 spirit-aeon-1 kernel: Lustre: Skipped 27 previous similar messages
Oct 26 18:45:06 spirit-aeon-1 kernel: LustreError: 29238:0:(ldlm_lib.c:3233:target_bulk_io()) @@@ Reconnect on bulk WRITE  req@ffff880808853850 x1549270577110928/t0(0) o4-&amp;gt;77fffece-ca00-9432-9fd5-857674816ee4@192.168.1.28@o2ib:-1/-1 lens 608/448 e 0 to 0 dl 1477507516 ref 1 fl Interpret:/0/0 rc 0/0
Oct 26 18:45:06 spirit-aeon-1 kernel: Lustre: zfstest-OST0000: Client de07d034-8d22-f659-1cbd-768d0030c923 (at 192.168.1.26@o2ib) reconnecting
Oct 26 18:45:06 spirit-aeon-1 kernel: Lustre: Skipped 19 previous similar messages
Oct 26 18:45:06 spirit-aeon-1 kernel: LustreError: 29238:0:(ldlm_lib.c:3233:target_bulk_io()) Skipped 24 previous similar messages
Oct 26 18:45:07 spirit-aeon-1 kernel: LustreError: 29274:0:(ldlm_lib.c:3183:target_bulk_io()) @@@ bulk WRITE failed: rc -107  req@ffff88007fd76850 x1549270578161152/t0(0) o4-&amp;gt;1b29c57b-535c-46b6-486f-7405da7c1796@192.168.1.18@o2ib:-1/-1 lens 608/448 e 0 to 0 dl 1477507516 ref 1 fl Interpret:/2/0 rc 0/0
Oct 26 18:45:07 spirit-aeon-1 kernel: Lustre: zfstest-OST0000: Bulk IO write error with 1b29c57b-535c-46b6-486f-7405da7c1796 (at 192.168.1.18@o2ib), client will retry: rc -107
Oct 26 18:45:07 spirit-aeon-1 kernel: LustreError: 29261:0:(ldlm_lib.c:3233:target_bulk_io()) @@@ Reconnect on bulk WRITE  req@ffff88079810d050 x1549270582366336/t0(0) o4-&amp;gt;f71ccf2b-e961-3680-30e4-be748de9fb17@192.168.1.27@o2ib:-1/-1 lens 608/448 e 0 to 0 dl 1477507516 ref 1 fl Interpret:/2/0 rc 0/0
Oct 26 18:45:07 spirit-aeon-1 kernel: LustreError: 29261:0:(ldlm_lib.c:3233:target_bulk_io()) Skipped 21 previous similar messages
Oct 26 18:45:08 spirit-aeon-1 kernel: Lustre: zfstest-OST0000: Connection restored to 51f91233-8032-15d2-cd74-03a850dab34b (at 192.168.1.11@o2ib)
Oct 26 18:45:08 spirit-aeon-1 kernel: Lustre: Skipped 38 previous similar messages
Oct 26 18:45:08 spirit-aeon-1 kernel: Lustre: zfstest-OST0000: Client 7f61e263-a95b-3193-8481-b48b561e6e29 (at 192.168.1.25@o2ib) reconnecting
Oct 26 18:45:08 spirit-aeon-1 kernel: Lustre: Skipped 35 previous similar messages
Oct 26 18:45:10 spirit-aeon-1 kernel: LustreError: 29306:0:(ldlm_lib.c:3233:target_bulk_io()) @@@ Reconnect on bulk WRITE  req@ffff88072a949850 x1549270584459216/t0(0) o4-&amp;gt;b38521f1-8de6-5f02-305a-3c91f4caeb2d@192.168.1.24@o2ib:-1/-1 lens 608/448 e 0 to 0 dl 1477507518 ref 1 fl Interpret:/2/0 rc 0/0
Oct 26 18:45:10 spirit-aeon-1 kernel: LustreError: 29306:0:(ldlm_lib.c:3233:target_bulk_io()) Skipped 44 previous similar messages
Oct 26 18:45:12 spirit-aeon-1 kernel: Lustre: zfstest-OST0000: Connection restored to a8341b8c-a880-ca64-9830-6416dc1b988f (at 192.168.1.15@o2ib)
Oct 26 18:45:12 spirit-aeon-1 kernel: Lustre: Skipped 54 previous similar messages
Oct 26 18:45:12 spirit-aeon-1 kernel: Lustre: zfstest-OST0000: Client de07d034-8d22-f659-1cbd-768d0030c923 (at 192.168.1.26@o2ib) reconnecting
Oct 26 18:45:12 spirit-aeon-1 kernel: Lustre: Skipped 56 previous similar messages
Oct 26 18:45:14 spirit-aeon-1 kernel: LustreError: 29376:0:(ldlm_lib.c:3233:target_bulk_io()) @@@ Reconnect on bulk WRITE  req@ffff88075d19bc50 x1549270582366368/t0(0) o4-&amp;gt;f71ccf2b-e961-3680-30e4-be748de9fb17@192.168.1.27@o2ib:-1/-1 lens 608/448 e 0 to 0 dl 1477507522 ref 1 fl Interpret:/2/0 rc 0/0
Oct 26 18:45:14 spirit-aeon-1 kernel: LustreError: 29376:0:(ldlm_lib.c:3233:target_bulk_io()) Skipped 68 previous similar messages
Oct 26 18:45:56 spirit-aeon-1 kernel: LNetError: 21200:0:(o2iblnd_cb.c:3131:kiblnd_check_txs_locked()) Timed out tx: active_txs, 0 seconds
Oct 26 18:45:56 spirit-aeon-1 kernel: LNetError: 21200:0:(o2iblnd_cb.c:3194:kiblnd_check_conns()) Timed out RDMA with 192.168.1.11@o2ib (6): c: 127, oc: 0, rc: 94
Oct 26 18:45:56 spirit-aeon-1 kernel: LustreError: 21200:0:(events.c:446:server_bulk_callback()) event type 5, status -103, desc ffff8804c83c8000
Oct 26 18:45:56 spirit-aeon-1 kernel: LustreError: 21200:0:(events.c:446:server_bulk_callback()) event type 3, status -103, desc ffff8804c83c8000
Oct 26 18:45:56 spirit-aeon-1 kernel: LustreError: 21200:0:(events.c:446:server_bulk_callback()) event type 5, status -103, desc ffff8804c83c8000
Oct 26 18:45:56 spirit-aeon-1 kernel: LustreError: 21200:0:(events.c:446:server_bulk_callback()) event type 3, status -103, desc ffff8804c83c8000
Oct 26 18:45:56 spirit-aeon-1 kernel: LustreError: 21200:0:(events.c:446:server_bulk_callback()) event type 5, status -103, desc ffff8804c83c8000
Oct 26 18:45:56 spirit-aeon-1 kernel: LustreError: 21200:0:(events.c:446:server_bulk_callback()) event type 3, status -103, desc ffff8804c83c8000
Oct 26 18:45:56 spirit-aeon-1 kernel: LustreError: 21200:0:(events.c:446:server_bulk_callback()) event type 5, status -103, desc ffff8804c83c8000
Oct 26 18:45:56 spirit-aeon-1 kernel: LustreError: 21200:0:(events.c:446:server_bulk_callback()) event type 3, status -103, desc ffff8804c83c8000
Oct 26 18:45:56 spirit-aeon-1 kernel: LustreError: 21200:0:(events.c:446:server_bulk_callback()) event type 5, status -103, desc ffff88054023b400
Oct 26 18:45:56 spirit-aeon-1 kernel: Lustre: zfstest-OST0002: Bulk IO write error with 51f91233-8032-15d2-cd74-03a850dab34b (at 192.168.1.11@o2ib), client will retry: rc -110
Oct 26 18:45:56 spirit-aeon-1 kernel: LustreError: 212
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="171241" author="cliffw" created="Wed, 26 Oct 2016 19:28:49 +0000"  >&lt;p&gt;Client-side messages&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Oct 26 18:45:02 spirit-11 kernel: Lustre: DEBUG MARKER: == parallel-scale test iorfpp: iorfpp ================================================================ 18:45:02 (1477507502)
Oct 26 18:45:02 spirit-11 sshd[21261]: Received disconnect from 10.10.1.11: 11: disconnected by user
Oct 26 18:45:02 spirit-11 sshd[21261]: pam_unix(sshd:session): session closed &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; user root
Oct 26 18:45:02 spirit-11 systemd-logind: Removed session 42.
Oct 26 18:45:05 spirit-11 kernel: LNetError: 3587:0:(o2iblnd_cb.c:1074:kiblnd_init_rdma()) RDMA is too large &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; peer 192.168.2.2@o2ib (131072), src size: 1048576 dst size: 1048576
Oct 26 18:45:05 spirit-11 kernel: LNetError: 3586:0:(o2iblnd_cb.c:1698:kiblnd_reply()) Can&apos;t setup rdma &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; GET from 192.168.2.2@o2ib: -90
Oct 26 18:45:05 spirit-11 kernel: LustreError: 3586:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff88100d076800
Oct 26 18:45:05 spirit-11 kernel: LustreError: 3586:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff88100d074e00
Oct 26 18:45:05 spirit-11 kernel: LustreError: 3585:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff88100d074e00
Oct 26 18:45:05 spirit-11 kernel: Lustre: 3616:0:(client.c:2111:ptlrpc_expire_one_request()) @@@ Request sent has failed due to network error: [sent 1477507505/real 1477507505]  req@ffff88077d8d6600 x1549270577199552/t0(0) o4-&amp;gt;zfstest-OST0001-osc-ffff880ff9be7000@192.168.2.2@o2ib:6/4 lens 608/448 e 0 to 1 dl 1477507517 ref 2 fl Rpc:eX/0/ffffffff rc 0/-1
Oct 26 18:45:05 spirit-11 kernel: Lustre: zfstest-OST0001-osc-ffff880ff9be7000: Connection to zfstest-OST0001 (at 192.168.2.2@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Oct 26 18:45:05 spirit-11 kernel: Lustre: zfstest-OST0001-osc-ffff880ff9be7000: Connection restored to 192.168.2.2@o2ib (at 192.168.2.2@o2ib)
Oct 26 18:45:05 spirit-11 kernel: LustreError: 3582:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff88100d074400
Oct 26 18:45:05 spirit-11 kernel: LustreError: 3582:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff88100d074400
Oct 26 18:45:05 spirit-11 kernel: LustreError: 3581:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff88100d074400
Oct 26 18:45:05 spirit-11 kernel: LustreError: 3584:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff88100d074400
Oct 26 18:45:05 spirit-11 kernel: LNetError: 3587:0:(o2iblnd_cb.c:1074:kiblnd_init_rdma()) Skipped 7 previous similar messages
Oct 26 18:45:05 spirit-11 kernel: LustreError: 3587:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff88100d076800
Oct 26 18:45:05 spirit-11 kernel: LustreError: 3588:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff88003666fc00
Oct 26 18:45:05 spirit-11 kernel: LustreError: 3586:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff88003666fc00
Oct 26 18:45:05 spirit-11 kernel: LustreError: 3585:0:(events.c:199:client_bulk_callback()) event type 1, status -5, desc ffff88003666fc00
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="171289" author="cliffw" created="Thu, 27 Oct 2016 00:00:57 +0000"  >&lt;p&gt;Lustre logs from both OSS nodes.  backfstype was ZFS for this test, error can be reproduced on either ZFS or ldiskfs&lt;/p&gt;</comment>
                            <comment id="171290" author="cliffw" created="Thu, 27 Oct 2016 00:01:32 +0000"  >&lt;p&gt;MDS and sample client lustre logs. &lt;/p&gt;</comment>
                            <comment id="171399" author="green" created="Thu, 27 Oct 2016 17:24:33 +0000"  >&lt;p&gt;I think the IO errors aside, it&apos;s still important to fix the assertion since twe don&apos;t want to have the crashes.&lt;br/&gt;
Now if io erors happen to be due to network errors later - then we can dig into this separately, but at least we won&apos;t be crashing anything.&lt;/p&gt;</comment>
                            <comment id="171404" author="cliffw" created="Thu, 27 Oct 2016 17:36:59 +0000"  >&lt;p&gt;One note: the IOR test makes no progress due to these errors, it may not be hitting an ASSERT, but the code is completely un-useable. &lt;/p&gt;</comment>
                            <comment id="171419" author="cliffw" created="Thu, 27 Oct 2016 18:37:38 +0000"  >&lt;p&gt;Repeated the fault with ldiskfs, set debug=-1 and dumped lustre logs after the hang occurred. &lt;br/&gt;
MDS = spirit-4 OSS= spirit-5/6, all others are clients. &lt;/p&gt;</comment>
                            <comment id="171562" author="cliffw" created="Fri, 28 Oct 2016 14:52:59 +0000"  >&lt;p&gt;Received patch from Alex, &quot;Doug thinks that can be introduced by &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7650&quot; title=&quot;ko2iblnd map_on_demand can&amp;#39;t negotitate when page sizes are different between nodes.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7650&quot;&gt;&lt;del&gt;LU-7650&lt;/del&gt;&lt;/a&gt;.. I just pushed a patch to revert that: &lt;a href=&quot;http://review.whamcloud.com/23439&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/23439&lt;/a&gt;&quot;&lt;br/&gt;
I was able to successfully complete IOR runs on both sets of servers, will retry ZFS today.&lt;/p&gt;</comment>
                            <comment id="171682" author="gerrit" created="Fri, 28 Oct 2016 23:51:11 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/23439/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/23439/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8573&quot; title=&quot;IOR: niobuf.c:319:ptlrpc_register_bulk()) ASSERTION( desc-&amp;gt;bd_md_count == 0 ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8573&quot;&gt;&lt;del&gt;LU-8573&lt;/del&gt;&lt;/a&gt; lnet: Revert &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7650&quot; title=&quot;ko2iblnd map_on_demand can&amp;#39;t negotitate when page sizes are different between nodes.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7650&quot;&gt;&lt;del&gt;LU-7650&lt;/del&gt;&lt;/a&gt; patches&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 59fbfa5f3fa925f676636f1b78c986d05cd295bc&lt;/p&gt;</comment>
                            <comment id="171700" author="pjones" created="Sat, 29 Oct 2016 00:25:54 +0000"  >&lt;p&gt;Landed for 2.9&lt;/p&gt;</comment>
                            <comment id="233508" author="hongchao.zhang" created="Fri, 14 Sep 2018 08:40:30 +0000"  >&lt;p&gt;Hongchao Zhang (hongchao@whamcloud.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/33167&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/33167&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8573&quot; title=&quot;IOR: niobuf.c:319:ptlrpc_register_bulk()) ASSERTION( desc-&amp;gt;bd_md_count == 0 ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8573&quot;&gt;&lt;del&gt;LU-8573&lt;/del&gt;&lt;/a&gt; ptlrpc: race with reply_in_callback&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: ac6b543a08033ddb5879706c0b3c941d3cdfbef0&lt;/p&gt;</comment>
                            <comment id="237788" author="adilger" created="Sat, 1 Dec 2018 05:34:34 +0000"  >&lt;p&gt;Closing this ticket again, and using &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11647&quot; title=&quot;niobuf.c:330:ptlrpc_register_bulk()) ASSERTION( desc-&amp;gt;bd_md_count == 0 ) failed:&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11647&quot;&gt;&lt;del&gt;LU-11647&lt;/del&gt;&lt;/a&gt; for the new patch.  This avoids confusing the fix version for the patch.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="53981">LU-11647</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="54091">LU-11692</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="34042">LU-7650</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="23809" name="client.spirit-28.log.gz" size="3123369" author="cliffw" created="Thu, 27 Oct 2016 00:01:32 +0000"/>
                            <attachment id="23306" name="lu-8573-syslog.tar" size="230" author="cliffw" created="Mon, 3 Oct 2016 21:57:09 +0000"/>
                            <attachment id="23307" name="lu-8573.console.tar" size="232" author="cliffw" created="Mon, 3 Oct 2016 21:57:09 +0000"/>
                            <attachment id="23810" name="mds.sprit-3.log.gz" size="232" author="cliffw" created="Thu, 27 Oct 2016 00:01:32 +0000"/>
                            <attachment id="23807" name="oss.aeon-1.log.gz" size="230" author="cliffw" created="Thu, 27 Oct 2016 00:00:56 +0000"/>
                            <attachment id="23808" name="oss.aeon-2.log.gz" size="2396344" author="cliffw" created="Thu, 27 Oct 2016 00:00:57 +0000"/>
                            <attachment id="23822" name="spirit-11.log.gz" size="1296043" author="cliffw" created="Thu, 27 Oct 2016 18:37:38 +0000"/>
                            <attachment id="23301" name="spirit-14-lustre-log.20160928_2029-4-lu-8573.bz2" size="1054109" author="cliffw" created="Mon, 3 Oct 2016 21:53:03 +0000"/>
                            <attachment id="23302" name="spirit-15-lustre-log.20160928_2029-4-lu-8573.bz2" size="976711" author="cliffw" created="Mon, 3 Oct 2016 21:53:03 +0000"/>
                            <attachment id="23303" name="spirit-16-lustre-log.20160928_2029-4-lu-8573.bz2" size="1090141" author="cliffw" created="Mon, 3 Oct 2016 21:53:03 +0000"/>
                            <attachment id="23823" name="spirit-17.log.gz" size="1103251" author="cliffw" created="Thu, 27 Oct 2016 18:37:38 +0000"/>
                            <attachment id="23824" name="spirit-20.log.gz" size="1290004" author="cliffw" created="Thu, 27 Oct 2016 18:37:38 +0000"/>
                            <attachment id="23300" name="spirit-3-lustre-log.20160928_2029-4-lu-8573.bz2" size="290" author="cliffw" created="Mon, 3 Oct 2016 21:53:03 +0000"/>
                            <attachment id="23368" name="spirit-33.log-20160904" size="15756" author="cliffw" created="Mon, 10 Oct 2016 16:16:10 +0000"/>
                            <attachment id="23819" name="spirit-4.log.gz" size="2304107" author="cliffw" created="Thu, 27 Oct 2016 18:37:38 +0000"/>
                            <attachment id="23530" name="spirit-5.callback.txt.gz" size="244" author="cliffw" created="Mon, 17 Oct 2016 18:49:34 +0000"/>
                            <attachment id="23820" name="spirit-5.log.gz" size="1342884" author="cliffw" created="Thu, 27 Oct 2016 18:37:38 +0000"/>
                            <attachment id="23531" name="spirit-6.callback.lustre.log.gz" size="258" author="cliffw" created="Mon, 17 Oct 2016 18:49:34 +0000"/>
                            <attachment id="23821" name="spirit-6.log.gz" size="1485289" author="cliffw" created="Thu, 27 Oct 2016 18:37:38 +0000"/>
                            <attachment id="23304" name="spirit-aeon-1-lustre-log.20160928_2029-4-lu-8573.bz2" size="301" author="cliffw" created="Mon, 3 Oct 2016 21:53:03 +0000"/>
                            <attachment id="23305" name="spirit-aeon-2-lustre-log.20160928_2029-4-lu-8573.bz2" size="301" author="cliffw" created="Mon, 3 Oct 2016 21:53:04 +0000"/>
                            <attachment id="23532" name="sprit-11.callbacks.lustre.log.gz" size="2078992" author="cliffw" created="Mon, 17 Oct 2016 18:49:34 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzympj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>