<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:31:01 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-9983] LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS</title>
                <link>https://jira.whamcloud.com/browse/LU-9983</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;soak is started, jobs begin to run. &lt;br/&gt;
soak-8 (MDT0000) reports and error:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Sep 13 15:53:37 soak-8 kernel: LustreError: 4873:0:(out_handler.c:597:out_write()) soaked-MDT0000: empty or wrong size 0 pos: rc = -61
Sep 13 15:53:37 soak-8 kernel: LustreError: 4990:0:(out_handler.c:1000:out_handle()) soaked-MDT0000: invalid update buffer magic 0 expect bdde0002: rc = -71
Sep 13 15:53:37 soak-8 kernel: LustreError: 4873:0:(out_handler.c:597:out_write()) Skipped 2 previous similar messages
Sep 13 15:53:44 soak-8 kernel: LustreError: 5117:0:(mdt_lvb.c:163:mdt_lvbo_fill()) soaked-MDT0000: expected 968 actual 344.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;All other DNE MDTS (01-03) promptly LBUG:&lt;br/&gt;
soak-9&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Sep 13 15:53:37 soak-9 kernel: LustreError: 11-0: soaked-MDT0000-osp-MDT0001: operation out_update to node 192.168.1.108@o2ib failed: rc = -71
Sep 13 15:53:37 soak-9 kernel: LustreError: 4734:0:(layout.c:2082:__req_capsule_get()) @@@ Wrong buffer &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; field `object_update_reply&lt;span class=&quot;code-quote&quot;&gt;&apos; (1 of 1) in format `OUT_UPDATE&apos;&lt;/span&gt;: 0 vs. 4096 (server)#012  req@ffff880808e79500 x1578438980902080/t0(0) o1000-&amp;gt;soaked-MDT0002-osp-MDT0001@192.168.1.110@o2ib:24/4 lens 392/192 e 0 to 0 dl 1505318024 ref 2 fl Interpret:RM/0/0 rc -71/-71
Sep 13 15:53:37 soak-9 kernel: LustreError: Skipped 1 previous similar message
Sep 13 15:53:47 soak-9 kernel: LustreError: 4719:0:(llog_osd.c:327:llog_osd_declare_write_rec()) ASSERTION( rec ) failed:
Sep 13 15:53:47 soak-9 kernel: LustreError: 4719:0:(llog_osd.c:327:llog_osd_declare_write_rec()) LBUG
Sep 13 15:53:47 soak-9 kernel: Pid: 4719, comm: dist_txn-1
Sep 13 15:53:47 soak-9 kernel: #012Call Trace:
Sep 13 15:53:47 soak-9 kernel: [&amp;lt;ffffffffc0e667ae&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]
Sep 13 15:53:47 soak-9 kernel: [&amp;lt;ffffffffc0e6683c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]
Sep 13 15:53:47 soak-9 kernel: [&amp;lt;ffffffffc0f658f5&amp;gt;] llog_osd_declare_write_rec+0x3c5/0x3d0 [obdclass]
Sep 13 15:53:47 soak-9 kernel: [&amp;lt;ffffffffc0f5a0e4&amp;gt;] llog_declare_write_rec+0x84/0x200 [obdclass]
Sep 13 15:53:47 soak-9 kernel: [&amp;lt;ffffffffc0f5a862&amp;gt;] llog_cancel_rec+0xe2/0x870 [obdclass]
Sep 13 15:53:47 soak-9 kernel: [&amp;lt;ffffffffc0f60f6a&amp;gt;] llog_cat_cancel_records+0x13a/0x2e0 [obdclass]
Sep 13 15:53:47 soak-9 kernel: [&amp;lt;ffffffffc0e71ba7&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
Sep 13 15:53:47 soak-9 kernel: [&amp;lt;ffffffffc11f22d9&amp;gt;] distribute_txn_commit_thread+0x599/0xca0 [ptlrpc]
Sep 13 15:53:47 soak-9 kernel: [&amp;lt;ffffffff810c12c8&amp;gt;] ? check_preempt_curr+0x78/0xa0 
Sep 13 15:53:47 soak-9 kernel: [&amp;lt;ffffffff810c4810&amp;gt;] ? default_wake_function+0x0/0x20
Sep 13 15:53:47 soak-9 kernel: [&amp;lt;ffffffffc11f1d40&amp;gt;] ? distribute_txn_commit_thread+0x0/0xca0 [ptlrpc]
Sep 13 15:53:47 soak-9 kernel: [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
Sep 13 15:53:47 soak-9 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
Sep 13 15:53:47 soak-9 kernel: [&amp;lt;ffffffff816b4f18&amp;gt;] ret_from_fork+0x58/0x90
Sep 13 15:53:47 soak-9 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
Sep 13 15:53:47 soak-9 kernel:
Sep 13 15:53:47 soak-9 kernel: Kernel panic - not syncing: LBUG
Sep 13 15:53:50 soak-9 kernel: CPU: 25 PID: 4719 Comm: dist_txn-1 Tainted: P           OE  ------------   3.10.0-693.1.1.el7_lustre.x86_64 #1
Sep 13 15:53:50 soak-9 kernel: Hardware name: Intel Corporation S2600GZ ........../S2600GZ, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013
Sep 13 15:53:50 soak-9 kernel: ffff880419612f00 000000001ae378cc ffff880418eafb90 ffffffff816a3d6d
Sep 13 15:53:50 soak-9 kernel: ffff880418eafc10 ffffffff8169dc54 ffffffff00000008 ffff880418eafc20
Sep 13 15:53:50 soak-9 kernel: ffff880418eafbc0 000000001ae378cc 000000001ae378cc ffff88082da4f8b8
Sep 13 15:53:50 soak-9 kernel: Call Trace:
Sep 13 15:53:50 soak-9 kernel: [&amp;lt;ffffffff816a3d6d&amp;gt;] dump_stack+0x19/0x1b
Sep 13 15:53:50 soak-9 kernel: [&amp;lt;ffffffff8169dc54&amp;gt;] panic+0xe8/0x20d
Sep 13 15:53:50 soak-9 kernel: [&amp;lt;ffffffffc0e66854&amp;gt;] lbug_with_loc+0x64/0xb0 [libcfs]
Sep 13 15:53:50 soak-9 kernel: [&amp;lt;ffffffffc0f658f5&amp;gt;] llog_osd_declare_write_rec+0x3c5/0x3d0 [obdclass]
Sep 13 15:53:50 soak-9 kernel: [&amp;lt;ffffffffc0f5a0e4&amp;gt;] llog_declare_write_rec+0x84/0x200 [obdclass]
Sep 13 15:53:50 soak-9 kernel: [&amp;lt;ffffffffc0f5a862&amp;gt;] llog_cancel_rec+0xe2/0x870 [obdclass]
Sep 13 15:53:50 soak-9 kernel: [&amp;lt;ffffffffc0f60f6a&amp;gt;] llog_cat_cancel_records+0x13a/0x2e0 [obdclass]
Sep 13 15:53:50 soak-9 kernel: [&amp;lt;ffffffffc0e71ba7&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
Sep 13 15:53:50 soak-9 kernel: [&amp;lt;ffffffffc11f22d9&amp;gt;] distribute_txn_commit_thread+0x599/0xca0 [ptlrpc]
Sep 13 15:53:50 soak-9 kernel: [&amp;lt;ffffffff810c12c8&amp;gt;] ? check_preempt_curr+0x78/0xa0
Sep 13 15:53:50 soak-9 kernel: [&amp;lt;ffffffff810c4810&amp;gt;] ? wake_up_state+0x20/0x20
Sep 13 15:53:50 soak-9 kernel: [&amp;lt;ffffffffc11f1d40&amp;gt;] ? sub_trans_commit_cb+0x20/0x20 [ptlrpc]
Sep 13 15:53:50 soak-9 kernel: [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
Sep 13 15:53:50 soak-9 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? insert_kthread_work+0x40/0x40
Sep 13 15:53:50 soak-9 kernel: [&amp;lt;ffffffff816b4f18&amp;gt;] ret_from_fork+0x58/0x90
Sep 13 15:53:50 soak-9 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? insert_kthread_work+0x40/0x40
Sep 13 15:53:50 soak-9 kernel: Kernel Offset: disabled
Sep 13 15:53:50 soak-9 kernel: ------------[ cut here ]------------
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;soak-10&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Sep 13 15:53:37 soak-10 kernel: LustreError: 11-0: soaked-MDT0000-osp-MDT0002: operation out_update to node 192.168.1.108@o2ib failed: rc = -71
Sep 13 15:53:37 soak-10 kernel: LustreError: 14955:0:(layout.c:2082:__req_capsule_get()) @@@ Wrong buffer &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; field `object_update_reply&lt;span class=&quot;code-quote&quot;&gt;&apos; (1 of 1) in format `OUT_UPDATE&apos;&lt;/span&gt;: 0 vs. 4096 (server)#012  req@ffff880818bb5400 x1578438986144240/t0(0) o1000-&amp;gt;soaked-MDT0000-osp-MDT0002@192.168.1.108@o2ib:24/4 lens 392/192 e 0 to 0 dl 1505318024 ref 2 fl Interpret:RM/0/0 rc -71/-71
Sep 13 15:53:37 soak-10 kernel: LustreError: 14939:0:(out_handler.c:597:out_write()) soaked-MDT0002: empty or wrong size 0 pos: rc = -61
Sep 13 15:53:37 soak-10 kernel: LustreError: 14939:0:(out_handler.c:1000:out_handle()) soaked-MDT0002: invalid update buffer magic 0 expect bdde0002: rc = -71
Sep 13 15:53:44 soak-10 kernel: LustreError: 14961:0:(llog_osd.c:327:llog_osd_declare_write_rec()) ASSERTION( rec ) failed:
Sep 13 15:53:44 soak-10 kernel: LustreError: 14961:0:(llog_osd.c:327:llog_osd_declare_write_rec()) LBUG
Sep 13 15:53:44 soak-10 kernel: Pid: 14961, comm: dist_txn-2
Sep 13 15:53:44 soak-10 kernel: #012Call Trace:
Sep 13 15:53:44 soak-10 kernel: [&amp;lt;ffffffffc0e087ae&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]
Sep 13 15:53:44 soak-10 kernel: [&amp;lt;ffffffffc0e0883c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]
Sep 13 15:53:44 soak-10 kernel: [&amp;lt;ffffffffc0f208f5&amp;gt;] llog_osd_declare_write_rec+0x3c5/0x3d0 [obdclass]
Sep 13 15:53:44 soak-10 kernel: [&amp;lt;ffffffffc0f150e4&amp;gt;] llog_declare_write_rec+0x84/0x200 [obdclass]
Sep 13 15:53:44 soak-10 kernel: [&amp;lt;ffffffffc0f15862&amp;gt;] llog_cancel_rec+0xe2/0x870 [obdclass]
Sep 13 15:53:44 soak-10 kernel: [&amp;lt;ffffffffc0f1bf6a&amp;gt;] llog_cat_cancel_records+0x13a/0x2e0 [obdclass]
Sep 13 15:53:44 soak-10 kernel: [&amp;lt;ffffffffc0e13ba7&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
Sep 13 15:53:44 soak-10 kernel: [&amp;lt;ffffffffc11fc2d9&amp;gt;] distribute_txn_commit_thread+0x599/0xca0 [ptlrpc]
Sep 13 15:53:44 soak-10 kernel: [&amp;lt;ffffffff810c4810&amp;gt;] ? default_wake_function+0x0/0x20
Sep 13 15:53:44 soak-10 kernel: [&amp;lt;ffffffffc11fbd40&amp;gt;] ? distribute_txn_commit_thread+0x0/0xca0 [ptlrpc]
Sep 13 15:53:44 soak-10 kernel: [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
Sep 13 15:53:44 soak-10 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
Sep 13 15:53:44 soak-10 kernel: [&amp;lt;ffffffff816b4f18&amp;gt;] ret_from_fork+0x58/0x90
Sep 13 15:53:44 soak-10 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
Sep 13 15:53:44 soak-10 kernel:
Sep 13 15:53:44 soak-10 kernel: Kernel panic - not syncing: LBUG
Sep 13 15:53:45 soak-10 kernel: CPU: 10 PID: 14961 Comm: dist_txn-2 Tainted: P           OE  ------------   3.10.0-693.1.1.el7_lustre.x86_64 #1
Sep 13 15:53:45 soak-10 kernel: Hardware name: Intel Corporation S2600GZ ........../S2600GZ, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013
Sep 13 15:53:45 soak-10 kernel: ffff8803f8e7bf00 000000009b9e8911 ffff8803f8ea7b90 ffffffff816a3d6d
Sep 13 15:53:45 soak-10 kernel: ffff8803f8ea7c10 ffffffff8169dc54 ffffffff00000008 ffff8803f8ea7c20
Sep 13 15:53:45 soak-10 kernel: ffff8803f8ea7bc0 000000009b9e8911 000000009b9e8911 ffff88082d88f8b8
Sep 13 15:53:45 soak-10 kernel: Call Trace:
Sep 13 15:53:45 soak-10 kernel: [&amp;lt;ffffffff816a3d6d&amp;gt;] dump_stack+0x19/0x1b
Sep 13 15:53:45 soak-10 kernel: [&amp;lt;ffffffff8169dc54&amp;gt;] panic+0xe8/0x20d
Sep 13 15:53:45 soak-10 kernel: [&amp;lt;ffffffffc0e08854&amp;gt;] lbug_with_loc+0x64/0xb0 [libcfs]
Sep 13 15:53:45 soak-10 kernel: [&amp;lt;ffffffffc0f208f5&amp;gt;] llog_osd_declare_write_rec+0x3c5/0x3d0 [obdclass]
Sep 13 15:53:45 soak-10 kernel: [&amp;lt;ffffffffc0f150e4&amp;gt;] llog_declare_write_rec+0x84/0x200 [obdclass]
Sep 13 15:53:45 soak-10 kernel: [&amp;lt;ffffffffc0f15862&amp;gt;] llog_cancel_rec+0xe2/0x870 [obdclass]
Sep 13 15:53:45 soak-10 kernel: [&amp;lt;ffffffffc0f1bf6a&amp;gt;] llog_cat_cancel_records+0x13a/0x2e0 [obdclass]
Sep 13 15:53:45 soak-10 kernel: [&amp;lt;ffffffffc0e13ba7&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
Sep 13 15:53:45 soak-10 kernel: [&amp;lt;ffffffffc11fc2d9&amp;gt;] distribute_txn_commit_thread+0x599/0xca0 [ptlrpc]
Sep 13 15:53:45 soak-10 kernel: [&amp;lt;ffffffff810c4810&amp;gt;] ? wake_up_state+0x20/0x20
Sep 13 15:53:45 soak-10 kernel: [&amp;lt;ffffffffc11fbd40&amp;gt;] ? sub_trans_commit_cb+0x20/0x20 [ptlrpc]
Sep 13 15:53:45 soak-10 kernel: [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
Sep 13 15:53:45 soak-10 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? insert_kthread_work+0x40/0x40
Sep 13 15:53:45 soak-10 kernel: [&amp;lt;ffffffff816b4f18&amp;gt;] ret_from_fork+0x58/0x90
Sep 13 15:53:45 soak-10 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? insert_kthread_work+0x40/0x40
Sep 13 15:53:45 soak-10 kernel: Kernel Offset: disabled
Sep 13 15:53:45 soak-10 kernel: ------------[ cut here ]------------
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;soak-11&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Sep 13 15:53:43 soak-11 kernel: LustreError: 4755:0:(llog_osd.c:327:llog_osd_declare_write_rec()) ASSERTION( rec ) failed:
Sep 13 15:53:43 soak-11 kernel: LustreError: 4755:0:(llog_osd.c:327:llog_osd_declare_write_rec()) LBUG
Sep 13 15:53:43 soak-11 kernel: Pid: 4755, comm: dist_txn-3
Sep 13 15:53:43 soak-11 kernel: #012Call Trace:
Sep 13 15:53:43 soak-11 kernel: [&amp;lt;ffffffffc0d6f7ae&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]
Sep 13 15:53:43 soak-11 kernel: [&amp;lt;ffffffffc0d6f83c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]
Sep 13 15:53:44 soak-11 kernel: [&amp;lt;ffffffffc0e878f5&amp;gt;] llog_osd_declare_write_rec+0x3c5/0x3d0 [obdclass]
Sep 13 15:53:44 soak-11 kernel: [&amp;lt;ffffffffc0e7c0e4&amp;gt;] llog_declare_write_rec+0x84/0x200 [obdclass]
Sep 13 15:53:44 soak-11 kernel: [&amp;lt;ffffffffc0e7c862&amp;gt;] llog_cancel_rec+0xe2/0x870 [obdclass]
Sep 13 15:53:44 soak-11 kernel: [&amp;lt;ffffffffc0e82f6a&amp;gt;] llog_cat_cancel_records+0x13a/0x2e0 [obdclass]
Sep 13 15:53:44 soak-11 kernel: [&amp;lt;ffffffffc0d7aba7&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
Sep 13 15:53:44 soak-11 kernel: [&amp;lt;ffffffffc11632d9&amp;gt;] distribute_txn_commit_thread+0x599/0xca0 [ptlrpc]
Sep 13 15:53:44 soak-11 kernel: [&amp;lt;ffffffff810c4810&amp;gt;] ? default_wake_function+0x0/0x20
Sep 13 15:53:44 soak-11 kernel: [&amp;lt;ffffffffc1162d40&amp;gt;] ? distribute_txn_commit_thread+0x0/0xca0 [ptlrpc]
Sep 13 15:53:44 soak-11 kernel: [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
Sep 13 15:53:44 soak-11 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
Sep 13 15:53:44 soak-11 kernel: [&amp;lt;ffffffff816b4f18&amp;gt;] ret_from_fork+0x58/0x90
Sep 13 15:53:44 soak-11 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
Sep 13 15:53:44 soak-11 kernel:
Sep 13 15:53:44 soak-11 kernel: Kernel panic - not syncing: LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>version=2.10.52_83_g1fc4ed3 lustre-master build 3637 - RHEL 7.3 distro with 7.4 kernels</environment>
        <key id="48300">LU-9983</key>
            <summary>LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="cliffw">Cliff White</reporter>
                        <labels>
                            <label>soak</label>
                    </labels>
                <created>Wed, 13 Sep 2017 16:07:50 +0000</created>
                <updated>Wed, 2 Jan 2019 20:44:07 +0000</updated>
                            <resolved>Sun, 17 Dec 2017 15:56:19 +0000</resolved>
                                    <version>Lustre 2.10.1</version>
                    <version>Lustre 2.11.0</version>
                                    <fixVersion>Lustre 2.11.0</fixVersion>
                    <fixVersion>Lustre 2.10.6</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>15</watches>
                                                                            <comments>
                            <comment id="208638" author="cliffw" created="Mon, 18 Sep 2017 16:42:19 +0000"  >&lt;p&gt;Hit this again on two MDS over the weekend&lt;/p&gt;</comment>
                            <comment id="209324" author="cliffw" created="Fri, 22 Sep 2017 19:50:52 +0000"  >&lt;p&gt;Hit this again after the RHEL 7.4 update - soak really is not useable with this bug.&lt;/p&gt;</comment>
                            <comment id="209338" author="gerrit" created="Fri, 22 Sep 2017 21:52:38 +0000"  >&lt;p&gt;John L. Hammond (john.hammond@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29177&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29177&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; llog: revert &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9848&quot; title=&quot;LBUG: ASSERTION( len &amp;gt;= (24) &amp;amp;&amp;amp; (len &amp;amp; 0x7) == 0 ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9848&quot;&gt;&lt;del&gt;LU-9848&lt;/del&gt;&lt;/a&gt;&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 1c11b55cc8d600db09bd7502b01bf8515821a9a3&lt;/p&gt;</comment>
                            <comment id="209348" author="cliffw" created="Sat, 23 Sep 2017 00:16:27 +0000"  >&lt;p&gt;Loaded that build, immediately hit LBUG.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Sep 23 00:09:42 soak-9 kernel: LustreError: 2294:0:(layout.c:2085:__req_capsule_get()) @@@ Wrong buffer &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; field `object_update_reply&lt;span class=&quot;code-quote&quot;&gt;&apos; (1 of 1) in format `OUT_UPDATE&apos;&lt;/span&gt;: 0 vs. 4096 (server)#012  req@ffff8808103d1800 x1579283258584816/t0(0) o1000-&amp;gt;soaked-MDT0000-osp-MDT0001@192.168.1.108@o2ib:24/4 lens 392/192 e 0 to 0 dl 1506125390 ref 2 fl Interpret:RM/0/0 rc -71/-71
Sep 23 00:09:53 soak-9 kernel: LustreError: 2351:0:(llog_osd.c:327:llog_osd_declare_write_rec()) ASSERTION( rec ) failed:
Sep 23 00:09:53 soak-9 kernel: LustreError: 2351:0:(llog_osd.c:327:llog_osd_declare_write_rec()) LBUG
Sep 23 00:09:53 soak-9 kernel: Pid: 2351, comm: dist_txn-1
Sep 23 00:09:53 soak-9 kernel: #012Call Trace:
Sep 23 00:09:53 soak-9 kernel: [&amp;lt;ffffffffc08907ae&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]
Sep 23 00:09:53 soak-9 kernel: [&amp;lt;ffffffffc089083c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]
Sep 23 00:09:53 soak-9 kernel: [&amp;lt;ffffffffc0a16b75&amp;gt;] llog_osd_declare_write_rec+0x3c5/0x3d0 [obdclass]
Sep 23 00:09:53 soak-9 kernel: [&amp;lt;ffffffffc0a0b0e4&amp;gt;] llog_declare_write_rec+0x84/0x200 [obdclass]
Sep 23 00:09:53 soak-9 kernel: [&amp;lt;ffffffffc0a0b862&amp;gt;] llog_cancel_rec+0xe2/0x870 [obdclass]
Sep 23 00:09:53 soak-9 kernel: [&amp;lt;ffffffffc0a1233a&amp;gt;] llog_cat_cancel_records+0x13a/0x2e0 [obdclass]
Sep 23 00:09:53 soak-9 kernel: [&amp;lt;ffffffffc089bbc7&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
Sep 23 00:09:54 soak-9 kernel: [&amp;lt;ffffffffc0cf2139&amp;gt;] distribute_txn_commit_thread+0x599/0xca0 [ptlrpc]
Sep 23 00:09:54 soak-9 kernel: [&amp;lt;ffffffff810c4810&amp;gt;] ? default_wake_function+0x0/0x20
Sep 23 00:09:54 soak-9 kernel: [&amp;lt;ffffffffc0cf1ba0&amp;gt;] ? distribute_txn_commit_thread+0x0/0xca0 [ptlrpc]
Sep 23 00:09:54 soak-9 kernel: [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
Sep 23 00:09:54 soak-9 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
Sep 23 00:09:54 soak-9 kernel: [&amp;lt;ffffffff816b4f58&amp;gt;] ret_from_fork+0x58/0x90
Sep 23 00:09:54 soak-9 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
Sep 23 00:09:54 soak-9 kernel:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="209349" author="cliffw" created="Sat, 23 Sep 2017 00:16:45 +0000"  >&lt;p&gt;Will restart with panic_on_lbug=0&lt;/p&gt;</comment>
                            <comment id="209351" author="di.wang" created="Sat, 23 Sep 2017 01:58:02 +0000"  >&lt;p&gt;This actually include 2 bugs.&lt;/p&gt;

&lt;p&gt;1. Bulk transfer between MDTs somehow is screwed up, so those update request can not pass the audit. &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Sep 13 15:53:37 soak-8 kernel: LustreError: 4873:0:(out_handler.c:597:out_write()) soaked-MDT0000: empty or wrong size 0 pos: rc = -61
Sep 13 15:53:37 soak-8 kernel: LustreError: 4990:0:(out_handler.c:1000:out_handle()) soaked-MDT0000: invalid update buffer magic 0 expect bdde0002: rc = -71
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;2. when the master MDT get failure of remote request caused by 1, there seems a race between invalidate the request and cancel log. &lt;/p&gt;

&lt;p&gt;2 seems easy to fix, but 1 is the major issue here. Did we land sth recently which might break the bulk transfer between MDTs?&lt;/p&gt;
</comment>
                            <comment id="209354" author="gerrit" created="Sat, 23 Sep 2017 06:15:06 +0000"  >&lt;p&gt;wangdi (di.wang@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29178&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29178&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; llog: skip non-exist log cancellation&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 7e54ccd5d638cddd77d1aee76aeeae0e26361974&lt;/p&gt;</comment>
                            <comment id="209356" author="gerrit" created="Sat, 23 Sep 2017 06:36:23 +0000"  >&lt;p&gt;wangdi (di.wang@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29179&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29179&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; kernel: revert kernel upgrade&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 144226402da2a74b1ab5391393ba4ff64962d11a&lt;/p&gt;</comment>
                            <comment id="209357" author="di.wang" created="Sat, 23 Sep 2017 06:39:42 +0000"  >&lt;p&gt; &lt;a href=&quot;https://review.whamcloud.com/29178&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29178&lt;/a&gt; is the fix for issue 2.&lt;/p&gt;

&lt;p&gt;Unfortunately, I am still not sure what cause the issue1.  And it is also very interesting, according to the debug log, the update requests still valid when packing the RPC, but somehow on the server side, those request magic number is reset to 0.&lt;/p&gt;

&lt;p&gt;On the source MDT, see those magic&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000004:00080000:15.0:1506127961.257426:0:3000:0:(osp_trans.c:1330:osp_get_next_request()) ou ffff88042ece15a0 version 3 rpc_version 3
00000004:00000001:15.0:1506127961.257444:0:3000:0:(osp_trans.c:1119:osp_send_update_req()) Process entered
00000004:00000001:15.0:1506127961.257446:0:3000:0:(osp_trans.c:361:osp_prep_update_req()) Process entered
00000004:00000040:15.0:1506127961.257448:0:3000:0:(osp_trans.c:272:object_update_request_dump()) i = 0 fid = [0x2c0003ab2:0x2:0x0] op = create params = 1 batchid = 0 size = 256 repsize 0
00000004:00000040:15.0:1506127961.257453:0:3000:0:(osp_trans.c:272:object_update_request_dump()) i = 1 fid = [0x2c0003ab2:0x2:0x0] op = ref_add params = 0 batchid = 0 size = 40 repsize 0
00000004:00000040:15.0:1506127961.257456:0:3000:0:(osp_trans.c:272:object_update_request_dump()) i = 2 fid = [0x2c0003ab2:0x2:0x0] op = insert params = 3 batchid = 0 size = 96 repsize 0
00000004:00000040:15.0:1506127961.257458:0:3000:0:(osp_trans.c:272:object_update_request_dump()) i = 3 fid = [0x2c0003ab2:0x2:0x0] op = insert params = 3 batchid = 0 size = 96 repsize 0
00000004:00000040:15.0:1506127961.257461:0:3000:0:(osp_trans.c:272:object_update_request_dump()) i = 4 fid = [0x2c0003ab2:0x2:0x0] op = xattr_set params = 3 batchid = 0 size = 144 repsize 0
00000004:00000040:15.0:1506127961.257464:0:3000:0:(osp_trans.c:272:object_update_request_dump()) i = 5 fid = [0x2c0003ab2:0x2:0x0] op = xattr_set params = 3 batchid = 0 size = 160 repsize 0
00000004:00000040:15.0:1506127961.257467:0:3000:0:(osp_trans.c:272:object_update_request_dump()) i = 6 fid = [0x2c0003ab2:0x2:0x0] op = xattr_set params = 3 batchid = 0 size = 96 repsize 0
00000004:00000040:15.0:1506127961.257469:0:3000:0:(osp_trans.c:272:object_update_request_dump()) i = 7 fid = [0x2c0003ab2:0x3:0x0] op = create params = 1 batchid = 0 size = 256 repsize 0
00000004:00000040:15.0:1506127961.257472:0:3000:0:(osp_trans.c:272:object_update_request_dump()) i = 8 fid = [0x20000000a:0x3:0x0] op = insert params = 3 batchid = 0 size = 112 repsize 0
00000004:00000040:15.0:1506127961.257475:0:3000:0:(osp_trans.c:278:object_update_request_dump()) updates = ffff8804b92e0000 magic = bdde0002 count = 9 size = 1256
00000004:00000040:15.0:1506127961.257478:0:3000:0:(osp_trans.c:272:object_update_request_dump()) i = 0 fid = [0x2c0003ab2:0x1:0x0] op = write params = 2 batchid = 0 size = 32832 repsize 0
00000004:00000040:15.0:1506127961.257482:0:3000:0:(osp_trans.c:278:object_update_request_dump()) updates = ffff880806030000 magic = bdde0002 count = 1 size = 32832
00000004:00000040:15.0:1506127961.257484:0:3000:0:(osp_trans.c:272:object_update_request_dump()) i = 0 fid = [0x2c0003ab2:0x1:0x0] op = write params = 2 batchid = 0 size = 128 repsize 0
00000004:00000040:15.0:1506127961.257487:0:3000:0:(osp_trans.c:278:object_update_request_dump()) updates = ffff8808067f6000 magic = bdde0002 count = 1 size = 128
00000004:00000040:15.0:1506127961.257489:0:3000:0:(osp_trans.c:272:object_update_request_dump()) i = 0 fid = [0x2c0003ab2:0x3:0x0] op = write params = 2 batchid = 0 size = 32832 repsize 0
00000004:00000040:15.0:1506127961.257492:0:3000:0:(osp_trans.c:278:object_update_request_dump()) updates = ffff880806040000 magic = bdde0002 count = 1 size = 32832
00000004:00000040:15.0:1506127961.257494:0:3000:0:(osp_trans.c:272:object_update_request_dump()) i = 0 fid = [0x2c0003ab2:0x3:0x0] op = write params = 2 batchid = 0 size = 2696 repsize 0
00000004:00000040:15.0:1506127961.257497:0:3000:0:(osp_trans.c:278:object_update_request_dump()) updates = ffff8808067f7000 magic = bdde0002 count = 1 size = 2696
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;But on the receiver, the magic has been changed to zero.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000800:00000200:10.0:1506127961.258292:0:3044:0:(o2iblnd_cb.c:1386:kiblnd_launch_tx()) conn[ffff88082dfb7200] (20)++
00000800:00000200:10.0:1506127961.258294:0:3044:0:(o2iblnd_cb.c:1156:kiblnd_queue_tx_locked()) conn[ffff88082dfb7200] (21)++
00000800:00000200:10.0:1506127961.258299:0:3044:0:(o2iblnd_cb.c:1392:kiblnd_launch_tx()) conn[ffff88082dfb7200] (22)--
00000100:00000200:10.0:1506127961.258302:0:3044:0:(niobuf.c:262:ptlrpc_start_bulk_transfer()) Transferring 5 pages 77970 bytes via portal 14 id 12345-192.168.1.109@o2ib mbits 0x59c5acbf08b80-0x59c5acbf08b80
00000100:00000001:10.0:1506127961.258306:0:3044:0:(niobuf.c:264:ptlrpc_start_bulk_transfer()) Process leaving (rc=0 : 0 : 0)
00010000:00000001:10.0:1506127961.258595:0:3044:0:(ldlm_lib.c:3268:target_bulk_io()) Process leaving (rc=0 : 0 : 0)
00000020:00020000:10.0:1506127961.258600:0:3044:0:(out_handler.c:1000:out_handle()) soaked-MDT0003: invalid update buffer magic 0 expect bdde0002: rc = -71
00000020:00000001:10.0:1506127961.279044:0:3044:0:(out_handler.c:1001:out_handle()) Process leaving via out_free (rc=18446744073709547449 : -4167 : 0xffffffffffffefb9)
00000020:00000010:10.0:1506127961.279048:0:3044:0:(out_handler.c:1165:out_handle()) kfreed &apos;update_bufs[i]&apos;: 4096 at ffff88083fcb1000.
00000020:00000010:10.0:1506127961.279066:0:3044:0:(out_handler.c:1165:out_handle()) kfreed &apos;update_bufs[i]&apos;: 32841 at ffff88080f180000.
00000020:00000010:10.0:1506127961.279086:0:3044:0:(out_handler.c:1165:out_handle()) kfreed &apos;update_bufs[i]&apos;: 4096 at ffff88080f54f000.
00000020:00000010:10.0:1506127961.279090:0:3044:0:(out_handler.c:1165:out_handle()) kfreed &apos;update_bufs[i]&apos;: 32841 at ffff88080f190000.
00000020:00000010:10.0:1506127961.279115:0:3044:0:(out_handler.c:1165:out_handle()) kfreed &apos;update_bufs[i]&apos;: 4096 at ffff880817a78000.
00000020:00000010:10.0:1506127961.279117:0:3044:0:(out_handler.c:1169:out_handle()) kfreed &apos;update_bufs&apos;: 40 at ffff88042b7fc340.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Sarah told me this failure did not happen for build 18,  whose top is&lt;br/&gt;
           &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9903&quot; title=&quot;kernel update [RHEL6.9 2.6.32-696.10.1.el6]&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9903&quot;&gt;&lt;del&gt;LU-9903&lt;/del&gt;&lt;/a&gt; kernel: kernel update RHEL6.9 &lt;span class=&quot;error&quot;&gt;&amp;#91;2.6.32-696.10.1.el6&amp;#93;&lt;/span&gt; &lt;/p&gt;

&lt;p&gt;So I checked all of commit since &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9725&quot; title=&quot;Mount commands don&amp;#39;t return for targets in LFS with DNE and 3 MDTs &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9725&quot;&gt;&lt;del&gt;LU-9725&lt;/del&gt;&lt;/a&gt;, the most suspicious patches are kernel upgrade, so I revert them &lt;a href=&quot;https://review.whamcloud.com/#/c/29179/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/29179/&lt;/a&gt;  . Cliff: could you please try this? Thanks.&lt;/p&gt;




</comment>
                            <comment id="209373" author="cliffw" created="Sun, 24 Sep 2017 17:29:07 +0000"  >&lt;p&gt;That patch produced the old kernel, (3.10.0-514.26.2) which doesn&apos;t work with CENTOS 7.4, can you re-base on a current kernel? (3.10.0-693.2.2)&lt;br/&gt;
I just finished up-grading soak to 7.4, really don&apos;t want to downgrade. &lt;/p&gt;</comment>
                            <comment id="209375" author="gerrit" created="Mon, 25 Sep 2017 00:16:33 +0000"  >&lt;p&gt;wangdi (di.wang@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29185&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29185&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; lustre: performance v1&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 16343af007dc503f38926fae3a7ebd4e7d66e5a0&lt;/p&gt;</comment>
                            <comment id="209376" author="gerrit" created="Mon, 25 Sep 2017 00:22:38 +0000"  >&lt;p&gt;wangdi (di.wang@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29186&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29186&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; lustre: 9983 try v2&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: d7a390d2f9bb337476c8a1a2b20b524ebc11e7c9&lt;/p&gt;</comment>
                            <comment id="209377" author="di.wang" created="Mon, 25 Sep 2017 00:26:34 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; can you re-base on a current kernel? (3.10.0-693.2.2)
I just finished up-grading soak to 7.4, really don&apos;t want to downgrade.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Ok, then let&apos;s use bisect to find out which patch caused this issue. Could you please try patch 29185 and 29186?  And please remove the update log before re-run thanks.&lt;/p&gt;</comment>
                            <comment id="209422" author="cliffw" created="Mon, 25 Sep 2017 14:38:32 +0000"  >&lt;p&gt;Sunday, I re-loaded build 26 and reformatted without DNE. Soak has run for 14 hours without any issues. &lt;/p&gt;</comment>
                            <comment id="209445" author="di.wang" created="Mon, 25 Sep 2017 17:04:50 +0000"  >&lt;p&gt;Cliff tried 29185 and it also hit this LBUG. Build 29185 is basically build 18 + 2 kernel upgrade patches,  &lt;br/&gt;
       &lt;a href=&quot;https://review.whamcloud.com/29002&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29002&lt;/a&gt;&lt;br/&gt;
       &lt;a href=&quot;https://review.whamcloud.com/28532&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/28532&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Since we did not hit this problem for build 18. so it seems the kernel upgrade is the culprit here? (somehow it break the bulk transfer between MDTs).&lt;/p&gt;

&lt;p&gt;I will upload the client/server debug log here, could some network guys have a look?&lt;/p&gt;</comment>
                            <comment id="209452" author="adilger" created="Mon, 25 Sep 2017 17:46:52 +0000"  >&lt;p&gt;Cliff will also try running IOR with data verification (which we should probably be running on soak all the time), so we can verify if bulk transfers of data are working properly or not.  This would help isolate if there are problems with the core bulk data transfer mechanism (which would point more in the direction of LNet or OFED changes), or if this is isolated to OUT.&lt;/p&gt;

&lt;p&gt;Di, it is also bad if the MDS is LASSERTing on data that it got over the network.  It should only report an error in this case.&lt;/p&gt;</comment>
                            <comment id="209458" author="di.wang" created="Mon, 25 Sep 2017 17:50:10 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Di, it is also bad if the MDS is LASSERTing on data that it got over the network. It should only report an error in this case.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Oh, I posted a patch to fix this LASSERT  &lt;a href=&quot;https://review.whamcloud.com/29178&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29178&lt;/a&gt; , please check. thanks.&lt;/p&gt;</comment>
                            <comment id="209460" author="cliffw" created="Mon, 25 Sep 2017 17:56:30 +0000"  >&lt;p&gt;Loaded patch 29185 - immediate LBUG. loading next patch&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Sep 25 16:22:16 soak-10 kernel: LustreError: 13509:0:(llog_osd.c:327:llog_osd_declare_write_rec()) ASSERTION( rec ) failed:
Sep 25 16:22:16 soak-10 kernel: LustreError: 13509:0:(llog_osd.c:327:llog_osd_declare_write_rec()) LBUG
Sep 25 16:22:16 soak-10 kernel: Pid: 13509, comm: dist_txn-2
Sep 25 16:22:16 soak-10 kernel: #012Call Trace:
Sep 25 16:22:16 soak-10 kernel: [&amp;lt;ffffffffc09187ae&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]
Sep 25 16:22:16 soak-10 kernel: [&amp;lt;ffffffffc091883c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]
Sep 25 16:22:16 soak-10 kernel: [&amp;lt;ffffffffc0a3bac5&amp;gt;] llog_osd_declare_write_rec+0x3c5/0x3d0 [obdclass]
Sep 25 16:22:16 soak-10 kernel: [&amp;lt;ffffffffc0a300e4&amp;gt;] llog_declare_write_rec+0x84/0x200 [obdclass]
Sep 25 16:22:16 soak-10 kernel: [&amp;lt;ffffffffc0a30862&amp;gt;] llog_cancel_rec+0xe2/0x870 [obdclass]
Sep 25 16:22:16 soak-10 kernel: [&amp;lt;ffffffffc0a3728a&amp;gt;] llog_cat_cancel_records+0x13a/0x2e0 [obdclass]
Sep 25 16:22:16 soak-10 kernel: [&amp;lt;ffffffffc0923bc7&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
Sep 25 16:22:17 soak-10 kernel: [&amp;lt;ffffffffc0d170e9&amp;gt;] distribute_txn_commit_thread+0x599/0xca0 [ptlrpc]
Sep 25 16:22:17 soak-10 kernel: [&amp;lt;ffffffff810c4810&amp;gt;] ? default_wake_function+0x0/0x20
Sep 25 16:22:17 soak-10 kernel: [&amp;lt;ffffffffc0d16b50&amp;gt;] ? distribute_txn_commit_thread+0x0/0xca0 [ptlrpc]
Sep 25 16:22:17 soak-10 kernel: [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
Sep 25 16:22:17 soak-10 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
Sep 25 16:22:17 soak-10 kernel: [&amp;lt;ffffffff816b4f18&amp;gt;] ret_from_fork+0x58/0x90
Sep 25 16:22:17 soak-10 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
Sep 25 16:22:17 soak-10 kernel:
Sep 25 16:22:17 soak-10 kernel: Kernel panic - not syncing: LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="209461" author="cliffw" created="Mon, 25 Sep 2017 17:57:11 +0000"  >&lt;p&gt;Errors from soak-8&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Sep 25 16:21:41 soak-8 systemd: Removed slice User Slice of root.
Sep 25 16:21:41 soak-8 systemd: Stopping User Slice of root.
Sep 25 16:22:05 soak-8 kernel: LustreError: 13225:0:(out_handler.c:597:out_write()) soaked-MDT0000: empty or wrong size 0 pos: rc = -61
Sep 25 16:22:05 soak-8 kernel: LustreError: 13706:0:(out_handler.c:1000:out_handle()) soaked-MDT0000: invalid update buffer magic 0 expect bdde0002: rc = -71
Sep 25 16:22:47 soak-8 sshd[13757]: error: Could not load host key: /etc/ssh/ssh_host_dsa_key
Sep 25 16:22:47 soak-8 sshd[13757]: Accepted publickey &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; root from 10.10.1.116 port 50002 ssh2: RSA SHA256:VGwjPuk53LIsLKjhGizbClh9X4HNRiAOs+XaQdKAWxM
Sep 25 16:22:47 soak-8 systemd: Created slice User Slice of root.
Sep 25 16:22:47 soak-8 systemd: Starting User Slice of root.
Sep 25 16:22:47 soak-8 systemd-logind: New session 26 of user root.
Sep 25 16:22:47 soak-8 systemd: Started Session 26 of user root.
Sep 25 16:22:47 soak-8 systemd: Starting Session 26 of user root.
Sep 25 16:22:47 soak-8 sshd[13757]: pam_unix(sshd:session): session opened &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; user root by (uid=0)
Sep 25 16:22:47 soak-8 sshd[13757]: pam_unix(sshd:session): session closed &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; user root
Sep 25 16:22:47 soak-8 systemd-logind: Removed session 26.
Sep 25 16:22:47 soak-8 systemd: Removed slice User Slice of root.
Sep 25 16:22:47 soak-8 systemd: Stopping User Slice of root.
Sep 25 16:22:49 soak-8 kernel: Lustre: 12799:0:(client.c:2114:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; sent delay: [sent 1506356561/real 0]  req@ffff8803f3a70f00 x1579526007953840/t0(0) o400-&amp;gt;soaked-MDT0002-osp-MDT0000@192.168.1.110@o2ib:24/4 lens 224/224 e 0 to 1 dl 1506356568 ref 2 fl Rpc:XN/0/ffffffff rc 0/-1
Sep 25 16:22:49 soak-8 kernel: Lustre: soaked-MDT0003-osp-MDT0000: Connection to soaked-MDT0003 (at 192.168.1.111@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 25 16:22:49 soak-8 kernel: Lustre: 12799:0:(client.c:2114:ptlrpc_expire_one_request()) Skipped 1 previous similar message
Sep 25 16:22:50 soak-8 sshd[13780]: error: Could not load host key: /etc/ssh/ssh_host_dsa_key
Sep 25 16:22:50 soak-8 sshd[13780]: Accepted publickey &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; root from 10.10.1.116 port 50026 ssh2: RSA SHA256:VGwjPuk53LIsLKjhGizbClh9X4HNRiAOs+XaQdKAWxM
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="209463" author="cliffw" created="Mon, 25 Sep 2017 18:11:52 +0000"  >&lt;p&gt;Attempt to run the second patch with IOR and data verification. removed all old update_logs from MDTs. Soak started 2017-09-25 18:02:05 One IOR single-shared-file test completed without error. soak-9 and soak-11 immediately crashed: &amp;lt;15 seconds after start of test. &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;soak-9.log:Sep 25 18:02:17 soak-9 kernel: LustreError: 2686:0:(llog_osd.c:327:llog_osd_declare_write_rec()) LBUG
Sep 25 18:01:01 soak-9 systemd: Stopping User Slice of root.
Sep 25 18:02:06 soak-9 kernel: LustreError: 2615:0:(out_handler.c:597:out_write()) soaked-MDT0001: empty or wrong size 0 pos: rc = -61
Sep 25 18:02:06 soak-9 kernel: LustreError: 11-0: soaked-MDT0000-osp-MDT0001: operation out_update to node 192.168.1.108@o2ib failed: rc = -71
Sep 25 18:02:06 soak-9 kernel: LustreError: 2630:0:(layout.c:2085:__req_capsule_get()) @@@ Wrong buffer &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; field `object_update_reply&lt;span class=&quot;code-quote&quot;&gt;&apos; (1 of 1) in format `OUT_UPDATE&apos;&lt;/span&gt;: 0 vs. 4096 (server)#012  req@ffff88080daa3900 x1579533157282240/t0(0) o1000-&amp;gt;soaked-MDT0000-osp-MDT0001@192.168.1.108@o2ib:24/4 lens 392/192 e 0 to 0 dl 1506362537 ref 2 fl Interpret:RM/0/0 rc -71/-71
Sep 25 18:02:17 soak-9 kernel: LustreError: 2686:0:(llog_osd.c:327:llog_osd_declare_write_rec()) ASSERTION( rec ) failed:
Sep 25 18:02:17 soak-9 kernel: LustreError: 2686:0:(llog_osd.c:327:llog_osd_declare_write_rec()) LBUG
Sep 25 18:02:17 soak-9 kernel: Pid: 2686, comm: dist_txn-1
Sep 25 18:02:17 soak-9 kernel: #012Call Trace:
Sep 25 18:02:17 soak-9 kernel: [&amp;lt;ffffffffc09097ae&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]
Sep 25 18:02:17 soak-9 kernel: [&amp;lt;ffffffffc090983c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]
Sep 25 18:02:17 soak-9 kernel: [&amp;lt;ffffffffc0a93ac5&amp;gt;] llog_osd_declare_write_rec+0x3c5/0x3d0 [obdclass]
Sep 25 18:02:17 soak-9 kernel: [&amp;lt;ffffffffc0a880e4&amp;gt;] llog_declare_write_rec+0x84/0x200 [obdclass]
Sep 25 18:02:17 soak-9 kernel: [&amp;lt;ffffffffc0a88862&amp;gt;] llog_cancel_rec+0xe2/0x870 [obdclass]
Sep 25 18:02:17 soak-9 kernel: [&amp;lt;ffffffffc0a8f28a&amp;gt;] llog_cat_cancel_records+0x13a/0x2e0 [obdclass]
Sep 25 18:02:17 soak-9 kernel: [&amp;lt;ffffffffc0914bc7&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
Sep 25 18:02:17 soak-9 kernel: [&amp;lt;ffffffffc0d20139&amp;gt;] distribute_txn_commit_thread+0x599/0xca0 [ptlrpc]
Sep 25 18:02:17 soak-9 kernel: [&amp;lt;ffffffff810c12c8&amp;gt;] ? check_preempt_curr+0x78/0xa0
Sep 25 18:02:17 soak-9 kernel: [&amp;lt;ffffffff810c4810&amp;gt;] ? default_wake_function+0x0/0x20
Sep 25 18:02:17 soak-9 kernel: [&amp;lt;ffffffffc0d1fba0&amp;gt;] ? distribute_txn_commit_thread+0x0/0xca0 [ptlrpc]
Sep 25 18:02:17 soak-9 kernel: [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
Sep 25 18:02:17 soak-9 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
Sep 25 18:02:17 soak-9 kernel: [&amp;lt;ffffffff816b4f58&amp;gt;] ret_from_fork+0x58/0x90
Sep 25 18:02:17 soak-9 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
Sep 25 18:02:17 soak-9 kernel:
Sep 25 18:02:17 soak-9 kernel: Kernel panic - not syncing: LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Next steps?&lt;/p&gt;</comment>
                            <comment id="209464" author="cliffw" created="Mon, 25 Sep 2017 18:14:55 +0000"  >&lt;p&gt;Per request, lustre-log from soak-8 attached. &lt;/p&gt;</comment>
                            <comment id="209465" author="di.wang" created="Mon, 25 Sep 2017 18:15:42 +0000"  >&lt;p&gt;Ah, so this seems to isolated to OUT. hmm, let me add more debug log.&lt;/p&gt;</comment>
                            <comment id="209475" author="gerrit" created="Mon, 25 Sep 2017 19:07:35 +0000"  >&lt;p&gt;wangdi (di.wang@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29202&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29202&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; out: try 3&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 4e13d3da5beffabe3cb69268349c366acd36a825&lt;/p&gt;</comment>
                            <comment id="209499" author="gerrit" created="Mon, 25 Sep 2017 22:48:53 +0000"  >&lt;p&gt;wangdi (di.wang@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29207&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29207&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; out: try 3&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 6fcce248254b265f7521deaacb300420445e826b&lt;/p&gt;

&lt;p&gt;Please ignore this&lt;/p&gt;</comment>
                            <comment id="209502" author="di.wang" created="Mon, 25 Sep 2017 22:59:58 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Cliff will also try running IOR with data verification (which we should probably be running on soak all the time), so we can verify if bulk transfers of data are working properly or not. This would help isolate if there are problems with the core bulk data transfer mechanism (which would point more in the direction of LNet or OFED changes), or if this is isolated to OUT.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Note: bulk between MDTs are using different iovec ops than OSC&amp;lt;--&amp;gt; OST. See &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5835&quot; title=&quot;Introduce the IOVEC in ptlrpc&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5835&quot;&gt;&lt;del&gt;LU-5835&lt;/del&gt;&lt;/a&gt; &lt;a href=&quot;http://review.whamcloud.com/12525/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/12525/&lt;/a&gt;  &lt;/p&gt;

&lt;p&gt;If bulk transferring between OSC and OST are proved to be correct, then maybe we should check those iovec ops for MDTs.&lt;/p&gt;</comment>
                            <comment id="209506" author="gerrit" created="Tue, 26 Sep 2017 00:28:27 +0000"  >&lt;p&gt;wangdi (di.wang@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29208&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29208&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; osp: align the OSP request size by 4k&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: d741a50ea589969c92685b0fff63e22488b44bb4&lt;/p&gt;

&lt;p&gt;Although this patch can avoid the LBUG, but the real reason still needs to be understood.  It looks like PTLRPC (or Lnet with RHEL 7.4 kernel) can not handle non-pagesize aligned buffer on the receiver side.  &lt;/p&gt;</comment>
                            <comment id="209524" author="cliffw" created="Tue, 26 Sep 2017 04:16:23 +0000"  >&lt;p&gt;Previous patch (build 50791) ran 4 hours in soak without LBUG. Switching to &lt;a href=&quot;https://review.whamcloud.com/#/c/29208/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/29208/&lt;/a&gt; - build 50793&lt;/p&gt;</comment>
                            <comment id="209606" author="gerrit" created="Tue, 26 Sep 2017 17:18:47 +0000"  >&lt;p&gt;wangdi (di.wang@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29218&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29218&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; llog: skip non-exist log cancellation&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 4df15f1acd5ccdb0d5628a2ce620d03d90ea9171&lt;/p&gt;</comment>
                            <comment id="209654" author="gerrit" created="Tue, 26 Sep 2017 20:25:26 +0000"  >&lt;p&gt;Amir Shehata (amir.shehata@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29222&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29222&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; lnet: test to see if this is the cause&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 6f109b6b5d3fadd077df0414292c52f33659c6e0&lt;/p&gt;</comment>
                            <comment id="209802" author="di.wang" created="Thu, 28 Sep 2017 03:13:45 +0000"  >&lt;p&gt;Finally, I got two nodes with IB in Onyx, and can reproduce the issue easily.  And also it works fine with tcp lnd.  So it is definitely O2ib issue.&lt;/p&gt;

&lt;p&gt;If needed, I can ask Sarah to test it in RHEL7.3 to see if it works there.&lt;/p&gt;</comment>
                            <comment id="209803" author="gerrit" created="Thu, 28 Sep 2017 04:18:55 +0000"  >&lt;p&gt;wangdi (di.wang@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29238&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29238&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; llog: skip non-exist log cancellation&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 27fe551a1c18e8d90c2c04ebb5a2dee315b26321&lt;/p&gt;</comment>
                            <comment id="209804" author="gerrit" created="Thu, 28 Sep 2017 04:39:52 +0000"  >&lt;p&gt;wangdi (di.wang@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29240&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29240&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; osp: align the OSP request size by 4k&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: d6bc57faa857d2fb5353476afbf416cf3fde255f&lt;/p&gt;</comment>
                            <comment id="209877" author="cliffw" created="Thu, 28 Sep 2017 20:46:28 +0000"  >&lt;p&gt;Running  &lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-reviews/50862/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.hpdd.intel.com/job/lustre-reviews/50862/&lt;/a&gt;, &lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;connection drops have stopped&lt;/li&gt;
	&lt;li&gt;random job failures have stopped.&lt;/li&gt;
	&lt;li&gt;so far no LBUGS.&lt;br/&gt;
This build is way better than the previous tests. &lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="209982" author="gerrit" created="Fri, 29 Sep 2017 19:26:34 +0000"  >&lt;p&gt;John L. Hammond (john.hammond@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/29208/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29208/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; osp: align the OSP request size by 4k&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: c535d7a4021e3c520246bdde118be97414938266&lt;/p&gt;</comment>
                            <comment id="209986" author="gerrit" created="Fri, 29 Sep 2017 20:04:50 +0000"  >&lt;p&gt;Bob Glossman (bob.glossman@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29270&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29270&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; osp: align the OSP request size by 4k&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 9994585785eeb46fdb42b6b409764c03ea5a7651&lt;/p&gt;</comment>
                            <comment id="210047" author="cliffw" created="Mon, 2 Oct 2017 00:57:39 +0000"  >&lt;p&gt;Running the (almost) latest patch, seeing quite a few of these:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;/scratch/logs/syslog/soak-8.log:Oct  1 22:37:25 soak-8 kernel: LustreError: 8097:0:(mdt_lvb.c:163:mdt_lvbo_fill()) Skipped 4 previous similar messages
/scratch/logs/syslog/soak-8.log:Oct  1 22:37:25 soak-8 kernel: LustreError: 8097:0:(mdt_lvb.c:163:mdt_lvbo_fill()) soaked-MDT0000: expected 944 actual 416.
/scratch/logs/syslog/soak-9.log:Oct  1 22:42:25 soak-9 kernel: LustreError: 2165:0:(mdt_lvb.c:163:mdt_lvbo_fill()) Skipped 6 previous similar messages
/scratch/logs/syslog/soak-9.log:Oct  1 22:42:25 soak-9 kernel: LustreError: 2165:0:(mdt_lvb.c:163:mdt_lvbo_fill()) soaked-MDT0001: expected 872 actual 416.
/scratch/logs/syslog/soak-10.log:Oct  1 22:42:26 soak-10 kernel: LustreError: 2401:0:(mdt_lvb.c:163:mdt_lvbo_fill()) Skipped 10 previous similar messages
/scratch/logs/syslog/soak-10.log:Oct  1 22:42:26 soak-10 kernel: LustreError: 2401:0:(mdt_lvb.c:163:mdt_lvbo_fill()) soaked-MDT0002: expected 872 actual 416.
/scratch/logs/syslog/soak-10.log:Oct  1 22:42:26 soak-10 kernel: LustreError: 4181:0:(mdt_lvb.c:163:mdt_lvbo_fill()) soaked-MDT0002: expected 872 actual 416.
/scratch/logs/syslog/soak-10.log:Oct  1 22:44:04 soak-10 kernel: LustreError: 2351:0:(mdt_lvb.c:163:mdt_lvbo_fill()) soaked-MDT0002: expected 872 actual 416.
/scratch/logs/syslog/soak-9.log:Oct  1 22:44:04 soak-9 kernel: LustreError: 2351:0:(mdt_lvb.c:163:mdt_lvbo_fill()) soaked-MDT0001: expected 848 actual 416.
/scratch/logs/syslog/soak-10.log:Oct  1 22:57:27 soak-10 kernel: LustreError: 4296:0:(mdt_lvb.c:163:mdt_lvbo_fill()) Skipped 8 previous similar messages
/scratch/logs/syslog/soak-10.log:Oct  1 22:57:27 soak-10 kernel: LustreError: 4296:0:(mdt_lvb.c:163:mdt_lvbo_fill()) soaked-MDT0002: expected 872 actual 416.
/scratch/logs/syslog/soak-9.log:Oct  1 22:57:27 soak-9 kernel: LustreError: 2329:0:(mdt_lvb.c:163:mdt_lvbo_fill()) Skipped 9 previous similar messages
/scratch/logs/syslog/soak-9.log:Oct  1 22:57:27 soak-9 kernel: LustreError: 2329:0:(mdt_lvb.c:163:mdt_lvbo_fill()) soaked-MDT0001: expected 800 actual 416.
/scratch/logs/syslog/soak-9.log:Oct  1 22:59:06 soak-9 kernel: LustreError: 2357:0:(mdt_lvb.c:163:mdt_lvbo_fill()) soaked-MDT0001: expected 776 actual 416.

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;We see some timeouts/lustre-log dumps, but no LBUGs. Further details tomorrow, timeouts may be failover-related.&lt;/p&gt;</comment>
                            <comment id="210056" author="adilger" created="Mon, 2 Oct 2017 09:21:09 +0000"  >&lt;p&gt;I think the mdt_lvbo_fill() error is fallout from PFL and has been seen on master (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9825&quot; title=&quot;Multiple errors on OST/MDS &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9825&quot;&gt;LU-9825&lt;/a&gt;). That said, it is so commonly printed as to make it very annoying, and IMHO something that should be fixed. Lai, could you please take a look.&lt;/p&gt;</comment>
                            <comment id="210058" author="pjones" created="Mon, 2 Oct 2017 10:41:31 +0000"  >&lt;p&gt;Lai will be out this week so is there someone else who could confirm that this is benign enough to live with in 2.10.1 or not?&lt;/p&gt;</comment>
                            <comment id="210134" author="gerrit" created="Mon, 2 Oct 2017 21:26:09 +0000"  >&lt;p&gt;John L. Hammond (john.hammond@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29290&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29290&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; ko2iblnd: allow for discontiguous fragments&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: cfce4303780fb6a12d2b08b784e333a2de7de07b&lt;/p&gt;</comment>
                            <comment id="210140" author="cliffw" created="Mon, 2 Oct 2017 22:22:43 +0000"  >&lt;p&gt;Should I move soak to this patch? &lt;/p&gt;</comment>
                            <comment id="210541" author="jgmitter" created="Fri, 6 Oct 2017 17:52:11 +0000"  >&lt;p&gt;Yes, please test the viability of John&apos;s patch as a fix for this issue.   It has been validated already by Amir/John on their side.  After a successful run at the soak scale, we can create a tmp patch to revert Di&apos;s fix and have only John&apos;s patch present for a final verification.&lt;/p&gt;</comment>
                            <comment id="210560" author="adilger" created="Fri, 6 Oct 2017 21:01:18 +0000"  >&lt;p&gt;I don&apos;t think we need to revert Di&apos;s patch, even if John&apos;s patch works. &lt;/p&gt;</comment>
                            <comment id="210564" author="cliffw" created="Fri, 6 Oct 2017 23:11:53 +0000"  >&lt;p&gt;I&apos;ll do this on Monday.&lt;/p&gt;</comment>
                            <comment id="210572" author="jgmitter" created="Sat, 7 Oct 2017 02:20:40 +0000"  >&lt;p&gt;Thanks Cliff.&lt;/p&gt;

&lt;p&gt;Di - do you agree with Andreas that your fix is useful independent of John&apos;s proposed fix?&lt;/p&gt;</comment>
                            <comment id="210574" author="di.wang" created="Sat, 7 Oct 2017 02:45:39 +0000"  >&lt;p&gt;Yes, Joe.  I agree with Andreas.&lt;/p&gt;</comment>
                            <comment id="210634" author="cliffw" created="Mon, 9 Oct 2017 18:10:25 +0000"  >&lt;p&gt;With this patch, multiple errors when attempting client mounts. Servers mount.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[ 4377.206482] Lustre: 10866:0:(client.c:2113:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1507572416/real 1507572416]  req@ffff880825ec0300 x1580800666501216/t0(0) o503-&amp;gt;MGC192.168.1.108@o2ib@192.168.1.108@o2ib:26/25 lens 272/8416 e 0 to 1 dl 1507572460 ref 2 fl Rpc:X/0/ffffffff rc 0/-1
[ 4377.238308] LustreError: 166-1: MGC192.168.1.108@o2ib: Connection to MGS (at 192.168.1.108@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail
[ 4377.254213] LustreError: 15c-8: MGC192.168.1.108@o2ib: The configuration from log &lt;span class=&quot;code-quote&quot;&gt;&apos;soaked-client&apos;&lt;/span&gt; failed (-5). This may be the result of communication errors between &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; node and the MGS, a bad configuration, or other errors. See the syslog &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; more information.
[ 4377.254668] Lustre: MGC192.168.1.108@o2ib: Connection restored to MGC192.168.1.108@o2ib_0 (at 192.168.1.108@o2ib)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="210641" author="cliffw" created="Mon, 9 Oct 2017 18:42:19 +0000"  >&lt;p&gt;Clients do not mount with this patch - set -1 debug, dumped lustre-log from MDS and client after failure. Attached. &lt;/p&gt;</comment>
                            <comment id="210871" author="gerrit" created="Wed, 11 Oct 2017 19:52:49 +0000"  >&lt;p&gt;John L. Hammond (john.hammond@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/29238/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29238/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; llog: skip non-exist log cancellation&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 858f0ee15eb0ba2eca525297d510213ba6c5ec5e&lt;/p&gt;</comment>
                            <comment id="211147" author="gerrit" created="Mon, 16 Oct 2017 03:23:08 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/29290/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29290/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; ko2iblnd: allow for discontiguous fragments&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 7954a520428002de8629916d4ae7b6660818dc6c&lt;/p&gt;</comment>
                            <comment id="213001" author="hornc" created="Tue, 7 Nov 2017 18:18:00 +0000"  >&lt;p&gt;Cray is seeing this issue in our 2.10 testing. I read through the comments in this ticket but I did not see a clear description of the root cause of this issue. Could someone provide a description? I see a ko2iblnd change was made in association with this patch, and I&apos;m concerned that gnilnd may be vulnerable as well.&lt;/p&gt;</comment>
                            <comment id="213362" author="jhammond" created="Fri, 10 Nov 2017 22:29:00 +0000"  >&lt;p&gt;Chris, the root cause is the bug addressed by &lt;a href=&quot;https://review.whamcloud.com/29290/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29290&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;See the commit message for an explanation. This in turn triggers a second bug which is addressed by &lt;a href=&quot;https://review.whamcloud.com/29238/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29238&lt;/a&gt;/.&lt;/p&gt;

&lt;p&gt;You can also see the failed assertion in some OOM cases on an MDS.&lt;/p&gt;</comment>
                            <comment id="213364" author="hornc" created="Fri, 10 Nov 2017 22:34:28 +0000"  >&lt;p&gt;Got it. Thanks, John.&lt;/p&gt;</comment>
                            <comment id="214388" author="gerrit" created="Wed, 22 Nov 2017 03:54:37 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/29218/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29218/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; llog: skip non-exist log cancellation&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 16d7737894c0a8a111f64f04d7c6bdf0ddc86167&lt;/p&gt;</comment>
                            <comment id="216495" author="gerrit" created="Sun, 17 Dec 2017 06:18:02 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/29270/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29270/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; osp: align the OSP request size by 4k&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 1f50b1e494ff1b4988508c6d6398ee6769467931&lt;/p&gt;</comment>
                            <comment id="216534" author="pjones" created="Sun, 17 Dec 2017 15:56:19 +0000"  >&lt;p&gt;It looks like all the parts of this have finally landed to master&lt;/p&gt;</comment>
                            <comment id="216621" author="gerrit" created="Mon, 18 Dec 2017 17:51:53 +0000"  >&lt;p&gt;Minh Diep (minh.diep@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/30580&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30580&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;del&gt;LU-9983&lt;/del&gt;&lt;/a&gt; ko2iblnd: allow for discontiguous fragments&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 0d10af5408ef98801920c04daf86d1d21c8d6dda&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="46078">LU-9500</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="43055">LU-9026</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="47572">LU-9810</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="48627">LU-10089</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="48582">LU-10068</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="28424" name="MDS.lustre.log.txt.gz" size="13450472" author="cliffw" created="Mon, 9 Oct 2017 18:42:43 +0000"/>
                            <attachment id="28349" name="client.log" size="82023" author="di.wang" created="Mon, 25 Sep 2017 17:07:12 +0000"/>
                            <attachment id="28423" name="client.lustre.log.txt.gz" size="988215" author="cliffw" created="Mon, 9 Oct 2017 18:42:32 +0000"/>
                            <attachment id="28348" name="server.log" size="466051" author="di.wang" created="Mon, 25 Sep 2017 17:07:12 +0000"/>
                            <attachment id="28352" name="soak-8.lustre.log.gz" size="168766" author="cliffw" created="Mon, 25 Sep 2017 18:14:40 +0000"/>
                            <attachment id="28351" name="vmcore-dmesg.txt" size="142306" author="cliffw" created="Mon, 25 Sep 2017 17:38:54 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzk4f:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>