<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:24:44 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-9274] LBUG: (recover.c:157:ptlrpc_replay_next()) ASSERTION( !list_empty(&amp;req-&gt;rq_cli.cr_unreplied_list) ) failed:</title>
                <link>https://jira.whamcloud.com/browse/LU-9274</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Soak is running, performing successful OSS failover, partitions are being recovered:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;2017-03-30 00:47:10,761:fsmgmt.fsmgmt:INFO     Next recovery check in 15s...
2017-03-30 00:47:32,586:fsmgmt.fsmgmt:DEBUG    Recovery Result Record: {&lt;span class=&quot;code-quote&quot;&gt;&apos;soak-4&apos;&lt;/span&gt;: {&lt;span class=&quot;code-quote&quot;&gt;&apos;soaked-OST000f&apos;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&apos;RECOVERING&apos;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&apos;soaked-OST000e&apos;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&apos;INACTIVE&apos;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&apos;soaked-OST0008&apos;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&apos;INACTIVE&apos;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&apos;soaked-OST0009&apos;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&apos;RECOVERING&apos;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&apos;soaked-OST0002&apos;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&apos;INACTIVE&apos;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&apos;soaked-OST0003&apos;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&apos;RECOVERING&apos;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&apos;soaked-OST0015&apos;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&apos;COMPLETE&apos;&lt;/span&gt;, &lt;span class=&quot;code-quote&quot;&gt;&apos;soaked-OST0014&apos;&lt;/span&gt;: &lt;span class=&quot;code-quote&quot;&gt;&apos;INACTIVE&apos;&lt;/span&gt;}}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Single client has LBUG, after recovering some partitions&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Mar 30 00:47:33 soak-36 kernel: Lustre: soaked-OST0009-osc-ffff88085b72c000: Connection restored to 192.168.1.104@o2ib10 (at 192.168.1.104@o2ib10)
.....:q
Mar 30 00:48:31 soak-36 kernel: LustreError: 4753:0:(recover.c:157:ptlrpc_replay_next()) ASSERTION( !list_empty(&amp;amp;req-&amp;gt;rq_cli.cr_unreplied_list) ) failed:
Mar 30 00:48:31 soak-36 kernel: LustreError: 4753:0:(recover.c:157:ptlrpc_replay_next()) LBUG
Mar 30 00:48:31 soak-36 kernel: Pid: 4753, comm: ptlrpcd_rcv
Mar 30 00:48:31 soak-36 kernel: #012Call Trace:
Mar 30 00:48:31 soak-36 kernel: [&amp;lt;ffffffffa092c7f3&amp;gt;] libcfs_debug_dumpstack+0x53/0x80 [libcfs]
Mar 30 00:48:31 soak-36 kernel: [&amp;lt;ffffffffa092c861&amp;gt;] lbug_with_loc+0x41/0xb0 [libcfs]
Mar 30 00:48:31 soak-36 kernel: [&amp;lt;ffffffffa0d31c87&amp;gt;] ptlrpc_replay_next+0x447/0x450 [ptlrpc]
Mar 30 00:48:31 soak-36 kernel: [&amp;lt;ffffffffa0d55682&amp;gt;] ptlrpc_import_recovery_state_machine+0x1d2/0xbc0 [ptlrpc]
Mar 30 00:48:31 soak-36 kernel: [&amp;lt;ffffffffa0d2a2ff&amp;gt;] ptlrpc_replay_interpret+0x17f/0x7d0 [ptlrpc]
Mar 30 00:48:31 soak-36 kernel: [&amp;lt;ffffffffa0d2e0b5&amp;gt;] ptlrpc_check_set.part.23+0x425/0x1dd0 [ptlrpc]
Mar 30 00:48:31 soak-36 kernel: [&amp;lt;ffffffffa0d2fabb&amp;gt;] ptlrpc_check_set+0x5b/0xe0 [ptlrpc]
Mar 30 00:48:31 soak-36 kernel: [&amp;lt;ffffffffa0d5bb8b&amp;gt;] ptlrpcd_check+0x4db/0x5d0 [ptlrpc]
Mar 30 00:48:31 soak-36 kernel: [&amp;lt;ffffffffa0d5bf3b&amp;gt;] ptlrpcd+0x2bb/0x560 [ptlrpc]
Mar 30 00:48:31 soak-36 kernel: [&amp;lt;ffffffff810c4fd0&amp;gt;] ? default_wake_function+0x0/0x20
Mar 30 00:48:31 soak-36 kernel: [&amp;lt;ffffffffa0d5bc80&amp;gt;] ? ptlrpcd+0x0/0x560 [ptlrpc]
Mar 30 00:48:32 soak-36 kernel: [&amp;lt;ffffffff810b064f&amp;gt;] kthread+0xcf/0xe0
Mar 30 00:48:32 soak-36 kernel: [&amp;lt;ffffffff810b0580&amp;gt;] ? kthread+0x0/0xe0
Mar 30 00:48:32 soak-36 kernel: [&amp;lt;ffffffff81696958&amp;gt;] ret_from_fork+0x58/0x90
Mar 30 00:48:32 soak-36 kernel: [&amp;lt;ffffffff810b0580&amp;gt;] ? kthread+0x0/0xe0
Mar 30 00:48:32 soak-36 kernel:
Mar 30 00:48:32 soak-36 kernel: Kernel panic - not syncing: LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;vmcore-dmesg.txt attached. Full crash dump is available on soak-36&lt;/p&gt;</description>
                <environment>Soak cluster version=lustre: 2.9.52_73_gb5c4f03</environment>
        <key id="45140">LU-9274</key>
            <summary>LBUG: (recover.c:157:ptlrpc_replay_next()) ASSERTION( !list_empty(&amp;req-&gt;rq_cli.cr_unreplied_list) ) failed:</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="niu">Niu Yawei</assignee>
                                    <reporter username="cliffw">Cliff White</reporter>
                        <labels>
                            <label>soak</label>
                    </labels>
                <created>Thu, 30 Mar 2017 15:10:34 +0000</created>
                <updated>Mon, 3 Sep 2018 08:01:00 +0000</updated>
                            <resolved>Tue, 18 Jul 2017 13:39:35 +0000</resolved>
                                    <version>Lustre 2.10.0</version>
                                    <fixVersion>Lustre 2.10.0</fixVersion>
                    <fixVersion>Lustre 2.11.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="190161" author="jgmitter" created="Thu, 30 Mar 2017 17:21:38 +0000"  >&lt;p&gt;Hi Niu,&lt;/p&gt;

&lt;p&gt;Would you be able to look into this ptlrpc issue?  It looks like you had added this assertion.&lt;/p&gt;

&lt;p&gt;Thanks.&lt;br/&gt;
Joe&lt;/p&gt;</comment>
                            <comment id="199884" author="cliffw" created="Wed, 21 Jun 2017 20:20:14 +0000"  >&lt;p&gt;Hit this again on the latest tip of master &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Jun 21 20:12:45 soak-23 kernel: LustreError: 2921:0:(recover.c:157:ptlrpc_replay_next()) ASSERTION( !list_empty(&amp;amp;req-&amp;gt;rq_cli.cr_unreplied_list) ) failed:
Jun 21 20:12:45 soak-23 kernel: LustreError: 2921:0:(recover.c:157:ptlrpc_replay_next()) LBUG
Jun 21 20:12:45 soak-23 kernel: Pid: 2921, comm: ptlrpcd_rcv
Jun 21 20:12:45 soak-23 kernel: #012Call Trace:
Jun 21 20:12:45 soak-23 kernel: [&amp;lt;ffffffffa08057ee&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]
Jun 21 20:12:45 soak-23 kernel: [&amp;lt;ffffffffa080587c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]
Jun 21 20:12:45 soak-23 kernel: [&amp;lt;ffffffffa0b376b7&amp;gt;] ptlrpc_replay_next+0x447/0x450 [ptlrpc]
Jun 21 20:12:45 soak-23 kernel: [&amp;lt;ffffffffa0b5ac92&amp;gt;] ptlrpc_import_recovery_state_machine+0x1d2/0xbc0 [ptlrpc]
Jun 21 20:12:45 soak-23 kernel: [&amp;lt;ffffffffa0b2fdbf&amp;gt;] ptlrpc_replay_interpret+0x17f/0x7d0 [ptlrpc]
Jun 21 20:12:45 soak-23 kernel: [&amp;lt;ffffffffa0b33b1c&amp;gt;] ptlrpc_check_set.part.23+0x42c/0x1da0 [ptlrpc]
Jun 21 20:12:45 soak-23 kernel: [&amp;lt;ffffffffa0b354eb&amp;gt;] ptlrpc_check_set+0x5b/0xe0 [ptlrpc]
Jun 21 20:12:45 soak-23 kernel: [&amp;lt;ffffffffa0b6116b&amp;gt;] ptlrpcd_check+0x4db/0x5d0 [ptlrpc]
Jun 21 20:12:45 soak-23 kernel: [&amp;lt;ffffffffa0b6151b&amp;gt;] ptlrpcd+0x2bb/0x560 [ptlrpc]
Jun 21 20:12:45 soak-23 kernel: [&amp;lt;ffffffff810c5080&amp;gt;] ? default_wake_function+0x0/0x20
Jun 21 20:12:45 soak-23 kernel: [&amp;lt;ffffffffa0b61260&amp;gt;] ? ptlrpcd+0x0/0x560 [ptlrpc]
Jun 21 20:12:45 soak-23 kernel: [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
Jun 21 20:12:45 soak-23 kernel: [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0
Jun 21 20:12:45 soak-23 kernel: [&amp;lt;ffffffff81696a58&amp;gt;] ret_from_fork+0x58/0x90
Jun 21 20:12:45 soak-23 kernel: [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0
Jun 21 20:12:45 soak-23 kernel:
Jun 21 20:12:45 soak-23 kernel: Kernel panic - not syncing: LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="199885" author="cliffw" created="Wed, 21 Jun 2017 20:22:01 +0000"  >&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[88503.634729] LNet: 2911:0:(o2iblnd_cb.c:3207:kiblnd_check_conns()) Timed out tx &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 192.168.1.102@o2ib: 11 seconds
[88503.647523] LNet: 2911:0:(o2iblnd_cb.c:3207:kiblnd_check_conns()) Skipped 1 previous similar message
[88529.635079] LNet: 2911:0:(o2iblnd_cb.c:3207:kiblnd_check_conns()) Timed out tx &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 192.168.1.102@o2ib: 12 seconds
[88567.330684] Lustre: soaked-OST0000-osc-ffff88101b206800: Connection restored to 192.168.1.102@o2ib (at 192.168.1.102@o2ib)
[88578.129087] LustreError: 2921:0:(recover.c:157:ptlrpc_replay_next()) ASSERTION( !list_empty(&amp;amp;req-&amp;gt;rq_cli.cr_unreplied_list) ) failed:
[88578.146490] LustreError: 2921:0:(recover.c:157:ptlrpc_replay_next()) LBUG
[88578.156002] Pid: 2921, comm: ptlrpcd_rcv
[88578.162345]
Call Trace:
[88578.170516]  [&amp;lt;ffffffffa08057ee&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]
[88578.179661]  [&amp;lt;ffffffffa080587c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]
[88578.188506]  [&amp;lt;ffffffffa0b376b7&amp;gt;] ptlrpc_replay_next+0x447/0x450 [ptlrpc]
[88578.197244]  [&amp;lt;ffffffffa0b5ac92&amp;gt;] ptlrpc_import_recovery_state_machine+0x1d2/0xbc0 [ptlrpc]
[88578.207713]  [&amp;lt;ffffffffa0b2fdbf&amp;gt;] ptlrpc_replay_interpret+0x17f/0x7d0 [ptlrpc]
[88578.216902]  [&amp;lt;ffffffffa0b33b1c&amp;gt;] ptlrpc_check_set.part.23+0x42c/0x1da0 [ptlrpc]
[88578.226271]  [&amp;lt;ffffffffa0b354eb&amp;gt;] ptlrpc_check_set+0x5b/0xe0 [ptlrpc]
[88578.234573]  [&amp;lt;ffffffffa0b6116b&amp;gt;] ptlrpcd_check+0x4db/0x5d0 [ptlrpc]
[88578.242748]  [&amp;lt;ffffffffa0b6151b&amp;gt;] ptlrpcd+0x2bb/0x560 [ptlrpc]
[88578.250303]  [&amp;lt;ffffffff810c5080&amp;gt;] ? default_wake_function+0x0/0x20
[88578.258243]  [&amp;lt;ffffffffa0b61260&amp;gt;] ? ptlrpcd+0x0/0x560 [ptlrpc]
[88578.265777]  [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
[88578.272224]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0
[88578.278746]  [&amp;lt;ffffffff81696a58&amp;gt;] ret_from_fork+0x58/0x90
[88578.285737]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0
[88578.292212]
[88578.294807] Kernel panic - not syncing: LBUG
[88578.300515] CPU: 22 PID: 2921 Comm: ptlrpcd_rcv Tainted: G           OE  ------------   3.10.0-514.10.2.el7.x86_64 #1
[88578.313346] Hardware name: Intel Corporation S2600GZ ........../S2600GZ, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013
[88578.326859]  ffffffffa0823e8b 0000000059ebae47 ffff880805e53b30 ffffffff816864ef
[88578.336126]  ffff880805e53bb0 ffffffff8167f8f6 ffffffff00000008 ffff880805e53bc0
[88578.345376]  ffff880805e53b60 0000000059ebae47 0000000059ebae47 ffff88081e38f838
[88578.354607] Call Trace:
[88578.358250]  [&amp;lt;ffffffff816864ef&amp;gt;] dump_stack+0x19/0x1b
[88578.364899]  [&amp;lt;ffffffff8167f8f6&amp;gt;] panic+0xe3/0x1f2
[88578.371145]  [&amp;lt;ffffffffa0805894&amp;gt;] lbug_with_loc+0x64/0xb0 [libcfs]
[88578.378944]  [&amp;lt;ffffffffa0b376b7&amp;gt;] ptlrpc_replay_next+0x447/0x450 [ptlrpc]
[88578.387403]  [&amp;lt;ffffffffa0b5ac92&amp;gt;] ptlrpc_import_recovery_state_machine+0x1d2/0xbc0 [ptlrpc]
[88578.397595]  [&amp;lt;ffffffffa0b2fdbf&amp;gt;] ptlrpc_replay_interpret+0x17f/0x7d0 [ptlrpc]
[88578.406511]  [&amp;lt;ffffffffa0b33b1c&amp;gt;] ptlrpc_check_set.part.23+0x42c/0x1da0 [ptlrpc]
[88578.415621]  [&amp;lt;ffffffffa0b354eb&amp;gt;] ptlrpc_check_set+0x5b/0xe0 [ptlrpc]
[88578.423637]  [&amp;lt;ffffffffa0b6116b&amp;gt;] ptlrpcd_check+0x4db/0x5d0 [ptlrpc]
[88578.431535]  [&amp;lt;ffffffffa0b6151b&amp;gt;] ptlrpcd+0x2bb/0x560 [ptlrpc]
[88578.438818]  [&amp;lt;ffffffff810c5080&amp;gt;] ? wake_up_state+0x20/0x20
[88578.445879]  [&amp;lt;ffffffffa0b61260&amp;gt;] ? ptlrpcd_check+0x5d0/0x5d0 [ptlrpc]
[88578.453914]  [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
[88578.460085]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
[88578.468086]  [&amp;lt;ffffffff81696a58&amp;gt;] ret_from_fork+0x58/0x90
[88578.474806]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;crash dump is available on soak-23&lt;/p&gt;</comment>
                            <comment id="199928" author="niu" created="Thu, 22 Jun 2017 07:03:23 +0000"  >&lt;p&gt;I&apos;ll make a debug patch to collect more information about the request.&lt;/p&gt;</comment>
                            <comment id="199929" author="gerrit" created="Thu, 22 Jun 2017 07:07:37 +0000"  >&lt;p&gt;Niu Yawei (yawei.niu@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/27776&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/27776&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9274&quot; title=&quot;LBUG: (recover.c:157:ptlrpc_replay_next()) ASSERTION( !list_empty(&amp;amp;req-&amp;gt;rq_cli.cr_unreplied_list) ) failed:&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9274&quot;&gt;&lt;del&gt;LU-9274&lt;/del&gt;&lt;/a&gt; ptlrpc: more debug info in ptlrpc_replay_next&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: ad81008de7a866ebf96676fae03ddc213ac9a020&lt;/p&gt;</comment>
                            <comment id="200070" author="cliffw" created="Fri, 23 Jun 2017 14:35:32 +0000"  >&lt;p&gt;Peter requested a run of master. Hit this again, will see about the debug patch. System log attached&lt;/p&gt;</comment>
                            <comment id="200071" author="cliffw" created="Fri, 23 Jun 2017 14:36:05 +0000"  >&lt;p&gt;Crash dump is available on soak-16&lt;/p&gt;</comment>
                            <comment id="200150" author="niu" created="Mon, 26 Jun 2017 00:38:13 +0000"  >&lt;p&gt;Cliff, if it&apos;s easy to be reproduced, could you apply the debug patch and run again? Thanks.&lt;/p&gt;</comment>
                            <comment id="200223" author="cliffw" created="Mon, 26 Jun 2017 15:05:46 +0000"  >&lt;p&gt;Ran the patch. &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Jun 24 01:36:21 soak-11 kernel: LustreError: 4559:0:(service.c:2228:ptlrpc_handle_rs()) ASSERTION( lock != ((void *)0) ) failed:
Jun 24 01:36:21 soak-11 kernel: LustreError: 4559:0:(service.c:2228:ptlrpc_handle_rs()) LBUG
Jun 24 01:36:21 soak-11 kernel: Pid: 4559, comm: ptlrpc_hr00_001
Jun 24 01:36:21 soak-11 kernel: #012Call Trace:
Jun 24 01:36:21 soak-11 kernel: [&amp;lt;ffffffffa0bbd7ee&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]
Jun 24 01:36:21 soak-11 kernel: [&amp;lt;ffffffffa0bbd87c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]  
Jun 24 01:36:21 soak-11 kernel: [&amp;lt;ffffffffa0f3282d&amp;gt;] ptlrpc_hr_main+0x83d/0x8f0 [ptlrpc]
Jun 24 01:36:21 soak-11 kernel: [&amp;lt;ffffffff810c8395&amp;gt;] ? sched_clock_cpu+0x85/0xc0
Jun 24 01:36:21 soak-11 kernel: [&amp;lt;ffffffff810c54e0&amp;gt;] ? default_wake_function+0x0/0x20  
Jun 24 01:36:21 soak-11 kernel: [&amp;lt;ffffffffa0f31ff0&amp;gt;] ? ptlrpc_hr_main+0x0/0x8f0 [ptlrpc]
Jun 24 01:36:21 soak-11 kernel: [&amp;lt;ffffffff810b0a4f&amp;gt;] kthread+0xcf/0xe0
Jun 24 01:36:21 soak-11 kernel: [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread+0x0/0xe0
Jun 24 01:36:21 soak-11 kernel: [&amp;lt;ffffffff81697798&amp;gt;] ret_from_fork+0x58/0x90
Jun 24 01:36:21 soak-11 kernel: [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread+0x0/0xe0
Jun 24 01:36:21 soak-11 kernel:
Jun 24 01:36:21 soak-11 kernel: Kernel panic - not syncing: LBUG
Jun 24 01:36:22 soak-11 kernel: CPU: 1 PID: 4559 Comm: ptlrpc_hr00_001 Tainted: P           OE  ------------   3.10.0-514.21.1.el7_lustre.x86_64 #1
Jun 24 01:36:22 soak-11 kernel: Hardware name: Intel Corporation S2600GZ ........../S2600GZ, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013
Jun 24 01:36:22 soak-11 kernel: ffffffffa0bdbe8b 00000000de1e9805 ffff88081bdd7d30 ffffffff8168717f
Jun 24 01:36:22 soak-11 kernel: ffff88081bdd7db0 ffffffff816805aa ffffffff00000008 ffff88081bdd7dc0
Jun 24 01:36:22 soak-11 kernel: ffff88081bdd7d60 00000000de1e9805 00000000de1e9805 ffff88042e04f838
Jun 24 01:36:22 soak-11 kernel: Call Trace:
Jun 24 01:36:22 soak-11 kernel: [&amp;lt;ffffffff8168717f&amp;gt;] dump_stack+0x19/0x1b
Jun 24 01:36:22 soak-11 kernel: [&amp;lt;ffffffff816805aa&amp;gt;] panic+0xe3/0x1f2
Jun 24 01:36:22 soak-11 kernel: [&amp;lt;ffffffffa0bbd894&amp;gt;] lbug_with_loc+0x64/0xb0 [libcfs]  
Jun 24 01:36:22 soak-11 kernel: [&amp;lt;ffffffffa0f3282d&amp;gt;] ptlrpc_hr_main+0x83d/0x8f0 [ptlrpc]
Jun 24 01:36:22 soak-11 kernel: [&amp;lt;ffffffff810c8395&amp;gt;] ? sched_clock_cpu+0x85/0xc0
Jun 24 01:36:22 soak-11 kernel: [&amp;lt;ffffffff810c54e0&amp;gt;] ? wake_up_state+0x20/0x20 
Jun 24 01:36:22 soak-11 kernel: [&amp;lt;ffffffffa0f31ff0&amp;gt;] ? ptlrpc_svcpt_stop_threads+0x590/0x590 [ptlrpc]
Jun 24 01:36:22 soak-11 kernel: [&amp;lt;ffffffff810b0a4f&amp;gt;] kthread+0xcf/0xe0
Jun 24 01:36:22 soak-11 kernel: [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
Jun 24 01:36:22 soak-11 kernel: [&amp;lt;ffffffff81697798&amp;gt;] ret_from_fork+0x58/0x90
Jun 24 01:36:22 soak-11 kernel: [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
Jun 24 01:36:22 soak-11 kernel: ------------[ cut here ]------------
Jun 24 01:36:22 soak-11 kernel: WARNING: at arch/x86/kernel/smp.c:125 native_smp_send_reschedule+0x5f/0x70()
Jun 24 01:36:22 soak-11 kernel: Modules linked in: osp(OE) mdd(OE) lod(OE) mdt(OE) lfsck(OE) mgc(OE) osd_ldiskfs(OE) ldiskfs(OE) lquota(OE) fid(OE) fld(OE) ko2iblnd(OE) ptlrpc(OE) obdclass(OE) lnet(OE) libcfs(OE) zfs(POE) zunicode(POE) zavl(POE) zcommon(POE) znvpair(POE) spl(OE) zlib_deflate 8021q garp mrp stp llc rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp scsi_tgt ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm mlx4_ib ib_core intel_powerclamp coretemp intel_rapl iosf_mbi kvm dm_round_robin irqbypass crc32_pclmul ghash_clmulni_intel ntb aesni_intel sg lrw gf128mul glue_helper ablk_helper cryptd sb_edac shpchp iTCO_wdt ipmi_devintf ipmi_ssif iTCO_vendor_support wmi ipmi_si ioatdma edac_core i2c_i801 lpc_ich pcspkr mei_me ipmi_msghandler
Jun 24 01:36:22 soak-11 kernel: mei dm_multipath dm_mod nfsd nfs_acl lockd grace auth_rpcgss sunrpc ip_tables ext4 mbcache jbd2 mlx4_en sd_mod crc_t10dif crct10dif_generic mgag200 drm_kms_helper syscopyarea igb sysfillrect sysimgblt isci fb_sys_fops ahci crct10dif_pclmul ttm crct10dif_common libsas ptp crc32c_intel libahci pps_core drm mlx4_core mpt3sas libata dca raid_class i2c_algo_bit scsi_transport_sas devlink i2c_core fjes
Jun 24 01:36:22 soak-11 kernel: CPU: 1 PID: 13 Comm: migration/1 Tainted: P           OE  ------------   3.10.0-514.21.1.el7_lustre.x86_64 #1
Jun 24 01:36:22 soak-11 kernel: Hardware name: Intel Corporation S2600GZ ........../S2600GZ, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013
Jun 24 01:36:22 soak-11 kernel: 0000000000000000 00000000073b2e60 ffff880179ccbc70 ffffffff8168717f
[10535.721248] WARNING: at arch/x86/kernel/smp.c:125 native_smp_send_reschedule+0x5f/0x70()^M
[10535.731298] Modules linked in: osp(OE) mdd(OE) lod(OE) mdt(OE) lfsck(OE) mgc(OE) osd_ldiskfs(OE) ldiskfs(OE) lquota(OE) fid(OE) fld(OE) ko2iblnd(OE) ptlrpc(OE) obdclass(OE) lnet(OE) libcfs(OE) zfs(POE) zunicode(POE) zavl(POE) zcommon(POE) znvpair(POE) spl(OE) zlib_deflate 8021q garp mrp stp llc rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp scsi_tgt ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm mlx4_ib ib_core intel_powerclamp coretemp intel_rapl iosf_mbi kvm dm_round_robin irqbypass crc32_pclmul ghash_clmulni_intel ntb aesni_intel sg lrw gf128mul glue_helper ablk_helper cryptd sb_edac shpchp iTCO_wdt ipmi_devintf ipmi_ssif iTCO_vendor_support wmi ipmi_si ioatdma edac_core i2c_i801 lpc_ich pcspkr mei_me ipmi_msghandler mei dm_multipath dm_mod nfsd nfs_acl lockd grace auth_rpcgss sunrpc ip_tables ext4 mbcache jbd2 mlx4_en sd_mod crc_t10dif crct10dif_generic mgag200 drm_kms_helper syscopyarea igb sysfillrect sysimgblt isci fb_sys_fops ahci crct10dif_pclmul ttm crct10dif_common libsas ptp crc32c_intel libahci pps_core drm mlx4_core mpt3sas libata dca raid_class i2c_algo_bit scsi_transport_sas devlink i2c_core fjes^M
[10535.863515] CPU: 1 PID: 13 Comm: migration/1 Tainted: P           OE  ------------   3.10.0-514.21.1.el7_lustre.x86_64 #1^M
[10535.876807] Hardware name: Intel Corporation S2600GZ ........../S2600GZ, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013^M
[10535.890410]  0000000000000000 00000000073b2e60 ffff880179ccbc70 ffffffff8168717f^M
[10535.899786]  ffff880179ccbca8 ffffffff81085cb0 0000000000000011 ffff880179ccbd38^M
[10535.909179]  ffff88042e256c40 ffff88017a20edd0 ffff88081be6de20 ffff880179ccbcb8^M
[10535.918571] Call Trace:^M
[10535.922389]  [&amp;lt;ffffffff8168717f&amp;gt;] dump_stack+0x19/0x1b^M
[10535.929237]  [&amp;lt;ffffffff81085cb0&amp;gt;] warn_slowpath_common+0x70/0xb0^M
[10535.937041]  [&amp;lt;ffffffff81085dfa&amp;gt;] warn_slowpath_null+0x1a/0x20^M
[10535.944645]  [&amp;lt;ffffffff8104e10f&amp;gt;] native_smp_send_reschedule+0x5f/0x70^M
[10535.953022]  [&amp;lt;ffffffff810c1128&amp;gt;] resched_task+0xb8/0xd0^M
[10535.960003]  [&amp;lt;ffffffff810c1f55&amp;gt;] check_preempt_curr+0x75/0xa0^M
[10535.967565]  [&amp;lt;ffffffff810c9d5b&amp;gt;] move_task+0x4b/0x50^M
[10535.974246]  [&amp;lt;ffffffff810cccbf&amp;gt;] active_load_balance_cpu_stop+0x1ff/0x290^M
[10535.982984]  [&amp;lt;ffffffff810ccac0&amp;gt;] ? can_migrate_task+0x500/0x500^M
[10535.990751]  [&amp;lt;ffffffff81116a56&amp;gt;] cpu_stopper_thread+0x96/0x170^M
[10535.998423]  [&amp;lt;ffffffff8168c240&amp;gt;] ? __schedule+0x3b0/0x990^M
[10536.005608]  [&amp;lt;ffffffff810b95bf&amp;gt;] smpboot_thread_fn+0x12f/0x180^M
[10536.013263]  [&amp;lt;ffffffff810b9490&amp;gt;] ? lg_double_unlock+0x90/0x90^M
[10536.020798]  [&amp;lt;ffffffff810b0a4f&amp;gt;] kthread+0xcf/0xe0^M
[10536.027233]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140^M
[10536.035501]  [&amp;lt;ffffffff81697798&amp;gt;] ret_from_fork+0x58/0x90^M
[10536.042505]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140^M
[10536.050768] ---[ end trace 163e77165449f3fa ]---^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="200228" author="niu" created="Mon, 26 Jun 2017 15:49:10 +0000"  >&lt;p&gt;This is &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9504&quot; title=&quot;LBUG ptlrpc_handle_rs()) ASSERTION( lock != ((void *)0) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9504&quot;&gt;&lt;del&gt;LU-9504&lt;/del&gt;&lt;/a&gt;, looks we&apos;d apply the fix of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9504&quot; title=&quot;LBUG ptlrpc_handle_rs()) ASSERTION( lock != ((void *)0) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9504&quot;&gt;&lt;del&gt;LU-9504&lt;/del&gt;&lt;/a&gt; as well.&lt;/p&gt;</comment>
                            <comment id="200229" author="cliffw" created="Mon, 26 Jun 2017 15:56:32 +0000"  >&lt;p&gt;Can you rebase this on latest master? &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9504&quot; title=&quot;LBUG ptlrpc_handle_rs()) ASSERTION( lock != ((void *)0) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9504&quot;&gt;&lt;del&gt;LU-9504&lt;/del&gt;&lt;/a&gt; has landed there.&lt;/p&gt;</comment>
                            <comment id="200303" author="niu" created="Tue, 27 Jun 2017 02:22:25 +0000"  >&lt;blockquote&gt;
&lt;p&gt;Can you rebase this on latest master? &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9504&quot; title=&quot;LBUG ptlrpc_handle_rs()) ASSERTION( lock != ((void *)0) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9504&quot;&gt;&lt;del&gt;LU-9504&lt;/del&gt;&lt;/a&gt; has landed there.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Rebase done.&lt;/p&gt;</comment>
                            <comment id="200494" author="cliffw" created="Wed, 28 Jun 2017 14:51:24 +0000"  >&lt;p&gt;System ran for 24 hours, no LBUGS, but soak-29 client had a hard crash with NMI watchdogs. &lt;br/&gt;
Console log attached, crash dump is available on soak-29. &lt;/p&gt;</comment>
                            <comment id="200561" author="niu" created="Thu, 29 Jun 2017 03:39:45 +0000"  >&lt;p&gt;Looks we hit another bug, lots of cpus stuck on net lock.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[50631.252941] CPU: 13 PID: 43744 Comm: simul Tainted: G          IOE  ------------   3.10.0-514.10.2.el7.x86_64 #1^M
[50631.252942] Hardware name: Intel Corporation S2600WTT/S2600WTT, BIOS SE5C610.86B.01.01.0008.021120151325 02/11/2015^M
[50631.252943] task: ffff880856d15e20 ti: ffff880564080000 task.ti: ffff880564080000^M
[50631.252943] RIP: 0010:[&amp;lt;ffffffff8168de5a&amp;gt;]  [&amp;lt;ffffffff8168de5a&amp;gt;] _raw_spin_lock+0x3a/0x50^M
[50631.252948] RSP: 0018:ffff8805640831e8  EFLAGS: 00000206^M
[50631.252949] RAX: 0000000000006893 RBX: ffff880564083fd8 RCX: 0000000000008478^M
[50631.252949] RDX: 00000000000084c6 RSI: 00000000000084c6 RDI: ffff88105e021fc0^M
[50631.252950] RBP: ffff8805640831e8 R08: 0000000000000001 R09: 0001fd60baec016f^M
[50631.252950] R10: ffff88084073c9ad R11: ffff880564083186 R12: ffffffffa09c0200^M
[50631.252951] R13: ffffffffa08386d8 R14: ffff880564083290 R15: ffffffffa09c6c60^M
[50631.252951] FS:  00007f590af73740(0000) GS:ffff88085ef40000(0000) knlGS:0000000000000000^M
[50631.252952] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033^M
[50631.252952] CR2: 00007f2f1b6cb6a8 CR3: 0000000350e07000 CR4: 00000000001407e0^M
[50631.252953] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000^M
[50631.252953] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400^M
[50631.252954] Stack:^M
[50631.252954]  ffff880564083210 ffffffffa08447c8 ffffffffa08448b6 0000000000000001^M
[50631.252955]  ffff88104fb88400 ffff8805640832d0 ffffffffa09869b5 00000add00000000^M
[50631.252956]  0000000007a55a10 ffffffffff0a0004 0000000007a55a10 ffffffffffffffff^M
[50631.252957] Call Trace:^M
[50631.252959]  [&amp;lt;ffffffffa08447c8&amp;gt;] cfs_percpt_lock+0x58/0x110 [libcfs]^M
[50631.252970]  [&amp;lt;ffffffffa08448b6&amp;gt;] ? cfs_percpt_unlock+0x36/0xc0 [libcfs]^M
[50631.252976]  [&amp;lt;ffffffffa09869b5&amp;gt;] lnet_select_pathway+0x235/0x1140 [lnet]^M
[50631.252988]  [&amp;lt;ffffffffa0988fb1&amp;gt;] lnet_send+0x51/0x180 [lnet]^M
[50631.252994]  [&amp;lt;ffffffffa0989325&amp;gt;] LNetPut+0x245/0x7a0 [lnet]^M
[50631.253001]  [&amp;lt;ffffffffa0bc1996&amp;gt;] ptl_send_buf+0x146/0x530 [ptlrpc]^M
[50631.253028]  [&amp;lt;ffffffffa0bc358d&amp;gt;] ptl_send_rpc+0x67d/0xe60 [ptlrpc]^M
[50631.253046]  [&amp;lt;ffffffffa0bb8ed0&amp;gt;] ptlrpc_send_new_req+0x460/0xa60 [ptlrpc]^M
[50631.253064]  [&amp;lt;ffffffffa0bbdaa1&amp;gt;] ptlrpc_set_wait+0x3d1/0x900 [ptlrpc]^M
[50631.253082]  [&amp;lt;ffffffffa09fd1e5&amp;gt;] ? lustre_get_jobid+0x215/0x4d0 [obdclass]^M
[50631.253103]  [&amp;lt;ffffffffa0bc8fe5&amp;gt;] ? lustre_msg_set_jobid+0x95/0x100 [ptlrpc]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Could you open another ticket for this? I think we&apos;d ask some Lnet expert to take a look. Thanks.&lt;/p&gt;</comment>
                            <comment id="200599" author="cliffw" created="Thu, 29 Jun 2017 15:32:26 +0000"  >&lt;p&gt;I switched to b2.10 build 2 last night, and hit this again. &lt;br/&gt;
Can you rebase to latest b2.10? I am wondering if your debug patch is preventing the error from hitting. I ran &amp;gt;24 hours on the debug patch, no errors, b2.10 hits it immediately.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[52683.724559] LustreError: 2896:0:(recover.c:157:ptlrpc_replay_next()) ASSERTION( !list_empty(&amp;amp;req-&amp;gt;rq_cli.cr_unreplied_list) ) failed: ^M
[52683.740443] LustreError: 2896:0:(recover.c:157:ptlrpc_replay_next()) LBUG^M
[52683.749161] Pid: 2896, comm: ptlrpcd_rcv^M
[52683.754657] ^M
[52683.754657] Call Trace:^M
[52683.761275]  [&amp;lt;ffffffffa07d77ee&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]^M
[52683.769670]  [&amp;lt;ffffffffa07d787c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]^M
[52683.777689]  [&amp;lt;ffffffffa0b1b817&amp;gt;] ptlrpc_replay_next+0x447/0x450 [ptlrpc]^M
[52683.786387]  [&amp;lt;ffffffffa0b3ede2&amp;gt;] ptlrpc_import_recovery_state_machine+0x1d2/0xbc0 [ptlrpc]^M
[52683.796819]  [&amp;lt;ffffffffa0b13f1f&amp;gt;] ptlrpc_replay_interpret+0x17f/0x7d0 [ptlrpc]^M
[52683.805973]  [&amp;lt;ffffffffa0b17c7c&amp;gt;] ptlrpc_check_set.part.23+0x42c/0x1da0 [ptlrpc]^M
[52683.815312]  [&amp;lt;ffffffffa0b1964b&amp;gt;] ptlrpc_check_set+0x5b/0xe0 [ptlrpc]^M
[52683.823568]  [&amp;lt;ffffffffa0b452bb&amp;gt;] ptlrpcd_check+0x4db/0x5d0 [ptlrpc]^M
[52683.831704]  [&amp;lt;ffffffffa0b4566b&amp;gt;] ptlrpcd+0x2bb/0x560 [ptlrpc]^M
[52683.839210]  [&amp;lt;ffffffff810c5080&amp;gt;] ? default_wake_function+0x0/0x20^M
[52683.847110]  [&amp;lt;ffffffffa0b453b0&amp;gt;] ? ptlrpcd+0x0/0x560 [ptlrpc]^M
[52683.854587]  [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0^M
[52683.860982]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0^M
[52683.867455]  [&amp;lt;ffffffff81696a58&amp;gt;] ret_from_fork+0x58/0x90^M
[52683.874426]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="200616" author="adilger" created="Thu, 29 Jun 2017 17:40:09 +0000"  >&lt;p&gt;Niu, I just noticed that &lt;tt&gt;ptlrpc_connect_interpret()&lt;/tt&gt; is changing &lt;tt&gt;imp_replay_cursor&lt;/tt&gt; without &lt;tt&gt;imp_lock&lt;/tt&gt; held.  There may be other places that this is modified, and would mean that multiple threads changing this pointer, or &lt;tt&gt;imp_committed_list&lt;/tt&gt; or &lt;tt&gt;rq_replay_list&lt;/tt&gt; could cause the &lt;tt&gt;LASSERT()&lt;/tt&gt; to trip?&lt;/p&gt;</comment>
                            <comment id="200627" author="green" created="Thu, 29 Jun 2017 18:09:00 +0000"  >&lt;p&gt;the debug that seems useful in this case is:&lt;br/&gt;
&quot;ha net info rpctrace&quot;, jsut add them to default debug with something like this on the node that you expect to crash:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; i in ha net info rpctrace ; &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt; echo +$i &amp;gt;/proc/sys/lnet/debug ; done
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="200663" author="niu" created="Fri, 30 Jun 2017 00:47:21 +0000"  >&lt;blockquote&gt;
&lt;p&gt;Can you rebase to latest b2.10? I am wondering if your debug patch is preventing the error from hitting. I ran &amp;gt;24 hours on the debug patch, no errors, b2.10 hits it immediately.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;This debug patch changed only the LASSERT to a &apos;if&apos; check, I don&apos;t think it would prevent the error from hitting.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Niu, I just noticed that ptlrpc_connect_interpret() is changing imp_replay_cursor without imp_lock held. There may be other places that this is modified, and would mean that multiple threads changing this pointer, or imp_committed_list or rq_replay_list could cause the LASSERT() to trip?&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;On ptlrpc_connect_interpret(), no replay request is inflight, so I think it&apos;s ok to change imp_replay_cursor without holding lock.&lt;/p&gt;

&lt;p&gt;Given this bug is rare to hit, I think we can just land the debug patch so that we can collect debug information once we hit it in later soak test.&lt;/p&gt;</comment>
                            <comment id="200802" author="cliffw" created="Fri, 30 Jun 2017 19:35:24 +0000"  >&lt;p&gt;The LBUG does not trigger with the debug patch, which may be significant. &lt;br/&gt;
Tried lustre-master build 3606 at Peter&apos;s request, hit the LBUG again:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Jun 30 19:07:41 soak-16 kernel: Lustre: 2972:0:(client.c:2114:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; sent delay: [sent 1498849630/real 0]  req@ffff8806cf0b4200 x1571647094223296/t0(0) o8-&amp;gt;soaked-OST0016-osc-ffff88082f4ec800@192.168.1.106@o2ib:28/4 lens 520/544 e 0 to 1 dl 1498849661 ref 2 fl Rpc:XN/0/ffffffff rc 0/-1
Jun 30 19:07:41 soak-16 kernel: Lustre: 2972:0:(client.c:2114:ptlrpc_expire_one_request()) Skipped 7 previous similar messages
Jun 30 19:08:14 soak-16 kernel: LustreError: 2972:0:(recover.c:157:ptlrpc_replay_next()) ASSERTION( !list_empty(&amp;amp;req-&amp;gt;rq_cli.cr_unreplied_list) ) failed:
Jun 30 19:08:14 soak-16 kernel: LustreError: 2972:0:(recover.c:157:ptlrpc_replay_next()) LBUG
Jun 30 19:08:14 soak-16 kernel: Pid: 2972, comm: ptlrpcd_rcv
Jun 30 19:08:14 soak-16 kernel: #012Call Trace:
Jun 30 19:08:14 soak-16 kernel: [&amp;lt;ffffffffa082a7ee&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]
Jun 30 19:08:14 soak-16 kernel: [&amp;lt;ffffffffa082a87c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]
Jun 30 19:08:14 soak-16 kernel: [&amp;lt;ffffffffa0b5c817&amp;gt;] ptlrpc_replay_next+0x447/0x450 [ptlrpc]
Jun 30 19:08:14 soak-16 kernel: [&amp;lt;ffffffffa0b7fde2&amp;gt;] ptlrpc_import_recovery_state_machine+0x1d2/0xbc0 [ptlrpc]
Jun 30 19:08:14 soak-16 kernel: [&amp;lt;ffffffffa0b54f1f&amp;gt;] ptlrpc_replay_interpret+0x17f/0x7d0 [ptlrpc]
Jun 30 19:08:14 soak-16 kernel: [&amp;lt;ffffffffa0b58c7c&amp;gt;] ptlrpc_check_set.part.23+0x42c/0x1da0 [ptlrpc]
Jun 30 19:08:14 soak-16 kernel: [&amp;lt;ffffffffa0b5a64b&amp;gt;] ptlrpc_check_set+0x5b/0xe0 [ptlrpc]
Jun 30 19:08:14 soak-16 kernel: [&amp;lt;ffffffffa0b862bb&amp;gt;] ptlrpcd_check+0x4db/0x5d0 [ptlrpc]
Jun 30 19:08:14 soak-16 kernel: [&amp;lt;ffffffffa0b8666b&amp;gt;] ptlrpcd+0x2bb/0x560 [ptlrpc]
Jun 30 19:08:14 soak-16 kernel: [&amp;lt;ffffffff810c5080&amp;gt;] ? default_wake_function+0x0/0x20
Jun 30 19:08:14 soak-16 kernel: [&amp;lt;ffffffffa0b863b0&amp;gt;] ? ptlrpcd+0x0/0x560 [ptlrpc]
Jun 30 19:08:14 soak-16 kernel: [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
Jun 30 19:08:14 soak-16 kernel: [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0
Jun 30 19:08:14 soak-16 kernel: [&amp;lt;ffffffff81696a58&amp;gt;] ret_from_fork+0x58/0x90
Jun 30 19:08:14 soak-16 kernel: [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0
Jun 30 19:08:14 soak-16 kernel:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="200803" author="cliffw" created="Fri, 30 Jun 2017 19:35:57 +0000"  >&lt;p&gt;Crash dump is available on soak-16&lt;/p&gt;</comment>
                            <comment id="200859" author="cliffw" created="Mon, 3 Jul 2017 14:40:02 +0000"  >&lt;p&gt;Ran with debug=0&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Jul  1 12:01:11 soak-16 kernel: LustreError: 2905:0:(recover.c:158:ptlrpc_replay_next()) @@@ Not on unreplied list!#012  req@ffff88060fac0f00 x1571667100760512/t738743039794(738743039794) o4-&amp;gt;soaked-OST0012-osc-ffff880829008000@192.168.1.102@o2ib:6/4 lens 608/416 e 0 to 0 dl 1498910140 ref 1 fl Complete:R/4/0 rc 0/0
Jul  1 12:01:11 soak-16 kernel: LustreError: 2905:0:(recover.c:159:ptlrpc_replay_next()) LBUG
Jul  1 12:01:11 soak-16 kernel: Pid: 2905, comm: ptlrpcd_rcv
Jul  1 12:01:11 soak-16 kernel: #012Call Trace:
Jul  1 12:01:11 soak-16 kernel: [&amp;lt;ffffffffa08077ee&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]
Jul  1 12:01:11 soak-16 kernel: [&amp;lt;ffffffffa080787c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]
Jul  1 12:01:11 soak-16 kernel: [&amp;lt;ffffffffa0b4d81d&amp;gt;] ptlrpc_replay_next+0x44d/0x450 [ptlrpc]
Jul  1 12:01:11 soak-16 kernel: [&amp;lt;ffffffffa0b70de2&amp;gt;] ptlrpc_import_recovery_state_machine+0x1d2/0xbc0 [ptlrpc]
Jul  1 12:01:11 soak-16 kernel: [&amp;lt;ffffffffa0b45f1f&amp;gt;] ptlrpc_replay_interpret+0x17f/0x7d0 [ptlrpc]
Jul  1 12:01:11 soak-16 kernel: [&amp;lt;ffffffffa0b49c7c&amp;gt;] ptlrpc_check_set.part.23+0x42c/0x1da0 [ptlrpc]
Jul  1 12:01:11 soak-16 kernel: [&amp;lt;ffffffffa0b4b64b&amp;gt;] ptlrpc_check_set+0x5b/0xe0 [ptlrpc]
Jul  1 12:01:11 soak-16 kernel: [&amp;lt;ffffffffa0b772bb&amp;gt;] ptlrpcd_check+0x4db/0x5d0 [ptlrpc]
Jul  1 12:01:11 soak-16 kernel: [&amp;lt;ffffffffa0b7766b&amp;gt;] ptlrpcd+0x2bb/0x560 [ptlrpc]
Jul  1 12:01:11 soak-16 kernel: [&amp;lt;ffffffff810c5080&amp;gt;] ? default_wake_function+0x0/0x20
Jul  1 12:01:11 soak-16 kernel: [&amp;lt;ffffffffa0b773b0&amp;gt;] ? ptlrpcd+0x0/0x560 [ptlrpc]
Jul  1 12:01:11 soak-16 kernel: [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
Jul  1 12:01:11 soak-16 kernel: [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0
Jul  1 12:01:11 soak-16 kernel: [&amp;lt;ffffffff81696a58&amp;gt;] ret_from_fork+0x58/0x90
Jul  1 12:01:11 soak-16 kernel: [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0
Jul  1 12:01:11 soak-16 kernel:
Jul  1 12:01:11 soak-16 kernel: LustreError: dumping log to /tmp/lustre-log.1498910471.2905
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="200861" author="cliffw" created="Mon, 3 Jul 2017 14:41:46 +0000"  >&lt;p&gt;lustre-log attached. &lt;/p&gt;</comment>
                            <comment id="200879" author="gerrit" created="Tue, 4 Jul 2017 04:10:18 +0000"  >&lt;p&gt;Niu Yawei (yawei.niu@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/27920&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/27920&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9274&quot; title=&quot;LBUG: (recover.c:157:ptlrpc_replay_next()) ASSERTION( !list_empty(&amp;amp;req-&amp;gt;rq_cli.cr_unreplied_list) ) failed:&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9274&quot;&gt;&lt;del&gt;LU-9274&lt;/del&gt;&lt;/a&gt; ptlrpc: add replay request into unreplied list&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 8b36f176677da1c584410e45f27229fa8c749113&lt;/p&gt;</comment>
                            <comment id="200930" author="adilger" created="Tue, 4 Jul 2017 17:33:55 +0000"  >&lt;p&gt;Cliff, can you please test on soak with this patch.&lt;/p&gt;</comment>
                            <comment id="201045" author="cliffw" created="Wed, 5 Jul 2017 20:10:52 +0000"  >&lt;p&gt;I am not seeing LBUGS, so far, but am seeing a consistent pattern where after a failover, most/all clients are evicted and recovery is aborted. Some sample messages&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Jul  5 15:05:21 soak-8 kernel: Lustre: 4763:0:(ldlm_lib.c:2059:target_recovery_overseer()) recovery is aborted, evict exports in recovery
Jul  5 15:05:21 soak-8 kernel: Lustre: soaked-MDT0000: Recovery over after 2:18, of 37 clients 3 recovered and 34 were evicted.
Jul  5 15:37:02 soak-8 kernel: Lustre: MGS: haven&lt;span class=&quot;code-quote&quot;&gt;&apos;t heard from client 1b0cd471-e969-acd0-bd39-45b87774c0c3 (at 192.168.1.107@o2ib) in 227 seconds. I think it&apos;&lt;/span&gt;s dead, and I am evicting it. exp ffff8804b978ac00, cur 1499269022 expire 1499268872 last 1499268795
Jul  5 16:47:07 soak-8 kernel: Lustre: MGS: haven&lt;span class=&quot;code-quote&quot;&gt;&apos;t heard from client adfa67c6-0880-1a84-5ecf-172c556a5dd6 (at 192.168.1.111@o2ib) in 227 seconds. I think it&apos;&lt;/span&gt;s dead, and I am evicting it. exp ffff8807dd474c00, cur 1499273227 expire 1499273077 last 1499273000
Jul  5 16:47:16 soak-8 kernel: Lustre: soaked-MDT0000: haven&lt;span class=&quot;code-quote&quot;&gt;&apos;t heard from client soaked-MDT0003-mdtlov_UUID (at 192.168.1.111@o2ib) in 228 seconds. I think it&apos;&lt;/span&gt;s dead, and I am evicting it. exp ffff8807dea70400, cur 1499273236 expire 1499273086 last 1499273008
Jul  5 16:50:06 soak-8 kernel: LustreError: 167-0: soaked-MDT0003-osp-MDT0000: This client was evicted by soaked-MDT0003; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail.
Jul  5 17:18:19 soak-8 kernel: Lustre: MGS: haven&lt;span class=&quot;code-quote&quot;&gt;&apos;t heard from client 8fe08516-dbc0-85e3-6fb4-93c94095a877 (at 192.168.1.106@o2ib) in 227 seconds. I think it&apos;&lt;/span&gt;s dead, and I am evicting it. exp ffff8807dd475800, cur 1499275099 expire 1499274949 last 1499274872
Jul  5 18:14:24 soak-8 kernel: Lustre: 4771:0:(ldlm_lib.c:2059:target_recovery_overseer()) recovery is aborted, evict exports in recovery
Jul  5 18:14:24 soak-8 kernel: Lustre: soaked-MDT0000: Denying connection &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;new&lt;/span&gt; client a1e375f2-7cfb-7e06-98b7-7ea8a7f2eca4(at 192.168.1.130@o2ib), waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 37 known clients (13 recovered, 0 in progress, and 24 evicted) to recover in 4:02
Jul  5 18:14:24 soak-8 kernel: Lustre: soaked-MDT0000: Recovery over after 11:00, of 37 clients 13 recovered and 24 were evicted.
Jul  5 18:44:53 soak-8 kernel: Lustre: MGS: haven&lt;span class=&quot;code-quote&quot;&gt;&apos;t heard from client e13056ec-cbfc-e019-c014-c1a0cfd67815 (at 192.168.1.103@o2ib) in 229 seconds. I think it&apos;&lt;/span&gt;s dead, and I am evicting it. exp ffff88017917cc00, cur 1499280293 expire 1499280143 last 1499280064
Jul  5 19:07:31 soak-8 kernel: Lustre: MGS: haven&lt;span class=&quot;code-quote&quot;&gt;&apos;t heard from client 9b21a845-8a0f-5a86-e846-32a309c2b726 (at 192.168.1.111@o2ib) in 233 seconds. I think it&apos;&lt;/span&gt;s dead, and I am evicting it. exp ffff88017976dc00, cur 1499281651 expire 1499281501 last 1499281418
Jul  5 19:10:18 soak-8 kernel: LustreError: 167-0: soaked-MDT0003-osp-MDT0000: This client was evicted by soaked-MDT0003; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail.
Jul  5 19:35:10 soak-8 kernel: Lustre: MGS: haven&lt;span class=&quot;code-quote&quot;&gt;&apos;t heard from client 687666be-1c95-4869-bffb-261e5c2ec0e7 (at 192.168.1.111@o2ib) in 228 seconds. I think it&apos;&lt;/span&gt;s dead, and I am evicting it. exp ffff8807dbcfe000, cur 1499283310 expire 1499283160 last 1499283082
Jul  5 19:37:55 soak-8 kernel: LustreError: 167-0: soaked-MDT0003-osp-MDT0000: This client was evicted by soaked-MDT0003; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail.
Jul  5 20:06:07 soak-8 kernel: Lustre: MGS: haven&lt;span class=&quot;code-quote&quot;&gt;&apos;t heard from client a43f28ec-dc91-d483-5a19-102f4ecfe255 (at 192.168.1.109@o2ib) in 228 seconds. I think it&apos;&lt;/span&gt;s dead, and I am evicting it. exp ffff880404926c00, cur 1499285167 expire 1499285017 last 1499284939
Jul  5 20:07:15 soak-8 kernel: Lustre: 15754:0:(ldlm_lib.c:2059:target_recovery_overseer()) recovery is aborted, evict exports in recovery
Jul  5 20:08:00 soak-8 kernel: LustreError: 167-0: soaked-MDT0001-osp-MDT0000: This client was evicted by soaked-MDT0001; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It would be good if someone could login and have a look.&lt;/p&gt;</comment>
                            <comment id="201090" author="niu" created="Thu, 6 Jul 2017 01:55:52 +0000"  >&lt;p&gt;All the evicted clients are MDT-MDT client, it seems like a DNE recovery problem, and not related to this patch. Cliff, did you ever observe similar recovery failures during past soak testings?&lt;/p&gt;

&lt;p&gt;Are these clients (192.168.1.103@o2ib, 106, 109 and 111) evicted by MGS from MDTs or Lustre clients? Did you observe any hung MDT during recovery? Could you attach the logs from all MDTs in tickets? It&apos;s quite slow from here to login and operate on soak nodes. (if any MDT hang during recovery, could you dump the &lt;br/&gt;
 full stack trace of it and attach it in ticket as well). Thanks in advance.&lt;/p&gt;</comment>
                            <comment id="201218" author="cliffw" created="Thu, 6 Jul 2017 18:15:17 +0000"  >&lt;p&gt;I am seeing two things constantly with this build:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;soak-10
----------------
mdt.soaked-MDT0002.recovery_status=
status: WAITING
non-ready MDTs:  0000 0001 0003
recovery_start: 1499361565
time_waited: 3037
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;after failover, recovery takes forever/quite a long time on MDS.&lt;/p&gt;

&lt;p&gt;2. When LFSCK is started, it times out and the abort command is issued by the framework. The abort takes a very long time.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;2017-07-06 17:29:30,838:fsmgmt.fsmgmt:INFO     executing cmd: lctl lfsck_stop -M soaked-MDT0000
2017-07-06 18:13:57,169:fsmgmt.fsmgmt:INFO     lfsck stopped on soak-8
2017-07-06 18:13:57,170:fsmgmt.fsmgmt:INFO     mds_restart completed, lfsck completed
{code]
Lustre-log dumps from all MDS and further stack traces attached
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;[ 3113.596481] Lustre: soaked-MDT0002: Recovery already passed deadline 30:28, It is most likely due to DNE recovery is failed or stuck, please wait a few more minutes or abort the recovery.^M&lt;br/&gt;
[ 3113.617684] Lustre: soaked-MDT0002: Connection restored to a953293b-361e-a006-a3a3-0e98a9c1b777 (at 172.16.1.49@o2ib1)^M&lt;br/&gt;
[ 3318.928937] Lustre: 3702:0:(ldlm_lib.c:1784:extend_recovery_timer()) soaked-MDT0002: extended recovery timer reaching hard limit: 900, extend: 1^M&lt;br/&gt;
[ 3318.946182] Lustre: 3702:0:(ldlm_lib.c:1784:extend_recovery_timer()) Skipped 31 previous similar messages^M&lt;br/&gt;
[ 3628.813529] Lustre: soaked-MDT0002: recovery is timed out, evict stale exports^M&lt;br/&gt;
[ 3628.824816] Lustre: soaked-MDT0002: disconnecting 2 stale clients^M&lt;br/&gt;
[ 3628.836011] Lustre: 3702:0:(ldlm_lib.c:1630:abort_req_replay_queue()) @@@ aborted:  req@ffff8803e67c8300 x1572096734652000/t0(1181119750277) o35-&amp;gt;13a5519c-4a88-30e7-e970-6d733706fa78@172.16.1.45@o2ib1:501/0 lens 512/0 e 129 to 0 dl 1499364816 ref 1 fl Complete:/4/ffffffff rc 0/-1^M&lt;br/&gt;
[ 3628.893326] Lustre: soaked-MDT0002: Denying connection for new client a519ea7f-c98c-a97e-e2cf-40376209045c(at 192.168.1.116@o2ib), waiting for 37 known clients (34 recovered, 1 in progress, and 2 evicted) to recover in 21188466:16^M&lt;br/&gt;
[ 3628.900998] Lustre: soaked-MDT0002: Received new LWP connection from 192.168.1.108@o2ib, removing former export from same NID^M&lt;br/&gt;
[ 3628.901105] Lustre: soaked-MDT0002: Connection restored to 192.168.1.108@o2ib (at 192.168.1.108@o2ib)^M&lt;br/&gt;
[ 3628.908451] Lustre: 3702:0:(ldlm_lib.c:2049:target_recovery_overseer()) soaked-MDT0002 recovery is aborted by hard timeout^M&lt;br/&gt;
[ 3628.908459] Lustre: 3702:0:(ldlm_lib.c:2059:target_recovery_overseer()) recovery is aborted, evict exports in recovery^M&lt;br/&gt;
[ 3628.917834] Lustre: soaked-MDT0002: Recovery over after 54:04, of 37 clients 34 recovered and 3 were evicted.^M&lt;br/&gt;
[ 3652.712709] Lustre: soaked-MDT0002: Connection restored to 192.168.1.108@o2ib (at 192.168.1.108@o2ib)^M&lt;br/&gt;
[ 3652.726408] Lustre: Skipped 6 previous similar messages^M&lt;br/&gt;
[ 3659.981726] LustreError: 3722:0:(mdt_lvb.c:163:mdt_lvbo_fill()) soaked-MDT0002: expected 968 actual 344.^M&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="201291" author="niu" created="Fri, 7 Jul 2017 03:38:02 +0000"  >&lt;p&gt;Thank you, Cliff. Seems there is something wrong with the DNE recovery, I see there is a OI scrub thread hung for waiting io completion on soak-8, but not sure if it&apos;s related to the recovery failure problem:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[19587.652446] OI_scrub        D ffff666666663838     0 28932      2 0x00000080
[19587.660994]  ffff8807d14c7890 0000000000000046 ffff88082c3f9f60 ffff8807d14c7fd8
[19587.669968]  ffff8803d0049898 ffff8807d14c7fd8 ffff88082c3f9f60 ffff88042e056c40
[19587.679285]  ffff88042c8a1f78 7fffffffffffffff ffff88039a765000 ffffffff8168a8a0
[19587.688282] Call Trace:
[19587.691650]  [&amp;lt;ffffffff8168a8a0&amp;gt;] ? bit_wait+0x50/0x50
[19587.698046]  [&amp;lt;ffffffff8168c849&amp;gt;] schedule+0x29/0x70
[19587.704230]  [&amp;lt;ffffffff8168a289&amp;gt;] schedule_timeout+0x239/0x2c0
[19587.711411]  [&amp;lt;ffffffffa0272f19&amp;gt;] ? dm_old_request_fn+0xc9/0x1f0 [dm_mod]
[19587.719649]  [&amp;lt;ffffffff810eb0dc&amp;gt;] ? ktime_get_ts64+0x4c/0xf0
[19587.726616]  [&amp;lt;ffffffff8168a8a0&amp;gt;] ? bit_wait+0x50/0x50
[19587.732999]  [&amp;lt;ffffffff8168bdee&amp;gt;] io_schedule_timeout+0xae/0x130
[19587.740348]  [&amp;lt;ffffffff813252b1&amp;gt;] ? vsnprintf+0x441/0x6a0
[19587.747378]  [&amp;lt;ffffffff8168be88&amp;gt;] ? io_schedule+0x18/0x20
[19587.754050]  [&amp;lt;ffffffffa1167b3c&amp;gt;] ? ldiskfs_map_blocks+0x5c/0x5e0 [ldiskfs]
[19587.762459]  [&amp;lt;ffffffff8168a3d5&amp;gt;] ? __wait_on_bit+0x65/0x90
[19587.769313]  [&amp;lt;ffffffff8168a8a0&amp;gt;] ? bit_wait+0x50/0x50
[19587.775671]  [&amp;lt;ffffffff8168a481&amp;gt;] ? out_of_line_wait_on_bit+0x81/0xb0
[19587.783486]  [&amp;lt;ffffffff810b1be0&amp;gt;] ? wake_bit_function+0x40/0x40
[19587.790716]  [&amp;lt;ffffffff8123341a&amp;gt;] ? __wait_on_buffer+0x2a/0x30
[19587.797859]  [&amp;lt;ffffffffa116833c&amp;gt;] ? ldiskfs_bread+0x7c/0xc0 [ldiskfs]
[19587.806016]  [&amp;lt;ffffffffa122f0a0&amp;gt;] ? iam_node_read+0x60/0xe0 [osd_ldiskfs]
[19587.814213]  [&amp;lt;ffffffffa122a6c7&amp;gt;] ? fid_is_on_ost+0x1c7/0x420 [osd_ldiskfs]
[19587.822600]  [&amp;lt;ffffffffa122a9c4&amp;gt;] ? osd_oi_lookup+0xa4/0x160 [osd_ldiskfs]
[19587.830894]  [&amp;lt;ffffffffa0bc43b3&amp;gt;] ? libcfs_log_goto+0x23/0x30 [libcfs]
[19587.838792]  [&amp;lt;ffffffffa123de09&amp;gt;] ? osd_scrub_get_fid+0x1f9/0x290 [osd_ldiskfs]
[19587.847565]  [&amp;lt;ffffffffa1242a62&amp;gt;] ? osd_scrub_exec+0x62/0x5b0 [osd_ldiskfs]
[19587.855967]  [&amp;lt;ffffffffa11631fe&amp;gt;] ? ldiskfs_read_inode_bitmap+0x1fe/0x5b0 [ldiskfs]
[19587.865156]  [&amp;lt;ffffffffa1244225&amp;gt;] ? osd_inode_iteration+0x475/0xc70 [osd_ldiskfs]
[19587.874494]  [&amp;lt;ffffffffa1242a00&amp;gt;] ? osd_ios_ROOT_scan+0x300/0x300 [osd_ldiskfs]
[19587.883303]  [&amp;lt;ffffffffa123eab0&amp;gt;] ? osd_preload_next+0xa0/0xa0 [osd_ldiskfs]
[19587.891811]  [&amp;lt;ffffffffa1245360&amp;gt;] ? osd_scrub_main+0x940/0xf00 [osd_ldiskfs]
[19587.900318]  [&amp;lt;ffffffff810c54e0&amp;gt;] ? wake_up_state+0x20/0x20
[19587.907177]  [&amp;lt;ffffffffa1244a20&amp;gt;] ? osd_inode_iteration+0xc70/0xc70 [osd_ldiskfs]
[19587.916170]  [&amp;lt;ffffffff810b0a4f&amp;gt;] ? kthread+0xcf/0xe0
[19587.922453]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
[19587.930372]  [&amp;lt;ffffffff81697798&amp;gt;] ? ret_from_fork+0x58/0x90
[19587.937217]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Lai, could you take a look at the logs and stack traces to see if there is anything suspicious? Thank you.&lt;/p&gt;</comment>
                            <comment id="201333" author="laisiyao" created="Fri, 7 Jul 2017 11:44:52 +0000"  >&lt;p&gt;This is found in soak-10.2.log:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;soak-10.2.log:00010000:02000400:9.0:1499351737.374320:0:3602:0:(ldlm_lib.c:2080:target_recovery_overseer()) soaked-MDT0002: recovery is timed out, evict stale exports
soak-10.2.log:00010000:00080000:9.0:1499351737.427959:0:3602:0:(ldlm_lib.c:2044:target_recovery_overseer()) soaked-MDT0002: there are still update replay (0x1120005fc96)in the queue.
soak-10.2.log:00010000:00000040:9.0:1499351737.427965:0:3602:0:(ldlm_lib.c:2504:target_recovery_thread()) 1: request replay stage - 1 clients from t1176821431446
soak-10.2.log:00010000:00080000:9.0:1499351737.427983:0:3602:0:(ldlm_lib.c:2044:target_recovery_overseer()) soaked-MDT0002: there are still update replay (0x1120005fc96)in the queue.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It looks that DNE recovery not finished, this may cause system hung or other failures, I&apos;ll check how this happened.&lt;/p&gt;</comment>
                            <comment id="201394" author="cliffw" created="Fri, 7 Jul 2017 18:57:12 +0000"  >&lt;p&gt;I have created &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9748&quot; title=&quot;DNE recovery hangs, blocks Lustre recovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9748&quot;&gt;&lt;del&gt;LU-9748&lt;/del&gt;&lt;/a&gt; for the DNE recovery issue&lt;/p&gt;</comment>
                            <comment id="201555" author="cliffw" created="Mon, 10 Jul 2017 17:07:20 +0000"  >&lt;p&gt;Hit this again on lustre-master, build 3606&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[ 4931.461826] LustreError: 3604:0:(recover.c:157:ptlrpc_replay_next()) ASSERTION( !list_empty(&amp;amp;req-&amp;gt;rq_cli.cr_unreplied_list) ) failed: ^M
[ 4931.475358] LustreError: 3604:0:(recover.c:157:ptlrpc_replay_next()) LBUG^M
[ 4931.482951] Pid: 3604, comm: ptlrpcd_rcv^M
[ 4931.487341] ^M
[ 4931.487341] Call Trace:^M
[ 4931.491756]  [&amp;lt;ffffffffa07807ee&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]^M
[ 4931.499055]  [&amp;lt;ffffffffa078087c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]^M
[ 4931.506024]  [&amp;lt;ffffffffa0b19817&amp;gt;] ptlrpc_replay_next+0x447/0x450 [ptlrpc]^M
[ 4931.513639]  [&amp;lt;ffffffffa0b3cde2&amp;gt;] ptlrpc_import_recovery_state_machine+0x1d2/0xbc0 [ptlrpc]^M
[ 4931.522999]  [&amp;lt;ffffffffa0b11f1f&amp;gt;] ptlrpc_replay_interpret+0x17f/0x7d0 [ptlrpc]^M
[ 4931.531098]  [&amp;lt;ffffffffa0b15c7c&amp;gt;] ptlrpc_check_set.part.23+0x42c/0x1da0 [ptlrpc]^M
[ 4931.539393]  [&amp;lt;ffffffffa0b1764b&amp;gt;] ptlrpc_check_set+0x5b/0xe0 [ptlrpc]^M
[ 4931.546623]  [&amp;lt;ffffffffa0b432bb&amp;gt;] ptlrpcd_check+0x4db/0x5d0 [ptlrpc]^M
[ 4931.553754]  [&amp;lt;ffffffffa0b4366b&amp;gt;] ptlrpcd+0x2bb/0x560 [ptlrpc]^M
[ 4931.560279]  [&amp;lt;ffffffff810c5080&amp;gt;] ? default_wake_function+0x0/0x20^M
[ 4931.567215]  [&amp;lt;ffffffffa0b433b0&amp;gt;] ? ptlrpcd+0x0/0x560 [ptlrpc]^M
[ 4931.573730]  [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0^M
[ 4931.579187]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0^M
[ 4931.584740]  [&amp;lt;ffffffff81696a58&amp;gt;] ret_from_fork+0x58/0x90^M
[ 4931.590778]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0^M
[ 4931.596326] ^M
[ 4931.597997] Kernel panic - not syncing: LBUG^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="201556" author="cliffw" created="Mon, 10 Jul 2017 17:08:43 +0000"  >&lt;p&gt;There is a crash dump available on soak-31&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[ 4874.102156] Lustre: 3604:0:(client.c:2114:ptlrpc_expire_one_request()) Skipped 3 previous similar messages
[ 4928.198801] Lustre: soaked-OST0002-osc-ffff88105a2ec000: Connection restored to 192.168.1.104@o2ib (at 192.168.1.104@o2ib)
[ 4929.712442] Lustre: soaked-OST0014-osc-ffff88105a2ec000: Connection restored to 192.168.1.104@o2ib (at 192.168.1.104@o2ib)
[ 4929.724803] Lustre: Skipped 1 previous similar message
[ 4931.461826] LustreError: 3604:0:(recover.c:157:ptlrpc_replay_next()) ASSERTION( !list_empty(&amp;amp;req-&amp;gt;rq_cli.cr_unreplied_list) ) failed:
[ 4931.475358] LustreError: 3604:0:(recover.c:157:ptlrpc_replay_next()) LBUG
[ 4931.482951] Pid: 3604, comm: ptlrpcd_rcv
[ 4931.487341]
Call Trace:
[ 4931.491756]  [&amp;lt;ffffffffa07807ee&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]
[ 4931.499055]  [&amp;lt;ffffffffa078087c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]
[ 4931.506024]  [&amp;lt;ffffffffa0b19817&amp;gt;] ptlrpc_replay_next+0x447/0x450 [ptlrpc]
[ 4931.513639]  [&amp;lt;ffffffffa0b3cde2&amp;gt;] ptlrpc_import_recovery_state_machine+0x1d2/0xbc0 [ptlrpc]
[ 4931.522999]  [&amp;lt;ffffffffa0b11f1f&amp;gt;] ptlrpc_replay_interpret+0x17f/0x7d0 [ptlrpc]
[ 4931.531098]  [&amp;lt;ffffffffa0b15c7c&amp;gt;] ptlrpc_check_set.part.23+0x42c/0x1da0 [ptlrpc]
[ 4931.539393]  [&amp;lt;ffffffffa0b1764b&amp;gt;] ptlrpc_check_set+0x5b/0xe0 [ptlrpc]
[ 4931.546623]  [&amp;lt;ffffffffa0b432bb&amp;gt;] ptlrpcd_check+0x4db/0x5d0 [ptlrpc]
[ 4931.553754]  [&amp;lt;ffffffffa0b4366b&amp;gt;] ptlrpcd+0x2bb/0x560 [ptlrpc]
[ 4931.560279]  [&amp;lt;ffffffff810c5080&amp;gt;] ? default_wake_function+0x0/0x20
[ 4931.567215]  [&amp;lt;ffffffffa0b433b0&amp;gt;] ? ptlrpcd+0x0/0x560 [ptlrpc]
[ 4931.573730]  [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
[ 4931.579187]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0
[ 4931.584740]  [&amp;lt;ffffffff81696a58&amp;gt;] ret_from_fork+0x58/0x90
[ 4931.590778]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0
[ 4931.596326]
[ 4931.597997] Kernel panic - not syncing: LBUG
[ 4931.602769] CPU: 5 PID: 3604 Comm: ptlrpcd_rcv Tainted: G          IOE  ------------   3.10.0-514.10.2.el7.x86_64 #1
[ 4931.614514] Hardware name: Intel Corporation S2600WTT/S2600WTT, BIOS SE5C610.86B.01.01.0008.021120151325 02/11/2015
[ 4931.626160]  ffffffffa079ee8b 00000000f7bba2b7 ffff880841747b30 ffffffff816864ef
[ 4931.634457]  ffff880841747bb0 ffffffff8167f8f6 ffffffff00000008 ffff880841747bc0
[ 4931.642744]  ffff880841747b60 00000000f7bba2b7 00000000f7bba2b7 ffff88085ed4f838
[ 4931.651035] Call Trace:
[ 4931.653764]  [&amp;lt;ffffffff816864ef&amp;gt;] dump_stack+0x19/0x1b
[ 4931.659507]  [&amp;lt;ffffffff8167f8f6&amp;gt;] panic+0xe3/0x1f2
[ 4931.664856]  [&amp;lt;ffffffffa0780894&amp;gt;] lbug_with_loc+0x64/0xb0 [libcfs]
[ 4931.671781]  [&amp;lt;ffffffffa0b19817&amp;gt;] ptlrpc_replay_next+0x447/0x450 [ptlrpc]
[ 4931.679388]  [&amp;lt;ffffffffa0b3cde2&amp;gt;] ptlrpc_import_recovery_state_machine+0x1d2/0xbc0 [ptlrpc]
[ 4931.688742]  [&amp;lt;ffffffffa0b11f1f&amp;gt;] ptlrpc_replay_interpret+0x17f/0x7d0 [ptlrpc]
[ 4931.696832]  [&amp;lt;ffffffffa0b15c7c&amp;gt;] ptlrpc_check_set.part.23+0x42c/0x1da0 [ptlrpc]
[ 4931.705118]  [&amp;lt;ffffffffa0b1764b&amp;gt;] ptlrpc_check_set+0x5b/0xe0 [ptlrpc]
[ 4931.712339]  [&amp;lt;ffffffffa0b432bb&amp;gt;] ptlrpcd_check+0x4db/0x5d0 [ptlrpc]
[ 4931.719465]  [&amp;lt;ffffffffa0b4366b&amp;gt;] ptlrpcd+0x2bb/0x560 [ptlrpc]
[ 4931.725981]  [&amp;lt;ffffffff810c5080&amp;gt;] ? wake_up_state+0x20/0x20
[ 4931.732224]  [&amp;lt;ffffffffa0b433b0&amp;gt;] ? ptlrpcd_check+0x5d0/0x5d0 [ptlrpc]
[ 4931.739517]  [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
[ 4931.744959]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
[ 4931.752243]  [&amp;lt;ffffffff81696a58&amp;gt;] ret_from_fork+0x58/0x90
[ 4931.758267]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="201621" author="cliffw" created="Tue, 11 Jul 2017 01:22:16 +0000"  >&lt;p&gt;Now running lustre-b2_10 build #3&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Jul 11 01:13:20 soak-17 kernel: LustreError: 2937:0:(recover.c:157:ptlrpc_replay_next()) ASSERTION( !list_empty(&amp;amp;req-&amp;gt;rq_cli.cr_unreplied_list) ) failed:
Jul 11 01:13:20 soak-17 kernel: LustreError: 2937:0:(recover.c:157:ptlrpc_replay_next()) LBUG
Jul 11 01:13:20 soak-17 kernel: Pid: 2937, comm: ptlrpcd_rcv
Jul 11 01:13:20 soak-17 kernel: #012Call Trace:
Jul 11 01:13:20 soak-17 kernel: [&amp;lt;ffffffffa08447ee&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]
Jul 11 01:13:20 soak-17 kernel: [&amp;lt;ffffffffa084487c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]
Jul 11 01:13:20 soak-17 kernel: [&amp;lt;ffffffffa0b76817&amp;gt;] ptlrpc_replay_next+0x447/0x450 [ptlrpc]
Jul 11 01:13:20 soak-17 kernel: [&amp;lt;ffffffffa0b99de2&amp;gt;] ptlrpc_import_recovery_state_machine+0x1d2/0xbc0 [ptlrpc]
Jul 11 01:13:20 soak-17 kernel: [&amp;lt;ffffffffa0b6ef1f&amp;gt;] ptlrpc_replay_interpret+0x17f/0x7d0 [ptlrpc]
Jul 11 01:13:20 soak-17 kernel: [&amp;lt;ffffffffa0b72c7c&amp;gt;] ptlrpc_check_set.part.23+0x42c/0x1da0 [ptlrpc]
Jul 11 01:13:20 soak-17 kernel: [&amp;lt;ffffffffa0b7464b&amp;gt;] ptlrpc_check_set+0x5b/0xe0 [ptlrpc]
Jul 11 01:13:20 soak-17 kernel: [&amp;lt;ffffffffa0ba02bb&amp;gt;] ptlrpcd_check+0x4db/0x5d0 [ptlrpc]
Jul 11 01:13:20 soak-17 kernel: [&amp;lt;ffffffffa0ba066b&amp;gt;] ptlrpcd+0x2bb/0x560 [ptlrpc]
Jul 11 01:13:20 soak-17 kernel: [&amp;lt;ffffffff810c5080&amp;gt;] ? default_wake_function+0x0/0x20
Jul 11 01:13:20 soak-17 kernel: [&amp;lt;ffffffffa0ba03b0&amp;gt;] ? ptlrpcd+0x0/0x560 [ptlrpc]
Jul 11 01:13:20 soak-17 kernel: [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
Jul 11 01:13:20 soak-17 kernel: [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0
Jul 11 01:13:20 soak-17 kernel: [&amp;lt;ffffffff81696a58&amp;gt;] ret_from_fork+0x58/0x90
Jul 11 01:13:20 soak-17 kernel: [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0
Jul 11 01:13:20 soak-17 kernel:
Jul 11 01:13:20 soak-17 kernel: Kernel panic - not syncing: LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Crash dump is available on soak-17&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[ 3947.210017] perf: interrupt took too &lt;span class=&quot;code-object&quot;&gt;long&lt;/span&gt; (3225 &amp;gt; 3192), lowering kernel.perf_event_max_sample_rate to 62000
[ 4161.246530] Lustre: 2953:0:(client.c:2114:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; sent delay: [sent 1499735243/real 0]  req@ffff880397b67200 x1572582391402544/t0(0) o101-&amp;gt;soaked-OST0008-osc-ffff8808276d0000@192.168.1.104@o2ib:28/4 lens 328/400 e 0 to 1 dl 1499735256 ref 2 fl Rpc:X/0/ffffffff rc 0/-1
[ 4161.282464] Lustre: soaked-OST0008-osc-ffff8808276d0000: Connection to soaked-OST0008 (at 192.168.1.104@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
[ 4162.030527] Lustre: soaked-OST000e-osc-ffff8808276d0000: Connection to soaked-OST000e (at 192.168.1.104@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
[ 4169.926508] Lustre: soaked-OST0014-osc-ffff8808276d0000: Connection to soaked-OST0014 (at 192.168.1.104@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
[ 4194.948696] Lustre: 2944:0:(client.c:2114:ptlrpc_expire_one_request()) @@@ Request sent has failed due to network error: [sent 1499735290/real 1499735290]  req@ffff880334b43c00 x1572582391418656/t0(0) o400-&amp;gt;soaked-OST0002-osc-ffff8808276d0000@192.168.1.104@o2ib:28/4 lens 224/224 e 0 to 1 dl 1499735303 ref 1 fl Rpc:eXN/0/ffffffff rc 0/-1
[ 4194.986416] Lustre: 2944:0:(client.c:2114:ptlrpc_expire_one_request()) Skipped 7 previous similar messages
[ 4194.998649] Lustre: soaked-OST0002-osc-ffff8808276d0000: Connection to soaked-OST0002 (at 192.168.1.104@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
[ 4245.020397] Lustre: 2937:0:(client.c:2114:ptlrpc_expire_one_request()) @@@ Request sent has failed due to network error: [sent 1499735340/real 1499735340]  req@ffff880334b42100 x1572582391449120/t0(0) o8-&amp;gt;soaked-OST0002-osc-ffff8808276d0000@192.168.1.104@o2ib:28/4 lens 520/544 e 0 to 1 dl 1499735351 ref 1 fl Rpc:eXN/0/ffffffff rc 0/-1
[ 4245.057630] Lustre: 2937:0:(client.c:2114:ptlrpc_expire_one_request()) Skipped 4 previous similar messages
[ 4320.019906] Lustre: 2937:0:(client.c:2114:ptlrpc_expire_one_request()) @@@ Request sent has failed due to network error: [sent 1499735415/real 1499735415]  req@ffff880335d55a00 x1572582391496080/t0(0) o8-&amp;gt;soaked-OST0008-osc-ffff8808276d0000@192.168.1.104@o2ib:28/4 lens 520/544 e 0 to 1 dl 1499735436 ref 1 fl Rpc:eXN/0/ffffffff rc 0/-1
[ 4320.057297] Lustre: 2937:0:(client.c:2114:ptlrpc_expire_one_request()) Skipped 4 previous similar messages
[ 4451.018399] Lustre: 2937:0:(client.c:2114:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; sent delay: [sent 1499735515/real 0]  req@ffff880335d53c00 x1572582391561888/t0(0) o8-&amp;gt;soaked-OST0014-osc-ffff8808276d0000@192.168.1.104@o2ib:28/4 lens 520/544 e 0 to 1 dl 1499735546 ref 2 fl Rpc:XN/0/ffffffff rc 0/-1
[ 4451.054303] Lustre: 2937:0:(client.c:2114:ptlrpc_expire_one_request()) Skipped 7 previous similar messages
[ 4504.566715] LustreError: 2937:0:(recover.c:157:ptlrpc_replay_next()) ASSERTION( !list_empty(&amp;amp;req-&amp;gt;rq_cli.cr_unreplied_list) ) failed:
[ 4504.587823] LustreError: 2937:0:(recover.c:157:ptlrpc_replay_next()) LBUG
[ 4504.598875] Pid: 2937, comm: ptlrpcd_rcv
[ 4504.606014]
Call Trace:
[ 4504.615457]  [&amp;lt;ffffffffa08447ee&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]
[ 4504.624991]  [&amp;lt;ffffffffa084487c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]
[ 4504.634021]  [&amp;lt;ffffffffa0b76817&amp;gt;] ptlrpc_replay_next+0x447/0x450 [ptlrpc]
[ 4504.643628]  [&amp;lt;ffffffffa0b99de2&amp;gt;] ptlrpc_import_recovery_state_machine+0x1d2/0xbc0 [ptlrpc]
[ 4504.654851]  [&amp;lt;ffffffffa0b6ef1f&amp;gt;] ptlrpc_replay_interpret+0x17f/0x7d0 [ptlrpc]
[ 4504.664682]  [&amp;lt;ffffffffa0b72c7c&amp;gt;] ptlrpc_check_set.part.23+0x42c/0x1da0 [ptlrpc]
[ 4504.674633]  [&amp;lt;ffffffffa0b7464b&amp;gt;] ptlrpc_check_set+0x5b/0xe0 [ptlrpc]
[ 4504.683405]  [&amp;lt;ffffffffa0ba02bb&amp;gt;] ptlrpcd_check+0x4db/0x5d0 [ptlrpc]
[ 4504.692018]  [&amp;lt;ffffffffa0ba066b&amp;gt;] ptlrpcd+0x2bb/0x560 [ptlrpc]
[ 4504.699993]  [&amp;lt;ffffffff810c5080&amp;gt;] ? default_wake_function+0x0/0x20
[ 4504.708298]  [&amp;lt;ffffffffa0ba03b0&amp;gt;] ? ptlrpcd+0x0/0x560 [ptlrpc]
[ 4504.716207]  [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
[ 4504.723002]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0
[ 4504.729868]  [&amp;lt;ffffffff81696a58&amp;gt;] ret_from_fork+0x58/0x90
[ 4504.737213]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0
[ 4504.744039]
[ 4504.746963] Kernel panic - not syncing: LBUG
[ 4504.752969] CPU: 14 PID: 2937 Comm: ptlrpcd_rcv Tainted: G           OE  ------------   3.10.0-514.10.2.el7.x86_64 #1
[ 4504.766051] Hardware name: Intel Corporation S2600GZ ........../S2600GZ, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013
[ 4504.779814]  ffffffffa0862e8b 0000000057e50b83 ffff88082ab1fb30 ffffffff816864ef
[ 4504.789340]  ffff88082ab1fbb0 ffffffff8167f8f6 ffffffff00000008 ffff88082ab1fbc0
[ 4504.798846]  ffff88082ab1fb60 0000000057e50b83 0000000057e50b83 ffff88082d98f838
[ 4504.808330] Call Trace:
[ 4504.812212]  [&amp;lt;ffffffff816864ef&amp;gt;] dump_stack+0x19/0x1b
[ 4504.819084]  [&amp;lt;ffffffff8167f8f6&amp;gt;] panic+0xe3/0x1f2
[ 4504.825543]  [&amp;lt;ffffffffa0844894&amp;gt;] lbug_with_loc+0x64/0xb0 [libcfs]
[ 4504.833554]  [&amp;lt;ffffffffa0b76817&amp;gt;] ptlrpc_replay_next+0x447/0x450 [ptlrpc]
[ 4504.842225]  [&amp;lt;ffffffffa0b99de2&amp;gt;] ptlrpc_import_recovery_state_machine+0x1d2/0xbc0 [ptlrpc]
[ 4504.852623]  [&amp;lt;ffffffffa0b6ef1f&amp;gt;] ptlrpc_replay_interpret+0x17f/0x7d0 [ptlrpc]
[ 4504.861745]  [&amp;lt;ffffffffa0b72c7c&amp;gt;] ptlrpc_check_set.part.23+0x42c/0x1da0 [ptlrpc]
[ 4504.871040]  [&amp;lt;ffffffffa0b7464b&amp;gt;] ptlrpc_check_set+0x5b/0xe0 [ptlrpc]
[ 4504.879243]  [&amp;lt;ffffffffa0ba02bb&amp;gt;] ptlrpcd_check+0x4db/0x5d0 [ptlrpc]
[ 4504.887319]  [&amp;lt;ffffffffa0ba066b&amp;gt;] ptlrpcd+0x2bb/0x560 [ptlrpc]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="201622" author="cliffw" created="Tue, 11 Jul 2017 01:26:41 +0000"  >&lt;p&gt;Attached lustre dump from all MDS&lt;/p&gt;</comment>
                            <comment id="201624" author="niu" created="Tue, 11 Jul 2017 01:35:11 +0000"  >&lt;p&gt;Cliff, the fix hasn&apos;t been landed yet, so the crash is expected.&lt;/p&gt;</comment>
                            <comment id="201638" author="pjones" created="Tue, 11 Jul 2017 04:47:52 +0000"  >&lt;p&gt;Niu&lt;/p&gt;

&lt;p&gt;the test also included a reconfiguration on soak which it was hoped might make this race less likely as the disk faliure had coincided with this race being hit far more frenquently (see DCO-7264), Given that this has not worked would it be enough to just land this one fix or did earlier testing suggest further fixes would be needed too?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="201646" author="niu" created="Tue, 11 Jul 2017 06:55:26 +0000"  >&lt;p&gt;Peter, I think the fix from &lt;a href=&quot;https://review.whamcloud.com/#/c/27920/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/27920/&lt;/a&gt; should be enough.&lt;/p&gt;</comment>
                            <comment id="201700" author="gerrit" created="Tue, 11 Jul 2017 17:20:20 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/27920/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/27920/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9274&quot; title=&quot;LBUG: (recover.c:157:ptlrpc_replay_next()) ASSERTION( !list_empty(&amp;amp;req-&amp;gt;rq_cli.cr_unreplied_list) ) failed:&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9274&quot;&gt;&lt;del&gt;LU-9274&lt;/del&gt;&lt;/a&gt; ptlrpc: add replay request into unreplied list&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 7d29d3167684b13a612c8c1bff860019a218115c&lt;/p&gt;</comment>
                            <comment id="202460" author="niu" created="Tue, 18 Jul 2017 13:39:35 +0000"  >&lt;p&gt;Patch landed for 2.10.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                                        </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="47156">LU-9748</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="27446" name="soak-10.2.log.gz" size="9791752" author="cliffw" created="Thu, 6 Jul 2017 14:42:57 +0000"/>
                            <attachment id="27457" name="soak-10.debug.log.gz" size="6813711" author="cliffw" created="Thu, 6 Jul 2017 16:04:46 +0000"/>
                            <attachment id="27469" name="soak-10.dump.txt.gz" size="3965065" author="cliffw" created="Thu, 6 Jul 2017 18:43:36 +0000"/>
                            <attachment id="27444" name="soak-10.log.gz" size="192167" author="cliffw" created="Thu, 6 Jul 2017 14:26:50 +0000"/>
                            <attachment id="27568" name="soak-10.lustre.log.txt.gz" size="10846173" author="cliffw" created="Tue, 11 Jul 2017 01:26:04 +0000"/>
                            <attachment id="27450" name="soak-10.stack.txt" size="1290806" author="cliffw" created="Thu, 6 Jul 2017 15:37:01 +0000"/>
                            <attachment id="27445" name="soak-10.stacks.txt" size="1122917" author="cliffw" created="Thu, 6 Jul 2017 14:26:51 +0000"/>
                            <attachment id="27467" name="soak-10.stacks.txt.gz" size="81885" author="cliffw" created="Thu, 6 Jul 2017 18:43:26 +0000"/>
                            <attachment id="27458" name="soak-11.debug.log.gz" size="16007480" author="cliffw" created="Thu, 6 Jul 2017 16:05:07 +0000"/>
                            <attachment id="27463" name="soak-11.dump.txt.gz" size="6089684" author="cliffw" created="Thu, 6 Jul 2017 18:43:40 +0000"/>
                            <attachment id="27569" name="soak-11.lustre.log.txt.gz" size="11920856" author="cliffw" created="Tue, 11 Jul 2017 01:26:21 +0000"/>
                            <attachment id="27451" name="soak-11.stacks.txt" size="1307557" author="cliffw" created="Thu, 6 Jul 2017 15:37:06 +0000"/>
                            <attachment id="27462" name="soak-11.stacks.txt.gz" size="111403" author="cliffw" created="Thu, 6 Jul 2017 18:43:25 +0000"/>
                            <attachment id="27410" name="soak-16.lustre-log.1498910471.2905.gz" size="71747" author="cliffw" created="Mon, 3 Jul 2017 14:41:34 +0000"/>
                            <attachment id="27108" name="soak-16.syslog.txt" size="4355768" author="cliffw" created="Fri, 23 Jun 2017 14:35:49 +0000"/>
                            <attachment id="27206" name="soak-29.console.txt.gz" size="10889122" author="cliffw" created="Wed, 28 Jun 2017 14:51:47 +0000"/>
                            <attachment id="27455" name="soak-8.debug.log.gz" size="6884684" author="cliffw" created="Thu, 6 Jul 2017 16:03:36 +0000"/>
                            <attachment id="27464" name="soak-8.dump.txt.gz" size="6137708" author="cliffw" created="Thu, 6 Jul 2017 18:43:39 +0000"/>
                            <attachment id="27566" name="soak-8.lustre.log.txt.gz" size="11631653" author="cliffw" created="Tue, 11 Jul 2017 01:25:29 +0000"/>
                            <attachment id="27448" name="soak-8.stacks.txt" size="1431196" author="cliffw" created="Thu, 6 Jul 2017 15:37:05 +0000"/>
                            <attachment id="27465" name="soak-8.stacks.txt.gz" size="118388" author="cliffw" created="Thu, 6 Jul 2017 18:43:25 +0000"/>
                            <attachment id="27456" name="soak-9.debug.log.gz" size="16160668" author="cliffw" created="Thu, 6 Jul 2017 16:03:59 +0000"/>
                            <attachment id="27468" name="soak-9.dump.txt.gz" size="4454963" author="cliffw" created="Thu, 6 Jul 2017 18:43:37 +0000"/>
                            <attachment id="27567" name="soak-9.lustre.log.txt.gz" size="11465053" author="cliffw" created="Tue, 11 Jul 2017 01:25:46 +0000"/>
                            <attachment id="27449" name="soak-9.stacks.txt" size="1416025" author="cliffw" created="Thu, 6 Jul 2017 15:37:01 +0000"/>
                            <attachment id="27466" name="soak-9.stacks.txt.gz" size="116500" author="cliffw" created="Thu, 6 Jul 2017 18:43:26 +0000"/>
                            <attachment id="26064" name="vmcore-dmesg.txt" size="137942" author="cliffw" created="Thu, 30 Mar 2017 15:10:31 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzz8rj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>