<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:20:47 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-8815] MDS fails to complete recovery</title>
                <link>https://jira.whamcloud.com/browse/LU-8815</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;After previous failover, MDT0003 is failed back to lola-11:&lt;br/&gt;
Nov  8 22:10:14 lola-10 kernel: Lustre: server umount soaked-MDT0003 complete&lt;/p&gt;

&lt;p&gt;MDS failover fault induced, lola-11 fails MDT0003 to lola-10:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;016-11-08 22:24:25,959:fsmgmt.fsmgmt:INFO     Failing over soaked-MDT0003 ...
2016-11-08 22:24:25,959:fsmgmt.fsmgmt:INFO     Mounting soaked-MDT0003 on lola-10 ...
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Recovery starts on MDS: Nov  8 22:25:29 lola-10 kernel: Lustre: soaked-MDT0003: Imperative Recovery enabled, recovery window shrunk from 300-900 down to 150-450&lt;/p&gt;

&lt;p&gt;Task block on lola-10:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;ov  8 22:29:17 lola-10 kernel: INFO: task mdt_out01_007:6305 blocked &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; more than 120 seconds.
Nov  8 22:29:17 lola-10 kernel:      Tainted: P           -- ------------    2.6.32-573.26.1.el6_lustre.x86_64 #1
Nov  8 22:29:17 lola-10 kernel: &lt;span class=&quot;code-quote&quot;&gt;&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot;&lt;/span&gt; disables &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; message.
Nov  8 22:29:17 lola-10 kernel: mdt_out01_007 D 0000000000000018     0  6305      2 0x00000080
Nov  8 22:29:17 lola-10 kernel: ffff8808104b7b40 0000000000000046 0000000000000000 ffff8808104b7d00
Nov  8 22:29:17 lola-10 kernel: ffff8804337ea800 ffff880824300118 000014206fe1f7c1 ffff880824300118
Nov  8 22:29:17 lola-10 kernel: ffff8808104b7b40 00000001014d3a26 ffff880834add068 ffff8808104b7fd8
Nov  8 22:29:17 lola-10 kernel: Call Trace:
Nov  8 22:29:17 lola-10 kernel: [&amp;lt;ffffffffa07bca3d&amp;gt;] lu_object_find_at+0x3d/0xe0 [obdclass]
Nov  8 22:29:17 lola-10 kernel: [&amp;lt;ffffffffa09dd442&amp;gt;] ? __req_capsule_get+0x162/0x6e0 [ptlrpc]
Nov  8 22:29:17 lola-10 kernel: [&amp;lt;ffffffff81067650&amp;gt;] ? default_wake_function+0x0/0x20
Nov  8 22:29:17 lola-10 kernel: [&amp;lt;ffffffffa09b2af0&amp;gt;] ? lustre_swab_object_update_reply+0x0/0xc0 [ptlrpc]
Nov  8 22:29:17 lola-10 kernel: [&amp;lt;ffffffffa07bd7cc&amp;gt;] dt_locate_at+0x1c/0xa0 [obdclass]
Nov  8 22:29:17 lola-10 kernel: [&amp;lt;ffffffffa0a24147&amp;gt;] out_handle+0x1067/0x18d0 [ptlrpc]
Nov  8 22:29:17 lola-10 kernel: [&amp;lt;ffffffff8105e9b6&amp;gt;] ? enqueue_task+0x66/0x80
Nov  8 22:29:17 lola-10 kernel: [&amp;lt;ffffffff8105ab8d&amp;gt;] ? check_preempt_curr+0x6d/0x90
Nov  8 22:29:17 lola-10 kernel: [&amp;lt;ffffffffa067ac8a&amp;gt;] ? lc_watchdog_touch+0x7a/0x190 [libcfs]
Nov  8 22:29:17 lola-10 kernel: [&amp;lt;ffffffff8153afce&amp;gt;] ? mutex_lock+0x1e/0x50
Nov  8 22:29:17 lola-10 kernel: [&amp;lt;ffffffffa0a142ca&amp;gt;] ? req_can_reconstruct+0x6a/0x120 [ptlrpc]
Nov  8 22:29:17 lola-10 kernel: [&amp;lt;ffffffffa0a1b4bc&amp;gt;] tgt_request_handle+0x8ec/0x1440 [ptlrpc]
Nov  8 22:29:17 lola-10 kernel: [&amp;lt;ffffffffa09c7bc1&amp;gt;] ptlrpc_main+0xd31/0x1800 [ptlrpc]
Nov  8 22:29:17 lola-10 kernel: [&amp;lt;ffffffff81539b0e&amp;gt;] ? thread_return+0x4e/0x7d0
Nov  8 22:29:17 lola-10 kernel: [&amp;lt;ffffffffa09c6e90&amp;gt;] ? ptlrpc_main+0x0/0x1800 [ptlrpc]
Nov  8 22:29:17 lola-10 kernel: [&amp;lt;ffffffff810a138e&amp;gt;] kthread+0x9e/0xc0
Nov  8 22:29:17 lola-10 kernel: [&amp;lt;ffffffff8100c28a&amp;gt;] child_rip+0xa/0x20
Nov  8 22:29:17 lola-10 kernel: [&amp;lt;ffffffff810a12f0&amp;gt;] ? kthread+0x0/0xc0
Nov  8 22:29:18 lola-10 kernel: [&amp;lt;ffffffff8100c280&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Second blocked task:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Nov  8 22:29:32 lola-10 kernel: Call Trace:
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffffa07bca3d&amp;gt;] lu_object_find_at+0x3d/0xe0 [obdclass]
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffffa09dd442&amp;gt;] ? __req_capsule_get+0x162/0x6e0 [ptlrpc]
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffff81067650&amp;gt;] ? default_wake_function+0x0/0x20
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffffa09b2af0&amp;gt;] ? lustre_swab_object_update_reply+0x0/0xc0 [ptlrpc]
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffffa07bd7cc&amp;gt;] dt_locate_at+0x1c/0xa0 [obdclass]
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffffa0a24147&amp;gt;] out_handle+0x1067/0x18d0 [ptlrpc]
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffff8105e9b6&amp;gt;] ? enqueue_task+0x66/0x80
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffff8105ab8d&amp;gt;] ? check_preempt_curr+0x6d/0x90
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffffa067ac8a&amp;gt;] ? lc_watchdog_touch+0x7a/0x190 [libcfs]
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffff8153afce&amp;gt;] ? mutex_lock+0x1e/0x50
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffffa0a142ca&amp;gt;] ? req_can_reconstruct+0x6a/0x120 [ptlrpc]
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffffa0a1b4bc&amp;gt;] tgt_request_handle+0x8ec/0x1440 [ptlrpc]
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffffa09c7bc1&amp;gt;] ptlrpc_main+0xd31/0x1800 [ptlrpc]
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffff81539b0e&amp;gt;] ? thread_return+0x4e/0x7d0
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffffa09c6e90&amp;gt;] ? ptlrpc_main+0x0/0x1800 [ptlrpc]
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffff810a138e&amp;gt;] kthread+0x9e/0xc0
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffff8100c28a&amp;gt;] child_rip+0xa/0x20
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffff810a12f0&amp;gt;] ? kthread+0x0/0xc0
Nov  8 22:29:32 lola-10 kernel: [&amp;lt;ffffffff8100c280&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Recovery timer competes, but recovery does not:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;mdt.soaked-MDT0003.recovery_status=
status: RECOVERING
recovery_start: 1478672734
time_remaining: 0
connected_clients: 21/21
req_replay_clients: 4
lock_repay_clients: 5
completed_clients: 16
evicted_clients: 0
replayed_requests: 7
queued_requests: 4
next_transno: 77309522798
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;System finally notices: &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Nov  9 08:22:35 lola-10 kernel: Lustre: soaked-MDT0003: Recovery already passed deadline 588:42, It is most likely due to DNE recovery is failed or stuck, please wait a few more minutes or abort the recovery.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Attached lustre-log dumps from lola-10&lt;/p&gt;</description>
                <environment>Soak test cluster, tip of lustre-master lustre: 2.8.59_79_gb8811a0</environment>
        <key id="41437">LU-8815</key>
            <summary>MDS fails to complete recovery</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="6" iconUrl="https://jira.whamcloud.com/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="yong.fan">nasf</assignee>
                                    <reporter username="cliffw">Cliff White</reporter>
                        <labels>
                            <label>soak</label>
                    </labels>
                <created>Wed, 9 Nov 2016 17:54:42 +0000</created>
                <updated>Fri, 30 Jun 2017 01:51:09 +0000</updated>
                            <resolved>Fri, 30 Jun 2017 01:51:09 +0000</resolved>
                                    <version>Lustre 2.9.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="172969" author="cliffw" created="Wed, 9 Nov 2016 17:56:17 +0000"  >&lt;p&gt;Prior to this issue, soak test had 4 successful MDS failovers, 8 successful MDS restarts.&lt;/p&gt;</comment>
                            <comment id="172971" author="di.wang" created="Wed, 9 Nov 2016 18:08:04 +0000"  >&lt;p&gt;The recovery stuck here&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;tgt_recover_3 S 0000000000000008     0  9320      2 0x00000080
 ffff8803539037f0 0000000000000046 0000000000000000 ffff8803c5a25520
 ffff88041d3fa040 ffff8803c5a25520 0000345d232c77d9 ffff8807ba8ae1c0
 ffff8807fec20000 00000001036a5b96 ffff8803c5a25ad8 ffff880353903fd8
Call Trace:
 [&amp;lt;ffffffff8153a9b2&amp;gt;] schedule_timeout+0x192/0x2e0
 [&amp;lt;ffffffff81089fa0&amp;gt;] ? process_timeout+0x0/0x10
 [&amp;lt;ffffffffa09abbb1&amp;gt;] ptlrpc_set_wait+0x321/0x960 [ptlrpc]
 [&amp;lt;ffffffffa09a0ce0&amp;gt;] ? ptlrpc_interrupted_set+0x0/0x120 [ptlrpc]
 [&amp;lt;ffffffff81067650&amp;gt;] ? default_wake_function+0x0/0x20
 [&amp;lt;ffffffffa09b80a5&amp;gt;] ? lustre_msg_set_jobid+0xf5/0x130 [ptlrpc]
 [&amp;lt;ffffffffa09ac271&amp;gt;] ptlrpc_queue_wait+0x81/0x220 [ptlrpc]
 [&amp;lt;ffffffffa12b6cc2&amp;gt;] osp_remote_sync+0xf2/0x1e0 [osp]
 [&amp;lt;ffffffffa129b1c7&amp;gt;] osp_attr_get+0x447/0x710 [osp]
 [&amp;lt;ffffffff811791ea&amp;gt;] ? kmem_cache_alloc+0x18a/0x190
 [&amp;lt;ffffffffa129bd25&amp;gt;] osp_object_init+0x1f5/0x360 [osp]
 [&amp;lt;ffffffffa07bb508&amp;gt;] lu_object_alloc+0xd8/0x320 [obdclass]
 [&amp;lt;ffffffffa07bc8f1&amp;gt;] lu_object_find_try+0x151/0x260 [obdclass]
 [&amp;lt;ffffffffa07bcab1&amp;gt;] lu_object_find_at+0xb1/0xe0 [obdclass]
 [&amp;lt;ffffffffa07bbac5&amp;gt;] ? lu_object_put+0x135/0x3b0 [obdclass]
 [&amp;lt;ffffffffa07bd7cc&amp;gt;] dt_locate_at+0x1c/0xa0 [obdclass]
 [&amp;lt;ffffffffa0a3457a&amp;gt;] update_recovery_exec+0xfa/0x1ce0 [ptlrpc]
 [&amp;lt;ffffffffa0a37da1&amp;gt;] distribute_txn_replay_handle+0x271/0xcf0 [ptlrpc]
 [&amp;lt;ffffffffa0979402&amp;gt;] target_recovery_thread+0xa12/0x1dd0 [ptlrpc]
 [&amp;lt;ffffffffa09789f0&amp;gt;] ? target_recovery_thread+0x0/0x1dd0 [ptlrpc]
 [&amp;lt;ffffffff810a138e&amp;gt;] kthread+0x9e/0xc0
 [&amp;lt;ffffffff8100c28a&amp;gt;] child_rip+0xa/0x20
 [&amp;lt;ffffffff810a12f0&amp;gt;] ? kthread+0x0/0xc0
 [&amp;lt;ffffffff8100c280&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And on the remote MDT, most thread stuck at&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;mdt_out01_012 D 0000000000000009     0 19980      2 0x00000080
 ffff8803facd3b40 0000000000000046 0000000000000000 ffff8803facd3d00
 ffff8804337ea800 ffff880824e00118 0000213d70f62b10 ffff880824e00118
 ffff8803facd3b40 00000001022959fb ffff8803f16c1ad8 ffff8803facd3fd8
Call Trace:
 [&amp;lt;ffffffffa07bca3d&amp;gt;] lu_object_find_at+0x3d/0xe0 [obdclass]
 [&amp;lt;ffffffffa09dd442&amp;gt;] ? __req_capsule_get+0x162/0x6e0 [ptlrpc]
 [&amp;lt;ffffffff81067650&amp;gt;] ? default_wake_function+0x0/0x20
 [&amp;lt;ffffffffa09b2af0&amp;gt;] ? lustre_swab_object_update_reply+0x0/0xc0 [ptlrpc]
 [&amp;lt;ffffffffa07bd7cc&amp;gt;] dt_locate_at+0x1c/0xa0 [obdclass]
 [&amp;lt;ffffffffa0a24147&amp;gt;] out_handle+0x1067/0x18d0 [ptlrpc]
 [&amp;lt;ffffffff8105e9b6&amp;gt;] ? enqueue_task+0x66/0x80
 [&amp;lt;ffffffff8105ab8d&amp;gt;] ? check_preempt_curr+0x6d/0x90
 [&amp;lt;ffffffffa067ac8a&amp;gt;] ? lc_watchdog_touch+0x7a/0x190 [libcfs]
 [&amp;lt;ffffffff8153afce&amp;gt;] ? mutex_lock+0x1e/0x50
 [&amp;lt;ffffffffa0a142ca&amp;gt;] ? req_can_reconstruct+0x6a/0x120 [ptlrpc]
 [&amp;lt;ffffffffa0a1b4bc&amp;gt;] tgt_request_handle+0x8ec/0x1440 [ptlrpc]
 [&amp;lt;ffffffffa09c7bc1&amp;gt;] ptlrpc_main+0xd31/0x1800 [ptlrpc]
 [&amp;lt;ffffffff81539b0e&amp;gt;] ? thread_return+0x4e/0x7d0
 [&amp;lt;ffffffffa09c6e90&amp;gt;] ? ptlrpc_main+0x0/0x1800 [ptlrpc]
 [&amp;lt;ffffffff810a138e&amp;gt;] kthread+0x9e/0xc0
 [&amp;lt;ffffffff8100c28a&amp;gt;] child_rip+0xa/0x20
 [&amp;lt;ffffffff810a12f0&amp;gt;] ? kthread+0x0/0xc0
 [&amp;lt;ffffffff8100c280&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Just glance the trace a bit, no obvious evidence why it stuck here. &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/sad.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="172972" author="cliffw" created="Wed, 9 Nov 2016 18:17:37 +0000"  >&lt;p&gt;The abort_recovery command never completes:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;LustreError: 72214:0:(mdt_handler.c:5942:mdt_iocontrol()) soaked-MDT0003: Aborting recovery &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; device
LustreError: 72214:0:(ldlm_lib.c:2596:target_stop_recovery_thread()) soaked-MDT0003: Aborting recovery
LustreError: 6078:0:(ldlm_lib.c:2816:target_queue_recovery_request()) @@@ dropping resent queued req  req@ffff88
03f78f03c0 x1550360654345360/t0(77309522798) o36-&amp;gt;3d670dc0-f7e5-756a-f555-c09f70a29db2@192.168.1.117@o2ib100:-1/
-1 lens 768/0 e 0 to 0 dl 1478715043 ref 2 fl Interpret:/6/ffffffff rc 0/-1
LustreError: 6078:0:(ldlm_lib.c:2816:target_queue_recovery_request()) Skipped 169 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="181590" author="cliffw" created="Fri, 20 Jan 2017 16:09:58 +0000"  >&lt;p&gt;Hit this again testing 2.9.0 GA &lt;/p&gt;</comment>
                            <comment id="181599" author="di.wang" created="Fri, 20 Jan 2017 17:11:53 +0000"  >&lt;p&gt;Any more information here? console or stack trace. Thanks.&lt;/p&gt;</comment>
                            <comment id="181600" author="cliffw" created="Fri, 20 Jan 2017 17:30:55 +0000"  >&lt;p&gt;Currently, system cannot recover. &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Jan 20 08:59:41 lola-8 kernel: Pid: 6236, comm: OI_scrub Tainted: P           --L------------    2.6.32-573.26.1.el6_lustre.x86_64 #1 Intel Corporation S2600GZ ........../S2600GZ
Jan 20 08:59:41 lola-8 kernel: RIP: 0010:[&amp;lt;ffffffffa10d6e98&amp;gt;]  [&amp;lt;ffffffffa10d6e98&amp;gt;] osd_inode_iteration+0x498/0xd80 [osd_ldiskfs]
Jan 20 08:59:41 lola-8 kernel: RSP: 0018:ffff8804173d7d10  EFLAGS: 00000202
Jan 20 08:59:41 lola-8 kernel: RAX: 0000000000000004 RBX: ffff8804173d7e00 RCX: ffff8804173d7dc0
Jan 20 08:59:41 lola-8 kernel: RDX: ffff8804173d7da0 RSI: ffff8808296fc000 RDI: ffff88082d394000
Jan 20 08:59:41 lola-8 kernel: RBP: ffffffff8100bc0e R08: 0000000000000001 R09: 0000000000000004
Jan 20 08:59:41 lola-8 kernel: R10: ffff880417371b80 R11: ffff880417371b90 R12: ffff8804173d7e00
Jan 20 08:59:41 lola-8 kernel: R13: ffff8808296fd000 R14: ffff8808296fd0e0 R15: 0000000000000001
Jan 20 08:59:41 lola-8 kernel: FS:  0000000000000000(0000) GS:ffff880038680000(0000) knlGS:0000000000000000
Jan 20 08:59:41 lola-8 kernel: CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
Jan 20 08:59:41 lola-8 kernel: CR2: 00000039036acd90 CR3: 0000000001a8d000 CR4: 00000000000407e0
Jan 20 08:59:41 lola-8 kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
Jan 20 08:59:41 lola-8 kernel: DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Jan 20 08:59:41 lola-8 kernel: &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; OI_scrub (pid: 6236, threadinfo ffff8804173d4000, task ffff880435f34040)
Jan 20 08:59:41 lola-8 kernel: Stack:
Jan 20 08:59:41 lola-8 kernel: 0000000000015a00 ffff8808331d4c00 ffff8808000005a6 ffff8804173d7e10
Jan 20 08:59:41 lola-8 kernel: &amp;lt;d&amp;gt; ffff8804173d7dd0 ffffffff81539b0e ffff880435f34040 ffff880435f34040
Jan 20 08:59:41 lola-8 kernel: &amp;lt;d&amp;gt; ffffffffa10d1e20 ffffffffa10d3be0 ffff88082d394000 ffff8808345e0000
Jan 20 08:59:41 lola-8 kernel: Call Trace:
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffff81539b0e&amp;gt;] ? thread_return+0x4e/0x7d0
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffffa10d1e20&amp;gt;] ? osd_scrub_exec+0x0/0x1dc0 [osd_ldiskfs]
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffffa10d3be0&amp;gt;] ? osd_scrub_next+0x0/0x4b0 [osd_ldiskfs]
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffffa10d8905&amp;gt;] ? osd_scrub_main+0x885/0xec0 [osd_ldiskfs]
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffff81067650&amp;gt;] ? default_wake_function+0x0/0x20
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffffa10d8080&amp;gt;] ? osd_scrub_main+0x0/0xec0 [osd_ldiskfs]
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffff810a138e&amp;gt;] ? kthread+0x9e/0xc0
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffff8100c28a&amp;gt;] ? child_rip+0xa/0x20
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffff810a12f0&amp;gt;] ? kthread+0x0/0xc0
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffff8100c280&amp;gt;] ? child_rip+0x0/0x20
Jan 20 08:59:41 lola-8 kernel: Code: 00 02 05 00 00 48 c7 05 a3 59 02 00 00 00 00 00 c7 05 91 59 02 00 01 00 00 00 e8 d4 ad 7b ff e9 d2 fb ff ff 0f 1f 80 00 00 00 00 &amp;lt;83&amp;gt; f8 05 0f 84 67 03 00 00 83 f8 06 0f 84 a6 04 00 00 41 89 c1 
Jan 20 08:59:41 lola-8 kernel: Call Trace:
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffff81539b0e&amp;gt;] ? thread_return+0x4e/0x7d0
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffffa10d1e20&amp;gt;] ? osd_scrub_exec+0x0/0x1dc0 [osd_ldiskfs]
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffffa10d3be0&amp;gt;] ? osd_scrub_next+0x0/0x4b0 [osd_ldiskfs]
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffffa10d8905&amp;gt;] ? osd_scrub_main+0x885/0xec0 [osd_ldiskfs]
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffff81067650&amp;gt;] ? default_wake_function+0x0/0x20
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffffa10d8080&amp;gt;] ? osd_scrub_main+0x0/0xec0 [osd_ldiskfs]
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffff810a138e&amp;gt;] ? kthread+0x9e/0xc0
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffff8100c28a&amp;gt;] ? child_rip+0xa/0x20
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffff810a12f0&amp;gt;] ? kthread+0x0/0xc0
Jan 20 08:59:41 lola-8 kernel: [&amp;lt;ffffffff8100c280&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="182361" author="yong.fan" created="Fri, 27 Jan 2017 09:44:53 +0000"  >&lt;p&gt;According to the bug description, it seems that one MDT was trying to recover/replay some cross-MDTs RPC, that triggered OUT_ATTR_GET RPC. Unfortunately, such OUT RPC was blocked on remote MDT inside lu_object_find_at(). Usually, the lu_object_find_at() will be blocked if the target object is in-RAM but marked as dying. If someone hold the reference on the dying object, then others will be blocked until all the reference have been released. So we need to find out what the target object is, and who hold the reference on such dying object.&lt;/p&gt;

&lt;p&gt;Do we have any clew about the first question: what the target object is?&lt;br/&gt;
Di, is it possible an update_log file on remote MDT?&lt;/p&gt;

&lt;p&gt;As for the stack trace for OI scrub in above comment, that seems not related. Because OI scrub operates the inode directly, NOT reference the up layer object.&lt;/p&gt;</comment>
                            <comment id="183459" author="gerrit" created="Sat, 4 Feb 2017 11:37:10 +0000"  >&lt;p&gt;Fan Yong (fan.yong@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/25260&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/25260&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8815&quot; title=&quot;MDS fails to complete recovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8815&quot;&gt;&lt;del&gt;LU-8815&lt;/del&gt;&lt;/a&gt; obdclass: debug information for lu_object_find_at&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: e8c1940e2e7b0afa0b54d139c70359425bf92d40&lt;/p&gt;</comment>
                            <comment id="183460" author="yong.fan" created="Sat, 4 Feb 2017 11:40:10 +0000"  >&lt;p&gt;Cliff, would you please to verify the debug patch &lt;a href=&quot;https://review.whamcloud.com/#/c/25260/?&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/25260/?&lt;/a&gt; That can tell us which FID caused the blocking. Thanks!&lt;/p&gt;</comment>
                            <comment id="184261" author="yong.fan" created="Fri, 10 Feb 2017 00:38:05 +0000"  >&lt;p&gt;+1 on master:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/b54d8c9c-eee8-11e6-9756-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/b54d8c9c-eee8-11e6-9756-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="187107" author="yong.fan" created="Mon, 6 Mar 2017 09:08:14 +0000"  >&lt;p&gt;Cliff, do you have any change to reproduce the issue with the patch &lt;a href=&quot;https://review.whamcloud.com/#/c/25260/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/25260/&lt;/a&gt; applied? Thanks!&lt;/p&gt;</comment>
                            <comment id="187192" author="cliffw" created="Mon, 6 Mar 2017 17:49:35 +0000"  >&lt;p&gt;We are moving soak hardware this week, will add that to the list.&lt;/p&gt;</comment>
                            <comment id="190840" author="yong.fan" created="Wed, 5 Apr 2017 11:48:35 +0000"  >&lt;p&gt;Any update with the patch applied? Thanks!&lt;/p&gt;</comment>
                            <comment id="190862" author="cliffw" created="Wed, 5 Apr 2017 14:32:37 +0000"  >&lt;p&gt;We have been busy with the move, but have not seen any recovery issues. I should be able to try the patch this week, is there a build?&lt;/p&gt;</comment>
                            <comment id="190870" author="yong.fan" created="Wed, 5 Apr 2017 15:02:02 +0000"  >&lt;p&gt;&lt;a href=&quot;https://review.whamcloud.com/#/c/25260/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/25260/&lt;/a&gt;&lt;br/&gt;
set 4 build status:&lt;br/&gt;
Overall Build Status: SUCCESS&lt;br/&gt;
Builder CentOS 6.7 x86_64 (BUILD) succeeded (build successful) - &lt;a href=&quot;http://build.lustre.org/builders/CentOS%206.7%20x86_64%20%28BUILD%29/builds/7691&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.lustre.org/builders/CentOS%206.7%20x86_64%20%28BUILD%29/builds/7691&lt;/a&gt;&lt;br/&gt;
Builder CentOS 6.8 x86_64 (BUILD) succeeded (build successful) - &lt;a href=&quot;http://build.lustre.org/builders/CentOS%206.8%20x86_64%20%28BUILD%29/builds/3827&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.lustre.org/builders/CentOS%206.8%20x86_64%20%28BUILD%29/builds/3827&lt;/a&gt;&lt;br/&gt;
Builder CentOS 7.2 x86_64 (BUILD) succeeded (build successful) - &lt;a href=&quot;http://build.lustre.org/builders/CentOS%207.2%20x86_64%20%28BUILD%29/builds/7677&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.lustre.org/builders/CentOS%207.2%20x86_64%20%28BUILD%29/builds/7677&lt;/a&gt;&lt;br/&gt;
Builder Ubuntu 14.04 x86_64 (BUILD) succeeded (build successful) - &lt;a href=&quot;http://build.lustre.org/builders/Ubuntu%2014.04%20x86_64%20%28BUILD%29/builds/7637&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.lustre.org/builders/Ubuntu%2014.04%20x86_64%20%28BUILD%29/builds/7637&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="190888" author="cliffw" created="Wed, 5 Apr 2017 16:51:05 +0000"  >&lt;p&gt;We have also moved to RHEL 7 with new soak, can you re-trigger your build? Jenkins no longer has the bits. I get RPMS from Jenkins. &lt;/p&gt;</comment>
                            <comment id="192651" author="yong.fan" created="Wed, 19 Apr 2017 08:43:39 +0000"  >&lt;p&gt;New build:&lt;br/&gt;
&lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-reviews/46754/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.hpdd.intel.com/job/lustre-reviews/46754/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="195720" author="adilger" created="Fri, 12 May 2017 18:28:01 +0000"  >&lt;p&gt;It may be that this is fixed with patch &lt;a href=&quot;https://review.whamcloud.com/26965&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/26965&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9049&quot; title=&quot;DNE MDT Never completes recovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9049&quot;&gt;&lt;del&gt;LU-9049&lt;/del&gt;&lt;/a&gt; obdclass: unhash object when destroying object&lt;/tt&gt;&quot;.&lt;/p&gt;</comment>
                            <comment id="196482" author="cliffw" created="Fri, 19 May 2017 16:54:08 +0000"  >&lt;p&gt;We have not seen a recovery failure since that time. However, we&apos;ve had other hard bugs that reduced runtime in some cases. &lt;/p&gt;</comment>
                            <comment id="196568" author="pjones" created="Sun, 21 May 2017 12:55:55 +0000"  >&lt;p&gt;Ok I will remove the fix version for the time being and we can continue to monitor&lt;/p&gt;</comment>
                            <comment id="200667" author="yong.fan" created="Fri, 30 Jun 2017 01:51:09 +0000"  >&lt;p&gt;Another instance of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9049&quot; title=&quot;DNE MDT Never completes recovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9049&quot;&gt;&lt;del&gt;LU-9049&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="43326">LU-9049</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="24038" name="lola-10.syslog.txt.gz" size="109077" author="cliffw" created="Wed, 9 Nov 2016 17:54:42 +0000"/>
                            <attachment id="24039" name="lustre-log.1478652100.6053.txt.gz" size="12758571" author="cliffw" created="Wed, 9 Nov 2016 17:54:42 +0000"/>
                            <attachment id="24040" name="lustre-log.1478705828.64556.txt.gz" size="4654" author="cliffw" created="Wed, 9 Nov 2016 17:54:42 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzyv1j:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>