<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:34:00 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10321] MDS - umount hangs during failback </title>
                <link>https://jira.whamcloud.com/browse/LU-10321</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;MDT 2 (soak-10) fails over to soak-11, with errors&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Dec  2 07:27:00 soak-11 kernel: LustreError: 2976:0:(llog_osd.c:960:llog_osd_next_block()) soaked-MDT0003-osp-MDT0002: missed desired record? 2 &amp;gt; 1
Dec  2 07:27:00 soak-11 kernel: LustreError: 2976:0:(lod_dev.c:419:lod_sub_recovery_thread()) soaked-MDT0003-osp-MDT0002 getting update log failed: rc = -2
Dec  2 07:27:00 soak-11 kernel: LustreError: 2976:0:(lod_dev.c:419:lod_sub_recovery_thread()) Skipped 3 previous similar messages
Dec  2 07:27:01 soak-11 kernel: LustreError: 2381:0:(mdt_open.c:1167:mdt_cross_open()) soaked-MDT0002: [0x280002b4c:0xa44:0x0] doesn&apos;t exist!: rc = -14
Dec  2 07:27:02 soak-11 kernel: Lustre: 2977:0:(ldlm_lib.c:2059:target_recovery_overseer()) recovery is aborted, evict exports in recovery
Dec  2 07:27:02 soak-11 kernel: Lustre: 2977:0:(ldlm_lib.c:2059:target_recovery_overseer()) Skipped 2 previous similar messages
Dec  2 07:27:02 soak-11 kernel: Lustre: soaked-MDT0002: disconnecting 31 stale clients
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Soak attempts a umount which hangs:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;2017-12-02 07:27:16,430:fsmgmt.fsmgmt:INFO     Unmounting soaked-MDT0002 on soak-11 ...
soak-11
Dec  2 07:30:16 soak-11 kernel: INFO: task umount:3039 blocked &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; more than 120 seconds.
Dec  2 07:30:16 soak-11 kernel: &lt;span class=&quot;code-quote&quot;&gt;&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot;&lt;/span&gt; disables &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; message.
Dec  2 07:30:16 soak-11 kernel: umount          D ffff8803c81f4008     0  3039   3037 0x00000080
Dec  2 07:30:16 soak-11 kernel: ffff8803ce3afa30 0000000000000086 ffff88081fa50000 ffff8803ce3affd8
Dec  2 07:30:16 soak-11 kernel: ffff8803ce3affd8 ffff8803ce3affd8 ffff88081fa50000 ffff8803c81f4000
Dec  2 07:30:16 soak-11 kernel: ffff8803c81f4004 ffff88081fa50000 00000000ffffffff ffff8803c81f4008
Dec  2 07:30:16 soak-11 kernel: Call Trace:
Dec  2 07:30:16 soak-11 kernel: [&amp;lt;ffffffff816aa489&amp;gt;] schedule_preempt_disabled+0x29/0x70
Dec  2 07:30:16 soak-11 kernel: [&amp;lt;ffffffff816a83b7&amp;gt;] __mutex_lock_slowpath+0xc7/0x1d0
Dec  2 07:30:16 soak-11 kernel: [&amp;lt;ffffffff816a77cf&amp;gt;] mutex_lock+0x1f/0x2f
Dec  2 07:30:16 soak-11 kernel: [&amp;lt;ffffffffc14560c7&amp;gt;] lfsck_stop+0x167/0x4e0 [lfsck]
Dec  2 07:30:16 soak-11 kernel: [&amp;lt;ffffffff810c4832&amp;gt;] ? default_wake_function+0x12/0x20
Dec  2 07:30:16 soak-11 kernel: [&amp;lt;ffffffff811e0593&amp;gt;] ? __kmalloc+0x1e3/0x230
Dec  2 07:30:16 soak-11 kernel: [&amp;lt;ffffffffc1625aa6&amp;gt;] mdd_iocontrol+0x96/0x16a0 [mdd]
Dec  2 07:30:17 soak-11 kernel: [&amp;lt;ffffffffc0ec9619&amp;gt;] ? lprocfs_counter_add+0xf9/0x160 [obdclass]
Dec  2 07:30:17 soak-11 kernel: [&amp;lt;ffffffffc1500fc1&amp;gt;] mdt_device_fini+0x71/0x920 [mdt]
Dec  2 07:30:17 soak-11 kernel: [&amp;lt;ffffffffc0ed6911&amp;gt;] class_cleanup+0x971/0xcd0 [obdclass]
Dec  2 07:30:17 soak-11 kernel: [&amp;lt;ffffffffc0ed8cad&amp;gt;] class_process_config+0x19cd/0x23b0 [obdclass]
Dec  2 07:30:17 soak-11 kernel: [&amp;lt;ffffffffc0dc6bc7&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
Dec  2 07:30:17 soak-11 kernel: [&amp;lt;ffffffffc0ed9856&amp;gt;] class_manual_cleanup+0x1c6/0x710 [obdclass]
Dec  2 07:30:17 soak-11 kernel: [&amp;lt;ffffffffc0f07fee&amp;gt;] server_put_super+0x8de/0xcd0 [obdclass]
Dec  2 07:30:17 soak-11 kernel: [&amp;lt;ffffffff81203692&amp;gt;] generic_shutdown_super+0x72/0x100
Dec  2 07:30:17 soak-11 kernel: [&amp;lt;ffffffff81203a62&amp;gt;] kill_anon_super+0x12/0x20
Dec  2 07:30:17 soak-11 kernel: [&amp;lt;ffffffffc0edc152&amp;gt;] lustre_kill_super+0x32/0x50 [obdclass]
Dec  2 07:30:17 soak-11 kernel: [&amp;lt;ffffffff81203e19&amp;gt;] deactivate_locked_super+0x49/0x60
Dec  2 07:30:17 soak-11 kernel: [&amp;lt;ffffffff81204586&amp;gt;] deactivate_super+0x46/0x60
Dec  2 07:30:17 soak-11 kernel: [&amp;lt;ffffffff812217cf&amp;gt;] cleanup_mnt+0x3f/0x80
Dec  2 07:30:18 soak-11 kernel: [&amp;lt;ffffffff81221862&amp;gt;] __cleanup_mnt+0x12/0x20
Dec  2 07:30:18 soak-11 kernel: [&amp;lt;ffffffff810ad275&amp;gt;] task_work_run+0xc5/0xf0
Dec  2 07:30:18 soak-11 kernel: [&amp;lt;ffffffff8102ab62&amp;gt;] do_notify_resume+0x92/0xb0
Dec  2 07:30:18 soak-11 kernel: [&amp;lt;ffffffff816b533d&amp;gt;] int_signal+0x12/0x17
Dec  2 07:30:19 soak-11 kernel: LustreError: 11-0: soaked-OST0016-osc-MDT0002: operation ost_connect to node 192.168.1.106@o2ib failed: rc = -114
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This wedges soak, no further faults are attempted, jobs stop scheduling. &lt;br/&gt;
This happened over the weekend. Dumped Lustre logs, forced a crash dump. &lt;br/&gt;
Logs, crash info attached.&lt;br/&gt;
Full crash dump is available on Spirit. &lt;/p&gt;</description>
                <environment>Soak stress cluster MLNX networking stack.</environment>
        <key id="49541">LU-10321</key>
            <summary>MDS - umount hangs during failback </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="yong.fan">nasf</assignee>
                                    <reporter username="cliffw">Cliff White</reporter>
                        <labels>
                            <label>soak</label>
                    </labels>
                <created>Mon, 4 Dec 2017 17:54:10 +0000</created>
                <updated>Fri, 2 Feb 2018 18:56:50 +0000</updated>
                            <resolved>Thu, 4 Jan 2018 16:36:21 +0000</resolved>
                                    <version>Lustre 2.10.2</version>
                                    <fixVersion>Lustre 2.11.0</fixVersion>
                    <fixVersion>Lustre 2.10.4</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="215263" author="jhammond" created="Mon, 4 Dec 2017 21:35:37 +0000"  >&lt;p&gt;Maybe a lfsck start/recovery dealdock.&lt;/p&gt;

&lt;p&gt;umount is waiting in &lt;tt&gt;lfsck_stop()&lt;/tt&gt; on &lt;tt&gt;li_mutex&lt;/tt&gt;. &lt;tt&gt;tgt_recover_2&lt;/tt&gt; is in &lt;tt&gt;lfsck_start()&lt;/tt&gt; holding &lt;tt&gt;li_mutex&lt;/tt&gt;. Possibly waiting on &lt;tt&gt;lfsck_layout&lt;/tt&gt; (pid 3013) which is sending notifications but is stuck waiting for import recovery.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00100000:10000000:17.0:1512199622.506187:0:3013:0:(lfsck_engine.c:1576:lfsck_assistant_engine()) soaked-MDT0002-osd: lfsck_layout LFSCK assistant thread start
00000100:00080000:17.0:1512199622.506589:0:3013:0:(client.c:1563:ptlrpc_send_new_req()) @@@ req from PID 0 waiting for recovery: (FULL != CONNECTING)  req@ffff8803ee66b900 x1585651536798656/t0(0) o1101-&amp;gt;soaked-OST0016-osc-MDT0002@192.168.1.106@o2ib:28/4 lens 320/224 e 0 to 0 dl 0 ref 1 fl Rpc:W/0/ffffffff rc 0/-1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Relevant backtraces:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lfsck_namespace S ffff8803c81f4000     0  3012      2 0x00000080^M
 [&amp;lt;----------------&amp;gt;] schedule+0x29/0x70^M
 [&amp;lt;----------------&amp;gt;] lfsck_assistant_engine+0x118d/0x20c0 [lfsck]^M
 [&amp;lt;----------------&amp;gt;] ? dequeue_task_fair+0x41e/0x660^M
 [&amp;lt;----------------&amp;gt;] ? __schedule+0x39d/0x8b0^M
 [&amp;lt;----------------&amp;gt;] ? wake_up_state+0x20/0x20^M
 [&amp;lt;----------------&amp;gt;] ? lfsck_master_engine+0x12f0/0x12f0 [lfsck]^M
 [&amp;lt;----------------&amp;gt;] kthread+0xcf/0xe0^M
 [&amp;lt;----------------&amp;gt;] ? insert_kthread_work+0x40/0x40^M
 [&amp;lt;----------------&amp;gt;] ret_from_fork+0x58/0x90^M
 [&amp;lt;----------------&amp;gt;] ? insert_kthread_work+0x40/0x40^M

lfsck           S ffff8803f6dc9800     0  3010      2 0x00000080^M
 [&amp;lt;----------------&amp;gt;] schedule+0x29/0x70^M
 [&amp;lt;----------------&amp;gt;] lfsck_start_assistant+0x1d5/0x320 [lfsck]^M
 [&amp;lt;----------------&amp;gt;] ? dt_read+0x14/0x50 [obdclass]^M
 [&amp;lt;----------------&amp;gt;] ? lfsck_layout_load_bitmap+0x4bb/0x710 [lfsck]^M
 [&amp;lt;----------------&amp;gt;] ? wake_up_state+0x20/0x20^M
 [&amp;lt;----------------&amp;gt;] lfsck_layout_master_prep+0xfe/0x250 [lfsck]^M
 [&amp;lt;----------------&amp;gt;] lfsck_master_engine+0x176/0x12f0 [lfsck]^M
 [&amp;lt;----------------&amp;gt;] ? __switch_to+0xd7/0x510^M
 [&amp;lt;----------------&amp;gt;] ? __schedule+0x39d/0x8b0^M
 [&amp;lt;----------------&amp;gt;] ? lfsck_master_oit_engine+0x1150/0x1150 [lfsck]^M
 [&amp;lt;----------------&amp;gt;] kthread+0xcf/0xe0^M
 [&amp;lt;----------------&amp;gt;] ? insert_kthread_work+0x40/0x40^M
 [&amp;lt;----------------&amp;gt;] ret_from_fork+0x58/0x90^M
 [&amp;lt;----------------&amp;gt;] ? insert_kthread_work+0x40/0x40^M

lfsck_layout    S 000000000000f908     0  3013      2 0x00000080^M
 [&amp;lt;----------------&amp;gt;] schedule+0x29/0x70^M
 [&amp;lt;----------------&amp;gt;] schedule_timeout+0x174/0x2c0^M
 [&amp;lt;----------------&amp;gt;] ? internal_add_timer+0x70/0x70^M
 [&amp;lt;----------------&amp;gt;] ? ptlrpc_init_rq_pool+0x110/0x110 [ptlrpc]^M
 [&amp;lt;----------------&amp;gt;] ptlrpc_set_wait+0x4c0/0x910 [ptlrpc]^M
 [&amp;lt;----------------&amp;gt;] ? wake_up_state+0x20/0x20^M
 [&amp;lt;----------------&amp;gt;] lfsck_assistant_notify_others+0x110f/0x1390 [lfsck]^M
 [&amp;lt;----------------&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]^M
 [&amp;lt;----------------&amp;gt;] ? lfsck_master_engine+0x12f0/0x12f0 [lfsck]^M
 [&amp;lt;----------------&amp;gt;] lfsck_assistant_engine+0x140/0x20c0 [lfsck]^M
 [&amp;lt;----------------&amp;gt;] ? check_preempt_curr+0x78/0xa0^M
 [&amp;lt;----------------&amp;gt;] ? dequeue_task_fair+0x41e/0x660^M
 [&amp;lt;----------------&amp;gt;] ? __switch_to+0xd7/0x510^M
 [&amp;lt;----------------&amp;gt;] ? __schedule+0x39d/0x8b0^M
 [&amp;lt;----------------&amp;gt;] ? lfsck_master_engine+0x12f0/0x12f0 [lfsck]^M
 [&amp;lt;----------------&amp;gt;] kthread+0xcf/0xe0^M
 [&amp;lt;----------------&amp;gt;] ? insert_kthread_work+0x40/0x40^M
 [&amp;lt;----------------&amp;gt;] ret_from_fork+0x58/0x90^M
 [&amp;lt;----------------&amp;gt;] ? insert_kthread_work+0x40/0x40^M

tgt_recover_2   S ffff8808067d9fa0     0  2977      2 0x00000080^M
 [&amp;lt;----------------&amp;gt;] schedule+0x29/0x70^M
 [&amp;lt;----------------&amp;gt;] lfsck_start+0xf25/0x17c0 [lfsck]^M
 [&amp;lt;----------------&amp;gt;] ? llog_key_init+0x20/0xd0 [obdclass]^M
 [&amp;lt;----------------&amp;gt;] ? wake_up_state+0x20/0x20^M
 [&amp;lt;----------------&amp;gt;] mdd_iocontrol+0x3d3/0x16a0 [mdd]^M
 [&amp;lt;----------------&amp;gt;] ? osp_key_init+0x3b/0xd0 [osp]^M
 [&amp;lt;----------------&amp;gt;] mdt_postrecov+0x148/0x1d0 [mdt]^M
 [&amp;lt;----------------&amp;gt;] mdt_obd_postrecov+0x8d/0xb0 [mdt]^M
 [&amp;lt;----------------&amp;gt;] target_recovery_thread+0xaf2/0x1360 [ptlrpc]^M
 [&amp;lt;----------------&amp;gt;] ? __schedule+0x39d/0x8b0^M
 [&amp;lt;----------------&amp;gt;] ? replay_request_or_update.isra.21+0x8c0/0x8c0 [ptlrpc]^M
 [&amp;lt;----------------&amp;gt;] kthread+0xcf/0xe0^M
 [&amp;lt;----------------&amp;gt;] ? insert_kthread_work+0x40/0x40^M
 [&amp;lt;----------------&amp;gt;] ret_from_fork+0x58/0x90^M
 [&amp;lt;----------------&amp;gt;] ? insert_kthread_work+0x40/0x40^M

umount          D ffff8803c81f4008     0  3039   3037 0x00000080^M
 [&amp;lt;----------------&amp;gt;] schedule_preempt_disabled+0x29/0x70^M
 [&amp;lt;----------------&amp;gt;] __mutex_lock_slowpath+0xc7/0x1d0^M
 [&amp;lt;----------------&amp;gt;] mutex_lock+0x1f/0x2f^M
 [&amp;lt;----------------&amp;gt;] lfsck_stop+0x167/0x4e0 [lfsck]^M
 [&amp;lt;----------------&amp;gt;] ? default_wake_function+0x12/0x20^M
 [&amp;lt;----------------&amp;gt;] ? __kmalloc+0x1e3/0x230^M
 [&amp;lt;----------------&amp;gt;] mdd_iocontrol+0x96/0x16a0 [mdd]^M
 [&amp;lt;----------------&amp;gt;] ? lprocfs_counter_add+0xf9/0x160 [obdclass]^M
 [&amp;lt;----------------&amp;gt;] mdt_device_fini+0x71/0x920 [mdt]^M
 [&amp;lt;----------------&amp;gt;] class_cleanup+0x971/0xcd0 [obdclass]^M
 [&amp;lt;----------------&amp;gt;] class_process_config+0x19cd/0x23b0 [obdclass]^M
 [&amp;lt;----------------&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]^M
 [&amp;lt;----------------&amp;gt;] class_manual_cleanup+0x1c6/0x710 [obdclass]^M
 [&amp;lt;----------------&amp;gt;] server_put_super+0x8de/0xcd0 [obdclass]^M
 [&amp;lt;----------------&amp;gt;] generic_shutdown_super+0x72/0x100^M
 [&amp;lt;----------------&amp;gt;] kill_anon_super+0x12/0x20^M
 [&amp;lt;----------------&amp;gt;] lustre_kill_super+0x32/0x50 [obdclass]^M
 [&amp;lt;----------------&amp;gt;] deactivate_locked_super+0x49/0x60^M
 [&amp;lt;----------------&amp;gt;] deactivate_super+0x46/0x60^M
 [&amp;lt;----------------&amp;gt;] cleanup_mnt+0x3f/0x80^M
 [&amp;lt;----------------&amp;gt;] __cleanup_mnt+0x12/0x20^M
 [&amp;lt;----------------&amp;gt;] task_work_run+0xc5/0xf0^M
 [&amp;lt;----------------&amp;gt;] do_notify_resume+0x92/0xb0^M
 [&amp;lt;----------------&amp;gt;] int_signal+0x12/0x17^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="215264" author="jhammond" created="Mon, 4 Dec 2017 21:36:11 +0000"  >&lt;p&gt;Fan Yong, does this look familiar?&lt;/p&gt;</comment>
                            <comment id="215312" author="yong.fan" created="Tue, 5 Dec 2017 15:30:26 +0000"  >&lt;p&gt;According to current implementation (master), the LFSCK notify RPC is interruptible, so would you please to  try &quot;kill -9 3013&quot; to check whether the layout LFSCK assistant engine can be interrupted? If yes, then the lfsck_start can go ahead (exit with failure), and then the lfsck_stop will not be blocked.&lt;/p&gt;</comment>
                            <comment id="215395" author="cliffw" created="Tue, 5 Dec 2017 23:17:47 +0000"  >&lt;p&gt;Okay, so you want lfsck_layout process killed if this happens again?&lt;/p&gt;</comment>
                            <comment id="215409" author="yong.fan" created="Wed, 6 Dec 2017 01:46:17 +0000"  >&lt;p&gt;Yes, please kill the blocked (on recovery) LFSCK assistant thread, such as lfsck_layout or lfsck_namespace. It is expected to work, otherwise, we need to find out the solution.&lt;/p&gt;</comment>
                            <comment id="215445" author="cliffw" created="Wed, 6 Dec 2017 16:12:19 +0000"  >&lt;p&gt;It does not work.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@soak-8 ~]# ps -ef |grep lfsck
root       6695   6693  0 04:35 ?        00:00:00 bash -c  ??lctl lfsck_start -M soaked-MDT0000 -s 1000 -t all -A ??&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; [[ $? != 0 ]]; then ???lctl lfsck_start -M soaked-MDT0000 -s 1000 -t namespace,layout -A ???&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; [[ $? != 0 ]]; then ????lctl lfsck_start -M soaked-MDT0000 -s 1000 -t namespace -A ???fi ??fi
root       6713   6695  0 04:35 ?        00:00:00 lctl lfsck_start -M soaked-MDT0000 -s 1000 -t all -A
root       6714      2  0 04:35 ?        00:00:00 [lfsck]
root       6716      2  0 04:35 ?        00:00:00 [lfsck_layout]
root      53709  53676  0 16:09 pts/0    00:00:00 lctl lfsck_stop -M soaked-MDT0000
root      53748  53717  0 16:10 pts/1    00:00:00 grep --color=auto lfsck
[root@soak-8 ~]# kill -9 6716
[root@soak-8 ~]# kill -9 6716
[root@soak-8 ~]# ps -ef |grep lfsck
root       6695   6693  0 04:35 ?        00:00:00 bash -c  ??lctl lfsck_start -M soaked-MDT0000 -s 1000 -t all -A ??&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; [[ $? != 0 ]]; then ???lctl lfsck_start -M soaked-MDT0000 -s 1000 -t namespace,layout -A ???&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; [[ $? != 0 ]]; then ????lctl lfsck_start -M soaked-MDT0000 -s 1000 -t namespace -A ???fi ??fi
root       6713   6695  0 04:35 ?        00:00:00 lctl lfsck_start -M soaked-MDT0000 -s 1000 -t all -A
root       6714      2  0 04:35 ?        00:00:00 [lfsck]
root       6716      2  0 04:35 ?        00:00:00 [lfsck_layout]
root      53709  53676  0 16:09 pts/0    00:00:00 lctl lfsck_stop -M soaked-MDT0000
root      53754  53717  0 16:11 pts/1    00:00:00 grep --color=auto lfsck
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Process is un-killable, soak-8 is wedged. I can leave it like this if you want to login.&lt;/p&gt;</comment>
                            <comment id="215448" author="yong.fan" created="Wed, 6 Dec 2017 16:50:57 +0000"  >&lt;p&gt;Just login soak-8, but seems reboot.&lt;br/&gt;
This situation is more complex than &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6684&quot; title=&quot;lctl lfsck_stop hangs&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6684&quot;&gt;&lt;del&gt;LU-6684&lt;/del&gt;&lt;/a&gt;. According to current (master) implementation, when lfsck_stop, it will send SIGINT signal to the LFSCK engines, then even if the LFSCK engines are blocked by some RPCs, it still can be waken up and exit. But in this case, the lfsck_stop was blocked on the mutex, then it has no chance to send the SIGINT single.&lt;/p&gt;</comment>
                            <comment id="215449" author="cliffw" created="Wed, 6 Dec 2017 16:51:58 +0000"  >&lt;p&gt;Sorry, I&apos;ll let you know when the next hang happens. I should have dumped stacks before the reboot.&lt;/p&gt;</comment>
                            <comment id="215505" author="cliffw" created="Wed, 6 Dec 2017 23:00:16 +0000"  >&lt;p&gt;I think i have re-produced the hang, I&apos;ll leave it like this until you have a look. &lt;br/&gt;
Sigh, it finally exited. Trying again.&lt;/p&gt;</comment>
                            <comment id="215523" author="gerrit" created="Thu, 7 Dec 2017 07:58:09 +0000"  >&lt;p&gt;Fan Yong (fan.yong@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/30420&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30420&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10321&quot; title=&quot;MDS - umount hangs during failback &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10321&quot;&gt;&lt;del&gt;LU-10321&lt;/del&gt;&lt;/a&gt; lfsck: allow to stop the in-starting lfsck&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 7817928a62c8a212f5c2b040587042c193bd5c1a&lt;/p&gt;</comment>
                            <comment id="215525" author="gerrit" created="Thu, 7 Dec 2017 08:06:20 +0000"  >&lt;p&gt;Fan Yong (fan.yong@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/30422&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30422&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10321&quot; title=&quot;MDS - umount hangs during failback &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10321&quot;&gt;&lt;del&gt;LU-10321&lt;/del&gt;&lt;/a&gt; lfsck: allow to stop the in-starting lfsck&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 162e2f5c04b32904053ac31795450eb9da521c00&lt;/p&gt;</comment>
                            <comment id="215594" author="adilger" created="Thu, 7 Dec 2017 19:13:39 +0000"  >&lt;p&gt;I think there is already an option &lt;tt&gt;no scrub&lt;/tt&gt; to prevent Oi scrub from starting. &lt;/p&gt;

&lt;p&gt;However, it would also be useful if it was possible to force LFSCK to abort the running scan at startup, in case there is a problem like this at a customer. Is &#8220;&lt;tt&gt;no scrub&lt;/tt&gt;&#8221; along with &#8220;&lt;tt&gt;lctl lfsck_stop &#8212;reset&lt;/tt&gt;&#8221; enough to kill the running LFSCK, and erase the saved state?&lt;/p&gt;</comment>
                            <comment id="215633" author="yong.fan" created="Fri, 8 Dec 2017 01:28:59 +0000"  >&lt;p&gt;For this case, it is NOT the LFSCK auto-resumed from the last check-point, instead, it was started manually (or by scripts) after MDT failover. So no related with mount option.&lt;/p&gt;</comment>
                            <comment id="215665" author="cliffw" created="Fri, 8 Dec 2017 15:54:37 +0000"  >&lt;p&gt;Hit the problem again  last night. System completed a failback, MDTs are all mounted in the proper place, lfsck is wedged. Will leave the system if you wish to have a look. &lt;/p&gt;</comment>
                            <comment id="215834" author="cliffw" created="Fri, 8 Dec 2017 20:24:57 +0000"  >&lt;p&gt;Dumped lustre logs from all MDS, output in /scratch/results/soak/soak-X-hung.lfsck.Dec08.txt&lt;br/&gt;
Dumped stacks on all MDS, output in console logs. Crash-dumped all MDS, results on Spirit. &lt;br/&gt;
Restarting. &lt;/p&gt;</comment>
                            <comment id="215950" author="cliffw" created="Mon, 11 Dec 2017 17:37:47 +0000"  >&lt;p&gt;Had a simular hang on soak-3 and soak-6 (OSS) during umount. Dumped lustre-logs, attached. Also dumped stack traces and crash dumped both nodes, output available on spirit. &lt;/p&gt;</comment>
                            <comment id="216041" author="yong.fan" created="Tue, 12 Dec 2017 12:16:26 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=cliffw&quot; class=&quot;user-hover&quot; rel=&quot;cliffw&quot;&gt;cliffw&lt;/a&gt;, as you said on Skype, there are multiple issues during current Spirit tests. The original hung happened on the MDT because of the blocked uninterrupted LFSCK. The patch 30420 is used for resolving such trouble. But it does not means the Spirit will not hung after applying the patch, because some other issues may also block the system. For the new hung, it happened on the OST side, different from the original LFSCK hung. So would you please to check whether the original LFSCK hung issue resolved or not? Thanks!&lt;/p&gt;</comment>
                            <comment id="216085" author="cliffw" created="Tue, 12 Dec 2017 18:42:06 +0000"  >&lt;p&gt;We have not seen lfsck hangs with MDT failover after the patch. The two hangs occurred with OST failover. After the most recent hang, i was able to reboot/remount the system and then I could start and stop lfsck without the hang. &lt;/p&gt;</comment>
                            <comment id="216136" author="yong.fan" created="Wed, 13 Dec 2017 05:41:48 +0000"  >&lt;p&gt;According to the logs, three were new LFSCK started just after lfsck_stop during the MDT umount. Then nobody will stop the new triggered LFSCK as to the MDT cannot umount. I will make patch to resolve the race condition.&lt;/p&gt;</comment>
                            <comment id="216137" author="gerrit" created="Wed, 13 Dec 2017 05:42:07 +0000"  >&lt;p&gt;Fan Yong (fan.yong@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/30513&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30513&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10321&quot; title=&quot;MDS - umount hangs during failback &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10321&quot;&gt;&lt;del&gt;LU-10321&lt;/del&gt;&lt;/a&gt; lfsck: not start lfsck during umount&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 6263064b369dd4fddbb0dfa9ab49013a0d791629&lt;/p&gt;</comment>
                            <comment id="216138" author="gerrit" created="Wed, 13 Dec 2017 05:52:27 +0000"  >&lt;p&gt;Fan Yong (fan.yong@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/30514&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30514&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10321&quot; title=&quot;MDS - umount hangs during failback &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10321&quot;&gt;&lt;del&gt;LU-10321&lt;/del&gt;&lt;/a&gt; lfsck: not start lfsck during umount&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 2022d417ddaf663dc7addb5389acade0390996e5&lt;/p&gt;</comment>
                            <comment id="216964" author="cliffw" created="Thu, 21 Dec 2017 17:34:15 +0000"  >&lt;p&gt;We just hit this again on master, so we need a version of the patch for master. &lt;/p&gt;</comment>
                            <comment id="216967" author="cliffw" created="Thu, 21 Dec 2017 17:58:24 +0000"  >&lt;p&gt;Okay, we have an IB build so we will test this.&lt;/p&gt;</comment>
                            <comment id="217186" author="cliffw" created="Tue, 26 Dec 2017 16:23:00 +0000"  >&lt;p&gt;We are still hitting this issue, with the patch. Soak-10 just hit it, crash dump vmcore-dmesg and lustre-log attached&lt;/p&gt;</comment>
                            <comment id="217425" author="gerrit" created="Thu, 4 Jan 2018 02:48:08 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/30420/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30420/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10321&quot; title=&quot;MDS - umount hangs during failback &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10321&quot;&gt;&lt;del&gt;LU-10321&lt;/del&gt;&lt;/a&gt; lfsck: allow to stop the in-starting lfsck&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 9c9a05fee6c0fce557dfa578ff7116b905d4e00a&lt;/p&gt;</comment>
                            <comment id="217426" author="gerrit" created="Thu, 4 Jan 2018 02:48:14 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/30513/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30513/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10321&quot; title=&quot;MDS - umount hangs during failback &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10321&quot;&gt;&lt;del&gt;LU-10321&lt;/del&gt;&lt;/a&gt; lfsck: not start lfsck during umount&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 0cabe06be1fcc6c3ff690889b1e86bbe4690b854&lt;/p&gt;</comment>
                            <comment id="217494" author="yong.fan" created="Thu, 4 Jan 2018 16:36:21 +0000"  >&lt;p&gt;Patches have been landed to master, other LFSCK start trouble will be handled via &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10419&quot; title=&quot;LFSCK fails to start, hangs systems. &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10419&quot;&gt;&lt;del&gt;LU-10419&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="219880" author="gerrit" created="Fri, 2 Feb 2018 18:11:44 +0000"  >&lt;p&gt;John L. Hammond (john.hammond@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/30422/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30422/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10321&quot; title=&quot;MDS - umount hangs during failback &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10321&quot;&gt;&lt;del&gt;LU-10321&lt;/del&gt;&lt;/a&gt; lfsck: allow to stop the in-starting lfsck&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 147b2b4177f9f8d3c4407cd74bb1b97a5c5d7f53&lt;/p&gt;</comment>
                            <comment id="219881" author="gerrit" created="Fri, 2 Feb 2018 18:12:01 +0000"  >&lt;p&gt;John L. Hammond (john.hammond@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/30514/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30514/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10321&quot; title=&quot;MDS - umount hangs during failback &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10321&quot;&gt;&lt;del&gt;LU-10321&lt;/del&gt;&lt;/a&gt; lfsck: not start lfsck during umount&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: e82d8ad284fd37bbc4863723aa42c927d456fae6&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="30488">LU-6684</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="29058" name="soak-10.umount.hang.txt.gz" size="9540747" author="cliffw" created="Tue, 26 Dec 2017 16:23:20 +0000"/>
                            <attachment id="28830" name="soak-11.stacks.txt.gz" size="171251" author="cliffw" created="Mon, 4 Dec 2017 17:52:31 +0000"/>
                            <attachment id="28829" name="soak-11.umount.hang.txt" size="93323612" author="cliffw" created="Mon, 4 Dec 2017 17:54:02 +0000"/>
                            <attachment id="28902" name="soak-3.umount.hang.txt.gz" size="7162805" author="cliffw" created="Mon, 11 Dec 2017 17:38:07 +0000"/>
                            <attachment id="28903" name="soak-6.umount.hang.txt.gz" size="230490" author="cliffw" created="Mon, 11 Dec 2017 17:38:01 +0000"/>
                            <attachment id="29059" name="vmcore-dmesg.txt" size="1045638" author="cliffw" created="Tue, 26 Dec 2017 16:23:13 +0000"/>
                            <attachment id="28828" name="vmcore-dmesg.txt" size="1045495" author="cliffw" created="Mon, 4 Dec 2017 17:52:32 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzonr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>