<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:02:42 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13608] MDT stuck in WAITING, abort_recov stuck too</title>
                <link>https://jira.whamcloud.com/browse/LU-13608</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;when some network switch goes down, taking with it a number of compute nodes. &lt;br/&gt;
a bunch of compute nodes were stuck, and also caused some oss to go down to hit&lt;br/&gt;
LBUG (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12906&quot; title=&quot;LBUG ASSERTION( rspt-&amp;gt;rspt_cpt == cpt ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12906&quot;&gt;&lt;del&gt;LU-12906&lt;/del&gt;&lt;/a&gt;) and crash.&lt;/p&gt;

&lt;p&gt;the mds went into softlockups before crashing. when it got back, 3 out of 4 MDTs&lt;br/&gt;
mounted and recovered, but one MDT went into WAITING and stayed there.&lt;br/&gt;
lctl abort_recov had no effect on its status. so the mds was rebooted.&lt;/p&gt;

&lt;p&gt;when  the MDT again went into WAITING state, a pre-emptive abort_recov was issued&lt;br/&gt;
before it could time-out. but it did not help and the MDT continued to try to recover.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2020-05-08 18:52:15 [ 1640.880984] Pid: 15922, comm: mdt02_020 3.10.0-1062.4.1.el7_lustre.x86_64 #1 SMP Mon Oct 28 01:39:05 UTC 2019
2020-05-08 18:52:15 [ 1640.892505] Call Trace:
2020-05-08 18:52:15 [ 1640.895708]  [&amp;lt;ffffffffc169bdc0&amp;gt;] ptlrpc_set_wait+0x480/0x790 [ptlrpc]
2020-05-08 18:52:15 [ 1640.903498]  [&amp;lt;ffffffffc169c153&amp;gt;] ptlrpc_queue_wait+0x83/0x230 [ptlrpc]
2020-05-08 18:52:15 [ 1640.911383]  [&amp;lt;ffffffffc1a9eaf3&amp;gt;] osp_remote_sync+0xd3/0x200 [osp]
2020-05-08 18:52:15 [ 1640.918760]  [&amp;lt;ffffffffc1a84c63&amp;gt;] osp_attr_get+0x463/0x730 [osp]
2020-05-08 18:52:15 [ 1640.925917]  [&amp;lt;ffffffffc1a818cd&amp;gt;] osp_object_init+0x16d/0x2d0 [osp]
2020-05-08 18:52:15 [ 1640.933361]  [&amp;lt;ffffffffc141c59b&amp;gt;] lu_object_start.isra.35+0x8b/0x120 [obdclass]
2020-05-08 18:52:15 [ 1640.941977]  [&amp;lt;ffffffffc1420471&amp;gt;] lu_object_find_at+0x1e1/0xa60 [obdclass]
2020-05-08 18:52:15 [ 1640.950100]  [&amp;lt;ffffffffc1420d06&amp;gt;] lu_object_find+0x16/0x20 [obdclass]
2020-05-08 18:52:15 [ 1640.957737]  [&amp;lt;ffffffffc194a01b&amp;gt;] mdt_object_find+0x4b/0x170 [mdt]
2020-05-08 18:52:15 [ 1640.965074]  [&amp;lt;ffffffffc194cc38&amp;gt;] mdt_getattr_name_lock+0x848/0x1c30 [mdt]
2020-05-08 18:52:15 [ 1640.973194]  [&amp;lt;ffffffffc1954d25&amp;gt;] mdt_intent_getattr+0x2b5/0x480 [mdt]
2020-05-08 18:52:15 [ 1640.980928]  [&amp;lt;ffffffffc1951bb5&amp;gt;] mdt_intent_policy+0x435/0xd80 [mdt]
2020-05-08 18:52:15 [ 1640.988552]  [&amp;lt;ffffffffc1659d56&amp;gt;] ldlm_lock_enqueue+0x356/0xa20 [ptlrpc]
2020-05-08 18:52:15 [ 1640.996483]  [&amp;lt;ffffffffc1682366&amp;gt;] ldlm_handle_enqueue0+0xa56/0x15f0 [ptlrpc]
2020-05-08 18:52:15 [ 1641.004811]  [&amp;lt;ffffffffc170ab02&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
2020-05-08 18:52:15 [ 1641.012076]  [&amp;lt;ffffffffc17112ea&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
2020-05-08 18:52:15 [ 1641.020213]  [&amp;lt;ffffffffc16b629b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
2020-05-08 18:52:15 [ 1641.029217]  [&amp;lt;ffffffffc16b9bfc&amp;gt;] ptlrpc_main+0xb2c/0x1460 [ptlrpc]

2020-05-08 18:52:15 [ 1641.056176] Pid: 14912, comm: mdt03_006 3.10.0-1062.4.1.el7_lustre.x86_64 #1 SMP Mon Oct 28 01:39:05 UTC 2019
2020-05-08 18:52:15 [ 1641.067667] Call Trace:
2020-05-08 18:52:15 [ 1641.070833]  [&amp;lt;ffffffffc1672b96&amp;gt;] ldlm_completion_ast+0x4e6/0x860 [ptlrpc]
2020-05-08 18:52:15 [ 1641.078951]  [&amp;lt;ffffffffc167492f&amp;gt;] ldlm_cli_enqueue_fini+0x96f/0xdf0 [ptlrpc]
2020-05-08 18:52:15 [ 1641.087272]  [&amp;lt;ffffffffc167751e&amp;gt;] ldlm_cli_enqueue+0x40e/0x920 [ptlrpc]
2020-05-08 18:52:15 [ 1641.095108]  [&amp;lt;ffffffffc1a997f2&amp;gt;] osp_md_object_lock+0x162/0x2d0 [osp]
2020-05-08 18:52:15 [ 1641.102832]  [&amp;lt;ffffffffc10cb193&amp;gt;] lod_object_lock+0xf3/0x7b0 [lod]
2020-05-08 18:52:15 [ 1641.110179]  [&amp;lt;ffffffffc1a2eeee&amp;gt;] mdd_object_lock+0x3e/0xe0 [mdd]
2020-05-08 18:52:15 [ 1641.117429]  [&amp;lt;ffffffffc194a341&amp;gt;] mdt_remote_object_lock_try+0x1e1/0x750 [mdt]
2020-05-08 18:52:15 [ 1641.125926]  [&amp;lt;ffffffffc194a8da&amp;gt;] mdt_remote_object_lock+0x2a/0x30 [mdt]
2020-05-08 18:52:15 [ 1641.133847]  [&amp;lt;ffffffffc195f2ae&amp;gt;] mdt_rename_lock+0xbe/0x4b0 [mdt]
2020-05-08 18:52:15 [ 1641.141189]  [&amp;lt;ffffffffc1961605&amp;gt;] mdt_reint_rename+0x2c5/0x2b90 [mdt]
2020-05-08 18:52:15 [ 1641.148819]  [&amp;lt;ffffffffc196a693&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
2020-05-08 18:52:15 [ 1641.155965]  [&amp;lt;ffffffffc19471b3&amp;gt;] mdt_reint_internal+0x6e3/0xaf0 [mdt]
2020-05-08 18:52:15 [ 1641.163689]  [&amp;lt;ffffffffc1952567&amp;gt;] mdt_reint+0x67/0x140 [mdt]
2020-05-08 18:52:15 [ 1641.170469]  [&amp;lt;ffffffffc17112ea&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
2020-05-08 18:52:15 [ 1641.178601]  [&amp;lt;ffffffffc16b629b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
2020-05-08 18:52:15 [ 1641.187639]  [&amp;lt;ffffffffc16b9bfc&amp;gt;] ptlrpc_main+0xb2c/0x1460 [ptlrpc]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>Servers: &lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;Centos 7.7 &lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;Kernel 3.10.0-1062.4.1.el7_lustre.x86_64 &lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;Lustre 2.12.3 &lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;OFED-internal-4.7-1.0.0</environment>
        <key id="59362">LU-13608</key>
            <summary>MDT stuck in WAITING, abort_recov stuck too</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="hongchao.zhang">Hongchao Zhang</reporter>
                        <labels>
                    </labels>
                <created>Thu, 28 May 2020 12:00:13 +0000</created>
                <updated>Tue, 10 Jan 2023 19:17:45 +0000</updated>
                            <resolved>Fri, 23 Oct 2020 04:33:17 +0000</resolved>
                                    <version>Lustre 2.12.3</version>
                                    <fixVersion>Lustre 2.14.0</fixVersion>
                    <fixVersion>Lustre 2.12.6</fixVersion>
                    <fixVersion>Lustre 2.12.7</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="271395" author="bzzz" created="Thu, 28 May 2020 12:02:53 +0000"  >&lt;p&gt;what version did you use? any logs?&lt;/p&gt;</comment>
                            <comment id="271402" author="gerrit" created="Thu, 28 May 2020 12:30:50 +0000"  >&lt;p&gt;Hongchao Zhang (hongchao@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/38746&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/38746&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13608&quot; title=&quot;MDT stuck in WAITING, abort_recov stuck too&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13608&quot;&gt;&lt;del&gt;LU-13608&lt;/del&gt;&lt;/a&gt; tgt: abort recovery while reading update llog&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 8c4b7bf0bf5321a937d39ee1f5e7e907b2c20903&lt;/p&gt;</comment>
                            <comment id="274250" author="aboyko" created="Thu, 2 Jul 2020 07:19:30 +0000"  >&lt;p&gt;I&apos;ve seen a similar MDS hang, and I want to add my thoughts&lt;br/&gt;
For my occurrence, MDT02 recovery have not finished because of lod_sub_recovery_thread(), it tried to read update log files which was not exist.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 113444  TASK: ffff8ed602b4a140  CPU: 0   COMMAND: &quot;lod0002_rec0000&quot;
 #0 [ffff8ed5197635c8] __schedule at ffffffff9416d0bc
 #1 [ffff8ed519763658] schedule at ffffffff9416d6e9
 #2 [ffff8ed519763668] schedule_timeout at ffffffff9416b01e
 #3 [ffff8ed519763710] ptlrpc_set_wait at ffffffffc10d42f8 [ptlrpc]
 #4 [ffff8ed5197637b8] ptlrpc_queue_wait at ffffffffc10d4653 [ptlrpc]
 #5 [ffff8ed5197637d8] osp_remote_sync at ffffffffc168a783 [osp]
 #6 [ffff8ed519763828] osp_attr_get at ffffffffc16704d7 [osp]
 #7 [ffff8ed5197638a8] osp_object_init at ffffffffc166d00d [osp]
 #8 [ffff8ed5197638e0] lu_object_start at ffffffffc0dff50b [obdclass]
 #9 [ffff8ed519763938] lu_object_find_at at ffffffffc0e033f1 [obdclass]
#10 [ffff8ed519763a00] dt_locate_at at ffffffffc0e049bd [obdclass]
#11 [ffff8ed519763a20] llog_osd_open at ffffffffc0dc517f [obdclass]
#12 [ffff8ed519763a98] llog_open at ffffffffc0db1f8a [obdclass]
#13 [ffff8ed519763ae0] llog_cat_id2handle at ffffffffc0dbad45 [obdclass]
#14 [ffff8ed519763b50] llog_cat_process_common at ffffffffc0dbb1b9 [obdclass]
#15 [ffff8ed519763ba8] llog_cat_process_cb at ffffffffc0dbc191 [obdclass]
#16 [ffff8ed519763bf8] llog_process_thread at ffffffffc0db586f [obdclass]
#17 [ffff8ed519763d08] llog_process_or_fork at ffffffffc0db6b3c [obdclass]
#18 [ffff8ed519763d70] llog_cat_process_or_fork at ffffffffc0db89e9 [obdclass]
#19 [ffff8ed519763de8] llog_cat_process at ffffffffc0db8b9e [obdclass]
#20 [ffff8ed519763e08] lod_sub_recovery_thread at ffffffffc142b526 [lod]
#21 [ffff8ed519763ea8] kthread at ffffffff93ac2016
#22 [ffff8ed519763f50] ret_from_fork_nospec_begin at ffffffff9417abdd
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;MDT02 sent out request to MDT0, at MDT0 osd_fid_lookup happened, and triggered OI scrub, because of file did not exist. MDT0 replied with EINPROGRESS, and MDT02 resent request. And again osd_fid_lookup trigger scrub etc.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00080000:02000400:11.0:1593417562.046666:0:224060:0:(osd_handler.c:1221:osd_fid_lookup()) fs1-MDT0000: trigger OI scrub by RPC for the [0x200032c9a:0x1:0x0] with flags 0x4a, rc = 0
00100000:10000000:17.0:1593417562.411181:0:2919:0:(osd_scrub.c:536:osd_scrub_post()) fs1-MDT0000: OI scrub post with result = 1
00100000:10000000:17.0:1593417562.411207:0:2919:0:(scrub.c:239:scrub_file_store()) fs1-MDT0000: store scrub file: rc = 0
00100000:10000000:17.0:1593417562.411208:0:2919:0:(osd_scrub.c:1322:osd_scrub_main()) fs1-MDT0000: OI scrub: stop, pos = 3041940257: rc = 1
00100000:10000000:17.0:1593417570.677557:0:2950:0:(osd_scrub.c:459:osd_scrub_prep()) fs1-MDT0000: OI scrub prep, flags = 0x4e
00100000:10000000:17.0:1593417570.677574:0:2950:0:(scrub.c:132:scrub_file_reset()) fs1-MDT0000: reset OI scrub file, old flags = 0x0, add flags = 0x0
00100000:10000000:17.0:1593417570.677600:0:2950:0:(scrub.c:239:scrub_file_store()) fs1-MDT0000: store scrub file: rc = 0
00100000:10000000:17.0:1593417570.677603:0:2950:0:(osd_scrub.c:1307:osd_scrub_main()) fs1-MDT0000: OI scrub start, flags = 0x4e, pos = 12
00080000:02000400:7.0:1593417570.677625:0:225112:0:(osd_handler.c:1221:osd_fid_lookup()) fs1-MDT0000: trigger OI scrub by RPC for the [0x200032c9b:0x2:0x0] with flags 0x4a, rc = 0
00100000:10000000:17.0:1593417571.031439:0:2950:0:(osd_scrub.c:536:osd_scrub_post()) fs1-MDT0000: OI scrub post with result = 1
00100000:10000000:17.0:1593417571.031464:0:2950:0:(scrub.c:239:scrub_file_store()) fs1-MDT0000: store scrub file: rc = 0
00100000:10000000:17.0:1593417571.031465:0:2950:0:(osd_scrub.c:1322:osd_scrub_main()) fs1-MDT0000: OI scrub: stop, pos = 3041940257: rc = 1
00100000:10000000:17.0:1593417573.046732:0:2951:0:(osd_scrub.c:459:osd_scrub_prep()) fs1-MDT0000: OI scrub prep, flags = 0x4e
00100000:10000000:17.0:1593417573.046735:0:2951:0:(scrub.c:132:scrub_file_reset()) fs1-MDT0000: reset OI scrub file, old flags = 0x0, add flags = 0x0
00100000:10000000:17.0:1593417573.046744:0:2951:0:(scrub.c:239:scrub_file_store()) fs1-MDT0000: store scrub file: rc = 0
00100000:10000000:17.0:1593417573.046747:0:2951:0:(osd_scrub.c:1307:osd_scrub_main()) fs1-MDT0000: OI scrub start, flags = 0x4e, pos = 12
00080000:02000400:8.0:1593417573.046823:0:224060:0:(osd_handler.c:1221:osd_fid_lookup()) fs1-MDT0000: trigger OI scrub by RPC for the [0x200032c9a:0x1:0x0] with flags 0x4a, rc = 0
00100000:10000000:17.0:1593417573.403695:0:2951:0:(osd_scrub.c:536:osd_scrub_post()) fs1-MDT0000: OI scrub post with result = 1
00100000:10000000:17.0:1593417573.403720:0:2951:0:(scrub.c:239:scrub_file_store()) fs1-MDT0000: store scrub file: rc = 0
00100000:10000000:17.0:1593417573.403721:0:2951:0:(osd_scrub.c:1322:osd_scrub_main()) fs1-MDT0000: OI scrub: stop, pos = 3041940257: rc = 1
00100000:10000000:17.0:1593417581.677557:0:3081:0:(osd_scrub.c:459:osd_scrub_prep()) fs1-MDT0000: OI scrub prep, flags = 0x4e
00100000:10000000:17.0:1593417581.677574:0:3081:0:(scrub.c:132:scrub_file_reset()) fs1-MDT0000: reset OI scrub file, old flags = 0x0, add flags = 0x0
00100000:10000000:17.0:1593417581.677600:0:3081:0:(scrub.c:239:scrub_file_store()) fs1-MDT0000: store scrub file: rc = 0
00100000:10000000:17.0:1593417581.677603:0:3081:0:(osd_scrub.c:1307:osd_scrub_main()) fs1-MDT0000: OI scrub start, flags = 0x4e, pos = 12
00080000:02000400:7.0:1593417581.677628:0:225112:0:(osd_handler.c:1221:osd_fid_lookup()) fs1-MDT0000: trigger OI scrub by RPC for the [0x200032c9b:0x2:0x0] with flags 0x4a, rc = 0
00100000:10000000:17.0:1593417582.027254:0:3081:0:(osd_scrub.c:536:osd_scrub_post()) fs1-MDT0000: OI scrub post with result = 1
00100000:10000000:17.0:1593417582.027278:0:3081:0:(scrub.c:239:scrub_file_store()) fs1-MDT0000: store scrub file: rc = 0
00100000:10000000:17.0:1593417582.027279:0:3081:0:(osd_scrub.c:1322:osd_scrub_main()) fs1-MDT0000: OI scrub: stop, pos = 3041940257: rc = 1
00100000:10000000:17.0:1593417584.046608:0:3083:0:(osd_scrub.c:459:osd_scrub_prep()) fs1-MDT0000: OI scrub prep, flags = 0x4e
00100000:10000000:17.0:1593417584.046611:0:3083:0:(scrub.c:132:scrub_file_reset()) fs1-MDT0000: reset OI scrub file, old flags = 0x0, add flags = 0x0
00100000:10000000:17.0:1593417584.046622:0:3083:0:(scrub.c:239:scrub_file_store()) fs1-MDT0000: store scrub file: rc = 0
00100000:10000000:17.0:1593417584.046624:0:3083:0:(osd_scrub.c:1307:osd_scrub_main()) fs1-MDT0000: OI scrub start, flags = 0x4e, pos = 12
00080000:02000400:11.0:1593417584.046686:0:224060:0:(osd_handler.c:1221:osd_fid_lookup()) fs1-MDT0000: trigger OI scrub by RPC for the [0x200032c9a:0x1:0x0] with flags 0x4a, rc = 0
 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;For me it looks strange that OI scrub does not return an error after full scan and repeated FID.&lt;/p&gt;

&lt;p&gt;&#160;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13608&quot; title=&quot;MDT stuck in WAITING, abort_recov stuck too&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13608&quot;&gt;&lt;del&gt;LU-13608&lt;/del&gt;&lt;/a&gt;&#160;tgt: abort recovery while reading update llog would help recovery process at MDT, but I&apos;m not sure about lod_sub_recovery_thread().&#160; It&#160;looped at pltrpc layer with repeating a request for EINPROGRESS.&lt;/p&gt;</comment>
                            <comment id="274302" author="aboyko" created="Thu, 2 Jul 2020 16:44:22 +0000"  >&lt;p&gt;Here is the test to reproduce update log deletion. The fix doesn&apos;t help. I checked abort_recovery, etc.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh
index 414d95e..b2f02f0 100755
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -24177,6 +24177,29 @@ test_902() {
 }
 run_test 902 &quot;test short write doesn&apos;t hang lustre&quot;

+test_903() {
+       [ $MDSCOUNT -lt 2 ] &amp;amp;&amp;amp; skip &quot;needs &amp;gt;= 2 MDTs&quot;
+
+       local timeout
+
+       timeout=$(do_facet mds2 &quot;$LCTL get_param -n mdt.$FSNAME-MDT0001.recovery_time_hard&quot;)
+       for idx in $(seq $MDSCOUNT); do
+               stop mds${idx}
+       done
+
+       do_facet mds1 &quot;mkdir -p /tmp/test_903 &amp;amp;&amp;amp; mount -t ldiskfs -o loop $(mdsdevname 1) /tmp/test_903 &amp;amp;&amp;amp;
+               rm -f /tmp/test_903/update_log_dir/* &amp;amp;&amp;amp; umount /tmp/test_903 &amp;amp;&amp;amp; rm -rf /tmp/test_903&quot;
+
+       for idx in $(seq $MDSCOUNT); do
+               start mds${idx} $(mdsdevname $idx) $MDS_MOUNT_OPTS ||
+                       error &quot;mount mds$idx failed&quot;
+       done
+
+       wait_recovery_complete mds2 $timeout
+
+}
+run_test 903 &quot;don&apos;t hang MDS recovery when failed to get update log&quot;
+
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Maybe&#160;rq_no_retry_einprogress = 1 ?&#160;@Alex Zhuravlev&#160;&#160;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl get_param mdt.*.recovery_status
mdt.lustre-MDT0000.recovery_status=
status: COMPLETE
recovery_start: 1593460707
recovery_duration: 246400
completed_clients: 1/2
replayed_requests: 0
last_transno: 4294967310
VBR: DISABLED
IR: DISABLED
mdt.lustre-MDT0001.recovery_status=
status: WAITING
non-ready MDTs:  0000
recovery_start: 1593707106
time_waited: 1429

root     16048  0.0  0.0  11232   708 pts/2    D+   12:30   0:00 lctl --device 14 abort_recovery

 sudo cat /proc/16048/stack

[&amp;lt;ffffffffc0e9474d&amp;gt;] target_stop_recovery_thread.part.20+0x3d/0xd0 [ptlrpc]
[&amp;lt;ffffffffc0e947f8&amp;gt;] target_stop_recovery_thread+0x18/0x20 [ptlrpc]
[&amp;lt;ffffffffc11bcf68&amp;gt;] mdt_iocontrol+0x558/0xb00 [mdt]
[&amp;lt;ffffffffc0a17eeb&amp;gt;] class_handle_ioctl+0x16bb/0x1cc0 [obdclass]
[&amp;lt;ffffffffc0a18565&amp;gt;] obd_class_ioctl+0x75/0x170 [obdclass]
[&amp;lt;ffffffff9d256490&amp;gt;] do_vfs_ioctl+0x3a0/0x5a0
[&amp;lt;ffffffff9d256731&amp;gt;] SyS_ioctl+0xa1/0xc0
[&amp;lt;ffffffff9d776ddb&amp;gt;] system_call_fastpath+0x22/0x27
[&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="274419" author="gerrit" created="Sat, 4 Jul 2020 03:04:23 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/38746/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/38746/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13608&quot; title=&quot;MDT stuck in WAITING, abort_recov stuck too&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13608&quot;&gt;&lt;del&gt;LU-13608&lt;/del&gt;&lt;/a&gt; tgt: abort recovery while reading update llog&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 0496cdf20451f07befebd1cb8a770544ec0f57df&lt;/p&gt;</comment>
                            <comment id="274483" author="gerrit" created="Mon, 6 Jul 2020 00:34:46 +0000"  >&lt;p&gt;Li Dongyang (dongyangli@ddn.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/39284&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/39284&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13608&quot; title=&quot;MDT stuck in WAITING, abort_recov stuck too&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13608&quot;&gt;&lt;del&gt;LU-13608&lt;/del&gt;&lt;/a&gt; tgt: abort recovery while reading update llog&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: be2b297c3edd91d7caab97408bf7f539612a47e6&lt;/p&gt;</comment>
                            <comment id="275937" author="pjones" created="Tue, 21 Jul 2020 23:19:52 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=hongchao.zhang&quot; class=&quot;user-hover&quot; rel=&quot;hongchao.zhang&quot;&gt;hongchao.zhang&lt;/a&gt; is the issue reported by &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=aboyko&quot; class=&quot;user-hover&quot; rel=&quot;aboyko&quot;&gt;aboyko&lt;/a&gt; a different issue?&lt;/p&gt;</comment>
                            <comment id="275964" author="hongchao.zhang" created="Wed, 22 Jul 2020 11:59:18 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=pjones&quot; class=&quot;user-hover&quot; rel=&quot;pjones&quot;&gt;pjones&lt;/a&gt;&lt;br/&gt;
Hi, this could be another case of this issue, I will create a new patch to fix it.&lt;/p&gt;</comment>
                            <comment id="275965" author="aboyko" created="Wed, 22 Jul 2020 12:27:18 +0000"  >&lt;p&gt;HongChao Zhang, could you please reproduce the original issue and show that abort recovery fix it?&#160; It&apos;s better to have a regression test for it or some reproducer.&lt;/p&gt;</comment>
                            <comment id="276375" author="gerrit" created="Thu, 30 Jul 2020 12:13:27 +0000"  >&lt;p&gt;Alexander Boyko (alexander.boyko@hpe.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/39538&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/39538&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13608&quot; title=&quot;MDT stuck in WAITING, abort_recov stuck too&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13608&quot;&gt;&lt;del&gt;LU-13608&lt;/del&gt;&lt;/a&gt; out: don&apos;t return einprogress error&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 48cdb839e17fa99a2cbfe442b4ad86c6a6746e2b&lt;/p&gt;</comment>
                            <comment id="278453" author="gerrit" created="Tue, 1 Sep 2020 03:44:40 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/39538/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/39538/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13608&quot; title=&quot;MDT stuck in WAITING, abort_recov stuck too&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13608&quot;&gt;&lt;del&gt;LU-13608&lt;/del&gt;&lt;/a&gt; out: don&apos;t return einprogress error&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 865aa3f692bccdd9cf7ff6cafeee350e06bb8d76&lt;/p&gt;</comment>
                            <comment id="281491" author="gerrit" created="Tue, 6 Oct 2020 00:57:50 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/39284/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/39284/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13608&quot; title=&quot;MDT stuck in WAITING, abort_recov stuck too&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13608&quot;&gt;&lt;del&gt;LU-13608&lt;/del&gt;&lt;/a&gt; tgt: abort recovery while reading update llog&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 4142f05811b019df453ce52f6b690ec81fa5897f&lt;/p&gt;</comment>
                            <comment id="289104" author="gerrit" created="Fri, 8 Jan 2021 23:16:06 +0000"  >&lt;p&gt;Andreas Dilger (adilger@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/41183&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/41183&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13608&quot; title=&quot;MDT stuck in WAITING, abort_recov stuck too&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13608&quot;&gt;&lt;del&gt;LU-13608&lt;/del&gt;&lt;/a&gt; out: don&apos;t return einprogress error&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 2c6b286c1e596b850cabe0b185c1552b0133496d&lt;/p&gt;</comment>
                            <comment id="290745" author="adilger" created="Fri, 29 Jan 2021 22:49:41 +0000"  >&lt;p&gt;Alexander Boyko (alexander.boyko@hpe.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/39539&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/39539&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13608&quot; title=&quot;MDT stuck in WAITING, abort_recov stuck too&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13608&quot;&gt;&lt;del&gt;LU-13608&lt;/del&gt;&lt;/a&gt; tests: check MDS recovery hang&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 2329127abf2ace5333a10352b77202c55f8da0aa&lt;/p&gt;</comment>
                            <comment id="292107" author="gerrit" created="Tue, 16 Feb 2021 21:59:50 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/41183/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/41183/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13608&quot; title=&quot;MDT stuck in WAITING, abort_recov stuck too&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13608&quot;&gt;&lt;del&gt;LU-13608&lt;/del&gt;&lt;/a&gt; out: don&apos;t return einprogress error&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 7817acc39ee1d6859c2737f75619748dc8e37f95&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="62297">LU-14318</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="61557">LU-14119</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i011gf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>