<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:46:42 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-11761] blocked MDT mount and high cpu usage from lodXXXX_recYYYY threads</title>
                <link>https://jira.whamcloud.com/browse/LU-11761</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Another issue when using 2.12.0 RC2 during testing... MDTs mount seems to never complete and the following threads take 100% cpu:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
 20953 root 20 0 0 0 0 R 100.0 0.0 27:00.33 lod0002_rec0001
 20954 root 20 0 0 0 0 R 100.0 0.0 27:00.34 lod0002_rec0003
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This is on &lt;tt&gt;fir-md1-s1&lt;/tt&gt; that handles MDT0 and MDT2 on this test system.&lt;/p&gt;

&lt;p&gt;sysrq t shows:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Dec 11 09:50:13 fir-md1-s1 kernel: lod0002_rec0001 R  running task        0 20953      2 0x00000080
Dec 11 09:50:13 fir-md1-s1 kernel: Call Trace:
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d3cc5d&amp;gt;] ? keys_fini+0x2d/0x1d0 [obdclass]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d3ce2b&amp;gt;] lu_context_fini+0x2b/0xa0 [obdclass]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d3d0da&amp;gt;] lu_env_init+0x1a/0x30 [obdclass]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0f19b68&amp;gt;] ptlrpc_set_wait+0x7d8/0x8d0 [ptlrpc]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d515e5&amp;gt;] ? lustre_get_jobid+0x185/0x2e0 [obdclass]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d09f3c&amp;gt;] ? obd_get_request_slot+0x3c/0x280 [obdclass]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0f19ce3&amp;gt;] ptlrpc_queue_wait+0x83/0x230 [ptlrpc]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc088e334&amp;gt;] fld_client_rpc+0x104/0x540 [fld]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0892f5f&amp;gt;] fld_server_lookup+0x15f/0x320 [fld]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc1684587&amp;gt;] lod_fld_lookup+0x327/0x510 [lod]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc16997dd&amp;gt;] lod_object_init+0x7d/0x3c0 [lod]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d3dfd5&amp;gt;] lu_object_alloc+0xe5/0x320 [obdclass]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d3e2e6&amp;gt;] lu_object_find_at+0x76/0x280 [obdclass]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d3f78d&amp;gt;] dt_locate_at+0x1d/0xb0 [obdclass]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d02b4c&amp;gt;] llog_osd_open+0xfc/0xf30 [obdclass]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d3e789&amp;gt;] ? lu_object_put+0x279/0x3d0 [obdclass]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0ceff20&amp;gt;] llog_open+0x140/0x3d0 [obdclass]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc16bdeed&amp;gt;] lod_sub_prep_llog+0x14d/0x783 [lod]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc16837ab&amp;gt;] lod_sub_recovery_thread+0x1cb/0xc80 [lod]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffc16835e0&amp;gt;] ? lod_obd_get_info+0x9d0/0x9d0 [lod]
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffa7ac1c31&amp;gt;] kthread+0xd1/0xe0
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffa7ac1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffa8174c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
Dec 11 09:50:13 fir-md1-s1 kernel:  [&amp;lt;ffffffffa7ac1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;and&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Dec 11 09:44:24 fir-md1-s1 kernel: lod0002_rec0003 R  running task        0 20954      2 0x00000080
Dec 11 09:44:24 fir-md1-s1 kernel: Call Trace:
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d3cfa3&amp;gt;] ? lu_context_init+0xd3/0x1f0 [obdclass]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d3ceba&amp;gt;] ? lu_env_fini+0x1a/0x30 [obdclass]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0f19b68&amp;gt;] ? ptlrpc_set_wait+0x7d8/0x8d0 [ptlrpc]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d515e5&amp;gt;] ? lustre_get_jobid+0x185/0x2e0 [obdclass]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d09f3c&amp;gt;] ? obd_get_request_slot+0x3c/0x280 [obdclass]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0f19ce3&amp;gt;] ? ptlrpc_queue_wait+0x83/0x230 [ptlrpc]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc088e421&amp;gt;] ? fld_client_rpc+0x1f1/0x540 [fld]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0892f5f&amp;gt;] ? fld_server_lookup+0x15f/0x320 [fld]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc1684587&amp;gt;] ? lod_fld_lookup+0x327/0x510 [lod]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc16997dd&amp;gt;] ? lod_object_init+0x7d/0x3c0 [lod]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d3dfd5&amp;gt;] ? lu_object_alloc+0xe5/0x320 [obdclass]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d3e2e6&amp;gt;] ? lu_object_find_at+0x76/0x280 [obdclass]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d3f78d&amp;gt;] ? dt_locate_at+0x1d/0xb0 [obdclass]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d02b4c&amp;gt;] ? llog_osd_open+0xfc/0xf30 [obdclass]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d3e789&amp;gt;] ? lu_object_put+0x279/0x3d0 [obdclass]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0ceff20&amp;gt;] ? llog_open+0x140/0x3d0 [obdclass]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc16bdeed&amp;gt;] ? lod_sub_prep_llog+0x14d/0x783 [lod]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc16837ab&amp;gt;] ? lod_sub_recovery_thread+0x1cb/0xc80 [lod]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffc16835e0&amp;gt;] ? lod_obd_get_info+0x9d0/0x9d0 [lod]
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffa7ac1c31&amp;gt;] ? kthread+0xd1/0xe0
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffa7ac1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffa8174c24&amp;gt;] ? ret_from_fork_nospec_begin+0xe/0x21
Dec 11 09:44:24 fir-md1-s1 kernel:  [&amp;lt;ffffffffa7ac1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Mount commands are stuck, even when using -o abort_recov&lt;/p&gt;

&lt;p&gt;I took a crash dump just in case you&apos;re interested.&lt;/p&gt;

&lt;p&gt;I believe it&apos;s a regression from earlier 2.11.x versions...&lt;/p&gt;

&lt;p&gt;HTH,&lt;br/&gt;
Stephane&lt;/p&gt;</description>
                <environment>CentOS 7.6 3.10.0-957.1.3.el7_lustre.x86_64 Lustre 2.12.0 RC2</environment>
        <key id="54258">LU-11761</key>
            <summary>blocked MDT mount and high cpu usage from lodXXXX_recYYYY threads</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="sthiell">Stephane Thiell</reporter>
                        <labels>
                    </labels>
                <created>Tue, 11 Dec 2018 18:06:28 +0000</created>
                <updated>Thu, 9 Dec 2021 08:39:34 +0000</updated>
                            <resolved>Wed, 17 Jul 2019 21:23:09 +0000</resolved>
                                    <version>Lustre 2.12.0</version>
                                    <fixVersion>Lustre 2.13.0</fixVersion>
                    <fixVersion>Lustre 2.12.3</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="238403" author="pjones" created="Tue, 11 Dec 2018 18:29:09 +0000"  >&lt;p&gt;Lai&lt;/p&gt;

&lt;p&gt;Could you please make an assessment of this issue? Does this seem to be fallout from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11753&quot; title=&quot;MDS BUG on lfs migrate [osd_it_ea_rec]&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11753&quot;&gt;&lt;del&gt;LU-11753&lt;/del&gt;&lt;/a&gt;?&lt;/p&gt;

&lt;p&gt;Stephane&lt;/p&gt;

&lt;p&gt;Could you please attach the debug logs to the Jira ticket and&#160;upload the crash dump to the Whamcloud ftp site?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="238408" author="sthiell" created="Tue, 11 Dec 2018 18:49:46 +0000"  >&lt;p&gt;Sure, I uploaded the vmcore and corresponding debuginfo rpms to ftp.whamcloud.com:&lt;/p&gt;

&lt;p&gt;*&#160;vmcore as &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11761&quot; title=&quot;blocked MDT mount and high cpu usage from lodXXXX_recYYYY threads&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11761&quot;&gt;&lt;del&gt;LU-11761&lt;/del&gt;&lt;/a&gt;-vmcore-fir-md1-s1-2018-12-11-10-05-46&lt;/p&gt;

&lt;p&gt;debuginfo rpms as:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;lustre-debuginfo-2.12.0_RC2-1.el7.x86_64.rpm&lt;/li&gt;
	&lt;li&gt;kernel-debuginfo-3.10.0-957.1.3.el7_lustre.x86_64.rpm.1 (please disregard kernel-debuginfo-3.10.0-957.1.3.el7_lustre.x86_64.rpm)&lt;/li&gt;
	&lt;li&gt;kernel-debuginfo-common-x86_64-3.10.0-957.1.3.el7_lustre.x86_64.rpm&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;And yes this could be a fallout from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11753&quot; title=&quot;MDS BUG on lfs migrate [osd_it_ea_rec]&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11753&quot;&gt;&lt;del&gt;LU-11753&lt;/del&gt;&lt;/a&gt; as I started having this issue after the problematic &lt;tt&gt;lfs migrate -m ...&lt;/tt&gt;&lt;/p&gt;</comment>
                            <comment id="238435" author="laisiyao" created="Wed, 12 Dec 2018 02:22:19 +0000"  >&lt;p&gt;Peter, this looks to be caused by corrupt update log, which may be generated by &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11753&quot; title=&quot;MDS BUG on lfs migrate [osd_it_ea_rec]&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11753&quot;&gt;&lt;del&gt;LU-11753&lt;/del&gt;&lt;/a&gt;. I&apos;ll check logs for details.&lt;/p&gt;

&lt;p&gt;Stephane, did you use multiple MDT before? the update log is used on DNE system only.&lt;/p&gt;</comment>
                            <comment id="238667" author="laisiyao" created="Mon, 17 Dec 2018 03:44:25 +0000"  >&lt;p&gt;This will be fixed in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10888&quot; title=&quot;&amp;#39;lctl abort_recovery&amp;#39; allow aborting recovery between MDTs&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10888&quot;&gt;&lt;del&gt;LU-10888&lt;/del&gt;&lt;/a&gt;, but it&apos;s not finished yet.&lt;/p&gt;</comment>
                            <comment id="246441" author="sthiell" created="Sun, 28 Apr 2019 18:02:34 +0000"  >&lt;p&gt;Hit this again when mounting MDTs with &lt;tt&gt;-o abort_recov&lt;/tt&gt;&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Apr 28 10:56:16 fir-md1-s1 kernel: Lustre: fir-MDT0002: Imperative Recovery enabled, recovery window shrunk from 300-900 down to 150-900
Apr 28 10:56:17 fir-md1-s1 kernel: Lustre: fir-MDD0002: changelog on
Apr 28 10:56:17 fir-md1-s1 kernel: Lustre: fir-MDT0002: in recovery but waiting for the first client to connect
Apr 28 10:56:17 fir-md1-s1 kernel: LustreError: 39987:0:(mdt_handler.c:6447:mdt_iocontrol()) fir-MDT0002: Aborting recovery for device
Apr 28 10:56:17 fir-md1-s1 kernel: LustreError: 39987:0:(ldlm_lib.c:2605:target_stop_recovery_thread()) fir-MDT0002: Aborting recovery
Apr 28 10:56:17 fir-md1-s1 kernel: Lustre: Skipped 2 previous similar messages
Apr 28 10:56:17 fir-md1-s1 kernel: Lustre: 40232:0:(ldlm_lib.c:2058:target_recovery_overseer()) recovery is aborted, evict exports in recovery
Apr 28 10:56:17 fir-md1-s1 kernel: Lustre: 40232:0:(ldlm_lib.c:2058:target_recovery_overseer()) Skipped 2 previous similar messages
Apr 28 10:56:23 fir-md1-s1 kernel: LustreError: 23568:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) ldlm_cancel from 10.9.101.9@o2ib4 arrived at 1556474183 with bad export cookie 3999205256457834672
Apr 28 10:56:23 fir-md1-s1 kernel: LustreError: 23568:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) Skipped 2 previous similar messages
Apr 28 10:58:55 fir-md1-s1 kernel: LustreError: 137-5: fir-MDT0000_UUID: not available for connect from 10.9.105.54@o2ib4 (no target). If you are running an HA pair check that the target is mounted on the other 
Apr 28 10:58:55 fir-md1-s1 kernel: LustreError: Skipped 16004 previous similar messages
Apr 28 11:00:08 fir-md1-s1 kernel: INFO: task mount.lustre:39987 blocked for more than 120 seconds.
Apr 28 11:00:08 fir-md1-s1 kernel: &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
Apr 28 11:00:08 fir-md1-s1 kernel: mount.lustre    D ffff8b625635b0c0     0 39987  39986 0x00000082
Apr 28 11:00:08 fir-md1-s1 kernel: Call Trace:
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9bb67c49&amp;gt;] schedule+0x29/0x70
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9bb65721&amp;gt;] schedule_timeout+0x221/0x2d0
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9bb67ffd&amp;gt;] wait_for_completion+0xfd/0x140
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9b4d67b0&amp;gt;] ? wake_up_state+0x20/0x20
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d6936d&amp;gt;] target_stop_recovery_thread.part.18+0x3d/0xd0 [ptlrpc]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0d69418&amp;gt;] target_stop_recovery_thread+0x18/0x20 [ptlrpc]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc146e868&amp;gt;] mdt_iocontrol+0x558/0xb00 [mdt]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0b9a766&amp;gt;] server_start_targets+0x1c66/0x2a20 [obdclass]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0b6eb10&amp;gt;] ? lustre_start_mgc+0x260/0x2510 [obdclass]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0b67ea0&amp;gt;] ? class_config_dump_handler+0x7e0/0x7e0 [obdclass]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0b9c5ec&amp;gt;] server_fill_super+0x10cc/0x1890 [obdclass]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0b71828&amp;gt;] lustre_fill_super+0x328/0x950 [obdclass]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0b71500&amp;gt;] ? lustre_common_put_super+0x270/0x270 [obdclass]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9b6452cf&amp;gt;] mount_nodev+0x4f/0xb0
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0b69908&amp;gt;] lustre_mount+0x38/0x60 [obdclass]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9b645e4e&amp;gt;] mount_fs+0x3e/0x1b0
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9b6639e7&amp;gt;] vfs_kern_mount+0x67/0x110
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9b66600f&amp;gt;] do_mount+0x1ef/0xce0
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9b665d45&amp;gt;] ? copy_mount_options+0xc5/0x170
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9b666e43&amp;gt;] SyS_mount+0x83/0xd0
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9bb74ddb&amp;gt;] system_call_fastpath+0x22/0x27
Apr 28 11:00:08 fir-md1-s1 kernel: INFO: task mount.lustre:40007 blocked for more than 120 seconds.
Apr 28 11:00:08 fir-md1-s1 kernel: &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
Apr 28 11:00:08 fir-md1-s1 kernel: mount.lustre    D ffff8b625635e180     0 40007  40006 0x00000082
Apr 28 11:00:08 fir-md1-s1 kernel: Call Trace:
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9bb68b69&amp;gt;] schedule_preempt_disabled+0x29/0x70
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9bb66ab7&amp;gt;] __mutex_lock_slowpath+0xc7/0x1d0
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9bb65e9f&amp;gt;] mutex_lock+0x1f/0x2f
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc104f398&amp;gt;] mgc_set_info_async+0xa98/0x15f0 [mgc]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0b73bec&amp;gt;] ? keys_fill+0xfc/0x180 [obdclass]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0b98e1a&amp;gt;] server_start_targets+0x31a/0x2a20 [obdclass]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0b6eb10&amp;gt;] ? lustre_start_mgc+0x260/0x2510 [obdclass]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0b69f90&amp;gt;] ? do_lcfg+0x2f0/0x500 [obdclass]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0b9c5ec&amp;gt;] server_fill_super+0x10cc/0x1890 [obdclass]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0b71828&amp;gt;] lustre_fill_super+0x328/0x950 [obdclass]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0b71500&amp;gt;] ? lustre_common_put_super+0x270/0x270 [obdclass]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9b6452cf&amp;gt;] mount_nodev+0x4f/0xb0
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0b69908&amp;gt;] lustre_mount+0x38/0x60 [obdclass]
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9b645e4e&amp;gt;] mount_fs+0x3e/0x1b0
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9b6639e7&amp;gt;] vfs_kern_mount+0x67/0x110
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9b66600f&amp;gt;] do_mount+0x1ef/0xce0
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9b63e2aa&amp;gt;] ? __check_object_size+0x1ca/0x250
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9b61c74c&amp;gt;] ? kmem_cache_alloc_trace+0x3c/0x200
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9b666e43&amp;gt;] SyS_mount+0x83/0xd0
Apr 28 11:00:08 fir-md1-s1 kernel:  [&amp;lt;ffffffff9bb74ddb&amp;gt;] system_call_fastpath+0x22/0x27
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;top:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;   PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND                                                                                                                                      
 40230 root      20   0       0      0      0 R 100.0  0.0   4:14.53 lod0002_rec0001  
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="246453" author="hongchao.zhang" created="Mon, 29 Apr 2019 10:46:16 +0000"  >&lt;p&gt;In fld_client_rpc, the FLD_QUERY request will be retried in a loop if the connection to MDT0000 is not completed.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;int fld_client_rpc(struct obd_export *exp,
                   struct lu_seq_range *range, u32 fld_op,
                   struct ptlrpc_request **reqp)
{
            ...
            if (rc != 0) {
                if (imp-&amp;gt;imp_state != LUSTRE_IMP_CLOSED &amp;amp;&amp;amp;
                    !imp-&amp;gt;imp_deactive &amp;amp;&amp;amp;
                    imp-&amp;gt;imp_connect_flags_orig &amp;amp; OBD_CONNECT_MDS_MDS &amp;amp;&amp;amp;
                    OCD_HAS_FLAG(&amp;amp;imp-&amp;gt;imp_connect_data, LIGHTWEIGHT) &amp;amp;&amp;amp;
                    rc != -ENOTSUPP) {
                        /*
                         * Since LWP is not replayable, so it will keep
                         * trying unless umount happens or the remote
                         * target does not support the operation, otherwise
                         * it would cause unecessary failure of the
                         * application.
                         */
                        ptlrpc_req_finished(req);
                        rc = 0;
                        goto again;
                }
                GOTO(out_req, rc);
                ...
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In this case, the connection to the MDT0000 is actually lost, then the kernel thread &quot;lodxxxx_recxxxx&quot; is stuck in fld_client_rpc.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Apr 28 10:58:55 fir-md1-s1 kernel: LustreError: 137-5: fir-MDT0000_UUID: not available for connect from 10.9.105.54@o2ib4 (no target). If you are running an HA pair check that the target is mounted on the other 
Apr 28 10:58:55 fir-md1-s1 kernel: LustreError: Skipped 16004 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="247749" author="gerrit" created="Mon, 27 May 2019 06:16:47 +0000"  >&lt;p&gt;Hongchao Zhang (hongchao@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/34962&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34962&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11761&quot; title=&quot;blocked MDT mount and high cpu usage from lodXXXX_recYYYY threads&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11761&quot;&gt;&lt;del&gt;LU-11761&lt;/del&gt;&lt;/a&gt; fld: limit FLD_QUERY retry between MDTs&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: f2c831def8ac010b7a913633141deb2b5384e06c&lt;/p&gt;</comment>
                            <comment id="249638" author="sthiell" created="Fri, 21 Jun 2019 03:27:12 +0000"  >&lt;p&gt;Thanks Hongchao! FYI I&apos;ll use your current patch in case of emergency (eg. MDT restart) on our system, and rebuild when it&apos;s fully ready. We need to be able to abort the MDT recovery or better, the MDT recovery needs to be able to timeout, which is not always the case today as described in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12360&quot; title=&quot;Can&amp;#39;t restart filesystem (2.12) even with abort_recov&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12360&quot;&gt;LU-12360&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="251529" author="gerrit" created="Wed, 17 Jul 2019 06:21:27 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/34962/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34962/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11761&quot; title=&quot;blocked MDT mount and high cpu usage from lodXXXX_recYYYY threads&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11761&quot;&gt;&lt;del&gt;LU-11761&lt;/del&gt;&lt;/a&gt; fld: let&apos;s caller to retry FLD_QUERY&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: e3f6111dfd1c6f2266d0beef67e5a7514a6965d0&lt;/p&gt;</comment>
                            <comment id="251580" author="pjones" created="Wed, 17 Jul 2019 21:23:09 +0000"  >&lt;p&gt;Landed for 2.13&lt;/p&gt;</comment>
                            <comment id="252366" author="gerrit" created="Thu, 1 Aug 2019 00:48:54 +0000"  >&lt;p&gt;Minh Diep (mdiep@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/35661&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/35661&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11761&quot; title=&quot;blocked MDT mount and high cpu usage from lodXXXX_recYYYY threads&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11761&quot;&gt;&lt;del&gt;LU-11761&lt;/del&gt;&lt;/a&gt; fld: let&apos;s caller to retry FLD_QUERY&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: fe3d5156ac95e58c5fc62ab503dcff3619ccfbe0&lt;/p&gt;</comment>
                            <comment id="254119" author="sthiell" created="Wed, 4 Sep 2019 17:58:24 +0000"  >&lt;p&gt;Is there any chance to land this one in 2.12.3?&lt;/p&gt;</comment>
                            <comment id="254120" author="pjones" created="Wed, 4 Sep 2019 18:00:43 +0000"  >&lt;p&gt;Yes it is quite likely this will end up in 2.12.3&lt;/p&gt;</comment>
                            <comment id="254165" author="sthiell" created="Thu, 5 Sep 2019 13:28:49 +0000"  >&lt;p&gt;Great! With this patch on top of current b2_12 (so quite a lot of changes for us, not only this patch), the recovery worked on all 4 MDTs:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;fir-md1-s2: Sep 05 05:59:04 fir-md1-s2 kernel: Lustre: fir-MDT0001: Recovery over after 5:26, of 1378 clients 1378 recovered and 0 were evicted.
fir-md1-s2: Sep 05 05:59:11 fir-md1-s2 kernel: Lustre: fir-MDT0003: Recovery over after 0:32, of 1378 clients 1378 recovered and 0 were evicted.
fir-md1-s1: Sep 05 05:59:07 fir-md1-s1 kernel: Lustre: fir-MDT0002: Recovery over after 5:26, of 1379 clients 1379 recovered and 0 were evicted.
fir-md1-s1: Sep 05 05:59:25 fir-md1-s1 kernel: Lustre: fir-MDT0000: Recovery over after 0:43, of 1379 clients 1379 recovered and 0 were evicted.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Minor thing, two MDTs took some more time to mount, but after some 300s timeout it worked:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 4563.995907] LustreError: 39383:0:(ldlm_request.c:147:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1567688019, 300s ago), entering recovery for MGS@10.0.10.51@o2ib7 ns: MGC10.0.10.51@o2ib7 lock: ffff8fb0c2240480/0x5731634ee5f506c6 lrc: 4/1,0 mode: --/CR res: [0x726966:0x2:0x0].0x0 rrc: 3 type: PLN flags: 0x1000000000000 nid: local remote: 0x98816ce1399443f3 expref: -99 pid: 39383 timeout: 0 lvb_type: 0
[ 4564.033287] LustreError: 40296:0:(ldlm_resource.c:1147:ldlm_resource_complain()) MGC10.0.10.51@o2ib7: namespace resource [0x726966:0x2:0x0].0x0 (ffff8fb0c1334600) refcount nonzero (2) after lock cleanup; forcing cleanup.
[ 4564.052847] Lustre: MGC10.0.10.51@o2ib7: Connection restored to 10.0.10.51@o2ib7 (at 10.0.10.51@o2ib7)
[ 4564.062155] Lustre: Skipped 1 previous similar message
[ 4564.070150] Lustre: fir-MDT0003: Imperative Recovery enabled, recovery window shrunk from 300-900 down to 150-900
[ 4564.323008] Lustre: fir-MDD0003: changelog on
[ 4564.332139] Lustre: fir-MDT0003: in recovery but waiting for the first client to connect
[ 4564.429981] Lustre: fir-MDT0003: Will be in recovery for at least 2:30, or until 1378 clients reconnect
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;In case you want to investigate this timeout issue, the kernel logs of the successful start of 2.12.2_119 on our two MDSes are attached as:&lt;br/&gt;
 &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33454/33454_fir-md1-s1_2.12.2_119_SRCC.log&quot; title=&quot;fir-md1-s1_2.12.2_119_SRCC.log attached to LU-11761&quot;&gt;fir-md1-s1_2.12.2_119_SRCC.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; and &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33455/33455_fir-md1-s2_2.12.2_119_SRCC.log&quot; title=&quot;fir-md1-s2_2.12.2_119_SRCC.log attached to LU-11761&quot;&gt;fir-md1-s2_2.12.2_119_SRCC.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;But no 100% loaded lod* threads anymore. Not even a single eviction. This is awesome. Thanks all!&lt;/p&gt;</comment>
                            <comment id="254587" author="gerrit" created="Thu, 12 Sep 2019 03:51:09 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/35661/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/35661/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11761&quot; title=&quot;blocked MDT mount and high cpu usage from lodXXXX_recYYYY threads&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11761&quot;&gt;&lt;del&gt;LU-11761&lt;/del&gt;&lt;/a&gt; fld: let&apos;s caller to retry FLD_QUERY&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 80eb31f5367f5b879b544a61a938a8acd7de9cf5&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="49771">LU-10401</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="27471">LU-5871</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="55788">LU-12360</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="58843">LU-13468</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="33454" name="fir-md1-s1_2.12.2_119_SRCC.log" size="29301" author="sthiell" created="Thu, 5 Sep 2019 13:24:39 +0000"/>
                            <attachment id="33455" name="fir-md1-s2_2.12.2_119_SRCC.log" size="21679" author="sthiell" created="Thu, 5 Sep 2019 13:24:47 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i007sv:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>