<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:56:56 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12935] MDT deadlock on 2.12.3 with DoM; is it missing async_discard feature?</title>
                <link>https://jira.whamcloud.com/browse/LU-12935</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Currently we cannot make MDT0 work again on Fir (2.12.3) due to these backtraces and lock timeout:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Nov 04 18:30:17 fir-md1-s1 kernel: Pid: 32408, comm: mdt01_024 3.10.0-957.27.2.el7_lustre.pl1.x86_64 #1 SMP Mon Aug 5 15:28:37 PDT 2019
Nov 04 18:30:17 fir-md1-s1 kernel: Call Trace:
Nov 04 18:30:17 fir-md1-s1 kernel:  [&amp;lt;ffffffffc10ccac0&amp;gt;] ldlm_completion_ast+0x430/0x860 [ptlrpc]
Nov 04 18:30:17 fir-md1-s1 kernel:  [&amp;lt;ffffffffc10cd5e1&amp;gt;] ldlm_cli_enqueue_local+0x231/0x830 [ptlrpc]
Nov 04 18:30:17 fir-md1-s1 kernel:  [&amp;lt;ffffffffc15d850b&amp;gt;] mdt_object_local_lock+0x50b/0xb20 [mdt]
Nov 04 18:30:17 fir-md1-s1 kernel:  [&amp;lt;ffffffffc15d8b90&amp;gt;] mdt_object_lock_internal+0x70/0x360 [mdt]
Nov 04 18:30:17 fir-md1-s1 kernel:  [&amp;lt;ffffffffc15d8ea0&amp;gt;] mdt_object_lock+0x20/0x30 [mdt]
Nov 04 18:30:17 fir-md1-s1 kernel:  [&amp;lt;ffffffffc1617c4b&amp;gt;] mdt_brw_enqueue+0x44b/0x760 [mdt]
Nov 04 18:30:17 fir-md1-s1 kernel:  [&amp;lt;ffffffffc15c64bf&amp;gt;] mdt_intent_brw+0x1f/0x30 [mdt]
Nov 04 18:30:17 fir-md1-s1 kernel:  [&amp;lt;ffffffffc15debb5&amp;gt;] mdt_intent_policy+0x435/0xd80 [mdt]
Nov 04 18:30:17 fir-md1-s1 kernel:  [&amp;lt;ffffffffc10b3d46&amp;gt;] ldlm_lock_enqueue+0x356/0xa20 [ptlrpc]
Nov 04 18:30:17 fir-md1-s1 kernel:  [&amp;lt;ffffffffc10dc336&amp;gt;] ldlm_handle_enqueue0+0xa56/0x15f0 [ptlrpc]
Nov 04 18:30:17 fir-md1-s1 kernel:  [&amp;lt;ffffffffc1164a12&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
Nov 04 18:30:17 fir-md1-s1 kernel:  [&amp;lt;ffffffffc116936a&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
Nov 04 18:30:17 fir-md1-s1 kernel:  [&amp;lt;ffffffffc111024b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Nov 04 18:30:17 fir-md1-s1 kernel:  [&amp;lt;ffffffffc1113bac&amp;gt;] ptlrpc_main+0xb2c/0x1460 [ptlrpc]
Nov 04 18:30:17 fir-md1-s1 kernel:  [&amp;lt;ffffffffbe8c2e81&amp;gt;] kthread+0xd1/0xe0
Nov 04 18:30:17 fir-md1-s1 kernel:  [&amp;lt;ffffffffbef77c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
Nov 04 18:30:17 fir-md1-s1 kernel:  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Nov 04 18:30:17 fir-md1-s1 kernel: LNet: Service thread pid 32415 was inactive for 201.19s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one.
Nov 04 18:30:17 fir-md1-s1 kernel: LNet: Skipped 1 previous similar message
Nov 04 18:31:56 fir-md1-s1 kernel: LustreError: 32601:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1572920815, 300s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0000_UUID lock: ffffa10e0b407bc0/0x675682d854098c0 lrc: 3/0,1 mode: --/PW res: [0x200034eb7:0x1:0x0].0x0 bits 0x40/0x0 rrc: 912 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 32601 timeout: 0 lvb_type: 0
Nov 04 18:31:56 fir-md1-s1 kernel: LustreError: 32601:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 224 previous similar messages
Nov 04 18:34:25 fir-md1-s1 kernel: LustreError: 32404:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1572920965, 300s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0000_UUID lock: ffffa12da9254ec0/0x675682d8540d5dd lrc: 3/0,1 mode: --/PW res: [0x200034eb7:0x1:0x0].0x0 bits 0x40/0x0 rrc: 913 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 32404 timeout: 0 lvb_type: 0
Nov 04 18:34:25 fir-md1-s1 kernel: LustreError: 32404:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 161 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This looks similar to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11358&quot; title=&quot;racer test 1 hangs in locking with DNE&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11358&quot;&gt;LU-11358&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;I also just noticed another thing, on our new 2.12.3 clients, I can&apos;t find the async_discard import flag:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@sh-117-13 ~]# lctl get_param mdc.fir-*.import | grep connect_flags
    connect_flags: [ write_grant, server_lock, version, acl, xattr, create_on_write, truncate_lock, inode_bit_locks, getattr_by_fid, no_oh_for_devices, max_byte_per_rpc, early_lock_cancel, adaptive_timeouts, lru_resize, alt_checksum_algorithm, fid_is_enabled, version_recovery, pools, large_ea, full20, layout_lock, 64bithash, jobstats, umask, einprogress, grant_param, lvb_type, short_io, flock_deadlock, disp_stripe, open_by_fid, lfsck, multi_mod_rpcs, dir_stripe, subtree, bulk_mbits, second_flags, file_secctx, dir_migrate, unknown, flr, lock_convert, archive_id_array, selinux_policy, lsom, unknown2_0x4000 ]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Is it &lt;tt&gt;unknown2_0x4000&lt;/tt&gt; ? Or is it missing??&lt;/p&gt;


&lt;p&gt;Please confirm that the async_discard patch  is missing from 2.12.3? If it&apos;s the case, we&apos;ll need to perform a full downgrade or emergency patching of the cluster and Lustre servers.&lt;/p&gt;

&lt;p&gt;Attaching logs from MDS fir-md1-s1 which serves MDT0 and that we cannot make operational again at the moment. We even tried with abort_recovery with no luck.&lt;/p&gt;</description>
                <environment>CentOS 7.6</environment>
        <key id="57308">LU-12935</key>
            <summary>MDT deadlock on 2.12.3 with DoM; is it missing async_discard feature?</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="tappro">Mikhail Pershin</assignee>
                                    <reporter username="sthiell">Stephane Thiell</reporter>
                        <labels>
                    </labels>
                <created>Tue, 5 Nov 2019 02:43:35 +0000</created>
                <updated>Mon, 24 Aug 2020 17:45:10 +0000</updated>
                            <resolved>Thu, 12 Dec 2019 23:35:11 +0000</resolved>
                                    <version>Lustre 2.12.3</version>
                                    <fixVersion>Lustre 2.12.4</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="257683" author="sthiell" created="Tue, 5 Nov 2019 02:56:52 +0000"  >&lt;p&gt;Apparently the patch is included in 2.12.3 as &lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;commit e5810126b3fb488a3fed37e085e3ca4ae585324c
Author: Mikhail Pershin &amp;lt;mpershin@whamcloud.com&amp;gt;
Date:   Wed Oct 31 16:28:29 2018 +0300

    LU-11359 mdt: fix mdt_dom_discard_data() timeouts
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Which includes:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c
index 8894a38..13eaca4 100644
--- a/lustre/obdclass/lprocfs_status.c
+++ b/lustre/obdclass/lprocfs_status.c
@@ -785,6 +785,7 @@ static const char *obd_connect_names[] = {
        &quot;unknown&quot;,              /* 0x200 */
        &quot;selinux_policy&quot;,       /* 0x400 */
        &quot;lsom&quot;,                 /* 0x800 */
+       &quot;async_discard&quot;,        /* 0x4000 */
        NULL
 }; 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So why is the Lustre client version available on the Whamcloud repository not recognizing the async_discard import flag? &lt;/p&gt;</comment>
                            <comment id="257684" author="sthiell" created="Tue, 5 Nov 2019 03:04:27 +0000"  >&lt;p&gt;Note: the 2.12.3 version I build is not able to recognize the connect flag either, and shows as &lt;tt&gt;unknown2_0x4000&lt;/tt&gt;. Prior 2.12.3, with our patched 2.12.0, the flag was recognized as &quot;async_discard&quot;.  If the clients don&apos;t use the new feature, that would explain the MDT deadlock after we finally upgraded 1,000+ clients.&lt;/p&gt;</comment>
                            <comment id="257686" author="pjones" created="Tue, 5 Nov 2019 03:08:29 +0000"  >&lt;p&gt;Mike should look at this when he comes online but we will need to see what can be done to restore production in the meantime&lt;/p&gt;</comment>
                            <comment id="257687" author="sthiell" created="Tue, 5 Nov 2019 03:19:06 +0000"  >&lt;p&gt;Thanks! I&apos;ve tried a abort_recovery on MDT0 (we have four MDTs), but this didn&apos;t work. I&apos;ve restarted all MDTs without luck neither. MDT 1 to 3 are still working and some I/Os are still going on the cluster apparently, but the namespace is not accessible due to MDT0 being down/deadlocked. I think the next step will be to try an abort-recovery on all four MDTs but that will likely fail all jobs.&lt;/p&gt;</comment>
                            <comment id="257688" author="adilger" created="Tue, 5 Nov 2019 03:23:17 +0000"  >&lt;p&gt;As you posted in your recent comment, &lt;tt&gt;0x4000&lt;/tt&gt; is the &lt;tt&gt;unknown2_0x4000&lt;/tt&gt; flag, which should be interpreted as &lt;tt&gt;async_discard&lt;/tt&gt;.  The fact that it is being printed in the &lt;tt&gt;imports&lt;/tt&gt; file shows that both the client and server understand the &lt;tt&gt;OBD_CONNECT2_ASYNC_DISCARD&lt;/tt&gt; flag, though it is not being printed correctly.  It appears that this is because the &lt;tt&gt;obd_connect_names[]&lt;/tt&gt; array is &quot;positional&quot; and missing the lower-numbered flags &quot;&lt;tt&gt;pcc&lt;/tt&gt;&quot; and &quot;&lt;tt&gt;plain_layout&lt;/tt&gt;&quot; because those features were not backported to b2_12:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
        &lt;span class=&quot;code-quote&quot;&gt;&quot;archive_id_array&quot;&lt;/span&gt;,     &lt;span class=&quot;code-comment&quot;&gt;/* 0x100 */&lt;/span&gt;
        &lt;span class=&quot;code-quote&quot;&gt;&quot;unknown&quot;&lt;/span&gt;,              &lt;span class=&quot;code-comment&quot;&gt;/* 0x200 */&lt;/span&gt;
        &lt;span class=&quot;code-quote&quot;&gt;&quot;selinux_policy&quot;&lt;/span&gt;,       &lt;span class=&quot;code-comment&quot;&gt;/* 0x400 */&lt;/span&gt;
        &lt;span class=&quot;code-quote&quot;&gt;&quot;lsom&quot;&lt;/span&gt;,                 &lt;span class=&quot;code-comment&quot;&gt;/* 0x800 */&lt;/span&gt;
        &lt;span class=&quot;code-quote&quot;&gt;&quot;async_discard&quot;&lt;/span&gt;,        &lt;span class=&quot;code-comment&quot;&gt;/* 0x4000 */&lt;/span&gt;
        NULL
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I will make a patch so that &quot;&lt;tt&gt;async_discard&lt;/tt&gt;&quot; is printed properly, but that doesn&apos;t seem to be the root cause of this issue.&lt;/p&gt;</comment>
                            <comment id="257689" author="sthiell" created="Tue, 5 Nov 2019 03:34:02 +0000"  >&lt;p&gt;Thanks Andreas! And previous to the async_discard patch, we were still able to put the system back online.&lt;/p&gt;

&lt;p&gt;So I&apos;m attaching a debug log from the MDS as  &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33770/33770_fir-md1-s1-dk.log.gz&quot; title=&quot;fir-md1-s1-dk.log.gz attached to LU-12935&quot;&gt;fir-md1-s1-dk.log.gz&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; . Looks like there are some lfsck errors, it&apos;s perhaps something to look at, like this one:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00100000:10000000:15.0:1572924337.144468:0:34815:0:(lfsck_layout.c:374:lfsck_layout_verify_header_v1v3()) Unsupported LOV EA pattern 256 for the file [0x200029888:0x36c5:0x0] in the component 1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt; We started lfsck in dry-run mode this morning. This might be related.. perhaps. I know that the OI_Scrub had completed successfully on all MDTs.&lt;br/&gt;
Let me know what else would be helpful?&lt;/p&gt;</comment>
                            <comment id="257690" author="adilger" created="Tue, 5 Nov 2019 03:46:22 +0000"  >&lt;p&gt;The &lt;tt&gt;lfsck_layout_verify_header_v1v3()&lt;/tt&gt; message relates to &lt;tt&gt;LOV_PATTERN_MDT = 0x100&lt;/tt&gt;.  That is not yet implemented in (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11081&quot; title=&quot;LFSCK support for DoM file&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11081&quot;&gt;LU-11081&lt;/a&gt;), but should only prevent DoM files from being repaired by LFSCK.&lt;/p&gt;

&lt;p&gt;I&apos;m just looking at the debug log to see if there is anything I can see that might avoid this issue.&lt;/p&gt;</comment>
                            <comment id="257692" author="sthiell" created="Tue, 5 Nov 2019 04:15:29 +0000"  >&lt;p&gt;Thanks Andreas, I will follow &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11081&quot; title=&quot;LFSCK support for DoM file&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11081&quot;&gt;LU-11081&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;Earlier today we took a crash dump (sysrq-triggered) and I&apos;m attaching the output of foreach bt as  &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33771/33771_crash-sysrq-fir-md1-s1-foreach-bt.log&quot; title=&quot;crash-sysrq-fir-md1-s1-foreach-bt.log attached to LU-12935&quot;&gt;crash-sysrq-fir-md1-s1-foreach-bt.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;. We can see many mdt threads blocked in &lt;tt&gt;mdt_intent_brw()&lt;/tt&gt;. The lfsck thread is also doing things in ldiskfs but I doubt this is related to lfsck as we started to mount MDT0 using &lt;tt&gt;-o skip_lfsck&lt;/tt&gt; and it was the same.&lt;/p&gt;</comment>
                            <comment id="257693" author="sthiell" created="Tue, 5 Nov 2019 04:23:39 +0000"  >&lt;p&gt;Attaching debug logs for a local client &lt;tt&gt;fir-rbh01&lt;/tt&gt; as  &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33772/33772_fir-rbh01.dk.log&quot; title=&quot;fir-rbh01.dk.log attached to LU-12935&quot;&gt;fir-rbh01.dk.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; that we restarted too, so the log is complete. Two kind of errors I noticed while gathering this:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000100:00020000:36.0:1572920752.390569:0:84001:0:(layout.c:2113:__req_capsule_get()) @@@ Wrong buffer for field `obd_quotactl&apos; (1 of 1) in format `MDS_QUOTACTL&apos;: 0 vs. 112 (server)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000100:00000400:3.0:1572926475.221568:0:111431:0:(client.c:2133:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1572925719/real 1572925719]  req@ffff8b4664fdb600 x1649317797341696/t0(0) o48-&amp;gt;fir-MDT0000-mdc-ffff8b66b2630000@10.0.10.51@o2ib7:12/10 lens 336/336 e 0 to 1 dl 1572926475 ref 2 fl Rpc:X/2/ffffffff rc -11/-1
00000100:00000400:69.0:1572926475.221569:0:111377:0:(client.c:2133:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1572925719/real 1572925719]  req@ffff8b569d4a2400 x1649317797312576/t0(0) o41-&amp;gt;fir-MDT0000-mdc-ffff8b66b2630000@10.0.10.51@o2ib7:12/10 lens 224/368 e 0 to 1 dl 1572926475 ref 2 fl Rpc:X/2/ffffffff rc -11/-1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="257694" author="adilger" created="Tue, 5 Nov 2019 04:23:43 +0000"  >&lt;p&gt;It looks like the problem is related to the file &quot;&lt;span class=&quot;error&quot;&gt;&amp;#91;0x200034eb7:0x1:0x0&amp;#93;&lt;/span&gt;&quot;, which you may be able to use with &quot;&lt;tt&gt;lfs fid2path&lt;/tt&gt;&quot; to determine what its pathname is.  The lock reference count is going up and down on this file, but stays around 870 or so, likely indicating that either there is a lock refcount leak, or the clients continue getting references on the file and never actually release them.&lt;/p&gt;

&lt;p&gt;It seems like the MDS threads are eventually completing their operations on this file:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:00020000:6.0F:1572922849.748921:0:32760:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) ### lock on destroyed export ffffa11d012fc400 ns: mdt-fir-MDT0000_UUID lock: ffffa12d9633d340/0x675682d8540993e lrc: 3/0,0 mode: PW/PW res: [0x200034eb7:0x1:0x0].0x0 bits 0x40/0x0 rrc: 899 type: IBT flags: 0x50200400000020 nid: 10.9.117.25@o2ib4 remote: 0x9240a829e5985bb6 expref: 283 pid: 32760 timeout: 0 lvb_type: 0
00000400:02000400:6.0:1572922849.748967:0:32760:0:(watchdog.c:397:lcw_update_time()) Service thread pid 32760 completed after 2033.77s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
:
:
00010000:00020000:38.0F:1572922849.932606:0:32409:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) ### lock on destroyed export ffffa12df7e7f800 ns: mdt-fir-MDT0000_UUID lock: ffffa12d7b6cd580/0x675682d854099d1 lrc: 3/0,0 mode: PW/PW res: [0x200034eb7:0x1:0x0].0x0 bits 0x40/0x0 rrc: 882 type: IBT flags: 0x50200400000020 nid: 10.9.117.39@o2ib4 remote: 0x6ac7aeec8dbc3b60 expref: 39 pid: 32409 timeout: 0 lvb_type: 0
00000400:02000400:38.0:1572922849.932678:0:32409:0:(watchdog.c:397:lcw_update_time()) Service thread pid 32409 completed after 2033.95s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;If LFSCK is still running, you could consider to stop it (via &quot;&lt;tt&gt;lctl lfsck_stop&lt;/tt&gt;&quot;), as that may be driving up the load on the MDS?&lt;/p&gt;</comment>
                            <comment id="257696" author="sthiell" created="Tue, 5 Nov 2019 04:55:53 +0000"  >&lt;p&gt;Thanks! We are looking into it. I&apos;ve got the path after a stop/start of MDT0 but I still can&apos;t stat it or get its stripping info. The corresponding user runs a custom code (aeronautics), and has 4 jobs of 240 tasks each. Not sure if all of them are accessing this file and how big is the file at this point, but it is almost certain it is using DoM.&lt;/p&gt;

&lt;p&gt;Re: LFSCK, it has completed and just for information I&apos;m attaching the results as  &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33773/33773_fir-md1-s1_lfsck-results.log&quot; title=&quot;fir-md1-s1_lfsck-results.log attached to LU-12935&quot;&gt;fir-md1-s1_lfsck-results.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;/p&gt;
</comment>
                            <comment id="257697" author="sthiell" created="Tue, 5 Nov 2019 04:58:42 +0000"  >&lt;p&gt;Andreas, the file is empty!&lt;/p&gt;

&lt;p&gt;stat:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;  File: &#8216;/fir/..../DEFAULT.PKG&#8217;
  Size: 0         	Blocks: 0          IO Block: 4194304 regular empty file
Device: e64e03a8h/3863872424d	Inode: 144118891059085313  Links: 1
Access: (0644/-rw-r--r--)  Uid: (297171/    jbho)   Gid: (28669/ cfarhat)
Access: 2019-11-04 20:36:57.000000000 -0800
Modify: 2019-11-04 20:36:57.000000000 -0800
Change: 2019-11-04 20:36:57.000000000 -0800
 Birth: -
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Stripping info:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;.../DEFAULT.PKG
  lcm_layout_gen:    4
  lcm_mirror_count:  1
  lcm_entry_count:   4
    lcme_id:             1
    lcme_mirror_id:      0
    lcme_flags:          init
    lcme_extent.e_start: 0
    lcme_extent.e_end:   131072
      lmm_stripe_count:  0
      lmm_stripe_size:   131072
      lmm_pattern:       mdt
      lmm_layout_gen:    0
      lmm_stripe_offset: 0

    lcme_id:             2
    lcme_mirror_id:      0
    lcme_flags:          0
    lcme_extent.e_start: 131072
    lcme_extent.e_end:   134217728
      lmm_stripe_count:  1
      lmm_stripe_size:   4194304
      lmm_pattern:       raid0
      lmm_layout_gen:    0
      lmm_stripe_offset: -1

    lcme_id:             3
    lcme_mirror_id:      0
    lcme_flags:          0
    lcme_extent.e_start: 134217728
    lcme_extent.e_end:   137438953472
      lmm_stripe_count:  2
      lmm_stripe_size:   4194304
      lmm_pattern:       raid0
      lmm_layout_gen:    0
      lmm_stripe_offset: -1

    lcme_id:             4
    lcme_mirror_id:      0
    lcme_flags:          0
    lcme_extent.e_start: 137438953472
    lcme_extent.e_end:   EOF
      lmm_stripe_count:  4
      lmm_stripe_size:   4194304
      lmm_pattern:       raid0
      lmm_layout_gen:    0
      lmm_stripe_offset: -1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="257698" author="sthiell" created="Tue, 5 Nov 2019 05:09:32 +0000"  >&lt;p&gt;Andreas, at this point, the filesystem seems back online again. I did several stop/start of MDT0 while I had a stat and lfs getstripe running, but it tooks several attempts to get the result.  But then the file was not empty anymore and contained 186 bytes of text then after a few minutes was deleted.&lt;/p&gt;

&lt;p&gt;MDS logs:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Nov 04 20:53:24 fir-md1-s1 kernel: Lustre: fir-MDT0000: recovery is timed out, evict stale exports
Nov 04 20:53:24 fir-md1-s1 kernel: Lustre: fir-MDT0000: disconnecting 1 stale clients
Nov 04 20:53:24 fir-md1-s1 kernel: Lustre: 40926:0:(ldlm_lib.c:1765:extend_recovery_timer()) fir-MDT0000: extended recovery timer reaching hard limit: 900, extend: 1
Nov 04 20:53:24 fir-md1-s1 kernel: Lustre: 40926:0:(ldlm_lib.c:1765:extend_recovery_timer()) Skipped 77 previous similar messages
Nov 04 20:53:24 fir-md1-s1 kernel: Lustre: fir-MDT0000: Recovery over after 2:30, of 1280 clients 1279 recovered and 1 was evicted.
Nov 04 20:55:54 fir-md1-s1 kernel: LustreError: 21591:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.9.117.37@o2ib4  ns: mdt-fir-MDT0000_UUID lock: ffffa13e2631e300/0x675682d85feceb8 lrc: 3/0,0 mode: PW/PW res: [0x200035e2b:0x1:0x0].0x0 bits 0x40/0x0 rrc: 888 type: IBT flags: 0x60000400000020 nid: 10.9.117.37@o2ib4 remote: 0x4d1afbc8d54ee1c expref: 21 pid: 40926 timeout: 19031 lvb_type: 0
Nov 04 20:55:55 fir-md1-s1 kernel: LustreError: 41055:0:(service.c:2128:ptlrpc_server_handle_request()) @@@ Dropping timed-out request from 12345-10.9.108.53@o2ib4: deadline 100:15s ago
                                     req@ffffa13e32f00000 x1649335780509632/t0(0) o38-&amp;gt;&amp;lt;?&amp;gt;@&amp;lt;?&amp;gt;:0/0 lens 520/0 e 0 to 0 dl 1572929740 ref 1 fl Interpret:/0/ffffffff rc 0/-1
Nov 04 20:55:55 fir-md1-s1 kernel: Lustre: 41128:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (100:7s); client may timeout.  req@ffffa13e285d1b00 x1649334750879840/t0(0) o38-&amp;gt;&amp;lt;?&amp;gt;@&amp;lt;?&amp;gt;:0/0 lens 520/0 e 0 to 0 dl 1572929748 ref 1 fl Interpret:/0/ffffffff rc 0/-1
Nov 04 20:55:55 fir-md1-s1 kernel: Lustre: 41128:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 104 previous similar messages
Nov 04 20:55:55 fir-md1-s1 kernel: LustreError: 41055:0:(service.c:2128:ptlrpc_server_handle_request()) Skipped 110 previous similar messages
Nov 04 20:56:58 fir-md1-s1 kernel: Lustre: fir-MDT0000: Client 9b6df508-4fd4-62e6-a19b-42f88c25e71f (at 10.8.26.4@o2ib6) reconnecting
Nov 04 20:56:58 fir-md1-s1 kernel: Lustre: Skipped 39 previous similar messages
Nov 04 21:00:35 fir-md1-s1 kernel: LustreError: 137-5: fir-MDT0003_UUID: not available for connect from 10.8.11.28@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server.
Nov 04 21:00:35 fir-md1-s1 kernel: LustreError: Skipped 667 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="257700" author="sthiell" created="Tue, 5 Nov 2019 05:12:24 +0000"  >&lt;p&gt;Updated MDS dk logs attached as  &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33774/33774_fir-md1-s1-dk2.log.gz&quot; title=&quot;fir-md1-s1-dk2.log.gz attached to LU-12935&quot;&gt;fir-md1-s1-dk2.log.gz&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; and filesystem is still running.&lt;/p&gt;</comment>
                            <comment id="257701" author="sthiell" created="Tue, 5 Nov 2019 05:18:55 +0000"  >&lt;p&gt;The file in question, DEFAULT.PKG, is actually used by the AERO-F software to restart a simulation  (&lt;a href=&quot;https://frg.bitbucket.io/aero-f/index.html#Restart&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://frg.bitbucket.io/aero-f/index.html#Restart&lt;/a&gt;)&lt;/p&gt;</comment>
                            <comment id="257784" author="tappro" created="Tue, 5 Nov 2019 21:42:57 +0000"  >&lt;p&gt;Stephane, similar stack trace was seen several times in couple bugs like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11358&quot; title=&quot;racer test 1 hangs in locking with DNE&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11358&quot;&gt;LU-11358&lt;/a&gt; you&apos;ve mentioned but in that cases there was also other thread which was blocker, but I don&apos;t see any other thread in logs which would block all these mdt_brw_intent() threads.&lt;br/&gt;
In fact DoM is vulnerable to such access patterns like we have here - multiple writers to single file, because DoM lock covers whole DoM region but not stripes, so only one writer at the time is allowed. That can become bottleneck and may cause lock timeouts on high load. So general recommendation in that case - use ordinary file with OST stripes instead.&lt;br/&gt;
Meanwhile that shouldn&apos;t cause such server hung as you&apos;ve experienced, so I will investigate that more. &lt;/p&gt;</comment>
                            <comment id="257787" author="sthiell" created="Tue, 5 Nov 2019 21:59:17 +0000"  >&lt;p&gt;Hi Mike,&lt;/p&gt;

&lt;p&gt;Thanks for looking at this!&lt;/p&gt;

&lt;p&gt;You say the whole DoM region, but in our case, the DoM size is only 128 KB. It&apos;s less than the size of an OST stripe (4MB), so I&apos;m not sure to understand why DoM wouldn&apos;t be as efficient. For larger DoM sizes, I understand.&lt;/p&gt;

&lt;p&gt;And we do have arrays of SSDs for the MDT storage. So disk I/Os for DoM should not be a problem.&lt;/p&gt;

&lt;p&gt;Our main problem here is that it blocked MDT0 and thus the filesystem/namespace. I was hoping you could find a blocking thread maybe from  &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33771/33771_crash-sysrq-fir-md1-s1-foreach-bt.log&quot; title=&quot;crash-sysrq-fir-md1-s1-foreach-bt.log attached to LU-12935&quot;&gt;crash-sysrq-fir-md1-s1-foreach-bt.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; . Do you want the associated crash dump? Let me know, I can upload it to your FTP.&lt;/p&gt;</comment>
                            <comment id="257808" author="sthiell" created="Wed, 6 Nov 2019 05:57:21 +0000"  >&lt;p&gt;A similar situation happened again after the user relaunched his jobs, but the traces on the MDS are a bit different I think, at least the first ones. Looks like the filesystem is blocked again.&lt;/p&gt;

&lt;p&gt;One of these traces:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Nov 05 20:58:18 fir-md1-s1 kernel: NMI watchdog: BUG: soft lockup - CPU#38 stuck for 22s! [mdt_io02_034:41734]
...
Nov 05 20:58:18 fir-md1-s1 kernel: CPU: 38 PID: 41734 Comm: mdt_io02_034 Kdump: loaded Tainted: G           OEL ------------   3.10.0-957.27.2.el7_lustre.pl1.x86_64 #1
Nov 05 20:58:18 fir-md1-s1 kernel: Hardware name: Dell Inc. PowerEdge R6415/065PKD, BIOS 1.10.6 08/15/2019
Nov 05 20:58:18 fir-md1-s1 kernel: task: ffffa11e2a1e4100 ti: ffffa13cd3a70000 task.ti: ffffa13cd3a70000
Nov 05 20:58:18 fir-md1-s1 kernel: RIP: 0010:[&amp;lt;ffffffffbe913536&amp;gt;]  [&amp;lt;ffffffffbe913536&amp;gt;] native_queued_spin_lock_slowpath+0x126/0x200
Nov 05 20:58:18 fir-md1-s1 kernel: RSP: 0018:ffffa13cd3a73800  EFLAGS: 00000246
Nov 05 20:58:18 fir-md1-s1 kernel: RAX: 0000000000000000 RBX: ffffa130a9500be0 RCX: 0000000001310000
Nov 05 20:58:18 fir-md1-s1 kernel: RDX: ffffa12e3f8db780 RSI: 0000000001710101 RDI: ffffa13e3710f480
Nov 05 20:58:18 fir-md1-s1 kernel: RBP: ffffa13cd3a73800 R08: ffffa12e3f85b780 R09: 0000000000000000
Nov 05 20:58:18 fir-md1-s1 kernel: R10: ffffa12e3f85f140 R11: ffffda91d59da200 R12: 0000000000000000
Nov 05 20:58:18 fir-md1-s1 kernel: R13: ffffa13cd3a737a0 R14: ffffa130a9500948 R15: 0000000000000000
Nov 05 20:58:18 fir-md1-s1 kernel: FS:  00007f38ccb1d700(0000) GS:ffffa12e3f840000(0000) knlGS:0000000000000000
Nov 05 20:58:18 fir-md1-s1 kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Nov 05 20:58:18 fir-md1-s1 kernel: CR2: 000000000124f178 CR3: 000000364fa10000 CR4: 00000000003407e0
Nov 05 20:58:18 fir-md1-s1 kernel: Call Trace:
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffbef5f2cb&amp;gt;] queued_spin_lock_slowpath+0xb/0xf
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffbef6d7a0&amp;gt;] _raw_spin_lock+0x20/0x30
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffc13e2c07&amp;gt;] ldiskfs_es_lru_add+0x57/0x90 [ldiskfs]
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffc13ad6a5&amp;gt;] ldiskfs_ext_map_blocks+0x7b5/0xf60 [ldiskfs]
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffbe902372&amp;gt;] ? ktime_get_ts64+0x52/0xf0
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffbe903612&amp;gt;] ? ktime_get+0x52/0xe0
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0bab14b&amp;gt;] ? kiblnd_post_tx_locked+0x7bb/0xa50 [ko2iblnd]
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffc13e9728&amp;gt;] ldiskfs_map_blocks+0x98/0x700 [ldiskfs]
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0b40203&amp;gt;] ? cfs_hash_bd_lookup_intent+0x63/0x170 [libcfs]
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffbe902372&amp;gt;] ? ktime_get_ts64+0x52/0xf0
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffc14bab63&amp;gt;] osd_ldiskfs_map_inode_pages+0x143/0x420 [osd_ldiskfs]
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffc14bc996&amp;gt;] osd_write_prep+0x2b6/0x360 [osd_ldiskfs]
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffc1614c3b&amp;gt;] mdt_obd_preprw+0x65b/0x10a0 [mdt]
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffc116d1bc&amp;gt;] tgt_brw_write+0xc7c/0x1cf0 [ptlrpc]
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffbe8e59c8&amp;gt;] ? load_balance+0x178/0x9a0
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffbe8e143c&amp;gt;] ? update_curr+0x14c/0x1e0
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffbe8dca58&amp;gt;] ? __enqueue_entity+0x78/0x80
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffbe8e367f&amp;gt;] ? enqueue_entity+0x2ef/0xbe0
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffc1159a7d&amp;gt;] ? tgt_lookup_reply+0x2d/0x190 [ptlrpc]
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffc116936a&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffc1144da1&amp;gt;] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc]
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0b34bde&amp;gt;] ? ktime_get_real_seconds+0xe/0x10 [libcfs]
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffc111024b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffc110b805&amp;gt;] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc]
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffbe8cfeb4&amp;gt;] ? __wake_up+0x44/0x50
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffc1113bac&amp;gt;] ptlrpc_main+0xb2c/0x1460 [ptlrpc]
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffc1113080&amp;gt;] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc]
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffbe8c2e81&amp;gt;] kthread+0xd1/0xe0
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffbe8c2db0&amp;gt;] ? insert_kthread_work+0x40/0x40
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffbef77c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
Nov 05 20:58:18 fir-md1-s1 kernel:  [&amp;lt;ffffffffbe8c2db0&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Attaching MDS logs as  &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33779/33779_fir-md1-s1_20191105.log&quot; title=&quot;fir-md1-s1_20191105.log attached to LU-12935&quot;&gt;fir-md1-s1_20191105.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;/p&gt;</comment>
                            <comment id="257811" author="tappro" created="Wed, 6 Nov 2019 06:14:53 +0000"  >&lt;p&gt;Stephane, yes, I was checking exactly that sysrq file but found no good candidates. As for DoM file efficiency, it is not about its size but how many processes are accessing it at the same time. I think with file DEFAULT.PKG many processes are trying to write to the file beginning so DOM region becomes bottkeneck and OST stripes are not so useful in that case, each process need access to DOM region first each time? &lt;/p&gt;</comment>
                            <comment id="257813" author="sthiell" created="Wed, 6 Nov 2019 06:29:38 +0000"  >&lt;p&gt;Ah yes, I see, thanks. And I just found 250+ files like these that may be used by these aero-f jobs:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;-rw-r--r-- 1 jbho cfarhat  145736 Nov  5 21:27 /scratch/users/jbho/aerof_simulations/maewing/DEFAULT.D2W115
-rw-r--r-- 1 jbho cfarhat  149248 Nov  5 21:27 /scratch/users/jbho/aerof_simulations/maewing/DEFAULT.D2W116
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So they are small files but use the full 128KB DoM region but they are all on a single MDT, and accessed by 4 x 240 tasks (potentially).&lt;/p&gt;

&lt;p&gt;I&apos;ll make changes so that these won&apos;t be using DoM anymore.&lt;/p&gt;</comment>
                            <comment id="257814" author="tappro" created="Wed, 6 Nov 2019 06:44:03 +0000"  >&lt;p&gt;Stephane, there is one ticket for DoM improvements which can improve such access patterns, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10664&quot; title=&quot;DoM: make DoM lock enqueue non-blocking&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10664&quot;&gt;&lt;del&gt;LU-10664&lt;/del&gt;&lt;/a&gt;, but it has no patch at the moment. &lt;/p&gt;</comment>
                            <comment id="257840" author="sthiell" created="Wed, 6 Nov 2019 18:06:49 +0000"  >&lt;p&gt;Thanks Mike. We&apos;re currently discussing about changing our default stripping to avoid further issues like these and perhaps only use DoM on specific cases.&lt;/p&gt;

&lt;p&gt;FYI, yesterday, after I killed the suspected jobs, I still had to do a stop/start of MDT0 to resume filesystem operations (it didn&apos;t recover by itself).&lt;/p&gt;</comment>
                            <comment id="258029" author="sthiell" created="Fri, 8 Nov 2019 19:52:09 +0000"  >&lt;p&gt;To avoid further issues for now, we have removed the default DoM stripping from all directories on this filesystem (only kept a PFL stripping). New files won&apos;t use DoM anymore. We&apos;ll see if that helps.&lt;/p&gt;</comment>
                            <comment id="258869" author="adilger" created="Tue, 26 Nov 2019 23:59:35 +0000"  >&lt;p&gt;Stephane, it would make sense to get an strace (or equivalent Lustre &quot;{{lctl set_param debug=+vfstrace +dlmtrace) from these jobs to see just how many times they write to the same file.&lt;/p&gt;

&lt;p&gt;Mike, since it is possible to migrate DoM components to OSTs (either with full-file copy in 2.12 or via FLR mirror in 2.13 patch &lt;a href=&quot;https://review.whamcloud.com/35359&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/35359&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11421&quot; title=&quot;DoM: manual migration OST-MDT, MDT-MDT&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11421&quot;&gt;&lt;del&gt;LU-11421&lt;/del&gt;&lt;/a&gt; dom: manual OST-to-DOM migration via mirroring&lt;/tt&gt;&quot;), have you thought about automatically migrating files with high write lock contention from DoM to a regular OST object?  Since the amount of data to be moved is very small (under 150KB in this case), the migration should be very fast, and it would allow extent locks to be used on the file.&lt;/p&gt;

&lt;p&gt;That said, I have no idea how hard this would be, and only makes sense if there are multiple writers repeatedly contending on the same DoM file component (which I suspect is rare in most cases).  Even here, it may be that if the clients are only writing to the same file a handful of times that the extra migration step would make the performance worse rather than better. If they write to the same file hundreds of times then it might be worthwhile to implement.&lt;/p&gt;

&lt;p&gt;Even in IO-500 &lt;tt&gt;ior-hard-write&lt;/tt&gt; the chunk size is 47008 bytes, so at most 2-3 ranks would be contending on a 64KB or 128KB DoM component, and we never had problems with this in our testing.&lt;/p&gt;</comment>
                            <comment id="258886" author="gerrit" created="Wed, 27 Nov 2019 05:13:04 +0000"  >&lt;p&gt;Andreas Dilger (adilger@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/36881&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/36881&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12935&quot; title=&quot;MDT deadlock on 2.12.3 with DoM; is it missing async_discard feature?&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12935&quot;&gt;&lt;del&gt;LU-12935&lt;/del&gt;&lt;/a&gt; obdclass: fix import connect flag printing&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: b2afbb33175af02608d9d4127370dc874542148b&lt;/p&gt;</comment>
                            <comment id="259774" author="gerrit" created="Thu, 12 Dec 2019 23:06:30 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/36881/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/36881/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12935&quot; title=&quot;MDT deadlock on 2.12.3 with DoM; is it missing async_discard feature?&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12935&quot;&gt;&lt;del&gt;LU-12935&lt;/del&gt;&lt;/a&gt; obdclass: fix import connect flag printing&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: b66efa7d500f94d755f318d43804f0f3eb883835&lt;/p&gt;</comment>
                            <comment id="259779" author="pjones" created="Thu, 12 Dec 2019 23:35:11 +0000"  >&lt;p&gt;Landed for 2.12.4 - does not affect master&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="53400">LU-11421</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="50817">LU-10664</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="33771" name="crash-sysrq-fir-md1-s1-foreach-bt.log" size="1420586" author="sthiell" created="Tue, 5 Nov 2019 04:13:46 +0000"/>
                            <attachment id="33769" name="fir-md1-s1-MDT0.log" size="443824" author="sthiell" created="Tue, 5 Nov 2019 02:42:30 +0000"/>
                            <attachment id="33770" name="fir-md1-s1-dk.log.gz" size="3461438" author="sthiell" created="Tue, 5 Nov 2019 03:32:13 +0000"/>
                            <attachment id="33774" name="fir-md1-s1-dk2.log.gz" size="404227" author="sthiell" created="Tue, 5 Nov 2019 05:11:38 +0000"/>
                            <attachment id="33779" name="fir-md1-s1_20191105.log" size="775714" author="sthiell" created="Wed, 6 Nov 2019 05:57:16 +0000"/>
                            <attachment id="33773" name="fir-md1-s1_lfsck-results.log" size="3326" author="sthiell" created="Tue, 5 Nov 2019 04:42:05 +0000"/>
                            <attachment id="33772" name="fir-rbh01.dk.log" size="666577" author="sthiell" created="Tue, 5 Nov 2019 04:23:28 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00ozj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10020"><![CDATA[1]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>