<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:48:29 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-11964] Heavy load and soft lockups on MDS with DOM</title>
                <link>https://jira.whamcloud.com/browse/LU-11964</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;On Fir, our new scratch for Sherlock running 2.12, the filesystem hung today and then operations resumed. Fir is using DNE, PFL and DOM. The MDS hosting mdt&lt;span class=&quot;error&quot;&gt;&amp;#91;0,2&amp;#93;&lt;/span&gt; seems to have been under a heavy load. For example:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[481509.742936] Call Trace:
[481509.745520]  [&amp;lt;ffffffffc0f73303&amp;gt;] ldlm_process_inodebits_lock+0x93/0x3d0 [ptlrpc]
[481509.753123]  [&amp;lt;ffffffffc0f61fc0&amp;gt;] ? ldlm_export_lock_object+0x10/0x10 [ptlrpc]
[481509.760464]  [&amp;lt;ffffffffc0f73270&amp;gt;] ? ldlm_inodebits_compat_queue+0x440/0x440 [ptlrpc]
[481509.768321]  [&amp;lt;ffffffffc0f4160e&amp;gt;] ldlm_reprocess_queue+0x1be/0x3f0 [ptlrpc]
[481509.775403]  [&amp;lt;ffffffffc0f426cb&amp;gt;] __ldlm_reprocess_all+0x10b/0x380 [ptlrpc]
[481509.782482]  [&amp;lt;ffffffffc0f434a2&amp;gt;] ldlm_cancel_lock_for_export.isra.26+0x1c2/0x390 [ptlrpc]
[481509.790863]  [&amp;lt;ffffffffc0f43851&amp;gt;] ldlm_export_cancel_blocked_locks+0x121/0x200 [ptlrpc]
[481509.798987]  [&amp;lt;ffffffffc0f6c6e2&amp;gt;] ldlm_bl_thread_main+0x112/0x700 [ptlrpc]
[481509.805950]  [&amp;lt;ffffffff9d6d67b0&amp;gt;] ? wake_up_state+0x20/0x20
[481509.811650]  [&amp;lt;ffffffffc0f6c5d0&amp;gt;] ? ldlm_handle_bl_callback+0x530/0x530 [ptlrpc]
[481509.819130]  [&amp;lt;ffffffff9d6c1c31&amp;gt;] kthread+0xd1/0xe0
[481509.824095]  [&amp;lt;ffffffff9d6c1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
[481509.830277]  [&amp;lt;ffffffff9dd74c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
[481509.836799]  [&amp;lt;ffffffff9d6c1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
[481509.842979] Code: 74 0e 4c 89 f2 48 89 de 4c 89 ff e8 d3 a2 fc ff 49 8b 87 10 02 00 00 4c 8d a8 f0 fd ff ff 4d 39 fd 74 2b 49 83 bd a8 00 00 00 00 &amp;lt;74&amp;gt; 0e 4c 89 f2 48 89 de 4c 89 ef e8 a8 a2 fc ff 4d 8b ad 10 02 
[481516.050304] INFO: rcu_sched self-detected stall on CPU
[481516.051302] INFO: rcu_sched self-detected stall on CPU
[481516.051303]  {
[481516.051303]  15
[481516.051304] } 
[481516.051314]  (t=60001 jiffies g=115591743 c=115591742 q=6728165)
[481516.051315] Task dump for CPU 14:
[481516.051316] ldlm_bl_01      R
[481516.051316]   running task    
[481516.051318]     0 54053      2 0x00000088
[481516.051318] Call Trace:
[481516.051362]  [&amp;lt;ffffffffc0f43851&amp;gt;] ? ldlm_export_cancel_blocked_locks+0x121/0x200 [ptlrpc]
[481516.051394]  [&amp;lt;ffffffffc0f6c6e2&amp;gt;] ? ldlm_bl_thread_main+0x112/0x700 [ptlrpc]
[481516.051397]  [&amp;lt;ffffffff9d6d67b0&amp;gt;] ? wake_up_state+0x20/0x20
[481516.051426]  [&amp;lt;ffffffffc0f6c5d0&amp;gt;] ? ldlm_handle_bl_callback+0x530/0x530 [ptlrpc]
[481516.051427]  [&amp;lt;ffffffff9d6c1c31&amp;gt;] ? kthread+0xd1/0xe0
[481516.051429]  [&amp;lt;ffffffff9d6c1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
[481516.051431]  [&amp;lt;ffffffff9dd74c24&amp;gt;] ? ret_from_fork_nospec_begin+0xe/0x21
[481516.051432]  [&amp;lt;ffffffff9d6c1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
[481516.051432] Task dump for CPU 15:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;filesystem is still running but some threads are at 100% on the MDS&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;   PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND                                                                                                                                      
 54053 root      20   0       0      0      0 R 100.0  0.0   2:02.22 ldlm_bl_01                                                                                                                                   
 55962 root      20   0       0      0      0 R 100.0  0.0   2:01.79 ldlm_bl_03                                                                                                                                   
 56043 root      20   0       0      0      0 R 100.0  0.0   3:20.41 ldlm_bl_04                                                                                                                                   
 63820 root      20   0       0      0      0 R 100.0  0.0  10:13.65 mdt03_033  
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@fir-md1-s1 ~]# lctl get_param ldlm.*.*mdt*.*
ldlm.namespaces.mdt-fir-MDT0000_UUID.contended_locks=32
ldlm.namespaces.mdt-fir-MDT0000_UUID.contention_seconds=2
ldlm.namespaces.mdt-fir-MDT0000_UUID.ctime_age_limit=10
ldlm.namespaces.mdt-fir-MDT0000_UUID.dirty_age_limit=10
ldlm.namespaces.mdt-fir-MDT0000_UUID.early_lock_cancel=0
ldlm.namespaces.mdt-fir-MDT0000_UUID.lock_count=146645
ldlm.namespaces.mdt-fir-MDT0000_UUID.lock_timeouts=8
ldlm.namespaces.mdt-fir-MDT0000_UUID.lock_unused_count=0
ldlm.namespaces.mdt-fir-MDT0000_UUID.lru_max_age=3900000
ldlm.namespaces.mdt-fir-MDT0000_UUID.lru_size=4800
ldlm.namespaces.mdt-fir-MDT0000_UUID.max_nolock_bytes=0
ldlm.namespaces.mdt-fir-MDT0000_UUID.max_parallel_ast=1024
ldlm.namespaces.mdt-fir-MDT0000_UUID.resource_count=62361
ldlm.namespaces.mdt-fir-MDT0002_UUID.contended_locks=32
ldlm.namespaces.mdt-fir-MDT0002_UUID.contention_seconds=2
ldlm.namespaces.mdt-fir-MDT0002_UUID.ctime_age_limit=10
ldlm.namespaces.mdt-fir-MDT0002_UUID.dirty_age_limit=10
ldlm.namespaces.mdt-fir-MDT0002_UUID.early_lock_cancel=0
ldlm.namespaces.mdt-fir-MDT0002_UUID.lock_count=6472231
ldlm.namespaces.mdt-fir-MDT0002_UUID.lock_timeouts=166760
ldlm.namespaces.mdt-fir-MDT0002_UUID.lock_unused_count=0
ldlm.namespaces.mdt-fir-MDT0002_UUID.lru_max_age=3900000
ldlm.namespaces.mdt-fir-MDT0002_UUID.lru_size=4800
ldlm.namespaces.mdt-fir-MDT0002_UUID.max_nolock_bytes=0
ldlm.namespaces.mdt-fir-MDT0002_UUID.max_parallel_ast=1024
ldlm.namespaces.mdt-fir-MDT0002_UUID.resource_count=359599
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;MDS resumed operations by itself, the last logs were:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[482208.719700] LustreError: 63820:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1549999197, 300s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0002_UUID lock: ffff8995f523f740/0x1a4b7ac741771fee lrc: 3/0,1 mode: --/PW res: [0x2c00033c7:0x2b6e:0x0].0x0 bits 0x40/0x0 rrc: 39993 type: IBT flags: 0x40010080000000 nid: local remote: 0x0 expref: -99 pid: 63820 timeout: 0 lvb_type: 0
[482361.725697] LNet: Service thread pid 63820 completed after 1225.53s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
[482666.099950] Lustre: fir-MDT0002: Client 0ccdc4e2-9749-c9a5-afb4-85874ce74d6c (at 10.0.10.3@o2ib7) reconnecting
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I&apos;m attaching the full syslog output.&lt;/p&gt;

&lt;p&gt;Now everything looks good&#160;even though this MDS is still quite loaded in terms of CPU.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
 Stephane&lt;/p&gt;</description>
                <environment>CentOS 7.6, Linux Kernel 3.10.0-957.1.3.el7_lustre.x86_64, Lustre 2.12 clients (Sherlock cluster)</environment>
        <key id="54869">LU-11964</key>
            <summary>Heavy load and soft lockups on MDS with DOM</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="tappro">Mikhail Pershin</assignee>
                                    <reporter username="sthiell">Stephane Thiell</reporter>
                        <labels>
                    </labels>
                <created>Tue, 12 Feb 2019 20:15:24 +0000</created>
                <updated>Mon, 6 May 2019 18:57:08 +0000</updated>
                            <resolved>Thu, 21 Mar 2019 04:49:38 +0000</resolved>
                                    <version>Lustre 2.12.0</version>
                                    <fixVersion>Lustre 2.13.0</fixVersion>
                    <fixVersion>Lustre 2.12.1</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="241800" author="pfarrell" created="Tue, 12 Feb 2019 20:18:56 +0000"  >&lt;p&gt;Given that the system is back operational, I don&apos;t have any specific thoughts on this one, but I wanted to point out that at least the log in the highlighted error message is a DOM lock:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[482208.719700] LustreError: 63820:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1549999197, 300s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0002_UUID lock: ffff8995f523f740/0x1a4b7ac741771fee lrc: 3/0,1 mode: --/PW res: [0x2c00033c7:0x2b6e:0x0].0x0 bits 0x40/0x0 rrc: 39993 type: IBT flags: 0x40010080000000 nid: local remote: 0x0 expref: -99 pid: 63820 timeout: 0 lvb_type: 0

#define MDS_INODELOCK_DOM 0x000040 / Data for data-on-mdt files /
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="241859" author="pjones" created="Wed, 13 Feb 2019 13:11:04 +0000"  >&lt;p&gt;Mike is investigating&lt;/p&gt;</comment>
                            <comment id="241881" author="sthiell" created="Wed, 13 Feb 2019 17:29:32 +0000"  >&lt;p&gt;The filesystem has worked okay all night, load on MDS was &amp;lt; 1 most of the time but this morning the issue is starting again and Fir just became unavailable, ls hangs on all clients.&lt;/p&gt;

&lt;p&gt;Attaching recent syslog fir-md1-s1-20190213.log&lt;/p&gt;

&lt;p&gt;Stephane&lt;br/&gt;
&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="241882" author="sthiell" created="Wed, 13 Feb 2019 17:30:15 +0000"  >&lt;p&gt;Some more MDS logs:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[63732.498926] LustreError: 21799:0:(ldlm_request.c:147:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1550078451, 300s ago), entering recovery for fir-MDT0000_UUID@10.0.10.51@o2ib7 ns: fir-MDT0000-osp-MDT0002 lock: ffff9cd383330fc0/0x6bcd99f6a98cc9dd lrc: 4/0,1 mode: --/EX res: [0x200000004:0x1:0x0].0x0 bits 0x2/0x0 rrc: 8 type: IBT flags: 0x1000001000000 nid: local remote: 0x6bcd99f6a98cc9e4 expref: -99 pid: 21799 timeout: 0 lvb_type: 0
[63732.539003] Lustre: fir-MDT0000: Received new LWP connection from 0@lo, removing former export from same NID
[63732.548833] Lustre: Skipped 54 previous similar messages
[63743.973928] LustreError: 21793:0:(ldlm_request.c:147:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1550078462, 300s ago), entering recovery for fir-MDT0000_UUID@10.0.10.51@o2ib7 ns: fir-MDT0000-osp-MDT0002 lock: ffff9cb3fb39af40/0x6bcd99f6a98e41ad lrc: 4/0,1 mode: --/EX res: [0x200000004:0x1:0x0].0x0 bits 0x2/0x0 rrc: 8 type: IBT flags: 0x1000001000000 nid: local remote: 0x6bcd99f6a98e41b4 expref: -99 pid: 21793 timeout: 0 lvb_type: 0
[63763.594393] LustreError: 20534:0:(ldlm_request.c:147:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1550078482, 300s ago), entering recovery for fir-MDT0000_UUID@10.0.10.51@o2ib7 ns: fir-MDT0000-osp-MDT0002 lock: ffff9cd4a1032640/0x6bcd99f6a990a394 lrc: 4/0,1 mode: --/EX res: [0x200000004:0x1:0x0].0x0 bits 0x2/0x0 rrc: 8 type: IBT flags: 0x1000001000000 nid: local remote: 0x6bcd99f6a990a39b expref: -99 pid: 20534 timeout: 0 lvb_type: 0
[63775.626656] LustreError: 20525:0:(ldlm_request.c:147:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1550078494, 300s ago), entering recovery for fir-MDT0000_UUID@10.0.10.51@o2ib7 ns: fir-MDT0000-osp-MDT0002 lock: ffff9c9eb2fae780/0x6bcd99f6a992073d lrc: 4/0,1 mode: --/EX res: [0x200000004:0x1:0x0].0x0 bits 0x2/0x0 rrc: 8 type: IBT flags: 0x1000001000000 nid: local remote: 0x6bcd99f6a9920744 expref: -99 pid: 20525 timeout: 0 lvb_type: 0
[63782.188811] LustreError: 21356:0:(ldlm_request.c:147:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1550078501, 300s ago), entering recovery for fir-MDT0000_UUID@10.0.10.51@o2ib7 ns: fir-MDT0000-osp-MDT0002 lock: ffff9cc441acf500/0x6bcd99f6a992bcb8 lrc: 4/0,1 mode: --/EX res: [0x200000004:0x1:0x0].0x0 bits 0x2/0x0 rrc: 8 type: IBT flags: 0x1000001000000 nid: local remote: 0x6bcd99f6a992bcbf expref: -99 pid: 21356 timeout: 0 lvb_type: 0
[63835.457912] LNet: Service thread pid 21733 completed after 1359.93s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="241883" author="sthiell" created="Wed, 13 Feb 2019 17:34:59 +0000"  >&lt;p&gt;MDS completely hung. It looks like this task ldlm_bl_04 is stuck (100% cpu):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Feb 13 09:34:47 fir-md1-s1 kernel: ldlm_bl_04      R  running task        0 22866      2 0x00000080
Feb 13 09:34:47 fir-md1-s1 kernel: Call Trace:
Feb 13 09:34:47 fir-md1-s1 kernel:  [&amp;lt;ffffffffc121c4a2&amp;gt;] ? ldlm_cancel_lock_for_export.isra.26+0x1c2/0x390 [ptlrpc]
Feb 13 09:34:47 fir-md1-s1 kernel:  [&amp;lt;ffffffffc121c6af&amp;gt;] ? ldlm_cancel_locks_for_export_cb+0x3f/0x50 [ptlrpc]
Feb 13 09:34:47 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0c8bf20&amp;gt;] ? cfs_hash_for_each_relax+0x250/0x450 [libcfs]
Feb 13 09:34:47 fir-md1-s1 kernel:  [&amp;lt;ffffffffc121c670&amp;gt;] ? ldlm_cancel_lock_for_export.isra.26+0x390/0x390 [ptlrpc]
Feb 13 09:34:47 fir-md1-s1 kernel:  [&amp;lt;ffffffffc121c670&amp;gt;] ? ldlm_cancel_lock_for_export.isra.26+0x390/0x390 [ptlrpc]
Feb 13 09:34:47 fir-md1-s1 kernel:  [&amp;lt;ffffffffc0c8f480&amp;gt;] ? cfs_hash_for_each_empty+0x80/0x1d0 [libcfs]
Feb 13 09:34:47 fir-md1-s1 kernel:  [&amp;lt;ffffffffc121c9da&amp;gt;] ? ldlm_export_cancel_locks+0xaa/0x180 [ptlrpc]
Feb 13 09:34:47 fir-md1-s1 kernel:  [&amp;lt;ffffffffc1245a98&amp;gt;] ? ldlm_bl_thread_main+0x4c8/0x700 [ptlrpc]
Feb 13 09:34:47 fir-md1-s1 kernel:  [&amp;lt;ffffffff8e8d67b0&amp;gt;] ? wake_up_state+0x20/0x20
Feb 13 09:34:47 fir-md1-s1 kernel:  [&amp;lt;ffffffffc12455d0&amp;gt;] ? ldlm_handle_bl_callback+0x530/0x530 [ptlrpc]
Feb 13 09:34:47 fir-md1-s1 kernel:  [&amp;lt;ffffffff8e8c1c31&amp;gt;] ? kthread+0xd1/0xe0
Feb 13 09:34:47 fir-md1-s1 kernel:  [&amp;lt;ffffffff8e8c1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
Feb 13 09:34:47 fir-md1-s1 kernel:  [&amp;lt;ffffffff8ef74c24&amp;gt;] ? ret_from_fork_nospec_begin+0xe/0x21
Feb 13 09:34:47 fir-md1-s1 kernel:  [&amp;lt;ffffffff8e8c1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="241884" author="sthiell" created="Wed, 13 Feb 2019 17:36:29 +0000"  >&lt;p&gt;More info from sysrq trigger:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[64313.592483] runnable tasks:
[64313.595284]             task   PID         tree-key  switches  prio     wait-time             sum-exec        sum-sleep
[64313.606057] ----------------------------------------------------------------------------------------------------------
[64313.616842]      watchdog/47   246       -11.986915     16075     0         0.000000     95639.639642         0.000000 3 /
[64313.628000]     migration/47   247         0.000000       913     0         0.000000     83228.989645         0.000000 3 /
[64313.639163]     ksoftirqd/47   248   4251760.878673    114814   120         0.000000      1203.050164         0.000000 3 /
[64313.650327]    kworker/47:0H   250      8659.359946        10   100         0.000000         0.036600         0.000000 3 /
[64313.661503]     kworker/47:1  7283   4251787.201879     30064   120         0.000000       549.488018         0.000000 3 /
[64313.672688]    kworker/47:1H 19995   4251760.757048     82621   100         0.000000       213.505028         0.000000 3 /
[64313.683874]   mdt_rdpg03_000 20543   4064574.657540   3191348   120         0.000000    112475.429302         0.000000 3 /
[64313.695046]      osp-pre-1-0 20674   4251787.449629     12736   120         0.000000       157.173744         0.000000 3 /
[64313.706214]      osp-pre-9-0 20698   4251788.079886     13694   120         0.000000       233.975089         0.000000 3 /
[64313.717374]     osp-syn-19-0 20703   4251760.054468      5006   120         0.000000       110.608448         0.000000 3 /
[64313.728537]     osp-pre-25-0 20708   4251783.612212     13624   120         0.000000       211.588054         0.000000 3 /
[64313.739702]     osp-pre-45-0 20738   4251784.278457     13594   120         0.000000       214.823662         0.000000 3 /
[64313.750925]     osp-pre-15-0 20748   4251782.601412     13234   120         0.000000       207.394803         0.000000 3 /
[64313.762089]     osp-syn-34-0 20771   4251728.575052      4825   120         0.000000       106.819346         0.000000 3 /
[64313.773255]     osp-syn-46-2 20877   4251171.886625     44052   120         0.000000       980.511629         0.000000 3 /
[64313.784412]     osp-pre-14-2 20884   4251786.767116     17270   120         0.000000       357.818315         0.000000 3 /
[64313.795575]     osp-pre-43-2 20896   4251787.273264     17260   120         0.000000       315.967873         0.000000 3 /
[64313.806739]      osp-pre-4-2 20920   4251783.607955     16036   120         0.000000       316.253987         0.000000 3 /
[64313.817902]     osp-pre-20-2 20926   4251782.616211     16995   120         0.000000       313.137617         0.000000 3 /
[64313.829063]      osp-syn-7-2 20927   4251196.855904     42988   120         0.000000       924.183430         0.000000 3 /
[64313.840228]     osp-pre-45-2 20932   4251785.956673     16630   120         0.000000      1336.366368         0.000000 3 /
[64313.851391]     osp-pre-37-2 20954   4251787.782090     17270   120         0.000000       378.525667         0.000000 3 /
[64313.862553]     osp-pre-28-2 20962   4251785.503143     17263   120         0.000000       335.792043         0.000000 3 /
[64313.873722]    ldlm_cn03_005 21312   4251550.738085   1814427   120         0.000000     47099.867196         0.000000 3 /
[64313.884914]     mdt_io03_006 21315    116408.475938        63   120         0.000000         2.260830         0.000000 3 /
[64313.896124]     mdt_io03_010 21354    116408.606556        29   120         0.000000         1.139542         0.000000 3 /
[64313.907305]     mdt_io03_011 21488    116408.512297        12   120         0.000000         0.708143         0.000000 3 /
[64313.918588]     mdt_io03_018 21514    116408.461231        42   120         0.000000         1.479570         0.000000 3 /
[64313.929817]     mdt_io03_020 21521    116408.450269        16   120         0.000000         0.744343         0.000000 3 /
[64313.940987]     mdt_io03_036 21600    116408.446933        19   120         0.000000         0.470144         0.000000 3 /
[64313.952152]     mdt_io03_037 21605    116408.465779        21   120         0.000000         0.927188         0.000000 3 /
[64313.963318]     mdt_io03_044 21627    116408.472372        24   120         0.000000         0.748221         0.000000 3 /
[64313.974484]     mdt_io03_064 21687    116408.452994    128412   120         0.000000      3456.987955         0.000000 3 /
[64313.985672]     mdt_io03_068 21873    116408.480647        15   120         0.000000         0.374268         0.000000 3 /
[64313.996848]     mdt_io03_072 21957    116408.403701        17   120         0.000000         0.551459         0.000000 3 /
[64314.008012]     mdt_io03_078 22000    116050.478057         5   120         0.000000         0.243903         0.000000 3 /
[64314.019182]    ldlm_cn03_016 23003   4251550.732314    713873   120         0.000000     38845.360323         0.000000 3 /
[64314.030342]    ldlm_cn03_018 23710   4251550.766949    873662   120         0.000000     32322.127535         0.000000 3 /
[64314.041507]    ldlm_cn03_022 23747   4111097.485098    351440   120         0.000000     22733.206703         0.000000 3 /
[64314.052693]     kworker/47:0 30277   1963575.007667         6   120         0.000000         0.035638         0.000000 3 /
[64314.064075]    ldlm_cn03_024 38398   4111097.485529        71   120         0.000000      1576.538901         0.000000 3 /
[64314.075392]      ll_mgs_0022 38442   4067071.551411       165   120         0.000000        30.771242         0.000000 3 /
[64314.086676]             sshd 39058   4251787.901819       316   120         0.000000        18.184093         0.000000 3 /

[64314.100068] INFO: rcu_sched detected stalls on CPUs/tasks: { 19 23} (detected by 23, t=6 jiffies, g=5418270, c=5418269, q=717307)
[64314.111847] Task dump for CPU 19:
[64314.115166] ldlm_bl_04      R  running task        0 22866      2 0x00000080
[64314.122263] Call Trace:
[64314.124757]  [&amp;lt;ffffffffc121c4a2&amp;gt;] ? ldlm_cancel_lock_for_export.isra.26+0x1c2/0x390 [ptlrpc]
[64314.133220]  [&amp;lt;ffffffffc121c6af&amp;gt;] ? ldlm_cancel_locks_for_export_cb+0x3f/0x50 [ptlrpc]
[64314.141140]  [&amp;lt;ffffffffc0c8bf20&amp;gt;] ? cfs_hash_for_each_relax+0x250/0x450 [libcfs]
[64314.148559]  [&amp;lt;ffffffffc121c670&amp;gt;] ? ldlm_cancel_lock_for_export.isra.26+0x390/0x390 [ptlrpc]
[64314.157019]  [&amp;lt;ffffffffc121c670&amp;gt;] ? ldlm_cancel_lock_for_export.isra.26+0x390/0x390 [ptlrpc]
[64314.165457]  [&amp;lt;ffffffffc0c8f480&amp;gt;] ? cfs_hash_for_each_empty+0x80/0x1d0 [libcfs]
[64314.172792]  [&amp;lt;ffffffffc121c9da&amp;gt;] ? ldlm_export_cancel_locks+0xaa/0x180 [ptlrpc]
[64314.180215]  [&amp;lt;ffffffffc1245a98&amp;gt;] ? ldlm_bl_thread_main+0x4c8/0x700 [ptlrpc]
[64314.187261]  [&amp;lt;ffffffff8e8d67b0&amp;gt;] ? wake_up_state+0x20/0x20
[64314.192857]  [&amp;lt;ffffffffc12455d0&amp;gt;] ? ldlm_handle_bl_callback+0x530/0x530 [ptlrpc]
[64314.200251]  [&amp;lt;ffffffff8e8c1c31&amp;gt;] ? kthread+0xd1/0xe0
[64314.205303]  [&amp;lt;ffffffff8e8c1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
[64314.211398]  [&amp;lt;ffffffff8ef74c24&amp;gt;] ? ret_from_fork_nospec_begin+0xe/0x21
[64314.218085]  [&amp;lt;ffffffff8e8c1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="241885" author="pfarrell" created="Wed, 13 Feb 2019 17:37:58 +0000"  >&lt;p&gt;Stephane,&lt;/p&gt;

&lt;p&gt;Let&apos;s get those MDT lock stats:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; lctl get_param ldlm.*.*mdt*.*&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="241886" author="sthiell" created="Wed, 13 Feb 2019 17:40:32 +0000"  >&lt;p&gt;We started to fail over MDT0002 to the other server fir-md1-s2, it is currently in progress. But this is the output:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@fir-md1-s1 ~]# lctl get_param ldlm.*.*mdt*.*
ldlm.namespaces.mdt-fir-MDT0000_UUID.contended_locks=32
ldlm.namespaces.mdt-fir-MDT0000_UUID.contention_seconds=2
ldlm.namespaces.mdt-fir-MDT0000_UUID.ctime_age_limit=10
ldlm.namespaces.mdt-fir-MDT0000_UUID.dirty_age_limit=10
ldlm.namespaces.mdt-fir-MDT0000_UUID.early_lock_cancel=0
ldlm.namespaces.mdt-fir-MDT0000_UUID.lock_count=124320
ldlm.namespaces.mdt-fir-MDT0000_UUID.lock_timeouts=3
ldlm.namespaces.mdt-fir-MDT0000_UUID.lock_unused_count=0
ldlm.namespaces.mdt-fir-MDT0000_UUID.lru_max_age=3900000
ldlm.namespaces.mdt-fir-MDT0000_UUID.lru_size=4800
ldlm.namespaces.mdt-fir-MDT0000_UUID.max_nolock_bytes=0
ldlm.namespaces.mdt-fir-MDT0000_UUID.max_parallel_ast=1024
ldlm.namespaces.mdt-fir-MDT0000_UUID.resource_count=70472
ldlm.namespaces.mdt-fir-MDT0002_UUID.contended_locks=32
ldlm.namespaces.mdt-fir-MDT0002_UUID.contention_seconds=2
ldlm.namespaces.mdt-fir-MDT0002_UUID.ctime_age_limit=10
ldlm.namespaces.mdt-fir-MDT0002_UUID.dirty_age_limit=10
ldlm.namespaces.mdt-fir-MDT0002_UUID.early_lock_cancel=0
ldlm.namespaces.mdt-fir-MDT0002_UUID.lock_count=1725369
ldlm.namespaces.mdt-fir-MDT0002_UUID.lock_timeouts=1732375
ldlm.namespaces.mdt-fir-MDT0002_UUID.lock_unused_count=0
ldlm.namespaces.mdt-fir-MDT0002_UUID.lru_max_age=3900000
ldlm.namespaces.mdt-fir-MDT0002_UUID.lru_size=4800
ldlm.namespaces.mdt-fir-MDT0002_UUID.max_nolock_bytes=0
ldlm.namespaces.mdt-fir-MDT0002_UUID.max_parallel_ast=1024
ldlm.namespaces.mdt-fir-MDT0002_UUID.resource_count=77
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="241887" author="pfarrell" created="Wed, 13 Feb 2019 17:42:07 +0000"  >&lt;p&gt;Could you also turn on some debug for just a moment, then disable it?&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl clear; lctl set_param debug=+dlmtrace; lctl set_param debug=+info sleep 0.2; lctl set_param debug=-dlmtrace; lctl set_param debug=-info&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Then dump the logs.&lt;/p&gt;



&lt;p&gt;That should tell us a lot about what the system is spinning on.&lt;/p&gt;</comment>
                            <comment id="241888" author="pfarrell" created="Wed, 13 Feb 2019 17:43:19 +0000"  >&lt;p&gt;Hm.&#160; If you&apos;ve got the sort of problem I suspect, which is &quot;too many locks clogging up the machinery&quot;, failover may just kill your other MDT as it tries to bring them up.&lt;/p&gt;

&lt;p&gt;We&apos;ll see!&lt;/p&gt;</comment>
                            <comment id="241889" author="sthiell" created="Wed, 13 Feb 2019 17:47:27 +0000"  >&lt;p&gt;Thanks Patrick! Attached fir-md1-s1-dk.log but please note that MDT0002 is still trying tu unmount:&lt;/p&gt;

&lt;p&gt;syslog:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[64634.131555] Pid: 21813, comm: mdt01_059 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018
[64634.141297] Call Trace:
[64634.143768]  [&amp;lt;ffffffffc12340bd&amp;gt;] ldlm_completion_ast+0x63d/0x920 [ptlrpc]
[64634.150714]  [&amp;lt;ffffffffc1234dcc&amp;gt;] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc]
[64634.157899]  [&amp;lt;ffffffffc18e63d8&amp;gt;] mdt_object_local_lock+0x438/0xb20 [mdt]
[64634.164730]  [&amp;lt;ffffffffc18e6b30&amp;gt;] mdt_object_lock_internal+0x70/0x3e0 [mdt]
[64634.171730]  [&amp;lt;ffffffffc18e6efc&amp;gt;] mdt_reint_object_lock+0x2c/0x60 [mdt]
[64634.178388]  [&amp;lt;ffffffffc18fa2b9&amp;gt;] mdt_object_lock_save+0x29/0x50 [mdt]
[64634.184958]  [&amp;lt;ffffffffc18fb0f6&amp;gt;] mdt_reint_rename_internal.isra.46+0xa06/0x2760 [mdt]
[64634.192931]  [&amp;lt;ffffffffc1904e5b&amp;gt;] mdt_reint_rename_or_migrate.isra.51+0x19b/0x860 [mdt]
[64634.200984]  [&amp;lt;ffffffffc1905553&amp;gt;] mdt_reint_rename+0x13/0x20 [mdt]
[64634.207322]  [&amp;lt;ffffffffc19055e3&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
[64634.213402]  [&amp;lt;ffffffffc18e2133&amp;gt;] mdt_reint_internal+0x6e3/0xaf0 [mdt]
[64634.220004]  [&amp;lt;ffffffffc18ed497&amp;gt;] mdt_reint+0x67/0x140 [mdt]
[64634.225734]  [&amp;lt;ffffffffc12d135a&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
[64634.232703]  [&amp;lt;ffffffffc127592b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[64634.240476]  [&amp;lt;ffffffffc127925c&amp;gt;] ptlrpc_main+0xafc/0x1fc0 [ptlrpc]
[64634.246812]  [&amp;lt;ffffffff8e8c1c31&amp;gt;] kthread+0xd1/0xe0
[64634.251727]  [&amp;lt;ffffffff8ef74c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
[64634.258214]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[64634.263238] LustreError: dumping log to /tmp/lustre-log.1550079653.21813
[64645.362733] LustreError: 0-0: Forced cleanup waiting for fir-MDT0000-osp-MDT0002 namespace with 1 resources in use, (rc=-110)
[64670.374306] LustreError: 0-0: Forced cleanup waiting for fir-MDT0000-osp-MDT0002 namespace with 1 resources in use, (rc=-110)
[64720.386464] LustreError: 0-0: Forced cleanup waiting for fir-MDT0000-osp-MDT0002 namespace with 1 resources in use, (rc=-110)
[64720.397763] LustreError: Skipped 1 previous similar message
[64736.371824] Lustre: fir-MDT0002: Not available for connect from 10.9.106.29@o2ib4 (stopping)
[64736.380266] Lustre: Skipped 3144 previous similar messages
[64795.404191] LustreError: 0-0: Forced cleanup waiting for fir-MDT0000-osp-MDT0002 namespace with 1 resources in use, (rc=-110)
[64795.415491] LustreError: Skipped 2 previous similar messages
[64945.423663] LustreError: 0-0: Forced cleanup waiting for fir-MDT0000-osp-MDT0002 namespace with 1 resources in use, (rc=-110)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So not sure it&apos;ll be helpful. I think we&apos;ll dump and restart this MDS.&lt;/p&gt;</comment>
                            <comment id="241890" author="sthiell" created="Wed, 13 Feb 2019 17:51:09 +0000"  >&lt;p&gt;Ah, never mind. MDT2 stopped successfully on fir-md1-s1 and is now mounted on the failover partner fir-md1-s2.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[2302565.854897] Lustre: fir-MDT0002: Recovery over after 0:29, of 1344 clients 1344 recovered and 0 were evicted.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt; Load on fir-md1-s1 has dropped and filesystem is back online. We&apos;ll see how it goes...&lt;/p&gt;</comment>
                            <comment id="241891" author="sthiell" created="Wed, 13 Feb 2019 17:52:34 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;$ clush -w@mds lctl get_param ldlm.*.*mdt*.lock_count
fir-md1-s2: ldlm.namespaces.mdt-fir-MDT0001_UUID.lock_count=220834
fir-md1-s2: ldlm.namespaces.mdt-fir-MDT0002_UUID.lock_count=143296
fir-md1-s2: ldlm.namespaces.mdt-fir-MDT0003_UUID.lock_count=34274
fir-md1-s1: ldlm.namespaces.mdt-fir-MDT0000_UUID.lock_count=133837
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="241892" author="pfarrell" created="Wed, 13 Feb 2019 17:53:20 +0000"  >&lt;p&gt;Interesting:&lt;br/&gt;
 &quot;00010000:00010000:5.0:1550079902.033681:0:22866:0:(ldlm_lock.c:197:ldlm_lock_put()) ### final lock_put on destroyed lock, freeing it. ns: mdt-fir-MDT0002_UUID lock: ffff9c9f27590240/0x6bcd99f6a7f88060 lrc: 0/0,0 mode: --/PR res: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x2c00016d5:0x9c:0x0&amp;#93;&lt;/span&gt;.0x0 bits 0x40/0x0 rrc: 265624 type: IBT flags: 0x44801400010020 nid: 10.9.105.71@o2ib4 remote: 0xf3592e10ff0a3e0d expref: 265633 pid: 20526 timeout: 62419 lvb_type: 0&lt;br/&gt;
 00010000:00000040:5.0:1550079902.034443:0:22866:0:(ldlm_resource.c:1534:ldlm_resource_getref()) getref res: ffff9c9ceeeef8c0 count: 265623&quot;&lt;/p&gt;

&lt;p&gt;I wondered if we had something like this.&#160; There are 265 thousand locks on a single resource&lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/warning.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;.&#160; That&apos;s going to gum up the works hugely whenever anything has to address that resource.&lt;/p&gt;

&lt;p&gt;That&apos;s a DOM lock, too.&#160; Was every client and process in your system trying to read the same file?&#160; How many clients do you have?&lt;/p&gt;</comment>
                            <comment id="241893" author="pfarrell" created="Wed, 13 Feb 2019 17:55:43 +0000"  >&lt;p&gt;At least in the DK log sample here, all the locks (only 526, but still) are from the same node.&lt;/p&gt;

&lt;p&gt;I assume&#160;10.9.105.71@o2ib4 is a client?&lt;/p&gt;</comment>
                            <comment id="241894" author="sthiell" created="Wed, 13 Feb 2019 18:01:26 +0000"  >&lt;p&gt;Yes, 10.9.105.71@o2ib4 is one of our Sherlock client (sh-105-71) and currently has multiple jobs running. We&apos;re investigating any time correlation. Is there a way to find the original file from the lock message above?&lt;/p&gt;</comment>
                            <comment id="241895" author="pfarrell" created="Wed, 13 Feb 2019 18:06:02 +0000"  >&lt;p&gt;Yes - The resource ID should be the user visible FID, since it&apos;s the MDS:&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;0x2c00016d5:0x9c:0x0&amp;#93;&lt;/span&gt;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lfs fid2path /path/to/lustre/root/&#160;[0x2c00016d5:0x9c:0x0] &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="241896" author="sthiell" created="Wed, 13 Feb 2019 18:09:29 +0000"  >&lt;p&gt;Ok thanks! I tried that but unfortunately that returns &quot;No such file or directory&quot;.&lt;/p&gt;

&lt;p&gt;It looks like the lock count keeps increasing on MDT0002:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;$ clush -w@mds lctl get_param ldlm.*.*mdt*.lock_count
fir-md1-s1: ldlm.namespaces.mdt-fir-MDT0000_UUID.lock_count=249247
fir-md1-s2: ldlm.namespaces.mdt-fir-MDT0001_UUID.lock_count=344886
fir-md1-s2: ldlm.namespaces.mdt-fir-MDT0002_UUID.lock_count=891698
fir-md1-s2: ldlm.namespaces.mdt-fir-MDT0003_UUID.lock_count=48500
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="241897" author="pfarrell" created="Wed, 13 Feb 2019 18:15:02 +0000"  >&lt;p&gt;Let&apos;s take this to Skype (just in case you&apos;re not seeing that)&lt;/p&gt;</comment>
                            <comment id="241902" author="pfarrell" created="Wed, 13 Feb 2019 18:55:20 +0000"  >&lt;p&gt;OK, I&apos;ve found the proximate cause of lock accumulation.&#160; More to come.&lt;/p&gt;

&lt;p&gt;There&apos;s one &quot;normal&quot; open lock (PR) with a DOM bit on the resource, and hundreds of thousands (literally) of DOM only PR locks.&lt;/p&gt;

&lt;p&gt;We match the normal lock:&lt;br/&gt;
 00010000:00010000:18.0:1550082656.372818:0:311279:0:(ldlm_lock.c:1476:ldlm_lock_match()) ### matched (0 0) ns: fir-MDT0002-mdc-ffff9bcb944b9000 lock: ffff9bc299f1a400/0x318ede82da06317f lrc: 2/1,0 mode: PR/PR res: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x2c0003c05:0xe11c:0x0&amp;#93;&lt;/span&gt;.0x0 bits 0x53/0x0 rrc: 299247 type: IBT flags: 0x60000000000 nid: local remote: 0xfbab35471d1f08c6 expref: -99 pid: 311279 timeout: 0 lvb_type: 3&lt;/p&gt;

&lt;p&gt;The KMS ignore bit is set on the &quot;normal&quot; MDC lock:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; #define LDLM_FL_KMS_IGNORE 0x0000040000000000ULL
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;  LOCK: 0x60000000000&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The other bit (the &apos;2&apos; in &apos;4|2 == 6&apos;) is LVB_READY.&lt;/p&gt;

&lt;p&gt;If kms_ignore is set, we do not match the DOM lock (mdc_enqueue_send)&#160; :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;                if (!matched || ldlm_is_kms_ignore(matched))
                        goto no_match;&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Then we request another lock, which is what&apos;s going on here.&#160; But because this open lock matches and is then rejected (rather than being rejected in the match code), we do not match the later locks, so we request one for every read.&lt;/p&gt;

&lt;p&gt;I&apos;ll dig in a bit more, but I see a few potential fixes.&lt;/p&gt;</comment>
                            <comment id="241903" author="pfarrell" created="Wed, 13 Feb 2019 19:40:23 +0000"  >&lt;p&gt;Mike,&lt;br/&gt;
Interesting use of kms_ignore in the DOM code.&lt;/p&gt;

&lt;p&gt;It looks like it&apos;s used to say &quot;the DOM bit has been cancelled&quot;.&#160; (It would be good to add a comment in the KMS_IGNORE definition reflecting this.)&lt;/p&gt;


&lt;p&gt;So either we need to sort out why it got set with the DOM bit still present and make that not happen, or we should we move the &quot;ldlm_is_kms_ignore&quot; check in to ldlm_lock_match, so we look for another matching lock and don&apos;t keep adding locks like this.&lt;/p&gt;

&lt;p&gt;Probably both!&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;If the direction we go is the &quot;how did this get set like this&quot; direction, here&apos;s a thought:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;static int mdc_object_ast_clear(struct ldlm_lock *lock, void *data)
{
        ENTRY;        if ((lock-&amp;gt;l_ast_data == NULL &amp;amp;&amp;amp; !ldlm_is_kms_ignore(lock)) ||
            (lock-&amp;gt;l_ast_data == data)) {
                lock-&amp;gt;l_ast_data = NULL;
                ldlm_set_kms_ignore(lock);
        }
        RETURN(LDLM_ITER_CONTINUE);
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The equivalent osc code doesn&apos;t do this, and the LDLM lock is still reachable.&#160; So perhaps the MDC object was pruned but the lock wasn&apos;t...&#160; Not 100% sure that can happen.&lt;/p&gt;</comment>
                            <comment id="241904" author="pfarrell" created="Wed, 13 Feb 2019 19:44:49 +0000"  >&lt;p&gt;Attachments are a -1 dk log snippet from the client generating the lock requests (sh-106-60.dk.log.gz) and a short MDS log with dlmtrace set (fir-md1-s2-dk.log) (From Sebastien)&lt;/p&gt;</comment>
                            <comment id="241926" author="sthiell" created="Thu, 14 Feb 2019 00:19:06 +0000"  >&lt;p&gt;Appreciated all the help so far. Quick update as things are getting worse and worse and /fir is is barely usable. Using sysrq to dump the tasks, we found many jobs on clients blocked on vfs calls (vfs_mkdir, vfs_unlink) and Lustre was constantly trying to reconnect to the MDT (either MDT 0 or MDT 2).&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Feb 13 15:04:26 sh-106-65 kernel: Lustre: 215025:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1550098465/real 1550098465]  req@ffff8c5a3db1d400 x1624748573740256/t0(0) o36-&amp;gt;fir-MDT0002-mdc-ffff8c4e357f0800@10.0.10.52@o2ib7:12/10 lens 512/448 e 24 to 1 dl 1550099066 ref 2 fl Rpc:X/2/ffffffff rc -11/-1
Feb 13 15:04:26 sh-106-65 kernel: Lustre: fir-MDT0002-mdc-ffff8c4e357f0800: Connection to fir-MDT0002 (at 10.0.10.52@o2ib7) was lost; in progress operations using this service will wait for recovery to complete
Feb 13 15:04:26 sh-106-65 kernel: Lustre: fir-MDT0002-mdc-ffff8c4e357f0800: Connection restored to 10.0.10.52@o2ib7 (at 10.0.10.52@o2ib7)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt; Lock count on MDT 0 and 2 were &amp;gt; 2 million and increasing.&lt;/p&gt;

&lt;p&gt;I just restarted MDT 2 and hopefully clients have resumed I/O operations. Earlier today, we had disabled DOM on MDT 2 as a test, but it didn&apos;t help, but because  the filesystem already has 163M files with a mdt stripe so I&apos;m not sure it can help. It would be great to have a patch ASAP to fix this high lock count. Thanks!&lt;/p&gt;</comment>
                            <comment id="241928" author="sthiell" created="Thu, 14 Feb 2019 00:40:38 +0000"  >&lt;p&gt;After 34 minutes of uptime, MDT 2&apos;s lock_count is already at 1,732,474, doh.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;fir-md1-s2: ldlm.namespaces.mdt-fir-MDT0002_UUID.lock_count=1732474
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="241934" author="gerrit" created="Thu, 14 Feb 2019 02:34:06 +0000"  >&lt;p&gt;Patrick Farrell (pfarrell@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/34255&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34255&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11964&quot; title=&quot;Heavy load and soft lockups on MDS with DOM&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11964&quot;&gt;&lt;del&gt;LU-11964&lt;/del&gt;&lt;/a&gt; ldlm: Don&apos;t match kms_ignore DOM locks&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 0ebed6aac0e2dee45c4e7bba213416eefd73788a&lt;/p&gt;</comment>
                            <comment id="241936" author="pfarrell" created="Thu, 14 Feb 2019 02:38:09 +0000"  >&lt;p&gt;Copied from email:&lt;/p&gt;

&lt;p&gt;&quot;Just working on what I hope will be a fix.&#160; In the meantime, you might be able to limp along (better than with MDT restarts)&#160;by running this periodically on clients:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param ldlm.namespaces.*mdc*.lru_size=clear&#160; &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Perhaps once a minute...?&#160; The basic problem is that you&apos;re getting one lock in a weird state, and then you stop trying to match existing locks, so each read creates a new lock.&#160; I suspect you could probably clean out that first lock and the subsequent ones, avoiding the accumulation.&#160; (The &quot;weird&quot; state appears to be relatively rare -&#160; though common enough to cause trouble - so I think this should work.)&quot;&lt;/p&gt;</comment>
                            <comment id="241937" author="pfarrell" created="Thu, 14 Feb 2019 02:41:22 +0000"  >&lt;p&gt;Recent patch is passing some simple testing locally, but hasn&apos;t been through our suite yet.&#160; I&apos;m pretty confident it&apos;s sound, but if you can stand to wait, it would be nice to get it some test exposure and an opinion from Mike.&lt;/p&gt;</comment>
                            <comment id="241940" author="sthiell" created="Thu, 14 Feb 2019 02:54:22 +0000"  >&lt;p&gt;Hi Patrick,&lt;/p&gt;

&lt;p&gt;I just tried &lt;tt&gt;lctl set_param ldlm.namespaces.fir-MDT0002-*.lru_size=clear&lt;/tt&gt; on all clients and the lock count for MDT 2 went from 10 million to 1.6 million so I guess it&apos;s working partially, which is good.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@fir-md1-s2 ~]# lctl get_param ldlm.*.*mdt*.lock_count
ldlm.namespaces.mdt-fir-MDT0002_UUID.lock_count=10091472

... lru_size=clear on all clients

[root@fir-md1-s2 ~]# lctl get_param ldlm.*.*mdt*.lock_count
ldlm.namespaces.mdt-fir-MDT0002_UUID.lock_count=1673221
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I&apos;m gonna do this until the patch is ready (I prefer to wait for the patch to pass the tests yes &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;

&lt;p&gt;Thanks!&lt;/p&gt;</comment>
                            <comment id="241944" author="sthiell" created="Thu, 14 Feb 2019 03:43:47 +0000"  >&lt;p&gt;Running lru_size=clear on all clients every 2 minutes for all Fir&apos;s MDTs and the lock count has dropped!&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;$ clush -w@mds lctl get_param ldlm.*.*mdt*.lock_count
fir-md1-s2: ldlm.namespaces.mdt-fir-MDT0002_UUID.lock_count=171420
fir-md1-s1: ldlm.namespaces.mdt-fir-MDT0000_UUID.lock_count=11548
fir-md1-s1: ldlm.namespaces.mdt-fir-MDT0001_UUID.lock_count=10044
fir-md1-s1: ldlm.namespaces.mdt-fir-MDT0003_UUID.lock_count=20004
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="241945" author="pfarrell" created="Thu, 14 Feb 2019 03:46:22 +0000"  >&lt;p&gt;Glad it&apos;s helping.&#160; Especially glad because the patch isn&apos;t right and I&apos;m going to need Mike to take a look.&lt;/p&gt;</comment>
                            <comment id="241957" author="tappro" created="Thu, 14 Feb 2019 09:15:23 +0000"  >&lt;p&gt;Stephane,&lt;/p&gt;

&lt;p&gt;for now let&apos;s disable the &apos;dom_lock&apos; parameter on MDT: &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
lctl set_param mdt.*.dom_lock=0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;that disables DOM lock combining with other lock bits and switches to a simpler locking scheme for DOM files.&lt;/p&gt;

&lt;p&gt;also you&apos;ve mentioned that DOM was disabled on MDT2, how exactly that was done?&lt;/p&gt;</comment>
                            <comment id="241975" author="sthiell" created="Thu, 14 Feb 2019 16:14:29 +0000"  >&lt;p&gt;Hi Mike!&lt;/p&gt;

&lt;p&gt;So with a 2-min recurrent lru_size=clear on all clients, this morning I found the lock counts like that (looking good!):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;fir-md1-s2: ldlm.namespaces.mdt-fir-MDT0002_UUID.lock_count=31336
fir-md1-s1: ldlm.namespaces.mdt-fir-MDT0000_UUID.lock_count=8233
fir-md1-s1: ldlm.namespaces.mdt-fir-MDT0001_UUID.lock_count=15402
fir-md1-s1: ldlm.namespaces.mdt-fir-MDT0003_UUID.lock_count=12403
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I stopped lru_size=clear and set dom_lock=0:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;$ clush -w@mds lctl get_param mdt.fir-*.dom_lock
fir-md1-s2: mdt.fir-MDT0002.dom_lock=never
fir-md1-s1: mdt.fir-MDT0000.dom_lock=never
fir-md1-s1: mdt.fir-MDT0001.dom_lock=never
fir-md1-s1: mdt.fir-MDT0003.dom_lock=never
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;But then the lock count was still&#160; increasing. Now on MDT1 even more than MDT2! After ~5 minutes, we have:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;fir-md1-s2: ldlm.namespaces.mdt-fir-MDT0002_UUID.lock_count=148463
fir-md1-s1: ldlm.namespaces.mdt-fir-MDT0000_UUID.lock_count=58011
fir-md1-s1: ldlm.namespaces.mdt-fir-MDT0001_UUID.lock_count=203623
fir-md1-s1: ldlm.namespaces.mdt-fir-MDT0003_UUID.lock_count=31219
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Re: MDT2, it is correct that we tried to disable DOM yesterday by using:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl conf_param fir-MDT0002.lod.dom_stripesize=0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;However, shortly after that, we noticed that new files were still having a mdt stripe of 128K (as our default stripping is configured), and seeing no improvement in lock count, we decided to put it back to 128K (131072). So DOM is still enabled on all four MDTs.&lt;/p&gt;

&lt;p&gt;I have re-enabled the recurrent lru_size=clear for now. Let me know what I can try next to help resolve this. Thanks.&lt;br/&gt;
 &#160;&lt;/p&gt;</comment>
                            <comment id="241978" author="tappro" created="Thu, 14 Feb 2019 16:49:18 +0000"  >&lt;p&gt;Stephane, I am preparing quick patch for the problem noticed by Patrick, meanwhile could you say what does that client do? I see in logs there is quite big files with PFL layout and size about 400Gb, is that known what is going on there, i.e. access pattern? That would be helpful to know, so I can try to reproduce that locally and make sure patch will solve that&lt;/p&gt;</comment>
                            <comment id="241984" author="sthiell" created="Thu, 14 Feb 2019 17:56:23 +0000"  >&lt;p&gt;Ok, sounds good! We have more than 1,300 active clients on this cluster, so this is a tough question! &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&#160;We currently have 4,912 jobs running on Sherlock with all kinds of jobs (using small or large files). So I&apos;m not sure where to start. If there is a way to list the NIDs of clients holding the most MDT locks, that would really be helpful for job correlation, but I&apos;m not sure that&apos;s currently possible.&lt;/p&gt;</comment>
                            <comment id="241986" author="pfarrell" created="Thu, 14 Feb 2019 18:04:03 +0000"  >&lt;p&gt;Stephane,&lt;/p&gt;

&lt;p&gt;No way to do exactly that, but you could just look at total lock counts on the client side, and look for large #s.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl get_param ldlm.namespaces.*mdc*.lock_count &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="241988" author="sthiell" created="Thu, 14 Feb 2019 18:31:08 +0000"  >&lt;p&gt;Great!&#160;&lt;/p&gt;

&lt;p&gt;See attached file sherlocks.txt. The top one is a login node, with many users logged in, but really not many files open in /fir (&amp;lt; 10) when I check with lsof.&#160; There are several file descriptors open on the same file (a shell script), maybe that&apos;s a hint. Also I&apos;ve noticed a few python processes doing a lot of newfstatat even on the same file again and again but nothing super unusual.&lt;/p&gt;</comment>
                            <comment id="241991" author="tappro" created="Thu, 14 Feb 2019 19:10:56 +0000"  >&lt;p&gt;Stephane, that can be exactly the case, could you check does file exist with FID &lt;span class=&quot;error&quot;&gt;&amp;#91;0x2c0003c05:0xe11c:0x0&amp;#93;&lt;/span&gt; and its name?&lt;/p&gt;</comment>
                            <comment id="241993" author="sthiell" created="Thu, 14 Feb 2019 19:13:26 +0000"  >&lt;p&gt;Yes, this FID does exist and is relatively large:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@sh-06-33 ~]# lfs fid2path /fir 0x2c0003c05:0xe11c:0x0
/fir/users/sinitsk/qchem/scratch/ccsdt_star_energies/qm9_005226_RI-CCSDT_aug-cc-pVTZ/pagefile.0
[root@sh-06-33 ~]# lfs getstripe /fir/users/sinitsk/qchem/scratch/ccsdt_star_energies/qm9_005226_RI-CCSDT_aug-cc-pVTZ/pagefile.0
/fir/users/sinitsk/qchem/scratch/ccsdt_star_energies/qm9_005226_RI-CCSDT_aug-cc-pVTZ/pagefile.0
  lcm_layout_gen:    8
  lcm_mirror_count:  1
  lcm_entry_count:   6
    lcme_id:             1
    lcme_mirror_id:      0
    lcme_flags:          init
    lcme_extent.e_start: 0
    lcme_extent.e_end:   131072
      lmm_stripe_count:  0
      lmm_stripe_size:   131072
      lmm_pattern:       mdt
      lmm_layout_gen:    0
      lmm_stripe_offset: 0

    lcme_id:             2
    lcme_mirror_id:      0
    lcme_flags:          init
    lcme_extent.e_start: 131072
    lcme_extent.e_end:   16777216
      lmm_stripe_count:  1
      lmm_stripe_size:   4194304
      lmm_pattern:       raid0
      lmm_layout_gen:    0
      lmm_stripe_offset: 0
      lmm_objects:
      - 0: { l_ost_idx: 0, l_fid: [0x6c0000400:0xfe22d:0x0] }

    lcme_id:             3
    lcme_mirror_id:      0
    lcme_flags:          init
    lcme_extent.e_start: 16777216
    lcme_extent.e_end:   1073741824
      lmm_stripe_count:  2
      lmm_stripe_size:   4194304
      lmm_pattern:       raid0
      lmm_layout_gen:    0
      lmm_stripe_offset: 33
      lmm_objects:
      - 0: { l_ost_idx: 33, l_fid: [0x980000402:0xfe105:0x0] }
      - 1: { l_ost_idx: 24, l_fid: [0xb80000402:0xfe241:0x0] }

    lcme_id:             4
    lcme_mirror_id:      0
    lcme_flags:          init
    lcme_extent.e_start: 1073741824
    lcme_extent.e_end:   34359738368
      lmm_stripe_count:  4
      lmm_stripe_size:   4194304
      lmm_pattern:       raid0
      lmm_layout_gen:    0
      lmm_stripe_offset: 21
      lmm_objects:
      - 0: { l_ost_idx: 21, l_fid: [0x740000400:0xfe1da:0x0] }
      - 1: { l_ost_idx: 16, l_fid: [0x880000400:0xfe124:0x0] }
      - 2: { l_ost_idx: 7, l_fid: [0xb00000402:0xfe282:0x0] }
      - 3: { l_ost_idx: 45, l_fid: [0xa40000402:0xfe044:0x0] }

    lcme_id:             5
    lcme_mirror_id:      0
    lcme_flags:          init
    lcme_extent.e_start: 34359738368
    lcme_extent.e_end:   274877906944
      lmm_stripe_count:  8
      lmm_stripe_size:   4194304
      lmm_pattern:       raid0
      lmm_layout_gen:    0
      lmm_stripe_offset: 40
      lmm_objects:
      - 0: { l_ost_idx: 40, l_fid: [0xa00000402:0xfe11d:0x0] }
      - 1: { l_ost_idx: 4, l_fid: [0x8c0000402:0xfe00a:0x0] }
      - 2: { l_ost_idx: 29, l_fid: [0xcc0000402:0xfe1c8:0x0] }
      - 3: { l_ost_idx: 28, l_fid: [0xd80000402:0xfe255:0x0] }
      - 4: { l_ost_idx: 23, l_fid: [0x940000402:0xfe0d9:0x0] }
      - 5: { l_ost_idx: 20, l_fid: [0x9c0000402:0xfe0f0:0x0] }
      - 6: { l_ost_idx: 1, l_fid: [0xb40000402:0xfe42c:0x0] }
      - 7: { l_ost_idx: 47, l_fid: [0xa80000402:0xfe161:0x0] }

    lcme_id:             6
    lcme_mirror_id:      0
    lcme_flags:          init
    lcme_extent.e_start: 274877906944
    lcme_extent.e_end:   EOF
      lmm_stripe_count:  16
      lmm_stripe_size:   4194304
      lmm_pattern:       raid0
      lmm_layout_gen:    0
      lmm_stripe_offset: 14
      lmm_objects:
      - 0: { l_ost_idx: 14, l_fid: [0x400000400:0xfe44a:0x0] }
      - 1: { l_ost_idx: 9, l_fid: [0x540000400:0xfe672:0x0] }
      - 2: { l_ost_idx: 43, l_fid: [0x700000400:0xfe53b:0x0] }
      - 3: { l_ost_idx: 42, l_fid: [0x500000400:0xfe5bb:0x0] }
      - 4: { l_ost_idx: 2, l_fid: [0x5c0000400:0xfe59d:0x0] }
      - 5: { l_ost_idx: 25, l_fid: [0x640000400:0xfe5ea:0x0] }
      - 6: { l_ost_idx: 32, l_fid: [0x900000402:0xfe450:0x0] }
      - 7: { l_ost_idx: 17, l_fid: [0x680000400:0xfe67b:0x0] }
      - 8: { l_ost_idx: 18, l_fid: [0x780000400:0xfe490:0x0] }
      - 9: { l_ost_idx: 5, l_fid: [0x800000400:0xfe763:0x0] }
      - 10: { l_ost_idx: 41, l_fid: [0x840000400:0xfe41b:0x0] }
      - 11: { l_ost_idx: 44, l_fid: [0x7c0000400:0xfe60e:0x0] }
      - 12: { l_ost_idx: 38, l_fid: [0xe40000402:0xfe64f:0x0] }
      - 13: { l_ost_idx: 6, l_fid: [0xc40000402:0xfe4ba:0x0] }
      - 14: { l_ost_idx: 27, l_fid: [0xd00000402:0xfe681:0x0] }
      - 15: { l_ost_idx: 30, l_fid: [0xdc0000402:0xfe592:0x0] }

[root@sh-06-33 ~]# ls -lh /fir/users/sinitsk/qchem/scratch/ccsdt_star_energies/qm9_005226_RI-CCSDT_aug-cc-pVTZ/pagefile.0
-rw-r--r-- 1 sinitsk pande 376G Feb 13 10:42 /fir/users/sinitsk/qchem/scratch/ccsdt_star_energies/qm9_005226_RI-CCSDT_aug-cc-pVTZ/pagefile.0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="241996" author="tappro" created="Thu, 14 Feb 2019 19:59:04 +0000"  >&lt;p&gt;Stephane, could you please check if it is being used in any python scripts you&apos;ve mentioned before? Or any other? If that is not difficult to find out.&lt;/p&gt;</comment>
                            <comment id="241999" author="sthiell" created="Thu, 14 Feb 2019 20:19:50 +0000"  >&lt;p&gt;Mike, I checked and this file is currently open on at least 9 different compute nodes, but not by a python program, but qcprog.exe_s (Qchem software &lt;a href=&quot;http://www.q-chem.com/doc_for_web/readme_par.shtml).&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://www.q-chem.com/doc_for_web/readme_par.shtml)&lt;/a&gt; The program is doing a lot of read, fadvise and lseek:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[pid 62309] &amp;lt;... read resumed&amp;gt; &quot;\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0&quot;..., 2944) = 2944
[pid 62309] read(16, &quot;BhUr\357pE\300\272\366\336\245\206qE\300\312WD2ZrE\300.w\223\333esE\300&quot;..., 524288) = 524288
[pid 62309] fadvise64(16, 87782214528, 524288, POSIX_FADV_DONTNEED) = 0
[pid 62310] fadvise64(16, 84060294016, 524288, POSIX_FADV_WILLNEED) = 0
[pid 62310] lseek(16, 84060291072, SEEK_SET &amp;lt;unfinished ...&amp;gt;
[pid 62310] &amp;lt;... lseek resumed&amp;gt; )       = 84060291072
[pid 62310] read(16,  &amp;lt;unfinished ...&amp;gt;
[pid 62310] &amp;lt;... read resumed&amp;gt; &quot;\316t@\260\311\250\377&amp;gt;X\0\275I\2247\32?9e\212\26h$\22?P@a\360\27t\346\276&quot;..., 2944) = 2944
[pid 62310] read(16, &quot;\311j0.\216qH\300R\261W\17\34\202H\300\245T\314G\207\207H\300B\275\324\220\374\221H\300&quot;..., 524288) = 524288
[pid 62310] fadvise64(16, 84060294016, 524288, POSIX_FADV_DONTNEED) = 0
[pid 62304] fadvise64(16, 36313235456, 524288, POSIX_FADV_WILLNEED) = 0
[pid 62304] lseek(16, 36313235456, SEEK_SET &amp;lt;unfinished ...&amp;gt;
[pid 62304] &amp;lt;... lseek resumed&amp;gt; )       = 36313235456
[pid 62304] read(16,  &amp;lt;unfinished ...&amp;gt;
[pid 62304] &amp;lt;... read resumed&amp;gt; &quot;*|L\37\362*E\300\36T\342%\204+E\300\376\263\237\366R,E\300\214x\352\261\375,E\300&quot;..., 524288) = 524288
[pid 62304] fadvise64(16, 36313235456, 524288, POSIX_FADV_DONTNEED) = 0
[pid 62315] fadvise64(16, 88445404160, 524288, POSIX_FADV_WILLNEED) = 0
[pid 62315] lseek(16, 88445403136, SEEK_SET &amp;lt;unfinished ...&amp;gt;
[pid 62315] &amp;lt;... lseek resumed&amp;gt; )       = 88445403136
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="242000" author="sthiell" created="Thu, 14 Feb 2019 20:21:52 +0000"  >&lt;p&gt;Ah, actually fd 16 is not that file but another one.... Sorry.&lt;/p&gt;</comment>
                            <comment id="242006" author="tappro" created="Thu, 14 Feb 2019 20:49:18 +0000"  >&lt;p&gt;Stephane, thanks a lot, I think I am able to reproduce that behavior with quite simple test.  It shows growing lock count without patch and no lock count increase with patch.  I will push that patch to the gerrit as soon as local tests will end.&lt;/p&gt;</comment>
                            <comment id="242017" author="sthiell" created="Thu, 14 Feb 2019 22:46:32 +0000"  >&lt;p&gt;That&apos;s great news, I&apos;m so glad you were able to reproduce the issue locally. We can stay with the lru_size=clear workaround for the time being until the patch is ready.&lt;/p&gt;</comment>
                            <comment id="242019" author="gerrit" created="Thu, 14 Feb 2019 23:46:37 +0000"  >&lt;p&gt;Mike Pershin (mpershin@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/34261&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34261&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11964&quot; title=&quot;Heavy load and soft lockups on MDS with DOM&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11964&quot;&gt;&lt;del&gt;LU-11964&lt;/del&gt;&lt;/a&gt; mdc: prevent glimpse lock count grow&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 2f98ba5cf94352be6028bd9ddf93fe1f5623b347&lt;/p&gt;</comment>
                            <comment id="242295" author="sthiell" created="Tue, 19 Feb 2019 23:32:07 +0000"  >&lt;p&gt;Quick update: we&apos;re still using the lru_size=clear workaround and the servers have been stable. However, it would be nice to try a patch at some point. It looks like the patch didn&apos;t pass the DOM tests yet? In any case, let me know when it is ready. Of course, patching clients takes a lot more of effort. Is the patch client + server?&lt;/p&gt;

&lt;p&gt;Thanks again,&lt;/p&gt;

&lt;p&gt;Stephane&lt;/p&gt;</comment>
                            <comment id="242298" author="pfarrell" created="Tue, 19 Feb 2019 23:53:16 +0000"  >&lt;p&gt;Stephane,&lt;/p&gt;

&lt;p&gt;Patch is just client, and it has passed Maloo testing (including DOM).&#160; I think it&apos;s ready to try.&lt;/p&gt;</comment>
                            <comment id="242299" author="sthiell" created="Tue, 19 Feb 2019 23:59:26 +0000"  >&lt;p&gt;Ah great! Thanks Patrick, we&apos;ll work on it.&lt;/p&gt;</comment>
                            <comment id="242414" author="sthiell" created="Thu, 21 Feb 2019 04:32:53 +0000"  >&lt;p&gt;We have started the deployment of clients with 2.12.0 + the patch above on Sherlock. It will take some time to get all nodes up to date (at least 2 weeks), but we&apos;ll progressively remove the lru_size=clear workaround on the updated nodes. Thanks!&lt;/p&gt;</comment>
                            <comment id="242776" author="sthiell" created="Tue, 26 Feb 2019 05:47:42 +0000"  >&lt;p&gt;At this point we have upgraded more than 1300 clients with the patch (more than 90% of the cluster) and I just tried to remove the 2-min lru_size=clear workaround on those.&lt;/p&gt;

&lt;p&gt;At first the lock counters increased a lot but I think they have stabilized below 1M per MDT:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;$ clush -w@mds -L lctl get_param ldlm.*.*mdt*.lock_count 
fir-md1-s1: ldlm.namespaces.mdt-fir-MDT0000_UUID.lock_count=253634
fir-md1-s1: ldlm.namespaces.mdt-fir-MDT0002_UUID.lock_count=241738
fir-md1-s2: ldlm.namespaces.mdt-fir-MDT0001_UUID.lock_count=664659
fir-md1-s2: ldlm.namespaces.mdt-fir-MDT0003_UUID.lock_count=370532
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;$ clush -w@mds -L lctl get_param ldlm.*.*mdt*.lock_count 
fir-md1-s1: ldlm.namespaces.mdt-fir-MDT0000_UUID.lock_count=252612
fir-md1-s1: ldlm.namespaces.mdt-fir-MDT0002_UUID.lock_count=239413
fir-md1-s2: ldlm.namespaces.mdt-fir-MDT0001_UUID.lock_count=660803
fir-md1-s2: ldlm.namespaces.mdt-fir-MDT0003_UUID.lock_count=371448
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I&apos;ll confirm/update tomorrow.&lt;/p&gt;</comment>
                            <comment id="242910" author="sthiell" created="Wed, 27 Feb 2019 04:44:42 +0000"  >&lt;p&gt;As of tonight, the situation is stable, and we didn&apos;t have any new server crash (yay). So I guess the patch is definitively working. I can&apos;t thank you enough Patrick and Mike! The lock counters are still relatively high but there are ~6,300 jobs running at the moment and probably most of them using /fir so I assume this is normal:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;$ clush -w@mds -L lctl get_param ldlm.*.*mdt*.lock_count 
fir-md1-s1: ldlm.namespaces.mdt-fir-MDT0000_UUID.lock_count=439815
fir-md1-s1: ldlm.namespaces.mdt-fir-MDT0002_UUID.lock_count=241967
fir-md1-s2: ldlm.namespaces.mdt-fir-MDT0001_UUID.lock_count=1138179
fir-md1-s2: ldlm.namespaces.mdt-fir-MDT0003_UUID.lock_count=623797
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@fir-md1-s2 ~]# lctl get_param ldlm.namespaces.mdt-fir-MDT0001_UUID.*
ldlm.namespaces.mdt-fir-MDT0001_UUID.contended_locks=32
ldlm.namespaces.mdt-fir-MDT0001_UUID.contention_seconds=2
ldlm.namespaces.mdt-fir-MDT0001_UUID.ctime_age_limit=10
ldlm.namespaces.mdt-fir-MDT0001_UUID.dirty_age_limit=10
ldlm.namespaces.mdt-fir-MDT0001_UUID.early_lock_cancel=0
ldlm.namespaces.mdt-fir-MDT0001_UUID.lock_count=1126644
ldlm.namespaces.mdt-fir-MDT0001_UUID.lock_timeouts=44
ldlm.namespaces.mdt-fir-MDT0001_UUID.lock_unused_count=0
ldlm.namespaces.mdt-fir-MDT0001_UUID.lru_max_age=3900000
ldlm.namespaces.mdt-fir-MDT0001_UUID.lru_size=4800
ldlm.namespaces.mdt-fir-MDT0001_UUID.max_nolock_bytes=0
ldlm.namespaces.mdt-fir-MDT0001_UUID.max_parallel_ast=1024
ldlm.namespaces.mdt-fir-MDT0001_UUID.resource_count=354248
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="242968" author="pjones" created="Wed, 27 Feb 2019 18:15:00 +0000"  >&lt;p&gt;That is good news Stephane. Is the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11964&quot; title=&quot;Heavy load and soft lockups on MDS with DOM&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11964&quot;&gt;&lt;del&gt;LU-11964&lt;/del&gt;&lt;/a&gt; patch the only one applied on top of the vanilla 2.12.0 - or is there anything else?&lt;/p&gt;</comment>
                            <comment id="242969" author="sthiell" created="Wed, 27 Feb 2019 18:44:27 +0000"  >&lt;p&gt;Hi Peter, yes definitively a very good news!&lt;/p&gt;

&lt;p&gt;I have applied the following patch on top of 2.12.0 (and only on the clients), this should be &lt;a href=&quot;https://review.whamcloud.com/#/c/34261/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/34261/&lt;/a&gt; Patch Set 3:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;commit 7b8a3b32f4dc35dcb81ff1c9fd4da3e7c4184d26
Author: Mikhail Pershin &amp;lt;mpershin@whamcloud.com&amp;gt;
Date:   Fri Feb 15 00:51:00 2019 +0300

    LU-11964 mdc: prevent glimpse lock count grow
    
    DOM locks matching tries to ignore locks with
    LDLM_FL_KMS_IGNORE flag during ldlm_lock_match() but
    checks that after ldlm_lock_match() call. Therefore if
    there is any lock with such flag in queue then all other
    locks after it are ignored and new lock is created causing
    big amount of locks on single resource in some access
    patterns.
    Patch move that check inside lock_matches() function, it is
    done by negative check for that flag, i.e. if flag is
    set in lock_matches() parameters then locks with such flag
    are not matched. Corresponding test was added in sanity-dom.sh
    
    Test-Parameters: testlist=sanity-dom
    Signed-off-by: Mikhail Pershin &amp;lt;mpershin@whamcloud.com&amp;gt;
    Change-Id: Ic45ca10f0e603e79a3a00e4fde13a5fae15ea5fc
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This is the only patch we use right now. We have also disabled the&#160;&#160;idling connection feature, see &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11976&quot; title=&quot;req wrong generation leading to I/O errors on 2.12 clients&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11976&quot;&gt;&lt;del&gt;LU-11976&lt;/del&gt;&lt;/a&gt;. I think the combination of the patch + disabling the idling connection feature have made 2.12 stable for us.&lt;/p&gt;

&lt;p&gt;Thanks!&lt;/p&gt;</comment>
                            <comment id="244352" author="gerrit" created="Thu, 21 Mar 2019 03:42:06 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/34261/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34261/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11964&quot; title=&quot;Heavy load and soft lockups on MDS with DOM&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11964&quot;&gt;&lt;del&gt;LU-11964&lt;/del&gt;&lt;/a&gt; mdc: prevent glimpse lock count grow&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: b915221b6d0f3457fd9dd202a9d14c5f8385bf47&lt;/p&gt;</comment>
                            <comment id="244378" author="pjones" created="Thu, 21 Mar 2019 04:49:38 +0000"  >&lt;p&gt;Landed for 2.13&lt;/p&gt;</comment>
                            <comment id="244465" author="sthiell" created="Thu, 21 Mar 2019 22:31:52 +0000"  >&lt;p&gt;I noticed that this patch is not in b2_12 yet, but it should be added.&lt;/p&gt;</comment>
                            <comment id="244467" author="pjones" created="Thu, 21 Mar 2019 23:28:55 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=sthiell&quot; class=&quot;user-hover&quot; rel=&quot;sthiell&quot;&gt;sthiell&lt;/a&gt; that&apos;s what the LTS12 label means &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="244468" author="sthiell" created="Fri, 22 Mar 2019 00:27:54 +0000"  >&lt;p&gt;Whoops! Great! &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/biggrin.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="244679" author="gerrit" created="Tue, 26 Mar 2019 17:21:43 +0000"  >&lt;p&gt;Minh Diep (mdiep@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/34504&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34504&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11964&quot; title=&quot;Heavy load and soft lockups on MDS with DOM&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11964&quot;&gt;&lt;del&gt;LU-11964&lt;/del&gt;&lt;/a&gt; mdc: prevent glimpse lock count grow&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 6422dc451867239cb97a6ab864498077220dcbbb&lt;/p&gt;</comment>
                            <comment id="244984" author="gerrit" created="Mon, 1 Apr 2019 06:20:28 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/34504/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34504/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11964&quot; title=&quot;Heavy load and soft lockups on MDS with DOM&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11964&quot;&gt;&lt;del&gt;LU-11964&lt;/del&gt;&lt;/a&gt; mdc: prevent glimpse lock count grow&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 82dad29511b32c844c849ac4ec3ec0214077f125&lt;/p&gt;</comment>
                            <comment id="246754" author="gerrit" created="Mon, 6 May 2019 18:34:13 +0000"  >&lt;p&gt;(Incorrect ticket, comment removed...)&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="54918">LU-11976</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="54837">LU-11951</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="55041">LU-12037</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="31993" name="fir-md1-s1-20190213.log" size="83700" author="sthiell" created="Wed, 13 Feb 2019 17:29:39 +0000"/>
                            <attachment id="31995" name="fir-md1-s1-dk.log" size="1327866" author="sthiell" created="Wed, 13 Feb 2019 17:46:15 +0000"/>
                            <attachment id="31981" name="fir-md1-s1.syslog-20190212.log.gz" size="309817" author="sthiell" created="Tue, 12 Feb 2019 20:15:18 +0000"/>
                            <attachment id="31997" name="fir-md1-s2-dk.log" size="7497544" author="pfarrell" created="Wed, 13 Feb 2019 19:42:18 +0000"/>
                            <attachment id="31994" name="lustre-log.1550078704.21794" size="39974" author="sthiell" created="Wed, 13 Feb 2019 17:31:13 +0000"/>
                            <attachment id="31996" name="sh-106-60.dk.log.gz" size="4305267" author="pfarrell" created="Wed, 13 Feb 2019 19:42:11 +0000"/>
                            <attachment id="32003" name="sherlocks.txt" size="17069" author="sthiell" created="Thu, 14 Feb 2019 18:30:38 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00bjr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>