<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:01:48 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13500] Client gets evicted - nfsd non-standard errorno -108</title>
                <link>https://jira.whamcloud.com/browse/LU-13500</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Client is getting evicted by MDT as soon as nfsd service started on the client. &lt;/p&gt;

&lt;p&gt;client (golf1) kernel version : 3.10.0-1062.1.1.el7_lustre.x86_64 &lt;br/&gt;
client (golf1) lustre version :  lustre-2.12.3-1.el7.x86_64&lt;/p&gt;

&lt;p&gt;mds (gmds1) kernel version : 3.10.0-1062.1.1.el7_lustre.x86_64&lt;br/&gt;
mds (gmds1) lustre version : lustre-2.12.3-1.el7.x86_64&lt;/p&gt;

&lt;p&gt;oss (goss1-goss6) kernel version : 3.10.0-1062.1.1.el7_lustre.x86_64&lt;br/&gt;
oss (goss1-goss6) lustre version : lustre-2.12.3-1.el7.x86_64&lt;/p&gt;

&lt;p&gt;/etc/exports on golf1 :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;/user_data       10.25.0.0/16(fsid=123456789,rw,anonuid=0,insecure,no_subtree_check,insecure_locks,async)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Apr 30 14:03:07 golf1 kernel: LustreError: 11-0: golf-MDT0000-mdc-ffff973bf6409800: operation ldlm_enqueue to node 10.25.22.90@tcp failed: rc = -107
Apr 30 14:03:07 golf1 kernel: Lustre: golf-MDT0000-mdc-ffff973bf6409800: Connection to golf-MDT0000 (at 10.25.22.90@tcp) was lost; in progress operations using this service will wait for recovery to complete
Apr 30 14:03:07 golf1 kernel: LustreError: Skipped 8 previous similar messages
Apr 30 14:03:07 golf1 kernel: LustreError: 167-0: golf-MDT0000-mdc-ffff973bf6409800: This client was evicted by golf-MDT0000; in progress operations using this service will fail.
Apr 30 14:03:07 golf1 kernel: LustreError: 25491:0:(file.c:4339:ll_inode_revalidate_fini()) golf: revalidate FID [0x20004884e:0x16:0x0] error: rc = -5
Apr 30 14:03:07 golf1 kernel: ------------[ cut here ]------------
Apr 30 14:03:07 golf1 kernel: WARNING: CPU: 26 PID: 25600 at fs/nfsd/nfsproc.c:805 nfserrno+0x58/0x70 [nfsd]
Apr 30 14:03:07 golf1 kernel: LustreError: 25579:0:(file.c:216:ll_close_inode_openhandle()) golf-clilmv-ffff973bf6409800: inode [0x20004884e:0x15:0x0] mdc close failed: rc = -108
Apr 30 14:03:07 golf1 kernel: ------------[ cut here ]------------
Apr 30 14:03:07 golf1 kernel: ------------[ cut here ]------------
Apr 30 14:03:07 golf1 kernel: nfsd: non-standard errno: -108
Apr 30 14:03:07 golf1 kernel: ------------[ cut here ]------------
Apr 30 14:03:07 golf1 kernel: WARNING: CPU: 54 PID: 25602 at fs/nfsd/nfsproc.c:805 nfserrno+0x58/0x70 [nfsd]
Apr 30 14:03:07 golf1 kernel: LustreError: 25579:0:(file.c:216:ll_close_inode_openhandle()) Skipped 2 previous similar messages
Apr 30 14:03:07 golf1 kernel: WARNING: CPU: 24 PID: 25601 at fs/nfsd/nfsproc.c:805 nfserrno+0x58/0x70 [nfsd]
Apr 30 14:03:07 golf1 kernel: WARNING: CPU: 9 PID: 25505 at fs/nfsd/nfsproc.c:805 nfserrno+0x58/0x70 [nfsd]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="59006">LU-13500</key>
            <summary>Client gets evicted - nfsd non-standard errorno -108</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="green">Oleg Drokin</assignee>
                                    <reporter username="cmcl">Campbell Mcleay</reporter>
                        <labels>
                    </labels>
                <created>Fri, 1 May 2020 13:19:43 +0000</created>
                <updated>Wed, 3 Nov 2021 17:43:32 +0000</updated>
                                                                                <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="269125" author="green" created="Fri, 1 May 2020 19:25:05 +0000"  >&lt;p&gt;This only tells us that the client reexporting Lustre as NFS was evicted, but not why. Can you please provide MDT side log that lists the reason for eviction?&lt;/p&gt;</comment>
                            <comment id="269160" author="cmcl" created="Sat, 2 May 2020 05:27:54 +0000"  >&lt;p&gt;Hi,&lt;br/&gt;
Following is logs from MDT server.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Apr 30 14:03:05 gmds1 kernel: LustreError: 14058:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1588235285, 300s ago); not entering recovery in server code, just going back to sleep ns: mdt-golf-MDT0000_UUID lock: ffffa0b090a560c0/0xa9eccc8009893f78 lrc: 3/1,0 mode: --/PR res: [0x20004845a:0x1c19e:0x0].0x0 bits 0x13/0x40 rrc: 33 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 14058 timeout: 0 lvb_type: 0
Apr 30 14:03:05 gmds1 kernel: LustreError: 14058:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 5 previous similar messages
Apr 30 14:03:07 gmds1 kernel: LustreError: 8523:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 149s: evicting client at 10.25.22.97@tcp  ns: mdt-golf-MDT0000_UUID lock: ffffa0a39e5d1680/0xa9eccc800d78ab44 lrc: 3/0,0 mode: CW/CW res: [0x20004884e:0x1c:0x0].0x0 bits 0x5/0x0 rrc: 20 type: IBT flags: 0x60200400000020 nid: 10.25.22.97@tcp remote: 0xda4d2a0ced4730c9 expref: 14511 pid: 8126 timeout: 261303 lvb_type: 0
Apr 30 14:03:07 gmds1 kernel: LustreError: 18657:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) ### lock on destroyed export ffffa0bb7a50fc00 ns: mdt-golf-MDT0000_UUID lock: ffffa07d7513c380/0xa9eccc800d78c204 lrc: 3/0,0 mode: PR/PR res: [0x20004884e:0x16:0x0].0x0 bits 0x13/0x0 rrc: 10 type: IBT flags: 0x50200000000000 nid: 10.25.22.97@tcp remote: 0xda4d2a0ced4734d5 expref: 14509 pid: 18657 timeout: 0 lvb_type: 0
Apr 30 14:03:07 gmds1 kernel: Lustre: golf-MDT0000: Connection restored to  (at 10.25.22.97@tcp)
Apr 30 14:03:07 gmds1 kernel: Lustre: Skipped 1 previous similar message
Apr 30 14:03:07 gmds1 kernel: LustreError: 18657:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) Skipped 31 previous similar messages
Apr 30 14:03:08 gmds1 kernel: LustreError: 17659:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) ### lock on destroyed export ffffa0a3eb454800 ns: mdt-golf-MDT0000_UUID lock: ffffa093c8c60240/0xa9eccc7eb0bd313c lrc: 3/0,0 mode: CW/CW res: [0x20004845a:0x1c19e:0x0].0x0 bits 0xd/0x0 rrc: 34 type: IBT flags: 0x50200400000020 nid: 10.25.22.97@tcp remote: 0x6cc2fba1e568f302 expref: 9 pid: 17659 timeout: 0 lvb_type: 0
Apr 30 14:03:08 gmds1 kernel: LustreError: 17659:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) Skipped 6 previous similar messages
Apr 30 14:03:08 gmds1 kernel: Lustre: 17659:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (326:10967s); client may timeout.  req@ffffa0a10ad09b00 x1665373519819424/t1069384853821(0) o101-&amp;gt;62a78c6c-22f2-ded7-adb4-f014db8f51f3@10.25.22.97@tcp:581/0 lens 648/560 e 5 to 0 dl 1588224621 ref 1 fl Complete:/0/0 rc -107/-107
Apr 30 14:03:08 gmds1 kernel: Lustre: 17659:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 21 previous similar messages
Apr 30 14:03:08 gmds1 kernel: LNet: Service thread pid 17659 completed after 11292.30s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
Apr 30 14:03:08 gmds1 kernel: LNet: Skipped 22 previous similar messages
Apr 30 14:05:38 gmds1 kernel: LustreError: 18656:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1588235438, 300s ago); not entering recovery in server code, just going back to sleep ns: mdt-golf-MDT0000_UUID lock: ffffa08c5d0186c0/0xa9eccc800d787abb lrc: 3/0,1 mode: --/CW res: [0x20004845a:0x1c19e:0x0].0x0 bits 0x5/0x8 rrc: 33 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 18656 timeout: 0 lvb_type: 0
Apr 30 14:05:38 gmds1 kernel: LustreError: 18656:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 1 previous similar message
Apr 30 14:05:38 gmds1 kernel: LustreError: 8523:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.25.22.97@tcp  ns: mdt-golf-MDT0000_UUID lock: ffffa07bf5846e40/0xa9eccc801136d991 lrc: 3/0,0 mode: CW/CW res: [0x200048755:0xb4ea:0x0].0x0 bits 0xd/0x0 rrc: 5 type: IBT flags: 0x60200400000020 nid: 10.25.22.97@tcp remote: 0xda4d2a0ced475b16 expref: 65 pid: 17361 timeout: 261454 lvb_type: 0
Apr 30 14:05:38 gmds1 kernel: LustreError: 17658:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) ### lock on destroyed export ffffa0ad91ae8400 ns: mdt-golf-MDT0000_UUID lock: ffffa08429e50900/0xa9eccc801136dae8 lrc: 3/0,0 mode: PR/PR res: [0x20004884e:0x1b:0x0].0x0 bits 0x13/0x0 rrc: 11 type: IBT flags: 0x50200400000020 nid: 10.25.22.97@tcp remote: 0xda4d2a0ced475bbe expref: 59 pid: 17658 timeout: 0 lvb_type: 0
Apr 30 14:05:38 gmds1 kernel: Lustre: 15351:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (600:5657s); client may timeout.  req@ffffa0a0d9c6ba80 x1665320672855920/t0(0) o101-&amp;gt;2550d87b-664b-5ce0-ccf3-fa246e43162f@10.25.22.98@tcp:1/0 lens 584/592 e 24 to 0 dl 1588230081 ref 1 fl Complete:/0/0 rc 0/0
Apr 30 14:05:38 gmds1 kernel: Lustre: 15351:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 2 previous similar messages
Apr 30 14:05:38 gmds1 kernel: LNet: Service thread pid 15351 completed after 6257.04s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
Apr 30 14:05:38 gmds1 kernel: LNet: Skipped 2 previous similar messages
Apr 30 14:05:38 gmds1 kernel: LustreError: 17658:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) Skipped 18 previous similar messages
Apr 30 14:07:18 gmds1 kernel: LustreError: 8523:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 100s: evicting client at 10.25.22.98@tcp  ns: mdt-golf-MDT0000_UUID lock: ffffa08f3ed36e40/0xa9eccc7f66124799 lrc: 3/0,0 mode: PR/PR res: [0x20004845a:0x1c19e:0x0].0x0 bits 0x13/0x0 rrc: 33 type: IBT flags: 0x60200400000020 nid: 10.25.22.98@tcp remote: 0xc9202f42b18ed159 expref: 726586 pid: 15351 timeout: 261554 lvb_type: 0
Apr 30 14:07:28 gmds1 kernel: LustreError: 23173:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) ### lock on destroyed export ffffa0b019f7d800 ns: mdt-golf-MDT0000_UUID lock: ffffa08eefbb8900/0xa9eccc7ff25f4466 lrc: 3/0,0 mode: CW/CW res: [0x20004845a:0x1c19e:0x0].0x0 bits 0xd/0x0 rrc: 29 type: IBT flags: 0x50200400000020 nid: 10.25.22.97@tcp remote: 0x1b7af78dd9b57628 expref: 21 pid: 23173 timeout: 0 lvb_type: 0
Apr 30 14:07:28 gmds1 kernel: LustreError: 23173:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) Skipped 7 previous similar messages
Apr 30 14:07:28 gmds1 kernel: Lustre: 10894:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (716:710s); client may timeout.  req@ffffa0ac5d72f080 x1665374426896992/t1069387277830(0) o101-&amp;gt;3f4f5fc2-0361-6277-7d3a-55b6e0508d3a@10.25.22.97@tcp:528/0 lens 648/560 e 0 to 0 dl 1588235138 ref 1 fl Complete:/0/0 rc -107/-107
Apr 30 14:07:28 gmds1 kernel: Lustre: 10894:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 4 previous similar messages
Apr 30 14:07:28 gmds1 kernel: LNet: Service thread pid 9608 completed after 1426.69s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
Apr 30 14:07:28 gmds1 kernel: LNet: Skipped 3 previous similar messages
Apr 30 14:08:07 gmds1 kernel: LustreError: 13771:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1588235587, 300s ago); not entering recovery in server code, just going back to sleep ns: mdt-golf-MDT0000_UUID lock: ffffa09aebe38b40/0xa9eccc801131c48d lrc: 3/0,1 mode: --/CW res: [0x20004845a:0x1c19e:0x0].0x0 bits 0x5/0x8 rrc: 18 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 13771 timeout: 0 lvb_type: 0
Apr 30 14:08:07 gmds1 kernel: LustreError: 13771:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 1 previous similar message
Apr 30 14:08:08 gmds1 kernel: LustreError: 8523:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 149s: evicting client at 10.25.22.97@tcp  ns: mdt-golf-MDT0000_UUID lock: ffffa07bf5845a00/0xa9eccc801551d004 lrc: 3/0,0 mode: CW/CW res: [0x20004884e:0x18:0x0].0x0 bits 0x5/0x0 rrc: 6 type: IBT flags: 0x60200400000020 nid: 10.25.22.97@tcp remote: 0xda4d2a0ced477f58 expref: 59 pid: 8117 timeout: 261604 lvb_type: 0
Apr 30 14:08:08 gmds1 kernel: LustreError: 17661:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) ### lock on destroyed export ffffa0b0ee0c0400 ns: mdt-golf-MDT0000_UUID lock: ffffa099568e2640/0xa9eccc801551cf24 lrc: 1/0,0 mode: --/PR res: [0x200048755:0xb4ea:0x0].0x0 bits 0x13/0x0 rrc: 6 type: IBT flags: 0x54a01400000020 nid: 10.25.22.97@tcp remote: 0xda4d2a0ced477f3c expref: 53 pid: 17661 timeout: 0 lvb_type: 0
Apr 30 14:08:08 gmds1 kernel: Lustre: golf-MDT0000: Connection restored to  (at 10.25.22.97@tcp)
Apr 30 14:08:08 gmds1 kernel: Lustre: Skipped 2 previous similar messages
Apr 30 14:08:08 gmds1 kernel: LustreError: 17661:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) Skipped 9 previous similar messages
Apr 30 14:09:06 gmds1 dhclient[7002]: DHCPREQUEST on bond0 to 10.25.20.10 port 67 (xid=0x14eb9def)
Apr 30 14:09:06 gmds1 dhclient[7002]: DHCPACK from 10.25.20.10 (xid=0x14eb9def)
Apr 30 14:09:08 gmds1 dhclient[7002]: bound to 10.25.22.90 -- renewal in 18565 seconds.
Apr 30 14:10:01 gmds1 systemd: Created slice User Slice of root.
Apr 30 14:10:01 gmds1 systemd: Started Session 974 of user root.
Apr 30 14:10:01 gmds1 systemd: Removed slice User Slice of root.
Apr 30 14:13:08 gmds1 kernel: LustreError: 18651:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1588235888, 300s ago); not entering recovery in server code, just going back to sleep ns: mdt-golf-MDT0000_UUID lock: ffffa0a3d52d8900/0xa9eccc80189ae7ef lrc: 3/1,0 mode: --/PR res: [0x20004884e:0x15:0x0].0x0 bits 0x13/0x0 rrc: 32 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 18651 timeout: 0 lvb_type: 0
Apr 30 14:13:08 gmds1 kernel: LustreError: 18651:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 30 previous similar messages
Apr 30 14:13:21 gmds1 kernel: LNet: Service thread pid 17661 was inactive for 312.41s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging
 purposes:
Apr 30 14:13:21 gmds1 kernel: LNet: Skipped 4 previous similar messages
Apr 30 14:13:21 gmds1 kernel: Pid: 17661, comm: mdt01_070 3.10.0-1062.1.1.el7_lustre.x86_64 #1 SMP Mon Oct 21 20:19:09 UTC 2019
Apr 30 14:13:21 gmds1 kernel: Call Trace:
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc154ab96&amp;gt;] ldlm_completion_ast+0x4e6/0x860 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc154b601&amp;gt;] ldlm_cli_enqueue_local+0x231/0x830 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137c50b&amp;gt;] mdt_object_local_lock+0x50b/0xb20 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137cb90&amp;gt;] mdt_object_lock_internal+0x70/0x360 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137cf37&amp;gt;] mdt_object_lock_try+0x27/0xb0 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137e677&amp;gt;] mdt_getattr_name_lock+0x1287/0x1c30 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1385d25&amp;gt;] mdt_intent_getattr+0x2b5/0x480 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1382bb5&amp;gt;] mdt_intent_policy+0x435/0xd80 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1531d56&amp;gt;] ldlm_lock_enqueue+0x356/0xa20 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc155a366&amp;gt;] ldlm_handle_enqueue0+0xa56/0x15f0 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc15e2b02&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc15e92ea&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc158e29b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1591bfc&amp;gt;] ptlrpc_main+0xb2c/0x1460 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffb98c50d1&amp;gt;] kthread+0xd1/0xe0
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffb9f8cd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Apr 30 14:13:21 gmds1 kernel: LustreError: dumping log to /tmp/lustre-log.1588236201.17661
Apr 30 14:13:21 gmds1 kernel: Pid: 9332, comm: mdt01_005 3.10.0-1062.1.1.el7_lustre.x86_64 #1 SMP Mon Oct 21 20:19:09 UTC 2019
Apr 30 14:13:21 gmds1 kernel: Call Trace:
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc154ab96&amp;gt;] ldlm_completion_ast+0x4e6/0x860 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc154b601&amp;gt;] ldlm_cli_enqueue_local+0x231/0x830 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137c50b&amp;gt;] mdt_object_local_lock+0x50b/0xb20 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137cb90&amp;gt;] mdt_object_lock_internal+0x70/0x360 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137cf37&amp;gt;] mdt_object_lock_try+0x27/0xb0 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137e677&amp;gt;] mdt_getattr_name_lock+0x1287/0x1c30 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1385d25&amp;gt;] mdt_intent_getattr+0x2b5/0x480 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1382bb5&amp;gt;] mdt_intent_policy+0x435/0xd80 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1531d56&amp;gt;] ldlm_lock_enqueue+0x356/0xa20 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc155a366&amp;gt;] ldlm_handle_enqueue0+0xa56/0x15f0 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc15e2b02&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc15e92ea&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc158e29b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1591bfc&amp;gt;] ptlrpc_main+0xb2c/0x1460 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffb98c50d1&amp;gt;] kthread+0xd1/0xe0
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffb9f8cd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Apr 30 14:13:21 gmds1 kernel: Pid: 56713, comm: mdt01_034 3.10.0-1062.1.1.el7_lustre.x86_64 #1 SMP Mon Oct 21 20:19:09 UTC 2019
Apr 30 14:13:21 gmds1 kernel: Call Trace:
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc154ab96&amp;gt;] ldlm_completion_ast+0x4e6/0x860 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc154b601&amp;gt;] ldlm_cli_enqueue_local+0x231/0x830 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137c50b&amp;gt;] mdt_object_local_lock+0x50b/0xb20 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137cb90&amp;gt;] mdt_object_lock_internal+0x70/0x360 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137cf37&amp;gt;] mdt_object_lock_try+0x27/0xb0 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137e677&amp;gt;] mdt_getattr_name_lock+0x1287/0x1c30 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1385d25&amp;gt;] mdt_intent_getattr+0x2b5/0x480 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1382bb5&amp;gt;] mdt_intent_policy+0x435/0xd80 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1531d56&amp;gt;] ldlm_lock_enqueue+0x356/0xa20 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc155a366&amp;gt;] ldlm_handle_enqueue0+0xa56/0x15f0 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc15e2b02&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc15e92ea&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc158e29b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1591bfc&amp;gt;] ptlrpc_main+0xb2c/0x1460 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffb98c50d1&amp;gt;] kthread+0xd1/0xe0
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffb9f8cd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Apr 30 14:13:21 gmds1 kernel: Pid: 17361, comm: mdt01_055 3.10.0-1062.1.1.el7_lustre.x86_64 #1 SMP Mon Oct 21 20:19:09 UTC 2019
Apr 30 14:13:21 gmds1 kernel: Call Trace:
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc154ab96&amp;gt;] ldlm_completion_ast+0x4e6/0x860 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc154b601&amp;gt;] ldlm_cli_enqueue_local+0x231/0x830 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137c50b&amp;gt;] mdt_object_local_lock+0x50b/0xb20 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137cb90&amp;gt;] mdt_object_lock_internal+0x70/0x360 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137cf37&amp;gt;] mdt_object_lock_try+0x27/0xb0 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137e677&amp;gt;] mdt_getattr_name_lock+0x1287/0x1c30 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1385d25&amp;gt;] mdt_intent_getattr+0x2b5/0x480 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1382bb5&amp;gt;] mdt_intent_policy+0x435/0xd80 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1531d56&amp;gt;] ldlm_lock_enqueue+0x356/0xa20 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc155a366&amp;gt;] ldlm_handle_enqueue0+0xa56/0x15f0 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc15e2b02&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc15e92ea&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc158e29b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1591bfc&amp;gt;] ptlrpc_main+0xb2c/0x1460 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffb98c50d1&amp;gt;] kthread+0xd1/0xe0
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffb9f8cd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Apr 30 14:13:21 gmds1 kernel: Pid: 14058, comm: mdt01_048 3.10.0-1062.1.1.el7_lustre.x86_64 #1 SMP Mon Oct 21 20:19:09 UTC 2019
Apr 30 14:13:21 gmds1 kernel: Call Trace:
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc154ab96&amp;gt;] ldlm_completion_ast+0x4e6/0x860 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc154b601&amp;gt;] ldlm_cli_enqueue_local+0x231/0x830 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137c50b&amp;gt;] mdt_object_local_lock+0x50b/0xb20 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137cb90&amp;gt;] mdt_object_lock_internal+0x70/0x360 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137cf37&amp;gt;] mdt_object_lock_try+0x27/0xb0 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc137e677&amp;gt;] mdt_getattr_name_lock+0x1287/0x1c30 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1385d25&amp;gt;] mdt_intent_getattr+0x2b5/0x480 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1382bb5&amp;gt;] mdt_intent_policy+0x435/0xd80 [mdt]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1531d56&amp;gt;] ldlm_lock_enqueue+0x356/0xa20 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc155a366&amp;gt;] ldlm_handle_enqueue0+0xa56/0x15f0 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc15e2b02&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc15e92ea&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc158e29b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffc1591bfc&amp;gt;] ptlrpc_main+0xb2c/0x1460 [ptlrpc]
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffb98c50d1&amp;gt;] kthread+0xd1/0xe0
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffb9f8cd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
Apr 30 14:13:21 gmds1 kernel: [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Apr 30 14:13:21 gmds1 kernel: LNet: Service thread pid 18647 was inactive for 312.41s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one.
Apr 30 14:13:21 gmds1 kernel: LNet: Skipped 1 previous similar message
Apr 30 14:18:03 gmds1 kernel: Lustre: 17829:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (5/5), not sending early reply#012  req@ffffa0b122e29f80 x1665385136351328/t0(0) o101-&amp;gt;872face3-fe73-b814-5f50-65108dca215d@10.25.22.97@tcp:368/0 lens 576/3264 e 16 to 0 dl 1588236488 ref 2 fl Interpret:/0/0 rc 0/0
Apr 30 14:18:03 gmds1 kernel: Lustre: 17829:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 6 previous similar messages
Apr 30 14:18:09 gmds1 kernel: Lustre: golf-MDT0000: Client 872face3-fe73-b814-5f50-65108dca215d (at 10.25.22.97@tcp) reconnecting
Apr 30 14:18:09 gmds1 kernel: Lustre: golf-MDT0000: Connection restored to  (at 10.25.22.97@tcp)
Apr 30 14:18:18 gmds1 kernel: LNet: Service thread pid 13997 completed after 609.69s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
Apr 30 14:18:18 gmds1 kernel: LNet: Skipped 7 previous similar messages

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="269199" author="green" created="Mon, 4 May 2020 04:01:32 +0000"  >&lt;p&gt;Thank you, hm, this looksl like a lock timeout.&lt;/p&gt;

&lt;p&gt;Can you please collect lustre debug logs on the MDS and nfs-reexporting client when this is happening?&lt;/p&gt;

&lt;p&gt;Stop all other activity, then run this on both nodes:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param debug=-1
lctl set_param debug_mb=10240 # this requires 10G of RAM, if you don&apos;t have that much - reduce this number some
lctl clear
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;then start the nfsd on the client and wait until eviction, then run this right away on both nodes:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl dk &amp;gt;/tmp/lustre.log
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;collect the lustre.log from both nodes and make them available for download somehow.&lt;/p&gt;</comment>
                            <comment id="269202" author="cmcl" created="Mon, 4 May 2020 08:31:13 +0000"  >&lt;p&gt;Hi,&lt;/p&gt;

&lt;p&gt;I have collected logs and uploaded to our ftp server.&lt;br/&gt;
Could you please connect to our ftp.dneg.com server using filezilla and download logs&lt;/p&gt;

&lt;p&gt;Host= ftp.dneg.com&lt;br/&gt;
Port = 2222&lt;br/&gt;
Server Type = SFTP-SSH File Transfer Protocol&lt;br/&gt;
Logon Type = Normal&lt;br/&gt;
Login Name = lustre_log&lt;br/&gt;
Password = L@8Ki07g&lt;/p&gt;

&lt;p&gt;Once connected go into From_Dneg folder.&lt;/p&gt;

&lt;p&gt;Please let us know if you are able to download logs from ftp.dneg.com ftp server .&lt;/p&gt;
</comment>
                            <comment id="269314" author="green" created="Tue, 5 May 2020 15:53:10 +0000"  >&lt;p&gt;I am (somewaht slowly) getting these files so access is ok, thanks.&lt;br/&gt;
Can you please include server side mds log excerpt that lists the evicted log handle?&lt;/p&gt;</comment>
                            <comment id="269393" author="cmcl" created="Wed, 6 May 2020 07:46:05 +0000"  >&lt;p&gt;Thanks Oleg for looking into it. &lt;/p&gt;

&lt;p&gt;I checked the mds and client logs for 4th May, it didn&apos;t evict the client at that time even if mount was not accessible. &lt;/p&gt;

&lt;p&gt;Have reproduced this issue again today when client got evicted. Today&apos;s logs are uploaded again on ftp under /from_dneg/06May2020/&lt;/p&gt;

&lt;p&gt;mds logs evicting client&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;May  6 11:52:48 gmds1 kernel: LNet: Service thread pid 17655 was inactive for 200.31s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debug
ging purposes:
May  6 11:52:48 gmds1 kernel: LNet: Skipped 3 previous similar messages
May  6 11:52:48 gmds1 kernel: Pid: 17655, comm: mdt01_064 3.10.0-1062.1.1.el7_lustre.x86_64 #1 SMP Mon Oct 21 20:19:09 UTC 2019
May  6 11:52:48 gmds1 kernel: Call Trace:
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffc154aae0&amp;gt;] ldlm_completion_ast+0x430/0x860 [ptlrpc]
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffc154b601&amp;gt;] ldlm_cli_enqueue_local+0x231/0x830 [ptlrpc]
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffc137c50b&amp;gt;] mdt_object_local_lock+0x50b/0xb20 [mdt]
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffc137cb90&amp;gt;] mdt_object_lock_internal+0x70/0x360 [mdt]
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffc137cf37&amp;gt;] mdt_object_lock_try+0x27/0xb0 [mdt]
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffc13a3128&amp;gt;] mdt_object_open_lock+0x688/0xac0 [mdt]
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffc13a7967&amp;gt;] mdt_reint_open+0x15b7/0x3240 [mdt]
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffc139b693&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffc13781b3&amp;gt;] mdt_reint_internal+0x6e3/0xaf0 [mdt]
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffc1384a92&amp;gt;] mdt_intent_open+0x82/0x3a0 [mdt]
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffc1382bb5&amp;gt;] mdt_intent_policy+0x435/0xd80 [mdt]
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffc1531d56&amp;gt;] ldlm_lock_enqueue+0x356/0xa20 [ptlrpc]
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffc155a366&amp;gt;] ldlm_handle_enqueue0+0xa56/0x15f0 [ptlrpc]
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffc15e2b02&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffc15e92ea&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffc158e29b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffc1591bfc&amp;gt;] ptlrpc_main+0xb2c/0x1460 [ptlrpc]
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffb98c50d1&amp;gt;] kthread+0xd1/0xe0
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffb9f8cd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
May  6 11:52:48 gmds1 kernel: [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
May  6 11:52:48 gmds1 kernel: LustreError: dumping log to /tmp/lustre-log.1588746168.17655
May  6 11:52:53 gmds1 su: (to root) vch on pts/3
May  6 11:53:07 gmds1 kernel: LustreError: 8523:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 99s: evicting client at 10.25.22.97@tcp  ns: mdt-golf-MDT0000_UUID lock: ffffa0afb7a4b600/0xa9ecccc11a19b5be lrc: 3/0,0 mode: CW/CW res: [0x20004884e:0x1a:0x0].0x0 bits 0xd/0x0 rrc: 10 type: IBT flags: 0x60200400000020 nid: 10.25.22.97@tcp remote: 0x608b1ed02e6a5ec5 expref: 6373 pid: 45464 timeout: 771903 lvb_type: 0
May  6 11:53:11 gmds1 kernel: LustreError: 9940:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) ### lock on destroyed export ffffa09ee05db000 ns: mdt-golf-MDT0000_UUID lock: ffffa0aa636f1b00/0xa9ecccc11c5e9407 lrc: 3/0,0 mode: PR/PR res: [0x20004884e:0x15:0x0].0x0 bits 0x13/0x0 rrc: 4 type: IBT flags: 0x50200000000000 nid: 10.25.22.97@tcp remote: 0x608b1ed02e6a6149 expref: 6372 pid: 9940 timeout: 0 lvb_type: 0
May  6 11:53:11 gmds1 kernel: Lustre: golf-MDT0000: Connection restored to  (at 10.25.22.97@tcp)
May  6 11:53:15 gmds1 kernel: Lustre: Skipped 1 previous similar message
May  6 11:53:15 gmds1 kernel: LustreError: 9940:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) Skipped 5 previous similar messages
May  6 11:53:15 gmds1 kernel: Lustre: 8191:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (2402:79721s); client may timeout.  req@ffffa0baae312880 x1665743648877760/t0(0) o101-&amp;gt;0da7ba62-284f-8d7c-7923-989bf54c33d1@10.25.22.97@tcp:459/0 lens 576/592 e 15 to 0 dl 1588666466 ref 1 fl Complete:/0/0 rc -107/-107
May  6 11:53:15 gmds1 kernel: Lustre: 8191:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 4 previous similar messages
May  6 11:53:15 gmds1 kernel: LNet: Service thread pid 8191 completed after 82123.35s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
May  6 11:53:15 gmds1 kernel: LNet: Skipped 3 previous similar messages
May  6 11:53:15 gmds1 kernel: LustreError: 9940:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) Skipped 5 previous similar messages
May  6 11:53:15 gmds1 kernel: Lustre: 8191:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (2402:79721s); client may timeout.  req@ffffa0baae312880 x1665743648877760/t0(0) o101-&amp;gt;0da7ba62-284f-8d7c-7923-989bf54c33d1@10.25.22.97@tcp:459/0 lens 576/592 e 15 to 0 dl 1588666466 ref 1 fl Complete:/0/0 rc -107/-107
May  6 11:53:15 gmds1 kernel: Lustre: 8191:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 4 previous similar messages
May  6 11:53:15 gmds1 kernel: LNet: Service thread pid 8191 completed after 82123.35s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
May  6 11:53:15 gmds1 kernel: LNet: Skipped 3 previous similar messages
May  6 11:54:49 gmds1 kernel: LustreError: 8523:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 99s: evicting client at 10.25.22.97@tcp  ns: mdt-golf-MDT0000_UUID lock: ffffa0aac1395c40/0xa9ecccc11e3833ee lrc: 3/0,0 mode: CW/CW res: [0x20004884e:0x17:0x0].0x0 bits 0x5/0x0 rrc: 68 type: IBT flags: 0x60200400000020 nid: 10.25.22.97@tcp remote: 0x608b1ed02e6b8ad7 expref: 101 pid: 45266 timeout: 772005 lvb_type: 0
May  6 11:54:49 gmds1 kernel: LustreError: 46556:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) ### lock on destroyed export ffffa0abc5bbdc00 ns: mdt-golf-MDT0000_UUID lock: ffffa09e4ffa4a40/0xa9ecccc11e3834a4 lrc: 3/0,0 mode: PR/PR res: [0x20004884e:0x17:0x0].0x0 bits 0x1b/0x0 rrc: 64 type: IBT flags: 0x50200400000020 nid: 10.25.22.97@tcp remote: 0x608b1ed02e6b8af3 expref: 67 pid: 46556 timeout: 0 lvb_type: 0
May  6 11:54:49 gmds1 kernel: LustreError: 46556:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) Skipped 40 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="269805" author="cmcl" created="Mon, 11 May 2020 06:15:14 +0000"  >&lt;p&gt;Any update with this please. &lt;/p&gt;</comment>
                            <comment id="269807" author="green" created="Mon, 11 May 2020 08:19:26 +0000"  >&lt;p&gt;I got the logs and they have some holes in them due to a lot of ongoing activity so the understanding is still limited.&lt;/p&gt;

&lt;p&gt;Basically what I pieced together is:&lt;br/&gt;
as the lock BL ast appears for an OPEN lock, we are sending a CLOSE RPC to the MDS. then there&apos;s a gap until server evicts us 100 seconds later:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000080:00000001:42.0:1588746088.342151:0:2834:0:(file.c:245:ll_md_real_close()) Process entered
00000080:00000001:42.0:1588746187.766589:0:2834:0:(file.c:264:ll_md_real_close()) Process leaving (rc=0 : 0 : 0)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;no interim state from this process got into the logs unfortunately. But on the server messages log there are messages of long processing of messages, so I wonder if the message is delayed somewhere?&lt;br/&gt;
Also you did not include the lustre-log.1588746168.17655 the second time around so all that state until the timeout hit is lost and the first mentioning of the lock in question is the eviction message.&lt;/p&gt;

&lt;p&gt;Also the messages about processing taking 80000+ seconds is highly unusual.&lt;/p&gt;

&lt;p&gt;Can you please describe what&apos;s the workload like on the fs? any non-nfs access in shared namespace with nfs access? what do the nfs clients roughly do?&lt;/p&gt;

&lt;p&gt;I&apos;d really like to get the full logs but with the sheer amount of activity going on that appears to be not very practical, does the nfs reexport node has local (and fast) storage by any chance so we can try running debug daemon there? Absent that&lt;/p&gt;

&lt;p&gt;can you rerun the reproducer but instead of setting debug to -1 set it to: &quot;inode super info warning other ha dlmtrace error emerg rpctrace vfstrace console&quot;.&lt;br/&gt;
If any of the /tmp/lustre.log files are generated, we need them too.&lt;br/&gt;
Additionally please compress them (xz is best it seems though is sort of slowish) as otherwise downloading is really slow from your location for me. Please include dmesg excerpt that shows the first eviction.&lt;/p&gt;

&lt;p&gt;Additionally I tried to reproduce this on my test system applying what I was able to gather from existing logs and it seems to be working fine (but I am on a current b2_12), would not be a bad idea if you are able to update to 2.12.4 for this as well, I see there are some potentially relevant changes (e.g. the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4398&quot; title=&quot;mdt_object_open_lock() may not flush conflicting handles&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4398&quot;&gt;&lt;del&gt;LU-4398&lt;/del&gt;&lt;/a&gt; patch) &lt;/p&gt;</comment>
                            <comment id="270183" author="cmcl" created="Thu, 14 May 2020 06:27:45 +0000"  >&lt;p&gt;lustre-log.1588746168.17655.xz is uploaded under same folder, the original lustre-log.1588746168.17655 is uploaded too. &lt;/p&gt;

&lt;p&gt;The namespace is accessed for data backups using rsyncs locally. Time machine backups over afp protocol. The namespace gets mounted on other server over nfs to copy data using rsync. &lt;/p&gt;

&lt;p&gt;In parallel we are exploring if we can get a downtime to upgrade it to 2.12.4. &lt;/p&gt;</comment>
                            <comment id="276793" author="cmcl" created="Thu, 6 Aug 2020 07:16:50 +0000"  >&lt;p&gt;We have upgraded golf1,2,gmds1,goss1 to goss6 to 2.12.4&lt;/p&gt;</comment>
                            <comment id="276806" author="pjones" created="Thu, 6 Aug 2020 13:08:35 +0000"  >&lt;p&gt;ok. Please let us know whether this issue is still impacting you.&lt;/p&gt;</comment>
                            <comment id="278384" author="cmcl" created="Mon, 31 Aug 2020 14:03:52 +0000"  >&lt;p&gt;We have upgraded hotel servers as well to the Lustre version 2.12.4 . But, We are facing the issues with evicting clients in the 2.12.4 version as well.The NFS reexport is enabled in the hotel1 only in which we are having issues with client evictions.&lt;/p&gt;

&lt;p&gt;We Have uploaded the logs to our ftp server from MDS and NFS client servers to check this client evictions and the associated NFS re-export issues. &lt;/p&gt;

&lt;p&gt;Log location : /from_dneg/31Aug2020&lt;/p&gt;

&lt;p&gt;Adding the details of the hotel servers&lt;/p&gt;

&lt;p&gt;client (hotel1) kernel version : 3.10.0-1062.9.1.el7_lustre.x86_64&lt;br/&gt;
client (hotel1) lustre version : lustre-2.12.4-1.el7.x86_64&lt;/p&gt;

&lt;p&gt;mds (hmds1) kernel version : 3.10.0-1062.9.1.el7_lustre.x86_64&lt;br/&gt;
mds (hmds1) lustre version : lustre-2.12.4-1.el7.x86_64&lt;/p&gt;

&lt;p&gt;oss (hoss1-hoss6) kernel version : 3.10.0-1062.9.1.el7_lustre.x86_64&lt;br/&gt;
oss (hoss1-hoss6) lustre version : lustre-2.12.4-1.el7.x86_64&lt;/p&gt;

&lt;p&gt;/etc/exports on hotel1 :&lt;br/&gt;
/user_data       10.27.0.0/16(fsid=123456789,rw,anonuid=0,insecure,no_subtree_check,insecure_locks,async)&lt;/p&gt;</comment>
                            <comment id="278649" author="cmcl" created="Wed, 2 Sep 2020 16:39:36 +0000"  >&lt;p&gt;Hello Team,&lt;/p&gt;

&lt;p&gt;Can we have an update on this ? Also please let us know if we need any additional info to check this further.&lt;/p&gt;</comment>
                            <comment id="278779" author="cmcl" created="Thu, 3 Sep 2020 18:09:29 +0000"  >&lt;p&gt;Really appreciate if we get an update on this at the earliest .&lt;/p&gt;</comment>
                            <comment id="278796" author="green" created="Thu, 3 Sep 2020 21:27:52 +0000"  >&lt;p&gt;It looks like your logs cover different regions.&lt;/p&gt;

&lt;p&gt;The dmesgs covering the eviction are at 17:00 but the messages at 17:07 to 17:10 so we only know there&apos;s an eviction but nothing to corellate back to the logs&lt;/p&gt;</comment>
                            <comment id="278936" author="cmcl" created="Sat, 5 Sep 2020 07:56:29 +0000"  >&lt;p&gt;Yes, the eviction happened at aug 29 17:00 IST . The dmesg and messages logs having the same time but not really sure why the Lustre logs don&apos;t cover the logs at 17:00 . The evictions are keep happening. &lt;br/&gt;
I had generated the logs again and uploaded to the FTP server . Please have a look on these new evictions logs as well to investigate this further .&lt;/p&gt;

&lt;p&gt;Eviction time : Aug 31 at 19:23 and 19:35&lt;br/&gt;
Log location :  /from_dneg/05Sept2020&lt;/p&gt;</comment>
                            <comment id="278951" author="green" created="Sun, 6 Sep 2020 18:08:30 +0000"  >&lt;p&gt;Well, this still seems to be very disconnected between the two sets of logs. the debug logs cover from 1598883056.697070 to 1598883239.198867 (these are seconds since unix epoch), I am not exactly sure what your timezone is but it&apos;s XX:10:56 to XX:13:59 which seems to be far away from the eviction event.&lt;/p&gt;

&lt;p&gt;Did it take you a long time between the eviction and stopping the log? The log data overflows super fast so it&apos;s important to stop the lustre logging very quickly after the eviction was recorded.&lt;/p&gt;</comment>
                            <comment id="278975" author="cmcl" created="Mon, 7 Sep 2020 16:00:52 +0000"  >&lt;p&gt;Both hmds1 and hotel1 running with the same timezone (Asia/Calcutta (IST, +0530)).&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;hmds1 /tmp # timedatectl 
      Local time: Mon 2020-09-07 18:23:47 IST
  Universal time: Mon 2020-09-07 12:53:47 UTC
        RTC time: Mon 2020-09-07 12:53:48
       Time zone: Asia/Calcutta (IST, +0530)
     NTP enabled: yes
NTP synchronized: no
 RTC in local TZ: no
      DST active: n/a
hmds1 /tmp # 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&lt;span class=&quot;image-wrap&quot; style=&quot;&quot;&gt;&lt;a id=&quot;35918_thumb&quot; href=&quot;https://jira.whamcloud.com/secure/attachment/35918/35918_hmds1-timezone.png&quot; title=&quot;hmds1-timezone.png&quot; file-preview-type=&quot;image&quot; file-preview-id=&quot;35918&quot; file-preview-title=&quot;hmds1-timezone.png&quot;&gt;&lt;img src=&quot;https://jira.whamcloud.com/secure/thumbnail/35918/_thumb_35918.png&quot; style=&quot;border: 0px solid black&quot; role=&quot;presentation&quot;/&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;hotel1 /tmp # timedatectl 
      Local time: Mon 2020-09-07 18:24:16 IST
  Universal time: Mon 2020-09-07 12:54:16 UTC
        RTC time: Mon 2020-09-07 12:54:01
       Time zone: Asia/Calcutta (IST, +0530)
     NTP enabled: yes
NTP synchronized: no
 RTC in local TZ: no
      DST active: n/a
hotel1 /tmp # 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&lt;span class=&quot;image-wrap&quot; style=&quot;&quot;&gt;&lt;a id=&quot;35919_thumb&quot; href=&quot;https://jira.whamcloud.com/secure/attachment/35919/35919_hotel1-timezone.png&quot; title=&quot;hotel1-timezone.png&quot; file-preview-type=&quot;image&quot; file-preview-id=&quot;35919&quot; file-preview-title=&quot;hotel1-timezone.png&quot;&gt;&lt;img src=&quot;https://jira.whamcloud.com/secure/thumbnail/35919/_thumb_35919.png&quot; style=&quot;border: 0px solid black&quot; role=&quot;presentation&quot;/&gt;&lt;/a&gt;&lt;/span&gt; &lt;br/&gt;
 I have collected the lustre debug logs right after (approx 19:45 IST) the reported evictions which are at &quot;Mon Aug 31 19:23:03 2020&quot; and &quot;&quot;Mon Aug 31 19:35:55 2020&quot; .&#160; The debug logs writing finished at 19:50 IST (approx) on both hmds1 and hotel servers.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;hmds1 /tmp # ls -lrth lustre-hmds1-31082020.log 
-rw-rw-rw- 1 root root 9.6G Aug 31 19:50 lustre-hmds1-31082020.log
hmds1 /tmp # 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&lt;span class=&quot;image-wrap&quot; style=&quot;&quot;&gt;&lt;a id=&quot;35920_thumb&quot; href=&quot;https://jira.whamcloud.com/secure/attachment/35920/35920_hmds1-log-write.png&quot; title=&quot;hmds1-log-write.png&quot; file-preview-type=&quot;image&quot; file-preview-id=&quot;35920&quot; file-preview-title=&quot;hmds1-log-write.png&quot;&gt;&lt;img src=&quot;https://jira.whamcloud.com/secure/thumbnail/35920/_thumb_35920.png&quot; style=&quot;border: 0px solid black&quot; role=&quot;presentation&quot;/&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;hotel1 /tmp # ls -lrth lustre-hotel1-31082020.log 
-rw-rw-rw- 1 root root 9.6G Aug 31 19:51 lustre-hotel1-31082020.log
hotel1 /tmp # 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&lt;span class=&quot;image-wrap&quot; style=&quot;&quot;&gt;&lt;a id=&quot;35921_thumb&quot; href=&quot;https://jira.whamcloud.com/secure/attachment/35921/35921_hotel1-logs-write.png&quot; title=&quot;hotel1-logs-write.png&quot; file-preview-type=&quot;image&quot; file-preview-id=&quot;35921&quot; file-preview-title=&quot;hotel1-logs-write.png&quot;&gt;&lt;img src=&quot;https://jira.whamcloud.com/secure/thumbnail/35921/_thumb_35921.png&quot; style=&quot;border: 0px solid black&quot; role=&quot;presentation&quot;/&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;As checked the lustre debug logs, there are some difference in the hmds1 lustre debug log write timings. hotel1 seems to be having the logs of eviction at 19:35 IST.&lt;/p&gt;

&lt;p&gt;I am not sure why hmds1 is not captured the logs of evictions.&#160; Why this difference is coming in hmds1 ? Is there any other settings ?&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;hmds1-lustre-debug logs  - Started at Monday, 31 August 2020 19:40:56.697 GMT+05:30 (1598883056.697070) and finished at Monday, 31 August 2020 19:43:59.198 GMT+05:30 (1598883239.198867)
hotel1-lustre-debug logs -  Started at Monday, 31 August 2020 19:26:45.925 GMT+05:30 (1598882205.925774) and finished at Monday, 31 August 2020 19:44:30.984 GMT+05:30 (1598883270.984189)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The connection restorations details also available in the logs which are happening quickly but the evictions are making more trouble with the backup jobs.&lt;/p&gt;</comment>
                            <comment id="279198" author="cmcl" created="Thu, 10 Sep 2020 02:25:01 +0000"  >&lt;p&gt;Any update on this please  ?&lt;/p&gt;</comment>
                            <comment id="279853" author="cmcl" created="Thu, 17 Sep 2020 16:29:00 +0000"  >&lt;p&gt;Hello Team,&lt;/p&gt;

&lt;p&gt;Any news here ?&lt;/p&gt;</comment>
                            <comment id="280341" author="green" created="Wed, 23 Sep 2020 05:21:26 +0000"  >&lt;p&gt;Ok, so did nto realize you had a fraction of hour timezone.&lt;/p&gt;

&lt;p&gt;I think everything checks out then. The debug logs at your debug_mb setting only allow ~3 minutes of detailed logs and you generated them 5 and 15 minutes away from the time eviction happened.&lt;/p&gt;

&lt;p&gt;It sounds like you might want to set dump_on_eviction to 1 on your clients to cause the client to dump current copy of the log the moment it realizes it&apos;s evicted and also dump_on_timeout on the server to do the same when it encounters lock timeout and evicts a client.&lt;/p&gt;</comment>
                            <comment id="280544" author="cmcl" created="Thu, 24 Sep 2020 20:51:54 +0000"  >&lt;p&gt;OK, We set these parameters on both MDS and client currently to the get the proper logs after evictions .&lt;/p&gt;

&lt;p&gt;On both client and MDS&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param debug=-1
lctl set_param debug_mb=10240 
lctl clear
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;On Client &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param dump_on_eviction=1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;On MDS&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param dump_on_timeout=1 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="280915" author="cmcl" created="Tue, 29 Sep 2020 00:34:23 +0000"  >&lt;p&gt;Have uploaded the log dump files to FTP server which is generated on MDS and client right after the eviction. Hope that would help to investigate more on it.&lt;/p&gt;

&lt;p&gt;Evictions happened at &quot;Sun Sep 27 22:57:23 2020 IST&quot;&lt;br/&gt;
Log Location: /from_dneg/29Sept2020&lt;br/&gt;
hmds1 (MDS) log file :lustre-log.1601227748.5992.xz&lt;br/&gt;
hotel (Client) log file : lustre-log.1601227749.1886.xz&lt;/p&gt;</comment>
                            <comment id="280989" author="cmcl" created="Tue, 29 Sep 2020 18:16:40 +0000"  >&lt;p&gt;We had eviction today as well  which was happened on  &lt;b&gt;Tue Sep 29 09:20:16 2020 IST&lt;/b&gt; and &lt;b&gt;Tue Sep 29 09:29:16 2020 IST&lt;/b&gt; , Uploaded the logs for the same as well to investigate further. &lt;/p&gt;

&lt;p&gt;Evictions at : &lt;b&gt;Tue Sep 29 09:20:16 2020 IST&lt;/b&gt; and &lt;b&gt;Tue Sep 29 09:29:16 2020 IST&lt;/b&gt;&lt;br/&gt;
Log Location: /from_dneg/30Sept2020&lt;/p&gt;</comment>
                            <comment id="281516" author="cmcl" created="Tue, 6 Oct 2020 08:39:45 +0000"  >&lt;p&gt;Any updates here please ? &lt;/p&gt;</comment>
                            <comment id="281612" author="green" created="Wed, 7 Oct 2020 04:57:48 +0000"  >&lt;p&gt;it seems you have way too much stuff going on your nodes even for 10G of ram logs, so I guess we really need to reduce debug level and hope it all fits in and the debug level catches all the interesting bits.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
lctl set_param debug=&lt;span class=&quot;code-quote&quot;&gt;&quot;+rpctrace +dlmtrace +neterror +net&quot;&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;please replace the debug=-1 with the above and otherwise repeat the exercise and upload the logs.&lt;/p&gt;

&lt;p&gt;Sorry it&apos;s taking this long with the iterative process.&lt;/p&gt;</comment>
                            <comment id="281613" author="cmcl" created="Wed, 7 Oct 2020 05:14:45 +0000"  >&lt;p&gt;OK,  Do I need to stop all the IO/NFS connections on it to apply  this change or apply it directly ? &lt;/p&gt;

&lt;p&gt;So this will be the final config on both MDS and client or any other changes ?  Please confirm &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param debug=&quot;+rpctrace +dlmtrace +neterror +net&quot;
lctl set_param debug_mb=10240 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="281614" author="green" created="Wed, 7 Oct 2020 05:22:31 +0000"  >&lt;p&gt;you can apply it directly.&lt;/p&gt;

&lt;p&gt;basically you run it like before, jsut changing the lctl set_param debug=-1 line to this one, but the rest of it stays the same&lt;/p&gt;</comment>
                            <comment id="281725" author="cmcl" created="Thu, 8 Oct 2020 06:40:30 +0000"  >&lt;p&gt;Have uploaded eviction logs after above changes. Please have a look on this&lt;/p&gt;

&lt;p&gt;Eviction at : Oct  7 19:22 IST&lt;br/&gt;
Log location : /from_dneg/08Oct2020&lt;/p&gt;</comment>
                            <comment id="281787" author="green" created="Thu, 8 Oct 2020 18:29:15 +0000"  >&lt;p&gt;I got the logs and the are still full debug and then it dawned on me that since you are not restarting the nodes - the instruction I gave you would not disable excess logic. Sorry about that.&lt;/p&gt;

&lt;p&gt;So to manually set the desired debug level and override all the bits you currently have you need to do this instead:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
lctl set_param debug=&lt;span class=&quot;code-quote&quot;&gt;&quot;vfstrace info inode warning error dentry emerg console rpctrace dlmtrace neterror net&quot;&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="282002" author="cmcl" created="Mon, 12 Oct 2020 04:16:08 +0000"  >&lt;p&gt;OK , Have set these parameters on both hmds1 and client.  However the &quot;get_param&quot; still showing blank for the debug level, Hope that won&apos;t be a problem.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param debug=&quot;vfstrace info inode warning error dentry emerg console rpctrace dlmtrace neterror net&quot;
lctl set_param debug_mb=10240
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="282004" author="green" created="Mon, 12 Oct 2020 04:24:09 +0000"  >&lt;p&gt;&quot;still&quot; showing blank? it was not supposed to show blank at any time past or present. are there any errors in dmesg?&lt;/p&gt;</comment>
                            <comment id="282077" author="cmcl" created="Mon, 12 Oct 2020 18:59:19 +0000"  >&lt;p&gt;Yes, Still it is blank and no errors in dmesg as well  &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/sad.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="282078" author="green" created="Mon, 12 Oct 2020 20:04:38 +0000"  >&lt;p&gt;I mean was it is supposed to be something like this:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl get_param debug
debug=
super ioctl neterror warning dlmtrace error emerg ha rpctrace vfstrace config console lfsck

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;does &quot;cat /sys/kernel/debug/lnet/debug&quot; output anything?&lt;/p&gt;</comment>
                            <comment id="282088" author="cmcl" created="Tue, 13 Oct 2020 01:17:35 +0000"  >&lt;p&gt;OK, it&apos;s giving the output for debug on both and hope all good with that. &lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;hmds1 /tmp # lctl get_param debug
debug=
inode info neterror net warning dentry dlmtrace error emerg rpctrace vfstrace console
hmds1 /tmp # cat /sys/kernel/debug/lnet/debug
inode info neterror net warning dentry dlmtrace error emerg rpctrace vfstrace console
hmds1 /tmp # hotel1 /tmp # lctl get_param debug

hotel1 /tmp # lctl get_param debug
debug=
inode info neterror net warning dentry dlmtrace error emerg rpctrace vfstrace console
hotel1 /tmp # cat /sys/kernel/debug/lnet/debug
inode info neterror net warning dentry dlmtrace error emerg rpctrace vfstrace console
hotel1 /tmp # 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="282190" author="cmcl" created="Wed, 14 Oct 2020 07:16:49 +0000"  >&lt;p&gt;There were couple of evictions happened after the change in the debug level and have uploaded the latest logs . Please have a look on this&lt;/p&gt;

&lt;p&gt;Evictions at : Oct 14  04:11 and 04:28 IST&lt;br/&gt;
Location : /from_dneg/14Oct2020&lt;/p&gt;</comment>
                            <comment id="282287" author="cmcl" created="Thu, 15 Oct 2020 02:09:26 +0000"  >&lt;p&gt;Whether the latest logs contains sufficient data ?&lt;/p&gt;</comment>
                            <comment id="282558" author="cmcl" created="Mon, 19 Oct 2020 04:57:55 +0000"  >&lt;p&gt;Any updates here ?&lt;/p&gt;</comment>
                            <comment id="282679" author="green" created="Tue, 20 Oct 2020 08:36:38 +0000"  >&lt;p&gt;the 1602629930.5992/1602629931.16759 set of logs seems to be promising.&lt;/p&gt;

&lt;p&gt;I see there&apos;s an attempt to bunch some other locks that all happen to be open locks that are sort of expensive in the sense that they cause an extra RPC to close the file descriptor first this is not ideal but not fatal.&lt;/p&gt;

&lt;p&gt;The real trouble is eventually we run out of &quot;slots&quot; for the close RPC and nothing is waking us up until eviction 100 seconds later:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000020:00100000:11.0:1602629830.723116:0:37307:0:(genops.c:2356:obd_get_mod_rpc_slot()) hotel-MDT0000-mdc-ffff89b0ec6ee800: sleeping for a modify RPC slot opc 35, max 7
00000100:00000040:11.0:1602629930.420531:0:37307:0:(lustre_net.h:2498:ptlrpc_rqphase_move()) @@@ move req &quot;New&quot; -&amp;gt; &quot;Rpc&quot;  req@ffff89b2d64a0d80 x1685199064046272/t0(0) o35-&amp;gt;hotel-MDT0000-mdc-ffff89b0ec6ee800@10.27.22.90@tcp:23/10 lens 392/624 e 0 to 0 dl 0 ref 2 fl New:/0/ffffffff rc 0/-1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The reason seems to be some RPCs from this client being stuck on the server and so we run out of slots.&lt;/p&gt;

&lt;p&gt;checking server side for one of such requests it looks like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13692&quot; title=&quot;MDS slow/hung threads at mdt_object_local_lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13692&quot;&gt;&lt;del&gt;LU-13692&lt;/del&gt;&lt;/a&gt; might be what you are currently hitting. NASA seems to be having a good success with the patch there now.&lt;/p&gt;</comment>
                            <comment id="282723" author="cmcl" created="Tue, 20 Oct 2020 15:53:23 +0000"  >&lt;p&gt;Thanks for the update and just gone thorough that case . Could you please explain which patch need to apply here in our system ?  What exactly the patch is doing ? How we can implement the patch ? It would be good if you can share some docs on for that too.&lt;/p&gt;</comment>
                            <comment id="282725" author="pjones" created="Tue, 20 Oct 2020 16:04:09 +0000"  >&lt;p&gt;Campbell&lt;/p&gt;

&lt;p&gt;The fix Oleg mentioned is one that we are hoping to include in our upcoming 2.12.6 release. Would you prefer to wait until that so that you are running something well-tested or are things dire enough that you need to attempt a hotfix ASAP?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="282795" author="cmcl" created="Wed, 21 Oct 2020 03:08:35 +0000"  >&lt;p&gt;Hi Peter, &lt;/p&gt;

&lt;p&gt;When we are planning to release 2.12.6 version with the fix ? Whether there will be more delay in releasing this minor version ? &lt;/p&gt;</comment>
                            <comment id="282800" author="cmcl" created="Wed, 21 Oct 2020 04:06:18 +0000"  >&lt;p&gt;Meantime, Could you please provide the patch as well with the documentation to apply it ?  We will also try it out in our servers to see if that fixing the eviction issue. &lt;/p&gt;</comment>
                            <comment id="283064" author="cmcl" created="Fri, 23 Oct 2020 00:26:15 +0000"  >&lt;p&gt;Hi Peter/Oleg,&lt;/p&gt;

&lt;p&gt;As requested earlier, We would like to apply the patch first in our servers .Could you please share the patches for the same ASAP ?&lt;/p&gt;</comment>
                            <comment id="283091" author="pjones" created="Fri, 23 Oct 2020 04:05:35 +0000"  >&lt;p&gt;The patch is at &lt;a href=&quot;https://review.whamcloud.com/#/c/39598/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/39598/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="283241" author="cmcl" created="Mon, 26 Oct 2020 15:35:42 +0000"  >&lt;p&gt;Hi Peter,&lt;/p&gt;

&lt;p&gt;Can you please tell me what needs to be done here to apply the patch? I downloaded ldlm_lock_new-83331795f2550e9e0bf15c3d3832f3d3.c, created a patch file from a diff between it and the version of ldm_lock.c in the 2.12.4 src rpm, and it patched successfully but failed to build. I was able to build rpms from the 142fe73 tree, but they were version 2.13.56, and the rpms were kmod-lustre-client and lustre-client - so is this patch for the client packages? We use the server packages on the affected client, so have the lustre kernel and lustre rpm on our client. Should I build the 2.13.56 rpms against a generic kernel and use that?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="283344" author="green" created="Tue, 27 Oct 2020 06:57:39 +0000"  >&lt;p&gt;you can get already built RPMs from this patch: &lt;a href=&quot;https://review.whamcloud.com/40412&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40412&lt;/a&gt; there&apos;s a build link in the comments.&lt;/p&gt;

&lt;p&gt;The patch only affects MDS so technically that&apos;s the only node you need to update with it (This is tip of the b2_!2 tree so it has all the other fixes too though)&lt;/p&gt;</comment>
                            <comment id="283354" author="cmcl" created="Tue, 27 Oct 2020 10:11:16 +0000"  >&lt;p&gt;Hi Oleg,&lt;/p&gt;

&lt;p&gt;Apologies, I have not been able to find the link to the RPMs. I&apos;ve found a kernel image (kernel-centos7-x86_64) and a source tree (ac40c31.tar.gz), but could not find a link to any rpms - can you please provide more detail? Sorry, I&apos;m clearly missing something obvious.&lt;/p&gt;

&lt;p&gt;Kind regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="283365" author="pjones" created="Tue, 27 Oct 2020 12:23:39 +0000"  >&lt;p&gt;Campbell&lt;/p&gt;

&lt;p&gt;The build link from the comments Oleg was referring to is &lt;a href=&quot;https://build.whamcloud.com/job/lustre-reviews/77274/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.whamcloud.com/job/lustre-reviews/77274/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Hopefully it is intuitive enough how to drill in to get the distro/arch version relevant for you but let us know if not.&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="283369" author="cmcl" created="Tue, 27 Oct 2020 12:48:22 +0000"  >&lt;p&gt;Hi Peter, &lt;/p&gt;

&lt;p&gt;Thanks, I found them at &lt;a href=&quot;https://build.whamcloud.com/job/lustre-reviews/arch=x86_64,build_type=server,distro=el7.7,ib_stack=inkernel/lastSuccessfulBuild/artifact/artifacts/RPMS/x86_64/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.whamcloud.com/job/lustre-reviews/arch=x86_64,build_type=server,distro=el7.7,ib_stack=inkernel/lastSuccessfulBuild/artifact/artifacts/RPMS/x86_64/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="283765" author="cmcl" created="Fri, 30 Oct 2020 09:57:08 +0000"  >&lt;p&gt;Since upgrading to 2.13.56 on the MDS, we&apos;ve found a strange issue whereby we cannot create files over NFS to the lustre client on the lustre filesystem. We can remove files, and chown and chmod. Permissions make no difference. I can&apos;t see anything in the logs. Any ideas why this might happen? Do we need to update packages on anything else besides the MDS perhaps?&lt;/p&gt;</comment>
                            <comment id="283771" author="pjones" created="Fri, 30 Oct 2020 12:04:58 +0000"  >&lt;p&gt;Wait - 2.13.56?! I hope that this is a misunderstanding as that is a 2.14 development tag and not something suitable for running in production.&lt;/p&gt;</comment>
                            <comment id="283772" author="pjones" created="Fri, 30 Oct 2020 12:09:52 +0000"  >&lt;p&gt;Ah I see - you navigated to the most recent development build rather than the specific one highlighted above. The link I expected you to navigate to was &lt;a href=&quot;https://build.whamcloud.com/job/lustre-reviews/77274/arch=x86_64,build_type=server,distro=el7.7,ib_stack=inkernel/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.whamcloud.com/job/lustre-reviews/77274/arch=x86_64,build_type=server,distro=el7.7,ib_stack=inkernel/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="283773" author="cmcl" created="Fri, 30 Oct 2020 13:10:01 +0000"  >&lt;p&gt;ok, thanks Peter. Just to make sure: I should be installing version 2.12.5_62_ga40c31-1?&lt;/p&gt;</comment>
                            <comment id="283780" author="pjones" created="Fri, 30 Oct 2020 14:04:07 +0000"  >&lt;p&gt;Correct&lt;/p&gt;</comment>
                            <comment id="283944" author="cmcl" created="Sun, 1 Nov 2020 16:59:46 +0000"  >&lt;p&gt;HI Peter / Oleg,&lt;/p&gt;

&lt;p&gt;We installed the version &quot;2.12.5_62_ga40c31-1&quot; , But unfortunately that didn&apos;t help us to fix the  eviction issue. We do still having client evictions and have uploaded the latest logs to investigate further on this. &lt;/p&gt;

&lt;p&gt;Eviction time : Sun Nov  1 02:12:09 2020 IST&lt;br/&gt;
Log location :/from_dneg/01Nov2020/&lt;/p&gt;</comment>
                            <comment id="283966" author="cmcl" created="Mon, 2 Nov 2020 12:22:37 +0000"  >&lt;p&gt;Any thoughts here ?&lt;/p&gt;</comment>
                            <comment id="284005" author="green" created="Mon, 2 Nov 2020 18:17:37 +0000"  >&lt;p&gt;the traces are ramarkably similar, but this time as the client is forming early-cancel-requests we can trace them all on the server and the last one sent is handled on the server real fast, the reply is formed and sent, but somehow it is never received on the client (or if it is, there are no traces of it)&lt;/p&gt;

&lt;p&gt;client side:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000100:00100000:31.0:1604176830.175778:0:31048:0:(client.c:1630:ptlrpc_send_new_req()) Sending RPC pname:cluuid:pid:xid:nid:opc ldlm_bl_16:fba2d39b-78a9-1d04-926c-0efdbbdf3a0e:31048:1682064348899136:10.27.22.90@tcp:35
00000100:00000200:31.0:1604176830.175799:0:31048:0:(niobuf.c:884:ptl_send_rpc()) Setup reply buffer: 1024 bytes, xid 1682064348899136, portal 10
00000100:00000040:31.0:1604176830.175802:0:31048:0:(niobuf.c:905:ptl_send_rpc()) @@@ send flg=0  req@ffff8cfc9e310480 x1682064348899136/t0(0) o35-&amp;gt;hotel-MDT0000-mdc-ffff8ced28a0a800@10.27.22.90@tcp:23/10 lens 392/624 e 0 to 0 dl 1604176837 ref 3 fl Rpc:/0/ffffffff rc 0/-1
00000100:00000040:31.0:1604176830.175809:0:31048:0:(niobuf.c:57:ptl_send_buf()) peer_id 12345-10.27.22.90@tcp
00000100:00000200:31.0:1604176830.175819:0:31048:0:(niobuf.c:85:ptl_send_buf()) Sending 392 bytes to portal 23, xid 1682064348899136, offset 0
00000400:00000200:31.0:1604176830.175824:0:31048:0:(lib-move.c:4684:LNetPut()) LNetPut -&amp;gt; 12345-10.27.22.90@tcp
00000400:00000200:31.0:1604176830.175827:0:31048:0:(lib-move.c:2478:lnet_handle_send_case_locked()) Source ANY to MR:  10.27.22.90@tcp local destination
00000400:00000200:31.0:1604176830.175831:0:31048:0:(lib-move.c:1515:lnet_get_best_ni()) compare ni 10.27.22.97@tcp [c:246, d:21, s:1134232350] with best_ni not seleced [c:-2147483648, d:-1, s:0]
00000400:00000200:31.0:1604176830.175834:0:31048:0:(lib-move.c:1558:lnet_get_best_ni()) selected best_ni 10.27.22.97@tcp
00000400:00000200:31.0:1604176830.175836:0:31048:0:(lib-move.c:1790:lnet_select_peer_ni()) 10.27.22.97@tcp ni_is_pref = 0
00000400:00000200:31.0:1604176830.175839:0:31048:0:(lib-move.c:1846:lnet_select_peer_ni()) sd_best_lpni = 10.27.22.90@tcp
00000400:00000200:31.0:1604176830.175872:0:31048:0:(lib-move.c:1752:lnet_handle_send()) TRACE: 10.27.22.97@tcp(10.27.22.97@tcp:&amp;lt;?&amp;gt;) -&amp;gt; 10.27.22.90@tcp(10.27.22.90@tcp:10.27.22.90@tcp) : PUT try# 0
00000800:00000200:31.0:1604176830.175876:0:31048:0:(socklnd_cb.c:1003:ksocknal_send()) sending 392 bytes in 1 frags to 12345-10.27.22.90@tcp
00000800:00000200:31.0:1604176830.175880:0:31048:0:(socklnd.c:195:ksocknal_find_peer_locked()) got peer_ni [ffff8cfbf622dd00] -&amp;gt; 12345-10.27.22.90@tcp (5)
00000800:00000200:31.0:1604176830.175884:0:31048:0:(socklnd_cb.c:764:ksocknal_queue_tx_locked()) Sending to 12345-10.27.22.90@tcp ip 10.27.22.90:988
00000800:00000200:31.0:1604176830.175886:0:31048:0:(socklnd_cb.c:783:ksocknal_queue_tx_locked()) Packet ffff8cf4a6168200 type 1, nob 488 niov 2 nkiov 0
00000100:00100000:31.0:1604176830.175889:0:31048:0:(client.c:2354:ptlrpc_set_wait()) set ffff8cffcd521800 going to sleep for 6 seconds
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;no activity from this thread from this moment on highlighting it&apos;s still waiting for reply&lt;/p&gt;

&lt;p&gt;server side:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000100:00100000:19.0:1604176830.175965:0:37439:0:(service.c:2140:ptlrpc_server_handle_request()) Handling RPC pname:cluuid+ref:pid:xid:nid:opc mdt_rdpg01_019:fba2d39b-78a9-1d04-926c-0efdbbdf3a0e+4826808:31048:x1682064348899136:12345-10.27.22.97@tcp:35
00000100:00000200:19.0:1604176830.175966:0:37439:0:(service.c:2145:ptlrpc_server_handle_request()) got req 1682064348899136
00010000:00000040:19.0:1604176830.176000:0:37439:0:(ldlm_lib.c:2944:target_committed_to_req()) last_committed 734953002189, transno 734953004718, xid 1682064348899136
00010000:00000200:19.0:1604176830.176002:0:37439:0:(ldlm_lib.c:2996:target_send_reply_msg()) @@@ sending reply  req@ffff9f0783b79f80 x1682064348899136/t734953004718(0) o35-&amp;gt;fba2d39b-78a9-1d04-926c-0efdbbdf3a0e@10.27.22.97@tcp:401/0 lens 392/456 e 0 to 0 dl 1604176836 ref 1 fl Interpret:/0/0 rc 0/0
00000100:00000200:19.0:1604176830.176010:0:37439:0:(niobuf.c:85:ptl_send_buf()) Sending 424 bytes to portal 10, xid 1682064348899136, offset 192
00000800:00000200:19.0:1604176830.176028:0:37439:0:(socklnd_cb.c:783:ksocknal_queue_tx_locked()) Packet ffff9ee5e4e3a000 type 1, nob 520 niov 2 nkiov 0
00000100:00000040:19.0:1604176830.176034:0:37439:0:(lustre_net.h:2498:ptlrpc_rqphase_move()) @@@ move req &quot;Interpret&quot; -&amp;gt; &quot;Complete&quot;  req@ffff9f0783b79f80 x1682064348899136/t734953004718(0) o35-&amp;gt;fba2d39b-78a9-1d04-926c-0efdbbdf3a0e@10.27.22.97@tcp:401/0 lens 392/424 e 0 to 0 dl 1604176836 ref 1 fl Interpret:/0/0 rc 0/0
00000100:00100000:19.0:1604176830.176039:0:37439:0:(service.c:2190:ptlrpc_server_handle_request()) Handled RPC pname:cluuid+ref:pid:xid:nid:opc mdt_rdpg01_019:fba2d39b-78a9-1d04-926c-0efdbbdf3a0e+4826812:31048:x1682064348899136:12345-10.27.22.97@tcp:35 Request processed in 74us (98us total) trans 734953004718 rc 0/0
00000100:00100000:19.0:1604176830.176042:0:37439:0:(nrs_fifo.c:241:nrs_fifo_req_stop()) NRS stop fifo request from 12345-10.27.22.97@tcp, seq: 56446120
00000100:00000040:19.0:1604176830.176043:0:37439:0:(service.c:1057:ptlrpc_server_finish_active_request()) RPC PUTting export ffff9ed637f2c000 : new rpc_count 5
00000020:00000040:19.0:1604176830.176044:0:37439:0:(genops.c:1018:class_export_put()) PUTting export ffff9ed637f2c000 : new refcount 4826811
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Now if we switch back to the client we actually can see the reply come from the server - the next message received from MDS is for xid 1682064348976448 &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000400:00000200:32.0:1604176830.231054:0:4117:0:(lib-move.c:4190:lnet_parse()) TRACE: 10.27.22.97@tcp(10.27.22.97@tcp) &amp;lt;- 10.27.22.90@tcp : PUT - for me
00000400:00000200:32.0:1604176830.231058:0:4117:0:(lib-ptl.c:571:lnet_ptl_match_md()) Request from 12345-10.27.22.90@tcp of length 592 into portal 10 MB=0x5f9d420029540
00000400:00000200:32.0:1604176830.231061:0:4117:0:(lib-ptl.c:200:lnet_try_match_md()) Incoming put index a from 12345-10.27.22.90@tcp of length 592/592 into md 0x32bcc8121 [1] + 192
00000400:00000200:32.0:1604176830.231064:0:4117:0:(lib-msg.c:828:lnet_is_health_check()) health check = 0, status = 0, hstatus = 0
00000100:00000200:32.0:1604176830.231066:0:4117:0:(events.c:93:reply_in_callback()) @@@ type 2, status 0  req@ffff8cf2d0afc800 x1682064348976448/t0(0) o101-&amp;gt;hotel-MDT0000-mdc-ffff8ced28a0a800@10.27.22.90@tcp:12/10 lens 656/960 e 0 to 0 dl 1604176838 ref 2 fl Rpc:/0/ffffffff rc 0/-1
00000100:00000200:32.0:1604176830.231066:0:4117:0:(events.c:93:reply_in_callback()) @@@ type 2, status 0  req@ffff8cf2d0afc800 x1682064348976448/t0(0) o101-&amp;gt;hotel-MDT0000-mdc-ffff8ced28a0a800@10.27.22.90@tcp:12/10 lens 656/960 e 0 to 0 dl 1604176838 ref 2 fl Rpc:/0/ffffffff rc 0/-1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;But while it was sent a small fraction of a second before the other one - we don&apos;t see the message being sent in the client logs (we do see it in the server logs) - which I guess might explain why we never see the other one in logs either - it was just dropped as log bucket was full (we have log buckets divider per cpu and priority so some might be dropped before the others leaving such gaps seemingly at random when you have high level of activity).&lt;br/&gt;
It is still kind of strange we never see any more logs from the thread that sent the ELC request since if it did get the reply it is supposed to be sending more requests to the server. In fact checking server logs we do see more of requests that thread sent that were again timely handled on the server until that activity stops, we just don&apos;t know why did it stop.&lt;/p&gt;

&lt;p&gt;Do you have any storage on your nfs reexport node (like real fast local storage)?&lt;br/&gt;
I wonder if you would be able to run in debug daemon mode there (just on the lustre client since we appear to be getting good logs on the server).&lt;/p&gt;

&lt;p&gt;Basically you set the debug level as before, but you run this on the client:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
lctl debug_daemon start /some/path/on/real/local/storage/lustre.log 40000 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;this will create up to 40G file at the place.&lt;br/&gt;
and once the condition is reproduced you then do&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
lctl debug_daemon stop
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;you don&apos;t do the dump on eviction setting in this mode operation so it&apos;s important&lt;br/&gt;
to still do the debug daemon stop somewhat quickly after the eviction happens.&lt;/p&gt;

&lt;p&gt;Do you think you would be able to do something like this?&lt;/p&gt;</comment>
                            <comment id="284120" author="cmcl" created="Tue, 3 Nov 2020 13:15:13 +0000"  >&lt;p&gt;OK, I have disabled the dump on eviction on client and trying to start the debug daemon but it seems there is some limit.  But not sure whether 20GB is enough for this to capture it ?   Please suggest&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;hotel1 /tmp # lctl set_param dump_on_eviction=0
dump_on_eviction=0
hotel1 /tmp # lctl debug_daemon start /tmp/lustre-dump40g.log 40000 
size 40000 invalid, must be in the range 10-20480 MB
hotel1 /tmp # 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="284190" author="green" created="Tue, 3 Nov 2020 23:14:37 +0000"  >&lt;p&gt;well then please use 20480 instead of the 40000&lt;/p&gt;

&lt;p&gt;We won&apos;t know if it&apos;s enough or not until we look inside, here&apos;s hoping it is enough but if you can reproduce with less activity that certainly should help.&lt;/p&gt;</comment>
                            <comment id="284204" author="cmcl" created="Wed, 4 Nov 2020 04:25:48 +0000"  >&lt;p&gt;I have uploaded the latest logs after starting the debug_daemon  to investigate further on this. &lt;/p&gt;

&lt;p&gt;Log Location : /from_dneg/04Nov2020/&lt;br/&gt;
Eviction time : Wed Nov  4 02:12:55 2020 IST&lt;/p&gt;</comment>
                            <comment id="284364" author="cmcl" created="Thu, 5 Nov 2020 15:48:47 +0000"  >&lt;p&gt;Those logs are helping out us to look more on this ?&lt;/p&gt;</comment>
                            <comment id="284439" author="green" created="Fri, 6 Nov 2020 06:41:08 +0000"  >&lt;p&gt;please include server side dmesg -it helps me see evicted logs without poring through huge debug logs.&lt;/p&gt;

&lt;p&gt;anyway I see you included a number of server side logs, but all of them were dumped an hour before the client log begins?&lt;/p&gt;</comment>
                            <comment id="284443" author="green" created="Fri, 6 Nov 2020 07:28:56 +0000"  >&lt;p&gt;I also noticed that the client log only covers 15 minutes of time - this is enough to go back for a 100 second eviction, but you really need to be able to stop client side logging pretty quickly after server side eviction hits.&lt;/p&gt;</comment>
                            <comment id="284466" author="cmcl" created="Fri, 6 Nov 2020 11:09:03 +0000"  >&lt;p&gt;Unfortunately  it took a while for me to stop the debug daemon after the eviction happened . I think I had stopped the service after 1-2 hr (approx) of the eviction since it is happening randomly.  I could see the evictions happened at &quot;Wed Nov 4 02:12:55 2020 IST&quot;, But it dumped other files as well , Just added the created time details below. &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;-rw------- 1 root  root  7.8G Nov  4 02:13 lustre-log.1604436175.5291
-rw------- 1 root  root  602M Nov  4 02:14 lustre-log.1604436274.18605
-rw------- 1 root  root  4.5G Nov  4 02:20 lustre-log.1604436600.19412
-rw------- 1 root  root  5.3G Nov  4 02:31 lustre-log.1604437281.19418
-rw------- 1 root  root  5.0G Nov  4 02:41 lustre-log.1604437888.22490
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Client debug log seems to be stopped writing once its reached the file size threshold?, Added the time details as well for that too. &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;-rw-------  1 root     root      21G Nov  4 03:38 lustre-dump-hotel1.log
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I have uploaded the dmesg of the server and client as requested. If it is making more trouble to dig more into the log file then I can upload another set of new evictions logs but this time will try to stop the service right after it reported, Let me know if that is really required here. &lt;/p&gt;

&lt;p&gt;Log location  :  /from_dneg/04Nov2020/hmds1-logs&lt;br/&gt;
Filename : hmds1-dmesg04112020&lt;/p&gt;</comment>
                            <comment id="284506" author="green" created="Fri, 6 Nov 2020 17:18:02 +0000"  >&lt;p&gt;basically for the client log to contain enough information you must stop it within ~10 minutes of the eviction happening.&lt;/p&gt;

&lt;p&gt;the max size of 20GB is not used to stop writing but basically makes the file to act as a circular buffer so once the size is reached writing continues from offset zero overwriting old records.&lt;/p&gt;

&lt;p&gt;So please try to catch the eviction and stop the client logging right after reporting (since there&apos;s a very characteristic eviction record in the logs on the server side you might be able to script something up to catch the event and stop the writing client-side)&lt;/p&gt;</comment>
                            <comment id="284766" author="cmcl" created="Mon, 9 Nov 2020 23:11:06 +0000"  >&lt;p&gt;OK, Understood. So have uploaded latest set of logs to check further . This time have stopped logging at client side immediately after seeing the evictions.&lt;/p&gt;

&lt;p&gt;Log Location : /from_dneg/10Nov2020/&lt;br/&gt;
 Eviction Time :Tue Nov 10 02:27:42 2020&lt;/p&gt;

&lt;p&gt;There are couple of files got generated at server end , Adding the file creation details as well&lt;/p&gt;

&lt;p&gt;hmds1&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;-rw------- 1 root root 7.8G Nov 10 02:27 lustre-log.1604955458.5291
-rw------- 1 root root 3.5G Nov 10 02:29 lustre-log.1604955558.5335
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Hotel1- Eviction happened at &lt;b&gt;Nov 10 02:27:42 2020&lt;/b&gt; and had stopped client logging writing at &lt;b&gt;Nov 10 02:29&lt;/b&gt;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;-rw------- 1 root     root      21G Nov 10 02:29 lustre-dump-20G-10112020.log
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="284904" author="gerrit" created="Wed, 11 Nov 2020 05:30:51 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/40602&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40602&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13500&quot; title=&quot;Client gets evicted - nfsd non-standard errorno -108&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13500&quot;&gt;LU-13500&lt;/a&gt; ldlm: Do not LRU-cancel &quot;expensive&quot; locks for in bl-ast&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 6afaa40254cb6c0431ea05a72fc0fd076ec0bb81&lt;/p&gt;</comment>
                            <comment id="284905" author="gerrit" created="Wed, 11 Nov 2020 05:41:03 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/40603&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40603&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13500&quot; title=&quot;Client gets evicted - nfsd non-standard errorno -108&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13500&quot;&gt;LU-13500&lt;/a&gt; ldlm: Do not LRU-cancel &quot;expensive&quot; locks for in bl-ast&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 3b5eb8eb50885dd690307d4ba4178ee55a5bf970&lt;/p&gt;</comment>
                            <comment id="284906" author="green" created="Wed, 11 Nov 2020 06:34:55 +0000"  >&lt;p&gt;Thank you. this is a good log but it&apos;s very big to sift through all of it fast.&lt;/p&gt;

&lt;p&gt;Meanwhile some suspicions I had from the previous logs seem to be confirming - the &quot;proactive lru lock cancelling&quot; we do when preparing to send a cancel anyway seems to be a bit overezalous and does some pretty heavy processing.&lt;/p&gt;

&lt;p&gt;I am going to post a patch that should help here while I am digging out some other suspicious stuff I also see. &lt;a href=&quot;https://review.whamcloud.com/#/c/40603/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/40603/&lt;/a&gt;&lt;br/&gt;
This patch you need to apply on the clients (nfs exporting ones at least) (or just use out build from the builders: &lt;a href=&quot;https://build.whamcloud.com/job/lustre-reviews/77636/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.whamcloud.com/job/lustre-reviews/77636/&lt;/a&gt; - select suitable arch and distro type in the long list)&lt;/p&gt;</comment>
                            <comment id="284924" author="cmcl" created="Wed, 11 Nov 2020 12:44:48 +0000"  >&lt;p&gt;Looks like we are seeing some more stuffs here to rectify.&#160; Should we try this patch now or we will wait to dig more on the undergoing case&#160; ?&lt;/p&gt;</comment>
                            <comment id="284975" author="green" created="Wed, 11 Nov 2020 21:22:20 +0000"  >&lt;p&gt;I think it makes sense to try the patch now - if it helps - great.&lt;/p&gt;

&lt;p&gt;Whenever any additional patches would be needed is not yet clear and if yes and the condition would necessitate reversal of a previous patch to reproduce - that could be decided on later.&lt;/p&gt;</comment>
                            <comment id="285127" author="cmcl" created="Fri, 13 Nov 2020 11:04:35 +0000"  >&lt;p&gt;Oleg,&lt;/p&gt;

&lt;p&gt;We have applied the provided patch but unfortunately that seems to be not helping much to reduce the evictions &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/sad.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;. Your thoughts on this please.&lt;/p&gt;</comment>
                            <comment id="285199" author="cmcl" created="Sat, 14 Nov 2020 21:24:43 +0000"  >&lt;p&gt;nfs exports stopped working. Couldn&apos;t see anything in the client log.&lt;br/&gt;
A restart of nfs did not fix the issue. So we had to downgrade it back to 2.12.4. &lt;/p&gt;</comment>
                            <comment id="285213" author="cmcl" created="Mon, 16 Nov 2020 10:47:02 +0000"  >&lt;p&gt;After downgrading, back to 2.12.4, the lustre filesystem was hanging on the client (the other client we have was fine). It would work intermittently but mostly not (the other client was fine during this time). Is this due to lock recovery? I tried mounting with &apos;abort_recov&apos;, but same issue. It eventually resolved itself however. &lt;/p&gt;

&lt;p&gt;There&apos;s still a large number of stack traces in the MDS log, e.g.:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Nov 16 02:58:40 hmds1 kernel: LNet: Service thread pid 16267 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 278.13s. The thread might be hung, or it might only be slow and will res
ume later. Dumping the stack trace &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; debugging purposes:
Nov 16 02:58:40 hmds1 kernel: LNet: Skipped 4 previous similar messages
Nov 16 02:58:40 hmds1 kernel: Pid: 16267, comm: mdt01_051 3.10.0-1062.18.1.el7_lustre.x86_64 #1 SMP Mon Jun 8 13:47:48 UTC 2020
Nov 16 02:58:40 hmds1 kernel: Call Trace:
Nov 16 02:58:40 hmds1 kernel: [&amp;lt;ffffffffc12b8070&amp;gt;] ldlm_completion_ast+0x430/0x860 [ptlrpc]
Nov 16 02:58:40 hmds1 kernel: [&amp;lt;ffffffffc12ba0a1&amp;gt;] ldlm_cli_enqueue_local+0x231/0x830 [ptlrpc]
Nov 16 02:58:40 hmds1 kernel: [&amp;lt;ffffffffc15a817b&amp;gt;] mdt_rename_lock+0x24b/0x4b0 [mdt]
Nov 16 02:58:40 hmds1 kernel: [&amp;lt;ffffffffc15aa350&amp;gt;] mdt_reint_rename+0x2c0/0x2900 [mdt]
Nov 16 02:58:40 hmds1 kernel: [&amp;lt;ffffffffc15b31b3&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
Nov 16 02:58:40 hmds1 kernel: [&amp;lt;ffffffffc158f383&amp;gt;] mdt_reint_internal+0x6e3/0xaf0 [mdt]
Nov 16 02:58:40 hmds1 kernel: [&amp;lt;ffffffffc159b0f7&amp;gt;] mdt_reint+0x67/0x140 [mdt]
Nov 16 02:58:40 hmds1 kernel: [&amp;lt;ffffffffc1356e8a&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
Nov 16 02:58:40 hmds1 kernel: [&amp;lt;ffffffffc12fb83b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Nov 16 02:58:40 hmds1 kernel: [&amp;lt;ffffffffc12ff1a4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
Nov 16 02:58:40 hmds1 kernel: [&amp;lt;ffffffff8dac6321&amp;gt;] kthread+0xd1/0xe0
Nov 16 02:58:40 hmds1 kernel: [&amp;lt;ffffffff8e18ed37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
Nov 16 02:58:40 hmds1 kernel: [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Nov 16 02:58:40 hmds1 kernel: LustreError: dumping log to /tmp/lustre-log.1605475720.16267
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;We&apos;ve uploaded a log (5.1GB) from the MDS&lt;/p&gt;</comment>
                            <comment id="285445" author="cmcl" created="Wed, 18 Nov 2020 16:37:27 +0000"  >&lt;p&gt;Any findings here please ?&lt;/p&gt;</comment>
                            <comment id="285796" author="cmcl" created="Mon, 23 Nov 2020 15:11:23 +0000"  >&lt;p&gt;Hello Team,&lt;/p&gt;

&lt;p&gt;It would be really good if you can provide a solution for the same at the earliest as this is really affecting the production backup which is really putting us in a panic situations. Requesting you to check this on priority please.&#160;&lt;/p&gt;</comment>
                            <comment id="285845" author="green" created="Tue, 24 Nov 2020 02:37:06 +0000"  >&lt;p&gt;Sorry that I have no immediate answers to this, I am still thinking about the whole thing.&lt;/p&gt;</comment>
                            <comment id="285862" author="green" created="Tue, 24 Nov 2020 08:40:16 +0000"  >&lt;p&gt;So I wonder if reducing amount of mdt locks would at least help you temporarily since the problem is clearly related to parallel cancels.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param ldlm.namespaces.*MDT*mdc*.lru_size=100
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;can you please run this on your nfs export node(s)? The setting is not permanent and would reset on node reboot/lustre client remount.&lt;/p&gt;</comment>
                            <comment id="285969" author="cmcl" created="Wed, 25 Nov 2020 11:24:35 +0000"  >&lt;p&gt;Hi Oleg, &lt;/p&gt;

&lt;p&gt;Just some feedback on this setting: since the change has been made, there have been no stack traces, and the backups that run on that node have not been finishing early (so far). We&apos;ll see what happens when they finish, but looking much better so far!&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="289014" author="cmcl" created="Fri, 8 Jan 2021 09:00:02 +0000"  >&lt;p&gt;HI Oleg,&lt;/p&gt;

&lt;p&gt;We have modified the config for&#160;limit the number of nfsd threads a single client can use on NFS gateways which helped to avoid the further client evictions. We are not seeing any stack traces and the backups are running smoothly so far.&#160;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;/sys/module/sunrpc/parameters/svc_rpc_per_connection_limit = 1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Also we are having the below lru size configs are in place too&#160;on the clients&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param -P ldlm.namespaces.*.lru_size=10000
lctl set_param -P ldlm.namespaces.*.lru_max_age=600000
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Please check and suggest . &lt;/p&gt;</comment>
                            <comment id="290836" author="cmcl" created="Mon, 1 Feb 2021 09:16:41 +0000"  >&lt;p&gt;Any updates here ?&lt;/p&gt;</comment>
                            <comment id="290880" author="green" created="Mon, 1 Feb 2021 20:02:55 +0000"  >&lt;p&gt;these ldlm settings look fine to me if they work for you. You can shorten the max age if you want depending on your workload.&lt;/p&gt;

&lt;p&gt;I can&apos;t comment on the &quot;svc_rpc_per_connection_limit&quot; parameter as I am not familiar with that part of the NFS stack.&lt;/p&gt;</comment>
                            <comment id="304805" author="cmcl" created="Thu, 17 Jun 2021 15:59:40 +0000"  >&lt;p&gt;Hi Oleg Drokin,&lt;/p&gt;

&lt;p&gt;As we monitored the system with below suggested change , it&apos;s noticed that the Metadata server (hmds1) is getting rebooted randomly and the both the clients are evicting from the cluster which is again making issues. Could you please look into that more and suggest ?&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param ldlm.namespaces.*MDT*mdc*.lru_size=100
lctl set_param -P ldlm.namespaces.*.lru_size=10000
lctl set_param -P ldlm.namespaces.*.lru_max_age=600000
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="305188" author="green" created="Tue, 22 Jun 2021 14:16:28 +0000"  >&lt;p&gt;is there any more information about the MDS crashes/reboots? Is there an oops/lbug/assert (what&apos;s the stacktrace)? a HA-induced reboot due to high load/whatever? something else?&lt;/p&gt;</comment>
                            <comment id="305510" author="cmcl" created="Fri, 25 Jun 2021 10:32:31 +0000"  >&lt;p&gt;I have uploaded the crash logs for couple of reboot instances to check this further.  &lt;/p&gt;

&lt;p&gt;Uploaded file name:   &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/39297/39297_vmcore-dmesg-2021-06-17.txt&quot; title=&quot;vmcore-dmesg-2021-06-17.txt attached to LU-13500&quot;&gt;vmcore-dmesg-2021-06-17.txt&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;   - Restarted on 17th June&lt;br/&gt;
                                     &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/39296/39296_vmcore-dmesg-2021-05-11.txt&quot; title=&quot;vmcore-dmesg-2021-05-11.txt attached to LU-13500&quot;&gt;vmcore-dmesg-2021-05-11.txt&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;   - Restarted on 11th May&lt;/p&gt;

&lt;p&gt;Last restart instance &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;reboot   system boot  3.10.0-1062.18.1 Thu Jun 17 16:34 - 14:58 (7+22:23)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Load usage seems to be normal during that time&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
03:50:01 PM         2      1990      8.66      8.37      9.16         2
04:00:01 PM        11      1981      6.24      8.02      8.75         1
04:10:01 PM        15      1981      9.57     10.08      9.54         3
04:20:01 PM         2      1982     11.88     13.18     11.28         1
Average:           10      1983     10.40     10.50     10.42         2

04:35:02 PM       LINUX RESTART

04:40:01 PM   runq-sz  plist-sz   ldavg-1   ldavg-5  ldavg-15   blocked
04:50:01 PM         2      1917     14.27     12.43      7.79         2
05:00:01 PM         5      1939     13.30     13.36     10.71         3
05:10:01 PM        11      1931     11.81     12.92     11.84         3
05:20:01 PM        20      1924     17.28     14.74     13.02         2
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;Most of the restart, could see the message &quot;blk_update_request: critical medium error&quot; always.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[463108.636890] blk_update_request: critical medium error, dev nvme3n1, sector 5325696536
[622470.873927] md: md0: data-check done.
[955488.123023] md: data-check of RAID array md0
[1028448.215074] ------------[ cut here ]------------
[1028448.215083] WARNING: CPU: 7 PID: 0 at lib/percpu-refcount.c:155 percpu_ref_switch_to_atomic_rcu+0x136/0x150
[1028448.215088] percpu ref (no_op) &amp;lt;= 0 (0) after switching to atomic
[1028448.215089] Modules linked in:
[1028448.215091]  vfat fat osp(OE) mdd(OE) lod(OE) mdt(OE) lfsck(OE) mgs(OE) mgc(OE) mpt3sas mpt2sas raid_class scsi_transport_sas mptctl mptbase osd_ldiskfs(OE) ldiskfs(OE) lquota(OE) fid(OE) fld(OE) ksocklnd(OE) ptlrpc(OE) mbcache jbd2 obdclass(OE) lnet(OE) libcfs(OE) dell_rbu nfsv3 nfs fscache binfmt_misc bonding iTCO_wdt iTCO_vendor_support mxm_wmi dcdbas sb_edac intel_powerclamp coretemp intel_rapl iosf_mbi kvm_intel kvm irqbypass raid10 sg lpc_ich mei_me mei ipmi_si ipmi_devintf ipmi_msghandler wmi acpi_power_meter pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd grace ip_tables xfs sd_mod crc_t10dif crct10dif_generic 8021q garp mrp stp llc mgag200 i2c_algo_bit drm_kms_helper bnx2x syscopyarea sysfillrect sysimgblt fb_sys_fops ttm crct10dif_pclmul crct10dif_common crc32_pclmul crc32c_intel ghash_clmulni_intel
[1028448.215136]  ahci drm aesni_intel lrw gf128mul libahci glue_helper ablk_helper cryptd scsi_transport_iscsi libata megaraid_sas nvme ptp nvme_core pps_core drm_panel_orientation_quirks mdio libcrc32c dm_multipath sunrpc dm_mirror dm_region_hash dm_log dm_mod [last unloaded: usb_storage]
[1028448.215163] CPU: 7 PID: 0 Comm: swapper/7 Kdump: loaded Tainted: G           OE  ------------   3.10.0-1062.18.1.el7_lustre.x86_64 #1
[1028448.215164] Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.11.0 11/02/2019
[1028448.215165] Call Trace:
[1028448.215167]  &amp;lt;IRQ&amp;gt;  [&amp;lt;ffffffff8cf7b416&amp;gt;] dump_stack+0x19/0x1b
[1028448.215178]  [&amp;lt;ffffffff8c89bab8&amp;gt;] __warn+0xd8/0x100
[1028448.215180]  [&amp;lt;ffffffff8c89bb3f&amp;gt;] warn_slowpath_fmt+0x5f/0x80
[1028448.215185]  [&amp;lt;ffffffff8c958b6d&amp;gt;] ? rcu_advance_cbs+0xcd/0xe0
[1028448.215187]  [&amp;lt;ffffffff8cd9b740&amp;gt;] ? null_show+0x20/0x20
[1028448.215189]  [&amp;lt;ffffffff8cb86cd6&amp;gt;] percpu_ref_switch_to_atomic_rcu+0x136/0x150
[1028448.215192]  [&amp;lt;ffffffff8c95a4f8&amp;gt;] rcu_process_callbacks+0x1d8/0x570
[1028448.215195]  [&amp;lt;ffffffff8c8a5435&amp;gt;] __do_softirq+0xf5/0x280
[1028448.215200]  [&amp;lt;ffffffff8cf9242c&amp;gt;] call_softirq+0x1c/0x30
[1028448.215204]  [&amp;lt;ffffffff8c82f715&amp;gt;] do_softirq+0x65/0xa0
[1028448.215206]  [&amp;lt;ffffffff8c8a57b5&amp;gt;] irq_exit+0x105/0x110
[1028448.215209]  [&amp;lt;ffffffff8cf939d8&amp;gt;] smp_apic_timer_interrupt+0x48/0x60
[1028448.215211]  [&amp;lt;ffffffff8cf8fefa&amp;gt;] apic_timer_interrupt+0x16a/0x170
[1028448.215213]  &amp;lt;EOI&amp;gt;  [&amp;lt;ffffffff8cdc2027&amp;gt;] ? cpuidle_enter_state+0x57/0xd0
[1028448.215218]  [&amp;lt;ffffffff8cdc217e&amp;gt;] cpuidle_idle_call+0xde/0x230
[1028448.215221]  [&amp;lt;ffffffff8c837c6e&amp;gt;] arch_cpu_idle+0xe/0xc0
[1028448.215226]  [&amp;lt;ffffffff8c9017da&amp;gt;] cpu_startup_entry+0x14a/0x1e0
[1028448.215231]  [&amp;lt;ffffffff8c85a0c7&amp;gt;] start_secondary+0x1f7/0x270
[1028448.215234]  [&amp;lt;ffffffff8c8000d5&amp;gt;] start_cpu+0x5/0x14
[1028448.215236] ---[ end trace 4441139b69972efc ]---
[1136636.662352] blk_update_request: critical medium error, dev nvme3n1, sector 10406963584
[1136636.710304] blk_update_request: critical medium error, dev nvme3n1, sector 10406967424
[1165058.648561] md: md0: data-check done.
[1560227.473353] md: data-check of RAID array md0
[1745005.678257] blk_update_request: critical medium error, dev nvme3n1, sector 9927251728
[1779948.753709] md: md0: data-check done.
[1852184.120501] blk_update_request: critical medium error, dev nvme3n1, sector 9334756896
[1852184.120532] md/raid10:md0: nvme3n1: rescheduling sector 18668989512
[1852184.120644] blk_update_request: critical medium error, dev nvme3n1, sector 9334757280
[1852184.120673] md/raid10:md0: nvme3n1: rescheduling sector 18668990280
[1852184.120952] blk_update_request: critical medium error, dev nvme3n1, sector 9334756896
[1852184.121138] md/raid10:md0: read error corrected (8 sectors at 262144 on nvme3n1)
[1852184.121148] md/raid10:md0: nvme2n1: redirecting sector 18668989512 to another mirror
[1852184.121526] blk_update_request: critical medium error, dev nvme3n1, sector 9334757280
[1852184.121709] md/raid10:md0: read error corrected (8 sectors at 262144 on nvme3n1)
[1852184.121720] md/raid10:md0: nvme2n1: redirecting sector 18668990280 to another mirror
[2164966.710539] md: data-check of RAID array md0
[2183165.845838] blk_update_request: critical medium error, dev nvme3n1, sector 917466240
[2183165.846403] blk_update_request: critical medium error, dev nvme3n1, sector 917466280
[2431150.697903] md: md0: data-check done.
[2769705.550545] md: data-check of RAID array md0
[2969839.376579] 

sector 10812151448
[2993948.791186] md: md0: data-check done.
[3170956.989559] ------------[ cut here ]------------
[3170956.989584] kernel BUG at /tmp/rpmbuild-lustre-jenkins-KWaNslf3/BUILD/lustre-2.12.5_62_gac40c31/ldiskfs/htree_lock.c:429!
[3170956.989617] invalid opcode: 0000 [#1] SMP 
[3170956.989634] Modules linked in: vfat fat osp(OE) mdd(OE) lod(OE) mdt(OE) lfsck(OE) mgs(OE) mgc(OE) mpt3sas mpt2sas raid_class scsi_transport_sas mptctl mptbase osd_ldiskfs(OE) ldiskfs(OE) lquota(OE) fid(OE) fld(OE) ksocklnd(OE) ptlrpc(OE) mbcache jbd2 obdclass(OE) lnet(OE) libcfs(OE) dell_rbu nfsv3 nfs fscache binfmt_misc bonding iTCO_wdt iTCO_vendor_support mxm_wmi dcdbas sb_edac intel_powerclamp coretemp intel_rapl iosf_mbi kvm_intel kvm irqbypass raid10 sg lpc_ich mei_me mei ipmi_si ipmi_devintf ipmi_msghandler wmi acpi_power_meter pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd grace ip_tables xfs sd_mod crc_t10dif crct10dif_generic 8021q garp mrp stp llc mgag200 i2c_algo_bit drm_kms_helper bnx2x syscopyarea sysfillrect sysimgblt fb_sys_fops ttm crct10dif_pclmul crct10dif_common crc32_pclmul crc32c_intel
[3170956.989935]  ghash_clmulni_intel ahci drm aesni_intel lrw gf128mul libahci glue_helper ablk_helper cryptd scsi_transport_iscsi libata megaraid_sas nvme ptp nvme_core pps_core drm_panel_orientation_quirks mdio libcrc32c dm_multipath sunrpc dm_mirror dm_region_hash dm_log dm_mod [last unloaded: usb_storage]
[3170956.990047] CPU: 28 PID: 17088 Comm: mdt00_017 Kdump: loaded Tainted: G        W  OE  ------------   3.10.0-1062.18.1.el7_lustre.x86_64 #1
[3170956.990085] Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.11.0 11/02/2019
[3170956.990109] task: ffff9de935f041c0 ti: ffff9de74522c000 task.ti: ffff9de74522c000
[3170956.990132] RIP: 0010:[&amp;lt;ffffffffc0a86284&amp;gt;]  [&amp;lt;ffffffffc0a86284&amp;gt;] htree_node_unlock+0x4b4/0x4c0 [ldiskfs]
[3170956.990179] RSP: 0018:ffff9de74522f8b0  EFLAGS: 00010246
[3170956.990197] RAX: ffff9de933d98600 RBX: 0000000000000001 RCX: ffff9e0936a5b290
[3170956.990219] RDX: 00000000000000c8 RSI: 0000000000000001 RDI: 0000000000000000
[3170956.990241] RBP: ffff9de74522f928 R08: ffff9dcca9a01d00 R09: ffff9dda59559500
[3170956.990263] R10: 0000000000000000 R11: ffff9dcd260b9258 R12: ffff9e0936a5b2d8
[3170956.990285] R13: 0000000000000000 R14: ffff9de7ffea8a00 R15: ffff9dcd260b91e0
[3170956.990308] FS:  0000000000000000(0000) GS:ffff9de93f980000(0000) knlGS:0000000000000000
[3170956.990332] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[3170956.991222] CR2: 00007fdb46c25000 CR3: 0000003ff39b8000 CR4: 00000000003607e0
[3170956.992087] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[3170956.992943] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[3170956.993790] Call Trace:
[3170956.994639]  [&amp;lt;ffffffffc0a8646a&amp;gt;] htree_node_release_all+0x5a/0x80 [ldiskfs]
[3170956.995505]  [&amp;lt;ffffffffc0a864b2&amp;gt;] htree_unlock+0x22/0x70 [ldiskfs]
[3170956.996363]  [&amp;lt;ffffffffc0ad7b1e&amp;gt;] osd_index_ea_delete+0x30e/0xb10 [osd_ldiskfs]
[3170956.997244]  [&amp;lt;ffffffffc138efa8&amp;gt;] lod_sub_delete+0x1c8/0x460 [lod]
[3170956.998285]  [&amp;lt;ffffffffc1369c24&amp;gt;] lod_delete+0x24/0x30 [lod]
[3170956.999350]  [&amp;lt;ffffffffc13de7c4&amp;gt;] __mdd_index_delete_only+0x194/0x250 [mdd]
[3170957.000391]  [&amp;lt;ffffffffc13e1136&amp;gt;] __mdd_index_delete+0x46/0x290 [mdd]
[3170957.001480]  [&amp;lt;ffffffffc13eee78&amp;gt;] mdd_unlink+0x5f8/0xaa0 [mdd]
[3170957.002467]  [&amp;lt;ffffffffc12b0b69&amp;gt;] mdo_unlink+0x46/0x48 [mdt]
[3170957.003319]  [&amp;lt;ffffffffc1274b1d&amp;gt;] mdt_reint_unlink+0xbed/0x14b0 [mdt]
[3170957.004156]  [&amp;lt;ffffffffc12791b3&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
[3170957.004990]  [&amp;lt;ffffffffc1255383&amp;gt;] mdt_reint_internal+0x6e3/0xaf0 [mdt]
[3170957.005829]  [&amp;lt;ffffffffc12610f7&amp;gt;] mdt_reint+0x67/0x140 [mdt]
[3170957.006704]  [&amp;lt;ffffffffc0f86e8a&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
[3170957.007724]  [&amp;lt;ffffffffc0f605d1&amp;gt;] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc]
[3170957.008755]  [&amp;lt;ffffffffc082cbde&amp;gt;] ? ktime_get_real_seconds+0xe/0x10 [libcfs]
[3170957.009747]  [&amp;lt;ffffffffc0f2b83b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[3170957.010752]  [&amp;lt;ffffffffc0f28655&amp;gt;] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc]
[3170957.011741]  [&amp;lt;ffffffff8c8d3a33&amp;gt;] ? __wake_up+0x13/0x20
[3170957.012740]  [&amp;lt;ffffffffc0f2f1a4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
[3170957.013630]  [&amp;lt;ffffffffc0f2e670&amp;gt;] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc]
[3170957.014404]  [&amp;lt;ffffffff8c8c6321&amp;gt;] kthread+0xd1/0xe0
[3170957.015169]  [&amp;lt;ffffffff8c8c6250&amp;gt;] ? insert_kthread_work+0x40/0x40
[3170957.015910]  [&amp;lt;ffffffff8cf8ed37&amp;gt;] ret_from_fork_nospec_begin+0x21/0x21
[3170957.016633]  [&amp;lt;ffffffff8c8c6250&amp;gt;] ? insert_kthread_work+0x40/0x40
[3170957.017336] Code: 0f 0b 48 8b 45 90 8b 55 8c f3 90 0f a3 10 19 c9 85 c9 75 f5 f0 0f ab 10 19 c9 85 c9 0f 84 a4 fb ff ff eb e5 0f 1f 00 0f 0b 0f 0b &amp;lt;0f&amp;gt; 0b 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 89 f0 48 
[3170957.018817] RIP  [&amp;lt;ffffffffc0a86284&amp;gt;] htree_node_unlock+0x4b4/0x4c0 [ldiskfs]
[3170957.019524]  RSP &amp;lt;ffff9de74522f8b0&amp;gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="305781" author="green" created="Tue, 29 Jun 2021 13:38:51 +0000"  >&lt;p&gt;The last one looks like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13054&quot; title=&quot;MDS kernel BUG at ldiskfs/htree_lock.c:429!&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13054&quot;&gt;&lt;del&gt;LU-13054&lt;/del&gt;&lt;/a&gt; to me and this patch seems to be missing from the tree you are running (git hash ac40c31)&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="316966" author="dneg" created="Fri, 29 Oct 2021 08:09:24 +0000"  >&lt;p&gt;Hi Oleg,&lt;/p&gt;

&lt;p&gt;Whether this patch(for the reboot issue) included in 2.12.6 version or do we need to apply this manually in 2.12.6 ?&#160;&lt;/p&gt;</comment>
                            <comment id="316995" author="pjones" created="Fri, 29 Oct 2021 14:29:48 +0000"  >&lt;p&gt;The fix is queued up for the upcoming 2.12.8 release but, in the meantime, you could apply the patch - see &lt;a href=&quot;https://review.whamcloud.com/#/c/44121/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/44121/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="317364" author="dneg" created="Wed, 3 Nov 2021 16:49:12 +0000"  >&lt;p&gt;Hi Peter,&lt;/p&gt;

&lt;p&gt;Thanks for the update,&#160; We have applied above patch on hotel and monitoring it further.&#160;&lt;/p&gt;

&lt;p&gt;When are we planning to release the version 2.12.8 ?&#160;&lt;/p&gt;</comment>
                            <comment id="317373" author="pjones" created="Wed, 3 Nov 2021 17:43:32 +0000"  >&lt;p&gt;The timeline will be affected by third party software releases so is a little unclear but seems very likely to be before the year is out.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="59623">LU-13692</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="57570">LU-13054</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="35920" name="hmds1-log-write.png" size="7180" author="cmcl" created="Mon, 7 Sep 2020 15:50:21 +0000"/>
                            <attachment id="35918" name="hmds1-timezone.png" size="22110" author="cmcl" created="Mon, 7 Sep 2020 15:49:41 +0000"/>
                            <attachment id="35921" name="hotel1-logs-write.png" size="9047" author="cmcl" created="Mon, 7 Sep 2020 15:50:35 +0000"/>
                            <attachment id="35919" name="hotel1-timezone.png" size="22092" author="cmcl" created="Mon, 7 Sep 2020 15:50:02 +0000"/>
                            <attachment id="39296" name="vmcore-dmesg-2021-05-11.txt" size="160022" author="cmcl" created="Fri, 25 Jun 2021 10:29:39 +0000"/>
                            <attachment id="39297" name="vmcore-dmesg-2021-06-17.txt" size="153175" author="cmcl" created="Fri, 25 Jun 2021 10:29:43 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00z9r:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>