<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:47:16 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-11826] Cannot send after transport endpoint shutdown</title>
                <link>https://jira.whamcloud.com/browse/LU-11826</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;When running multiple rm&apos;s of files, we get the following error in the shell:&lt;/p&gt;

&lt;p&gt;/bin/rm: cannot remove &apos;&amp;lt;/some/file/path&amp;gt;&#8217;: Cannot send after transport endpoint shutdown&lt;/p&gt;

&lt;p&gt;These coincide with the following error in /var/log/messages:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 Dec 24 11:13:09 foxtrot2 kernel: LustreError: 11-0: foxtrot-MDT0000-mdc-ffff883ff6b12800: operation mds_close to node 10.21.22.10@tcp failed: rc = -107Dec 24 11:13:09 foxtrot2 kernel: Lustre: foxtrot-MDT0000-mdc-ffff883ff6b12800: Connection to foxtrot-MDT0000 (at 10.21.22.10@tcp) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to completeDec 24 11:13:09 foxtrot2 kernel: LustreError: 167-0: foxtrot-MDT0000-mdc-ffff883ff6b12800: This client was evicted by foxtrot-MDT0000; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail.Dec 24 11:13:09 foxtrot2 kernel: LustreError: 3598:0:(mdc_locks.c:1211:mdc_intent_getattr_async_interpret()) ldlm_cli_enqueue_fini: -5Dec 24 11:13:09 foxtrot2 kernel: LustreError: 3598:0:(mdc_locks.c:1211:mdc_intent_getattr_async_interpret()) Skipped 37 previous similar messagesDec 24 11:13:09 foxtrot2 kernel: LustreError: Skipped 50 previous similar messagesDec 24 11:13:09 foxtrot2 kernel: LustreError: 39322:0:(llite_lib.c:1512:ll_md_setattr()) md_setattr fails: rc = -5Dec 24 11:13:09 foxtrot2 kernel: LustreError: 38248:0:(file.c:172:ll_close_inode_openhandle()) foxtrot-clilmv-ffff883ff6b12800: inode [0x200030875:0x5d11:0x0] mdc close failed: rc = -107Dec 24 11:13:09 foxtrot2 kernel: LustreError: 38248:0:(file.c:172:ll_close_inode_openhandle()) Skipped 743 previous similar messagesDec 24 11:13:09 foxtrot2 kernel: LustreError: 41760:0:(vvp_io.c:1474:vvp_io_init()) foxtrot: refresh file layout [0x2000302ba:0x103db:0x0] error -108.Dec 24 11:13:09 foxtrot2 kernel: LustreError: 41760:0:(vvp_io.c:1474:vvp_io_init()) Skipped 310070 previous similar messagesDec 24 11:13:09 foxtrot2 kernel: LustreError: 44300:0:(mdc_request.c:1329:mdc_read_page()) foxtrot-MDT0000-mdc-ffff883ff6b12800: [0x20002cfcf:0x5a20:0x0] lock enqueue fails: rc = -108Dec 24 11:13:09 foxtrot2 kernel: LustreError: 39322:0:(llite_lib.c:1512:ll_md_setattr()) Skipped 5 previous similar messagesDec 24 11:13:09 foxtrot2 kernel: LustreError: 12816:0:(vvp_io.c:1474:vvp_io_init()) foxtrot: refresh file layout [0x200030766:0x18539:0x0] error -108.Dec 24 11:13:09 foxtrot2 kernel: LustreError: 39252:0:(vvp_io.c:1474:vvp_io_init()) foxtrot: refresh file layout [0x2000302ba:0x10403:0x0] error -108.Dec 24 11:13:09 foxtrot2 kernel: LustreError: 39252:0:(vvp_io.c:1474:vvp_io_init()) Skipped 143616 previous similar messagesDec 24 11:13:09 foxtrot2 kernel: LustreError: 44302:0:(file.c:172:ll_close_inode_openhandle()) foxtrot-clilmv-ffff883ff6b12800: inode [0x20000070c:0x2ea9:0x0] mdc close failed: rc = -108Dec 24 11:13:09 foxtrot2 kernel: LustreError: 44302:0:(file.c:172:ll_close_inode_openhandle()) Skipped 815 previous similar messagesDec 24 11:13:09 foxtrot2 kernel: LustreError: 12816:0:(vvp_io.c:1474:vvp_io_init()) Skipped 2986 previous similar messagesDec 24 11:13:10 foxtrot2 kernel: Lustre: foxtrot-MDT0000-mdc-ffff883ff6b12800: Connection restored to 10.21.22.10@tcp (at 10.21.22.10@tcp)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>CentOS 7.4</environment>
        <key id="54398">LU-11826</key>
            <summary>Cannot send after transport endpoint shutdown</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="green">Oleg Drokin</assignee>
                                    <reporter username="cmcl">Campbell Mcleay</reporter>
                        <labels>
                    </labels>
                <created>Mon, 24 Dec 2018 11:27:21 +0000</created>
                <updated>Thu, 28 Feb 2019 13:15:41 +0000</updated>
                            <resolved>Sat, 23 Feb 2019 04:28:28 +0000</resolved>
                                    <version>Lustre 2.10.6</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="239104" author="cmcl" created="Mon, 24 Dec 2018 11:29:25 +0000"  >&lt;p&gt;Just a note to say that the files it complains about when it says it can&apos;t remove them are not protected by read-only ACLs or file modes, and do exist on the filesystem.&lt;/p&gt;</comment>
                            <comment id="239106" author="pjones" created="Mon, 24 Dec 2018 13:07:45 +0000"  >&lt;p&gt;Emoly&lt;/p&gt;

&lt;p&gt;What do you suggest here?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="239108" author="green" created="Mon, 24 Dec 2018 17:58:11 +0000"  >&lt;p&gt;From the client excerpt provided it looks like the client was evicted by the servers. Can you please provide mds-side logs from that same time?&lt;/p&gt;</comment>
                            <comment id="239249" author="cmcl" created="Wed, 2 Jan 2019 10:59:27 +0000"  >&lt;p&gt;Hi Oleg,&lt;/p&gt;

&lt;p&gt;Sorry for the delay in replying. There appear to errors in the MDS logs at that time too:&lt;/p&gt;

&lt;p&gt;Dec 24 11:13:09 fmds1 kernel: LustreError: 17227:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.21.22.31@tcp) failed to reply to blocking AST (req@ffff88236d22d700 x1615613399909376 status 0 rc -110), evict it ns: mdt-foxtrot-MDT0000_UUID lock: ffff8820a62b8a00/0x13866bfbe99ef389 lrc: 4/0,0 mode: PR/PR res: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x20002c7e2:0x4352:0x0&amp;#93;&lt;/span&gt;.0x0 bits 0x1b rrc: 2 type: IBT flags: 0x60200400000020 nid: 10.21.22.31@tcp remote: 0xbf690c3f5f1d9519 expref: 169058 pid: 87340 timeout: 9220366206 lvb_type: 0&lt;br/&gt;
Dec 24 11:13:09 fmds1 kernel: LustreError: 138-a: foxtrot-MDT0000: A client on nid 10.21.22.31@tcp was evicted due to a lock blocking callback time out: rc -110&lt;br/&gt;
Dec 24 11:13:09 fmds1 kernel: LustreError: 19974:0:(ldlm_lockd.c:1398:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8820f2829800 ns: mdt-foxtrot-MDT0000_UUID lock: ffff88243a769c00/0x13866bfbe99f2569 lrc: 3/0,0 mode: PR/PR res: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x2000226c3:0xeea1:0x0&amp;#93;&lt;/span&gt;.0x0 bits 0x1b rrc: 1 type: IBT flags: 0x50200000000000 nid: 10.21.22.31@tcp remote: 0xbf690c3f5f1d993a expref: 168919 pid: 19974 timeout: 0 lvb_type: 0&lt;br/&gt;
Dec 24 11:13:09 fmds1 kernel: LustreError: 20543:0:(ldlm_lib.c:3180:target_bulk_io()) @@@ bulk READ failed: rc &lt;del&gt;107 req@ffff88236c2f8c00 x1620532433667344/t0(0) o37&lt;/del&gt;&amp;gt;5bfdc7b1-5f4a-3415-a424-ef332bafeb5b@10.21.22.31@tcp:-1/-1 lens 568/440 e 0 to 0 dl 1545650004 ref 1 fl Interpret:/0/0 rc 0/0&lt;br/&gt;
Dec 24 11:13:09 fmds1 kernel: Lustre: foxtrot-MDT0000: Connection restored to (at 10.21.22.31@tcp)&lt;br/&gt;
Dec 24 11:13:09 fmds1 kernel: LustreError: 19974:0:(ldlm_lockd.c:1398:ldlm_handle_enqueue0()) Skipped 30 previous similar messages&lt;br/&gt;
Dec 24 11:13:11 fmds1 kernel: LustreError: 98802:0:(client.c:1164:ptlrpc_import_delay_req()) @@@ IMP_CLOSED req@ffff88231848e900 x1615613399983344/t0(0) o104-&amp;gt;foxtrot-MDT0000@10.21.22.31@tcp:15/16 lens 296/224 e 0 to 0 dl 0 ref 1 fl Rpc:/0/ffffffff rc 0/-1&lt;br/&gt;
Dec 24 11:13:11 fmds1 kernel: LustreError: 98802:0:(client.c:1164:ptlrpc_import_delay_req()) Skipped 1 previous similar message&lt;br/&gt;
Dec 24 11:13:11 fmds1 kernel: LustreError: 98802:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.21.22.31@tcp) failed to reply to blocking AST (req@ffff88231848e900 x1615613399983344 status 0 rc -5), evict it ns: mdt-foxtrot-MDT0000_UUID lock: ffff882550cebc00/0x13866bfbe6684d33 lrc: 4/0,0 mode: PR/PR res: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x200030873:0x1e93f:0x0&amp;#93;&lt;/span&gt;.0x0 bits 0x2 rrc: 4 type: IBT flags: 0x60000400000020 nid: 10.21.22.31@tcp remote: 0xbf690c3f5eec06fe expref: 98520 pid: 20548 timeout: 9220368047 lvb_type: 0&lt;br/&gt;
Dec 24 11:13:11 fmds1 kernel: LustreError: 138-a: foxtrot-MDT0000: A client on nid 10.21.22.31@tcp was evicted due to a lock blocking callback time out: rc -5&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="239542" author="cmcl" created="Tue, 8 Jan 2019 16:26:42 +0000"  >&lt;p&gt;Hi Oleg,&lt;/p&gt;

&lt;p&gt;Any thoughts?&lt;/p&gt;

&lt;p&gt;Regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="239740" author="cmcl" created="Thu, 10 Jan 2019 11:51:37 +0000"  >&lt;p&gt;Can anyone help with this? Thanks&lt;/p&gt;</comment>
                            <comment id="239782" author="green" created="Thu, 10 Jan 2019 23:05:56 +0000"  >&lt;p&gt;so server side messages tell us that&lt;/p&gt;

&lt;p&gt;1. Client failed to reply to a blocking ast (a special mesasge that lustre server sends to ask a client to release some lock due to a conflict). - the timeout for these low overhead mesages is pretty small at 5-7 seconds so if there is a momentary lapse in connectivity for example, this might happen.&lt;br/&gt;
2. server evicted the client not to hold uprest of the nodes waiting for a potentially dead client&lt;br/&gt;
3. the client turned out to be not as dead as it seemed and reconnected.&lt;br/&gt;
4. other blocking AST that we had scheduled for this client was interrupted due to eviction and the client was evicted once more.&lt;/p&gt;

&lt;p&gt;the part #4 does not sound entirely correct, but it did not change anything in the general flow of things since the original eviction is what has messed everyting up and it most likely happened due to some hiccup on the network.&lt;/p&gt;

&lt;p&gt;To add: if you see this every time you do large rm&apos;s it&apos;s possible you are just overstretching some bit of the network infrastructure? you can collect come lustre debug logs and we can see which part exactly, I guess.&lt;/p&gt;</comment>
                            <comment id="239807" author="cmcl" created="Fri, 11 Jan 2019 11:36:23 +0000"  >&lt;p&gt;Hi Oleg,&lt;/p&gt;

&lt;p&gt;I&apos;ll prepare a bunch of rm jobs and turn on debugging. This may take a few days for me to get done.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="240010" author="cmcl" created="Tue, 15 Jan 2019 18:49:46 +0000"  >&lt;p&gt;Hi Oleg,&lt;/p&gt;

&lt;p&gt;Can you please let me know which of the following debug options I should enable (from :/proc/sys/lnet/subsystem_debug):&lt;/p&gt;

&lt;p&gt;undefined mdc mds osc ost class log llite rpc mgmt lnet lnd pinger filter echo ldlm lov lquota osd lfsck snapshot lmv sec gss mgc mgs fid fld&lt;/p&gt;

&lt;p&gt;Presumably, it is just:&lt;/p&gt;

&lt;p&gt;echo &quot;&amp;lt;subsystem1&amp;gt; &amp;lt;subsystem2&amp;gt; &amp;lt;subsystem3&amp;gt;&quot; &amp;gt;&#160;/proc/sys/lnet/debug&lt;/p&gt;

&lt;p&gt;Should I just do this on the MDS or on the clients as well? Same subsystems options on both?&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="240060" author="green" created="Wed, 16 Jan 2019 05:39:49 +0000"  >&lt;p&gt;you could also do &quot;+...&quot; notation to just add things to what you have there.&lt;br/&gt;
Don&apos;t change anything in sybsystem_debug, it&apos;s already fully enabled, you only need to adjust the debug setting&lt;/p&gt;

&lt;p&gt;In your case you are best advised to increase the debug log size to like 1G (note this will consume 1G of RAM)&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param debug_mb=1024
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;and then add dlmtrace info and rpctrace with&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param debug=&quot;+dlmtrace +info +rpctrace&quot;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Run these two commands on all the clients you expect the problem might hit and then also on the MDS.&lt;/p&gt;

&lt;p&gt;once the problem hits, run &quot;lctl dk &amp;gt;/tmp/${HOSTNAME}-lustre.log&quot; on all nodes affected, collect the files and upload them somewhere.&lt;/p&gt;

&lt;p&gt;Thanks.&lt;/p&gt;</comment>
                            <comment id="240205" author="cmcl" created="Thu, 17 Jan 2019 10:09:39 +0000"  >&lt;p&gt;Hi Oleg,&lt;/p&gt;

&lt;p&gt;There was a disconnection at around 22:27 last night, about 5 hours after I started the rm&apos;s. Unfortunately, I was not able to dump the log until I got in this morning, and the timestamp at the beginning of the client logs is after this time, however, the MDS debug log has that period covered. I&apos;ll restart the rm&apos;s and see if I can capture the error. Should I increase the debug_mb value on the clients? In the meantime, I can send you the MDS log, which compressed, is about 22MB. Is there somewhere I can send it?&lt;/p&gt;

&lt;p&gt;Regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="240211" author="pjones" created="Thu, 17 Jan 2019 11:20:48 +0000"  >&lt;p&gt;Sometimes the diagnostic data collected as part of Lustre troubleshooting is too large to be attached to a JIRA ticket. For these cases, Whamcloud provides an anonymous write-only FTP upload service. In order to use this service, you&apos;ll need an FTP client (e.g. ncftp, ftp, etc.) and a JIRA issue. Use the &apos;uploads&apos; directory and create a new subdirectory using your Jira issue as a name.&lt;/p&gt;

&lt;p&gt;In the following example, there are three debug logs in a single directory and the JIRA issue &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4242&quot; title=&quot;mdt_open.c:1685:mdt_reint_open()) LBUG&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4242&quot;&gt;&lt;del&gt;LU-4242&lt;/del&gt;&lt;/a&gt; has been created. After completing the upload, please update the relevant issue with a note mentioning the upload, so that our engineers know where to find your logs.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
$ ls -lh
total 333M
rw-rr- 1 mjmac mjmac 98M Feb 23 17:36 mds-debug
rw-rr- 1 mjmac mjmac 118M Feb 23 17:37 oss-00-debug
rw-rr- 1 mjmac mjmac 118M Feb 23 17:37 oss-01-debug
$ ncftp ftp.whamcloud.com
NcFTP 3.2.2 (Sep 04, 2008) by Mike Gleason (http:&lt;span class=&quot;code-comment&quot;&gt;//www.NcFTP.com/contact/).
&lt;/span&gt;Connecting to 99.96.190.235...
(vsFTPd 2.2.2)
Logging in...
Login successful.
Logged in to ftp.whamcloud.com.
ncftp / &amp;gt; cd uploads
Directory successfully changed.
ncftp /uploads &amp;gt; mkdir LU-4242
ncftp /uploads &amp;gt; cd LU-4242
Directory successfully changed.
ncftp /uploads/LU-4242 &amp;gt; put *
mds-debug: 97.66 MB 11.22 MB/s
oss-00-debug: 117.19 MB 11.16 MB/s
oss-01-debug: 117.48 MB 11.18 MB/s
ncftp /uploads/LU-4242 &amp;gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Please note that this is a WRITE-ONLY FTP service, so you will not be able to see (with ls) the files or directories you&apos;ve created, nor will you (or anyone other than Whamcloud staff) be able to see or read them.&lt;/p&gt;</comment>
                            <comment id="240234" author="cmcl" created="Thu, 17 Jan 2019 15:28:33 +0000"  >&lt;p&gt;Hi,&lt;/p&gt;

&lt;p&gt;I&apos;ve uploaded the MDS log (uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11826&quot; title=&quot;Cannot send after transport endpoint shutdown&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11826&quot;&gt;&lt;del&gt;LU-11826&lt;/del&gt;&lt;/a&gt;/fmds1-lustre.log.gz) to ftp.whamcloud.com. The time of the disconnection was&#160;1547661781 in unix time (16th Jan 22:27:22). I&apos;m running another rm in the hope I can capture some useful logs on the clients as well&lt;/p&gt;

&lt;p&gt;Regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="240448" author="cmcl" created="Mon, 21 Jan 2019 13:45:37 +0000"  >&lt;p&gt;Hi,&lt;/p&gt;

&lt;p&gt;Did you find anything of interest in the log I sent? Or do you need the client log? I&apos;m finding it difficult to capture the log on the client as it may take a few days to trigger it and I can&apos;t watch it all the time. Even with 4GB of memory assigned to the debug log size, it only seems to capture 5 minutes worth (though the log is 3GB in size)&lt;/p&gt;

&lt;p&gt;Regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="240501" author="green" created="Tue, 22 Jan 2019 04:14:25 +0000"  >&lt;p&gt;it&apos;s weird that your logs literally start with an eviction: &lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:00020000:51.1F:1545412061.748721:0:0:0:(ldlm_lockd.c:333:waiting_locks_
callback()) ### lock callback timer expired after 0s: evicting client at 10.21.2
2.32@tcp  ns: mdt-foxtrot-MDT0000_UUID lock: ffff88268bb39400/0x13866bc8bdb565ba
 lrc: 4/0,0 mode: PR/PR res: [0x20002a5ed:0x16aa8:0x0].0x0 bits 0x1b rrc: 3 type
: IBT flags: 0x60200400000020 nid: 10.21.22.32@tcp remote: 0xc849fe3bc093e756 ex
pref: 63841 pid: 20017 timeout: 0 lvb_type: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;it makes no sense to run days of logs on the clients waiting for the eviction, but you can set &quot;dump logs on eviction&quot; flag and so every time a client is evicted it would dump some logs, the first one would be containing all the history.&lt;br/&gt;
you can do this by setting dump_on_eviction=1 with lctl set_param on every client and on the mdt.&lt;/p&gt;

&lt;p&gt;Also I don&apos;t see any evictions at 1547661781 timestamp in the logs, the preceedign evictions are at 1545663407 which clearly is too long ago and the next evictions are at  1547669576 which is 2+ hours way from your specified time (timezone calculation problem?) and does not match the eviction profile we saw in the initial report (ast timeout). In fact at no time we see &quot;failed to reply to blocking AST&quot; message in the mdt log.&lt;/p&gt;

&lt;p&gt;Unfortunately the locks you gathered when you did already became kind of fragmented, so it would be really great if you can capture something from clients and servers using the dump on eviction method above (Lustre has three tier logging so even when logs start with a particular timestamp that does not mean there are no gaps in between those entries)&lt;/p&gt;</comment>
                            <comment id="240502" author="green" created="Tue, 22 Jan 2019 04:17:37 +0000"  >&lt;p&gt;I just noticed that you only capture 5 minutes of log in 3G, that might be enough since a normal lock timeout is like 200s and ast timeout is only about 7s.&lt;/p&gt;</comment>
                            <comment id="240517" author="cmcl" created="Tue, 22 Jan 2019 11:44:56 +0000"  >&lt;p&gt;Thank you very much Oleg, that&apos;s a very handy parameter. I have set this on the clients and the MDS and resumed the deletes. Might take a little while for it to trigger, will send the results through if it disconnects again.&lt;/p&gt;</comment>
                            <comment id="240542" author="cmcl" created="Tue, 22 Jan 2019 16:36:03 +0000"  >&lt;p&gt;Hi Oleg,&lt;/p&gt;

&lt;p&gt;Two of the three clients lost connection with the MDS. Have dumped the logs out and ftp&apos;ed them up. File names are:&lt;/p&gt;

&lt;p&gt;fmds1-lustre.log.220119.gz&lt;/p&gt;

&lt;p&gt;foxtrot2-lustre.log.220119.gz&lt;/p&gt;

&lt;p&gt;foxtrot3-lustre.log.220119.gz&lt;/p&gt;

&lt;p&gt;Regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="240802" author="cmcl" created="Mon, 28 Jan 2019 14:43:36 +0000"  >&lt;p&gt;Hi Oleg,&lt;/p&gt;

&lt;p&gt;Did you get a chance to take a look at those logs, and if so, were you able to determine what the issue was?&#160;&lt;/p&gt;

&lt;p&gt;Kind regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="240837" author="green" created="Tue, 29 Jan 2019 01:13:15 +0000"  >&lt;p&gt;hm... How were these logs produced? I don&apos;t see any evictions in there?&lt;/p&gt;

&lt;p&gt;I guess I failed to mention that when you have the dump on eviction setup, there would be /tmp/lustre-log-TIMESTAMP files dropped for every event that are all binary that you need to pass through &quot;lctl df&quot; to turn into text. It&apos;s those files that we need. Hopefully they are still there?&lt;/p&gt;

&lt;p&gt;the lctl df step is not necessary, we can run that ourselves.&lt;/p&gt;</comment>
                            <comment id="240858" author="cmcl" created="Tue, 29 Jan 2019 11:58:20 +0000"  >&lt;p&gt;Hi Oleg,&lt;/p&gt;

&lt;p&gt;Have uploaded the logs parsed via lctl df as well.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="240881" author="green" created="Tue, 29 Jan 2019 17:31:45 +0000"  >&lt;p&gt;Thank you for the logs, these ones are helpful, though I wonder what version do you really run, since some of the messages in there don&apos;t appear to match my copy of 2.10.6&lt;/p&gt;

&lt;p&gt;Anyway the issue you are hitting appears to be &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10945&quot; title=&quot;Race between sending bl ast and lock cancel&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10945&quot;&gt;&lt;del&gt;LU-10945&lt;/del&gt;&lt;/a&gt;, the telltale message in your logs is this:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:00010000:27.0:1548163854.268356:0:4761:0:(ldlm_lockd.c:1685:ldlm_request_cancel()) ### server cancels blocked lock after 1548163854s ns: mdt-foxtrot-MDT0000_UUID lock: ffff8822c3821200/0x13866dfbc82358f1 lrc: 4/0,0 mode: PR/PR res: [0x20002f6d0:0x15746:0x0].0x0 bits 0x1b rrc: 5 type: IBT flags: 0x40200000000020 nid: 10.21.22.32@tcp remote: 0xc849fe70b80233f8 expref: 220164 pid: 7423 timeout: 0 lvb_type: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The patch &lt;a href=&quot;https://review.whamcloud.com/#/c/32133/3&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/32133/3&lt;/a&gt; should help you except it does not apply to the b2_10 tree, I&apos;ll make a port.&lt;/p&gt;

&lt;p&gt;Do you have ability to self-build Lustre with the patch (only MDS and OSSes would need the patched code)?&lt;/p&gt;</comment>
                            <comment id="240957" author="cmcl" created="Wed, 30 Jan 2019 11:48:15 +0000"  >&lt;p&gt;Hi Oleg,&lt;/p&gt;

&lt;p&gt;Thanks, we&apos;ll try the patch, so let&apos;s go ahead with that.&lt;/p&gt;

&lt;p&gt;Kind regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="240968" author="green" created="Wed, 30 Jan 2019 15:02:48 +0000"  >&lt;p&gt;The ported patch is here: &lt;a href=&quot;https://review.whamcloud.com/#/c/34131/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/34131/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;I hoped it would be done testing by now, but apparently we have some test system slowness where results take awhile to become available.&lt;/p&gt;</comment>
                            <comment id="241174" author="cmcl" created="Fri, 1 Feb 2019 15:28:58 +0000"  >&lt;p&gt;Hi Oleg,&lt;/p&gt;

&lt;p&gt;Have built the server packages from that source tree you linked. Upon installation, I got a lot of warnings about&#160;llite_lloop.ko needing various unknown symbols, but I read somewhere that this package is obsolete - need I worry about this?&lt;/p&gt;

&lt;p&gt;Will start some deletes and see how it goes.&lt;/p&gt;

&lt;p&gt;Regards,&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="241181" author="green" created="Fri, 1 Feb 2019 16:57:07 +0000"  >&lt;p&gt;Yes, llite_lloop.ko is not really used nowadays so you should not worry too much about it. Please let me know how it goes, also if the problems persist, please collect the logs like before.&lt;/p&gt;</comment>
                            <comment id="242108" author="pjones" created="Sat, 16 Feb 2019 16:34:18 +0000"  >&lt;p&gt;How are things shaping up with the patch &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=cmcl&quot; class=&quot;user-hover&quot; rel=&quot;cmcl&quot;&gt;cmcl&lt;/a&gt;? Ok to consider this ticket closed?&lt;/p&gt;</comment>
                            <comment id="242577" author="pjones" created="Sat, 23 Feb 2019 04:28:28 +0000"  >&lt;p&gt;Disappointing to not hear explicit feedback on the effectiveness of the patch but I suppose no news is good news...&lt;/p&gt;</comment>
                            <comment id="243007" author="cmcl" created="Thu, 28 Feb 2019 10:36:49 +0000"  >&lt;p&gt;Hi Peter,&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;My apologies, I missed your last message. We did extensive testing with parallel deletes and there were no &apos;transport endpoint shutdown&apos; messages. We&apos;re still getting evictions from OSTs but that is a separate issue. So I think we can consider the patch a success in fixing that issue.&lt;/p&gt;

&lt;p&gt;Kind regards,&lt;/p&gt;


&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="243018" author="pjones" created="Thu, 28 Feb 2019 13:15:41 +0000"  >&lt;p&gt;Ah good - thanks for confirming! We have included this fix in 2.10.7 but I am loathe to include fixes that we don&apos;t know serve a purpose.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="51953">LU-10945</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i008o7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>