<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:55:32 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12774] Lustre client OST stuck in &quot;Evicted&quot; state</title>
                <link>https://jira.whamcloud.com/browse/LU-12774</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We had one of our lustre clients (this one is acting as an NFS gateway).&#160; It got evicted from an OST and seems to be stuck in that state and never recovers.&#160; Rebooting seems to be required to get it back in operation.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;We were getting these out of the kernel log:&lt;/p&gt;

&lt;p&gt;kernel: [ 7226.864597] LustreError: 11-0: lustre-OST0090-osc-ffff88103aeb4000: operation ost_read to node &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;mailto:10.11.200.13@o2ib&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;10.11.200.13@o2ib&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/mail_small.gif&quot; height=&quot;12&quot; width=&quot;13&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; failed: rc = -107&lt;br/&gt;
 kernel: [ 7226.864606] Lustre: lustre-OST0090-osc-ffff88103aeb4000: Connection to lustre-OST0090 (at &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;mailto:10.11.200.13@o2ib&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;10.11.200.13@o2ib&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/mail_small.gif&quot; height=&quot;12&quot; width=&quot;13&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;) was lost; in progress operations using this service will wait for recovery to complete&lt;br/&gt;
 kernel: [ 7226.864772] LustreError: 167-0: lustre-OST0090-osc-ffff88103aeb4000: This client was evicted by lustre-OST0090; in progress operations using this service will fail.&lt;/p&gt;

&lt;p&gt;kernel: [ 7226.877968] LustreError: 6866:0:(ldlm_resource.c:1101:ldlm_resource_complain()) lustre-OST0090-osc-ffff88103aeb4000: namespace resource &lt;span class=&quot;error&quot;&gt;&amp;#91;0x2680000400:0x2b70b4a:0x0&amp;#93;&lt;/span&gt;.0x0 (ffff88203c9&lt;br/&gt;
 5f880) refcount nonzero (1) after lock cleanup; forcing cleanup.&lt;br/&gt;
 kernel: [ 7226.877972] LustreError: 6866:0:(ldlm_resource.c:1683:ldlm_resource_dump()) &amp;#8212; Resource: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x2680000400:0x2b70b4a:0x0&amp;#93;&lt;/span&gt;.0x0 (ffff88203c95f880) refcount = 2&lt;br/&gt;
 lstgwbal837 kernel: [ 7226.877973] LustreError: 6866:0:(ldlm_resource.c:1686:ldlm_resource_dump()) Granted locks (in reverse order):&lt;br/&gt;
 lstgwbal837 kernel: [ 7226.877978] LustreError: 6866:0:(ldlm_resource.c:1689:ldlm_resource_dump()) ### ### ns: lustre-OST0090-osc-ffff88103aeb4000 lock: ffff88100e7a1c00/0x80fbdb776558f43 lrc: 4/0,1 mode: PW/PW res: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x2680000400:0x2b70b4a:0x0&amp;#93;&lt;/span&gt;.0x0 rrc: 3 type: EXT &lt;span class=&quot;error&quot;&gt;&amp;#91;0-&amp;gt;18446744073709551615&amp;#93;&lt;/span&gt; (req 36864-&amp;gt;40959) flags: 0x526400020000 nid: local remote: 0xc69e1cc8fc9b178e expref: -99 pid: 5106 timeout: 0 lvb_type: 1&lt;/p&gt;

&lt;p&gt;kernel: [ 7460.227838] LustreError: 5106:0:(osc_cache.c:952:osc_extent_wait()) extent ffff8807ba25aa98@&lt;/p&gt;
{[6 -&amp;gt; 9/1023], [3|1|-|active|wiuY|ffff8816a6279180], [40960|4|+|-|ffff88100e7a1c00|1024|
 (null)]}
&lt;p&gt; lustre-OST0090-osc-ffff88103aeb4000: wait ext to 0 timedout, recovery in progress?&lt;br/&gt;
 kernel: [ 7460.227846] LustreError: 5106:0:(osc_cache.c:952:osc_extent_wait()) ### extent: ffff8807ba25aa98 ns: lustre-OST0090-osc-ffff88103aeb4000 lock: ffff88100e7a1c00/0x80fbdb776558f43 lrc: 4/0,1 mode: PW/PW res: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x2680000400:0x2b70b4a:0x0&amp;#93;&lt;/span&gt;.0x0 rrc: 3 type: EXT &lt;span class=&quot;error&quot;&gt;&amp;#91;0-&amp;gt;18446744073709551615&amp;#93;&lt;/span&gt; (req 36864-&amp;gt;40959) flags: 0x426400020000 nid: local remote: 0xc69e1cc8fc9b178e expref: -99 pid: 5106 timeout: 0 lvb_type: 1&lt;br/&gt;
 kernel: [ 7460.227848] LustreError: 5106:0:(osc_cache.c:952:osc_extent_wait()) Skipped 1 previous similar message&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;We aren&apos;t sure whats going on there, but it looked to us like after getting evicted it tried to clean up locks and was failing to clean one up, which was preventing it from trying to recover?&lt;/p&gt;

&lt;p&gt;We are currently running the 2.10.8 client and server.&#160; Any help would be appreciated!&lt;/p&gt;

&lt;p&gt;Thanks!&lt;/p&gt;</description>
                <environment></environment>
        <key id="56929">LU-12774</key>
            <summary>Lustre client OST stuck in &quot;Evicted&quot; state</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="mcmult">Tim McMullan</reporter>
                        <labels>
                    </labels>
                <created>Tue, 17 Sep 2019 16:08:30 +0000</created>
                <updated>Fri, 15 Nov 2019 15:13:22 +0000</updated>
                                            <version>Lustre 2.10.8</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="255277" author="mcmult" created="Mon, 23 Sep 2019 19:06:13 +0000"  >&lt;p&gt;We just hit a similar state, but we weren&apos;t actively evicted from a client.&#160;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;We were only getting these:&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;Mon Sep 23 13:41:20 2019&amp;#93;&lt;/span&gt; LustreError: 7973:0:(osc_cache.c:952:osc_extent_wait()) extent ffff881972951e88@{&lt;span class=&quot;error&quot;&gt;&amp;#91;29846 -&amp;gt; 29856/30719&amp;#93;&lt;/span&gt;, &lt;span class=&quot;error&quot;&gt;&amp;#91;4|1|-|active|wiuY|ffff8816e4114990&amp;#93;&lt;/span&gt;, &lt;span class=&quot;error&quot;&gt;&amp;#91;69632|11|+|+|ffff88190cb0d800|1024| (null)&amp;#93;&lt;/span&gt;} lustre-OST008f-osc-ffff88103de9c000: wait ext to 0 timedout, recovery in progress?&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;Mon Sep 23 13:41:20 2019&amp;#93;&lt;/span&gt; LustreError: 7973:0:(osc_cache.c:952:osc_extent_wait()) ### extent: ffff881972951e88 ns: lustre-OST008f-osc-ffff88103de9c000 lock: ffff88190cb0d800/0x390d8f3984002de2 lrc: 4/0,1 mode: PW/PW res: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x35b5871:0x0:0x0&amp;#93;&lt;/span&gt;.0x0 rrc: 2 type: EXT &lt;span class=&quot;error&quot;&gt;&amp;#91;0-&amp;gt;18446744073709551615&amp;#93;&lt;/span&gt; (req 0-&amp;gt;49151) flags: 0x20000020000 nid: local remote: 0x24410d5ca118deb2 expref: -99 pid: 7973 timeout: 0 lvb_type: 1&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;that pid showed this backtrace:&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa118648c&amp;gt;&amp;#93;&lt;/span&gt; osc_extent_wait+0x49c/0x7c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa118cf64&amp;gt;&amp;#93;&lt;/span&gt; osc_cache_wait_range+0x314/0x970 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa117909b&amp;gt;&amp;#93;&lt;/span&gt; osc_io_fsync_end+0x7b/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cd959e&amp;gt;&amp;#93;&lt;/span&gt; cl_io_end+0x4e/0x130 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa104abeb&amp;gt;&amp;#93;&lt;/span&gt; lov_io_end_wrapper+0xcb/0xd0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa104ae87&amp;gt;&amp;#93;&lt;/span&gt; lov_io_fsync_end+0x77/0x1a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cd959e&amp;gt;&amp;#93;&lt;/span&gt; cl_io_end+0x4e/0x130 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cdb940&amp;gt;&amp;#93;&lt;/span&gt; cl_io_loop+0x110/0xc30 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa10af45a&amp;gt;&amp;#93;&lt;/span&gt; cl_sync_file_range+0x28a/0x310 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa10cfc03&amp;gt;&amp;#93;&lt;/span&gt; ll_writepages+0x73/0x1d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81197717&amp;gt;&amp;#93;&lt;/span&gt; __filemap_fdatawrite_range+0xa7/0xe0&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8119784c&amp;gt;&amp;#93;&lt;/span&gt; filemap_write_and_wait_range+0x3c/0x80&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa10af56b&amp;gt;&amp;#93;&lt;/span&gt; ll_fsync+0x8b/0x5d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa131cc84&amp;gt;&amp;#93;&lt;/span&gt; nfsd_commit+0x94/0xb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;nfsd&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa13253f3&amp;gt;&amp;#93;&lt;/span&gt; nfsd3_proc_commit+0x83/0xd0 &lt;span class=&quot;error&quot;&gt;&amp;#91;nfsd&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa1317e0c&amp;gt;&amp;#93;&lt;/span&gt; nfsd_dispatch+0xcc/0x270 &lt;span class=&quot;error&quot;&gt;&amp;#91;nfsd&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa125152f&amp;gt;&amp;#93;&lt;/span&gt; svc_process_common+0x42f/0x6c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;sunrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa12518bd&amp;gt;&amp;#93;&lt;/span&gt; svc_process+0xfd/0x1c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;sunrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa131787a&amp;gt;&amp;#93;&lt;/span&gt; nfsd+0xea/0x160 &lt;span class=&quot;error&quot;&gt;&amp;#91;nfsd&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810a0e29&amp;gt;&amp;#93;&lt;/span&gt; kthread+0xc9/0xe0&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8161e1df&amp;gt;&amp;#93;&lt;/span&gt; ret_from_fork+0x3f/0x80&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810a0d60&amp;gt;&amp;#93;&lt;/span&gt; kthread+0x0/0xe0&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffffffffff&amp;gt;&amp;#93;&lt;/span&gt; 0xffffffffffffffff&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;We are still trying to track down what exactly is getting us in this state.&#160;&#160;&lt;/p&gt;</comment>
                            <comment id="255718" author="makia" created="Tue, 1 Oct 2019 15:18:04 +0000"  >&lt;p&gt;A quick question, was this system upgraded to 2.10.8, and if so from what? Have there been any further occurrences of this since reported?&lt;/p&gt;</comment>
                            <comment id="255719" author="mcmult" created="Tue, 1 Oct 2019 15:36:19 +0000"  >&lt;p&gt;We have had this happen since reporting the issue multiple times.&#160; At this point, we&apos;ve moved the service off to a different host (still on the old client actually) and haven&apos;t seen it on the other, older client host.&lt;/p&gt;

&lt;p&gt;The client was initially 2.10.0, but was upgraded to 2.10.8 which is when we started seeing problems.&lt;/p&gt;

&lt;p&gt;The servers started off on 2.7 quite a while ago, the system was upgraded to 2.10.0, then to 2.10.4, then 2.10.8.&#160; All of that is relatively ancient history at this point, the client is the only thing to change recently.&lt;/p&gt;</comment>
                            <comment id="255722" author="makia" created="Tue, 1 Oct 2019 15:48:36 +0000"  >&lt;p&gt;Thanks for this information, another question just to get a clearer picture. The above that happened multiple times, this was all on the same host, or different hosts? You migrated to a &quot;new&quot; host (older version) but all previous failures were on the same 2.10.8 host?&lt;/p&gt;</comment>
                            <comment id="255723" author="mcmult" created="Tue, 1 Oct 2019 15:52:59 +0000"  >&lt;p&gt;sure thing, thank you for looking! Yes, all occurrences have been from the same host so far.&lt;/p&gt;</comment>
                            <comment id="255727" author="makia" created="Tue, 1 Oct 2019 16:08:52 +0000"  >&lt;p&gt;At this point, I have 2 running theories. The first is that because this has been seen on the same host, something is preventing recovery from fully completing, which then keeps popping back up. You may need to look at aborting recovery on mount if you haven&apos;t already. The expectation here is that it will &quot;ignore&quot; the previous states and move back on with life. Have you attempted to abort recovery on this host to see what happens?&lt;/p&gt;

&lt;p&gt;The second theory is a bit more worrisome, and that this is some kind of issue introduced between 2.10.0 and 2.10.8 that is causing NFS services to evict. I don&apos;t think we&apos;re quite here yet, but writing it just in case.&lt;/p&gt;</comment>
                            <comment id="255729" author="mcmult" created="Tue, 1 Oct 2019 16:29:22 +0000"  >&lt;p&gt;So, slightly more interesting bit: when we were digging around on the system while it was broken it looked like the client was trying to flush all its locks, but got stuck trying to flush a single lock and looked like the client never actually tried to enter recovery.&lt;/p&gt;

&lt;p&gt;A little more information on what we have seen so far: An issue has cropped up 3 times, all have included the &quot;osc_cache.c:952:osc_extent_wait()&quot; Lustre error.&#160; The first and third time, we were actually evicted by the server and we think never tried to enter recovery.&#160; The second occurrence was a little different... It didn&apos;t look like we got evicted, we did get several of the &quot;osc_cache.c:952:osc_extent_wait()&quot; Lustre errors, but every nfsd process was locked up.&#160; The trace for every nfsd process looked like the trace I included in my first comment.&lt;/p&gt;

&lt;p&gt;We haven&apos;t yet tried to abort recovery on mount, though I was thinking about trying it if we saw the error again.&lt;/p&gt;</comment>
                            <comment id="255730" author="makia" created="Tue, 1 Oct 2019 16:32:58 +0000"  >&lt;p&gt;OK, this lends more to the first theory and needing to abort recovery. There had been some testing seen where a client wasn&apos;t fully releasing locks which was causing recovery to fail. By aborting recovery, it gets past that state and allows the system to basically quiet down and move on.&lt;/p&gt;</comment>
                            <comment id="255735" author="mcmult" created="Tue, 1 Oct 2019 17:14:24 +0000"  >&lt;p&gt;I&apos;ll give that a try if we start seeing errors again, but if that solves the immediate issue I&apos;m a little concerned that we got there in the first place&lt;/p&gt;</comment>
                            <comment id="255736" author="makia" created="Tue, 1 Oct 2019 17:24:54 +0000"  >&lt;p&gt;I will need to dig through some Jira to see if there are reported issues relating to this. Because this appears to be an issue related to unreleased locks and the client is specifically re-exporting via NFS we are likely seeing something in that path not quite finishing. This could be an untimely crash of this specific host or some transaction between the NFS server and it&apos;s clients not quite completing. We would need to trace back to the beginning of this issue (and less the later evictions) to understand if there was some kind of event at that time that we can narrow down to; crashes/reboots, networking errors/hiccups, that kind of thing. This definitely shouldn&apos;t be seen under &quot;normal&quot; conditions.&lt;/p&gt;</comment>
                            <comment id="256256" author="mcmult" created="Fri, 11 Oct 2019 21:00:17 +0000"  >&lt;p&gt;we had a repro on a different lustre system:&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;2019-10-11T16:12:14.325450-04:00 lstgwbal850 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;3460541.106751&amp;#93;&lt;/span&gt; LustreError: 67287:0:(ldlm_resource.c:1683:ldlm_resource_dump()) &amp;#8212; Resource: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x1240000400:0xd5d2c5:0x0&amp;#93;&lt;/span&gt;.0x0 (ffff880cd3510400) refcount = 2&lt;/p&gt;

&lt;p&gt;2019-10-11T16:12:14.325451-04:00 lstgwbal850 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;3460541.106753&amp;#93;&lt;/span&gt; LustreError: 67287:0:(ldlm_resource.c:1686:ldlm_resource_dump()) Granted locks (in reverse order):&lt;/p&gt;

&lt;p&gt;2019-10-11T16:12:14.325452-04:00 lstgwbal850 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;3460541.106758&amp;#93;&lt;/span&gt; LustreError: 67287:0:(ldlm_resource.c:1689:ldlm_resource_dump()) ### ### ns: lustre2-OST003d-osc-ffff8817d77a6800 lock: ffff880196a92600/0x42b2f97bf1c27aaf lrc: 4/0,1 mode: PW/PW res: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x1240000400:0xd5d2c5:0x0&amp;#93;&lt;/span&gt;.0x0 rrc: 3 type: EXT &lt;span class=&quot;error&quot;&gt;&amp;#91;0-&amp;gt;18446744073709551615&amp;#93;&lt;/span&gt; (req 0-&amp;gt;4095) flags: 0x526400000000 nid: local remote: 0x686ffd0d6ebfa2f7 expref: -99 pid: 4105 timeout: 0 lvb_type: 1&lt;/p&gt;

&lt;p&gt;2019-10-11T16:17:15.373431-04:00 lstgwbal850 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;3460842.129241&amp;#93;&lt;/span&gt; LustreError: 4101:0:(osc_cache.c:952:osc_extent_wait()) extent ffff88156af6f408@{&lt;span class=&quot;error&quot;&gt;&amp;#91;2388 -&amp;gt; 2388/3071&amp;#93;&lt;/span&gt;, &lt;span class=&quot;error&quot;&gt;&amp;#91;3|1|-|active|wiuY|ffff880dc0ec3180&amp;#93;&lt;/span&gt;, &lt;span class=&quot;error&quot;&gt;&amp;#91;28672|1|+|-|ffff880196a92600|1024|&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; (null)&amp;#93;&lt;/span&gt;} lustre2-OST003d-osc-ffff8817d77a6800: wait ext to 0 timedout, recovery in progress?&lt;/p&gt;

&lt;p&gt;2019-10-11T16:17:15.373434-04:00 lstgwbal850 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;3460842.129248&amp;#93;&lt;/span&gt; LustreError: 4101:0:(osc_cache.c:952:osc_extent_wait()) ### extent: ffff88156af6f408 ns: lustre2-OST003d-osc-ffff8817d77a6800 lock: ffff880196a92600/0x42b2f97bf1c27aaf lrc: 4/0,1 mode: PW/PW res: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x1240000400:0xd5d2c5:0x0&amp;#93;&lt;/span&gt;.0x0 rrc: 3 type: EXT &lt;span class=&quot;error&quot;&gt;&amp;#91;0-&amp;gt;18446744073709551615&amp;#93;&lt;/span&gt; (req 0-&amp;gt;4095) flags: 0x426400000000 nid: local remote: 0x686ffd0d6ebfa2f7 expref: -99 pid: 4105 timeout: 0 lvb_type: 1&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Attempting to abort recovery hasn&apos;t seemed to help :/&lt;/p&gt;</comment>
                            <comment id="256258" author="mcmult" created="Fri, 11 Oct 2019 21:19:31 +0000"  >&lt;p&gt;Our current (tentative) plan going forward is to build latest 2.12 client for ours system and install on the problem box.&#160; If it recurs with the 2.12 client, we plan to start&#160;bisecting the 2.10 client, install it on our nfs server, and see where (rougly) we still see issues and the issues stop.&lt;/p&gt;

&lt;p&gt;The thinking here is that our 2.10.0 client seems to run clean, so perhaps we can at least find the bug client side.&lt;/p&gt;</comment>
                            <comment id="256274" author="makia" created="Sat, 12 Oct 2019 01:28:36 +0000"  >&lt;p&gt;I&apos;d say this is a good next step to see what happens. Thanks for the update.&lt;/p&gt;</comment>
                            <comment id="257744" author="mcmult" created="Tue, 5 Nov 2019 16:18:54 +0000"  >&lt;p&gt;We&apos;ve had a repo of this on the lustre 2.12.3 client:&lt;/p&gt;

&lt;p&gt;LustreError: 4730:0:(osc_cache.c:955:osc_extent_wait()) extent ffff881fe90ce9b8@{&lt;span class=&quot;error&quot;&gt;&amp;#91;4153 -&amp;gt; 4178/5119&amp;#93;&lt;/span&gt;, &lt;span class=&quot;error&quot;&gt;&amp;#91;3|1|-|active|wiuY|ffff881ff61c3b50&amp;#93;&lt;/span&gt;, &lt;span class=&quot;error&quot;&gt;&amp;#91;131072|26|+|-|ffff881f7dd55b80|1024| (null)&amp;#93;&lt;/span&gt;} lustre-OST0084-osc-ffff88103b209000: wait ext to 0 timedout, recovery in progress?&lt;br/&gt;
LustreError: 4730:0:(osc_cache.c:955:osc_extent_wait()) ### extent: ffff881fe90ce9b8 ns: lustre-OST0084-osc-ffff88103b209000 lock: ffff881f7dd55b80/0xa275852dbeedc19a lrc: 4/0,1 mode: PW/PW res: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x39b9f46:0x0:0x0&amp;#93;&lt;/span&gt;.0x0 rrc: 2 type: EXT &lt;span class=&quot;error&quot;&gt;&amp;#91;0-&amp;gt;18446744073709551615&amp;#93;&lt;/span&gt; (req 16777216-&amp;gt;17125375) flags: 0x20000020000 nid: local remote: 0x7517e8b592c8f8bb expref: -99 pid: 4729 timeout: 0 lvb_type: 1&lt;/p&gt;

&lt;p&gt;The node didn&apos;t get evicted, but we had enough of these to hang up the nfs server last night.&lt;/p&gt;</comment>
                            <comment id="258385" author="makia" created="Fri, 15 Nov 2019 15:13:22 +0000"  >&lt;p&gt;You had mentioned that if the issue reoccurred you were going to test a 2.10.0 client; were you able to do that? On next occurrence of this, it would also be good to get a full set of lustre logs attached to see what fully is happening.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00mvj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>