<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:33:46 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-3421] (ost_handler.c:1762:ost_blocking_ast()) Error -2 syncing data on lock cancel causes first ENOSPC client issues then MDS server locks up</title>
                <link>https://jira.whamcloud.com/browse/LU-3421</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Several times during our test shot we would encounter a situation where OSTs would report ENOSPC even tho there was enough space and inodes available. In time the MDS would become pinned and it would have to be rebooted to get a working file system again. I have attached the console logs for the MDS and OSS.&lt;/p&gt;</description>
                <environment>RHEL6.4 running with Lutsre 2.4-rc2 and cray clients running [2.4.0-rc1 SLES11 SP1 / 2.4.0-rc2 SLES11 SP2] clients</environment>
        <key id="19231">LU-3421</key>
            <summary>(ost_handler.c:1762:ost_blocking_ast()) Error -2 syncing data on lock cancel causes first ENOSPC client issues then MDS server locks up</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bobijam">Zhenyu Xu</assignee>
                                    <reporter username="simmonsja">James A Simmons</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Thu, 30 May 2013 16:50:26 +0000</created>
                <updated>Thu, 18 Sep 2014 21:00:51 +0000</updated>
                            <resolved>Tue, 3 Sep 2013 13:11:54 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                    <fixVersion>Lustre 2.4.1</fixVersion>
                    <fixVersion>Lustre 2.5.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="59673" author="pjones" created="Thu, 30 May 2013 18:02:59 +0000"  >&lt;p&gt;Bobijam&lt;/p&gt;

&lt;p&gt;Could you please look into this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="59674" author="simmonsja" created="Thu, 30 May 2013 18:07:21 +0000"  >&lt;p&gt;backtrace from the vmcore which will be uploaded soon:&lt;/p&gt;

&lt;p&gt;PID: 4814   TASK: ffff880e23333500  CPU: 15  COMMAND: &quot;mdt03_140&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800607e7e90&amp;#93;&lt;/span&gt; crash_nmi_callback at ffffffff8102d316&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800607e7ea0&amp;#93;&lt;/span&gt; notifier_call_chain at ffffffff81513a85&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800607e7ee0&amp;#93;&lt;/span&gt; atomic_notifier_call_chain at ffffffff81513aea&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800607e7ef0&amp;#93;&lt;/span&gt; notify_die at ffffffff8109cc1e&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800607e7f20&amp;#93;&lt;/span&gt; do_nmi at ffffffff8151174b&lt;br/&gt;
 #5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800607e7f50&amp;#93;&lt;/span&gt; nmi at ffffffff81511010&lt;br/&gt;
    &lt;span class=&quot;error&quot;&gt;&amp;#91;exception RIP: _spin_lock+30&amp;#93;&lt;/span&gt;&lt;br/&gt;
    RIP: ffffffff8151087e  RSP: ffff880e23335c40  RFLAGS: 00000293&lt;br/&gt;
    RAX: 00000000000014a6  RBX: ffff88099a82c200  RCX: ffff880e23335cf8&lt;br/&gt;
    RDX: 00000000000014a3  RSI: ffff880e23335cf0  RDI: ffff880ac63c35d8&lt;br/&gt;
    RBP: ffff880e23335c40   R8: 0000000000000000   R9: ffff8800b6c1dc00&lt;br/&gt;
    R10: 0000000000000009  R11: ffffffffa08dd5a0  R12: ffff88099a82c200&lt;br/&gt;
    R13: ffff880ac63c35c0  R14: ffff880e23335cf8  R15: 0000000000000000&lt;br/&gt;
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018&lt;br/&gt;
&amp;#8212; &amp;lt;NMI exception stack&amp;gt; &amp;#8212;&lt;br/&gt;
 #6 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880e23335c40&amp;#93;&lt;/span&gt; _spin_lock at ffffffff8151087e&lt;br/&gt;
 #7 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880e23335c48&amp;#93;&lt;/span&gt; lock_res_and_lock at ffffffffa07e3070 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880e23335c68&amp;#93;&lt;/span&gt; ldlm_lock_enqueue at ffffffffa07e859d &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #9 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880e23335cc8&amp;#93;&lt;/span&gt; ldlm_handle_enqueue0 at ffffffffa080f1bf &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880e23335d38&amp;#93;&lt;/span&gt; mdt_enqueue at ffffffffa0e6e3c6 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
#11 [ffff8#12 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880e23335da8&amp;#93;&lt;/span&gt; mds_regular_handle at ffffffffa0eae165 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
#13 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880e23335db8&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request at ffffffffa0841388 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#14 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880e23335eb8&amp;#93;&lt;/span&gt; ptlrpc_main at ffffffffa084271e &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#15 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880e23335f48&amp;#93;&lt;/span&gt; kernel_thread at ffffffff8100c0ca80e23335d58]&lt;/p&gt;</comment>
                            <comment id="59693" author="adilger" created="Thu, 30 May 2013 20:47:28 +0000"  >&lt;p&gt;It looks like there are a couple of different bugs being hit here:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;the OSTs are apparently reporting -ENOSPC when there is space available on them.  Is this possibly a case where the OST is reporting ENOSPC during precreate when there are free inodes, but there are no free blocks?  What is the &lt;tt&gt;lfs df&lt;/tt&gt; and &lt;tt&gt;lfs df -i&lt;/tt&gt; output from the filesystem?  If this state hits again, please also collect the output from &lt;tt&gt;lctl get_param osp.&lt;b&gt;.sync_&lt;/b&gt; osp.*.create_count&lt;/tt&gt; on the MDS&lt;/li&gt;
	&lt;li&gt;the MDS goes into a busy loop trying to create objects on the OSTs, rather than returning -ENOSPC to the client.  While it is good to ensure that the OSTs have been tried for creates, it doesn&apos;t make sense to try precreate so often if the OST is already out of space.&lt;/li&gt;
&lt;/ul&gt;


&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;May 29 08:18:40 widow-mds1 kernel: [76013.652071] Lustre: routed1-OST0015-osc-MDT0000: slow creates, last=[0x100150000:0x22041:0x0], next=[0x100150000:0x22041:0x0], reserved=0, syn_changes=2061, syn_rpc_in_progress=1, status=-19
May 29 08:19:32 widow-mds1 kernel: [76065.440057] Lustre: routed1-OST00f0-osc-MDT0000: slow creates, last=[0x100f00000:0x25e93:0x0], next=[0x100f00000:0x25e93:0x0], reserved=0, syn_changes=0, syn_rpc_in_progress=1, status=-19
May 29 08:23:37 widow-mds1 kernel: [76310.325026] LustreError: 6065:0:(osp_precreate.c:484:osp_precreate_send()) routed1-OST002a-osc-MDT0000: can&apos;t precreate: rc = -28
May 29 08:23:37 widow-mds1 kernel: [76310.367372] LustreError: 6065:0:(osp_precreate.c:484:osp_precreate_send()) Skipped 70657 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;There was previously debugging code in osp_precreate_reserve() because I was worried about exactly this kind of situation, but my recent patch &lt;a href=&quot;http://review.whamcloud.com/6219&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/6219&lt;/a&gt; (commit dc2bcafd2a0b) removed it before the 2.4.0 release because it would have LBUG&apos;d in this case:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;@@ -1062,17 +1061,6 @@ &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; osp_precreate_reserve(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_env *env, struct osp_device *d)
        &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; ((rc = d-&amp;gt;opd_pre_status) == 0 || rc == -ENOSPC ||
                rc == -ENODEV || rc == -EAGAIN) {
 
-#&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; LUSTRE_VERSION_CODE &amp;lt; OBD_OCD_VERSION(2, 3, 90, 0)
-               /*
-                * to address Andreas&apos;s concern on possible busy-loop
-                * between &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; thread and osp_precreate_send()
-                */
-               &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (unlikely(count++ == 1000)) {
-                       osp_precreate_timeout_condition(d);
-                       LBUG();
-               }
-#endif
-
                /*
                 * increase number of precreations
                 */
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It probably makes sense to reinstate this if it loops only, say, twice for -ENOSPC, since there are other OSTs that could be used and it doesn&apos;t make sense to block the MDS thread for such a long time.&lt;/p&gt;

&lt;p&gt;It seems to me that the update of &lt;tt&gt;osp_pre_status&lt;/tt&gt; at the start of &lt;tt&gt;osp_pre_update_status()&lt;/tt&gt; is racy.  If &lt;tt&gt;rc == 0&lt;/tt&gt; (i.e. the statfs succeeded) then &lt;tt&gt;osp_pre_status = 0&lt;/tt&gt;, even though it is set to &lt;tt&gt;-ENOSPC&lt;/tt&gt; again later on.  It would be better to have something like:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc != 0) {
                d-&amp;gt;opd_pre_status = rc;
                &lt;span class=&quot;code-keyword&quot;&gt;goto&lt;/span&gt; out;
        }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Otherwise, &lt;tt&gt;opd_pre_status&lt;/tt&gt; can be changed to 0 and break &lt;tt&gt;osp_precreate_ready_condition()&lt;/tt&gt; checking of -ENOSPC.&lt;/p&gt;

&lt;p&gt;It also seems like the -ENOSPC hysteresis that is so well described in the comment in &lt;tt&gt;osp_pre_update_status()&lt;/tt&gt; is not actually implemented.  When the free space is &amp;lt; 0.1% &lt;tt&gt;opd_pre_status = -ENOSPC&lt;/tt&gt;, but it is immediately cleared if free space is &amp;gt;= 0.1%.  It also seems that there is a longstanding bug in the code, with min(used blocks vs. 1 GByte).  It seems something like the following is needed:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;         * On very large disk, say 16TB 0.1% will be 16 GB. We don&apos;t want to
         * lose that amount of space so in those cases we report no space left
         * &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; their is less than 1 GB left, and clear it at 2GB. 
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (likely(msfs-&amp;gt;os_type != 0)) { &lt;span class=&quot;code-comment&quot;&gt;/* msfs contains valid data */&lt;/span&gt;
                used = min((msfs-&amp;gt;os_blocks - msfs-&amp;gt;os_bfree) &amp;gt;&amp;gt; 10,
                           1ULL &amp;lt;&amp;lt; (30 - msfs-&amp;gt;os_bsize);
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (d-&amp;gt;opd_pre_status == 0 &amp;amp;&amp;amp;
                    msfs-&amp;gt;os_ffree &amp;lt; 32 || msfs-&amp;gt;os_bavail &amp;lt; used) {
                        d-&amp;gt;opd_pre_status = -ENOSPC;
                        :
                        :
                } &lt;span class=&quot;code-keyword&quot;&gt;else&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (d-&amp;gt;opd_pre_status == -ENOSPC &amp;amp;&amp;amp;
                           msgs-&amp;gt;os_ffree &amp;gt; 64 &amp;amp;&amp;amp; msfs-&amp;gt;os_bavail &amp;gt; used * 2) {
                        d-&amp;gt;opd_pre_status = 0;
                        :
                        :
                }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The &lt;tt&gt;opd_pre_status = 0&lt;/tt&gt; state should only be set if there is enough space, not only if the STATFS RPC succeeded.&lt;/p&gt;</comment>
                            <comment id="59695" author="adilger" created="Thu, 30 May 2013 20:51:43 +0000"  >&lt;p&gt;James, I expect the stack trace you pointed out is not a primary cause of this problem.  That looks just to be an MDS thread that is blocked waiting for a DLM lock that is held by another thread.  My guess is that there is something wrong in the precreate code at ENOSPC time that is keeping the MDS threads blocked for too long a time, and this is caused by either really being out of space on the OSTs (either inodes or blocks is the same, since there is no point allocating objects on an OST with no space) or maybe false ENOSPC caused by a problem with the space grants or similar.&lt;/p&gt;

&lt;p&gt;It &lt;em&gt;does&lt;/em&gt; point out that we have something wrong with the usage of the DLM locks, or similar, because we should never be blocked for such a long time holding a spinlock.&lt;/p&gt;</comment>
                            <comment id="59703" author="simmonsja" created="Thu, 30 May 2013 22:20:33 +0000"  >&lt;p&gt;I uploaded the vmcore to ftp.whamcloud.com/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3421&quot; title=&quot;(ost_handler.c:1762:ost_blocking_ast()) Error -2 syncing data on lock cancel causes first ENOSPC client issues then MDS server locks up&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3421&quot;&gt;&lt;del&gt;LU-3421&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="59711" author="simmonsja" created="Thu, 30 May 2013 23:36:42 +0000"  >&lt;p&gt;Andreas the lfs df -i output was pretty much what you seen below. The lfs df results were basically the same. In the last test shot we saw issues with 4 ost becoming very full but this was not the case this time.&lt;/p&gt;

&lt;p&gt;/lustre/routed1-OST0130_UUID     3872000       86598     3785402   2% /lustre/routed1&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:304&amp;#93;&lt;/span&gt;&lt;br/&gt;
/lustre/routed1-OST00ac_UUID     3872000       86584     3785416   2% /lustre/routed1&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:172&amp;#93;&lt;/span&gt;&lt;br/&gt;
/lustre/routed1-OST0047_UUID     3872000       86539     3785461   2% /lustre/routed1&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:71&amp;#93;&lt;/span&gt;&lt;br/&gt;
/lustre/routed1-OST01c2_UUID     3872000       85067     3786933   2% /lustre/routed1&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:450&amp;#93;&lt;/span&gt;&lt;br/&gt;
UUID                              Inodes       IUsed       IFree IUse% Mounted on&lt;/p&gt;

&lt;p&gt;Talking with Oleg he didn&apos;t believe it was due to grants. Each OST is 250GB in size. I also expected the real issue is happening on the OSS rather than the MDS but the admins only gathered a vmcore from the MDS.&lt;/p&gt;</comment>
                            <comment id="59721" author="bzzz" created="Fri, 31 May 2013 08:04:06 +0000"  >&lt;p&gt;I agree that osp_pre_update_status() should not be setting the status to 0 first. though I think that check to limit time we spent in osp_precreate_reserve() does not depend on the status, it&apos;s a hardlimit. and in the worst case when osp_precreate_ready_condition() missed -ENOSPC due to the race we still should leave osp_pre_update_status() up on expiration? notice the expiration point is not reset in the loop.&lt;/p&gt;
</comment>
                            <comment id="59722" author="bobijam" created="Fri, 31 May 2013 08:45:39 +0000"  >&lt;p&gt;James, &lt;/p&gt;

&lt;p&gt;Do you have OSS debug log? Want to check the reason why it reports ENOSPC while it seems there are plenty.&lt;/p&gt;</comment>
                            <comment id="59736" author="simmonsja" created="Fri, 31 May 2013 11:49:37 +0000"  >&lt;p&gt;The only OSS logs I have are like the one I posted here.&lt;/p&gt;</comment>
                            <comment id="59919" author="adilger" created="Mon, 3 Jun 2013 22:56:22 +0000"  >&lt;p&gt;James, how many clients were mounting this filesystem?  If each OST is 250GB, and each client gets a 32MB grant, that means 32 clients/GB of free space, so 8000 clients would essentially pin all of the available space on each client.  I see something a bit strange in the code that might be causing a problem here:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; ofd_statfs(...)
{
        osfs-&amp;gt;os_bavail -= min_t(obd_size, osfs-&amp;gt;os_bavail,
                                 ((ofd-&amp;gt;ofd_tot_dirty + ofd-&amp;gt;ofd_tot_pending +
                                   osfs-&amp;gt;os_bsize - 1) &amp;gt;&amp;gt; ofd-&amp;gt;ofd_blockbits));
        :
        :
        /* The QoS code on the MDS does not care about space reserved &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt;
         * precreate, so take it out. */
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (exp_connect_flags(exp) &amp;amp; OBD_CONNECT_MDS) {
                struct filter_export_data *fed;

                fed = &amp;amp;obd-&amp;gt;obd_self_export-&amp;gt;exp_filter_data;
                osfs-&amp;gt;os_bavail -= min_t(obd_size, osfs-&amp;gt;os_bavail,
                                         fed-&amp;gt;fed_grant &amp;gt;&amp;gt; ofd-&amp;gt;ofd_blockbits);
        }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This is &lt;b&gt;subtracting&lt;/b&gt; the granted space from the available space returned to the MDS, but I &lt;em&gt;think&lt;/em&gt; it should be &lt;b&gt;adding&lt;/b&gt; the granted space back into os_bavail so that the MDS does not consider the grant space as &quot;used&quot;.  Otherwise, if the clients have reserved a lot of space on the OSTs they may not actually get to use it because the MDS will never allocate space there.&lt;/p&gt;

&lt;p&gt;A secondary issue is that there is no coordination between the space granted to a specific client and the objects that the MDS allocates to that client, which would become more important as the free space is running out.  There &lt;em&gt;could&lt;/em&gt; be some kind of (IMHO complex) coordination here between the MDS and clients/OSTs, but I think it would be easier if we just got the grant shrinking code to work again, as there is no guarantee that (a) clients doing IO will have any grant at all, and (b) the clients have grant on the OSTs for which they have been asked to write on.  Returning unused grant to the OSTs as the free space shrinks is the best way to ensure that there is some left for the clients actually doing IO.&lt;/p&gt;</comment>
                            <comment id="59944" author="johann" created="Tue, 4 Jun 2013 08:32:58 +0000"  >&lt;blockquote&gt;
&lt;p&gt;This is subtracting the granted space from the available space returned to the MDS,&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;This is actually only subtracting the space reserved for pre-creation for which we use the self export.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt; but I think it should be adding the granted space back into os_bavail so that the MDS does not consider the grant space as &quot;used&quot;. Otherwise, if the clients have reserved a lot of space on the OSTs they may not actually get to use it because the MDS will never allocate space there.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;I think there is nothing to add back, since only tot_dirty and tot_pending are taken into account here. Please note that tot_granted is not withdrawn anywhere.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;A secondary issue is that there is no coordination between the space granted to a specific client and the objects that the MDS allocates to that client, which would become more important as the free space is running out.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;I would agree if we were taking tot_granted out in statfs reply, however i don&apos;t think this is the case.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;There could be some kind of (IMHO complex) coordination here between the MDS and clients/OSTs, but I think it would be easier if we just got the grant shrinking code to work again, as there is no guarantee that (a) clients doing IO will have any grant at all, and (b) the clients have grant on the OSTs for which they have been asked to write on. Returning unused grant to the OSTs as the free space shrinks is the best way to ensure that there is some left for the clients actually doing IO.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;I am all for resurrecting grant shrinking, although i haven&apos;t had the opportunity to do it yet by lack of time. In fact, we might as well just disconnect (and therefore release grants) from OSTs when idle and when the replay list is empty. We could then reconnect on demand. IMHO, such a scheme would have other benefits: less clients to wait for during recovery and less PING requests.&lt;/p&gt;

&lt;p&gt;As for the original problem, it seems that precreate requests fails with ENOSPC on the OST:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;May 29 09:00:06 widow-oss11a4 kernel: [79146.403713] LustreError: 25344:0:(ofd_obd.c:1338:ofd_create()) routed1-OST016b: unable to precreate: rc = -28
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;James, could you please dump the grant state on the OST by running &quot;lctl get_param obdfilter.&lt;b&gt;.tot&lt;/b&gt; obdfilter.&lt;b&gt;.grant&lt;/b&gt;&quot;?&lt;/p&gt;</comment>
                            <comment id="59973" author="johann" created="Tue, 4 Jun 2013 15:27:25 +0000"  >&lt;p&gt;While running some tests locally, i found out that the space reserved for precreation always decreases, eventually reaches 0 and stays there. It seems that we exit from ofd_grant() at line 641:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt; 637         &lt;span class=&quot;code-comment&quot;&gt;/* align grant on block size */&lt;/span&gt;
 638         grant &amp;amp;= ~((1ULL &amp;lt;&amp;lt; ofd-&amp;gt;ofd_blockbits) - 1);
 639 
 640         &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!grant)
 641                 RETURN(0);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I think there are two issues:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;ofd_grant_create() is not aggressive enough in reserving space for precreation and ends up requesting an amount of grant space smaller than a block&lt;/li&gt;
	&lt;li&gt;the rounding in ofd_grant() turns the &amp;lt;4KB allocation into 0.&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;I will provide a patch.&lt;/p&gt;</comment>
                            <comment id="60085" author="simmonsja" created="Thu, 6 Jun 2013 12:51:06 +0000"  >&lt;p&gt;I tested your patch at small scale and it works fine. I like to keep this ticket open until our next test shot to ensure this addresses the problem. Andreas point out other issues as well.&lt;/p&gt;</comment>
                            <comment id="60086" author="pjones" created="Thu, 6 Jun 2013 12:55:27 +0000"  >&lt;p&gt;ok James. Do you have a timeframe for your next testshot yet?&lt;/p&gt;</comment>
                            <comment id="60178" author="simmonsja" created="Fri, 7 Jun 2013 16:08:21 +0000"  >&lt;p&gt;Looks like the end of July for our next test shot. I will see if I can duplicate it at a smaller scale in the mean time.&lt;/p&gt;</comment>
                            <comment id="65567" author="pjones" created="Tue, 3 Sep 2013 03:46:39 +0000"  >&lt;p&gt;James&lt;/p&gt;

&lt;p&gt;This patch landed has landed both for 2.4.1 and 2.5. Has the issue reproduced in any test runs featuring this patch? If not, then perhaps we can close the ticket and reopen if it ever does reappear...&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="65594" author="simmonsja" created="Tue, 3 Sep 2013 13:08:47 +0000"  >&lt;p&gt;I haven&apos;t seen this bug in some time. You can close it.&lt;/p&gt;</comment>
                            <comment id="65595" author="pjones" created="Tue, 3 Sep 2013 13:11:55 +0000"  >&lt;p&gt;Thanks James!&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="15990">LU-1947</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="12966" name="mds-kern.log" size="2720666" author="simmonsja" created="Thu, 30 May 2013 16:50:26 +0000"/>
                            <attachment id="12967" name="oss11a4-kern.log" size="49688" author="simmonsja" created="Thu, 30 May 2013 16:50:26 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvsbb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>8486</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>