<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:06:37 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-7173] ldlm_lock_destroy_internal() LBUG encountered during 2.8 large scale testing</title>
                <link>https://jira.whamcloud.com/browse/LU-7173</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;While running a simulated user work load our MDS crashed due to the following:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;&amp;lt;3&amp;gt;[ 5913.638287] LustreError: 15340:0:(ldlm_lock.c:371:ldlm_lock_destroy_internal()) ### lock still on resource ns: mdt-atlas1-MDT0000_UUID lock: ffff883fd37b3700/0xb9db2cef5d1e41
39 lrc: 3/0,0 mode: CR/CR res: [0x200252cc7:0x3:0x0].0 bits 0x8 rrc: 2 type: IBT flags: 0x50000000000000 nid: 9310@gni100 remote: 0xd3f59509a44985ef expref: 60 pid: 15340 timeout: 
0 lvb_type: 3
&amp;lt;0&amp;gt;[ 5913.674996] LustreError: 15340:0:(ldlm_lock.c:372:ldlm_lock_destroy_internal()) LBUG
&amp;lt;4&amp;gt;[ 5913.683915] Pid: 15340, comm: mdt03_052
&amp;lt;4&amp;gt;[ 5913.688327] 
&amp;lt;4&amp;gt;[ 5913.688327] Call Trace:
&amp;lt;4&amp;gt;[ 5913.692965]  [&amp;lt;ffffffffa0430895&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
&amp;lt;4&amp;gt;[ 5913.700886]  [&amp;lt;ffffffffa0430e97&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
&amp;lt;4&amp;gt;[ 5913.707968]  [&amp;lt;ffffffffa0707871&amp;gt;] ldlm_lock_destroy_internal+0x251/0x2c0 [ptlrpc]
&amp;lt;4&amp;gt;[ 5913.716573]  [&amp;lt;ffffffffa07092b5&amp;gt;] ldlm_lock_destroy+0x35/0x130 [ptlrpc]
&amp;lt;4&amp;gt;[ 5913.724118]  [&amp;lt;ffffffffa070a311&amp;gt;] ldlm_lock_enqueue+0x161/0x980 [ptlrpc]
&amp;lt;4&amp;gt;[ 5913.731760]  [&amp;lt;ffffffffa0733e9b&amp;gt;] ldlm_handle_enqueue0+0x51b/0x10c0 [ptlrpc]
&amp;lt;4&amp;gt;[ 5913.739797]  [&amp;lt;ffffffffa0d6f1a6&amp;gt;] mdt_enqueue+0x46/0xe0 [mdt]
&amp;lt;4&amp;gt;[ 5913.746362]  [&amp;lt;ffffffffa0d7401a&amp;gt;] mdt_handle_common+0x52a/0x1470 [mdt]
&amp;lt;4&amp;gt;[ 5913.753804]  [&amp;lt;ffffffffa0db0615&amp;gt;] mds_regular_handle+0x15/0x20 [mdt]
&amp;lt;4&amp;gt;[ 5913.761064]  [&amp;lt;ffffffffa0762f55&amp;gt;] ptlrpc_server_handle_request+0x385/0xc00 [ptlrpc]
&amp;lt;4&amp;gt;[ 5913.769866]  [&amp;lt;ffffffffa0442785&amp;gt;] ? lc_watchdog_touch+0x65/0x170 [libcfs]
&amp;lt;4&amp;gt;[ 5913.777632]  [&amp;lt;ffffffffa075b929&amp;gt;] ? ptlrpc_wait_event+0xa9/0x2d0 [ptlrpc]
&amp;lt;4&amp;gt;[ 5913.785377]  [&amp;lt;ffffffffa07656dd&amp;gt;] ptlrpc_main+0xaed/0x1930 [ptlrpc]
&amp;lt;4&amp;gt;[ 5913.792557]  [&amp;lt;ffffffffa0764bf0&amp;gt;] ? ptlrpc_main+0x0/0x1930 [ptlrpc]
&amp;lt;4&amp;gt;[ 5913.799689]  [&amp;lt;ffffffff8109e78e&amp;gt;] kthread+0x9e/0xc0
&amp;lt;4&amp;gt;[ 5913.805270]  [&amp;lt;ffffffff8100c28a&amp;gt;] child_rip+0xa/0x20
&amp;lt;4&amp;gt;[ 5913.810950]  [&amp;lt;ffffffff8109e6f0&amp;gt;] ? kthread+0x0/0xc0
&amp;lt;4&amp;gt;[ 5913.816622]  [&amp;lt;ffffffff8100c280&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>Clients running Lustre-2.7.57 plus patches using an lustre 2.5.4 server back end.</environment>
        <key id="32168">LU-7173</key>
            <summary>ldlm_lock_destroy_internal() LBUG encountered during 2.8 large scale testing</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bobijam">Zhenyu Xu</assignee>
                                    <reporter username="simmonsja">James A Simmons</reporter>
                        <labels>
                            <label>patch</label>
                    </labels>
                <created>Wed, 16 Sep 2015 18:17:42 +0000</created>
                <updated>Fri, 2 Dec 2022 18:16:16 +0000</updated>
                            <resolved>Mon, 1 Feb 2016 12:39:57 +0000</resolved>
                                    <version>Lustre 2.8.0</version>
                                    <fixVersion>Lustre 2.8.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>15</watches>
                                                                            <comments>
                            <comment id="127533" author="pjones" created="Wed, 16 Sep 2015 18:44:08 +0000"  >&lt;p&gt;Bobijam&lt;/p&gt;

&lt;p&gt;Could you please look into this issue?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="127534" author="adilger" created="Wed, 16 Sep 2015 18:45:05 +0000"  >&lt;p&gt;Have you run this similar workload with 2.5.4 clients without problem?  This is a server-side bug and it &lt;em&gt;shouldn&apos;t&lt;/em&gt; be possible for the client to induce this kind of failure regardless of what it is doing.  The only thing I can think of is that the 2.7.57 client is causing the MDS to follow some error handling path that isn&apos;t exercised by 2.5 clients.&lt;/p&gt;</comment>
                            <comment id="127563" author="simmonsja" created="Wed, 16 Sep 2015 21:35:15 +0000"  >&lt;p&gt;Never seen this problem with the cray 2.5 clients or the lustre 2.7 clients we run with. This only happened with the pre-2.8 clients.&lt;/p&gt;</comment>
                            <comment id="127565" author="simmonsja" created="Wed, 16 Sep 2015 21:44:22 +0000"  >&lt;p&gt;Ugh. Your ftp server died on me so I uploaded the vmcore files and dmesgs at &lt;a href=&quot;http://www.infradead.org/~jsimmons/2015-09-15.tgz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://www.infradead.org/~jsimmons/2015-09-15.tgz&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="127751" author="bobijam" created="Fri, 18 Sep 2015 06:04:12 +0000"  >&lt;p&gt;git commit 5517eab06eb99e4ecb66be251a10e70c37547610 added layout lock handling, esp. &lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;diff --git a/lustre/ldlm/ldlm_inodebits.c b/lustre/ldlm/ldlm_inodebits.c
index e10a654..b68dd59 100644
--- a/lustre/ldlm/ldlm_inodebits.c
+++ b/lustre/ldlm/ldlm_inodebits.c
@@ -190,8 +190,12 @@ &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; ldlm_process_inodebits_lock(struct ldlm_lock *lock, __u
64 *flags,
         LASSERT(cfs_list_empty(&amp;amp;res-&amp;gt;lr_converting));
         check_res_locked(res);
 
-        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!first_enq) {
-                LASSERT(work_list != NULL);
+       &lt;span class=&quot;code-comment&quot;&gt;/* (*flags &amp;amp; LDLM_FL_BLOCK_NOWAIT) is &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; layout lock right now. */&lt;/span&gt;
+        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!first_enq || (*flags &amp;amp; LDLM_FL_BLOCK_NOWAIT)) {
+               *err = ELDLM_LOCK_ABORTED;
+               &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (*flags &amp;amp; LDLM_FL_BLOCK_NOWAIT)
+                       *err = ELDLM_LOCK_WOULDBLOCK;
+
                 rc = ldlm_inodebits_compat_queue(&amp;amp;res-&amp;gt;lr_granted, lock, NULL);
                 &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!rc)
                         RETURN(LDLM_ITER_STOP);
@@ -201,6 +205,8 @@ &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; ldlm_process_inodebits_lock(struct ldlm_lock *lock, __u6
4 *flags,
 
                 ldlm_resource_unlink_lock(lock);
                 ldlm_grant_lock(lock, work_list);
+
+               *err = ELDLM_OK;
                 RETURN(LDLM_ITER_CONTINUE);
         }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;and in ldlm_lock_enqueue() &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;        &lt;span class=&quot;code-comment&quot;&gt;/* policies are not executed on the client or during replay */&lt;/span&gt;
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; ((*flags &amp;amp; (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT
            &amp;amp;&amp;amp; !local &amp;amp;&amp;amp; ns-&amp;gt;ns_policy) {
                rc = ns-&amp;gt;ns_policy(ns, lockp, cookie, lock-&amp;gt;l_req_mode, *flags,
                                   NULL);
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc == ELDLM_LOCK_REPLACED) {
                     ....
                } &lt;span class=&quot;code-keyword&quot;&gt;else&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc != ELDLM_OK ||
                           (rc == ELDLM_OK &amp;amp;&amp;amp; (*flags &amp;amp; LDLM_FL_INTENT_ONLY))) {
                        ldlm_lock_destroy(lock);
                        RETURN(rc);
                }
        }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I suspect that some intent_only lock has been granted and ldlm_lock_destroy() takes non-granted lock as its precondition.&lt;/p&gt;</comment>
                            <comment id="127761" author="bobijam" created="Fri, 18 Sep 2015 07:45:16 +0000"  >&lt;p&gt;two of the vmcore-dmesg.txt shows that problematic locks are granted (mode: CR/CR) LAYOUT locks (lvb_type: 3 -&amp;gt;LVB_T_LAYOUT)&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;&amp;lt;3&amp;gt;[ 5913.638287] LustreError: 15340:0:(ldlm_lock.c:371:ldlm_lock_destroy_intern
al()) ### lock still on resource ns: mdt-atlas1-MDT0000_UUID lock: ffff883fd37b3
700/0xb9db2cef5d1e4139 lrc: 3/0,0 mode: CR/CR res: [0x200252cc7:0x3:0x0].0 bits  
0x8 rrc: 2 type: IBT flags: 0x50000000000000 nid: 9310@gni100 remote: 0xd3f59509
a44985ef expref: 60 pid: 15340 timeout: 0 lvb_type: 3
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;&amp;lt;3&amp;gt;[  969.928837] LustreError: 16016:0:(ldlm_lock.c:371:ldlm_lock_destroy_intern
al()) ### lock still on resource ns: mdt-atlas1-MDT0000_UUID lock: ffff881fbaae9
9c0/0xcc3ee5780a4e2eea lrc: 3/0,0 mode: CR/CR res: [0x20024d1ef:0x2e:0x0].0 bits
 0x8 rrc: 1 type: IBT flags: 0x50000000000000 nid: 91@gni100 remote: 0xb81bd76a7
d6191aa expref: 57 pid: 16016 timeout: 0 lvb_type: 3
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="127762" author="bobijam" created="Fri, 18 Sep 2015 07:51:07 +0000"  >&lt;p&gt;Hi James,&lt;/p&gt;

&lt;p&gt;Is it easy to reproduce?&lt;/p&gt;</comment>
                            <comment id="127773" author="simmonsja" created="Fri, 18 Sep 2015 13:55:05 +0000"  >&lt;p&gt;Not so easy to reproduce. We saw it only during our test shot on Titan running the 2.8 clients with a 2.5.4 server back end. It only showed up when we ran our test harness. After the first couple of runs the problem stopped showing up &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/sad.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt; I have never seen it at smaller scales.&lt;/p&gt;</comment>
                            <comment id="127947" author="gerrit" created="Mon, 21 Sep 2015 06:00:11 +0000"  >&lt;p&gt;Bobi Jam (bobijam@hotmail.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/16497&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16497&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7173&quot; title=&quot;ldlm_lock_destroy_internal() LBUG encountered during 2.8 large scale testing&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7173&quot;&gt;&lt;del&gt;LU-7173&lt;/del&gt;&lt;/a&gt; ldlm: do not grant intent-only lock&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 9702522581eaf7ad309e8c91931d571b9cad0b7b&lt;/p&gt;</comment>
                            <comment id="128006" author="simmonsja" created="Mon, 21 Sep 2015 21:55:54 +0000"  >&lt;p&gt;The patch looks good but it will not be until Oct 13th that we get it test it out.&lt;/p&gt;</comment>
                            <comment id="129761" author="simmonsja" created="Wed, 7 Oct 2015 22:02:11 +0000"  >&lt;p&gt;We have been a bunch of testing of this patch at various scales before our Titan test shot and we are seeing very large meta data performance improvement with this patch.&lt;/p&gt;

&lt;p&gt;Results with patch&lt;/p&gt;

&lt;p&gt;   Directory creation :     37714.322      11084.963      26771.564       9055.694&lt;br/&gt;
   Directory stat    :        58904.478      26689.126      48141.812       9343.208&lt;br/&gt;
   Directory removal :     34290.774      12088.699      22193.915       6303.121&lt;br/&gt;
   File creation      :          4937.247           43.594        4020.814       1064.732&lt;br/&gt;
   File stat            :            796.591         124.390          476.913         149.635&lt;br/&gt;
   File read           :        30091.505      19214.287       26420.152       3238.929&lt;br/&gt;
   File removal      :         4577.256        1845.895         4072.557         591.356&lt;/p&gt;

&lt;p&gt;Results before patch:&lt;/p&gt;

&lt;p&gt;   Directory creation:       5657.386   4416.307    5086.521    376.908&lt;br/&gt;
   Directory stat    :       74770.631  72558.713  73347.269    544.856&lt;br/&gt;
   Directory removal :     7990.972    6751.102    7465.554    382.254&lt;br/&gt;
   File creation      :        6867.935    6227.613    6558.801    201.736&lt;br/&gt;
   File read             :     28801.818   27562.517  28219.511    412.373&lt;br/&gt;
   File removal      :       5889.433     4705.258    5566.932    314.018&lt;/p&gt;</comment>
                            <comment id="129794" author="adilger" created="Thu, 8 Oct 2015 07:16:18 +0000"  >&lt;p&gt;Well, there is an improvement for directory creation and removal performance with the patch, but the directory stat and file creation and removal performance is down and the performance results are much more variable between runs.&lt;/p&gt;</comment>
                            <comment id="130275" author="vitaly_fertman" created="Tue, 13 Oct 2015 19:19:43 +0000"  >&lt;p&gt;who said it is about intent_only? is there a recovery? is it DNE?&lt;br/&gt;
I have a similar dump and this is intent_getattr resend, due to some reason getattr created a new lock and the previous update lock is to be destroyed due to LOCK_REPLACED.&lt;/p&gt;</comment>
                            <comment id="130276" author="simmonsja" created="Tue, 13 Oct 2015 19:37:12 +0000"  >&lt;p&gt;I had a discussion about this patch with Oleg. The patch only is suppose to touch the recovery path but we saw this LBUG during an application run. We will see if this patch resolves our problem on our Oct 27 test shot.&lt;/p&gt;</comment>
                            <comment id="130458" author="green" created="Thu, 15 Oct 2015 00:03:27 +0000"  >&lt;p&gt;I had a lengthly discussion with Vitaly about what appears to be the same bug they are hitting (much closer to master, though).&lt;/p&gt;

&lt;p&gt;Sicne there&apos;s no recovery at play here, we believe the intent_only flag might be a red herring and instead the problem is the other one:&lt;/p&gt;

&lt;p&gt;ldlm_lock_enqueue:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;                rc = ns-&amp;gt;ns_policy(ns, lockp, cookie, lock-&amp;gt;l_req_mode, *flags,
                                   NULL);
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc == ELDLM_LOCK_REPLACED) {
...
                } &lt;span class=&quot;code-keyword&quot;&gt;else&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc != ELDLM_OK ||
                           (rc == ELDLM_OK &amp;amp;&amp;amp; (*flags &amp;amp; LDLM_FL_INTENT_ONLY))) {
                        ldlm_lock_destroy(lock);
                        RETURN(rc);
                }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This is correctly identified by Bobijam, but we believe the case here is that of rc != ELDLM_OK (which is supported by crashdump data from this other affected site). (for the record, rc is -2, or -ENOENT).&lt;br/&gt;
The other bit of info we gathered from there is the request is a RESEND (but not replay).&lt;/p&gt;

&lt;p&gt;So imagine we have this situation - an intent request comes in (getattr in this case), it is handled, and then a reply is sent (along with a lock), but the reply is then lost.&lt;br/&gt;
Client resend the request, mdt fixup code finds the old lock and substitutes it in.&lt;br/&gt;
Now the policy function fails and returns rc that&apos;s not OK. This will prompt us to call the lock_destroy on the already granted lock and trigger this assertion.&lt;/p&gt;

&lt;p&gt;The mystery was - what might cause a successful request to fail on retry, esp. with ENOENT with a lock in place, no less.&lt;br/&gt;
We have noticed that the request at the crash point did not contain reply buffers, so that means it failed super early on in processing, in body_unpack to be precise. This function does a lookup and could return -2 if the object is not there.&lt;br/&gt;
The request is also a getattr_by_fid.&lt;/p&gt;

&lt;p&gt;So it appears the sequence of events is roughly:&lt;br/&gt;
a file is opened.&lt;br/&gt;
the file is unlinked&lt;br/&gt;
openhandle holder does fstat and causes getattr_by_fid request&lt;br/&gt;
reply is lost&lt;br/&gt;
the file is closed.  // How this could happen while the getattr did not finish is not very clear to be sure. Perhaps that was not fstat, but some other way to get at the file and raced with unlink?&lt;br/&gt;
getattr_by_fid is resent - the object is no longer there, mdt_unpack_req_pack_rep-&amp;gt;mdt_body_unpack-&amp;gt;mdt_object_init-&amp;gt;...-&amp;gt;osd_object_init() returns -ENOENT.&lt;/p&gt;

&lt;p&gt;I would like to verify this theory on the ORNL crashdump, but unfortunately it does not contain the kernel vmlinux and lustre modules with symbols, so I cannot.&lt;br/&gt;
Can we have that additional data please?&lt;/p&gt;</comment>
                            <comment id="130459" author="green" created="Thu, 15 Oct 2015 00:08:28 +0000"  >&lt;p&gt;The idea for a fix is to additionally check if the failed request is a resend (with a granted lock) and in that case we need to &quot;soft cancel&quot; it - i.e. to send a blocking ast.&lt;/p&gt;

&lt;p&gt;This is needed because it&apos;s possible the other reply made it to the client while we are working here, so it knows about the lock and if we cancel it outright - there&apos;s a bit of state inconsistency.&lt;br/&gt;
If the client does not know about the lock, it&apos;ll respond with a n error and the lock would be canceled imediatelly.&lt;br/&gt;
In any case there&apos;s no need to wait for the lock to be canceled here and we then can proceed with returning the error to the client.&lt;/p&gt;</comment>
                            <comment id="130467" author="green" created="Thu, 15 Oct 2015 03:25:36 +0000"  >&lt;p&gt;So, I got the symbos information from James and the picture got a bit clouded.&lt;/p&gt;

&lt;p&gt;What I see similar: it&apos;s a resend, there is rc -2 returned from policy function, there&apos;s no repbuffers created hinting at a very early on failure.&lt;br/&gt;
Differences: The request is ldlm layout intent.&lt;/p&gt;

&lt;p&gt;Now the really strange part:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; p *(( struct mdt_thread_info *)0xffff8840296cb000)-&amp;gt;mti_pill
$9 = {
  rc_req = 0xffff884045bd3000, 
  rc_fmt = 0xffffffffa080c9a0 &amp;lt;RQF_MDS_REINT_SETXATTR&amp;gt;, 
  rc_loc = RCL_SERVER, 
  rc_area = {{4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295}, {4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295}}
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So somehow this request got wrong request format assigned, which is really weird.&lt;br/&gt;
Both crashes have this same wrong request format too, so it&apos;s not a purely coincidence.&lt;/p&gt;</comment>
                            <comment id="130670" author="green" created="Fri, 16 Oct 2015 18:16:59 +0000"  >&lt;p&gt;So I think it would be an interesting experiment to perform a forced resend for an IT_LAYOUT intent and see if it happens to fail every time. First with 2.5.4 on both ends (easier to organize) and then 2.5.4 on server and master on client.&lt;/p&gt;

&lt;p&gt;I envision a manual test to be something like:&lt;br/&gt;
dd if=/dev/urandom of=/mnt/lustre/file1 bs=1024&lt;br/&gt;
press ^Z to pause that process&lt;br/&gt;
cp /etc/passwd /mnt/lustre/file2&lt;br/&gt;
lfs swap_layouts /mnt/lustre/file1 /mnt/lustre/file2&lt;br/&gt;
This will invalidate the layout lock, so next write activity from paused dd would trigger intent lock.&lt;/p&gt;

&lt;p&gt;Then set OBD_FAIL_LDLM_REPLY (0x30c, or really 0x8000030c to make it one time) as fail_loc on MDS.&lt;br/&gt;
Unpause the dd (with fg) and make sure the fail_loc did trigger on MDS (this fail_loc does not work properly in master so need to make sure it works in 2.5.4).&lt;/p&gt;

&lt;p&gt;If it crahes - then we know it&apos;s every time such a resend happens and it should be easy to figure it out  afterwards, if not we&apos;ll need some more investigations.&lt;/p&gt;

&lt;p&gt;In any case I do not have time for this right now as I am departing for a conference tomorrow.&lt;/p&gt;

&lt;p&gt;James, if you have time to perform an experiment like this, that would be great as a first step, as I think Bobijam is busy with another conference in the next few days.&lt;/p&gt;</comment>
                            <comment id="130675" author="simmonsja" created="Fri, 16 Oct 2015 19:04:11 +0000"  >&lt;p&gt;I just tried your test using a 2.8 client &amp;lt;-&amp;gt; 2.8 server. No crash happened. I will work the admin to setup a 2.8 client to test against our 2.5 server setup. I did see time outs on the clients and MDS side.&lt;/p&gt;</comment>
                            <comment id="130687" author="vitaly_fertman" created="Fri, 16 Oct 2015 20:54:55 +0000"  >&lt;p&gt;due to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5604&quot; title=&quot;Lots of FAIL_ID checking are lost&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5604&quot;&gt;&lt;del&gt;LU-5604&lt;/del&gt;&lt;/a&gt;, OBD_FAIL_LDLM_REPLY does not work, you have to use OBD_FAIL_MDS_LDLM_REPLY_NET&lt;/p&gt;</comment>
                            <comment id="135413" author="gerrit" created="Mon, 7 Dec 2015 18:42:01 +0000"  >&lt;p&gt;Vitaly Fertman (vitaly.fertman@seagate.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/17501&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/17501&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7173&quot; title=&quot;ldlm_lock_destroy_internal() LBUG encountered during 2.8 large scale testing&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7173&quot;&gt;&lt;del&gt;LU-7173&lt;/del&gt;&lt;/a&gt; mdt: intent vs unlink race&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: c38c254ccad0e1a4b3a5eab99a6d3e09fd6f021f&lt;/p&gt;</comment>
                            <comment id="137019" author="gerrit" created="Mon, 21 Dec 2015 12:41:33 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/17501/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/17501/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7173&quot; title=&quot;ldlm_lock_destroy_internal() LBUG encountered during 2.8 large scale testing&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7173&quot;&gt;&lt;del&gt;LU-7173&lt;/del&gt;&lt;/a&gt; mdt: intent vs unlink race&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 9b520c36ddff687c51bea026d78380eb85981971&lt;/p&gt;</comment>
                            <comment id="139333" author="standan" created="Tue, 19 Jan 2016 22:58:08 +0000"  >&lt;p&gt;Another instance found for  interop : 2.7.1 Server/EL6.7 Client&lt;br/&gt;
Server: 2.7.1, b2_7_fe/34&lt;br/&gt;
Client: master, build# 3303, RHEL 6.7&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/1dbf619c-bb05-11e5-9137-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/1dbf619c-bb05-11e5-9137-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="139348" author="standan" created="Tue, 19 Jan 2016 23:31:17 +0000"  >&lt;p&gt;Another instance found for interop : 2.5.5 Server/EL6.7 Client&lt;br/&gt;
Server: 2.5.5, b2_5_fe/62&lt;br/&gt;
Client: master, build# 3303, RHEL 6.7&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/0ac277da-bb25-11e5-861c-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/0ac277da-bb25-11e5-861c-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="139374" author="standan" created="Wed, 20 Jan 2016 00:32:12 +0000"  >&lt;p&gt;Another instance found for interop : 2.5.5 Server/EL7 Client&lt;br/&gt;
Server: 2.5.5, b2_5_fe/62&lt;br/&gt;
Client: master, build# 3303, RHEL 7&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/76574e5a-bb0a-11e5-87b4-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/76574e5a-bb0a-11e5-87b4-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="140648" author="pjones" created="Mon, 1 Feb 2016 12:39:57 +0000"  >&lt;p&gt;It looks like this fix already landed to master for 2.8&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                                        </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="28982">LU-6334</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="33545">LU-7535</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzxntz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>