<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:52:35 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5565] (osd_handler.c:1959:osd_attr_set()) ASSERTION( dt_object_exists(dt) &amp;&amp; !dt_object_remote(dt) ) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-5565</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;I see this running racer with MDSCOUNT=4. I simplified it a bit.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;# export MDSCOUNT=4
# export MOUNT_2=y
# llmount.sh
...
# cd /mnt/lustre
# &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;true&lt;/span&gt;; &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt; lfs mkdir -c4 d0; sys_open d0/f0 cw; rm -rf d0; done &amp;amp;
# cd /mnt/lustre2
# &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;true&lt;/span&gt;; &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt; setfattr -n user.0 -v 0 d0; done
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;u login: [ 6081.349963] LustreError: 5860:0:(osd_handler.c:1959:osd_attr_set()) ASSERTION( dt_object_exists(dt) &amp;amp;&amp;amp; !dt_object_remote(dt) ) failed:
[ 6081.354491] LustreError: 5860:0:(osd_handler.c:1959:osd_attr_set()) LBUG
[ 6081.356572] Pid: 5860, comm: ll_ost_out01_00
[ 6081.357962]
[ 6081.357964] Call Trace:
[ 6081.359351]  [&amp;lt;ffffffffa02be8c5&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
[ 6081.361546]  [&amp;lt;ffffffffa02beec7&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
[ 6081.363579]  [&amp;lt;ffffffffa0b2d351&amp;gt;] osd_attr_set+0x1f1/0x540 [osd_ldiskfs]
[ 6081.365686]  [&amp;lt;ffffffffa0b1f07f&amp;gt;] ? osd_object_write_lock+0x9f/0x130 [osd_ldiskfs]
[ 6081.367802]  [&amp;lt;ffffffffa06fb838&amp;gt;] out_tx_attr_set_exec+0x208/0x370 [ptlrpc]
[ 6081.369162]  [&amp;lt;ffffffffa06f758a&amp;gt;] out_tx_end+0xda/0x5c0 [ptlrpc]
[ 6081.370361]  [&amp;lt;ffffffffa06ff0b9&amp;gt;] out_handle+0x5e9/0xdf0 [ptlrpc]
[ 6081.371583]  [&amp;lt;ffffffffa069207c&amp;gt;] ? lustre_msg_get_opc+0x9c/0x110 [ptlrpc]
[ 6081.372926]  [&amp;lt;ffffffffa06f461e&amp;gt;] tgt_request_handle+0x71e/0xb10 [ptlrpc]
[ 6081.374258]  [&amp;lt;ffffffffa06a4a67&amp;gt;] ptlrpc_main+0xe27/0x1980 [ptlrpc]
[ 6081.375509]  [&amp;lt;ffffffffa06a3c40&amp;gt;] ? ptlrpc_main+0x0/0x1980 [ptlrpc]
[ 6081.376702]  [&amp;lt;ffffffff8109eab6&amp;gt;] kthread+0x96/0xa0
[ 6081.377643]  [&amp;lt;ffffffff8100c30a&amp;gt;] child_rip+0xa/0x20
[ 6081.378592]  [&amp;lt;ffffffff81554710&amp;gt;] ? _spin_unlock_irq+0x30/0x40
[ 6081.379716]  [&amp;lt;ffffffff8100bb10&amp;gt;] ? restore_args+0x0/0x30
[ 6081.380747]  [&amp;lt;ffffffff8109ea20&amp;gt;] ? kthread+0x0/0xa0
[ 6081.381702]  [&amp;lt;ffffffff8100c300&amp;gt;] ? child_rip+0x0/0x20
[ 6081.382681]
[ 6081.383251] Kernel panic - not syncing: LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Here are some stack traces from a different occurrence.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 9780   TASK: ffff8801d69708c0  CPU: 0   COMMAND: &quot;setfattr&quot;
 #0 [ffff8801d6bcf6f8] schedule at ffffffff81551457
 #1 [ffff8801d6bcf7c0] schedule_timeout at ffffffff81552373
 #2 [ffff8801d6bcf890] ptlrpc_set_wait at ffffffffa0688c4a [ptlrpc]
 #3 [ffff8801d6bcf930] ptlrpc_queue_wait at ffffffffa0689217 [ptlrpc]
 #4 [ffff8801d6bcf950] mdc_xattr_common at ffffffffa0912d53 [mdc]
 #5 [ffff8801d6bcf9e0] mdc_setxattr at ffffffffa0913146 [mdc]
 #6 [ffff8801d6bcfa30] lmv_setxattr at ffffffffa08ca50b [lmv]
 #7 [ffff8801d6bcfac0] ll_setxattr_common at ffffffffa0e746f0 [lustre]
 #8 [ffff8801d6bcfb80] ll_setxattr at ffffffffa0e75054 [lustre]
 #9 [ffff8801d6bcfd10] __vfs_setxattr_noperm at ffffffff811caeee
#10 [ffff8801d6bcfd70] vfs_setxattr at ffffffff811cb0c4
#11 [ffff8801d6bcfdc0] setxattr at ffffffff811cb1a0
#12 [ffff8801d6bcff10] sys_setxattr at ffffffff811cb488
#13 [ffff8801d6bcff80] system_call_fastpath at ffffffff8100b072
    RIP: 000000377fee53c9  RSP: 00007fff08cfd300  RFLAGS: 00010206
    RAX: 00000000000000bc  RBX: ffffffff8100b072  RCX: 00000000014f0100
    RDX: 00000000014f0000  RSI: 00007fff08cff703  RDI: 00007fff08cff70f
    RBP: 00007fff08cff703   R8: 0000000000000000   R9: 0000000000000000
    R10: 0000000000000001  R11: 0000000000000246  R12: 0000000000000000
    R13: 0000000000000000  R14: 00007fff08cfd580  R15: 00000000014f0000
    ORIG_RAX: 00000000000000bc  CS: 0033  SS: 002b

PID: 3693   TASK: ffff880202176500  CPU: 0   COMMAND: &quot;mdt00_000&quot;
 #0 [ffff880202179790] schedule at ffffffff81551457
 #1 [ffff880202179858] schedule_timeout at ffffffff81552373
 #2 [ffff880202179928] ptlrpc_set_wait at ffffffffa0688c4a [ptlrpc]
 #3 [ffff8802021799c8] ptlrpc_queue_wait at ffffffffa0689217 [ptlrpc]
 #4 [ffff8802021799e8] out_remote_sync at ffffffffa07001b1 [ptlrpc]
 #5 [ffff880202179a38] osp_trans_trigger at ffffffffa0d9e4aa [osp]
 #6 [ffff880202179a78] osp_trans_start at ffffffffa0d9f914 [osp]
 #7 [ffff880202179aa8] lod_trans_start at ffffffffa0d27c11 [lod]
 #8 [ffff880202179ae8] mdd_trans_start at ffffffffa0c1c067 [mdd]
 #9 [ffff880202179af8] mdd_unlink at ffffffffa0c0175a [mdd]
#10 [ffff880202179bb8] mdo_unlink at ffffffffa0c6a688 [mdt]
#11 [ffff880202179bc8] mdt_reint_unlink at ffffffffa0c74468 [mdt]
#12 [ffff880202179c78] mdt_reint_rec at ffffffffa0c6a421 [mdt]
#13 [ffff880202179c98] mdt_reint_internal at ffffffffa0c4fc63 [mdt]
#14 [ffff880202179cd8] mdt_reint at ffffffffa0c504cb [mdt]
#15 [ffff880202179d18] tgt_request_handle at ffffffffa06f461e [ptlrpc]
#16 [ffff880202179d78] ptlrpc_main at ffffffffa06a4a67 [ptlrpc]
#17 [ffff880202179eb8] kthread at ffffffff8109eab6
#18 [ffff880202179f48] kernel_thread at ffffffff8100c30a

PID: 4642   TASK: ffff8801e59707c0  CPU: 1   COMMAND: &quot;mdt00_004&quot;
 #0 [ffff8801e856f7c0] schedule at ffffffff81551457
 #1 [ffff8801e856f888] schedule_timeout at ffffffff81552373
 #2 [ffff8801e856f958] ptlrpc_set_wait at ffffffffa0688c4a [ptlrpc]
 #3 [ffff8801e856f9f8] ptlrpc_queue_wait at ffffffffa0689217 [ptlrpc]
 #4 [ffff8801e856fa18] out_remote_sync at ffffffffa07001b1 [ptlrpc]
 #5 [ffff8801e856fa68] osp_trans_trigger at ffffffffa0d9e4aa [osp]
 #6 [ffff8801e856faa8] osp_trans_start at ffffffffa0d9f914 [osp]
 #7 [ffff8801e856fad8] lod_trans_start at ffffffffa0d27c11 [lod]
 #8 [ffff8801e856fb18] mdd_trans_start at ffffffffa0c1c067 [mdd]
 #9 [ffff8801e856fb28] mdd_attr_set at ffffffffa0c14985 [mdd]
#10 [ffff8801e856fba8] mo_attr_set at ffffffffa0c74bc8 [mdt]
#11 [ffff8801e856fbb8] mdt_reint_setxattr at ffffffffa0c75740 [mdt]
#12 [ffff8801e856fc78] mdt_reint_rec at ffffffffa0c6a421 [mdt]
#13 [ffff8801e856fc98] mdt_reint_internal at ffffffffa0c4fc63 [mdt]
#14 [ffff8801e856fcd8] mdt_reint at ffffffffa0c504cb [mdt]
#15 [ffff8801e856fd18] tgt_request_handle at ffffffffa06f461e [ptlrpc]
#16 [ffff8801e856fd78] ptlrpc_main at ffffffffa06a4a67 [ptlrpc]
#17 [ffff8801e856feb8] kthread at ffffffff8109eab6
#18 [ffff8801e856ff48] kernel_thread at ffffffff8100c30a

PID: 4225   TASK: ffff8801ed6c87c0  CPU: 1   COMMAND: &quot;ll_ost_out00_00&quot;
 #0 [ffff8801ed6cb998] machine_kexec at ffffffff81039950
 #1 [ffff8801ed6cb9f8] crash_kexec at ffffffff810d4372
 #2 [ffff8801ed6cbac8] panic at ffffffff81550d83
 #3 [ffff8801ed6cbb48] lbug_with_loc at ffffffffa02bef1b [libcfs]
 #4 [ffff8801ed6cbb68] osd_attr_set at ffffffffa0b2d351 [osd_ldiskfs]
 #5 [ffff8801ed6cbbc8] out_tx_attr_set_exec at ffffffffa06fb838 [ptlrpc]
 #6 [ffff8801ed6cbc08] out_tx_end at ffffffffa06f758a [ptlrpc]
 #7 [ffff8801ed6cbc58] out_handle at ffffffffa06ff0b9 [ptlrpc]
 #8 [ffff8801ed6cbd18] tgt_request_handle at ffffffffa06f461e [ptlrpc]
 #9 [ffff8801ed6cbd78] ptlrpc_main at ffffffffa06a4a67 [ptlrpc]
#10 [ffff8801ed6cbeb8] kthread at ffffffff8109eab6
#11 [ffff8801ed6cbf48] kernel_thread at ffffffff8100c30a
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I&apos;m not sure why the handler is ll_ost_out00_00 but I checked and that task is handling a OUT_UPDATE RPC from lustre-MDT0000 to lustre-MDT0002.&lt;/p&gt;

&lt;p&gt;Note that the setattr is for the ctime on d0. Should this ctime update really go to each stripe? If so then shouldn&apos;t we lock the stripes as we do from mdt_attr_set()?&lt;/p&gt;</description>
                <environment></environment>
        <key id="26238">LU-5565</key>
            <summary>(osd_handler.c:1959:osd_attr_set()) ASSERTION( dt_object_exists(dt) &amp;&amp; !dt_object_remote(dt) ) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="jhammond">John Hammond</assignee>
                                    <reporter username="jhammond">John Hammond</reporter>
                        <labels>
                            <label>MB</label>
                            <label>dne2</label>
                            <label>mdt</label>
                            <label>out</label>
                    </labels>
                <created>Fri, 29 Aug 2014 17:53:30 +0000</created>
                <updated>Wed, 3 Feb 2016 19:19:56 +0000</updated>
                            <resolved>Wed, 3 Feb 2016 19:19:56 +0000</resolved>
                                    <version>Lustre 2.7.0</version>
                                    <fixVersion>Lustre 2.7.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>10</watches>
                                                                            <comments>
                            <comment id="97038" author="adilger" created="Wed, 22 Oct 2014 17:57:59 +0000"  >&lt;p&gt;I hit this same LASSERT running racer on my single-node test system (2x1GB MDT, 3x4GB OST) as part of acceptance-small.sh using v2_6_54_0-16-g0024956:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LustreError: 1492:0:(osd_handler.c:1963:osd_attr_set()) ASSERTION( dt_object_exists(dt) &amp;amp;&amp;amp; !dt_object_remote(dt) ) failed: 
Pid: 1492, comm: ll_ost_out00_00
libcfs_debug_dumpstack+0x55/0x80 [libcfs]
lbug_with_loc+0x47/0xb0 [libcfs]
osd_attr_set+0x197/0x4e0 [osd_ldiskfs]
out_tx_attr_set_exec+0x260/0x3f0 [ptlrpc]
out_tx_end+0xda/0x5c0 [ptlrpc]
out_handle+0x7c0/0xe50 [ptlrpc]
tgt_request_handle+0x71e/0xb10 [ptlrpc]
ptlrpc_main+0xe64/0x1990 [ptlrpc]
kthread+0x96/0xa0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="97061" author="adilger" created="Wed, 22 Oct 2014 21:50:18 +0000"  >&lt;p&gt;I&apos;ve pushed a cleanup/debug patch &lt;a href=&quot;http://review.whamcloud.com/12398&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/12398&lt;/a&gt; to at least split this LASSERT() into two separate ones, so that it is easier to see which one is failing.  This is not expected to fix the problem here.&lt;/p&gt;</comment>
                            <comment id="97315" author="jhammond" created="Thu, 23 Oct 2014 19:30:00 +0000"  >&lt;p&gt;dt_object_exists() is failing.&lt;/p&gt;</comment>
                            <comment id="97637" author="adilger" created="Mon, 27 Oct 2014 21:25:07 +0000"  >&lt;p&gt;I hit this problem again, with the same stack in out_handle() as my previous occurrence.&lt;/p&gt;</comment>
                            <comment id="98529" author="jhammond" created="Thu, 6 Nov 2014 17:07:33 +0000"  >&lt;p&gt;Can we replace this with error handling?&lt;/p&gt;</comment>
                            <comment id="98531" author="bzzz" created="Thu, 6 Nov 2014 17:12:53 +0000"  >&lt;p&gt;yes, in out_tx_attr_set_exec() once the lock is taken we should check whether the object still exists.&lt;/p&gt;</comment>
                            <comment id="98601" author="jhammond" created="Thu, 6 Nov 2014 21:19:00 +0000"  >&lt;p&gt;Please see &lt;a href=&quot;http://review.whamcloud.com/12608&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/12608&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="98673" author="di.wang" created="Fri, 7 Nov 2014 18:01:35 +0000"  >&lt;p&gt;&quot;Note that the setattr is for the ctime on d0. Should this ctime update really go to each stripe? If so then shouldn&apos;t we lock the stripes as we do from mdt_attr_set()?&quot;&lt;/p&gt;

&lt;p&gt;John, I think you are right, we should lock the object both for setting xattr and attr.  And ctime does not need to go every stripe, but&lt;/p&gt;

&lt;p&gt;1. lod_attr_set needs to be fixed, i.e. set_attr time only goes to master object. &lt;br/&gt;
2. lmv_merge_attr needs to be fixed, i.e. it needs to take count in master object time as well, right now it only merge the time attributes from all of the sub-stripes. &lt;/p&gt;</comment>
                            <comment id="98676" author="di.wang" created="Fri, 7 Nov 2014 18:05:03 +0000"  >&lt;p&gt;Can we replace this with error handling?&lt;/p&gt;

&lt;p&gt;Please do it in OSD, then local object and remote object are getting equal treatment.&lt;/p&gt;</comment>
                            <comment id="103444" author="di.wang" created="Wed, 14 Jan 2015 06:05:28 +0000"  >&lt;p&gt;The real reason of this LBUG is because of the race between close(unlink orphan) and setxattr(or setattr). Because close is not protected by ldlm lock.  In current implementation(mdd_close()), we use a local lock(mdd_write_lock) to protect the unlink orphan process in the MDD layer, which is fine if the directory is a local directory.  But if the directory is a striped directory, the local lock might not be enough, because the unlink orphan (triggered by close) will delete the stripe on the other MDT as well. If other threads do setxattr(or setattr) at the same time on this striped directory. it causes this LBUG.&lt;br/&gt;
There are a few options to fix this problem,&lt;/p&gt;

&lt;p&gt;1. protect close process with ldlm lock. &lt;br/&gt;
2. &quot;Open&quot; the file before setxattr (or setattr), of course only on the server side in MDD layer, then &quot;close&quot; the file after setxattr (or setattr), which might trigger unlink orphan as well.&lt;/p&gt;

&lt;p&gt;Either of them might need some changes (not tiny), probably not a good choice for 2.7. Any other suggestions?&lt;/p&gt;
</comment>
                            <comment id="103474" author="jhammond" created="Wed, 14 Jan 2015 15:21:57 +0000"  >&lt;p&gt;Isn&apos;t there a (already broken) rule against taking LDLM locks in close. From mdc_close()&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;        /* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a                
         * portal whose threads are not taking any DLM locks and are therefore            
         * always progressing */
        req-&amp;gt;rq_request_portal = MDS_READPAGE_PORTAL;
        ptlrpc_at_set_req_timeout(req);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Why not fix this by handling the race?&lt;/p&gt;</comment>
                            <comment id="103477" author="bzzz" created="Wed, 14 Jan 2015 15:39:25 +0000"  >&lt;p&gt;using LDLM lock has additional benefits as an object can be accessed by another node and LDLM is the natural mechanism to control access.&lt;/p&gt;</comment>
                            <comment id="103505" author="adilger" created="Wed, 14 Jan 2015 18:41:49 +0000"  >&lt;p&gt;Couldn&apos;t we just return -ENOENT in this case?  The setxattr() call is path-based, and not file descriptor based, so there is no expectation that it is free of races. If the object is deleted then it is perfectly fine to return an error to the caller. &lt;/p&gt;</comment>
                            <comment id="103521" author="di.wang" created="Wed, 14 Jan 2015 20:25:54 +0000"  >&lt;p&gt;I had thought the same thing, then I saw &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6027&quot; title=&quot;Issues with EAs of orphan files and EAs with empty values&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6027&quot;&gt;&lt;del&gt;LU-6027&lt;/del&gt;&lt;/a&gt;, and discussed with Li Wei. The problem is if the application open the file and hold the file handle, do setxattr, should we expect succeeds or failure? And it does succeed on ext4?  I do not know if posix require this or not?  And also if we return -ENOENT here, Li Wei needs to change his new &quot;racer&quot; program and disable sanityn.sh 82.&lt;/p&gt;</comment>
                            <comment id="103522" author="jhammond" created="Wed, 14 Jan 2015 20:45:00 +0000"  >&lt;p&gt;&amp;gt; I had thought the same thing, then I saw &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6027&quot; title=&quot;Issues with EAs of orphan files and EAs with empty values&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6027&quot;&gt;&lt;del&gt;LU-6027&lt;/del&gt;&lt;/a&gt;, and discussed with Li Wei. The problem is if the application open the file and hold the file handle, do setxattr, should we expect succeeds or failure? And it does succeed on ext4? I do not know if posix require this or not? And also if we return -ENOENT here, Li Wei needs to change his new &quot;racer&quot; program and disable sanityn.sh 82.&lt;/p&gt;

&lt;p&gt;setxattr() or fsetxattr()?&lt;/p&gt;</comment>
                            <comment id="103538" author="di.wang" created="Wed, 14 Jan 2015 22:26:21 +0000"  >&lt;p&gt;I meant Li Wei used this file handle thing to set xattr in his new &quot;racer&quot; test program. I am not sure which one he used. But if I returned ENOENT here, it will definitely cause sanityn.sh 82 failed. &lt;/p&gt;</comment>
                            <comment id="103539" author="jhammond" created="Wed, 14 Jan 2015 22:37:09 +0000"  >&lt;p&gt;We should be clear about that before we add LDLM locks to close because of it.&lt;/p&gt;

&lt;p&gt;Note that &lt;a href=&quot;http://review.whamcloud.com/#/c/12608/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/12608/&lt;/a&gt; does &lt;b&gt;not&lt;/b&gt; cause -ENOENT to be returned under the following situation.&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;lfs mkdir -c4 d0&lt;/li&gt;
	&lt;li&gt;fd = open(&quot;d0&quot;, O_RDONLY)&lt;/li&gt;
	&lt;li&gt;rmdir(&quot;d0&quot;)&lt;/li&gt;
	&lt;li&gt;fsetxattr(fd, &quot;user.foo&quot;, &quot;bar&quot;, 3, 0)&lt;/li&gt;
&lt;/ol&gt;
</comment>
                            <comment id="103568" author="di.wang" created="Thu, 15 Jan 2015 08:25:29 +0000"  >&lt;p&gt;Yes, right now, it does not return ENOENT, because of the file is still open, and we allow this in current implementation.  So is it bad we just return ENOENT for fsetxattr in this case? &lt;/p&gt;</comment>
                            <comment id="103594" author="jhammond" created="Thu, 15 Jan 2015 14:45:32 +0000"  >&lt;p&gt;&amp;gt; Yes, right now, it does not return ENOENT, because of the file is still open, and we allow this in current implementation. So is it bad we just return ENOENT for fsetxattr in this case?&lt;/p&gt;

&lt;p&gt;Why would we do that?&lt;/p&gt;</comment>
                            <comment id="103625" author="di.wang" created="Thu, 15 Jan 2015 18:11:38 +0000"  >&lt;p&gt;I mean If it return ENOENT for unlinked object during setattr or setxattr, then we do not need worry about orphans.   &lt;/p&gt;</comment>
                            <comment id="103651" author="jhammond" created="Thu, 15 Jan 2015 19:54:47 +0000"  >&lt;p&gt;Why not use &lt;a href=&quot;http://review.whamcloud.com/#/c/12608/?&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/12608/?&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="103664" author="di.wang" created="Thu, 15 Jan 2015 21:10:32 +0000"  >&lt;p&gt;It is different problem. I thought we are talking about about resolving the race, and &lt;a href=&quot;http://review.whamcloud.com/#/c/12608&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/12608&lt;/a&gt; is to avoid panic, which actually is covering the problem somehow. IMHO&lt;/p&gt;</comment>
                            <comment id="103666" author="jhammond" created="Thu, 15 Jan 2015 21:27:53 +0000"  >&lt;p&gt;It is the same problem. &lt;a href=&quot;http://review.whamcloud.com/#/c/12608&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/12608&lt;/a&gt; handles the race.&lt;/p&gt;</comment>
                            <comment id="103679" author="di.wang" created="Thu, 15 Jan 2015 21:59:36 +0000"  >&lt;p&gt;Hmm, I thought we suppose to use ldlm lock to protect the race like this, or just not get into setattr or set xattr, once the object is being unlinked. &lt;/p&gt;

&lt;p&gt;And also I thought the object should exist once the modification getting into OSD/OSP layer, because it suppose to do sanity check and hold ldlm lock in MDT layer.  Though there are some exception for now, like close.  I thought we should resolve this in MDT layer, instead of &quot;covering&quot; it in OSD/OSP.  &lt;/p&gt;</comment>
                            <comment id="103714" author="gerrit" created="Fri, 16 Jan 2015 03:26:36 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/12608/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/12608/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5565&quot; title=&quot;(osd_handler.c:1959:osd_attr_set()) ASSERTION( dt_object_exists(dt) &amp;amp;&amp;amp; !dt_object_remote(dt) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5565&quot;&gt;&lt;del&gt;LU-5565&lt;/del&gt;&lt;/a&gt; osd-ldiskfs: handle non-existing objects&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 30b0b011dfd53faf8f903401fa0539a8d625b0e5&lt;/p&gt;</comment>
                            <comment id="103745" author="jhammond" created="Fri, 16 Jan 2015 14:35:13 +0000"  >&lt;p&gt;Note that osd-zfs still needs to be fixed.&lt;/p&gt;</comment>
                            <comment id="103757" author="jlevi" created="Fri, 16 Jan 2015 17:00:25 +0000"  >&lt;p&gt;Patch landed to Master.&lt;/p&gt;</comment>
                            <comment id="104334" author="gerrit" created="Thu, 22 Jan 2015 16:02:13 +0000"  >&lt;p&gt;John L. Hammond (john.hammond@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/13496&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/13496&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5565&quot; title=&quot;(osd_handler.c:1959:osd_attr_set()) ASSERTION( dt_object_exists(dt) &amp;amp;&amp;amp; !dt_object_remote(dt) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5565&quot;&gt;&lt;del&gt;LU-5565&lt;/del&gt;&lt;/a&gt; osd-ldiskfs: handle nonexisting objects&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 0b4fd760b34b1ded5e8fee6db07a5a0f0291389d&lt;/p&gt;</comment>
                            <comment id="140339" author="yong.fan" created="Thu, 28 Jan 2016 13:27:54 +0000"  >&lt;p&gt;We hit the same trouble with ZFS backend on lola. We need enhance the patch &lt;a href=&quot;http://review.whamcloud.com/#/c/12608/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/12608/&lt;/a&gt; for ZFS case.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Jan 27 05:49:12 lola-5 kernel: LustreError: 22690:0:(osd_object.c:925:osd_attr_set()) ASSERTION( dt_object_exists(dt) ) failed:
Jan 27 05:49:12 lola-5 kernel: LustreError: 22690:0:(osd_object.c:925:osd_attr_set()) LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="140342" author="bzzz" created="Thu, 28 Jan 2016 14:05:28 +0000"  >&lt;p&gt;this is done in &lt;a href=&quot;http://review.whamcloud.com/#/c/18024/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/18024/&lt;/a&gt; and &lt;a href=&quot;http://review.whamcloud.com/#/c/18155/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/18155/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="140786" author="heckes" created="Tue, 2 Feb 2016 12:43:53 +0000"  >&lt;p&gt;For build &apos;20160201&apos; (master) executed during soaktest (see: &lt;a href=&quot;https://wiki.hpdd.intel.com/display/Releases/Soak+Testing+on+Lola#SoakTestingonLola-20160201&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://wiki.hpdd.intel.com/display/Releases/Soak+Testing+on+Lola#SoakTestingonLola-20160201&lt;/a&gt;)&lt;br/&gt;
the error happened again and persists even after reboot of the node so that the affected &lt;br/&gt;
OSS node and OSTs become unusable.&lt;br/&gt;
The MDTs have been formated with &lt;em&gt;ldiskfs&lt;/em&gt;, OSTs with &lt;em&gt;zfs&lt;/em&gt;&lt;/p&gt;

&lt;p&gt;Sequence of events:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;Feb  2 01:32:18 2016: lfsck started on mds-0 (&lt;tt&gt;lola-8&lt;/tt&gt; using the command {{lctl lfsck_start -M soaked-MDT0000 -A }}&lt;br/&gt;
OSS nodes &lt;tt&gt;lola-2&lt;/tt&gt; crashed with LBUG (See see error message after time stamp in console log  &apos;Feb  2 01:30:01&apos;)&lt;/li&gt;
	&lt;li&gt;Feb  2 01:50:01 2016: lola-2 rebooted; remounted OSTs node crashed with same LBUG&lt;br/&gt;
(see error message after &apos;Feb  2 01:50:01 lola-2 TIME: Time stamp for console&apos; in console log)&lt;/li&gt;
	&lt;li&gt;Several tries to simply reboot the node and remount again&lt;/li&gt;
	&lt;li&gt;Feb  2 02:13:35 PST 2016: Execution of &lt;tt&gt;lctl lfsck_stop -M soaked-MDT0000 -A }} end successfull. No OSTs mounted on {{lola-2&lt;/tt&gt;&lt;/li&gt;
	&lt;li&gt;Feb  2 02:15:01 2016: Mounted OSTs on &lt;tt&gt;lola-2&lt;/tt&gt;. Node crashed almost immediately with same LBUG again. Forced creation of debug log with &apos;lfsck&apos; enable and buffer size 512M&lt;br/&gt;
(see attached file: lustre-log-lu-5565)&lt;/li&gt;
	&lt;li&gt;Feb  2 02:31 2016: Rebooted and re-mounted OSTs one more with &apos;panic_on_lbug&apos;, lfsck enabled, debug buffer 512M&lt;br/&gt;
Explicitly forced stack trace.&lt;br/&gt;
Also debug logs were written after LBUG appear again (almost immediately). See:&lt;br/&gt;
                      lustre-log.1454409077.7931&lt;br/&gt;
                      lustre-log.1454409090.7841&lt;/li&gt;
	&lt;li&gt;OSS node can&apos;t be used as FS resource anymore&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Attached files:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;&lt;tt&gt;lola-2&lt;/tt&gt; message and console log&lt;/li&gt;
	&lt;li&gt;debug-log after &lt;tt&gt;lfsck&lt;/tt&gt; stopped and remount of OSTs&lt;/li&gt;
	&lt;li&gt;lustre-log.1454409077.7931, lustre-log.1454409090.7841 - Debug logs of server in current status&lt;br/&gt;
when mounting the OSTs&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="141037" author="pjones" created="Wed, 3 Feb 2016 19:19:56 +0000"  >&lt;p&gt;Please open a new ticket to track any further work that is needed in this area for 2.8&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="28105">LU-6100</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="17588">LU-2821</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="20268" name="console-lola-2.log.bz2" size="164121" author="heckes" created="Tue, 2 Feb 2016 12:53:56 +0000"/>
                            <attachment id="20270" name="lustre-log-lu-5565.bz2" size="2570" author="heckes" created="Tue, 2 Feb 2016 12:53:56 +0000"/>
                            <attachment id="20269" name="lustre-log.1454409077.7931.bz2" size="271240" author="heckes" created="Tue, 2 Feb 2016 12:53:56 +0000"/>
                            <attachment id="20272" name="lustre-log.1454409090.7841.bz2" size="275798" author="heckes" created="Tue, 2 Feb 2016 12:54:24 +0000"/>
                            <attachment id="20271" name="messages-lola-2.log.bz2" size="46145" author="heckes" created="Tue, 2 Feb 2016 12:53:56 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwuzb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>15517</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>