<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:25:55 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2523] ll_update_inode()) ASSERTION( lu_fid_eq(&amp;lli-&gt;lli_fid, &amp;body-&gt;fid1) ) failed: Trying to change FID</title>
                <link>https://jira.whamcloud.com/browse/LU-2523</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;After landing &lt;a href=&quot;http://review.whamcloud.com/#change,4478&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,4478&lt;/a&gt; I started to get racer crashes in lfs setstripe:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[63022.322673] LustreError: 20985:0:(llite_lib.c:1765:ll_update_inode()) ASSERTION( lu_fid_eq(&amp;amp;lli-&amp;gt;lli_fid, &amp;amp;body-&amp;gt;fid1) ) failed: Trying to change FID [0x200000401:0x83b3:0x0] to the [0x200000400:0x8063:0x0], inode 144115205255757923/33554436(ffff88007ab86b08)
[63022.323770] LustreError: 20985:0:(llite_lib.c:1765:ll_update_inode()) LBUG
[63022.324067] Pid: 20985, comm: lfs
[63022.324320] 
[63022.324321] Call Trace:
[63022.324880]  [&amp;lt;ffffffffa041d915&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
[63022.325186]  [&amp;lt;ffffffffa041df27&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
[63022.325481]  [&amp;lt;ffffffffa0e1feb1&amp;gt;] ll_update_inode+0xe11/0xe50 [lustre]
[63022.325778]  [&amp;lt;ffffffffa0e20072&amp;gt;] ll_prep_inode+0x182/0xc00 [lustre]
[63022.326071]  [&amp;lt;ffffffffa0e0f503&amp;gt;] ll_intent_file_open+0x563/0x840 [lustre]
[63022.326372]  [&amp;lt;ffffffffa0e36850&amp;gt;] ? ll_md_blocking_ast+0x0/0x730 [lustre]
[63022.326667]  [&amp;lt;ffffffffa0e0f868&amp;gt;] ll_lov_setstripe_ea_info+0x88/0x2d0 [lustre]
[63022.327180]  [&amp;lt;ffffffffa0e12252&amp;gt;] ll_lov_setstripe+0x92/0x5a0 [lustre]
[63022.327470]  [&amp;lt;ffffffffa0e13fae&amp;gt;] ll_file_ioctl+0xc9e/0x1230 [lustre]
[63022.327773]  [&amp;lt;ffffffff81043444&amp;gt;] ? __do_page_fault+0x204/0x490
[63022.328041]  [&amp;lt;ffffffff8118e112&amp;gt;] vfs_ioctl+0x22/0xa0
[63022.328297]  [&amp;lt;ffffffff81188225&amp;gt;] ? putname+0x35/0x50
[63022.328563]  [&amp;lt;ffffffff8118ea9e&amp;gt;] do_vfs_ioctl+0x3ee/0x5e0
[63022.328823]  [&amp;lt;ffffffff8118ed11&amp;gt;] sys_ioctl+0x81/0xa0
[63022.329081]  [&amp;lt;ffffffff8100b0f2&amp;gt;] system_call_fastpath+0x16/0x1b
[63022.329349] 
[63022.338857] Kernel panic - not syncing: LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;crashdump is in /exports/crashdumps/192.168.10.218-2012-12-24-01:46:51/&lt;/p&gt;</description>
                <environment></environment>
        <key id="17020">LU-2523</key>
            <summary>ll_update_inode()) ASSERTION( lu_fid_eq(&amp;lli-&gt;lli_fid, &amp;body-&gt;fid1) ) failed: Trying to change FID</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bfaccini">Bruno Faccini</assignee>
                                    <reporter username="green">Oleg Drokin</reporter>
                        <labels>
                            <label>HB</label>
                            <label>mn1</label>
                    </labels>
                <created>Mon, 24 Dec 2012 01:51:41 +0000</created>
                <updated>Tue, 26 Jan 2016 12:58:35 +0000</updated>
                            <resolved>Wed, 13 Mar 2013 08:45:59 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                    <fixVersion>Lustre 2.4.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>10</watches>
                                                                            <comments>
                            <comment id="49633" author="adilger" created="Mon, 24 Dec 2012 02:04:04 +0000"  >&lt;p&gt;Haven&apos;t looked into the details yet, but it might be that the LOV EA from userspace is being used directly to set the striping on the inode, but there is a race between getting the layout and setting the layout, and the user data has an old FID in it. &lt;/p&gt;

&lt;p&gt;Probably not too hard to fix once the details are worked out. &lt;/p&gt;</comment>
                            <comment id="49641" author="bfaccini" created="Mon, 24 Dec 2012 09:42:13 +0000"  >&lt;p&gt;I wanted to have a look but I don&apos;t know on which system is the crash-dump repository ??&lt;/p&gt;</comment>
                            <comment id="49756" author="green" created="Fri, 28 Dec 2012 13:37:07 +0000"  >&lt;p&gt;Bruno: this is on my home testbox, ping me on skype for login details&lt;/p&gt;</comment>
                            <comment id="49874" author="bfaccini" created="Thu, 3 Jan 2013 10:49:22 +0000"  >&lt;p&gt;Assigning to me, and thank&apos;s Oleg I will ping you on skype soon.&lt;/p&gt;</comment>
                            <comment id="49953" author="bfaccini" created="Fri, 4 Jan 2013 10:56:04 +0000"  >&lt;p&gt;Ok, thank&apos;s Oleg I am now connected to your testbox and have access to the crash-dump.&lt;/p&gt;

&lt;p&gt;But, since I did not find associated Kernel&apos;s, I decided to try using the one from the latest Maloo/Jenkins build (#11433) for this Jira, and doing so, crash tool reported me that the &quot;vmcore&quot; was not a SMP one !! Could this be the problem causing the crashes since as far as I know UMP Kernels willingly miss some locking/protection stuff, or more likely it is a crash-tool weirdness ...&lt;/p&gt;

&lt;p&gt;To confirm this or reproduce the crash I will provision a node with the build for this JIRA and run racer on it. Just to be sure we speak about the same test, it is the one that installs in /usr/lib64/lustre/tests/racer* ??&lt;/p&gt;

&lt;p&gt;Last, and you may not do this automatically because of the space required, but since the vmlinux/NameList file is not saved in the same place, how can we retrieve or rebuild it ??&lt;/p&gt;</comment>
                            <comment id="49954" author="bfaccini" created="Fri, 4 Jan 2013 11:24:37 +0000"  >&lt;p&gt;Ok thank&apos;s again Oleg, after you pointed me to the right vmlinux, crash becomes happy and no longer report the vmcore beeing non-SMP, I guess it would have been too easy !!&lt;/p&gt;

&lt;p&gt;So, diving into the crash-dump now ...&lt;/p&gt;

</comment>
                            <comment id="50707" author="adilger" created="Thu, 17 Jan 2013 13:54:35 +0000"  >&lt;p&gt;Bruno, any update on this?&lt;/p&gt;</comment>
                            <comment id="50805" author="bfaccini" created="Fri, 18 Jan 2013 10:24:38 +0000"  >&lt;p&gt;Sorry Andreas, you are right I am late on this.&lt;/p&gt;

&lt;p&gt;Finally I got a few time to concentrate and dig into this crash-dump, and I 1st started from the upper-end of the panic stack where I found the Assert condition was hopefully still there.&lt;/p&gt;

&lt;p&gt;Trying from the lower-end, I found that the only fetched user-datas for the ioctl/setstripe looks like :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; p/x *(struct lov_user_md_v1 *)0xffff880042c35d78
$29 = {
  lmm_magic = 0xbd10bd0, 
  lmm_pattern = 0x0, 
  lmm_object_id = 0x0, 
  lmm_object_seq = 0x0, 
  lmm_stripe_size = 0x0, 
  lmm_stripe_count = 0x1, 
  u = {
    lum_stripe_offset = 0xffff, 
    lum_layout_gen = 0xffff
  }, 
  lmm_objects = 0xffff880042c35d98
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So the problem seem to come from out-dated FID in the Client/local inode vs the one coming from MDT ...&lt;br/&gt;
And the concerned inode is for a symlink ...&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; bt
PID: 20985  TASK: ffff88002ea52240  CPU: 3   COMMAND: &quot;lfs&quot;
 #0 [ffff880042c35920] machine_kexec at ffffffff8103201b
 #1 [ffff880042c35980] crash_kexec at ffffffff810b8ba2
 #2 [ffff880042c35a50] panic at ffffffff814f75eb
 #3 [ffff880042c35ad0] lbug_with_loc at ffffffffa041df7b [libcfs]
 #4 [ffff880042c35af0] ll_update_inode at ffffffffa0e1feb1 [lustre]
 #5 [ffff880042c35b90] ll_prep_inode at ffffffffa0e20072 [lustre]
 #6 [ffff880042c35c50] ll_intent_file_open at ffffffffa0e0f503 [lustre]
 #7 [ffff880042c35ce0] ll_lov_setstripe_ea_info at ffffffffa0e0f868 [lustre]
 #8 [ffff880042c35d60] ll_lov_setstripe at ffffffffa0e12252 [lustre]
 #9 [ffff880042c35df0] ll_file_ioctl at ffffffffa0e13fae [lustre]
#10 [ffff880042c35e60] vfs_ioctl at ffffffff8118e112
#11 [ffff880042c35ea0] do_vfs_ioctl at ffffffff8118ea9e
#12 [ffff880042c35f30] sys_ioctl at ffffffff8118ed11
#13 [ffff880042c35f80] system_call_fastpath at ffffffff8100b0f2
    RIP: 00007fc341ecb257  RSP: 00007fff16f8b618  RFLAGS: 00010206
    RAX: 0000000000000010  RBX: ffffffff8100b0f2  RCX: 0000000000000000
    RDX: 00007fff16f8b6e0  RSI: 000000004008669a  RDI: 0000000000000003
    RBP: 0000000000000000   R8: 00000000ffffffff   R9: 0000000000000001
    R10: 00007fff16f8b3a0  R11: 0000000000000206  R12: 00007fff16f8b6e0
    R13: 00007fff16f8d9fa  R14: 0000000000000003  R15: 0000000000000000
    ORIG_RAX: 0000000000000010  CS: 0033  SS: 002b
crash&amp;gt; files
PID: 20985  TASK: ffff88002ea52240  CPU: 3   COMMAND: &quot;lfs&quot;
ROOT:     CWD: home/green/git/lustre-release/lustre/tests/racer
 FD       FILE            DENTRY           INODE       TYPE PATH
  0 ffff880026128f08 ffff8800b8092f18 ffff8800b9f04cf0 CHR  dev/null
  1 ffff88008524cf08 ffff88009d96bf18 ffff88004757acf0 FIFO 
  2 ffff8800305f1f08 ffff8800b8092f18 ffff8800b9f04cf0 CHR  dev/null
  3 ffff880052a11f08 ffff88000650af18 ffff88007ab86b08 LNK  mnt/lustre2/racer/16
crash&amp;gt; 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So could it be possible that we are not safe against underlying file/inode change ??&lt;/p&gt;

&lt;p&gt;Oleg, running with the &quot;lfs setstripe&quot; addon in file_create.sh script of &quot;racer&quot;, is it possible that we hit a collision with a concurrent file_symlink.sh ?? I know it is old, but do you remember what kind of work-load (multiple Clients, multiple &quot;racer&quot; occurrences, multiple directories, ...) you ran at time of crash ??&lt;/p&gt;</comment>
                            <comment id="50810" author="bfaccini" created="Fri, 18 Jan 2013 11:24:38 +0000"  >&lt;p&gt;Having a look in the related source code and more current structures in the crash-dump, is it me or the problem/race would better have occurred somewhere between md_intent_lock() and ll_prep_inode()/md_get_lustre_md() calls in ll_intent_file_open() ??&lt;/p&gt;</comment>
                            <comment id="51099" author="bfaccini" created="Thu, 24 Jan 2013 09:27:48 +0000"  >&lt;p&gt;I have been able to reproduce (concerned inode is also for a symlink!) on Toro, still after introducing &quot;lfs setstripe&quot; addon in file_create.sh script of &quot;racer&quot;, but again the low-level of Lustre debug-trace does not help to understand the sequence of actions leading to the crash/LBUG.&lt;/p&gt;

&lt;p&gt;Trying to reproduce now with full debug-trace enabled and all-pages (including User/process address-space) to be dumped. And it just crashed again now !!!&lt;/p&gt;

&lt;p&gt;Waiting for node reboot to investigate new crash datas.&lt;/p&gt;</comment>
                            <comment id="51801" author="jhammond" created="Tue, 5 Feb 2013 12:48:25 +0000"  >&lt;p&gt;Here is a simplified reproducer:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
llmount.sh
cd /mnt/lustre
touch file1

In a single process do:
  struct lov_user_md_v3 *lum;
  /* Initialize lum */
  fd2 = open(&quot;file2&quot;, O_RDWR|O_CREAT|O_LOV_DELAY_CREATE, 0666);
  rename(&quot;file1&quot;, &quot;file2&quot;);
  ioctl(fd2, LL_IOC_LOV_SETSTRIPE, lum);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="51807" author="jhammond" created="Tue, 5 Feb 2013 14:05:16 +0000"  >&lt;p&gt;I if remove the mdd_is_dead_obj() checks from mdd_open_sanity_check() and mdd_cd_sanity_check() then this goes away and I pass racer. Does removing these checks sound scary?&lt;/p&gt;

&lt;p&gt;For a real fix, as long as setstripe is piggybacked on ll_intent_file_open(), I think that there needs to be a way of doing real open by fid without the unlinked checks, and without falling back to open by path as is happening here.&lt;/p&gt;</comment>
                            <comment id="51885" author="bfaccini" created="Wed, 6 Feb 2013 13:40:47 +0000"  >&lt;p&gt;Yes, I agree, open by path seems the problem. I confirmed this from the full trace I got.&lt;/p&gt;

&lt;p&gt;Will try to see how to change things in ll_lov_setstripe_ea_info()/ll_intent_file_open() sequence. I may need help/comments !!...&lt;/p&gt;</comment>
                            <comment id="51911" author="jhammond" created="Wed, 6 Feb 2013 15:40:14 +0000"  >&lt;p&gt;Three comments:&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;In the stack traces and crash dumps most of the data available is about the new inode not the old, since the assertion is so far down in ll_update_inode().&lt;/li&gt;
	&lt;li&gt;In order to fix this you have to first decide what you want the outcome of the ioctl() call should be in the simplified reproducer. -ENOENT or 0, with the second being the superior choice in my mind. If you agree then you have to circumvent/disable the &quot;dead&quot; object checks in mdd_open_sanity_check() and mdd_cd_sanity_check().&lt;/li&gt;
	&lt;li&gt;mdd_is_dead_obj() is a terrible name since the object is not dead, just unlinked.&lt;/li&gt;
&lt;/ol&gt;
</comment>
                            <comment id="51978" author="bfaccini" created="Thu, 7 Feb 2013 11:20:02 +0000"  >&lt;p&gt;John,&lt;/p&gt;

&lt;p&gt;Since I am not really familiar with this code, I wanted to just have a look if something could not be done on client-side too.&lt;/p&gt;

&lt;p&gt;You were ahead of me in the involved source code reading ! So, I agree with you that avoiding the &quot;dead&quot; check in mdd_open_sanity_check()appears as a quick fix for this particular case, if no others/hidden implications.&lt;/p&gt;

&lt;p&gt;Having a look to the DEAD_OBJ flag usage/test in all the code, looks like there may be other candidates like mdd_cd_sanity_check() as you already pointed, and the POSIX-related comment in mdd_readpage() makes me wonder on how to address all cases. I know that dealing with opened+unlinked files as always been tricky for filesystems software, so do you have an idea ?&lt;/p&gt;

&lt;p&gt;Anyway I will start with a patch like you proposed as to &quot;remove the mdd_is_dead_obj() checks from mdd_open_sanity_check() and mdd_cd_sanity_check()&quot;. But I will try to add it extra-checks (like object beeing already/currenly referenced, ...) in order to fit the specific condition it triggered there. Will add you as reviewer for having your feeling about it.&lt;/p&gt;

</comment>
                            <comment id="52032" author="jhammond" created="Fri, 8 Feb 2013 09:15:58 +0000"  >&lt;p&gt;OK good. Using racer, I noticed that the issue reproduced much more quickly if I commented out the dd command from file_create.sh. So it may be worthwhile to test with and without that line.&lt;/p&gt;

&lt;p&gt;Also, running with the above discussed changes to mdd_open_sanity_check() and mdd_cd_sanity_check(), I hit the following:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LustreError: 31309:0:(lod_lov.c:782:lod_load_striping()) ASSERTION( lo-&amp;gt;ldo_stripe[i] ) failed: stripe 0 is NULL
LustreError: 31309:0:(lod_lov.c:782:lod_load_striping()) LBUG
Pid: 31309, comm: mdt00_003

crash&amp;gt; bt -l
PID: 31309  TASK: ffff88015023a040  CPU: 1   COMMAND: &quot;mdt00_003&quot;
 #0 [ffff8801728dd828] machine_kexec at ffffffff81031f7b
    /usr/src/debug/kernel-2.6.32-279.19.1.el6/linux-2.6.32-279.19.1.el6_lustre_gcov.x86_64/arch/x86/kernel/machine_kexec_64.c: 336
 #1 [ffff8801728dd888] crash_kexec at ffffffff810b8c22
    /usr/src/debug/kernel-2.6.32-279.19.1.el6/linux-2.6.32-279.19.1.el6_lustre_gcov.x86_64/kernel/kexec.c: 1106
 #2 [ffff8801728dd958] panic at ffffffff814eae18
    /usr/src/debug/kernel-2.6.32-279.19.1.el6/linux-2.6.32-279.19.1.el6_lustre_gcov.x86_64/kernel/panic.c: 103
 #3 [ffff8801728dd9d8] lbug_with_loc at ffffffffa0bcdeeb [libcfs]
    /root/lustre-release/libcfs/libcfs/linux/linux-debug.c: 188
 #4 [ffff8801728dd9f8] lod_load_striping at ffffffffa087e95f [lod]
    /root/lustre-release/lustre/include/lu_object.h: 868
 #5 [ffff8801728dda38] lod_declare_attr_set at ffffffffa088af8b [lod]
    /root/lustre-release/lustre/lod/lod_object.c: 300
 #6 [ffff8801728dda88] mdd_rename at ffffffffa0266528 [mdd]
    /root/lustre-release/lustre/mdd/mdd_dir.c: 2087
 #7 [ffff8801728ddba8] mdt_reint_rename at ffffffffa07b9627 [mdt]
    /root/lustre-release/lustre/mdt/mdt_reint.c: 1270
 #8 [ffff8801728ddcc8] mdt_reint_rec at ffffffffa07b56c1 [mdt]
    /root/lustre-release/libcfs/include/libcfs/libcfs_debug.h: 211
 #9 [ffff8801728ddce8] mdt_reint_internal at ffffffffa07aed23 [mdt]
    /root/lustre-release/libcfs/include/libcfs/libcfs_debug.h: 211
#10 [ffff8801728ddd28] mdt_reint at ffffffffa07af054 [mdt]
    /root/lustre-release/lustre/mdt/mdt_handler.c: 1818
#11 [ffff8801728ddd48] mdt_handle_common at ffffffffa079ffc8 [mdt]
    /root/lustre-release/lustre/mdt/mdt_handler.c: 2981
#12 [ffff8801728ddd98] mds_regular_handle at ffffffffa07d7605 [mdt]
    /root/lustre-release/lustre/mdt/mdt_mds.c: 354
#13 [ffff8801728ddda8] ptlrpc_server_handle_request at ffffffffa0fbbc7c [ptlrpc]
    /root/lustre-release/lustre/include/lustre_net.h: 2771
#14 [ffff8801728ddea8] ptlrpc_main at ffffffffa0fbd1c6 [ptlrpc]
    /root/lustre-release/lustre/ptlrpc/service.c: 2487
#15 [ffff8801728ddf48] kernel_thread at ffffffff8100c0ca
    /usr/src/debug///////kernel-2.6.32-279.19.1.el6/linux-2.6.32-279.19.1.el6_lustre_gcov.x86_64/arch/x86/kernel/entry_64.S: 1213
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I can&apos;t say now if this was caused by my changes or not.&lt;/p&gt;</comment>
                            <comment id="52067" author="jhammond" created="Fri, 8 Feb 2013 16:16:12 +0000"  >&lt;p&gt;To my great relief, the lod_load_striping() LBUG is reproducible on master. I&apos;ll create a separate issue.&lt;/p&gt;</comment>
                            <comment id="52077" author="jhammond" created="Fri, 8 Feb 2013 18:11:02 +0000"  >&lt;p&gt;Please see &lt;a href=&quot;http://review.whamcloud.com/5314&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5314&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="52078" author="jhammond" created="Fri, 8 Feb 2013 18:13:51 +0000"  >&lt;p&gt;Should ll_intent_file_open() be unconditionally setting MDS_OPEN_BY_FID in it_flags?&lt;/p&gt;</comment>
                            <comment id="52079" author="adilger" created="Fri, 8 Feb 2013 20:08:36 +0000"  >&lt;p&gt;John, Bruno,&lt;br/&gt;
rather than working everything out from first principles, please feel free to ask others for guidance if needed.  I&apos;ve added Jinshan and Fan Yong to the CC list, since they are probably familiar with this code and can provide some input on what is supposed to be happening here.&lt;/p&gt;

&lt;p&gt;It would also be great if any patch you write would include comments to explain things so that the next person who has to look at the code has a bit of help.&lt;/p&gt;</comment>
                            <comment id="52109" author="bfaccini" created="Mon, 11 Feb 2013 05:29:41 +0000"  >&lt;p&gt;John, seems that autotest sanityn.test_30 failed with the patch. Due to the nature of test_30 it is highly suspect that change is involved. But may be it is a particular case with currently exec()&apos;ed/binary files, then unlinked and re-accessed via its /proc/&amp;lt;PID&amp;gt;/exe method. I do not have access to &quot;#bug #11110&quot; cited in reference in this test, so I miss the original reason of this test, but may be it remembers something to you or somebody in CC ?&lt;/p&gt;

&lt;p&gt;Andreas, thank&apos;s for adding Jinshan and Fan Yong to help. BTW, this is actually one of the problem I have when I need to request for reviewers, I miss the knowledge of who has been involved with what and I don&apos;t know how to quickly find it.&lt;/p&gt;</comment>
                            <comment id="52162" author="jhammond" created="Mon, 11 Feb 2013 15:49:34 +0000"  >&lt;p&gt;It looks like the original reason for this test was to require that Lustre imitate NFS&apos;s ESTALE semantics for exec. This seems like a bogus requirement. The kernel and ext4 seem to have no problems with executing an open unlinked file through /proc/PID/exe.&lt;/p&gt;</comment>
                            <comment id="52189" author="jay" created="Mon, 11 Feb 2013 22:18:09 +0000"  >&lt;p&gt;I&apos;m looking at this issue.&lt;/p&gt;</comment>
                            <comment id="52192" author="jay" created="Mon, 11 Feb 2013 23:52:24 +0000"  >&lt;p&gt;You guys are absolutely right for the root cause of problem. However, I do have a different idea about the fix. The code below may have some problem:&lt;/p&gt;

&lt;p&gt;From mdt_reint_open():&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;                result = mdt_open_by_fid_lock(info, ldlm_rep, lhc);
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; ((result != -ENOENT &amp;amp;&amp;amp; !(create_flags &amp;amp; MDS_OPEN_CREAT)) &amp;amp;&amp;amp;
                     result != -EREMOTE)
                        GOTO(out, result);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Shouldn&apos;t be: if (!(result == -ENOENT &amp;amp;&amp;amp; create_flags &amp;amp; MDS_OPEN_CREATE) || result == -EREMOTE)? that means if there is no file existing and no intention to create, or the file is on the other MDT, it will exit immediately, actually (result == -EREMOTE) is redundant here.&lt;/p&gt;

&lt;p&gt;Can you please productize your reproduce case and add it to sanity.sh?&lt;/p&gt;</comment>
                            <comment id="52211" author="bfaccini" created="Tue, 12 Feb 2013 09:33:28 +0000"  >&lt;p&gt;Seems to me that the reason sanityn/test_30 fails with John patch is because we no longer return ENOENT from mdd_open_sanity_check() if mdd_is_dead_obj() is true, thus we do not switch to open-by-name in mdt_reint_open() and we keep using/returning the same/original FID. This cause the FID control in mdc_finish_intent_lock() to be ok and thus ESTALE is not returned.&lt;/p&gt;

&lt;p&gt;If we want to still behave the same way for test_30 case and fix problem for this ticket, I was wondering if there is a way to pass the M_CHECK_STALE mode bit to the MDS/Server side, thus we can use it to keep return ENOENT ??&lt;/p&gt;</comment>
                            <comment id="52231" author="jhammond" created="Tue, 12 Feb 2013 14:32:06 +0000"  >&lt;p&gt;In mdt_reint_open(), if mdt_open_by_fid_lock() returns -EREMOTE then I believe that we should not return early. So how can the &apos;result == -EREMOTE&apos; test be redundant? Do you mean the following:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;if (result == -ENOENT) {
        if (!(create_flags &amp;amp; MDS_OPEN_CREAT))
                GOTO(out, result);
} else if (result != -EREMOTE) {
        GOTO(out, result);
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Or more concisely:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;if (result == -ENOENT ? !(create_flags &amp;amp; MDS_OPEN_CREAT) : result != -EREMOTE)
        GOTO(out, result);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;(I like this approach because every time I use the ternary operator, Andreas rejects my patch for insufficient parentheses.)&lt;/p&gt;</comment>
                            <comment id="52262" author="bfaccini" created="Wed, 13 Feb 2013 07:10:15 +0000"  >&lt;p&gt;I am currently building a test version including the change you just discussed, but just for my highlight, the expected outcome is to return ENOENT for this JIRA/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2523&quot; title=&quot;ll_update_inode()) ASSERTION( lu_fid_eq(&amp;amp;lli-&amp;gt;lli_fid, &amp;amp;body-&amp;gt;fid1) ) failed: Trying to change FID&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2523&quot;&gt;&lt;del&gt;LU-2523&lt;/del&gt;&lt;/a&gt; problem/scenario, isn&apos;t-it ?? So finally, we choose to have racer to fail when running with &quot;lfs setstripe&quot; add-on ??&lt;/p&gt;</comment>
                            <comment id="52270" author="jhammond" created="Wed, 13 Feb 2013 09:59:26 +0000"  >&lt;p&gt;Please see &lt;a href=&quot;http://review.whamcloud.com/5417&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5417&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;I have tried to follow Jinshan&apos;s suggestion as closely as possible, and have not changed the behavior of mdd_open_sanity_check() or mdd_cd_sanity_check(). (Although I believe we should, doing so would be better addressed in a separate issue.)&lt;/p&gt;</comment>
                            <comment id="52281" author="jhammond" created="Wed, 13 Feb 2013 12:27:56 +0000"  >&lt;p&gt;Jinshan please tell me what&apos;s wrong with my reasoning about -EREMOTE in mdt_reint_open(). Even for a remote child the server needs to return a lock.&lt;/p&gt;

&lt;p&gt;Moreover if I use the condition you suggest then I can trigger an LBUG as follows:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;### To avoid LU-2789, in lustre/tests/file_create.sh comment out the 
### lfs setstripe lines and replace dd with touch.
# MDSCOUNT=2 MOUNT_2=yes llmount.sh
# (cd /mnt/lustre; while true; do lfs mkdir -i1 racer/$((RANDOM % 16)); done) 2&amp;gt;/dev/null &amp;amp;
# sh ./lustre/tests/racer.sh

LustreError: 10096:0:(mdt_handler.c:3784:mdt_intent_reint()) ASSERTION( lustre_handle_is_used(&amp;amp;lhc-&amp;gt;mlh_reg_lh) ) failed:
LustreError: 10096:0:(mdt_handler.c:3784:mdt_intent_reint()) LBUG
Pid: 10096, comm: mdt00_005

Call Trace:
 [&amp;lt;ffffffffa0e31895&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
 [&amp;lt;ffffffffa0e31e97&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
 [&amp;lt;ffffffffa0a4258e&amp;gt;] mdt_intent_reint+0x4ae/0x4f0 [mdt]
 [&amp;lt;ffffffffa0a3de9e&amp;gt;] mdt_intent_policy+0x3ae/0x750 [mdt]
 [&amp;lt;ffffffffa05d4351&amp;gt;] ldlm_lock_enqueue+0x361/0x8d0 [ptlrpc]
 [&amp;lt;ffffffffa05fa447&amp;gt;] ldlm_handle_enqueue0+0x4f7/0x1080 [ptlrpc]
 [&amp;lt;ffffffffa0a3e376&amp;gt;] mdt_enqueue+0x46/0x110 [mdt]
 [&amp;lt;ffffffffa0a32fb8&amp;gt;] mdt_handle_common+0x628/0x1620 [mdt]
 [&amp;lt;ffffffffa0a6a5b5&amp;gt;] mds_regular_handle+0x15/0x20 [mdt]
 [&amp;lt;ffffffffa062c00c&amp;gt;] ptlrpc_server_handle_request+0x41c/0xdf0 [ptlrpc]
 [&amp;lt;ffffffffa0e325de&amp;gt;] ? cfs_timer_arm+0xe/0x10 [libcfs]
 [&amp;lt;ffffffffa0623739&amp;gt;] ? ptlrpc_wait_event+0xa9/0x290 [ptlrpc]
 [&amp;lt;ffffffff81052223&amp;gt;] ? __wake_up+0x53/0x70
 [&amp;lt;ffffffffa062d556&amp;gt;] ptlrpc_main+0xb76/0x1870 [ptlrpc]
 [&amp;lt;ffffffffa062c9e0&amp;gt;] ? ptlrpc_main+0x0/0x1870 [ptlrpc]
 [&amp;lt;ffffffff8100c0ca&amp;gt;] child_rip+0xa/0x20
 [&amp;lt;ffffffffa062c9e0&amp;gt;] ? ptlrpc_main+0x0/0x1870 [ptlrpc]
 [&amp;lt;ffffffffa062c9e0&amp;gt;] ? ptlrpc_main+0x0/0x1870 [ptlrpc]
 [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This usually happens within a few seconds of starting racer. Whereas with my condition I survive this modified racer.&lt;/p&gt;</comment>
                            <comment id="52334" author="adilger" created="Wed, 13 Feb 2013 20:29:24 +0000"  >&lt;p&gt;I&apos;ve submitted &lt;a href=&quot;http://review.whamcloud.com/5424&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5424&lt;/a&gt; to disable the setstripe in racer/file_create.sh, unless &quot;&lt;tt&gt;RACER_SETSTRIPE=true&lt;/tt&gt;&quot; is set in the environment.  That will allow adding racer to the review test list, but also allow testing these patches as needed via &quot;&lt;tt&gt;Test-Parameters: envdefinitions=RACER_SETSTRIPE=true&lt;/tt&gt;&quot;.&lt;/p&gt;

&lt;p&gt;Please delete the exception for RACER_SETSTRIPE from file_create.sh when this patch and &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2789&quot; title=&quot;lod_load_striping()) ASSERTION( lo-&amp;gt;ldo_stripenr == 0 ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2789&quot;&gt;&lt;del&gt;LU-2789&lt;/del&gt;&lt;/a&gt; are landed.&lt;/p&gt;</comment>
                            <comment id="53901" author="jlevi" created="Wed, 13 Mar 2013 08:45:59 +0000"  >&lt;p&gt;Change/5417 landed to master.&lt;br/&gt;
Change/5314 will be abandoned and a new enhancement ticket created for that work.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="18506">LU-3215</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="17520">LU-2789</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="18790">LU-3311</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="34148">LU-7678</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="17558">LU-2808</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzve7b:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>5940</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>