<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:32:52 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-3318] mdc_set_lock_data() ASSERTION( old_inode-&gt;i_state &amp; I_FREEING ) </title>
                <link>https://jira.whamcloud.com/browse/LU-3318</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:184:lov_init_sub()) header@c000000c6a188980[0x0, 28, [0x100f00000:0x131a05a:0x0] hash]{ 
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:184:lov_init_sub()) ....lovsub@c000000c6a188a18[0]
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:184:lov_init_sub()) ....osc@c000000d6758cad0id: 0x0:20029530 idx: 240 gen: 0 kms_valid: 1 kms 1618944 
rc: 0 force_sync: 0 min_xid: 0 size: 1618944 mtime: 1368036151 atime: 1368036151 ctime: 1368036150 blocks: 2048
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:184:lov_init_sub()) } header@c000000c6a188980
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:184:lov_init_sub()) stripe 0 is already owned.
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) header@c000000f4fd1c8f8[0x0, 45, [0x5ae58ebb30:0x27:0x0] hash]{ 
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) ....vvp@c000000f4fd1c990(+ 0 0) inode: c000000c6a513178 6549798167695589415/1524993723 100666 1 0 c000000f4fd1c990 [0x5ae58ebb30:0x27:0x0]
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) ....lov@c000000c562bdfd8stripes: 2, valid, lsm{c000000c56b65b00 0x0BD10BD0 1 2 0}: 
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) header@c000000c6a188980[0x0, 28, [0x100f00000:0x131a05a:0x0] hash]{ 
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) ....lovsub@c000000c6a188a18[0]
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) ....osc@c000000d6758cad0id: 0x0:20029530 idx: 240 gen: 0 kms_valid: 1 kms 1618944 rc: 0 force_sync: 0 min_xid: 0 size: 1618944 mtime: 1368036151 atime: 1368036151 ctime: 1368036150 blocks: 2048
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) } header@c000000c6a188980
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) header@c000000c6a188a70[0x0, 18, [0x1006b0000:0x130771a:0x0] hash]{ 
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) ....lovsub@c000000c6a188b08[1]
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) ....osc@c000000d6758cbf0id: 0x0:19953434 idx: 107 gen: 0 kms_valid: 1 kms 1048576 rc: 0 force_sync: 0 min_xid: 0 size: 1048576 mtime: 1368036151 atime: 1368036151 ctime: 1368036151 blocks: 2048
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) } header@c000000c6a188a70
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) 
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) } header@c000000f4fd1c8f8
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) owned.
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:186:lov_init_sub()) header@c000000f4fd1c7f0[0x0, 1, [0x5ae58ebb30:0x27:0x0]]
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:186:lov_init_sub()) try to own.
2013-05-08 11:02:31 LustreError: 7256:0:(lcommon_cl.c:1201:cl_file_inode_init()) Failure to initialize cl object [0x5ae58ebb30:0x27:0x0]: -5
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:184:lov_init_sub()) header@c000000c6a188980[0x0, 28, [0x100f00000:0x131a05a:0x0] hash]{ 
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:184:lov_init_sub()) ....lovsub@c000000c6a188a18[0]
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:184:lov_init_sub()) ....osc@c000000d6758cad0id: 0x0:20029530 idx: 240 gen: 0 kms_valid: 1 kms 1618944 rc: 0 force_sync: 0 min_xid: 0 size: 1618944 mtime: 1368036151 atime: 1368036151 ctime: 1368036150 blocks: 2048
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:184:lov_init_sub()) } header@c000000c6a188980
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:184:lov_init_sub()) stripe 0 is already owned.
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) header@c000000f4fd1c8f8[0x0, 45, [0x5ae58ebb30:0x27:0x0] hash]{ 
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) ....vvp@c000000f4fd1c990(+ 0 0) inode: c000000c6a513178 6549798167695589415/1524993723 100666 1 0 c000000f4fd1c990 [0x5ae58ebb30:0x27:0x0]
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) ....lov@c000000c562bdfd8stripes: 2, valid, lsm{c000000c56b65b00 0x0BD10BD0 1 2 0}: 
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) header@c000000c6a188980[0x0, 28, [0x100f00000:0x131a05a:0x0] hash]{ 
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) ....lovsub@c000000c6a188a18[0]
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) ....osc@c000000d6758cad0id: 0x0:20029530 idx: 240 gen: 0 kms_valid: 1 kms 1618944 rc: 0 force_sync: 0 min_xid: 0 size: 1618944 mtime: 1368036151 atime: 1368036151 ctime: 1368036150 blocks: 2048
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) } header@c000000c6a188980
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) header@c000000c6a188a70[0x0, 18, [0x1006b0000:0x130771a:0x0] hash]{ 
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) ....lovsub@c000000c6a188b08[1]
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) ....osc@c000000d6758cbf0id: 0x0:19953434 idx: 107 gen: 0 kms_valid: 1 kms 1048576 rc: 0 force_sync: 0 min_xid: 0 size: 1048576 mtime: 1368036151 atime: 1368036151 ctime: 1368036151 blocks: 2048
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) } header@c000000c6a188a70
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) 
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) } header@c000000f4fd1c8f8
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:185:lov_init_sub()) owned.
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:186:lov_init_sub()) header@c000000f4fd1c6e8[0x0, 1, [0x5ae58ebb30:0x27:0x0]]
2013-05-08 11:02:31 LustreError: 7256:0:(lov_object.c:186:lov_init_sub()) try to own.
2013-05-08 11:02:31 LustreError: 7256:0:(llite_lib.c:2197:ll_prep_inode()) new_inode -fatal: rc -5
2013-05-08 11:08:51 LustreError: 45085:0:(mdc_locks.c:143:mdc_set_lock_data()) ASSERTION( old_inode-&amp;gt;i_state &amp;amp; I_FREEING ) failed: Found existing inode c000000d336808f8/6549372160627028526/1524894535 state 0 in lock: setting data to c000000c6abfdb78/252641838/1524894535
2013-05-08 11:08:51 LustreError: 45085:0:(mdc_locks.c:143:mdc_set_lock_data()) LBUG
2013-05-08 11:08:51 Call Trace:
2013-05-08 11:08:51 [c000000ddf04f1d0] [c000000000012f04] .show_stack+0x74/0x1c0 (unreliable)
2013-05-08 11:08:51 [c000000ddf04f280] [d00000000a9e0cb8] .libcfs_debug_dumpstack+0xd8/0x150 [libcfs]
2013-05-08 11:08:51 [c000000ddf04f330] [d00000000a9e1480] .lbug_with_loc+0x50/0xc0 [libcfs]
2013-05-08 11:08:51 [c000000ddf04f3c0] [d00000000c24fd2c] .mdc_set_lock_data+0x33c/0x340 [mdc]
2013-05-08 11:08:51 [c000000ddf04f480] [d00000000c4a8674] .ll_lookup_it_finish+0xb34/0x1700 [lustre]
2013-05-08 11:08:51 [c000000ddf04f5d0] [d00000000c4a96d8] .ll_lookup_it+0x498/0xfb0 [lustre]
2013-05-08 11:08:51 [c000000ddf04f750] [d00000000c4aa4b4] .ll_lookup_nd+0x2c4/0x580 [lustre]
2013-05-08 11:08:51 [c000000ddf04f810] [c0000000001d0614] .do_lookup+0x254/0x2d0
2013-05-08 11:08:51 [c000000ddf04f8e0] [c0000000001d3348] .__link_path_walk+0x1f8/0x15a0
2013-05-08 11:08:51 [c000000ddf04fa10] [c0000000001d4aa8] .path_walk+0x98/0x180
2013-05-08 11:08:51 [c000000ddf04fab0] [c0000000001d4d9c] .do_path_lookup+0x7c/0xf0
2013-05-08 11:08:51 [c000000ddf04fb40] [c0000000001d5b80] .user_path_at+0x60/0xb0
2013-05-08 11:08:51 [c000000ddf04fc90] [c0000000001c9034] .vfs_fstatat+0x44/0xb0
2013-05-08 11:08:51 [c000000ddf04fd30] [c0000000001c9274] .SyS_stat64+0x24/0x60
2013-05-08 11:08:51 [c000000ddf04fe30] [c000000000008564] syscall_exit+0x0/0x40
2013-05-08 11:08:51 Kernel panic - not syncing: LBUG
2013-05-08 11:08:51 Call Trace:
2013-05-08 11:08:51 [c000000ddf04f1f0] [c000000000012f04] .show_stack+0x74/0x1c0 (unreliable)
2013-05-08 11:08:51 [c000000ddf04f2a0] [c0000000005c4f34] .panic+0xc4/0x1f8
2013-05-08 11:08:51 [c000000ddf04f330] [d00000000a9e14e0] .lbug_with_loc+0xb0/0xc0 [libcfs]
2013-05-08 11:08:51 [c000000ddf04f3c0] [d00000000c24fd2c] .mdc_set_lock_data+0x33c/0x340 [mdc]
2013-05-08 11:08:51 [c000000ddf04f480] [d00000000c4a8674] .ll_lookup_it_finish+0xb34/0x1700 [lustre]
2013-05-08 11:08:51 [c000000ddf04f5d0] [d00000000c4a96d8] .ll_lookup_it+0x498/0xfb0 [lustre]
2013-05-08 11:08:51 [c000000ddf04f750] [d00000000c4aa4b4] .ll_lookup_nd+0x2c4/0x580 [lustre]
2013-05-08 11:08:51 [c000000ddf04f810] [c0000000001d0614] .do_lookup+0x254/0x2d0
2013-05-08 11:08:51 [c000000ddf04f8e0] [c0000000001d3348] .__link_path_walk+0x1f8/0x15a0
2013-05-08 11:08:51 [c000000ddf04fa10] [c0000000001d4aa8] .path_walk+0x98/0x180
2013-05-08 11:08:51 [c000000ddf04fab0] [c0000000001d4d9c] .do_path_lookup+0x7c/0xf0
2013-05-08 11:08:51 [c000000ddf04fb40] [c0000000001d5b80] .user_path_at+0x60/0xb0
2013-05-08 11:08:51 [c000000ddf04fc90] [c0000000001c9034] .vfs_fstatat+0x44/0xb0
2013-05-08 11:08:51 [c000000ddf04fd30] [c0000000001c9274] .SyS_stat64+0x24/0x60
2013-05-08 11:08:51 [c000000ddf04fe30] [c000000000008564] syscall_exit+0x0/0x40
2013-05-08 11:08:51 May  8 11:08:51 rzuseqlac2 kernel: LustreError: 45085:0:(mdc_locks.c:143:mdc_set_lock_data()) ASSERTION( old_inode-&amp;gt;i_state &amp;amp; I_FREEING ) failed: Found existing inode c000000d336808f8/6549372160627028526/1524894535 state 0 in lock: setting data to c000000c6abfdb78/252641838/1524894535
2013-05-08 11:08:51 May  8 11:08:51 rzuseqlac2 kernel: LustreError: 45085:0:(mdc_locks.c:143:mdc_set_lock_data()) LBUG
2013-05-08 11:08:51 May  8 11:08:51 rzuseqlac2 kernel: Kernel panic - not syncing: LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;We have two crash dumps available for analysis.&lt;/p&gt;

&lt;p&gt;LLNL-bug-id: BG-165&lt;/p&gt;</description>
                <environment>2.3.64 ppc64 client&lt;br/&gt;
2.1.4 x86_64 server</environment>
        <key id="18881">LU-3318</key>
            <summary>mdc_set_lock_data() ASSERTION( old_inode-&gt;i_state &amp; I_FREEING ) </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="di.wang">Di Wang</assignee>
                                    <reporter username="nedbass">Ned Bass</reporter>
                        <labels>
                    </labels>
                <created>Fri, 10 May 2013 20:59:49 +0000</created>
                <updated>Thu, 7 Nov 2013 22:22:24 +0000</updated>
                            <resolved>Fri, 31 May 2013 21:19:45 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                    <fixVersion>Lustre 2.4.0</fixVersion>
                    <fixVersion>Lustre 2.5.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>12</watches>
                                                                            <comments>
                            <comment id="58203" author="jay" created="Fri, 10 May 2013 21:33:35 +0000"  >&lt;p&gt;Is this another occurrence of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3190&quot; title=&quot;Interop 2.3.0&amp;lt;-&amp;gt;2.4 Failed on lustre-rsync-test test 3b: ASSERTION( lio-&amp;gt;lis_lsm != ((void *)0) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3190&quot;&gt;&lt;del&gt;LU-3190&lt;/del&gt;&lt;/a&gt;?&lt;/p&gt;</comment>
                            <comment id="58207" author="pjones" created="Fri, 10 May 2013 22:32:48 +0000"  >&lt;p&gt;Di &lt;/p&gt;

&lt;p&gt;Can you comment whether this is the same issue as &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3190&quot; title=&quot;Interop 2.3.0&amp;lt;-&amp;gt;2.4 Failed on lustre-rsync-test test 3b: ASSERTION( lio-&amp;gt;lis_lsm != ((void *)0) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3190&quot;&gt;&lt;del&gt;LU-3190&lt;/del&gt;&lt;/a&gt;?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="58211" author="nedbass" created="Fri, 10 May 2013 23:16:51 +0000"  >&lt;p&gt;These crashes occurred shortly after updating the PPC client to Lustre 2.3.64.  Prior to this we were running 2.3.58 on the client and never hit this bug.&lt;/p&gt;</comment>
                            <comment id="58246" author="di.wang" created="Sun, 12 May 2013 05:32:11 +0000"  >&lt;p&gt;Oh, server version is 2.1.4, which does not have OI cache yet. So it should not be 3190, IMHO.&lt;/p&gt;</comment>
                            <comment id="58250" author="di.wang" created="Sun, 12 May 2013 18:14:59 +0000"  >&lt;p&gt;Is it easy to reproduce the problem? Could you upload crash dump somewhere? Hmm, the inode number looks unreal.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-05-08 11:08:51 LustreError: 45085:0:(mdc_locks.c:143:mdc_set_lock_data()) ASSERTION( old_inode-&amp;gt;i_state &amp;amp; I_FREEING ) failed: Found existing inode c000000d336808f8/6549372160627028526/1524894535 state 0 in lock: setting data to c000000c6abfdb78/252641838/1524894535
2013-05-08 11:08:51 LustreError: 45085:0:(mdc_locks.c:143:mdc_set_lock_data()) LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I do not know whether those CLIO error message are harmful. Jinshan, Could you please comment.&lt;/p&gt;</comment>
                            <comment id="58294" author="nedbass" created="Mon, 13 May 2013 17:27:51 +0000"  >&lt;p&gt;Di,  see &lt;a href=&quot;ftp://ftp.whamcloud.com/uploads/LU-3318.vmcore.tar.bz2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;ftp://ftp.whamcloud.com/uploads/LU-3318.vmcore.tar.bz2&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;No, we don&apos;t know how to easily reproduce the problem.&lt;/p&gt;</comment>
                            <comment id="58303" author="adilger" created="Mon, 13 May 2013 18:46:53 +0000"  >&lt;p&gt;Ned, is this on a filesystem that was upgraded from 1.8 (or earlier) originally?&lt;/p&gt;

&lt;p&gt;Di, it looks like the lov_mds_md.lmm_oi is getting an IDIF FID &quot;&lt;span class=&quot;error&quot;&gt;&amp;#91;0x100f00000:0x131a05a:0x0&amp;#93;&lt;/span&gt;&quot;, which doesn&apos;t make sense.  Maybe it is for old 1.8 lov_mds_md that used &quot;seq = 0&quot; instead of filling in the generation at that point?  I thought we stopped using ostid_to_fid() for lmm_oi?&lt;/p&gt;</comment>
                            <comment id="58306" author="nedbass" created="Mon, 13 May 2013 19:00:24 +0000"  >&lt;p&gt;Yes, the filesystem was upgraded from 1.8 or earlier.&lt;/p&gt;</comment>
                            <comment id="58396" author="di.wang" created="Mon, 13 May 2013 20:46:30 +0000"  >&lt;p&gt;Andreas, that is ost_idx, not generation. &lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;/* convert an OST objid into an IDIF FID SEQ number */
static inline obd_seq fid_idif_seq(obd_id id, __u32 ost_idx)
{       
        return FID_SEQ_IDIF | (ost_idx &amp;lt;&amp;lt; 16) | ((id &amp;gt;&amp;gt; 32) &amp;amp; 0xffff);
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;so ost_idx is 0xf0 in this case. oh, we only stopped using ostid_to_fid in the metadata stack of the client. Inside CLIO, it still uses ostid_to_fid there.&lt;/p&gt;
</comment>
                            <comment id="58404" author="di.wang" created="Mon, 13 May 2013 22:13:53 +0000"  >&lt;p&gt;Sigh, it is a ppc64 crash dump, I do not have environment. Ned, could you please extract lustre debug log from it and upload somewhere. Thanks.&lt;/p&gt;</comment>
                            <comment id="58408" author="nedbass" created="Mon, 13 May 2013 22:56:19 +0000"  >&lt;p&gt;Debug log from client &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/12800/12800_LU-3318.lustre_debug_log.txt.gz&quot; title=&quot;LU-3318.lustre_debug_log.txt.gz attached to LU-3318&quot;&gt;LU-3318.lustre_debug_log.txt.gz&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;</comment>
                            <comment id="58420" author="di.wang" created="Tue, 14 May 2013 03:32:31 +0000"  >&lt;p&gt;Hmm, I checked the debug log, &lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000080:00020000:32.0:1367969153.115866:3776:32225:0:(llite_lib.c:2197:ll_prep_inode()) new_inode -fatal: rc -5
00020000:00020000:32.0:1367969153.123510:6448:32225:0:(lov_object.c:184:lov_init_sub()) header@c000000a3fd98200[0x0, 4, [0x100ed0000:0x130ad85:0x0] hash]{ 
00020000:00020000:32.0:1367969153.123522:6448:32225:0:(lov_object.c:184:lov_init_sub()) ....lovsub@c000000a3fd98298[0]
00020000:00020000:32.0:1367969153.123530:6448:32225:0:(lov_object.c:184:lov_init_sub()) ....osc@c000000a354367d0id: 0x0:19967365 idx: 237 gen: 0 kms_valid: 1 kms 2472 rc: 0 force_sync: 0 min_xid: 0 size: 2472 mtime: 1367966890 atime: 1367966860 ctime: 1367966890 blocks: 8
00020000:00020000:32.0:1367969153.123544:6448:32225:0:(lov_object.c:184:lov_init_sub()) } header@c000000a3fd98200
00020000:00020000:32.0:1367969153.123553:6096:32225:0:(lov_object.c:184:lov_init_sub()) stripe 0 is already owned.
00020000:00020000:32.0:1367969153.123563:6448:32225:0:(lov_object.c:185:lov_init_sub()) header@c000000a3fc27eb8[0x0, 3, [0x5ae187c950:0x1e332:0x0] hash]{ 
00020000:00020000:32.0:1367969153.156639:6448:32225:0:(lov_object.c:185:lov_init_sub()) ....vvp@c000000a3fc27f50(- 0 0) inode: c000000a555c1178 6548664631873889074/1524729801 100666 1 0 c000000a3fc27f50 [0x5ae187c950:0x1e332:0x0]
00020000:00020000:32.0:1367969153.156650:6752:32225:0:(lov_object.c:185:lov_init_sub()) ....lov@c000000a96bb9758stripes: 2, valid, lsm{c000000a2875fd80 0x0BD10BD0 1 2 0}: 
00020000:00020000:32.0:1367969153.156658:6944:32225:0:(lov_object.c:185:lov_init_sub()) header@c000000a3fd98200[0x0, 4, [0x100ed0000:0x130ad85:0x0] hash]{ 
00020000:00020000:32.0:1367969153.156664:6944:32225:0:(lov_object.c:185:lov_init_sub()) ....lovsub@c000000a3fd98298[0]
00020000:00020000:32.0:1367969153.156689:6944:32225:0:(lov_object.c:185:lov_init_sub()) ....osc@c000000a354367d0id: 0x0:19967365 idx: 237 gen: 0 kms_valid: 1 kms 2472 rc: 0 force_sync: 0 min_xid: 0 size: 2472 mtime: 1367966890 atime: 1367966860 ctime: 1367966890 blocks: 8
00020000:00020000:32.0:1367969153.156711:6944:32225:0:(lov_object.c:185:lov_init_sub()) } header@c000000a3fd98200
00020000:00020000:32.0:1367969153.156749:6944:32225:0:(lov_object.c:185:lov_init_sub()) header@c000000a3fd982f0[0x0, 2, [0x1010e0000:0x1330db1:0x0] hash]{ 
00020000:00020000:32.0:1367969153.167572:6944:32225:0:(lov_object.c:185:lov_init_sub()) ....lovsub@c000000a3fd98388[1]
00020000:00020000:32.0:1367969153.167579:6944:32225:0:(lov_object.c:185:lov_init_sub()) ....osc@c000000a354368f0id: 0x0:20123057 idx: 270 gen: 0 kms_valid: 1 kms 0 rc: 0 force_sync: 0 min_xid: 0 size: 0 mtime: 1367966864 atime: 1367966864 ctime: 1367966864 blocks: 0
00020000:00020000:32.0:1367969153.167588:6944:32225:0:(lov_object.c:185:lov_init_sub()) } header@c000000a3fd982f0
00020000:00020000:32.0:1367969153.167592:6448:32225:0:(lov_object.c:185:lov_init_sub()) 
00020000:00020000:32.0:1367969153.167596:6448:32225:0:(lov_object.c:185:lov_init_sub()) } header@c000000a3fc27eb8
00020000:00020000:32.0:1367969153.167600:6096:32225:0:(lov_object.c:185:lov_init_sub()) owned.
00020000:00020000:32.0:1367969153.167605:6256:32225:0:(lov_object.c:186:lov_init_sub()) header@c0000005ee567990[0x0, 1, [0x5ae187c950:0x1e332:0x0]]
00020000:00020000:32.0:1367969153.167610:6096:32225:0:(lov_object.c:186:lov_init_sub()) try to own.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Hmm it seems different lov objects have the same FID,&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
header@c000000a3fc27eb8[0x0, 3, [0x5ae187c950:0x1e332:0x0] hash]

header@c0000005ee567990[0x0, 1, [0x5ae187c950:0x1e332:0x0]]

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So maybe CLIO cache or LU cache has some problem. Jinshan, please comment. Note: it is 2.1.4 server with 2.3.64 ppc client.&lt;/p&gt;</comment>
                            <comment id="58421" author="adilger" created="Tue, 14 May 2013 04:02:28 +0000"  >&lt;p&gt;It&apos;s also possible that the LOV objects actually DO have the same FID, since there has never been any checking of this field (it has only ever been used for debug messages).  This could easily happen in case of a backup and restore of 1.8 inodes, and then creating new files under 1.8.  &lt;/p&gt;

&lt;p&gt;Are there any messages that print the actual inode numbers instead of the LOV EA FID?  That would allow us to check via &lt;tt&gt;lfs getstripe -v&lt;/tt&gt; if the LOV EA FIDs are actually the same. &lt;/p&gt;

&lt;p&gt;At that point it should be easy to &lt;tt&gt;lfs_migrate&lt;/tt&gt; one of the files to a new one. This would preferably be done on the file where the LOV EA FID does not match the inode number (i.e. &lt;tt&gt;ls -i&lt;/tt&gt;). &lt;/p&gt;</comment>
                            <comment id="58424" author="jay" created="Tue, 14 May 2013 04:54:38 +0000"  >&lt;p&gt;Hi Ned, were you running 32bit applications in PPC64?&lt;/p&gt;</comment>
                            <comment id="58425" author="nedbass" created="Tue, 14 May 2013 05:45:01 +0000"  >&lt;p&gt;Hi Jinshan, I think it&apos;s unlikely that 32-bit applications were running, but I&apos;ll investigate that possibility.&lt;/p&gt;</comment>
                            <comment id="58426" author="di.wang" created="Tue, 14 May 2013 06:05:18 +0000"  >&lt;p&gt;Andreas, for LOV objects, I actually mean top objects being built in CLIO stack, whose FID is actually got from lookup, for igif, it should be got from ino directly, instead of lmm_oi, if that is what you mean. But I just find this IGIF seq number&lt;span class=&quot;error&quot;&gt;&amp;#91;0x5ae187c950:0x1e332:0x0&amp;#93;&lt;/span&gt; is even bigger than normal seq 0x200000400ULL, so clearly something wrong here.&lt;/p&gt;</comment>
                            <comment id="58428" author="di.wang" created="Tue, 14 May 2013 06:52:26 +0000"  >&lt;p&gt;Hmm, According to the debug message, &lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00020000:00020000:9.0:1367969152.885287:6448:32223:0:(lov_object.c:185:lov_init_sub()) header@c000000a3fc27048[0x0, 3, [0x5ae187c950:0x1e322:0x0] hash]{ 
00020000:00020000:9.0:1367969152.885298:6448:32223:0:(lov_object.c:185:lov_init_sub()) ....vvp@c000000a3fc270e0(- 0 0) inode: c000000a35e8bbf8 6548664631873889058/1524729801 100666 1 0 c000000a3fc270e0 [0x5ae187c950:0x1e322:0x0]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;inode ino is 6548664631873889058, which is indeed built by &lt;span class=&quot;error&quot;&gt;&amp;#91;0x5ae187c950:0x1e322:0x0&amp;#93;&lt;/span&gt;. &lt;/p&gt;

&lt;p&gt;But 0x5ae187c950 seems too big for me, (0x5ae187c950 - 0x20000400) = 380B FID sequences has been used. Even if the system is keeping umount/mount, it will take 380B/1000 = 380M amount/mount to consume 380 Billion sequence, which seems unlikely to me.&lt;/p&gt;

&lt;p&gt;Ned, how long the system has been upgraded to 2.1.4? Could you please try this &quot;lfs fid2path 0x5ae187c950:0x1e322:0x0&quot;&lt;/p&gt;</comment>
                            <comment id="58430" author="nedbass" created="Tue, 14 May 2013 07:45:12 +0000"  >&lt;p&gt;Di, &lt;/p&gt;

&lt;blockquote&gt;&lt;p&gt;Ned, how long the system has been upgraded to 2.1.4?&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;I believe we upgraded it in March 2012.&lt;/p&gt;

&lt;blockquote&gt;&lt;p&gt;Could you please try this &quot;lfs fid2path 0x5ae187c950:0x1e322:0x0&quot;&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;It returns no such file or directory.&lt;/p&gt;</comment>
                            <comment id="58431" author="nedbass" created="Tue, 14 May 2013 07:51:06 +0000"  >&lt;p&gt;For reference, recently created files also have &quot;big&quot; sequence numbers:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# rzuseqlac2 /root &amp;gt; touch /p/lscratchrza/bass6/file1
# rzuseqlac2 /root &amp;gt; lfs path2fid !$
lfs path2fid /p/lscratchrza/bass6/file1
[0x5ae60e2460:0x55c7:0x0]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="58460" author="jay" created="Tue, 14 May 2013 16:22:46 +0000"  >&lt;p&gt;For the duplicate inode problem at the client site, the only possible cause I can discover now is that 32bit applications were running.&lt;/p&gt;</comment>
                            <comment id="58462" author="adilger" created="Tue, 14 May 2013 16:41:25 +0000"  >&lt;p&gt;Ned, do you have any x86 clients that can access this filesystem?  Do they return something different for &lt;tt&gt;lfs path2fid /p/lscratchrza/bass6/file1&lt;/tt&gt;?&lt;/p&gt;

&lt;p&gt;Di, is there a file under /proc/fs/lustre/fld/ on the MDS that can dump the allocated SEQ numbers?  I agree that 0x5ae60e2460 seems completely unreasonable for a sequence number.&lt;/p&gt;</comment>
                            <comment id="58467" author="nedbass" created="Tue, 14 May 2013 17:12:02 +0000"  >&lt;p&gt;Andreas, &lt;tt&gt;lfs path2fid /p/lscratchrza/bass6/file1&lt;/tt&gt; on  an x86_64 client returns the same fid.&lt;/p&gt;</comment>
                            <comment id="58470" author="di.wang" created="Tue, 14 May 2013 17:40:21 +0000"  >&lt;p&gt;Andreas, there are no such stuff in 2.1 lustre. &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/sad.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="58479" author="marc@llnl.gov" created="Tue, 14 May 2013 18:20:36 +0000"  >&lt;p&gt;This file system hardware is 7+ years old.  It may have been reformatted once about 5 years ago.  Large object numbers would not surprise me.  It probably started out running lustre 1.6.&lt;/p&gt;</comment>
                            <comment id="58480" author="di.wang" created="Tue, 14 May 2013 18:23:14 +0000"  >&lt;p&gt;There is bug &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1632&quot; title=&quot;FID sequence numbers not working properly with filesystems formatted using 1.8?&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1632&quot;&gt;&lt;del&gt;LU-1632&lt;/del&gt;&lt;/a&gt; might cause FID sequence being used excessively if you did not erase the config log when upgrading from 1.8 to 2.1, i.e. there are no LMV layer on the client side. But the fix has been landed to 2.3.58. Probably the client with pre-2.3.58 has been run too long time (about a year? though the seq number still seems too big for me). &lt;/p&gt;

&lt;p&gt;Ned, could you please check whether there are lmv layer on the client side? and also confirm if you erased the config log during upgrading from 1.8 to 2.1.  And also are you able to find out some old files, which are created near after the upgrading and near before updating client to 2.3.58,  to see how is their FID? Thanks!&lt;/p&gt;</comment>
                            <comment id="58488" author="adilger" created="Tue, 14 May 2013 18:47:22 +0000"  >&lt;p&gt;Di, you previously wrote:&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;oh, we only stopped using ostid_to_fid in the metadata stack of the client. Inside CLIO, it still uses ostid_to_fid there.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Is there a patch needed to fix that problem?  Could it lead to this inconsistency (or maybe the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3327&quot; title=&quot;kernel:LustreError: 4543:0:(mdc_locks.c:143:mdc_set_lock_data()) LBUG [reproducible in my environment]&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3327&quot;&gt;&lt;del&gt;LU-3327&lt;/del&gt;&lt;/a&gt;)?&lt;/p&gt;</comment>
                            <comment id="58490" author="nedbass" created="Tue, 14 May 2013 18:58:32 +0000"  >&lt;blockquote&gt;&lt;p&gt;Ned, could you please check whether there are lmv layer on the client side?&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;No, there is no lmv layer on the client.&lt;/p&gt;

&lt;blockquote&gt;&lt;p&gt;and also confirm if you erased the config log during upgrading from 1.8 to 2.1&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;It looks like we did not:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedHeader panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;llog_reader CONFIGS/lsa-MDT0000&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Header size : 8192
Time : Thu Sep 29 09:45:18 2011
Number of records: 3486
Target uuid : config_uuid 
-----------------------
#01 (224)marker   1 (flags=0x01, v1.8.5.0) lsa-mdtlov      &apos;lov setup&apos; Thu Sep 29 09:45:18 2011-
#02 (112)attach    0:lsa-mdtlov  1:lov  2:lsa-mdtlov_UUID  
#03 (168)lov_setup 0:lsa-mdtlov  1:(struct lov_desc)
                uuid=lsa-mdtlov_UUID  stripe:cnt=1 size=1048576 offset=18446744073709551615 pattern=0x1
#04 (224)marker   1 (flags=0x02, v1.8.5.0) lsa-mdtlov      &apos;lov setup&apos; Thu Sep 29 09:45:18 2011-
#05 (224)marker   2 (flags=0x01, v1.8.5.0) lsa-MDT0000     &apos;add mdt&apos; Thu Sep 29 09:45:18 2011-
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;blockquote&gt;&lt;p&gt;And also are you able to find out some old files, which are created near after the upgrading and near before updating client to 2.3.58, to see how is their FID?&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;I&apos;ll look, but I&apos;m not very hopeful of finding any.  Do the files needs to have been created on a pre-2.3.58 client?  We have 2.1 clients using this filesystem too, and no way to tell which clients created which files.&lt;/p&gt;</comment>
                            <comment id="58492" author="di.wang" created="Tue, 14 May 2013 19:21:37 +0000"  >&lt;p&gt;Andreas: Yes, the conversion of lmm_oi to FID has been separated from ostid_to_fid, and the patch &lt;a href=&quot;http://review.whamcloud.com/#change,6044&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,6044&lt;/a&gt;, and also since lmm_oi is only for debugging(or output for getstripe)purpose right now. so we might remove that finally. And for CLIO object, which is created by FID directly, i.e. the FID we got from name lookup, which is nothing related with lmm_oi, IMHO. So it seems to me lmm_oi is not the problem here. But I will check.&lt;/p&gt;

&lt;p&gt;Hmm, this indeed seems similar with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3327&quot; title=&quot;kernel:LustreError: 4543:0:(mdc_locks.c:143:mdc_set_lock_data()) LBUG [reproducible in my environment]&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3327&quot;&gt;&lt;del&gt;LU-3327&lt;/del&gt;&lt;/a&gt;, but according to the message on 3327&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;May 12 22:51:39 l2 kernel: LustreError: 4543:0:(mdc_locks.c:143:mdc_set_lock_data()) ASSERTION( old_inode-&amp;gt;i_state &amp;amp; I_FREEING ) failed: Found existing inode ffff88084365e178/144115205255786850/33554436 state 0 in lock: setting data to ffff8808689361b8/144115205255786850/33554436
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So it is same lock, same inode i_ino/generation, but different inodes(pointer are different).&lt;/p&gt;

&lt;p&gt;And this ticket&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-05-08 11:08:51 LustreError: 45085:0:(mdc_locks.c:143:mdc_set_lock_data()) ASSERTION( old_inode-&amp;gt;i_state &amp;amp; I_FREEING ) failed: Found existing inode c000000d336808f8/6549372160627028526/1524894535 state 0 in lock: setting data to c000000c6abfdb78/252641838/1524894535
2013-05-08 11:08:51 LustreError: 45085:0:(mdc_locks.c:143:mdc_set_lock_data()) LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;it is same lock, generation, but different i_ino and inodes.  quite interesting.&lt;/p&gt;



</comment>
                            <comment id="58493" author="di.wang" created="Tue, 14 May 2013 19:23:53 +0000"  >&lt;p&gt;Ned: yes, if you can find the files created just before and after you upgrade client to 2.3.58, and tell me their FIDs, that would be great. Since I want to understand whether this giant sequence is caused by &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1632&quot; title=&quot;FID sequence numbers not working properly with filesystems formatted using 1.8?&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1632&quot;&gt;&lt;del&gt;LU-1632&lt;/del&gt;&lt;/a&gt; or there are still other problem here.&lt;/p&gt;</comment>
                            <comment id="58504" author="di.wang" created="Tue, 14 May 2013 20:45:59 +0000"  >&lt;p&gt;According to the error message here&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-05-08 11:08:51 LustreError: 45085:0:(mdc_locks.c:143:mdc_set_lock_data()) ASSERTION( old_inode-&amp;gt;i_state &amp;amp; I_FREEING ) failed: Found existing inode c000000d336808f8/6549372160627028526/1524894535 state 0 in lock: setting data to c000000c6abfdb78/252641838/1524894535
2013-05-08 11:08:51 LustreError: 45085:0:(mdc_locks.c:143:mdc_set_lock_data()) LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The old inode is 252641838, which definitely can not got from fid_flatten(&lt;span class=&quot;error&quot;&gt;&amp;#91;0x5ae40b47b0:0x1b634:0x0&amp;#93;&lt;/span&gt;), so it should use fid_flatten32 to form the ino by the FID. But later it use fid_flatten to form the ino for the same object, which cause the LBUG. So clearly some one is calling 32-bit system call for one object, but later it call 64bit system call again for this object. &lt;/p&gt;</comment>
                            <comment id="58507" author="di.wang" created="Tue, 14 May 2013 20:59:46 +0000"  >&lt;p&gt;Another interesting problem is that according to the comment from Ned (14/May/13 7:51 AM) &lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# rzuseqlac2 /root &amp;gt; touch /p/lscratchrza/bass6/file1
# rzuseqlac2 /root &amp;gt; lfs path2fid !$
lfs path2fid /p/lscratchrza/bass6/file1
[0x5ae60e2460:0x55c7:0x0]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And when the ticket is created&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; header@c000000f4fd1c6e8[0x0, 1, [0x5ae58ebb30:0x27:0x0]]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;the sequence is around 0x5ae58ebb30, Ned, I am not sure if you can still find this file (lfs fid2path 0x5ae58ebb30:0x27:0x0)and find out its creation time. If that file is being created around that time, then it consume about (0x5ae60e2460 - 0x5ae58ebb30) = 8M sequence in 4 days, which still seems too much for me, except I miss sth here.&lt;/p&gt;

&lt;p&gt;Ned, how many clients does this cluster have? And currently all clients are running with 2.3.58 right?&lt;/p&gt;
</comment>
                            <comment id="58511" author="nedbass" created="Tue, 14 May 2013 21:15:36 +0000"  >&lt;p&gt;Ah apparently our 2.1 x86 clients don&apos;t have the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1632&quot; title=&quot;FID sequence numbers not working properly with filesystems formatted using 1.8?&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1632&quot;&gt;&lt;del&gt;LU-1632&lt;/del&gt;&lt;/a&gt; patch:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;$ touch a b c d
$ for x in a b c d ; do lfs path2fid $x ; done
[0x5b05195040:0x1:0x0]
[0x5b05197750:0x1:0x0]
[0x5b05199e60:0x1:0x0]
[0x5b0519c188:0x1:0x0]
$ pwd
/p/lscratchrza/bass6
$ rpm -q lustre
lustre-2.1.2-4chaos_2.6.32_220.23.1.1chaos.ch5.x86_64.x86_64
$ arch
x86_64
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="58512" author="marc@llnl.gov" created="Tue, 14 May 2013 21:21:24 +0000"  >&lt;p&gt;We didn&apos;t upgrade all of our clients to 2.1.4 because we paused the rollout due to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3029&quot; title=&quot;Directory listings are unreliable&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3029&quot;&gt;&lt;del&gt;LU-3029&lt;/del&gt;&lt;/a&gt;.  Also, this file system (lscratchrza) has ppc 32-bit clients mounting it right now (rzdawndev a BG/P system), and there would be 32-bit applications writing to the file system.  These clients are not the same clients that generated this ASSERTION, those are ppc 64-bit clients.&lt;/p&gt;</comment>
                            <comment id="58515" author="di.wang" created="Tue, 14 May 2013 21:46:50 +0000"  >&lt;p&gt;Is that possible those 32 bits application running on these ppc 64-bit clients? And also, is it possible to umount clients and erase the config log with tunefs, and remount clients, so lmv layer can be created on client side, which will slow down the sequence consume speed.   &lt;/p&gt;</comment>
                            <comment id="58524" author="di.wang" created="Wed, 15 May 2013 01:08:42 +0000"  >&lt;p&gt;Just tried to figure out the FID from these two inodes in the error message. The FID should be &lt;span class=&quot;error&quot;&gt;&amp;#91;0x5ae40b47b0: 0x1b22e: 0&amp;#93;&lt;/span&gt;. &lt;/p&gt;

&lt;p&gt;So fid_flatten( &lt;span class=&quot;error&quot;&gt;&amp;#91;0x5ae40b47b0: 0x1b22e: 0&amp;#93;&lt;/span&gt;) = 6549372160627028526 and fid_flatten32(&lt;span class=&quot;error&quot;&gt;&amp;#91;0x5ae40b47b0: 0x1b22e: 0&amp;#93;&lt;/span&gt;) = 252641838&lt;/p&gt;

&lt;p&gt;which just match the error message and confirm this guess&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-05-08 11:08:51 LustreError: 45085:0:(mdc_locks.c:143:mdc_set_lock_data()) ASSERTION( old_inode-&amp;gt;i_state &amp;amp; I_FREEING ) failed: Found existing inode c000000d336808f8/6549372160627028526/1524894535 state 0 in lock: setting data to c000000c6abfdb78/252641838/1524894535
2013-05-08 11:08:51 LustreError: 45085:0:(mdc_locks.c:143:mdc_set_lock_data()) LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;But not sure how to fix this. &lt;/p&gt;</comment>
                            <comment id="58525" author="nedbass" created="Wed, 15 May 2013 01:18:09 +0000"  >&lt;blockquote&gt;&lt;p&gt;Is that possible those 32 bits application running on these ppc 64-bit clients? &lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;We confirmed that it is possible to build and run 32-bit apps on the ppc64 client that hit this bug.&lt;/p&gt;</comment>
                            <comment id="58537" author="jay" created="Wed, 15 May 2013 03:48:32 +0000"  >&lt;p&gt;We should always use `unsigned long&apos; in kernel space. The only concern for 32bit compatibility is when we&apos;re filling in the user buffer. So in this case for ll_iget(), we should call cl_fid_build_ino() with api32 = 0.&lt;/p&gt;</comment>
                            <comment id="58538" author="di.wang" created="Wed, 15 May 2013 04:29:54 +0000"  >&lt;p&gt;The problem is that if some 32bit user(like NFS ?) will use ino directly? if we cut 64 bit inode ino to 32 bit and return it to the user, which might not be able to get back the inode with this 32 bit ino?&lt;/p&gt;</comment>
                            <comment id="58558" author="adilger" created="Wed, 15 May 2013 11:03:46 +0000"  >&lt;p&gt;This was discussed in the past, and the assumption was made that no applications were going to access both 32-bit and 64-bit inode numbers at the same time.  The inode number should not be stored or used internally, but rather the FID should always be used for internal access, and the inode number should always be regenerated in the context of the user thread that is doing the stat() or whatever needs it.&lt;/p&gt;</comment>
                            <comment id="58579" author="thermeon" created="Wed, 15 May 2013 16:03:35 +0000"  >&lt;p&gt;I can confirm that &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3327&quot; title=&quot;kernel:LustreError: 4543:0:(mdc_locks.c:143:mdc_set_lock_data()) LBUG [reproducible in my environment]&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3327&quot;&gt;&lt;del&gt;LU-3327&lt;/del&gt;&lt;/a&gt; involves a 32bit application on a 64bit client.&lt;/p&gt;</comment>
                            <comment id="58581" author="jay" created="Wed, 15 May 2013 16:35:11 +0000"  >&lt;blockquote&gt;
&lt;p&gt;The problem is that if some 32bit user(like NFS ?) will use ino directly? if we cut 64 bit inode ino to 32 bit and return it to the user, which might not be able to get back the inode with this 32 bit ino?&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;I thought we pack FID into NFS file handle. I can&apos;t think of an use case of using ino directly - but apparently I&apos;m not an expert about this &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/wink.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;
</comment>
                            <comment id="58596" author="adilger" created="Wed, 15 May 2013 17:49:29 +0000"  >&lt;p&gt;Jinshan,&lt;br/&gt;
this isn&apos;t necessarily a problem with the NFS file handle, but rather some part of the internal code is using the 32-bit inode number AND the 64-bit inode number internally for some comparison, when it shouldn&apos;t be doing this.&lt;/p&gt;</comment>
                            <comment id="58603" author="jay" created="Wed, 15 May 2013 18:31:44 +0000"  >&lt;p&gt;Hi Andreas, why is 32-bit inode number still needed by internal code?&lt;/p&gt;</comment>
                            <comment id="58606" author="di.wang" created="Wed, 15 May 2013 18:56:52 +0000"  >&lt;p&gt;Hmm, how about in prep_inode, we always try to locate inode(ilookup) by ino created by fid_flatten32 first, if it can not find  any inodes associated with the ino, then it will prepare the inode in the normal way.  Since 32 bit application might be rare thing these days, we probably can add a flag in super block, and then set the flag if there are 32 bit application running, then only do fid_flatten32 check if this flag is being set.  Though this method might make it a bit complicate, and also slow down the client metadata performance if there are 32 bit application has been running.&lt;/p&gt;
</comment>
                            <comment id="58639" author="adilger" created="Thu, 16 May 2013 05:59:26 +0000"  >&lt;p&gt;My preference would be that we use 64-bit inode numbers internally all the time, and only expose 32-bit inode numbers to the application.  It looks like this is a defect introduced by &lt;a href=&quot;http://review.whamcloud.com/5711&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5711&lt;/a&gt; from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2904&quot; title=&quot;parallel-scale-nfsv3: FAIL: setup nfs failed!&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2904&quot;&gt;&lt;del&gt;LU-2904&lt;/del&gt;&lt;/a&gt;, since this is what introduced the use of ll_need_32bit_api(sbi) to ll_iget().  I suspect it &lt;em&gt;might&lt;/em&gt; be OK to replace this with (sbi-&amp;gt;ll_flags &amp;amp; LL_SBI_32BIT_API) so that the cfs_curproc_is_32bit() part is skipped in this case.&lt;/p&gt;</comment>
                            <comment id="58719" author="di.wang" created="Thu, 16 May 2013 22:36:15 +0000"  >&lt;p&gt;yes, I agree removing cfs_curproc_is_32bit should fix this problem. &lt;a href=&quot;http://review.whamcloud.com/#change,6371&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,6371&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="58731" author="yong.fan" created="Fri, 17 May 2013 03:06:49 +0000"  >&lt;p&gt;Mixed using 64-bit ino# internally and 32-bit ino# external for 32-bit applications may cause unexpectation, because we do not know how the 32-bit ino# will be used, and cannot prevent it to be passed down back to llite for using. So for a give Lustre client, either all 32-bit ino# (on 32-bit platform or via &quot;-o 32bitapi&quot;) or all 64-bit ino#, NOT mixed.&lt;/p&gt;</comment>
                            <comment id="58738" author="adilger" created="Fri, 17 May 2013 06:42:06 +0000"  >&lt;p&gt;Most applications do not use or care about the inode number. However, 32-bit applications ona 64-bit client will definitely get an error from stat() if the curproc check is removed.  That is MUCH more likely to cause a problem for users than a very obscure case of someone mixing 32-bit and 64-bit applications that use the inode number on the same client and expecting them to work well together. &lt;/p&gt;

&lt;p&gt;This has been in all of the 2.x releases without any complaints, so I&apos;d rather leave this alone. &lt;/p&gt;</comment>
                            <comment id="58767" author="nedbass" created="Fri, 17 May 2013 15:44:52 +0000"  >&lt;p&gt;Can we detect this case and return an error rather than asserting on it?  Allowing a misbehaved application to crash the OS seems unacceptable.&lt;/p&gt;</comment>
                            <comment id="58794" author="nedbass" created="Fri, 17 May 2013 18:50:49 +0000"  >&lt;p&gt;I&apos;m having trouble understanding what exactly we think applications are doing to trigger this bug. For example, nasf said:&lt;/p&gt;

&lt;blockquote&gt;&lt;p&gt;we do not know how the 32-bit ino# will be used, and cannot prevent it to be passed down back to llite for using. &lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;How would an application pass an inode number back to llite?  I&apos;m not aware of any library or system calls that allow files to be accessed by inode number.&lt;/p&gt;

&lt;p&gt;Do we think it has to be the same process mixing 32-bit and 64-bit system calls, or is it problematic just to mix 32-bit and 64-bit processes that access the same files on the same client?&lt;/p&gt;</comment>
                            <comment id="58797" author="thermeon" created="Fri, 17 May 2013 19:06:53 +0000"  >&lt;p&gt;Ned, fwiw, my related case &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3327&quot; title=&quot;kernel:LustreError: 4543:0:(mdc_locks.c:143:mdc_set_lock_data()) LBUG [reproducible in my environment]&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3327&quot;&gt;&lt;del&gt;LU-3327&lt;/del&gt;&lt;/a&gt; is definitely just a 32bit app on a 64bit client.&lt;/p&gt;</comment>
                            <comment id="58805" author="nedbass" created="Fri, 17 May 2013 20:58:04 +0000"  >&lt;p&gt;I managed to reproduce the bug by running this repeatedly, where &lt;tt&gt;stat32&lt;/tt&gt; is a program which I compiled with &lt;tt&gt;-m32&lt;/tt&gt; that just calls &lt;tt&gt;stat()&lt;/tt&gt; on its argument.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;echo 3 &amp;gt; /proc/sys/vm/drop_caches                                                  
cat $FILE &amp;gt; /dev/null                                                                
sleep 1                                                                            
./stat32 $FILE
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I eventually hit the ASSERTION running &lt;tt&gt;cat&lt;/tt&gt;.  The behavior is non-deterministic.  Sometimes &lt;tt&gt;stat32&lt;/tt&gt; gets -EIO back from stat() with this console error:&lt;/p&gt;

&lt;p&gt;&lt;tt&gt;LustreError: 59173:0:(llite_lib.c:2197:ll_prep_inode()) new_inode -fatal: rc -5&lt;/tt&gt;&lt;/p&gt;

&lt;p&gt;Other times there are no failures.&lt;/p&gt;</comment>
                            <comment id="58807" author="nedbass" created="Fri, 17 May 2013 21:05:40 +0000"  >&lt;p&gt;Also, I looked more closely at the original crash dump to see what the application was doing.  The user seems to be building a C++ application using the Boost.Jam build tool and the IBM xlC compilers.  Here is the process hierarchy for the process that crashed:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; ps -p 25993                                                              
PID: 0      TASK: c000000000e903f0  CPU: 0   COMMAND: &quot;swapper&quot;                 
 PID: 1      TASK: c000000f59aa87f0  CPU: 45  COMMAND: &quot;init&quot;                   
  PID: 3900   TASK: c000000f43edbb30  CPU: 32  COMMAND: &quot;ksshd&quot;                 
   PID: 48487  TASK: c000000dc5d10900  CPU: 33  COMMAND: &quot;ksshd&quot;                
    PID: 48807  TASK: c000000d4eed2a70  CPU: 38  COMMAND: &quot;ksshd&quot;               
     PID: 48810  TASK: c000000385466480  CPU: 11  COMMAND: &quot;tcsh&quot;               
      PID: 5628   TASK: c0000004c12fe520  CPU: 12  COMMAND: &quot;xjam1&quot;             
       PID: 5925   TASK: c000000f3667a810  CPU: 36  COMMAND: &quot;bjam&quot;             
        PID: 22403  TASK: c0000004c0594e60  CPU: 17  COMMAND: &quot;bgxlC&quot;           
         PID: 25989  TASK: c0000003853634c0  CPU: 36  COMMAND: &quot;ipa&quot;            
          PID: 25993  TASK: c000000f36458ff0  CPU: 12  COMMAND: &quot;xlCcode&quot;  
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The file with the duplicate FID was a .cpp source file.  I&apos;m not sure which part of the tool chain is 32-bit, but this isn&apos;t exactly an obscure use case.&lt;/p&gt;</comment>
                            <comment id="58822" author="di.wang" created="Sat, 18 May 2013 01:09:36 +0000"  >&lt;p&gt;Hmm, since there are not enough debug log here, here is what I think it happens&lt;/p&gt;

&lt;p&gt;1. Some 32 bit application(thread) access the file, which will prepare inode with 32 bit ino(because of &lt;a href=&quot;http://review.whamcloud.com/#change,5711&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,5711&lt;/a&gt;), and also get back LOOKUP|UPDATE|LAYOUT lock.&lt;br/&gt;
2. Then LOOKUP/UPDATE lock is being revoked by the server, but client still open this file, so LAYOUT lock is left on the client side. (with 32bit inode attached to it).&lt;br/&gt;
3. Some other normal threads(non-32 bit flag on the thread) lookup the file, and enqueue the lock, getback the fid, then build 64bit ino, and when it will locate the same lock by FID on cache, but the lock is already attached with the inode in 32bit ino, which cause the LBUG. &lt;/p&gt;

&lt;p&gt;Probably .cpp file is being processed by some 32-bit pre-processing tool before building or during building? Just wild guess. Anyway, the 32-bit ino must be built because the file has been accessed by one thread with TIF_32BIT flag. I will update the patch  &lt;a href=&quot;http://review.whamcloud.com/#change,6371&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,6371&lt;/a&gt; according to Andreas&apos;s comment. &lt;/p&gt;</comment>
                            <comment id="59797" author="jlevi" created="Fri, 31 May 2013 20:54:24 +0000"  >&lt;p&gt;Now that Change, 6371 has landed to master can this ticket be closed? Or is there additional patches to land?&lt;/p&gt;</comment>
                            <comment id="59800" author="nedbass" created="Fri, 31 May 2013 21:02:29 +0000"  >&lt;p&gt;Yes, this can be closed.&lt;/p&gt;</comment>
                            <comment id="59806" author="pjones" created="Fri, 31 May 2013 21:19:45 +0000"  >&lt;p&gt;Thanks Ned!&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="15225">LU-1632</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="17757">LU-2904</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="18912">LU-3327</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="12800" name="LU-3318.lustre_debug_log.txt.gz" size="2145128" author="nedbass" created="Mon, 13 May 2013 22:56:19 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvqrb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>8220</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>