<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:04:13 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-6895] sanity-lfsck test 4 hung: bad entry in directory: rec_len is smaller than minimal - inode=3925999616</title>
                <link>https://jira.whamcloud.com/browse/LU-6895</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;sanity-lfsck test 4 hung as follows:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Starting client: shadow-23vm5:  -o user_xattr,flock shadow-23vm8@tcp:/lustre /mnt/lustre
CMD: shadow-23vm5 mkdir -p /mnt/lustre
CMD: shadow-23vm5 mount -t lustre -o user_xattr,flock shadow-23vm8@tcp:/lustre /mnt/lustre
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;On client shadow-23vm5:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 9794.882548] Lustre: DEBUG MARKER: mkdir -p /mnt/lustre
[ 9794.888889] Lustre: DEBUG MARKER: mount -t lustre -o user_xattr,flock shadow-23vm8@tcp:/lustre /mnt/lustre
[ 9794.910641] LustreError: 11-0: lustre-MDT0000-mdc-ffff880071b43400: operation mds_connect to node 10.1.5.29@tcp failed: rc = -30
[ 9799.909475] LustreError: 11-0: lustre-MDT0000-mdc-ffff880071b43400: operation mds_connect to node 10.1.5.29@tcp failed: rc = -30
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;On MDS shadow-23vm8:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 9799.279381] Lustre: DEBUG MARKER: /usr/sbin/lctl get_param -n mdd.lustre-MDT0000.lfsck_namespace
[ 9799.343571] LustreError: 9974:0:(tgt_lastrcvd.c:929:tgt_client_new()) lustre-MDT0000: unable to set MULTI RPCS incompatibility flag
[ 9801.154230] LustreError: 10003:0:(qsd_entry.c:291:qsd_update_index()) lustre-MDT0000: failed to update global index copy for id 0, rc:-30
[ 9801.154261] Lustre: 10003:0:(qsd_reint.c:487:qsd_reint_main()) lustre-MDT0000: reint global for [0x200000006:0x1010000:0x0] failed. -30
[ 9804.341804] LustreError: 9974:0:(tgt_lastrcvd.c:952:tgt_client_new()) lustre-MDT0000: Failed to write client lcd at idx 0, rc -30
[ 9809.342270] LustreError: 9974:0:(tgt_lastrcvd.c:952:tgt_client_new()) lustre-MDT0000: Failed to write client lcd at idx 0, rc -30
[ 9814.342330] LustreError: 9974:0:(tgt_lastrcvd.c:952:tgt_client_new()) lustre-MDT0000: Failed to write client lcd at idx 0, rc -30
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Maloo report: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/2642815a-2f37-11e5-bc70-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/2642815a-2f37-11e5-bc70-5254006e85c2&lt;/a&gt;&lt;/p&gt;</description>
                <environment>Lustre Tag: 2.7.56&lt;br/&gt;
Distro/Arch: SLES11SP3/x86_64 (both client and server)</environment>
        <key id="31187">LU-6895</key>
            <summary>sanity-lfsck test 4 hung: bad entry in directory: rec_len is smaller than minimal - inode=3925999616</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="yong.fan">nasf</assignee>
                                    <reporter username="yujian">Jian Yu</reporter>
                        <labels>
                    </labels>
                <created>Wed, 22 Jul 2015 20:35:03 +0000</created>
                <updated>Wed, 13 Jan 2016 05:51:01 +0000</updated>
                            <resolved>Fri, 18 Dec 2015 14:07:22 +0000</resolved>
                                    <version>Lustre 2.8.0</version>
                                    <fixVersion>Lustre 2.8.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>14</watches>
                                                                            <comments>
                            <comment id="122028" author="green" created="Thu, 23 Jul 2015 17:21:27 +0000"  >&lt;p&gt;In MDS logs we can see:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 9798.706830] LDISKFS-fs error (device dm-0): htree_dirblock_to_tree:939: inode #25142: block 7408: comm lfsck: bad entry in directory: rec_len is smaller than minimal - offset=0(0), inode=3925999616, rec_len=1, name_len=0
[ 9798.706838] Aborting journal on device dm-0-8.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Now the big question is - how come we have this corruption all of a sudden in this presumably pretty fresh filesystem?&lt;/p&gt;</comment>
                            <comment id="122232" author="bzzz" created="Mon, 27 Jul 2015 04:59:27 +0000"  >&lt;p&gt;I&apos;ve been running sanity-lfsck on SLES for 24+ hours, but still can&apos;t reproduce the issue.&lt;/p&gt;</comment>
                            <comment id="122283" author="yujian" created="Mon, 27 Jul 2015 17:02:43 +0000"  >&lt;p&gt;For master branch, the latest occurrences are:&lt;br/&gt;
2015-07-19 &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/2642815a-2f37-11e5-bc70-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/2642815a-2f37-11e5-bc70-5254006e85c2&lt;/a&gt; (2.7.56)&lt;br/&gt;
2015-07-10 &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/b064bb40-26d7-11e5-b3d7-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/b064bb40-26d7-11e5-b3d7-5254006e85c2&lt;/a&gt; (2.7.56)&lt;br/&gt;
2015-07-07 &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/c86b0010-256e-11e5-b033-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/c86b0010-256e-11e5-b033-5254006e85c2&lt;/a&gt; (2.7.56)&lt;br/&gt;
2015-05-21 &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/60586868-00bc-11e5-9650-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/60586868-00bc-11e5-9650-5254006e85c2&lt;/a&gt; (2.7.54) &lt;/p&gt;

&lt;p&gt;Here is a for-test-only patch trying to reproduce the failure on the latest master branch: &lt;a href=&quot;http://review.whamcloud.com/15737&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/15737&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="122384" author="yujian" created="Tue, 28 Jul 2015 08:11:02 +0000"  >&lt;p&gt;More failure instances on master branch against RHEL 7.1 distro:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/5b78dad6-34d7-11e5-a27f-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/5b78dad6-34d7-11e5-a27f-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="124024" author="bzzz" created="Thu, 13 Aug 2015 09:38:24 +0000"  >&lt;p&gt;is there a way to access disk image left after a failed test?&lt;/p&gt;</comment>
                            <comment id="124087" author="yujian" created="Thu, 13 Aug 2015 18:55:50 +0000"  >&lt;p&gt;Hi Alex, I think no for autotest because the test nodes were re-provisioned for testing other patches.&lt;br/&gt;
I&apos;ll try to manually reproduce the failure on eagle cluster.&lt;/p&gt;</comment>
                            <comment id="124207" author="yujian" created="Sat, 15 Aug 2015 01:32:48 +0000"  >&lt;p&gt;Failed to provision RHEL 7.1 server node (TEI-3793) and SLES11 SP3 server node (not supported) on eagle cluster. There is no free node on onyx and shadow cluster.&lt;/p&gt;</comment>
                            <comment id="124686" author="gerrit" created="Thu, 20 Aug 2015 13:57:18 +0000"  >&lt;p&gt;Fan Yong (fan.yong@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/16035&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16035&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6895&quot; title=&quot;sanity-lfsck test 4 hung: bad entry in directory: rec_len is smaller than minimal - inode=3925999616&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6895&quot;&gt;&lt;del&gt;LU-6895&lt;/del&gt;&lt;/a&gt; osd-ldiskfs: handle REMOTE_PARENT_DIR properly&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: b50a0450dd24dd3e59537296d82885bebd8a665f&lt;/p&gt;</comment>
                            <comment id="124687" author="yong.fan" created="Thu, 20 Aug 2015 14:06:19 +0000"  >&lt;p&gt;Hope the above patch can fix the issue of &quot;LustreError: 108955:0:(osd_handler.c:5497:osd_dirent_check_repair()) ASSERTION( !(lma-&amp;gt;lma_compat &amp;amp; LMAC_NOT_IN_OI) ) failed:&quot;&lt;/p&gt;</comment>
                            <comment id="124706" author="gerrit" created="Thu, 20 Aug 2015 17:17:42 +0000"  >&lt;p&gt;Yang Sheng (yang.sheng@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/16036&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16036&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6895&quot; title=&quot;sanity-lfsck test 4 hung: bad entry in directory: rec_len is smaller than minimal - inode=3925999616&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6895&quot;&gt;&lt;del&gt;LU-6895&lt;/del&gt;&lt;/a&gt; debug: debug patch&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 60c7612c6b63b414167b7353f55f72184607c340&lt;/p&gt;</comment>
                            <comment id="125058" author="adilger" created="Tue, 25 Aug 2015 17:55:25 +0000"  >&lt;p&gt;The patch here may also relate to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6974&quot; title=&quot;RHEL 7.1 lustre-initialization-1: MDS crashed while lustre mount&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6974&quot;&gt;&lt;del&gt;LU-6974&lt;/del&gt;&lt;/a&gt;, but not sure if this patch will resolve the issue.&lt;/p&gt;</comment>
                            <comment id="125144" author="yong.fan" created="Wed, 26 Aug 2015 03:18:26 +0000"  >&lt;p&gt;The patch &lt;a href=&quot;http://review.whamcloud.com/16035&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16035&lt;/a&gt; (or &lt;a href=&quot;http://review.whamcloud.com/16044&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16044&lt;/a&gt;) only avoids the LBUG of &quot;osd_dirent_check_repair()) ASSERTION( !(lma-&amp;gt;lma_compat &amp;amp; LMAC_NOT_IN_OI) ) failed:&quot;. I do not think it will resolve the issue of MDS crash in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6974&quot; title=&quot;RHEL 7.1 lustre-initialization-1: MDS crashed while lustre mount&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6974&quot;&gt;&lt;del&gt;LU-6974&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="125463" author="yujian" created="Fri, 28 Aug 2015 00:10:28 +0000"  >&lt;p&gt;In addition, the original failure of this ticket still needs to be resolved after landing nasf&apos;s patches.&lt;/p&gt;</comment>
                            <comment id="125565" author="yong.fan" created="Fri, 28 Aug 2015 17:47:37 +0000"  >&lt;p&gt;I have checked all the failure instances, and found that they all failed at the same child entry: inode=3925999616&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;https://testing.hpdd.intel.com/test_logs/279ce69e-2f37-11e5-bc70-5254006e85c2/show_text
[ 9798.706830] LDISKFS-fs error (device dm-0): htree_dirblock_to_tree:939: inode #25142: block 7408: comm lfsck: bad entry in directory: rec_len is smaller than minimal - offset=0(0), inode=3925999616, rec_len=1, name_len=0
https://testing.hpdd.intel.com/test_logs/b22e5a94-26d7-11e5-b3d7-5254006e85c2/show_text
[11790.484265] LDISKFS-fs error (device dm-0): htree_dirblock_to_tree:1270: inode #25135: block 8434: comm lfsck: bad entry in directory: rec_len is smaller than minimal - offset=0(0), inode=3925999616, rec_len=1, name_len=0
https://testing.hpdd.intel.com/test_logs/c8e97dc8-256e-11e5-b033-5254006e85c2/show_text
[12907.839356] LDISKFS-fs error (device dm-0): htree_dirblock_to_tree:939: inode #25136: block 7408: comm lfsck: bad entry in directory: rec_len is smaller than minimal - offset=0(0), inode=3925999616, rec_len=1, name_len=0
https://testing.hpdd.intel.com/test_logs/61b52aca-00bc-11e5-9650-5254006e85c2/show_text
[20101.639390] LDISKFS-fs error (device dm-0): htree_dirblock_to_tree:954: inode #25139: block 7407: comm lfsck: bad entry in directory: rec_len is smaller than minimal - offset=0(0), inode=3925999616, rec_len=1, name_len=0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;printf %x 3925999616&lt;br/&gt;
ea020000&lt;/p&gt;

&lt;p&gt;It looks quite like:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;#define LDISKFS_XATTR_MAGIC             0xEA020000
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I cannot say that the inode# is just the LDISKFS_XATTR_MAGIC, because I cannot establish any relationship between them. But since all the failure instances all failed at the same child entry, we have to give more attention.&lt;/p&gt;</comment>
                            <comment id="125583" author="adilger" created="Fri, 28 Aug 2015 19:40:30 +0000"  >&lt;p&gt;So it seems that the directory block and an external inode block are getting mixed up:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;struct ext4_dir_entry_2 {
        __le32  inode;                  &lt;span class=&quot;code-comment&quot;&gt;/* Inode number */&lt;/span&gt;
        __le16  rec_len;                &lt;span class=&quot;code-comment&quot;&gt;/* Directory entry length */&lt;/span&gt;
        __u8    name_len;               &lt;span class=&quot;code-comment&quot;&gt;/* Name length */&lt;/span&gt;
        __u8    file_type;
        &lt;span class=&quot;code-object&quot;&gt;char&lt;/span&gt;    name[EXT4_NAME_LEN];    &lt;span class=&quot;code-comment&quot;&gt;/* File name */&lt;/span&gt;
};
struct ext4_xattr_header {
        __le32  h_magic;        &lt;span class=&quot;code-comment&quot;&gt;/* magic number &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; identification */&lt;/span&gt;
        __le32  h_refcount;     &lt;span class=&quot;code-comment&quot;&gt;/* reference count */&lt;/span&gt;
        __le32  h_blocks;       &lt;span class=&quot;code-comment&quot;&gt;/* number of disk blocks used */&lt;/span&gt;
        __le32  h_hash;         &lt;span class=&quot;code-comment&quot;&gt;/* hash value of all attributes */&lt;/span&gt;
        __le32  h_checksum;     &lt;span class=&quot;code-comment&quot;&gt;/* crc32c(uuid+id+xattrblock) */&lt;/span&gt;
                                &lt;span class=&quot;code-comment&quot;&gt;/* id = inum &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; refcount=1, blknum otherwise */&lt;/span&gt;
        __u32   h_reserved[3];  &lt;span class=&quot;code-comment&quot;&gt;/* zero right now */&lt;/span&gt;
};
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The &lt;tt&gt;h_refcount=0x00000001&lt;/tt&gt; only is used for external xattr blocks and matches &lt;tt&gt;rec_len=0x0001, name_len=0x00&lt;/tt&gt;.&lt;/p&gt;</comment>
                            <comment id="127350" author="gerrit" created="Tue, 15 Sep 2015 16:33:28 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/16035/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16035/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6895&quot; title=&quot;sanity-lfsck test 4 hung: bad entry in directory: rec_len is smaller than minimal - inode=3925999616&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6895&quot;&gt;&lt;del&gt;LU-6895&lt;/del&gt;&lt;/a&gt; osd-ldiskfs: handle REMOTE_PARENT_DIR properly&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 6bf1d78ed87dc25cef0e8a48f1ef251f9bb017a8&lt;/p&gt;</comment>
                            <comment id="127459" author="gerrit" created="Wed, 16 Sep 2015 09:27:38 +0000"  >&lt;p&gt;Fan Yong (fan.yong@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/16439&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16439&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6895&quot; title=&quot;sanity-lfsck test 4 hung: bad entry in directory: rec_len is smaller than minimal - inode=3925999616&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6895&quot;&gt;&lt;del&gt;LU-6895&lt;/del&gt;&lt;/a&gt; scrub: not trigger scrub if inode removed by race&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 01e7291500e05688e6d79d18df0b89f08eaa4796&lt;/p&gt;</comment>
                            <comment id="127460" author="gerrit" created="Wed, 16 Sep 2015 09:27:39 +0000"  >&lt;p&gt;Fan Yong (fan.yong@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/16440&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16440&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6895&quot; title=&quot;sanity-lfsck test 4 hung: bad entry in directory: rec_len is smaller than minimal - inode=3925999616&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6895&quot;&gt;&lt;del&gt;LU-6895&lt;/del&gt;&lt;/a&gt; lfsck: not destroy directory when fix FID-in-dirent&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 79d3bca8df59010ad686c02ce319499fe81401c6&lt;/p&gt;</comment>
                            <comment id="127621" author="gerrit" created="Thu, 17 Sep 2015 13:47:10 +0000"  >&lt;p&gt;Alex Zhuravlev (alexey.zhuravlev@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/16468&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16468&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6895&quot; title=&quot;sanity-lfsck test 4 hung: bad entry in directory: rec_len is smaller than minimal - inode=3925999616&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6895&quot;&gt;&lt;del&gt;LU-6895&lt;/del&gt;&lt;/a&gt; tests: debug patch&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 5de0f2d029766c345abd4d6c0e089c94881cbc88&lt;/p&gt;</comment>
                            <comment id="129040" author="adilger" created="Thu, 1 Oct 2015 19:00:58 +0000"  >&lt;p&gt;Need to add a test for this before closing this bug. Best would be to add a full e2fsck and LFSCK run in conf-sanity test_32, and fix that test to also run on ZFS. &lt;/p&gt;</comment>
                            <comment id="129041" author="bzzz" created="Thu, 1 Oct 2015 19:06:16 +0000"  >&lt;p&gt;Andreas, I don&apos;t think we have a fix for the issue. AFAIU, Fan Yong&apos;s patch can potentially help in case of 2nd and subsequent dir entries, but it&apos;s always offset 0, i.e. the 1st entry is corrupted.&lt;/p&gt;</comment>
                            <comment id="129389" author="pjones" created="Mon, 5 Oct 2015 22:14:34 +0000"  >&lt;p&gt;Does this issue affect both ZFS and ldiskfs or just the latter?&lt;/p&gt;</comment>
                            <comment id="129390" author="yujian" created="Mon, 5 Oct 2015 22:21:10 +0000"  >&lt;p&gt;Hi Peter,&lt;/p&gt;

&lt;p&gt;It only affects ldiskfs because OI Scrub is not implemented for ZFS.&lt;/p&gt;</comment>
                            <comment id="129391" author="pjones" created="Mon, 5 Oct 2015 22:22:31 +0000"  >&lt;p&gt;Ah ok - thanks!&lt;/p&gt;</comment>
                            <comment id="129912" author="jamesanunez" created="Fri, 9 Oct 2015 01:59:05 +0000"  >&lt;blockquote&gt;
&lt;p&gt;Need to add a test for this before closing this bug. Best would be to add a full e2fsck and LFSCK run in conf-sanity test_32, and fix that test to also run on ZFS.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Adding LFSCK and e2fsck is being tracked under &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7212&quot; title=&quot;Add LFSCK check in conf-sanity 32&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7212&quot;&gt;LU-7212&lt;/a&gt;. &lt;/p&gt;</comment>
                            <comment id="130332" author="gerrit" created="Wed, 14 Oct 2015 04:39:12 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/16440/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16440/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6895&quot; title=&quot;sanity-lfsck test 4 hung: bad entry in directory: rec_len is smaller than minimal - inode=3925999616&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6895&quot;&gt;&lt;del&gt;LU-6895&lt;/del&gt;&lt;/a&gt; lfsck: not destroy directory when fix FID-in-dirent&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: eebc3da214dfcbc01ba637f0925bfe8635b26138&lt;/p&gt;</comment>
                            <comment id="130333" author="gerrit" created="Wed, 14 Oct 2015 04:47:03 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/16439/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16439/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6895&quot; title=&quot;sanity-lfsck test 4 hung: bad entry in directory: rec_len is smaller than minimal - inode=3925999616&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6895&quot;&gt;&lt;del&gt;LU-6895&lt;/del&gt;&lt;/a&gt; scrub: not trigger scrub if inode removed by race&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: d8612e307f28cc81df5d93e45130b9f04837ee27&lt;/p&gt;</comment>
                            <comment id="130371" author="jgmitter" created="Wed, 14 Oct 2015 14:22:00 +0000"  >&lt;p&gt;Landed for 2.8&lt;/p&gt;</comment>
                            <comment id="130381" author="gerrit" created="Wed, 14 Oct 2015 15:16:46 +0000"  >&lt;p&gt;Fan Yong (fan.yong@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/16821&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16821&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6895&quot; title=&quot;sanity-lfsck test 4 hung: bad entry in directory: rec_len is smaller than minimal - inode=3925999616&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6895&quot;&gt;&lt;del&gt;LU-6895&lt;/del&gt;&lt;/a&gt; lfsck: conflict lu_dirent_attrs members&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 720a9fbeeb4edcb4a34022880401fc4291693b23&lt;/p&gt;</comment>
                            <comment id="131810" author="gerrit" created="Wed, 28 Oct 2015 13:49:09 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/16821/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16821/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6895&quot; title=&quot;sanity-lfsck test 4 hung: bad entry in directory: rec_len is smaller than minimal - inode=3925999616&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6895&quot;&gt;&lt;del&gt;LU-6895&lt;/del&gt;&lt;/a&gt; lfsck: conflict lu_dirent_attrs members&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 44247c22b8a93360f2734ded593da6e570083c5b&lt;/p&gt;</comment>
                            <comment id="131842" author="jgmitter" created="Wed, 28 Oct 2015 15:22:56 +0000"  >&lt;p&gt;Latest patch has landed that caused the reopen.&lt;/p&gt;

&lt;p&gt;Fixed for 2.8&lt;/p&gt;</comment>
                            <comment id="133546" author="gerrit" created="Sat, 14 Nov 2015 15:41:50 +0000"  >&lt;p&gt;Fan Yong (fan.yong@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/17197&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/17197&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6895&quot; title=&quot;sanity-lfsck test 4 hung: bad entry in directory: rec_len is smaller than minimal - inode=3925999616&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6895&quot;&gt;&lt;del&gt;LU-6895&lt;/del&gt;&lt;/a&gt; ldiskfs: debug patch for kernel-3.x dir corruption&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 9bc645bb858bf593145efc0906a1c974fa96e1e4&lt;/p&gt;</comment>
                            <comment id="134839" author="yong.fan" created="Tue, 1 Dec 2015 11:17:09 +0000"  >&lt;p&gt;We need more work on this bug.&lt;/p&gt;</comment>
                            <comment id="134841" author="gerrit" created="Tue, 1 Dec 2015 11:17:43 +0000"  >&lt;p&gt;Fan Yong (fan.yong@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/17403&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/17403&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6895&quot; title=&quot;sanity-lfsck test 4 hung: bad entry in directory: rec_len is smaller than minimal - inode=3925999616&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6895&quot;&gt;&lt;del&gt;LU-6895&lt;/del&gt;&lt;/a&gt; lfsck: avoid accessing bad OI during lfsck start&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: fd012d15cac12d9ef39f279abe0c4ea40284fe29&lt;/p&gt;</comment>
                            <comment id="135834" author="yong.fan" created="Thu, 10 Dec 2015 15:45:21 +0000"  >&lt;p&gt;For a new formatted Lustre system, it does not guarantee that all the on-disk inodes/blocks are initialized. Instead, they are marked as free in related inode/block bitmaps. When the inode is allocated to some object, it will be initialized at that time. Such processing accelerates the format. But it may cause trouble for MDT file-level backup/restore. For example:&lt;/p&gt;

&lt;p&gt;The sysadmin backup the MDT via server side file level tar, then reformat the MDT device, and then restore the MDT from the backup. Assume some object_A, its FID is the FID_A, and it was mapped to the inode_B before the backup, such OI mapping was recorded in the OI file. After the restore, another inode_C is assigned to the object_A, but before OI scrub rebuilding related OI mapping, the stale mapping &quot;FID_A =&amp;gt; inode_B&quot; is still in the OI file, and can be found by OI lookup. Generally, even if someone found the stale OI mapping, it is not trouble, because the OSD will verify whether FID-in-LMA for the indoe_B matches the FID_A or not. But if the inode_B is NOT allocated after the resotre, and because we did not initialize inode_B during reformat, then the FID-in-LMA for the indoe_B is still kept as the same before the backup, means&lt;br/&gt;
it matches the FID_A, then the OSD will think that the inode_B is still assigned to the object_A after the restore. That is wrong. In futher, although the inode_B is not allocated, but some of the blocks that were assigned to it may have been allocated to others. Then accessing the blocks via the inode_B may access some invalid data, and may trigger some assertion, such as this issue.&lt;/p&gt;

&lt;p&gt;So the key issues are two:&lt;/p&gt;

&lt;p&gt;1) Some FID based operation may access stale OI mapping after MDT file-level backup/restore.&lt;/p&gt;

&lt;p&gt;2) The OSD-ldiskfs may get some non-allocated inode with the give ino#/generation.&lt;/p&gt;

&lt;p&gt;So as long as we can resovle one of them, then the trouble in the ticket can be resolved.&lt;/p&gt;

&lt;p&gt;The solutions for 1):&lt;/p&gt;

&lt;p&gt;1.1) Avoid FID based operation before OI scrub done. That is not easy, because fid2path cannot be covered by some name based operation.&lt;/p&gt;

&lt;p&gt;1.2) Remove the OI files after MDT file-level backup/restore. It is more direct solution. Another benefit is that even if OI scrub rebuilt the OI files, it only guarantees that all the FIDs&apos; OI mappings have been refrshed. But it does not clean the stale FIDs&apos; OI mappings. Because the OI scrub only does inode-table based scanning, not OI files scanning. Removing the OI files can resolve related trouble completely.&lt;/p&gt;

&lt;p&gt;The solutions for 2):&lt;/p&gt;

&lt;p&gt;2.1) New ldiskfs patch to make ldiskfs_iget() to return &quot;-ENOENT&quot; for the case of loading non-allocated inode by checking the inode bitmap.&lt;/p&gt;

&lt;p&gt;2.2) Check the inode&apos;s valid inside OSD via related inode bitmap.&lt;/p&gt;

&lt;p&gt;Generally, less ldiskfs patches is better. It will safe a lot of effort when kernel upgrade. So 2.1) is not the best solution. As for 2.2), it is not good to access the inode bitmap directly in OSD unless we have to.&lt;/p&gt;

&lt;p&gt;Relatively, the solution 1.2) is more efficient and benefit. I will make patch for that.&lt;/p&gt;</comment>
                            <comment id="135837" author="bzzz" created="Thu, 10 Dec 2015 15:56:03 +0000"  >&lt;p&gt;if an inode isn&apos;t allocated, then the corresponding bit in the bitmap should be 0 ?&lt;/p&gt;</comment>
                            <comment id="135840" author="yong.fan" created="Thu, 10 Dec 2015 16:04:07 +0000"  >&lt;p&gt;Yes, the inode table bitmap marks the inode as free after the restore, but the inode itself is not in initialised status because it has ever been used before the backup. But the ldiskfs_iget() does not check the inode table bitmap.&lt;/p&gt;</comment>
                            <comment id="135842" author="bzzz" created="Thu, 10 Dec 2015 16:10:43 +0000"  >&lt;p&gt;well, we&apos;ve got osd_iget() which can check the bitmaps (at least in case of broken OI) ?&lt;/p&gt;</comment>
                            <comment id="135843" author="bzzz" created="Thu, 10 Dec 2015 16:12:36 +0000"  >&lt;p&gt;also, I don&apos;t understand one thing - after restore we know that OI is stale, right? then how is it possible that someone is able to use it before it&apos;s rebuilt ?&lt;/p&gt;</comment>
                            <comment id="135846" author="yong.fan" created="Thu, 10 Dec 2015 16:15:36 +0000"  >&lt;blockquote&gt;
&lt;p&gt;well, we&apos;ve got osd_iget() which can check the bitmaps (at least in case of broken OI) ?&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;I have made some patch to check the inode table bitmap when ldiskfs_iget() returned the crashed directory inode. It shows that the inode table bitmaps marks the returned inode as free.&lt;/p&gt;</comment>
                            <comment id="135849" author="yong.fan" created="Thu, 10 Dec 2015 16:18:15 +0000"  >&lt;blockquote&gt;
&lt;p&gt;also, I don&apos;t understand one thing - after restore we know that OI is stale, right? then how is it possible that someone is able to use it before it&apos;s rebuilt ?&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;It is the LFSCK locating .lustre/lost+found/MDTxxxx object, originally, such locating is FID based directly, then osd_fid_lookup() will found the old OI mapping in the OI file, in future, ldiskfs_iget() returned the old inode, and the subsequent osd_check_lma() found the FID-in-LMA matches the given one, so it regarded it as an valid inode. But in fact, it is a free inode.&lt;/p&gt;</comment>
                            <comment id="135851" author="bzzz" created="Thu, 10 Dec 2015 16:21:31 +0000"  >&lt;p&gt;I find it wrong that we let anyone use stale OI. it&apos;s just matter of time to hit a similar issue again in the future.&lt;/p&gt;</comment>
                            <comment id="135854" author="yong.fan" created="Thu, 10 Dec 2015 16:45:39 +0000"  >&lt;p&gt;It is not that case, because originally, after the file-level backup/restore, the OI scrub will rebuild the OI files. But such rebuilding is based on the original OI files, means during the rebuilding, the OI file may contains some of valid OI mappings, and some of stale OI mappings. From others view, it is not easy to know which one is valid and which is invalid. So the osd_fid_lookup() will make verification via check FID-in-LMA. Unfortunately, if the target inode has neither allocated nor initialized, then such verification will be cheated. That is why I made patch to remove the OI files completely for file-lelvel backup/restore case. That can avoid such trouble form root.&lt;/p&gt;</comment>
                            <comment id="135856" author="bzzz" created="Thu, 10 Dec 2015 16:49:24 +0000"  >&lt;p&gt;iirc, we used FS&apos;s UUID (which is unique) to recognize backup-restore case which invalidates most of OI entries..&lt;/p&gt;</comment>
                            <comment id="135992" author="yong.fan" created="Fri, 11 Dec 2015 00:46:45 +0000"  >&lt;p&gt;Yes, that is what we did. When initialise the OSD, it will check whether the FS UUID that is stored in the OI_scrub file matches current FS UUID or not. If not, then it is quite possible that the system have restored. At that time, almost all of the OI mappings in the OI files are invalid. But as the OI scrub running, more and more OI mappings are updated in the OI files, so the OI files will contain both valid and invalid OI mappings.&lt;/p&gt;

&lt;p&gt;The new patch will remove the OI files completely when detect the MDT file level backup/restore. That will guarantee that during the OI scrub, all the OI mappings in the OI files are valid.&lt;/p&gt;</comment>
                            <comment id="136819" author="gerrit" created="Fri, 18 Dec 2015 05:27:46 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/17403/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/17403/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6895&quot; title=&quot;sanity-lfsck test 4 hung: bad entry in directory: rec_len is smaller than minimal - inode=3925999616&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6895&quot;&gt;&lt;del&gt;LU-6895&lt;/del&gt;&lt;/a&gt; lfsck: drop bad OI files after MDT file-level restore&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 4c45115bdf2c5145ca3779c38a06e90024f013f0&lt;/p&gt;</comment>
                            <comment id="136845" author="jgmitter" created="Fri, 18 Dec 2015 14:07:22 +0000"  >&lt;p&gt;Landed for 2.8&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10120">
                    <name>Blocker</name>
                                            <outwardlinks description="is blocking">
                                        <issuelink>
            <issuekey id="31067">LU-6843</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="33740">LU-7572</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="32154">LU-7169</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="32698">LU-7315</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzxisn:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>