<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:26:49 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2627] /bin/ls gets Input/output error</title>
                <link>https://jira.whamcloud.com/browse/LU-2627</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Doing an ls gives the following error&lt;br/&gt;
ls: reading directory d4_stats/: Input/output error&lt;/p&gt;

&lt;p&gt;client error:&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;5237686.818045&amp;#93;&lt;/span&gt; LustreError: 77522:0:(dir.c:648:ll_readdir()) error reading dir &lt;span class=&quot;error&quot;&gt;&amp;#91;0x4488b6ced74:0x1edb5:0x0&amp;#93;&lt;/span&gt; at 0: rc -5&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;5237686.849844&amp;#93;&lt;/span&gt; LustreError: 77522:0:(dir.c:648:ll_readdir()) Skipped 51 previous similar messages&lt;/p&gt;

&lt;p&gt;MDT Error:&lt;br/&gt;
Jan 16 11:18:37 nbp1-mds kernel: Lustre: 15390:0:(mdd_object.c:2412:__mdd_readpage()) build page failed: -5!&lt;/p&gt;

&lt;p&gt;Please advise on debug flags to use to gather logs.&lt;/p&gt;</description>
                <environment></environment>
        <key id="17194">LU-2627</key>
            <summary>/bin/ls gets Input/output error</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="cliffw">Cliff White</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Wed, 16 Jan 2013 14:24:11 +0000</created>
                <updated>Wed, 3 Jul 2013 21:02:54 +0000</updated>
                            <resolved>Thu, 21 Mar 2013 20:23:50 +0000</resolved>
                                    <version>Lustre 2.1.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>12</watches>
                                                                            <comments>
                            <comment id="50565" author="mhanafi" created="Wed, 16 Jan 2013 14:29:09 +0000"  >&lt;p&gt;MDS has logged these messages&lt;br/&gt;
an 15 15:23:11 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: dx entry: limit != root limit&lt;br/&gt;
Jan 15 15:23:11 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: Corrupt dir inode 158428053, running e2fsck is recommended.&lt;br/&gt;
Jan 15 15:23:11 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: dx entry: limit != root limit&lt;br/&gt;
Jan 15 15:23:11 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: Corrupt dir inode 158428056, running e2fsck is recommended.&lt;br/&gt;
Jan 15 15:23:45 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: dx entry: limit != root limit&lt;br/&gt;
Jan 15 15:23:45 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: Corrupt dir inode 158427968, running e2fsck is recommended.&lt;br/&gt;
Jan 15 15:23:45 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: dx entry: limit != root limit&lt;br/&gt;
Jan 15 15:23:45 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: Corrupt dir inode 158428043, running e2fsck is recommended.&lt;br/&gt;
Jan 16 03:28:55 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: Unrecognised inode hash code 5 for directory #37273881&lt;br/&gt;
Jan 16 03:28:55 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: Corrupt dir inode 37273881, running e2fsck is recommended.&lt;br/&gt;
Jan 16 07:24:28 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: Unrecognised inode hash code 4 for directory #39331752&lt;br/&gt;
Jan 16 07:24:28 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: Corrupt dir inode 39331752, running e2fsck is recommended.&lt;br/&gt;
Jan 16 07:25:11 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: Unrecognised inode hash code 5 for directory #37273881&lt;br/&gt;
Jan 16 07:25:11 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: Corrupt dir inode 37273881, running e2fsck is recommended.&lt;br/&gt;
Jan 16 09:59:07 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: Unrecognised inode hash code 107 for directory #15731753&lt;br/&gt;
Jan 16 09:59:07 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: Corrupt dir inode 15731753, running e2fsck is recommended.&lt;br/&gt;
Jan 16 11:13:11 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: Unrecognised inode hash code 10 for directory #37272081&lt;br/&gt;
Jan 16 11:13:11 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: Corrupt dir inode 37272081, running e2fsck is recommended.&lt;/p&gt;
</comment>
                            <comment id="50567" author="mhanafi" created="Wed, 16 Jan 2013 14:29:48 +0000"  >&lt;p&gt;Can unmount and just run e2fsck on the mdt device?&lt;/p&gt;</comment>
                            <comment id="50585" author="cliffw" created="Wed, 16 Jan 2013 16:48:51 +0000"  >&lt;p&gt;Yes, you should umount and fsck the MDT. You do not have to umount clients, however clients may block while the MDT is down.&lt;/p&gt;</comment>
                            <comment id="50587" author="mhanafi" created="Wed, 16 Jan 2013 16:58:40 +0000"  >&lt;p&gt;Can you please provide the exact options to used for the fsck command&lt;/p&gt;</comment>
                            <comment id="50588" author="cliffw" created="Wed, 16 Jan 2013 17:05:28 +0000"  >&lt;p&gt;First, check all your logs and see if you are having hardware failures. Is there any error logging in your disk hardware? The device under dm-2 may have an issue.&lt;br/&gt;
For fsck, first run&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;fsck -fn &amp;lt;yourMDTdevice&amp;gt;&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;This is a read-only pass, and should give you an idea of what is going on. &lt;br/&gt;
Then, you can run &lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;fsck -fy &amp;lt;yourMDTdevice&amp;gt;&lt;br/&gt;
to repair. &lt;/li&gt;
&lt;/ol&gt;
</comment>
                            <comment id="50589" author="mhanafi" created="Wed, 16 Jan 2013 17:12:07 +0000"  >&lt;p&gt;Read only pass has lots of errors like this&lt;br/&gt;
Error while reading over extent tree in inode 14726136: Corrupt extent header&lt;/p&gt;</comment>
                            <comment id="50590" author="cliffw" created="Wed, 16 Jan 2013 17:14:38 +0000"  >&lt;p&gt;Can you post the full output?&lt;/p&gt;</comment>
                            <comment id="50591" author="cliffw" created="Wed, 16 Jan 2013 17:15:12 +0000"  >&lt;p&gt;And are you using e2fsprogs from Whamcloud? Please indicate the version of e2fsprogs you have installed.&lt;/p&gt;</comment>
                            <comment id="50592" author="cliffw" created="Wed, 16 Jan 2013 17:16:33 +0000"  >&lt;p&gt;Also, is there any indicate of hardware issue with the disk?&lt;/p&gt;</comment>
                            <comment id="50593" author="mhanafi" created="Wed, 16 Jan 2013 17:18:29 +0000"  >&lt;p&gt;long list of this &lt;/p&gt;

&lt;p&gt;nbp1-MDT0000 has been mounted 110 times without being checked, check forced.&lt;br/&gt;
Pass 1: Checking inodes, blocks, and sizes&lt;br/&gt;
Error while reading over extent tree in inode 8502968: Corrupt extent header&lt;br/&gt;
Clear inode? no&lt;/p&gt;

&lt;p&gt;Error while reading over extent tree in inode 8503011: Corrupt extent header&lt;br/&gt;
Clear inode? no&lt;/p&gt;

&lt;p&gt;Error while reading over extent tree in inode 8503034: Corrupt extent header&lt;br/&gt;
Clear inode? no&lt;/p&gt;

&lt;p&gt;Error while reading over extent tree in inode 8503327: Corrupt extent header&lt;br/&gt;
Clear inode? no&lt;/p&gt;

&lt;p&gt;Error while reading over extent tree in inode 8503340: Corrupt extent header&lt;br/&gt;
Clear inode? no&lt;/p&gt;

&lt;p&gt;Error while reading over extent tree in inode 8503345: Corrupt extent header&lt;br/&gt;
Clear inode? no&lt;/p&gt;

&lt;p&gt;Error while reading over extent tree in inode 8503781: Corrupt extent header&lt;br/&gt;
Clear inode? no&lt;/p&gt;

&lt;p&gt;Error while reading over extent tree in inode 8503785: Corrupt extent header&lt;br/&gt;
Clear inode? no&lt;/p&gt;

&lt;p&gt;Error while reading over extent tree in inode 8503787: Corrupt extent header&lt;br/&gt;
Clear inode? no&lt;/p&gt;

&lt;p&gt;Error while reading over extent tree in inode 8503801: Corrupt extent header&lt;br/&gt;
Clear inode? no&lt;/p&gt;

&lt;p&gt;Error while reading over extent tree in inode 8503805: Corrupt extent header&lt;br/&gt;
Clear inode? no&lt;/p&gt;

&lt;p&gt;Error while reading over extent tree in inode 8503808: Corrupt extent header&lt;br/&gt;
Clear inode? no&lt;/p&gt;

&lt;p&gt;Error while reading over extent tree in inode 8503810: Corrupt extent header&lt;br/&gt;
Clear inode? no&lt;/p&gt;

&lt;p&gt;Error while reading over extent tree in inode 8503956: Corrupt extent header&lt;br/&gt;
Clear inode? no&lt;/p&gt;

&lt;p&gt;Error while reading over extent tree in inode 8503961: Corrupt extent header&lt;br/&gt;
Clear inode? no&lt;/p&gt;

&lt;p&gt;Error while reading over extent tree in inode 8504005: Corrupt extent header&lt;br/&gt;
Clear inode? no&lt;/p&gt;

&lt;p&gt;Error while reading over extent tree in inode 8510949: Corrupt extent header&lt;br/&gt;
Clear inode? no&lt;/p&gt;

&lt;p&gt;Error while reading over extent tree in inode 8541695: Corrupt extent header&lt;br/&gt;
Clear inode? no&lt;/p&gt;

&lt;p&gt;I just stopped it for now.&lt;/p&gt;</comment>
                            <comment id="50594" author="mhanafi" created="Wed, 16 Jan 2013 17:24:42 +0000"  >&lt;p&gt;FYI- this is 1.8 upgraded to 2.x filesystem.&lt;br/&gt;
we have the following&lt;/p&gt;

&lt;p&gt;e2fsprogs-1.41.90.wc4-7.el6.x86_64&lt;br/&gt;
lustre-ldiskfs-3.3.0-1nasS_2.6.32_279.2.1.el6.20120824.x86_64.lustre213.x86_64&lt;/p&gt;</comment>
                            <comment id="50595" author="cliffw" created="Wed, 16 Jan 2013 17:25:27 +0000"  >&lt;p&gt;That is rather bad. You need to verify that your disk hardware is healthy, you may be seeing a disk failure. Do you have a backup? &lt;/p&gt;</comment>
                            <comment id="50596" author="cliffw" created="Wed, 16 Jan 2013 17:36:21 +0000"  >&lt;p&gt;Can you give us your kernel version, and the version on all Lustre RPMS? You compile your own Lustre?&lt;/p&gt;</comment>
                            <comment id="50597" author="mhanafi" created="Wed, 16 Jan 2013 17:37:12 +0000"  >&lt;p&gt;Hardware is healthy. We don&apos;t have backups. But I am able to remount the mdt. &lt;br/&gt;
but got these errors&lt;br/&gt;
Jan 16 14:24:10 nbp1-mds kernel: Lustre: nbp1-MDT0000: disconnecting 1 stale clients&lt;br/&gt;
Jan 16 14:24:10 nbp1-mds kernel: LustreError: 86971:0:(mdt_handler.c:2792:mdt_recovery()) operation 35 on unconnected MDS from 12345-10.151.53.248@o2ib&lt;br/&gt;
Jan 16 14:24:10 nbp1-mds kernel: LustreError: 86971:0:(mdt_handler.c:2792:mdt_recovery()) Skipped 2802 previous similar messages&lt;br/&gt;
Jan 16 14:24:10 nbp1-mds kernel: Lustre: 87267:0:(ldlm_lib.c:946:target_handle_connect()) nbp1-MDT0000: connection from 14c5fa4f-a16b-c855-4b79-5ba10c67a331@10.151.53.248@o2ib recovering/t491111894205 exp (null) cur 1358375050 last 0&lt;br/&gt;
Jan 16 14:24:10 nbp1-mds kernel: Lustre: 87267:0:(ldlm_lib.c:946:target_handle_connect()) Skipped 4169 previous similar messages&lt;br/&gt;
Jan 16 14:24:10 nbp1-mds kernel: Lustre: nbp1-MDT0000: Client f0dbfa3e-028e-c2ce-22e3-c3f171e89ebf (at 10.151.26.25@o2ib) reconnecting, waiting for 12090 clients in recovery for 3:12&lt;br/&gt;
Jan 16 14:24:10 nbp1-mds kernel: Lustre: nbp1-MDT0000: Denying connection for new client 10.151.53.248@o2ib (at 14c5fa4f-a16b-c855-4b79-5ba10c67a331), waiting for 745 clients in recovery for 3:12&lt;br/&gt;
Jan 16 14:24:10 nbp1-mds kernel: Lustre: Skipped 47 previous similar messages&lt;br/&gt;
Jan 16 14:24:12 nbp1-mds kernel: Lustre: nbp1-MDT0000: Client dd76159a-cc95-ee10-7333-4317d790b9fb (at 10.151.27.24@o2ib) reconnecting, waiting for 12090 clients in recovery for 3:12&lt;br/&gt;
Jan 16 14:24:13 nbp1-mds kernel: Lustre: nbp1-MDT0000: sending delayed replies to recovered clients&lt;br/&gt;
Jan 16 14:24:15 nbp1-mds kernel: Lustre: MDS mdd_obd-nbp1-MDT0000: nbp1-OST000a_UUID now active, resetting orphans&lt;br/&gt;
Jan 16 14:24:15 nbp1-mds kernel: Lustre: Skipped 91 previous similar messages&lt;br/&gt;
Jan 16 14:24:15 nbp1-mds kernel: Lustre: MDS mdd_obd-nbp1-MDT0000: nbp1-OST0002_UUID now active, resetting orphans&lt;br/&gt;
Jan 16 14:24:15 nbp1-mds kernel: Lustre: Skipped 1 previous similar message&lt;br/&gt;
Jan 16 14:24:15 nbp1-mds kernel: Lustre: 86698:0:(mdd_orphans.c:371:orph_key_test_and_del()) Found orphan! Delete it&lt;br/&gt;
Jan 16 14:24:15 nbp1-mds kernel: Lustre: 86698:0:(mdd_orphans.c:371:orph_key_test_and_del()) Skipped 2579 previous similar messages&lt;br/&gt;
Jan 16 14:24:16 nbp1-mds kernel: Lustre: MDS mdd_obd-nbp1-MDT0000: nbp1-OST0025_UUID now active, resetting orphans&lt;br/&gt;
Jan 16 14:24:16 nbp1-mds kernel: Lustre: Skipped 22 previous similar messages&lt;br/&gt;
Jan 16 14:24:18 nbp1-mds kernel: Lustre: 86698:0:(mdd_orphans.c:371:orph_key_test_and_del()) Found orphan! Delete it&lt;br/&gt;
Jan 16 14:24:18 nbp1-mds kernel: Lustre: 86698:0:(mdd_orphans.c:371:orph_key_test_and_del()) Skipped 1 previous similar message&lt;br/&gt;
Jan 16 14:24:22 nbp1-mds kernel: Lustre: 86698:0:(mdd_orphans.c:371:orph_key_test_and_del()) Found orphan! Delete it&lt;br/&gt;
Jan 16 14:24:22 nbp1-mds kernel: Lustre: 86698:0:(mdd_orphans.c:371:orph_key_test_and_del()) Skipped 18 previous similar messages&lt;br/&gt;
Jan 16 14:24:30 nbp1-mds kernel: Lustre: 86698:0:(mdd_orphans.c:371:orph_key_test_and_del()) Found orphan! Delete it&lt;br/&gt;
Jan 16 14:24:30 nbp1-mds kernel: Lustre: 86698:0:(mdd_orphans.c:371:orph_key_test_and_del()) Skipped 156 previous similar messages&lt;br/&gt;
Jan 16 14:24:45 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: Unrecognised inode hash code 10 for directory #37272081&lt;br/&gt;
Jan 16 14:24:45 nbp1-mds kernel: LDISKFS-fs warning (device dm-2): dx_probe: Corrupt dir inode 37272081, running e2fsck is recommended.&lt;br/&gt;
Jan 16 14:24:45 nbp1-mds kernel: Lustre: 87669:0:(mdd_object.c:2412:__mdd_readpage()) build page failed: -5!&lt;br/&gt;
Jan 16 14:24:45 nbp1-mds kernel: Lustre: 87669:0:(mdd_object.c:2412:__mdd_readpage()) Skipped 1 previous similar message&lt;br/&gt;
Jan 16 14:24:46 nbp1-mds kernel: Lustre: 86698:0:(mdd_orphans.c:371:orph_key_test_and_del()) Found orphan! Delete it&lt;br/&gt;
Jan 16 14:24:46 nbp1-mds kernel: Lustre: 86698:0:(mdd_orphans.c:371:orph_key_test_and_del()) Skipped 1476 previous similar messages&lt;/p&gt;</comment>
                            <comment id="50598" author="mhanafi" created="Wed, 16 Jan 2013 17:43:58 +0000"  >&lt;p&gt;Here is list of server lustre rpms&lt;br/&gt;
lustre-debuginfo-2.1.3-1nasS_2.6.32_279.2.1.el6.20120824.x86_64.lustre213.x86_64&lt;br/&gt;
lustre-2.1.3-1nasS_2.6.32_279.2.1.el6.20120824.x86_64.lustre213.x86_64&lt;br/&gt;
lustre-systemtap-0.9.3-2.noarch&lt;br/&gt;
lustre-modules-2.1.3-1nasS_2.6.32_279.2.1.el6.20120824.x86_64.lustre213.x86_64&lt;br/&gt;
lustre-ldiskfs-debuginfo-3.3.0-1nasS_2.6.32_279.2.1.el6.20120824.x86_64.lustre213.x86_64&lt;br/&gt;
lustre-ldiskfs-3.3.0-1nasS_2.6.32_279.2.1.el6.20120824.x86_64.lustre213.x86_64&lt;br/&gt;
lustre-source-2.1.3-1nasS_2.6.32_279.2.1.el6.20120824.x86_64.lustre213.x86_64&lt;br/&gt;
lustre-tools-0.7.10-2.noarch&lt;br/&gt;
lustre-tests-2.1.3-1nasS_2.6.32_279.2.1.el6.20120824.x86_64.lustre213.x86_64&lt;/p&gt;

&lt;p&gt;Linux nbp1-mds 2.6.32-279.2.1.el6.20120824.x86_64.lustre213 #1 SMP Mon Aug 27 15:02:12 EDT 2012 x86_64 x86_64 x86_64 GNU/Linux&lt;/p&gt;

&lt;p&gt;our source tree is out on github&lt;/p&gt;</comment>
                            <comment id="50599" author="mhanafi" created="Wed, 16 Jan 2013 17:47:07 +0000"  >&lt;p&gt;should I remount the mdt for now so the clients can recover or hold off.&lt;/p&gt;
</comment>
                            <comment id="50601" author="cliffw" created="Wed, 16 Jan 2013 17:48:44 +0000"  >&lt;p&gt;The only actual errors are the LDISKFS-fs warnings. The rest are mostly standard restart. (errors should start with LustreError) &lt;br/&gt;
It would have been better to have run fsck -fy prior to the remount.&lt;br/&gt;
We are concerned that the issues reported by fsck -fn will like result in lost files if you run fsck -fy (the inodes with errors may be removed).  You need to verify the health of the hardware prior to attempting any filesystem repair. &lt;br/&gt;
If the hardware is healthy, fsck -fy will likely make the filesystem operable, but you will probably have to identify and restore the affected files.&lt;/p&gt;</comment>
                            <comment id="50602" author="mhanafi" created="Wed, 16 Jan 2013 17:52:11 +0000"  >&lt;p&gt;Should we upgrade our e2fsprogs and try. The hardware is differently healthy.&lt;/p&gt;</comment>
                            <comment id="50603" author="jaylan" created="Wed, 16 Jan 2013 17:53:45 +0000"  >&lt;p&gt;The git source for the server can be found at&lt;br/&gt;
&lt;a href=&quot;https://github.com/jlan/lustre-nas&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/jlan/lustre-nas&lt;/a&gt;&lt;br/&gt;
with branch nas-2.1.3, tag 2.1.3-1nasS.&lt;/p&gt;</comment>
                            <comment id="50604" author="mhanafi" created="Wed, 16 Jan 2013 18:01:28 +0000"  >&lt;p&gt;Do you have any documentation of identifying and restoring the affected files?&lt;/p&gt;</comment>
                            <comment id="50605" author="cliffw" created="Wed, 16 Jan 2013 18:24:05 +0000"  >&lt;p&gt;Yes, you should upgrade to the latest e2fsprogs from &lt;a href=&quot;http://downloads.whamcloud.com/public/e2fsprogs/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://downloads.whamcloud.com/public/e2fsprogs/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="50607" author="mhanafi" created="Wed, 16 Jan 2013 18:35:44 +0000"  >&lt;p&gt;after the new e2fsck most of there errors are&lt;/p&gt;

&lt;p&gt;Fast symlink 21070423 has EXTENT_FL set.  Clear? no&lt;/p&gt;

&lt;p&gt;and &lt;/p&gt;

&lt;p&gt;Inode 11172447 symlink missing NUL terminator.  Fix? no&lt;/p&gt;

&lt;p&gt;is that good or bad?&lt;/p&gt;</comment>
                            <comment id="50608" author="cliffw" created="Wed, 16 Jan 2013 19:09:30 +0000"  >&lt;p&gt;Not horrible. We think you may lose some symlinks. I would go ahead and say &apos;y&apos;&lt;/p&gt;</comment>
                            <comment id="50609" author="mhanafi" created="Wed, 16 Jan 2013 19:15:46 +0000"  >&lt;p&gt;hmmm e2fsck -vn SIGSEGV one it started checking directory structures&lt;/p&gt;

&lt;p&gt;nbp1-mds ~/newrpms # e2fsck &lt;del&gt;vn /dev/mapper/nbp1&lt;/del&gt;-vg-mdt1 &amp;gt; initcheck.out&lt;br/&gt;
e2fsck 1.42.3.wc3 (15-Aug-2012)&lt;br/&gt;
Signal (11) SIGSEGV si_code=SEGV_MAPERR fault addr=0x8&lt;br/&gt;
e2fsck&lt;span class=&quot;error&quot;&gt;&amp;#91;0x42d1dd&amp;#93;&lt;/span&gt;&lt;br/&gt;
/lib64/libc.so.6&lt;span class=&quot;error&quot;&gt;&amp;#91;0x3a42432900&amp;#93;&lt;/span&gt;&lt;br/&gt;
e2fsck&lt;span class=&quot;error&quot;&gt;&amp;#91;0x41160a&amp;#93;&lt;/span&gt;&lt;br/&gt;
e2fsck(e2fsck_process_bad_inode+0x693)&lt;span class=&quot;error&quot;&gt;&amp;#91;0x419b13&amp;#93;&lt;/span&gt;&lt;br/&gt;
e2fsck&lt;span class=&quot;error&quot;&gt;&amp;#91;0x419cfb&amp;#93;&lt;/span&gt;&lt;br/&gt;
/lib64/libext2fs.so.2(ext2fs_dblist_iterate2+0x87)&lt;span class=&quot;error&quot;&gt;&amp;#91;0x7fffed8c9447&amp;#93;&lt;/span&gt;&lt;br/&gt;
e2fsck(e2fsck_pass2+0x10b)&lt;span class=&quot;error&quot;&gt;&amp;#91;0x418ceb&amp;#93;&lt;/span&gt;&lt;br/&gt;
e2fsck(e2fsck_run+0x4f)&lt;span class=&quot;error&quot;&gt;&amp;#91;0x40eb6f&amp;#93;&lt;/span&gt;&lt;br/&gt;
e2fsck(main+0xbd2)&lt;span class=&quot;error&quot;&gt;&amp;#91;0x40cbf2&amp;#93;&lt;/span&gt;&lt;br/&gt;
/lib64/libc.so.6(__libc_start_main+0xfd)&lt;span class=&quot;error&quot;&gt;&amp;#91;0x3a4241ecdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
e2fsck&lt;span class=&quot;error&quot;&gt;&amp;#91;0x409c49&amp;#93;&lt;/span&gt;&lt;/p&gt;</comment>
                            <comment id="50610" author="adilger" created="Wed, 16 Jan 2013 19:22:38 +0000"  >&lt;p&gt;How did the extents feature get enabled on the MDT filesystem?  This is not a standard formatting option, and not something that we test locally.  It is likely the root cause of the problems that you are seeing.&lt;/p&gt;</comment>
                            <comment id="50612" author="mhanafi" created="Wed, 16 Jan 2013 19:25:15 +0000"  >&lt;p&gt;here is what we have. Should we remove &quot;extent&quot; Option?&lt;br/&gt;
nbp1-mds ~/newrpms # tune2fs &lt;del&gt;l /dev/mapper/nbp1&lt;/del&gt;-vg-mdt1&lt;br/&gt;
tune2fs 1.42.3.wc3 (15-Aug-2012)&lt;br/&gt;
Filesystem volume name:   nbp1-MDT0000&lt;br/&gt;
Last mounted on:          /&lt;br/&gt;
Filesystem UUID:          aa5a51d9-5858-4bad-b6b6-668298ae0a7e&lt;br/&gt;
Filesystem magic number:  0xEF53&lt;br/&gt;
Filesystem revision #:    1 (dynamic)&lt;br/&gt;
Filesystem features:      has_journal ext_attr resize_inode dir_index filetype extent flex_bg dirdata sparse_super large_file huge_file uninit_bg dir_nlink extra_isize&lt;br/&gt;
Filesystem flags:         signed_directory_hash &lt;br/&gt;
Default mount options:    (none)&lt;br/&gt;
Filesystem state:         clean&lt;br/&gt;
Errors behavior:          Continue&lt;br/&gt;
Filesystem OS type:       Linux&lt;br/&gt;
Inode count:              268435456&lt;br/&gt;
Block count:              268435456&lt;br/&gt;
Reserved block count:     0&lt;br/&gt;
Free blocks:              224938984&lt;br/&gt;
Free inodes:              204578188&lt;br/&gt;
First block:              0&lt;br/&gt;
Block size:               4096&lt;br/&gt;
Fragment size:            4096&lt;br/&gt;
Reserved GDT blocks:      960&lt;br/&gt;
Blocks per group:         32768&lt;br/&gt;
Fragments per group:      32768&lt;br/&gt;
Inodes per group:         32768&lt;br/&gt;
Inode blocks per group:   4096&lt;br/&gt;
Flex block group size:    16&lt;br/&gt;
Filesystem created:       Wed Jun  8 19:54:48 2011&lt;br/&gt;
Last mount time:          Wed Jan 16 14:17:10 2013&lt;br/&gt;
Last write time:          Wed Jan 16 15:30:31 2013&lt;br/&gt;
Mount count:              112&lt;br/&gt;
Maximum mount count:      20&lt;br/&gt;
Last checked:             Wed Jun  8 19:54:48 2011&lt;br/&gt;
Check interval:           15552000 (6 months)&lt;br/&gt;
Next check after:         Mon Dec  5 18:54:48 2011&lt;br/&gt;
Lifetime writes:          7121 GB&lt;br/&gt;
Reserved blocks uid:      0 (user root)&lt;br/&gt;
Reserved blocks gid:      0 (group root)&lt;br/&gt;
First inode:              11&lt;br/&gt;
Inode size:               512&lt;br/&gt;
Required extra isize:     28&lt;br/&gt;
Desired extra isize:      28&lt;br/&gt;
Journal inode:            8&lt;br/&gt;
Default directory hash:   half_md4&lt;br/&gt;
Directory Hash Seed:      9bb09704-2b6c-4030-afe1-fcb7935216aa&lt;br/&gt;
Journal backup:           inode blocks&lt;/p&gt;</comment>
                            <comment id="50613" author="cliffw" created="Wed, 16 Jan 2013 19:26:36 +0000"  >&lt;p&gt;Can you give us the output of &apos;tune2fs -l &amp;lt;device&amp;gt;&apos; and tunefs.lustre -print &amp;lt;device&amp;gt; ?&lt;/p&gt;</comment>
                            <comment id="50614" author="mhanafi" created="Wed, 16 Jan 2013 19:28:45 +0000"  >&lt;p&gt;nbp1-mds ~/newrpms # tune2fs &lt;del&gt;l /dev/mapper/nbp1&lt;/del&gt;-vg-mdt1&lt;br/&gt;
tune2fs 1.42.3.wc3 (15-Aug-2012)&lt;br/&gt;
Filesystem volume name:   nbp1-MDT0000&lt;br/&gt;
Last mounted on:          /&lt;br/&gt;
Filesystem UUID:          aa5a51d9-5858-4bad-b6b6-668298ae0a7e&lt;br/&gt;
Filesystem magic number:  0xEF53&lt;br/&gt;
Filesystem revision #:    1 (dynamic)&lt;br/&gt;
Filesystem features:      has_journal ext_attr resize_inode dir_index filetype extent flex_bg dirdata sparse_super large_file huge_file uninit_bg dir_nlink extra_isize&lt;br/&gt;
Filesystem flags:         signed_directory_hash &lt;br/&gt;
Default mount options:    (none)&lt;br/&gt;
Filesystem state:         clean&lt;br/&gt;
Errors behavior:          Continue&lt;br/&gt;
Filesystem OS type:       Linux&lt;br/&gt;
Inode count:              268435456&lt;br/&gt;
Block count:              268435456&lt;br/&gt;
Reserved block count:     0&lt;br/&gt;
Free blocks:              224938984&lt;br/&gt;
Free inodes:              204578188&lt;br/&gt;
First block:              0&lt;br/&gt;
Block size:               4096&lt;br/&gt;
Fragment size:            4096&lt;br/&gt;
Reserved GDT blocks:      960&lt;br/&gt;
Blocks per group:         32768&lt;br/&gt;
Fragments per group:      32768&lt;br/&gt;
Inodes per group:         32768&lt;br/&gt;
Inode blocks per group:   4096&lt;br/&gt;
Flex block group size:    16&lt;br/&gt;
Filesystem created:       Wed Jun  8 19:54:48 2011&lt;br/&gt;
Last mount time:          Wed Jan 16 14:17:10 2013&lt;br/&gt;
Last write time:          Wed Jan 16 15:30:31 2013&lt;br/&gt;
Mount count:              112&lt;br/&gt;
Maximum mount count:      20&lt;br/&gt;
Last checked:             Wed Jun  8 19:54:48 2011&lt;br/&gt;
Check interval:           15552000 (6 months)&lt;br/&gt;
Next check after:         Mon Dec  5 18:54:48 2011&lt;br/&gt;
Lifetime writes:          7121 GB&lt;br/&gt;
Reserved blocks uid:      0 (user root)&lt;br/&gt;
Reserved blocks gid:      0 (group root)&lt;br/&gt;
First inode:              11&lt;br/&gt;
Inode size:               512&lt;br/&gt;
Required extra isize:     28&lt;br/&gt;
Desired extra isize:      28&lt;br/&gt;
Journal inode:            8&lt;br/&gt;
Default directory hash:   half_md4&lt;br/&gt;
Directory Hash Seed:      9bb09704-2b6c-4030-afe1-fcb7935216aa&lt;br/&gt;
Journal backup:           inode blocks&lt;br/&gt;
nbp1-mds ~/newrpms # tunefs.lustre &lt;del&gt;print /dev/mapper/nbp1&lt;/del&gt;-vg-mdt1&lt;br/&gt;
checking for existing Lustre data: found CONFIGS/mountdata&lt;br/&gt;
Reading CONFIGS/mountdata&lt;/p&gt;

&lt;p&gt;   Read previous values:&lt;br/&gt;
Target:     nbp1-MDT0000&lt;br/&gt;
Index:      0&lt;br/&gt;
Lustre FS:  nbp1&lt;br/&gt;
Mount type: ldiskfs&lt;br/&gt;
Flags:      0x401&lt;br/&gt;
              (MDT )&lt;br/&gt;
Persistent mount opts: iopen_nopriv,user_xattr,errors=remount-ro&lt;br/&gt;
Parameters: mgsnode=10.151.26.26@o2ib lov.stripesize=1048576 lov.stripecount=4 mdd.quota_type=u mdt.identity_info=/usr/sbin/l_getidentity&lt;/p&gt;


&lt;p&gt;   Permanent disk data:&lt;br/&gt;
Target:     nbp1-MDT0000&lt;br/&gt;
Index:      0&lt;br/&gt;
Lustre FS:  nbp1&lt;br/&gt;
Mount type: ldiskfs&lt;br/&gt;
Flags:      0x441&lt;br/&gt;
              (MDT update )&lt;br/&gt;
Persistent mount opts: iopen_nopriv,user_xattr,errors=remount-ro&lt;br/&gt;
Parameters: mgsnode=10.151.26.26@o2ib lov.stripesize=1048576 lov.stripecount=4 mdd.quota_type=u mdt.identity_info=/usr/sbin/l_getidentity rint&lt;/p&gt;

&lt;p&gt;Writing CONFIGS/mountdata&lt;br/&gt;
nbp1-mds ~/newrpms # &lt;/p&gt;</comment>
                            <comment id="50615" author="cliffw" created="Wed, 16 Jan 2013 19:29:34 +0000"  >&lt;p&gt;sorry, looks like our replies crossed. I see you already supplied the tune2fs&lt;/p&gt;</comment>
                            <comment id="50616" author="adilger" created="Wed, 16 Jan 2013 19:39:31 +0000"  >&lt;p&gt;Can you please run e2fsck under gdb with &quot;-n&quot; option and paste the resulting stack trace here?  I can&apos;t see enough of where the problem is above.&lt;/p&gt;

&lt;p&gt;If you have a spare SATA disk I would recommend making a full backup of the MDT device with &quot;dd&quot;, since this would go relatively quickly (maybe at 100MB/s, so a few hours for the full backup).  This may be important in case running the real e2fsck doesn&apos;t go well (depending on what corruption is being seen).&lt;/p&gt;
</comment>
                            <comment id="50617" author="mhanafi" created="Wed, 16 Jan 2013 20:22:43 +0000"  >&lt;p&gt;rogram received signal SIGSEGV, Segmentation fault.&lt;br/&gt;
check_symlink (ctx=0x645060, pctx=0x0, ino=8503956, inode=0x7fffffffe180, buf=&amp;lt;value optimized out&amp;gt;) at pass1.c:190&lt;br/&gt;
190                     if (ext2fs_extent_open2(ctx-&amp;gt;fs, pctx-&amp;gt;ino, inode, &amp;amp;handle))&lt;br/&gt;
Missing separate debuginfos, use: debuginfo-install db4-4.7.25-16.el6.x86_64 glibc-2.12-1.47.el6.x86_64 libblkid-2.17.2-12.4.el6.x86_64 libuuid-2.17.2-12.4.el6.x86_64&lt;br/&gt;
(gdb) bt&lt;br/&gt;
#0  check_symlink (ctx=0x645060, pctx=0x0, ino=8503956, inode=0x7fffffffe180, buf=&amp;lt;value optimized out&amp;gt;) at pass1.c:190&lt;br/&gt;
#1  0x0000000000419b13 in e2fsck_process_bad_inode (ctx=0x645060, dir=&amp;lt;value optimized out&amp;gt;, ino=8503956, buf=0x66eda0 &quot;&quot;) at pass2.c:1402&lt;br/&gt;
#2  0x0000000000419cfb in check_dir_block (fs=0x645510, db=0x7fffe11d2a68, priv_data=0x7fffffffe570) at pass2.c:1044&lt;br/&gt;
#3  0x00007fffed8c9447 in ext2fs_dblist_iterate2 (dblist=0x649100, func=0x419b70 &amp;lt;check_dir_block&amp;gt;, priv_data=0x7fffffffe570) at dblist.c:239&lt;br/&gt;
#4  0x0000000000418ceb in e2fsck_pass2 (ctx=0x645060) at pass2.c:148&lt;br/&gt;
#5  0x000000000040eb6f in e2fsck_run (ctx=0x645060) at e2fsck.c:226&lt;br/&gt;
#6  0x000000000040cbf2 in main (argc=&amp;lt;value optimized out&amp;gt;, argv=&amp;lt;value optimized out&amp;gt;) at unix.c:1852&lt;/p&gt;
</comment>
                            <comment id="50623" author="mhanafi" created="Wed, 16 Jan 2013 21:33:54 +0000"  >&lt;p&gt;dd is going to take 20hours! I have created a snapshot of the volume so we can run the fsck on it&lt;/p&gt;
</comment>
                            <comment id="50625" author="bobijam" created="Wed, 16 Jan 2013 22:49:30 +0000"  >&lt;p&gt;the latest e2fsck has a glitch, and I uploaded a patch for it (&lt;a href=&quot;http://review.whamcloud.com/5045&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5045&lt;/a&gt;)&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedHeader panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;commit message&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
LU-2627 e2fsck: check_symlink() SIGSEGV

Since e2fsck_pass1_check_symlink()-&amp;gt;
check_symlink(ctx, NULL, ino, inode, buf), we should use &apos;ino&apos; instead
of &apos;pctx-&amp;gt;ino&apos; in check_symlink().
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;this is just for the SIGSEGV issue.&lt;/p&gt;</comment>
                            <comment id="50626" author="bobijam" created="Wed, 16 Jan 2013 23:30:10 +0000"  >&lt;p&gt;As Andreas suggested, run e2fsck with run the patched e2fsck with &quot;-n&quot; under gdb and paste the resulting stack trace so that we can diagnose what the problem could be.&lt;/p&gt;

&lt;p&gt;Running e2fsck with &apos;-n&apos; won&apos;t change the disk device.&lt;/p&gt;</comment>
                            <comment id="50635" author="mhanafi" created="Thu, 17 Jan 2013 01:55:58 +0000"  >&lt;p&gt;Looks like the patch got us past the SIGSEGV. But looks like the fsck will remove all symlinks!&lt;/p&gt;

&lt;p&gt;It is calling out what looks like all symlinks as invalid.&lt;/p&gt;
</comment>
                            <comment id="50666" author="cliffw" created="Thu, 17 Jan 2013 10:27:28 +0000"  >&lt;p&gt;This is why we urge a backup before you fsck -y. Will the snapshot allow you to restore the symlinks?&lt;/p&gt;</comment>
                            <comment id="50672" author="mhanafi" created="Thu, 17 Jan 2013 10:58:27 +0000"  >&lt;p&gt;It is not clear to me why it is removing all the symlunks. Is it because of the extent option? How would we restore the symlinks from the dd backup?&lt;/p&gt;</comment>
                            <comment id="50677" author="mhanafi" created="Thu, 17 Jan 2013 11:31:56 +0000"  >&lt;p&gt;here is the summary of the test fsck.&lt;/p&gt;

&lt;p&gt;nbp1-MDT0000: ********** WARNING: Filesystem still has errors **********&lt;/p&gt;


&lt;p&gt;    63829418 inodes used (23.78%, out of 268435456)&lt;br/&gt;
        3242 non-contiguous files (0.0%)&lt;br/&gt;
       35652 non-contiguous directories (0.1%)&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;of inodes with ind/dind/tind blocks: 0/0/0&lt;br/&gt;
             Extent depth histogram: 63096846/16544/13&lt;br/&gt;
    43499835 blocks used (16.20%, out of 268435456)&lt;br/&gt;
           0 bad blocks&lt;br/&gt;
       21886 large files&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;    62047854 regular files&lt;br/&gt;
      773172 directories&lt;br/&gt;
           0 character device files&lt;br/&gt;
           0 block device files&lt;br/&gt;
          20 fifos&lt;br/&gt;
        6652 links&lt;br/&gt;
      612271 symbolic links (349798 fast symbolic links)&lt;br/&gt;
          65 sockets&lt;br/&gt;
------------&lt;br/&gt;
    63836054 files&lt;/p&gt;

&lt;p&gt;I can upload the full upload of the output.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@pladmin4:~/mhanafi&amp;#93;&lt;/span&gt;$ grep invalid fck.out | wc -l&lt;br/&gt;
396020&lt;/p&gt;

</comment>
                            <comment id="50682" author="cliffw" created="Thu, 17 Jan 2013 11:55:08 +0000"  >&lt;p&gt;We are not certain that the symlinks would be deleted, in a case such as this it is always desirable to have a backup, if possible.&lt;/p&gt;</comment>
                            <comment id="50692" author="bobijam" created="Thu, 17 Jan 2013 12:46:43 +0000"  >&lt;p&gt;please compress and upload fck.out.&lt;/p&gt;

&lt;p&gt;I want to check whether those invalid symlink file are those long symlink which miss NUL terminator. Something like&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedHeader panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;an example&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Pass 1: Checking inodes, blocks, and sizes
Inode 121351 symlink missing NUL terminator.  Fix? no
...
...
Pass 2: Checking directory structure
Symlink /path/to/long/symlink/file (inode #121351) is invalid.		
Clear? no
...
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;If it&apos;s this case, latest e2fsck should be capable of fixing them. (like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1540&quot; title=&quot;e2fsck remove too many symlinks&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1540&quot;&gt;&lt;del&gt;LU-1540&lt;/del&gt;&lt;/a&gt; indicates)&lt;/p&gt;</comment>
                            <comment id="50695" author="mhanafi" created="Thu, 17 Jan 2013 12:59:25 +0000"  >&lt;p&gt;file is uploaded&lt;/p&gt;</comment>
                            <comment id="50705" author="adilger" created="Thu, 17 Jan 2013 13:49:31 +0000"  >&lt;p&gt;Filed &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2634&quot; title=&quot;short symlinks on MDT with &amp;quot;extents&amp;quot; have EXT4_EXTENTS_FL set&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2634&quot;&gt;&lt;del&gt;LU-2634&lt;/del&gt;&lt;/a&gt; for tracking issue with EXT4_EXTENTS_FL set on symlinks for MDT with &quot;extents&quot; feature enabled.&lt;/p&gt;</comment>
                            <comment id="50713" author="adilger" created="Thu, 17 Jan 2013 14:23:00 +0000"  >&lt;p&gt;Bobijam, I think that the problem is with e2fsck rejecting short symlinks with the EXT4_EXTENTS_FL set.  The &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1540&quot; title=&quot;e2fsck remove too many symlinks&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1540&quot;&gt;&lt;del&gt;LU-1540&lt;/del&gt;&lt;/a&gt; NUL termination problem appears that it would be fixed correctly with the current e2fsck.  This EXT4_EXTENTS_FL appears to be a bug in the osd-ldiskfs code, if &quot;extents&quot; is enabled, for which I&apos;ve filed &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2634&quot; title=&quot;short symlinks on MDT with &amp;quot;extents&amp;quot; have EXT4_EXTENTS_FL set&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2634&quot;&gt;&lt;del&gt;LU-2634&lt;/del&gt;&lt;/a&gt;.  Since we never format the MDT with &quot;extents&quot;, we have never seen such a problem in our testing.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Inode 9482890 symlink missing NUL terminator.  Fix? no
Inode 9482897 symlink missing NUL terminator.  Fix? no
Fast symlink 9482914 has EXTENT_FL set.  Clear? no
Fast symlink 9482917 has EXTENT_FL set.  Clear? no
Fast symlink 9482921 has EXTENT_FL set.  Clear? no
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It makes sense to change e2fsck to accept such inodes and just clear the EXT4_EXTENTS_FL instead of considering it corrupted.  That will allow recovering the filesystem without the need to restore the symlinks (which would just get EXT4_EXTENTS_FL set again, until &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2634&quot; title=&quot;short symlinks on MDT with &amp;quot;extents&amp;quot; have EXT4_EXTENTS_FL set&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2634&quot;&gt;&lt;del&gt;LU-2634&lt;/del&gt;&lt;/a&gt; is fixed).&lt;/p&gt;</comment>
                            <comment id="50714" author="mhanafi" created="Thu, 17 Jan 2013 14:26:32 +0000"  >&lt;p&gt;This was a 1.8.x filesystem that was upgraded. So I think the extent option is leftover from the 1.8.x format.&lt;/p&gt;</comment>
                            <comment id="50733" author="adilger" created="Thu, 17 Jan 2013 18:00:03 +0000"  >&lt;p&gt;Looking at the e2fsck code, it appears that it will correctly remove just the EXTENT_FL flag, rather than clear the whole inode:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (extent_fs &amp;amp;&amp;amp; (inode-&amp;gt;i_flags &amp;amp; EXT4_EXTENTS_FL) &amp;amp;&amp;amp;
                    LINUX_S_ISLNK(inode-&amp;gt;i_mode) &amp;amp;&amp;amp;
                    !ext2fs_inode_has_valid_blocks2(fs, inode) &amp;amp;&amp;amp;
                    fix_problem(ctx, PR_1_FAST_SYMLINK_EXTENT_FL, &amp;amp;pctx)) {
                        inode-&amp;gt;i_flags &amp;amp;= ~EXT4_EXTENTS_FL;
                        e2fsck_write_inode(ctx, ino, inode, &lt;span class=&quot;code-quote&quot;&gt;&quot;pass1&quot;&lt;/span&gt;);
                }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;so the only confusion is that the PR_1_FAST_SYMLINK_EXTENT_FL problem code is asking &quot;Clear&quot;, which might be confusing to some (including myself) as asking whether the &lt;em&gt;inode&lt;/em&gt; should be cleared instead of the &lt;em&gt;flag&lt;/em&gt; being cleared.  I will submit a patch to fix this.&lt;/p&gt;

&lt;p&gt;The later errors:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Symlink /ROOT/pheimbac/ecco/2013-01-seaice-adjoint/MITgcm_latest/mysetups/arctic210x192x50/build_forw/timeave_cumulate.F (inode #68169598) is invalid.
Clear? no
Symlink /ROOT/pheimbac/ecco/2013-01-seaice-adjoint/MITgcm_latest/mysetups/arctic210x192x50/build_forw/cal_compdates.F (inode #68169136) is invalid.
Clear? no
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;should not be hit if the earlier checks to clear EXT4_EXTENT_FL had been allowed to clear this flag from the short symlinks.&lt;/p&gt;

&lt;p&gt;There are some further errors, much later in the log.  There are ~20 of the following errors in Pass 2:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Pass 2: Checking directory structure
Second entry &apos;IE_t040101_000000.log&apos; (inode=18364943) in directory inode 1837308
5 should be &apos;..&apos;
Fix? no
Entry &apos;..&apos; in /ROOT/xjia/Saturn/run_IdealizedSW_notilt_1e275_newgrid2_highorder/
RESULTS/run_all/IE (18373085) is duplicate &apos;..&apos; entry.
Fix? no
Entry &apos;..&apos; in /ROOT/xjia/Saturn/run_IdealizedSW_notilt_1e275_newgrid2_highorder/
RESULTS/run_all/IE (18373085) is duplicate &apos;..&apos; entry.
Fix? no
Entry &apos;..&apos; in /ROOT/xjia/Saturn/run_IdealizedSW_notilt_1e275_newgrid2_highorder/
RESULTS/run_all/IE (18373085) is a link to directory /ROOT/xjia/Saturn/run_Ideal
izedSW_notilt_1e275_newgrid2_highorder/RESULTS/run_all (13221653).
Clear? no
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;that appear a bit unusual, but are not fatally broken.  There are ~20 matching errors for the unfixed &quot;..&quot; entries later in Pass 3:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Pass 3: Checking directory connectivity
&apos;..&apos; in /ROOT/xjia/Saturn/run_IdealizedSW_notilt_1e275_newgrid2_highorder/RESULTS/run_all/IE (18373085) is &amp;lt;The NULL inode&amp;gt; (0), should be /ROOT/xjia/Saturn/run_IdealizedSW_notilt_1e275_newgrid2_highorder/RESULTS/run_all (13221653).
Fix? no
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;and a few minor errors in Pass 3A:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Pass 3A: Optimizing directories
Duplicate entry &apos;c_t_f.x&apos; in /ROOT/aiannett/NCC/Testing/Back-Face-Step (77623393) found.  Clear? no
Entry &apos;c_t_f.x&apos; in /ROOT/aiannett/NCC/Testing/Back-Face-Step (77623393) has a non-unique filename.
Rename to c_t_f.~0? no
Duplicate entry &apos;b1b2b3.x&apos; in /ROOT/aiannett/NCC/Testing/Back-Face-Step (77623393) found.  Clear? no
Entry &apos;b1b2b3.x&apos; in /ROOT/aiannett/NCC/Testing/Back-Face-Step (77623393) has a non-unique filename.
Rename to b1b2b3~0? no
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It appears that the entries that would be &quot;fixed&quot; in Pass 2 will likely appear in lost+found once they are fixed, and if you want to recover those files you could mount the MDT locally with &lt;tt&gt;mount -t ldiskfs&lt;/tt&gt; and rename them from &lt;tt&gt;.../lost+found/#inode&lt;/tt&gt; to the path given for each inode number.&lt;/p&gt;

&lt;p&gt;I think you could go ahead with running &lt;tt&gt;e2fsck -fy&lt;/tt&gt; on the snapshot, mount the snapshot MDT filesystem locally as ldiskfs to verify a handful of the symlinks are still intact, and check lost+found for the ~20 or so inodes that would need to be fixed (you could even write a short script to rename them if downtime is critical).  If that works OK, then when you take the real MDT filesystem offline for repair, please make another snapshot at that time, run the &lt;tt&gt;e2fsck -fy&lt;/tt&gt; on the real MDT, mount as ldiskfs and repair the files in lost+found before unmounting and remounting it again as lustre.&lt;/p&gt;

&lt;p&gt;In order to get the number of messages in the e2fsck log to a manageable number, I filtered out all of the duplicate messages:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;egrep -v &quot;^$|^Fast symlink .* EXTENT_FL|^Inode .* missing NUL terminator|^Clear&quot; e2fsck.log &amp;gt; e2fsck-filtered.log
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I had also filtered out &quot;&lt;tt&gt;^Symlink.*is invalid&lt;/tt&gt;&quot; messages, but I don&apos;t think you should hit them during the repairing e2fsck run.&lt;/p&gt;</comment>
                            <comment id="50738" author="adilger" created="Thu, 17 Jan 2013 18:35:34 +0000"  >&lt;p&gt;I also see in your MDT feature list that there is the &quot;dirdata&quot; feature enabled, but this is definitely NOT a feature that would have been enabled with a filesystem formatted with 1.8.  Also, the &quot;..&quot; corruption is definitely not random.&lt;/p&gt;

&lt;p&gt;Did you perhaps run the Xyratex &quot;upgrade&quot; tool on the MDT filesystem?&lt;/p&gt;

&lt;p&gt;I believe that this would be the root cause of the &quot;..&quot; corruption.  My understanding is that it was deleting the &quot;..&quot; entry to add the FID, and then re-inserting it into the directory, but ext4/e2fsck require that the &quot;..&quot; entry immediately follow the &quot;.&quot; entry at the start.&lt;/p&gt;</comment>
                            <comment id="50739" author="mhanafi" created="Thu, 17 Jan 2013 18:48:20 +0000"  >&lt;p&gt;We did not use the xyratex upgrade tool. But we added that dirdata option at some point. Should we remove that option?&lt;/p&gt;</comment>
                            <comment id="50745" author="mhanafi" created="Thu, 17 Jan 2013 20:00:54 +0000"  >&lt;p&gt;Uploading the fsck ran on the snap. Please review before we run on the real mdt device.&lt;/p&gt;</comment>
                            <comment id="50761" author="adilger" created="Thu, 17 Jan 2013 22:31:28 +0000"  >&lt;p&gt;Looking at the test e2fsck log, one new directory is getting yet a different error related to the &quot;.&quot; entry:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Directory entry for &apos;.&apos; in /ROOT/msekula/fun/camrad (13208388) is big.
Split? yes
Missing &apos;..&apos; in directory inode 13208388.
Fix? yes
Setting filetype for entry &apos;..&apos; in /ROOT/msekula/fun/camrad (13208388) to 2.
Entry &apos;..&apos; in /ROOT/msekula/fun/camrad (13208388) is duplicate &apos;..&apos; entry.
Fix? yes
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I suspect that there is some code in e2fsck or in ldiskfs that is not handling the dirdata field correctly.  It likely relates to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2638&quot; title=&quot;corruption of MDT &amp;quot;..&amp;quot; entry in some ldiskfs directories&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2638&quot;&gt;&lt;del&gt;LU-2638&lt;/del&gt;&lt;/a&gt;.  There are several files moved to &lt;tt&gt;lost+found&lt;/tt&gt; as I suspected, but it looks like the majority of symlinks are fine.&lt;/p&gt;

&lt;p&gt;It doesn&apos;t seem that a large number of directories will be repaired, so I think it makes sense to go ahead and fix the real MDT at this point.  The only other thing you might want to check before doing the final is if you run &quot;&lt;tt&gt;e2fsck -fy&lt;/tt&gt;&quot; on the snapshot a second time that it passes cleanly without any repairs.  About 30 directories will be moved to lost+found, but they can be moved back to their correct location, and nothing should be lost.&lt;/p&gt;

&lt;p&gt;The next question to figure out what has caused this problem.  When did you upgrade to 2.1?  Were these directories existing before the upgrade from 1.8, or were they created afterward?  How large are the directories (number of entries = &quot;find ${directory} -print | wc -l&quot;, size of directory = &quot;ls -ld ${directory}&quot;)?  Do you know if the directories where renamed after they were created?  How long has it been since you last ran e2fsck?  Have you run it since the upgrade?&lt;/p&gt;</comment>
                            <comment id="50762" author="mhanafi" created="Thu, 17 Jan 2013 23:49:08 +0000"  >&lt;p&gt;It has been a very long time since we have ran e2fsck and that was during the 1.8.x code. We have never ran e2fsck since moving to 2.1. &lt;/p&gt;

&lt;p&gt;Should we remove the dirdata options? &lt;/p&gt;

&lt;p&gt;I will check the date and size of the directories. We may want to just archive these and restore them after the fsck or tar/delete/untar them.&lt;/p&gt;
</comment>
                            <comment id="50766" author="adilger" created="Fri, 18 Jan 2013 00:42:36 +0000"  >&lt;p&gt;The &quot;dirdata&quot; option is enabled by default for 2.x filesystems, but I don&apos;t think it is necessarily advisable to disable it at this time.  It does appear at first glance that running e2fsck after removing the dirdata feature would handle this correctly and clear the extra dirdata flag in each dirent, but we haven&apos;t tested this at all, and it would also cause the MDS to become considerably slower.&lt;/p&gt;

&lt;p&gt;So far I don&apos;t see any indication besides the mixup with &quot;..&quot; entries that there is anything seriously wrong with these directories.  The bytes at the start of the directory are used for &quot;.&quot;, &quot;..&quot;, and the htree index on directories over 4kB in size, and not user data.   e2fsck should regenerate all of the needed information from redundant information elsewhere, except being able to move the entry from lost+found back to the proper place in the tree.&lt;/p&gt;</comment>
                            <comment id="50910" author="cliffw" created="Mon, 21 Jan 2013 12:55:54 +0000"  >&lt;p&gt;What is your current state? What help can we give you?&lt;/p&gt;</comment>
                            <comment id="50916" author="mhanafi" created="Mon, 21 Jan 2013 13:44:40 +0000"  >&lt;p&gt;At this point we have been able to run fsck on the mdt and have recovered from the errors.&lt;/p&gt;
</comment>
                            <comment id="50917" author="cliffw" created="Mon, 21 Jan 2013 13:49:29 +0000"  >&lt;p&gt;Is the issue closed, or is there some other help we can give you?&lt;/p&gt;</comment>
                            <comment id="52062" author="mhanafi" created="Fri, 8 Feb 2013 14:47:42 +0000"  >&lt;p&gt;We seem to have hit this issue again on the same filesystem.&lt;/p&gt;

&lt;p&gt;pfe1 ~ # ls -l /nobackupp1/xmeng/run_sc_anisopi/run06_dipole_semiimpl_nohyp_taug&lt;br/&gt;
r_60000ss/SC&lt;br/&gt;
ls: reading directory /nobackupp1/xmeng/run_sc_anisopi/run06_dipole_semiimpl_noh&lt;br/&gt;
yp_taugr_60000ss/SC: Input/output error&lt;br/&gt;
total 0&lt;/p&gt;

&lt;p&gt;from the mdt&lt;br/&gt;
Feb  8 06:50:58 nbp1-mds kernel: LDISKFS-fs warning (device dm-4): dx_probe: Unrecognised inode hash code 18 for directory #17309149&lt;br/&gt;
Feb  8 06:50:58 nbp1-mds kernel: LDISKFS-fs warning (device dm-4): dx_probe: Corrupt dir inode 17309149, running e2fsck is recommended.&lt;br/&gt;
Feb  8 06:51:57 nbp1-mds kernel: LDISKFS-fs warning (device dm-4): dx_probe: Unrecognised inode hash code 8 for directory #17309159&lt;br/&gt;
Feb  8 06:51:57 nbp1-mds kernel: LDISKFS-fs warning (device dm-4): dx_probe: Corrupt dir inode 17309159, running e2fsck is recommended.&lt;br/&gt;
Feb  8 08:35:12 nbp1-mds kernel: LDISKFS-fs warning (device dm-4): dx_probe: Unrecognised inode hash code 15 for directory #130557236&lt;br/&gt;
Feb  8 08:35:12 nbp1-mds kernel: LDISKFS-fs warning (device dm-4): dx_probe: Corrupt dir inode 130557236, running e2fsck is recommended.&lt;br/&gt;
Feb  8 11:45:38 nbp1-mds kernel: LDISKFS-fs warning (device dm-4): dx_probe: Unrecognised inode hash code 3 for directory #157287952&lt;br/&gt;
Feb  8 11:45:39 nbp1-mds kernel: LDISKFS-fs warning (device dm-4): dx_probe: Corrupt dir inode 157287952, running e2fsck is recommended.&lt;br/&gt;
Feb  8 11:46:07 nbp1-mds kernel: LDISKFS-fs warning (device dm-4): dx_probe: Unrecognised inode hash code 4 for directory #157331367&lt;br/&gt;
Feb  8 11:46:07 nbp1-mds kernel: LDISKFS-fs warning (device dm-4): dx_probe: Corrupt dir inode 157331367, running e2fsck is recommended.&lt;/p&gt;</comment>
                            <comment id="52073" author="adilger" created="Fri, 8 Feb 2013 17:16:05 +0000"  >&lt;p&gt;This problem will persist for large 1.8 directories that are renamed until a version of the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2638&quot; title=&quot;corruption of MDT &amp;quot;..&amp;quot; entry in some ldiskfs directories&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2638&quot;&gt;&lt;del&gt;LU-2638&lt;/del&gt;&lt;/a&gt; patch &lt;a href=&quot;http://review.whamcloud.com/5179&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5179&lt;/a&gt; is applied.  For the short term, until this patch is applied, it is possible to disable the dirdata feature on the unmounted MDT filesystem:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;tune2fs -O dirdata /dev/mdtdev
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;though this will have some negative performance impact for all newly-created files when doing name lookups and &quot;ls -l&quot;.&lt;/p&gt;</comment>
                            <comment id="52076" author="mhanafi" created="Fri, 8 Feb 2013 17:35:30 +0000"  >&lt;p&gt;uploading fsck output for review before we run it for real.&lt;/p&gt;
</comment>
                            <comment id="52213" author="johann" created="Tue, 12 Feb 2013 10:46:07 +0000"  >&lt;p&gt;There is nothing new in the fsck output compared to last time. I think you should go ahead and run fsck.&lt;/p&gt;</comment>
                            <comment id="54605" author="pjones" created="Thu, 21 Mar 2013 20:23:50 +0000"  >&lt;p&gt;As per NASA ok to close ticket&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="19583">LU-3519</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="17208">LU-2634</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="12248" name="fsck.2.8.2012.nbp1.out.gz" size="1711937" author="mhanafi" created="Fri, 8 Feb 2013 17:36:33 +0000"/>
                            <attachment id="12175" name="mdtsnap.fsck.out.gz" size="1151247" author="mhanafi" created="Thu, 17 Jan 2013 20:01:37 +0000"/>
                            <attachment id="12170" name="nbp1FSCK.out.gz" size="4783823" author="mhanafi" created="Thu, 17 Jan 2013 12:58:14 +0000"/>
                    </attachments>
                <subtasks>
                            <subtask id="17215">LU-2638</subtask>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvfgf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>6149</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>