<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:45:09 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-11584] kernel BUG at ldiskfs.h:1907!</title>
                <link>https://jira.whamcloud.com/browse/LU-11584</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;server keeps crashing with the following error. &lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[  981.957669] Lustre: nbp13-OST0008: trigger OI scrub by RPC &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; the [0x100080000:0x217edd:0x0] with flags 0x4a, rc = 0
[  981.989579] Lustre: Skipped 11 previous similar messages
[ 1045.404615] ------------[ cut here ]------------
[ 1045.418484] kernel BUG at /tmp/rpmbuild-lustre-jlan-ItUrr9b3/BUILD/lustre-2.10.5/ldiskfs/ldiskfs.h:1907!
[ 1045.446989] invalid opcode: 0000 [#1] SMP 
[ 1045.459302] Modules linked in: ofd(OE) ost(OE) osp(OE) mdd(OE) lod(OE) mdt(OE) lfsck(OE) mgs(OE) mgc(OE) osd_ldiskfs(OE) ldiskfs(OE) lquota(OE) lustre(OE) lmv(OE) mdc(OE) lov(OE) fid(OE) fld(OE) dm_service_time ko2iblnd(OE) ptlrpc(OE) obdclass(OE) lnet(OE) libcfs(OE) lpfc ib_iser(OE) libiscsi scsi_transport_iscsi crct10dif_generic scsi_transport_fc scsi_tgt rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) bonding ib_ipoib(OE) ib_cm(OE) ib_uverbs(OE) ib_umad(OE) sunrpc dm_mirror dm_region_hash dm_log mlx5_ib(OE) ib_core(OE) intel_powerclamp coretemp intel_rapl iosf_mbi kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul mgag200 ghash_clmulni_intel i2c_algo_bit ttm dm_multipath aesni_intel drm_kms_helper lrw syscopyarea gf128mul sysfillrect sysimgblt glue_helper fb_sys_fops ablk_helper mlx5_core(OE) mlxfw(OE) tg3 ses cryptd mlx_compat(OE) drm ptp ipmi_si enclosure mei_me i2c_core pps_core hpwdt hpilo ipmi_devintf lpc_ich dm_mod mfd_core mei shpchp pcspkr wmi ipmi_msghandler acpi_power_meter binfmt_misc tcp_bic ip_tables virtio_scsi virtio_ring virtio xfs libcrc32c ext4 mbcache jbd2 sd_mod crc_t10dif crct10dif_common sg usb_storage smartpqi(E) crc32c_intel scsi_transport_sas [last unloaded: pps_core]
[ 1045.776428] CPU: 5 PID: 11348 Comm: lfsck Tainted: G           OE  ------------   3.10.0-693.21.1.el7.20180508.x86_64.lustre2105 #1
[ 1045.811992] Hardware name: HPE ProLiant DL380 Gen10/ProLiant DL380 Gen10, BIOS U30 06/15/2018
[ 1045.837624] task: ffff882ddca23f40 ti: ffff882bd280c000 task.ti: ffff882bd280c000
[ 1045.860117] RIP: 0010:[&amp;lt;ffffffffa10fbd04&amp;gt;]  [&amp;lt;ffffffffa10fbd04&amp;gt;] ldiskfs_rec_len_to_disk.part.9+0x4/0x10 [ldiskfs]
[ 1045.891259] RSP: 0018:ffff882bd280f980  EFLAGS: 00010207
[ 1045.907218] RAX: 0000000000000000 RBX: ffff882bd280fb58 RCX: ffff882bd280f994
[ 1045.928666] RDX: 00000000ffffffac RSI: ffffffffffffff81 RDI: 00000000ffffff81
[ 1045.950113] RBP: ffff882bd280f980 R08: 00000000ffffff81 R09: ffffffffa10fded0
[ 1045.971560] R10: ffff88303f803b00 R11: 0000000000ffffff R12: 000000000000003c
[ 1045.993006] R13: ffff881e2eae7708 R14: ffff881e2eae7690 R15: 0000000000000000
[ 1046.014452] FS:  0000000000000000(0000) GS:ffff882f7ef40000(0000) knlGS:0000000000000000
[ 1046.038775] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1046.056039] CR2: 00007ffff20df034 CR3: 0000002ef4268000 CR4: 00000000003607e0
[ 1046.077485] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1046.098932] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1046.120378] Call Trace:
[ 1046.127717]  [&amp;lt;ffffffffa10fe245&amp;gt;] htree_inlinedir_to_tree+0x445/0x450 [ldiskfs]
[ 1046.149690]  [&amp;lt;ffffffff8123002e&amp;gt;] ? __generic_file_splice_read+0x4ee/0x5e0
[ 1046.170356]  [&amp;lt;ffffffff81234cdd&amp;gt;] ? __getblk+0x2d/0x2e0
[ 1046.186052]  [&amp;lt;ffffffff81234c4c&amp;gt;] ? __find_get_block+0xbc/0x120
[ 1046.203841]  [&amp;lt;ffffffff81234cdd&amp;gt;] ? __getblk+0x2d/0x2e0
[ 1046.219541]  [&amp;lt;ffffffffa10cdfa0&amp;gt;] ? __ldiskfs_get_inode_loc+0x110/0x3e0 [ldiskfs]
[ 1046.242039]  [&amp;lt;ffffffffa10c89ef&amp;gt;] ? ldiskfs_xattr_find_entry+0x9f/0x130 [ldiskfs]
[ 1046.264536]  [&amp;lt;ffffffffa10c0277&amp;gt;] ldiskfs_htree_fill_tree+0x137/0x2f0 [ldiskfs]
[ 1046.286507]  [&amp;lt;ffffffff811df826&amp;gt;] ? kmem_cache_alloc_trace+0x1d6/0x200
[ 1046.306126]  [&amp;lt;ffffffffa10ae5ec&amp;gt;] ldiskfs_readdir+0x61c/0x850 [ldiskfs]
[ 1046.326012]  [&amp;lt;ffffffffa1147640&amp;gt;] ? osd_declare_ref_del+0x130/0x130 [osd_ldiskfs]
[ 1046.348507]  [&amp;lt;ffffffff812256b2&amp;gt;] ? generic_getxattr+0x52/0x70
[ 1046.366036]  [&amp;lt;ffffffffa1145cde&amp;gt;] osd_ldiskfs_it_fill+0xbe/0x260 [osd_ldiskfs]
[ 1046.387747]  [&amp;lt;ffffffffa1145eb7&amp;gt;] osd_it_ea_load+0x37/0x100 [osd_ldiskfs]
[ 1046.408158]  [&amp;lt;ffffffffa122808c&amp;gt;] lfsck_open_dir+0x11c/0x3a0 [lfsck]
[ 1046.427257]  [&amp;lt;ffffffffa1228cb2&amp;gt;] lfsck_master_oit_engine+0x9a2/0x1190 [lfsck]
[ 1046.448969]  [&amp;lt;ffffffff816946f7&amp;gt;] ? __schedule+0x477/0xa30
[ 1046.465453]  [&amp;lt;ffffffffa1229d96&amp;gt;] lfsck_master_engine+0x8f6/0x1360 [lfsck]
[ 1046.486120]  [&amp;lt;ffffffff810c4d40&amp;gt;] ? wake_up_state+0x20/0x20
[ 1046.502865]  [&amp;lt;ffffffffa12294a0&amp;gt;] ? lfsck_master_oit_engine+0x1190/0x1190 [lfsck]
[ 1046.525360]  [&amp;lt;ffffffff810b1131&amp;gt;] kthread+0xd1/0xe0
[ 1046.540011]  [&amp;lt;ffffffff810b1060&amp;gt;] ? insert_kthread_work+0x40/0x40
[ 1046.558323]  [&amp;lt;ffffffff816a14dd&amp;gt;] ret_from_fork+0x5d/0xb0
[ 1046.574540]  [&amp;lt;ffffffff810b1060&amp;gt;] ? insert_kthread_work+0x40/0x40
[ 1046.592852] Code: 44 04 02 48 8d 44 03 c8 48 01 c7 e8 b7 f6 22 e0 48 83 c4 10 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 0b 0f 0b 0f 1f 40 00 55 48 89 e5 &amp;lt;0f&amp;gt; 0b 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 85 f6 48 
[ 1046.650192] RIP  [&amp;lt;ffffffffa10fbd04&amp;gt;] ldiskfs_rec_len_to_disk.part.9+0x4/0x10 [ldiskfs]

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="53850">LU-11584</key>
            <summary>kernel BUG at ldiskfs.h:1907!</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bzzz">Alex Zhuravlev</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Mon, 29 Oct 2018 22:53:31 +0000</created>
                <updated>Sat, 8 Feb 2020 13:21:04 +0000</updated>
                            <resolved>Mon, 25 Nov 2019 20:20:55 +0000</resolved>
                                    <version>Lustre 2.10.5</version>
                                    <fixVersion>Lustre 2.13.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="235842" author="mhanafi" created="Mon, 29 Oct 2018 23:24:08 +0000"  >&lt;p&gt;some logs before crash&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[  373.561429] Lustre: 9389:0:(osd_handler.c:7051:osd_mount()) MGS-osd: device /dev/mapper/nbp13_1-MGS0 was upgraded from Lustre-1.x without enabling the dirdata feature. If you &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt; not want to downgrade to Lustre-1.x again, you can enable it via &lt;span class=&quot;code-quote&quot;&gt;&apos;tune2fs -O dirdata device&apos;&lt;/span&gt;
[  374.897846] Lustre: 9489:0:(osd_handler.c:371:osd_get_lma()) dm-1: unsupported incompat LMA feature(s) 0xffffffe1 &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; fid = [0x0:0x20af:0x2], ino = 153397641
[  401.375821] Lustre: nbp13-OST0004: Will be in recovery &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; at least 5:00, or until 25 clients reconnect
[  473.539046] Lustre: nbp13-MDT0000: Will be in recovery &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; at least 5:00, or until 24 clients reconnect
[  473.567385] Lustre: Skipped 3 previous similar messages
[  478.625631] Lustre: nbp13-OST0005: Will be in recovery &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; at least 5:00, or until 25 clients reconnect
[  519.958976] LNet: 4020:0:(o2iblnd_cb.c:3192:kiblnd_check_conns()) Timed out tx &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 10.151.26.154@o2ib: 96 seconds
[  519.989838] LNet: 4020:0:(o2iblnd_cb.c:3192:kiblnd_check_conns()) Skipped 5 previous similar messages
[  530.053761] Lustre: 7860:0:(client.c:2114:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1540855068/real 1540855068]  req@ffff882da135b600 x1615703345988272/t0(0) o8-&amp;gt;nbp13-OST0004-osc-MDT0000@0@lo:28/4 lens 520/544 e 0 to 1 dl 1540855223 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
[  667.563723] LustreError: 10029:0:(ofd_dev.c:1784:ofd_create_hdl()) nbp13-OST0008: unable to precreate: rc = -115
[  667.566809] Lustre: 10692:0:(osd_handler.c:759:osd_check_lma()) nbp13-MDT0000: unsupported incompat LMA feature(s) 0xffffffe1 &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; fid = [0x200001db5:0x19764:0x0], ino = 162675645
[  667.642235] LustreError: 9617:0:(osp_precreate.c:657:osp_precreate_send()) nbp13-OST0008-osc-MDT0000: precreate fid [0x100080000:0x217edc:0x0] &amp;lt; local used fid [0x100080000:0x217edc:0x0]: rc = -116
[  667.695067] LustreError: 9617:0:(osp_precreate.c:1289:osp_precreate_thread()) nbp13-OST0008-osc-MDT0000: cannot precreate objects: rc = -116
[  677.552789] LustreError: 10453:0:(ofd_dev.c:1784:ofd_create_hdl()) nbp13-OST0008: unable to precreate: rc = -115
[  677.583422] LustreError: 9617:0:(osp_precreate.c:657:osp_precreate_send()) nbp13-OST0008-osc-MDT0000: precreate fid [0x100080000:0x217edc:0x0] &amp;lt; local used fid [0x100080000:0x217edc:0x0]: rc = -116
[  677.636261] LustreError: 9617:0:(osp_precreate.c:1289:osp_precreate_thread()) nbp13-OST0008-osc-MDT0000: cannot precreate objects: rc = -116
[  687.545335] Lustre: nbp13-OST0008: trigger OI scrub by RPC &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; the [0x100080000:0x217edd:0x0] with flags 0x4a, rc = 0
[  687.577251] LustreError: 10029:0:(ofd_dev.c:1784:ofd_create_hdl()) nbp13-OST0008: unable to precreate: rc = -115
[  687.607875] LustreError: 9617:0:(osp_precreate.c:657:osp_precreate_send()) nbp13-OST0008-osc-MDT0000: precreate fid [0x100080000:0x217edc:0x0] &amp;lt; local used fid [0x100080000:0x217edc:0x0]: rc = -116
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="235843" author="pjones" created="Mon, 29 Oct 2018 23:51:47 +0000"  >&lt;p&gt;Dongyang is looking into this&lt;/p&gt;</comment>
                            <comment id="235844" author="mhanafi" created="Mon, 29 Oct 2018 23:52:53 +0000"  >&lt;p&gt;got the crash dump also. if they need to pull something from it&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
crash&amp;gt; bt
PID: 10665  TASK: ffff882f0e410fd0  CPU: 5   COMMAND: &lt;span class=&quot;code-quote&quot;&gt;&quot;lfsck&quot;&lt;/span&gt;
 #0 [ffff882909ccf630] machine_kexec at ffffffff8105b64b
 #1 [ffff882909ccf690] __crash_kexec at ffffffff81105342
 #2 [ffff882909ccf760] crash_kexec at ffffffff81105430
 #3 [ffff882909ccf778] oops_end at ffffffff81699778
 #4 [ffff882909ccf7a0] die at ffffffff8102e8ab
 #5 [ffff882909ccf7d0] do_trap at ffffffff81698ec0
 #6 [ffff882909ccf820] do_invalid_op at ffffffff8102b124
 #7 [ffff882909ccf8d0] invalid_op at ffffffff816a487e
    [exception RIP: ldiskfs_rec_len_to_disk+4]
    RIP: ffffffffa1167d04  RSP: ffff882909ccf980  RFLAGS: 00010207
    RAX: 0000000000000000  RBX: ffff882909ccfb58  RCX: ffff882909ccf994
    RDX: 00000000ffffffac  RSI: ffffffffffffff81  RDI: 00000000ffffff81
    RBP: ffff882909ccf980   R8: 00000000ffffff81   R9: ffffffffa1169ed0
    R10: ffff88303f803b00  R11: 0000000000ffffff  R12: 000000000000003c
    R13: ffff882387ee3388  R14: ffff882387ee3310  R15: 0000000000000000
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #8 [ffff882909ccf988] htree_inlinedir_to_tree at ffffffffa116a245 [ldiskfs]
 #9 [ffff882909ccfb28] ldiskfs_htree_fill_tree at ffffffffa112c277 [ldiskfs]
#10 [ffff882909ccfbf0] ldiskfs_readdir at ffffffffa111a5ec [ldiskfs]
#11 [ffff882909ccfca0] osd_ldiskfs_it_fill at ffffffffa11b1cde [osd_ldiskfs]
#12 [ffff882909ccfce8] osd_it_ea_load at ffffffffa11b1eb7 [osd_ldiskfs]
#13 [ffff882909ccfd10] lfsck_open_dir at ffffffffa123f08c [lfsck]
#14 [ffff882909ccfd50] lfsck_master_oit_engine at ffffffffa123fcb2 [lfsck]
#15 [ffff882909ccfdf0] lfsck_master_engine at ffffffffa1240d96 [lfsck]
#16 [ffff882909ccfec8] kthread at ffffffff810b1131
#17 [ffff882909ccff50] ret_from_fork at ffffffff816a14dd
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="235845" author="pjones" created="Mon, 29 Oct 2018 23:53:02 +0000"  >&lt;p&gt;Could you please supply version of Lustre details?&lt;/p&gt;</comment>
                            <comment id="235847" author="dongyang" created="Mon, 29 Oct 2018 23:57:29 +0000"  >&lt;p&gt;I can see inline_data is enabled for the OST:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
htree_inlinedir_to_tree+0x445/0x450 [ldiskfs]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;currently we don&apos;t support inline_data on the targets, and mkfs.lustre should not enabled them.&lt;/p&gt;

&lt;p&gt;How was the OST created?&lt;/p&gt;</comment>
                            <comment id="235848" author="jaylan" created="Mon, 29 Oct 2018 23:59:59 +0000"  >&lt;p&gt;I have these LU patches on top of 2.10.5:&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10055&quot; title=&quot;mdt_fill_lvbo() message spew on MDS console&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10055&quot;&gt;&lt;del&gt;LU-10055&lt;/del&gt;&lt;/a&gt; mdt: use max_mdsize in reply for layout intent&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11187&quot; title=&quot;MMP updated sometimes failes T10PI checks&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11187&quot;&gt;&lt;del&gt;LU-11187&lt;/del&gt;&lt;/a&gt; ldiskfs: don&apos;t mark mmp buffer head dirty&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9230&quot; title=&quot;soft lockup on v2.9 Lustre clients (ldlm?)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9230&quot;&gt;&lt;del&gt;LU-9230&lt;/del&gt;&lt;/a&gt; ldlm: speed up preparation for list of lock cancel&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10830&quot; title=&quot;lfs setstripe not correctly setting umask permissions&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10830&quot;&gt;&lt;del&gt;LU-10830&lt;/del&gt;&lt;/a&gt; utils: fix create mode for lfs setstripe&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10003&quot; title=&quot;lnetctl error &amp;quot;cannot add network: invalid argument&amp;quot;&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10003&quot;&gt;LU-10003&lt;/a&gt; lnet: clarify lctl deprecation message&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10003&quot; title=&quot;lnetctl error &amp;quot;cannot add network: invalid argument&amp;quot;&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10003&quot;&gt;LU-10003&lt;/a&gt; lnet: deprecate lctl net commands&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt; lnd: use less CQ entries for each connection&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt; lnet: fix build with M-OFED 4.1&lt;/p&gt;
</comment>
                            <comment id="235850" author="mhanafi" created="Tue, 30 Oct 2018 00:03:20 +0000"  >&lt;p&gt;Normal ldiskfs format operation.&lt;/p&gt;

&lt;p&gt;Here is typical lustre.csv like&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
service432-ib1,&lt;span class=&quot;code-quote&quot;&gt;&quot;options lnet networks=o2ib(ib1)&quot;&lt;/span&gt;,/dev/mapper/nbp13_1-OST22,/mnt/lustre/nbp13_1-OST22,ost,nbp13,&lt;span class=&quot;code-quote&quot;&gt;&quot;10.151.26.183@o2ib:10.151.26.185@o2ib&quot;&lt;/span&gt;,22,,&lt;span class=&quot;code-quote&quot;&gt;&quot;-m 0 -i 10485760 -G 64 -t ext4 -E packed_meta_blocks=1&quot;&lt;/span&gt;,&lt;span class=&quot;code-quote&quot;&gt;&quot;acl,errors=panic,user_xattr,max_sectors_kb=0&quot;&lt;/span&gt;,10.151.26.185@o2ib:10.151.26.183@o2ib
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
nbp13_1-MGS0: Filesystem features:      has_journal ext_attr resize_inode dir_index filetype needs_recovery flex_bg sparse_super large_file huge_file uninit_bg dir_nlink quota
nbp13_1-MDT0000: Filesystem features:      has_journal ext_attr resize_inode dir_index filetype needs_recovery flex_bg dirdata inline_data sparse_super large_file huge_file uninit_bg dir_nlink extra_isize quota
nbp13_1-OST0003: Filesystem features:      has_journal ext_attr dir_index filetype needs_recovery extent 64bit flex_bg sparse_super large_file huge_file uninit_bg dir_nlink extra_isize quota
nbp13_1-OST0005: Filesystem features:      has_journal ext_attr dir_index filetype needs_recovery extent 64bit flex_bg sparse_super large_file huge_file uninit_bg dir_nlink extra_isize quota
nbp13_1-OST0006: Filesystem features:      has_journal ext_attr dir_index filetype needs_recovery extent 64bit flex_bg sparse_super large_file huge_file uninit_bg dir_nlink extra_isize quota
nbp13_1-OST0008: Filesystem features:      has_journal ext_attr dir_index filetype needs_recovery extent 64bit flex_bg sparse_super large_file huge_file uninit_bg dir_nlink extra_isize quota
nbp13_1-OST000A: Filesystem features:      has_journal ext_attr dir_index filetype needs_recovery extent 64bit flex_bg sparse_super large_file huge_file uninit_bg dir_nlink extra_isize quota
nbp13_1-OST0000: Filesystem features:      has_journal ext_attr dir_index filetype needs_recovery extent 64bit flex_bg sparse_super large_file huge_file uninit_bg dir_nlink extra_isize quota
nbp13_1-OST0001: Filesystem features:      has_journal ext_attr dir_index filetype needs_recovery extent 64bit flex_bg sparse_super large_file huge_file uninit_bg dir_nlink extra_isize quota
nbp13_1-OST0002: Filesystem features:      has_journal ext_attr dir_index filetype needs_recovery extent 64bit flex_bg sparse_super large_file huge_file uninit_bg dir_nlink extra_isize quota
nbp13_1-OST0004: Filesystem features:      has_journal ext_attr dir_index filetype needs_recovery extent 64bit flex_bg sparse_super large_file huge_file uninit_bg dir_nlink extra_isize quota
nbp13_1-OST0007: Filesystem features:      has_journal ext_attr dir_index filetype needs_recovery extent 64bit flex_bg sparse_super large_file huge_file uninit_bg dir_nlink extra_isize quota
nbp13_1-OST0009: Filesystem features:      has_journal ext_attr dir_index filetype needs_recovery extent 64bit flex_bg sparse_super large_file huge_file uninit_bg dir_nlink extra_isize quota
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="235852" author="dongyang" created="Tue, 30 Oct 2018 00:09:03 +0000"  >&lt;p&gt;Just saw your updated comment, Looks like&#160;nbp13_1-MDT0000 has inline_data enabled.&lt;/p&gt;

&lt;p&gt;If it was created with e2fsprogs-1.44.3.wc1 then mke2fs will stop and give an error saying dirdata and inline_data can not be enabled at the same time,&lt;/p&gt;

&lt;p&gt;If it was created with the earlier version of e2fsprogs it doesn&apos;t even know about inline_data feature.&lt;/p&gt;

&lt;p&gt;Was inline_data enabled by tune2fs some point after the target was created?&lt;/p&gt;</comment>
                            <comment id="235853" author="mhanafi" created="Tue, 30 Oct 2018 00:16:23 +0000"  >&lt;p&gt;FYI we had hardware issue on this filesystem on friday and we had to run fsck all targets. It had found/fix issues. This could be a side effect of that.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/31356/31356_dumpe2fs.out&quot; title=&quot;dumpe2fs.out attached to LU-11584&quot;&gt;dumpe2fs.out&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;</comment>
                            <comment id="235854" author="mhanafi" created="Tue, 30 Oct 2018 00:19:07 +0000"  >&lt;p&gt;it was created with e2fsprogs-1.42.13.wc6-7.el7.x86_64. Then&#160; e2fsprogs-1.44.3.wc1 fsck was ran it this weekend.&lt;/p&gt;

&lt;p&gt;during fsck had issue with quota file so I disabled and renabled it.&lt;/p&gt;

&lt;p&gt;tune2fs -O^quota&lt;/p&gt;

&lt;p&gt;tune2fs -Oquota&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="235855" author="mhanafi" created="Tue, 30 Oct 2018 00:41:49 +0000"  >&lt;p&gt;Should I remove the inline_data feature?&lt;/p&gt;</comment>
                            <comment id="235856" author="dongyang" created="Tue, 30 Oct 2018 00:53:06 +0000"  >&lt;p&gt;Do we still have the output of the e2fsck?&lt;/p&gt;

&lt;p&gt;I think there is&#160; bug in the e2fsck,&lt;/p&gt;

&lt;p&gt;which a corrupted inode flag made e2fsck set the inline_data feature in the superblock.&lt;/p&gt;

&lt;p&gt;if that&apos;s the case then we need to clear the inline_data feature bit and rerun the e2fsck with a patch to fix the inode.&lt;/p&gt;</comment>
                            <comment id="235857" author="mhanafi" created="Tue, 30 Oct 2018 00:58:38 +0000"  >&lt;p&gt;Don&apos;t have the fsck output.&lt;/p&gt;

&lt;p&gt;i can run&lt;/p&gt;

&lt;p&gt;tune2fs -O^inline_data&lt;/p&gt;

&lt;p&gt;what do you mean &apos;e2fsck with a patch&apos;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="235859" author="adilger" created="Tue, 30 Oct 2018 01:15:43 +0000"  >&lt;p&gt;Yes, the &lt;tt&gt;inline_data&lt;/tt&gt; feature is not currently supported with Lustre.&lt;/p&gt;

&lt;p&gt;As you wrote, &quot;&lt;tt&gt;tune2fs -O ^inline_data&lt;/tt&gt;&quot; will disable the feature, but e2fsck will automatically enable the feature if it finds an inode with the &lt;tt&gt;EXT4_INLINE_DATA_FL&lt;/tt&gt; set.   If there is only a handful of inodes with this flag set, you could run &lt;tt&gt;e2fsck -f /dev/XXX&lt;/tt&gt; (note no &apos;&lt;tt&gt;y&lt;/tt&gt;&apos; option) and then when it asks to enable the &lt;tt&gt;inline_data&lt;/tt&gt; feature answer &apos;&lt;tt&gt;n&lt;/tt&gt;&apos; and &apos;&lt;tt&gt;y&lt;/tt&gt;&apos; to clearing the inode.  This would erase the whole inode, but it is also likely that these inodes just contain garbage anyway.  &lt;/p&gt;

&lt;p&gt;If these are critical files, instead of e2fsck clearing the whole inode, it is also possible to run &lt;tt&gt;e2fsck -fn /dev/XXX&lt;/tt&gt; after disabling the &lt;tt&gt;inline_data&lt;/tt&gt; feature to get a list of inodes affected by this issue, and then use &lt;tt&gt;debugfs -w /dev/XXX&lt;/tt&gt; on the unmounted filesystem, and then &lt;tt&gt;stat &amp;lt;inum&amp;gt;|/ROOT/path/to/inode&lt;/tt&gt; to print the flags on each inode and &lt;tt&gt;set_inode_field &amp;lt;inum&amp;gt;|/ROOT/path/to/inode&lt;/tt&gt; to clear the &lt;tt&gt;EXT4_INLINE_DATA_FL = 0x10000000&lt;/tt&gt; flag.  Unfortunately, there is no debugfs interface to just clear a single flag from an inode, so the existing value is needed to know what to set.&lt;/p&gt;</comment>
                            <comment id="235862" author="dongyang" created="Tue, 30 Oct 2018 01:32:49 +0000"  >&lt;p&gt;I agree with Andreas, Just want to mention that&#160;&quot;tune2fs -O ^inline_data&quot; won&apos;t work&lt;/p&gt;

&lt;p&gt;to disable inline_data, we need to &quot;debugfs -w /dev/XXX&quot; and then &quot;feature -inline_data&quot;&lt;/p&gt;

&lt;p&gt;the patch I mentioned is to make e2fsck clear the inode rather than enabling inline_data feature,&lt;/p&gt;

&lt;p&gt;e2fsck currently trusts the inode flag if it has inline_data flag set, however for us that inode is highly like to contain garbage.&lt;/p&gt;

&lt;p&gt;You can disable inline_data and clear the inode or clear&#160;EXT4_INLINE_DATA_FL flag for the inode like Andreas said above, without the patch. The patch is just to prevent this from happening again.&lt;/p&gt;

&lt;p&gt;DY&lt;/p&gt;</comment>
                            <comment id="235863" author="mhanafi" created="Tue, 30 Oct 2018 01:37:32 +0000"  >&lt;p&gt;tune2fs -O^inline_data /dev/mapper/nbp13_1-MDT0&lt;br/&gt;
 tune2fs 1.44.3.wc1 (23-July-2018)&lt;br/&gt;
 Clearing filesystem feature &apos;inline_data&apos; not supported.&lt;/p&gt;

&lt;p&gt;&#160;1. I will run the debugfs command &lt;/p&gt;

&lt;p&gt;2. run fsck -fn to get list of files.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="235864" author="mhanafi" created="Tue, 30 Oct 2018 01:51:28 +0000"  >&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 root@nbp13-srv1 ~]# e2fsck -fn /dev/mapper/nbp13_1-MDT0 | tee /tmp/fsck.out
e2fsck 1.44.3.wc1 (23-July-2018)
Pass 1: Checking inodes, blocks, and sizes
Inode 140572827 has inline data, but superblock is missing INLINE_DATA feature
Clear? noInode 140572827 has INLINE_DATA_FL flag on filesystem without inline data support.
Clear? noInode 140572828 has inline data, but superblock is missing INLINE_DATA feature
Clear? noInode 140572828 has INLINE_DATA_FL flag on filesystem without inline data support.
Clear? noPass 2: Checking directory structure
Pass 3: Checking directory connectivity
&lt;span class=&quot;code-quote&quot;&gt;&apos;..&apos;&lt;/span&gt; in /ROOT/pkolano (140572827) is &amp;lt;The NULL inode&amp;gt; (0), should be /ROOT (140569473).
Fix? noUnconnected directory inode 140572828 (/ROOT/pkolano/tmp)
Connect to /lost+found? noUnconnected directory inode 140572829 (/ROOT/pkolano/tmp/64.3)
Connect to /lost+found? no&lt;span class=&quot;code-quote&quot;&gt;&apos;..&apos;&lt;/span&gt; in ... (140572829) is /ROOT/pkolano/tmp (140572828), should be &amp;lt;The NULL inode&amp;gt; (0).
Fix? noUnconnected directory inode 140572894 (/ROOT/pkolano/tmp/64.2)
Connect to /lost+found? no&lt;span class=&quot;code-quote&quot;&gt;&apos;..&apos;&lt;/span&gt; in ... (140572894) is /ROOT/pkolano/tmp (140572828), should be &amp;lt;The NULL inode&amp;gt; (0).
Fix? noPass 4: Checking reference counts
Inode 140569473 ref count is 9, should be 8.  Fix? noInode 140572827 ref count is 3, should be 1.  Fix? noInode 140572828 ref count is 4, should be 2.  Fix? noInode 140572829 ref count is 2, should be 1.  Fix? noInode 140572894 ref count is 2, should be 1.  Fix? noPass 5: Checking group summary informationnbp13-MDT0000: ********** WARNING: Filesystem still has errors **********nbp13-MDT0000: 28251917/317769600 files (0.1% non-contiguous), 83952122/3106406400 blocks
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;both 2 inode can be delete&lt;br/&gt;
debugfs:  ncheck 140572828&lt;br/&gt;
Inode	Pathname&lt;br/&gt;
140572828	/ROOT/pkolano/tmp&lt;br/&gt;
debugfs:  ncheck 140572827&lt;br/&gt;
Inode	Pathname&lt;br/&gt;
140572827	/ROOT/pkolano&lt;/p&gt;</comment>
                            <comment id="235867" author="adilger" created="Tue, 30 Oct 2018 02:15:49 +0000"  >&lt;p&gt;You &lt;em&gt;should&lt;/em&gt; be able to disable the &lt;tt&gt;inline_data&lt;/tt&gt; feature via &quot;&lt;tt&gt;debugfs -w &apos;feature ^inline_data&apos; /dev/XXX&lt;/tt&gt;&quot; to bypass the tune2fs checks.&lt;/p&gt;</comment>
                            <comment id="235868" author="mhanafi" created="Tue, 30 Oct 2018 02:21:47 +0000"  >&lt;p&gt;I did disable the feature via debugfs. How do i clear the INLINE_DATA_FL from the inodes?&lt;/p&gt;</comment>
                            <comment id="235877" author="mhanafi" created="Tue, 30 Oct 2018 04:23:08 +0000"  >&lt;p&gt;I got past the two inodes. and mounted the filesystem. I see these errors.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[17342.023159] LustreError: 26378:0:(ofd_dev.c:1784:ofd_create_hdl()) nbp13-OST0008: unable to precreate: rc = -115
[17342.053760] LustreError: 26378:0:(ofd_dev.c:1784:ofd_create_hdl()) Skipped 59 previous similar messages
[17342.082037] LustreError: 25151:0:(osp_precreate.c:657:osp_precreate_send()) nbp13-OST0008-osc-MDT0000: precreate fid [0x100080000:0x217edc:0x0] &amp;lt; local used fid [0x100080000:0x217edc:0x0]: rc = -116
[17342.135124] LustreError: 25151:0:(osp_precreate.c:657:osp_precreate_send()) Skipped 59 previous similar messages
[17342.165732] LustreError: 25151:0:(osp_precreate.c:1289:osp_precreate_thread()) nbp13-OST0008-osc-MDT0000: cannot precreate objects: rc = -116

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="235893" author="adilger" created="Tue, 30 Oct 2018 09:52:53 +0000"  >&lt;p&gt;This looks like it only affects creating files on the one OST0008, the rest of the filesystem should be usable at this point, including reading data on the affected OSTs.  If there are multiple OSTs similarly affected then that could be problematic over time, but not immediately except for reduced performance.  It should be possible to restart use of the OSTs by deleting the file &lt;tt&gt;lov_objids&lt;/tt&gt; and &lt;tt&gt;lov_objseq&lt;/tt&gt; on the MDT.&lt;/p&gt;</comment>
                            <comment id="235900" author="mhanafi" created="Tue, 30 Oct 2018 10:51:12 +0000"  >&lt;p&gt;I umounted the MDT and remount using ldiskfs. Removed the 2 files and remounted using lustre. Still seeing the errors. Do I need to remount all OSTs?&lt;/p&gt;</comment>
                            <comment id="235940" author="mhanafi" created="Tue, 30 Oct 2018 19:27:08 +0000"  >&lt;p&gt;This filesystem is having additional issues.&lt;/p&gt;

&lt;p&gt;ls -l is hanging on some dir and some dir owner and group are showing up as &quot;?&quot;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;

 tpfe2 /nobackupp13/spocops/git/sector/spoc/code/dist/logs # ls
metrics-dump-0.txt  metrics-dump-0.txt.old  tmq.wrapper.log  tmq.wrapper.log.1  tmq.wrapper.log.2  worker.wrapper.log
tpfe2 /nobackupp13/spocops/git/sector/spoc/code/dist/logs # ls -l
ls: cannot access &lt;span class=&quot;code-quote&quot;&gt;&apos;tmq.wrapper.log.1&apos;&lt;/span&gt;: No such file or directory
ls: cannot access &lt;span class=&quot;code-quote&quot;&gt;&apos;metrics-dump-0.txt&apos;&lt;/span&gt;: No such file or directory
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="235941" author="adilger" created="Tue, 30 Oct 2018 19:41:44 +0000"  >&lt;p&gt;This typically indicates that the OST objects for those files are missing.  OI Scrub on the OSTs should have already moved any objects from the OST&apos;s local &lt;tt&gt;lost+found&lt;/tt&gt; directory back into the right place, but it wouldn&apos;t hurt to take a look (you could run &quot;&lt;tt&gt;debugfs -c -R &apos;ls -l lost+found&apos; /dev/XXXX&lt;/tt&gt;&quot; on the respective OSTs, there should only be &quot;.&quot; and &quot;..&quot; and a few empty directory blocks reported).&lt;/p&gt;

&lt;p&gt;Other than that, if the OST objects are lost due to hardware corruption, then there isn&apos;t much that can be done for those files beyond deleting them (with &quot;&lt;tt&gt;unlink&lt;/tt&gt;&quot; instead of &quot;&lt;tt&gt;rm&lt;/tt&gt;&quot;) and restoring them from backup.&lt;/p&gt;</comment>
                            <comment id="235942" author="mhanafi" created="Tue, 30 Oct 2018 19:52:19 +0000"  >&lt;p&gt;How do we clear the&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;17342.082037&amp;#93;&lt;/span&gt; LustreError: 25151:0:(osp_precreate.c:657:osp_precreate_send()) nbp13-OST0008-osc-MDT0000: precreate fid &lt;span class=&quot;error&quot;&gt;&amp;#91;0x100080000:0x217edc:0x0&amp;#93;&lt;/span&gt; &amp;lt; local used fid &lt;span class=&quot;error&quot;&gt;&amp;#91;0x100080000:0x217edc:0x0&amp;#93;&lt;/span&gt;: rc = -116&lt;/p&gt;

&lt;p&gt;issue.&lt;/p&gt;

&lt;p&gt;delete lov_objids and lov_objseq didn&apos;t work&lt;/p&gt;</comment>
                            <comment id="235943" author="mhanafi" created="Tue, 30 Oct 2018 19:59:49 +0000"  >&lt;p&gt;There are files listed in lost+found but looks like empty dirctory blocks.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
debugfs: ls -l
 11 40700 (2) 0 0 139264 7-Aug-2018 21:08 .
 2 40755 (2) 0 0 4096 7-Aug-2018 21:09 ..
 0 0 (1) 0 0 0 #75852
 0 0 (1) 0 0 0 #113934
 0 0 (1) 0 0 0 #184111
 0 0 (1) 0 0 0 #266679
 0 0 (1) 0 0 0 #331827
 0 0 (1) 0 0 0 #385401
 0 0 (1) 0 0 0 #444954
 0 0 (1) 0 0 0 #496838
 0 0 (1) 0 0 0 #567511
 0 0 (1) 0 0 0 #605846
 0 0 (1) 0 0 0 #649369
 0 0 (1) 0 0 0 #687206
 0 0 (1) 0 0 0 #732707
 0 0 (1) 0 0 0 #769520
 0 0 (1) 0 0 0 #815218
 0 0 (1) 0 0 0 #875528
 0 0 (1) 0 0 0 #915005
 0 0 (1) 0 0 0 #955684
 0 0 (1) 0 0 0 #993221
 0 0 (1) 0 0 0 #1028775
 0 0 (1) 0 0 0 #1073199
 0 0 (1) 0 0 0 #1111095
 0 0 (1) 0 0 0 #1148688
 0 0 (1) 0 0 0 #1191718
 0 0 (1) 0 0 0 #1230579
 0 0 (1) 0 0 0 #1273743
 0 0 (1) 0 0 0 #1312334
 0 0 (1) 0 0 0 #1353029
 0 0 (1) 0 0 0 #1431710
 0 0 (1) 0 0 0 #1472117
 0 0 (1) 0 0 0 #1524449
 0 0 (1) 0 0 0 #1605063
 0 0 (1) 0 0 0 #1666014
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="235950" author="bzzz" created="Tue, 30 Oct 2018 20:58:18 +0000"  >&lt;p&gt;can you please try to mount again with full debug enabled and attach logs from MDS and that OST?&lt;/p&gt;</comment>
                            <comment id="235952" author="mhanafi" created="Tue, 30 Oct 2018 21:02:08 +0000"  >&lt;p&gt;Do you want me to remount the OST and MDT or just the MDT?&lt;/p&gt;</comment>
                            <comment id="235953" author="bzzz" created="Tue, 30 Oct 2018 21:08:57 +0000"  >&lt;p&gt;ideally - both, please: MDS, then OST.&lt;/p&gt;</comment>
                            <comment id="235955" author="mhanafi" created="Tue, 30 Oct 2018 21:17:39 +0000"  >&lt;p&gt;Both ost and mdt are on the same host&lt;/p&gt;

&lt;p&gt; &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/31379/31379_nbp13.debug.gz&quot; title=&quot;nbp13.debug.gz attached to LU-11584&quot;&gt;nbp13.debug.gz&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;/p&gt;</comment>
                            <comment id="235956" author="mhanafi" created="Tue, 30 Oct 2018 21:24:42 +0000"  >&lt;p&gt;Filesystem level issues are:&lt;/p&gt;

&lt;p&gt;1.  files with ? for user and gid&lt;br/&gt;
2. directories where ls -l hangs.&lt;br/&gt;
3. No such file or directory&lt;br/&gt;
4. unsupported incompat LMA feature(s) 0xffffffe1 (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11583&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.whamcloud.com/browse/LU-11583&lt;/a&gt;) I tried setfattr -x trusted.lma /mnt/XXX/ROOT/path/to/file it didn&apos;t work.&lt;/p&gt;

&lt;p&gt;How do we find and clear all these?&lt;/p&gt;</comment>
                            <comment id="235958" author="bzzz" created="Tue, 30 Oct 2018 21:29:21 +0000"  >&lt;p&gt;thanks, it will take some time to study the logs. can you please also check OI scrub status:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;lctl get_param osd*.*OST*.oi_scrub&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="235960" author="mhanafi" created="Tue, 30 Oct 2018 21:34:35 +0000"  >&lt;p&gt;Attaching oi_scrub.out&lt;/p&gt;

&lt;p&gt; &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/31380/31380_oi_scrub.out&quot; title=&quot;oi_scrub.out attached to LU-11584&quot;&gt;oi_scrub.out&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;/p&gt;</comment>
                            <comment id="235967" author="adilger" created="Tue, 30 Oct 2018 22:21:38 +0000"  >&lt;p&gt;It looks like OST0008 is currently running an OI Scrub triggered by the object precreate from the MDS:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;1540934094.152104:0:1589:0:(ofd_dev.c:1588:ofd_create_hdl()) ofd_create(0x0:2195196)
1540934094.152114:0:1589:0:(ofd_dev.c:1750:ofd_create_hdl()) nbp13-OST0008: reserve 32 objects in group 0x0 at 2195165
1540934094.152122:0:1589:0:(osd_handler.c:1003:osd_fid_lookup()) Process entered
1540934094.165749:0:1589:0:(osd_handler.c:728:osd_check_lma()) Process entered
1540934094.165750:0:1589:0:(osd_handler.c:793:osd_check_lma()) Process leaving (rc=-78)                &amp;lt;************  -78 = -EREMCHG
1540934094.165757:0:1589:0:(osd_scrub.c:2654:osd_scrub_start()) Process entered
540934094.165790:0:1589:0:(osd_scrub.c:2661:osd_scrub_start()) Process leaving (rc=0 : 0 : 0)
1540934094.165791:0:1589:0:(osd_handler.c:1139:osd_fid_lookup()) nbp13-OST0008: trigger OI scrub by RPC for the [0x100080000:0x217edd:0x0] with flags 0x4a, rc = 0
1540934094.213780:0:1589:0:(ofd_dev.c:446:ofd_object_free()) object free, fid = [0x100080000:0x217edd:0x0]
1540934094.213783:0:1589:0:(ofd_objects.c:253:ofd_precreate_objects()) Process leaving via out (rc=-115)
1540934094.213785:0:1589:0:(ofd_objects.c:402:ofd_precreate_objects()) created 0/32 objects: -115
1540934094.213785:0:1589:0:(ofd_objects.c:405:ofd_precreate_objects()) Process leaving (rc=-115)
1540934094.213786:0:1589:0:(ofd_dev.c:1784:ofd_create_hdl()) nbp13-OST0008: unable to precreate: rc = -115
1540934094.272318:0:11192:0:(osp_precreate.c:657:osp_precreate_send()) nbp13-OST0008-osc-MDT0000: precreate fid [0x100080000:0x217edc:0x0] &amp;lt; local used fid [0x100080000:0x217edc:0x0]: rc = -116
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Based on the speed of the scrub of the other OSTs, this process should only take about 15s and should have completed already for OST0008, but it looks like it is either stuck or restarting the scrub repeatedly due to some inconsistency it is finding with the OST objects.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;osd-ldiskfs.nbp13-OST0008.oi_scrub=
name: OI_scrub
magic: 0x4c5fd252
oi_files: 64
status: scanning
flags: auto
param:
time_since_last_completed: 9 seconds
time_since_latest_start: 8 seconds
time_since_last_checkpoint: 8 seconds
latest_start_position: 12
last_checkpoint_position: 11
first_failure_position: N/A
checked: 1170405
updated: 0
failed: 0
prior_updated: 0
noscrub: 0
igif: 0
success_count: 11061
run_time: 8 seconds
average_speed: 146300 objects/sec
real-time_speed: 155205 objects/sec
current_position: 1457233
lf_scanned: 0
lf_repaired: 0
lf_failed: 0
inodes_per_group: 16
current_iit_group: 91077
current_iit_base: 1457233
current_iit_offset: 1
scrub_in_prior: no
scrub_full_speed: yes
partial_scan: no
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;As for resolving the outstanding issues:&lt;br/&gt;
1. &lt;b&gt;files with ? are missing OST objects:&lt;/b&gt;  Unless there is some expectation that these files can be recovered by some other means, they should probably be deleted.  This could either be done by manually scanning the filesystem with e.g. &quot;find&quot; or by running a full layout LFSCK.  However, until the OI Scrub issue on OST0008 is resolved then the full LFSCK will likely also not complete.&lt;br/&gt;
2. &lt;b&gt;directories where &quot;ls -l&quot; hangs:&lt;/b&gt; may be caused by the IO Scrub ongoing on OST0008.  You could check if &lt;tt&gt;lfs getstripe&lt;/tt&gt; on hanging files include only files on OST0008&lt;br/&gt;
3. &lt;b&gt;no such file or directory:&lt;/b&gt; is the same cause as #1 - OST objects are missing and &lt;tt&gt;stat()&lt;/tt&gt; on those objects returns &lt;tt&gt;-ENOENT&lt;/tt&gt;&lt;br/&gt;
4. &lt;b&gt;unsupported incompat LMA feature(s) 0xffffffe1:&lt;/b&gt; did you do the &lt;tt&gt;setfattr -x trusted.lma&lt;/tt&gt; on the ldiskfs-mounted MDT filesystem and the correct file?  that should have removed the LMA xattr to clear the flag.  According to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11583&quot; title=&quot;unsupported incompat LMA feature(s) 0xffffffe1 &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11583&quot;&gt;LU-11583&lt;/a&gt; you deleted that file already?&lt;/p&gt;</comment>
                            <comment id="235968" author="mhanafi" created="Tue, 30 Oct 2018 22:33:21 +0000"  >&lt;p&gt;So... we need to resolve the nbp13-OST0008 issue first. The oi scrub keep restart due to the same fid.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[  766.323537] Lustre: nbp13-OST0008: trigger OI scrub by RPC &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; the [0x100080000:0x217edd:0x0] with flags 0x4a, rc = 0
[  766.355463] Lustre: Skipped 3 previous similar messages
[  766.371175] LustreError: 8836:0:(ofd_dev.c:1784:ofd_create_hdl()) nbp13-OST0008: unable to precreate: rc = -115
[  766.401518] LustreError: 8836:0:(ofd_dev.c:1784:ofd_create_hdl()) Skipped 3 previous similar messages
[  766.401539] LustreError: 8115:0:(osp_precreate.c:657:osp_precreate_send()) nbp13-OST0008-osc-MDT0000: precreate fid [0x100080000:0x217edc:0x0] &amp;lt; local used fid [0x100080000:0x217edc:0x0]: rc = -116
[  766.401540] LustreError: 8115:0:(osp_precreate.c:657:osp_precreate_send()) Skipped 3 previous similar messages
[  766.401543] LustreError: 8115:0:(osp_precreate.c:1289:osp_precreate_thread()) nbp13-OST0008-osc-MDT0000: cannot precreate objects: rc = -116
[  766.401544] LustreError: 8115:0:(osp_precreate.c:1289:osp_precreate_thread()) Skipped 3 previous similar messages
[  836.271099] Lustre: nbp13-OST0008: trigger OI scrub by RPC &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; the [0x100080000:0x217edd:0x0] with flags 0x4a, rc = 0
[  836.303036] Lustre: Skipped 6 previous similar messages
[  836.318743] LustreError: 8836:0:(ofd_dev.c:1784:ofd_create_hdl()) nbp13-OST0008: unable to precreate: rc = -115
[  836.349088] LustreError: 8836:0:(ofd_dev.c:1784:ofd_create_hdl()) Skipped 6 previous similar messages
[  836.349107] LustreError: 8115:0:(osp_precreate.c:657:osp_precreate_send()) nbp13-OST0008-osc-MDT0000: precreate fid [0x100080000:0x217edc:0x0] &amp;lt; local used fid [0x100080000:0x217edc:0x0]: rc = -116
[  836.349108] LustreError: 8115:0:(osp_precreate.c:657:osp_precreate_send()) Skipped 6 previous similar messages
[  836.349111] LustreError: 8115:0:(osp_precreate.c:1289:osp_precreate_thread()) nbp13-OST0008-osc-MDT0000: cannot precreate objects: rc = -116
[  836.349112] LustreError: 8115:0:(osp_precreate.c:1289:osp_precreate_thread()) Skipped 6 previous similar messages
[  867.763998] LNet: 3774:0:(o2iblnd_cb.c:3192:kiblnd_check_conns()) Timed out tx &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 10.151.26.144@o2ib: 36 seconds
[  867.794860] LNet: 3774:0:(o2iblnd_cb.c:3192:kiblnd_check_conns()) Skipped 4 previous similar messages
[  966.173700] Lustre: nbp13-OST0008: trigger OI scrub by RPC &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; the [0x100080000:0x217edd:0x0] with flags 0x4a, rc = 0
[  966.205625] Lustre: Skipped 12 previous similar messages
[  966.221594] LustreError: 8837:0:(ofd_dev.c:1784:ofd_create_hdl()) nbp13-OST0008: unable to precreate: rc = -115
[  966.251939] LustreError: 8837:0:(ofd_dev.c:1784:ofd_create_hdl()) Skipped 12 previous similar messages
[  966.251958] LustreError: 8115:0:(osp_precreate.c:657:osp_precreate_send()) nbp13-OST0008-osc-MDT0000: precreate fid [0x100080000:0x217edc:0x0] &amp;lt; local used fid [0x100080000:0x217edc:0x0]: rc = -116
[  966.251960] LustreError: 8115:0:(osp_precreate.c:657:osp_precreate_send()) Skipped 12 previous similar messages
[  966.251962] LustreError: 8115:0:(osp_precreate.c:1289:osp_precreate_thread()) nbp13-OST0008-osc-MDT0000: cannot precreate objects: rc = -116
[  966.251963] LustreError: 8115:0:(osp_precreate.c:1289:osp_precreate_thread()) Skipped 12 previous similar messages
[ 1225.994890] Lustre: nbp13-OST0008: trigger OI scrub by RPC &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; the [0x100080000:0x217edd:0x0] with flags 0x4a, rc = 0
[ 1226.026820] Lustre: Skipped 25 previous similar messages
[ 1226.042790] LustreError: 8837:0:(ofd_dev.c:1784:ofd_create_hdl()) nbp13-OST0008: unable to precreate: rc = -115
[ 1226.073134] LustreError: 8837:0:(ofd_dev.c:1784:ofd_create_hdl()) Skipped 25 previous similar messages
[ 1226.073159] LustreError: 8115:0:(osp_precreate.c:657:osp_precreate_send()) nbp13-OST0008-osc-MDT0000: precreate fid [0x100080000:0x217edc:0x0] &amp;lt; local used fid [0x100080000:0x217edc:0x0]: rc = -116
[ 1226.073161] LustreError: 8115:0:(osp_precreate.c:657:osp_precreate_send()) Skipped 25 previous similar messages
[ 1226.073164] LustreError: 8115:0:(osp_precreate.c:1289:osp_precreate_thread()) nbp13-OST0008-osc-MDT0000: cannot precreate objects: rc = -116
[ 1226.073165] LustreError: 8115:0:(osp_precreate.c:1289:osp_precreate_thread()) Skipped 25 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;How do i find this inode?&lt;/p&gt;</comment>
                            <comment id="235969" author="mhanafi" created="Tue, 30 Oct 2018 22:34:47 +0000"  >&lt;p&gt;for #4. yes I delete one of the files. but there are more, which do contain user data.&lt;/p&gt;</comment>
                            <comment id="235974" author="mhanafi" created="Tue, 30 Oct 2018 23:45:12 +0000"  >&lt;p&gt;I located &lt;span class=&quot;error&quot;&gt;&amp;#91;0x100080000:0x217edd:0x0&amp;#93;&lt;/span&gt;  on the OST&lt;br/&gt;
it is just a empty inode.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
# debugfs -c -R &lt;span class=&quot;code-quote&quot;&gt;&quot;stat &amp;lt;2195165&amp;gt;&quot;&lt;/span&gt; /dev/mapper/nbp13_1-OST8
debugfs 1.44.3.wc1 (23-July-2018)
/dev/mapper/nbp13_1-OST8: catastrophic mode - not reading inode or group bitmaps
Inode: 2195165   Type: bad type    Mode:  0000   Flags: 0x0
Generation: 0    Version: 0x00000000
User:     0   Group:     0   Size: 0
File ACL: 0
Links: 0   Blockcount: 0
Fragment:  Address: 0    &lt;span class=&quot;code-object&quot;&gt;Number&lt;/span&gt;: 0    Size: 0
ctime: 0x00000000 -- Wed Dec 31 16:00:00 1969
atime: 0x00000000 -- Wed Dec 31 16:00:00 1969
mtime: 0x00000000 -- Wed Dec 31 16:00:00 1969
Size of extra inode fields: 0
BLOCKS:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="235995" author="mhanafi" created="Wed, 31 Oct 2018 02:52:18 +0000"  >&lt;p&gt;Any updates?&lt;/p&gt;</comment>
                            <comment id="235998" author="mhanafi" created="Wed, 31 Oct 2018 03:52:32 +0000"  >&lt;p&gt;Some info;&lt;br/&gt;
 file with ? for uid and gid are the onces that get called out on the mdt as&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 unsupported incompat LMA feature(s) 0x70687320 &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; fid = [0x0:0x2bae:0x2], ino = 100026353&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="236000" author="mhanafi" created="Wed, 31 Oct 2018 04:33:03 +0000"  >&lt;p&gt;Here is an example of a inode that ls will hang.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;

 [18623.900347] Lustre: 31365:0:(osd_handler.c:371:osd_get_lma()) dm-1: unsupported incompat LMA feature(s) 0x73746960 &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; fid = [0x0:0x13af:0x2], ino = 236893545
[18623.942973] Lustre: 31365:0:(osd_handler.c:371:osd_get_lma()) Skipped 138971 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
nbp15-srv1 ~ # debugfs -c -R &lt;span class=&quot;code-quote&quot;&gt;&apos;stat &amp;lt;236893545&amp;gt; &apos;&lt;/span&gt; /dev/mapper/nbp15_1-MDT0
debugfs 1.44.3.wc1 (23-July-2018)
/dev/mapper/nbp15_1-MDT0: catastrophic mode - not reading inode or group bitmaps
Inode: 236893545   Type: regular    Mode:  0640   Flags: 0x0
Generation: 20109448    Version: 0x00000003:10887bb2
User: 522602360   Group:  1179   Project:     0   Size: 0
File ACL: 0
Links: 1   Blockcount: 0
Fragment:  Address: 0    &lt;span class=&quot;code-object&quot;&gt;Number&lt;/span&gt;: 0    Size: 0
 ctime: 0x5bbf9603:00000000 -- Thu Oct 11 11:27:15 2018
 atime: 0x5b9814ba:00000000 -- Tue Sep 11 12:17:14 2018
 mtime: 0x565df0af:00000000 -- Tue Dec  1 11:10:39 2015
crtime: 0x5b99a181:a7491e2c -- Wed Sep 12 16:30:09 2018
Size of extra inode fields: 32
Extended attributes:
  trusted.lma (24) = 6c 6c 63 2e 66 69 74 73 00 00 00 00 00 00 00 00 af 13 00 00 02 00 00 00 
  lma: fid=[0:0x13af:0x2] compat=2e636c6c incompat=73746966
  trusted.link (80)
  trusted.lov (128)
BLOCKS: &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;We need a way find and clear these errros.&lt;/p&gt;</comment>
                            <comment id="236001" author="adilger" created="Wed, 31 Oct 2018 05:07:37 +0000"  >&lt;p&gt;The information in the &quot;&lt;tt&gt;lma&lt;/tt&gt;&quot; xattr looks to be total garbage.  The &lt;tt&gt;compat=2e636c6c&lt;/tt&gt; and &lt;tt&gt;incompat=73746966&lt;/tt&gt; flags are full of unknown values - only a small number of values are defined.  It looks like the &lt;tt&gt;trusted.fid&lt;/tt&gt; has been clobbered by ASCII text, which includes &quot;&lt;tt&gt;6c 6c 63 2e 66 69 74 73 == llc.fit&lt;/tt&gt;&quot;, &quot;&lt;tt&gt;2e636c6c = .cll&lt;/tt&gt;&quot;, and &quot;&lt;tt&gt;73746966 = stif&lt;/tt&gt;&quot; (or the reverse, depending on byte ordering).  One option is clearing the &quot;&lt;tt&gt;lma&lt;/tt&gt;&quot; xattr, in case the &quot;&lt;tt&gt;lov&lt;/tt&gt;&quot; xattr still contains a valid &lt;tt&gt;LOV_MAGIC&lt;/tt&gt; value and a valid layout.  The &quot;&lt;tt&gt;trusted.lma&lt;/tt&gt;&quot; xattr can be rebuilt by OI Scrub if needed.&lt;/p&gt;

&lt;p&gt;To delete the &lt;tt&gt;trusted.lma&lt;/tt&gt; xattr, the MDT needs to be mounted as type ldiskfs, since the MDS blocks direct access/modification to this xattr.  Then &quot;setfattr -x trusted.lma /path/to/file&quot; to delete the xattr.&lt;/p&gt;</comment>
                            <comment id="236002" author="mhanafi" created="Wed, 31 Oct 2018 05:13:45 +0000"  >&lt;p&gt;But looks like there are 1000&apos;s of these inodes. How can we easily find them?&lt;br/&gt;
What about the nbp13-OST8 issues. We have a second filesystem with the same issue.&lt;/p&gt;

</comment>
                            <comment id="236004" author="adilger" created="Wed, 31 Oct 2018 05:14:59 +0000"  >&lt;blockquote&gt;
&lt;p&gt;I located &lt;span class=&quot;error&quot;&gt;&amp;#91;0x100080000:0x217edd:0x0&amp;#93;&lt;/span&gt; on the OST, it is just a empty inode.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Is this the object &lt;tt&gt;O/0/d29/2195165&lt;/tt&gt; or how did you map this FID to that inode number?  If it is, then that would imply directory corruption on the OST, since the directory entry shouldn&apos;t be pointing at an unused inode.  Ah, to clarify, the &lt;tt&gt;0x217edd&lt;/tt&gt; part of the FID does not map directly to the inode number, it is just the OID part of the FID, an arbitrary sequential number.   If &lt;tt&gt;O/0/d29/2195165&lt;/tt&gt; exists on OST0008, what does &quot;&lt;tt&gt;stat&lt;/tt&gt;&quot; report for it?&lt;/p&gt;</comment>
                            <comment id="236005" author="mhanafi" created="Wed, 31 Oct 2018 05:20:58 +0000"  >&lt;p&gt;RE: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x100080000:0x217edd:0x0&amp;#93;&lt;/span&gt;, OK  I did that mapping incorrectly. Is there a way to find out what that object inode is?&lt;/p&gt;

&lt;p&gt;are you saying &lt;span class=&quot;error&quot;&gt;&amp;#91;0x100080000:0x217edd:0x0&amp;#93;&lt;/span&gt; -&amp;gt; maps to  O/0/d29/2195165 &lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
debugfs:  stat O/0/d29/2195165
Inode: 1762634   Type: regular    Mode:  07666   Flags: 0x80000
Generation: 3301012751    Version: 0x00000000:00000000
User:     0   Group:     0   Project:     0   Size: 0
File ACL: 0
Links: 2   Blockcount: 0
Fragment:  Address: 0    &lt;span class=&quot;code-object&quot;&gt;Number&lt;/span&gt;: 0    Size: 0
 ctime: 0x00000000:00000000 -- Wed Dec 31 16:00:00 1969
 atime: 0x00000000:00000000 -- Wed Dec 31 16:00:00 1969
 mtime: 0x00000000:00000000 -- Wed Dec 31 16:00:00 1969
crtime: 0x5bd254c2:a90f833c -- Thu Oct 25 16:41:54 2018
Size of extra inode fields: 32
Extended attributes:
  trusted.lma (24) = 08 00 00 00 00 00 00 00 00 00 08 00 01 00 00 00 9d 7e 21 00 00 00 00 00 
  lma: fid=[0x100080000:0x217e9d:0x0] compat=8 incompat=0
EXTENTS:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="236007" author="adilger" created="Wed, 31 Oct 2018 05:29:48 +0000"  >&lt;p&gt;Correct.  The &lt;tt&gt;0x100080000&lt;/tt&gt; part of the FID identifies it as an OST FID (&lt;tt&gt;0x1&lt;/tt&gt; part) on OST0008.  The second part is the Object ID, which (in decimal) is the filename, and modulo 32 is the subdirectory.&lt;/p&gt;</comment>
                            <comment id="236008" author="adilger" created="Wed, 31 Oct 2018 05:31:57 +0000"  >&lt;p&gt;So it looks like there is a hard link to this object, likely from &lt;tt&gt;O/0/d29/2195101&lt;/tt&gt;, which is probably the correct object for that inode due to the FID in the lma xattr, and &lt;tt&gt;O/0/d29/2195165&lt;/tt&gt; should be removed.&lt;/p&gt;</comment>
                            <comment id="236009" author="mhanafi" created="Wed, 31 Oct 2018 05:35:49 +0000"  >&lt;p&gt;should i delete it via ldiskfs mount or debugfs -w mi ?&lt;/p&gt;

&lt;p&gt;how should we scan for bad lma xattr?&lt;/p&gt;</comment>
                            <comment id="236012" author="adilger" created="Wed, 31 Oct 2018 05:50:01 +0000"  >&lt;p&gt;The OST object should be deleted via ldiskfs.&lt;/p&gt;

&lt;p&gt;As for the bad lma xattr, I don&apos;t think that LFSCK can fix that problem right now, since the incompat flag is specifically intended to block old Lustre versions that don&apos;t understand particular feature flags from modifying the inode.  For finding the objects, probably the easiest way is to run a namespace walk to find inodes that show errors when accessed.  It may be that &quot;&lt;tt&gt;lfs find &amp;lt;mountpoint&amp;gt;&lt;/tt&gt;&quot; might be enough to generate an error message for a file with the bad LMA.  Unfortunately, we can&apos;t use e.g. &quot;&lt;tt&gt;lfs fid2path&lt;/tt&gt;&quot; on the FIDs reported in the error message since they are not valid FIDs.&lt;/p&gt;</comment>
                            <comment id="236013" author="mhanafi" created="Wed, 31 Oct 2018 05:55:00 +0000"  >&lt;p&gt;ls -l will find these but it will hang on some.&lt;br/&gt;
 I have a modified version of lester backend scan tool or can e2scan be used to get the bad inodes&lt;/p&gt;
</comment>
                            <comment id="236016" author="adilger" created="Wed, 31 Oct 2018 06:09:52 +0000"  >&lt;p&gt;Probably Lester would be fastest.  If it is already able to decode the LMA (which is probably yes, since that is how it finds the FID) it shouldn&apos;t be too hard to check the &lt;tt&gt;compat&lt;/tt&gt; and &lt;tt&gt;incompat&lt;/tt&gt; flags at the same time.  The current known compat and incompat flags are:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
&lt;span class=&quot;code-keyword&quot;&gt;enum&lt;/span&gt; lma_compat {
        LMAC_HSM         = 0x00000001,
&lt;span class=&quot;code-comment&quot;&gt;/*      LMAC_SOM         = 0x00000002, obsolete since 2.8.0 */&lt;/span&gt;
        LMAC_NOT_IN_OI   = 0x00000004, &lt;span class=&quot;code-comment&quot;&gt;/* the object does NOT need OI mapping */&lt;/span&gt;
        LMAC_FID_ON_OST  = 0x00000008, /* For OST-object, its OI mapping is
                                       * under /O/&amp;lt;seq&amp;gt;/d&amp;lt;x&amp;gt;. */
        LMAC_STRIPE_INFO = 0x00000010, &lt;span class=&quot;code-comment&quot;&gt;/* stripe info in the LMA EA. */&lt;/span&gt;
        LMAC_COMP_INFO   = 0x00000020, &lt;span class=&quot;code-comment&quot;&gt;/* Component info in the LMA EA. */&lt;/span&gt;
        LMAC_IDX_BACKUP  = 0x00000040, &lt;span class=&quot;code-comment&quot;&gt;/* Has index backup. */&lt;/span&gt;
};

/**
 * Masks &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; all features that should be supported by a Lustre version to
 * access a specific file.
 * This information is stored in lustre_mdt_attrs::lma_incompat.
 */
&lt;span class=&quot;code-keyword&quot;&gt;enum&lt;/span&gt; lma_incompat {
        LMAI_RELEASED           = 0x00000001, &lt;span class=&quot;code-comment&quot;&gt;/* file is released */&lt;/span&gt;
        LMAI_AGENT              = 0x00000002, &lt;span class=&quot;code-comment&quot;&gt;/* agent inode */&lt;/span&gt;
        LMAI_REMOTE_PARENT      = 0x00000004, /* the parent of the object
                                                 is on the remote MDT */
        LMAI_STRIPED            = 0x00000008, &lt;span class=&quot;code-comment&quot;&gt;/* striped directory inode */&lt;/span&gt;
        LMAI_ORPHAN             = 0x00000010, &lt;span class=&quot;code-comment&quot;&gt;/* inode is orphan */&lt;/span&gt;
        LMA_INCOMPAT_SUPP       = (LMAI_AGENT | LMAI_REMOTE_PARENT | \
                                   LMAI_STRIPED | LMAI_ORPHAN)
};
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="236017" author="mhanafi" created="Wed, 31 Oct 2018 06:12:48 +0000"  >&lt;p&gt;how should I delete O/0/d29/2195101&lt;/p&gt;</comment>
                            <comment id="236020" author="adilger" created="Wed, 31 Oct 2018 06:20:01 +0000"  >&lt;p&gt;Strictly speaking, if &lt;tt&gt;O/0/d29/2195101&lt;/tt&gt; exists, then if the &lt;tt&gt;O/0/d29/2195165&lt;/tt&gt; link is deleted it should be OK again.  That said, this object has no data and has not been used by an MDT inode yet (or it would report a &quot;&lt;tt&gt;parent&lt;/tt&gt;&quot; FID sa well), so there is probably no huge risk to delete it as well, but I also don&apos;t think it is totally necessary.&lt;/p&gt;</comment>
                            <comment id="236022" author="mhanafi" created="Wed, 31 Oct 2018 06:23:21 +0000"  >&lt;p&gt;any chance we can get a tool to scan the MDT for the bad lma? I think that is our only change find or ls -l just hangs &lt;/p&gt;</comment>
                            <comment id="236023" author="adilger" created="Wed, 31 Oct 2018 07:38:25 +0000"  >&lt;p&gt;Do you have an idea of many bad objects exist in the filesystem? Have you been able to access file data for some files, but only some relatively small fraction (e.g. 1% or 5%) of the files are exhibiting the bad lma problem?  Is this problem only happening on the MDT or also on the OST? &lt;/p&gt;

&lt;p&gt;The &quot;right&quot; tool for this would be to modify LFSCK to be able to detect an &quot;obviously&quot; corrupt LMA and erase and rebuild it, for some definition of &quot;obviously correct&quot;, while preserving the original meaning of the &lt;tt&gt;incompat&lt;/tt&gt; flag.  However, that is not something that should be rushed, as we would need to test it fairly well to ensure it does not quickly and automatically do the wrong thing for the filesystem and cause more problems.&lt;/p&gt;

&lt;p&gt;Have you tried using something like &quot;&lt;tt&gt;lfs find -uid 0 /mnt/XXX&lt;/tt&gt;&quot; to scan the mounted filesystem?  It does not try to instantiate the file inodes on the client (to avoid cache pollution), but rather just fetches the inode attributes to the client and returns them to userspace.  However, it does need to access the directory inodes. so there would still be some chance of the client hanging.&lt;/p&gt;</comment>
                            <comment id="236024" author="mhanafi" created="Wed, 31 Oct 2018 07:58:36 +0000"  >&lt;p&gt;Don&#8217;t know for sure how many I am guessing +5000 or more.   If you run lfs find it will not find any. you need to do at least a ls -l. Just like a ls won&#8217;t work.&lt;/p&gt;</comment>
                            <comment id="236025" author="mhanafi" created="Wed, 31 Oct 2018 08:07:05 +0000"  >&lt;p&gt;If we can clear at lma xattr can we not read all the bad xattr mounted as ldiskfs &lt;/p&gt;</comment>
                            <comment id="236026" author="adilger" created="Wed, 31 Oct 2018 08:25:39 +0000"  >&lt;p&gt;If it is mounted as ldiskfs, then there would need to be a userspace tool written to decode the lma xattr from disk, since it is a binary structure. The debugfs utility decodes this for us for debugging purposes.&lt;/p&gt;</comment>
                            <comment id="236027" author="adilger" created="Wed, 31 Oct 2018 08:32:00 +0000"  >&lt;p&gt;Alex is investigating a change to LFSCK to rewrite the LMA and clear the bad flags and incorrect FID. &lt;/p&gt;

&lt;p&gt;In the meantime, if it is possible I would suggest to make a device-level backup of the MDT filesystem in case there are any problems. This should be possible in a few hours if there is a suitable device available to hold it. &lt;/p&gt;</comment>
                            <comment id="236047" author="bzzz" created="Wed, 31 Oct 2018 15:52:04 +0000"  >&lt;p&gt;yes, I&apos;ve been working on a patch for OI scrub to fix wrong names in /O/.. which seem to be the blocking point.&lt;br/&gt;
also, I&apos;ve got a test simulating the problem - essentially a single extra hardlink in /O/.. exposes the problem with endless precreate.&lt;/p&gt;</comment>
                            <comment id="236048" author="bzzz" created="Wed, 31 Oct 2018 16:00:17 +0000"  >&lt;p&gt;as for duplicated hardlinks (have to you tried to remove O/0/d29/2195165 manually?) I think you can use the following command on a directly mounted OST filesystem:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;find O -type f ! -links 1&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;as that that is object index and it&apos;s not supposed to have hardlinks at all. this way you can estimate how objects may need recovery.&lt;/p&gt;</comment>
                            <comment id="236066" author="adilger" created="Wed, 31 Oct 2018 18:28:33 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=mhanafi&quot; class=&quot;user-hover&quot; rel=&quot;mhanafi&quot;&gt;mhanafi&lt;/a&gt; were you able to clear the bad (hard-linked) inode(s) on OST0008 to get beyond the precreate problem?&lt;/p&gt;

&lt;p&gt;For the LMA issue, Alex is still working on a patch.  It would be useful to also dump the &quot;trusted.lov&quot; xattr on one of the inodes that have the LMA error to see if it still contains a valid layout. This would need to be done via If the LOV does not contain a valid layout then it needs to be removed as well. &lt;/p&gt;

&lt;p&gt;My understanding is that beyond the files impacted by the LMA issue, the filesystem should be usable at this point. Peter was mentioning that there were several filesystems affected at this time?  Are they all hitting the same problems?  How did multiple filesystems become corrupted at the same time? &lt;/p&gt;</comment>
                            <comment id="236067" author="mhanafi" created="Wed, 31 Oct 2018 18:48:08 +0000"  >&lt;p&gt;Yes i deleted the O/0/d29/2195165 and that got us past that.&lt;/p&gt;

&lt;p&gt;We had 5 filesystem on similar raid backends. They experienced the same issue during firmware update. During firmware update the RAID t10pi setting got turned off this caused errors on the hosts side.&lt;/p&gt;

&lt;p&gt;I was able to do a backend scan of all MDT inodes and dump out the lma_compat and lma_incompat. It looks like they are zero except for the bad inodes.&lt;/p&gt;

&lt;p&gt;On nbp10 I ran setfattr -x trusted.lma on the list of bad inodes. And mounted via lustre and start lfsck. it is still running.&lt;/p&gt;

&lt;p&gt;This is an example of a bad inode. I&apos;ll try to get you some more examples.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
nbp15-srv1 ~ # debugfs -c -R &lt;span class=&quot;code-quote&quot;&gt;&apos;stat &amp;lt;236893545&amp;gt; &apos;&lt;/span&gt; /dev/mapper/nbp15_1-MDT0
debugfs 1.44.3.wc1 (23-July-2018)
/dev/mapper/nbp15_1-MDT0: catastrophic mode - not reading inode or group bitmaps
Inode: 236893545   Type: regular    Mode:  0640   Flags: 0x0
Generation: 20109448    Version: 0x00000003:10887bb2
User: 522602360   Group:  1179   Project:     0   Size: 0
File ACL: 0
Links: 1   Blockcount: 0
Fragment:  Address: 0    &lt;span class=&quot;code-object&quot;&gt;Number&lt;/span&gt;: 0    Size: 0
 ctime: 0x5bbf9603:00000000 -- Thu Oct 11 11:27:15 2018
 atime: 0x5b9814ba:00000000 -- Tue Sep 11 12:17:14 2018
 mtime: 0x565df0af:00000000 -- Tue Dec  1 11:10:39 2015
crtime: 0x5b99a181:a7491e2c -- Wed Sep 12 16:30:09 2018
Size of extra inode fields: 32
Extended attributes:
  trusted.lma (24) = 6c 6c 63 2e 66 69 74 73 00 00 00 00 00 00 00 00 af 13 00 00 02 00 00 00 
  lma: fid=[0:0x13af:0x2] compat=2e636c6c incompat=73746966
  trusted.link (80)
  trusted.lov (128)
BLOCKS: 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;How do I figure what file this is&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Lustre: nbp10-MDT0000: trigger OI scrub by RPC &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; the [0x2000033ce:0x2f:0x0] with flags 0x4a, rc = 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;fid2path is hanging&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
tpfe2 ~ # lfs fid2path /nobackupp10 0x2000033ce:0x2f:0x0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The filesystems may be usable for the most part, but we have taking them it offline make sure all issues are resolved before releasing back to the users.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="236070" author="mhanafi" created="Wed, 31 Oct 2018 18:55:03 +0000"  >&lt;p&gt;nbp10 current lfsck status:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
name: lfsck_namespace
magic: 0xa0621a0b
version: 2
status: scanning-phase1
flags: inconsistent,incomplete
param: all_targets,orphan,create_ostobj,create_mdtobj
last_completed_time: N/A
time_since_last_completed: N/A
latest_start_time: 1541009812
time_since_latest_start: 2281 seconds
last_checkpoint_time: 1541012058
time_since_last_checkpoint: 35 seconds
latest_start_position: 77, N/A, N/A
last_checkpoint_position: 116707678, N/A, N/A
first_failure_position: 81903790, [0x200003393:0x149:0x0], 0x21ece37a
checked_phase1: 72944086
checked_phase2: 0
updated_phase1: 1020
updated_phase2: 0
failed_phase1: 38
failed_phase2: 0
directories: 615982
dirent_repaired: 217
linkea_repaired: 802
nlinks_repaired: 0
multiple_linked_checked: 35712
multiple_linked_repaired: 0
unknown_inconsistency: 0
unmatched_pairs_repaired: 0
dangling_repaired: 1
multiple_referenced_repaired: 0
bad_file_type_repaired: 0
lost_dirent_repaired: 0
local_lost_found_scanned: 0
local_lost_found_moved: 0
local_lost_found_skipped: 0
local_lost_found_failed: 0
striped_dirs_scanned: 0
striped_dirs_repaired: 0
striped_dirs_failed: 0
striped_dirs_disabled: 0
striped_dirs_skipped: 0
striped_shards_scanned: 0
striped_shards_repaired: 0
striped_shards_failed: 0
striped_shards_skipped: 0
name_hash_repaired: 0
linkea_overflow_cleared: 0
success_count: 0
run_time_phase1: 4136 seconds
run_time_phase2: 0 seconds
average_speed_phase1: 17636 items/sec
average_speed_phase2: N/A
average_speed_total: 17636 items/sec
real_time_speed_phase1: 274 items/sec
real_time_speed_phase2: N/A
current_position: 116925445, N/A, N/A
 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
tpfe2 ~ # lfs fid2path /nobackupp10 0x200003393:0x149:0x0
/nobackupp10/hhashimo/data/GDM/data
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This the directory where ls -l is hanging.&lt;/p&gt;</comment>
                            <comment id="236076" author="mhanafi" created="Wed, 31 Oct 2018 19:36:59 +0000"  >&lt;p&gt;more example of bad inode&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;

 debugfs:  stat cmst_file
Inode: 229115909   Type: regular    Mode:  0664   Flags: 0x0
Generation: 2422773946    Version: 0x00000001:00000028
User: 10376   Group:  1987   Project:     0   Size: 0
File ACL: 2239771779
Links: 1   Blockcount: 8
Fragment:  Address: 0    &lt;span class=&quot;code-object&quot;&gt;Number&lt;/span&gt;: 0    Size: 0
 ctime: 0x5b75bb84:00000000 -- Thu Aug 16 10:59:32 2018
 atime: 0x5bd267f4:00000000 -- Thu Oct 25 18:03:48 2018
 mtime: 0x5b75bb73:3e55a3d4 -- Thu Aug 16 10:59:15 2018
crtime: 0x5b75bb73:3e55a3d4 -- Thu Aug 16 10:59:15 2018
Size of extra inode fields: 32
Extended attributes:
  trusted.lma (24) = 73 74 5f 66 69 6c 65 00 00 00 00 00 00 00 00 00 06 04 00 00 02 00 00 00 
  lma: fid=[0:0x406:0x2] compat=665f7473 incompat=656c69
  trusted.link (51)
  system.posix_acl_access (28) = 00 00 00 00 00 00 00 00 01 00 00 00 01 00 06 00 02 00 06 00 fb 28 00 00 04 00 04 00 
  trusted.lov (1688)
BLOCKS:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="236077" author="adilger" created="Wed, 31 Oct 2018 19:57:12 +0000"  >&lt;p&gt;It is possible that fid2path is hanging because LFSCK is still running and rebuilding the OI files, so it is getting a return code if &quot;&lt;tt&gt;-EINPROGRESS&lt;/tt&gt;&quot; for which the client will wait indefinitely until the MDS completes LFSCK and locates the respective FID it returns an error.  That said, given the FID is corrupt in the LMA, then it is possible that the requested FID will no longer exist. &lt;/p&gt;

&lt;p&gt;Typically, LFSCK will trust the FID stored in the inode LMA over a fid in the directory entry, since the chance of the LMA FID xattr being corrupted without actually corrupting the xattr structure itself (which are stored within a few bytes of each other) was considered to be extremely unlikely, though I guess we may have to reconsider this assumption. I&apos;d need to check the LFSCK code to see if it does a validity check on the dirent FID vs. the LMA FID and excludes one if it is not valid.&lt;/p&gt;

&lt;p&gt;For the files where you removed the LMA xattr, are those files now accessible?  &lt;/p&gt;

&lt;p&gt;It is water under the bridge at this point, but in the future I&apos;d suggest a staged rollout of changes like this so that any issues seem during the upgrade are contained to a single filesystem. &lt;/p&gt;</comment>
                            <comment id="236079" author="mhanafi" created="Wed, 31 Oct 2018 20:38:11 +0000"  >&lt;p&gt;after removing the lma xattr, an ls -l will hang and trigger a oi_scrub.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="236116" author="mhanafi" created="Thu, 1 Nov 2018 06:43:25 +0000"  >
&lt;p&gt;More examples:&lt;/p&gt;

&lt;p&gt;ls -l output&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 -????????? ? ? ? ? ? PrfToolParametersTest.class&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
debugfs:  stat PrfToolParametersTest.class
Inode: 168298272   Type: regular    Mode:  0640   Flags: 0x0
Generation: 1296031430    Version: 0x00000003:3e688f5d
User: 30757   Group: 41548   Project:     0   Size: 0
File ACL: 0
Links: 1   Blockcount: 0
Fragment:  Address: 0    &lt;span class=&quot;code-object&quot;&gt;Number&lt;/span&gt;: 0    Size: 0
 ctime: 0x5bd11d80:79fa43d0 -- Wed Oct 24 18:33:52 2018
 atime: 0x5bd11d80:79fa43d0 -- Wed Oct 24 18:33:52 2018
 mtime: 0x5bd11d80:79fa43d0 -- Wed Oct 24 18:33:52 2018
crtime: 0x5bd11d80:79fa43d0 -- Wed Oct 24 18:33:52 2018
Size of extra inode fields: 32
Extended attributes:
  trusted.lma (24) = 00 00 00 00 00 00 00 00 01 21 00 00 02 00 00 00 b3 6f 00 00 00 00 00 00 
  lma: fid=[0x200002101:0x6fb3:0x0] compat=0 incompat=0
  trusted.lov (448)
  trusted.link (69)
BLOCKS:

 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;ls -l output&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
ls: cannot access &lt;span class=&quot;code-quote&quot;&gt;&apos;./spocops/git/sector/spoc/code/commissioning-tools/build/src/main/matlab/write_LsqParameters.m&apos;&lt;/span&gt;: No such file or directory
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
debugfs:  stat /ROOT/./spocops/git/sector/spoc/code/commissioning-tools/build/src/main/matlab/write_LsqParameters.m
Inode: 168297875   Type: regular    Mode:  0640   Flags: 0x0
Generation: 1296029279    Version: 0x00000003:3e6662b8
User: 30757   Group: 41548   Project:     0   Size: 0
File ACL: 0
Links: 1   Blockcount: 0
Fragment:  Address: 0    &lt;span class=&quot;code-object&quot;&gt;Number&lt;/span&gt;: 0    Size: 0
 ctime: 0x5bd11cf9:00000000 -- Wed Oct 24 18:31:37 2018
 atime: 0x5bd11d80:00000000 -- Wed Oct 24 18:33:52 2018
 mtime: 0x5bd11cf9:00000000 -- Wed Oct 24 18:31:37 2018
crtime: 0x5bd11cf9:212fdac8 -- Wed Oct 24 18:31:37 2018
Size of extra inode fields: 32
Extended attributes:
  trusted.lma (24) = 00 00 00 00 00 00 00 00 01 21 00 00 02 00 00 00 3c 67 00 00 00 00 00 00 
  lma: fid=[0x200002101:0x673c:0x0] compat=0 incompat=0
  trusted.lov (448)
  trusted.link (63)
BLOCKS:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
</comment>
                            <comment id="236118" author="adilger" created="Thu, 1 Nov 2018 07:02:59 +0000"  >&lt;p&gt;Here files look like the LMA xattr is valid. Can you check &quot;&lt;tt&gt;lfs getstripe&lt;/tt&gt;&quot; for the files to get the objects, then on the respective OSTs you can use &quot;&lt;tt&gt;objid=NNNN; debugfs -c -R &quot;stat O/0/d$((objid % 32))/$objid&quot; /dev/XXX&lt;/tt&gt;&quot; to see if the object is missing or maybe broken (wrong parent FID)?&lt;/p&gt;</comment>
                            <comment id="236125" author="mhanafi" created="Thu, 1 Nov 2018 07:21:20 +0000"  >&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
lfs getstripe write_LsqParameters.m
 write_LsqParameters.m
 lcm_layout_gen: 4
 lcm_entry_count: 4
 lcme_id: 1
 lcme_flags: init
 lcme_extent.e_start: 0
 lcme_extent.e_end: 8388608
 lmm_stripe_count: 1
 lmm_stripe_size: 1048576
 lmm_pattern: 1
 lmm_layout_gen: 0
 lmm_stripe_offset: 21
 lmm_objects:

0: { l_ost_idx: 21, l_fid: [0x100150000:0x215567:0x0] }

lcme_id: 2
 lcme_flags: 0
 lcme_extent.e_start: 8388608
 lcme_extent.e_end: 17179869184
 lmm_stripe_count: 4
 lmm_stripe_size: 1048576
 lmm_pattern: 1
 lmm_layout_gen: 65535
 lmm_stripe_offset: -1
 lcme_id: 3
 lcme_flags: 0
 lcme_extent.e_start: 17179869184
 lcme_extent.e_end: 68719476736
 lmm_stripe_count: 8
 lmm_stripe_size: 1048576
 lmm_pattern: 1
 lmm_layout_gen: 65535
 lmm_stripe_offset: -1
 lcme_id: 4
 lcme_flags: 0
 lcme_extent.e_start: 68719476736
 lcme_extent.e_end: EOF
 lmm_stripe_count: 16
 lmm_stripe_size: 1048576
 lmm_pattern: 1
 lmm_layout_gen: 65535
 lmm_stripe_offset: -1

=========================================
nbp13-srv2 ~ # objid=`printf &lt;span class=&quot;code-quote&quot;&gt;&quot;%i\n&quot;&lt;/span&gt; 0x215567
nbp13-srv2 ~ # debugfs -c -R &lt;span class=&quot;code-quote&quot;&gt;&quot;stat O/0/d$((objid % 32))/$objid&quot;&lt;/span&gt; /dev/mapper/nbp13_1-OST21
 debugfs 1.44.3.wc1 (23-July-2018)
 /dev/mapper/nbp13_1-OST21: catastrophic mode - not reading inode or group bitmaps
 O/0/d7/2184551: File not found by ext2_lookup&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 lfs getstripe PrfToolParametersTest.class
PrfToolParametersTest.class
  lcm_layout_gen:  4
  lcm_entry_count: 4
    lcme_id:             1
    lcme_flags:          init
    lcme_extent.e_start: 0
    lcme_extent.e_end:   8388608
      lmm_stripe_count:  1
      lmm_stripe_size:   1048576
      lmm_pattern:       1
      lmm_layout_gen:    0
      lmm_stripe_offset: 13
      lmm_objects:
      - 0: { l_ost_idx: 13, l_fid: [0x1000d0000:0x2156d1:0x0] }

    lcme_id:             2
    lcme_flags:          0
    lcme_extent.e_start: 8388608
    lcme_extent.e_end:   17179869184
      lmm_stripe_count:  4
      lmm_stripe_size:   1048576
      lmm_pattern:       1
      lmm_layout_gen:    65535
      lmm_stripe_offset: -1
    lcme_id:             3
    lcme_flags:          0
    lcme_extent.e_start: 17179869184
    lcme_extent.e_end:   68719476736
      lmm_stripe_count:  8
      lmm_stripe_size:   1048576
      lmm_pattern:       1
      lmm_layout_gen:    65535
      lmm_stripe_offset: -1
    lcme_id:             4
    lcme_flags:          0
    lcme_extent.e_start: 68719476736
    lcme_extent.e_end:   EOF
      lmm_stripe_count:  16
      lmm_stripe_size:   1048576
      lmm_pattern:       1
      lmm_layout_gen:    65535
      lmm_stripe_offset: -1
=================================
objid=`printf &lt;span class=&quot;code-quote&quot;&gt;&quot;%i\n&quot;&lt;/span&gt; 0x2156d1`
 debugfs -c -R &lt;span class=&quot;code-quote&quot;&gt;&quot;stat O/0/d$((objid % 32))/$objid&quot;&lt;/span&gt; /dev/mapper/nbp13_1-OST13 
debugfs 1.44.3.wc1 (23-July-2018)
/dev/mapper/nbp13_1-OST13: catastrophic mode - not reading inode or group bitmaps
O/0/d17/2184913: File not found by ext2_lookup 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;so missing objects.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="236132" author="gerrit" created="Thu, 1 Nov 2018 10:02:22 +0000"  >&lt;p&gt;Andreas Dilger (adilger@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/33546&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/33546&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11584&quot; title=&quot;kernel BUG at ldiskfs.h:1907!&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11584&quot;&gt;&lt;del&gt;LU-11584&lt;/del&gt;&lt;/a&gt; osd-ldiskfs: fix lost+found object replace&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 8f98c54f9d7207c2a3f10f06cb913359cbf65a6d&lt;/p&gt;</comment>
                            <comment id="236135" author="gerrit" created="Thu, 1 Nov 2018 10:54:21 +0000"  >&lt;p&gt;Alex Zhuravlev (bzzz@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/33547&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/33547&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11584&quot; title=&quot;kernel BUG at ldiskfs.h:1907!&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11584&quot;&gt;&lt;del&gt;LU-11584&lt;/del&gt;&lt;/a&gt; osd: OI scrub to remove corrupted LMA&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 14674945f442e1dccdf2ed4cfd81eb2a2a55e1cf&lt;/p&gt;</comment>
                            <comment id="236152" author="mhanafi" created="Thu, 1 Nov 2018 16:24:47 +0000"  >&lt;p&gt;are they patches ready for us to try?&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="236153" author="bzzz" created="Thu, 1 Nov 2018 16:30:28 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=mhanafi&quot; class=&quot;user-hover&quot; rel=&quot;mhanafi&quot;&gt;mhanafi&lt;/a&gt; not yet, still in testing..&lt;/p&gt;</comment>
                            <comment id="236241" author="jaylan" created="Fri, 2 Nov 2018 16:24:57 +0000"  >&lt;p&gt;File lustre/include/uapi/linux/lustre/lustre_user.h does not exist in 2.10.5.&lt;br/&gt;
It looks like the file resides at /lustre/include/lustre/lustre_user.h in b2_10. Is it safe for me to apply the change to that file or should I wait for your back port?&lt;/p&gt;</comment>
                            <comment id="236242" author="bzzz" created="Fri, 2 Nov 2018 16:29:05 +0000"  >&lt;p&gt;I&apos;m making a port right now.&lt;/p&gt;</comment>
                            <comment id="236247" author="gerrit" created="Fri, 2 Nov 2018 18:32:19 +0000"  >&lt;p&gt;Alex Zhuravlev (bzzz@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/33560&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/33560&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11584&quot; title=&quot;kernel BUG at ldiskfs.h:1907!&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11584&quot;&gt;&lt;del&gt;LU-11584&lt;/del&gt;&lt;/a&gt; osd: OI scrub to ignore object with broken LMA&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 98b64203037c12c9df93b8d17e07370d05372b9c&lt;/p&gt;</comment>
                            <comment id="236248" author="bzzz" created="Fri, 2 Nov 2018 18:35:47 +0000"  >&lt;p&gt;first of all, you need to apply the patch and rebuild Lustre. the packages need to be installed on MDS, and OSS in case there is similar (but unseen) corruption there.&lt;/p&gt;

&lt;p&gt;Then it makes sense to estimate amount of recovery needed (i.e. identify files with broken LMA).&lt;/p&gt;

&lt;p&gt;Do the follow steps (on the MDS):&lt;/p&gt;

&lt;p&gt;1) mount MDS with OI scrub disabled:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; mount -t lustre -o user_xattr,noscrub &amp;lt;mdt device&amp;gt; &amp;lt;mdt mountpoint&amp;gt;&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;2) set debug level for subsequent analysis:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param debug=+lfsck&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;3) start LFSCK in read-only mode:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; lctl lfsck_start -M nbp13_1-MDT0000 -t namespace -r --dryrun&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;4) wait for LFSCK completion checking status:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl get_param -n  mdd.*.lfsck_namespace | egrep &quot;^status|inconsistent&quot;&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;5) grab and post last &lt;tt&gt;lfsck_namespace&lt;/tt&gt; status&lt;br/&gt;
6) attach Lustre kernel debug log from the MDS as well:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl dk | gzip -9 &amp;gt; /tmp/debug-lfsck-nbp_1-MDT0000.log.gz
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Thanks&lt;/p&gt;</comment>
                            <comment id="236259" author="adilger" created="Fri, 2 Nov 2018 20:11:56 +0000"  >&lt;p&gt;I&apos;ve reviewed the patch and the backport.  The LFSCK/scrub testing failed on the master version of the patch due to a known (unrelated) intermittent test error.  The testing on the backported patch was delayed because we just added ARM builds to all patches and this was misconfigured for b2_10, but that has been resolved.  The testing on the backported patch is expected to complete in about an hour, and no problems are expected.&lt;/p&gt;

&lt;p&gt;It should be noted that this patch to LFSCK is intended to repair the specific LMA corruption that is seen on this system, and is not intended for long-term inclusion in your production release.  There is no expectation of problems in the short term, but the fix bypasses specific consistency checks in the code that should be restored before the system is upgraded, and a different patch will be landed for long-term production use.&lt;/p&gt;

&lt;p&gt;The above procedure is running LFSCK in &quot;dry run&quot; mode, so no fixes will be made to the filesystem, only a report of the number of files that will be repaired.  If the dry run is successful and the number of files being repaired is consistent with expectations, I&apos;d recommend to run in fixing mode (remove &quot;&lt;tt&gt;--dryrun&lt;/tt&gt;&quot; option) on the &quot;test&quot; filesystem and/or MDT backup image to ensure it fixes the problem.  Please attach logs to the ticket when LFSCK is finished, or if you have problems.&lt;/p&gt;</comment>
                            <comment id="236261" author="jaylan" created="Fri, 2 Nov 2018 21:11:09 +0000"  >&lt;p&gt;Thanks for the update, Andreas and Alex~&lt;/p&gt;</comment>
                            <comment id="236282" author="mhanafi" created="Sat, 3 Nov 2018 17:48:47 +0000"  >&lt;p&gt;We haven&apos;t ran the new code but here is one more example: Is this bad lma on the OST object?&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[325981.396812] Lustre: Skipped 3 previous similar messages
[326747.450553] Lustre: nbp13-OST0001: trigger OI scrub by RPC &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; the [0x100010000:0x2155af:0x0] with flags 0x4a, rc = 0
[326747.482740] Lustre: Skipped 3 previous similar messages
[327512.978588] Lustre: nbp13-OST0001: trigger OI scrub by RPC &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; the [0x100010000:0x2155af:0x0] with flags 0x4a, rc = 0
[327513.010762] Lustre: Skipped 3 previous similar messages
[328279.688198] Lustre: nbp13-OST0001: trigger OI scrub by RPC &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; the [0x100010000:0x2155af:0x0] with flags 0x4a, rc = 0
[328279.720378] Lustre: Skipped 3 previous similar messages
nbp13-srv1 ~ # objid=`printf &lt;span class=&quot;code-quote&quot;&gt;&quot;%i&quot;&lt;/span&gt; 0x2155af`
nbp13-srv1 ~ # debugfs -c -R &lt;span class=&quot;code-quote&quot;&gt;&quot;stat O/0/d$((objid % 32))/$objid&quot;&lt;/span&gt; /dev/mapper/nbp13_1-OST1
debugfs 1.44.3.wc1 (23-July-2018)
/dev/mapper/nbp13_1-OST1: catastrophic mode - not reading inode or group bitmaps
Inode: 1673602   Type: regular    Mode:  0666   Flags: 0x80000
Generation: 2828099384    Version: 0x00000003:005e7593
User: 30757   Group: 41548   Project:     0   Size: 2180
File ACL: 0
Links: 2   Blockcount: 8
Fragment:  Address: 0    &lt;span class=&quot;code-object&quot;&gt;Number&lt;/span&gt;: 0    Size: 0
 ctime: 0x5bd11ce7:00000000 -- Wed Oct 24 18:31:19 2018
 atime: 0x5bd11ce8:00000000 -- Wed Oct 24 18:31:20 2018
 mtime: 0x5bd11ce7:00000000 -- Wed Oct 24 18:31:19 2018
crtime: 0x5bd11c77:03872348 -- Wed Oct 24 18:29:27 2018
Size of extra inode fields: 32
Extended attributes:
  trusted.lma (24) = 08 00 00 00 00 00 00 00 00 00 01 00 01 00 00 00 ae 55 21 00 00 00 00 00 
  lma: fid=[0x100010000:0x2155ae:0x0] compat=8 incompat=0
  trusted.fid (44)
  fid: parent=[0x200002101:0x66b8:0x0] stripe=0 stripe_size=1048576 stripe_count=1 component_id=1 component_start=0 component_end=8388608
EXTENTS:
(0):3426781548



tpfe2 ~ # lfs fid2path /nobackupp13 0x200002101:0x66b8:0x0
/nobackupp13/quarantine/spocops/git/sector/spoc/code/dist/dist/classes/java/main/gov/nasa/tess/dv/outputs/DvAbstractTargetTableData$Builder.class

tpfe2 ~ # ls -l /nobackupp13/quarantine/spocops/git/sector/spoc/code/dist/dist/classes/java/main/gov/nasa/tess/dv/outputs/DvAbstractTargetTableData
ls: cannot access &lt;span class=&quot;code-quote&quot;&gt;&apos;/nobackupp13/quarantine/spocops/git/sector/spoc/code/dist/dist/classes/java/main/gov/nasa/tess/dv/outputs/DvAbstractTargetTableData&apos;&lt;/span&gt;: No such file or directory
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="236285" author="mhanafi" created="Sat, 3 Nov 2018 19:36:44 +0000"  >&lt;p&gt;i ran lfsck on nbp15 which has the same issues as 13. We are planing on reformatting it.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/31399/31399_debug-lfsck-nbp15-MDT0000.gz&quot; title=&quot;debug-lfsck-nbp15-MDT0000.gz attached to LU-11584&quot;&gt;debug-lfsck-nbp15-MDT0000.gz&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;</comment>
                            <comment id="236287" author="adilger" created="Sat, 3 Nov 2018 20:09:48 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;nbp13-srv1 ~ # objid=`printf &quot;%i&quot; 0x2155af`
nbp13-srv1 ~ # debugfs -c -R &quot;stat O/0/d$((objid % 32))/$objid&quot; /dev/mapper/nbp13_1-OST1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;FYI, if you have the hex value for the object ID, you could directly use:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;debugfs -c -R &quot;stat O/0/d$((0x2155af % 32))/$((0x2155af))&quot; /dev/mapper/nbp13_1-OST1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In any case, what is strange is that this is object ID being looked up is &lt;tt&gt;0x2155af&lt;/tt&gt;, but the object that is found reports itself to be &lt;tt&gt;0x2155ae&lt;/tt&gt;:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Extended attributes:
  lma: fid=[0x100010000:0x2155ae:0x0] compat=8 incompat=0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Based on the fid2path output, it looks like this object is actually &lt;tt&gt;0x2155ae&lt;/tt&gt;, so it should be renamed from &quot;&lt;tt&gt;/O/0/d15/218463&lt;/tt&gt;&quot; to &quot;&lt;tt&gt;/O/0/d14/2184622&lt;/tt&gt;&quot;.  It isn&apos;t clear why OI Scrub is not repairing this automatically.&lt;/p&gt;</comment>
                            <comment id="236288" author="adilger" created="Sat, 3 Nov 2018 20:57:20 +0000"  >&lt;p&gt;I did see something interesting in the debug log... One of the files that LFSCK complained about was:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;osd_handler.c:6401:osd_dirent_check_repair()) nbp15-MDT0000: the target inode does not recognize the dirent, dir = 237857984/19940587,  name = kplr011027624-2012004120508_llc.fits, ino = 237860402, [0x2000013af:0x8f13:0x0]: rc = -61
osd_handler.c:6401:osd_dirent_check_repair()) nbp15-MDT0000: the target inode does not recognize the dirent, dir = 238340766/19942571,  name = kplr005385471-2009259160929_llc.fits, ino = 238345690, [0x2000013ae:0x8f1c:0x0]: rc = -61
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The filenames both end in &quot;&lt;tt&gt;llc.fits&lt;/tt&gt;&quot; which is the same ASCII string that was corrupting the LMA FID.  This is returning &quot;&lt;tt&gt;-61 = -ENODATA&lt;/tt&gt;&quot; which Alex&apos;s patch is supposed to do when it finds a corrupted LMA FID, but it doesn&apos;t look like it repaired them:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
        rc = osd_get_lma(info, inode, dentry, &amp;amp;info-&amp;gt;oti_ost_attrs);
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc == -ENODATA || !fid_is_sane(&amp;amp;lma-&amp;gt;lma_self_fid))
                lma = NULL;
        :
        :
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!fid_is_zero(fid)) {
                rc = osd_verify_ent_by_linkea(env, inode, pfid, ent-&amp;gt;oied_name,
                                              ent-&amp;gt;oied_namelen);
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc == -ENOENT ||
                    (rc == -ENODATA &amp;amp;&amp;amp;
                     !(dev-&amp;gt;od_scrub.os_scrub.os_file.sf_flags &amp;amp; SF_UPGRADE))) {
                        /*
                         * linkEA does not recognize the dirent entry,
                         * it may because the dirent entry corruption
                         * and points to other&apos;s inode.
                         */
                        CDEBUG(D_LFSCK, &lt;span class=&quot;code-quote&quot;&gt;&quot;%s: the target inode does not &quot;&lt;/span&gt;
                               &lt;span class=&quot;code-quote&quot;&gt;&quot;recognize the dirent, dir = %lu/%u, &quot;&lt;/span&gt;
                               &lt;span class=&quot;code-quote&quot;&gt;&quot; name = %.*s, ino = %llu, &quot;&lt;/span&gt;
                               DFID&lt;span class=&quot;code-quote&quot;&gt;&quot;: rc = %d\n&quot;&lt;/span&gt;, devname, dir-&amp;gt;i_ino,
                               dir-&amp;gt;i_generation, ent-&amp;gt;oied_namelen,
                               ent-&amp;gt;oied_name, ent-&amp;gt;oied_ino, PFID(fid), rc);
                        *attr |= LUDA_UNKNOWN;

                        GOTO(out, rc = 0);
                }

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I&apos;d suspect that this is because the linkEA (&quot;&lt;tt&gt;link&lt;/tt&gt;&quot; xattr which is also stored in the inode) is also missing?  It looks like we need to set the &lt;tt&gt;SF_UPGRADE&lt;/tt&gt; flag (maybe renamed to &quot;&lt;tt&gt;SF_REBUILD_LMA&lt;/tt&gt;&quot;) if the LMA has been removed (rc = -ENODATA) so that we fall through to the LMA repair code further down?  We can&apos;t check for the &lt;tt&gt;LMAC_INIT_FID&lt;/tt&gt; flag, since it is stored in the LMA itself, which is missing here.&lt;/p&gt;</comment>
                            <comment id="236289" author="mhanafi" created="Sat, 3 Nov 2018 21:51:26 +0000"  >&lt;p&gt;Here is the nbp13 lfsck runs.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;

 nbp13-srv1 ~ # lctl get_param -n  mdd.*.lfsck_namespace
name: lfsck_namespace
magic: 0xa0621a0b
version: 2
status: completed
flags: inconsistent
param: dryrun
last_completed_time: 1541281433
time_since_last_completed: 341 seconds
latest_start_time: 1541281072
time_since_latest_start: 702 seconds
last_checkpoint_time: 1541281433
time_since_last_checkpoint: 341 seconds
latest_start_position: 77, N/A, N/A
last_checkpoint_position: 317719759, N/A, N/A
first_failure_position: 153388517, [0x2000020af:0x39d9:0x0], 0x753a410c57f07b3
checked_phase1: 30987846
checked_phase2: 111
inconsistent_phase1: 2
inconsistent_phase2: 3
failed_phase1: 21
failed_phase2: 3
directories: 2709152
dirent_inconsistent: 0
linkea_inconsistent: 2
nlinks_inconsistent: 0
multiple_linked_checked: 5
multiple_linked_inconsistent: 0
unknown_inconsistency: 0
unmatched_pairs_inconsistent: 0
dangling_inconsistent: 0
multiple_referenced_inconsistent: 3
bad_file_type_inconsistent: 0
lost_dirent_inconsistent: 0
local_lost_found_scanned: 3
local_lost_found_moved: 3
local_lost_found_skipped: 0
local_lost_found_failed: 0
striped_dirs_scanned: 0
striped_dirs_inconsistent: 0
striped_dirs_failed: 0
striped_dirs_disabled: 0
striped_dirs_skipped: 0
striped_shards_scanned: 0
striped_shards_inconsistent: 0
striped_shards_failed: 0
striped_shards_skipped: 0
name_hash_inconsistent: 0
linkea_overflow_inconsistent: 0
success_count: 3
run_time_phase1: 362 seconds
run_time_phase2: 0 seconds
average_speed_phase1: 85601 items/sec
average_speed_phase2: 111 objs/sec
average_speed_total: 85366 items/sec
real_time_speed_phase1: N/A
real_time_speed_phase2: N/A
current_position: N/A
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt; &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/31400/31400_nbp13.lfsck.debug.out2.gz&quot; title=&quot;nbp13.lfsck.debug.out2.gz attached to LU-11584&quot;&gt;nbp13.lfsck.debug.out2.gz&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;  &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/31401/31401_nbp13.lfsck.debug.out1.gz&quot; title=&quot;nbp13.lfsck.debug.out1.gz attached to LU-11584&quot;&gt;nbp13.lfsck.debug.out1.gz&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;/p&gt;</comment>
                            <comment id="236307" author="gerrit" created="Mon, 5 Nov 2018 06:00:17 +0000"  >&lt;p&gt;Li Dongyang (dongyangli@ddn.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/33576&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/33576&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11584&quot; title=&quot;kernel BUG at ldiskfs.h:1907!&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11584&quot;&gt;&lt;del&gt;LU-11584&lt;/del&gt;&lt;/a&gt; e2fsck: check xattr &apos;system.data&apos; before setting inline_data feature&lt;br/&gt;
Project: tools/e2fsprogs&lt;br/&gt;
Branch: master-lustre&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 64b71635ffa84a01946199e3cd31b1ee9fd9a15f&lt;/p&gt;</comment>
                            <comment id="236347" author="mhanafi" created="Mon, 5 Nov 2018 20:00:00 +0000"  >&lt;p&gt;Any comments on the output of nbp13.lfsck?&lt;/p&gt;</comment>
                            <comment id="236348" author="bzzz" created="Mon, 5 Nov 2018 20:02:56 +0000"  >&lt;p&gt;I&apos;m modifying the test to simulate additional broken LinkEA, going to report results ASAP.&lt;/p&gt;</comment>
                            <comment id="236349" author="bzzz" created="Mon, 5 Nov 2018 20:05:55 +0000"  >&lt;p&gt;I still don&apos;t understand why the nbp13 log doesn&apos;t contain &quot;unsupported incompat LMA feature&quot; message.&lt;/p&gt;</comment>
                            <comment id="237156" author="mhanafi" created="Sun, 18 Nov 2018 08:01:02 +0000"  >&lt;p&gt;I was able to find all the inodes with bad LMA and delete them via ldiskfs. So what we have left are files that trigger OI scrub and that report &quot;?&quot; for size/uid/etc. The user has been able recover all the effected files, so we just need a way to delete the files.&lt;/p&gt;

&lt;p&gt;If we delete the files via ldiskfs how can we make sure that the objects will be cleaned up.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="237366" author="adilger" created="Wed, 21 Nov 2018 23:57:29 +0000"  >&lt;p&gt;Mahmoud, the orphan OST objects can be cleaned up with LFSCK &lt;tt&gt;layout&lt;/tt&gt; checking.  The orphans are linked into the &lt;tt&gt;$MOUNT/.lustre/lost+found&lt;/tt&gt; directory if &quot;&lt;tt&gt;lctl lfsck_start -o -t layout&lt;/tt&gt;&quot; is used (the &quot;-o&quot; option can be used as part of a full LFSCK run as well).&lt;/p&gt;</comment>
                            <comment id="238105" author="mhanafi" created="Thu, 6 Dec 2018 18:35:56 +0000"  >&lt;p&gt;Open new prio1 case &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11737&quot; title=&quot;LustreError: 11060:0:(osd_handler.c:3985:osd_xattr_set()) ASSERTION( handle ) failed: &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11737&quot;&gt;&lt;del&gt;LU-11737&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;after delete quarantined files hitting lbug.&lt;/p&gt;</comment>
                            <comment id="242879" author="gerrit" created="Wed, 27 Feb 2019 02:00:46 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/33546/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/33546/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11584&quot; title=&quot;kernel BUG at ldiskfs.h:1907!&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11584&quot;&gt;&lt;del&gt;LU-11584&lt;/del&gt;&lt;/a&gt; osd-ldiskfs: fix lost+found object replace&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 900352f2bc15906a8fba9cb889df4b166a53bade&lt;/p&gt;</comment>
                            <comment id="258792" author="jgmitter" created="Mon, 25 Nov 2019 20:20:55 +0000"  >&lt;p&gt;Patch landed to master.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10324">
                    <name>Cloners</name>
                                                                <inwardlinks description="is cloned by">
                                        <issuelink>
            <issuekey id="53868">LU-11589</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="47671">LU-9836</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="53865">LU-11588</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="53849">LU-11583</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="54208">LU-11737</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="31399" name="debug-lfsck-nbp15-MDT0000.gz" size="61197" author="mhanafi" created="Sat, 3 Nov 2018 19:36:38 +0000"/>
                            <attachment id="31356" name="dumpe2fs.out" size="37008" author="mhanafi" created="Tue, 30 Oct 2018 00:16:19 +0000"/>
                            <attachment id="31379" name="nbp13.debug.gz" size="25962147" author="mhanafi" created="Tue, 30 Oct 2018 21:18:13 +0000"/>
                            <attachment id="31401" name="nbp13.lfsck.debug.out1.gz" size="303637" author="mhanafi" created="Sat, 3 Nov 2018 21:50:57 +0000"/>
                            <attachment id="31400" name="nbp13.lfsck.debug.out2.gz" size="3964" author="mhanafi" created="Sat, 3 Nov 2018 21:50:57 +0000"/>
                            <attachment id="31380" name="oi_scrub.out" size="6176" author="mhanafi" created="Tue, 30 Oct 2018 21:34:31 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i005bb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10020"><![CDATA[1]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>