<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:21:05 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1948] ldiskfs - MDS goes read-only (SWL)</title>
                <link>https://jira.whamcloud.com/browse/LU-1948</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Sep 16 01:28:12 hyperion-rst6 kernel: LDISKFS-fs error (device md1): ldiskfs_add_entry:&lt;br/&gt;
Sep 16 01:28:12 hyperion-rst6 kernel: LDISKFS-fs error (device md1): ldiskfs_add_entry: bad entry in directory #88606751: rec_len is smaller than minimal - block=44369457offset=536(536), inode=88627711, rec_len=0, name_len=4&lt;br/&gt;
Sep 16 01:28:12 hyperion-rst6 kernel: Aborting journal on device md1-8.&lt;br/&gt;
Sep 16 01:28:12 hyperion-rst6 kernel: LDISKFS-fs error (device md1) in ldiskfs_reserve_inode_write: Journal has aborted&lt;br/&gt;
Sep 16 01:28:12 hyperion-rst6 kernel: LDISKFS-fs (md1): Remounting filesystem read-only&lt;br/&gt;
Sep 16 01:28:12 hyperion-rst6 kernel: LDISKFS-fs (md1): Remounting filesystem read-only&lt;br/&gt;
Sep 16 01:28:12 hyperion-rst6 kernel: LDISKFS-fs error (device md1) in ldiskfs_new_inode: Journal has aborted&lt;br/&gt;
Sep 16 01:28:12 hyperion-rst6 kernel: LDISKFS-fs error (device md1) in ldiskfs_delete_inode: Journal has aborted&lt;br/&gt;
Sep 16 01:28:12 hyperion-rst6 kernel: LustreError: 4489:0:(osd_io.c:1014:osd_ldiskfs_write_record()) journal_get_write_access() returned error -30&lt;br/&gt;
Sep 16 01:28:12 hyperion-rst6 kernel: LustreError: 4186:0:(osd_handler.c:894:osd_trans_stop()) Failure in transaction hook: -30&lt;/p&gt;

&lt;p&gt;Disk appeared to be quite messed up with fsck -fy. Ran the data capture script from lu-1015, results attached.&lt;/p&gt;
</description>
                <environment>SWL - Hyperion/LLNL RHEL6 servers and clients</environment>
        <key id="15991">LU-1948</key>
            <summary>ldiskfs - MDS goes read-only (SWL)</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="di.wang">Di Wang</assignee>
                                    <reporter username="cliffw">Cliff White</reporter>
                        <labels>
                    </labels>
                <created>Sun, 16 Sep 2012 11:41:43 +0000</created>
                <updated>Tue, 9 Oct 2012 00:55:11 +0000</updated>
                            <resolved>Mon, 1 Oct 2012 17:20:18 +0000</resolved>
                                    <version>Lustre 2.3.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="45020" author="pjones" created="Mon, 17 Sep 2012 08:09:45 +0000"  >&lt;p&gt;Bobijam&lt;/p&gt;

&lt;p&gt;Is there anything that you can determine from this report?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="45064" author="di.wang" created="Mon, 17 Sep 2012 13:47:39 +0000"  >&lt;p&gt;Cliff, Could you tell me more about your test? on lu-1015, I saw most discussion is about OST fails and run fsck. &lt;br/&gt;
Did you do the same thing about MDS? run tests, uncleanup power off, fsck -p (found problem), then run fsck -fy, start MDS again, found these problem? What test did you run? Could you please post the script here?  And I assume you use e2fsprogs-1.42.3.wc3? &lt;/p&gt;
</comment>
                            <comment id="45066" author="cliffw" created="Mon, 17 Sep 2012 13:55:14 +0000"  >&lt;p&gt;The test being run is SWL, which is essentially a large mix of multiple tasks (ior,fdtree,mib,etc) In this case, the MDS went read-only in the&lt;br/&gt;
middle of the test. I halted the MDS and ran the data-dump script created for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1015&quot; title=&quot;ldiskfs corruption with large LUNs&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1015&quot;&gt;&lt;del&gt;LU-1015&lt;/del&gt;&lt;/a&gt;. e2fsprogs was 1.41.90-wc4-7, i will reinstall newer.&lt;/p&gt;</comment>
                            <comment id="45067" author="cliffw" created="Mon, 17 Sep 2012 14:02:06 +0000"  >&lt;p&gt;current script:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
#!/bin/bash
ostdev=$1
logdir=/home/white215/wham/lu1015/logs/
stamp=`date +%Y%m%d.%M%S`
dumpe2fs /dev/$ostdev &amp;gt; $logdir/$ostdev.$stamp.stats
debugfs -c -R &lt;span class=&quot;code-quote&quot;&gt;&quot;dump &amp;lt;8&amp;gt;&quot;&lt;/span&gt; /dev/$ostdev &amp;gt; $logdir/$ostdev.$stamp.journal
debugfs -c -R &lt;span class=&quot;code-quote&quot;&gt;&quot;logdump -a&quot;&lt;/span&gt; /dev/$ostdev &amp;gt; $logdir/$ostdev.$stamp.logdump
e2fsck -fp /dev/$ostdev 2&amp;gt;&amp;amp;1 | tee $logdir/$ostdev.$stamp.e2fsck
dumpe2fs /dev/$ostdev &amp;gt; $logdir/$ostdev.$stamp.stats.post
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="45073" author="di.wang" created="Mon, 17 Sep 2012 14:45:45 +0000"  >&lt;p&gt;Thanks, I see. Is the test started from a fresh reformatted MDS? or the MDS was just been fixed by e2fsck -fy before the run? If the MDS has not been changed yet, could you find out the directory name of 88606751? I guess you can just mount it as ldiskfs, and get it by find /mnt/mds -inum 88606751  Thanks.&lt;/p&gt;</comment>
                            <comment id="45099" author="cliffw" created="Mon, 17 Sep 2012 21:01:58 +0000"  >&lt;p&gt;MDS just went read-only, then crashed when I attempted a umount -f&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
2012-09-17 17:44:00 Lustre: 4130:0:(service.c:2101:ptlrpc_handle_rs()) All locks stolen from rs ffff880132a5d000 x1413385599017471.t4342523245 o0 NID 192.168.119.37@o2ib1
2012-09-17 17:44:00 Lustre: 4130:0:(service.c:2101:ptlrpc_handle_rs()) Skipped 402 previous similar messages
2012-09-17 17:44:00 LustreError: 5729:0:(mdt_recovery.c:611:mdt_steal_ack_locks()) Skipped 426 previous similar messages
2012-09-17 17:47:49 LDISKFS-fs error (device md1): ldiskfs_add_entry:
2012-09-17 17:47:49 LDISKFS-fs error (device md1): ldiskfs_add_entry: bad entry in directory #116917774: rec_len is smaller than minimal - block=58525219offset=312(312), inode=117441145, rec_len=0, name_len=4
2012-09-17 17:47:49 Aborting journal on device md1-8.  
2012-09-17 17:47:49 LDISKFS-fs error (device md1) in ldiskfs_reserve_inode_write: Journal has aborted
2012-09-17 17:47:49 LDISKFS-fs error (device md1) in ldiskfs_reserve_inode_write: Journal has aborted
2012-09-17 17:47:49 LDISKFS-fs error (device md1) in ldiskfs_reserve_inode_write: Journal has aborted
2012-09-17 17:47:49 LDISKFS-fs error (device md1) in ldiskfs_reserve_inode_write: Journal has aborted
2012-09-17 17:47:49 LDISKFS-fs (md1):
2012-09-17 17:47:49 LustreError: 5731:0:(osd_io.c:1014:osd_ldiskfs_write_record()) journal_get_write_access() returned error -30
2012-09-17 17:47:49 LDISKFS-fs (md1):
2012-09-17 17:47:49 LDISKFS-fs (md1):
2012-09-17 17:47:49 LDISKFS-fs (md1): Remounting filesystem read-onlyRemounting filesystem read-onlyRemounting filesystem read-only
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Kernel BUG&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
2012-09-17 17:49:57 LDISKFS-fs error (device md1): ldiskfs_mb_release_inode_pa: pa free mismatch: [pa ffff88031438e748] [phy 190752] [logic 608] [len 16] [free 13] [error 0] [inode 7340126] [freed 16]
2012-09-17 17:49:57 LDISKFS-fs error (device md1): ldiskfs_mb_release_inode_pa: free 16, pa_free 13
2012-09-17 17:49:57 ------------[ cut here ]------------
2012-09-17 17:49:57 kernel BUG at /&lt;span class=&quot;code-keyword&quot;&gt;var&lt;/span&gt;/lib/jenkins/workspace/lustre-b2_3/arch/x86_64/build_type/server/distro/el6/ib_stack/inkernel/BUILD/BUILD/lustre-ldiskfs-3.3.0/ldiskfs/mballoc.c:3784!
2012-09-17 17:49:57 invalid opcode: 0000 [#1] SMP
2012-09-17 17:49:57 last sysfs file: /sys/devices/system/cpu/cpu15/cache/index2/shared_cpu_map
2012-09-17 17:49:57 CPU 14
2012-09-17 17:49:57 Modules linked in: cmm(U) osd_ldiskfs(U) mdt(U) mdd(U) mds(U) fsfilt_ldiskfs(U) exportfs mgs(U) mgc(U) lustre(U) lquota(U) lov(U) osc(U) mdc(U) fid(U) fld(U) ptlrpc(U) obdclass(U) lvfs(U) ldiskfs(U) mbcache jbd2 zfs(P)(U) zcommon(P)(U) znvpair(P)(U) zavl(P)(U) zunicode(P)(U) spl(U) zlib_deflate ko2iblnd(U) lnet(U) sha512_generic sha256_generic libcfs(U) cpufreq_ondemand acpi_cpufreq freq_table mperf ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_addr ib_sa mlx4_ib ib_mad ib_core dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun kvm raid0 sg sr_mod cdrom sd_mod crc_t10dif dcdbas serio_raw ata_generic pata_acpi ata_piix iTCO_wdt iTCO_vendor_support mptsas mptscsih mptbase scsi_transport_sas i7core_edac edac_core ipv6 nfs lockd fscache nfs_acl auth_rpcgss sunrpc mlx4_en mlx4_core bnx2 [last unloaded: scsi_wait_scan]
2012-09-17 17:49:57
2012-09-17 17:49:57 Pid: 10207, comm: umount Tainted: P           ---------------    2.6.32-279.5.1.el6_lustre.x86_64 #1 Dell Inc. PowerEdge R610/0K399H
2012-09-17 17:49:57 RIP: 0010:[&amp;lt;ffffffffa07214b6&amp;gt;]  [&amp;lt;ffffffffa07214b6&amp;gt;] ldiskfs_mb_release_inode_pa+0x346/0x360 [ldiskfs]
2012-09-17 17:49:57 RSP: 0018:ffff88026e7152b8  EFLAGS: 00010216
2012-09-17 17:49:57 RAX: 000000000000000d RBX: 0000000000000010 RCX: ffff88032a805800
2012-09-17 17:49:57 RDX: 0000000000000000 RSI: 0000000000000046 RDI: ffff880316f3a2c0
2012-09-17 17:49:57 RBP: ffff88026e715368 R08: 0000000000000000 R09: 0000000000000000
2012-09-17 17:49:57 R10: 00000000001 R11: 0000000000000000 R12: ffff88025db65898
2012-09-17 17:49:57 R13: ffff88023f45d3a0 R14: 0000000000002931 R15: ffff88031438e748
2012-09-17 17:49:57 FS:  00002aaaab690740(0000) GS:ffff8801b58e0000(0000) knlGS:0000000000000000
2012-09-17 17:49:57 CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
2012-09-17 17:49:57 CR2: 00002aaaaace3387 CR3: 00000002b6ee5000 CR4: 00000000000006e0
2012-09-17 17:49:57 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
2012-09-17 17:49:57 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
2012-09-17 17:49:57 &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; umount (pid: 10207, threadinfo ffff88026e714000, task ffff88007fa10080)
2012-09-17 17:49:57 Stack:
2012-09-17 17:49:57  ffff880200000010 000000000000000d ffff880200000000 000000000070005e
2012-09-17 17:49:57 &amp;lt;d&amp;gt; 0000000000000010 000000000000000b ffff88026e7152f8 ffffffff811ae146
2012-09-17 17:49:57 &amp;lt;d&amp;gt; ffff88032a805800 ffff88032a3b8c00 ffff88031438e748 000000000002c000
2012-09-17 17:49:57 Call Trace:

2012-09-17 17:49:57  [&amp;lt;ffffffff811ae146&amp;gt;] ? __wait_on_buffer+0x26/0x30
2012-09-17 17:49:57  [&amp;lt;ffffffffa072701e&amp;gt;] ldiskfs_discard_preallocations+0x1fe/0x490 [ldiskfs]
2012-09-17 17:49:57  [&amp;lt;ffffffffa0711077&amp;gt;] ldiskfs_release_file+0xb7/0xd0 [ldiskfs]
2012-09-17 17:49:57  [&amp;lt;ffffffff8117ca65&amp;gt;] __fput+0xf5/0x210
2012-09-17 17:49:57  [&amp;lt;ffffffff8117cba5&amp;gt;] fput+0x25/0x30
2012-09-17 17:49:57  [&amp;lt;ffffffff811785cd&amp;gt;] filp_close+0x5d/0x90
2012-09-17 17:49:57  [&amp;lt;ffffffffa07a1dbe&amp;gt;] llog_lvfs_close+0x2e/0x130 [obdclass]
2012-09-17 17:49:57  [&amp;lt;ffffffffa079afff&amp;gt;] llog_close+0x5f/0x1b0 [obdclass]
2012-09-17 17:49:57  [&amp;lt;ffffffffa079e92d&amp;gt;] llog_cat_put+0x5d/0x160 [obdclass]
2012-09-17 17:49:57  [&amp;lt;ffffffffa07a76bc&amp;gt;] llog_obd_origin_cleanup+0x1bc/0x460 [obdclass]
2012-09-17 17:49:57  [&amp;lt;ffffffffa07a7366&amp;gt;] __llog_ctxt_put+0xc6/0x260 [obdclass]
2012-09-17 17:49:57  [&amp;lt;ffffffffa07a7a0c&amp;gt;] llog_cleanup+0xac/0x490 [obdclass]
2012-09-17 17:49:57  [&amp;lt;ffffffffa0b7b6ef&amp;gt;] osc_llog_finish+0x7f/0x250 [osc]
2012-09-17 17:49:57  [&amp;lt;ffffffffa07a6438&amp;gt;] obd_llog_finish+0x88/0x1a0 [obdclass]
2012-09-17 17:49:57  [&amp;lt;ffffffffa0b76dca&amp;gt;] osc_precleanup+0x2ea/0x3b0 [osc]
2012-09-17 17:49:57  [&amp;lt;ffffffffa07d00c7&amp;gt;] class_cleanup+0x1f7/0xdc0 [obdclass]
2012-09-17 17:49:58  [&amp;lt;ffffffffa07b12b6&amp;gt;] ? class_name2dev+0x56/0xe0 [obdclass]
2012-09-17 17:49:58  [&amp;lt;ffffffffa07d243b&amp;gt;] class_process_config+0x102b/0x1c30 [obdclass]
2012-09-17 17:49:58  [&amp;lt;ffffffffa0398be0&amp;gt;] ? cfs_alloc+0x30/0x60 [libcfs]
2012-09-17 17:49:58  [&amp;lt;ffffffffa07cbe43&amp;gt;] ? lustre_cfg_new+0x353/0x7e0 [obdclass]
2012-09-17 17:49:58  [&amp;lt;ffffffffa07d31b9&amp;gt;] class_manual_cleanup+0x179/0x6f0 [obdclass]
2012-09-17 17:49:58  [&amp;lt;ffffffffa0c09743&amp;gt;] lov_putref+0x393/0xb30 [lov]
2012-09-17 17:49:58  [&amp;lt;ffffffff814ff18e&amp;gt;] ? mutex_lock+0x1e/0x50
2012-09-17 17:49:58  [&amp;lt;ffffffffa0c1335f&amp;gt;] lov_disconnect+0x19f/0x4c0 [lov]
2012-09-17 17:49:58  [&amp;lt;ffffffffa0eb1c72&amp;gt;] mds_lov_clean+0x262/0x6f0 [mds]
2012-09-17 17:49:58  [&amp;lt;ffffffffa0eb22c6&amp;gt;] mds_precleanup+0x1c6/0x4b0 [mds]
2012-09-17 17:49:58  [&amp;lt;ffffffffa07d00c7&amp;gt;] class_cleanup+0x1f7/0xdc0 [obdclass]
2012-09-17 17:49:58  [&amp;lt;ffffffff81271fe5&amp;gt;] ? _atomic_dec_and_lock+0x55/0x80
2012-09-17 17:49:58  [&amp;lt;ffffffffa07e9827&amp;gt;] ? lu_object_put+0x157/0x290 [obdclass]
2012-09-17 17:49:58  [&amp;lt;ffffffffa0eee8c4&amp;gt;] mdd_fini_obd+0x54/0x240 [mdd]
2012-09-17 17:49:58  [&amp;lt;ffffffffa0f0d777&amp;gt;] mdd_process_config+0x2d7/0xa20 [mdd]
2012-09-17 17:49:58  [&amp;lt;ffffffff814fdca0&amp;gt;] ? thread_return+0x4e/0x76e
2012-09-17 17:49:58  [&amp;lt;ffffffffa06a3b6b&amp;gt;] cmm_process_config+0x7b/0xd10 [cmm]
2012-09-17 17:49:58  [&amp;lt;ffffffffa0f6fd8a&amp;gt;] mdt_stack_fini+0x5da/0xd40 [mdt]
2012-09-17 17:49:58  [&amp;lt;ffffffffa06a2886&amp;gt;] ? cmm_init_capa_ctxt+0x46/0x140 [cmm]
2012-09-17 17:49:58  [&amp;lt;ffffffffa0f707c5&amp;gt;] mdt_device_fini+0x2d5/0x5f0 [mdt]
2012-09-17 17:49:58  [&amp;lt;ffffffffa07d0447&amp;gt;] class_cleanup+0x577/0xdc0 [obdclass]
2012-09-17 17:49:58  [&amp;lt;ffffffffa07b12b6&amp;gt;] ? class_name2dev+0x56/0xe0 [obdclass]
2012-09-17 17:49:58  [&amp;lt;ffffffffa07d243b&amp;gt;] class_process_config+0x102b/0x1c30 [obdclass]
2012-09-17 17:49:58  [&amp;lt;ffffffffa0398be0&amp;gt;] ? cfs_alloc+0x30/0x60 [libcfs]
2012-09-17 17:49:58  [&amp;lt;ffffffffa07cbe43&amp;gt;] ? lustre_cfg_new+0x353/0x7e0 [obdclass]
2012-09-17 17:49:58  [&amp;lt;ffffffffa07d31b9&amp;gt;] class_manual_cleanup+0x179/0x6f0 [obdclass]
2012-09-17 17:49:58  [&amp;lt;ffffffffa07b12b6&amp;gt;] ? class_name2dev+0x56/0xe0 [obdclass]
2012-09-17 17:49:58  [&amp;lt;ffffffffa07dd3c9&amp;gt;] server_put_super+0x6f9/0xcf0 [obdclass]
2012-09-17 17:49:58  [&amp;lt;ffffffff8117d34b&amp;gt;] generic_shutdown_super+0x5b/0xe0
2012-09-17 17:49:58  [&amp;lt;ffffffff8117d436&amp;gt;] kill_anon_super+0x16/0x60
2012-09-17 17:49:58  [&amp;lt;ffffffffa07d4df6&amp;gt;] lustre_kill_super+0x36/0x60 [obdclass]
2012-09-17 17:49:58  [&amp;lt;ffffffff8117e4b0&amp;gt;] deactivate_super+0x70/0x90
2012-09-17 17:49:58  [&amp;lt;ffffffff8119a4ff&amp;gt;] mntput_no_expire+0xbf/0x110
2012-09-17 17:49:58  [&amp;lt;ffffffff8119af9b&amp;gt;] sys_umount+0x7b/0x3a0
2012-09-17 17:49:58  [&amp;lt;ffffffff81082de1&amp;gt;] ? sigprocmask+0x71/0x110
2012-09-17 17:49:58  [&amp;lt;ffffffff8100b0f2&amp;gt;] system_call_fastpath+0x16/0x1b
2012-09-17 17:49:58 Code: 55 c8 e9 39 fe ff ff 31 db 41 83 7f 4c 00 0f 84 7e fd ff ff 0f 0b eb fe 0f 0b eb fe 0f 0b 0f 1f 80 00 00 00 00 eb f7 0f 0b eb fe &amp;lt;0f&amp;gt; 0b 0f 1f 84 00 00 00 00 00 eb f6 66 66 66 66 66 2e 0f 1f 84
2012-09-17 17:49:58 RIP  [&amp;lt;ffffffffa07214b6&amp;gt;] ldiskfs_mb_release_inode_pa+0x346/0x360 [ldiskfs]
2012-09-17 17:49:58  RSP &amp;lt;ffff88026e7152b8&amp;gt;
2012-09-17 17:49:58 Initializing cgroup subsys cpuset
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;am rebooting the node now, will run fsck script.&lt;/p&gt;</comment>
                            <comment id="45119" author="di.wang" created="Tue, 18 Sep 2012 03:10:10 +0000"  >&lt;p&gt;I add some debug patch based on b2_3 (&lt;a href=&quot;http://review.whamcloud.com/#change,4020&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,4020&lt;/a&gt;   only patch on el6 ldiskfs series). build &lt;a href=&quot;http://build.whamcloud.com/job/lustre-reviews/9250/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-reviews/9250/&lt;/a&gt;  Cliff, could you please test with this rpm.  And I suspect this problem might be related with ldiskfs htree, so Liang is looking at this problem as well.&lt;/p&gt;</comment>
                            <comment id="45171" author="cliffw" created="Tue, 18 Sep 2012 17:24:36 +0000"  >&lt;p&gt;Okay, the MDS just &apos;sploded. &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
2012-09-18 14:13:37 LDISKFS-fs error (device md1): add_dirent:
2012-09-18 14:13:37 LDISKFS-fs error (device md1): ldiskfs_add_entry: bad entry in directory #78644697: rec_len is smaller than minimal - block=39336951offset=152(152), inode=78655820, rec_len=0, name_len=4
2012-09-18 14:13:37 Aborting journal on device md1-8.
2012-09-18 14:13:37 LDISKFS-fs error (device md1): ldiskfs_journal_start_sb: Detected aborted journal
2012-09-18 14:13:37 LDISKFS-fs (md1): Remounting filesystem read-only
2012-09-18 14:13:37 LDISKFS-fs (md1): Remounting filesystem read-only
2012-09-18 14:13:37 LDISKFS-fs error (device md1): add_dirent: 78644697: lck ffff8801304ad000 off=152(78655820),inode=32, rec_len=4, name_len=0
2012-09-18 14:13:37
2012-09-18 14:13:37 LustreError: 8052:0:(osd_io.c:1014:osd_ldiskfs_write_record()) journal_get_write_access() returned error -30
2012-09-18 14:13:37 LustreError: 8052:0:(osd_handler.c:894:osd_trans_stop()) Failure in transaction hook: -30
2012-09-18 14:13:37 LustreError: 8052:0:(osd_handler.c:899:osd_trans_stop()) Failure to stop transaction: -30
2012-09-18 14:13:37 78644697: name L0F7, lck 0000000000000004, name_len=568931904
2012-09-18 14:13:37
2012-09-18 14:13:37 ------------[ cut here ]------------
2012-09-18 14:13:37 kernel BUG at /&lt;span class=&quot;code-keyword&quot;&gt;var&lt;/span&gt;/lib/jenkins/workspace/lustre-reviews/arch/x86_64/build_type/server/distro/el6/ib_stack/inkernel/BUILD/BUILD/lustre-ldiskfs-3.3.0/ldiskfs/namei.c:1757!
2012-09-18 14:13:37 invalid opcode: 0000 [#1] SMP
2012-09-18 14:13:37 last sysfs file: /sys/devices/pci0000:00/0000:00:09.0/0000:05:00.0/infiniband_mad/umad0/port
2012-09-18 14:13:37 CPU 7
2012-09-18 14:13:37 Modules linked in: cmm(U) osd_ldiskfs(U) mdt(U) mdd(U) mds(U) fsfilt_ldiskfs(U) exportfs mgs(U) mgc(U) ldiskfs(U) mbcache jbd2 lustre(U) lquota(U) lov(U) osc(U) mdc(U) fid(U) fld(U) ptlrpc(U) obdclass(U) lvfs(U) zfs(P)(U) zcommon(P)(U) znvpair(P)(U) zavl(P)(U) zunicode(P)(U) spl(U) zlib_deflate ko2iblnd(U) lnet(U) sha512_generic sha256_generic libcfs(U) cpufreq_ondemand acpi_cpufreq freq_table mperf ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_addr ib_sa mlx4_ib ib_mad ib_core dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun kvm raid0 sg sr_mod cdrom sd_mod crc_t10dif dcdbas serio_raw ata_generic pata_acpi ata_piix iTCO_wdt iTCO_vendor_support mptsas mptscsih mptbase scsi_transport_sas i7core_edac edac_core ipv6 nfs lockd fscache nfs_acl auth_rpcgss sunrpc mlx4_en mlx4_core bnx2 [last unloaded: scsi_wait_scan]
2012-09-18 14:13:37
2012-09-18 14:13:37 Pid: 4898, comm: mdt03_008 Tainted: P           ---------------    2.6.32-279.5.1.el6_lustre.gb4cc145.x86_64 #1 Dell Inc. PowerEdge R610/0K399H

2012-09-18 14:13:37 RIP: 0010:[&amp;lt;ffffffffa0db21d0&amp;gt;]  [&amp;lt;ffffffffa0db21d0&amp;gt;] add_dirent_to_buf+0x4c0/0x530 [ldiskfs]
2012-09-18 14:13:37 RSP: 0018:ffff880121e935f0  EFLAGS: 00010246
2012-09-18 14:13:37 RAX: ffff880181248000 RBX: ffff880268654078 RCX: 00000000000014b5
2012-09-18 14:13:37 RDX: ffff880268654098 RSI: 0000000000000046 RDI: ffff88016ff93c00
2012-09-18 14:13:37 RBP: ffff880121e936b0 R08: 0000000000000000 R09: 0000000000000000
2012-09-18 14:13:37 R10: 0000000000000001 R11: 0000000000000000 R12: ffff880130e16ad0
2012-09-18 14:13:37 R13: 0000000000000000 R14: 0000000000000004 R15: 0000000000000020
2012-09-18 14:13:37 FS:  00002aaaab47e700(0000) GS:ffff880028260000(0000) knlGS:0000000000000000
2012-09-18 14:13:37 CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
2012-09-18 14:13:37 CR2: 00000000006d3a80 CR3: 0000000001a85000 CR4: 00000000000006e0
2012-09-18 14:13:37 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
2012-09-18 14:13:37 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
2012-09-18 14:13:37 &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; mdt03_008 (pid: 4898, threadinfo ffff880121e92000, task ffff88012df62080)
2012-09-18 14:13:37 LustreError: 4099:0:(fsfilt-ldiskfs.c:332:fsfilt_ldiskfs_start()) error starting handle &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; op 10 (114 credits): rc -30
2012-09-18 14:13:37 LustreError: 4099:0:(llog_server.c:414:llog_origin_handle_cancel()) fsfilt_start_log() failed: -30
2012-09-18 14:13:37 LustreError: 4099:0:(llog_server.c:453:llog_origin_handle_cancel()) Cancel 0 of 121 llog-records failed: -30
2012-09-18 14:13:37 Stack:
2012-09-18 14:13:37  ffff880121e93640 0000000002583bf7 0000000000001000 0000000000000000
2012-09-18 14:13:37 &amp;lt;d&amp;gt; ffff8801abdad5c0 ffff880200000004 ffff880104b03141 0000000000000004
2012-09-18 14:13:37 &amp;lt;d&amp;gt; 00000020e2f30800 ffff8801258d9e00 ffff8801064c3288 ffff880268654fe0
2012-09-18 14:13:37 Call Trace:
2012-09-18 14:13:37  [&amp;lt;ffffffffa0db5c5d&amp;gt;] ldiskfs_add_entry+0xcd/0x500 [ldiskfs]
2012-09-18 14:13:37  [&amp;lt;ffffffffa1001689&amp;gt;] __osd_ea_add_rec+0xb9/0x190 [osd_ldiskfs]
2012-09-18 14:13:37  [&amp;lt;ffffffffa100dbeb&amp;gt;] osd_index_ea_insert+0x21b/0x5e0 [osd_ldiskfs]
2012-09-18 14:13:37  [&amp;lt;ffffffffa0eee977&amp;gt;] __mdd_index_insert_only+0x147/0x150 [mdd]
2012-09-18 14:13:37  [&amp;lt;ffffffffa0eef9b1&amp;gt;] __mdd_index_insert+0x51/0x1f0 [mdd]
2012-09-18 14:13:37  [&amp;lt;ffffffffa0ef55e3&amp;gt;] mdd_create+0x19a3/0x20c0 [mdd]
2012-09-18 14:13:37  [&amp;lt;ffffffffa03aa5b1&amp;gt;] ? libcfs_debug_msg+0x41/0x50 [libcfs]
2012-09-18 14:13:37  [&amp;lt;ffffffffa06a4637&amp;gt;] cml_create+0x97/0x250 [cmm]
2012-09-18 14:13:37  [&amp;lt;ffffffffa0f8bb9f&amp;gt;] mdt_reint_open+0x108f/0x18a0 [mdt]
2012-09-18 14:13:37  [&amp;lt;ffffffffa0f75151&amp;gt;] mdt_reint_rec+0x41/0xe0 [mdt]
2012-09-18 14:13:37  [&amp;lt;ffffffffa0f6e9aa&amp;gt;] mdt_reint_internal+0x50a/0x810 [mdt]
2012-09-18 14:13:37  [&amp;lt;ffffffffa0f6ef7d&amp;gt;] mdt_intent_reint+0x1ed/0x500 [mdt]
2012-09-18 14:13:37  [&amp;lt;ffffffffa0f6b191&amp;gt;] mdt_intent_policy+0x371/0x6a0 [mdt]
2012-09-18 14:13:37  [&amp;lt;ffffffffa0859881&amp;gt;] ldlm_lock_enqueue+0x361/0x8f0 [ptlrpc]
2012-09-18 14:13:37  [&amp;lt;ffffffffa08819bf&amp;gt;] ldlm_handle_enqueue0+0x48f/0xf70 [ptlrpc]
2012-09-18 14:13:37  [&amp;lt;ffffffffa0f6b506&amp;gt;] mdt_enqueue+0x46/0x130 [mdt]
2012-09-18 14:13:37  [&amp;lt;ffffffffa0f62802&amp;gt;] mdt_handle_common+0x922/0x1740 [mdt]
2012-09-18 14:13:37  [&amp;lt;ffffffffa0f636f5&amp;gt;] mdt_regular_handle+0x15/0x20 [mdt]
2012-09-18 14:13:37  [&amp;lt;ffffffffa08b199d&amp;gt;] ptlrpc_server_handle_request+0x40d/0xea0 [ptlrpc]
2012-09-18 14:13:37  [&amp;lt;ffffffffa08a8f37&amp;gt;] ? ptlrpc_wait_event+0xa7/0x2a0 [ptlrpc]
2012-09-18 14:13:37  [&amp;lt;ffffffffa03aa5b1&amp;gt;] ? libcfs_debug_msg+0x41/0x50 [libcfs]
2012-09-18 14:13:37  [&amp;lt;ffffffff810533f3&amp;gt;] ? __wake_up+0x53/0x70
2012-09-18 14:13:37  [&amp;lt;ffffffffa08b2f89&amp;gt;] ptlrpc_main+0xb59/0x1860 [ptlrpc]
2012-09-18 14:13:37  [&amp;lt;ffffffffa08b2430&amp;gt;] ? ptlrpc_main+0x0/0x1860 [ptlrpc]
2012-09-18 14:13:37  [&amp;lt;ffffffff8100c14a&amp;gt;] child_rip+0xa/0x20
2012-09-18 14:13:37  [&amp;lt;ffffffffa08b2430&amp;gt;] ? ptlrpc_main+0x0/0x1860 [ptlrpc]
2012-09-18 14:13:37  [&amp;lt;ffffffffa08b2430&amp;gt;] ? ptlrpc_main+0x0/0x1860 [ptlrpc]
2012-09-18 14:13:37  [&amp;lt;ffffffff8100c140&amp;gt;] ? child_rip+0x0/0x20
2012-09-18 14:13:37 Code: 00 00 48 c7 c2 20 81 dd a0 4c 8b 45 a8 45 89 f1 48 c7 c6 e5 d2 dd a0 e8 9f 56 01 00 48 8b 55 c8 66 83 7a 04 00 0f 85 a9 fd ff ff &amp;lt;0f&amp;gt; 0b eb fe 49 8b
2012-09-18 14:13:37 LustreError: 4099:0:(fsfilt-ldiskfs.c:332:fsfilt_ldiskfs_start()) error starting handle &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; op 10 (114 credits): rc -30
2012-09-18 14:13:37 LustreError: 4099:0:(llog_server.c:414:llog_origin_handle_cancel()) fsfilt_start_log() failed: -30
2012-09-18 14:13:37 LustreError: 4099:0:(llog_server.c:453:llog_origin_handle_cancel()) Cancel 0 of 121 llog-records failed: -30
2012-09-18 14:13:37 4c 24 40 49 8b bc 24 08 01 00 00 31 c0 4c 8b
2012-09-18 14:13:37 RIP  [&amp;lt;ffffffffa0db21d0&amp;gt;] add_dirent_to_buf+0x4c0/0x530 [ldiskfs]
2012-09-18 14:13:37  RSP &amp;lt;ffff880121e935f0&amp;gt;
2012-09-18 14:13:38 Initializing cgroup subsys cpuset
2012-09-18 14:13:38 Initializing cgroup subsys cpu
2012-09-18 14:13:38 Linux version 2.6.32-279.5.1.el6_lustre.gb4cc145.x86
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;vmcore has been captured, is on brent.whamcloud.com ~/cliffw/lu1948/vmcore-rst6-c2.gz&lt;/p&gt;</comment>
                            <comment id="45180" author="di.wang" created="Tue, 18 Sep 2012 21:45:15 +0000"  >&lt;p&gt;According to test result the corrupt directory comes from fdtree test. &quot; 78644697: name L0F7, lck 0000000000000004, name_len=568931904&quot;. Hmm during fdtree test, each thread will only create files in its own directory, and only 10 files for each dir.  &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; 
 \\srun -N $NODES -n $PROCS --wait=7200 -o fdtree.%t.out \$SWL/IO/fdtree.bash -l 3 -d 10 &amp;gt;&amp;amp; tmp_file
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt; 

&lt;p&gt;So this problem should be unrelated with ldiskfs pdir patch.  Now I suspect it might be related with mballoc, because we saw this during the test&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; 
&quot;2012-09-17 17:49:57 LDISKFS-fs error (device md1): ldiskfs_mb_release_inode_pa: pa free mismatch: [pa ffff88031438e748] [phy 190752] [logic 608] [len 16] [free 13] [error 0] [inode 7340126] [freed 16]
2012-09-17 17:49:57 LDISKFS-fs error (device md1): ldiskfs_mb_release_inode_pa: free 16, pa_free 13 &quot;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt; 

&lt;p&gt; though it happened after the fs turned to read-only.&lt;/p&gt;


</comment>
                            <comment id="45181" author="di.wang" created="Tue, 18 Sep 2012 21:47:42 +0000"  >&lt;p&gt;Add andreas to the ticket, in case there are some mballoc changes recently for ext4.&lt;/p&gt;</comment>
                            <comment id="45202" author="adilger" created="Wed, 19 Sep 2012 02:15:38 +0000"  >&lt;p&gt;There haven&apos;t been changes to mballoc, but there was a change to the symlink NUL termination recently. Does this workload create symlinks?&lt;/p&gt;</comment>
                            <comment id="45203" author="liwei" created="Wed, 19 Sep 2012 02:21:34 +0000"  >&lt;p&gt;Isn&apos;t osd_ldiskfs_write_record() writing one-byte off the buffer limit if write_NUL is true?&lt;/p&gt;</comment>
                            <comment id="45204" author="di.wang" created="Wed, 19 Sep 2012 02:22:04 +0000"  >&lt;p&gt;fdtree does not create symlinks, which only includes mkdir, create, dd, unlink, rmdir. But SWL includes 5 tests, fdtree, simul, IOR, mirIO, mdtest. Simul definitely include create symlinks here. &lt;/p&gt;</comment>
                            <comment id="45205" author="di.wang" created="Wed, 19 Sep 2012 02:32:08 +0000"  >&lt;p&gt;oh, if you mean &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1540&quot; title=&quot;e2fsck remove too many symlinks&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1540&quot;&gt;&lt;del&gt;LU-1540&lt;/del&gt;&lt;/a&gt;, which has been landed on 2_3, and already included in our test rpm here.&lt;/p&gt;</comment>
                            <comment id="45315" author="di.wang" created="Thu, 20 Sep 2012 23:43:45 +0000"  >&lt;p&gt;This problem should be a duplicate with 1976, and Fang yong already provide a fix there, close this one.&lt;/p&gt;</comment>
                            <comment id="45775" author="cliffw" created="Sat, 29 Sep 2012 21:39:13 +0000"  >&lt;p&gt;System crashed again w/liang&apos;s patch - dump taken&lt;br/&gt;
stack is a bit messed up&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;2012-09-27 21:56:35 LustreError: 5611:0:(osd_handler.c:2343:osd_object_ref_del()) ASSERTION( inode-&amp;gt;i_nlink &amp;gt; 0 ) failed:
2012-09-27 21:56:35 LustreError: 5611:0:(osd_handler.c:2343:osd_object_ref_del()) LBUG
2012-09-27 21:56:35 Pid: 5611, comm: mdt00_015
2012-09-27 21:56:35
2012-09-27 21:56:35 Sep 27 21:56:35 Call Trace:
2012-09-27 21:56:35 hyperion-rst6 ke [&amp;lt;ffffffffa0392905&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
2012-09-27 21:56:35 rnel: LustreErro [&amp;lt;ffffffffa0392f17&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
2012-09-27 21:56:35 r: 5611:0:(osd_h [&amp;lt;ffffffffa0a946a1&amp;gt;] osd_object_ref_del+0x1d1/0x210 [osd_ldiskfs]
2012-09-27 21:56:35 andler.c:2343:os [&amp;lt;ffffffffa0efa09d&amp;gt;] mdo_ref_del+0xad/0xb0 [mdd]
2012-09-27 21:56:35 d_object_ref_del [&amp;lt;ffffffffa0eff715&amp;gt;] mdd_unlink+0x815/0xdb0 [mdd]
2012-09-27 21:56:35 ()) ASSERTION( i [&amp;lt;ffffffffa09581e4&amp;gt;] ? lustre_msg_get_versions+0xa4/0x120 [ptlrpc]
2012-09-27 21:56:35 node-&amp;gt;i_nlink &amp;gt;  [&amp;lt;ffffffffa08bd037&amp;gt;] cml_unlink+0x97/0x200 [cmm]
2012-09-27 21:56:35 0 ) failed:
2012-09-27 21:56:35 Sep [&amp;lt;ffffffffa0f83ddf&amp;gt;] ? mdt_version_get_save+0x8f/0xd0 [mdt]
2012-09-27 21:56:35  27 21:56:35 hyp [&amp;lt;ffffffffa0f84454&amp;gt;] mdt_reint_unlink+0x634/0x9e0 [mdt]
2012-09-27 21:56:35 erion-rst6 kerne [&amp;lt;ffffffffa0f81151&amp;gt;] mdt_reint_rec+0x41/0xe0 [mdt]
2012-09-27 21:56:35 l: LustreError:  [&amp;lt;ffffffffa0f7a9aa&amp;gt;] mdt_reint_internal+0x50a/0x810 [mdt]
2012-09-27 21:56:35 5611:0:(osd_hand [&amp;lt;ffffffffa0f7acf4&amp;gt;] mdt_reint+0x44/0xe0 [mdt]
2012-09-27 21:56:35 ler.c:2343:osd_o [&amp;lt;ffffffffa0f6e802&amp;gt;] mdt_handle_common+0x922/0x1740 [mdt]
2012-09-27 21:56:35 bject_ref_del()) [&amp;lt;ffffffffa0f6f6f5&amp;gt;] mdt_regular_handle+0x15/0x20 [mdt]
2012-09-27 21:56:35  LBUG
2012-09-27 21:56:35  [&amp;lt;ffffffffa0966b3c&amp;gt;] ptlrpc_server_handle_request+0x41c/0xe00 [ptlrpc]
2012-09-27 21:56:35  [&amp;lt;ffffffffa039365e&amp;gt;] ? cfs_timer_arm+0xe/0x10 [libcfs]
2012-09-27 21:56:35  [&amp;lt;ffffffffa03a513f&amp;gt;] ? lc_watchdog_touch+0x6f/0x180 [libcfs]
2012-09-27 21:56:35  [&amp;lt;ffffffffa095df37&amp;gt;] ? ptlrpc_wait_event+0xa7/0x2a0 [ptlrpc]
2012-09-27 21:56:35  [&amp;lt;ffffffff810533f3&amp;gt;] ? __wake_up+0x53/0x70
2012-09-27 21:56:35  [&amp;lt;ffffffffa0968111&amp;gt;] ptlrpc_main+0xbf1/0x19e0 [ptlrpc]
2012-09-27 21:56:35  [&amp;lt;ffffffffa0967520&amp;gt;] ? ptlrpc_main+0x0/0x19e0 [ptlrpc]
2012-09-27 21:56:35  [&amp;lt;ffffffff8100c14a&amp;gt;] child_rip+0xa/0x20
2012-09-27 21:56:35  [&amp;lt;ffffffffa0967520&amp;gt;] ? ptlrpc_main+0x0/0x19e0 [ptlrpc]
2012-09-27 21:56:35  [&amp;lt;ffffffffa0967520&amp;gt;] ? ptlrpc_main+0x0/0x19e0 [ptlrpc]
2012-09-27 21:56:35  [&amp;lt;ffffffff8100c140&amp;gt;] ? child_rip+0x0/0x20
2012-09-27 21:56:35
2012-09-27 21:56:35 Kernel panic - not syncing: LBUG
2012-09-27 21:56:35 Pid: 5611, comm: mdt00_015 Tainted: P           ---------------    2.6.32-279.5.1.el6_lustre.x86_64 #1
2012-09-27 21:56:35 Sep 27 21:56:35 Call Trace:
2012-09-27 21:56:35 hyperion-rst6 ke [&amp;lt;ffffffff814fd58a&amp;gt;] ? panic+0xa0/0x168
2012-09-27 21:56:35 rnel: Kernel pan [&amp;lt;ffffffffa0392f6b&amp;gt;] ? lbug_with_loc+0x9b/0xb0 [libcfs]
2012-09-27 21:56:35 ic - not syncing [&amp;lt;ffffffffa0a946a1&amp;gt;] ? osd_object_ref_del+0x1d1/0x210 [osd_ldiskfs]
2012-09-27 21:56:35 : LBUG
2012-09-27 21:56:35  [&amp;lt;ffffffffa0efa09d&amp;gt;] ? mdo_ref_del+0xad/0xb0 [mdd]
2012-09-27 21:56:35  [&amp;lt;ffffffffa0eff715&amp;gt;] ? mdd_unlink+0x815/0xdb0 [mdd]
2012-09-27 21:56:35  [&amp;lt;ffffffffa09581e4&amp;gt;] ? lustre_msg_get_versions+0xa4/0x120 [ptlrpc]
2012-09-27 21:56:35  [&amp;lt;ffffffffa08bd037&amp;gt;] ? cml_unlink+0x97/0x200 [cmm]
2012-09-27 21:56:35  [&amp;lt;ffffffffa0f83ddf&amp;gt;] ? mdt_version_get_save+0x8f/0xd0 [mdt]
2012-09-27 21:56:35  [&amp;lt;ffffffffa0f84454&amp;gt;] ? mdt_reint_unlink+0x634/0x9e0 [mdt]
2012-09-27 21:56:35  [&amp;lt;ffffffffa0f81151&amp;gt;] ? mdt_reint_rec+0x41/0xe0 [mdt]
2012-09-27 21:56:35  [&amp;lt;ffffffffa0f7a9aa&amp;gt;] ? mdt_reint_internal+0x50a/0x810 [mdt]
2012-09-27 21:56:35  [&amp;lt;ffffffffa0f7acf4&amp;gt;] ? mdt_reint+0x44/0xe0 [mdt]
2012-09-27 21:56:35  [&amp;lt;ffffffffa0f6e802&amp;gt;] ? mdt_handle_common+0x922/0x1740 [mdt]
2012-09-27 21:56:36  [&amp;lt;ffffffffa0f6f6f5&amp;gt;] ? mdt_regular_handle+0x15/0x20 [mdt]
2012-09-27 21:56:36  [&amp;lt;ffffffffa0966b3c&amp;gt;] ? ptlrpc_server_handle_request+0x41c/0xe00 [ptlrpc]
2012-09-27 21:56:36  [&amp;lt;ffffffffa039365e&amp;gt;] ? cfs_timer_arm+0xe/0x10 [libcfs]
2012-09-27 21:56:36  [&amp;lt;ffffffffa03a513f&amp;gt;] ? lc_watchdog_touch+0x6f/0x180 [libcfs]
2012-09-27 21:56:36  [&amp;lt;ffffffffa095df37&amp;gt;] ? ptlrpc_wait_event+0xa7/0x2a0 [ptlrpc]
2012-09-27 21:56:36  [&amp;lt;ffffffff810533f3&amp;gt;] ? __wake_up+0x53/0x70
2012-09-27 21:56:36  [&amp;lt;ffffffffa0968111&amp;gt;] ? ptlrpc_main+0xbf1/0x19e0 [ptlrpc]
2012-09-27 21:56:36  [&amp;lt;ffffffffa0967520&amp;gt;] ? ptlrpc_main+0x0/0x19e0 [ptlrpc]
2012-09-27 21:56:36  [&amp;lt;ffffffff8100c14a&amp;gt;] ? child_rip+0xa/0x20
2012-09-27 21:56:36  [&amp;lt;ffffffffa0967520&amp;gt;] ? ptlrpc_main+0x0/0x19e0 [ptlrpc]
2012-09-27 21:56:36  [&amp;lt;ffffffffa0967520&amp;gt;] ? ptlrpc_main+0x0/0x19e0 [ptlrpc]
2012-09-27 21:56:36  [&amp;lt;ffffffff8100c140&amp;gt;] ? child_rip+0x0/0x20
2012-09-27 21:56:36 Initializing cgroup subsys cpuset
2012-09-27 21:56:36 Initializing cgroup subsys cpu
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="45777" author="liang" created="Sat, 29 Sep 2012 22:59:37 +0000"  >&lt;p&gt;sigh, I would say this is a different bug, I just found it, it&apos;s &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1951&quot; title=&quot;SWL: osd_handler.c:2343:osd_object_ref_del()) ASSERTION( inode-&amp;gt;i_nlink &amp;gt; 0 ) failed:&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1951&quot;&gt;&lt;del&gt;LU-1951&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="45821" author="pjones" created="Mon, 1 Oct 2012 17:20:18 +0000"  >&lt;p&gt;duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2041&quot; title=&quot;SWL ldiskfs_add_entry: bad entry in directory #127928380: rec_len is smaller than minimal&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2041&quot;&gt;&lt;del&gt;LU-2041&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="46235" author="cliffw" created="Tue, 9 Oct 2012 00:55:11 +0000"  >&lt;p&gt;Hit again in most recent SWL test &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Oct  8 20:55:58 hyperion-rst6 kernel: Lustre: 4204:0:(service.c:2105:ptlrpc_handle_rs()) All locks stolen from rs ffff88012a6c9000 x1415301393993750.t4460647409 o0 NID 192.168.116.125@o2ib1
Oct  8 20:58:24 hyperion-rst6 kernel: LDISKFS-fs error (device md1): ldiskfs_add_entry: bad entry in directory #40370421: rec_len is smaller than minimal - block=20251025offset=504(504), inode=40380706, rec_len=0, name_len=4
Oct  8 20:58:24 hyperion-rst6 kernel: Aborting journal on device md1-8.
Oct  8 20:58:24 hyperion-rst6 kernel: LDISKFS-fs error (device md1): ldiskfs_journal_start_sb:
Oct  8 20:58:24 hyperion-rst6 kernel: LDISKFS-fs error (device md1): ldiskfs_journal_start_sb: Detected aborted journal
Oct  8 20:58:24 hyperion-rst6 kernel: LDISKFS-fs (md1): Remounting filesystem read-only
Oct  8 20:58:24 hyperion-rst6 kernel: LDISKFS-fs (md1): Remounting filesystem read-only
Oct  8 20:58:24 hyperion-rst6 kernel: LDISKFS-fs error (device md1) in iam_txn_add: Journal has aborted
Oct  8 20:58:24 hyperion-rst6 kernel: LustreError: 4885:0:(osd_io.c:1014:osd_ldiskfs_write_record()) journal_get_write_access() returned error -30
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="11862" name="md1.tar.gz" size="870426" author="cliffw" created="Sun, 16 Sep 2012 11:41:44 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvgi7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>6323</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>