<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:40:46 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4222] Oops in mdt_dump_lmm+0x16/0x410 [mdt]</title>
                <link>https://jira.whamcloud.com/browse/LU-4222</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Nov  6 16:38:07 lustre-mds-0-0 kernel: BUG: unable to handle kernel NULL pointer dereference at 000000000000001c&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: IP: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cfb246&amp;gt;&amp;#93;&lt;/span&gt; mdt_dump_lmm+0x16/0x410 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: PGD 0 &lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: Oops: 0000 &lt;a href=&quot;#1&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;1&lt;/a&gt; SMP &lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: last sysfs file: /sys/devices/pci0000:00/0000:00:09.0/0000:19:00.0/0000:1a:04.0/0000:1c:00.0/irq&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: CPU 4 &lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: Modules linked in: osp(U) lod(U) mdt(U) mgs(U) mgc(U) fsfilt_ldiskfs(U) osd_ldiskfs(U) lquota(U) mdd(U) lustre(U) lov(U) osc(U) mdc(U) fid(U) fld(U) ksocklnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) sha512_generic sha256_generic crc32c_intel libcfs(U) ldiskfs(U) autofs4 sunrpc cpufreq_ondemand acpi_cpufreq freq_table mperf iptable_filter ip_tables ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_addr ipv6 ib_sa ib_mad ib_core microcode serio_raw i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support i7core_edac edac_core ioatdma raid10 myri10ge ses enclosure sg igb dca ptp pps_core sr_mod cdrom ext4 mbcache jbd2 sd_mod crc_t10dif usb_storage ahci mptsas mptscsih mptbase scsi_transport_sas dm_mirror dm_region_hash dm_log dm_mod &lt;span class=&quot;error&quot;&gt;&amp;#91;last unloaded: scsi_wait_scan&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: Pid: 4408, comm: mdt02_002 Not tainted 2.6.32-358.14.1.el6_lustre.g0a46394.x86_64 #1 SUN MICROSYSTEMS SUN FIRE X4170 SERVER          /ASSY,MOTHERBOARD,X4170&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: RIP: 0010:&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cfb246&amp;gt;&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cfb246&amp;gt;&amp;#93;&lt;/span&gt; mdt_dump_lmm+0x16/0x410 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: RSP: 0018:ffff88066bf87a20  EFLAGS: 00010282&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: RAX: 0000000000000003 RBX: ffff88066bf7e000 RCX: ffffc9002118d6f0&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: RDX: ffff88066914bc00 RSI: 0000000000000000 RDI: 0000000000000040&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: RBP: ffff88066bf87a70 R08: 0000000000008001 R09: ffff88066bf7e510&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: R10: ffff88067451c49c R11: ffffffffa03b89b0 R12: ffff880669236070&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: R13: ffff8806793c77a0 R14: 0000000000000038 R15: ffff880669208a68&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: FS:  00007f00c33bf700(0000) GS:ffff88038ac00000(0000) knlGS:0000000000000000&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: CR2: 000000000000001c CR3: 00000006789ec000 CR4: 00000000000007e0&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: Process mdt02_002 (pid: 4408, threadinfo ffff88066bf86000, task ffff88066d264080)&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: Stack:&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: ffff88066bf7e000 ffff880677269000 ffff88066bf87a70 ffffffffa0ce1832&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &amp;lt;d&amp;gt; ffff880669236070 ffff88066bf7e000 ffff880669236070 ffff8806793c77a0&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &amp;lt;d&amp;gt; 0000000000000038 ffff880669208a68 ffff88066bf87b00 ffffffffa0cf4b0f&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: Call Trace:&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0ce1832&amp;gt;&amp;#93;&lt;/span&gt; ? mdt_pack_attr2body+0xe2/0x270 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cf4b0f&amp;gt;&amp;#93;&lt;/span&gt; mdt_getattr_internal+0x56f/0x1210 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cf661e&amp;gt;&amp;#93;&lt;/span&gt; mdt_getattr_name_lock+0xe6e/0x1980 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06bd135&amp;gt;&amp;#93;&lt;/span&gt; ? lustre_msg_buf+0x55/0x60 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06e5646&amp;gt;&amp;#93;&lt;/span&gt; ? __req_capsule_get+0x166/0x700 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06bf3c4&amp;gt;&amp;#93;&lt;/span&gt; ? lustre_msg_get_flags+0x34/0xb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cf73cd&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_getattr+0x29d/0x490 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0ce3f3e&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_policy+0x39e/0x720 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0675831&amp;gt;&amp;#93;&lt;/span&gt; ldlm_lock_enqueue+0x361/0x8d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa069c1ef&amp;gt;&amp;#93;&lt;/span&gt; ldlm_handle_enqueue0+0x4ef/0x10b0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0ce43c6&amp;gt;&amp;#93;&lt;/span&gt; mdt_enqueue+0x46/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0ceaab7&amp;gt;&amp;#93;&lt;/span&gt; mdt_handle_common+0x647/0x16d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06bebac&amp;gt;&amp;#93;&lt;/span&gt; ? lustre_msg_get_transno+0x8c/0x100 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d243f5&amp;gt;&amp;#93;&lt;/span&gt; mds_regular_handle+0x15/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06ce3c8&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request+0x398/0xc60 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03e85de&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_timer_arm+0xe/0x10 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03f9d9f&amp;gt;&amp;#93;&lt;/span&gt; ? lc_watchdog_touch+0x6f/0x170 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06c5729&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_wait_event+0xa9/0x290 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81055ad3&amp;gt;&amp;#93;&lt;/span&gt; ? __wake_up+0x53/0x70&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06cf75e&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0xace/0x1700 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06cec90&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1700 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0ca&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06cec90&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1700 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06cec90&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1700 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0c0&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: Code: 41 ab 9e ff 48 89 83 70 04 00 00 e9 2d ff ff ff 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 41 55 41 54 53 48 83 ec 28 0f 1f 44 00 00 &amp;lt;44&amp;gt; 0f b7 66 1c 41 89 fe 41 89 fd 48 89 f3 41 81 e6 00 04 06 02 &lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: RIP  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cfb246&amp;gt;&amp;#93;&lt;/span&gt; mdt_dump_lmm+0x16/0x410 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: RSP &amp;lt;ffff88066bf87a20&amp;gt;&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: CR2: 000000000000001c&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: --&lt;del&gt;[ end trace 0dadd51afe1c36b7 ]&lt;/del&gt;--&lt;br/&gt;
Nov  6 16:38:07 lustre-mds-0-0 kernel: Kernel panic - not syncing: Fatal exception&lt;/p&gt;

&lt;p&gt;We were trying to setup active/active MDS/MDT from a cluster with two MDS and two MDT. while trying to mount the 1.8.9 clients, we hit this panic on the MDS.&lt;/p&gt;

&lt;p&gt;Our goal was from a 1.8.x server -&amp;gt; upgrade to 2.4.1 -&amp;gt; backup and restore the single MDT to a new system with 1 MDT -&amp;gt; add another MDT on different MDS as remote mdt.&lt;/p&gt;

&lt;p&gt;The last step is to use tunefs.lustre to configure active/active HA on MDS(s)&lt;/p&gt;</description>
                <environment></environment>
        <key id="21906">LU-4222</key>
            <summary>Oops in mdt_dump_lmm+0x16/0x410 [mdt]</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="di.wang">Di Wang</assignee>
                                    <reporter username="mdiep">Minh Diep</reporter>
                        <labels>
                            <label>mn4</label>
                            <label>sdsc</label>
                    </labels>
                <created>Thu, 7 Nov 2013 01:04:15 +0000</created>
                <updated>Mon, 20 Jan 2014 22:38:00 +0000</updated>
                            <resolved>Tue, 7 Jan 2014 16:30:47 +0000</resolved>
                                    <version>Lustre 2.4.1</version>
                                    <fixVersion>Lustre 2.6.0</fixVersion>
                    <fixVersion>Lustre 2.5.1</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>12</watches>
                                                                            <comments>
                            <comment id="70944" author="pjones" created="Thu, 7 Nov 2013 02:20:21 +0000"  >&lt;p&gt;Hi Bobijam&lt;/p&gt;

&lt;p&gt;Could you please look into this issue?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="72144" author="mdiep" created="Fri, 22 Nov 2013 15:57:59 +0000"  >&lt;p&gt;we hit this again&lt;/p&gt;

&lt;p&gt;Nov 21 11:51:30 lustre-mds-0-0 kernel: BUG: unable to handle kernel NULL pointer dereference at 000000000000001c&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: IP: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cfa246&amp;gt;&amp;#93;&lt;/span&gt; mdt_dump_lmm+0x16/0x410 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: PGD 375648067 PUD 375600067 PMD 0&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: Oops: 0000 &lt;a href=&quot;#1&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;1&lt;/a&gt; SMP&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: last sysfs file: /sys/devices/pci0000:00/0000:00:1e.0/0000:1f:05.0/local_cpus&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: CPU 0&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: Modules linked in: nfs lockd fscache auth_rpcgss nfs_acl osp(U) lod(U) mdt(U) mgs(U) mgc(U) fsfilt_ldiskfs(U) osd_ldiskfs(U) ldiskfs(U) lquota(U) mdd(U) lustre(U) lov(U) osc(U) mdc(U) fid(U) fld(U) ksocklnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) sha512_generic sha256_generic crc32c_intel libcfs(U) autofs4 sunrpc cpufreq_ondemand acpi_cpufreq freq_table mperf iptable_filter ip_tables ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_addr ipv6 ib_sa ib_mad ib_core microcode serio_raw i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support ioatdma i7core_edac edac_core raid10 myri10ge ses enclosure sg igb dca ptp pps_core sr_mod cdrom ext4 jbd2 mbcache sd_mod crc_t10dif usb_storage ahci mptsas mptscsih mptbase scsi_transport_sas dm_mirror dm_region_hash dm_log dm_mod &lt;span class=&quot;error&quot;&gt;&amp;#91;last unloaded: scsi_wait_scan&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel:&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: Pid: 26979, comm: mdt00_001 Not tainted 2.6.32-358.18.1.el6_lustre.x86_64 #1 SUN MICROSYSTEMS SUN FIRE X4170 SERVER          /ASSY,MOTHERBOARD,X4170&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: RIP: 0010:&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cfa246&amp;gt;&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cfa246&amp;gt;&amp;#93;&lt;/span&gt; mdt_dump_lmm+0x16/0x410 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: RSP: 0018:ffff8806791e3a20  EFLAGS: 00010282&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: RAX: 0000000000000003 RBX: ffff88036f1ed000 RCX: ffffc9001e8bada0&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: RDX: ffff88036d465600 RSI: 0000000000000000 RDI: 0000000000000040&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: RBP: ffff8806791e3a70 R08: 00000000fffffffb R09: 00000000fffffffe&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: R10: 0000000000000000 R11: 0000000000000001 R12: ffff88063b3f8d08&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: R13: ffff88036b368ba0 R14: 0000000000000038 R15: ffff880370762a68&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: FS:  00007f177ce6a700(0000) GS:ffff880028200000(0000) knlGS:0000000000000000&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: CR2: 000000000000001c CR3: 0000000376129000 CR4: 00000000000007f0&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: Process mdt00_001 (pid: 26979, threadinfo ffff8806791e2000, task ffff8806791e1500)&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: Stack:&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: ffff88036f1ed000 ffff88036b295000 ffff8806791e3a70 ffffffffa0ce0812&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &amp;lt;d&amp;gt; ffff8806000081ed ffff88036f1ed000 ffff88063b3f8d08 ffff88036b368ba0&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &amp;lt;d&amp;gt; 0000000000000038 ffff880370762a68 ffff8806791e3b00 ffffffffa0cf3aef&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: Call Trace:&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0ce0812&amp;gt;&amp;#93;&lt;/span&gt; ? mdt_pack_attr2body+0xe2/0x270 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cf3aef&amp;gt;&amp;#93;&lt;/span&gt; mdt_getattr_internal+0x56f/0x1210 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0371d98&amp;gt;&amp;#93;&lt;/span&gt; ? libcfs_log_return+0x28/0x40 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cf55fe&amp;gt;&amp;#93;&lt;/span&gt; mdt_getattr_name_lock+0xe6e/0x1980 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa063c135&amp;gt;&amp;#93;&lt;/span&gt; ? lustre_msg_buf+0x55/0x60 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0664646&amp;gt;&amp;#93;&lt;/span&gt; ? __req_capsule_get+0x166/0x700 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cf63ad&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_getattr+0x29d/0x490 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0ce2f1e&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_policy+0x39e/0x720 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05f4831&amp;gt;&amp;#93;&lt;/span&gt; ldlm_lock_enqueue+0x361/0x8d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa061b1ef&amp;gt;&amp;#93;&lt;/span&gt; ldlm_handle_enqueue0+0x4ef/0x10b0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0ce33a6&amp;gt;&amp;#93;&lt;/span&gt; mdt_enqueue+0x46/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0ce9a97&amp;gt;&amp;#93;&lt;/span&gt; mdt_handle_common+0x647/0x16d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d233f5&amp;gt;&amp;#93;&lt;/span&gt; mds_regular_handle+0x15/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa064d3c8&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request+0x398/0xc60 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0378e05&amp;gt;&amp;#93;&lt;/span&gt; ? lc_watchdog_touch+0xd5/0x170 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0644729&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_wait_event+0xa9/0x290 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03772d1&amp;gt;&amp;#93;&lt;/span&gt; ? libcfs_debug_msg+0x41/0x50 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81055ad3&amp;gt;&amp;#93;&lt;/span&gt; ? __wake_up+0x53/0x70&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa064e75e&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0xace/0x1700 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa064dc90&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1700 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0ca&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa064dc90&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1700 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa064dc90&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1700 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0c0&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: Code: 41 ab 96 ff 48 89 83 70 04 00 00 e9 2d ff ff ff 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 41 55 41 54 53 48 83 ec 28 0f 1f 44 00 00 &amp;lt;44&amp;gt; 0f b7 66 1c 41 89 fe 41 89 fd 48 89 f3 41 81 e6 00 04 06 02&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: RIP  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cfa246&amp;gt;&amp;#93;&lt;/span&gt; mdt_dump_lmm+0x16/0x410 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: RSP &amp;lt;ffff8806791e3a20&amp;gt;&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: CR2: 000000000000001c&lt;br/&gt;
Nov 21 11:51:30 lustre-mds-0-0 kernel: --&lt;del&gt;[ end trace 3314141bd29618c1 ]&lt;/del&gt;--&lt;/p&gt;</comment>
                            <comment id="72180" author="di.wang" created="Sat, 23 Nov 2013 05:52:04 +0000"  >&lt;p&gt;It seems ma_lmm is NULL, even MA_LOV is setup and ma_lmm_size &amp;gt; 0. I post a debug patch &lt;a href=&quot;http://review.whamcloud.com/8377&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8377&lt;/a&gt; , Minh will try later, hope we can get more information.&lt;/p&gt;</comment>
                            <comment id="73313" author="nedbass" created="Wed, 11 Dec 2013 21:52:32 +0000"  >&lt;p&gt;We started hitting this same oops on a production filesystem.  MDS is stuck in a crash/reboot cycle.  It was upgraded from 2.1 last week.  Has there been any progress on this bug?&lt;/p&gt;</comment>
                            <comment id="73314" author="nedbass" created="Wed, 11 Dec 2013 22:07:17 +0000"  >&lt;p&gt;Our tree is here: &lt;a href=&quot;https://github.com/chaos/lustre/commits/2.4.0-19chaos&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/chaos/lustre/commits/2.4.0-19chaos&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The backtrace looks like this:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;mdt_getattr_internal
mdt_getattr_name_lock
mdt_intent_getattr
mdt_indent_policy
ldlm_lock_enqueue
ldlm_handle_enqueue0
mdt_enqueue
mdt_handle_common
mds_regular_handle
ptlrpc_server_handle_request
pltrpc_main
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I can&apos;t provide crash dumps, but I can carry out debugging procedures and post results.&lt;/p&gt;</comment>
                            <comment id="73316" author="nedbass" created="Wed, 11 Dec 2013 22:17:14 +0000"  >&lt;p&gt;Interestingly, I just learned that two different filesystems started hitting this within minutes of each other.  Perhaps a common client is sending problematic requests to both MDSs.  On the other system, many of the NULL pointer derefs are preceded by a message:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LustreError: 4367:0:(mdt_lvb.c:175:mdt_lvbo_fill()) ls6-MDT0000: expected 80 actual 0.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="73317" author="nedbass" created="Wed, 11 Dec 2013 23:09:47 +0000"  >&lt;p&gt;I started the MDS with the debug patch and got this:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;ASSERTION( ma-&amp;gt;ma_lmm_size != 0 &amp;amp;&amp;amp; ma-&amp;gt;ma_lmm != NULL ) failed: Invalid EA 80:(null) [0x1a89081cdcb:0x13b:0x0]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="73319" author="mdiep" created="Wed, 11 Dec 2013 23:28:05 +0000"  >&lt;p&gt;Hi Ned,&lt;/p&gt;

&lt;p&gt;could you do&lt;br/&gt;
echo 0 &amp;gt; /proc/sys/lnet/panic_on_lbug&lt;br/&gt;
echo -1 &amp;gt; /proc/sys/lnet/debug&lt;br/&gt;
echo 50 &amp;gt; /proc/sys/lnet/debug_mb&lt;/p&gt;

&lt;p&gt;after it crash, it should have a debug log in /tmp. please attach it&lt;br/&gt;
if you can identify the client that cause it, please collect debug -1 too when it crash&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;</comment>
                            <comment id="73320" author="nedbass" created="Wed, 11 Dec 2013 23:55:00 +0000"  >&lt;p&gt;Hi Minh, sorry this system is classified so I can&apos;t provide debug logs.&lt;/p&gt;

&lt;p&gt;As a test, I modified &lt;tt&gt;mdt_getattr_internal()&lt;/tt&gt; to log an error and return &lt;tt&gt;EFAULT&lt;/tt&gt; if &lt;tt&gt;ma-&amp;gt;ma_lmm == NULL&lt;/tt&gt;.  In that case, it logged the error for the same FID twice.  I was able to use &lt;tt&gt;fid2path&lt;/tt&gt; to get the file name, and I was able to read the file and get its striping EA info.  But I don&apos;t know safe it is to run with that patch.&lt;/p&gt;</comment>
                            <comment id="73321" author="di.wang" created="Wed, 11 Dec 2013 23:59:35 +0000"  >&lt;p&gt;Are there any 1.8 clients attached to this server? The reason of this LBUG, which happened in SDSC, is because some 1.8 clients send RPC with invalid format to 2.4 server, which 2.4 does not handle it well enough. &lt;/p&gt;</comment>
                            <comment id="73322" author="nedbass" created="Thu, 12 Dec 2013 00:05:33 +0000"  >&lt;p&gt;I don&apos;t think there&apos;s any 1.8 clients, but do you know of a way to determine that from the server side?&lt;/p&gt;</comment>
                            <comment id="73324" author="di.wang" created="Thu, 12 Dec 2013 00:11:48 +0000"  >&lt;p&gt;Hmm, checking debug log to find out which client the RPC is from? I probably can provide a patch to stop the LBUG, but we probably need find out why client send invalid RPC here.&lt;/p&gt;</comment>
                            <comment id="73325" author="nedbass" created="Thu, 12 Dec 2013 00:27:59 +0000"  >&lt;p&gt;Di, OK I&apos;m trying to get a debug log.&lt;/p&gt;</comment>
                            <comment id="73328" author="di.wang" created="Thu, 12 Dec 2013 00:46:09 +0000"  >&lt;p&gt;Here is a patch to check the RPC on MDT.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/8550&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8550&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;With this patch, it might not LBUG again, but we need figure out the reason why the client sends this RPC. &lt;/p&gt;</comment>
                            <comment id="73334" author="nedbass" created="Thu, 12 Dec 2013 03:07:16 +0000"  >&lt;p&gt;Hmm, it still crashes in the same place with patch 8550.  My only guess is that it changes the &lt;tt&gt;ma-&amp;gt;ma_lmm&lt;/tt&gt; pointer in &lt;tt&gt;mdt_attr_get_lov()&lt;/tt&gt;.&lt;/p&gt;</comment>
                            <comment id="73335" author="nedbass" created="Thu, 12 Dec 2013 03:27:22 +0000"  >&lt;p&gt;I moved the check from patch 8550 to just above the call to &lt;tt&gt;mdt_dump_lmm()&lt;/tt&gt;.  Now I get the debug message with the client uuid.  On both MDS the bad request came from the same x86_64 client cluster which is running lustre-2.4.0-19chaos.&lt;/p&gt;</comment>
                            <comment id="73336" author="nedbass" created="Thu, 12 Dec 2013 03:51:59 +0000"  >&lt;p&gt;I can reproduce the bad request by opening any file from some, but not all, nodes on that client cluster (graph).  The open on the client fails with EFAULT and logs&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LustreError: ... ll_inode_revalidate_fini() ls6: revalidate FID [0x...] error: rc = -14
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Dropping caches or remounting the filesystem doesn&apos;t fix it.  Rebooting the client node does fix it.  That client cluster was updated today, and used to run Lustre 2.1.  We updated another client cluster (zin) to the same software levels today, but I don&apos;t see the problem from zin.  So it seems there is something funny going on with graph.&lt;/p&gt;</comment>
                            <comment id="73337" author="di.wang" created="Thu, 12 Dec 2013 05:36:20 +0000"  >&lt;p&gt;There is code snippet at the end of mdt_attr_get_complex&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;        int mdt_attr_get_complex(struct mdt_thread_info *info,
                         struct mdt_object *o, struct md_attr *ma)
        .....
        ma-&amp;gt;ma_need = need;
        CDEBUG(D_INODE, &quot;after getattr rc = %d, ma_valid = &quot;LPX64&quot; ma_lmm=%p\n&quot;,
               rc, ma-&amp;gt;ma_valid, ma-&amp;gt;ma_lmm);
        RETURN(rc);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Could you please tell me what did you get when this -14 happens? &lt;/p&gt;</comment>
                            <comment id="73339" author="di.wang" created="Thu, 12 Dec 2013 06:02:19 +0000"  >&lt;blockquote&gt;
&lt;p&gt;I moved the check from patch 8550 to just above the call to mdt_dump_lmm(). &lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;How did you check there?  You can not check buffer-&amp;gt;lb_buf anymore, if you move this check above mdt_dump_lmm, because buffer-&amp;gt;lb_buf are from thread_info, which will be reused in the following function.&lt;/p&gt;</comment>
                            <comment id="73340" author="nedbass" created="Thu, 12 Dec 2013 06:15:51 +0000"  >&lt;p&gt;I checked &lt;tt&gt;ma-&amp;gt;ma_lmm&lt;/tt&gt;.&lt;/p&gt;

&lt;p&gt;I&apos;ll post the other debug information you requested tomorrow morning PST when I get back to the office.&lt;/p&gt;</comment>
                            <comment id="73394" author="nedbass" created="Thu, 12 Dec 2013 17:44:25 +0000"  >&lt;p&gt;Di, here is the CDEBUG message.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;mdt_attr_get_complex() after getattr rc = 0, ma_valid = 0x3 ma_lmm=(null)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="73398" author="di.wang" created="Thu, 12 Dec 2013 18:27:41 +0000"  >&lt;p&gt;Ned, please try this patch &lt;a href=&quot;http://review.whamcloud.com/#/c/8550/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/8550/&lt;/a&gt;, which will not retrieve lovEA, if client does not require, and also some other debug information, in case this does not resolve the issue.&lt;/p&gt;</comment>
                            <comment id="73402" author="nedbass" created="Thu, 12 Dec 2013 19:59:18 +0000"  >&lt;p&gt;Di, with that patch I get:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;mdt_getattr_internal() ls4-MDT0000: RPC from &amp;lt;UUID&amp;gt;: does not need LOVEA
mdt_attr_get_lov() [&amp;lt;FID&amp;gt;] retrieve lovEA with (null):0
mdt_attr_get_lov() [&amp;lt;FID&amp;gt;] got lovEA with (null):80
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This would have crashed, but I kept the check before &lt;tt&gt;mdt_dump_lmm()&lt;/tt&gt; to log an error and return &lt;tt&gt;EFAULT&lt;/tt&gt; if &lt;tt&gt;ma-&amp;gt;ma_lmm&lt;/tt&gt; is &lt;tt&gt;NULL&lt;/tt&gt;.&lt;/p&gt;</comment>
                            <comment id="73407" author="di.wang" created="Thu, 12 Dec 2013 21:12:32 +0000"  >&lt;p&gt;hmm, so the RPC does not need LOVEA, but somehow the server still trying to get it. hmm&lt;/p&gt;</comment>
                            <comment id="73408" author="nedbass" created="Thu, 12 Dec 2013 21:20:29 +0000"  >&lt;p&gt;My only guess is that &lt;tt&gt;mdt_getattr_name_lock()&lt;/tt&gt; sets MA_LOV in ma_need:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;1422                 &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (try_layout) {
1423                         child_bits |= MDS_INODELOCK_LAYOUT;
1424                         /* &lt;span class=&quot;code-keyword&quot;&gt;try&lt;/span&gt; layout lock, it may fail to be granted due to
1425                          * contention at LOOKUP or UPDATE */
1426                         &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!mdt_object_lock_try(info, child, lhc, child_bits,
1427                                                  MDT_CROSS_LOCK)) {
1428                                 child_bits &amp;amp;= ~MDS_INODELOCK_LAYOUT;
1429                                 LASSERT(child_bits != 0);
1430                                 rc = mdt_object_lock(info, child, lhc,
1431                                                 child_bits, MDT_CROSS_LOCK);
1432                         } &lt;span class=&quot;code-keyword&quot;&gt;else&lt;/span&gt; {
1433                                 ma_need |= MA_LOV;
...
1452         rc = mdt_getattr_internal(info, child, ma_need);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="73409" author="di.wang" created="Thu, 12 Dec 2013 21:23:56 +0000"  >&lt;p&gt;Hmm, it seems to me cl_max_md_size is not being set correctly in time in some cases, which client will reply to pack getattr RPC. I update the patch, but that requires to remount the client. As you said remount can fix the problem? So after remount you can not reproduce the problem anymore? Anyway can you try this patch? thanks.&lt;/p&gt;</comment>
                            <comment id="73416" author="nedbass" created="Thu, 12 Dec 2013 23:27:53 +0000"  >&lt;p&gt;After rebooting more clients, many did come up still having this problem.  So we should be able to verify pretty easily if the patch works.&lt;/p&gt;</comment>
                            <comment id="73418" author="nedbass" created="Fri, 13 Dec 2013 00:18:11 +0000"  >&lt;p&gt;Di, initial results are that the client-side patch does not fix the problem.&lt;/p&gt;</comment>
                            <comment id="73419" author="di.wang" created="Fri, 13 Dec 2013 00:29:45 +0000"  >&lt;p&gt;Hmm, I think there are two problems here&lt;br/&gt;
1. MDS should not retrieve LOVEA if there are no room in the request, this is easy to fix.&lt;br/&gt;
2. Why clients send getattr RPC without reserving the space for LOVEA, IMHO, if the cl_max_md_size is being set correctly, all of getattr RPC should have enough space for LOVEA. Hmm, could you check debug log on the client side to see where is the RPC from. is it from mdc_intent_getattr_pack ?&lt;/p&gt;</comment>
                            <comment id="73420" author="nedbass" created="Fri, 13 Dec 2013 00:41:15 +0000"  >&lt;p&gt;Di, this client normally mounts 6 filesystems.  The problem seems more likely to happen if 5 or more of those filesystems are mounted.  I have yet to see it if I only mount 4 or fewer of the filesystems.  I&apos;m using about 100 client nodes to test so the evidence is pretty compelling.&lt;/p&gt;</comment>
                            <comment id="73421" author="nedbass" created="Fri, 13 Dec 2013 00:45:49 +0000"  >&lt;p&gt;My test looks something like this, where &amp;lt;hostlist&amp;gt; contains about 100 nodes (pdsh is a distributed remote command invoker):&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
pdsh -w &amp;lt;hostlist&amp;gt; &apos;umount -a -t lustre ; for x in 1 2 3 4 5 6 ; do mount /p/lscratch$x ; done ; cat /p/lscratch5/bass6/x 2&amp;gt;&amp;amp;1&apos;

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;If I mount all six filesystems, I always get some subset of nodes for which the cat command failed with &apos;Bad address&apos;.&lt;/p&gt;

&lt;p&gt;If I mount only five, I usually but not always get failures.&lt;/p&gt;

&lt;p&gt;If I mount four or fewer, I never get failures.&lt;/p&gt;</comment>
                            <comment id="73422" author="di.wang" created="Fri, 13 Dec 2013 00:48:56 +0000"  >&lt;p&gt;Ned, Could you try this patch&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c
index 28c07b4..4541f1a 100644
--- a/lustre/mdc/mdc_locks.c
+++ b/lustre/mdc/mdc_locks.c
@@ -434,6 +434,10 @@ static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp,
         lit = req_capsule_client_get(&amp;amp;req-&amp;gt;rq_pill, &amp;amp;RMF_LDLM_INTENT);
         lit-&amp;gt;opc = (__u64)it-&amp;gt;it_op;
 
+       if (obddev-&amp;gt;u.cli.cl_max_mds_easize == 0) {
+               CERROR(&quot;%s: cl_max_mds_easize is zero!\n&quot;, obddev-&amp;gt;obd_name);
+               RETURN(ERR_PTR(-EINVAL));
+       }
         /* pack the intended request */
         mdc_getattr_pack(req, valid, it-&amp;gt;it_flags, op_data,
                          obddev-&amp;gt;u.cli.cl_max_mds_easize);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;to see whether you can see the error message from client side?&lt;/p&gt;</comment>
                            <comment id="73423" author="nedbass" created="Fri, 13 Dec 2013 01:06:52 +0000"  >&lt;p&gt;I&apos;ll try it.&lt;/p&gt;

&lt;p&gt;Regarding my earlier comment, I do sometimes see failure with 3 4 or mounts, but usually only one client out of 100.  So it seems to become more likely with more mounts.  Also the order of mounts seems to matter. I don&apos;t get errors if I mount all six in reverse order :/  I can&apos;t figure out what the pattern is yet.&lt;/p&gt;</comment>
                            <comment id="73428" author="nedbass" created="Fri, 13 Dec 2013 01:29:28 +0000"  >&lt;p&gt;No, the error does not show up on the client when I reproduce the bug.&lt;/p&gt;</comment>
                            <comment id="73429" author="di.wang" created="Fri, 13 Dec 2013 01:33:11 +0000"  >&lt;p&gt;Hmm, this probably mean cl_max_md_size initialization is being delayed when there are multiple client being mounted at the same time, since this cl_max_md_size is being updated when OSC is connected. so we need initialize the cl_max_md_size synchronously during the mount process.  &lt;/p&gt;</comment>
                            <comment id="73430" author="di.wang" created="Fri, 13 Dec 2013 01:36:31 +0000"  >&lt;p&gt;Hmm, could you please try this plus 8550&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c
index cc592a7..f9ef390 100644
--- a/lustre/mdc/mdc_request.c
+++ b/lustre/mdc/mdc_request.c
@@ -189,14 +189,13 @@ static int mdc_getattr_common(struct obd_export *exp,
 
         CDEBUG(D_NET, &quot;mode: %o\n&quot;, body-&amp;gt;mode);
 
-        if (body-&amp;gt;eadatasize != 0) {
-                mdc_update_max_ea_from_body(exp, body);
-
-                eadata = req_capsule_server_sized_get(pill, &amp;amp;RMF_MDT_MD,
-                                                      body-&amp;gt;eadatasize);
-                if (eadata == NULL)
-                        RETURN(-EPROTO);
-        }
+       mdc_update_max_ea_from_body(exp, body);
+       if (body-&amp;gt;eadatasize != 0) {
+               eadata = req_capsule_server_sized_get(pill, &amp;amp;RMF_MDT_MD,
+                                                     body-&amp;gt;eadatasize);
+               if (eadata == NULL)
+                       RETURN(-EPROTO);
+       }
 
         if (body-&amp;gt;valid &amp;amp; OBD_MD_FLRMTPERM) {


&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="73431" author="di.wang" created="Fri, 13 Dec 2013 01:46:43 +0000"  >&lt;p&gt;Hmm, I compared lustre b2_4 code with lustre chaos 2.4, in mdc_intent_getattr_pack&lt;/p&gt;

&lt;p&gt;chaos lustre 2.4 &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;        /* pack the intended request */
        mdc_getattr_pack(req, valid, it-&amp;gt;it_flags, op_data,
                         obddev-&amp;gt;u.cli.cl_default_mds_easize);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;my lustre b2_4 branch&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
     /* pack the intended request */
        mdc_getattr_pack(req, valid, it-&amp;gt;it_flags, op_data,
                         obddev-&amp;gt;u.cli.cl_max_mds_easize);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Ned, Could you please confirm that? Some one change this code in chaos?&lt;/p&gt;</comment>
                            <comment id="73432" author="di.wang" created="Fri, 13 Dec 2013 01:51:37 +0000"  >&lt;p&gt;Oh, it seems from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3338&quot; title=&quot;IOC_MDC_GETFILESTRIPE can abuse vmalloc()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3338&quot;&gt;&lt;del&gt;LU-3338&lt;/del&gt;&lt;/a&gt;. Ned, does this patch in your lustre source(lustre-2.4.0-19chaos)? &lt;/p&gt;</comment>
                            <comment id="73434" author="nedbass" created="Fri, 13 Dec 2013 02:10:57 +0000"  >&lt;p&gt;Yes, we are running with the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3338&quot; title=&quot;IOC_MDC_GETFILESTRIPE can abuse vmalloc()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3338&quot;&gt;&lt;del&gt;LU-3338&lt;/del&gt;&lt;/a&gt; patch.&lt;/p&gt;</comment>
                            <comment id="73435" author="nedbass" created="Fri, 13 Dec 2013 02:25:14 +0000"  >&lt;p&gt;Initial results with your latest patch to &lt;tt&gt;mdc_getattr_common()&lt;/tt&gt; are good.  The problem is not happening so far with that patch.&lt;/p&gt;</comment>
                            <comment id="73436" author="di.wang" created="Fri, 13 Dec 2013 02:36:24 +0000"  >&lt;p&gt;Hmm, the patch try to get the max_md_size from MDS initially, which is MAX_MD_SIZE; (4 stripes), so initially, it will set the default_md_size to be 4 stripes, intead of one,&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;         cli-&amp;gt;cl_default_mds_easize =
                            min_t(__u32, body-&amp;gt;max_mdsize, PAGE_CACHE_SIZE);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I am not so sure, this is the right way to fix this.&lt;/p&gt;</comment>
                            <comment id="73611" author="di.wang" created="Mon, 16 Dec 2013 19:38:22 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/#/c/8550/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/8550/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="73648" author="di.wang" created="Tue, 17 Dec 2013 04:44:55 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/8599&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8599&lt;/a&gt; for master branch&lt;/p&gt;</comment>
                            <comment id="74482" author="pjones" created="Tue, 7 Jan 2014 16:30:47 +0000"  >&lt;p&gt;Fix landed for 2.6&lt;/p&gt;</comment>
                            <comment id="74551" author="james beal" created="Wed, 8 Jan 2014 10:54:23 +0000"  >&lt;p&gt;Is it too late for this to hit 2.5.1 ?&lt;/p&gt;</comment>
                            <comment id="74560" author="pjones" created="Wed, 8 Jan 2014 14:26:28 +0000"  >&lt;p&gt;No it is not too late so this is still possible.&lt;/p&gt;</comment>
                            <comment id="74563" author="james beal" created="Wed, 8 Jan 2014 14:59:29 +0000"  >&lt;p&gt;Thank you that would be appreciated...&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzw89j:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>11490</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>