<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:38:02 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-3916] MDS crash, RIP  :obdclass:lprocfs_exp_setup+0x449/0xd90</title>
                <link>https://jira.whamcloud.com/browse/LU-3916</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;NOAA ran into a kernel panic on an mds that appears to have something to do with the MGS procfs system:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Unable to handle kernel NULL pointer dereference at 0000000000000050 RIP: 
 [&amp;lt;ffffffff8af85f39&amp;gt;] :obdclass:lprocfs_exp_setup+0x449/0xd90
PGD 914f35067 PUD 914f36067 PMD 0 
Oops: 0000 [1] SMP 
last sysfs file: /&lt;span class=&quot;code-keyword&quot;&gt;class/&lt;/span&gt;infiniband_mad/umad0/port
CPU 21 
Modules linked in: mds(U) fsfilt_ldiskfs(U) mgs(U) mgc(U) ldiskfs(U) jbd2(U) crc16(U) lustre(U) lov(U) mdc(U) lquota(U) osc(U) ko2iblnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) libcfs(U) autofs4(U) ipmi_devintf(U) ipmi_si(U) ipmi_msghandler(U) ib_iser(U) libiscsi2(U) scsi_transport_iscsi2(U) scsi_transport_iscsi(U) ib_srp(U) rds(U) ib_sdp(U) ib_ipoib(U) ipoib_helper(U) rdma_ucm(U) rdma_cm(U) ib_ucm(U) ib_uverbs(U) ib_umad(U) ib_cm(U) iw_cm(U) ib_addr(U) ipv6(U) xfrm_nalgo(U) crypto_api(U) ib_sa(U) dm_round_robin(U) dm_multipath(U) scsi_dh(U) video(U) backlight(U) sbs(U) power_meter(U) hwmon(U) i2c_ec(U) dell_wmi(U) wmi(U) button(U) battery(U) asus_acpi(U) acpi_memhotplug(U) ac(U) parport_pc(U) lp(U) parport(U) mlx4_ib(U) ib_mad(U) ib_core(U) mlx4_en(U) joydev(U) sg(U) i2c_i801(U) igb(U) i2c_core(U) tpm_tis(U) tpm(U) tpm_bios(U) 8021q(U) mlx4_core(U) pcspkr(U) dca(U) serio_raw(U) dm_raid45(U) dm_message(U) dm_region_hash(U) dm_mem_cache(U) dm_snapshot(U) dm_zero(U) dm_mirror(U) dm_log(U) dm_mod(U) qla2xxx(U) scsi_transport_fc(U) ahci(U) libata(U) shpchp(U) mptsas(U) mptscsih(U) mptbase(U) scsi_transport_sas(U) sd_mod(U) scsi_mod(U) ext3(U) jbd(U) uhci_hcd(U) ohci_hcd(U) ehci_hcd(U)
Pid: 11472, comm: ll_mgs_12 Tainted: G     ---- 2.6.18-348.1.1.el5_lustre.es52 #1
RIP: 0010:[&amp;lt;ffffffff8af85f39&amp;gt;]  [&amp;lt;ffffffff8af85f39&amp;gt;] :obdclass:lprocfs_exp_setup+0x449/0xd90
RSP: 0018:ffff8102082f1ad0  EFLAGS: 00010202
RAX: ffff81121bc82cc0 RBX: ffff8104af91d400 RCX: 0000000000000681
RDX: 0000000000000000 RSI: ffff81121bc82cc8 RDI: ffff81121bc82cc8
RBP: ffff81120c246140 R08: 0000000000000001 R09: 0000000000000000
R10: ffff81120c246140 R11: 0000000000000058 R12: ffff8104af91d400
R13: ffff8103879c8038 R14: ffff810384ffd5b0 R15: ffff8102082f1b5c
FS:  00002b68894c66e0(0000) GS:ffff81123fdda8c0(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000050 CR3: 0000000914f34000 CR4: 00000000000006a0
&lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; ll_mgs_12 (pid: 11472, threadinfo ffff8102082f0000, task ffff8102fcd63860)
Stack:  0000000000000000 0000000000000000 ffffffff8afb3986 ffff81038a2055c0
 0000003000000020 ffff8102082f1bd0 ffff8102082f1b10 ffff81120c246148
 ffff81121bc82cc0 ffff8104af91d400 ffff81038485e128 ffff8102082f1ca0
Call Trace:
 [&amp;lt;ffffffff8b2f4a70&amp;gt;] :mgs:mgs_handle+0x0/0x16d0
 [&amp;lt;ffffffff8b2f9450&amp;gt;] :mgs:mgs_export_stats_init+0x20/0xe0
 [&amp;lt;ffffffff8b2f34de&amp;gt;] :mgs:mgs_reconnect+0x14e/0x1e0
 [&amp;lt;ffffffff8b03c307&amp;gt;] :ptlrpc:lustre_msg_add_op_flags+0x47/0x120
 [&amp;lt;ffffffff8b03cea5&amp;gt;] :ptlrpc:lustre_msg_get_conn_cnt+0x35/0xf0
 [&amp;lt;ffffffff8b006cf0&amp;gt;] :ptlrpc:target_handle_connect+0x24c0/0x2e80
 [&amp;lt;ffffffff8af27b00&amp;gt;] :lnet:lnet_match_blocked_msg+0x360/0x390
 [&amp;lt;ffffffff80158202&amp;gt;] __next_cpu+0x19/0x28
 [&amp;lt;ffffffff8b2f4f5e&amp;gt;] :mgs:mgs_handle+0x4ee/0x16d0
 [&amp;lt;ffffffff800471ee&amp;gt;] try_to_wake_up+0x472/0x484
 [&amp;lt;ffffffff8b046874&amp;gt;] :ptlrpc:ptlrpc_server_handle_request+0x984/0xe00
 [&amp;lt;ffffffff8b046fd5&amp;gt;] :ptlrpc:ptlrpc_wait_event+0x2e5/0x310
 [&amp;lt;ffffffff8008d7a6&amp;gt;] __wake_up_common+0x3e/0x68
 [&amp;lt;ffffffff8b047f16&amp;gt;] :ptlrpc:ptlrpc_main+0xf16/0x10e0
 [&amp;lt;ffffffff8005dfc1&amp;gt;] child_rip+0xa/0x11
 [&amp;lt;ffffffff8b047000&amp;gt;] :ptlrpc:ptlrpc_main+0x0/0x10e0
 [&amp;lt;ffffffff8005dfb7&amp;gt;] child_rip+0x0/0x11
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I ran gdb on obdclass and it looks like the panic is here:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;(gdb) list *(lprocfs_exp_setup+0x449)
0x2cf39 is in lprocfs_exp_setup (/vault/builds/workspace/Lustre_ES_1.5/build-area/BUILD/lustre-1.8.9/lustre/obdclass/lprocfs_status.c:1729).
1724                   atomic_read(&amp;amp;new_stat-&amp;gt;nid_exp_ref_count));
1725    
1726            /* we need to release old stats because lprocfs_exp_cleanup() hasn&apos;t
1727             * been and will never be called. */
1728            &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (exp-&amp;gt;exp_nid_stats != NULL) {
1729                    nidstat_putref(exp-&amp;gt;exp_nid_stats);
1730                    exp-&amp;gt;exp_nid_stats = NULL;
1731            }
1732    
1733            /* Return -EALREADY here so that we know that the /proc
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Is it possible that lprocfs_exp_setup was called twice by two separate threads? If so, it seems like this could happen if 1730 was executed and then 1729. &lt;/p&gt;

&lt;p&gt;I&apos;ve attached crash bt and log files. &lt;/p&gt;</description>
                <environment></environment>
        <key id="20878">LU-3916</key>
            <summary>MDS crash, RIP  :obdclass:lprocfs_exp_setup+0x449/0xd90</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bobijam">Zhenyu Xu</assignee>
                                    <reporter username="kitwestneat">Kit Westneat</reporter>
                        <labels>
                    </labels>
                <created>Tue, 10 Sep 2013 13:08:14 +0000</created>
                <updated>Mon, 30 Sep 2013 14:31:49 +0000</updated>
                            <resolved>Mon, 30 Sep 2013 14:31:49 +0000</resolved>
                                    <version>Lustre 1.8.9</version>
                                    <fixVersion>Lustre 2.5.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>3</watches>
                                                                            <comments>
                            <comment id="66189" author="pjones" created="Tue, 10 Sep 2013 15:35:18 +0000"  >&lt;p&gt;Bobijam&lt;/p&gt;

&lt;p&gt;Could you please look into this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="66300" author="bobijam" created="Wed, 11 Sep 2013 03:16:35 +0000"  >&lt;p&gt;yes, the MGS handles the same client reconnection requests with two different threads.&lt;/p&gt;

&lt;p&gt;Lustre: 11475:0:(ldlm_lib.c:576:target_handle_reconnect()) MGS: b5a21c59-87ef-a4c5-b039-d953bd254eea reconnecting&lt;br/&gt;
Lustre: 11472:0:(ldlm_lib.c:576:target_handle_reconnect()) MGS: b5a21c59-87ef-a4c5-b039-d953bd254eea reconnecting&lt;br/&gt;
Lustre: 11472:0:(ldlm_lib.c:576:target_handle_reconnect()) Skipped 7 previous similar messages&lt;br/&gt;
Lustre: 11475:0:(ldlm_lib.c:576:target_handle_reconnect()) Skipped 7 previous similar messages&lt;/p&gt;</comment>
                            <comment id="66302" author="bobijam" created="Wed, 11 Sep 2013 04:28:40 +0000"  >&lt;p&gt;b1_8 patch tracking at &lt;a href=&quot;http://review.whamcloud.com/7605&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/7605&lt;/a&gt;&lt;br/&gt;
master patch tracking at &lt;a href=&quot;http://review.whamcloud.com/7606&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/7606&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="67941" author="pjones" created="Mon, 30 Sep 2013 14:31:49 +0000"  >&lt;p&gt;Landed for 2.5&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="13449" name="2013-09-04.foreach_bt.out" size="431592" author="kitwestneat" created="Tue, 10 Sep 2013 13:08:14 +0000"/>
                            <attachment id="13448" name="2013-09-04.log.out" size="239575" author="kitwestneat" created="Tue, 10 Sep 2013 13:08:14 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzw1mf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>10344</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>