<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:09:37 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-14425] BUG: unable to handle kernel paging request at ffffffffffffffff</title>
                <link>https://jira.whamcloud.com/browse/LU-14425</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[10444.024266] BUG: unable to handle kernel paging request at ffffffffffffffff
[10444.034709] IP: 0xffffffffffffffff
[10444.038729] PGD 1e0c067 P4D 1e0c067 PUD 1e0e067 PMD 0 
[10444.046081] Oops: 0010 [#1] SMP PTI
[10444.050413] CPU: 1 PID: 2493 Comm: kgnilnd_sd_00 Tainted: P           O     4.12.14-197.7_5.0.96-cray_ari_c #1 SLE15 (unreleased)
[10444.069060] Hardware name: Cray Inc. Cascade/Cascade, BIOS 4.6.5 09/05/2019
[10444.078578] task: ffff880f898c31c0 task.stack: ffffc90007610000
[10444.087691] RIP: 0010:0xffffffffffffffff
[10444.092405] RSP: 0018:ffffc90007613ae0 EFLAGS: 00010286
[10444.099635] RAX: ffff880f87d11bc0 RBX: ffff88078e307000 RCX: 00000000ffffffff
[10444.109846] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff88078e5e8f18
[10444.119712] RBP: ffffc90007613b90 R08: 0000000000000000 R09: 0000000000000000
[10444.130956] R10: 0000000000000000 R11: ffffea001e0c1a60 R12: ffff88078e5e8e58
[10444.142241] R13: 0000000000000001 R14: 0000000000000000 R15: ffffffffffffffff
[10444.152325] FS:  0000000000000000(0000) GS:ffff88085f840000(0000) knlGS:0000000000000000
[10444.164237] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[10444.172192] CR2: ffffffffffffffff CR3: 0000000001e0a001 CR4: 00000000001606e0
[10444.182560] Call Trace:
[10444.184378]  ? lnet_finalize+0x750/0x1130 [lnet]
[10444.190241]  ? _kgnilnd_schedule_conn+0x128/0x3a0 [kgnilnd]
[10444.197527]  kgnilnd_recv+0x5b6/0xcb0 [kgnilnd]
[10444.203490]  lnet_ni_recv+0x5a/0x2d0 [lnet]
[10444.209107]  lnet_recv_put+0x7f/0xb0 [lnet]
[10444.214445]  lnet_parse_local+0x642/0xdc0 [lnet]
[10444.220245]  ? gni_cq_get_event+0x47/0xb10 [kgni_ari]
[10444.227225]  lnet_parse+0xaf3/0x1120 [lnet]
[10444.231471]  ? kmem_cache_alloc+0x1cb/0x5e0
[10444.236043]  ? kgnilnd_check_fma_send_cq+0xdf5/0x1090 [kgnilnd]
[10444.244100]  kgnilnd_check_fma_rx+0x166d/0x1e90 [kgnilnd]
[10444.251562]  ? lock_timer_base+0x6b/0x90
[10444.256140]  kgnilnd_process_conns+0x527/0xe50 [kgnilnd]
[10444.263252]  ? kgnilnd_process_mapped_tx+0x574/0x810 [kgnilnd]
[10444.271873]  kgnilnd_scheduler+0x199/0x5b0 [kgnilnd]
[10444.278550]  ? wait_woken+0x80/0x80
[10444.282602]  kthread+0x121/0x140
[10444.285913]  ? kgnilnd_process_conns+0xe50/0xe50 [kgnilnd]
[10444.293607]  ? kthread_create_worker_on_cpu+0x50/0x50
[10444.300612]  ret_from_fork+0x3a/0x50
[10444.304848] Code:  Bad RIP value.
[10444.308187] Modules linked in: mgc(O) lustre(O) lmv(O) mdc(O) fid(O) lov(O) fld(O) osc(O) ptlrpc(O) obdclass(O) pm_api(O) xpmem(O) cmsr(O) bpmcdmod(O) x86_pkg_temp_thermal freemem(O) pcie_link_bw_monitor(O) ib_core(O) kdreg(O) kgnilnd(O) dvs(O) dvsipc(O) dvsipc_lnet(O) lnet(O) libcfs(O) dvsproc(O) gpcd_ari(O) ipogif_ari(O) kgni_ari(O) hwerr(PO) rca(O) heartbeat(O) simplex(PO) hss_os(PO) ghal_ari(O) craytrace(O)
[10444.369702] CR2: ffffffffffffffff
[10444.374673] ---[ end trace bd9de6e45567314b ]---
[10444.379464] RIP: 0010:0xffffffffffffffff
[10444.383581] RSP: 0018:ffffc90007613ae0 EFLAGS: 00010286
[10444.389141] RAX: ffff880f87d11bc0 RBX: ffff88078e307000 RCX: 00000000ffffffff
[10444.397849] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff88078e5e8f18
[10444.406839] RBP: ffffc90007613b90 R08: 0000000000000000 R09: 0000000000000000
[10444.415731] R10: 0000000000000000 R11: ffffea001e0c1a60 R12: ffff88078e5e8e58
[10444.425544] R13: 0000000000000001 R14: 0000000000000000 R15: ffffffffffffffff
[10444.435234] FS:  0000000000000000(0000) GS:ffff88085f840000(0000) knlGS:0000000000000000
[10444.447830] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[10444.455364] CR2: ffffffffffffffff CR3: 0000000001e0a001 CR4: 00000000001606e0
[10444.465273] Kernel panic - not syncing: Fatal exception
[10445.505720] Shutting down cpus with NMI
[10445.520518] Kernel Offset: disabled
[10445.524528] ---[ end Kernel panic - not syncing: Fatal exception

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Appears the crash here:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;static void
lnet_msg_detach_md(struct lnet_msg *msg, int status)
{
	struct lnet_libmd *md = msg-&amp;gt;msg_md;
	lnet_handler_t handler = NULL;
	int cpt = lnet_cpt_of_cookie(md-&amp;gt;md_lh.lh_cookie);
	int unlink;
 
	lnet_res_lock(cpt);
	while (md-&amp;gt;md_flags &amp;amp; LNET_MD_FLAG_HANDLING)
		/* An event handler is running - wait for it to
		 * complete to avoid races.
		 */
		lnet_md_wait_handling(md, cpt);
 
	/* Now it&apos;s safe to drop my caller&apos;s ref */
	md-&amp;gt;md_refcount--;
	LASSERT(md-&amp;gt;md_refcount &amp;gt;= 0);
 
	unlink = lnet_md_unlinkable(md);
	if (md-&amp;gt;md_handler) {
		if ((md-&amp;gt;md_flags &amp;amp; LNET_MD_FLAG_ABORTED) &amp;amp;&amp;amp; !status) {
			msg-&amp;gt;msg_ev.status   = -ETIMEDOUT;
			CDEBUG(D_NET, &quot;md 0x%p already unlinked\n&quot;, md);
		} else {
			msg-&amp;gt;msg_ev.status   = status;
		}
		msg-&amp;gt;msg_ev.unlinked = unlink;
		handler = md-&amp;gt;md_handler;
		if (!unlink)
			md-&amp;gt;md_flags |= LNET_MD_FLAG_HANDLING;
	}
 
	if (unlink || (md-&amp;gt;md_refcount == 0 &amp;amp;&amp;amp;
		       md-&amp;gt;md_threshold == LNET_MD_THRESH_INF))
		lnet_detach_rsp_tracker(md, cpt);
 
	msg-&amp;gt;msg_md = NULL;
	if (unlink)
		lnet_md_unlink(md);
 
	lnet_res_unlock(cpt);
 
	if (handler) {
		handler(&amp;amp;msg-&amp;gt;msg_ev);
		if (!unlink) {
			lnet_res_lock(cpt); /* # &amp;lt;&amp;lt;-------------------- crash is here. */
			md-&amp;gt;md_flags &amp;amp;= ~LNET_MD_FLAG_HANDLING;
			wake_up_var(md);
			lnet_res_unlock(cpt);
		}
	}
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Reverted &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10428&quot; title=&quot;LNet events should generated without resource lock held&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10428&quot;&gt;&lt;del&gt;LU-10428&lt;/del&gt;&lt;/a&gt; lnet: call event handlers without res_lock &lt;br/&gt;
and the crash seems to be gone&lt;/p&gt;</description>
                <environment></environment>
        <key id="62842">LU-14425</key>
            <summary>BUG: unable to handle kernel paging request at ffffffffffffffff</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="6">Not a Bug</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="stancheff">Shaun Tancheff</reporter>
                        <labels>
                    </labels>
                <created>Thu, 11 Feb 2021 15:16:19 +0000</created>
                <updated>Tue, 23 Mar 2021 21:49:01 +0000</updated>
                            <resolved>Tue, 23 Mar 2021 21:49:01 +0000</resolved>
                                    <version>Lustre 2.14.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="291744" author="adilger" created="Thu, 11 Feb 2021 15:34:17 +0000"  >&lt;p&gt;What version is this?&lt;/p&gt;</comment>
                            <comment id="291747" author="stancheff" created="Thu, 11 Feb 2021 15:40:29 +0000"  >&lt;p&gt;master from commit: 9cd651aead327ae4589b58dde5818b068c89b3e5&lt;br/&gt;
about 3 commits before 2.14.0-RC1&lt;/p&gt;</comment>
                            <comment id="291782" author="adilger" created="Thu, 11 Feb 2021 18:16:13 +0000"  >&lt;p&gt;Shaun, what kind of load was this under, and how often have you hit it?  Since 2.14.0 is almost out the door, unless this is a &quot;falls over continuously and is unusable&quot; kind of bug, then it may just miss 2.14.0 and be landed afterward. &lt;/p&gt;

&lt;p&gt;That said, I&apos;m not aware of any particular requirement for the original &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10428&quot; title=&quot;LNet events should generated without resource lock held&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10428&quot;&gt;&lt;del&gt;LU-10428&lt;/del&gt;&lt;/a&gt; patch, and if it is trading a theoretical bug for a very real crash, then I don&apos;t mind reverting it and landing the fix post-2.14.   Could you please push your revert patch to Gerrit so that it can start testing, and it will at least have a chance of being landed for 2.14.0. &lt;/p&gt;</comment>
                            <comment id="291783" author="adilger" created="Thu, 11 Feb 2021 18:19:18 +0000"  >&lt;p&gt;Oh, I didn&apos;t notice before - is this only affecting kgnilnd?  That would put it into the realm of &quot;not many people will use vanilla 2.14.0 in this config&quot;, since they will almost certainly be running a Cray-supplied client, so that pretty much takes it off the 2.14.0 candidate list, and a fix can be included later. &lt;/p&gt;</comment>
                            <comment id="291848" author="stancheff" created="Fri, 12 Feb 2021 07:46:30 +0000"  >&lt;p&gt;Load was I/O stress test with 40 client nodes.&lt;br/&gt;
Hit was early and often, system could not function and quickly brought down all the connected OSTs.&lt;/p&gt;

&lt;p&gt;I will be bringing the system up to 2.14.0 RC2 with the required cray additions (gnilnd fixes) and &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10428&quot; title=&quot;LNet events should generated without resource lock held&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10428&quot;&gt;&lt;del&gt;LU-10428&lt;/del&gt;&lt;/a&gt; reverted later today.&lt;/p&gt;</comment>
                            <comment id="291956" author="spitzcor" created="Sun, 14 Feb 2021 16:39:55 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=adilger&quot; class=&quot;user-hover&quot; rel=&quot;adilger&quot;&gt;adilger&lt;/a&gt;, while we have yet to experience this bug in an environment without gnilnd clients,  it doesn&apos;t seem like a defect with the patch from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10428&quot; title=&quot;LNet events should generated without resource lock held&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10428&quot;&gt;&lt;del&gt;LU-10428&lt;/del&gt;&lt;/a&gt; would be LND specific.  It seems like there is a risk that the bug would affect other architectures.  In any case, as Shaun says, we&apos;ll revert the patch with our gnilnd systems and continue testing. &lt;/p&gt;</comment>
                            <comment id="291957" author="adilger" created="Sun, 14 Feb 2021 18:58:07 +0000"  >&lt;p&gt;This is totally outside my area of expertise, but is there anything unusual in &lt;tt&gt;kgnilnd_recv()&lt;/tt&gt; that is not happening in other LNDs? &#160;Also, it seems a bit strange that it is crashing when accessing &quot;&lt;tt&gt;cpt&lt;/tt&gt;&quot;, since CPTs shouldn&apos;t really be changing except at startup, but I don&apos;t know this part of the code very well.&#160;&lt;/p&gt;</comment>
                            <comment id="291960" author="neilb" created="Sun, 14 Feb 2021 22:24:43 +0000"  >&lt;p&gt;I&apos;m not convinced that you&apos;ve identified the crash location correctly.&lt;/p&gt;

&lt;p&gt;I assume you found the location of the top address of the stack: lnet_finalize+0x750/0x1130&#160;&lt;/p&gt;

&lt;p&gt;This looks to be a return address that was pushed onto the stack when handler() was called.&lt;/p&gt;

&lt;p&gt;The &quot;?&quot; at the start of the line indicates that this address is &lt;b&gt;not&lt;/b&gt; part of the current call stack, it is just an old return address that happens to be on the stack.&lt;/p&gt;

&lt;p&gt;The place we should be looking is kgnilnd_recv+0x5b6/0xcb0&#160;&lt;/p&gt;

&lt;p&gt;If you can identify the line of code which this address corresponds to, that should help identify the root problem.&lt;/p&gt;</comment>
                            <comment id="295865" author="hornc" created="Tue, 23 Mar 2021 21:48:52 +0000"  >&lt;p&gt;This turned out to be a bug with DVS. We&apos;re porting DVS to work with some of these recent changes to the LNet API (namely &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13005&quot; title=&quot;Remove LNet event queuing subsystem (lib-eq)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13005&quot;&gt;&lt;del&gt;LU-13005&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10428&quot; title=&quot;LNet events should generated without resource lock held&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10428&quot;&gt;&lt;del&gt;LU-10428&lt;/del&gt;&lt;/a&gt;) and we made a mistake. We&apos;re still working on that effort, but for now there&apos;s nothing to do here.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="49974">LU-10428</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i01mef:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>