<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:40:56 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-11100] Clients hangs in LNetMDUnlink</title>
                <link>https://jira.whamcloud.com/browse/LU-11100</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Clients hang in LNetMDUnlink. May be a dup of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11092&quot; title=&quot;NMI watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [ptlrpcd_00_18:4222]&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11092&quot;&gt;LU-11092&lt;/a&gt; and &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10669&quot; title=&quot;Potential race condition when unlinking MD&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10669&quot;&gt;LU-10669&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[166855.238376] CPU: 33 PID: 2938 Comm: ptlrpcd_01_02 Tainted: P        W  OEL  NX 4.4.90-92.45.1.20171031-nasa #1
[166855.238378] Hardware name: SGI.COM SUMMIT/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013
[166855.238381] task: ffff8807db820bc0 ti: ffff8807db824000 task.ti: ffff8807db824000
[166855.238383] RIP: 0010:[&amp;lt;ffffffff810cc0a1&amp;gt;]  [&amp;lt;ffffffff810cc0a1&amp;gt;] native_queued_spin_lock_slowpath+0x111/0x1a0
[166855.238392] RSP: 0018:ffff8807db827b98  EFLAGS: 00000246
[166855.238393] RAX: 0000000000000000 RBX: ffff880fe93574e0 RCX: 0000000000880000
[166855.238395] RDX: ffff88081e2567c0 RSI: 0000000000280001 RDI: ffff88101cdb6e00
[166855.238396] RBP: ffff8807db827b98 R08: ffff88101db567c0 R09: 0000000000000000
[166855.238398] R10: 0000000000000000 R11: ffff880ee98f8817 R12: 0000000000000008
[166855.238400] R13: 000000000a222d0f R14: 0000000000000001 R15: 0000000000000000
[166855.238402] FS:  0000000000000000(0000) GS:ffff88101db40000(0000) knlGS:0000000000000000
[166855.238403] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[166855.238405] CR2: 0000000000641038 CR3: 0000000001afe000 CR4: 00000000001406e0
[166855.238407] Stack:
[166855.238408]  ffff8807db827ba8 ffffffff8119162a ffff8807db827bb8 ffffffff8161e640
[166855.238411]  ffff8807db827be0 ffffffffa0a96683 ffffffffa1dc78e7 0000000000000001
[166855.238414]  000000002888b43d ffff8807db827cb8 ffffffffa0b254f5 ffffffffa1dc78d8
[166855.238417] Call Trace:
[166855.238431]  [&amp;lt;ffffffff8119162a&amp;gt;] queued_spin_lock_slowpath+0xb/0xf
[166855.238439]  [&amp;lt;ffffffff8161e640&amp;gt;] _raw_spin_lock+0x20/0x30
[166855.238467]  [&amp;lt;ffffffffa0a96683&amp;gt;] cfs_percpt_lock+0x53/0x100 [libcfs]
[166855.238510]  [&amp;lt;ffffffffa0b254f5&amp;gt;] LNetMDUnlink+0x65/0x150 [lnet]
[166855.238573]  [&amp;lt;ffffffffa1d5cc88&amp;gt;] ptlrpc_unregister_reply+0xf8/0x6f0 [ptlrpc]
[166855.238636]  [&amp;lt;ffffffffa1d616d8&amp;gt;] ptlrpc_expire_one_request+0xb8/0x430 [ptlrpc]
[166855.238674]  [&amp;lt;ffffffffa1d61aff&amp;gt;] ptlrpc_expired_set+0xaf/0x190 [ptlrpc]
[166855.238719]  [&amp;lt;ffffffffa1d8f998&amp;gt;] ptlrpcd+0x258/0x4e0 [ptlrpc]
[166855.238729]  [&amp;lt;ffffffff8109f276&amp;gt;] kthread+0xd6/0xf0
[166855.238735]  [&amp;lt;ffffffff8161ed3f&amp;gt;] ret_from_fork+0x3f/0x70
[166855.241341] DWARF2 unwinder stuck at ret_from_fork+0x3f/0x70
[166855.241342] 
[166855.241343] Leftover inexact backtrace:
                
[166855.241348]  [&amp;lt;ffffffff8109f1a0&amp;gt;] ? kthread_park+0x60/0x60
 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;We will try to get a reproducer.&lt;/p&gt;</description>
                <environment>client sles12sp2 lustre 2.10.3&lt;br/&gt;
servers 2.7.3 and 2.10.3</environment>
        <key id="52597">LU-11100</key>
            <summary>Clients hangs in LNetMDUnlink</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="4">Incomplete</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                            <label>lnet</label>
                    </labels>
                <created>Mon, 25 Jun 2018 22:57:41 +0000</created>
                <updated>Thu, 3 Jun 2021 13:53:15 +0000</updated>
                            <resolved>Wed, 6 Jan 2021 12:41:37 +0000</resolved>
                                    <version>Lustre 2.10.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>12</watches>
                                                                            <comments>
                            <comment id="229734" author="jgmitter" created="Tue, 26 Jun 2018 14:41:30 +0000"  >&lt;p&gt;Hi Amir,&lt;/p&gt;

&lt;p&gt;Can you please have a look and see if you think it is a duplicate with a known fix or a new issue?&lt;/p&gt;

&lt;p&gt;Thanks.&lt;/p&gt;

&lt;p&gt;Joe&lt;/p&gt;</comment>
                            <comment id="229768" author="mhanafi" created="Wed, 27 Jun 2018 22:16:24 +0000"  >&lt;p&gt;I got this to reproduce using the following steps.&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;Put some load on client.
	&lt;ol&gt;
		&lt;li&gt;stress --vm-bytes 8g --vm 5 --vm-keep --cpu 5&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
	&lt;li&gt;Run lots of stats. I used recursive ls.
	&lt;ol&gt;
		&lt;li&gt;cd /lustrefs&lt;/li&gt;
		&lt;li&gt;for i in `ls -1 |tail -40`;do ls -lR ${i} &amp;gt; /dev/null 2&amp;gt;&amp;amp;1 &amp;amp; done&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
	&lt;li&gt;Wait a few mins (1 to 2min)&lt;/li&gt;
	&lt;li&gt;Drop caches
	&lt;ol&gt;
		&lt;li&gt;echo 2 &amp;gt; /proc/sys/vm/drop_caches&lt;/li&gt;
		&lt;li&gt;May need to do this a few times&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
	&lt;li&gt;kill off the ls commands
	&lt;ol&gt;
		&lt;li&gt;pkill ls&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;&#160;You will see connection lost to some OSTs and all ptlrpcd* threads will go into %100 run state. Then you will get softlockup stack dumps.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="229769" author="ashehata" created="Thu, 28 Jun 2018 01:38:56 +0000"  >&lt;p&gt;Did this problem start happening with 2.10.4 only? Would we be able to try the reproducer on 2.10.3?&lt;/p&gt;

&lt;p&gt;&#160;Can we also get a crash dump with the system in that state?&lt;/p&gt;</comment>
                            <comment id="229797" author="mhanafi" created="Thu, 28 Jun 2018 21:53:43 +0000"  >&lt;p&gt;I am using 2.10.3 plus patches. This did started with 2.10.3 and sles12SP2&lt;/p&gt;

&lt;p&gt;I will get a crash dump and upload it.&lt;/p&gt;</comment>
                            <comment id="229798" author="ashehata" created="Thu, 28 Jun 2018 23:11:21 +0000"  >&lt;p&gt;I was looking through the tickets and I found a very similar problem on an internal ticket with the same stack trace that has been hit on 2.5ish (2.5+other patches). So this might not be new to 2.10.3. The crash dump might show us the state of the lock and if it&apos;s being held by another process. Would you be able to also include the kernel and lustre debug rpms so I can get a hold of the vmlinux and debug symbols. Will make it easier to look through the crash. thanks.&lt;/p&gt;</comment>
                            <comment id="229977" author="mhanafi" created="Thu, 5 Jul 2018 20:06:21 +0000"  >&lt;p&gt;Amir,&lt;/p&gt;

&lt;p&gt;Have you had a chance to look over the crash dumps?&lt;/p&gt;</comment>
                            <comment id="229986" author="ashehata" created="Thu, 5 Jul 2018 21:10:44 +0000"  >&lt;p&gt;I&apos;m having trouble opening the crash file:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
crash: ./usr/lib/debug/boot/vmlinux-4.4.126-94.22.2.20180514-nasa.debug: no text and data contentscrash: The namelist argument supplied in &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;case&lt;/span&gt; is a debuginfo file,
which must be accompanied by the kernel file from which it was derived.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I&apos;m wondering if I need to run crash on sles. I&apos;ll try setting up a VM with sles installed.&lt;/p&gt;

&lt;p&gt;In the meantime, are you able to startup crash?&lt;/p&gt;


&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 crash  ../vmcore ./usr/lib/debug/boot/vmlinux-4.4.126-94.22.2.20180514-nasa.debug&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="229988" author="mhanafi" created="Thu, 5 Jul 2018 21:28:38 +0000"  >&lt;p&gt;Crash on sles is a little different. This is how I start it&lt;/p&gt;

&lt;p&gt;crash vmcore vmlinux-4.4.126-94.22.2.20180514-nasa.gz&lt;/p&gt;

&lt;p&gt;You may need a sles system.&lt;/p&gt;</comment>
                            <comment id="229989" author="ashehata" created="Thu, 5 Jul 2018 21:46:04 +0000"  >&lt;p&gt;I&apos;m setting up one right now. Will let you know how it goes.&lt;/p&gt;</comment>
                            <comment id="230025" author="ashehata" created="Fri, 6 Jul 2018 22:00:25 +0000"  >&lt;p&gt;Looking at the crash dump it&apos;s not immediately obvious to me where/if there is a deadlock. I see that the res_lock for both CPTs in the system are taken, but the rest of the stack traces don&apos;t show who has it locked. My examination of the code so far doesn&apos;t reveal a place where we&apos;re not unlocking.&lt;/p&gt;

&lt;p&gt;I&apos;ll continue examining the dump and the code. In the meantime, would we be able to determine when this hang started happening? Can we try 2.10.2 and if it happens there then 2.10.1?&lt;/p&gt;

&lt;p&gt;I&apos;m just trying to narrow down the set of changes that could&apos;ve caused this issue.&lt;/p&gt;</comment>
                            <comment id="230027" author="ashehata" created="Fri, 6 Jul 2018 23:22:24 +0000"  >&lt;p&gt;Also is it possible to get a link to the Lustre tree&#160; you&apos;re using, so I can make sure I&apos;m matching the stack trace with the code properly.&lt;/p&gt;</comment>
                            <comment id="230028" author="jaylan" created="Fri, 6 Jul 2018 23:40:23 +0000"  >&lt;p&gt;My 2.10.3 git repo is at&#160;git://github.com/jlan/lustre-nas&lt;br/&gt;
 &#160; &#160; It is open.&lt;/p&gt;

&lt;p&gt;If you need my 2.7.3-fe git repo, please give me your github ID so that I can grant you access. You can email me your github ID.&#160;&lt;/p&gt;

&lt;p&gt;Thanks!&lt;/p&gt;</comment>
                            <comment id="230255" author="ashehata" created="Fri, 13 Jul 2018 17:51:03 +0000"  >&lt;p&gt;The crash dump shows that the md/me lists are growing extensively, in excess of 5Million entries. This could explain why we&apos;re seeing soft lockups. In LNetMDUnlink() we call lnet_res_lh_lookup(), although this looks through the hash, but because we have so many entries, the hash can still collide growing the lists.&lt;/p&gt;

&lt;p&gt;The next step is to understand why the lists are growing. Would it be ok to provide a debug patch to confirm that this is the issue and to extract further information, if need be.&lt;/p&gt;</comment>
                            <comment id="230256" author="mhanafi" created="Fri, 13 Jul 2018 17:56:53 +0000"  >&lt;p&gt;Yes a debug patch will work.&lt;/p&gt;</comment>
                            <comment id="230313" author="ashehata" created="Mon, 16 Jul 2018 22:57:40 +0000"  >&lt;p&gt;Attached is a debug patch based on the github repo you pointed me to.&lt;/p&gt;

&lt;p&gt;This times the lnet_res_lh_lookup() and prints an error message to console if it took more than 5 seconds to finish&lt;/p&gt;

&lt;p&gt;When you reproduce the problem, if we can monitor dmesg and see if any of these messages pop up. They are tagged with &quot;NASA_DEBUG&quot;.&lt;/p&gt;

&lt;p&gt;I&apos;m still scrubbing through the core, printing out the length of each MD hash list to see if there is anyone that is of excessive length. It&apos;s taking a while.&lt;/p&gt;</comment>
                            <comment id="230404" author="ashehata" created="Wed, 18 Jul 2018 02:21:05 +0000"  >&lt;p&gt;Oleg has also pointed me to this ticket which seems to be related to the issue you&apos;re running into. Would you be able to apply the patch and see if it resolves the hang?&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11079&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.whamcloud.com/browse/LU-11079&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="230497" author="jaylan" created="Wed, 18 Jul 2018 19:58:54 +0000"  >&lt;p&gt;Hi Amir, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11079&quot; title=&quot;Control concurrent statahead instances&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11079&quot;&gt;&lt;del&gt;LU-11079&lt;/del&gt;&lt;/a&gt; patch can not be applied to b2_10 cleanly. Is there other commit that I need to cherry-pick first?&lt;/p&gt;</comment>
                            <comment id="230596" author="ashehata" created="Thu, 19 Jul 2018 20:09:55 +0000"  >&lt;p&gt;Hi Jay, I attach the patch ported to the NASA repo.&lt;/p&gt;

&lt;p&gt;Let me know how it goes.&lt;/p&gt;</comment>
                            <comment id="230610" author="jaylan" created="Thu, 19 Jul 2018 22:51:24 +0000"  >&lt;p&gt;Thank you Amir.&lt;/p&gt;</comment>
                            <comment id="231180" author="mhanafi" created="Tue, 31 Jul 2018 19:24:30 +0000"  >&lt;p&gt;I tired the patch and was unable to reproduce the hang. We will try the patch on some production servers for additional testing.&lt;/p&gt;</comment>
                            <comment id="231329" author="jaylan" created="Thu, 2 Aug 2018 19:12:37 +0000"  >&lt;p&gt;Hi,&lt;/p&gt;

&lt;p&gt;Could you also back port the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11079&quot; title=&quot;Control concurrent statahead instances&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11079&quot;&gt;&lt;del&gt;LU-11079&lt;/del&gt;&lt;/a&gt; patch to b2_11? The patch from master branch caused merge conflicts, and the patch from b2_10 caused compilation errors.&#160;&lt;/p&gt;



&lt;p&gt;Thanks,&lt;br/&gt;
Jay&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="231426" author="jaylan" created="Fri, 3 Aug 2018 20:30:13 +0000"  >&lt;p&gt;Ah, I figured out the problem that caused the compilation errors. The nasa_LU_11079.patch in the attachments is not the same as the patch in the b2_10 reviews. Applying the patch from the attachments to nas-2.11.0 addressed the merge and compilation problem.&lt;/p&gt;

&lt;p&gt;A back port to b2_11 is not needed. Thanks.&lt;/p&gt;</comment>
                            <comment id="233065" author="jaylan" created="Wed, 5 Sep 2018 19:40:25 +0000"  >&lt;p&gt;Admir,&lt;/p&gt;

&lt;p&gt;We still hit this problem with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11079&quot; title=&quot;Control concurrent statahead instances&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11079&quot;&gt;&lt;del&gt;LU-11079&lt;/del&gt;&lt;/a&gt; applied late last week.&lt;/p&gt;

&lt;p&gt;Jay&lt;/p&gt;</comment>
                            <comment id="233066" author="jwallior" created="Wed, 5 Sep 2018 19:49:48 +0000"  >&lt;p&gt;FYI: This sounds similar to an issue we hit: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11092&quot; title=&quot;NMI watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [ptlrpcd_00_18:4222]&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11092&quot;&gt;LU-11092&lt;/a&gt;. We fixed it by changing the ldlm lru_* settings. &lt;/p&gt;</comment>
                            <comment id="233133" author="jaylan" created="Thu, 6 Sep 2018 18:40:27 +0000"  >&lt;p&gt;Hi Julien,&lt;/p&gt;

&lt;p&gt;Thank you for your info. I did not realize that Mahmoud has done the ldlm settings at our site. I was trying to find out if I miss any patch that I need to cherry-pick. Thanks.&lt;/p&gt;</comment>
                            <comment id="233190" author="mhanafi" created="Fri, 7 Sep 2018 23:00:49 +0000"  >&lt;p&gt;We tried the settings recommend in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11092&quot; title=&quot;NMI watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [ptlrpcd_00_18:4222]&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11092&quot;&gt;LU-11092&lt;/a&gt;. But it is making things worse because we are hitting &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9230&quot; title=&quot;soft lockup on v2.9 Lustre clients (ldlm?)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9230&quot;&gt;&lt;del&gt;LU-9230&lt;/del&gt;&lt;/a&gt; more frequently.&lt;/p&gt;</comment>
                            <comment id="233288" author="jaylan" created="Mon, 10 Sep 2018 18:23:07 +0000"  >&lt;p&gt;For documentation purpose, backporting &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9230&quot; title=&quot;soft lockup on v2.9 Lustre clients (ldlm?)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9230&quot;&gt;&lt;del&gt;LU-9230&lt;/del&gt;&lt;/a&gt; to b2_10 is tracked in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11352&quot; title=&quot;backport of LU-9230 to 2.10.5&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11352&quot;&gt;&lt;del&gt;LU-11352&lt;/del&gt;&lt;/a&gt; &quot;backport of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9230&quot; title=&quot;soft lockup on v2.9 Lustre clients (ldlm?)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9230&quot;&gt;&lt;del&gt;LU-9230&lt;/del&gt;&lt;/a&gt; to 2.10.5&quot;&lt;/p&gt;</comment>
                            <comment id="261664" author="mhanafi" created="Wed, 22 Jan 2020 20:25:52 +0000"  >&lt;p&gt;We are hitting this with lustre2.12.3. So the above patches didn&apos;t fix the issue.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887] Modules linked in: iptable_nat(E) nf_nat_ipv4(E) nf_nat(E) binfmt_misc(E) fuse(E) mgc(OEN) lustre(OEN) lmv(OEN) mdc(OEN) fid(OEN) osc(OEN) rpcsec_gss_krb5(E) auth_rpcgss(E) lov(OEN) nfsv4(E) fld(OEN) dns_resolver(E) ko2iblnd(OEN) ptlrpc(OEN) obdclass(OEN) lnet(OEN) nfsv3(E) nfs_acl(E) nfs(E) lockd(E) grace(E) fscache(E) libcfs(OEN) rdma_ucm(OEX) ib_ucm(OEX) rdma_cm(OEX) iw_cm(OEX) configfs(E) ib_ipoib(OEX) ib_cm(OEX) ib_umad(OEX) bonding(E) iscsi_ibft(E) iscsi_boot_sysfs(E) nf_log_ipv6(E) nf_log_common(E) xt_LOG(E) nf_conntrack_ipv6(E) nf_defrag_ipv6(E) ip6table_filter(E) ip6_tables(E) xt_tcpudp(E) nf_conntrack_ipv4(E) nf_defrag_ipv4(E) xt_conntrack(E) iptable_filter(E) xt_CT(E) nf_conntrack(E) libcrc32c(E) iptable_raw(E) ip_tables(E) x_tables(E) mlx4_ib(OEX) ib_uverbs(OEX) ib_core(OEX)
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887]  tcp_bic(EN) intel_rapl(E) x86_pkg_temp_thermal(E) intel_powerclamp(E) coretemp(E) kvm_intel(E) kvm(E) irqbypass(E) ipmi_ssif(E) crc32_pclmul(E) ghash_clmulni_intel(E) pcbc(E) mlx4_core(OEX) aesni_intel(E) iTCO_wdt(E) iTCO_vendor_support(E) aes_x86_64(E) crypto_simd(E) glue_helper(E) mlx_compat(OEX) devlink(E) cryptd(E) ipmi_si(E) ioatdma(E) igb(E) mei_me(E) mei(E) lpc_ich(E) wmi(E) ipmi_devintf(E) ipmi_msghandler(E) i2c_i801(E) shpchp(E) mfd_core(E) pcspkr(E) dca(E) button(E) acpi_cpufreq(E) sunrpc(E) ext4(E) crc16(E) jbd2(E) mbcache(E) sd_mod(E) csiostor(E) sr_mod(E) cdrom(E) mgag200(E) i2c_algo_bit(E) drm_kms_helper(E) syscopyarea(E) sysfillrect(E) sysimgblt(E) fb_sys_fops(E) ttm(E) isci(EX) ahci(E) cxgb4(E) drm(E) libahci(E) libsas(E) ptp(E) crc32c_intel(E) serio_raw(E) drm_panel_orientation_quirks(E)
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887]  libata(E) scsi_transport_fc(E) pps_core(E) scsi_transport_sas(E) hwperf(OEX) numatools(OEX) xpmem(OEX) gru(E) sg(E) dm_multipath(E) dm_mod(E) scsi_dh_rdac(E) scsi_dh_emc(E) scsi_dh_alua(E) scsi_mod(E) autofs4(E)
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887] Supported: No, Unreleased kernel
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887] CPU: 10 PID: 4337 Comm: ptlrpcd_01_05 Tainted: G           OEL     4.12.14-95.40.1.20191112-nasa #1 SLE12-SP4 (unreleased)
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887] Hardware name: SGI.COM C1104-RP7/X9DRW-3LN4F+/X9DRW-3TF+, BIOS 3.00 09/12/2013
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887] task: ffff91f29dfeca00 task.stack: ffffa1c3cb810000
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887] RIP: 0010:native_queued_spin_lock_slowpath+0xda/0x1d0
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887] RSP: 0018:ffffa1c3cb813c30 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff10
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 00000000002c0000
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887] RDX: ffff91fadf2a3a00 RSI: ffff91fadf3e3a00 RDI: ffff91facfa7d040
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887] RBP: ffffa1c3cb813d10 R08: 0000000000000000 R09: 0000000000000150
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887] R10: 000000000000002d R11: ffff91f982e8d817 R12: 000000007068765d
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887] R13: 000000001c1a1d97 R14: 0000000000000000 R15: ffff91f29dfeca00
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887] FS:  0000000000000000(0000) GS:ffff91fadf280000(0000) knlGS:0000000000000000
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887] CR2: 00002aaaaacc4000 CR3: 0000000d3700a001 CR4: 00000000000606e0
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887] Call Trace:
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887]  queued_spin_lock_slowpath+0x7/0xa
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887]  LNetMDUnlink+0x65/0x150 [lnet]
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887]  ptlrpc_unregister_reply+0xf2/0x6f0 [ptlrpc]
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887]  ? ptlrpc_set_import_discon+0xf5/0x6e0 [ptlrpc]
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887]  ptlrpc_expire_one_request+0xe4/0x4d0 [ptlrpc]
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887]  ptlrpc_expired_set+0xa9/0x180 [ptlrpc]
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887]  ptlrpcd+0x22e/0x4a0 [ptlrpc]
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887]  ? wake_up_q+0x70/0x70
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887]  kthread+0xff/0x140
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887]  ? ptlrpcd_check+0x560/0x560 [ptlrpc]
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887]  ? __kthread_parkme+0x70/0x70
Jan 21 10:19:20 pfe24 kernel: [1579630760.775887]  ret_from_fork+0x35/0x40

 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="268279" author="mhanafi" created="Wed, 22 Apr 2020 18:26:05 +0000"  >&lt;p&gt;We hit this issue again today.&lt;/p&gt;</comment>
                            <comment id="268389" author="ashehata" created="Thu, 23 Apr 2020 16:54:40 +0000"  >&lt;p&gt;Is it possible to grab the stack traces of all the tasks when we hit this issue:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
echo t &amp;gt; /proc/sysrq-trigger &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;It would be useful to see who&apos;s holding the lock. Last I looked at the crash dump for this case, it looked like the MD/ME lists were growing, so I was suspecting that it takes a long time to go through them.&lt;/p&gt;</comment>
                            <comment id="288741" author="mhanafi" created="Wed, 6 Jan 2021 01:12:39 +0000"  >&lt;p&gt;This can be closed&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="52535">LU-11079</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="30536" name="nasa_LU-11079.patch" size="3549" author="ashehata" created="Thu, 19 Jul 2018 20:09:13 +0000"/>
                            <attachment id="30515" name="nasa_debug.patch" size="2307" author="ashehata" created="Mon, 16 Jul 2018 22:57:04 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzyhr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>