<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:46:27 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-11734] LNet crashes with 2.12.0-rc1: lnet_attach_rsp_tracker() ASSERTION(md-&gt;md_rspt_ptr == ((void *)0)) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-11734</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;In my testing I see at bring up:&lt;/p&gt;

&lt;p&gt;2018-12-05T14:24:14.274614-05:00 ninja84.ccs.ornl.gov kernel: Lustre: Lustre: Build Version: 2.12.0_RC1_dirty&lt;/p&gt;

&lt;p&gt;2018-12-05T14:24:14.671611-05:00 ninja84.ccs.ornl.gov kernel: LNetError: 3759:0:(lib-move.c:4429:lnet_attach_rsp_tracker()) ASSERTION( md-&amp;gt;md_r&lt;/p&gt;

&lt;p&gt;spt_ptr == ((void *)0) ) failed:&lt;/p&gt;

&lt;p&gt;2018-12-05T14:24:14.671682-05:00 ninja84.ccs.ornl.gov kernel: LNetError: 3759:0:(lib-move.c:4429:lnet_attach_rsp_tracker()) LBUG&lt;/p&gt;

&lt;p&gt;2018-12-05T14:24:14.671716-05:00 ninja84.ccs.ornl.gov kernel: Pid: 3759, comm: monitor_thread 3.10.0-862.3.2.el7.x86_64 #1 SMP Tue May 15 18:22&lt;/p&gt;

&lt;p&gt;:15 EDT 2018&lt;/p&gt;

&lt;p&gt;2018-12-05T14:24:14.681179-05:00 ninja84.ccs.ornl.gov kernel: Call Trace:&lt;/p&gt;

&lt;p&gt;2018-12-05T14:24:14.685051-05:00 ninja84.ccs.ornl.gov kernel: Lustre: Echo OBD driver; &lt;a href=&quot;http://www.lustre.org/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://www.lustre.org/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;2018-12-05T14:24:14.695841-05:00 ninja84.ccs.ornl.gov kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc0c267cc&amp;gt;&amp;#93;&lt;/span&gt; libcfs_call_trace+0x8c/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;2018-12-05T14:24:14.697263-05:00 ninja84.ccs.ornl.gov kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc0c2687c&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x4c/0xa0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;2018-12-05T14:24:14.703501-05:00 ninja84.ccs.ornl.gov kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc102c257&amp;gt;&amp;#93;&lt;/span&gt; lnet_attach_rsp_tracker.isra.29+0xd7/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;2018-12-05T14:24:14.711154-05:00 ninja84.ccs.ornl.gov kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc1033ba1&amp;gt;&amp;#93;&lt;/span&gt; LNetGet+0x5d1/0xa90 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;2018-12-05T14:24:14.716870-05:00 ninja84.ccs.ornl.gov kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc103c0b9&amp;gt;&amp;#93;&lt;/span&gt; lnet_check_routers+0x399/0xbf0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;2018-12-05T14:24:14.723556-05:00 ninja84.ccs.ornl.gov kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc10354bf&amp;gt;&amp;#93;&lt;/span&gt; lnet_monitor_thread+0x4ff/0x560 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;2018-12-05T14:24:14.730325-05:00 ninja84.ccs.ornl.gov kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8dabb161&amp;gt;&amp;#93;&lt;/span&gt; kthread+0xd1/0xe0&lt;/p&gt;

&lt;p&gt;2018-12-05T14:24:14.735265-05:00 ninja84.ccs.ornl.gov kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8e120677&amp;gt;&amp;#93;&lt;/span&gt; ret_from_fork_nospec_end+0x0/0x39&lt;/p&gt;

&lt;p&gt;2018-12-05T14:24:14.741596-05:00 ninja84.ccs.ornl.gov kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffffffffff&amp;gt;&amp;#93;&lt;/span&gt; 0xffffffffffffffff&lt;/p&gt;</description>
                <environment>lustre 2.12.0-rc1</environment>
        <key id="54200">LU-11734</key>
            <summary>LNet crashes with 2.12.0-rc1: lnet_attach_rsp_tracker() ASSERTION(md-&gt;md_rspt_ptr == ((void *)0)) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="simmonsja">James A Simmons</reporter>
                        <labels>
                    </labels>
                <created>Wed, 5 Dec 2018 19:40:07 +0000</created>
                <updated>Sat, 15 Dec 2018 16:03:55 +0000</updated>
                            <resolved>Sat, 8 Dec 2018 13:17:42 +0000</resolved>
                                    <version>Lustre 2.12.0</version>
                                    <fixVersion>Lustre 2.12.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="238023" author="adilger" created="Wed, 5 Dec 2018 19:51:26 +0000"  >&lt;p&gt;Amir, could you please take a look?&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;James, could you please provide a bit of basic info about your network config?&#160; LND type, routing, module parameters, OFED version.&lt;/p&gt;</comment>
                            <comment id="238024" author="simmonsja" created="Wed, 5 Dec 2018 19:53:10 +0000"  >&lt;p&gt;Basic IB to IB mlx4 using default OFED stack on x86 servers.&lt;/p&gt;</comment>
                            <comment id="238028" author="ashehata" created="Wed, 5 Dec 2018 20:27:26 +0000"  >&lt;p&gt;working on a patch. Should have it up soon.&lt;/p&gt;</comment>
                            <comment id="238044" author="gerrit" created="Wed, 5 Dec 2018 22:19:47 +0000"  >&lt;p&gt;Amir Shehata (ashehata@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/33794&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/33794&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11734&quot; title=&quot;LNet crashes with 2.12.0-rc1: lnet_attach_rsp_tracker() ASSERTION(md-&amp;gt;md_rspt_ptr == ((void *)0)) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11734&quot;&gt;&lt;del&gt;LU-11734&lt;/del&gt;&lt;/a&gt; lnet: handle multi-md usage&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 14bf25aa1c1723ce4a0a88341e6e62299994cb8a&lt;/p&gt;</comment>
                            <comment id="238055" author="adilger" created="Wed, 5 Dec 2018 22:58:56 +0000"  >&lt;p&gt;This looks related to the issues seen in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10669&quot; title=&quot;Potential race condition when unlinking MD&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10669&quot;&gt;LU-10669&lt;/a&gt;?&lt;/p&gt;</comment>
                            <comment id="238068" author="ashehata" created="Thu, 6 Dec 2018 00:57:54 +0000"  >&lt;p&gt;I don&apos;t think this particular patch is relevant to that ticket since &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10669&quot; title=&quot;Potential race condition when unlinking MD&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10669&quot;&gt;LU-10669&lt;/a&gt; was before health. But it could be related in the sense that an MD can be used multiple times, which could lead to a corner scenario causing the assert:&lt;/p&gt;

&lt;p&gt;lib-msg.c:346:lnet_msg_detach_md()) ASSERTION( md-&amp;gt;md_refcount &amp;gt;= 0&lt;/p&gt;</comment>
                            <comment id="238100" author="simmonsja" created="Thu, 6 Dec 2018 16:37:29 +0000"  >&lt;p&gt;Patch seems to resolve my bringup issues. I&apos;m doing further testing to make sure no other regressions exist.&lt;/p&gt;</comment>
                            <comment id="238200" author="sthiell" created="Sat, 8 Dec 2018 02:54:34 +0000"  >&lt;p&gt;Just wanted to say that we hit the same issue with 2.12.0_RC1, it&apos;s super unstable.  I&apos;ve used quite extensively 2.11.56_3_g4e42995 but didn&apos;t hit this issue before the upgrade to CentOS 7.6 + 2.12.0_RC1.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[Fri Dec  7 17:28:33 2018]CentOS Linux 7 (Core)^M
[Fri Dec  7 17:28:33 2018]Kernel 3.10.0-957.1.3.el7_lustre.x86_64 on an x86_64^M
[Fri Dec  7 17:28:33 2018]^M
[Fri Dec  7 17:28:33 2018]fir-io2-s1 login: [ 4678.271368] LNet: HW NUMA nodes: 4, HW CPU cores: 32, npartitions: 4^M
[Fri Dec  7 18:44:21 2018][ 4678.279024] alg: No test for adler32 (adler32-zlib)^M
[Fri Dec  7 18:44:22 2018][ 4679.080205] Lustre: Lustre: Build Version: 2.12.0_RC1^M
[Fri Dec  7 18:44:22 2018][ 4679.196318] LNet: Using FastReg for registration^M
[Fri Dec  7 18:44:22 2018][ 4679.328400] LNet: Added LNI 10.0.10.103@o2ib7 [8/256/0/180]^M
[Fri Dec  7 18:44:23 2018][ 4680.195333] LNet: 19041:0:(o2iblnd_cb.c:3370:kiblnd_check_conns()) Timed out tx for 10.0.10.202@o2ib7: 4679 seconds^M
[Fri Dec  7 18:45:23 2018][ 4740.334804] LNetError: 19070:0:(lib-move.c:4429:lnet_attach_rsp_tracker()) ASSERTION( md-&amp;gt;md_rspt_ptr == ((void *)0) ) failed: ^M
[Fri Dec  7 18:45:23 2018][ 4740.346280] LNetError: 19070:0:(lib-move.c:4429:lnet_attach_rsp_tracker()) LBUG^M
[Fri Dec  7 18:45:23 2018][ 4740.353596] Pid: 19070, comm: monitor_thread 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018^M
[Fri Dec  7 18:45:23 2018][ 4740.363800] Call Trace:^M
[Fri Dec  7 18:45:23 2018][ 4740.366262]  [&amp;lt;ffffffffc094c7cc&amp;gt;] libcfs_call_trace+0x8c/0xc0 [libcfs]^M
[Fri Dec  7 18:45:23 2018][ 4740.372832]  [&amp;lt;ffffffffc094c87c&amp;gt;] lbug_with_loc+0x4c/0xa0 [libcfs]^M
[Fri Dec  7 18:45:23 2018][ 4740.379068]  [&amp;lt;ffffffffc09e8667&amp;gt;] lnet_attach_rsp_tracker.isra.29+0xd7/0xe0 [lnet]^M
[Fri Dec  7 18:45:23 2018][ 4740.386685]  [&amp;lt;ffffffffc09effb1&amp;gt;] LNetGet+0x5d1/0xa90 [lnet]^M
[Fri Dec  7 18:45:23 2018][ 4740.392395]  [&amp;lt;ffffffffc09f8399&amp;gt;] lnet_check_routers+0x399/0xbf0 [lnet]^M
[Fri Dec  7 18:45:23 2018][ 4740.399054]  [&amp;lt;ffffffffc09f18cf&amp;gt;] lnet_monitor_thread+0x4ff/0x560 [lnet]^M
[Fri Dec  7 18:45:23 2018][ 4740.405811]  [&amp;lt;ffffffffb76c1c31&amp;gt;] kthread+0xd1/0xe0^M
[Fri Dec  7 18:45:23 2018][ 4740.410725]  [&amp;lt;ffffffffb7d74c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21^M
[Fri Dec  7 18:45:23 2018][ 4740.417215]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff^M
[Fri Dec  7 18:45:23 2018][ 4740.422237] Kernel panic - not syncing: LBUG^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Config is CentOS 7.6 (3.10.0-957.1.3.el7_lustre.x86_64), Lustre 2.12.0 RC1, in-kernel OFED stack, AMD EPYC server (Dell R6415), mlx5 EDR.&lt;/p&gt;

&lt;p&gt;Perfectly working config was CentOS 7.5 (3.10.0-862.14.4.el7_lustre.x86_64), Lustre 2.11.56_3_g4e42995, in-kernel OFED stack, AMD EPYC server (Dell R6415), mlx5 EDR.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Stephane&lt;/p&gt;
</comment>
                            <comment id="238201" author="sthiell" created="Sat, 8 Dec 2018 03:45:53 +0000"  >&lt;p&gt;I tried again with 2.12.0_RC1 + Amir&apos;s patch at &lt;a href=&quot;https://review.whamcloud.com/#/c/33794/1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/33794/1&lt;/a&gt;&#160;, the OSS oopsed:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[  496.611635] LNet: HW NUMA nodes: 4, HW CPU cores: 32, npartitions: 4
[  496.619204] alg: No test for adler32 (adler32-zlib)
[  497.421277] Lustre: Lustre: Build Version: 2.12.0_RC1
[  497.541113] LNet: Using FastReg for registration
[  497.674010] LNet: Added LNI 10.0.10.104@o2ib7 [8/256/0/180]
[  498.540700] LNet: 120701:0:(o2iblnd_cb.c:3370:kiblnd_check_conns()) Timed out tx for 10.0.10.202@o2ib7: 497 seconds
[  499.695014] md: md5 stopped.
[  499.728157] async_tx: api initialized (async)
[  499.734214] xor: automatically using best checksumming function:
[  499.749692]    avx       :  9660.000 MB/sec
[  499.781700] raid6: sse2x1   gen()  6117 MB/s
[  499.802691] raid6: sse2x2   gen() 11347 MB/s
[  499.823694] raid6: sse2x4   gen() 12980 MB/s
[  499.844693] raid6: avx2x1   gen() 14261 MB/s
[  499.865692] raid6: avx2x2   gen() 18882 MB/s
[  499.886692] raid6: avx2x4   gen() 18851 MB/s
[  499.890963] raid6: using algorithm avx2x2 gen() (18882 MB/s)
[  499.896626] raid6: using avx2x2 recovery algorithm
[  499.917975] md/raid:md5: device dm-55 operational as raid disk 0
[  499.923990] md/raid:md5: device dm-11 operational as raid disk 9
[  499.930008] md/raid:md5: device dm-10 operational as raid disk 8
[  499.936020] md/raid:md5: device dm-1 operational as raid disk 7
[  499.941966] md/raid:md5: device dm-83 operational as raid disk 6
[  499.947996] md/raid:md5: device sdau operational as raid disk 5
[  499.953926] md/raid:md5: device dm-72 operational as raid disk 4
[  499.959939] md/raid:md5: device dm-61 operational as raid disk 3
[  499.965954] md/raid:md5: device dm-60 operational as raid disk 2
[  499.971970] md/raid:md5: device dm-116 operational as raid disk 1
[  499.979129] md/raid:md5: raid level 6 active with 10 out of 10 devices, algorithm 2
[  500.021173] md5: detected capacity change from 0 to 64011422924800
[  500.044705] md: md3 stopped.
[  500.077660] md/raid:md3: device dm-87 operational as raid disk 0
[  500.083732] md/raid:md3: device dm-86 operational as raid disk 9
[  500.089794] md/raid:md3: device dm-7 operational as raid disk 8
[  500.095771] md/raid:md3: device dm-81 operational as raid disk 7
[  500.101791] md/raid:md3: device dm-80 operational as raid disk 6
[  500.107802] md/raid:md3: device dm-69 operational as raid disk 5
[  500.113821] md/raid:md3: device dm-68 operational as raid disk 4
[  500.119838] md/raid:md3: device dm-111 operational as raid disk 3
[  500.125948] md/raid:md3: device dm-110 operational as raid disk 2
[  500.132043] md/raid:md3: device dm-88 operational as raid disk 1
[  500.138901] md/raid:md3: raid level 6 active with 10 out of 10 devices, algorithm 2
[  500.166972] md3: detected capacity change from 0 to 64011422924800
[  500.181231] md: md11 stopped.
[  500.205433] md/raid:md11: device dm-91 operational as raid disk 0
[  500.211589] md/raid:md11: device dm-48 operational as raid disk 9
[  500.217708] md/raid:md11: device dm-47 operational as raid disk 8
[  500.223824] md/raid:md11: device dm-35 operational as raid disk 7
[  500.229925] md/raid:md11: device dm-34 operational as raid disk 6
[  500.236037] md/raid:md11: device dm-104 operational as raid disk 5
[  500.242310] md/raid:md11: device dm-25 operational as raid disk 4
[  500.248439] md/raid:md11: device dm-100 operational as raid disk 3
[  500.254664] md/raid:md11: device dm-99 operational as raid disk 2
[  500.260789] md/raid:md11: device dm-118 operational as raid disk 1
[  500.267842] md/raid:md11: raid level 6 active with 10 out of 10 devices, algorithm 2
[  500.297329] md11: detected capacity change from 0 to 64011422924800
[  500.306119] md: md1 stopped.
[  500.341217] md/raid:md1: device dm-58 operational as raid disk 0
[  500.347236] md/raid:md1: device dm-6 operational as raid disk 9
[  500.353163] md/raid:md1: device dm-5 operational as raid disk 8
[  500.359089] md/raid:md1: device dm-76 operational as raid disk 7
[  500.365110] md/raid:md1: device dm-75 operational as raid disk 6
[  500.371134] md/raid:md1: device dm-64 operational as raid disk 5
[  500.377156] md/raid:md1: device dm-63 operational as raid disk 4
[  500.383168] md/raid:md1: device dm-109 operational as raid disk 3
[  500.389267] md/raid:md1: device dm-56 operational as raid disk 2
[  500.395287] md/raid:md1: device dm-67 operational as raid disk 1
[  500.401961] md/raid:md1: raid level 6 active with 10 out of 10 devices, algorithm 2
[  500.430931] md1: detected capacity change from 0 to 64011422924800
[  500.444576] md: md7 stopped.
[  500.480725] md/raid:md7: device dm-98 operational as raid disk 0
[  500.486749] md/raid:md7: device dm-40 operational as raid disk 9
[  500.492776] md/raid:md7: device dm-39 operational as raid disk 8
[  500.498794] md/raid:md7: device dm-27 operational as raid disk 7
[  500.504806] md/raid:md7: device dm-26 operational as raid disk 6
[  500.510829] md/raid:md7: device dm-19 operational as raid disk 5
[  500.516846] md/raid:md7: device dm-18 operational as raid disk 4
[  500.522857] md/raid:md7: device dm-117 operational as raid disk 3
[  500.528957] md/raid:md7: device dm-94 operational as raid disk 2
[  500.534972] md/raid:md7: device dm-21 operational as raid disk 1
[  500.541667] md/raid:md7: raid level 6 active with 10 out of 10 devices, algorithm 2
[  500.593292] md7: detected capacity change from 0 to 64011422924800
[  500.602526] md: md9 stopped.
[  500.645764] md/raid:md9: device dm-49 operational as raid disk 0
[  500.651782] md/raid:md9: device dm-44 operational as raid disk 9
[  500.657811] md/raid:md9: device dm-43 operational as raid disk 8
[  500.663832] md/raid:md9: device dm-107 operational as raid disk 7
[  500.669945] md/raid:md9: device dm-31 operational as raid disk 6
[  500.675959] md/raid:md9: device dm-22 operational as raid disk 5
[  500.681974] md/raid:md9: device dm-103 operational as raid disk 4
[  500.688074] md/raid:md9: device dm-97 operational as raid disk 3
[  500.694090] md/raid:md9: device dm-96 operational as raid disk 2
[  500.700104] md/raid:md9: device dm-50 operational as raid disk 1
[  500.708843] md/raid:md9: raid level 6 active with 10 out of 10 devices, algorithm 2
[  500.748835] md9: detected capacity change from 0 to 64011422924800
[  500.752863] LDISKFS-fs (md5): file extents enabled, maximum tree depth=5
[  500.841913] LDISKFS-fs (md3): file extents enabled, maximum tree depth=5
[  501.055692] LDISKFS-fs (md5): mounted filesystem with ordered data mode. Opts: errors=remount-ro,no_mbcache,nodelalloc
[  501.143861] LDISKFS-fs (md3): mounted filesystem with ordered data mode. Opts: errors=remount-ro,no_mbcache,nodelalloc
[  501.167914] LDISKFS-fs (md1): file extents enabled, maximum tree depth=5
[  501.182682] LDISKFS-fs (md11): file extents enabled, maximum tree depth=5
[  501.508363] LDISKFS-fs (md11): mounted filesystem with ordered data mode. Opts: errors=remount-ro,no_mbcache,nodelalloc
[  501.510261] LDISKFS-fs (md1): mounted filesystem with ordered data mode. Opts: errors=remount-ro,no_mbcache,nodelalloc
[  501.521916] LDISKFS-fs (md9): file extents enabled, maximum tree depth=5
[  501.554606] BUG: unable to handle kernel paging request at 000000000001b780
[  501.561618] IP: [&amp;lt;ffffffff901121d0&amp;gt;] native_queued_spin_lock_slowpath+0x110/0x200
[  501.569135] PGD 1032e6c067 PUD 1032e6d067 PMD 0 
[  501.573830] Oops: 0002 [#1] SMP 
[  501.577114] Modules linked in: ost(OE) mgc(OE) osd_ldiskfs(OE) lquota(OE) raid456 async_raid6_recov async_memcpy async_pq raid6_pq libcrc32c async_xor xor async_tx ldiskfs(OE) lustre(OE) lmv(OE) mdc(OE) osc(OE) lov(OE) fid(OE) fld(OE) ko2iblnd(OE) ptlrpc(OE) obdclass(OE) lnet(OE) libcfs(OE) rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache mpt2sas mptctl mptbase dell_rbu vfat fat rpcrdma sunrpc dm_service_time ib_iser libiscsi scsi_transport_iscsi dcdbas amd64_edac_mod edac_mce_amd kvm_amd kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd pcspkr k10temp i2c_piix4 ccp ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm dm_multipath ses enclosure dm_mod ipmi_si mlx5_ib ipmi_devintf sg ipmi_msghandler ib_core acpi_power_meter ip_tables ext4 mbcache jbd2 sd_mod crc_t10dif crct10dif_generic i2c_algo_bit drm_kms_helper syscopyarea mlx5_core sysfillrect sysimgblt fb_sys_fops ttm ahci mlxfw devlink libahci crct10dif_pclmul tg3 crct10dif_common ptp drm libata megaraid_sas crc32c_intel drm_panel_orientation_quirks pps_core mpt3sas(OE) raid_class scsi_transport_sas
[  501.680694] CPU: 30 PID: 121538 Comm: mount.lustre Kdump: loaded Tainted: G           OE  ------------   3.10.0-957.1.3.el7_lustre.x86_64 #1
[  501.693284] Hardware name: Dell Inc. PowerEdge R6415/065PKD, BIOS 1.3.6 04/20/2018
[  501.700853] task: ffff8b02a92fe180 ti: ffff8b02b2e64000 task.ti: ffff8b02b2e64000
[  501.708330] RIP: 0010:[&amp;lt;ffffffff901121d0&amp;gt;]  [&amp;lt;ffffffff901121d0&amp;gt;] native_queued_spin_lock_slowpath+0x110/0x200
[  501.718272] RSP: 0018:ffff8b02b2e674b8  EFLAGS: 00010002
[  501.723583] RAX: 00000000000016e6 RBX: 0000000000000287 RCX: 0000000000f10000
[  501.730717] RDX: 000000000001b780 RSI: 00000000b7318240 RDI: ffff8b22b7318fc0
[  501.737850] RBP: ffff8b02b2e674b8 R08: ffff8b22bf7db780 R09: 0000000000000000
[  501.744982] R10: 0000000000000002 R11: 00000000000002a0 R12: 0000000000000000
[  501.752116] R13: ffff8b2256e55800 R14: ffff8b22b7318fc0 R15: 0000000000000002
[  501.759249] FS:  00007fc3f2cd5840(0000) GS:ffff8b22bf7c0000(0000) knlGS:0000000000000000
[  501.767335] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  501.773079] CR2: 000000000001b780 CR3: 0000001034c28000 CR4: 00000000003407e0
[  501.780213] Call Trace:
[  501.782671]  [&amp;lt;ffffffff9075bfcb&amp;gt;] queued_spin_lock_slowpath+0xb/0xf
[  501.788943]  [&amp;lt;ffffffff9076a507&amp;gt;] _raw_spin_lock_irqsave+0x37/0x40
[  501.795132]  [&amp;lt;ffffffffc06c1bab&amp;gt;] ib_fmr_pool_map_phys+0x4b/0x300 [ib_core]
[  501.802095]  [&amp;lt;ffffffffc0bfd6cc&amp;gt;] kiblnd_fmr_pool_map+0x15c/0x690 [ko2iblnd]
[  501.809148]  [&amp;lt;ffffffffc0c01965&amp;gt;] kiblnd_map_tx.isra.28+0x155/0x470 [ko2iblnd]
[  501.816373]  [&amp;lt;ffffffffc0c03216&amp;gt;] kiblnd_setup_rd_iov+0x2f6/0x400 [ko2iblnd]
[  501.820899] LDISKFS-fs (md7): file extents enabled, maximum tree depth=5
[  501.825866] LDISKFS-fs (md9): mounted filesystem with ordered data mode. Opts: errors=remount-ro,no_mbcache,nodelalloc
[  501.840808]  [&amp;lt;ffffffffc0c06a6a&amp;gt;] kiblnd_send+0x5da/0xa20 [ko2iblnd]
[  501.847176]  [&amp;lt;ffffffffc097a0e4&amp;gt;] lnet_ni_send+0x44/0xd0 [lnet]
[  501.853102]  [&amp;lt;ffffffffc0981d92&amp;gt;] lnet_send+0x82/0x1c0 [lnet]
[  501.858855]  [&amp;lt;ffffffffc098219c&amp;gt;] LNetPut+0x2cc/0xb50 [lnet]
[  501.864561]  [&amp;lt;ffffffffc0c935d6&amp;gt;] ptl_send_buf+0x146/0x530 [ptlrpc]
[  501.870855]  [&amp;lt;ffffffffc0c95f1d&amp;gt;] ptl_send_rpc+0x69d/0xe70 [ptlrpc]
[  501.877146]  [&amp;lt;ffffffffc0c8ab60&amp;gt;] ptlrpc_send_new_req+0x450/0xa60 [ptlrpc]
[  501.884045]  [&amp;lt;ffffffffc0c8f788&amp;gt;] ptlrpc_set_wait+0x3f8/0x8d0 [ptlrpc]
[  501.890600]  [&amp;lt;ffffffffc0a765e5&amp;gt;] ? lustre_get_jobid+0x185/0x2e0 [obdclass]
[  501.897592]  [&amp;lt;ffffffffc0c9a2e0&amp;gt;] ? lustre_msg_buf_v2+0x1b0/0x1b0 [ptlrpc]
[  501.904491]  [&amp;lt;ffffffffc0c9bbaa&amp;gt;] ? lustre_msg_set_jobid+0x9a/0x110 [ptlrpc]
[  501.911560]  [&amp;lt;ffffffffc0c8fce3&amp;gt;] ptlrpc_queue_wait+0x83/0x230 [ptlrpc]
[  501.918173]  [&amp;lt;ffffffffc0f24b44&amp;gt;] mgc_target_register+0x134/0x4c0 [mgc]
[  501.924785]  [&amp;lt;ffffffffc0f2862b&amp;gt;] mgc_set_info_async+0x37b/0x1610 [mgc]
[  501.931407]  [&amp;lt;ffffffffc08fef07&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
[  501.938036]  [&amp;lt;ffffffffc0a85b7b&amp;gt;] server_start_targets+0x116b/0x2a20 [obdclass]
[  501.945362]  [&amp;lt;ffffffffc0a5ab10&amp;gt;] ? lustre_start_mgc+0x260/0x2510 [obdclass]
[  501.952421]  [&amp;lt;ffffffffc0a884fc&amp;gt;] server_fill_super+0x10cc/0x1890 [obdclass]
[  501.959487]  [&amp;lt;ffffffffc0a5d828&amp;gt;] lustre_fill_super+0x328/0x950 [obdclass]
[  501.966375]  [&amp;lt;ffffffffc0a5d500&amp;gt;] ? lustre_common_put_super+0x270/0x270 [obdclass]
[  501.973945]  [&amp;lt;ffffffff902452cf&amp;gt;] mount_nodev+0x4f/0xb0
[  501.979185]  [&amp;lt;ffffffffc0a55908&amp;gt;] lustre_mount+0x38/0x60 [obdclass]
[  501.985452]  [&amp;lt;ffffffff90245e4e&amp;gt;] mount_fs+0x3e/0x1b0
[  501.990507]  [&amp;lt;ffffffff902639e7&amp;gt;] vfs_kern_mount+0x67/0x110
[  501.996076]  [&amp;lt;ffffffff9026600f&amp;gt;] do_mount+0x1ef/0xce0
[  502.001218]  [&amp;lt;ffffffff9023e2aa&amp;gt;] ? __check_object_size+0x1ca/0x250
[  502.007485]  [&amp;lt;ffffffff9021c74c&amp;gt;] ? kmem_cache_alloc_trace+0x3c/0x200
[  502.013922]  [&amp;lt;ffffffff90266e43&amp;gt;] SyS_mount+0x83/0xd0
[  502.018977]  [&amp;lt;ffffffff90774ddb&amp;gt;] system_call_fastpath+0x22/0x27
[  502.024980] Code: 87 47 02 c1 e0 10 45 31 c9 85 c0 74 44 48 89 c2 c1 e8 13 48 c1 ea 0d 48 98 83 e2 30 48 81 c2 80 b7 01 00 48 03 14 c5 60 b9 d4 90 &amp;lt;4c&amp;gt; 89 02 41 8b 40 08 85 c0 75 0f 0f 1f 44 00 00 f3 90 41 8b 40 
[  502.045501] RIP  [&amp;lt;ffffffff901121d0&amp;gt;] native_queued_spin_lock_slowpath+0x110/0x200
[  502.053096]  RSP &amp;lt;ffff8b02b2e674b8&amp;gt;
[  502.056589] CR2: 000000000001b780
 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="238202" author="sthiell" created="Sat, 8 Dec 2018 04:35:11 +0000"  >&lt;p&gt;Slightly different symptom on MGS/MDS 2.12.0_RC1 + Amir&apos;s patch, but definitively unusable:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;fir-md1-s1 login: [  497.904084] LNet: HW NUMA nodes: 4, HW CPU cores: 48, npartitions: 4
[  497.913010] alg: No test for adler32 (adler32-zlib)
[  498.713626] Lustre: Lustre: Build Version: 2.12.0_RC1
[  498.836281] LNet: Using FastReg for registration
[  498.968174] LNet: Added LNI 10.0.10.51@o2ib7 [8/256/0/180]
[  499.835260] LNet: 20661:0:(o2iblnd_cb.c:3370:kiblnd_check_conns()) Timed out tx for 10.0.10.202@o2ib7: 498 seconds
[  500.088147] LDISKFS-fs warning (device dm-1): ldiskfs_multi_mount_protect:321: MMP interval 42 higher than expected, please wait.
[  500.088147] 
[  542.110957] LDISKFS-fs (dm-1): recovery complete
[  542.115778] LDISKFS-fs (dm-1): mounted filesystem with ordered data mode. Opts: user_xattr,errors=remount-ro,no_mbcache,nodelalloc
[  542.437699] Lustre: MGS: Connection restored to ebe34e0e-4031-a65f-5f91-0d1e392e8147 (at 0@lo)
[  543.013105] LDISKFS-fs warning (device dm-2): ldiskfs_multi_mount_protect:321: MMP interval 42 higher than expected, please wait.
[  543.013105] 
[  543.014468] LDISKFS-fs warning (device dm-4): ldiskfs_multi_mount_protect:321: MMP interval 42 higher than expected, please wait.
[  543.014468] 
[  558.768986] Lustre: MGS: Connection restored to 73368e2a-0b82-5939-f05f-1ad2a79768da (at 10.9.101.59@o2ib4)
[  561.842373] Lustre: MGS: Connection restored to d9b25a46-8406-5014-40e9-a5748f9ad5c5 (at 10.0.10.105@o2ib7)
[  561.852127] Lustre: Skipped 1 previous similar message
[  567.115808] Lustre: MGS: Connection restored to a8053d09-bfcd-3aa9-c515-8645bbb097c5 (at 10.8.0.68@o2ib6)
[  571.051218] Lustre: MGS: Received new LWP connection from 10.9.101.59@o2ib4, removing former export from same NID
[  572.706429] Lustre: MGS: Connection restored to 38b4f0c2-db8c-d604-7e00-2c04b790499d (at 10.9.0.63@o2ib4)
[  572.716088] Lustre: Skipped 6 previous similar messages
[  573.921768] Lustre: MGS: Received new LWP connection from 10.0.10.105@o2ib7, removing former export from same NID
[  575.260500] Lustre: MGS: Received new LWP connection from 10.9.0.61@o2ib4, removing former export from same NID
[  578.088401] Lustre: MGS: Received new LWP connection from 10.9.101.59@o2ib4, removing former export from same NID
[  580.973963] Lustre: MGS: Connection restored to d9b25a46-8406-5014-40e9-a5748f9ad5c5 (at 10.0.10.105@o2ib7)
[  580.985864] Lustre: Skipped 6 previous similar messages
[  582.286596] Lustre: MGS: Received new LWP connection from 10.9.0.61@o2ib4, removing former export from same NID
[  582.296688] Lustre: Skipped 2 previous similar messages
[  583.358103] LustreError: 20965:0:(ldlm_lib.c:3248:target_bulk_io()) @@@ timeout on bulk READ after 6+0s  req@ffff8f086f7d0450 x1619041162737216/t0(0) o256-&amp;gt;d1fa3f21-b7f3-5a11-e34d-8236124081cd@10.9.0.1@o2ib4:489/0 lens 304/240 e 0 to 0 dl 1544243514 ref 1 fl Interpret:/0/0 rc 0/0
[  585.015471] LDISKFS-fs (dm-4): file extents enabled, maximum tree depth=5
[  585.067346] LDISKFS-fs (dm-2): file extents enabled, maximum tree depth=5
[  585.315122] LDISKFS-fs (dm-4): recovery complete
[  585.319941] LDISKFS-fs (dm-4): mounted filesystem with ordered data mode. Opts: user_xattr,errors=remount-ro,acl,no_mbcache,nodelalloc
[  585.354354] LDISKFS-fs (dm-2): recovery complete
[  585.358987] LDISKFS-fs (dm-2): mounted filesystem with ordered data mode. Opts: user_xattr,errors=remount-ro,acl,no_mbcache,nodelalloc
[  590.570227] NMI watchdog: Watchdog detected hard LOCKUP on cpu 22
[  590.576145] Modules linked in: mgs(OE) mgc(OE) osd_ldiskfs(OE) lquota(OE) ldiskfs(OE) lustre(OE) lmv(OE) mdc(OE) osc(OE) lov(OE) fid(OE) fld(OE) ko2iblnd(OE) ptlrpc(OE) obdclass(OE) lnet(OE) libcfs(OE) rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache ib_ucm rpcrdma rdma_ucm ib_uverbs ib_iser ib_umad rdma_cm iw_cm libiscsi ib_ipoib scsi_transport_iscsi ib_cm mlx5_ib ib_core mpt2sas mptctl mptbase dell_rbu sunrpc vfat fat dm_round_robin amd64_edac_mod edac_mce_amd kvm_amd kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd dm_multipath dcdbas ses pcspkr dm_mod ipmi_si enclosure ipmi_devintf i2c_piix4 sg ccp k10temp ipmi_msghandler acpi_power_meter ip_tables ext4 mbcache jbd2 sd_mod crc_t10dif crct10dif_generic i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm ahci mlx5_core crct10dif_pclmul crct10dif_common libahci mlxfw devlink drm tg3 crc32c_intel libata ptp megaraid_sas drm_panel_orientation_quirks pps_core mpt3sas(OE) raid_class scsi_transport_sas
[  590.671805] CPU: 22 PID: 20866 Comm: ll_mgs_0002 Kdump: loaded Tainted: G           OE  ------------   3.10.0-957.1.3.el7_lustre.x86_64 #1
[  590.684220] Hardware name: Dell Inc. PowerEdge R6415/065PKD, BIOS 1.3.6 04/20/2018
[  590.691786] Call Trace:
[  590.694241]  &amp;lt;NMI&amp;gt;  [&amp;lt;ffffffff87361e41&amp;gt;] dump_stack+0x19/0x1b
[  590.700025]  [&amp;lt;ffffffff86d494c5&amp;gt;] watchdog_overflow_callback+0x135/0x140
[  590.706726]  [&amp;lt;ffffffff86da1297&amp;gt;] __perf_event_overflow+0x57/0x100
[  590.712910]  [&amp;lt;ffffffff86daa904&amp;gt;] perf_event_overflow+0x14/0x20
[  590.718831]  [&amp;lt;ffffffff86c055f0&amp;gt;] x86_pmu_handle_irq+0x140/0x1a0
[  590.724838]  [&amp;lt;ffffffff86f77c08&amp;gt;] ? ioremap_page_range+0x2e8/0x480
[  590.731018]  [&amp;lt;ffffffff86dfb854&amp;gt;] ? vunmap_page_range+0x234/0x470
[  590.737106]  [&amp;lt;ffffffff86dfbaa1&amp;gt;] ? unmap_kernel_range_noflush+0x11/0x20
[  590.743811]  [&amp;lt;ffffffff8703d49e&amp;gt;] ? ghes_copy_tofrom_phys+0x10e/0x210
[  590.750254]  [&amp;lt;ffffffff8703d640&amp;gt;] ? ghes_read_estatus+0xa0/0x190
[  590.756263]  [&amp;lt;ffffffff8736b031&amp;gt;] perf_event_nmi_handler+0x31/0x50
[  590.762440]  [&amp;lt;ffffffff8736c8fc&amp;gt;] nmi_handle.isra.0+0x8c/0x150
[  590.768274]  [&amp;lt;ffffffff8736cb1d&amp;gt;] do_nmi+0x15d/0x460
[  590.773239]  [&amp;lt;ffffffff8736bd69&amp;gt;] end_repeat_nmi+0x1e/0x81
[  590.778731]  [&amp;lt;ffffffff86d121e2&amp;gt;] ? native_queued_spin_lock_slowpath+0x122/0x200
[  590.786127]  [&amp;lt;ffffffff86d121e2&amp;gt;] ? native_queued_spin_lock_slowpath+0x122/0x200
[  590.793521]  [&amp;lt;ffffffff86d121e2&amp;gt;] ? native_queued_spin_lock_slowpath+0x122/0x200
[  590.800910]  &amp;lt;EOE&amp;gt;  [&amp;lt;ffffffff8735bfcb&amp;gt;] queued_spin_lock_slowpath+0xb/0xf
[  590.807812]  [&amp;lt;ffffffff8736a507&amp;gt;] _raw_spin_lock_irqsave+0x37/0x40
[  590.814001]  [&amp;lt;ffffffffc090abab&amp;gt;] ib_fmr_pool_map_phys+0x4b/0x300 [ib_core]
[  590.820965]  [&amp;lt;ffffffffc0ec66cc&amp;gt;] kiblnd_fmr_pool_map+0x15c/0x690 [ko2iblnd]
[  590.828017]  [&amp;lt;ffffffffc0eca965&amp;gt;] kiblnd_map_tx.isra.28+0x155/0x470 [ko2iblnd]
[  590.835235]  [&amp;lt;ffffffffc0ecc216&amp;gt;] kiblnd_setup_rd_iov+0x2f6/0x400 [ko2iblnd]
[  590.842282]  [&amp;lt;ffffffffc0ecfa6a&amp;gt;] kiblnd_send+0x5da/0xa20 [ko2iblnd]
[  590.848645]  [&amp;lt;ffffffffc0c430e4&amp;gt;] lnet_ni_send+0x44/0xd0 [lnet]
[  590.854569]  [&amp;lt;ffffffffc0c4ad92&amp;gt;] lnet_send+0x82/0x1c0 [lnet]
[  590.860323]  [&amp;lt;ffffffffc0c4b19c&amp;gt;] LNetPut+0x2cc/0xb50 [lnet]
[  590.866029]  [&amp;lt;ffffffffc0f5c5d6&amp;gt;] ptl_send_buf+0x146/0x530 [ptlrpc]
[  590.872302]  [&amp;lt;ffffffffc0bc1bde&amp;gt;] ? ktime_get_real_seconds+0xe/0x10 [libcfs]
[  590.879378]  [&amp;lt;ffffffffc0f5f98b&amp;gt;] ptlrpc_send_reply+0x29b/0x840 [ptlrpc]
[  590.886109]  [&amp;lt;ffffffffc0f1e94e&amp;gt;] target_send_reply_msg+0x8e/0x170 [ptlrpc]
[  590.893092]  [&amp;lt;ffffffffc0f28e0e&amp;gt;] target_send_reply+0x30e/0x730 [ptlrpc]
[  590.899823]  [&amp;lt;ffffffffc0f66877&amp;gt;] ? lustre_msg_set_last_committed+0x27/0xa0 [ptlrpc]
[  590.907607]  [&amp;lt;ffffffffc0fccf07&amp;gt;] tgt_request_handle+0x697/0x1580 [ptlrpc]
[  590.914514]  [&amp;lt;ffffffffc0fa6a51&amp;gt;] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc]
[  590.922088]  [&amp;lt;ffffffffc0bc1bde&amp;gt;] ? ktime_get_real_seconds+0xe/0x10 [libcfs]
[  590.929161]  [&amp;lt;ffffffffc0f7192b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[  590.936846]  [&amp;lt;ffffffffc0f6e7b5&amp;gt;] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc]
[  590.943633]  [&amp;lt;ffffffff86cd67c2&amp;gt;] ? default_wake_function+0x12/0x20
[  590.949898]  [&amp;lt;ffffffff86ccba9b&amp;gt;] ? __wake_up_common+0x5b/0x90
[  590.955757]  [&amp;lt;ffffffffc0f7525c&amp;gt;] ptlrpc_main+0xafc/0x1fc0 [ptlrpc]
[  590.962050]  [&amp;lt;ffffffffc0f74760&amp;gt;] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc]
[  590.969441]  [&amp;lt;ffffffff86cc1c31&amp;gt;] kthread+0xd1/0xe0
[  590.974320]  [&amp;lt;ffffffff86cc1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
[  590.980413]  [&amp;lt;ffffffff87374c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
[  590.986850]  [&amp;lt;ffffffff86cc1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
[  590.992945] Kernel panic - not syncing: Hard LOCKUP
[  590.997823] CPU: 22 PID: 20866 Comm: ll_mgs_0002 Kdump: loaded Tainted: G           OE  ------------   3.10.0-957.1.3.el7_lustre.x86_64 #1
[  591.010241] Hardware name: Dell Inc. PowerEdge R6415/065PKD, BIOS 1.3.6 04/20/2018
[  591.017807] Call Trace:
[  591.020262]  &amp;lt;NMI&amp;gt;  [&amp;lt;ffffffff87361e41&amp;gt;] dump_stack+0x19/0x1b
[  591.026035]  [&amp;lt;ffffffff8735b550&amp;gt;] panic+0xe8/0x21f
[  591.030826]  [&amp;lt;ffffffff87374c24&amp;gt;] ? ret_from_fork_nospec_begin+0xe/0x21
[  591.037442]  [&amp;lt;ffffffff86c9739f&amp;gt;] nmi_panic+0x3f/0x40
[  591.042492]  [&amp;lt;ffffffff86d494b1&amp;gt;] watchdog_overflow_callback+0x121/0x140
[  591.049193]  [&amp;lt;ffffffff86da1297&amp;gt;] __perf_event_overflow+0x57/0x100
[  591.055370]  [&amp;lt;ffffffff86daa904&amp;gt;] perf_event_overflow+0x14/0x20
[  591.061292]  [&amp;lt;ffffffff86c055f0&amp;gt;] x86_pmu_handle_irq+0x140/0x1a0
[  591.067296]  [&amp;lt;ffffffff86f77c08&amp;gt;] ? ioremap_page_range+0x2e8/0x480
[  591.073477]  [&amp;lt;ffffffff86dfb854&amp;gt;] ? vunmap_page_range+0x234/0x470
[  591.079570]  [&amp;lt;ffffffff86dfbaa1&amp;gt;] ? unmap_kernel_range_noflush+0x11/0x20
[  591.086268]  [&amp;lt;ffffffff8703d49e&amp;gt;] ? ghes_copy_tofrom_phys+0x10e/0x210
[  591.092709]  [&amp;lt;ffffffff8703d640&amp;gt;] ? ghes_read_estatus+0xa0/0x190
[  591.098714]  [&amp;lt;ffffffff8736b031&amp;gt;] perf_event_nmi_handler+0x31/0x50
[  591.104893]  [&amp;lt;ffffffff8736c8fc&amp;gt;] nmi_handle.isra.0+0x8c/0x150
[  591.110726]  [&amp;lt;ffffffff8736cb1d&amp;gt;] do_nmi+0x15d/0x460
[  591.115694]  [&amp;lt;ffffffff8736bd69&amp;gt;] end_repeat_nmi+0x1e/0x81
[  591.121181]  [&amp;lt;ffffffff86d121e2&amp;gt;] ? native_queued_spin_lock_slowpath+0x122/0x200
[  591.128572]  [&amp;lt;ffffffff86d121e2&amp;gt;] ? native_queued_spin_lock_slowpath+0x122/0x200
[  591.135964]  [&amp;lt;ffffffff86d121e2&amp;gt;] ? native_queued_spin_lock_slowpath+0x122/0x200
[  591.143355]  &amp;lt;EOE&amp;gt;  [&amp;lt;ffffffff8735bfcb&amp;gt;] queued_spin_lock_slowpath+0xb/0xf
[  591.150258]  [&amp;lt;ffffffff8736a507&amp;gt;] _raw_spin_lock_irqsave+0x37/0x40
[  591.156442]  [&amp;lt;ffffffffc090abab&amp;gt;] ib_fmr_pool_map_phys+0x4b/0x300 [ib_core]
[  591.163407]  [&amp;lt;ffffffffc0ec66cc&amp;gt;] kiblnd_fmr_pool_map+0x15c/0x690 [ko2iblnd]
[  591.170452]  [&amp;lt;ffffffffc0eca965&amp;gt;] kiblnd_map_tx.isra.28+0x155/0x470 [ko2iblnd]
[  591.177672]  [&amp;lt;ffffffffc0ecc216&amp;gt;] kiblnd_setup_rd_iov+0x2f6/0x400 [ko2iblnd]
[  591.184719]  [&amp;lt;ffffffffc0ecfa6a&amp;gt;] kiblnd_send+0x5da/0xa20 [ko2iblnd]
[  591.191077]  [&amp;lt;ffffffffc0c430e4&amp;gt;] lnet_ni_send+0x44/0xd0 [lnet]
[  591.197004]  [&amp;lt;ffffffffc0c4ad92&amp;gt;] lnet_send+0x82/0x1c0 [lnet]
[  591.202759]  [&amp;lt;ffffffffc0c4b19c&amp;gt;] LNetPut+0x2cc/0xb50 [lnet]
[  591.208448]  [&amp;lt;ffffffffc0f5c5d6&amp;gt;] ptl_send_buf+0x146/0x530 [ptlrpc]
[  591.214715]  [&amp;lt;ffffffffc0bc1bde&amp;gt;] ? ktime_get_real_seconds+0xe/0x10 [libcfs]
[  591.221786]  [&amp;lt;ffffffffc0f5f98b&amp;gt;] ptlrpc_send_reply+0x29b/0x840 [ptlrpc]
[  591.228509]  [&amp;lt;ffffffffc0f1e94e&amp;gt;] target_send_reply_msg+0x8e/0x170 [ptlrpc]
[  591.235494]  [&amp;lt;ffffffffc0f28e0e&amp;gt;] target_send_reply+0x30e/0x730 [ptlrpc]
[  591.242224]  [&amp;lt;ffffffffc0f66877&amp;gt;] ? lustre_msg_set_last_committed+0x27/0xa0 [ptlrpc]
[  591.249992]  [&amp;lt;ffffffffc0fccf07&amp;gt;] tgt_request_handle+0x697/0x1580 [ptlrpc]
[  591.256897]  [&amp;lt;ffffffffc0fa6a51&amp;gt;] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc]
[  591.264463]  [&amp;lt;ffffffffc0bc1bde&amp;gt;] ? ktime_get_real_seconds+0xe/0x10 [libcfs]
[  591.271534]  [&amp;lt;ffffffffc0f7192b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[  591.279214]  [&amp;lt;ffffffffc0f6e7b5&amp;gt;] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc]
[  591.285996]  [&amp;lt;ffffffff86cd67c2&amp;gt;] ? default_wake_function+0x12/0x20
[  591.292263]  [&amp;lt;ffffffff86ccba9b&amp;gt;] ? __wake_up_common+0x5b/0x90
[  591.298124]  [&amp;lt;ffffffffc0f7525c&amp;gt;] ptlrpc_main+0xafc/0x1fc0 [ptlrpc]
[  591.304415]  [&amp;lt;ffffffffc0f74760&amp;gt;] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc]
[  591.311807]  [&amp;lt;ffffffff86cc1c31&amp;gt;] kthread+0xd1/0xe0
[  591.316685]  [&amp;lt;ffffffff86cc1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
[  591.322780]  [&amp;lt;ffffffff87374c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
[  591.329219]  [&amp;lt;ffffffff86cc1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
[    0.000000] Initializing cgroup subsys cpuset
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="238203" author="sthiell" created="Sat, 8 Dec 2018 04:42:27 +0000"  >&lt;p&gt;Perhaps I missed something, do I need to apply&#160;the patch from&#160;&lt;a href=&quot;https://review.whamcloud.com/#/c/31313/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/31313/&lt;/a&gt;&#160;too? Please clarify which patches need to be applied on top of 2.12.0_RC1 if you want me to try again. Thanks!&lt;/p&gt;</comment>
                            <comment id="238204" author="pjones" created="Sat, 8 Dec 2018 05:26:50 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=sthiell&quot; class=&quot;user-hover&quot; rel=&quot;sthiell&quot;&gt;sthiell&lt;/a&gt; just wait for RC2 - which should be available shortly...&lt;/p&gt;</comment>
                            <comment id="238206" author="gerrit" created="Sat, 8 Dec 2018 05:33:56 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/33794/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/33794/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11734&quot; title=&quot;LNet crashes with 2.12.0-rc1: lnet_attach_rsp_tracker() ASSERTION(md-&amp;gt;md_rspt_ptr == ((void *)0)) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11734&quot;&gt;&lt;del&gt;LU-11734&lt;/del&gt;&lt;/a&gt; lnet: handle multi-md usage&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 8c249097e62713baf51aec808489a86acf46748d&lt;/p&gt;</comment>
                            <comment id="238210" author="sthiell" created="Sat, 8 Dec 2018 05:51:24 +0000"  >&lt;p&gt;Thanks Peter!&lt;/p&gt;</comment>
                            <comment id="238215" author="pjones" created="Sat, 8 Dec 2018 13:17:42 +0000"  >&lt;p&gt;Landed for 2.12. &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=sthiell&quot; class=&quot;user-hover&quot; rel=&quot;sthiell&quot;&gt;sthiell&lt;/a&gt; please open a new ticket if there are any problems with RC2&lt;/p&gt;</comment>
                            <comment id="238231" author="sthiell" created="Sun, 9 Dec 2018 01:51:44 +0000"  >&lt;p&gt;Sure, will do &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;

&lt;p&gt;I confirm this specific problem is fixed in RC2.&lt;/p&gt;</comment>
                            <comment id="238232" author="pjones" created="Sun, 9 Dec 2018 01:53:58 +0000"  >&lt;p&gt;Phew! &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="50828">LU-10669</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i007fz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>