<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:37:26 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-3848] Compute node crashes due to error in Lustre : ASSERTION( me == md-&gt;md_me ) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-3848</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Hi,&lt;/p&gt;

&lt;p&gt;At a customer site, a compute node running Lustre crashes with the following error:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-08-23 17:08:55 LustreError: 3049:0:(lib-move.c:185:lnet_match_md()) ASSERTION( me == md-&amp;gt;md_me ) failed:
2013-08-23 17:08:55 LustreError: 3049:0:(lib-move.c:185:lnet_match_md()) LBUG
2013-08-23 17:08:55 Pid: 3049, comm: kiblnd_sd_11
2013-08-23 17:08:55
2013-08-23 17:08:55 Call Trace:
2013-08-23 17:08:55 [&amp;lt;ffffffffa04c47f5&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
2013-08-23 17:08:55 [&amp;lt;ffffffffa04c4e07&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
2013-08-23 17:08:55 [&amp;lt;ffffffffa053c2bd&amp;gt;] lnet_match_md+0x35d/0x3a0 [lnet]
2013-08-23 17:08:55 [&amp;lt;ffffffffa053a8dd&amp;gt;] ? lnet_ni_recv+0xad/0x2f0 [lnet]
2013-08-23 17:08:55 [&amp;lt;ffffffffa0541482&amp;gt;] lnet_parse+0xac2/0x1b80 [lnet]
2013-08-23 17:08:55 [&amp;lt;ffffffff81042f33&amp;gt;] ? enqueue_task+0x43/0x90
2013-08-23 17:08:55 [&amp;lt;ffffffffa08d177b&amp;gt;] kiblnd_handle_rx+0x2cb/0x680 [ko2iblnd]
2013-08-23 17:08:55 [&amp;lt;ffffffff8103d349&amp;gt;] ? __wake_up_common+0x59/0x90
2013-08-23 17:08:55 [&amp;lt;ffffffffa08d2560&amp;gt;] kiblnd_rx_complete+0x2d0/0x440 [ko2iblnd]
2013-08-23 17:08:55 [&amp;lt;ffffffff81042cf3&amp;gt;] ? __wake_up+0x53/0x70
2013-08-23 17:08:55 [&amp;lt;ffffffffa08d2732&amp;gt;] kiblnd_complete+0x62/0xe0 [ko2iblnd]
2013-08-23 17:08:55 [&amp;lt;ffffffffa08d2ae9&amp;gt;] kiblnd_scheduler+0x339/0x7a0 [ko2iblnd]
2013-08-23 17:08:55 [&amp;lt;ffffffff81048df0&amp;gt;] ? default_wake_function+0x0/0x20
2013-08-23 17:08:55 [&amp;lt;ffffffffa08d27b0&amp;gt;] ? kiblnd_scheduler+0x0/0x7a0 [ko2iblnd]
2013-08-23 17:08:55 [&amp;lt;ffffffff8100412a&amp;gt;] child_rip+0xa/0x20
2013-08-23 17:08:55 [&amp;lt;ffffffffa08d27b0&amp;gt;] ? kiblnd_scheduler+0x0/0x7a0 [ko2iblnd]
2013-08-23 17:08:55 [&amp;lt;ffffffffa08d27b0&amp;gt;] ? kiblnd_scheduler+0x0/0x7a0 [ko2iblnd]
2013-08-23 17:08:55 [&amp;lt;ffffffff81004120&amp;gt;] ? child_rip+0x0/0x20
2013-08-23 17:08:55
2013-08-23 17:08:55 Kernel panic - not syncing: LBUG
2013-08-23 17:08:55 Pid: 3049, comm: kiblnd_sd_11 Tainted: G --------------- H 2.6.32-279.5.2.bl6.Bull.36.x86_64 #1
2013-08-23 17:08:55 Call Trace:
2013-08-23 17:08:55 [&amp;lt;ffffffff81495fe3&amp;gt;] ? panic+0xa0/0x168
2013-08-23 17:08:55 [&amp;lt;ffffffffa04c4e5b&amp;gt;] ? lbug_with_loc+0x9b/0xb0 [libcfs]
2013-08-23 17:08:55 [&amp;lt;ffffffffa053c2bd&amp;gt;] ? lnet_match_md+0x35d/0x3a0 [lnet]
2013-08-23 17:08:55 [&amp;lt;ffffffffa053a8dd&amp;gt;] ? lnet_ni_recv+0xad/0x2f0 [lnet]
2013-08-23 17:08:55 [&amp;lt;ffffffffa0541482&amp;gt;] ? lnet_parse+0xac2/0x1b80 [lnet]
2013-08-23 17:08:55 [&amp;lt;ffffffff81042f33&amp;gt;] ? enqueue_task+0x43/0x90
2013-08-23 17:08:55 [&amp;lt;ffffffffa08d177b&amp;gt;] ? kiblnd_handle_rx+0x2cb/0x680 [ko2iblnd]
2013-08-23 17:08:55 [&amp;lt;ffffffff8103d349&amp;gt;] ? __wake_up_common+0x59/0x90
2013-08-23 17:08:55 [&amp;lt;ffffffffa08d2560&amp;gt;] ? kiblnd_rx_complete+0x2d0/0x440 [ko2iblnd]
2013-08-23 17:08:55 [&amp;lt;ffffffff81042cf3&amp;gt;] ? __wake_up+0x53/0x70
2013-08-23 17:08:55 [&amp;lt;ffffffffa08d2732&amp;gt;] ? kiblnd_complete+0x62/0xe0 [ko2iblnd]
2013-08-23 17:08:55 [&amp;lt;ffffffffa08d2ae9&amp;gt;] ? kiblnd_scheduler+0x339/0x7a0 [ko2iblnd]
2013-08-23 17:08:55 [&amp;lt;ffffffff81048df0&amp;gt;] ? default_wake_function+0x0/0x20
2013-08-23 17:08:55 [&amp;lt;ffffffffa08d27b0&amp;gt;] ? kiblnd_scheduler+0x0/0x7a0 [ko2iblnd]
2013-08-23 17:08:55 [&amp;lt;ffffffff8100412a&amp;gt;] ? child_rip+0xa/0x20
2013-08-23 17:08:55 [&amp;lt;ffffffffa08d27b0&amp;gt;] ? kiblnd_scheduler+0x0/0x7a0 [ko2iblnd]
2013-08-23 17:08:55 [&amp;lt;ffffffffa08d27b0&amp;gt;] ? kiblnd_scheduler+0x0/0x7a0 [ko2iblnd]
2013-08-23 17:08:55 [&amp;lt;ffffffff81004120&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The messages visible in the syslog 2 hours before the failed assertion are:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-08-23 15:26:09 LustreError: 11-0: an error occurred while communicating with 10.200.200.66@o2ib. The obd_ping operation failed with -107
2013-08-23 15:26:09 Lustre: scratch-OST0018-osc-ffff88087a659000: Connection to scratch-OST0018 (at 10.200.200.66@o2ib) was lost; in progress operations using this service will wait for recovery to complete
2013-08-23 15:26:09 LustreError: 167-0: This client was evicted by scratch-OST0018; in progress operations using this service will fail.
2013-08-23 15:26:09 LustreError: 24191:0:(osc_lock.c:816:osc_ldlm_completion_ast()) lock@ffff880e871c7d98[2 3 0 1 1 00000000] R(1):[4, 18446744073709551615]@[0x100180000:0x2e225e:0x0] {
2013-08-23 15:26:09 LustreError: 24191:0:(osc_lock.c:816:osc_ldlm_completion_ast()) lovsub@ffff880c2c27c8a0: [0 ffff880f31e5f268 R(1):[4, 18446744073709551615]@[0x2006bc5f0:0x1ac23:0x0]]
2013-08-23 15:26:09 LustreError: 24191:0:(osc_lock.c:816:osc_ldlm_completion_ast()) osc@ffff88104e603d78: ffff881072b1f240 40120002 0x4d3a65b709264355 3 ffff881012f1ae88 size: 26325 mtime: 1377264244 atime: 1377264247 ctime: 1377264244 blocks: 56
2013-08-23 15:26:09 LustreError: 24191:0:(osc_lock.c:816:osc_ldlm_completion_ast()) } lock@ffff880e871c7d98
2013-08-23 15:26:09 LustreError: 24191:0:(osc_lock.c:816:osc_ldlm_completion_ast()) dlmlock returned -5
2013-08-23 15:26:09 LustreError: 24191:0:(osc_lock.c:816:osc_ldlm_completion_ast()) lock@ffff880e871c7078[2 2 0 1 1 00000000] R(1):[2, 3]@[0x100180000:0x2e225e:0x0] {
2013-08-23 15:26:09 LustreError: 24191:0:(osc_lock.c:816:osc_ldlm_completion_ast()) lovsub@ffff880c2c27c920: [0 ffff8801fca3e4d8 R(1):[0, 18446744073709551615]@[0x2006bc5f0:0x1ac23:0x0]]
2013-08-23 15:26:09 LustreError: 24191:0:(osc_lock.c:816:osc_ldlm_completion_ast()) osc@ffff88104e603368: ffff88077ea89d80 40120002 0x4d3a65b709264363 2 (null) size: 26325 mtime: 1377264244 atime: 1377264247 ctime: 1377264244 blocks: 56
2013-08-23 15:26:09 LustreError: 24191:0:(osc_lock.c:816:osc_ldlm_completion_ast()) } lock@ffff880e871c7078
2013-08-23 15:26:09 LustreError: 24191:0:(osc_lock.c:816:osc_ldlm_completion_ast()) dlmlock returned -5
2013-08-23 15:26:09 LustreError: 24191:0:(ldlm_resource.c:749:ldlm_resource_complain()) Namespace scratch-OST0018-osc-ffff88087a659000 resource refcount nonzero (2) after lock cleanup; forcing cleanup.
2013-08-23 15:26:09 LustreError: 24191:0:(ldlm_resource.c:755:ldlm_resource_complain()) Resource: ffff880e973c7480 (3023454/0/0/0) (rc: 2)
2013-08-23 15:26:09 Lustre: scratch-OST0018-osc-ffff88087a659000: Connection restored to scratch-OST0018 (at 10.200.200.66@o2ib)
2013-08-23 15:26:09 LustreError: 24071:0:(cl_lock.c:1413:cl_unuse_try()) lock@ffff880e871c7698[2 4 0 2 0 00000000] R(1):[4, 18446744073709551615]@[0x2006bc5f0:0x1ac23:0x0] {
2013-08-23 15:26:09 LustreError: 24071:0:(cl_lock.c:1413:cl_unuse_try()) vvp@ffff880dafcd7d30:
2013-08-23 15:26:09 LustreError: 24071:0:(cl_lock.c:1413:cl_unuse_try()) lov@ffff880f31e5f268: 1
2013-08-23 15:26:09 LustreError: 24071:0:(cl_lock.c:1413:cl_unuse_try()) 0 0: ---
2013-08-23 15:26:09 LustreError: 24071:0:(cl_lock.c:1413:cl_unuse_try())
2013-08-23 15:26:09 LustreError: 24071:0:(cl_lock.c:1413:cl_unuse_try()) } lock@ffff880e871c7698
2013-08-23 15:26:09 LustreError: 24071:0:(cl_lock.c:1413:cl_unuse_try()) unuse return -5
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;After investigation, no error was seen on the Infiniband network by the time of the Lustre node crash.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Sebastien.&lt;/p&gt;</description>
                <environment></environment>
        <key id="20660">LU-3848</key>
            <summary>Compute node crashes due to error in Lustre : ASSERTION( me == md-&gt;md_me ) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bfaccini">Bruno Faccini</assignee>
                                    <reporter username="sebastien.buisson">Sebastien Buisson</reporter>
                        <labels>
                            <label>mn1</label>
                    </labels>
                <created>Wed, 28 Aug 2013 14:21:28 +0000</created>
                <updated>Wed, 24 Feb 2016 17:28:55 +0000</updated>
                            <resolved>Wed, 5 Mar 2014 15:21:19 +0000</resolved>
                                    <version>Lustre 2.1.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>10</watches>
                                                                            <comments>
                            <comment id="65267" author="bfaccini" created="Wed, 28 Aug 2013 16:33:26 +0000"  >&lt;p&gt;Hello Seb!&lt;br/&gt;
Is there a crash-dump available ?? Is it a one shoot ??&lt;/p&gt;</comment>
                            <comment id="65271" author="liang" created="Wed, 28 Aug 2013 16:47:20 +0000"  >&lt;p&gt;this could be an instance of bz11130 which exists ages (I believe there are a few more instances of it in BZ, but I&apos;m not authorized to access those tickets), I suspect it&apos;s a nasty memory corruption which is irrelevant to LNet.&lt;/p&gt;</comment>
                            <comment id="65300" author="kitwestneat" created="Wed, 28 Aug 2013 19:18:39 +0000"  >&lt;p&gt;FYI we were getting this in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3010&quot; title=&quot;client crashes on RHEL6 with Lustre 1.8.8&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3010&quot;&gt;&lt;del&gt;LU-3010&lt;/del&gt;&lt;/a&gt;. It ended up being an RDMA bug in that version of redhat.&lt;/p&gt;</comment>
                            <comment id="65319" author="pjones" created="Wed, 28 Aug 2013 22:18:02 +0000"  >&lt;p&gt;Kit&lt;/p&gt;

&lt;p&gt;Thanks for the tip. Could you please share the details of the RH version you experienced the bug in and what version contained the fix (or the RH ticket number if you know it)&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Bob&lt;/p&gt;

&lt;p&gt;Could you please oversee this issue?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="65324" author="kitwestneat" created="Wed, 28 Aug 2013 22:46:17 +0000"  >&lt;p&gt;We were running 2.6.32-279.19.1.el6.x86_64.rpm. I think it is fixed in the RH 6.4 line, unfortunately I am not sure what the bz or errata was. My understanding is that no 6.3 kernel release has the fix. &lt;/p&gt;</comment>
                            <comment id="65329" author="bogl" created="Wed, 28 Aug 2013 23:23:11 +0000"  >&lt;p&gt;Kit, thanks for those details.&lt;/p&gt;

&lt;p&gt;Sebastian, since the probable bug is in the upstream kernel you will need to move to 6.4 in order to get the fix.  We shifted support in lustre from 6.3 kernels to 6.4 kernels soon after the 2.1.5 lustre version. You will need a lustre build from b2_1 that matches the specific kernel version you move to or build your own from source. The most recent version we support on the b2_1 branch is RHEL/Centos 6.4, 2.6.32-358.11.1.el6&lt;/p&gt;</comment>
                            <comment id="65338" author="louveta" created="Thu, 29 Aug 2013 04:05:41 +0000"  >&lt;p&gt;Kit, Bob,&lt;/p&gt;

&lt;p&gt;Is this RDMA corruption has any relation with the patch lustre/kernel_patches/patches/ipoib-locking-fix.patch that was in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2846&quot; title=&quot;Kernel update [RHEL6.4 2.6.32-358.2.1.el6]&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2846&quot;&gt;&lt;del&gt;LU-2846&lt;/del&gt;&lt;/a&gt;, commit 0f1d64724480bdfa268d18153272c3484c90e578 ?&lt;/p&gt;</comment>
                            <comment id="65340" author="kitwestneat" created="Thu, 29 Aug 2013 06:25:47 +0000"  >&lt;p&gt;Hi Alexandre,&lt;/p&gt;

&lt;p&gt;I believe it was that patch. The customer is now running 2.6.32-279.32.1.el6, which is not public, but the changelog for it only has that patch. Strangely though, according to the RHEL errata the bug wasn&apos;t introduced until 279.22.. Perhaps there was another memory corruption bug that the patch fixed? In any case, this patch helped us with the LBUG in kiblnd. &lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Kit&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="65369" author="bogl" created="Thu, 29 Aug 2013 15:06:26 +0000"  >&lt;p&gt;Going back over our change history, the patch being referred to was&lt;/p&gt;

&lt;p&gt;lustre/kernel_patches/patches/ipoib-locking-fix.patch&lt;/p&gt;

&lt;p&gt;It was added to our kernel patch series for application to the 6.4 kernel (2.6.32-358.2.1.el6) in &lt;a href=&quot;http://review.whamcloud.com/#/c/5952/3&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/5952/3&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;It was deleted in &lt;a href=&quot;http://review.whamcloud.com/#/c/6615/3&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/6615/3&lt;/a&gt; during our support upgrade to the later 6.4 version (2.6.32-358.11.1.el6) when that change landed in the upstream kernel.&lt;/p&gt;

&lt;p&gt;While it should work in principal we have never applied or tested that patch on a 6.3 kernel ourselves.  Or you can try to use the 2.6.32-279.32.1.el6 kernel that Kit says has the fix.  Either of those options will probably work but haven&apos;t been proofed or tested here.&lt;/p&gt;</comment>
                            <comment id="66487" author="sebastien.buisson" created="Thu, 12 Sep 2013 14:08:03 +0000"  >&lt;p&gt;Hi,&lt;/p&gt;

&lt;p&gt;The patch lustre/kernel_patches/patches/ipoib-locking-fix.patch has been installed on-site, and so far it improves the situation very well: Lustre is much more stable now.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Sebastien.&lt;/p&gt;</comment>
                            <comment id="70964" author="sebastien.buisson" created="Thu, 7 Nov 2013 12:59:25 +0000"  >&lt;p&gt;Hi,&lt;/p&gt;

&lt;p&gt;Unfortunately I think we mixed different issues: the problem initially described here was hit at SARA customer, whereas my last comment was only true for MeteoFrance customer (please also note that this patch on IPoIB was removed as it was found to introduce a deadlock).&lt;/p&gt;

&lt;p&gt;So back to active as this issue has been hit 11 times during October.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Sebastien.&lt;/p&gt;</comment>
                            <comment id="71430" author="bogl" created="Wed, 13 Nov 2013 16:08:32 +0000"  >&lt;p&gt;Very strongly suspect you are hitting this problem because you have removed the IPoIB patch.  It was directly cribbed from upstream linux and addressed precisely this sort of memory corruption in IB.  If it causes a deadlock for you the only remedy I can suggest is to move to a later kernel version that already incorporates the equivalent fix.  As the problem is in the upstream linux, not in lustre, there&apos;s not a lot we can do about it.&lt;/p&gt;</comment>
                            <comment id="71532" author="sebastien.buisson" created="Thu, 14 Nov 2013 15:32:38 +0000"  >&lt;p&gt;From &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3010&quot; title=&quot;client crashes on RHEL6 with Lustre 1.8.8&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3010&quot;&gt;&lt;del&gt;LU-3010&lt;/del&gt;&lt;/a&gt;, it seems that a modification in the RH6.4 kernels obsoletes the ipoib-locking-fix patch.&lt;br/&gt;
Can we have a pointer to this modification ?&lt;/p&gt;</comment>
                            <comment id="71544" author="kitwestneat" created="Thu, 14 Nov 2013 16:07:26 +0000"  >&lt;p&gt;Hi Sebastien,&lt;/p&gt;

&lt;p&gt;The issue in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3010&quot; title=&quot;client crashes on RHEL6 with Lustre 1.8.8&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3010&quot;&gt;&lt;del&gt;LU-3010&lt;/del&gt;&lt;/a&gt; was resolved by the ipoib patch, which RHEL included in the 6.4 kernel:&lt;br/&gt;
&lt;a href=&quot;https://access.redhat.com/site/documentation/en-US/Red_Hat_Enterprise_Linux/6/html/6.4_Technical_Notes/kernel.html#RHSA-2013-0911&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://access.redhat.com/site/documentation/en-US/Red_Hat_Enterprise_Linux/6/html/6.4_Technical_Notes/kernel.html#RHSA-2013-0911&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;(bz 928817)&lt;/p&gt;

&lt;p&gt;FWIW I don&apos;t think NOAA has seen any client deadlocks, but we did run into an OSS o2iblnd deadlock (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3596&quot; title=&quot;deadlock in kiblnd&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3596&quot;&gt;&lt;del&gt;LU-3596&lt;/del&gt;&lt;/a&gt;). &lt;/p&gt;

&lt;p&gt;HTH,&lt;br/&gt;
Kit&lt;/p&gt;</comment>
                            <comment id="71545" author="bogl" created="Thu, 14 Nov 2013 16:15:40 +0000"  >&lt;p&gt;Kit,&lt;/p&gt;

&lt;p&gt;Thanks for that reference.  It was mentioned in the commit header of &lt;a href=&quot;http://review.whamcloud.com/5952&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5952&lt;/a&gt; that our ipoib patch came from upstream commit fa16ebed31f336e41970f3f0ea9e8279f6be2d27.  I would have had a hard time tracking down a better pointer.&lt;/p&gt;</comment>
                            <comment id="72525" author="sebastien.buisson" created="Fri, 29 Nov 2013 15:18:51 +0000"  >&lt;p&gt;Hi,&lt;/p&gt;

&lt;p&gt;This issue is not fixed by the &apos;ipoib&apos; patch. Precisely we already run an OFED stack that integrates a more complete version of this patch, and it does not help.&lt;/p&gt;

&lt;p&gt;In order to help investigations, we have uploaded a crash dump on Whamcloud&apos;s ftp, at &lt;a href=&quot;ftp://ftp.whamcloud.com/uploads/vmcore-tcn127-LBUG&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;ftp://ftp.whamcloud.com/uploads/vmcore-tcn127-LBUG&lt;/a&gt; .&lt;br/&gt;
Please tell us if you need anything else.&lt;/p&gt;

&lt;p&gt;Sebastien.&lt;/p&gt;</comment>
                            <comment id="72548" author="bfaccini" created="Sat, 30 Nov 2013 10:00:23 +0000"  >&lt;p&gt;Hello Sebastien,&lt;br/&gt;
Seems we are back to the beginning on this, and with your answer to my very 1st demand !!&lt;br/&gt;
BTW, since it appear to be the solo vmcore file (header shows 2.6.32-279.5.2.bl6.Bull.36.x86_64 Kernel version), can you also provide the associated vmlinux (or kernel-dbuginfo-* RPMs) and Lustre modules ?&lt;br/&gt;
Also, does this still occur running with 2.1.3?&lt;br/&gt;
Thanks again and in advance.&lt;/p&gt;</comment>
                            <comment id="72568" author="hugo_meiland" created="Mon, 2 Dec 2013 11:14:32 +0000"  >&lt;p&gt;added zip file to ftp with vmlinuz, systemmap and mlx4-ib.ko: /uploads/LU3848-vmlinuz-systemmap-mlx4_ib_module.zip&lt;/p&gt;</comment>
                            <comment id="72601" author="hugo_meiland" created="Mon, 2 Dec 2013 15:27:25 +0000"  >&lt;p&gt;new zip uploaded, containing vmlinux, systemmap and weak-updates&lt;/p&gt;</comment>
                            <comment id="72618" author="pjones" created="Mon, 2 Dec 2013 17:10:51 +0000"  >&lt;p&gt;Bruno&lt;/p&gt;

&lt;p&gt;Could you please take over assisting with this ticket?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="72645" author="bfaccini" created="Mon, 2 Dec 2013 20:35:01 +0000"  >&lt;p&gt;Hugo, can you indicate the name of the 2nd file because I can not list files on the upload!&lt;/p&gt;</comment>
                            <comment id="72687" author="bfaccini" created="Tue, 3 Dec 2013 08:06:46 +0000"  >&lt;p&gt;Humm based on the 1st name construct, I think I was able to guess it, is&apos;nt it &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3848&quot; title=&quot;Compute node crashes due to error in Lustre : ASSERTION( me == md-&amp;gt;md_me ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3848&quot;&gt;&lt;del&gt;LU-3848&lt;/del&gt;&lt;/a&gt;-vmlinux-systemmap-weak_updates.zip ?&lt;/p&gt;</comment>
                            <comment id="72688" author="hugo_meiland" created="Tue, 3 Dec 2013 08:30:54 +0000"  >&lt;p&gt;ack, this is the one... please let me know if you need additional info!&lt;br/&gt;
Hugo&lt;/p&gt;</comment>
                            <comment id="72895" author="bfaccini" created="Thu, 5 Dec 2013 14:16:20 +0000"  >&lt;p&gt;Thanks Hugo!&lt;br/&gt;
In the Kernel log of the crash-dump you uploaded, I see some OOM traces and also VTune driver being loaded, since there are no time-stamps can you check in syslog (or uplod it too ?) if it occurs around the time of the LBUG ??&lt;br/&gt;
Continuing crash-dump analysis.&lt;/p&gt;</comment>
                            <comment id="72912" author="bfaccini" created="Thu, 5 Dec 2013 18:46:40 +0000"  >&lt;p&gt;Argh, this is weird looks like the me-&amp;gt;me_md pointer, of the 1st ME linked on the_lnet.ln_portals&lt;span class=&quot;error&quot;&gt;&amp;#91;15&amp;#93;&lt;/span&gt;-&amp;gt; ptl_mlist (there are 1536 on the linked-list) being parsed in lnet_match_md(), is -3 bytes (!!) than the correct MD address ... :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; bt
PID: 3265   TASK: ffff881079e3c7b0  CPU: 0   COMMAND: &quot;kiblnd_sd_19&quot;
 #0 [ffff88107aed7a48] machine_kexec at ffffffff8102902b
 #1 [ffff88107aed7aa8] crash_kexec at ffffffff810a5292
 #2 [ffff88107aed7b78] panic at ffffffff81495fea
 #3 [ffff88107aed7bf8] lbug_with_loc at ffffffffa04fce5b [libcfs]
 #4 [ffff88107aed7c18] lnet_match_md at ffffffffa05742bd [lnet]
 #5 [ffff88107aed7cc8] lnet_parse at ffffffffa0579482 [lnet]
 #6 [ffff88107aed7db8] kiblnd_handle_rx at ffffffffa090977b [ko2iblnd]
 #7 [ffff88107aed7e08] kiblnd_rx_complete at ffffffffa090a560 [ko2iblnd]
 #8 [ffff88107aed7e58] kiblnd_complete at ffffffffa090a732 [ko2iblnd]
 #9 [ffff88107aed7e68] kiblnd_scheduler at ffffffffa090aae9 [ko2iblnd]
#10 [ffff88107aed7f48] kernel_thread at ffffffff8100412a
crash&amp;gt; 

crash&amp;gt; p/x the_lnet.ln_portals[15]
$6 = {
  ptl_mhash = 0x0, 
  ptl_mlist = {
    next = 0xffff88083674e3c0, 
    prev = 0xffff880866eaebc0
  }, 
  ptl_msgq = {
    next = 0xffff88107ce0d3d8, 
    prev = 0xffff88107ce0d3d8
  }, 
  ptl_ml_version = 0x35c7, 
  ptl_msgq_version = 0x1, 
  ptl_options = 0x5   &amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;----- LNET_PTL_LAZY | LNET_PTL_MATCH_WILDCARD
}
crash&amp;gt; p/x *(lnet_me_t *)0xffff88083674e3c0
$13 = {
  me_list = {
    next = 0xffff88032837b3c0, 
    prev = 0xffff88107ce0d3c8
  }, 
  me_lh = {
    lh_hash_chain = {
      next = 0xffff880865e2b450, 
      prev = 0xffff880413b34850
    }, 
    lh_cookie = 0x45d7862
  }, 
  me_match_id = {
    nid = 0xffffffffffffffff, 
    pid = 0xffffffff
  }, 
  me_portal = 0xf, 
  me_match_bits = 0x0, 
  me_ignore_bits = 0xffffffffffffffff, 
  me_unlink = 0x1, 
  me_md = 0xffff88034c96543d   &amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;------ VERY BAD!!
}
crash&amp;gt; 
crash&amp;gt; p/x ((lnet_me_t *)0xffff88083674e3c0)-&amp;gt;me_md
$18 = 0xffff88034c96543d
crash&amp;gt; p/x ((lnet_me_t *)0xffff88083674e3c0)-&amp;gt;me_md-&amp;gt;md_me
$19 = 0x83674e3c0000000
crash&amp;gt; 
crash&amp;gt; p/x ((struct lnet_libmd *)0xffff88034c96543d)-&amp;gt;md_me
$20 = 0x83674e3c0000000
crash&amp;gt; p/x ((struct lnet_libmd *)0xffff88034c965440)-&amp;gt;md_me
$21 = 0xffff88083674e3c0
crash&amp;gt; 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;But how can this happen ??!! When the only place where MD/me-&amp;gt;me_md is set is in LNetMDAttach() and this after being allocated from SLabs.&lt;/p&gt;</comment>
                            <comment id="72968" author="hugo_meiland" created="Fri, 6 Dec 2013 12:36:25 +0000"  >&lt;p&gt;no usefull info on drivers loading during crash, for completeness syslog uploaded as &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3848&quot; title=&quot;Compute node crashes due to error in Lustre : ASSERTION( me == md-&amp;gt;md_me ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3848&quot;&gt;&lt;del&gt;LU-3848&lt;/del&gt;&lt;/a&gt;-syslog.zip&lt;/p&gt;</comment>
                            <comment id="73039" author="liang" created="Sat, 7 Dec 2013 04:02:53 +0000"  >&lt;p&gt;Because this kind of memory corruption can be from different modules, how about creating dedicated slab cache for both lnet_me_t and lnet_libmd_t (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4330&quot; title=&quot;LustreError: 46336:0:(events.c:433:ptlrpc_master_callback()) ASSERTION( callback == request_out_callback || callback == reply_in_callback || callback == client_bulk_callback || callback == request_in_callback || callback == reply_out_callback ... ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4330&quot;&gt;LU-4330&lt;/a&gt;)? If these are indeed defects in LNet, we still can reproduce, otherwise we should see different symptoms which might help to find out the real reason. &lt;/p&gt;</comment>
                            <comment id="73043" author="bfaccini" created="Sat, 7 Dec 2013 15:59:38 +0000"  >&lt;p&gt;Hugo: thanks for the log, and yes I agree that no specific activity/msg can be found in the Syslog at crash/LBUG time.&lt;/p&gt;

&lt;p&gt;Liang: thanks for your comment. &lt;br/&gt;
Since the same kind of corruption occured in a MD for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4330&quot; title=&quot;LustreError: 46336:0:(events.c:433:ptlrpc_master_callback()) ASSERTION( callback == request_out_callback || callback == reply_in_callback || callback == client_bulk_callback || callback == request_in_callback || callback == reply_out_callback ... ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4330&quot;&gt;LU-4330&lt;/a&gt; and in the ME for this ticket, both being allocated in Slab/kmem-cache size-128. So, as per your advice, I will try to develop a patch where MEs and small MDs (&amp;lt;128bytes) will be allocated in a specific kmem-cache. On the other hand and specifically for MDs, I may also try to use/define LNET_USE_LIB_FREELIST, this way we will use full-size and pre-allocated MDs &#8230;&lt;/p&gt;

&lt;p&gt;It is still a bit early to directly link this ticket to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4330&quot; title=&quot;LustreError: 46336:0:(events.c:433:ptlrpc_master_callback()) ASSERTION( callback == request_out_callback || callback == reply_in_callback || callback == client_bulk_callback || callback == request_in_callback || callback == reply_out_callback ... ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4330&quot;&gt;LU-4330&lt;/a&gt;, because even if it concern the same kind of corruption (-3), the same size-128 kmem-cache, a node of the same cluster/system, the offset in the affected Slab/structure is different.&lt;/p&gt;</comment>
                            <comment id="73165" author="liang" created="Tue, 10 Dec 2013 01:49:04 +0000"  >&lt;p&gt;btw, I just posted a patch to fix potential memory corruption: &lt;a href=&quot;http://review.whamcloud.com/#/c/8511/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/8511/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="75663" author="hugo_meiland" created="Mon, 27 Jan 2014 12:25:52 +0000"  >&lt;p&gt;Bruno, Liang: This weekend I was able to obtain kernel coredumps again with this error (after a stable period os 3-4 weeks). As mentioned earlier the lustre version has been upgraded to 2.1.6. The uploaded file is LU3848-tcn23-vmcore-vmlinux-systemmap-weak-updates.zip; please let me know if you need additional files...&lt;/p&gt;</comment>
                            <comment id="75747" author="bfaccini" created="Tue, 28 Jan 2014 09:32:23 +0000"  >&lt;p&gt;Hello Hugo, I will review the new crash-dump you uploaded to verify we still encounter the same kind of corruption.&lt;/p&gt;

&lt;p&gt;BTW, I have pushed a patch that should help to better identify the root cause of the MEs/MDs corruption in Slabs, for both &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4330&quot; title=&quot;LustreError: 46336:0:(events.c:433:ptlrpc_master_callback()) ASSERTION( callback == request_out_callback || callback == reply_in_callback || callback == client_bulk_callback || callback == request_in_callback || callback == reply_out_callback ... ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4330&quot;&gt;LU-4330&lt;/a&gt; and this ticket, it is available at &lt;a href=&quot;http://review.whamcloud.com/8819/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8819/&lt;/a&gt;. It has already been successfully exposed to autotests, and you may want to give it a try. May be Sebastien could also validate it and try to integrate it in Bull distro?&lt;/p&gt;

&lt;p&gt;Since this patch extracts LNET MEs and small-MDs allocation from generic &amp;lt;size-128&amp;gt; kmem_cache, I strongly expect that you will no longer encounter &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3848&quot; title=&quot;Compute node crashes due to error in Lustre : ASSERTION( me == md-&amp;gt;md_me ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3848&quot;&gt;&lt;del&gt;LU-3848&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4330&quot; title=&quot;LustreError: 46336:0:(events.c:433:ptlrpc_master_callback()) ASSERTION( callback == request_out_callback || callback == reply_in_callback || callback == client_bulk_callback || callback == request_in_callback || callback == reply_out_callback ... ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4330&quot;&gt;LU-4330&lt;/a&gt; LBUGs (and may be suffer new issues with others Kernel codes using &amp;lt;size-128&amp;gt; &#8230;). But if not, and you are able to reproduce the LBUGs, then I will need to investigate further.&lt;/p&gt;</comment>
                            <comment id="75749" author="liang" created="Tue, 28 Jan 2014 09:46:01 +0000"  >&lt;p&gt;Hi Hugo, have you already applied the patch I mentioned previously? &lt;a href=&quot;http://review.whamcloud.com/#/c/8511/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/8511/&lt;/a&gt; , it did fix potential memory corruption issue, although not sure if it&apos;s the reason of your issue here. &lt;/p&gt;</comment>
                            <comment id="75852" author="hugo_meiland" created="Wed, 29 Jan 2014 15:10:32 +0000"  >&lt;p&gt;Hi Bruno and Liang; &lt;br/&gt;
@Bruno: Thanks for the patch, I&apos;ve asked Sebastien to prepare it for me to test at the client site. I do have plenty more of crashes available right now, but I&apos;m not sure if this would be a huge help at this time.&lt;br/&gt;
@Liang: Your patch has been integrated to our tree, but just after our upgrade to 2.1.6, so I&apos;m not running this patch yet. It will be in the next lustre package.&lt;br/&gt;
I&apos;ll try to give you feedback on these as soon as possible.&lt;/p&gt;</comment>
                            <comment id="75924" author="bfaccini" created="Thu, 30 Jan 2014 09:20:41 +0000"  >&lt;p&gt;Hello Hugo,&lt;br/&gt;
Thanks again for your help on this!&lt;br/&gt;
I just had a look into the recent crash-dump you uploaded in LU3848-tcn23-vmcore-vmlinux-systemmap-weak-updates.zip, and I can confirm this the same scenario, only little difference is that the me_md field corruption is -1 instead of -3 before.&lt;/p&gt;</comment>
                            <comment id="76559" author="bfaccini" created="Sun, 9 Feb 2014 17:16:09 +0000"  >&lt;p&gt;I need to indicate that after successful auto-tests, I made local extensive testing of &lt;a href=&quot;http://review.whamcloud.com/8819/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8819/&lt;/a&gt; and I can confirm it works as expected and with no MEs/MDs leak in Slabs.&lt;/p&gt;</comment>
                            <comment id="78458" author="hugo_meiland" created="Wed, 5 Mar 2014 15:02:36 +0000"  >&lt;p&gt;As an update: the fix has now been installed for a couple of weeks and no more of these LBUG&apos;s have been seen, so the fix looks good...&lt;/p&gt;</comment>
                            <comment id="78461" author="pjones" created="Wed, 5 Mar 2014 15:21:19 +0000"  >&lt;p&gt;Great! Thanks for letting us know Hugo&lt;/p&gt;</comment>
                            <comment id="143605" author="bfaccini" created="Wed, 24 Feb 2016 17:28:55 +0000"  >&lt;p&gt;Follow on (master patch, for any other version, ...) to be tracked in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4330&quot; title=&quot;LustreError: 46336:0:(events.c:433:ptlrpc_master_callback()) ASSERTION( callback == request_out_callback || callback == reply_in_callback || callback == client_bulk_callback || callback == request_in_callback || callback == reply_out_callback ... ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4330&quot;&gt;LU-4330&lt;/a&gt; now.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="22283">LU-4330</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvzhr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9966</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>