<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:41:45 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4330] LustreError: 46336:0:(events.c:433:ptlrpc_master_callback()) ASSERTION( callback == request_out_callback || callback == reply_in_callback || callback == client_bulk_callback || callback == request_in_callback || callback == reply_out_callback ... ) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-4330</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Hi,&lt;/p&gt;

&lt;p&gt;More and more compute nodes of several different customer clusters are hitting an LBUG on this &apos;assertion failed&apos; issue:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-11-21 14:06:54 LustreError: 46336:0:(events.c:433:ptlrpc_master_callback()) ASSERTION( callback == request_out_callback || callback == reply_in_callback || callback == client_bulk_callback || callback == request_in_callback || callback == reply_out_callback || callback == server_bulk_callback ) failed:
2013-11-21 14:06:54 LustreError: 46336:0:(events.c:433:ptlrpc_master_callback()) LBUG
2013-11-21 14:06:54 Nov 21 14:06:54 Pid: 46336, comm: kiblnd_sd_00
2013-11-21 14:06:54 compute5666 kernel
2013-11-21 14:06:54 : LustreError: 4Call Trace:
2013-11-21 14:06:54 6336:0:(events.c [&amp;lt;ffffffffa041c7f5&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
2013-11-21 14:06:54 :433:ptlrpc_mast [&amp;lt;ffffffffa041ce07&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
2013-11-21 14:06:54 er_callback()) A [&amp;lt;ffffffffa06a426c&amp;gt;] ptlrpc_master_callback+0xcc/0xd0 [ptlrpc]
2013-11-21 14:06:54 SSERTION( callba [&amp;lt;ffffffffa048ebd2&amp;gt;] lnet_enq_event_locked+0x62/0xd0 [lnet]
2013-11-21 14:06:54 ck == request_ou [&amp;lt;ffffffffa048ecdb&amp;gt;] lnet_finalize+0x9b/0x2f0 [lnet]
2013-11-21 14:06:54 t_callback || ca [&amp;lt;ffffffffa083d073&amp;gt;] kiblnd_recv+0x103/0x570 [ko2iblnd]
2013-11-21 14:06:54 llback == reply_ [&amp;lt;ffffffffa04928dd&amp;gt;] lnet_ni_recv+0xad/0x2f0 [lnet]
2013-11-21 14:06:54 in_callback || c [&amp;lt;ffffffffa0492c06&amp;gt;] lnet_recv_put+0xe6/0x120 [lnet]
2013-11-21 14:06:54 allback == clien [&amp;lt;ffffffffa0499c33&amp;gt;] lnet_parse+0x1273/0x1b80 [lnet]
2013-11-21 14:06:54 t_bulk_callback [&amp;lt;ffffffff81042ca3&amp;gt;] ? enqueue_task+0x43/0x90
2013-11-21 14:06:54 || callback == r [&amp;lt;ffffffffa083d7ab&amp;gt;] kiblnd_handle_rx+0x2cb/0x680 [ko2iblnd]
2013-11-21 14:06:54 equest_in_callba [&amp;lt;ffffffffa083e590&amp;gt;] kiblnd_rx_complete+0x2d0/0x440 [ko2iblnd]
2013-11-21 14:06:54 ck || callback = [&amp;lt;ffffffff81042a63&amp;gt;] ? __wake_up+0x53/0x70
2013-11-21 14:06:54 = reply_out_call [&amp;lt;ffffffffa083e762&amp;gt;] kiblnd_complete+0x62/0xe0 [ko2iblnd]
2013-11-21 14:06:54 back || callback [&amp;lt;ffffffffa083eb19&amp;gt;] kiblnd_scheduler+0x339/0x7a0 [ko2iblnd]
2013-11-21 14:06:54 == server_bulk_ [&amp;lt;ffffffff8104a320&amp;gt;] ? default_wake_function+0x0/0x20
2013-11-21 14:06:54 callback ) faile [&amp;lt;ffffffffa083e7e0&amp;gt;] ? kiblnd_scheduler+0x0/0x7a0 [ko2iblnd]
2013-11-21 14:06:54 d:
2013-11-21 14:06:54 Nov 21 14:06 [&amp;lt;ffffffff8100412a&amp;gt;] child_rip+0xa/0x20
2013-11-21 14:06:54 :54 compute5666 ke [&amp;lt;ffffffffa083e7e0&amp;gt;] ? kiblnd_scheduler+0x0/0x7a0 [ko2iblnd]
2013-11-21 14:06:54 rnel: LustreErro [&amp;lt;ffffffffa083e7e0&amp;gt;] ? kiblnd_scheduler+0x0/0x7a0 [ko2iblnd]
2013-11-21 14:06:54 r: 46336:0:(even [&amp;lt;ffffffff81004120&amp;gt;] ? child_rip+0x0/0x20
2013-11-21 14:06:54 ts.c:433:ptlrpc_
2013-11-21 14:06:54 master_callback()) LBUG
2013-11-21 14:06:54 Nov 21 14:06:54 compute566Kernel panic - not syncing: LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;For information, systems are running with kernel boot parameter &apos;tolerant=1&apos; set.&lt;/p&gt;

&lt;p&gt;We have a crash dump that we will upload to ftp.&lt;/p&gt;

&lt;p&gt;Sebastien.&lt;/p&gt;</description>
                <environment></environment>
        <key id="22283">LU-4330</key>
            <summary>LustreError: 46336:0:(events.c:433:ptlrpc_master_callback()) ASSERTION( callback == request_out_callback || callback == reply_in_callback || callback == client_bulk_callback || callback == request_in_callback || callback == reply_out_callback ... ) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="4" iconUrl="https://jira.whamcloud.com/images/icons/statuses/reopened.png" description="This issue was once resolved, but the resolution was deemed incorrect. From here issues are either marked assigned or resolved.">Reopened</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="bfaccini">Bruno Faccini</assignee>
                                    <reporter username="sebastien.buisson">Sebastien Buisson</reporter>
                        <labels>
                    </labels>
                <created>Mon, 2 Dec 2013 14:40:13 +0000</created>
                <updated>Thu, 14 Jun 2018 21:41:20 +0000</updated>
                                            <version>Lustre 2.1.6</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>13</watches>
                                                                            <comments>
                            <comment id="72597" author="bfaccini" created="Mon, 2 Dec 2013 14:51:56 +0000"  >&lt;p&gt;Hello Sebastien, &lt;br/&gt;
According to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1734&quot; title=&quot;ptlrpc_master_callback() LBUG&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1734&quot;&gt;&lt;del&gt;LU-1734&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3010&quot; title=&quot;client crashes on RHEL6 with Lustre 1.8.8&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3010&quot;&gt;&lt;del&gt;LU-3010&lt;/del&gt;&lt;/a&gt; already describing the same issue/LBUG, it has been identified to be caused by some IB-related memory corruption.&lt;br/&gt;
Thus I am wondering if the affected systems are running with ipoib-locking-fix.patch described in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3848&quot; title=&quot;Compute node crashes due to error in Lustre : ASSERTION( me == md-&amp;gt;md_me ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3848&quot;&gt;&lt;del&gt;LU-3848&lt;/del&gt;&lt;/a&gt; (where you recently noticed &quot;please also note that this patch on IPoIB was removed as it was found to introduce a deadlock&quot;) ?&lt;/p&gt;</comment>
                            <comment id="72690" author="hugo_meiland" created="Tue, 3 Dec 2013 12:02:46 +0000"  >&lt;p&gt;just uploaded vmcore + vmlinux and Lustre modules to Intel&apos;s FTP: /uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4330&quot; title=&quot;LustreError: 46336:0:(events.c:433:ptlrpc_master_callback()) ASSERTION( callback == request_out_callback || callback == reply_in_callback || callback == client_bulk_callback || callback == request_in_callback || callback == reply_out_callback ... ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4330&quot;&gt;LU-4330&lt;/a&gt;* &lt;/p&gt;</comment>
                            <comment id="72884" author="sebastien.buisson" created="Thu, 5 Dec 2013 08:21:37 +0000"  >&lt;p&gt;Hi Bruno,&lt;/p&gt;

&lt;p&gt;Just a quick comment to mention that we are not running with the ipoib-locking-fix.patch, as we already run an OFED stack that integrates a more complete version of this patch, and it does not help.&lt;/p&gt;

&lt;p&gt;Cheers,&lt;br/&gt;
Sebastien.&lt;/p&gt;</comment>
                            <comment id="72900" author="bfaccini" created="Thu, 5 Dec 2013 14:55:28 +0000"  >&lt;p&gt;Hugo, thanks for uploading the crash-dump, but I need the exact names to be able to get them.&lt;br/&gt;
Sebastien, thanks for the precision.&lt;/p&gt;</comment>
                            <comment id="72963" author="hugo_meiland" created="Fri, 6 Dec 2013 08:32:42 +0000"  >&lt;p&gt;filenames are:&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4330&quot; title=&quot;LustreError: 46336:0:(events.c:433:ptlrpc_master_callback()) ASSERTION( callback == request_out_callback || callback == reply_in_callback || callback == client_bulk_callback || callback == request_in_callback || callback == reply_out_callback ... ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4330&quot;&gt;LU-4330&lt;/a&gt;-vmcore-tcn91-LBUG&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4330&quot; title=&quot;LustreError: 46336:0:(events.c:433:ptlrpc_master_callback()) ASSERTION( callback == request_out_callback || callback == reply_in_callback || callback == client_bulk_callback || callback == request_in_callback || callback == reply_out_callback ... ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4330&quot;&gt;LU-4330&lt;/a&gt;-vmlinux-systemmap-weak_updates.zip&lt;br/&gt;
Let me know if you can find them, otherwise I&apos;ll upload again...&lt;br/&gt;
Hugo&lt;/p&gt;</comment>
                            <comment id="72997" author="bfaccini" created="Fri, 6 Dec 2013 17:01:48 +0000"  >&lt;p&gt;Thanks Hugo, I got them and was able to look at the crash-dump then!&lt;br/&gt;
BTW, analysis conclusion looks very similar of the one I just made for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3848&quot; title=&quot;Compute node crashes due to error in Lustre : ASSERTION( me == md-&amp;gt;md_me ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3848&quot;&gt;&lt;del&gt;LU-3848&lt;/del&gt;&lt;/a&gt; :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; p/x ((lnet_me_t *)0xffff88053cc10dc0)-&amp;gt;me_md-&amp;gt;md_user_ptr
$32 = 0xffff88087aa2c03d
crash&amp;gt; 
crash&amp;gt; p/x *(struct ptlrpc_cb_id *)0xffff88087aa2c03d
$8 = {
  cbid_fn = 0xffa07a2b10ffff88, 
  cbid_arg = 0x87aa2c000ffffff
}
crash&amp;gt; p/x *(struct ptlrpc_cb_id *)0xffff88087aa2c040
$9 = {
  cbid_fn = 0xffffffffa07a2b10, 
  cbid_arg = 0xffff88087aa2c000
}
crash&amp;gt; x/i 0xffffffffa07a2b10
   0xffffffffa07a2b10 &amp;lt;request_in_callback&amp;gt;:    push   %rbp
crash&amp;gt; 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;which means that current MD&apos;s md_user_ptr field/pointer seems to have been decremented by 3 !!! Just like the ME&apos;s md_me pointer in crash-dump for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3848&quot; title=&quot;Compute node crashes due to error in Lustre : ASSERTION( me == md-&amp;gt;md_me ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3848&quot;&gt;&lt;del&gt;LU-3848&lt;/del&gt;&lt;/a&gt; that occured on an other Client of same Cluster/Customer.&lt;br/&gt;
This is again in Slabs size-128, no OOM logs but VTune/Emon driver loaded.&lt;/p&gt;

&lt;p&gt;So, if we add &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3848&quot; title=&quot;Compute node crashes due to error in Lustre : ASSERTION( me == md-&amp;gt;md_me ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3848&quot;&gt;&lt;del&gt;LU-3848&lt;/del&gt;&lt;/a&gt; and this ticket&apos;s symptoms, it looks like we suffer a new situation like the one described in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3010&quot; title=&quot;client crashes on RHEL6 with Lustre 1.8.8&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3010&quot;&gt;&lt;del&gt;LU-3010&lt;/del&gt;&lt;/a&gt; !!!&lt;/p&gt;

</comment>
                            <comment id="73044" author="bfaccini" created="Sat, 7 Dec 2013 16:06:29 +0000"  >&lt;p&gt;Since the same kind of corruption occured in a ME for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3848&quot; title=&quot;Compute node crashes due to error in Lustre : ASSERTION( me == md-&amp;gt;md_me ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3848&quot;&gt;&lt;del&gt;LU-3848&lt;/del&gt;&lt;/a&gt; and in the MD for this ticket, both being allocated in Slab/kmem-cache size-128. &lt;br/&gt;
So, as per Liang&apos;s advice, I will try to develop a patch where MEs and small MDs (&amp;lt;128bytes) will be allocated in a specific kmem-cache. On the other hand and specifically for MDs, I may also try to use/define LNET_USE_LIB_FREELIST, this way we will use full-size and pre-allocated MDs &#8230;&lt;/p&gt;

&lt;p&gt;It is still a bit early to directly link this ticket to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3848&quot; title=&quot;Compute node crashes due to error in Lustre : ASSERTION( me == md-&amp;gt;md_me ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3848&quot;&gt;&lt;del&gt;LU-3848&lt;/del&gt;&lt;/a&gt;, because even if it concern the same kind of corruption (-3), the same size-128 kmem-cache, a node of the same cluster/system, the offset in the affected Slab/structure is different.&lt;/p&gt;</comment>
                            <comment id="74296" author="hugo_meiland" created="Fri, 3 Jan 2014 07:34:23 +0000"  >&lt;p&gt;Hi all, best wishes for 2014!!&lt;/p&gt;

&lt;p&gt;after upgrading the clients to 2.1.6, this crash has again occured, this hopefully helps in finding the underlying problem...&lt;/p&gt;

&lt;p&gt;1388685619 2014 Jan  2 19:00:19 tcn19 kern emerg kernel LustreError: 3259:0:(events.c:433:ptlrpc_master_callback()) ASSERTION( callback == request&lt;br/&gt;
_out_callback || callback == reply_in_callback || callback == client_bulk_callback || callback == request_in_callback || callback == reply_out_cal&lt;br/&gt;
lback || callback == server_bulk_callback ) failed: &lt;br/&gt;
1388685619 2014 Jan  2 19:00:19 tcn19 kern emerg kernel LustreError: 3259:0:(events.c:433:ptlrpc_master_callback()) LBUG&lt;br/&gt;
1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel Pid: 3259, comm: kiblnd_sd_04&lt;br/&gt;
1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;br/&gt;
1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel Call Trace:&lt;br/&gt;
1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05317f5&amp;gt;&amp;#93;&lt;/span&gt; libcfs_debug_dumpstack+0x55/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0531e07&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07a225c&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_master_callback+0xcc/0xd0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05a3bd2&amp;gt;&amp;#93;&lt;/span&gt; lnet_enq_event_locked+0x62/0xd0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
 1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05a3cdb&amp;gt;&amp;#93;&lt;/span&gt; lnet_finalize+0x9b/0x2f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
 1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa093b043&amp;gt;&amp;#93;&lt;/span&gt; kiblnd_recv+0x103/0x570 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05a78dd&amp;gt;&amp;#93;&lt;/span&gt; lnet_ni_recv+0xad/0x2f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
 1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05a7c06&amp;gt;&amp;#93;&lt;/span&gt; lnet_recv_put+0xe6/0x120 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
 1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05aec33&amp;gt;&amp;#93;&lt;/span&gt; lnet_parse+0x1273/0x1b80 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
 1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa093b77b&amp;gt;&amp;#93;&lt;/span&gt; kiblnd_handle_rx+0x2cb/0x680 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8103d349&amp;gt;&amp;#93;&lt;/span&gt; ? __wake_up_common+0x59/0x90&lt;br/&gt;
 1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa093c560&amp;gt;&amp;#93;&lt;/span&gt; kiblnd_rx_complete+0x2d0/0x440 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81042cf3&amp;gt;&amp;#93;&lt;/span&gt; ? __wake_up+0x53/0x70&lt;br/&gt;
 1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa093c732&amp;gt;&amp;#93;&lt;/span&gt; kiblnd_complete+0x62/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa093cae9&amp;gt;&amp;#93;&lt;/span&gt; kiblnd_scheduler+0x339/0x7a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81048df0&amp;gt;&amp;#93;&lt;/span&gt; ? default_wake_function+0x0/0x20&lt;br/&gt;
 1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa093c7b0&amp;gt;&amp;#93;&lt;/span&gt; ? kiblnd_scheduler+0x0/0x7a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 1388685619 2014 Jan  2 19:00:19 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100412a&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
 1388685620 2014 Jan  2 19:00:20 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa093c7b0&amp;gt;&amp;#93;&lt;/span&gt; ? kiblnd_scheduler+0x0/0x7a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 1388685620 2014 Jan  2 19:00:20 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa093c7b0&amp;gt;&amp;#93;&lt;/span&gt; ? kiblnd_scheduler+0x0/0x7a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 1388685620 2014 Jan  2 19:00:20 tcn19 kern warning kernel &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81004120&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;/p&gt;
</comment>
                            <comment id="74382" author="bfaccini" created="Mon, 6 Jan 2014 10:07:41 +0000"  >&lt;p&gt;Hello Hugo, Can you also upload this new crash-dump (and required vmlinux/lustre-modules additionnal stuff) with 2.1.6, so I can double-check if we still suffer the same kind of corruption ??&lt;/p&gt;

&lt;p&gt;On the other hand and if still same issue as very strongly expected, I will try to come up with a patch to implement the work-around (move allocation of MEs and small MDs out of &amp;lt;size-128&amp;gt; SLABs kmem-cache) described before.&lt;/p&gt;
</comment>
                            <comment id="74474" author="hugo_meiland" created="Tue, 7 Jan 2014 11:01:07 +0000"  >&lt;p&gt;I&apos;m afraid no crash-dump on this one available, I will send one on the next occurance...&lt;/p&gt;</comment>
                            <comment id="75664" author="hugo_meiland" created="Mon, 27 Jan 2014 12:27:19 +0000"  >&lt;p&gt;Bruno, Liang: This weekend I was able to obtain kernel coredumps again with this error (after a stable period os 3-4 weeks). As mentioned earlier the lustre version has been upgraded to 2.1.6. The uploaded file is LU4330-tcn82-vmcore-vmlinux-systemmap-weak-updates.zip; please let me know if you need additional files...&lt;/p&gt;</comment>
                            <comment id="75746" author="bfaccini" created="Tue, 28 Jan 2014 09:31:13 +0000"  >&lt;p&gt;Hello Hugo, I will review the new crash-dump you uploaded to verify we still encounter the same hind of corruption.&lt;/p&gt;

&lt;p&gt;BTW, I have pushed a patch that should help to better identify the root cause of the MEs/MDs corruption in Slabs, for both &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3848&quot; title=&quot;Compute node crashes due to error in Lustre : ASSERTION( me == md-&amp;gt;md_me ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3848&quot;&gt;&lt;del&gt;LU-3848&lt;/del&gt;&lt;/a&gt; and this ticket, it is available at &lt;a href=&quot;http://review.whamcloud.com/8819/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8819/&lt;/a&gt;. It has already been successfully exposed to autotests, and you may want to give it a try. May be Sebastien could also validate it and try to integrate it in Bull distro?&lt;/p&gt;

&lt;p&gt;Since this patch extracts LNET MEs and small-MDs allocation from generic &amp;lt;size-128&amp;gt; kmem_cache, I strongly expect that you will no longer encounter &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3848&quot; title=&quot;Compute node crashes due to error in Lustre : ASSERTION( me == md-&amp;gt;md_me ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3848&quot;&gt;&lt;del&gt;LU-3848&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4330&quot; title=&quot;LustreError: 46336:0:(events.c:433:ptlrpc_master_callback()) ASSERTION( callback == request_out_callback || callback == reply_in_callback || callback == client_bulk_callback || callback == request_in_callback || callback == reply_out_callback ... ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4330&quot;&gt;LU-4330&lt;/a&gt; LBUGs (and may be suffer new issues with others Kernel codes using &amp;lt;size-128&amp;gt; &#8230;). But if not, and you are able to reproduce the LBUGs, then I will need to investigate further.&lt;/p&gt;
</comment>
                            <comment id="75853" author="hugo_meiland" created="Wed, 29 Jan 2014 15:12:01 +0000"  >&lt;p&gt;just to keep the info with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3848&quot; title=&quot;Compute node crashes due to error in Lustre : ASSERTION( me == md-&amp;gt;md_me ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3848&quot;&gt;&lt;del&gt;LU-3848&lt;/del&gt;&lt;/a&gt; in sync:&lt;br/&gt;
Hi Bruno and Liang; &lt;br/&gt;
 @Bruno: Thanks for the patch, I&apos;ve asked Sebastien to prepare it for me to test at the client site. I do have plenty more of crashes available right now, but I&apos;m not sure if this would be a huge help at this time.&lt;br/&gt;
 @Liang: Your patch has been integrated to our tree, but just after our upgrade to 2.1.6, so I&apos;m not running this patch yet. It will be in the next lustre package.&lt;br/&gt;
 I&apos;ll try to give you feedback on these as soon as possible.&lt;/p&gt;</comment>
                            <comment id="75925" author="bfaccini" created="Thu, 30 Jan 2014 09:48:33 +0000"  >&lt;p&gt;Hello Hugo,&lt;br/&gt;
Thanks again for your help on this!&lt;br/&gt;
I just had a look into the recent crash-dump you uploaded in LU4330-tcn82-vmcore-vmlinux-systemmap-weak-updates.zip, and I can confirm this the same scenario, with the same/-3 corruption of MD&apos;s md_user_ptr field/pointer.&lt;/p&gt;</comment>
                            <comment id="76558" author="bfaccini" created="Sun, 9 Feb 2014 17:15:31 +0000"  >&lt;p&gt;I need to indicate that after successful auto-tests, I made local extensive testing of &lt;a href=&quot;http://review.whamcloud.com/8819/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8819/&lt;/a&gt; and I can confirm it works as expected and with no MEs/MDs leak in Slabs.&lt;/p&gt;</comment>
                            <comment id="78457" author="hugo_meiland" created="Wed, 5 Mar 2014 15:01:18 +0000"  >&lt;p&gt;As an update: the fix has now been installed for a couple of weeks and no more of these LBUG&apos;s have been seen, so the fix looks good...&lt;/p&gt;</comment>
                            <comment id="78697" author="bfaccini" created="Fri, 7 Mar 2014 14:09:29 +0000"  >&lt;p&gt;Hello Hugo, thank&apos;s for your feedback!&lt;br/&gt;
The fact that both asserts for this ticket and &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3848&quot; title=&quot;Compute node crashes due to error in Lustre : ASSERTION( me == md-&amp;gt;md_me ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3848&quot;&gt;&lt;del&gt;LU-3848&lt;/del&gt;&lt;/a&gt; disappeared seems to confirm what I suspected from the beginning, that LNET/Lustre were not involved with the corruptions but only victims. BTW, did you notice any other sub-system (networking, ...) disfunction since ?&lt;/p&gt;

&lt;p&gt;On the other-hand I am now working on a more generic (not based on hard-coded 128 bytes length) patch version to address Liang&apos;s last input, and also answer to Isaac comments too. Ans also push a master version as Isaac requested.&lt;/p&gt;</comment>
                            <comment id="79658" author="hugo_meiland" created="Wed, 19 Mar 2014 13:57:09 +0000"  >&lt;p&gt;Hi Bruno, no disfunction seen, the fix seems to solve the issue completely; thanks! Hugo&lt;/p&gt;</comment>
                            <comment id="143435" author="gerrit" created="Tue, 23 Feb 2016 19:39:59 +0000"  >&lt;p&gt;Faccini Bruno (bruno.faccini@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/18586&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/18586&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4330&quot; title=&quot;LustreError: 46336:0:(events.c:433:ptlrpc_master_callback()) ASSERTION( callback == request_out_callback || callback == reply_in_callback || callback == client_bulk_callback || callback == request_in_callback || callback == reply_out_callback ... ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4330&quot;&gt;LU-4330&lt;/a&gt; lnet: Allocate MEs and small MDs in own kmem_caches&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 11e4b34d75505476f363c1cf4400755a7f30f766&lt;/p&gt;</comment>
                            <comment id="143630" author="adilger" created="Wed, 24 Feb 2016 18:31:39 +0000"  >&lt;p&gt;The scary thing is that there would continue to be random memory corruptions in the size-128 slab, but they will just be corrupting some other part of memory.&lt;/p&gt;

&lt;p&gt;If this problem can be found in a relatively short amount of testing time, then there are debugging patches available that could be applied to the kernel to make all kmalloc() calls actually map to vmalloc() internally and have vmalloc() always use a new memory address, and then when the memory is freed the page is unmapped and the address never used again.  If another thread is incorrectly accessing an unmapped address (use after free) it will fault and then the source of the corruption may be found.  Unfortunately, this impacts the performance and can only be used for debugging and not in production.&lt;/p&gt;

&lt;p&gt;Patches are available in &lt;a href=&quot;https://bugzilla.lustre.org/show_bug.cgi?id=22471&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://bugzilla.lustre.org/show_bug.cgi?id=22471&lt;/a&gt; but the would likely need to be updated for newer kernels.  They can definitely help find memory corruption problems that are otherwise very difficult to find.&lt;/p&gt;</comment>
                            <comment id="143639" author="adilger" created="Wed, 24 Feb 2016 19:23:11 +0000"  >&lt;p&gt;Bruno, if you update those patches for newer kernels, please submit the new patches into lustre/kernel_patches/patches so they are available for use in the future, since bugzilla may disappear at some point.&lt;/p&gt;</comment>
                            <comment id="147514" author="simmonsja" created="Thu, 31 Mar 2016 23:48:46 +0000"  >&lt;p&gt;I think I might know what the problem is. A recent patch for ko2iblnd in the upstream kernel landed that exposed a serious memory corruption.&lt;br/&gt;
The commit is 3d1477309806459d39e13d8c3206ba35d183c34a &quot;Replace sg++ with sg = sg_next(sg)&quot; The scatter gather list is from &lt;br/&gt;
tx-&amp;gt;tx_frags which is IBLND_MAX_RDMA_FRAGS in size. Since you write at an offset into the tx_frags that mean you really need  IBLND_MAX_RDMA_FRAGS + 1 in size for the frags. Currently the upstream clients will crash when you attempt to access scatter list entry IBLND_MAX_RDMA_FRAGS + 1.&lt;/p&gt;</comment>
                            <comment id="148833" author="bfaccini" created="Wed, 13 Apr 2016 22:28:19 +0000"  >&lt;p&gt;James,&lt;br/&gt;
Thanks for your input.&lt;br/&gt;
But well, I can not tell for this old case/crash, but on my side I have seen recent &amp;lt;size-128&amp;gt; Slabs corruption cases that are ext4/ldiskfs related (see &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7980&quot; title=&quot;Overrun in generic &amp;lt;size-128&amp;gt; kmem_cache Slabs causing OSS to crash&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7980&quot;&gt;&lt;del&gt;LU-7980&lt;/del&gt;&lt;/a&gt;).&lt;/p&gt;</comment>
                            <comment id="154021" author="gerrit" created="Tue, 31 May 2016 04:56:24 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/18586/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/18586/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4330&quot; title=&quot;LustreError: 46336:0:(events.c:433:ptlrpc_master_callback()) ASSERTION( callback == request_out_callback || callback == reply_in_callback || callback == client_bulk_callback || callback == request_in_callback || callback == reply_out_callback ... ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4330&quot;&gt;LU-4330&lt;/a&gt; lnet: Allocate MEs and small MDs in own kmem_caches&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 9d9bb678d6b3707623845e0ce67dd7fd07a12fe9&lt;/p&gt;</comment>
                            <comment id="154064" author="pjones" created="Tue, 31 May 2016 12:50:24 +0000"  >&lt;p&gt;Landed for 2.9&lt;/p&gt;</comment>
                            <comment id="155355" author="simmonsja" created="Fri, 10 Jun 2016 15:13:51 +0000"  >&lt;p&gt;Attempted to merge the patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4330&quot; title=&quot;LustreError: 46336:0:(events.c:433:ptlrpc_master_callback()) ASSERTION( callback == request_out_callback || callback == reply_in_callback || callback == client_bulk_callback || callback == request_in_callback || callback == reply_out_callback ... ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4330&quot;&gt;LU-4330&lt;/a&gt; upstream and it was rejected since according to Greg it will not fix the problem. See email&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;http://driverdev.linuxdriverproject.org/pipermail/driverdev-devel/2016-June/090787.html&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://driverdev.linuxdriverproject.org/pipermail/driverdev-devel/2016-June/090787.html&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Bruno can you reproduce this problem? Have you tried debugging with slub?&lt;/p&gt;</comment>
                            <comment id="157668" author="bfaccini" created="Tue, 5 Jul 2016 15:59:23 +0000"  >&lt;p&gt;James, sorry to be late to comment/answer you.&lt;br/&gt;
In fact, I should have re-phrase my commit msg before submitting patch upstream, seems that Greg thought that this patch has been created to fix the corruotions issues, when it has been created to pursue with the corruption isolation, by first avoiding LNet MEs/small-MDs to be impacted. Then the idea came up that moving LNet MEs/small-MDs to a specific kmem_cache has been developed and seen as a possible performance enhancement.&lt;br/&gt;
BTW, Greg seems curious about getting some numbers/stats about this enhancement!&lt;/p&gt;</comment>
                            <comment id="161002" author="pjones" created="Fri, 5 Aug 2016 23:23:58 +0000"  >&lt;p&gt;Regardless of what upstream think of this fix, I think that we no longer need to track it for 2.9 so I will remove the fix version&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="20660">LU-3848</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="37956">LU-8362</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwacn:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>11850</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>