<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:29:49 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2967] list_del corruption - client crashes</title>
                <link>https://jira.whamcloud.com/browse/LU-2967</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;After multiple hours of SWL runs, multiple client crashes. &lt;br/&gt;
Example one &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;2013-03-14 06:13:47 ------------[ cut here ]------------
2013-03-14 06:13:47 WARNING: at lib/list_debug.c:51 list_del+0x8d/0xa0() (Tainted: G        W  ---------------   )
2013-03-14 06:13:47 Hardware name: XS23-TY
2013-03-14 06:13:47 list_del corruption. next-&amp;gt;prev should be ffff8801aee8bc50, but was 0504000006000001
2013-03-14 06:13:47 Modules linked in: lmv(U) mgc(U) lustre(U) lov(U) osc(U) lquota(U) mdc(U) fid(U) fld(U) ko2iblnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) libcfs(U) acpi_cpufreq freq_table mperf ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_addr ib_sa mlx4_ib ib_mad ib_core dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun kvm dcdbas i2c_i801 i2c_core ahci iTCO_wdt iTCO_vendor_support ioatdma dca i7core_edac edac_core shpchp ipv6 nfs lockd fscache nfs_acl auth_rpcgss sunrpc mlx4_en mlx4_core e1000e [last unloaded: cpufreq_ondemand]
2013-03-14 06:13:47 Pid: 3160, comm: ipoib Tainted: G        W  ---------------    2.6.32-279.22.1.el6.x86_64 #1
2013-03-14 06:13:47 Call Trace:
2013-03-14 06:13:47  [&amp;lt;ffffffff8106a2a7&amp;gt;] ? warn_slowpath_common+0x87/0xc0
2013-03-14 06:13:47  [&amp;lt;ffffffff8106a396&amp;gt;] ? warn_slowpath_fmt+0x46/0x50
2013-03-14 06:13:47  [&amp;lt;ffffffff81279f0d&amp;gt;] ? list_del+0x8d/0xa0
2013-03-14 06:13:47  [&amp;lt;ffffffffa0347619&amp;gt;] ? ipoib_cm_tx_reap+0xc9/0x510 [ib_ipoib]
2013-03-14 06:13:47  [&amp;lt;ffffffffa0347550&amp;gt;] ? ipoib_cm_tx_reap+0x0/0x510 [ib_ipoib]
2013-03-14 06:13:47  [&amp;lt;ffffffff8108b370&amp;gt;] ? worker_thread+0x170/0x2a0
2013-03-14 06:13:47  [&amp;lt;ffffffff81090be0&amp;gt;] ? autoremove_wake_function+0x0/0x40
2013-03-14 06:13:47  [&amp;lt;ffffffff8108b200&amp;gt;] ? worker_thread+0x0/0x2a0
2013-03-14 06:13:47  [&amp;lt;ffffffff81090876&amp;gt;] ? kthread+0x96/0xa0
2013-03-14 06:13:47  [&amp;lt;ffffffff8100c0ca&amp;gt;] ? child_rip+0xa/0x20
2013-03-14 06:13:47  [&amp;lt;ffffffff810907e0&amp;gt;] ? kthread+0x0/0xa0
2013-03-14 06:13:47  [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20
2013-03-14 06:13:47 ---[ end trace e1288d85056fd00d ]---
2013-03-14 06:13:47 BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
2013-03-14 06:13:47 IP: [&amp;lt;ffffffff81279e9b&amp;gt;] list_del+0x1b/0xa0
2013-03-14 06:13:47 PGD 174282067 PUD 145d8f067 PMD 0
2013-03-14 06:13:47 Oops: 0000 [#1] SMP
2013-03-14 06:13:47 last sysfs file: /sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/net/eth1/statistics/tx_errors
2013-03-14 06:13:47 CPU 2
2013-03-14 06:13:47 Modules linked in: lmv(U) mgc(U) lustre(U) lov(U) osc(U) lquota(U) mdc(U) fid(U) fld(U) ko2iblnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) libcfs(U) acpi_cpufreq freq_table mperf ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_addr ib_sa mlx4_ib ib_mad ib_core dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun kvm dcdbas i2c_i801 i2c_core ahci iTCO_wdt iTCO_vendor_support ioatdma dca i7core_edac edac_core shpchp ipv6 nfs lockd fscache nfs_acl auth_rpcgss sunrpc mlx4_en mlx4_core e1000e [last unloaded: cpufreq_ondemand]
2013-03-14 06:13:47
2013-03-14 06:13:47 Pid: 3160, comm: ipoib Tainted: G        W  ---------------    2.6.32-279.22.1.el6.x86_64 #1 Dell        XS23-TY     /XS23-TY
2013-03-14 06:13:47 RIP: 0010:[&amp;lt;ffffffff81279e9b&amp;gt;]  [&amp;lt;ffffffff81279e9b&amp;gt;] list_del+0x1b/0xa0
2013-03-14 06:13:47 RSP: 0018:ffff880339053db0  EFLAGS: 00010046
2013-03-14 06:13:47 RAX: 0000000000000000 RBX: ffff8801b082f8d0 RCX: 0000000000004aef
2013-03-14 06:13:47 RDX: 0000000000000246 RSI: ffff8801bb8444d0 RDI: ffff8801b082f8d0
2013-03-14 06:13:47 RBP: ffff880339053dc0 R08: ffff8801b082f8d0 R09: 0000000000000000
2013-03-14 06:13:47 R10: ffff8801c0065680 R11: 0000000000000000 R12: ffff8801ba034020
2013-03-14 06:13:47 R13: 0000000000000246 R14: ffff8801ba697e80 R15: ffff8801ba0346e0
2013-03-14 06:13:47 FS:  0000000000000000(0000) GS:ffff880028240000(0000) knlGS:0000000000000000
2013-03-14 06:13:47 CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
2013-03-14 06:13:47 CR2: 0000000000000008 CR3: 00000001a4639000 CR4: 00000000000006e0
2013-03-14 06:13:47 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
2013-03-14 06:13:47 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
2013-03-14 06:13:47 &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; ipoib (pid: 3160, threadinfo ffff880339052000, task ffff880339256040)
2013-03-14 06:13:47 Stack:
2013-03-14 06:13:47  0000000109b77ac5 ffff8801b082f8c0 ffff880339053e30 ffffffffa0347619
2013-03-14 06:13:47 &amp;lt;d&amp;gt; ffff88033c1acaa0 ffff880339256040 ffff8801ba0352e8 ffff8801ba034340
2013-03-14 06:13:47 &amp;lt;d&amp;gt; ffff880339053e30 ffffffff00000002 ffffe8fe62609a40 ffffe8fe62609a40
2013-03-14 06:13:47 Call Trace:
2013-03-14 06:13:47  [&amp;lt;ffffffffa0347619&amp;gt;] ipoib_cm_tx_reap+0xc9/0x510 [ib_ipoib]
2013-03-14 06:13:47  [&amp;lt;ffffffffa0347550&amp;gt;] ? ipoib_cm_tx_reap+0x0/0x510 [ib_ipoib]
2013-03-14 06:13:47  [&amp;lt;ffffffff8108b370&amp;gt;] worker_thread+0x170/0x2a0
2013-03-14 06:13:47  [&amp;lt;ffffffff81090be0&amp;gt;] ? autoremove_wake_function+0x0/0x40
2013-03-14 06:13:47  [&amp;lt;ffffffff8108b200&amp;gt;] ? worker_thread+0x0/0x2a0
2013-03-14 06:13:47  [&amp;lt;ffffffff81090876&amp;gt;] kthread+0x96/0xa0
2013-03-14 06:13:47  [&amp;lt;ffffffff8100c0ca&amp;gt;] child_rip+0xa/0x20
2013-03-14 06:13:47  [&amp;lt;ffffffff810907e0&amp;gt;] ? kthread+0x0/0xa0
2013-03-14 06:13:47  [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20
2013-03-14 06:13:47 Code: 4c 8b ad e8 fe ff ff e9 db fd ff ff 90 90 90 90 55 48 89 e5 53 48 89 fb 48 83 ec 08 48 8b 47 08 4c 8b 00 4c 39 c7 75 39 48 8b 03 &amp;lt;4c&amp;gt; 8b 40 08 4c 39 c3 75 4c 48 8b 53 08 48 89 50 08 48 89 02 48
2013-03-14 06:13:47 RIP  [&amp;lt;ffffffff81279e9b&amp;gt;] list_del+0x1b/0xa0
2013-03-14 06:13:47  RSP &amp;lt;ffff880339053db0&amp;gt;
2013-03-14 06:13:47 CR2: 0000000000000008
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Second Example:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;2013-03-14 07:15:50 ------------[ cut here ]------------
2013-03-14 07:15:50 WARNING: at lib/list_debug.c:30 __list_add+0x8f/0xa0() (Tainted: G        W  ---------------   )
2013-03-14 07:15:50 Hardware name: XS23-TY
2013-03-14 07:15:50 list_add corruption. prev-&amp;gt;next should be next (ffff8801af5ed2d0), but was ffff88033b3addd0. (prev=ffff8801ba25f2e8).
2013-03-14 07:15:50 Modules linked in: lmv(U) mgc(U) lustre(U) lov(U) osc(U) lquota(U) mdc(U) fid(U) fld(U) ko2iblnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) libcfs(U) acpi_cpufreq freq_table mperf ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_addr ib_sa mlx4_ib ib_mad ib_core dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun kvm dcdbas iTCO_wdt iTCO_vendor_support i2c_i801 i2c_core ahci i7core_edac edac_core ioatdma dca shpchp ipv6 nfs lockd fscache nfs_acl auth_rpcgss sunrpc mlx4_en mlx4_core e1000e [last unloaded: cpufreq_ondemand]
2013-03-14 07:15:50 Pid: 4328, comm: kiblnd_sd_07 Tainted: G        W  ---------------    2.6.32-279.22.1.el6.x86_64 #1
2013-03-14 07:15:50 Call Trace:
2013-03-14 07:15:50  &amp;lt;IRQ&amp;gt;  [&amp;lt;ffffffff8106a2a7&amp;gt;] ? warn_slowpath_common+0x87/0xc0
2013-03-14 07:15:50  [&amp;lt;ffffffff8106a396&amp;gt;] ? warn_slowpath_fmt+0x46/0x50
2013-03-14 07:15:50  [&amp;lt;ffffffff81279faf&amp;gt;] ? __list_add+0x8f/0xa0
2013-03-14 07:15:50  [&amp;lt;ffffffffa033fb7e&amp;gt;] ? ipoib_cm_destroy_tx+0x6e/0xc0 [ib_ipoib]
2013-03-14 07:15:50  [&amp;lt;ffffffffa0337b39&amp;gt;] ? ipoib_neigh_dtor+0x89/0xf0 [ib_ipoib]
2013-03-14 07:15:50  [&amp;lt;ffffffffa0337bc8&amp;gt;] ? ipoib_neigh_reclaim+0x28/0x30 [ib_ipoib]
2013-03-14 07:15:50  [&amp;lt;ffffffff810de635&amp;gt;] ? __rcu_process_callbacks+0x135/0x350
2013-03-14 07:15:50  [&amp;lt;ffffffff81012a69&amp;gt;] ? read_tsc+0x9/0x20
2013-03-14 07:15:50  [&amp;lt;ffffffff810de87b&amp;gt;] ? rcu_process_callbacks+0x2b/0x50
2013-03-14 07:15:50  [&amp;lt;ffffffff81072ac1&amp;gt;] ? __do_softirq+0xc1/0x1e0
2013-03-14 07:15:50  [&amp;lt;ffffffff81095760&amp;gt;] ? hrtimer_interrupt+0x140/0x250
2013-03-14 07:15:50  [&amp;lt;ffffffff8100c1cc&amp;gt;] ? call_softirq+0x1c/0x30
2013-03-14 07:15:50  [&amp;lt;ffffffff8100de05&amp;gt;] ? do_softirq+0x65/0xa0
2013-03-14 07:15:50  [&amp;lt;ffffffff810728a5&amp;gt;] ? irq_exit+0x85/0x90
2013-03-14 07:15:50  [&amp;lt;ffffffff814f2360&amp;gt;] ? smp_apic_timer_interrupt+0x70/0x9b
2013-03-14 07:15:50  [&amp;lt;ffffffff8100bb93&amp;gt;] ? apic_timer_interrupt+0x13/0x20
2013-03-14 07:15:50  &amp;lt;EOI&amp;gt;  [&amp;lt;ffffffff814ec947&amp;gt;] ? _spin_unlock_irqrestore+0x17/0x20
2013-03-14 07:15:50  [&amp;lt;ffffffffa0322a46&amp;gt;] ? mlx4_ib_poll_cq+0x2c6/0x7f0 [mlx4_ib]
2013-03-14 07:15:50  [&amp;lt;ffffffffa07a4478&amp;gt;] ? kiblnd_scheduler+0xf8/0x760 [ko2iblnd]
2013-03-14 07:15:50  [&amp;lt;ffffffff8105fa40&amp;gt;] ? default_wake_function+0x0/0x20
2013-03-14 07:15:50  [&amp;lt;ffffffffa07a4380&amp;gt;] ? kiblnd_scheduler+0x0/0x760 [ko2iblnd]
2013-03-14 07:15:50  [&amp;lt;ffffffff8100c0ca&amp;gt;] ? child_rip+0xa/0x20
2013-03-14 07:15:50  [&amp;lt;ffffffffa07a4380&amp;gt;] ? kiblnd_scheduler+0x0/0x760 [ko2iblnd]
2013-03-14 07:15:50  [&amp;lt;ffffffffa07a4380&amp;gt;] ? kiblnd_scheduler+0x0/0x760 [ko2iblnd]
2013-03-14 07:15:50  [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20
2013-03-14 07:15:50 ---[ end trace ceec6f0d4be48403 ]---
2013-03-14 07:15:50 general protection fault: 0000 [#1] SMP
2013-03-14 07:15:50 last sysfs file: /sys/devices/virtual/dmi/id/sys_vendor
2013-03-14 07:15:50 CPU 0
2013-03-14 07:15:50 Modules linked in: lmv(U) mgc(U) lustre(U) lov(U) osc(U) lquota(U) mdc(U) fid(U) fld(U) ko2iblnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) libcfs(U) acpi_cpufreq freq_table mperf ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_addr ib_sa mlx4_ib ib_mad ib_core dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun kvm dcdbas iTCO_wdt iTCO_vendor_support i2c_i801 i2c_core ahci i7core_edac edac_core ioatdma dca shpchp ipv6 nfs lockd fscache nfs_acl auth_rpcgss sunrpc mlx4_en mlx4_core e1000e [last unloaded: cpufreq_ondemand]
2013-03-14 07:15:50
2013-03-14 07:15:50 Pid: 3208, comm: ipoib Tainted: G        W  ---------------    2.6.32-279.22.1.el6.x86_64 #1 Dell        XS23-TY     /XS23-TY
2013-03-14 07:15:50 RIP: 0010:[&amp;lt;ffffffff81279e9b&amp;gt;]  [&amp;lt;ffffffff81279e9b&amp;gt;] list_del+0x1b/0xa0
2013-03-14 07:15:50 RSP: 0018:ffff8801bba1ddb0  EFLAGS: 00010046
2013-03-14 07:15:50 RAX: dead000000100100 RBX: ffff8801af5ed2d0 RCX: 000000000000b9d4
2013-03-14 07:15:50 RDX: 0000000000000246 RSI: ffff8801bfe979d0 RDI: ffff8801af5ed2d0
2013-03-14 07:15:50 RBP: ffff8801bba1ddc0 R08: ffff8801af5ed2d0 R09: 0000000000000000
2013-03-14 07:15:50 R10: ffff8801c0065880 R11: 0000000000000000 R12: ffff8801ba25e020
2013-03-14 07:15:50 R13: 0000000000000246 R14: ffff8801ba021400 R15: ffff8801ba25e6e0
2013-03-14 07:15:50 FS:  0000000000000000(0000) GS:ffff880028200000(0000) knlGS:0000000000000000
2013-03-14 07:15:50 CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
2013-03-14 07:15:50 CR2: 00002aaab80041f8 CR3: 0000000175615000 CR4: 00000000000006f0
2013-03-14 07:15:50 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
2013-03-14 07:15:50 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
2013-03-14 07:15:50 &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; ipoib (pid: 3208, threadinfo ffff8801bba1c000, task ffff8801bb536080)
2013-03-14 07:15:50 Stack:
2013-03-14 07:15:50  0000000109f05306 ffff8801af5ed2c0 ffff8801bba1de30 ffffffffa0340619
2013-03-14 07:15:50 &amp;lt;d&amp;gt; ffffffff81a8d020 ffff8801bb536080 ffff8801ba25f2e8 ffff8801ba25e340
2013-03-14 07:15:50 &amp;lt;d&amp;gt; 00000078bba1de30 0000000000000000 ffff8801bba1de10 ffffe8fe62609a40
2013-03-14 07:15:50 Call Trace:
2013-03-14 07:15:50  [&amp;lt;ffffffffa0340619&amp;gt;] ipoib_cm_tx_reap+0xc9/0x510 [ib_ipoib]
2013-03-14 07:15:50  [&amp;lt;ffffffffa0340550&amp;gt;] ? ipoib_cm_tx_reap+0x0/0x510 [ib_ipoib]
2013-03-14 07:15:50  [&amp;lt;ffffffff8108b370&amp;gt;] worker_thread+0x170/0x2a0
2013-03-14 07:15:50  [&amp;lt;ffffffff81090be0&amp;gt;] ? autoremove_wake_function+0x0/0x40
2013-03-14 07:15:50  [&amp;lt;ffffffff8108b200&amp;gt;] ? worker_thread+0x0/0x2a0
2013-03-14 07:15:50  [&amp;lt;ffffffff81090876&amp;gt;] kthread+0x96/0xa0
2013-03-14 07:15:50  [&amp;lt;ffffffff8100c0ca&amp;gt;] child_rip+0xa/0x20
2013-03-14 07:15:50  [&amp;lt;ffffffff810907e0&amp;gt;] ? kthread+0x0/0xa0
2013-03-14 07:15:50  [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20
2013-03-14 07:15:50 Code: 4c 8b ad e8 fe ff ff e9 db fd ff ff 90 90 90 90 55 48 89 e5 53 48 89 fb 48 83 ec 08 48 8b 47 08 4c 8b 00 4c 39 c7 75 39 48 8b 03 &amp;lt;4c&amp;gt; 8b 40 08 4c 39 c3 75 4c 48 8b 53 08 48 89 50 08 48 89 02 48
2013-03-14 07:15:50 RIP  [&amp;lt;ffffffff81279e9b&amp;gt;] list_del+0x1b/0xa0
2013-03-14 07:15:50  RSP &amp;lt;ffff8801bba1ddb0&amp;gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>Hyperion/LLNL - SWL testing</environment>
        <key id="17901">LU-2967</key>
            <summary>list_del corruption - client crashes</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="ys">Yang Sheng</assignee>
                                    <reporter username="cliffw">Cliff White</reporter>
                        <labels>
                            <label>mq213</label>
                    </labels>
                <created>Thu, 14 Mar 2013 12:33:38 +0000</created>
                <updated>Wed, 19 Jun 2013 10:00:16 +0000</updated>
                            <resolved>Wed, 19 Jun 2013 09:59:49 +0000</resolved>
                                    <version>Lustre 2.1.5</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>10</watches>
                                                                            <comments>
                            <comment id="54073" author="green" created="Fri, 15 Mar 2013 00:35:12 +0000"  >&lt;p&gt;this is really a crash in o2ib driver itself, nothing to do with Lustre I suspect.&lt;/p&gt;</comment>
                            <comment id="54083" author="adilger" created="Fri, 15 Mar 2013 01:53:47 +0000"  >&lt;p&gt;Unless the problem is due to memory corruption (use after free) or similar, though if it was in ipoib for all of the clients this is definitely not even related to o2iblnd.&lt;/p&gt;

&lt;p&gt;Did we update OFED recently by any chance?  I recall seeing some patches for OFED, but I have no idea if this is relevant for 2.1.5.&lt;/p&gt;

&lt;p&gt;Maybe worthwhile to ask LLNL if there was some hiccup on the IB fabric and if they have seen this problem before?&lt;/p&gt;</comment>
                            <comment id="54506" author="mdiep" created="Wed, 20 Mar 2013 20:08:56 +0000"  >&lt;p&gt;Cliff, what is the last known good kernel that passed this? and what is it now?&lt;/p&gt;</comment>
                            <comment id="54507" author="green" created="Wed, 20 Mar 2013 20:25:32 +0000"  >&lt;p&gt;Looking at the changelog for 279.22.1.el6 that introduced this I see:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;BZ#880085&lt;br/&gt;
    Previously, the IP over Infiniband (IPoIB) driver maintained state information about neighbors on the network by attaching it to the core network&apos;s neighbor structure. However, due to a race condition between the freeing of the core network neighbor struct and the freeing of the IPoIB network struct, a use after free condition could happen, resulting in either a kernel oops or 4 or 8 bytes of kernel memory being zeroed when it was not supposed to be. These patches decouple the IPoIB neighbor struct from the core networking stack&apos;s neighbor struct so that there is no race between the freeing of one and the freeing of the other. &lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;So this must be it, the failure is in neighbor handling code, but I do not have enough permissions in RH bz to check the patch.&lt;br/&gt;
I think it&apos;s tiem to file a bug for RH.&lt;br/&gt;
We first hit it going from lnxrel=&quot;279.14.1.el6&quot; to lnxrel=&quot;279.22.1.el6&quot;&lt;/p&gt;</comment>
                            <comment id="54517" author="cliffw" created="Wed, 20 Mar 2013 22:10:56 +0000"  >&lt;p&gt;279.14.1 would be the last kernel that passed.&lt;/p&gt;</comment>
                            <comment id="54519" author="mdiep" created="Wed, 20 Mar 2013 22:29:54 +0000"  >&lt;p&gt;have you run the same test on master which has version 279.19.1?&lt;/p&gt;</comment>
                            <comment id="54520" author="mdiep" created="Wed, 20 Mar 2013 22:32:12 +0000"  >&lt;p&gt;the changes around that function in the ipoib_cm.c between 14.1 and 22.1 are&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@fat-amd-4 infiniband&amp;#93;&lt;/span&gt;# diff ulp/ipoib/ipoib_cm.c /root/kernel14/linux-2.6.32-279.14.1.el6/drivers/infiniband/ulp/ipoib/ipoib_cm.c &lt;br/&gt;
812c812,814&lt;br/&gt;
&amp;lt;                       ipoib_neigh_free(neigh);&lt;br/&gt;
&amp;#8212;&lt;br/&gt;
&amp;gt;                       if (neigh-&amp;gt;ah)&lt;br/&gt;
&amp;gt;                               ipoib_put_ah(neigh-&amp;gt;ah);&lt;br/&gt;
&amp;gt;                       ipoib_neigh_free(dev, neigh);&lt;br/&gt;
1229c1231,1233&lt;br/&gt;
&amp;lt;                       ipoib_neigh_free(neigh);&lt;br/&gt;
&amp;#8212;&lt;br/&gt;
&amp;gt;                       if (neigh-&amp;gt;ah)&lt;br/&gt;
&amp;gt;                               ipoib_put_ah(neigh-&amp;gt;ah);&lt;br/&gt;
&amp;gt;                       ipoib_neigh_free(dev, neigh);&lt;br/&gt;
1276c1280&lt;br/&gt;
&amp;lt;                         tx-&amp;gt;neigh-&amp;gt;daddr + 4);&lt;br/&gt;
&amp;#8212;&lt;br/&gt;
&amp;gt;                         tx-&amp;gt;neigh-&amp;gt;dgid.raw);&lt;br/&gt;
1301c1305&lt;br/&gt;
&amp;lt;               qpn = IPOIB_QPN(neigh-&amp;gt;daddr);&lt;br/&gt;
&amp;#8212;&lt;br/&gt;
&amp;gt;               qpn = IPOIB_QPN(neigh-&amp;gt;neighbour-&amp;gt;ha);&lt;br/&gt;
1317c1321,1323&lt;br/&gt;
&amp;lt;                               ipoib_neigh_free(neigh);&lt;br/&gt;
&amp;#8212;&lt;br/&gt;
&amp;gt;                               if (neigh-&amp;gt;ah)&lt;br/&gt;
&amp;gt;                                       ipoib_put_ah(neigh-&amp;gt;ah);&lt;br/&gt;
&amp;gt;                               ipoib_neigh_free(dev, neigh);&lt;/p&gt;</comment>
                            <comment id="54524" author="cliffw" created="Wed, 20 Mar 2013 23:16:37 +0000"  >&lt;p&gt;Yes, the test failing is SWL which is run routinely.&lt;/p&gt;</comment>
                            <comment id="54615" author="green" created="Thu, 21 Mar 2013 22:13:23 +0000"  >&lt;p&gt;RedHat bug (confirmed, with a reference to fix): &lt;a href=&quot;https://bugzilla.redhat.com/show_bug.cgi?id=913645&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://bugzilla.redhat.com/show_bug.cgi?id=913645&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="55253" author="pjones" created="Tue, 2 Apr 2013 05:13:19 +0000"  >&lt;p&gt;Yangsheng&lt;/p&gt;

&lt;p&gt;Please confirm when a kernel update exists which fixes this Red Hat bug&lt;/p&gt;

&lt;p&gt;thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="55259" author="ys" created="Tue, 2 Apr 2013 08:22:18 +0000"  >&lt;p&gt;The latest 2.6.32-358.2.1.el6 still not include the fix(upstream fa16ebed31f336e41970f3f0ea9e8279f6be2d27).&lt;/p&gt;</comment>
                            <comment id="55575" author="green" created="Fri, 5 Apr 2013 03:32:46 +0000"  >&lt;p&gt;Change to pull in the upstream fix while RedHat waits for fix effectiveness confirmation master version is at &lt;a href=&quot;http://review.whamcloud.com/5952&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5952&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Also, I just realized that we are not really sure if master is good enough to withstand SWL run at this time, so I made a b2_1 patch too: &lt;a href=&quot;http://review.whamcloud.com/5953&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5953&lt;/a&gt; (it reverts back to the problematic commit that was used originally for this bugreport, but with the fix added on top).&lt;/p&gt;</comment>
                            <comment id="60854" author="ys" created="Wed, 19 Jun 2013 09:59:49 +0000"  >&lt;p&gt;2.6.32-358.11.1.el6 update already included this fix(&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3461&quot; title=&quot;Kernel update [RHEL6.4 2.6.32-358.11.1.el6] &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3461&quot;&gt;&lt;del&gt;LU-3461&lt;/del&gt;&lt;/a&gt;). So close this one.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="19390">LU-3461</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="16904">LU-2473</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvlef:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>7232</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>