<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:19:47 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1799] Oops: Kernel access of bad area with IPv6 address</title>
                <link>https://jira.whamcloud.com/browse/LU-1799</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;IBM reported this kernel panic on their BGQ IO node when loading the ptlrpc module with an o2ib network.  The IB interface had an IPv4 and IPv6 address.  Removing the IPv6 address avoided the crash.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;{0}.1.0: Unable to handle kernel paging request for data at address 0x00000138
{0}.1.0: Faulting instruction address: 0xc0000000002cb518
{0}.1.0: Oops: Kernel access of bad area, sig: 11 [#1]
{0}.1.0: SMP NR_CPUS=68 Blue Gene/Q
{0}.1.0: Modules linked in: ko2iblnd(U) ptlrpc(+)(U) obdclass(U) lnet(U) lvfs(U) libcfs(U)
{0}.1.0: NIP: c0000000002cb518 LR: 80000000035f82a8 CTR: c0000000002cb4fc
{0}.1.0: REGS: c0000003e4c27290 TRAP: 0300   Not tainted  (2.6.32-220.23.3.bgq.el6_V1R1M2_0.ppc64)
{0}.1.0: MSR: 0000000080029000 &amp;lt;EE,ME,CE&amp;gt;  CR: 24228480  XER: 20000000
{0}.1.0: DEAR: 0000000000000138, ESR: 0000000000000000
{0}.1.0: TASK = c0000003ec085b20[2170] &apos;modprobe&apos; THREAD: c0000003e4c24000 CPU: 4
{0}.1.0: GPR00: 80000000035f82a8 c0000003e4c27510 c0000000006bc1c0 0000000000000000
{0}.1.0: GPR04: 0000000000000000 0000000000000000 c0000003ebdd0118 c000000002342408
{0}.1.0: GPR08: 0000000000000001 0000000000000000 0000000000008000 c0000000002cb4fc
{0}.1.0: GPR12: 8000000003612280 c000000000725900 0000000000000400 0000000000000730
{0}.1.0: GPR16: 0000000000000000 8000000000dd24b8 0000000000000000 0000000000000010
{0}.1.0: GPR20: 80000000007607c0 0000000000003590 8000000000754608 800000000361b050
{0}.1.0: GPR24: c0000003efa99400 c0000003e4c27610 c0000003e4c27600 c0000003e4c27620
{0}.1.0: GPR28: c0000003ebdd00c0 c0000003ebc60600 800000000362c568 0000000000000000
{0}.1.0: NIP [c0000000002cb518] .ib_alloc_pd+0x1c/0x6c
{0}.1.0: LR [80000000035f82a8] .kiblnd_dev_failover+0x228/0xc30 [ko2iblnd]
{0}.1.0: Call Trace:
{0}.1.0: [c0000003e4c27510] [c0000003e4c27670] 0xc0000003e4c27670 (unreliable)
{0}.1.0: [c0000003e4c27590] [80000000035f82a8] .kiblnd_dev_failover+0x228/0xc30 [ko2iblnd]
{0}.1.0: [c0000003e4c276f0] [80000000035f8e2c] .kiblnd_create_dev+0x17c/0x650 [ko2iblnd]
{0}.1.0: [c0000003e4c277e0] [80000000035fff58] .kiblnd_startup+0x3a8/0x6e0 [ko2iblnd]
{0}.1.0: [c0000003e4c278d0] [8000000000d962fc] .lnet_startup_lndnis+0x1bc/0xa50 [lnet]
{0}.1.0: [c0000003e4c27a00] [8000000000d96d24] .LNetNIInit+0x194/0x2b0 [lnet]
{0}.1.0: [c0000003e4c27ac0] [80000000031351e4] .ptlrpc_ni_init+0x84/0x260 [ptlrpc]
{0}.1.0: [c0000003e4c27b80] [8000000003135794] .ptlrpc_init_portals+0x34/0x1c0 [ptlrpc]
{0}.1.0: [c0000003e4c27c30] [80000000031825e8] .init_module+0x158/0x7dc8 [ptlrpc]
{0}.1.0: [c0000003e4c27cd0] [c000000000000e74] .do_one_initcall+0x88/0x1bc
{0}.1.0: [c0000003e4c27d80] [c00000000006c910] .SyS_init_module+0x11c/0x2b4
{0}.1.0: [c0000003e4c27e30] [c000000000000580] syscall_exit+0x0/0x2c
{0}.1.0: Instruction dump:
{0}.1.0: 38210080 e8010010 ebe1fff8 7c0803a6 4e800020 7c0802a6 fbe1fff8 38800000
{0}.1.0: 38a00000 7c7f1b78 f8010010 f821ff81 &amp;lt;e9230138&amp;gt; e8090000 f8410028 7c0903a6
{0}.1.0: Kernel panic - not syncing: Fatal exception
{0}.1.0: Call Trace:
{0}.1.0: [c0000003e4c26fc0] [c000000000008190] .show_stack+0x7c/0x184 (unreliable)
{0}.1.0: [c0000003e4c27070] [c000000000423614] .panic+0x80/0x1a8
{0}.1.0: [c0000003e4c27100] [c000000000018d58] .die+0x1a4/0x1bc
{0}.1.0: [c0000003e4c271a0] [c00000000001e9e0] .bad_page_fault+0xb8/0xd4
{0}.1.0: [c0000003e4c27220] [c000000000013e4c] storage_fault_common+0x48/0x4c
{0}.1.0: --- Exception: 300 at .ib_alloc_pd+0x1c/0x6c
{0}.1.0:     LR = .kiblnd_dev_failover+0x228/0xc30 [ko2iblnd]
{0}.1.0: [c0000003e4c27510] [c0000003e4c27670] 0xc0000003e4c27670 (unreliable)
{0}.1.0: [c0000003e4c27590] [80000000035f82a8] .kiblnd_dev_failover+0x228/0xc30 [ko2iblnd]
{0}.1.0: [c0000003e4c276f0] [80000000035f8e2c] .kiblnd_create_dev+0x17c/0x650 [ko2iblnd]
{0}.1.0: [c0000003e4c277e0] [80000000035fff58] .kiblnd_startup+0x3a8/0x6e0 [ko2iblnd]
{0}.1.0: [c0000003e4c278d0] [8000000000d962fc] .lnet_startup_lndnis+0x1bc/0xa50 [lnet]
{0}.1.0: [c0000003e4c27a00] [8000000000d96d24] .LNetNIInit+0x194/0x2b0 [lnet]
{0}.1.0: [c0000003e4c27ac0] [80000000031351e4] .ptlrpc_ni_init+0x84/0x260 [ptlrpc]
{0}.1.0: [c0000003e4c27b80] [8000000003135794] .ptlrpc_init_portals+0x34/0x1c0 [ptlrpc]
{0}.1.0: [c0000003e4c27c30] [80000000031825e8] .init_module+0x158/0x7dc8 [ptlrpc]
{0}.1.0: [c0000003e4c27cd0] [c000000000000e74] .do_one_initcall+0x88/0x1bc
{0}.1.0: [c0000003e4c27d80] [c00000000006c910] .SyS_init_module+0x11c/0x2b4
{0}.1.0: [c0000003e4c27e30] [c000000000000580] syscall_exit+0x0/0x2c
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>&lt;a href=&quot;https://github.com/chaos/lustre/commits/orion-2_3_49_54_2-62chaos&quot;&gt;https://github.com/chaos/lustre/commits/orion-2_3_49_54_2-62chaos&lt;/a&gt;&lt;br/&gt;
BGQ ppc64</environment>
        <key id="15619">LU-1799</key>
            <summary>Oops: Kernel access of bad area with IPv6 address</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="nedbass">Ned Bass</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Tue, 28 Aug 2012 16:54:47 +0000</created>
                <updated>Mon, 26 Jun 2017 18:29:47 +0000</updated>
                                            <version>Lustre 2.3.0</version>
                    <version>Lustre 2.4.0</version>
                                    <fixVersion>Lustre 2.4.0</fixVersion>
                                        <due></due>
                            <votes>1</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="43898" author="doug" created="Tue, 28 Aug 2012 17:10:47 +0000"  >&lt;p&gt;LNet currently does not support IPv6.  A presentation was done by Isaac Huang at this year&apos;s LUG talking about the large challenge of supporting IPv6.  No one has taken ownership of this task yet (I believe it was on the OpenSFS list of projects to fund).&lt;/p&gt;</comment>
                            <comment id="43899" author="ian" created="Tue, 28 Aug 2012 17:18:39 +0000"  >&lt;p&gt;Doug it seems reasonable that we don&apos;t support IPv6, but we also probably shouldn&apos;t kernel panic just because we see an IPv6 address.&lt;/p&gt;</comment>
                            <comment id="43900" author="nedbass" created="Tue, 28 Aug 2012 17:35:57 +0000"  >&lt;p&gt;Indeed the desired outcome from this issue is to fix the panic &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;  In fact when the interface had &lt;b&gt;only&lt;/b&gt; an IPv6 address, Lustre handled it gracefully:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LNetError: 3588:0:(linux-tcpip.c:137:libcfs_ipif_query()) Can&apos;t get IP address for interface ib0    
LNetError: 3588:0:(o2iblnd.c:2569:kiblnd_create_dev()) Can&apos;t query IPoIB interface ib0: -99    
LNetError: 105-4: Error -100 starting up LNI o2ib 
LustreError: 3588:0:(events.c:737:ptlrpc_init_portals()) network initialisation failed
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I believe our IO nodes on the LLNL BGQ clusters have both IPv4 and IPv6 addresses and we didn&apos;t run into this, so I&apos;m not sure why it was a problem on IBM&apos;s system.&lt;/p&gt;</comment>
                            <comment id="43901" author="doug" created="Tue, 28 Aug 2012 17:54:05 +0000"  >&lt;p&gt;I wonder if this is an endian issue (since the IBM system is PPC64)?  Are any of the IO nodes on the LLNL BGQ cluster PPC?&lt;/p&gt;</comment>
                            <comment id="43902" author="nedbass" created="Tue, 28 Aug 2012 17:56:49 +0000"  >&lt;p&gt;Yes they are all PPC.&lt;/p&gt;</comment>
                            <comment id="43903" author="prakash" created="Tue, 28 Aug 2012 17:57:42 +0000"  >&lt;p&gt;Personally, I wouldn&apos;t call those messages on the console as &quot;handled gracefully&quot;; but it&apos;s better than a panic. If the issue is IPv6 compatibility, stating that on the console would be nice:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LNetError: ib0 configured for IPv6, which is not supported by LNet. Network initialization failed as a result.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="43905" author="nedbass" created="Tue, 28 Aug 2012 18:09:29 +0000"  >&lt;p&gt;Good point.  It was only the EADDRNOTAVAIL return value that suggested it might be an IPv6 issue, but that&apos;s not exactly user-friendly.&lt;/p&gt;</comment>
                            <comment id="43925" author="liang" created="Wed, 29 Aug 2012 03:09:37 +0000"  >&lt;p&gt;I suspect we got NULL pointer (cmid-&amp;gt;device) at here:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;kiblnd_dev_failover()-&amp;gt;ib_alloc_pd(cmid-&amp;gt;device);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I&apos;m wondering how it could happen because we did check returned value for all calls, and we also specified both address and AF_INET before calling rdma_bind_addr(), which can attach cmid to device:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;rdma_bind_addr()-&amp;gt;cma_acquire_dev()-&amp;gt;cma_acquire_dev()-&amp;gt;cma_attach_to_dev()
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="43961" author="nedbass" created="Wed, 29 Aug 2012 13:43:05 +0000"  >&lt;p&gt;Perhaps address scope is a factor. IBM&apos;s system had a site-scoped v6 address whereas the LLNL systems have only automatically derived link-local addresses.&lt;/p&gt;</comment>
                            <comment id="43994" author="doug" created="Thu, 30 Aug 2012 13:11:54 +0000"  >&lt;p&gt;When we do a kinlnd_create_dev(), it calls libcfs_ipif_query() to get the address which is used in the rdma_bind_addr().  If libcfs_ipif_query() were to get an IPv6 address (struct sockaddr_in6) rather than an IPv4 address (struct sockaddr_in), then we would misinterpret what was returned and pass a badly formed address to rdma_bind_addr().  I&apos;m wondering if that is what happened here.&lt;/p&gt;

&lt;p&gt;In kiblnd_create_dev(), we make use of the ioctl SIOGIFADDR with the address type set to AF_INET.  In theory, this should only return an IPv4 address; we need to use AF_INET6 for IPv6.  There seems to be a lot of confusion in networking forums as to how SIOGIFADDR works which is made worse by the fact that the IPv6 struct, sockaddr_in6, does not fit into the generic sockaddr structure which is returned by SIOGIFADDR.&lt;/p&gt;

&lt;p&gt;I will need to look at the specific source code implementation of SIOGIFADDR for the distro IBM is running.  &lt;/p&gt;

&lt;p&gt;Ned: what distro and version is IBM running?&lt;/p&gt;

&lt;p&gt;Liang: Another way we can do this is to use SIOGIFCONF to get a list of all interfaces and addresses.  That way we can be smart in our error messages when we can see there is only an IPv6 address.  However, due to the structure size differences I mention above, some implementations do not return IPv6 addresses at all.&lt;/p&gt;</comment>
                            <comment id="43997" author="nedbass" created="Thu, 30 Aug 2012 13:36:21 +0000"  >&lt;blockquote&gt;&lt;p&gt;Ned: what distro and version is IBM running?&lt;/p&gt;&lt;/blockquote&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;seqio33-ib0@root:uname -a
Linux seqio33-ib0 2.6.32-220.23.3.bgq.el6_V1R1M2_0.ppc64 #1 SMP Sat Aug 25 18:22:10 CDT 2012 ppc64 ppc64 ppc64 GNU/Linux

seqio33-ib0@root:cat /etc/redhat-release 
Red Hat Enterprise Linux Server release 6.2 (Santiago)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="45755" author="doug" created="Fri, 28 Sep 2012 18:58:48 +0000"  >&lt;p&gt;Fix on master: &lt;a href=&quot;http://review.whamcloud.com/#change,3815&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,3815&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="46017" author="isaac" created="Thu, 4 Oct 2012 16:01:19 +0000"  >&lt;p&gt;Doug, was that a fix? I thought it was just a debugging patch. Perhaps I&apos;ve missed something.&lt;/p&gt;</comment>
                            <comment id="46027" author="doug" created="Thu, 4 Oct 2012 19:35:04 +0000"  >&lt;p&gt;Whoops!  My bad.  Doing bulk cleanup of Jira tickets and did not see &quot;debug patch&quot; in patch title.  &lt;/p&gt;

&lt;p&gt;Change 3815 is only a debug patch and not a fix.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10490" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>End date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Fri, 27 Jun 2014 16:54:47 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                <customfield id="customfield_10070" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Project</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10031"><![CDATA[Orion]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzv5mv:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>4447</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10493" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>Start date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Tue, 28 Aug 2012 16:54:47 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                    </customfields>
    </item>
</channel>
</rss>