<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:57:29 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-6132] Unable to unload ib drivers with lustre loaded</title>
                <link>https://jira.whamcloud.com/browse/LU-6132</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Unloading IB drivers results in hung task message and driver unloading is stuck forever.&lt;/p&gt;

&lt;p&gt;Steps to reproduce:&lt;br/&gt;
1) Have a lustre mount to server&lt;br/&gt;
2) On server do /etc/init.d/openibd stop&lt;br/&gt;
3) openibd script is stuck&lt;br/&gt;
4) After 120 seconds, following message is seen in dmesg:&lt;/p&gt;

&lt;p&gt;LNetError: 131-3: Received notification of device removal&lt;br/&gt;
Please shutdown LNET to allow this to proceed&lt;br/&gt;
INFO: task modprobe:2837 blocked for more than 120 seconds.&lt;br/&gt;
      Not tainted 2.6.32_431.el6_lustre.x86_64 #1&lt;br/&gt;
&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.&lt;br/&gt;
modprobe      D 0000000000000000     0  2837   2777 0x00000000&lt;br/&gt;
 ffff88011c649bf8 0000000000000082 00000000ffffffff 00000000ffffffff&lt;br/&gt;
 ffff88011c649c38 ffffffff81060b13 ffff88011c649c78 00000000811a591f&lt;br/&gt;
 ffff8800cc23f058 ffff88011c649fd8 000000000000fbc8 ffff8800cc23f058&lt;br/&gt;
Call Trace:&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81060b13&amp;gt;&amp;#93;&lt;/span&gt; ? perf_event_task_sched_out+0x33/0x70&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8105a570&amp;gt;&amp;#93;&lt;/span&gt; ? __dequeue_entity+0x30/0x50&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81528c25&amp;gt;&amp;#93;&lt;/span&gt; schedule_timeout+0x215/0x2e0&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81527d80&amp;gt;&amp;#93;&lt;/span&gt; ? thread_return+0x4e/0x76e&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff815288a3&amp;gt;&amp;#93;&lt;/span&gt; wait_for_common+0x123/0x180&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81065df0&amp;gt;&amp;#93;&lt;/span&gt; ? default_wake_function+0x0/0x20&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810686da&amp;gt;&amp;#93;&lt;/span&gt; ? __cond_resched+0x2a/0x40&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff815289bd&amp;gt;&amp;#93;&lt;/span&gt; wait_for_completion+0x1d/0x20&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03170be&amp;gt;&amp;#93;&lt;/span&gt; cma_remove_one+0x18e/0x210 &lt;span class=&quot;error&quot;&gt;&amp;#91;rdma_cm&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa021f5ff&amp;gt;&amp;#93;&lt;/span&gt; ib_unregister_device+0x4f/0x100 &lt;span class=&quot;error&quot;&gt;&amp;#91;ib_core&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0257aa6&amp;gt;&amp;#93;&lt;/span&gt; mlx4_ib_remove+0xc6/0x300 &lt;span class=&quot;error&quot;&gt;&amp;#91;mlx4_ib&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0167881&amp;gt;&amp;#93;&lt;/span&gt; mlx4_remove_device+0x71/0x90 &lt;span class=&quot;error&quot;&gt;&amp;#91;mlx4_core&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa01679b3&amp;gt;&amp;#93;&lt;/span&gt; mlx4_unregister_interface+0x43/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;mlx4_core&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa026f891&amp;gt;&amp;#93;&lt;/span&gt; __exit_compat+0x15/0x69 &lt;span class=&quot;error&quot;&gt;&amp;#91;mlx4_ib&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810b9454&amp;gt;&amp;#93;&lt;/span&gt; sys_delete_module+0x194/0x260&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8152d8ce&amp;gt;&amp;#93;&lt;/span&gt; ? do_page_fault+0x3e/0xa0&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100b072&amp;gt;&amp;#93;&lt;/span&gt; system_call_fastpath+0x16/0x1b&lt;/p&gt;

&lt;p&gt;The cause of this is that ko2iblnd does not handle device removal (should probably handle it the same as disconnected event).&lt;/p&gt;</description>
                <environment>RHEL 6.5 with MLNX_OFED 2.3 and ConnectX3/ConnectX3 Pro/ConnectIB  HW (but I&amp;#39;m guessing is reproducible with any OS and any OFED/upstream kernel).</environment>
        <key id="28242">LU-6132</key>
            <summary>Unable to unload ib drivers with lustre loaded</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="dmiter">Dmitry Eremin</assignee>
                                    <reporter username="yanb">Yan Burman</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Sun, 18 Jan 2015 15:11:11 +0000</created>
                <updated>Tue, 28 May 2019 15:46:04 +0000</updated>
                                            <version>Lustre 2.7.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>22</watches>
                                                                            <comments>
                            <comment id="103848" author="liang" created="Mon, 19 Jan 2015 13:50:25 +0000"  >&lt;p&gt;Release 2.7 will have Dynamic LNet Config (DLC) which allows user to dynamically add/remove LNet NI in userspace, after that you should be able to unload ib drivers. So far we don&#8217;t plan to handle this event and trigger NI removal in kernel space.&lt;/p&gt;</comment>
                            <comment id="103877" author="adilger" created="Mon, 19 Jan 2015 18:06:06 +0000"  >&lt;p&gt;Could you please explain why you would want to be able to unload the IB module while Lustre is up and running?&lt;/p&gt;</comment>
                            <comment id="103974" author="yanb" created="Tue, 20 Jan 2015 08:41:23 +0000"  >&lt;p&gt;One scenario would be reboot. Also, device removal event will be received upon PCI reset to device or unbind/hotplug.&lt;br/&gt;
From my experience on other projects, there are two changes that need to be done to support this:&lt;br/&gt;
1) Be able to differentiate between CM IDs of the listener and of connections&lt;br/&gt;
2) If CM ID is of a connection, do the same handling as disconnected event. In case of listener, just destroy the CM ID&lt;/p&gt;</comment>
                            <comment id="108138" author="adilger" created="Thu, 26 Feb 2015 19:01:10 +0000"  >&lt;p&gt;It seems to me that the correct solution here is during shutown to clean up the LNET routes/modules before unconfiguring IB.  That should happen via the /etc/init.d/lnet script.  Not clear why that isn&apos;t happening?&lt;/p&gt;</comment>
                            <comment id="108400" author="yanb" created="Mon, 2 Mar 2015 14:47:58 +0000"  >&lt;p&gt;The problem would happen if the mlx4/mlx5 drivers are unloaded before LNET is cleaned up or if device is removed (which is easily simulated by unloading mlx&lt;/p&gt;
{4,5}
&lt;p&gt;_* modules).&lt;br/&gt;
Fixing the script (assuming it&apos;s a script problem) will fix one scenario out of few. Handling the device removal event will be cleaner and handle other scenarios as well.&lt;br/&gt;
Handling CM ID of a connection should be similar if not identical to handling of disconnected event.&lt;br/&gt;
The only non-trivial (as far as I understood) part is identifying that the CM ID belongs to a listener, as it is not being saved currently from what I saw.&lt;br/&gt;
What do you think of this idea?&lt;/p&gt;</comment>
                            <comment id="108466" author="isaac" created="Mon, 2 Mar 2015 19:55:43 +0000"  >&lt;p&gt;I think the shutdown scripts should be fixed to honor the correct dependency - i.e. shutdown the IB users (e.g. LNet) before any attempts to shutdown any part of IB. As to LNet support of device removal, if there&apos;s a valid use case for that we should certainly support it. But I&apos;d tend to say incorrect shutdown order isn&apos;t a valid use case. If there&apos;s other scenarios where LNet would need to handle device removal, please point it out.&lt;/p&gt;</comment>
                            <comment id="108724" author="yanb" created="Wed, 4 Mar 2015 12:55:45 +0000"  >&lt;p&gt;Other use cases where you may get device removal event are:&lt;br/&gt;
1) Card/FW failure and reset issued on the card&lt;br/&gt;
2) VPI - changing in runtime between ethernet and IB port type&lt;br/&gt;
3) Unloading driver perhaps for maintenance&lt;br/&gt;
4) Hotplug of card (as well as VF in SRIOV case)&lt;/p&gt;
</comment>
                            <comment id="108756" author="isaac" created="Wed, 4 Mar 2015 17:52:36 +0000"  >&lt;p&gt;2-4 could be handled by removing the IB NI first with DLC, since they are all admin actions. Case 1 could probably be cleaned up with DLC NI shutdown after it has happened.&lt;/p&gt;</comment>
                            <comment id="121681" author="pichong" created="Mon, 20 Jul 2015 15:59:47 +0000"  >&lt;p&gt;This issue has been reported by one of our customer. It usually occurs when shutting down a Lustre client while Lustre file systems are still mounted.&lt;/p&gt;</comment>
                            <comment id="181141" author="dinatale2" created="Wed, 18 Jan 2017 15:59:53 +0000"  >&lt;p&gt;We&apos;ve encountered this problem as well on both lustre clients and lustre servers. Problem usually occurs, as Gregoire mentioned, when a lustre fs is mounted on a client or a lustre server has a target mounted during shut down.&lt;/p&gt;</comment>
                            <comment id="181157" author="dmiter" created="Wed, 18 Jan 2017 16:49:14 +0000"  >&lt;p&gt;This happens when you have Lustre share mounted but try to unload OFED drivers. The shutdown sequence should be following:&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;umount /mnt/lustre&lt;/li&gt;
	&lt;li&gt;lustre_rmmod&lt;/li&gt;
	&lt;li&gt;unload OFED drivers&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;We can make first 2 steps in our shutdown script but cannot guarantee that&#160;OFED drivers will not be unloaded first.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="181267" author="simmonsja" created="Wed, 18 Jan 2017 23:29:49 +0000"  >&lt;p&gt;It is possible to make lustre aware when the IB core is unloaded but i haven&apos;t had cycles to implement this. I guess in that case we would have to force evictions of clients if that happens.&lt;/p&gt;</comment>
                            <comment id="181521" author="adilger" created="Thu, 19 Jan 2017 22:29:52 +0000"  >&lt;p&gt;I don&apos;t see why the client would need to be evicted, per-se if the IB interface is stopped.  In theory, if the client has some other form of communication with the server (e.g. TCP or OPA) it could continue to work after the IB interface is stopped.  Handling that cleanly would definitely need some development work, and is best left until after the LNet Multi-Rail code is landed, since I suspect it will need to deal with that situation in any case.&lt;/p&gt;

&lt;p&gt;One simple option for handling this in the short term is adding an &lt;tt&gt;/sbin/umount.lustre&lt;/tt&gt; script which tries &lt;tt&gt;lustre_rmmod&lt;/tt&gt; to unload the modules, but fails silently if the modules are in use (i.e. another filesystem is mounted).  That would drop the LNet references and disconnect the client, allowing the IB modules to be unloaded.  However, this depends on the client unmount happening before the IB modules are cleaned up.  The other option is a systemd script (see patch &lt;a href=&quot;http://review.whamcloud.com/21457&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/21457&lt;/a&gt; &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8384&quot; title=&quot;convert startup scripts to systemd unit files for EL7&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8384&quot;&gt;&lt;del&gt;LU-8384&lt;/del&gt;&lt;/a&gt; scripts: Add scripts to systemd for EL7&quot;).&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="34544">LU-7755</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="45827">LU-9439</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="32588">LU-8384</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="52438">LU-11066</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzx46v:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>17090</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10023"><![CDATA[4]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>