<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:28:47 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-9737] lnetctl net show command hung after add net  </title>
                <link>https://jira.whamcloud.com/browse/LU-9737</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Hi, &lt;/p&gt;

&lt;p&gt;I am test multi-rail in  2.10.0-RC1 and add one netwrok with ib0,ib1 interfaces.&lt;br/&gt;
But lnetctl/lctl  always hung after add one net.&lt;br/&gt;
Should I enable anything ? Does anyone have user guide for multi-rail configuration?  Thanks.&lt;/p&gt;


&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;test steps&amp;#93;&lt;/span&gt;&lt;br/&gt;
1.mdoprobe lnet&lt;br/&gt;
2.lnetctl lnet configure&lt;br/&gt;
3.lnetctl net show&lt;br/&gt;
net:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;net: lo&lt;br/&gt;
      nid: 0@lo&lt;br/&gt;
      status: up&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;4.lnetctl net add --net o2ib0 --if ib0,ib1&lt;br/&gt;
5.lnetctl net show =&amp;gt;hung&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;kernel message&amp;#93;&lt;/span&gt;&lt;br/&gt;
[  434.309534] LNet: Added LNI 172.20.110.220@o2ib &lt;span class=&quot;error&quot;&gt;&amp;#91;8/256/0/180&amp;#93;&lt;/span&gt;&lt;br/&gt;
[  434.323740] LNet: Added LNI 172.20.110.221@o2ib &lt;span class=&quot;error&quot;&gt;&amp;#91;8/256/0/180&amp;#93;&lt;/span&gt;&lt;br/&gt;
[  726.028183] INFO: task lctl:12235 blocked for more than 120 seconds.&lt;br/&gt;
[  726.028248] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.&lt;br/&gt;
[  726.028306] lctl            D ffffffffa0cdc7b0     0 12235  10719 0x00000084&lt;br/&gt;
[  726.028313]  ffff88086c75fd40 0000000000000086 ffff88086cb69f60 ffff88086c75ffd8&lt;br/&gt;
[  726.028317]  ffff88086c75ffd8 ffff88086c75ffd8 ffff88086cb69f60 ffffffffa0cdc7a8&lt;br/&gt;
[  726.028320]  ffffffffa0cdc7ac ffff88086cb69f60 00000000ffffffff ffffffffa0cdc7b0&lt;br/&gt;
[  726.028324] Call Trace:&lt;br/&gt;
[  726.028343]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8168c769&amp;gt;&amp;#93;&lt;/span&gt; schedule_preempt_disabled+0x29/0x70&lt;br/&gt;
[  726.028348]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8168a3c5&amp;gt;&amp;#93;&lt;/span&gt; __mutex_lock_slowpath+0xc5/0x1c0&lt;br/&gt;
[  726.028354]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8168982f&amp;gt;&amp;#93;&lt;/span&gt; mutex_lock+0x1f/0x2f&lt;br/&gt;
[  726.028372]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c96fd6&amp;gt;&amp;#93;&lt;/span&gt; LNetNIInit+0x46/0xa40 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
[  726.028388]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cb478f&amp;gt;&amp;#93;&lt;/span&gt; lnet_ioctl+0x4f/0x250 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
[  726.028404]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c384ac&amp;gt;&amp;#93;&lt;/span&gt; libcfs_ioctl+0x2ac/0x4c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
[  726.028415]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c34517&amp;gt;&amp;#93;&lt;/span&gt; libcfs_psdev_ioctl+0x67/0xf0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
[  726.028422]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81211ed5&amp;gt;&amp;#93;&lt;/span&gt; do_vfs_ioctl+0x2d5/0x4b0&lt;br/&gt;
[  726.028427]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8121cb77&amp;gt;&amp;#93;&lt;/span&gt; ? __fd_install+0x47/0x60&lt;br/&gt;
[  726.028431]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81212151&amp;gt;&amp;#93;&lt;/span&gt; SyS_ioctl+0xa1/0xc0&lt;br/&gt;
[  726.028437]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff816965c9&amp;gt;&amp;#93;&lt;/span&gt; system_call_fastpath+0x16/0x1b&lt;br/&gt;
[  846.028188] INFO: task lctl:12235 blocked for more than 120 seconds.&lt;br/&gt;
[  846.028256] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.&lt;/p&gt;</description>
                <environment>lustre:        2.10.0-RC1&lt;br/&gt;
lnet:           0.7.0</environment>
        <key id="47108">LU-9737</key>
            <summary>lnetctl net show command hung after add net  </summary>
                <type id="9" iconUrl="https://jira.whamcloud.com/images/icons/issuetypes/undefined.png">Question/Request</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="sebg-crd-pm">sebg-crd-pm</reporter>
                        <labels>
                    </labels>
                <created>Wed, 5 Jul 2017 09:19:05 +0000</created>
                <updated>Fri, 14 Jul 2017 03:25:05 +0000</updated>
                                                                                <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="200999" author="pjones" created="Wed, 5 Jul 2017 12:53:39 +0000"  >&lt;p&gt;The Multi-Rail instructions are in the manual - &lt;a href=&quot;http://doc.lustre.org/lustre_manual.xhtml#lnetmr&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://doc.lustre.org/lustre_manual.xhtml#lnetmr&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="201003" author="pjones" created="Wed, 5 Jul 2017 13:22:48 +0000"  >&lt;p&gt;Amir&lt;/p&gt;

&lt;p&gt;Could you please assist with any follow on questions relating to the instructions in the manual?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="201041" author="ashehata" created="Wed, 5 Jul 2017 18:48:06 +0000"  >&lt;p&gt;I&apos;m unable to reproduce your problem locally.&lt;/p&gt;

&lt;p&gt;Is this reproducible a 100% of the time? From the stack trace it appears that the ln_api_mutex is not being unlocked causing a deadlock. But I don&apos;t see a problem in the code.&lt;/p&gt;

&lt;p&gt;How did you get 2.10-RC1? did you build it yourself? or did you download the RPMs from somewhere?&lt;/p&gt;

&lt;p&gt;Is there other users trying to run &quot;lctl&quot; commands at the same time when you encounter this problem?&lt;/p&gt;

&lt;p&gt;Do you have lustre up? or are you loading lnet by itself?&lt;/p&gt;

&lt;p&gt;Can you also paste the output of the following command:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;lnetctl -h
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
</comment>
                            <comment id="201092" author="sebg-crd-pm" created="Thu, 6 Jul 2017 02:05:53 +0000"  >&lt;p&gt;Hi,&lt;/p&gt;

&lt;p&gt;#Is this reproducible a 100% of the time? &lt;br/&gt;
=&amp;gt;Yes.&lt;br/&gt;
#How did you get 2.10-RC1? did you build it yourself? or did you download the RPMs from somewhere?&lt;br/&gt;
=&amp;gt;I got it  from &lt;a href=&quot;https://git.hpdd.intel.com/?p=fs/lustre-release.git&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://git.hpdd.intel.com/?p=fs/lustre-release.git&lt;/a&gt;,  and build it by myself.&lt;br/&gt;
#Is there other users trying to run &quot;lctl&quot; commands at the same time when you encounter this problem?&lt;br/&gt;
=&amp;gt;No, it only used by me.&lt;br/&gt;
#Do you have lustre up? or are you loading lnet by itself?&lt;br/&gt;
=&amp;gt;I have also try to use &quot;lctl net up&quot; or &quot;lnet start&quot;. They all got the same issue.&lt;br/&gt;
I got the message when execute &quot;lnet status&quot;, Is it allright?&lt;br/&gt;
error: get_param: param_path &apos;health_check&apos;: No such file or directory&lt;br/&gt;
running&lt;br/&gt;
#Can you also paste the output of the following command:&lt;br/&gt;
=&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@mdsb1 ~&amp;#93;&lt;/span&gt;# lnetctl -h&lt;br/&gt;
Try interactive use without arguments or use one of:&lt;br/&gt;
&quot;lnet&quot;&lt;br/&gt;
&quot;route&quot;&lt;br/&gt;
&quot;net&quot;&lt;br/&gt;
&quot;routing&quot;&lt;br/&gt;
&quot;set&quot;&lt;br/&gt;
&quot;import&quot;&lt;br/&gt;
&quot;export&quot;&lt;br/&gt;
&quot;stats&quot;&lt;br/&gt;
&quot;peer_credits&quot;&lt;br/&gt;
&quot;help&quot;&lt;br/&gt;
&quot;exit&quot;&lt;br/&gt;
&quot;quit&quot;&lt;br/&gt;
as argument.&lt;/p&gt;


&lt;p&gt;Becasue I have installed lustre 2.9 in these nodes before install lustre 2.10-RC1.&lt;br/&gt;
   In order to clarify the problem, I will try to install Lustre 2.10-RC1 in one pure OS node.&lt;br/&gt;
   Or you can provide me diagnostic script to get anything as you want to know. &lt;/p&gt;

&lt;p&gt;   Another question, if &quot;lnetctl net add --net o2ib0 --if ib0,ib1&quot; works fine in mgs node,  &lt;br/&gt;
How can I format mds/oss with mgsnode NID or mount lustre with mgsnode NID ?  Thanks.&lt;br/&gt;
(Is it need included both mgs ib0,ib1 NIDs,  like&lt;br/&gt;
&quot;mkfs.lustre --reformat --mdt --index=0 --mgsnode=172.20.110.220@o2ib:172.20.110.221@o2ib&quot; ?    &lt;br/&gt;
mount.lustre 172.20.110.220@o2ib:172.20.110.221@o2ib:/hpcfs /mnt/client ? )&lt;/p&gt;

</comment>
                            <comment id="201112" author="sebg-crd-pm" created="Thu, 6 Jul 2017 06:15:20 +0000"  >&lt;p&gt;Hi Amir Shehata,&lt;br/&gt;
I found there is no lnetctl in my built lustre.  The lnetctl file maybe installed by IEEL lustre before.&lt;br/&gt;
I can setup lnetctl  ok now after  install yaml-devel package and rebuid lustre.  Thanks. &lt;/p&gt;

&lt;p&gt;Another question, if &quot;lnetctl net add --net o2ib0 --if ib0,ib1&quot; works fine in mgs node, &lt;br/&gt;
How can I format mds/oss with mgsnode NID or mount lustre with mgsnode NID ? Thanks.&lt;br/&gt;
(Is it need included both mgs ib0,ib1 NIDs, like&lt;br/&gt;
&quot;mkfs.lustre --reformat --mdt --index=0 --mgsnode=172.20.110.220@o2ib:172.20.110.221@o2ib ...&quot; ? &lt;br/&gt;
mount.lustre 172.20.110.220@o2ib:172.20.110.221@o2ib:/hpcfs /mnt/client ? )&lt;/p&gt;</comment>
                            <comment id="201504" author="olaf" created="Mon, 10 Jul 2017 12:13:04 +0000"  >&lt;p&gt;Hi Amir, this looks like a duplicate of&#160;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9729&quot; title=&quot;missing mutex unlock in  lnet_dyn_add_net()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9729&quot;&gt;&lt;del&gt;LU-9729&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="201583" author="olaf" created="Mon, 10 Jul 2017 19:50:56 +0000"  >&lt;p&gt;Based on my analysis of the source code and the procedure that created the hang, it is almost certain that this is a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9729&quot; title=&quot;missing mutex unlock in  lnet_dyn_add_net()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9729&quot;&gt;&lt;del&gt;LU-9729&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;Please note that even if the fix for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9729&quot; title=&quot;missing mutex unlock in  lnet_dyn_add_net()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9729&quot;&gt;&lt;del&gt;LU-9729&lt;/del&gt;&lt;/a&gt; is included, the real problem is using an older &lt;tt&gt;lnetctl&lt;/tt&gt;.&lt;/p&gt;

&lt;p&gt;To ensure that &lt;tt&gt;lnetctl&lt;/tt&gt; will be built, install the rpms for &lt;tt&gt;libyaml&lt;/tt&gt; and &lt;tt&gt;libyaml-devel&lt;/tt&gt; on the build machine. This is not a hard requirement at the moment, but it will be in the future, because you need an up-to-date &lt;tt&gt;lnetctl&lt;/tt&gt; to enable and configure new functionality like LNet Multi-Rail.&lt;/p&gt;

&lt;p&gt;Remember that to use &lt;tt&gt;lnetctl&lt;/tt&gt; the &lt;tt&gt;libyaml&lt;/tt&gt; rpm also has to be installed on all nodes.&lt;/p&gt;</comment>
                            <comment id="202102" author="sebg-crd-pm" created="Fri, 14 Jul 2017 03:25:05 +0000"  >&lt;p&gt;Thanks for your kind reminder.  =&amp;gt; Remember that to use lnetctl the libyaml rpm also has to be installed on all nodes.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzg3b:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>