<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:00:12 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13311] lustre2.13 module load gets stuck at lnet</title>
                <link>https://jira.whamcloud.com/browse/LU-13311</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We started testing 2.13.&lt;/p&gt;

&lt;p&gt;Module load gets stuck at&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 open(&lt;span class=&quot;code-quote&quot;&gt;&quot;/sys/module/ptlrpc/initstate&quot;&lt;/span&gt;, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
stat(&lt;span class=&quot;code-quote&quot;&gt;&quot;/sys/module/ptlrpc&quot;&lt;/span&gt;, 0x7fffffffc660) = -1 ENOENT (No such file or directory)
open(&lt;span class=&quot;code-quote&quot;&gt;&quot;/lib/modules/3.10.0-1062.12.1.el7_lustre2130.x86_64/extra/lustre/fs/ptlrpc.ko&quot;&lt;/span&gt;, O_RDONLY|O_CLOEXEC) = 7
read(7, &lt;span class=&quot;code-quote&quot;&gt;&quot;\177ELF\2\1&quot;&lt;/span&gt;, 6)               = 6
lseek(7, 0, SEEK_SET)                   = 0
fstat(7, {st_mode=S_IFREG|0744, st_size=4181088, ...}) = 0
mmap(NULL, 4181088, PROT_READ, MAP_PRIVATE, 7, 0) = 0x2aaaabca1000
finit_module(7, &lt;span class=&quot;code-quote&quot;&gt;&quot;at_max=600 at_min=275 &quot;&lt;/span&gt;, 0&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;See attached file for complete trace.&lt;br/&gt;
I also include our module.conf file.&lt;/p&gt;</description>
                <environment></environment>
        <key id="58222">LU-13311</key>
            <summary>lustre2.13 module load gets stuck at lnet</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Fri, 28 Feb 2020 23:00:52 +0000</created>
                <updated>Wed, 6 Jan 2021 13:06:26 +0000</updated>
                            <resolved>Wed, 6 Jan 2021 13:06:26 +0000</resolved>
                                    <version>Lustre 2.13.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="264281" author="adilger" created="Sat, 29 Feb 2020 00:12:18 +0000"  >&lt;p&gt;This looks like it is getting stuck during loading of &lt;tt&gt;ptlrpc.ko&lt;/tt&gt;, which is when the network connections are initialized.  Could you please enable full debugging before starting to load &lt;tt&gt;ptlrpc.ko&lt;/tt&gt; and dump the debug logs after it gets stuck to see where it is having problems:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# modprobe libcfs
# lctl set_param debug=-1
# modprobe ptlrpc.ko &amp;amp; sleep 10
# lctl debug_kernel /tmp/debug.ptlrpc.log
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and then attach the log here (maybe gzipped if very large).&lt;/p&gt;</comment>
                            <comment id="264282" author="mhanafi" created="Sat, 29 Feb 2020 00:26:30 +0000"  >&lt;p&gt;So I tracked down the issue to our module.conf setting for the routers&lt;br/&gt;
removing this line the module loads find.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
options lnet routes=&lt;span class=&quot;code-quote&quot;&gt;&quot;o2ib233 10.151.26.[80-94]@o2ib; o2ib313 10.151.25.[167-171,195-198,202-205,222]@o2ib 10.151.26.[60,127,140-144,146-154]@o2ib; o2ib417 10.151.26.[230-239,244-249]@o2ib&quot;&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;See uploaded debug file.&lt;/p&gt;</comment>
                            <comment id="264283" author="adilger" created="Sat, 29 Feb 2020 03:07:18 +0000"  >&lt;p&gt;I&apos;m not able to necessarily able diagnose the LNet issue, that may have to be our LNet team.  That said, there looks to be something strange going on in the setup:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000800:00000010:7.0:1582936086.907926:0:9830:0:(o2iblnd.c:2381:kiblnd_create_tx_pool()) alloc &apos;(tx-&amp;gt;tx_sge)&apos;: 8224 at ffffa9c30d64b000 (tot 1972499764).
00000800:00000010:7.0:1582936086.907928:0:9830:0:(o2iblnd.c:2387:kiblnd_create_tx_pool()) alloc &apos;(tx-&amp;gt;tx_rd)&apos;: 3080 at ffff9add9603f000 (tot 1972502844).
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;It looks like it has allocated almost 2GB of memory for IB buffers (less a few MB for other things at the start).  That seems pretty high. Are you sure the IB parameters are correct?&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;options ko2iblnd ntx=251072 credits=125536 fmr_pool_size=62769 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;However, some suggestions to help debug:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;does the hang relate to any specific router?  If you remove one or two of the routers does it load OK?  I&apos;m assuming the routers are working for 2.10 clients?  In the logs it looks like it is parsing the &lt;tt&gt;o2ib233&lt;/tt&gt;, &lt;tt&gt;o2ib313&lt;/tt&gt;, and &lt;tt&gt;o2ib417&lt;/tt&gt; routes correctly.  Then it looks like it gets into peer discovery (or something) with the routers and never really finishes 20 seconds later when the debug log is dumped:
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000800:00000200:7.0:1582936102.918672:0:3029:0:(o2iblnd.c:1356:kiblnd_map_rx_descs()) rx 63: ffff9acdfe38a000 0xffe38a000(0xffe38a000)
00000800:00000200:7.0:1582936102.918674:0:3029:0:(o2iblnd.c:1356:kiblnd_map_rx_descs()) rx 64: ffff9acdfe38b000 0xffe38b000(0xffe38b000)
00000800:00000200:7.0:1582936102.918676:0:3029:0:(o2iblnd.c:1356:kiblnd_map_rx_descs()) rx 65: ffff9acdfc71e000 0xffc71e000(0xffc71e000)
00000800:00000010:7.0:1582936102.918678:0:3029:0:(o2iblnd.c:956:kiblnd_create_conn()) kfreed &apos;init_qp_attr&apos;: 112 at ffff9acdfb2e5d00 (tot 1973018084).
00000800:00000200:7.0:1582936102.918680:0:3029:0:(o2iblnd_cb.c:205:kiblnd_post_rx()) conn[ffff9acdd096a200] (67)++
00000800:00000200:7.0:1582936102.918682:0:3029:0:(o2iblnd_cb.c:239:kiblnd_post_rx()) conn[ffff9acdd096a200] (68)--
00000800:00000200:7.0:1582936102.918684:0:3029:0:(o2iblnd_cb.c:205:kiblnd_post_rx()) conn[ffff9acdd096a200] (67)++
00000800:00000200:7.0:1582936102.918685:0:3029:0:(o2iblnd_cb.c:239:kiblnd_post_rx()) conn[ffff9acdd096a200] (68)--
:
:
00000800:00000200:7.0:1582936102.918879:0:3029:0:(o2iblnd_cb.c:205:kiblnd_post_rx()) conn[ffff9acdd096a200] (67)++
00000800:00000200:7.0:1582936102.918881:0:3029:0:(o2iblnd_cb.c:239:kiblnd_post_rx()) conn[ffff9acdd096a200] (68)--
00000800:00000200:7.2:1582936102.919380:0:0:0:(o2iblnd_cb.c:3737:kiblnd_cq_completion()) conn[ffff9acdd096a200] (67)++
00000800:00000200:7.0:1582936102.919391:0:9865:0:(o2iblnd_cb.c:3859:kiblnd_scheduler()) conn[ffff9acdd096a200] (68)++
00000800:00000100:7.0:1582936102.919397:0:9865:0:(o2iblnd_cb.c:507:kiblnd_rx_complete()) Rx from 10.151.26.239@o2ib failed: 5
00000800:00000200:7.0:1582936102.919399:0:9865:0:(o2iblnd_cb.c:553:kiblnd_rx_complete()) rx ffff9acd505ec000 conn ffff9acdd096a200
:
:
00000800:00000200:7.0:1582936102.919845:0:9865:0:(o2iblnd_cb.c:158:kiblnd_drop_rx()) conn[ffff9acdd096a200] (3)--
00000800:00000200:1.0:1582936102.919845:0:9864:0:(o2iblnd_cb.c:3875:kiblnd_scheduler()) conn[ffff9acdd096a200] (3)--
00000800:00000200:7.0:1582936102.919849:0:9865:0:(o2iblnd_cb.c:3875:kiblnd_scheduler()) conn[ffff9acdd096a200] (1)--
00000800:00000200:1.0:1582936102.919857:0:9863:0:(o2iblnd_cb.c:3528:kiblnd_connd()) peer_ni[ffff9acdfd59b300] -&amp;gt; 10.151.26.239@o2ib (2)++
00000800:00000010:7.0:1582936102.920306:0:9863:0:(o2iblnd.c:1272:kiblnd_free_pages()) kfreed &apos;p&apos;: 536 at ffff9acd4b677c00 (tot 1973017412).
00000800:00000010:7.0:1582936102.920309:0:9863:0:(o2iblnd.c:1059:kiblnd_destroy_conn()) kfreed &apos;conn-&amp;gt;ibc_rxs&apos;: 6864 at ffff9acd505ec000 (tot 1973010548).
00000800:00000200:7.0:1582936102.920314:0:9863:0:(o2iblnd.c:1072:kiblnd_destroy_conn()) peer_ni[ffff9acdfd59b300] -&amp;gt; 10.151.26.239@o2ib (3)--
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;&lt;/li&gt;
&lt;/ul&gt;


&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;have you tried 2.12.4 yet?  It fixed several LNet issues that were in 2.12.3 (though I see you already have some of the tunables in &lt;tt&gt;lustre.conf&lt;/tt&gt; to disable &lt;tt&gt;lnet_transaction_timeout&lt;/tt&gt;).  Since this problem hits immediately, it should be easy to verify whether it is already fixed.&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="264284" author="mhanafi" created="Sat, 29 Feb 2020 03:28:12 +0000"  >&lt;p&gt;We have use the same ko2iblnd setting for a long time. We came up with those values for our large number of clients (over 13K). 2.12.3 servers&#160; work just fine. I make made a mistake listing effective version it should be 2.13.&lt;/p&gt;

&lt;p&gt;So far we have only seen this issue with 2.13 servers. I think one of the routers listed is actually down. I&apos;ll remove that and try it.&lt;/p&gt;

&lt;p&gt;Our routers have 2 interfaces on each side, but they are not configure to use multirail. It may be an issue with multi-rail discovery.&lt;/p&gt;</comment>
                            <comment id="264293" author="pjones" created="Sat, 29 Feb 2020 13:47:54 +0000"  >&lt;p&gt;Amir&lt;/p&gt;

&lt;p&gt;Could you please advise&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="264474" author="mhanafi" created="Tue, 3 Mar 2020 21:14:31 +0000"  >&lt;p&gt;Module load hangs if a router listed that is down. &lt;br/&gt;
If I remove 10.151.26.239@o2ib module load works.&lt;br/&gt;
When it is listed in module.conf&lt;br/&gt;
This is the output on console. Why is it enumerating eth1?&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[  224.782043] LNet: 8054:0:(config.c:1641:lnet_inet_enumerate()) lnet: Ignoring &lt;span class=&quot;code-keyword&quot;&gt;interface&lt;/span&gt; eth1: it&apos;s down
[  227.476974] LNet: 8054:0:(router.c:692:lnet_add_route()) Consider turning discovery on to enable full Multi-Rail routing functionality
[  227.521176] LNetError: 8092:0:(router.c:315:lnet_set_route_aliveness()) route to o2ib233 through 10.151.26.80@o2ib has gone from down to up
[  228.781636] LNet: 8089:0:(o2iblnd_cb.c:3397:kiblnd_check_conns()) Timed out tx &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 10.151.26.239@o2ib: 227 seconds
[  228.817288] LNetError: 2950:0:(peer.c:3724:lnet_peer_ni_add_to_recoveryq_locked()) lpni 10.151.26.239@o2ib added to recovery queue. Health = 999
[  229.781593] LNet: 8089:0:(o2iblnd_cb.c:3397:kiblnd_check_conns()) Timed out tx &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 10.151.26.239@o2ib: 228 seconds
[  262.779754] LNet: 8089:0:(o2iblnd_cb.c:3397:kiblnd_check_conns()) Timed out tx &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 10.151.26.239@o2ib: 0 seconds
[  292.778101] LNet: 8089:0:(o2iblnd_cb.c:3397:kiblnd_check_conns()) Timed out tx &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 10.151.26.239@o2ib: 0 seconds
[  302.777536] LNet: 8089:0:(o2iblnd_cb.c:3397:kiblnd_check_conns()) Timed out tx &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 10.151.26.239@o2ib: 10 seconds
[  322.776432] LNet: 8089:0:(o2iblnd_cb.c:3397:kiblnd_check_conns()) Timed out tx &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 10.151.26.239@o2ib: 0 seconds
[  353.774712] LNet: 8089:0:(o2iblnd_cb.c:3397:kiblnd_check_conns()) Timed out tx &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 10.151.26.239@o2ib: 1 seconds

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="264520" author="ashehata" created="Wed, 4 Mar 2020 05:31:16 +0000"  >&lt;p&gt;I believe this issue was fixed with the following patch:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
&#160; LU-13001 lnet: Wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; single discovery attempt of routers&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="288750" author="mhanafi" created="Wed, 6 Jan 2021 01:27:06 +0000"  >&lt;p&gt;We can close this&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="34367" name="debug.ptlrpc.log.gz" size="2615451" author="mhanafi" created="Sat, 29 Feb 2020 00:30:30 +0000"/>
                            <attachment id="34366" name="lustre.conf" size="985" author="mhanafi" created="Fri, 28 Feb 2020 23:00:21 +0000"/>
                            <attachment id="34365" name="module_load_issue.out" size="27583" author="mhanafi" created="Fri, 28 Feb 2020 23:00:21 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00ukn:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>