<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:48:14 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-11937] lnet.service randomly load tcp NIDs</title>
                <link>https://jira.whamcloud.com/browse/LU-11937</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;On clients, we&apos;re using lnet.service with the following config:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@sh-112-12 ~]# cat /etc/lnet.conf 
net:
    - net type: o2ib4
      local NI(s):
        - nid:
          interfaces:
              0: ib0
route: 
    - net: o2ib1
      gateway: 10.9.0.[31-32]@o2ib4
    - net: o2ib5
      gateway: 10.9.0.[41-42]@o2ib4
    - net: o2ib7
      gateway: 10.9.0.[21-24]@o2ib4
[root@sh-112-12 ~]# lctl list_nids
10.10.112.12@tcp
10.9.112.12@o2ib4
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@sh-112-12 ~]# dmesg | grep -i lnet
[  397.762804] LNet: HW NUMA nodes: 2, HW CPU cores: 20, npartitions: 2
[  398.995449] LNet: 13837:0:(socklnd.c:2655:ksocknal_enumerate_interfaces()) Ignoring interface enp4s0f1 (down)
[  399.005708] LNet: Added LNI 10.10.112.12@tcp [8/256/0/180]
[  399.011316] LNet: Accept secure, port 988
[  399.060725] LNet: Using FastReg for registration
[  399.075936] LNet: Added LNI 10.9.112.12@o2ib4 [8/256/0/180]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;It is unclear why it does that at this point.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;client network config:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@sh-112-12 ~]# ip addr
1: lo: &amp;lt;LOOPBACK,UP,LOWER_UP&amp;gt; mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
       valid_lft forever preferred_lft forever
    inet6 ::1/128 scope host 
       valid_lft forever preferred_lft forever
2: enp4s0f0: &amp;lt;BROADCAST,MULTICAST,UP,LOWER_UP&amp;gt; mtu 1500 qdisc mq state UP group default qlen 1000
    link/ether 0c:c4:7a:dc:96:ae brd ff:ff:ff:ff:ff:ff
    inet 10.10.112.12/16 brd 10.10.255.255 scope global enp4s0f0
       valid_lft forever preferred_lft forever
    inet6 fe80::ec4:7aff:fedc:96ae/64 scope link 
       valid_lft forever preferred_lft forever
3: enp4s0f1: &amp;lt;BROADCAST,MULTICAST&amp;gt; mtu 1500 qdisc noop state DOWN group default qlen 1000
    link/ether 0c:c4:7a:dc:96:af brd ff:ff:ff:ff:ff:ff
4: ib0: &amp;lt;BROADCAST,MULTICAST,UP,LOWER_UP&amp;gt; mtu 2044 qdisc mq state UP group default qlen 256
    link/infiniband 20:00:10:8b:fe:80:00:00:00:00:00:00:24:8a:07:03:00:a0:9e:20 brd 00:ff:ff:ff:ff:12:40:1b:ff:ff:00:00:00:00:00:00:ff:ff:ff:ff
    inet 10.9.112.12/16 brd 10.9.255.255 scope global ib0
       valid_lft forever preferred_lft forever
    inet6 fe80::268a:703:a0:9e20/64 scope link 
       valid_lft forever preferred_lft forever
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;lnet.service origin:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@sh-112-12 ~]# rpm -qf /usr/lib/systemd/system/lnet.service 
lustre-client-2.12.0-1.el7.x86_64
[root@sh-112-12 ~]# rpm -q --info lustre-client
Name        : lustre-client
Version     : 2.12.0
Release     : 1.el7
Architecture: x86_64
Install Date: Wed 06 Feb 2019 10:13:52 AM PST
Group       : System Environment/Kernel
Size        : 2007381
License     : GPL
Signature   : (none)
Source RPM  : lustre-client-2.12.0-1.el7.src.rpm
Build Date  : Fri 21 Dec 2018 01:53:18 PM PST
Build Host  : trevis-307-el7-x8664-3.trevis.whamcloud.com
Relocations : (not relocatable)
URL         : https://wiki.whamcloud.com/
Summary     : Lustre File System
Description :
Userspace tools and files for the Lustre file system.
[root@sh-112-12 ~]# cat /usr/lib/systemd/system/lnet.service 
[Unit]
Description=lnet management

Requires=network-online.target
After=network-online.target openibd.service rdma.service

ConditionPathExists=!/proc/sys/lnet/

[Service]
Type=oneshot
RemainAfterExit=true
ExecStart=/sbin/modprobe lnet
ExecStart=/usr/sbin/lnetctl lnet configure
ExecStart=/usr/sbin/lnetctl import /etc/lnet.conf
ExecStop=/usr/sbin/lustre_rmmod ptlrpc
ExecStop=/usr/sbin/lnetctl lnet unconfigure
ExecStop=/usr/sbin/lustre_rmmod libcfs ldiskfs

[Install]
WantedBy=multi-user.target
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This leads to many issues server-side with 2.12, as reported in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11888&quot; title=&quot;Unreachable client NID confusing Lustre 2.12&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11888&quot;&gt;LU-11888&lt;/a&gt; and &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11936&quot; title=&quot;High ldlm load, slow/unusable filesystem&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11936&quot;&gt;LU-11936&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;Thanks!&lt;br/&gt;
 Stephane&lt;/p&gt;</description>
                <environment>CentOS 7.6 (3.10.0-957.5.1.el7.x86_64), Lustre 2.12.0</environment>
        <key id="54801">LU-11937</key>
            <summary>lnet.service randomly load tcp NIDs</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="6">Not a Bug</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="sthiell">Stephane Thiell</reporter>
                        <labels>
                    </labels>
                <created>Wed, 6 Feb 2019 18:56:47 +0000</created>
                <updated>Tue, 12 Nov 2019 23:39:07 +0000</updated>
                            <resolved>Tue, 12 Nov 2019 23:39:06 +0000</resolved>
                                    <version>Lustre 2.12.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="241487" author="pjones" created="Wed, 6 Feb 2019 19:01:34 +0000"  >&lt;p&gt;Related to other tickets Sonia is working on&lt;/p&gt;</comment>
                            <comment id="241489" author="sthiell" created="Wed, 6 Feb 2019 19:08:07 +0000"  >&lt;p&gt;Thanks, we&apos;re trying this drop-in file as a workaround on all clients:&lt;/p&gt;

&lt;p&gt;/etc/systemd/system/lnet.service.d/deps.conf&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[Unit]
After=dkms.service network.service

[Service]
# we don&apos;t want tcp nids
ExecStartPost=-/usr/sbin/lnetctl net del --net tcp
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="258094" author="sthiell" created="Mon, 11 Nov 2019 18:09:33 +0000"  >&lt;p&gt;Hello Peter,&lt;/p&gt;

&lt;p&gt;This problem is still there and caused some trouble for us last weekend. Apparently, depiste our lnet service workaround on the clients, a tcp NID was able to make its way to Fir servers (2.12.3) which after a few hours caused a MDT deadlock.&lt;/p&gt;

&lt;p&gt;NOTE: For us, this is a major blocker for migrating Oak from 2.10 to 2.12, as multi-rail can cause this kind of issues, especially when storage and compute are separated and managed by different teams. A misconfigured client can cause such trouble on the server side.&#160; Note that in our case, we don&apos;t want any tcp NID at all but there is no way to avoid that in 2.12 as far as I know. In 2.10, there is no risk of having this situation on the servers.&lt;/p&gt;

&lt;p&gt;We tracked down the problem today to the lnet.service script, which induces a race between &lt;tt&gt;lnet configure&lt;/tt&gt; and &lt;tt&gt;lnet import&lt;/tt&gt;:&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# cat /usr/lib/systemd/system/lnet.service
[Unit]
Description=lnet management

Requires=network-online.target
After=network-online.target openibd.service rdma.service opa.service

ConditionPathExists=!/proc/sys/lnet/

[Service]
Type=oneshot
RemainAfterExit=true
ExecStart=/sbin/modprobe lnet
ExecStart=/usr/sbin/lnetctl lnet configure
ExecStart=/usr/sbin/lnetctl import /etc/lnet.conf
ExecStop=/usr/sbin/lustre_rmmod ptlrpc
ExecStop=/usr/sbin/lnetctl lnet unconfigure
ExecStop=/usr/sbin/lustre_rmmod libcfs ldiskfs

[Install]
WantedBy=multi-user.target
 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;Even with our workaround, which works most of the case though, some clients can show up with a tcp0 NID at lnet configure and thus there is a risk of announcing themselves with a tcp NID when the filesystem tries to mount itself:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2019-11-05T22:10:26-08:00 sh-117-11 systemd: Starting SYSV: Lustre shine mounting script...
2019-11-05T22:10:26-08:00 sh-117-11 kernel: LNet: HW NUMA nodes: 2, HW CPU cores: 24, npartitions: 2
2019-11-05T22:10:27-08:00 sh-117-11 kernel: LNet: Added LNI 10.10.117.11@tcp [8/256/0/180]
2019-11-05T22:10:27-08:00 sh-117-11 kernel: LNet: Accept secure, port 988
2019-11-05T22:10:27-08:00 sh-117-11 kernel: LNet: Using FastReg for registration
2019-11-05T22:10:27-08:00 sh-117-11 kernel: LNet: Added LNI 10.9.117.11@o2ib4 [8/256/0/180]
2019-11-05T22:10:29-08:00 sh-117-11 kernel: LNet: Removed LNI 10.10.117.11@tcp
2019-11-05T22:10:43-08:00 sh-117-11 kernel: LNet: 8157:0:(o2iblnd_cb.c:3396:kiblnd_check_conns()) Timed out tx for 10.9.0.24@o2ib4: 38 seconds
2019-11-05T22:11:06-08:00 sh-117-11 kernel: LNet: 8157:0:(o2iblnd_cb.c:3396:kiblnd_check_conns()) Timed out tx for 10.9.0.24@o2ib4: 61 seconds
2019-11-05T22:11:08-08:00 sh-117-11 shine: Starting shine:  WARNING: Nothing to mount on sh-117-11 for `regal&apos;
2019-11-05T22:11:08-08:00 sh-117-11 shine: WARNING: Nothing was done for `regal&apos;.
2019-11-05T22:11:08-08:00 sh-117-11 shine: Mount of fir on /scratch failed
2019-11-05T22:11:08-08:00 sh-117-11 shine: &amp;gt;&amp;gt; mount.lustre: mount 10.0.10.51@o2ib7:10.0.10.52@o2ib7:/fir at /scratch failed: Input/output error
2019-11-05T22:11:08-08:00 sh-117-11 shine: Is the MGS running?
2019-11-05T22:11:08-08:00 sh-117-11 shine: [FAILED]
2019-11-05T22:11:08-08:00 sh-117-11 systemd: shine.service: control process exited, code=exited status=16
2019-11-05T22:11:08-08:00 sh-117-11 systemd: Failed to start SYSV: Lustre shine mounting script.
2019-11-05T22:11:08-08:00 sh-117-11 systemd: Unit shine.service entered failed state.
2019-11-05T22:11:08-08:00 sh-117-11 systemd: shine.service failed.
2019-11-05T22:11:33-08:00 sh-117-11 systemd: Starting SYSV: Lustre shine mounting script...
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Then, the servers keep trying to contact these clients to the erroneous tcp NID, while they don&apos;t even have a tcp interface nor a route for those, and we end up having problems like these:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[Sat Nov  9 23:58:14 2019][461981.327905] LustreError: 80884:0:(ldlm_lockd.c:1348:ldlm_handle_enqueue0()) ### lock on destroyed export ffffa0fe1e7f7400 ns: mdt-fir-MDT0000_UUID lock: ffffa11943671200/0x675684f65f9baf7 lrc: 3/0,0 mode: CR/CR res: [0x200038966:0x418:0x0].0x0 bits 0x9/0x0 rrc: 2 type: IBT flags: 0x50200000000000 nid: 10.10.117.11@tcp remote: 0x541e831b11b117da expref: 250 pid: 80884 timeout: 0 lvb_type: 0^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;We also think that the backtraces below are due to MDT threads being stuck with tcp NIDs:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[Sun Nov 10 06:24:59 2019][485187.377753] Pid: 67098, comm: mdt03_047 3.10.0-957.27.2.el7_lustre.pl1.x86_64 #1 SMP Mon Aug 5 15:28:37 PDT 2019^M
[Sun Nov 10 06:25:00 2019][485187.388020] Call Trace:^M
[Sun Nov 10 06:25:00 2019][485187.390566]  [&amp;lt;ffffffffc10ccb75&amp;gt;] ldlm_completion_ast+0x4e5/0x860 [ptlrpc]^M
[Sun Nov 10 06:25:00 2019][485187.397597]  [&amp;lt;ffffffffc10cd5e1&amp;gt;] ldlm_cli_enqueue_local+0x231/0x830 [ptlrpc]^M
[Sun Nov 10 06:25:00 2019][485187.404884]  [&amp;lt;ffffffffc15d850b&amp;gt;] mdt_object_local_lock+0x50b/0xb20 [mdt]^M
[Sun Nov 10 06:25:00 2019][485187.411808]  [&amp;lt;ffffffffc15d8b90&amp;gt;] mdt_object_lock_internal+0x70/0x360 [mdt]^M
[Sun Nov 10 06:25:00 2019][485187.418892]  [&amp;lt;ffffffffc15da40d&amp;gt;] mdt_getattr_name_lock+0x101d/0x1c30 [mdt]^M
[Sun Nov 10 06:25:00 2019][485187.425989]  [&amp;lt;ffffffffc15e1d25&amp;gt;] mdt_intent_getattr+0x2b5/0x480 [mdt]^M
[Sun Nov 10 06:25:00 2019][485187.432638]  [&amp;lt;ffffffffc15debb5&amp;gt;] mdt_intent_policy+0x435/0xd80 [mdt]^M
[Sun Nov 10 06:25:00 2019][485187.439230]  [&amp;lt;ffffffffc10b3d46&amp;gt;] ldlm_lock_enqueue+0x356/0xa20 [ptlrpc]^M
[Sun Nov 10 06:25:00 2019][485187.446073]  [&amp;lt;ffffffffc10dc336&amp;gt;] ldlm_handle_enqueue0+0xa56/0x15f0 [ptlrpc]^M
[Sun Nov 10 06:25:00 2019][485187.453272]  [&amp;lt;ffffffffc1164a12&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]^M
[Sun Nov 10 06:25:00 2019][485187.459514]  [&amp;lt;ffffffffc116936a&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]^M
[Sun Nov 10 06:25:00 2019][485187.466549]  [&amp;lt;ffffffffc111024b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]^M
[Sun Nov 10 06:25:00 2019][485187.474357]  [&amp;lt;ffffffffc1113bac&amp;gt;] ptlrpc_main+0xb2c/0x1460 [ptlrpc]^M
[Sun Nov 10 06:25:00 2019][485187.480780]  [&amp;lt;ffffffffbe8c2e81&amp;gt;] kthread+0xd1/0xe0^M
[Sun Nov 10 06:25:00 2019][485187.485775]  [&amp;lt;ffffffffbef77c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21^M
[Sun Nov 10 06:25:00 2019][485187.492327]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Yesterday, MDT0 on Fir went down and was completely hung, with previously thousands messages having tcp NIDs and backtrace like above, all of that apparently due to 2 clients announcing themselves as having a tcp NID.&lt;/p&gt;

&lt;p&gt;Please advice if there is a way to completely disable multi-rail and avoid this situation. I would recommend to increase the severity of this issue as this has caused a lot of trouble since 2.12, but I&apos;m glad we&apos;re finally making progress. Thanks much!&lt;br/&gt;
&#160;&lt;/p&gt;</comment>
                            <comment id="258189" author="sthiell" created="Tue, 12 Nov 2019 20:24:59 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@fir-md1-s1 fir-MDT0000]# lnetctl peer show --nid 10.10.23.14@tcp
peer:
    - primary nid: 10.10.23.14@tcp
      Multi-Rail: True
      peer ni:
        - nid: 10.8.23.14@o2ib6
          state: NA
        - nid: 10.10.23.14@tcp
          state: NA
[root@fir-md1-s1 fir-MDT0000]# lctl ping 10.10.23.14@tcp
failed to ping 10.10.23.14@tcp: Input/output error
[root@fir-md1-s1 fir-MDT0000]# lctl ping 10.8.23.14@o2ib6
12345-0@lo
12345-10.8.23.14@o2ib6
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I was able to manually remove the TCP nid with this:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@fir-md1-s1 fir-MDT0000]# lnetctl peer del --prim_nid 10.10.23.14@tcp --nid 10.10.23.14@tcp
[root@fir-md1-s1 fir-MDT0000]# lnetctl peer show --nid 10.10.23.14@tcp
show:
    - peer:
          errno: -2
          descr: &quot;cannot get peer information: No such file or directory&quot;

[root@fir-md1-s1 fir-MDT0000]# lnetctl peer show --nid 10.8.23.14@o2ib6
show:
    - peer:
          errno: -2
          descr: &quot;cannot get peer information: No such file or directory&quot;

[root@fir-md1-s1 fir-MDT0000]# lnetctl peer show --nid 10.8.23.14@o2ib6
peer:
    - primary nid: 10.8.23.14@o2ib6
      Multi-Rail: True
      peer ni:
        - nid: 10.8.23.14@o2ib6
          state: NA
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="258192" author="ashehata" created="Tue, 12 Nov 2019 20:40:25 +0000"  >&lt;p&gt;If the TCP network is configured on the node, then it&apos;ll be propagated due to the discovery feature. There are three solutions: 1) remove the tcp nid from all the nodes if you don&apos;t need it. 2) Turn off discovery on all the nodes. 3) Explicitly configure the peers (but that would be a lot of config). I believe you can use ip2nets syntax though.&lt;/p&gt;</comment>
                            <comment id="258193" author="sthiell" created="Tue, 12 Nov 2019 20:45:43 +0000"  >&lt;p&gt;Amir, our lnet.conf on the clients is as follow:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@sh-101-01 ~]# cat /etc/lnet.conf 
global:
    - retry_count: 0
    - health_sensitivity: 0
    - transaction_timeout: 10
net:
    - net type: o2ib4
      local NI(s):
        - nid:
          interfaces:
              0: ib0
route: 
    - net: o2ib5
      gateway: 10.9.0.[41-42]@o2ib4
    - net: o2ib7
      gateway: 10.9.0.[21-24]@o2ib4
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;But when lnet is loaded, it does a &lt;tt&gt;lnet configure&lt;/tt&gt; before the import of that file, which I think might propagate a tcp NID in some rare case.&lt;/p&gt;

&lt;p&gt;How can we be sure to disable discovery everywhere without any race condition? How do you do that? We really don&apos;t use multi-rail at all in our case. Thanks!!&lt;/p&gt;</comment>
                            <comment id="258195" author="ashehata" created="Tue, 12 Nov 2019 21:43:20 +0000"  >&lt;p&gt;Hi Stephane,&lt;/p&gt;

&lt;p&gt;&lt;tt&gt;lnetctl lnet configure&lt;/tt&gt; should not configure any networks. The default tcp would get configured if somewhere you&apos;re doing &lt;tt&gt;lctl net up&lt;/tt&gt;. That would load the default tcp network.&lt;/p&gt;

&lt;p&gt;To disable discovery you can add&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
options lnet  lnet_peer_discovery_disabled=1&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;on all the nodes.&lt;/p&gt;

&lt;p&gt;My hunch at the moment is that there are some nodes which are using&#160;&lt;tt&gt;lctl net up&lt;/tt&gt; or&#160;&lt;tt&gt;lnetctl lnet configure --all&lt;/tt&gt;. This would lead to the tcp network being loaded, especially if you don&apos;t have an &quot;&lt;tt&gt;options network&quot;&lt;/tt&gt; in your&#160;&lt;tt&gt;&quot;modprobe.d/lnet.conf&quot;&lt;/tt&gt; file&lt;/p&gt;

&lt;p&gt;Would you be able to check that?&lt;/p&gt;</comment>
                            <comment id="258199" author="sthiell" created="Tue, 12 Nov 2019 22:37:17 +0000"  >&lt;p&gt;Hi Amir,&lt;/p&gt;

&lt;p&gt;Thanks for your help! This was useful. I confirm that &lt;tt&gt;lnetctl lnet configure&lt;/tt&gt; does not configure any networks, my bad!&lt;/p&gt;

&lt;p&gt;I guess we&apos;ve just figured out what was wrong in our setup and you were very close: a service to mount our Lustre filesystems was doing a &lt;tt&gt;modprobe lustre&lt;/tt&gt; and in some (rare) cases, the filesystem mount that followed was done at the same time as &lt;tt&gt;lnet.service&lt;/tt&gt;, leading to a tcp NID to be propagated to the servers and causing the trouble I described on the server-side.&#160;We have fixed the dependencies of boot time services on our clients so this hopefully should not happen anymore!&lt;/p&gt;

&lt;p&gt;&lt;tt&gt;modprobe lustre&lt;/tt&gt; seems to do the same as &lt;tt&gt;lnet net up&lt;/tt&gt; and does configure a default tcp network if lnet is not configured yet.&lt;/p&gt;

&lt;p&gt;Sorry for the noise, after all, it looks like this was never a problem of &lt;tt&gt;lnet.service&lt;/tt&gt;.&lt;/p&gt;</comment>
                            <comment id="258201" author="pjones" created="Tue, 12 Nov 2019 23:19:57 +0000"  >&lt;p&gt;Good news - so can we consider this ticket resolved?&lt;/p&gt;</comment>
                            <comment id="258202" author="sthiell" created="Tue, 12 Nov 2019 23:24:51 +0000"  >&lt;p&gt;Yes, good news! We appreciated the help, thanks! Sorry it took us so much time to figure that out. We&apos;ve also added some &quot;rogue NID monitoring&quot; on the server side, just in case some clients continue to be misconfigured. We prefer to let the default lnet discovery enabled for now, but it&apos;s good to know that we do have the option to disable it we want to. I&apos;m ok to consider this ticket resolved at this point.&lt;/p&gt;</comment>
                            <comment id="258203" author="pjones" created="Tue, 12 Nov 2019 23:39:07 +0000"  >&lt;p&gt;ok - thanks&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00b4v:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>