Description
The ip2nets parameter, when matching a bound IP address on an interface of the node, may activate a different LND interface with the wrong IP address.
This may be related to LUs:
https://jira.whamcloud.com/browse/LU-7563
https://jira.whamcloud.com/browse/LU-11859
The Note/example on page 69 of https://build.whamcloud.com/job/lustre-manual/lastSuccessfulBuild/artifact/lustre_manual.pdf doesn't match what actually happens.
If an interface is explicitly specified as well as a pattern, the interface matched using the IP pattern will be sanitized against the explicitly-defined interface. For example, tcp(eth0) 192.168.*.3 and there exists in the system eth0 == 192.158.19.3 and eth1 == 192.168.3.3, then the configuration will fail, because the pattern contradicts the interface specified. A clear warning will be displayed if inconsistent configuration is encountered.
Example showing the issue with stock lnet.ko, working through the above steps:
[root@lustre.test ~]# ip netns add test
[root@lustre.test ~]# ip -n test link add eth0 type dummy
[root@lustre.test ~]# ip -n test link add eth1 type dummy
[root@lustre.test ~]# ip -n test addr add 127.0.0.1/8 brd + dev lo
[root@lustre.test ~]# ip -n test addr add 192.158.19.3/24 brd + dev eth0
[root@lustre.test ~]# ip -n test addr add 192.168.3.3/24 brd + dev eth1
[root@lustre.test ~]# ip -n test link set up dev lo
[root@lustre.test ~]# ip -n test link set up dev eth0
[root@lustre.test ~]# ip -n test link set up dev eth1
[root@lustre.test ~]# ip netns exec test bash
[root@lustre.test ~]# ip -4 addr list
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
inet 127.0.0.1/8 brd 127.255.255.255 scope host lo
valid_lft forever preferred_lft forever
2: eth0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000
inet 192.158.19.3/24 brd 192.158.19.255 scope global eth0
valid_lft forever preferred_lft forever
3: eth1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000
inet 192.168.3.3/24 brd 192.168.3.255 scope global eth1
valid_lft forever preferred_lft forever
[root@lustre.test ~]# modinfo -F version lnet
0.7.0
[root@lustre.test ~]# modinfo -F version lustre
2.15.2
[root@lustre.test ~]# cat /etc/modprobe.d/lnet.conf
options lnet "ip2nets=tcp(eth0) 192.168.*.3"
[root@lustre.test ~]# modprobe lnet
[root@lustre.test ~]# cat /sys/module/lnet/parameters/ip2nets
tcp(eth0) 192.168.*.3
[root@lustre.test ~]# lnetctl net show
show:
- net:
errno: -100
descr: "cannot get networks: Network is down"
[root@lustre.test ~]# lnetctl lnet configure --all
[root@lustre.test ~]# lnetctl net show
net:
- net type: lo
local NI(s):
- nid: 0@lo
status: up
- net type: tcp
local NI(s):
- nid: 192.158.19.3@tcp
status: up
interfaces:
0: eth0
[root@lustre.test ~]# dmesg
LNet: HW NUMA nodes: 2, HW CPU cores: 56, npartitions: 2
alg: No test for adler32 (adler32-zlib)
Key type ._llcrypt registered
Key type .llcrypt registered
LNet: Added LNI 192.158.19.3@tcp [8/256/0/180]
LNet: Accept secure, port 988
Same steps with patched lnet.ko (without IP stack setup steps):
[root@lustre.test ~]# ip netns exec test bash
[root@lustre.test ~]# ip -4 addr list
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
inet 127.0.0.1/8 brd 127.255.255.255 scope host lo
valid_lft forever preferred_lft forever
2: eth0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000
inet 192.158.19.3/24 brd 192.158.19.255 scope global eth0
valid_lft forever preferred_lft forever
3: eth1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000
inet 192.168.3.3/24 brd 192.168.3.255 scope global eth1
valid_lft forever preferred_lft forever
[root@lustre.test ~]# cat /etc/modprobe.d/lnet.conf
options lnet "ip2nets=tcp(eth0) 192.168.*.3"
[root@lustre.test ~]# modprobe lnet
[root@lustre.test ~]# cat /sys/module/lnet/parameters/ip2nets
tcp(eth0) 192.168.*.3
[root@lustre.test ~]# lnetctl net show
show:
- net:
errno: -100
descr: "cannot get networks: Network is down"
[root@lustre.test ~]# lnetctl lnet configure --all
configure:
- lnet:
errno: -22
descr: "LNet configure error: Invalid argument"
[root@lustre.test ~]# dmesg
LNet: HW NUMA nodes: 2, HW CPU cores: 56, npartitions: 2
alg: No test for adler32 (adler32-zlib)
Key type ._llcrypt registered
Key type .llcrypt registered
LNetError: 11a-a: ip2nets does not match any local IP interfaces
LNetError: 47753:0:(config.c:574:lnet_parse_networks()) networks string is undefined
[root@lustre.test ~]# modprobe -r lnet ; dmesg -C
[root@lustre.test ~]# modprobe lnet "ip2nets=tcp(eth0,eth10,eth1) 192.168.*.3"
[root@lustre.test ~]# cat /sys/module/lnet/parameters/ip2nets
tcp(eth0,eth10,eth1) 192.168.*.3
[root@lustre.test ~]# lnetctl net show
show:
- net:
errno: -100
descr: "cannot get networks: Network is down"
[root@lustre.test ~]# lnetctl lnet configure --all
[root@lustre.test ~]# lnetctl net show
net:
- net type: lo
local NI(s):
- nid: 0@lo
status: up
- net type: tcp
local NI(s):
- nid: 192.168.3.3@tcp
status: up
interfaces:
0: eth1
[root@lustre.test ~]# dmesg
LNet: HW NUMA nodes: 2, HW CPU cores: 56, npartitions: 2
alg: No test for adler32 (adler32-zlib)
Key type ._llcrypt registered
Key type .llcrypt registered
LNet: ip2nets matched tcp(eth1) for 192.168.*.3
LNet: Added LNI 192.168.3.3@tcp [8/256/0/180]
LNet: Accept secure, port 988
I wrote the patch against the 2.15.2 tag in hopes of making the code follow my interpretation of the documentation for the module parameter. Hopefully the area where I added the check is the correct spot.