Details
-
Bug
-
Resolution: Fixed
-
Minor
-
Lustre 2.12.6
-
None
-
RedHat 8.3
kernel 4.18.0-240.10.1.el8_3.x86_64
lustre 2.12.6
-
3
-
9223372036854775807
Description
On a machine with 4 IB interfaces, I would like to create a LNet multirail configuration that takes into account NUMA location of each interface, in order to get the highest LNet performance.
I have tried several lnet configuration but none of them allow a local binding of each interface.
Here is the NUMA description of the machine. The IB devices ib0, ib1, ib2, ib3 are located on NUMA node 1, 3, 5 and 7 respectively.
# numactl -H available: 8 nodes (0-7) node 0 cpus: 0 1 2 3 4 5 48 49 50 51 52 53 node 0 size: 63832 MB node 0 free: 60103 MB node 1 cpus: 6 7 8 9 10 11 54 55 56 57 58 59 node 1 size: 64268 MB node 1 free: 39220 MB node 2 cpus: 12 13 14 15 16 17 60 61 62 63 64 65 node 2 size: 64317 MB node 2 free: 61323 MB node 3 cpus: 18 19 20 21 22 23 66 67 68 69 70 71 node 3 size: 64281 MB node 3 free: 61558 MB node 4 cpus: 24 25 26 27 28 29 72 73 74 75 76 77 node 4 size: 64269 MB node 4 free: 60741 MB node 5 cpus: 30 31 32 33 34 35 78 79 80 81 82 83 node 5 size: 64305 MB node 5 free: 62450 MB node 6 cpus: 36 37 38 39 40 41 84 85 86 87 88 89 node 6 size: 64275 MB node 6 free: 63133 MB node 7 cpus: 42 43 44 45 46 47 90 91 92 93 94 95 node 7 size: 64337 MB node 7 free: 62429 MB node distances: node 0 1 2 3 4 5 6 7 0: 10 12 12 12 32 32 32 32 1: 12 10 12 12 32 32 32 32 2: 12 12 10 12 32 32 32 32 3: 12 12 12 10 32 32 32 32 4: 32 32 32 32 10 12 12 12 5: 32 32 32 32 12 10 12 12 6: 32 32 32 32 12 12 10 12 7: 32 32 32 32 12 12 12 10 # grep . /sys/class/net/ib*/device/numa_node /sys/class/net/ib0/device/numa_node:1 /sys/class/net/ib1/device/numa_node:3 /sys/class/net/ib2/device/numa_node:5 /sys/class/net/ib3/device/numa_node:7
By default, the libcfs module configures 8 CPTs
# modprobe -v libcfs insmod /lib/modules/4.18.0-240.10.1.el8_3.x86_64/weak-updates/lustre-client/net/libcfs.ko # lctl get_param cpu_partition_table cpu_partition_table= 0 : 0 1 2 3 4 5 48 49 50 51 52 53 1 : 6 7 8 9 10 11 54 55 56 57 58 59 2 : 12 13 14 15 16 17 60 61 62 63 64 65 3 : 18 19 20 21 22 23 66 67 68 69 70 71 4 : 24 25 26 27 28 29 72 73 74 75 76 77 5 : 30 31 32 33 34 35 78 79 80 81 82 83 6 : 36 37 38 39 40 41 84 85 86 87 88 89 7 : 42 43 44 45 46 47 90 91 92 93 94 95
With configuration 1, no LNet binding is specified and we observe each interface is bound to every CPTs
# modprobe -v lnet
insmod /lib/modules/4.18.0-240.10.1.el8_3.x86_64/weak-updates/lustre-client/net/lnet.ko networks=o2ib(ib0,ib1,ib2,ib3)
# lctl net up
LNET configured
# lnetctl net show
net:
- net type: lo
local NI(s):
- nid: 0@lo
status: up
- net type: o2ib
local NI(s):
- nid: 14.128.0.45@o2ib
status: up
interfaces:
0: ib0
- nid: 14.128.0.46@o2ib
status: up
interfaces:
0: ib1
- nid: 14.128.0.47@o2ib
status: up
interfaces:
0: ib2
- nid: 14.128.0.48@o2ib
status: up
interfaces:
0: ib3
# lnetctl net show --verbose | grep -E 'ib|CPT|dev'
dev cpt: 0
CPT: "[0,1,2,3,4,5,6,7]"
- net type: o2ib
- nid: 14.128.0.45@o2ib
0: ib0
dev cpt: 1
CPT: "[0,1,2,3,4,5,6,7]"
- nid: 14.128.0.46@o2ib
0: ib1
dev cpt: 3
CPT: "[0,1,2,3,4,5,6,7]"
- nid: 14.128.0.47@o2ib
0: ib2
dev cpt: 5
CPT: "[0,1,2,3,4,5,6,7]"
- nid: 14.128.0.48@o2ib
0: ib3
dev cpt: 7
CPT: "[0,1,2,3,4,5,6,7]"
With configuration 2, LNet binding is specified as [1,3,5,7] and we observe each interface is bound to CPTs 1,3,5 and 7. It is better, but still not optimal for the performance.
# modprobe -v lnet
insmod /lib/modules/4.18.0-240.10.1.el8_3.x86_64/weak-updates/lustre-client/net/lnet.ko networks=o2ib(ib0,ib1,ib2,ib3)[1,3,5,7]
# lctl net up
LNET configured
# lnetctl net show
net:
- net type: lo
local NI(s):
- nid: 0@lo
status: up
- net type: o2ib
local NI(s):
- nid: 14.128.0.45@o2ib
status: up
interfaces:
0: ib0
- nid: 14.128.0.46@o2ib
status: up
interfaces:
0: ib1
- nid: 14.128.0.47@o2ib
status: up
interfaces:
0: ib2
- nid: 14.128.0.48@o2ib
status: up
interfaces:
0: ib3
# lnetctl net show --verbose | grep -E 'ib|CPT|dev'
dev cpt: 0
CPT: "[0,1,2,3,4,5,6,7]"
- net type: o2ib
- nid: 14.128.0.45@o2ib
0: ib0
dev cpt: 1
CPT: "[1,3,5,7]"
- nid: 14.128.0.46@o2ib
0: ib1
dev cpt: 3
CPT: "[1,3,5,7]"
- nid: 14.128.0.47@o2ib
0: ib2
dev cpt: 5
CPT: "[1,3,5,7]"
- nid: 14.128.0.48@o2ib
0: ib3
dev cpt: 7
CPT: "[1,3,5,7]"
Finally with configuration 3, a fine NUMA binding is specified through a lnetctl yaml import, but it seems not taken into account.
# modprobe -v lnet
insmod /lib/modules/4.18.0-240.10.1.el8_3.x86_64/weak-updates/lustre-client/net/lnet.ko networks=""
# lctl net up
LNET configured
# lnetctl net del --net tcp
# lnetctl net show
net:
- net type: lo
local NI(s):
- nid: 0@lo
status: up
# cat lnetctl.config.txt
net:
- net type: o2ib
local NI(s):
- nid: 14.128.0.45@o2ib
interfaces:
0: ib0
CPT: "[1]"
- nid: 14.128.0.46@o2ib
interfaces:
0: ib1
CPT: "[3]"
- nid: 14.128.0.47@o2ib
interfaces:
0: ib2
CPT: "[5]"
- nid: 14.128.0.48@o2ib
interfaces:
0: ib3
CPT: "[7]"
# lnetctl import lnetctl.config.txt
# echo $?
0
# lnetctl net show
net:
- net type: lo
local NI(s):
- nid: 0@lo
status: up
- net type: o2ib
local NI(s):
- nid: 14.128.0.45@o2ib
status: up
interfaces:
0: ib0
- nid: 14.128.0.46@o2ib
status: up
interfaces:
0: ib1
- nid: 14.128.0.47@o2ib
status: up
interfaces:
0: ib2
- nid: 14.128.0.48@o2ib
status: up
interfaces:
0: ib3
# lnetctl net show --verbose
net:
- net type: lo
local NI(s):
- nid: 0@lo
status: up
statistics:
send_count: 0
recv_count: 0
drop_count: 0
tunables:
peer_timeout: 0
peer_credits: 0
peer_buffer_credits: 0
credits: 0
dev cpt: 0
tcp bonding: 0
CPT: "[0,1,2,3,4,5,6,7]"
- net type: o2ib
local NI(s):
- nid: 14.128.0.45@o2ib
status: up
interfaces:
0: ib0
statistics:
send_count: 0
recv_count: 0
drop_count: 0
tunables:
peer_timeout: 180
peer_credits: 8
peer_buffer_credits: 0
credits: 256
peercredits_hiw: 4
map_on_demand: 0
concurrent_sends: 8
fmr_pool_size: 512
fmr_flush_trigger: 384
fmr_cache: 1
ntx: 512
conns_per_peer: 1
lnd tunables:
dev cpt: 1
tcp bonding: 0
CPT: "[0,1,2,3,4,5,6,7]"
- nid: 14.128.0.46@o2ib
status: up
interfaces:
0: ib1
statistics:
send_count: 0
recv_count: 0
drop_count: 0
tunables:
peer_timeout: 180
peer_credits: 8
peer_buffer_credits: 0
credits: 256
peercredits_hiw: 4
map_on_demand: 0
concurrent_sends: 8
fmr_pool_size: 512
fmr_flush_trigger: 384
fmr_cache: 1
ntx: 512
conns_per_peer: 1
lnd tunables:
dev cpt: 3
tcp bonding: 0
CPT: "[0,1,2,3,4,5,6,7]"
- nid: 14.128.0.47@o2ib
status: up
interfaces:
0: ib2
statistics:
send_count: 0
recv_count: 0
drop_count: 0
tunables:
peer_timeout: 180
peer_credits: 8
peer_buffer_credits: 0
credits: 256
peercredits_hiw: 4
map_on_demand: 0
concurrent_sends: 8
fmr_pool_size: 512
fmr_flush_trigger: 384
fmr_cache: 1
ntx: 512
conns_per_peer: 1
lnd tunables:
dev cpt: 5
tcp bonding: 0
CPT: "[0,1,2,3,4,5,6,7]"
- nid: 14.128.0.48@o2ib
status: up
interfaces:
0: ib3
statistics:
send_count: 0
recv_count: 0
drop_count: 0
tunables:
peer_timeout: 180
peer_credits: 8
peer_buffer_credits: 0
credits: 256
peercredits_hiw: 4
map_on_demand: 0
concurrent_sends: 8
fmr_pool_size: 512
fmr_flush_trigger: 384
fmr_cache: 1
ntx: 512
conns_per_peer: 1
lnd tunables:
dev cpt: 7
tcp bonding: 0
CPT: "[0,1,2,3,4,5,6,7]"
Why the CPT specified for each interface of the multirail LNet interface has not been taken into account ?