Details
-
Bug
-
Resolution: Fixed
-
Major
-
Lustre 2.14.0, Lustre 2.12.5
-
None
-
3
-
9223372036854775807
Description
Two threads trying to acquire lnet_net_lock(1):
crash_x86_64> p ((struct cfs_percpt_lock *)0xffff8808299ed880)->pcl_locks[1] $11 = (spinlock_t *) 0xffff88076447c640
*hornc@cflosbld09 200312170801 $ grep -B 4 -A 8 ffff88076447c640 bt.all
PID: 8971 TASK: ffff88076b3c4f00 CPU: 1 COMMAND: "kiblnd_sd_00_00"
[exception RIP: queued_spin_lock_slowpath+283]
RIP: ffffffff810a573b RSP: ffffc90000ea7a80 RFLAGS: 00000202
RAX: 0000000000300101 RBX: ffff8808299ed880 RCX: 0000000000000001
RDX: 0000000000000101 RSI: 0000000000000001 RDI: ffff88076447c640
RBP: ffffc90000ea7a80 R8: 0000000000000101 R9: ffffffffa05d1729
R10: 0000000000000000 R11: ffffffffa05ebb48 R12: 0000000000000008
R13: ffff88082be96600 R14: ffff88082be96600 R15: ffff88071da3e568
CS: 0010 SS: 0018
#0 [ffffc90000ea7a88] _raw_spin_lock at ffffffff816b5411
#1 [ffffc90000ea7a98] cfs_percpt_lock at ffffffffa053c546 [libcfs]
#2 [ffffc90000ea7ac0] lnet_handle_send at ffffffffa05a3bd2 [lnet]
#3 [ffffc90000ea7b28] lnet_select_pathway at ffffffffa05a6e18 [lnet]
--
PID: 8983 TASK: ffff8807167a3080 CPU: 11 COMMAND: "lnet_discovery"
[exception RIP: queued_spin_lock_slowpath+184]
RIP: ffffffff810a56d8 RSP: ffffc900286e3e48 RFLAGS: 00000202
RAX: 0000000000000000 RBX: ffff8808299ed880 RCX: ffff88085fae2a80
RDX: 0000000000300101 RSI: 0000000000000101 RDI: ffff88076447c640
RBP: ffffc900286e3e48 R8: 0000000000300000 R9: 0000000000000000
R10: ffffc900286e3e90 R11: 0000000000000f2f R12: 0000000000000008
R13: ffff880828f021d0 R14: 0000000000000240 R15: 000500280a0c0035
CS: 0010 SS: 0018
#0 [ffffc900286e3e50] _raw_spin_lock at ffffffff816b5411
#1 [ffffc900286e3e60] cfs_percpt_lock at ffffffffa053c546 [libcfs]
#2 [ffffc900286e3e88] lnet_peer_discovery at ffffffffa05bc031 [lnet]
#3 [ffffc900286e3f08] kthread at ffffffff8107e807
*hornc@cflosbld09 200312170801 $
A debug patch tells us which thread is holding this lock:
crash_x86_64> p *((struct cfs_percpt_lock *)0xffff8808299ed880)->pcl_locks_info[1]
$4 = {
pcl_fn_ptr = 0xffffffffa05a69b7 <lnet_select_pathway+103>,
pcl_thr_ptr = 0xffff88082cd6f040
}
crash_x86_64> task 0xffff88082cd6f040 | grep PID
PID: 8965 TASK: ffff88082cd6f040 CPU: 10 COMMAND: "kgnilnd_sd_06"
crash_x86_64> bt 8965
PID: 8965 TASK: ffff88082cd6f040 CPU: 10 COMMAND: "kgnilnd_sd_06"
[exception RIP: queued_spin_lock_slowpath+180]
RIP: ffffffff810a56d4 RSP: ffffc90000d73ad8 RFLAGS: 00000202
RAX: 0000000000000000 RBX: ffff880828f02100 RCX: ffff88085faa2a80
RDX: 00000000002c0101 RSI: 0000000000000101 RDI: ffff880828f02184
RBP: ffffc90000d73ad8 R8: 00000000002c0000 R9: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff88071e12d568
R13: ffff880828f02100 R14: 0000000000000001 R15: ffffffffffffffff
CS: 0010 SS: 0018
#0 [ffffc90000d73ae0] _raw_spin_lock at ffffffff816b5411
#1 [ffffc90000d73af0] lnet_peer_gw_discovery at ffffffffa05bd329 [lnet]
#2 [ffffc90000d73b08] lnet_initiate_peer_discovery at ffffffffa05a31c3 [lnet]
#3 [ffffc90000d73b40] lnet_select_pathway at ffffffffa05a6a4c [lnet]
#4 [ffffc90000d73c40] lnet_send at ffffffffa05a7af5 [lnet]
#5 [ffffc90000d73c60] lnet_finalize at ffffffffa05999fa [lnet]
#6 [ffffc90000d73cc0] kgnilnd_tx_done at ffffffffa064cba4 [kgnilnd]
#7 [ffffc90000d73d20] kgnilnd_check_fma_send_cq at ffffffffa0655884 [kgnilnd]
#8 [ffffc90000d73e78] kgnilnd_scheduler at ffffffffa065a6ca [kgnilnd]
#9 [ffffc90000d73f08] kthread at ffffffff8107e807
#10 [ffffc90000d73f50] ret_from_fork at ffffffff8180024a
crash_x86_64>
This thread is trying to get lp_lock:
bool
lnet_peer_gw_discovery(struct lnet_peer *lp)
{
bool rc = false;
spin_lock(&lp->lp_lock); <<<< Thread here
The peer in question:
crash_x86_64> struct -o lnet_peer | grep lp_lock
[132] spinlock_t lp_lock;
crash_x86_64> eval ffff880828f02184 - 132
hexadecimal: ffff880828f02100
decimal: 18446612167360782592 (-131906348769024)
octal: 1777774200405074020400
binary: 1111111111111111100010000000100000101000111100000010000100000000
crash_x86_64>
From backtrace info in the dump we can see a few threads with this address in their stack.
One is a kgnilnd_sd thread:
PID: 8960 TASK: ffff8807644aa0c0 CPU: 13 COMMAND: "kgnilnd_sd_01"
This thread is finalizing a message. It is stuck acquiring an lp_lock in lnet_return_rx_credits_locked().
lnet_return_rx_credits_locked(struct lnet_msg *msg)
{
...
lp = rxpeerni->lpni_peer_net->lpn_peer;
/* give back peer router credits */
msg->msg_peerrtrcredit = 0;
spin_lock(&rxpeerni->lpni_lock);
spin_lock(&lp->lp_lock); <<<<<<<<<< Thread is here
Meanwhile, a kiblnd_sd thread is processing a discovery reply:
crash_x86_64> bt 8977
PID: 8977 TASK: ffff88076550afc0 CPU: 15 COMMAND: "kiblnd_sd_03_00"
[exception RIP: queued_spin_lock_slowpath+281]
RIP: ffffffff810a5739 RSP: ffffc90000f13ae8 RFLAGS: 00000202
RAX: 0000000000000101 RBX: ffff88082cfe3200 RCX: 0000000000000001
RDX: 0000000000000101 RSI: 0000000000000001 RDI: ffff88082cfe32a4
RBP: ffffc90000f13ae8 R8: 0000000000000101 R9: 000000000000000f
R10: 0000000000000001 R11: ffff8806e12b7eea R12: ffff880828f02100
R13: ffff880828cb2209 R14: ffff880828f02184 R15: ffff880828cb2200
CS: 0010 SS: 0018
#0 [ffffc90000f13af0] _raw_spin_lock at ffffffff816b5411
#1 [ffffc90000f13b00] lnet_peer_ni_clr_non_mr_pref_nid at ffffffffa05b81eb [lnet]
#2 [ffffc90000f13b20] lnet_peer_clr_non_mr_pref_nids at ffffffffa05b82db [lnet]
#3 [ffffc90000f13b40] lnet_discovery_event_handler at ffffffffa05be2cf [lnet]
It is holding lp_lock.
lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev)
{
...
spin_lock(&lp->lp_lock);
...
CDEBUG(D_NET, "peer %s(%p) is MR capable\n",
libcfs_nid2str(lp->lp_primary_nid), lp);
lp->lp_state |= LNET_PEER_MULTI_RAIL;
lnet_peer_clr_non_mr_pref_nids(lp);
void
lnet_peer_clr_non_mr_pref_nids(struct lnet_peer *lp)
{
struct lnet_peer_ni *lpni = NULL;
while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL)
lnet_peer_ni_clr_non_mr_pref_nid(lpni);
}
lnet_peer_ni_clr_non_mr_pref_nid(struct lnet_peer_ni *lpni)
{
...
spin_lock(&lpni->lpni_lock); <<<< Thread is here
Thread A: spin_lock(lp_lock)
Thread B: spin_lock(lpni_lock)
Thread A: spin_lock(lpni_lock)
Thread B: spin_lock(lp_lock)
Deadlock.
Since these two threads are deadlocked, this is preventing PID 8965 (which is holding lnet_net_lock(1)) from making progress. Thus causing further deadlocks with threads trying to acquire lnet_net_lock(1).
Same issue exists with lnet_post_routed_recv_locked()