Details
-
Bug
-
Resolution: Fixed
-
Minor
-
Lustre 2.15.0
-
None
-
3
-
9223372036854775807
Description
Gerrit Janitor testing with lockdep enabled reports a potential lock inversion problem on the client console log during runtests:
http://testing.linuxhacker.ru:3333/lustre-reports/22787/results.html
http://testing.linuxhacker.ru:3333/lustre-reports/22787/testresults/runtests-rhel8-zfs-rhel8.5_x86_64-rhel8.5_x86_64/
[ 272.552541]
[ 272.553139] ========================================================
[ 272.554212] WARNING: possible irq lock inversion dependency detected
[ 272.555211] 4.18.0rh8.5-debug #2 Tainted: G O --------- - -
[ 272.556374] --------------------------------------------------------
[ 272.557413] socknal_sd01_01/2126 just changed the state of lock:
[ 272.558402] ffffffffa0593070 (&ksocknal_data.ksnd_global_lock){++.?}-{2:2}, at: ksocknal_data_ready+0x2b/0x110 [ksocklnd]
[ 272.560260] but this lock took another, SOFTIRQ-unsafe lock in the past:
[ 272.561416] (&tcd->tcd_lock){+.+.}-{2:2}
[ 272.561419] and interrupts could create inverse lock ordering between them.
[ 272.564283]
[ 272.564283] other info that might help us debug this:
[ 272.565401] Possible interrupt unsafe locking scenario:
[ 272.565401]
[ 272.566542] CPU0 CPU1
[ 272.567437] ---- ----
[ 272.568273] lock(&tcd->tcd_lock);
[ 272.568819] local_irq_disable();
[ 272.570068] lock(&ksocknal_data.ksnd_global_lock);
[ 272.571353] lock(&tcd->tcd_lock);
[ 272.572443] <Interrupt>
[ 272.573004] lock(&ksocknal_data.ksnd_global_lock);
[ 272.573874]
[ 272.573874] *** DEADLOCK ***
[ 272.573874]
[ 272.574864] 5 locks held by socknal_sd01_01/2126:
[ 272.575750] #0: ffff888118da5918 (&keys[i]){+.+.}-{2:2}, at: cfs_percpt_lock+0xaa/0x170 [libcfs]
[ 272.577328] #1: ffff888118da5a18 (&keys[i]#2){+.+.}-{2:2}, at: cfs_percpt_lock+0xaa/0x170 [libcfs]
[ 272.579057] #2: ffffffff82f2e400 (rcu_read_lock){....}-{1:2}, at: netif_receive_skb_internal+0x4d/0x2d0
[ 272.581396] #3: ffffffff82f2e400 (rcu_read_lock){....}-{1:2}, at: ip_local_deliver_finish+0x35/0x210
[ 272.583011] #4: ffff88810a1b4420 (k-slock-AF_INET/1){+.-.}-{2:2}, at: tcp_v4_rcv+0x1609/0x1990
[ 272.584439]
[ 272.584439] the shortest dependencies between 2nd lock and 1st lock:
[ 272.585764] -> (&tcd->tcd_lock){+.+.}-{2:2} ops: 1335 {
[ 272.586649] HARDIRQ-ON-W at:
[ 272.587201] lock_acquire+0x16a/0x6f0
[ 272.588169] _raw_spin_lock+0x50/0xd0
[ 272.589097] cfs_trace_lock_tcd+0x2f/0xc0 [libcfs]
[ 272.590182] libcfs_debug_msg+0x13f/0xf70 [libcfs]
[ 272.591289] cfs_cpu_init+0x17b/0x3b0 [libcfs]
[ 272.592324] seq_client_rpc+0xbc/0x760 [fid]
[ 272.593302] do_one_initcall+0xbe/0x620
[ 272.594279] do_init_module+0xaa/0x350
[ 272.595190] load_module+0x19c9/0x1d50
[ 272.596181] __do_sys_finit_module+0x138/0x190
[ 272.597185] __x64_sys_finit_module+0x1e/0x30
[ 272.598138] do_syscall_64+0xd4/0x5a0
[ 272.599040] entry_SYSCALL_64_after_hwframe+0x6a/0xdf
[ 272.600301] SOFTIRQ-ON-W at:
[ 272.600895] lock_acquire+0x16a/0x6f0
[ 272.601875] _raw_spin_lock+0x50/0xd0
[ 272.602815] cfs_trace_lock_tcd+0x2f/0xc0 [libcfs]
[ 272.604134] libcfs_debug_msg+0x13f/0xf70 [libcfs]
[ 272.605273] cfs_cpu_init+0x17b/0x3b0 [libcfs]
[ 272.606380] seq_client_rpc+0xbc/0x760 [fid]
[ 272.607438] do_one_initcall+0xbe/0x620
[ 272.608545] do_init_module+0xaa/0x350
[ 272.609478] load_module+0x19c9/0x1d50
[ 272.610396] __do_sys_finit_module+0x138/0x190
[ 272.611412] __x64_sys_finit_module+0x1e/0x30
[ 272.612456] do_syscall_64+0xd4/0x5a0
[ 272.613387] entry_SYSCALL_64_after_hwframe+0x6a/0xdf
[ 272.614497] INITIAL USE at:
[ 272.615027] lock_acquire+0x16a/0x6f0
[ 272.615906] _raw_spin_lock+0x50/0xd0
[ 272.616941] cfs_trace_lock_tcd+0x2f/0xc0 [libcfs]
[ 272.618009] libcfs_debug_msg+0x13f/0xf70 [libcfs]
[ 272.619127] cfs_cpu_init+0x17b/0x3b0 [libcfs]
[ 272.620271] seq_client_rpc+0xbc/0x760 [fid]
[ 272.621196] do_one_initcall+0xbe/0x620
[ 272.622144] do_init_module+0xaa/0x350
[ 272.623073] load_module+0x19c9/0x1d50
[ 272.624015] __do_sys_finit_module+0x138/0x190
[ 272.625051] __x64_sys_finit_module+0x1e/0x30
[ 272.626041] do_syscall_64+0xd4/0x5a0
[ 272.626935] entry_SYSCALL_64_after_hwframe+0x6a/0xdf
[ 272.628429] }
[ 272.628820] ... key at: [<ffffffffa0436360>] __key.50710+0x0/0xfffffffffffdcca0 [libcfs]
[ 272.630665] ... acquired at:
[ 272.631366] __lock_acquire+0x67f/0xe40
[ 272.632312] lock_acquire+0x16a/0x6f0
[ 272.633164] _raw_spin_lock+0x50/0xd0
[ 272.633964] cfs_trace_lock_tcd+0x2f/0xc0 [libcfs]
[ 272.634965] libcfs_debug_msg+0x13f/0xf70 [libcfs]
[ 272.635974] ksocknal_find_peer_locked+0x261/0x270 [ksocklnd]
[ 272.637086] ksocknal_launch_packet+0x67/0x660 [ksocklnd]
[ 272.638097] ksocknal_send+0x15e/0x3d0 [ksocklnd]
[ 272.639004] lnet_ni_send+0x68/0x150 [lnet]
[ 272.639793] lnet_send+0xc5/0x270 [lnet]
[ 272.640594] LNetGet+0x50b/0xc60 [lnet]
[ 272.641361] lnet_send_ping+0x143/0x2b0 [lnet]
[ 272.642299] lnet_peer_send_ping+0xdc/0x320 [lnet]
[ 272.643382] lnet_peer_discovery+0x163f/0x2260 [lnet]
[ 272.644438] kthread+0x1ae/0x1e0
[ 272.645224] ret_from_fork+0x24/0x50
[ 272.646083]
[ 272.646335] -> (&ksocknal_data.ksnd_global_lock){++.?}-{2:2} ops: 4782 {
[ 272.647517] HARDIRQ-ON-W at:
[ 272.648125] lock_acquire+0x16a/0x6f0
[ 272.649214] _raw_write_lock_bh+0x60/0xe0
[ 272.651188] ksocknal_launch_packet+0xb4/0x660 [ksocklnd]
[ 272.653816] ksocknal_send+0x15e/0x3d0 [ksocklnd]
[ 272.655566] lnet_ni_send+0x68/0x150 [lnet]
[ 272.657191] lnet_send+0xc5/0x270 [lnet]
[ 272.659140] LNetGet+0x50b/0xc60 [lnet]
[ 272.661052] lnet_send_ping+0x143/0x2b0 [lnet]
[ 272.662410] lnet_peer_send_ping+0xdc/0x320 [lnet]
[ 272.663597] lnet_peer_discovery+0x163f/0x2260 [lnet]
[ 272.664725] kthread+0x1ae/0x1e0
[ 272.665555] ret_from_fork+0x24/0x50
[ 272.666573] HARDIRQ-ON-R at:
[ 272.667113] lock_acquire+0x16a/0x6f0
[ 272.667938] _raw_read_lock+0xd7/0x120
[ 272.668797] ksocknal_check_peer_timeouts+0x45/0xd70 [ksocklnd]
[ 272.670193] ksocknal_reaper+0x4b8/0x6c0 [ksocklnd]
[ 272.671270] kthread+0x1ae/0x1e0
[ 272.672099] ret_from_fork+0x24/0x50
[ 272.673034] IN-SOFTIRQ-R at:
[ 272.673560] lock_acquire+0x16a/0x6f0
[ 272.674375] _raw_read_lock+0x64/0x120
[ 272.675268] ksocknal_data_ready+0x2b/0x110 [ksocklnd]
[ 272.676414] tcp_data_ready+0x2f/0x1e0
[ 272.677336] tcp_data_queue+0x47a/0xb30
[ 272.678261] tcp_rcv_established+0x2d1/0xb90
[ 272.679280] tcp_v4_do_rcv+0x185/0x320
[ 272.680522] tcp_v4_rcv+0x169d/0x1990
[ 272.681552] ip_protocol_deliver_rcu+0x47/0x670
[ 272.682994] ip_local_deliver_finish+0xce/0x210
[ 272.684640] ip_local_deliver+0xcd/0x380
[ 272.685698] ip_rcv_finish+0x39e/0xbf0
[ 272.686942] ip_rcv+0x332/0x733
[ 272.688142] __netif_receive_skb_core+0x1503/0x1790
[ 272.689282] __netif_receive_skb+0x2b/0x90
[ 272.690932] netif_receive_skb_internal+0xd0/0x2d0
[ 272.692038] napi_gro_receive+0x25c/0x350
[ 272.693237] receive_buf+0x2aa/0x7f0
[ 272.694635] virtnet_poll+0x245/0x498
[ 272.695680] __napi_poll+0x41/0x3c0
[ 272.696725] net_rx_action+0x3cf/0x530
[ 272.697661] __do_softirq+0x139/0x879
[ 272.698689] irq_exit+0x28f/0x2d0
[ 272.699706] smp_apic_timer_interrupt+0x17c/0x530
[ 272.701015] apic_timer_interrupt+0xf/0x20
[ 272.702049] lock_is_held_type+0x61/0x230
[ 272.703010] rcu_read_lock_sched_held+0x6c/0xc0
[ 272.704188] lock_acquired+0x602/0x660
[ 272.705250] _raw_spin_lock+0x82/0xd0
[ 272.706196] cfs_percpt_lock+0xaa/0x170 [libcfs]
[ 272.707331] lnet_discovery_event_handler+0x25b/0x800 [lnet]
[ 272.708717] lnet_finalize+0xa28/0xe60 [lnet]
[ 272.709737] ksocknal_tx_done+0xc1/0x190 [ksocklnd]
[ 272.711087] ksocknal_scheduler+0x938/0x10e0 [ksocklnd]
[ 272.712299] kthread+0x1ae/0x1e0
[ 272.713143] ret_from_fork+0x24/0x50
[ 272.714040] SOFTIRQ-ON-R at:
[ 272.714634] lock_acquire+0x16a/0x6f0
[ 272.715459] _raw_read_lock+0xd7/0x120
[ 272.716504] ksocknal_check_peer_timeouts+0x45/0xd70 [ksocklnd]
[ 272.717729] ksocknal_reaper+0x4b8/0x6c0 [ksocklnd]
[ 272.718885] kthread+0x1ae/0x1e0
[ 272.719739] ret_from_fork+0x24/0x50
[ 272.720693] INITIAL USE at:
[ 272.721348] lock_acquire+0x16a/0x6f0
[ 272.722329] _raw_write_lock_bh+0x60/0xe0
[ 272.723327] ksocknal_launch_packet+0xb4/0x660 [ksocklnd]
[ 272.724560] ksocknal_send+0x15e/0x3d0 [ksocklnd]
[ 272.725650] lnet_ni_send+0x68/0x150 [lnet]
[ 272.726659] lnet_send+0xc5/0x270 [lnet]
[ 272.727623] LNetGet+0x50b/0xc60 [lnet]
[ 272.728748] lnet_send_ping+0x143/0x2b0 [lnet]
[ 272.729682] lnet_peer_send_ping+0xdc/0x320 [lnet]
[ 272.730662] lnet_peer_discovery+0x163f/0x2260 [lnet]
[ 272.731812] kthread+0x1ae/0x1e0
[ 272.732731] ret_from_fork+0x24/0x50
[ 272.733550] INITIAL READ USE at:
[ 272.734153] lock_acquire+0x16a/0x6f0
[ 272.735183] _raw_read_lock+0xd7/0x120
[ 272.736515] ksocknal_check_peer_timeouts+0x45/0xd70 [ksocklnd]
[ 272.737954] ksocknal_reaper+0x4b8/0x6c0 [ksocklnd]
[ 272.739108] kthread+0x1ae/0x1e0
[ 272.740205] ret_from_fork+0x24/0x50
[ 272.741169] }
[ 272.741498] ... key at: [<ffffffffa0596580>] __key.91535+0x0/0xfffffffffffeca80 [ksocklnd]
[ 272.742977] ... acquired at:
[ 272.743471] __lock_acquire+0xbbc/0xe40
[ 272.744322] lock_acquire+0x16a/0x6f0
[ 272.744995] _raw_read_lock+0x64/0x120
[ 272.745664] ksocknal_data_ready+0x2b/0x110 [ksocklnd]
[ 272.746577] tcp_data_ready+0x2f/0x1e0
[ 272.747298] tcp_data_queue+0x47a/0xb30
[ 272.747983] tcp_rcv_established+0x2d1/0xb90
[ 272.748799] tcp_v4_do_rcv+0x185/0x320
[ 272.749655] tcp_v4_rcv+0x169d/0x1990
[ 272.750325] ip_protocol_deliver_rcu+0x47/0x670
[ 272.751137] ip_local_deliver_finish+0xce/0x210
[ 272.751985] ip_local_deliver+0xcd/0x380
[ 272.752812] ip_rcv_finish+0x39e/0xbf0
[ 272.753494] ip_rcv+0x332/0x733
[ 272.754113] __netif_receive_skb_core+0x1503/0x1790
[ 272.755006] __netif_receive_skb+0x2b/0x90
[ 272.755683] netif_receive_skb_internal+0xd0/0x2d0
[ 272.756696] napi_gro_receive+0x25c/0x350
[ 272.757425] receive_buf+0x2aa/0x7f0
[ 272.758060] virtnet_poll+0x245/0x498
[ 272.758661] __napi_poll+0x41/0x3c0
[ 272.759391] net_rx_action+0x3cf/0x530
[ 272.760385] __do_softirq+0x139/0x879
[ 272.761221] irq_exit+0x28f/0x2d0
[ 272.761790] smp_apic_timer_interrupt+0x17c/0x530
[ 272.763096] apic_timer_interrupt+0xf/0x20
[ 272.763960] lock_is_held_type+0x61/0x230
[ 272.765203] rcu_read_lock_sched_held+0x6c/0xc0
[ 272.766986] lock_acquired+0x602/0x660
[ 272.768470] _raw_spin_lock+0x82/0xd0
[ 272.770022] cfs_percpt_lock+0xaa/0x170 [libcfs]
[ 272.771249] lnet_discovery_event_handler+0x25b/0x800 [lnet]
[ 272.772483] lnet_finalize+0xa28/0xe60 [lnet]
[ 272.773253] ksocknal_tx_done+0xc1/0x190 [ksocklnd]
[ 272.774400] ksocknal_scheduler+0x938/0x10e0 [ksocklnd]
[ 272.775678] kthread+0x1ae/0x1e0
[ 272.776341] ret_from_fork+0x24/0x50