Details
-
Bug
-
Resolution: Fixed
-
Minor
-
None
-
None
-
3
-
9223372036854775807
Description
Multiple Lustre client nodes are reporting this same error in their syslog for the lnet_discovery process :
[Wed Feb 14 12:21:10 2024] memcpy: detected field-spanning write (size 64) of single field "&lp->lp_data->pb_info" at lnet/lnet/peer.c:2456 (size 16) [Wed Feb 14 12:21:10 2024] WARNING: CPU: 175 PID: 151690 at lnet/lnet/peer.c:2456 lnet_peer_push_event+0xcb5/0xcd0 [lnet] [Wed Feb 14 12:21:10 2024] Modules linked in: mgc(OE) lustre(OE) ptlrpc_gss(OE) mdc(OE) fid(OE) lov(OE) osc(OE) lmv(OE) fld(OE) ko2iblnd(OE) ptlrpc(OE) obdclass(OE) lnet(OE) libcfs(OE) msr nvidia_uvm(OE) rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs lockd grace fscache netfs nvidia_modeset(OE) gdrdrv(POE) uio_pci_generic uio nvme_fabrics intel_rapl_msr intel_rapl_common i10nm_edac nfit x86_pkg_temp_thermal intel_powerclamp coretemp rdma_ucm(OE) sch_fq_codel nvidia_peermem(POE) rdma_cm(OE) iw_cm(OE) kvm_intel kvm ib_ipoib(OE) crct10dif_pclmul crc32_pclmul ghash_clmulni_intel ib_cm(OE) ast aesni_intel drm_vram_helper mxm_wmi drm_ttm_helper ib_umad(OE) pmt_telemetry intel_th_gth ttm pmt_crashlog crypto_simd sunrpc binfmt_misc drm_kms_helper pmt_class ipmi_ssif cryptd mlx5_ib(OE) cec mei_me qat_4xxx rc_core raid0 i2c_algo_bit fb_sys_fops video intel_th_pci intel_qat syscopyarea isst_if_mbox_pci isst_if_mmio idxd sysfillrect rapl i2c_i801 ib_uverbs(OE) mei sysimgblt xhci_pci switchtec authenc idxd_bus [Wed Feb 14 12:21:10 2024] isst_if_common i2c_ismt xhci_pci_renesas intel_th intel_pmt i2c_smbus wmi acpi_ipmi ipmi_si ipmi_devintf pinctrl_emmitsburg ipmi_msghandler mac_hid nvidia(OE) ib_core(OE) efi_pstore drm ip_tables x_tables autofs4 virtiofs cuse overlay ice nvme i40e mlx5_core(OE) mlxdevm(OE) pci_hyperv_intf nvme_core mlxfw(OE) psample tls mlx_compat(OE) ixgbe xfrm_algo dca mdio [last unloaded: mst_pciconf] [Wed Feb 14 12:21:10 2024] CPU: 175 PID: 151690 Comm: lnet_discovery Tainted: P OE 5.15.0-94-generic #104-Ubuntu ........... [Wed Feb 14 12:21:10 2024] RIP: 0010:lnet_peer_push_event+0xcb5/0xcd0 [lnet] [Wed Feb 14 12:21:10 2024] Code: 00 00 41 83 e7 01 75 24 b9 10 00 00 00 48 c7 c2 00 66 89 c1 4c 89 f6 48 c7 c7 78 66 89 c1 c6 05 3b 0b 02 00 01 e8 fa 0e 49 ed <0f> 0b 49 8b bc 24 a0 00 00 00 e9 49 f8 ff ff 66 66 2e 0f 1f 84 00 [Wed Feb 14 12:21:10 2024] RSP: 0018:ff6360da86163b68 EFLAGS: 00010286 [Wed Feb 14 12:21:10 2024] RAX: 0000000000000000 RBX: ff40777104828409 RCX: 0000000000000027 [Wed Feb 14 12:21:10 2024] RDX: ff40786cbf1e0588 RSI: 0000000000000001 RDI: ff40786cbf1e0580 [Wed Feb 14 12:21:10 2024] RBP: ff6360da86163b98 R08: 0000000000000003 R09: ffffffffffe978a8 [Wed Feb 14 12:21:10 2024] R10: ff40786cbe0978b0 R11: 0000000000000001 R12: ff4076738db23c00 [Wed Feb 14 12:21:10 2024] R13: ff4076738db23c94 R14: 0000000000000040 R15: 0000000000000000 [Wed Feb 14 12:21:10 2024] FS: 0000000000000000(0000) GS:ff40786cbf1c0000(0000) knlGS:0000000000000000 [Wed Feb 14 12:21:10 2024] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [Wed Feb 14 12:21:10 2024] CR2: 00007f28658f5a10 CR3: 00000139ce810002 CR4: 0000000000771ee0 [Wed Feb 14 12:21:10 2024] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [Wed Feb 14 12:21:10 2024] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 [Wed Feb 14 12:21:10 2024] PKRU: 55555554 [Wed Feb 14 12:21:10 2024] Call Trace: [Wed Feb 14 12:21:10 2024] <TASK> [Wed Feb 14 12:21:10 2024] ? show_trace_log_lvl+0x1d6/0x2ea [Wed Feb 14 12:21:10 2024] ? show_trace_log_lvl+0x1d6/0x2ea [Wed Feb 14 12:21:10 2024] ? lnet_push_target_event_handler+0x46/0x110 [lnet] [Wed Feb 14 12:21:10 2024] ? show_regs.part.0+0x23/0x29 [Wed Feb 14 12:21:10 2024] ? show_regs.cold+0x8/0xd [Wed Feb 14 12:21:10 2024] ? lnet_peer_push_event+0xcb5/0xcd0 [lnet] [Wed Feb 14 12:21:10 2024] ? __warn+0x8c/0x100 [Wed Feb 14 12:21:10 2024] ? lnet_peer_push_event+0xcb5/0xcd0 [lnet] [Wed Feb 14 12:21:10 2024] ? report_bug+0xa4/0xd0 [Wed Feb 14 12:21:10 2024] ? handle_bug+0x39/0x90 [Wed Feb 14 12:21:10 2024] ? exc_invalid_op+0x19/0x70 [Wed Feb 14 12:21:10 2024] ? asm_exc_invalid_op+0x1b/0x20 [Wed Feb 14 12:21:10 2024] ? lnet_peer_push_event+0xcb5/0xcd0 [lnet] [Wed Feb 14 12:21:10 2024] ? lnet_peer_push_event+0xcb5/0xcd0 [lnet] [Wed Feb 14 12:21:10 2024] lnet_push_target_event_handler+0x46/0x110 [lnet] [Wed Feb 14 12:21:10 2024] lnet_finalize+0x5fd/0x12d0 [lnet] [Wed Feb 14 12:21:10 2024] ? lnet_me_unlink+0xbf/0xd0 [lnet] [Wed Feb 14 12:21:10 2024] ? lnet_swap_pinginfo+0x70/0x70 [lnet] [Wed Feb 14 12:21:10 2024] ? __cond_resched+0x1a/0x50 [Wed Feb 14 12:21:10 2024] ? lnet_copy_iov2kiov+0x15d/0x2e0 [lnet] [Wed Feb 14 12:21:10 2024] kiblnd_recv+0xcf/0x720 [ko2iblnd] [Wed Feb 14 12:21:10 2024] ? lnet_try_match_md+0x1e7/0x2f0 [lnet] [Wed Feb 14 12:21:10 2024] lnet_ni_recv+0x106/0x2e0 [lnet] [Wed Feb 14 12:21:10 2024] lnet_recv_put+0x8f/0xc0 [lnet] [Wed Feb 14 12:21:10 2024] lnet_recv_delayed_msg_list+0x160/0x280 [lnet] [Wed Feb 14 12:21:10 2024] LNetMDAttach+0x109/0x230 [lnet] [Wed Feb 14 12:21:10 2024] lnet_push_target_post+0xcf/0x250 [lnet] [Wed Feb 14 12:21:10 2024] ? lnet_swap_pinginfo+0x70/0x70 [lnet]
and having a look to the concerned source code and objects :
2307 /* 2308 * Handle inbound push. 2309 * Like any event handler, called with lnet_res_lock/CPT held. 2310 */ 2311 void lnet_peer_push_event(struct lnet_event *ev) 2312 { .................. 2452 if (lp->lp_state & LNET_PEER_DATA_PRESENT) { 2453 if (LNET_PING_BUFFER_SEQNO(pbuf) > 2454 LNET_PING_BUFFER_SEQNO(lp->lp_data) && 2455 pbuf->pb_info.pi_nnis <= lp->lp_data->pb_nnis) { 2456 memcpy(&lp->lp_data->pb_info, &pbuf->pb_info, 2457 LNET_PING_INFO_SIZE(pbuf->pb_info.pi_nnis)); 2458 CDEBUG(D_NET, "Ping/Push race from %s: %u vs %u\n", 2459 libcfs_nid2str(lp->lp_primary_nid), 2460 LNET_PING_BUFFER_SEQNO(pbuf), 2461 LNET_PING_BUFFER_SEQNO(lp->lp_data)); 2462 } 2463 goto out; 2464 } 2465 2466 /* 2467 * Allocate a buffer to copy the data. On a failure we drop 2468 * the Push and set FORCE_PING to force the discovery 2469 * thread to fix the problem by pinging the peer. 2470 */ 2471 lp->lp_data = lnet_ping_buffer_alloc(lp->lp_data_nnis, GFP_ATOMIC); 2472 if (!lp->lp_data) { 2473 lp->lp_state |= LNET_PEER_FORCE_PING; 2474 CDEBUG(D_NET, "Cannot allocate Push buffer for %s %u\n", 2475 libcfs_nid2str(lp->lp_primary_nid), 2476 LNET_PING_BUFFER_SEQNO(pbuf)); 2477 goto out; 2478 } 2479 2480 /* Success */ 2481 unsafe_memcpy(&lp->lp_data->pb_info, &pbuf->pb_info, 2482 LNET_PING_INFO_SIZE(pbuf->pb_info.pi_nnis), 2483 FLEXIBLE_OBJECT);"lnet/lnet/peer.c" line 2483 of 4233 --58%-- col 10-24
with
struct lnet_ping_info { __u32 pi_magic; __u32 pi_features; lnet_pid_t pi_pid; __u32 pi_nnis; struct lnet_ni_status pi_ni[]; } WIRE_ATTR; #define LNET_PING_INFO_SIZE(NNIDS) \ offsetof(struct lnet_ping_info, pi_ni[NNIDS])
where we can see that the same copy/args is being done using either memcpy() or unsafe_memcpy(), but in the memcpy() case, the compiler can not determine what could be the run-time size of the copy due to the variable size of the pi_ni[] field/array.
So, this definitelly looks like a false positive and this should be fixed by simply using unsafe_memcpy().
I will cook a patch and attach it to ticket.