[LU-9928] conf-sanity: test_32d IP: [<ffffffffc0bc8083>] lnet_cpt_of_md+0x13/0x260 [lnet] Created: 29/Aug/17 Updated: 05/Aug/20 Resolved: 05/Aug/20 |
|
| Status: | Resolved |
| Project: | Lustre |
| Component/s: | None |
| Affects Version/s: | None |
| Fix Version/s: | None |
| Type: | Bug | Priority: | Minor |
| Reporter: | Maloo | Assignee: | Amir Shehata (Inactive) |
| Resolution: | Duplicate | Votes: | 0 |
| Labels: | None | ||
| Issue Links: |
|
||||||||
| Severity: | 3 | ||||||||
| Rank (Obsolete): | 9223372036854775807 | ||||||||
| Description |
|
This issue was created by maloo for ys <yang.sheng@intel.com> Please provide additional information about the failure here. This issue relates to the following test suite run: https://testing.hpdd.intel.com/test_sets/077a6fce-8ca2-11e7-b50a-5254006e85c2. 06:41:40:[ 7296.487291] BUG: unable to handle kernel paging request at 0000000059a50ce5 06:41:40:[ 7296.488008] IP: [<ffffffffc0bc8083>] lnet_cpt_of_md+0x13/0x260 [lnet] 06:41:40:[ 7296.488008] PGD 7a934067 PUD 0 06:41:40:[ 7296.488008] Oops: 0000 [#1] SMP 06:41:40:[ 7296.488008] Modules linked in: osd_zfs(OE) zfs(POE) obdecho(OE) osc(OE) ptlrpc_gss(OE) ofd(OE) ost(OE) lustre(OE) lmv(OE) mdc(OE) lov(OE) osp(OE) mdd(OE) lod(OE) mdt(OE) lfsck(OE) mgs(OE) mgc(OE) lquota(OE) fid(OE) fld(OE) ksocklnd(OE) ptlrpc(OE) obdclass(OE) lnet(OE) zunicode(POE) zavl(POE) icp(POE) zcommon(POE) znvpair(POE) spl(OE) libcfs(OE) dm_mod rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod crc_t10dif crct10dif_generic ib_srp scsi_transport_srp scsi_tgt ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_core iosf_mbi crc32_pclmul ghash_clmulni_intel aesni_intel ppdev lrw gf128mul glue_helper ablk_helper cryptd joydev pcspkr virtio_balloon nfsd i2c_piix4 parport_pc parport nfs_acl lockd grace auth_rpcgss sunrpc ip_tables ext4 mbcache jbd2 ata_generic pata_acpi cirrus drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm virtio_blk ata_piix libata 8139too crct10dif_pclmul crct10dif_common crc32c_intel serio_raw 8139cp virtio_pci virtio_ring virtio mii i2c_core floppy [last unloaded: zfs] 06:41:40:[ 7296.488008] CPU: 1 PID: 4474 Comm: lnet_discovery Tainted: P OE ------------ 3.10.0-693.1.1.el7_lustre.x86_64 #1 06:41:40:[ 7296.488008] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2007 06:41:40:[ 7296.488008] task: ffff88007a5b5ee0 ti: ffff88006a84c000 task.ti: ffff88006a84c000 06:41:40:[ 7296.488008] RIP: 0010:[<ffffffffc0bc8083>] [<ffffffffc0bc8083>] lnet_cpt_of_md+0x13/0x260 [lnet] 06:41:40:[ 7296.488008] RSP: 0018:ffff88006a84fd10 EFLAGS: 00010202 06:41:40:[ 7296.488008] RAX: 0000000000000000 RBX: 000200000a02023b RCX: 0000000000000001 06:41:40:[ 7296.488008] RDX: 0000000000000001 RSI: 0000000063e679a8 RDI: 0000000059a50c98 06:41:40:[ 7296.488008] RBP: ffff88006a84fd18 R08: 000000000000ffff R09: 000000000000ffff 06:41:40:[ 7296.488008] R10: 20676e69646e6570 R11: 206567617373656d R12: ffff880063e67900 06:41:40:[ 7296.488008] R13: ffff880063e67910 R14: 0000000000000000 R15: 0000000000001030 06:41:40:[ 7296.488008] FS: 0000000000000000(0000) GS:ffff88007fd00000(0000) knlGS:0000000000000000 06:41:40:[ 7296.488008] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 06:41:40:[ 7296.488008] CR2: 0000000059a50ce5 CR3: 000000007a88e000 CR4: 00000000000406e0 06:41:40:[ 7296.488008] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 06:41:40:[ 7296.488008] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 06:41:40:[ 7296.488008] Stack: 06:41:40:[ 7296.488008] 000200000a02023b ffff88006a84fdc8 ffffffffc0bcf33d 0000000046946e10 06:41:40:[ 7296.488008] 0000000046946e10 0000000100000000 ffffffffc0c149e0 000200000a02023b 06:41:40:[ 7296.488008] 0000000000000000 000000000000ffff ffff880063e67910 ffff880063e67900 06:41:40:[ 7296.488008] Call Trace: 06:41:40:[ 7296.488008] [<ffffffffc0bcf33d>] lnet_select_pathway+0x5d/0x1300 [lnet] 06:41:40:[ 7296.488008] [<ffffffffc0bd2531>] lnet_send+0x51/0x180 [lnet] 06:41:40:[ 7296.488008] [<ffffffffc0be1d98>] lnet_peer_discovery_complete+0x178/0x320 [lnet] 06:41:40:[ 7296.488008] [<ffffffffc0be7768>] lnet_peer_discovery+0x588/0x1030 [lnet] 06:41:40:[ 7296.488008] [<ffffffff810b1910>] ? wake_up_atomic_t+0x30/0x30 06:41:40:[ 7296.488008] [<ffffffffc0be71e0>] ? lnet_peer_merge_data+0xde0/0xde0 [lnet] 06:41:40:[ 7296.488008] [<ffffffff810b098f>] kthread+0xcf/0xe0 06:41:40:[ 7296.488008] [<ffffffff810b08c0>] ? insert_kthread_work+0x40/0x40 06:41:40:[ 7296.488008] [<ffffffff816b4f18>] ret_from_fork+0x58/0x90 06:41:40:[ 7296.488008] [<ffffffff810b08c0>] ? insert_kthread_work+0x40/0x40 06:41:40:[ 7296.488008] Code: 48 c7 c7 c0 29 c0 c0 e8 8c e7 ac ff 66 90 66 2e 0f 1f 84 00 00 00 00 00 66 66 66 66 90 48 85 ff 0f 84 e2 01 00 00 55 48 89 e5 53 <f6> 47 4d 02 89 f3 74 5b 48 8b 77 68 48 83 fe ff 74 51 8b 0d 11 06:41:40:[ 7296.488008] RIP [<ffffffffc0bc8083>] lnet_cpt_of_md+0x13/0x260 [lnet] 06:41:40:[ 7296.488008] RSP <ffff88006a84fd10> 06:41:40:[ 7296.488008] CR2: 0000000059a50ce5 06:41:40:[ 0.000000] Initializing cgroup subsys cpuset 06:41:40:[ 0.000000] Initializing cgroup subsys cpu |
| Comments |
| Comment by Sebastien Buisson (Inactive) [ 30/Aug/17 ] |
|
+1 on master: |
| Comment by Bruno Faccini (Inactive) [ 30/Aug/17 ] |
|
+1 on latest master : looks similar to |
| Comment by Yang Sheng [ 30/Aug/17 ] |
|
Looks like the msg_md was messed. crash> struct lnet_msg ffff880063e67910
struct lnet_msg {
msg_activelist = {
next = 0xffff88007b9e14c0,
prev = 0xffff88007b9e14c0
},
msg_list = {
next = 0xffff88005625d610,
prev = 0xffff88005625d410
},
msg_target = {
nid = 562950121325115,
pid = 0
},
msg_initiator = 3,
msg_from = 4144,
msg_type = 0,
msg_src_nid_param = 18446744073709551615,
msg_rtr_nid_param = 0,
msg_tx_committed = 0,
msg_tx_cpt = 0,
msg_rx_committed = 0,
msg_rx_cpt = 0,
msg_tx_delayed = 0,
msg_rx_delayed = 0,
msg_rx_ready_delay = 0,
msg_vmflush = 0,
msg_target_is_router = 0,
msg_routing = 0,
msg_ack = 0,
msg_sending = 1,
msg_receiving = 0,
msg_txcredit = 0,
msg_peertxcredit = 0,
msg_rtrcredit = 0,
msg_peerrtrcredit = 0,
msg_onactivelist = 0,
msg_rdma_get = 0,
msg_txpeer = 0x0,
msg_rxpeer = 0x0,
msg_private = 0x0,
msg_md = 0x59a50c98,
msg_txni = 0xffffffffc0c01020 <the_lnet+384>,
msg_rxni = 0xffffffffc0c01020 <the_lnet+384>,
msg_len = 0,
msg_wanted = 0,
msg_offset = 1676048808,
msg_niov = 4294936576,
msg_iov = 0xffff880063e679a8,
msg_kiov = 0x0,
msg_ev = {
target = {
nid = 142545401151929,
pid = 1
},
initiator = {
nid = 0,
pid = 0
},
source = {
nid = 18446744071586135488,
pid = 3237090016
},
sender = 18446612133228933888,
type = 0,
pt_index = 0,
match_bits = 1,
rlength = 914715264,
mlength = 4294936576,
md_handle = {
cookie = 18446612133228933184
},
md = {
start = 0xffff880058384c00,
length = 1,
threshold = 0,
max_size = 0,
options = 0,
user_ptr = 0xffff880063e67a30,
eq_handle = {
cookie = 18446612133990267440
},
bulk_handle = {
cookie = 8318832459525390336
}
},
hdr_data = 7307481174827298911,
msg_type = 115,
status = 0,
unlinked = 0,
offset = 0,
sequence = 0
},
msg_hdr = {
dest_nid = 0,
src_nid = 0,
dest_pid = 0,
src_pid = 0,
type = 4026532327,
payload_length = 33060,
msg = {
ack = {
dst_wmd = {
wh_interface_cookie = 1,
wh_object_cookie = 0
},
match_bits = 0,
mlength = 2171551168
},
put = {
ack_wmd = {
wh_interface_cookie = 1,
wh_object_cookie = 0
},
match_bits = 0,
hdr_data = 18446744071586135488,
ptl_index = 3234101248,
offset = 4294967295
},
get = {
return_wmd = {
wh_interface_cookie = 1,
wh_object_cookie = 0
},
match_bits = 0,
ptl_index = 2171551168,
src_offset = 4294967295,
sink_length = 3234101248
},
reply = {
dst_wmd = {
wh_interface_cookie = 1,
wh_object_cookie = 0
}
},
hello = {
incarnation = 1,
type = 0
}
}
}
}
But don't know why it happened. Only lnet_msg_attach_md & lnet_msg_detach_md can touch this field. Does it can be consider a overwrite? Thanks, |
| Comment by Amir Shehata (Inactive) [ 31/Aug/17 ] |
|
|
| Comment by Olaf Weber [ 31/Aug/17 ] |
|
I agree that we should fix |