Details
-
Bug
-
Resolution: Fixed
-
Major
-
Lustre 2.1.3
-
None
-
Lustre 2.1.3 Bull.2.308
-
3
-
6773
Description
While doing a tar on a login node, the login node encountered a general protection fault in osc_send_oap_rpc.
It looks like a race where the loi_oap_pages structure of osc_send_oap_rpc gets poisoned:
general protection fault: 0000 [#1] SMP last sysfs file: /sys/devices/pci0000:80/0000:80:02.0/0000:83:00.0/host7/rport-7:0-0/target7:0:0/7:0:0:18/state CPU 3 Modules linked in: tcp_diag inet_diag iptable_filter ip_tables nfs fscache lmv(U) mgc(U) lustre(U) lov(U) osc(U) mdc(U) lquota(U) fid(U) fld(U) ko2iblnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) libcfs(U) nfsd lockd nfs_acl auth_rpcgss exportfs ipmi_devintf ipmi_si ipmi_msghandler sunrpc rdma_ucm(U) ib_sdp(U) rdma_cm(U) iw_cm(U) ib_addr(U) ib_ipoib(U) ib_cm(U) ib_sa(U) ipv6 ib_uverbs(U) ib_umad(U) mlx4_ib(U) mlx4_core(U) ib_mthca(U) ib_mad(U) ib_core(U) dm_mirror dm_region_hash dm_log dm_round_robin scsi_dh_rdac dm_multipath dm_mod uinput usbhid hid sg lpfc scsi_transport_fc scsi_tgt sb_edac edac_core i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support igb ioatdma dca ext4 mbcache jbd2 ehci_hcd sd_mod crc_t10dif ahci megaraid_sas [last unloaded: scsi_wait_scan] Pid: 30811, comm: tar Not tainted 2.6.32-220.23.1.bl6.Bull.28.8.x86_64 #1 Bull SAS bullx R/X9DRH-7TF/7F/iTF/iF RIP: 0010:[<ffffffffa09f8cb1>] [<ffffffffa09f8cb1>] osc_send_oap_rpc+0x61/0x1b40 [osc] RSP: 0018:ffff8805da6f3588 EFLAGS: 00010296 RAX: 5a5a5a5a5a5a5a42 RBX: ffff8804710c6a80 RCX: 5a5a5a5a5a5a5a5a RDX: ffff8804710c6a80 RSI: ffff881067b22648 RDI: ffff8804710c6aa8 RBP: ffff8805da6f36b8 R08: ffff8804710c6a98 R09: 00000000000057bf R10: 0000000000000d9d R11: b000000000000000 R12: ffff881067b22648 R13: ffff8804710c6a98 R14: ffff8805da6f3668 R15: ffff8804710c6a98 FS: 00007f77e18f17a0(0000) GS:ffff880028260000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000032bcd9b4f8 CR3: 000000083eaac000 CR4: 00000000000406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400 0xffffffffa09f8c50 <osc_send_oap_rpc>: push %rbp 0xffffffffa09f8c51 <osc_send_oap_rpc+1>: mov %rsp,%rbp 0xffffffffa09f8c54 <osc_send_oap_rpc+4>: push %r15 0xffffffffa09f8c56 <osc_send_oap_rpc+6>: push %r14 0xffffffffa09f8c58 <osc_send_oap_rpc+8>: push %r13 0xffffffffa09f8c5a <osc_send_oap_rpc+10>: push %r12 0xffffffffa09f8c5c <osc_send_oap_rpc+12>: push %rbx 0xffffffffa09f8c5d <osc_send_oap_rpc+13>: sub $0x108,%rsp 0xffffffffa09f8c64 <osc_send_oap_rpc+20>: nopl 0x0(%rax,%rax,1) 0xffffffffa09f8c69 <osc_send_oap_rpc+25>: testb $0x1,-0x4f27ec(%rip) # 0xffffffffa0506484 0xffffffffa09f8c70 <osc_send_oap_rpc+32>: lea -0x50(%rbp),%r14 0xffffffffa09f8c74 <osc_send_oap_rpc+36>: mov %rdi,-0x78(%rbp) 0xffffffffa09f8c78 <osc_send_oap_rpc+40>: mov %rsi,%r12 0xffffffffa09f8c7b <osc_send_oap_rpc+43>: mov %rdx,-0xc0(%rbp) 0xffffffffa09f8c82 <osc_send_oap_rpc+50>: mov %ecx,-0xac(%rbp) 0xffffffffa09f8c88 <osc_send_oap_rpc+56>: mov %r8,%r15 0xffffffffa09f8c8b <osc_send_oap_rpc+59>: mov %r14,-0x50(%rbp) 0xffffffffa09f8c8f <osc_send_oap_rpc+63>: mov %r14,-0x48(%rbp) 0xffffffffa09f8c93 <osc_send_oap_rpc+67>: je 0xffffffffa09f8ca2 0xffffffffa09f8c95 <osc_send_oap_rpc+69>: testb $0x8,-0x4f281c(%rip) # 0xffffffffa0506480 0xffffffffa09f8c9c <osc_send_oap_rpc+76>: jne 0xffffffffa09f9500 0xffffffffa09f8ca2 <osc_send_oap_rpc+82>: mov 0x10(%r15),%rcx <= %rcx comes from %r15, which itself comes from %r8 = 5th argument of osc_send_oap_rpc 0xffffffffa09f8ca6 <osc_send_oap_rpc+86>: lea 0x10(%r15),%rdi 0xffffffffa09f8caa <osc_send_oap_rpc+90>: lea -0x18(%rcx),%rax <= %rax now equals to 5a5a5a5a5a5a5a42 coming from %rcx - x18 0xffffffffa09f8cae <osc_send_oap_rpc+94>: cmp %rcx,%rdi 0xffffffffa09f8cb1 <osc_send_oap_rpc+97>: mov 0x18(%rax),%rdx <= Crashed here 0xffffffffa09f8cb5 <osc_send_oap_rpc+101>: je 0xffffffffa09f8d50 0xffffffffa09f8cbb <osc_send_oap_rpc+107>: sub $0x18,%rdx 0xffffffffa09f8cbf <osc_send_oap_rpc+111>: xor %ecx,%ecx 0xffffffffa09f8cc1 <osc_send_oap_rpc+113>: jmp 0xffffffffa09f8d10 0xffffffffa09f8cc3 <osc_send_oap_rpc+115>: nopl 0x0(%rax,%rax,1) static int osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli, struct lov_oinfo *loi, int cmd, struct loi_oap_pages *lop) %r15 = %r8 = lop crash> struct loi_oap_pages ffff8804710c6a98 struct loi_oap_pages { lop_pending = { next = 0x5a5a5a5a5a5a5a5a, prev = 0x5a5a5a5a5a5a5a5a }, lop_urgent = { next = 0x5a5a5a5a5a5a5a5a, prev = 0x5a5a5a5a5a5a5a5a }, lop_pending_group = { next = 0x5a5a5a5a5a5a5a5a, prev = 0x5a5a5a5a5a5a5a5a }, lop_num_pending = 1515870810 } static int osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli, struct lov_oinfo *loi, int cmd, struct loi_oap_pages *lop) { struct ptlrpc_request *req; obd_count page_count = 0; struct osc_async_page *oap = NULL, *tmp; struct osc_brw_async_args *aa; const struct obd_async_page_ops *ops; CFS_LIST_HEAD(rpc_list); int srvlock = 0, mem_tight = 0; struct cl_object *clob = NULL; obd_off starting_offset = OBD_OBJECT_EOF; unsigned int ending_offset; int starting_page_off = 0; ENTRY; /* ASYNC_HP pages first. At present, when the lock the pages is * to be canceled, the pages covered by the lock will be sent out * with ASYNC_HP. We have to send out them as soon as possible. */ cfs_list_for_each_entry_safe(oap, tmp, &lop->lop_urgent, oap_urgent_item) { ^^^^^^^^^^^^^^^ == LI_POISON if (oap->oap_async_flags & ASYNC_HP) cfs_list_move(&oap->oap_pending_item, &rpc_list); else if (!(oap->oap_brw_flags & OBD_BRW_SYNC)) /* only do this for writeback pages. */ cfs_list_move_tail(&oap->oap_pending_item, &rpc_list); if (++page_count >= cli->cl_max_pages_per_rpc) break; }
I attach a file with the output of dmesg,ps,bt, bt -a and foreach bt, if you need more elements.
Attachments
Issue Links
- is duplicated by
-
LU-2853 osc send list corruption
- Closed