Details
-
Bug
-
Resolution: Fixed
-
Major
-
Lustre 2.1.3
-
None
-
Lustre 2.1.3 Bull.2.308
-
3
-
6773
Description
While doing a tar on a login node, the login node encountered a general protection fault in osc_send_oap_rpc.
It looks like a race where the loi_oap_pages structure of osc_send_oap_rpc gets poisoned:
general protection fault: 0000 [#1] SMP last sysfs file: /sys/devices/pci0000:80/0000:80:02.0/0000:83:00.0/host7/rport-7:0-0/target7:0:0/7:0:0:18/state CPU 3 Modules linked in: tcp_diag inet_diag iptable_filter ip_tables nfs fscache lmv(U) mgc(U) lustre(U) lov(U) osc(U) mdc(U) lquota(U) fid(U) fld(U) ko2iblnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) libcfs(U) nfsd lockd nfs_acl auth_rpcgss exportfs ipmi_devintf ipmi_si ipmi_msghandler sunrpc rdma_ucm(U) ib_sdp(U) rdma_cm(U) iw_cm(U) ib_addr(U) ib_ipoib(U) ib_cm(U) ib_sa(U) ipv6 ib_uverbs(U) ib_umad(U) mlx4_ib(U) mlx4_core(U) ib_mthca(U) ib_mad(U) ib_core(U) dm_mirror dm_region_hash dm_log dm_round_robin scsi_dh_rdac dm_multipath dm_mod uinput usbhid hid sg lpfc scsi_transport_fc scsi_tgt sb_edac edac_core i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support igb ioatdma dca ext4 mbcache jbd2 ehci_hcd sd_mod crc_t10dif ahci megaraid_sas [last unloaded: scsi_wait_scan] Pid: 30811, comm: tar Not tainted 2.6.32-220.23.1.bl6.Bull.28.8.x86_64 #1 Bull SAS bullx R/X9DRH-7TF/7F/iTF/iF RIP: 0010:[<ffffffffa09f8cb1>] [<ffffffffa09f8cb1>] osc_send_oap_rpc+0x61/0x1b40 [osc] RSP: 0018:ffff8805da6f3588 EFLAGS: 00010296 RAX: 5a5a5a5a5a5a5a42 RBX: ffff8804710c6a80 RCX: 5a5a5a5a5a5a5a5a RDX: ffff8804710c6a80 RSI: ffff881067b22648 RDI: ffff8804710c6aa8 RBP: ffff8805da6f36b8 R08: ffff8804710c6a98 R09: 00000000000057bf R10: 0000000000000d9d R11: b000000000000000 R12: ffff881067b22648 R13: ffff8804710c6a98 R14: ffff8805da6f3668 R15: ffff8804710c6a98 FS: 00007f77e18f17a0(0000) GS:ffff880028260000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000032bcd9b4f8 CR3: 000000083eaac000 CR4: 00000000000406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400 0xffffffffa09f8c50 <osc_send_oap_rpc>: push %rbp 0xffffffffa09f8c51 <osc_send_oap_rpc+1>: mov %rsp,%rbp 0xffffffffa09f8c54 <osc_send_oap_rpc+4>: push %r15 0xffffffffa09f8c56 <osc_send_oap_rpc+6>: push %r14 0xffffffffa09f8c58 <osc_send_oap_rpc+8>: push %r13 0xffffffffa09f8c5a <osc_send_oap_rpc+10>: push %r12 0xffffffffa09f8c5c <osc_send_oap_rpc+12>: push %rbx 0xffffffffa09f8c5d <osc_send_oap_rpc+13>: sub $0x108,%rsp 0xffffffffa09f8c64 <osc_send_oap_rpc+20>: nopl 0x0(%rax,%rax,1) 0xffffffffa09f8c69 <osc_send_oap_rpc+25>: testb $0x1,-0x4f27ec(%rip) # 0xffffffffa0506484 0xffffffffa09f8c70 <osc_send_oap_rpc+32>: lea -0x50(%rbp),%r14 0xffffffffa09f8c74 <osc_send_oap_rpc+36>: mov %rdi,-0x78(%rbp) 0xffffffffa09f8c78 <osc_send_oap_rpc+40>: mov %rsi,%r12 0xffffffffa09f8c7b <osc_send_oap_rpc+43>: mov %rdx,-0xc0(%rbp) 0xffffffffa09f8c82 <osc_send_oap_rpc+50>: mov %ecx,-0xac(%rbp) 0xffffffffa09f8c88 <osc_send_oap_rpc+56>: mov %r8,%r15 0xffffffffa09f8c8b <osc_send_oap_rpc+59>: mov %r14,-0x50(%rbp) 0xffffffffa09f8c8f <osc_send_oap_rpc+63>: mov %r14,-0x48(%rbp) 0xffffffffa09f8c93 <osc_send_oap_rpc+67>: je 0xffffffffa09f8ca2 0xffffffffa09f8c95 <osc_send_oap_rpc+69>: testb $0x8,-0x4f281c(%rip) # 0xffffffffa0506480 0xffffffffa09f8c9c <osc_send_oap_rpc+76>: jne 0xffffffffa09f9500 0xffffffffa09f8ca2 <osc_send_oap_rpc+82>: mov 0x10(%r15),%rcx <= %rcx comes from %r15, which itself comes from %r8 = 5th argument of osc_send_oap_rpc 0xffffffffa09f8ca6 <osc_send_oap_rpc+86>: lea 0x10(%r15),%rdi 0xffffffffa09f8caa <osc_send_oap_rpc+90>: lea -0x18(%rcx),%rax <= %rax now equals to 5a5a5a5a5a5a5a42 coming from %rcx - x18 0xffffffffa09f8cae <osc_send_oap_rpc+94>: cmp %rcx,%rdi 0xffffffffa09f8cb1 <osc_send_oap_rpc+97>: mov 0x18(%rax),%rdx <= Crashed here 0xffffffffa09f8cb5 <osc_send_oap_rpc+101>: je 0xffffffffa09f8d50 0xffffffffa09f8cbb <osc_send_oap_rpc+107>: sub $0x18,%rdx 0xffffffffa09f8cbf <osc_send_oap_rpc+111>: xor %ecx,%ecx 0xffffffffa09f8cc1 <osc_send_oap_rpc+113>: jmp 0xffffffffa09f8d10 0xffffffffa09f8cc3 <osc_send_oap_rpc+115>: nopl 0x0(%rax,%rax,1) static int osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli, struct lov_oinfo *loi, int cmd, struct loi_oap_pages *lop) %r15 = %r8 = lop crash> struct loi_oap_pages ffff8804710c6a98 struct loi_oap_pages { lop_pending = { next = 0x5a5a5a5a5a5a5a5a, prev = 0x5a5a5a5a5a5a5a5a }, lop_urgent = { next = 0x5a5a5a5a5a5a5a5a, prev = 0x5a5a5a5a5a5a5a5a }, lop_pending_group = { next = 0x5a5a5a5a5a5a5a5a, prev = 0x5a5a5a5a5a5a5a5a }, lop_num_pending = 1515870810 } static int osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli, struct lov_oinfo *loi, int cmd, struct loi_oap_pages *lop) { struct ptlrpc_request *req; obd_count page_count = 0; struct osc_async_page *oap = NULL, *tmp; struct osc_brw_async_args *aa; const struct obd_async_page_ops *ops; CFS_LIST_HEAD(rpc_list); int srvlock = 0, mem_tight = 0; struct cl_object *clob = NULL; obd_off starting_offset = OBD_OBJECT_EOF; unsigned int ending_offset; int starting_page_off = 0; ENTRY; /* ASYNC_HP pages first. At present, when the lock the pages is * to be canceled, the pages covered by the lock will be sent out * with ASYNC_HP. We have to send out them as soon as possible. */ cfs_list_for_each_entry_safe(oap, tmp, &lop->lop_urgent, oap_urgent_item) { ^^^^^^^^^^^^^^^ == LI_POISON if (oap->oap_async_flags & ASYNC_HP) cfs_list_move(&oap->oap_pending_item, &rpc_list); else if (!(oap->oap_brw_flags & OBD_BRW_SYNC)) /* only do this for writeback pages. */ cfs_list_move_tail(&oap->oap_pending_item, &rpc_list); if (++page_count >= cli->cl_max_pages_per_rpc) break; }
I attach a file with the output of dmesg,ps,bt, bt -a and foreach bt, if you need more elements.
Attachments
Issue Links
- is duplicated by
-
LU-2853 osc send list corruption
-
- Closed
-
Re-choose/refresh of lov_oinfo between write/read steps and calls of osc_send_oap_rpc() in osc_check_rpcs() seems to work but it may break the balancing algorithm and it also does not cover all cases since same problem is likely to also occur upon end of read step where the lov_oinfo may also have been freed (again during cl_loi_list_lock released window) and it is unlinked from any client_obd cl_loi_[hp_ready,ready,write_read]_list's before calling loi_list_maint() and next while() loop.
Thus since the only routine used to maintain the cl_loi_[hp_ready,ready,write_read]list's is loi_list_maint() it may be possible to implement the 1st fix option I listed in prior comment, ie unlink lov_oinfo from client_obd cl_loi[hp_ready,ready,write_read]list's when choosen in osc_next_loi(), and keep it as-is during all these page-lists processing, this only needs to remove calls to loi_list_maint() (that will re-link lov_oinfo on cl_loi[hp_ready,ready,write_read]_list's !!) in both osc_send_oap_rpc() in osc_check_rpcs().
New/2nd patch/way is at http://review.whamcloud.com/6200 .