Details
-
Bug
-
Resolution: Fixed
-
Major
-
Lustre 2.1.3
-
None
-
Lustre 2.1.3 Bull.2.308
-
3
-
6773
Description
While doing a tar on a login node, the login node encountered a general protection fault in osc_send_oap_rpc.
It looks like a race where the loi_oap_pages structure of osc_send_oap_rpc gets poisoned:
general protection fault: 0000 [#1] SMP
last sysfs file: /sys/devices/pci0000:80/0000:80:02.0/0000:83:00.0/host7/rport-7:0-0/target7:0:0/7:0:0:18/state
CPU 3
Modules linked in: tcp_diag inet_diag iptable_filter ip_tables nfs fscache lmv(U) mgc(U) lustre(U) lov(U) osc(U) mdc(U)
lquota(U) fid(U) fld(U) ko2iblnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) libcfs(U) nfsd lockd nfs_acl auth_rpcgss exportfs
ipmi_devintf ipmi_si ipmi_msghandler sunrpc rdma_ucm(U) ib_sdp(U) rdma_cm(U) iw_cm(U) ib_addr(U) ib_ipoib(U) ib_cm(U)
ib_sa(U) ipv6 ib_uverbs(U) ib_umad(U) mlx4_ib(U) mlx4_core(U) ib_mthca(U) ib_mad(U) ib_core(U) dm_mirror dm_region_hash
dm_log dm_round_robin scsi_dh_rdac dm_multipath dm_mod uinput usbhid hid sg lpfc scsi_transport_fc scsi_tgt sb_edac
edac_core i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support igb ioatdma dca ext4 mbcache jbd2 ehci_hcd sd_mod crc_t10dif
ahci megaraid_sas [last unloaded: scsi_wait_scan]
Pid: 30811, comm: tar Not tainted 2.6.32-220.23.1.bl6.Bull.28.8.x86_64 #1 Bull SAS bullx R/X9DRH-7TF/7F/iTF/iF
RIP: 0010:[<ffffffffa09f8cb1>] [<ffffffffa09f8cb1>] osc_send_oap_rpc+0x61/0x1b40 [osc]
RSP: 0018:ffff8805da6f3588 EFLAGS: 00010296
RAX: 5a5a5a5a5a5a5a42 RBX: ffff8804710c6a80 RCX: 5a5a5a5a5a5a5a5a
RDX: ffff8804710c6a80 RSI: ffff881067b22648 RDI: ffff8804710c6aa8
RBP: ffff8805da6f36b8 R08: ffff8804710c6a98 R09: 00000000000057bf
R10: 0000000000000d9d R11: b000000000000000 R12: ffff881067b22648
R13: ffff8804710c6a98 R14: ffff8805da6f3668 R15: ffff8804710c6a98
FS: 00007f77e18f17a0(0000) GS:ffff880028260000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 00000032bcd9b4f8 CR3: 000000083eaac000 CR4: 00000000000406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400
0xffffffffa09f8c50 <osc_send_oap_rpc>: push %rbp
0xffffffffa09f8c51 <osc_send_oap_rpc+1>: mov %rsp,%rbp
0xffffffffa09f8c54 <osc_send_oap_rpc+4>: push %r15
0xffffffffa09f8c56 <osc_send_oap_rpc+6>: push %r14
0xffffffffa09f8c58 <osc_send_oap_rpc+8>: push %r13
0xffffffffa09f8c5a <osc_send_oap_rpc+10>: push %r12
0xffffffffa09f8c5c <osc_send_oap_rpc+12>: push %rbx
0xffffffffa09f8c5d <osc_send_oap_rpc+13>: sub $0x108,%rsp
0xffffffffa09f8c64 <osc_send_oap_rpc+20>: nopl 0x0(%rax,%rax,1)
0xffffffffa09f8c69 <osc_send_oap_rpc+25>: testb $0x1,-0x4f27ec(%rip) # 0xffffffffa0506484
0xffffffffa09f8c70 <osc_send_oap_rpc+32>: lea -0x50(%rbp),%r14
0xffffffffa09f8c74 <osc_send_oap_rpc+36>: mov %rdi,-0x78(%rbp)
0xffffffffa09f8c78 <osc_send_oap_rpc+40>: mov %rsi,%r12
0xffffffffa09f8c7b <osc_send_oap_rpc+43>: mov %rdx,-0xc0(%rbp)
0xffffffffa09f8c82 <osc_send_oap_rpc+50>: mov %ecx,-0xac(%rbp)
0xffffffffa09f8c88 <osc_send_oap_rpc+56>: mov %r8,%r15
0xffffffffa09f8c8b <osc_send_oap_rpc+59>: mov %r14,-0x50(%rbp)
0xffffffffa09f8c8f <osc_send_oap_rpc+63>: mov %r14,-0x48(%rbp)
0xffffffffa09f8c93 <osc_send_oap_rpc+67>: je 0xffffffffa09f8ca2
0xffffffffa09f8c95 <osc_send_oap_rpc+69>: testb $0x8,-0x4f281c(%rip) # 0xffffffffa0506480
0xffffffffa09f8c9c <osc_send_oap_rpc+76>: jne 0xffffffffa09f9500
0xffffffffa09f8ca2 <osc_send_oap_rpc+82>: mov 0x10(%r15),%rcx <= %rcx comes from %r15, which
itself comes from %r8 = 5th argument of osc_send_oap_rpc
0xffffffffa09f8ca6 <osc_send_oap_rpc+86>: lea 0x10(%r15),%rdi
0xffffffffa09f8caa <osc_send_oap_rpc+90>: lea -0x18(%rcx),%rax <= %rax now equals
to 5a5a5a5a5a5a5a42 coming from %rcx - x18
0xffffffffa09f8cae <osc_send_oap_rpc+94>: cmp %rcx,%rdi
0xffffffffa09f8cb1 <osc_send_oap_rpc+97>: mov 0x18(%rax),%rdx <= Crashed here
0xffffffffa09f8cb5 <osc_send_oap_rpc+101>: je 0xffffffffa09f8d50
0xffffffffa09f8cbb <osc_send_oap_rpc+107>: sub $0x18,%rdx
0xffffffffa09f8cbf <osc_send_oap_rpc+111>: xor %ecx,%ecx
0xffffffffa09f8cc1 <osc_send_oap_rpc+113>: jmp 0xffffffffa09f8d10
0xffffffffa09f8cc3 <osc_send_oap_rpc+115>: nopl 0x0(%rax,%rax,1)
static int
osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli,
struct lov_oinfo *loi,
int cmd, struct loi_oap_pages *lop)
%r15 = %r8 = lop
crash> struct loi_oap_pages ffff8804710c6a98
struct loi_oap_pages {
lop_pending = {
next = 0x5a5a5a5a5a5a5a5a,
prev = 0x5a5a5a5a5a5a5a5a
},
lop_urgent = {
next = 0x5a5a5a5a5a5a5a5a,
prev = 0x5a5a5a5a5a5a5a5a
},
lop_pending_group = {
next = 0x5a5a5a5a5a5a5a5a,
prev = 0x5a5a5a5a5a5a5a5a
},
lop_num_pending = 1515870810
}
static int
osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli,
struct lov_oinfo *loi,
int cmd, struct loi_oap_pages *lop)
{
struct ptlrpc_request *req;
obd_count page_count = 0;
struct osc_async_page *oap = NULL, *tmp;
struct osc_brw_async_args *aa;
const struct obd_async_page_ops *ops;
CFS_LIST_HEAD(rpc_list);
int srvlock = 0, mem_tight = 0;
struct cl_object *clob = NULL;
obd_off starting_offset = OBD_OBJECT_EOF;
unsigned int ending_offset;
int starting_page_off = 0;
ENTRY;
/* ASYNC_HP pages first. At present, when the lock the pages is
* to be canceled, the pages covered by the lock will be sent out
* with ASYNC_HP. We have to send out them as soon as possible. */
cfs_list_for_each_entry_safe(oap, tmp, &lop->lop_urgent, oap_urgent_item) {
^^^^^^^^^^^^^^^
== LI_POISON
if (oap->oap_async_flags & ASYNC_HP)
cfs_list_move(&oap->oap_pending_item, &rpc_list);
else if (!(oap->oap_brw_flags & OBD_BRW_SYNC))
/* only do this for writeback pages. */
cfs_list_move_tail(&oap->oap_pending_item, &rpc_list);
if (++page_count >= cli->cl_max_pages_per_rpc)
break;
}
I attach a file with the output of dmesg,ps,bt, bt -a and foreach bt, if you need more elements.
Attachments
Issue Links
- is duplicated by
-
LU-2853 osc send list corruption
-
- Closed
-