[LU-2797] GPF in osc_send_oap_rpc - Whamcloud Community JIRA

Details

Type: Bug
Resolution: Fixed
Priority: Major
Fix Version/s: Lustre 2.1.6
Affects Version/s: Lustre 2.1.3
Labels:
None
Environment:
Lustre 2.1.3 Bull.2.308

Severity:
3
Epic:
- client
Rank (Obsolete):
6773

Description

While doing a tar on a login node, the login node encountered a general protection fault in osc_send_oap_rpc.

It looks like a race where the loi_oap_pages structure of osc_send_oap_rpc gets poisoned:

general protection fault: 0000 [#1] SMP 
last sysfs file: /sys/devices/pci0000:80/0000:80:02.0/0000:83:00.0/host7/rport-7:0-0/target7:0:0/7:0:0:18/state
CPU 3 
Modules linked in: tcp_diag inet_diag iptable_filter ip_tables nfs fscache lmv(U) mgc(U) lustre(U) lov(U) osc(U) mdc(U)
lquota(U) fid(U) fld(U) ko2iblnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) libcfs(U) nfsd lockd nfs_acl auth_rpcgss exportfs
ipmi_devintf ipmi_si ipmi_msghandler sunrpc rdma_ucm(U) ib_sdp(U) rdma_cm(U) iw_cm(U) ib_addr(U) ib_ipoib(U) ib_cm(U)
ib_sa(U) ipv6 ib_uverbs(U) ib_umad(U) mlx4_ib(U) mlx4_core(U) ib_mthca(U) ib_mad(U) ib_core(U) dm_mirror dm_region_hash
dm_log dm_round_robin scsi_dh_rdac dm_multipath dm_mod uinput usbhid hid sg lpfc scsi_transport_fc scsi_tgt sb_edac
edac_core i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support igb ioatdma dca ext4 mbcache jbd2 ehci_hcd sd_mod crc_t10dif
ahci megaraid_sas [last unloaded: scsi_wait_scan]

Pid: 30811, comm: tar Not tainted 2.6.32-220.23.1.bl6.Bull.28.8.x86_64 #1 Bull SAS bullx R/X9DRH-7TF/7F/iTF/iF
RIP: 0010:[<ffffffffa09f8cb1>]  [<ffffffffa09f8cb1>] osc_send_oap_rpc+0x61/0x1b40 [osc]
RSP: 0018:ffff8805da6f3588  EFLAGS: 00010296
RAX: 5a5a5a5a5a5a5a42 RBX: ffff8804710c6a80 RCX: 5a5a5a5a5a5a5a5a
RDX: ffff8804710c6a80 RSI: ffff881067b22648 RDI: ffff8804710c6aa8
RBP: ffff8805da6f36b8 R08: ffff8804710c6a98 R09: 00000000000057bf
R10: 0000000000000d9d R11: b000000000000000 R12: ffff881067b22648
R13: ffff8804710c6a98 R14: ffff8805da6f3668 R15: ffff8804710c6a98
FS:  00007f77e18f17a0(0000) GS:ffff880028260000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 00000032bcd9b4f8 CR3: 000000083eaac000 CR4: 00000000000406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400





0xffffffffa09f8c50 <osc_send_oap_rpc>:  push   %rbp
0xffffffffa09f8c51 <osc_send_oap_rpc+1>:        mov    %rsp,%rbp
0xffffffffa09f8c54 <osc_send_oap_rpc+4>:        push   %r15
0xffffffffa09f8c56 <osc_send_oap_rpc+6>:        push   %r14
0xffffffffa09f8c58 <osc_send_oap_rpc+8>:        push   %r13
0xffffffffa09f8c5a <osc_send_oap_rpc+10>:       push   %r12
0xffffffffa09f8c5c <osc_send_oap_rpc+12>:       push   %rbx
0xffffffffa09f8c5d <osc_send_oap_rpc+13>:       sub    $0x108,%rsp
0xffffffffa09f8c64 <osc_send_oap_rpc+20>:       nopl   0x0(%rax,%rax,1)
0xffffffffa09f8c69 <osc_send_oap_rpc+25>:       testb  $0x1,-0x4f27ec(%rip)        # 0xffffffffa0506484
0xffffffffa09f8c70 <osc_send_oap_rpc+32>:       lea    -0x50(%rbp),%r14
0xffffffffa09f8c74 <osc_send_oap_rpc+36>:       mov    %rdi,-0x78(%rbp)
0xffffffffa09f8c78 <osc_send_oap_rpc+40>:       mov    %rsi,%r12
0xffffffffa09f8c7b <osc_send_oap_rpc+43>:       mov    %rdx,-0xc0(%rbp)
0xffffffffa09f8c82 <osc_send_oap_rpc+50>:       mov    %ecx,-0xac(%rbp)
0xffffffffa09f8c88 <osc_send_oap_rpc+56>:       mov    %r8,%r15
0xffffffffa09f8c8b <osc_send_oap_rpc+59>:       mov    %r14,-0x50(%rbp)
0xffffffffa09f8c8f <osc_send_oap_rpc+63>:       mov    %r14,-0x48(%rbp)
0xffffffffa09f8c93 <osc_send_oap_rpc+67>:       je     0xffffffffa09f8ca2
0xffffffffa09f8c95 <osc_send_oap_rpc+69>:       testb  $0x8,-0x4f281c(%rip)        # 0xffffffffa0506480
0xffffffffa09f8c9c <osc_send_oap_rpc+76>:       jne    0xffffffffa09f9500
0xffffffffa09f8ca2 <osc_send_oap_rpc+82>:       mov    0x10(%r15),%rcx         <=  %rcx comes from %r15, which
itself comes from %r8 = 5th argument of osc_send_oap_rpc
0xffffffffa09f8ca6 <osc_send_oap_rpc+86>:       lea    0x10(%r15),%rdi
0xffffffffa09f8caa <osc_send_oap_rpc+90>:       lea    -0x18(%rcx),%rax        <= %rax now equals
to 5a5a5a5a5a5a5a42 coming from %rcx - x18
0xffffffffa09f8cae <osc_send_oap_rpc+94>:       cmp    %rcx,%rdi

0xffffffffa09f8cb1 <osc_send_oap_rpc+97>:       mov    0x18(%rax),%rdx           <= Crashed here

0xffffffffa09f8cb5 <osc_send_oap_rpc+101>:      je     0xffffffffa09f8d50
0xffffffffa09f8cbb <osc_send_oap_rpc+107>:      sub    $0x18,%rdx
0xffffffffa09f8cbf <osc_send_oap_rpc+111>:      xor    %ecx,%ecx
0xffffffffa09f8cc1 <osc_send_oap_rpc+113>:      jmp    0xffffffffa09f8d10
0xffffffffa09f8cc3 <osc_send_oap_rpc+115>:      nopl   0x0(%rax,%rax,1)



static int
osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli,
                 struct lov_oinfo *loi,
                 int cmd, struct loi_oap_pages *lop)

%r15 = %r8 = lop

crash> struct loi_oap_pages ffff8804710c6a98
struct loi_oap_pages {
  lop_pending = {
    next = 0x5a5a5a5a5a5a5a5a, 
    prev = 0x5a5a5a5a5a5a5a5a
  }, 
  lop_urgent = {
    next = 0x5a5a5a5a5a5a5a5a, 
    prev = 0x5a5a5a5a5a5a5a5a
  }, 
  lop_pending_group = {
    next = 0x5a5a5a5a5a5a5a5a, 
    prev = 0x5a5a5a5a5a5a5a5a
  }, 
  lop_num_pending = 1515870810
}

static int
osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli,
                 struct lov_oinfo *loi,
                 int cmd, struct loi_oap_pages *lop)
{
        struct ptlrpc_request *req;
        obd_count page_count = 0;
        struct osc_async_page *oap = NULL, *tmp;
        struct osc_brw_async_args *aa;
        const struct obd_async_page_ops *ops;
        CFS_LIST_HEAD(rpc_list);
        int srvlock = 0, mem_tight = 0;
        struct cl_object *clob = NULL;
        obd_off starting_offset = OBD_OBJECT_EOF;
        unsigned int ending_offset;
        int starting_page_off = 0;
        ENTRY;

        /* ASYNC_HP pages first. At present, when the lock the pages is
         * to be canceled, the pages covered by the lock will be sent out
         * with ASYNC_HP. We have to send out them as soon as possible. */
        cfs_list_for_each_entry_safe(oap, tmp, &lop->lop_urgent, oap_urgent_item) {
                                                ^^^^^^^^^^^^^^^
                                                == LI_POISON

                if (oap->oap_async_flags & ASYNC_HP)
                        cfs_list_move(&oap->oap_pending_item, &rpc_list);
                else if (!(oap->oap_brw_flags & OBD_BRW_SYNC))
                        /* only do this for writeback pages. */
                        cfs_list_move_tail(&oap->oap_pending_item, &rpc_list);
                if (++page_count >= cli->cl_max_pages_per_rpc)
                        break;
        }

I attach a file with the output of dmesg,ps,bt, bt -a and foreach bt, if you need more elements.

Attachments

- Sort By Name
- Sort By Date
- Ascending
- Descending
- Thumbnails
- List
- Download All

crash-dump-properties
12/Feb/13 8:37 AM
856 kB
Sebastien Piechurski

Issue Links

is duplicated by

LU-2853 osc send list corruption

Closed

GPF in osc_send_oap_rpc

Details

Description

Attachments

Attachments

Issue Links

Activity

People

Dates