[LU-2797] GPF in osc_send_oap_rpc Created: 12/Feb/13 Updated: 17/Jul/13 Resolved: 26/Jun/13 |
|
| Status: | Resolved |
| Project: | Lustre |
| Component/s: | None |
| Affects Version/s: | Lustre 2.1.3 |
| Fix Version/s: | Lustre 2.1.6 |
| Type: | Bug | Priority: | Major |
| Reporter: | Sebastien Piechurski | Assignee: | Bruno Faccini (Inactive) |
| Resolution: | Fixed | Votes: | 0 |
| Labels: | None | ||
| Environment: |
Lustre 2.1.3 Bull.2.308 |
||
| Attachments: |
|
||||||||
| Issue Links: |
|
||||||||
| Severity: | 3 | ||||||||
| Epic: | client | ||||||||
| Rank (Obsolete): | 6773 | ||||||||
| Description |
|
While doing a tar on a login node, the login node encountered a general protection fault in osc_send_oap_rpc. It looks like a race where the loi_oap_pages structure of osc_send_oap_rpc gets poisoned: general protection fault: 0000 [#1] SMP
last sysfs file: /sys/devices/pci0000:80/0000:80:02.0/0000:83:00.0/host7/rport-7:0-0/target7:0:0/7:0:0:18/state
CPU 3
Modules linked in: tcp_diag inet_diag iptable_filter ip_tables nfs fscache lmv(U) mgc(U) lustre(U) lov(U) osc(U) mdc(U)
lquota(U) fid(U) fld(U) ko2iblnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) libcfs(U) nfsd lockd nfs_acl auth_rpcgss exportfs
ipmi_devintf ipmi_si ipmi_msghandler sunrpc rdma_ucm(U) ib_sdp(U) rdma_cm(U) iw_cm(U) ib_addr(U) ib_ipoib(U) ib_cm(U)
ib_sa(U) ipv6 ib_uverbs(U) ib_umad(U) mlx4_ib(U) mlx4_core(U) ib_mthca(U) ib_mad(U) ib_core(U) dm_mirror dm_region_hash
dm_log dm_round_robin scsi_dh_rdac dm_multipath dm_mod uinput usbhid hid sg lpfc scsi_transport_fc scsi_tgt sb_edac
edac_core i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support igb ioatdma dca ext4 mbcache jbd2 ehci_hcd sd_mod crc_t10dif
ahci megaraid_sas [last unloaded: scsi_wait_scan]
Pid: 30811, comm: tar Not tainted 2.6.32-220.23.1.bl6.Bull.28.8.x86_64 #1 Bull SAS bullx R/X9DRH-7TF/7F/iTF/iF
RIP: 0010:[<ffffffffa09f8cb1>] [<ffffffffa09f8cb1>] osc_send_oap_rpc+0x61/0x1b40 [osc]
RSP: 0018:ffff8805da6f3588 EFLAGS: 00010296
RAX: 5a5a5a5a5a5a5a42 RBX: ffff8804710c6a80 RCX: 5a5a5a5a5a5a5a5a
RDX: ffff8804710c6a80 RSI: ffff881067b22648 RDI: ffff8804710c6aa8
RBP: ffff8805da6f36b8 R08: ffff8804710c6a98 R09: 00000000000057bf
R10: 0000000000000d9d R11: b000000000000000 R12: ffff881067b22648
R13: ffff8804710c6a98 R14: ffff8805da6f3668 R15: ffff8804710c6a98
FS: 00007f77e18f17a0(0000) GS:ffff880028260000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 00000032bcd9b4f8 CR3: 000000083eaac000 CR4: 00000000000406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400
0xffffffffa09f8c50 <osc_send_oap_rpc>: push %rbp
0xffffffffa09f8c51 <osc_send_oap_rpc+1>: mov %rsp,%rbp
0xffffffffa09f8c54 <osc_send_oap_rpc+4>: push %r15
0xffffffffa09f8c56 <osc_send_oap_rpc+6>: push %r14
0xffffffffa09f8c58 <osc_send_oap_rpc+8>: push %r13
0xffffffffa09f8c5a <osc_send_oap_rpc+10>: push %r12
0xffffffffa09f8c5c <osc_send_oap_rpc+12>: push %rbx
0xffffffffa09f8c5d <osc_send_oap_rpc+13>: sub $0x108,%rsp
0xffffffffa09f8c64 <osc_send_oap_rpc+20>: nopl 0x0(%rax,%rax,1)
0xffffffffa09f8c69 <osc_send_oap_rpc+25>: testb $0x1,-0x4f27ec(%rip) # 0xffffffffa0506484
0xffffffffa09f8c70 <osc_send_oap_rpc+32>: lea -0x50(%rbp),%r14
0xffffffffa09f8c74 <osc_send_oap_rpc+36>: mov %rdi,-0x78(%rbp)
0xffffffffa09f8c78 <osc_send_oap_rpc+40>: mov %rsi,%r12
0xffffffffa09f8c7b <osc_send_oap_rpc+43>: mov %rdx,-0xc0(%rbp)
0xffffffffa09f8c82 <osc_send_oap_rpc+50>: mov %ecx,-0xac(%rbp)
0xffffffffa09f8c88 <osc_send_oap_rpc+56>: mov %r8,%r15
0xffffffffa09f8c8b <osc_send_oap_rpc+59>: mov %r14,-0x50(%rbp)
0xffffffffa09f8c8f <osc_send_oap_rpc+63>: mov %r14,-0x48(%rbp)
0xffffffffa09f8c93 <osc_send_oap_rpc+67>: je 0xffffffffa09f8ca2
0xffffffffa09f8c95 <osc_send_oap_rpc+69>: testb $0x8,-0x4f281c(%rip) # 0xffffffffa0506480
0xffffffffa09f8c9c <osc_send_oap_rpc+76>: jne 0xffffffffa09f9500
0xffffffffa09f8ca2 <osc_send_oap_rpc+82>: mov 0x10(%r15),%rcx <= %rcx comes from %r15, which
itself comes from %r8 = 5th argument of osc_send_oap_rpc
0xffffffffa09f8ca6 <osc_send_oap_rpc+86>: lea 0x10(%r15),%rdi
0xffffffffa09f8caa <osc_send_oap_rpc+90>: lea -0x18(%rcx),%rax <= %rax now equals
to 5a5a5a5a5a5a5a42 coming from %rcx - x18
0xffffffffa09f8cae <osc_send_oap_rpc+94>: cmp %rcx,%rdi
0xffffffffa09f8cb1 <osc_send_oap_rpc+97>: mov 0x18(%rax),%rdx <= Crashed here
0xffffffffa09f8cb5 <osc_send_oap_rpc+101>: je 0xffffffffa09f8d50
0xffffffffa09f8cbb <osc_send_oap_rpc+107>: sub $0x18,%rdx
0xffffffffa09f8cbf <osc_send_oap_rpc+111>: xor %ecx,%ecx
0xffffffffa09f8cc1 <osc_send_oap_rpc+113>: jmp 0xffffffffa09f8d10
0xffffffffa09f8cc3 <osc_send_oap_rpc+115>: nopl 0x0(%rax,%rax,1)
static int
osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli,
struct lov_oinfo *loi,
int cmd, struct loi_oap_pages *lop)
%r15 = %r8 = lop
crash> struct loi_oap_pages ffff8804710c6a98
struct loi_oap_pages {
lop_pending = {
next = 0x5a5a5a5a5a5a5a5a,
prev = 0x5a5a5a5a5a5a5a5a
},
lop_urgent = {
next = 0x5a5a5a5a5a5a5a5a,
prev = 0x5a5a5a5a5a5a5a5a
},
lop_pending_group = {
next = 0x5a5a5a5a5a5a5a5a,
prev = 0x5a5a5a5a5a5a5a5a
},
lop_num_pending = 1515870810
}
static int
osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli,
struct lov_oinfo *loi,
int cmd, struct loi_oap_pages *lop)
{
struct ptlrpc_request *req;
obd_count page_count = 0;
struct osc_async_page *oap = NULL, *tmp;
struct osc_brw_async_args *aa;
const struct obd_async_page_ops *ops;
CFS_LIST_HEAD(rpc_list);
int srvlock = 0, mem_tight = 0;
struct cl_object *clob = NULL;
obd_off starting_offset = OBD_OBJECT_EOF;
unsigned int ending_offset;
int starting_page_off = 0;
ENTRY;
/* ASYNC_HP pages first. At present, when the lock the pages is
* to be canceled, the pages covered by the lock will be sent out
* with ASYNC_HP. We have to send out them as soon as possible. */
cfs_list_for_each_entry_safe(oap, tmp, &lop->lop_urgent, oap_urgent_item) {
^^^^^^^^^^^^^^^
== LI_POISON
if (oap->oap_async_flags & ASYNC_HP)
cfs_list_move(&oap->oap_pending_item, &rpc_list);
else if (!(oap->oap_brw_flags & OBD_BRW_SYNC))
/* only do this for writeback pages. */
cfs_list_move_tail(&oap->oap_pending_item, &rpc_list);
if (++page_count >= cli->cl_max_pages_per_rpc)
break;
}
I attach a file with the output of dmesg,ps,bt, bt -a and foreach bt, if you need more elements. |
| Comments |
| Comment by Peter Jones [ 12/Feb/13 ] |
|
Bruno Could you please help with this one? Thanks Peter |
| Comment by Bruno Faccini (Inactive) [ 12/Feb/13 ] |
|
Hello Seb !! |
| Comment by Sebastien Piechurski [ 13/Feb/13 ] |
|
Hi Bruno ! Sorry for the bad formatting of the initial description ... Here is the list of Jiras corresponding to the included patches. Tell me if you need more informations from the dump. Seb. |
| Comment by Bruno Faccini (Inactive) [ 18/Feb/13 ] |
|
Hello Seb, Concerning the crash-dump, we need to get extract more infos, and I would like you to retrieve and dump the 2nd parameter for osc_send_oap_rpc() (a struct client_obd *), which has been saved in %r12 (R12: ffff881067b22648) at the time of the GPF. |
| Comment by Bruno Faccini (Inactive) [ 19/Feb/13 ] |
|
Problem looks like a race in parsing one of the client_obd's lists (cl_loi_hp_ready_list/cl_loi_ready_list/cl_loi_write_list/cl_loi_read_list) of lov_oinfo. This may come from the fact there are multiple "tar" processes running and competing to access these lists/structs for this same client_obd ... But this should not happen if lists access/updates are safely protected by cl_loi_list_lock !! |
| Comment by Sebastien Piechurski [ 22/Feb/13 ] |
|
Here is the requested client_obd struct: struct client_obd {
cl_sem = {
count = 0,
wait_lock = {
raw_lock = {
slock = 0
}
},
wait_list = {
next = 0xffff881067b22658,
prev = 0xffff881067b22658
}
},
cl_target_uuid = {
uuid = "scratch-OST0000_UUID\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
},
cl_import = 0xffff88105d718800,
cl_conn_count = 1,
cl_default_mds_easize = 0,
cl_max_mds_easize = 48,
cl_max_mds_cookiesize = 32,
cl_sp_me = LUSTRE_SP_CLI,
cl_sp_to = LUSTRE_SP_OST,
cl_flvr_mgc = {
sf_rpc = 0,
sf_flags = 0,
u_rpc = {<No data fields>},
u_bulk = {
hash = {
hash_alg = 0 '\000'
}
}
},
cl_llcd_offset = 0x0,
cl_dirty = 33554432,
cl_dirty_max = 33554432,
cl_dirty_transit = 0,
cl_avail_grant = 30281728,
cl_lost_grant = 4096,
cl_cache_waiters = {
next = 0xffff880ddcc97738,
prev = 0xffff880ea95c7738
},
cl_next_shrink_grant = 4875484627,
cl_grant_shrink_list = {
next = 0xffff881067b22708,
prev = 0xffff881067b22708
},
cl_grant_sem = {
lock = {
raw_lock = {
slock = 0
}
},
count = 1,
wait_list = {
next = 0xffff881067b22720,
prev = 0xffff881067b22720
}
},
cl_grant_shrink_interval = 1200,
cl_loi_list_lock = {
lock = {
raw_lock = {
slock = 2091613343
}
}
},
cl_loi_ready_list = {
next = 0xffff880be329f648,
prev = 0xffff880907ea58c8
},
cl_loi_hp_ready_list = {
next = 0xffff881067b22748,
prev = 0xffff881067b22748
},
cl_loi_write_list = {
next = 0xffff8808b83aec68,
prev = 0xffff880d34b198e8
},
cl_loi_read_list = {
next = 0xffff881067b22768,
prev = 0xffff881067b22768
},
cl_r_in_flight = 0,
cl_w_in_flight = 4,
cl_pending_w_pages = 7334,
cl_pending_r_pages = 0,
cl_max_pages_per_rpc = 256,
cl_max_rpcs_in_flight = 8,
cl_read_rpc_hist = {
oh_lock = {
raw_lock = {
slock = 2887691294
}
},
oh_buckets = {191654, 46436, 23673, 14630, 6356, 5136, 4296, 8853, 3946, 968, 191, 52, 12, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
},
cl_write_rpc_hist = {
oh_lock = {
raw_lock = {
slock = 2215216137
}
},
oh_buckets = {82913, 96191, 49436, 39547, 21783, 15763, 16527, 19420, 10408, 5652, 2263, 1038, 347, 144, 34, 14, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
},
cl_read_page_hist = {
oh_lock = {
raw_lock = {
slock = 2887691294
}
},
oh_buckets = {72977, 13449, 9899, 2851, 5192, 2466, 3230, 4538, 191604, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
},
cl_write_page_hist = {
oh_lock = {
raw_lock = {
slock = 2215216137
}
},
oh_buckets = {8912, 4639, 9193, 6562, 3486, 3212, 4527, 4185, 316765, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
},
cl_read_offset_hist = {
oh_lock = {
raw_lock = {
slock = 2887691294
}
},
oh_buckets = {275181, 8501, 11733, 855, 676, 648, 1024, 2688, 4900, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
},
cl_write_offset_hist = {
oh_lock = {
raw_lock = {
slock = 2215216137
}
},
oh_buckets = {330039, 148, 272, 488, 1019, 1656, 3181, 4273, 20405, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
},
cl_destroy_in_flight = {
counter = 0
},
cl_destroy_waitq = {
lock = {
raw_lock = {
slock = 3409365814
}
},
task_list = {
next = 0xffff881067b22dd0,
prev = 0xffff881067b22dd0
}
},
cl_rpc_lock = 0x0,
cl_close_lock = 0x0,
cl_oscc = {
oscc_lock = {
raw_lock = {
slock = 1166034304
}
},
oscc_wait_create_list = {
next = 0xffff881067b22df8,
prev = 0xffff881067b22df8
},
oscc_obd = 0xffff881067b22138,
oscc_last_id = 1,
oscc_next_id = 2,
oscc_grow_count = 32,
oscc_max_grow_count = 20000,
oscc_oa = {
o_valid = 0,
o_oi = {
oi_id = 0,
oi_seq = 0
},
o_parent_seq = 0,
o_size = 0,
o_mtime = 0,
o_atime = 0,
o_ctime = 0,
o_blocks = 0,
o_grant = 0,
o_blksize = 0,
o_mode = 0,
o_uid = 0,
o_gid = 0,
o_flags = 0,
o_nlink = 0,
o_parent_oid = 0,
o_misc = 0,
o_ioepoch = 0,
o_stripe_idx = 0,
o_parent_ver = 0,
o_handle = {
cookie = 0
},
o_lcookie = {
lgc_lgl = {
lgl_oid = 0,
lgl_oseq = 0,
lgl_ogen = 0
},
lgc_subsys = 0,
lgc_index = 0,
lgc_padding = 0
},
o_uid_h = 0,
o_gid_h = 0,
o_padding_3 = 0,
o_padding_4 = 0,
o_padding_5 = 0,
o_padding_6 = 0
},
oscc_flags = 1,
oscc_waitq = {
lock = {
raw_lock = {
slock = 0
}
},
task_list = {
next = 0xffff881067b22f08,
prev = 0xffff881067b22f08
}
}
},
cl_mgc_sem = {
lock = {
raw_lock = {
slock = 0
}
},
count = 1,
wait_list = {
next = 0xffff881067b22f20,
prev = 0xffff881067b22f20
}
},
cl_mgc_vfsmnt = 0x0,
cl_mgc_configs_dir = 0x0,
cl_mgc_refcount = {
counter = 0
},
cl_mgc_mgsexp = 0x0,
cl_checksum = 1,
cl_supp_cksum_types = 3,
cl_cksum_type = OBD_CKSUM_ADLER,
cl_ar = {
ar_rc = 0,
ar_force_sync = 0,
ar_min_xid = 0
},
cl_qchk_stat = 1,
cl_seq = 0x0,
cl_resends = {
counter = 10
}
}
If needed, I can upload the whole dump. |
| Comment by Oleg Drokin [ 22/Feb/13 ] |
|
Shadow noticed there is a long-standing bug in osc_send_oap_request, that could be traced back to clio introduction and further made worse be bug 18881 commit: req = osc_build_req(env, cli, &rpc_list, page_count,
mem_tight ? (cmd | OBD_BRW_MEMALLOC) : cmd);
if (IS_ERR(req)) {
<== Here we must regain the obd_list_lock or we will work on unprotected list, what's worse, we'll return to caller without the lock and caller expects the lock to be locked.
LASSERT(cfs_list_empty(&rpc_list));
loi_list_maint(cli, loi);
RETURN(PTR_ERR(req));
}
|
| Comment by Bruno Faccini (Inactive) [ 22/Feb/13 ] |
|
Hello Seb, |
| Comment by Bruno Faccini (Inactive) [ 22/Feb/13 ] |
|
Pushed patch for b2_1 at http://review.whamcloud.com/5516. |
| Comment by Alexey Lyashkov [ 23/Feb/13 ] |
|
Bruno, that fix need for all branches includes master. Xyratex have dump with different oap list corruption. bug also exist from 2.1 (first report) until 2.3 release. |
| Comment by Bruno Faccini (Inactive) [ 25/Feb/13 ] |
|
Humm, seems that due to patch and spin-lock/LBUG order, sanity/test_118f fails due to time-out, with Client's threads stack-traces dumps showing hangs due to new spin-lock (Which means test expect LBUG to occur ??) !! So trying to revert spin-lock/LBUG order, as commented by Alexey, this is patch #3. |
| Comment by Bruno Faccini (Inactive) [ 01/Mar/13 ] |
|
Humm too fast ..., the patch still causes sanity/test_118f (simulate/inject EINVAL via OBD_FAIL_OSC_BRW_PREP_REQ2) to fail for a timeout with stacks like following being printed to Console : 17:08:15:BUG: soft lockup - CPU#0 stuck for 60s! [multiop:18829] 17:08:16:CPU 0: 17:08:16:Modules linked in: loop llite_lloop(U) lustre(U) mgc(U) lov(U) osc(U) mdc(U) lmv(U) fid(U) fld(U) lquota(U) ptlrpc(U) obdclass(U) lvfs(U) ksocklnd(U) lnet(U) libcfs(U) nfs nfsd exportfs nfs_acl auth_rpcgss autofs4 hidp rfcomm l2cap bluetooth lockd sunrpc be2iscsi iscsi_tcp bnx2i cnic uio cxgb3i libcxgbi iw_cxgb3 cxgb3 8021q libiscsi_tcp ib_iser libiscsi2 scsi_transport_iscsi2 scsi_transport_iscsi ib_srp rds ib_sdp ib_ipoib ipoib_helper ipv6 xfrm_nalgo crypto_api rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_addr ib_sa ib_mad ib_core dm_mirror dm_multipath scsi_dh video backlight sbs power_meter hwmon i2c_ec dell_wmi wmi button battery asus_acpi acpi_memhotplug ac parport_pc lp parport floppy i2c_piix4 i2c_core 8139too 8139cp tpm_tis pcspkr tpm tpm_bios serio_raw virtio_balloon mii dm_raid45 dm_message dm_region_hash dm_log dm_mod dm_mem_cache ata_piix libata sd_mod scsi_mod virtio_blk virtio_pci virtio_ring virtio ext3 jbd uhci_hcd ohci_hcd ehci_hcd 17:08:17:Pid: 18829, comm: multiop Tainted: G ---- 2.6.18-308.20.1.el5 #1 17:08:17:RIP: 0010:[<ffffffff80064bdc>] [<ffffffff80064bdc>] .text.lock.spinlock+0x2/0x30 17:08:17:RSP: 0018:ffff81005f9e5820 EFLAGS: 00000286 17:08:17:RAX: 0000000000000044 RBX: ffff81005f9e58c0 RCX: 0000000000000000 17:08:17:RDX: ffff81005f9e58e8 RSI: 0000000001762650 RDI: ffff81006ad22814 17:08:17:RBP: ffff81006249e7d0 R08: 5a5a5a5a5a5a5a5a R09: 5a5a5a5a5a5a5a5a 17:08:17:R10: 5a5a5a5a5a5a5a5a R11: 5a5a5a5a5a5a5a5a R12: ffffffff8893d9f3 17:08:17:R13: ffff81005d1b7280 R14: ffff81006a878380 R15: ffff81005d1b7280 17:08:17:FS: 00002af8a98ba0c0(0000) GS:ffffffff80432000(0000) knlGS:0000000000000000 17:08:17:CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b 17:08:17:CR2: 00002af8a93519e0 CR3: 0000000064354000 CR4: 00000000000006e0 17:08:17: 17:08:17:Call Trace: 17:08:17: [<ffffffff88cac73c>] :osc:osc_send_oap_rpc+0x174c/0x1a80 17:08:17: [<ffffffff88cacc4e>] :osc:osc_check_rpcs+0x1de/0x450 17:08:17: [<ffffffff88caad11>] :osc:loi_list_maint+0xe1/0x110 17:08:17: [<ffffffff88cc4da0>] :osc:osc_io_submit+0x470/0x520 17:08:17: [<ffffffff88951263>] :obdclass:cl_io_submit_rw+0xd3/0x180 17:08:17: [<ffffffff88d3c5f7>] :lov:lov_io_submit+0x2a7/0xc60 17:08:17: [<ffffffff80063af9>] mutex_lock+0xd/0x1d 17:08:18: [<ffffffff88951263>] :obdclass:cl_io_submit_rw+0xd3/0x180 17:08:18: [<ffffffff88dfc6dd>] :lustre:ll_writepage+0x34d/0x460 17:08:18: [<ffffffff8001d0fb>] mpage_writepages+0x1bf/0x37d 17:08:18: [<ffffffff88dfc390>] :lustre:ll_writepage+0x0/0x460 17:08:18: [<ffffffff887f2edd>] :libcfs:libcfs_debug_msg+0x4d/0x60 17:08:18: [<ffffffff8005a9fb>] do_writepages+0x20/0x2f 17:08:18: [<ffffffff8004f8c0>] __filemap_fdatawrite_range+0x50/0x5b 17:08:18: [<ffffffff800c9888>] sync_page_range+0x3d/0xa0 17:08:18: [<ffffffff800c9b86>] generic_file_writev+0x8a/0xa3 17:08:18: [<ffffffff88e2438e>] :lustre:vvp_io_write_start+0x15e/0x210 17:08:18: [<ffffffff889513dc>] :obdclass:cl_io_start+0xcc/0x130 17:08:18: [<ffffffff88954adc>] :obdclass:cl_io_loop+0xbc/0x160 17:08:18: [<ffffffff88dd162a>] :lustre:ll_file_io_generic+0x4ba/0x500 17:08:18: [<ffffffff88dd182f>] :lustre:ll_file_writev+0x1bf/0x230 17:08:19: [<ffffffff88dddc9d>] :lustre:ll_file_write+0x15d/0x1d0 17:08:19: [<ffffffff80016b99>] vfs_write+0xce/0x174 17:08:19: [<ffffffff80017462>] sys_write+0x45/0x6e 17:08:19: [<ffffffff8005d28d>] tracesys+0xd5/0xe0 17:08:19: having a look to the source code with these new infos, it looks like locking cl_loi_list_lock is not needed finally since it is already done upon errors in osc_build_req() !! |
| Comment by Sebastien Piechurski [ 05/Mar/13 ] |
|
Sorry Bruno, I just woke up from a long period without reading my open tickets. Thanks. |
| Comment by Oleg Drokin [ 05/Mar/13 ] |
|
Shadow, your analysis here is incorrect. The "missing lock" is actually there, it's taken in osc_build_req on the error exit path... Sigh, that's a bad style for sure and deserves some comment. |
| Comment by Sebastien Piechurski [ 08/Mar/13 ] |
|
A tarball with all required files is currently uploading. |
| Comment by Bruno Faccini (Inactive) [ 11/Mar/13 ] |
|
Got it ! Thank's Seb. |
| Comment by Sebastien Piechurski [ 02/Apr/13 ] |
|
Hi Bruno, Do you have any news on this bug ? It seems to happen when multiple tar are run with output to the same file. Cheers, Seb. |
| Comment by Bruno Faccini (Inactive) [ 03/Apr/13 ] |
|
Yes and sorry, you are right and I know that I am late on this ticket, will try to give you an update asap now. |
| Comment by Bruno Faccini (Inactive) [ 05/Apr/13 ] |
|
Crash-dump analysis is quite painful since there are a lot of tar threads currently competing on the same OSC and all can be suspected, so I am in the process to review them all. I also thought I could get some help from the Lustre log/traces I extracted from the crash-dump, but no way it contains only D_HA traces of no interest. Since you indicate you already had 3 crashes, is it possible that you enable at least D_CACHE+D_INODE+D_PAGE tracing on the (known ?) Clients ?? |
| Comment by Sebastien Piechurski [ 05/Apr/13 ] |
|
Ok Bruno, I have set lnet.debug to "ioctl neterror warning error emerg ha config console inode cache page". We'll be waiting for the next crash to have the traces. Seb. |
| Comment by Benjamin Barthe (Inactive) [ 08/Apr/13 ] |
|
Hi guys, a new occurence happened this weekend, I'm currently uploading vmcore on FTP Benj. [EDIT] Upload done. |
| Comment by Sebastien Buisson (Inactive) [ 16/Apr/13 ] |
|
Hi, Did you have time to work on the crash dump supplied by Benjamin? There were 5 more occurences this weekend at the customer's site. This is becoming more and more visible to the customer. Clearly now this ticket's severity has to be increased. Thanks, |
| Comment by Bruno Faccini (Inactive) [ 16/Apr/13 ] |
|
Agree, this ticket is my top priority. And I upgrade its own to major. |
| Comment by Bruno Faccini (Inactive) [ 17/Apr/13 ] |
|
New crash-dump (including additional traces requested) analysis is on-going, it is not so easy due to osc_send_oap_rpc() routine size and tricky/optimized assembly code. Will update asap with more details. |
| Comment by Sebastien Buisson (Inactive) [ 17/Apr/13 ] |
|
Thanks Bruno, we are looking forward to hearing from you! Cheers, |
| Comment by Bruno Faccini (Inactive) [ 19/Apr/13 ] |
|
In the new crash-dump, the scenario is a bit different, the panic'ing thread is parsing the page-lists/loi_oap_pages (the loi_read_lop, it is important, see later) a lov_oinfo and, according to registers+memory content, this struct has been freed+poisoned and was even found re-allocated at the time of the crash-dump !! Thus, and I confirmed it by re-reading debugging notes for 1st crash-dump, my earlier comment "Problem looks like a race in parsing one of the client_obd's lists (cl_loi_hp_ready_list/cl_loi_ready_list/cl_loi_write_list/cl_loi_read_list) of lov_oinfo" is incomplete and would better be that when osc_check_rpcs() chooses/finds a lov_oinfo (via osc_next_loi()), it may have first called osc_send_oap_rpc() for lov_oinfo's loi_write_lop page-lists. During this cl_loi_list_lock can be returned and possibly granted by somebody else who can then free+poison lov_oinfo which is still linked on one of client_obd cl_loi_[hp_ready,ready,write_read]_list and can still be found via osc_next_loi(). Then, when cl_loi_list_lock is back available, osc_check_rpcs() can call osc_send_oap_rpc() for lov_oinfo's loi_read_lop page-lists and trigger the problem. Thus, possible fixes for the problem could be to unlink lov_oinfo from client_obd cl_loi_[hp_ready,ready,write_read]_list when choosen in osc_next_loi(), and keep it during all these page-lists processing (but then all further calls to loi_list_maint() will be useless ...). Or, revert the lock optimizations against long thread/time owning a critical lock, that cause cl_loi_list_lock to be returned in osc_build_req() (but looks tricky and of highly suspected big performance impact ...). Or only allow one thread at a time to be in osc_check_rpcs() (need to introduce a flag for this and test-and-set/swap it). Or re-choose the lov_oinfo (via osc_next_loi()) between osc_send_oap_rpc() calls for write/read... I will give it a try to the latest (simplest!) choice and let you know on how it goes. |
| Comment by Bruno Faccini (Inactive) [ 24/Apr/13 ] |
|
Patch is at http://review.whamcloud.com/6089. |
| Comment by Bruno Faccini (Inactive) [ 29/Apr/13 ] |
|
Re-choose/refresh of lov_oinfo between write/read steps and calls of osc_send_oap_rpc() in osc_check_rpcs() seems to work but it may break the balancing algorithm and it also does not cover all cases since same problem is likely to also occur upon end of read step where the lov_oinfo may also have been freed (again during cl_loi_list_lock released window) and it is unlinked from any client_obd cl_loi_[hp_ready,ready,write_read]_list's before calling loi_list_maint() and next while() loop. Thus since the only routine used to maintain the cl_loi_[hp_ready,ready,write_read]list's is loi_list_maint() it may be possible to implement the 1st fix option I listed in prior comment, ie unlink lov_oinfo from client_obd cl_loi[hp_ready,ready,write_read]list's when choosen in osc_next_loi(), and keep it as-is during all these page-lists processing, this only needs to remove calls to loi_list_maint() (that will re-link lov_oinfo on cl_loi[hp_ready,ready,write_read]_list's !!) in both osc_send_oap_rpc() in osc_check_rpcs(). New/2nd patch/way is at http://review.whamcloud.com/6200 . |
| Comment by Bruno Faccini (Inactive) [ 29/Apr/13 ] |
|
Seb(s), Benjamin, Could it be possible that you install+run builds with these 2 patches (on a few Nodes/Clients where the problem already occured) and expose them to the site work-load likely to reproduce the crash ?? BTW, is Meteo site running under production already ?? |
| Comment by Sebastien Buisson (Inactive) [ 30/Apr/13 ] |
|
Hi Bruno, The bug only occurs on the login node, and this node is unique. So people on site would like to wait for inspections and approvals of the 2 patches before testing them. Thanks, |
| Comment by Sebastien Piechurski [ 30/Apr/13 ] |
|
We might be able to reproduce the crashing workload on another machine which is currently under installation. |
| Comment by Alexandre Louvet [ 15/May/13 ] |
|
quick feedback from site : with the proposed patch the system deadlock almost immediately. |
| Comment by Bruno Faccini (Inactive) [ 16/May/13 ] |
|
Hello Alex, Sorry about that, but do you have more infos to provide that may help me to understand what is going wrong ?? Is there any crash-dump available or any other infos for this dead-lock ?? Did you or on-site people make some troubleshooting already ?? Also, which path/change did you integrate, #6089 or #6200 ?? |
| Comment by Sebastien Piechurski [ 17/May/13 ] |
|
Hi Bruno, I think the package we tried was including both #6089 and #6200. There is a comment for #6089 saying that there is another window for the same kind of race. Will there be an update to this patch to close this window ? |
| Comment by Bruno Faccini (Inactive) [ 17/May/13 ] |
|
Hello Seb, And thank you for the update. Yes you are right, an may be I did not mention it enough explicitly, but #6089 and #6200 are 2 separate fixes and are not intended to be used together. About the other possible window for the race, it seem to exist but in any case appears less important/ than the 1rst that you trigger and to be fixed in #6089. So can you give a try to #6089 solo/only ?? BTW, how did you integrate both patches since #6200 avoids any use/declare of lov_oinfo/loi, when #6089 uses it extensively ??!! And again, many thank's for your help on this. |
| Comment by Sebastien Piechurski [ 21/May/13 ] |
|
I am currently uploading the crash taken after the deadlock on the version with both patches applied. |
| Comment by Bruno Faccini (Inactive) [ 24/May/13 ] |
|
Hello Seb, INFO: task get_bench.bull:18887 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. get_bench.bul D 0000000000000016 0 18887 18828 0x00000080 ffff88084eaafcc8 0000000000000082 0000000000000001 5a5a5a5a5a5a5a5a 5a5a5a5a5a5a5a5a ffff88084eaafd48 ffff88084eaafc48 ffffffffa04df92e ffff88085ceb43e0 ffff88084eaaffd8 000000000000db00 ffff88085ceb43e0 Call Trace: [<ffffffffa04df92e>] ? cfs_free+0xe/0x10 [libcfs] [<ffffffff8117b997>] ? dput+0xc7/0x190 [<ffffffff81487205>] rwsem_down_failed_common+0x95/0x1f0 [<ffffffff81487383>] rwsem_down_write_failed+0x23/0x30 [<ffffffff81262023>] call_rwsem_down_write_failed+0x13/0x20 [<ffffffff81486832>] ? down_write+0x32/0x40 [<ffffffffa0b2ee3b>] ll_setattr_raw+0x18b/0xf20 [lustre] [<ffffffffa0b2fc2f>] ll_setattr+0x5f/0x100 [lustre] [<ffffffff8117f4e8>] notify_change+0x168/0x340 [<ffffffff81161343>] sys_fchmodat+0xc3/0x100 [<ffffffff81161398>] sys_chmod+0x18/0x20 [<ffffffff810030f2>] system_call_fastpath+0x16/0x1b and they were no-longer present at the time of the crash-dump. Which seems to indicate that, since they should be Un-interruptible at this point, the inode/ll_inode_info semaphore they were hung on became available, thus it was not really a dead-lock and also the Lustre code used there is far to be impacted by both patches. BTW, in the crash-dump I also found that 11 "cat" processes are hung since about 118s, all with the same stack-trace like following : PID: 17234 TASK: ffff880867c87080 CPU: 2 COMMAND: "cat"
#0 [ffff88086473f138] schedule at ffffffff81484c05
#1 [ffff88086473f200] io_schedule at ffffffff814853e3
#2 [ffff88086473f220] sync_page at ffffffff810fd4ad
#3 [ffff88086473f230] __wait_on_bit_lock at ffffffff81485c2a
#4 [ffff88086473f280] __lock_page at ffffffff810fd447
#5 [ffff88086473f2e0] vvp_page_own at ffffffffa0b6220a [lustre]
#6 [ffff88086473f300] cl_page_own0 at ffffffffa05f80bb [obdclass]
#7 [ffff88086473f350] cl_page_own at ffffffffa05f8330 [obdclass]
#8 [ffff88086473f360] check_and_discard_cb at ffffffffa05fe9ff [obdclass]
#9 [ffff88086473f3b0] cl_page_gang_lookup at ffffffffa05f9994 [obdclass]
#10 [ffff88086473f460] cl_lock_page_out at ffffffffa05fb53b [obdclass]
#11 [ffff88086473f4d0] osc_lock_flush at ffffffffa0a0177f [osc]
#12 [ffff88086473f520] osc_lock_cancel at ffffffffa0a01819 [osc]
#13 [ffff88086473f570] cl_lock_cancel0 at ffffffffa05fa1c5 [obdclass]
#14 [ffff88086473f5a0] cl_lock_cancel at ffffffffa05fafeb [obdclass]
#15 [ffff88086473f5c0] osc_ldlm_blocking_ast at ffffffffa0a02f7a [osc]
#16 [ffff88086473f630] ldlm_cancel_callback at ffffffffa07031f0 [ptlrpc]
#17 [ffff88086473f650] ldlm_cli_cancel_local at ffffffffa071d0cb [ptlrpc]
#18 [ffff88086473f680] ldlm_cli_cancel_list_local at ffffffffa071f9bd [ptlrpc]
#19 [ffff88086473f6e0] ldlm_cancel_resource_local at ffffffffa071fc62 [ptlrpc]
#20 [ffff88086473f750] osc_destroy at ffffffffa09ec44f [osc]
#21 [ffff88086473f800] lov_destroy at ffffffffa0a52cad [lov]
#22 [ffff88086473f8f0] ll_objects_destroy at ffffffffa0b447b7 [lustre]
#23 [ffff88086473fa20] ll_close_inode_openhandle at ffffffffa0b142a1 [lustre]
#24 [ffff88086473faa0] ll_md_real_close at ffffffffa0b1514a [lustre]
#25 [ffff88086473fad0] ll_md_close at ffffffffa0b15425 [lustre]
#26 [ffff88086473fb80] ll_file_release at ffffffffa0b15a35 [lustre]
#27 [ffff88086473fbd0] __fput at ffffffff81164a85
#28 [ffff88086473fc20] fput at ffffffff81164bc5
#29 [ffff88086473fc30] filp_close at ffffffff8116061d
#30 [ffff88086473fc60] put_files_struct at ffffffff8105732f
#31 [ffff88086473fcb0] exit_files at ffffffff810573f3
#32 [ffff88086473fce0] do_exit at ffffffff81059475
#33 [ffff88086473fd60] do_group_exit at ffffffff81059bc8
#34 [ffff88086473fd90] get_signal_to_deliver at ffffffff8106eec6
#35 [ffff88086473fe30] do_signal at ffffffff810022d5
#36 [ffff88086473ff30] do_notify_resume at ffffffff81002af0
#37 [ffff88086473ff50] int_signal at ffffffff810033c1
RIP: 00000032bcad83f0 RSP: 00007fff228120a8 RFLAGS: 00000246
RAX: fffffffffffffffc RBX: 0000000000200000 RCX: ffffffffffffffff
RDX: 0000000000200000 RSI: 00007fbf6c365000 RDI: 0000000000000003
RBP: 00007fbf6c365000 R8: 00000000ffffffff R9: 0000000000000000
R10: 0000000000200fff R11: 0000000000000246 R12: 0000000000000003
R13: ffffffffffff8000 R14: 0000000000200000 R15: 00007fbf6c365000
ORIG_RAX: 0000000000000000 CS: 0033 SS: 002b
and also, since about 112s, the Lustre umount process that was launched as part of the shutdown : PID: 24287 TASK: ffff8810414317d0 CPU: 17 COMMAND: "umount"
#0 [ffff88105cd07418] schedule at ffffffff81484c05
#1 [ffff88105cd074e0] io_schedule at ffffffff814853e3
#2 [ffff88105cd07500] sync_page at ffffffff810fd4ad
#3 [ffff88105cd07510] __wait_on_bit_lock at ffffffff81485c2a
#4 [ffff88105cd07560] __lock_page at ffffffff810fd447
#5 [ffff88105cd075c0] vvp_page_own at ffffffffa0b6220a [lustre]
#6 [ffff88105cd075e0] cl_page_own0 at ffffffffa05f80bb [obdclass]
#7 [ffff88105cd07630] cl_page_own at ffffffffa05f8330 [obdclass]
#8 [ffff88105cd07640] check_and_discard_cb at ffffffffa05fe9ff [obdclass]
#9 [ffff88105cd07690] cl_page_gang_lookup at ffffffffa05f9994 [obdclass]
#10 [ffff88105cd07740] cl_lock_page_out at ffffffffa05fb53b [obdclass]
#11 [ffff88105cd077b0] osc_lock_flush at ffffffffa0a0177f [osc]
#12 [ffff88105cd07800] osc_lock_cancel at ffffffffa0a01819 [osc]
#13 [ffff88105cd07850] cl_lock_cancel0 at ffffffffa05fa1c5 [obdclass]
#14 [ffff88105cd07880] cl_lock_cancel at ffffffffa05fafeb [obdclass]
#15 [ffff88105cd078a0] osc_ldlm_blocking_ast at ffffffffa0a02f7a [osc]
#16 [ffff88105cd07910] ldlm_cancel_callback at ffffffffa07031f0 [ptlrpc]
#17 [ffff88105cd07930] ldlm_cli_cancel_local at ffffffffa071d0cb [ptlrpc]
#18 [ffff88105cd07960] ldlm_cli_cancel at ffffffffa0720f78 [ptlrpc]
#19 [ffff88105cd079a0] cleanup_resource at ffffffffa070ae18 [ptlrpc]
#20 [ffff88105cd07a00] ldlm_resource_clean at ffffffffa070afda [ptlrpc]
#21 [ffff88105cd07a20] cfs_hash_for_each_relax at ffffffffa04ef28f [libcfs]
#22 [ffff88105cd07ab0] cfs_hash_for_each_nolock at ffffffffa04f0caf [libcfs]
#23 [ffff88105cd07ae0] ldlm_namespace_cleanup at ffffffffa0707a69 [ptlrpc]
#24 [ffff88105cd07b00] osc_import_event at ffffffffa09f8f02 [osc]
#25 [ffff88105cd07b60] ptlrpc_invalidate_import at ffffffffa076c0f0 [ptlrpc]
#26 [ffff88105cd07c20] ptlrpc_set_import_active at ffffffffa074175d [ptlrpc]
#27 [ffff88105cd07c50] osc_iocontrol at ffffffffa09f45e0 [osc]
#28 [ffff88105cd07d00] lov_iocontrol at ffffffffa0a5a82f [lov]
#29 [ffff88105cd07ec0] ll_umount_begin at ffffffffa0b29520 [lustre]
#30 [ffff88105cd07ef0] sys_umount at ffffffff8118322a
#31 [ffff88105cd07f80] system_call_fastpath at ffffffff810030f2
RIP: 00007f52e77837a7 RSP: 00007fffc0f25188 RFLAGS: 00010206
RAX: 00000000000000a6 RBX: ffffffff810030f2 RCX: 0000000000000010
RDX: 0000000000000000 RSI: 0000000000000001 RDI: 00007f52e8c75a00
RBP: 00007f52e8c759e0 R8: 00007f52e8c75a20 R9: 0000000000000001
R10: 00007fffc0f24fb0 R11: 0000000000000206 R12: 0000000000000000
R13: 0000000000000000 R14: 0000000000000000 R15: 00007f52e8c75a60
ORIG_RAX: 00000000000000a6 CS: 0033 SS: 002b
These last stacks seems to indicate that the Lustre pages flushing mechanism was slow (hung ?) but again the Lustre code used there is far to be impacted by both patches. On the other hand, do you have some news about exposure of Change #6089 only vs the production work-load ?? |
| Comment by Sebastien Piechurski [ 24/May/13 ] |
|
Hi Bruno, The workload was exercised during a whole night without a crash. Thanks for the work ! |