Details
-
Bug
-
Resolution: Fixed
-
Blocker
-
Lustre 2.5.0, Lustre 2.4.3
-
3
-
8769
Description
Running racer.sh 2.4.50-79-gaed8203 with some local patches (LU-3072, LU-3348, LU-3233, LU-3448) I can reproduce the following oops in vvp_io_fault_iter_init():
00000100:00100000:0.0:1371755423.271136:0:30780:0:(client.c:1805:ptlrpc_check_set()) Completed RPC pname:cluuid:pid:xid:nid:opc 19:1868864a-63ce-938d-b909-2dada17703ae:30780:1438389041886196:0@lo:49 00000080:00020000:0.0:1371755423.271146:0:30780:0:(vvp_io.c:1241:vvp_io_init()) lustre: refresh file layout [0x2c0000401:0x3467:0x0] error -13. BUG: unable to handle kernel NULL pointer dereference at 00000000000000b0 IP: [<ffffffffa0ce2bbc>] vvp_io_fault_iter_init+0x4c/0xc0 [lustre] ... Pid: 30780, comm: 19 Tainted: P --------------- 2.6.32-279.19.1.el6_lustre_gcov.x86_64 #1 Bochs Bochs RIP: 0010:[<ffffffffa0ce2bbc>] [<ffffffffa0ce2bbc>] vvp_io_fault_iter_init+0x4c/0xc0 [lustre] RSP: 0018:ffff88014196daf8 EFLAGS: 00010292 RAX: 0000000000000000 RBX: ffff88016554b870 RCX: 0000000000000000 RDX: ffff880160625400 RSI: ffffffffa0d0f0a0 RDI: ffff8801647e4ca0 RBP: ffff88014196db18 R08: 0000000000000000 R09: ffff880164fe68c8 R10: 0000000000000003 R11: 0000000000000000 R12: ffff8801647e4c68 R13: ffff880147805738 R14: ffff88016554c610 R15: ffff88014196dbd8 FS: 00007f6d52008700(0000) GS:ffff880028200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000001eac45c CR3: 0000000141551000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process 19 (pid: 30780, threadinfo ffff88014196c000, task ffff88016547e040) Stack: ffff88016554b870 ffff880164fe68c8 ffff8801647e4c68 ffff88014196dbc8 <d> ffff88014196db48 ffffffffa04a529d ffff88016547e040 ffff880164fe68c8 <d> ffff8801647e4c68 ffff88016356b148 ffff88014196db78 ffffffffa04a9cec Call Trace: [<ffffffffa04a529d>] cl_io_iter_init+0x5d/0x110 [obdclass] [<ffffffffa04a9cec>] cl_io_loop+0x4c/0x1b0 [obdclass] [<ffffffffa0cc6552>] ll_fault+0x2c2/0x4d0 [lustre] [<ffffffff8113bd54>] __do_fault+0x54/0x510 [<ffffffff81128750>] ? __lru_cache_add+0x40/0x90 [<ffffffff8113c307>] handle_pte_fault+0xf7/0xb50 [<ffffffff81278cec>] ? __bitmap_weight+0x8c/0xb0 [<ffffffff8116ba07>] ? mem_cgroup_update_file_mapped+0x17/0x90 [<ffffffff8114536a>] ? page_remove_rmap+0x7a/0xa0 [<ffffffff8113cf9a>] handle_mm_fault+0x23a/0x310 [<ffffffff810432d9>] __do_page_fault+0x139/0x480 [<ffffffff81196b40>] ? mntput_no_expire+0x30/0x110 [<ffffffff811793e1>] ? __fput+0x1a1/0x210 [<ffffffff8113fcee>] ? remove_vma+0x6e/0x90 [<ffffffff814f0f5e>] do_page_fault+0x3e/0xa0 [<ffffffff814ee315>] page_fault+0x25/0x30 Code: 89 f3 49 89 fc e8 65 ff ff ff 48 8b 7b 08 49 89 c6 e8 69 71 ff ff 48 89 de 4c 89 e7 49 89 c5 e8 db 96 ff ff 48 8b 80 a8 00 00 00 <48> 8b 80 b0 00 00 00 48 8b 40 18 4c 3b 68 10 75 22 49 8b 85 80 RIP [<ffffffffa0ce2bbc>] vvp_io_fault_iter_init+0x4c/0xc0 [lustre] RSP <ffff88014196daf8> CR2: 00000000000000b0
The oops is in the assertion
LASSERT(inode == cl2ccc_io(env, ios)->cui_fd->fd_file->f_dentry->d_inode);
because the ccc_io has a NULL cui_fd member:
crash> p *((struct cl_io *)0xffff880164fe68c8)
$3 = {
ci_type = CIT_FAULT,
ci_state = CIS_ZERO,
ci_obj = 0xffff880142b48148,
ci_parent = 0x0,
ci_layers = {
next = 0xffff88016554b888,
prev = 0xffff88016554b888
},
...
},
ci_lockreq = CILR_MANDATORY,
u = {
...
ci_fault = {
ft_index = 3,
ft_nob = 0,
ft_writable = 0,
ft_executable = 4,
ft_mkwrite = 0,
ft_page = 0x0
},
...
},
...
ci_nob = 0,
ci_result = 0,
ci_continue = 0,
ci_no_srvlock = 0,
ci_need_restart = 0,
ci_ignore_layout = 0,
ci_verify_layout = 0,
ci_owned_nr = 0
}
crash> p *((struct ccc_io *)0xffff88016554b870)
$9 = {
cui_cl = {
cis_io = 0xffff880164fe68c8,
cis_obj = 0xffff880142b48148,
cis_iop = 0xffffffffa0ce8200,
cis_linkage = {
next = 0xffff880164fe68e0,
prev = 0xffff880164fe68e0
}
},
cui_link = {
cill_linkage = {
next = 0x0,
prev = 0x0
},
cill_descr = {
cld_obj = 0x0,
cld_start = 0,
cld_end = 0,
cld_gid = 0,
cld_mode = CLM_PHANTOM,
cld_enq_flags = 0
},
cill_lock = 0x0,
cill_fini = 0
},
cui_iov = 0x0,
cui_nrsegs = 0,
cui_tot_nrsegs = 0,
cui_iov_olen = 0,
cui_tot_count = 0,
u = {
setattr = {
cui_local_lock = SETATTR_NOLOCK
}
},
cui_glimpse = 0,
cui_layout_gen = 4294967294,
cui_fd = 0x0,
cui_iocb = 0x0
}
I'm not sure why MDS_GETXATTR is returning -EACCES, but form here it's easy to see that the ll_fault_io_init() fails to handle the subsequent error from cl_io_init() and returns io without cui_fd being set.