Details
-
Bug
-
Resolution: Won't Fix
-
Minor
-
None
-
Lustre 2.5.3
-
bullx scs 4 AE4 with Bull Lustre based on 2.5.3.90 from git.
-
3
-
9223372036854775807
Description
One of our customer had a crash which seems very similar to LU-2388. However, the patches related to this issue are already part of the code (as this was landed for 2.4 and we are running 2.5.3.90).
This happened on a client, where there were corrupted list detected just before the crash:
WARNING: at lib/list_debug.c:48 list_del+0x6e/0xa0() (Not tainted) list_del corruption. prev->next should be ffff88015ae4ba70, but was 5a5a5a5a5a5a5a5a Pid: 1117, comm: python Not tainted 2.6.32-573.3.1.el6.x86_64 #1 Call Trace: [<ffffffff81077491>] ? warn_slowpath_common+0x91/0xe0 [<ffffffff81077596>] ? warn_slowpath_fmt+0x46/0x60 [<ffffffff81089c10>] ? process_timeout+0x0/0x10 [<ffffffff812a381e>] ? list_del+0x6e/0xa0 [<ffffffff810a18a1>] ? remove_wait_queue+0x31/0x50 [<ffffffffa0ad80cc>] ? do_statahead_enter+0x3dc/0x1890 [lustre] [<ffffffffa072cfe3>] ? ldlm_lock_add_to_lru+0x43/0x120 [ptlrpc] [<ffffffffa0732cf8>] ? ldlm_lock_decref_internal+0x358/0xad0 [ptlrpc] [<ffffffffa0733eb1>] ? ldlm_lock_decref+0x41/0x90 [ptlrpc] [<ffffffff810672b0>] ? default_wake_function+0x0/0x20 [<ffffffffa0ac0c26>] ? ll_lookup_it+0x5e6/0xb00 [lustre] [<ffffffffa0756280>] ? ptlrpc_req_finished+0x10/0x20 [ptlrpc] [<ffffffffa0a84d74>] ? ll_get_acl+0x34/0xe0 [lustre] [<ffffffffa0ac11cc>] ? ll_lookup_nd+0x8c/0x3f0 [lustre] [<ffffffff811ad32e>] ? d_alloc+0x13e/0x1b0 [<ffffffff811a1865>] ? do_lookup+0x1a5/0x230 [<ffffffff811a24f4>] ? __link_path_walk+0x7a4/0x1000 [<ffffffffa061e36f>] ? cl_env_put+0x20f/0x370 [obdclass] [<ffffffff811a300a>] ? path_walk+0x6a/0xe0 [<ffffffff811a321b>] ? filename_lookup+0x6b/0xc0 [<ffffffff811a4347>] ? user_path_at+0x57/0xa0 [<ffffffff81290405>] ? _atomic_dec_and_lock+0x55/0x80 [<ffffffff811b45c0>] ? mntput_no_expire+0x30/0x110 [<ffffffff81197514>] ? cp_new_stat+0xe4/0x100 [<ffffffff81197750>] ? vfs_fstatat+0x50/0xa0 [<ffffffff811978cb>] ? vfs_stat+0x1b/0x20 [<ffffffff811978f4>] ? sys_newstat+0x24/0x50 [<ffffffff810e8ab7>] ? audit_syscall_entry+0x1d7/0x200 [<ffffffff810e88ae>] ? __audit_syscall_exit+0x25e/0x290 [<ffffffff8100b0d2>] ? system_call_fastpath+0x16/0x1b --- WARNING: at lib/list_debug.c:51 list_del+0x8d/0xa0() (Tainted: G W -- ------------ ) list_del corruption. next->prev should be ffff88015ae4ba70, but was 5a5a5a5a5a5a5a5a Pid: 1117, comm: python Tainted: G W -- ------------ 2.6.32-573.3.1.el6.x86_64 #1 Call Trace: [<ffffffff81077491>] ? warn_slowpath_common+0x91/0xe0 [<ffffffff81077596>] ? warn_slowpath_fmt+0x46/0x60 [<ffffffff81089c10>] ? process_timeout+0x0/0x10 [<ffffffff812a383d>] ? list_del+0x8d/0xa0 [<ffffffff810a18a1>] ? remove_wait_queue+0x31/0x50 [<ffffffffa0ad80cc>] ? do_statahead_enter+0x3dc/0x1890 [lustre] [<ffffffffa072cfe3>] ? ldlm_lock_add_to_lru+0x43/0x120 [ptlrpc] [<ffffffffa0732cf8>] ? ldlm_lock_decref_internal+0x358/0xad0 [ptlrpc] [<ffffffffa0733eb1>] ? ldlm_lock_decref+0x41/0x90 [ptlrpc] [<ffffffff810672b0>] ? default_wake_function+0x0/0x20 [<ffffffffa0ac0c26>] ? ll_lookup_it+0x5e6/0xb00 [lustre] [<ffffffffa0756280>] ? ptlrpc_req_finished+0x10/0x20 [ptlrpc] [<ffffffffa0a84d74>] ? ll_get_acl+0x34/0xe0 [lustre] [<ffffffffa0ac11cc>] ? ll_lookup_nd+0x8c/0x3f0 [lustre] [<ffffffff811ad32e>] ? d_alloc+0x13e/0x1b0 [<ffffffff811a1865>] ? do_lookup+0x1a5/0x230 [<ffffffff811a24f4>] ? __link_path_walk+0x7a4/0x1000 [<ffffffffa061e36f>] ? cl_env_put+0x20f/0x370 [obdclass] [<ffffffff811a300a>] ? path_walk+0x6a/0xe0 [<ffffffff811a321b>] ? filename_lookup+0x6b/0xc0 [<ffffffff811a4347>] ? user_path_at+0x57/0xa0 [<ffffffff81290405>] ? _atomic_dec_and_lock+0x55/0x80 [<ffffffff811b45c0>] ? mntput_no_expire+0x30/0x110 [<ffffffff81197514>] ? cp_new_stat+0xe4/0x100 [<ffffffff81197750>] ? vfs_fstatat+0x50/0xa0 [<ffffffff811978cb>] ? vfs_stat+0x1b/0x20 [<ffffffff811978f4>] ? sys_newstat+0x24/0x50 [<ffffffff810e8ab7>] ? audit_syscall_entry+0x1d7/0x200 [<ffffffff810e88ae>] ? __audit_syscall_exit+0x25e/0x290 [<ffffffff8100b0d2>] ? system_call_fastpath+0x16/0x1b ---[ end trace 28dac25152f759ab ]---
and finally triggers a GPF with the following stack:
crash> bt
PID: 1117 TASK: ffff880476261520 CPU: 27 COMMAND: "python"
#0 [ffff88015ae4b6a0] machine_kexec at ffffffff8103d1ab
#1 [ffff88015ae4b700] crash_kexec at ffffffff810cc4f2
#2 [ffff88015ae4b7d0] oops_end at ffffffff8153ca10
#3 [ffff88015ae4b800] die at ffffffff81010f5b
#4 [ffff88015ae4b830] do_general_protection at ffffffff8153c502
#5 [ffff88015ae4b860] general_protection at ffffffff8153bcd5
[exception RIP: ll_sai_unplug+38]
RIP: ffffffffa0ad5386 RSP: ffff88015ae4b918 RFLAGS: 00010246
RAX: 5a5a5a5a5a5a5a5a RBX: ffff8805adbc3800 RCX: 000000000000bf31
RDX: ffff8805adbc3858 RSI: ffff88061923f440 RDI: ffff8805adbc3800
RBP: ffff88015ae4b998 R8: 0000000000000000 R9: ffff8800000bdf00
R10: 0000000000000000 R11: 0000000000000198 R12: ffff88061923f440
R13: 0000000000000000 R14: ffff88015ae4bb00 R15: ffff8805adbc3858
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#6 [ffff88015ae4b9a0] do_statahead_enter at ffffffffa0ad7e6f [lustre]
#7 [ffff88015ae4bac0] ll_lookup_it at ffffffffa0ac0c26 [lustre]
#8 [ffff88015ae4bbb0] ll_lookup_nd at ffffffffa0ac11cc [lustre]
#9 [ffff88015ae4bbf0] do_lookup at ffffffff811a1865
#10 [ffff88015ae4bc50] __link_path_walk at ffffffff811a24f4
#11 [ffff88015ae4bd30] path_walk at ffffffff811a300a
#12 [ffff88015ae4bd70] filename_lookup at ffffffff811a321b
#13 [ffff88015ae4bdb0] user_path_at at ffffffff811a4347
#14 [ffff88015ae4be80] vfs_fstatat at ffffffff81197750
#15 [ffff88015ae4bee0] vfs_stat at ffffffff811978cb
#16 [ffff88015ae4bef0] sys_newstat at ffffffff811978f4
#17 [ffff88015ae4bf80] system_call_fastpath at ffffffff8100b0d2
RIP: 00000030db4dae35 RSP: 00007f184162cc40 RFLAGS: 00010206
RAX: 0000000000000004 RBX: ffffffff8100b0d2 RCX: 00007f1842232a30
RDX: 00007f184162ccb0 RSI: 00007f184162ccb0 RDI: 00007f183417e280
RBP: 00007f183417e280 R8: 00000000ffffffff R9: 7469502f455f7963
R10: 2f352e33372f6863 R11: 0000000000000246 R12: 0000000000000000
R13: 00007f184e3a6ae0 R14: 0000000002029910 R15: 0000000000000000
ORIG_RAX: 0000000000000004 CS: 0033 SS: 002b
The crash occurs when derefencing the i_sb field of sai->sai_inode:
static void
ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
{
struct ptlrpc_thread *thread = &sai->sai_thread;
struct ll_sb_info *sbi = ll_i2sbi(sai->sai_inode); <=== HERE
int hit;
Both the sai and entry are poisoned:
crash> struct ll_sa_entry ffff88061923f440
struct ll_sa_entry {
se_link = {
next = 0x5a5a5a5a5a5a5a5a,
prev = 0x5a5a5a5a5a5a5a5a
},
se_list = {
next = 0x5a5a5a5a5a5a5a5a,
prev = 0x5a5a5a5a5a5a5a5a
},
se_hash = {
next = 0x5a5a5a5a5a5a5a5a,
prev = 0x5a5a5a5a5a5a5a5a
},
se_refcount = {
counter = 1515870810
},
se_index = 6510615555426900570,
se_handle = 6510615555426900570,
se_stat = 1515870810,
se_size = 1515870810,
se_minfo = 0x5a5a5a5a5a5a5a5a,
se_req = 0x5a5a5a5a5a5a5a5a,
se_inode = 0x5a5a5a5a5a5a5a5a,
se_qstr = {
hash = 1515870810,
len = 1515870810,
name = 0x5a5a5a5a5a5a5a5a <Address 0x5a5a5a5a5a5a5a5a out of bounds>
}
}
struct ll_statahead_info {
sai_inode = 0x5a5a5a5a5a5a5a5a,
sai_refcount = {
counter = 1515870810
},
sai_generation = 1515870810,
sai_max = 1515870810,
sai_sent = 6510615555426900570,
sai_replied = 6510615555426900570,
sai_index = 6510615555426900570,
sai_index_wait = 6510615555426900570,
sai_hit = 6510615555426900570,
sai_miss = 6510615555426900570,
sai_consecutive_miss = 1515870810,
sai_miss_hidden = 1515870810,
sai_skip_hidden = 1515870810,
sai_ls_all = 0,
sai_in_readpage = 1,
sai_agl_valid = 0,
sai_waitq = {
lock = {
raw_lock = {
slock = 1515936347
}
},
task_list = {
next = 0xffff8805adbc3860,
prev = 0xffff8805adbc3860
}
},
sai_thread = {
t_link = {
next = 0x5a5a5a5a5a5a5a5a,
prev = 0x5a5a5a5a5a5a5a5a
},
t_data = 0x5a5a5a5a5a5a5a5a,
t_flags = 1515870810,
t_id = 1515870810,
t_pid = 1515870810,
t_watchdog = 0x5a5a5a5a5a5a5a5a,
t_svcpt = 0x5a5a5a5a5a5a5a5a,
t_ctl_waitq = {
lock = {
raw_lock = {
slock = 1515870810
}
},
task_list = {
next = 0x5a5a5a5a5a5a5a5a,
prev = 0x5a5a5a5a5a5a5a5a
}
The call to ll_sai_unplug is there in do_statahead_enter:
1658 }
1659 }
1660
1661 ll_sai_unplug(sai, entry); <===
1662 RETURN(rc);
1663 }
1664
1665 /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */
1666 rc = is_first_dirent(dir, *dentryp);
1667 if (rc == LS_NONE_FIRST_DE)
1668 /* It is not "ls -{a}l" operation, no need statahead for it. */
1669 GOTO(out, rc = -EAGAIN);
There is no statahead process running:
crash> ps | grep ll_sa crash>
Confirmed by :
crash> struct ll_inode_info.u.d.d_opendir_pid 0xffff8807abf12540 u.d.d_opendir_pid = 0
The statahead info structure was also detached...
crash> struct ll_inode_info.u.d.d_sai 0xffff8807abf12540 u.d.d_sai = 0x0,
I guess there is a remaining race somewhere, but can't see where.
I am uploading a tarball with all sources, dump and required binaries for a more advanced analysis.
Attachments
Issue Links
- is related to
-
LU-2388 Oops in ll_sai_unplug
-
- Resolved
-