Details
-
Bug
-
Resolution: Won't Fix
-
Minor
-
None
-
Lustre 2.5.3
-
bullx scs 4 AE4 with Bull Lustre based on 2.5.3.90 from git.
-
3
-
9223372036854775807
Description
One of our customer had a crash which seems very similar to LU-2388. However, the patches related to this issue are already part of the code (as this was landed for 2.4 and we are running 2.5.3.90).
This happened on a client, where there were corrupted list detected just before the crash:
WARNING: at lib/list_debug.c:48 list_del+0x6e/0xa0() (Not tainted) list_del corruption. prev->next should be ffff88015ae4ba70, but was 5a5a5a5a5a5a5a5a Pid: 1117, comm: python Not tainted 2.6.32-573.3.1.el6.x86_64 #1 Call Trace: [<ffffffff81077491>] ? warn_slowpath_common+0x91/0xe0 [<ffffffff81077596>] ? warn_slowpath_fmt+0x46/0x60 [<ffffffff81089c10>] ? process_timeout+0x0/0x10 [<ffffffff812a381e>] ? list_del+0x6e/0xa0 [<ffffffff810a18a1>] ? remove_wait_queue+0x31/0x50 [<ffffffffa0ad80cc>] ? do_statahead_enter+0x3dc/0x1890 [lustre] [<ffffffffa072cfe3>] ? ldlm_lock_add_to_lru+0x43/0x120 [ptlrpc] [<ffffffffa0732cf8>] ? ldlm_lock_decref_internal+0x358/0xad0 [ptlrpc] [<ffffffffa0733eb1>] ? ldlm_lock_decref+0x41/0x90 [ptlrpc] [<ffffffff810672b0>] ? default_wake_function+0x0/0x20 [<ffffffffa0ac0c26>] ? ll_lookup_it+0x5e6/0xb00 [lustre] [<ffffffffa0756280>] ? ptlrpc_req_finished+0x10/0x20 [ptlrpc] [<ffffffffa0a84d74>] ? ll_get_acl+0x34/0xe0 [lustre] [<ffffffffa0ac11cc>] ? ll_lookup_nd+0x8c/0x3f0 [lustre] [<ffffffff811ad32e>] ? d_alloc+0x13e/0x1b0 [<ffffffff811a1865>] ? do_lookup+0x1a5/0x230 [<ffffffff811a24f4>] ? __link_path_walk+0x7a4/0x1000 [<ffffffffa061e36f>] ? cl_env_put+0x20f/0x370 [obdclass] [<ffffffff811a300a>] ? path_walk+0x6a/0xe0 [<ffffffff811a321b>] ? filename_lookup+0x6b/0xc0 [<ffffffff811a4347>] ? user_path_at+0x57/0xa0 [<ffffffff81290405>] ? _atomic_dec_and_lock+0x55/0x80 [<ffffffff811b45c0>] ? mntput_no_expire+0x30/0x110 [<ffffffff81197514>] ? cp_new_stat+0xe4/0x100 [<ffffffff81197750>] ? vfs_fstatat+0x50/0xa0 [<ffffffff811978cb>] ? vfs_stat+0x1b/0x20 [<ffffffff811978f4>] ? sys_newstat+0x24/0x50 [<ffffffff810e8ab7>] ? audit_syscall_entry+0x1d7/0x200 [<ffffffff810e88ae>] ? __audit_syscall_exit+0x25e/0x290 [<ffffffff8100b0d2>] ? system_call_fastpath+0x16/0x1b --- WARNING: at lib/list_debug.c:51 list_del+0x8d/0xa0() (Tainted: G W -- ------------ ) list_del corruption. next->prev should be ffff88015ae4ba70, but was 5a5a5a5a5a5a5a5a Pid: 1117, comm: python Tainted: G W -- ------------ 2.6.32-573.3.1.el6.x86_64 #1 Call Trace: [<ffffffff81077491>] ? warn_slowpath_common+0x91/0xe0 [<ffffffff81077596>] ? warn_slowpath_fmt+0x46/0x60 [<ffffffff81089c10>] ? process_timeout+0x0/0x10 [<ffffffff812a383d>] ? list_del+0x8d/0xa0 [<ffffffff810a18a1>] ? remove_wait_queue+0x31/0x50 [<ffffffffa0ad80cc>] ? do_statahead_enter+0x3dc/0x1890 [lustre] [<ffffffffa072cfe3>] ? ldlm_lock_add_to_lru+0x43/0x120 [ptlrpc] [<ffffffffa0732cf8>] ? ldlm_lock_decref_internal+0x358/0xad0 [ptlrpc] [<ffffffffa0733eb1>] ? ldlm_lock_decref+0x41/0x90 [ptlrpc] [<ffffffff810672b0>] ? default_wake_function+0x0/0x20 [<ffffffffa0ac0c26>] ? ll_lookup_it+0x5e6/0xb00 [lustre] [<ffffffffa0756280>] ? ptlrpc_req_finished+0x10/0x20 [ptlrpc] [<ffffffffa0a84d74>] ? ll_get_acl+0x34/0xe0 [lustre] [<ffffffffa0ac11cc>] ? ll_lookup_nd+0x8c/0x3f0 [lustre] [<ffffffff811ad32e>] ? d_alloc+0x13e/0x1b0 [<ffffffff811a1865>] ? do_lookup+0x1a5/0x230 [<ffffffff811a24f4>] ? __link_path_walk+0x7a4/0x1000 [<ffffffffa061e36f>] ? cl_env_put+0x20f/0x370 [obdclass] [<ffffffff811a300a>] ? path_walk+0x6a/0xe0 [<ffffffff811a321b>] ? filename_lookup+0x6b/0xc0 [<ffffffff811a4347>] ? user_path_at+0x57/0xa0 [<ffffffff81290405>] ? _atomic_dec_and_lock+0x55/0x80 [<ffffffff811b45c0>] ? mntput_no_expire+0x30/0x110 [<ffffffff81197514>] ? cp_new_stat+0xe4/0x100 [<ffffffff81197750>] ? vfs_fstatat+0x50/0xa0 [<ffffffff811978cb>] ? vfs_stat+0x1b/0x20 [<ffffffff811978f4>] ? sys_newstat+0x24/0x50 [<ffffffff810e8ab7>] ? audit_syscall_entry+0x1d7/0x200 [<ffffffff810e88ae>] ? __audit_syscall_exit+0x25e/0x290 [<ffffffff8100b0d2>] ? system_call_fastpath+0x16/0x1b ---[ end trace 28dac25152f759ab ]---
and finally triggers a GPF with the following stack:
crash> bt PID: 1117 TASK: ffff880476261520 CPU: 27 COMMAND: "python" #0 [ffff88015ae4b6a0] machine_kexec at ffffffff8103d1ab #1 [ffff88015ae4b700] crash_kexec at ffffffff810cc4f2 #2 [ffff88015ae4b7d0] oops_end at ffffffff8153ca10 #3 [ffff88015ae4b800] die at ffffffff81010f5b #4 [ffff88015ae4b830] do_general_protection at ffffffff8153c502 #5 [ffff88015ae4b860] general_protection at ffffffff8153bcd5 [exception RIP: ll_sai_unplug+38] RIP: ffffffffa0ad5386 RSP: ffff88015ae4b918 RFLAGS: 00010246 RAX: 5a5a5a5a5a5a5a5a RBX: ffff8805adbc3800 RCX: 000000000000bf31 RDX: ffff8805adbc3858 RSI: ffff88061923f440 RDI: ffff8805adbc3800 RBP: ffff88015ae4b998 R8: 0000000000000000 R9: ffff8800000bdf00 R10: 0000000000000000 R11: 0000000000000198 R12: ffff88061923f440 R13: 0000000000000000 R14: ffff88015ae4bb00 R15: ffff8805adbc3858 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #6 [ffff88015ae4b9a0] do_statahead_enter at ffffffffa0ad7e6f [lustre] #7 [ffff88015ae4bac0] ll_lookup_it at ffffffffa0ac0c26 [lustre] #8 [ffff88015ae4bbb0] ll_lookup_nd at ffffffffa0ac11cc [lustre] #9 [ffff88015ae4bbf0] do_lookup at ffffffff811a1865 #10 [ffff88015ae4bc50] __link_path_walk at ffffffff811a24f4 #11 [ffff88015ae4bd30] path_walk at ffffffff811a300a #12 [ffff88015ae4bd70] filename_lookup at ffffffff811a321b #13 [ffff88015ae4bdb0] user_path_at at ffffffff811a4347 #14 [ffff88015ae4be80] vfs_fstatat at ffffffff81197750 #15 [ffff88015ae4bee0] vfs_stat at ffffffff811978cb #16 [ffff88015ae4bef0] sys_newstat at ffffffff811978f4 #17 [ffff88015ae4bf80] system_call_fastpath at ffffffff8100b0d2 RIP: 00000030db4dae35 RSP: 00007f184162cc40 RFLAGS: 00010206 RAX: 0000000000000004 RBX: ffffffff8100b0d2 RCX: 00007f1842232a30 RDX: 00007f184162ccb0 RSI: 00007f184162ccb0 RDI: 00007f183417e280 RBP: 00007f183417e280 R8: 00000000ffffffff R9: 7469502f455f7963 R10: 2f352e33372f6863 R11: 0000000000000246 R12: 0000000000000000 R13: 00007f184e3a6ae0 R14: 0000000002029910 R15: 0000000000000000 ORIG_RAX: 0000000000000004 CS: 0033 SS: 002b
The crash occurs when derefencing the i_sb field of sai->sai_inode:
static void ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry) { struct ptlrpc_thread *thread = &sai->sai_thread; struct ll_sb_info *sbi = ll_i2sbi(sai->sai_inode); <=== HERE int hit;
Both the sai and entry are poisoned:
crash> struct ll_sa_entry ffff88061923f440 struct ll_sa_entry { se_link = { next = 0x5a5a5a5a5a5a5a5a, prev = 0x5a5a5a5a5a5a5a5a }, se_list = { next = 0x5a5a5a5a5a5a5a5a, prev = 0x5a5a5a5a5a5a5a5a }, se_hash = { next = 0x5a5a5a5a5a5a5a5a, prev = 0x5a5a5a5a5a5a5a5a }, se_refcount = { counter = 1515870810 }, se_index = 6510615555426900570, se_handle = 6510615555426900570, se_stat = 1515870810, se_size = 1515870810, se_minfo = 0x5a5a5a5a5a5a5a5a, se_req = 0x5a5a5a5a5a5a5a5a, se_inode = 0x5a5a5a5a5a5a5a5a, se_qstr = { hash = 1515870810, len = 1515870810, name = 0x5a5a5a5a5a5a5a5a <Address 0x5a5a5a5a5a5a5a5a out of bounds> } } struct ll_statahead_info { sai_inode = 0x5a5a5a5a5a5a5a5a, sai_refcount = { counter = 1515870810 }, sai_generation = 1515870810, sai_max = 1515870810, sai_sent = 6510615555426900570, sai_replied = 6510615555426900570, sai_index = 6510615555426900570, sai_index_wait = 6510615555426900570, sai_hit = 6510615555426900570, sai_miss = 6510615555426900570, sai_consecutive_miss = 1515870810, sai_miss_hidden = 1515870810, sai_skip_hidden = 1515870810, sai_ls_all = 0, sai_in_readpage = 1, sai_agl_valid = 0, sai_waitq = { lock = { raw_lock = { slock = 1515936347 } }, task_list = { next = 0xffff8805adbc3860, prev = 0xffff8805adbc3860 } }, sai_thread = { t_link = { next = 0x5a5a5a5a5a5a5a5a, prev = 0x5a5a5a5a5a5a5a5a }, t_data = 0x5a5a5a5a5a5a5a5a, t_flags = 1515870810, t_id = 1515870810, t_pid = 1515870810, t_watchdog = 0x5a5a5a5a5a5a5a5a, t_svcpt = 0x5a5a5a5a5a5a5a5a, t_ctl_waitq = { lock = { raw_lock = { slock = 1515870810 } }, task_list = { next = 0x5a5a5a5a5a5a5a5a, prev = 0x5a5a5a5a5a5a5a5a }
The call to ll_sai_unplug is there in do_statahead_enter:
1658 } 1659 } 1660 1661 ll_sai_unplug(sai, entry); <=== 1662 RETURN(rc); 1663 } 1664 1665 /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */ 1666 rc = is_first_dirent(dir, *dentryp); 1667 if (rc == LS_NONE_FIRST_DE) 1668 /* It is not "ls -{a}l" operation, no need statahead for it. */ 1669 GOTO(out, rc = -EAGAIN);
There is no statahead process running:
crash> ps | grep ll_sa crash>
Confirmed by :
crash> struct ll_inode_info.u.d.d_opendir_pid 0xffff8807abf12540 u.d.d_opendir_pid = 0
The statahead info structure was also detached...
crash> struct ll_inode_info.u.d.d_sai 0xffff8807abf12540 u.d.d_sai = 0x0,
I guess there is a remaining race somewhere, but can't see where.
I am uploading a tarball with all sources, dump and required binaries for a more advanced analysis.
Attachments
Issue Links
- is related to
-
LU-2388 Oops in ll_sai_unplug
- Resolved