[LU-8312] (statahead.c:351:do_sa_entry_fini()) ASSERTION( !ll_sa_entry_unhashed(entry) ) Created: 21/Jun/16 Updated: 22/Jun/16 |
|
| Status: | Open |
| Project: | Lustre |
| Component/s: | None |
| Affects Version/s: | Lustre 2.5.3 |
| Fix Version/s: | None |
| Type: | Bug | Priority: | Minor |
| Reporter: | Bruno Travouillon (Inactive) | Assignee: | Lai Siyao |
| Resolution: | Unresolved | Votes: | 0 |
| Labels: | None | ||
| Environment: |
Lustre 2.5.3 w/ bull patches |
||
| Issue Links: |
|
||||||||
| Severity: | 3 | ||||||||
| Rank (Obsolete): | 9223372036854775807 | ||||||||
| Description |
|
A customer is reporting a LBUG on a Lustre client. After a look into the core dump, it appears that Lusre was trying to use a structure that has been destroyed. crash> sys
KERNEL: /dumps/lib/kernel-debuginfo/2.6.32-573.12.1.el6.Bull.88.x86_64/modules/vmlinux
DUMPFILE: vmcore [PARTIAL DUMP]
CPUS: 48 [OFFLINE: 24]
DATE: Tue May 31 10:26:49 2016
UPTIME: 76 days, 00:27:50
LOAD AVERAGE: 0.16, 0.90, 3.76
TASKS: 1132
NODENAME: taurusi5095
RELEASE: 2.6.32-573.12.1.el6.Bull.88.x86_64
VERSION: #1 SMP Thu Jan 7 01:45:02 CET 2016
MACHINE: x86_64 (2494 Mhz)
MEMORY: 63.8 GB
PANIC: "Kernel panic - not syncing: LBUG"
The LBUG is: <0>LustreError: 20671:0:(statahead.c:351:do_sa_entry_fini()) ASSERTION( !ll_sa_entry_unhashed(entry) ) failed: <0>LustreError: 20671:0:(statahead.c:351:do_sa_entry_fini()) LBUG crash> bt
PID: 20671 TASK: ffff8803bfc66ab0 CPU: 12 COMMAND: "java"
#0 [ffff880209777740] machine_kexec at ffffffff8103d30b
#1 [ffff8802097777a0] crash_kexec at ffffffff810cc4a2
#2 [ffff880209777870] panic at ffffffff81538dc9
#3 [ffff8802097778f0] lbug_with_loc at ffffffffa0561eeb [libcfs]
#4 [ffff880209777910] ll_sai_unplug at ffffffffa0b5f8f7 [lustre]
#5 [ffff8802097779a0] do_statahead_enter at ffffffffa0b61e6f [lustre]
#6 [ffff880209777ac0] ll_lookup_it at ffffffffa0b4ac26 [lustre]
#7 [ffff880209777bb0] ll_lookup_nd at ffffffffa0b4b1cc [lustre]
#8 [ffff880209777bf0] do_lookup at ffffffff811a2055
#9 [ffff880209777c50] __link_path_walk at ffffffff811a2bd3
#10 [ffff880209777d30] path_walk at ffffffff811a378a
#11 [ffff880209777d70] filename_lookup at ffffffff811a399b
#12 [ffff880209777db0] user_path_at at ffffffff811a4ac7
#13 [ffff880209777e80] vfs_fstatat at ffffffff81197e60
#14 [ffff880209777ee0] vfs_stat at ffffffff81197fdb
#15 [ffff880209777ef0] sys_newstat at ffffffff81198004
#16 [ffff880209777f80] system_call_fastpath at ffffffff8100b0d2
RIP: 000000371eedae35 RSP: 00002ae0d973ab70 RFLAGS: 00000216
RAX: 0000000000000004 RBX: ffffffff8100b0d2 RCX: 0000000001000000
RDX: 00002ae0d973ac90 RSI: 00002ae0d973ac90 RDI: 000000000250ce70
RBP: 00002ae0d973ad40 R8: 74756f2e656c6966 R9: 3132363736343634
R10: 3030305f30363131 R11: 0000000000000246 R12: 000000000250ce70
R13: 0000000000000000 R14: 0000000002327800 R15: 00000000023279f8
ORIG_RAX: 0000000000000004 CS: 0033 SS: 002b
ll_sai_unplug -> ll_sa_entry_fini -> do_sa_entry_fini 346 static inline void 347 do_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry) 348 { 349 struct ll_inode_info *lli = ll_i2info(sai->sai_inode); 350 351 LASSERT(!ll_sa_entry_unhashed(entry)); 352 LASSERT(!cfs_list_empty(&entry->se_link)); 353 354 ll_sa_entry_unhash(sai, entry); 355 356 spin_lock(&lli->lli_sa_lock); 357 entry->se_stat = SA_ENTRY_DEST; 358 cfs_list_del_init(&entry->se_link); 359 if (likely(!cfs_list_empty(&entry->se_list))) 360 cfs_list_del_init(&entry->se_list); 361 spin_unlock(&lli->lli_sa_lock); 362 363 ll_sa_entry_put(sai, entry); 364 } 365 366 /* 367 * Delete it from sai_entries_stated list when fini. 368 */ 369 static void 370 ll_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry) 371 { 372 struct ll_sa_entry *pos, *next; 373 374 if (entry) 375 do_sa_entry_fini(sai, entry); 376 377 /* drop old entry, only 'scanner' process does this, no need to lock */ 378 cfs_list_for_each_entry_safe(pos, next, &sai->sai_entries, se_link) { 379 if (!is_omitted_entry(sai, pos->se_index)) 380 break; 381 do_sa_entry_fini(sai, pos); 382 } 383 } LASSERT condition: 90 static inline int ll_sa_entry_unhashed(struct ll_sa_entry *entry) 91 { 92 return cfs_list_empty(&entry->se_hash); 93 } 1483 static void 1484 ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry) 1485 { 1486 struct ptlrpc_thread *thread = &sai->sai_thread; 1487 struct ll_sb_info *sbi = ll_i2sbi(sai->sai_inode); 1488 int hit; 1489 ENTRY; 1490 1491 if (entry != NULL && entry->se_stat == SA_ENTRY_SUCC) 1492 hit = 1; 1493 else 1494 hit = 0; 1495 1496 ll_sa_entry_fini(sai, entry); <==== HERE [...] crash> dis -rl ffffffffa0b61e6f [...] 0xffffffffa0b61e64 : mov %rbx,%rsi 0xffffffffa0b61e67 : mov %r12,%rdi 0xffffffffa0b61e6a : callq 0xffffffffa0b5f360 rbx => ll_statahead_info *sai r12 => ll_sa_entry *entry crash> dis -rl ffffffffa0b5f8f7 0xffffffffa0b5f360 : push %rbp 0xffffffffa0b5f361 : mov %rsp,%rbp 0xffffffffa0b5f364 : push %r15 0xffffffffa0b5f366 : push %r14 0xffffffffa0b5f368 : push %r13 0xffffffffa0b5f36a : push %r12 0xffffffffa0b5f36c : push %rbx [...] crash> bt -f PID: 20671 TASK: ffff8803bfc66ab0 CPU: 12 COMMAND: "java" #0 [ffff880209777740] machine_kexec at ffffffff8103d30b ffff880209777748: 00000000030a3000 ffff8800030a3000 ffff880209777758: 00000000030a2000 0000000000000000 ffff880209777768: 8800000000000000 ffff88085237ffff ffff880209777778: 0000000000000000 ffff8802097777a8 ffff880209777788: ffff88085237c000 ffff880875163460 ffff880209777798: ffff880209777868 ffffffff810cc4a2 #1 [ffff8802097777a0] crash_kexec at ffffffff810cc4a2 ffff8802097777a8: 0000000000000001 ffff880875163460 ffff8802097777b8: ffff88085237c000 ffff880875163440 ffff8802097777c8: ffff880209777868 0000000000000000 ffff8802097777d8: 0000000000000198 0000000000000000 ffff8802097777e8: ffff8800000be180 0000000000000000 ffff8802097777f8: 0000000000000001 0000000000001d20 ffff880209777808: 0000000000000000 0000000000000001 ffff880209777818: 0000000000000002 ffffffff81011035 ffff880209777828: ffffffff810cc52f 0000000000000010 ffff880209777838: 0000000000000046 ffff8802097777a8 ffff880209777848: 0000000000000018 0000000000000004 ffff880209777858: ffffffffa058435b ffff880875163440 ffff880209777868: ffff8802097778e8 ffffffff81538dc9 #2 [ffff880209777870] panic at ffffffff81538dc9 ffff880209777878: ffffffffa0590ac0 ffffffffa058435b ffff880209777888: ffffffff00000008 ffff8802097778f8 ffff880209777898: ffff8802097778a8 0000000000000000 ffff8802097778a8: 0000000000313533 0000000000000282 ffff8802097778b8: ffffffff81ab26e0 0000000000000000 ffff8802097778c8: 0000000000000000 00000000ffffffff ffff8802097778d8: ffffffffa0b9ea20 ffffffffa0b9ea20 ffff8802097778e8: ffff880209777908 ffffffffa0561eeb #3 [ffff8802097778f0] lbug_with_loc at ffffffffa0561eeb [libcfs] ffff8802097778f8: ffff880851f1fa40 ffff88101b4c5800 ffff880209777908: ffff880209777998 ffffffffa0b5f8f7 #4 [ffff880209777910] ll_sai_unplug at ffffffffa0b5f8f7 [lustre] ffff880209777918: ffff880209777948 ffffffffa07bdeb1 ffff880209777928: ffff880209777948 ffff880209777a08 ffff880209777938: ffff880209777958 0000000000000001 ffff880209777948: ffff880209777978 ffffffffa0afcd2f ffff880209777958: 8246520fbee09a58 ffff8804f3090168 ffff880209777968: ffff880209777a08 ffff880875163440 ffff880209777978: ffff88101b4c5800 0000000000000001 ffff880209777988: ffff881079e1b540 ffff881079e1b548 ffff880209777998: ffff880209777ab8 ffffffffa0b61e6f #5 [ffff8802097779a0] do_statahead_enter at ffffffffa0b61e6f [lustre] r12 => ffff88101b4c5800 rbx => ffff880875163440 struct ll_sa_entry { cfs_list_t se_link; cfs_list_t se_list; cfs_list_t se_hash; cfs_atomic_t se_refcount; __u64 se_index; __u64 se_handle; se_stat_t se_stat; int se_size; struct md_enqueue_info *se_minfo; struct ptlrpc_request *se_req; struct inode *se_inode; struct qstr se_qstr; } SIZE: 120 crash> struct ll_sa_entry ffff88101b4c5800 struct ll_sa_entry { se_link = { next = 0x5a5a5a5a5a5a5a5a, prev = 0x5a5a5a5a5a5a5a5a }, se_list = { next = 0x5a5a5a5a5a5a5a5a, prev = 0x5a5a5a5a5a5a5a5a }, se_hash = { next = 0x5a5a5a5a5a5a5a5a, prev = 0x5a5a5a5a5a5a5a5a }, se_refcount = { counter = 1515870810 }, se_index = 6510615555426900570, se_handle = 6510615555426900570, se_stat = 1515870810, se_size = 1515870810, se_minfo = 0x5a5a5a5a5a5a5a5a, se_req = 0x5a5a5a5a5a5a5a5a, se_inode = 0x5a5a5a5a5a5a5a5a, se_qstr = { hash = 1515870810, len = 1515870810, name = 0x5a5a5a5a5a5a5a5a } } The ll_sa_entry.se_hash list is empty. crash> x/a 0xffff88101b4c5800
0xffff88101b4c5800: 0x5a5a5a5a5a5a5a5a
crash> struct ll_sa_entry.se_hash ffff88101b4c5800
se_hash = {
next = 0x5a5a5a5a5a5a5a5a,
prev = 0x5a5a5a5a5a5a5a5a
}
Except the ASSERTION, there is nothing helpful in the debug log. I asked the customer to enable the D_READA facility if this is reproducible. However, they hit this LBUG once for now. Do you need additional traces or the vmcore to further analyze this issue? |
| Comments |
| Comment by Joseph Gmitter (Inactive) [ 21/Jun/16 ] |
|
Hi Lai, Can you please assist here? Looks to be related to Thanks. |
| Comment by Lai Siyao [ 22/Jun/16 ] |
|
Hmm, the statahead code is immature in 2.5, and quite a number of fixes were landed to 2.7. Considering the total LOC is big, there is no plan to port them back, so I'd suggest you turn statahead off on all lustre client: `echo 0 > /proc/fs/lustre/llite/*/statahead_max`. |