Details
-
Bug
-
Resolution: Unresolved
-
Minor
-
None
-
Lustre 2.5.3
-
None
-
Lustre 2.5.3 w/ bull patches
-
3
-
9223372036854775807
Description
A customer is reporting a LBUG on a Lustre client.
After a look into the core dump, it appears that Lusre was trying to use a structure that has been destroyed.
crash> sys
KERNEL: /dumps/lib/kernel-debuginfo/2.6.32-573.12.1.el6.Bull.88.x86_64/modules/vmlinux
DUMPFILE: vmcore [PARTIAL DUMP]
CPUS: 48 [OFFLINE: 24]
DATE: Tue May 31 10:26:49 2016
UPTIME: 76 days, 00:27:50
LOAD AVERAGE: 0.16, 0.90, 3.76
TASKS: 1132
NODENAME: taurusi5095
RELEASE: 2.6.32-573.12.1.el6.Bull.88.x86_64
VERSION: #1 SMP Thu Jan 7 01:45:02 CET 2016
MACHINE: x86_64 (2494 Mhz)
MEMORY: 63.8 GB
PANIC: "Kernel panic - not syncing: LBUG"
The LBUG is:
<0>LustreError: 20671:0:(statahead.c:351:do_sa_entry_fini()) ASSERTION( !ll_sa_entry_unhashed(entry) ) failed: <0>LustreError: 20671:0:(statahead.c:351:do_sa_entry_fini()) LBUG
crash> bt
PID: 20671 TASK: ffff8803bfc66ab0 CPU: 12 COMMAND: "java"
#0 [ffff880209777740] machine_kexec at ffffffff8103d30b
#1 [ffff8802097777a0] crash_kexec at ffffffff810cc4a2
#2 [ffff880209777870] panic at ffffffff81538dc9
#3 [ffff8802097778f0] lbug_with_loc at ffffffffa0561eeb [libcfs]
#4 [ffff880209777910] ll_sai_unplug at ffffffffa0b5f8f7 [lustre]
#5 [ffff8802097779a0] do_statahead_enter at ffffffffa0b61e6f [lustre]
#6 [ffff880209777ac0] ll_lookup_it at ffffffffa0b4ac26 [lustre]
#7 [ffff880209777bb0] ll_lookup_nd at ffffffffa0b4b1cc [lustre]
#8 [ffff880209777bf0] do_lookup at ffffffff811a2055
#9 [ffff880209777c50] __link_path_walk at ffffffff811a2bd3
#10 [ffff880209777d30] path_walk at ffffffff811a378a
#11 [ffff880209777d70] filename_lookup at ffffffff811a399b
#12 [ffff880209777db0] user_path_at at ffffffff811a4ac7
#13 [ffff880209777e80] vfs_fstatat at ffffffff81197e60
#14 [ffff880209777ee0] vfs_stat at ffffffff81197fdb
#15 [ffff880209777ef0] sys_newstat at ffffffff81198004
#16 [ffff880209777f80] system_call_fastpath at ffffffff8100b0d2
RIP: 000000371eedae35 RSP: 00002ae0d973ab70 RFLAGS: 00000216
RAX: 0000000000000004 RBX: ffffffff8100b0d2 RCX: 0000000001000000
RDX: 00002ae0d973ac90 RSI: 00002ae0d973ac90 RDI: 000000000250ce70
RBP: 00002ae0d973ad40 R8: 74756f2e656c6966 R9: 3132363736343634
R10: 3030305f30363131 R11: 0000000000000246 R12: 000000000250ce70
R13: 0000000000000000 R14: 0000000002327800 R15: 00000000023279f8
ORIG_RAX: 0000000000000004 CS: 0033 SS: 002b
ll_sai_unplug -> ll_sa_entry_fini -> do_sa_entry_fini
346 static inline void 347 do_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry) 348 { 349 struct ll_inode_info *lli = ll_i2info(sai->sai_inode); 350 351 LASSERT(!ll_sa_entry_unhashed(entry)); 352 LASSERT(!cfs_list_empty(&entry->se_link)); 353 354 ll_sa_entry_unhash(sai, entry); 355 356 spin_lock(&lli->lli_sa_lock); 357 entry->se_stat = SA_ENTRY_DEST; 358 cfs_list_del_init(&entry->se_link); 359 if (likely(!cfs_list_empty(&entry->se_list))) 360 cfs_list_del_init(&entry->se_list); 361 spin_unlock(&lli->lli_sa_lock); 362 363 ll_sa_entry_put(sai, entry); 364 } 365 366 /* 367 * Delete it from sai_entries_stated list when fini. 368 */ 369 static void 370 ll_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry) 371 { 372 struct ll_sa_entry *pos, *next; 373 374 if (entry) 375 do_sa_entry_fini(sai, entry); 376 377 /* drop old entry, only 'scanner' process does this, no need to lock */ 378 cfs_list_for_each_entry_safe(pos, next, &sai->sai_entries, se_link) { 379 if (!is_omitted_entry(sai, pos->se_index)) 380 break; 381 do_sa_entry_fini(sai, pos); 382 } 383 }
LASSERT condition:
90 static inline int ll_sa_entry_unhashed(struct ll_sa_entry *entry) 91 { 92 return cfs_list_empty(&entry->se_hash); 93 }
1483 static void 1484 ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry) 1485 { 1486 struct ptlrpc_thread *thread = &sai->sai_thread; 1487 struct ll_sb_info *sbi = ll_i2sbi(sai->sai_inode); 1488 int hit; 1489 ENTRY; 1490 1491 if (entry != NULL && entry->se_stat == SA_ENTRY_SUCC) 1492 hit = 1; 1493 else 1494 hit = 0; 1495 1496 ll_sa_entry_fini(sai, entry); <==== HERE [...]
crash> dis -rl ffffffffa0b61e6f [...] 0xffffffffa0b61e64 : mov %rbx,%rsi 0xffffffffa0b61e67 : mov %r12,%rdi 0xffffffffa0b61e6a : callq 0xffffffffa0b5f360 rbx => ll_statahead_info *sai r12 => ll_sa_entry *entry crash> dis -rl ffffffffa0b5f8f7 0xffffffffa0b5f360 : push %rbp 0xffffffffa0b5f361 : mov %rsp,%rbp 0xffffffffa0b5f364 : push %r15 0xffffffffa0b5f366 : push %r14 0xffffffffa0b5f368 : push %r13 0xffffffffa0b5f36a : push %r12 0xffffffffa0b5f36c : push %rbx [...] crash> bt -f PID: 20671 TASK: ffff8803bfc66ab0 CPU: 12 COMMAND: "java" #0 [ffff880209777740] machine_kexec at ffffffff8103d30b ffff880209777748: 00000000030a3000 ffff8800030a3000 ffff880209777758: 00000000030a2000 0000000000000000 ffff880209777768: 8800000000000000 ffff88085237ffff ffff880209777778: 0000000000000000 ffff8802097777a8 ffff880209777788: ffff88085237c000 ffff880875163460 ffff880209777798: ffff880209777868 ffffffff810cc4a2 #1 [ffff8802097777a0] crash_kexec at ffffffff810cc4a2 ffff8802097777a8: 0000000000000001 ffff880875163460 ffff8802097777b8: ffff88085237c000 ffff880875163440 ffff8802097777c8: ffff880209777868 0000000000000000 ffff8802097777d8: 0000000000000198 0000000000000000 ffff8802097777e8: ffff8800000be180 0000000000000000 ffff8802097777f8: 0000000000000001 0000000000001d20 ffff880209777808: 0000000000000000 0000000000000001 ffff880209777818: 0000000000000002 ffffffff81011035 ffff880209777828: ffffffff810cc52f 0000000000000010 ffff880209777838: 0000000000000046 ffff8802097777a8 ffff880209777848: 0000000000000018 0000000000000004 ffff880209777858: ffffffffa058435b ffff880875163440 ffff880209777868: ffff8802097778e8 ffffffff81538dc9 #2 [ffff880209777870] panic at ffffffff81538dc9 ffff880209777878: ffffffffa0590ac0 ffffffffa058435b ffff880209777888: ffffffff00000008 ffff8802097778f8 ffff880209777898: ffff8802097778a8 0000000000000000 ffff8802097778a8: 0000000000313533 0000000000000282 ffff8802097778b8: ffffffff81ab26e0 0000000000000000 ffff8802097778c8: 0000000000000000 00000000ffffffff ffff8802097778d8: ffffffffa0b9ea20 ffffffffa0b9ea20 ffff8802097778e8: ffff880209777908 ffffffffa0561eeb #3 [ffff8802097778f0] lbug_with_loc at ffffffffa0561eeb [libcfs] ffff8802097778f8: ffff880851f1fa40 ffff88101b4c5800 ffff880209777908: ffff880209777998 ffffffffa0b5f8f7 #4 [ffff880209777910] ll_sai_unplug at ffffffffa0b5f8f7 [lustre] ffff880209777918: ffff880209777948 ffffffffa07bdeb1 ffff880209777928: ffff880209777948 ffff880209777a08 ffff880209777938: ffff880209777958 0000000000000001 ffff880209777948: ffff880209777978 ffffffffa0afcd2f ffff880209777958: 8246520fbee09a58 ffff8804f3090168 ffff880209777968: ffff880209777a08 ffff880875163440 ffff880209777978: ffff88101b4c5800 0000000000000001 ffff880209777988: ffff881079e1b540 ffff881079e1b548 ffff880209777998: ffff880209777ab8 ffffffffa0b61e6f #5 [ffff8802097779a0] do_statahead_enter at ffffffffa0b61e6f [lustre] r12 => ffff88101b4c5800 rbx => ffff880875163440 struct ll_sa_entry { cfs_list_t se_link; cfs_list_t se_list; cfs_list_t se_hash; cfs_atomic_t se_refcount; __u64 se_index; __u64 se_handle; se_stat_t se_stat; int se_size; struct md_enqueue_info *se_minfo; struct ptlrpc_request *se_req; struct inode *se_inode; struct qstr se_qstr; } SIZE: 120 crash> struct ll_sa_entry ffff88101b4c5800 struct ll_sa_entry { se_link = { next = 0x5a5a5a5a5a5a5a5a, prev = 0x5a5a5a5a5a5a5a5a }, se_list = { next = 0x5a5a5a5a5a5a5a5a, prev = 0x5a5a5a5a5a5a5a5a }, se_hash = { next = 0x5a5a5a5a5a5a5a5a, prev = 0x5a5a5a5a5a5a5a5a }, se_refcount = { counter = 1515870810 }, se_index = 6510615555426900570, se_handle = 6510615555426900570, se_stat = 1515870810, se_size = 1515870810, se_minfo = 0x5a5a5a5a5a5a5a5a, se_req = 0x5a5a5a5a5a5a5a5a, se_inode = 0x5a5a5a5a5a5a5a5a, se_qstr = { hash = 1515870810, len = 1515870810, name = 0x5a5a5a5a5a5a5a5a } }
The ll_sa_entry.se_hash list is empty.
crash> x/a 0xffff88101b4c5800 0xffff88101b4c5800: 0x5a5a5a5a5a5a5a5a crash> struct ll_sa_entry.se_hash ffff88101b4c5800 se_hash = { next = 0x5a5a5a5a5a5a5a5a, prev = 0x5a5a5a5a5a5a5a5a }
Except the ASSERTION, there is nothing helpful in the debug log. I asked the customer to enable the D_READA facility if this is reproducible. However, they hit this LBUG once for now.
Do you need additional traces or the vmcore to further analyze this issue?
Attachments
Issue Links
- is related to
-
LU-3270 ptlrpcd strnlen crash trying to log a message
- Resolved