[LU-8312] (statahead.c:351:do_sa_entry_fini()) ASSERTION( !ll_sa_entry_unhashed(entry) ) Created: 21/Jun/16  Updated: 22/Jun/16

Status: Open
Project: Lustre
Component/s: None
Affects Version/s: Lustre 2.5.3
Fix Version/s: None

Type: Bug Priority: Minor
Reporter: Bruno Travouillon (Inactive) Assignee: Lai Siyao
Resolution: Unresolved Votes: 0
Labels: None
Environment:

Lustre 2.5.3 w/ bull patches


Issue Links:
Related
is related to LU-3270 ptlrpcd strnlen crash trying to log a... Resolved
Severity: 3
Rank (Obsolete): 9223372036854775807

 Description   

A customer is reporting a LBUG on a Lustre client.

After a look into the core dump, it appears that Lusre was trying to use a structure that has been destroyed.

crash> sys
      KERNEL: /dumps/lib/kernel-debuginfo/2.6.32-573.12.1.el6.Bull.88.x86_64/modules/vmlinux
    DUMPFILE: vmcore  [PARTIAL DUMP]
        CPUS: 48 [OFFLINE: 24]
        DATE: Tue May 31 10:26:49 2016
      UPTIME: 76 days, 00:27:50
LOAD AVERAGE: 0.16, 0.90, 3.76
       TASKS: 1132
    NODENAME: taurusi5095
     RELEASE: 2.6.32-573.12.1.el6.Bull.88.x86_64
     VERSION: #1 SMP Thu Jan 7 01:45:02 CET 2016
     MACHINE: x86_64  (2494 Mhz)
      MEMORY: 63.8 GB
       PANIC: "Kernel panic - not syncing: LBUG"

The LBUG is:

<0>LustreError: 20671:0:(statahead.c:351:do_sa_entry_fini()) ASSERTION( !ll_sa_entry_unhashed(entry) ) failed:
<0>LustreError: 20671:0:(statahead.c:351:do_sa_entry_fini()) LBUG
crash> bt
PID: 20671  TASK: ffff8803bfc66ab0  CPU: 12  COMMAND: "java"
 #0 [ffff880209777740] machine_kexec at ffffffff8103d30b
 #1 [ffff8802097777a0] crash_kexec at ffffffff810cc4a2
 #2 [ffff880209777870] panic at ffffffff81538dc9
 #3 [ffff8802097778f0] lbug_with_loc at ffffffffa0561eeb [libcfs]
 #4 [ffff880209777910] ll_sai_unplug at ffffffffa0b5f8f7 [lustre]
 #5 [ffff8802097779a0] do_statahead_enter at ffffffffa0b61e6f [lustre]
 #6 [ffff880209777ac0] ll_lookup_it at ffffffffa0b4ac26 [lustre]
 #7 [ffff880209777bb0] ll_lookup_nd at ffffffffa0b4b1cc [lustre]
 #8 [ffff880209777bf0] do_lookup at ffffffff811a2055
 #9 [ffff880209777c50] __link_path_walk at ffffffff811a2bd3
#10 [ffff880209777d30] path_walk at ffffffff811a378a
#11 [ffff880209777d70] filename_lookup at ffffffff811a399b
#12 [ffff880209777db0] user_path_at at ffffffff811a4ac7
#13 [ffff880209777e80] vfs_fstatat at ffffffff81197e60
#14 [ffff880209777ee0] vfs_stat at ffffffff81197fdb
#15 [ffff880209777ef0] sys_newstat at ffffffff81198004
#16 [ffff880209777f80] system_call_fastpath at ffffffff8100b0d2
    RIP: 000000371eedae35  RSP: 00002ae0d973ab70  RFLAGS: 00000216
    RAX: 0000000000000004  RBX: ffffffff8100b0d2  RCX: 0000000001000000
    RDX: 00002ae0d973ac90  RSI: 00002ae0d973ac90  RDI: 000000000250ce70
    RBP: 00002ae0d973ad40   R8: 74756f2e656c6966   R9: 3132363736343634
    R10: 3030305f30363131  R11: 0000000000000246  R12: 000000000250ce70
    R13: 0000000000000000  R14: 0000000002327800  R15: 00000000023279f8
    ORIG_RAX: 0000000000000004  CS: 0033  SS: 002b

ll_sai_unplug -> ll_sa_entry_fini -> do_sa_entry_fini

 346 static inline void
 347 do_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
 348 {
 349     struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
 350
 351     LASSERT(!ll_sa_entry_unhashed(entry));
 352     LASSERT(!cfs_list_empty(&entry->se_link));
 353
 354     ll_sa_entry_unhash(sai, entry);
 355
 356     spin_lock(&lli->lli_sa_lock);
 357     entry->se_stat = SA_ENTRY_DEST;
 358     cfs_list_del_init(&entry->se_link);
 359     if (likely(!cfs_list_empty(&entry->se_list)))
 360         cfs_list_del_init(&entry->se_list);
 361     spin_unlock(&lli->lli_sa_lock);
 362
 363     ll_sa_entry_put(sai, entry);
 364 }
 365
 366 /*
 367  * Delete it from sai_entries_stated list when fini.
 368  */
 369 static void
 370 ll_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
 371 {
 372     struct ll_sa_entry *pos, *next;
 373
 374     if (entry)
 375         do_sa_entry_fini(sai, entry);
 376
 377     /* drop old entry, only 'scanner' process does this, no need to lock */
 378     cfs_list_for_each_entry_safe(pos, next, &sai->sai_entries, se_link) {
 379         if (!is_omitted_entry(sai, pos->se_index))
 380             break;
 381         do_sa_entry_fini(sai, pos);
 382     }
 383 }

LASSERT condition:

  90 static inline int ll_sa_entry_unhashed(struct ll_sa_entry *entry)
  91 {
  92         return cfs_list_empty(&entry->se_hash);
  93 }
1483 static void
1484 ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
1485 {
1486         struct ptlrpc_thread *thread = &sai->sai_thread;
1487         struct ll_sb_info    *sbi    = ll_i2sbi(sai->sai_inode);
1488         int                   hit;
1489         ENTRY;
1490
1491         if (entry != NULL && entry->se_stat == SA_ENTRY_SUCC)
1492                 hit = 1;
1493         else
1494                 hit = 0;
1495
1496         ll_sa_entry_fini(sai, entry);    <==== HERE
[...]
crash> dis -rl ffffffffa0b61e6f
[...]
0xffffffffa0b61e64 :    mov    %rbx,%rsi
0xffffffffa0b61e67 :    mov    %r12,%rdi
0xffffffffa0b61e6a :    callq  0xffffffffa0b5f360 

rbx =>  ll_statahead_info *sai
r12 =>  ll_sa_entry *entry

crash> dis -rl ffffffffa0b5f8f7
0xffffffffa0b5f360 :     push   %rbp
0xffffffffa0b5f361 :   mov    %rsp,%rbp
0xffffffffa0b5f364 :   push   %r15
0xffffffffa0b5f366 :   push   %r14
0xffffffffa0b5f368 :   push   %r13
0xffffffffa0b5f36a :  push   %r12
0xffffffffa0b5f36c :  push   %rbx
[...]

crash> bt -f
PID: 20671  TASK: ffff8803bfc66ab0  CPU: 12  COMMAND: "java"
 #0 [ffff880209777740] machine_kexec at ffffffff8103d30b
    ffff880209777748: 00000000030a3000 ffff8800030a3000
    ffff880209777758: 00000000030a2000 0000000000000000
    ffff880209777768: 8800000000000000 ffff88085237ffff
    ffff880209777778: 0000000000000000 ffff8802097777a8
    ffff880209777788: ffff88085237c000 ffff880875163460
    ffff880209777798: ffff880209777868 ffffffff810cc4a2
 #1 [ffff8802097777a0] crash_kexec at ffffffff810cc4a2
    ffff8802097777a8: 0000000000000001 ffff880875163460
    ffff8802097777b8: ffff88085237c000 ffff880875163440
    ffff8802097777c8: ffff880209777868 0000000000000000
    ffff8802097777d8: 0000000000000198 0000000000000000
    ffff8802097777e8: ffff8800000be180 0000000000000000
    ffff8802097777f8: 0000000000000001 0000000000001d20
    ffff880209777808: 0000000000000000 0000000000000001
    ffff880209777818: 0000000000000002 ffffffff81011035
    ffff880209777828: ffffffff810cc52f 0000000000000010
    ffff880209777838: 0000000000000046 ffff8802097777a8
    ffff880209777848: 0000000000000018 0000000000000004
    ffff880209777858: ffffffffa058435b ffff880875163440
    ffff880209777868: ffff8802097778e8 ffffffff81538dc9
 #2 [ffff880209777870] panic at ffffffff81538dc9
    ffff880209777878: ffffffffa0590ac0 ffffffffa058435b
    ffff880209777888: ffffffff00000008 ffff8802097778f8
    ffff880209777898: ffff8802097778a8 0000000000000000
    ffff8802097778a8: 0000000000313533 0000000000000282
    ffff8802097778b8: ffffffff81ab26e0 0000000000000000
    ffff8802097778c8: 0000000000000000 00000000ffffffff
    ffff8802097778d8: ffffffffa0b9ea20 ffffffffa0b9ea20
    ffff8802097778e8: ffff880209777908 ffffffffa0561eeb
 #3 [ffff8802097778f0] lbug_with_loc at ffffffffa0561eeb [libcfs]
    ffff8802097778f8: ffff880851f1fa40 ffff88101b4c5800
    ffff880209777908: ffff880209777998 ffffffffa0b5f8f7
 #4 [ffff880209777910] ll_sai_unplug at ffffffffa0b5f8f7 [lustre]
    ffff880209777918: ffff880209777948 ffffffffa07bdeb1
    ffff880209777928: ffff880209777948 ffff880209777a08
    ffff880209777938: ffff880209777958 0000000000000001
    ffff880209777948: ffff880209777978 ffffffffa0afcd2f
    ffff880209777958: 8246520fbee09a58 ffff8804f3090168
    ffff880209777968: ffff880209777a08 ffff880875163440
    ffff880209777978: ffff88101b4c5800 0000000000000001
    ffff880209777988: ffff881079e1b540 ffff881079e1b548
    ffff880209777998: ffff880209777ab8 ffffffffa0b61e6f
 #5 [ffff8802097779a0] do_statahead_enter at ffffffffa0b61e6f [lustre]

r12 => ffff88101b4c5800
rbx => ffff880875163440

struct ll_sa_entry {
    cfs_list_t se_link;
    cfs_list_t se_list;
    cfs_list_t se_hash;
    cfs_atomic_t se_refcount;
    __u64 se_index;
    __u64 se_handle;
    se_stat_t se_stat;
    int se_size;
    struct md_enqueue_info *se_minfo;
    struct ptlrpc_request *se_req;
    struct inode *se_inode;
    struct qstr se_qstr;
}
SIZE: 120
crash> struct ll_sa_entry ffff88101b4c5800
struct ll_sa_entry {
  se_link = {
    next = 0x5a5a5a5a5a5a5a5a,
    prev = 0x5a5a5a5a5a5a5a5a
  },
  se_list = {
    next = 0x5a5a5a5a5a5a5a5a,
    prev = 0x5a5a5a5a5a5a5a5a
  },
  se_hash = {
    next = 0x5a5a5a5a5a5a5a5a,
    prev = 0x5a5a5a5a5a5a5a5a
  },
  se_refcount = {
    counter = 1515870810
  },
  se_index = 6510615555426900570,
  se_handle = 6510615555426900570,
  se_stat = 1515870810,
  se_size = 1515870810,
  se_minfo = 0x5a5a5a5a5a5a5a5a,
  se_req = 0x5a5a5a5a5a5a5a5a,
  se_inode = 0x5a5a5a5a5a5a5a5a,
  se_qstr = {
    hash = 1515870810,
    len = 1515870810,
    name = 0x5a5a5a5a5a5a5a5a

  }
}

The ll_sa_entry.se_hash list is empty.

crash> x/a 0xffff88101b4c5800
0xffff88101b4c5800:     0x5a5a5a5a5a5a5a5a
crash> struct ll_sa_entry.se_hash ffff88101b4c5800
  se_hash = {
    next = 0x5a5a5a5a5a5a5a5a,
    prev = 0x5a5a5a5a5a5a5a5a
  }

Except the ASSERTION, there is nothing helpful in the debug log. I asked the customer to enable the D_READA facility if this is reproducible. However, they hit this LBUG once for now.

Do you need additional traces or the vmcore to further analyze this issue?



 Comments   
Comment by Joseph Gmitter (Inactive) [ 21/Jun/16 ]

Hi Lai,

Can you please assist here? Looks to be related to LU-3270

Thanks.
Joe

Comment by Lai Siyao [ 22/Jun/16 ]

Hmm, the statahead code is immature in 2.5, and quite a number of fixes were landed to 2.7. Considering the total LOC is big, there is no plan to port them back, so I'd suggest you turn statahead off on all lustre client: `echo 0 > /proc/fs/lustre/llite/*/statahead_max`.

Generated at Sat Feb 10 02:16:26 UTC 2024 using Jira 9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c.