Uploaded image for project: 'Lustre'
  1. Lustre
  2. LU-8312

(statahead.c:351:do_sa_entry_fini()) ASSERTION( !ll_sa_entry_unhashed(entry) )

    XMLWordPrintable

Details

    • Bug
    • Resolution: Unresolved
    • Minor
    • None
    • Lustre 2.5.3
    • None
    • Lustre 2.5.3 w/ bull patches
    • 3
    • 9223372036854775807

    Description

      A customer is reporting a LBUG on a Lustre client.

      After a look into the core dump, it appears that Lusre was trying to use a structure that has been destroyed.

      crash> sys
            KERNEL: /dumps/lib/kernel-debuginfo/2.6.32-573.12.1.el6.Bull.88.x86_64/modules/vmlinux
          DUMPFILE: vmcore  [PARTIAL DUMP]
              CPUS: 48 [OFFLINE: 24]
              DATE: Tue May 31 10:26:49 2016
            UPTIME: 76 days, 00:27:50
      LOAD AVERAGE: 0.16, 0.90, 3.76
             TASKS: 1132
          NODENAME: taurusi5095
           RELEASE: 2.6.32-573.12.1.el6.Bull.88.x86_64
           VERSION: #1 SMP Thu Jan 7 01:45:02 CET 2016
           MACHINE: x86_64  (2494 Mhz)
            MEMORY: 63.8 GB
             PANIC: "Kernel panic - not syncing: LBUG"
      

      The LBUG is:

      <0>LustreError: 20671:0:(statahead.c:351:do_sa_entry_fini()) ASSERTION( !ll_sa_entry_unhashed(entry) ) failed:
      <0>LustreError: 20671:0:(statahead.c:351:do_sa_entry_fini()) LBUG
      
      crash> bt
      PID: 20671  TASK: ffff8803bfc66ab0  CPU: 12  COMMAND: "java"
       #0 [ffff880209777740] machine_kexec at ffffffff8103d30b
       #1 [ffff8802097777a0] crash_kexec at ffffffff810cc4a2
       #2 [ffff880209777870] panic at ffffffff81538dc9
       #3 [ffff8802097778f0] lbug_with_loc at ffffffffa0561eeb [libcfs]
       #4 [ffff880209777910] ll_sai_unplug at ffffffffa0b5f8f7 [lustre]
       #5 [ffff8802097779a0] do_statahead_enter at ffffffffa0b61e6f [lustre]
       #6 [ffff880209777ac0] ll_lookup_it at ffffffffa0b4ac26 [lustre]
       #7 [ffff880209777bb0] ll_lookup_nd at ffffffffa0b4b1cc [lustre]
       #8 [ffff880209777bf0] do_lookup at ffffffff811a2055
       #9 [ffff880209777c50] __link_path_walk at ffffffff811a2bd3
      #10 [ffff880209777d30] path_walk at ffffffff811a378a
      #11 [ffff880209777d70] filename_lookup at ffffffff811a399b
      #12 [ffff880209777db0] user_path_at at ffffffff811a4ac7
      #13 [ffff880209777e80] vfs_fstatat at ffffffff81197e60
      #14 [ffff880209777ee0] vfs_stat at ffffffff81197fdb
      #15 [ffff880209777ef0] sys_newstat at ffffffff81198004
      #16 [ffff880209777f80] system_call_fastpath at ffffffff8100b0d2
          RIP: 000000371eedae35  RSP: 00002ae0d973ab70  RFLAGS: 00000216
          RAX: 0000000000000004  RBX: ffffffff8100b0d2  RCX: 0000000001000000
          RDX: 00002ae0d973ac90  RSI: 00002ae0d973ac90  RDI: 000000000250ce70
          RBP: 00002ae0d973ad40   R8: 74756f2e656c6966   R9: 3132363736343634
          R10: 3030305f30363131  R11: 0000000000000246  R12: 000000000250ce70
          R13: 0000000000000000  R14: 0000000002327800  R15: 00000000023279f8
          ORIG_RAX: 0000000000000004  CS: 0033  SS: 002b
      

      ll_sai_unplug -> ll_sa_entry_fini -> do_sa_entry_fini

       346 static inline void
       347 do_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
       348 {
       349     struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
       350
       351     LASSERT(!ll_sa_entry_unhashed(entry));
       352     LASSERT(!cfs_list_empty(&entry->se_link));
       353
       354     ll_sa_entry_unhash(sai, entry);
       355
       356     spin_lock(&lli->lli_sa_lock);
       357     entry->se_stat = SA_ENTRY_DEST;
       358     cfs_list_del_init(&entry->se_link);
       359     if (likely(!cfs_list_empty(&entry->se_list)))
       360         cfs_list_del_init(&entry->se_list);
       361     spin_unlock(&lli->lli_sa_lock);
       362
       363     ll_sa_entry_put(sai, entry);
       364 }
       365
       366 /*
       367  * Delete it from sai_entries_stated list when fini.
       368  */
       369 static void
       370 ll_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
       371 {
       372     struct ll_sa_entry *pos, *next;
       373
       374     if (entry)
       375         do_sa_entry_fini(sai, entry);
       376
       377     /* drop old entry, only 'scanner' process does this, no need to lock */
       378     cfs_list_for_each_entry_safe(pos, next, &sai->sai_entries, se_link) {
       379         if (!is_omitted_entry(sai, pos->se_index))
       380             break;
       381         do_sa_entry_fini(sai, pos);
       382     }
       383 }
      

      LASSERT condition:

        90 static inline int ll_sa_entry_unhashed(struct ll_sa_entry *entry)
        91 {
        92         return cfs_list_empty(&entry->se_hash);
        93 }
      
      1483 static void
      1484 ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
      1485 {
      1486         struct ptlrpc_thread *thread = &sai->sai_thread;
      1487         struct ll_sb_info    *sbi    = ll_i2sbi(sai->sai_inode);
      1488         int                   hit;
      1489         ENTRY;
      1490
      1491         if (entry != NULL && entry->se_stat == SA_ENTRY_SUCC)
      1492                 hit = 1;
      1493         else
      1494                 hit = 0;
      1495
      1496         ll_sa_entry_fini(sai, entry);    <==== HERE
      [...]
      
      crash> dis -rl ffffffffa0b61e6f
      [...]
      0xffffffffa0b61e64 :    mov    %rbx,%rsi
      0xffffffffa0b61e67 :    mov    %r12,%rdi
      0xffffffffa0b61e6a :    callq  0xffffffffa0b5f360 
      
      rbx =>  ll_statahead_info *sai
      r12 =>  ll_sa_entry *entry
      
      crash> dis -rl ffffffffa0b5f8f7
      0xffffffffa0b5f360 :     push   %rbp
      0xffffffffa0b5f361 :   mov    %rsp,%rbp
      0xffffffffa0b5f364 :   push   %r15
      0xffffffffa0b5f366 :   push   %r14
      0xffffffffa0b5f368 :   push   %r13
      0xffffffffa0b5f36a :  push   %r12
      0xffffffffa0b5f36c :  push   %rbx
      [...]
      
      crash> bt -f
      PID: 20671  TASK: ffff8803bfc66ab0  CPU: 12  COMMAND: "java"
       #0 [ffff880209777740] machine_kexec at ffffffff8103d30b
          ffff880209777748: 00000000030a3000 ffff8800030a3000
          ffff880209777758: 00000000030a2000 0000000000000000
          ffff880209777768: 8800000000000000 ffff88085237ffff
          ffff880209777778: 0000000000000000 ffff8802097777a8
          ffff880209777788: ffff88085237c000 ffff880875163460
          ffff880209777798: ffff880209777868 ffffffff810cc4a2
       #1 [ffff8802097777a0] crash_kexec at ffffffff810cc4a2
          ffff8802097777a8: 0000000000000001 ffff880875163460
          ffff8802097777b8: ffff88085237c000 ffff880875163440
          ffff8802097777c8: ffff880209777868 0000000000000000
          ffff8802097777d8: 0000000000000198 0000000000000000
          ffff8802097777e8: ffff8800000be180 0000000000000000
          ffff8802097777f8: 0000000000000001 0000000000001d20
          ffff880209777808: 0000000000000000 0000000000000001
          ffff880209777818: 0000000000000002 ffffffff81011035
          ffff880209777828: ffffffff810cc52f 0000000000000010
          ffff880209777838: 0000000000000046 ffff8802097777a8
          ffff880209777848: 0000000000000018 0000000000000004
          ffff880209777858: ffffffffa058435b ffff880875163440
          ffff880209777868: ffff8802097778e8 ffffffff81538dc9
       #2 [ffff880209777870] panic at ffffffff81538dc9
          ffff880209777878: ffffffffa0590ac0 ffffffffa058435b
          ffff880209777888: ffffffff00000008 ffff8802097778f8
          ffff880209777898: ffff8802097778a8 0000000000000000
          ffff8802097778a8: 0000000000313533 0000000000000282
          ffff8802097778b8: ffffffff81ab26e0 0000000000000000
          ffff8802097778c8: 0000000000000000 00000000ffffffff
          ffff8802097778d8: ffffffffa0b9ea20 ffffffffa0b9ea20
          ffff8802097778e8: ffff880209777908 ffffffffa0561eeb
       #3 [ffff8802097778f0] lbug_with_loc at ffffffffa0561eeb [libcfs]
          ffff8802097778f8: ffff880851f1fa40 ffff88101b4c5800
          ffff880209777908: ffff880209777998 ffffffffa0b5f8f7
       #4 [ffff880209777910] ll_sai_unplug at ffffffffa0b5f8f7 [lustre]
          ffff880209777918: ffff880209777948 ffffffffa07bdeb1
          ffff880209777928: ffff880209777948 ffff880209777a08
          ffff880209777938: ffff880209777958 0000000000000001
          ffff880209777948: ffff880209777978 ffffffffa0afcd2f
          ffff880209777958: 8246520fbee09a58 ffff8804f3090168
          ffff880209777968: ffff880209777a08 ffff880875163440
          ffff880209777978: ffff88101b4c5800 0000000000000001
          ffff880209777988: ffff881079e1b540 ffff881079e1b548
          ffff880209777998: ffff880209777ab8 ffffffffa0b61e6f
       #5 [ffff8802097779a0] do_statahead_enter at ffffffffa0b61e6f [lustre]
      
      r12 => ffff88101b4c5800
      rbx => ffff880875163440
      
      struct ll_sa_entry {
          cfs_list_t se_link;
          cfs_list_t se_list;
          cfs_list_t se_hash;
          cfs_atomic_t se_refcount;
          __u64 se_index;
          __u64 se_handle;
          se_stat_t se_stat;
          int se_size;
          struct md_enqueue_info *se_minfo;
          struct ptlrpc_request *se_req;
          struct inode *se_inode;
          struct qstr se_qstr;
      }
      SIZE: 120
      crash> struct ll_sa_entry ffff88101b4c5800
      struct ll_sa_entry {
        se_link = {
          next = 0x5a5a5a5a5a5a5a5a,
          prev = 0x5a5a5a5a5a5a5a5a
        },
        se_list = {
          next = 0x5a5a5a5a5a5a5a5a,
          prev = 0x5a5a5a5a5a5a5a5a
        },
        se_hash = {
          next = 0x5a5a5a5a5a5a5a5a,
          prev = 0x5a5a5a5a5a5a5a5a
        },
        se_refcount = {
          counter = 1515870810
        },
        se_index = 6510615555426900570,
        se_handle = 6510615555426900570,
        se_stat = 1515870810,
        se_size = 1515870810,
        se_minfo = 0x5a5a5a5a5a5a5a5a,
        se_req = 0x5a5a5a5a5a5a5a5a,
        se_inode = 0x5a5a5a5a5a5a5a5a,
        se_qstr = {
          hash = 1515870810,
          len = 1515870810,
          name = 0x5a5a5a5a5a5a5a5a
      
        }
      }
      

      The ll_sa_entry.se_hash list is empty.

      crash> x/a 0xffff88101b4c5800
      0xffff88101b4c5800:     0x5a5a5a5a5a5a5a5a
      crash> struct ll_sa_entry.se_hash ffff88101b4c5800
        se_hash = {
          next = 0x5a5a5a5a5a5a5a5a,
          prev = 0x5a5a5a5a5a5a5a5a
        }
      

      Except the ASSERTION, there is nothing helpful in the debug log. I asked the customer to enable the D_READA facility if this is reproducible. However, they hit this LBUG once for now.

      Do you need additional traces or the vmcore to further analyze this issue?

      Attachments

        Issue Links

          Activity

            People

              laisiyao Lai Siyao
              bruno.travouillon Bruno Travouillon (Inactive)
              Votes:
              0 Vote for this issue
              Watchers:
              5 Start watching this issue

              Dates

                Created:
                Updated: