Uploaded image for project: 'Lustre'
  1. Lustre
  2. LU-1278

Client Panic - Lustre 1.8.6 and RHEL 6.1

    XMLWordPrintable

Details

    • Bug
    • Resolution: Fixed
    • Critical
    • None
    • Lustre 1.8.6
    • None
    • 1
    • 6423

    Description

      Customer reported two system panics while trying to gain system acceptance. They provided the attached tracebacks.

      In addition, they provided the following information:

      crash> bt
      PID: 20417 TASK: ffff8803fe75a040 CPU: 0 COMMAND: "flush-lustre-1"
      #0 [ffff880400ad1270] machine_kexec at ffffffff810310db
      #1 [ffff880400ad12d0] crash_kexec at ffffffff810b63b2
      #2 [ffff880400ad13a0] oops_end at ffffffff814dec50
      #3 [ffff880400ad13d0] no_context at ffffffff81040cdb
      #4 [ffff880400ad1420] __bad_area_nosemaphore at ffffffff81040f65
      #5 [ffff880400ad1470] bad_area_nosemaphore at ffffffff81041033
      #6 [ffff880400ad1480] __do_page_fault at ffffffff8104170d
      #7 [ffff880400ad15a0] do_page_fault at ffffffff814e0c3e
      #8 [ffff880400ad15d0] page_fault at ffffffff814ddfe5
      [exception RIP: lov_ap_refresh_count+22]
      RIP: ffffffffa07c3cc6 RSP: ffff880400ad1680 RFLAGS: 00010282
      RAX: 0000000000000000 RBX: ffff8803ebe15230 RCX: ffff8803e9668360
      RDX: ffff8803e9668360 RSI: 0000000000000002 RDI: 0000000000000000
      RBP: ffff880400ad1690 R8: 0000000000000002 R9: 0000000000000000
      R10: ffffffffa0806ee0 R11: 0000000000000003 R12: ffff8803e9668348
      R13: 0000000000000000 R14: ffff8803e9668350 R15: ffff8803fa7c6ad0
      ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
      #9 [ffff880400ad1698] osc_send_oap_rpc at ffffffffa074feb9 [osc]
      #10 [ffff880400ad17e8] osc_check_rpcs at ffffffffa0751121 [osc]
      #11 [ffff880400ad1878] osc_set_async_flags at ffffffffa0751545 [osc]
      #12 [ffff880400ad1938] lov_set_async_flags at ffffffffa07c6273 [lov]
      #13 [ffff880400ad19b8] ll_writepage at ffffffffa085b2b4 [lustre]
      #14 [ffff880400ad1a58] ll_writepage_26 at ffffffffa0876d6e [lustre]
      #15 [ffff880400ad1a68] __writepage at ffffffff81120e67
      #16 [ffff880400ad1a88] write_cache_pages at ffffffff811221f9
      #17 [ffff880400ad1bb8] generic_writepages at ffffffff811224f4
      #18 [ffff880400ad1bc8] do_writepages at ffffffff81122521
      #19 [ffff880400ad1bd8] writeback_single_inode at ffffffff8119bbbd
      #20 [ffff880400ad1c18] writeback_sb_inodes at ffffffff8119bfbe
      #21 [ffff880400ad1c78] writeback_inodes_wb at ffffffff8119c11b
      #22 [ffff880400ad1cd8] wb_writeback at ffffffff8119c4bb
      #23 [ffff880400ad1dd8] wb_do_writeback at ffffffff8119c7a9
      #24 [ffff880400ad1e68] bdi_writeback_task at ffffffff8119c8b3
      #25 [ffff880400ad1eb8] bdi_start_fn at ffffffff81130c16
      #26 [ffff880400ad1ee8] kthread at ffffffff8108ddf6
      #27 [ffff880400ad1f48] kernel_thread at ffffffff8100c1ca

      We crash in lov_ap_refresh_count which was called by osc_send_oap_rpc.

      For whatever crazy reason osc/osc_request.c" line 2564 :

      /* ask the caller for the size of the io as the rpc
      leaves. */
      if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE))
      oap->oap_count =

      ops->ap_refresh_count(oap->oap_caller_data,cmd); <--------------
      if (oap->oap_count <= 0) {
      CDEBUG(D_CACHE, "oap %p count %d, completing\n",
      oap,
      oap->oap_count);

      where the <-------------- is above we send ap_refresh_count
      oap->oap_caller_data

      from the dump:

      crash> print *(struct osc_async_page *)0xffff8803ebe15230
      $89 = {
      oap_magic = 8675309,
      oap_cmd = 2,
      oap_interrupted = 0,
      oap_pending_item =

      { next = 0xffff8803ebe15238, prev = 0xffff8803ebe15238 }

      ,
      oap_urgent_item =

      { next = 0xffff8803ebe15248, prev = 0xffff8803ebe15248 }

      ,
      oap_rpc_item =

      { next = 0xffff8803ebe15258, prev = 0xffff8803ebe15258 }

      ,
      oap_obj_off = 0,
      oap_page_off = 0,
      oap_async_flags = 3,
      oap_brw_page =

      { off = 0, pg = 0xffffea00297d5fb0, count = 0, flag = 1056 }

      ,
      oap_occ = {
      occ_oig_item =

      { next = 0x0, prev = 0x0 }

      ,
      occ_interrupted = 0xffffffffa074b490 <osc_occ_interrupted>,
      interrupted = 0
      },
      oap_oig = 0x0,
      oap_request = 0x0,
      oap_cli = 0xffff8803fa7c6ad0,
      oap_loi = 0xffff8803e9668300,
      oap_caller_ops = 0xffffffffa0806ee0,
      oap_caller_data = 0x0,
      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
      oap_page_list =

      { next = 0xffff8803eb850d70, prev = 0xffff8803eb850d70 }

      ,
      oap_ldlm_lock = 0xffff8803eb850c00,
      oap_lock = {
      raw_lock =

      { slock = 131074 }

      }
      }

      ops->ap_refresh_count(oap->oap_caller_data,cmd); is the below:

      static int lov_ap_refresh_count(void *data, int cmd)
      {
      struct lov_async_page *lap = LAP_FROM_COOKIE(data);

      return
      lap->lap_caller_ops->ap_refresh_count(lap->lap_caller_data,
      cmd);
      }

      #define LAP_FROM_COOKIE(c)
      \
      (LASSERT(((struct lov_async_page *)(c))->lap_magic ==
      LOV_AP_MAGIC), \
      (struct lov_async_page *)(c))

      sending this a NULL is probably not a great idea......

      As to how oap_caller_data = 0x0 came to be I have no idea.

      I was able to pull the lustre debug ring out of the dump but it doesn't
      seem to have anything useful, the last entry appears to be a couple of
      days before the dump....

      Attachments

        1. panic1.txt
          5 kB
        2. panic2.txt
          5 kB

        Activity

          People

            green Oleg Drokin
            dnelson@ddn.com Dennis Nelson
            Votes:
            0 Vote for this issue
            Watchers:
            2 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved: