Uploaded image for project: 'Lustre'
  1. Lustre
  2. LU-15946

Unaligned I/O merged: ASSERTION( ... (ergo(i == 0, poff + pg->count == PAGE_SIZE) ... failed: i: 0/2 pg: 00000000295a3943 off: 0, count: 3928

    XMLWordPrintable

Details

    • Bug
    • Resolution: Unresolved
    • Major
    • None
    • None
    • None
    • 3
    • 9223372036854775807

    Description

      LBUG:

      console-20220406:2022-04-06T05:19:20.214620-05:00 c0-0c0s11n1 LustreError: 6221:0:(osc_request.c:1685:osc_brw_prep_request()) ASSERTION( page_count == 1 || (ergo(i == 0, poff + pg->count == PAGE_SIZE) && ergo(i > 0 && i < page_count - 1, poff == 0 && pg->count == PAGE_SIZE) && ergo(i == page_count - 1, poff == 0)) ) failed: i: 0/2 pg: 00000000295a3943 off: 0, count: 3928
      console-20220406:2022-04-06T05:19:20.214684-05:00 c0-0c0s11n1 LustreError: 6221:0:(osc_request.c:1685:osc_brw_prep_request()) LBUG
      console-20220406:2022-04-06T05:19:20.214702-05:00 c0-0c0s11n1 Pid: 6221, comm: ptlrpcd_00_13 5.3.18-59.34_7.0.4.6-cray_ari_c #1 SMP Wed Mar 2 05:45:36 UTC 2022 (d0a6bb5)
      console-20220406:2022-04-06T05:19:20.214726-05:00 c0-0c0s11n1 Call Trace TBD:
      console-20220406:2022-04-06T05:19:20.214733-05:00 c0-0c0s11n1 [<0>] libcfs_call_trace+0x74/0xc0 [libcfs]
      console-20220406:2022-04-06T05:19:20.214745-05:00 c0-0c0s11n1 [<0>] lbug_with_loc+0x43/0x90 [libcfs]
      console-20220406:2022-04-06T05:19:20.214751-05:00 c0-0c0s11n1 [<0>] osc_brw_prep_request+0x163e/0x1ab0 [osc]
      console-20220406:2022-04-06T05:19:20.214756-05:00 c0-0c0s11n1 [<0>] osc_build_rpc+0xa11/0x1130 [osc]
      console-20220406:2022-04-06T05:19:20.214761-05:00 c0-0c0s11n1 [<0>] osc_io_unplug0+0x1549/0x17e0 [osc]
      console-20220406:2022-04-06T05:19:20.214772-05:00 c0-0c0s11n1 [<0>] brw_queue_work+0x33/0xd0 [osc]
      console-20220406:2022-04-06T05:19:20.214777-05:00 c0-0c0s11n1 [<0>] work_interpreter+0x33/0x100 [ptlrpc]
      console-20220406:2022-04-06T05:19:20.214788-05:00 c0-0c0s11n1 [<0>] ptlrpc_check_set+0x54b/0x2060 [ptlrpc]
      console-20220406:2022-04-06T05:19:20.214810-05:00 c0-0c0s11n1 [<0>] ptlrpcd+0x820/0xa10 [ptlrpc]
      console-20220406:2022-04-06T05:19:20.214820-05:00 c0-0c0s11n1 [<0>] kthread+0x120/0x140
      console-20220406:2022-04-06T05:19:20.214841-05:00 c0-0c0s11n1 [<0>] ret_from_fork+0x3a/0x50
      console-20220406:2022-04-06T05:19:20.214852-05:00 c0-0c0s11n1 Kernel panic - not syncing: LBUG
      console-20220406:2022-04-06T05:19:20.214887-05:00 c0-0c0s11n1 CPU: 39 PID: 6221 Comm: ptlrpcd_00_13 Tainted: P           O       5.3.18-59.34_7.0.4.6-cray_ari_c #1 SLE15-SP3 (unreleased)
      console-20220406:2022-04-06T05:19:20.214918-05:00 c0-0c0s11n1 Hardware name: Cray Inc. Cascade/Cascade, BIOS 4.6.5 09/05/2019
      console-20220406:2022-04-06T05:19:20.214925-05:00 c0-0c0s11n1 Call Trace:
      console-20220406:2022-04-06T05:19:20.214931-05:00 c0-0c0s11n1  dump_stack+0x7a/0xa5
      console-20220406:2022-04-06T05:19:20.214936-05:00 c0-0c0s11n1  panic+0xfd/0x2c9
      console-20220406:2022-04-06T05:19:20.214941-05:00 c0-0c0s11n1  lbug_with_loc+0x89/0x90 [libcfs]
      console-20220406:2022-04-06T05:19:20.214946-05:00 c0-0c0s11n1  osc_brw_prep_request+0x163e/0x1ab0 [osc]
      console-20220406:2022-04-06T05:19:20.214951-05:00 c0-0c0s11n1  ? osc_req_attr_set+0x1ed/0x680 [osc]
      console-20220406:2022-04-06T05:19:20.214956-05:00 c0-0c0s11n1  osc_build_rpc+0xa11/0x1130 [osc]
      console-20220406:2022-04-06T05:19:20.214963-05:00 c0-0c0s11n1  osc_io_unplug0+0x1549/0x17e0 [osc]
      console-20220406:2022-04-06T05:19:20.214987-05:00 c0-0c0s11n1  ? trace_hardirqs_on+0x38/0xe0
      console-20220406:2022-04-06T05:19:20.214993-05:00 c0-0c0s11n1  ? __schedule+0x2f7/0x7c0
      console-20220406:2022-04-06T05:19:20.214998-05:00 c0-0c0s11n1  brw_queue_work+0x33/0xd0 [osc]
      console-20220406:2022-04-06T05:19:20.215003-05:00 c0-0c0s11n1  work_interpreter+0x33/0x100 [ptlrpc]
      console-20220406:2022-04-06T05:19:20.215008-05:00 c0-0c0s11n1  ptlrpc_check_set+0x54b/0x2060 [ptlrpc]
      console-20220406:2022-04-06T05:19:20.215012-05:00 c0-0c0s11n1  ptlrpcd+0x820/0xa10 [ptlrpc]
      console-20220406:2022-04-06T05:19:20.215031-05:00 c0-0c0s11n1  ? do_wait_intr_irq+0x90/0x90
      console-20220406:2022-04-06T05:19:20.215037-05:00 c0-0c0s11n1  kthread+0x120/0x140
      console-20220406:2022-04-06T05:19:20.215043-05:00 c0-0c0s11n1  ? ptlrpcd_ctl_init+0x180/0x180 [ptlrpc]
      console-20220406:2022-04-06T05:19:20.215048-05:00 c0-0c0s11n1  ? kthread_create_worker_on_cpu+0x70/0x70
      console-20220406:2022-04-06T05:19:20.215054-05:00 c0-0c0s11n1  ret_from_fork+0x3a/0x50
      console-20220406:2022-04-06T05:19:20.215059-05:00 c0-0c0s11n1 Shutting down cpus with NMI
      console-20220406:2022-04-06T05:19:20.215064-05:00 c0-0c0s11n1 Kernel Offset: disabled
      console-20220406:2022-04-06T05:19:20.215070-05:00 c0-0c0s11n1 ---[ end Kernel panic - not syncing: LBUG ]--- 
      

      The backtrace:

      crash_x86_64> bt -ll
      PID: 8139   TASK: ffff888f88b15000  CPU: 24  COMMAND: "ptlrpcd_00_08"
       #0 [ffffc90007d779c8] panic at ffffffff8107aab7
          /home/abuild/rpmbuild/BUILD/kernel-cray_ari_c-5.3.18/linux-5.3.18/linux-obj/../kernel/panic.c: 342
       #1 [ffffc90007d77a50] lbug_with_loc at ffffffffa0246529 [libcfs]
          /home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/libcfs/libcfs/debug.c: 476
       #2 [ffffc90007d77a70] osc_brw_prep_request at ffffffffa07ae4ee [osc]
          /home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/lustre/osc/osc_request.c: 1679
       #3 [ffffc90007d77ba0] osc_build_rpc at ffffffffa07b3631 [osc]
          /home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/lustre/osc/osc_request.c: 2618
       #4 [ffffc90007d77c60] osc_extent_finish at ffffffffa07cce99 [osc]
          /home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/lustre/osc/osc_cache.c: 2060
       #5 [ffffc90007d77d40] brw_queue_work at ffffffffa07a6433 [osc]
          /home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/libcfs/include/libcfs/libcfs_debug.h: 155
       #6 [ffffc90007d77d60] work_interpreter at ffffffffa0671df3 [ptlrpc]
          /usr/src/linux-5.3.18-59.34_7.0.4.6/include/linux/list.h: 135
       #7 [ffffc90007d77d80] ptlrpc_check_set at ffffffffa067a88b [ptlrpc]
          /home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/lustre/include/lustre_net.h: 1151
       #8 [ffffc90007d77e10] ptlrpcd at ffffffffa06a7ce0 [ptlrpc]
          /home/abuild/rpmbuild/BUILD/cray-lustre-2.15.0.3_rc2_cray_5_gad33231/lustre/ptlrpc/ptlrpcd.c: 361
       #9 [ffffc90007d77f08] kthread at ffffffff810a2400
          /home/abuild/rpmbuild/BUILD/kernel-cray_ari_c-5.3.18/linux-5.3.18/linux-obj/../kernel/kthread.c: 274
      #10 [ffffc90007d77f50] ret_from_fork at ffffffff8180021a
          /home/abuild/rpmbuild/BUILD/kernel-cray_ari_c-5.3.18/linux-5.3.18/linux-obj/../arch/x86/entry/entry_64.S: 360
      crash_x86_64> 
      

      The array pga which consists of brw_page entries is starting with a truncated page:

      crash_x86_64> bt -f
      PID: 8139   TASK: ffff888f88b15000  CPU: 24  COMMAND: "ptlrpcd_00_08"
       #0 [ffffc90007d779c8] panic at ffffffff8107aab7
          ffffc90007d779d0: ffffc90000000008 ffffc90007d77a58 
      ...
          ffffc90007d77b88: ffff888788dd28c0 ffff888788dd28d0 
          ffffc90007d77b98: ffffc90007d77c58 ffffffffa07b3631 
       #3 [ffffc90007d77ba0] osc_build_rpc at ffffffffa07b3631 [osc]
          ffffc90007d77ba8: 0000000000000000 0000000007d77bc8 
          ffffc90007d77bb8: 0000000000000000 ffff888789586af8 
      ...
      

      Examine the entries (there are 2):

      crash_x86_64> rd ffff888788dd28c0 8
      ffff888788dd28c0:  ffff8887814ae390 ffff8887814ae240   ..J.....@.J.....
      ffff888788dd28d0:  0000000000000000 0000000000000000   ................
      ...
      
      crash_x86_64> brw_page ffff8887814ae390
      struct brw_page {
        off = 0, 
        pg = 0xffffea00205343c0, 
        count = 3839,   <------ A short page
        flag = 682, 
        bp_off_diff = 0, 
        bp_count_diff = 0, 
        bp_padding = 0
      }
      
      crash_x86_64> brw_page ffff8887814ae240
      struct brw_page {
        off = 4096, 
        pg = 0xffffea000008a7c0, 
        count = 4096, 
        flag = 682, 
        bp_off_diff = 0, 
        bp_count_diff = 0, 
        bp_padding = 0
      }
      

      Where did this come from? Maybe the async_page will give us more information:

      crash_x86_64> whatis osc_async_page.oap_brw_page
      struct osc_async_page {
         [72] struct brw_page oap_brw_page;
      }
      
      crash_x86_64> eval ffff8887814ae390 -72
      hexadecimal: ffff8887814ae348  
      
      crash_x86_64> eval ffff8887814ae240 -72
      hexadecimal: ffff8887814ae1f8  
      
      crash_x86_64> struct osc_async_page ffff8887814ae348
      struct osc_async_page {
        oap_magic = 8675309, 
        oap_cmd = 2, 
      ...
        oap_obj_off = 0, 
        oap_page_off = 0, 
        oap_async_flags = (ASYNC_READY | ASYNC_URGENT | ASYNC_COUNT_STABLE), 
        oap_brw_page = {
          off = 0,                <----- Start of object
          pg = 0xffffea00205343c0, 
          count = 3839,     <----- Short page
          flag = 682, 
          bp_off_diff = 0, 
          bp_count_diff = 0, 
          bp_padding = 0
        }, 
      
      crash_x86_64> struct osc_async_page ffff8887814ae1f8
      struct osc_async_page {
        oap_magic = 8675309, 
        oap_cmd = 2, 
      ...
        oap_obj_off = 4096,   <--- 1 page into the object
        oap_page_off = 0, 
        oap_async_flags = (ASYNC_READY | ASYNC_URGENT | ASYNC_COUNT_STABLE), 
        oap_brw_page = {
          off = 4096, 
          pg = 0xffffea000008a7c0, 
          count = 4096,        <-------- full page at end of object
          flag = 682, 
          bp_off_diff = 0, 
          bp_count_diff = 0, 
          bp_padding = 0
        }, 
      ...
      

      It looks like non-contiguous aio/dio pages, checking the cl_page of each

      crash_x86_64> whatis osc_page.ops_oap
      struct osc_page {
         [24] struct osc_async_page ops_oap;
      }
      
      crash_x86_64> eval 0xffff8887814ae348 - 24
      hexadecimal: ffff8887814ae330  
      
      crash_x86_64> struct osc_page ffff8887814ae330
      struct osc_page {
        ops_cl = {
          cpl_page = 0xffff8887814ae2a0, 
          cpl_obj = 0xffff88878b4f4140, 
          cpl_ops = 0xffffffffa07d9f00 <osc_page_ops>
        }, 
        ops_oap = {
      
      crash_x86_64> struct cl_page 0xffff8887814ae2a0
      struct cl_page {
        cp_ref = {
          counter = 2
        }, 
        cp_lov_index = 0, 
        cp_osc_index = 0, 
        cp_obj = 0xffff888781d2b958, 
        cp_vmpage = 0xffffea00205343c0, 
        cp_inode = 0x0, 
        cp_batch = {
          next = 0xffff8887814ae178, 
          prev = 0xffff888787dc28d8
        }, 
        cp_layer_offset = "\000(@", 
        cp_layer_count = 3 '\003', 
        cp_state = CPS_PAGEOUT, 
        cp_type = CPT_TRANSIENT,     <-- DIO/AIO 
        cp_kmem_index = 0, 
        cp_unused1 = 0, 
        cp_owner = 0x0, 
        cp_reference = {<No data fields>}, 
        cp_obj_ref = {<No data fields>}, 
        cp_queue_ref = {<No data fields>}, 
        cp_sync_io = 0xffff888787dc2890
      }
      

      And the second page is also:

      crash_x86_64> struct osc_page  ffff8887814ae1e0  
      struct osc_page {
        ops_cl = {
          cpl_page = 0xffff8887814ae150, 
          cpl_obj = 0xffff88878b4f4140, 
          cpl_ops = 0xffffffffa07d9f00 <osc_page_ops>
        }, 
        ops_oap = {
      ...
      
      crash_x86_64> struct cl_page 0xffff8887814ae150
      struct cl_page {
        cp_ref = {
          counter = 2
        }, 
        cp_lov_index = 0, 
        cp_osc_index = 1, 
        cp_obj = 0xffff888781d2b958, 
        cp_vmpage = 0xffffea000008a7c0, 
        cp_inode = 0x0, 
        cp_batch = {
          next = 0xffff888787dc28d8, 
          prev = 0xffff8887814ae2c8
        }, 
        cp_layer_offset = "\000(@", 
        cp_layer_count = 3 '\003', 
        cp_state = CPS_PAGEOUT, 
        cp_type = CPT_TRANSIENT, 
        cp_kmem_index = 0, 
        cp_unused1 = 0, 
        cp_owner = 0x0, 
        cp_reference = {<No data fields>}, 
        cp_obj_ref = {<No data fields>}, 
        cp_queue_ref = {<No data fields>}, 
        cp_sync_io = 0xffff888787dc2890
      }
      crash_x86_64> 
      

      Attachments

        Activity

          People

            stancheff Shaun Tancheff
            stancheff Shaun Tancheff
            Votes:
            0 Vote for this issue
            Watchers:
            4 Start watching this issue

            Dates

              Created:
              Updated: