Uploaded image for project: 'Lustre'
  1. Lustre
  2. LU-17255

dt_record_write()) ASSERTION( dt->do_body_ops->dbo_write )

Details

    • Bug
    • Resolution: Unresolved
    • Minor
    • None
    • None
    • None
    • 3
    • 9223372036854775807

    Description

      [15148.368109] LustreError: 11399:0:(dt_object.c:532:dt_record_write()) ASSERTION( dt->do_body_ops->dbo_write ) failed: [0x240000401:0x2:0x0]
      [15148.368823] LustreError: 11399:0:(dt_object.c:532:dt_record_write()) LBUG
      [15148.369203] Pid: 11399, comm: tgt_recover_1 3.10.0-1160.49.1.el7_lustre.x86_64 #1 SMP Fri Jun 17 18:46:08 UTC 2022
      [15148.369204] Call Trace:
      [15148.369218] [<0>] libcfs_call_trace+0x90/0xf0 [libcfs]
      [15148.369222] [<0>] lbug_with_loc+0x4c/0xa0 [libcfs]
      [15148.369245] [<0>] dt_record_write+0xc2/0x130 [obdclass]
      [15148.369288] [<0>] out_tx_write_exec+0x9d/0x370 [ptlrpc]
      [15148.369319] [<0>] out_tx_end+0xe1/0x600 [ptlrpc]
      [15148.369349] [<0>] out_handle+0x1baf/0x2420 [ptlrpc]
      [15148.369378] [<0>] tgt_request_handle+0x883/0x19a0 [ptlrpc]
      [15148.369409] [<0>] handle_recovery_req+0x93/0x3c0 [ptlrpc]
      [15148.369438] [<0>] replay_request_or_update.isra.25+0x31c/0x9e0 [ptlrpc]
      [15148.369466] [<0>] target_recovery_thread+0x7bc/0x13d0 [ptlrpc]
      [15148.369469] [<0>] kthread+0xd1/0xe0
      [15148.369472] [<0>] ret_from_fork_nospec_begin+0x21/0x21
      [15148.369482] [<0>] 0xfffffffffffffffe
      [15148.369483] Kernel panic - not syncing: LBUG
      [15148.369831] CPU: 3 PID: 11399 Comm: tgt_recover_1 Kdump: loaded Tainted: G           OE  ------------   3.10.0-1160.49.1.el7_lustre.x86_64 #1
       

      Accidentally reproduced it on the latest master working on the different problem with following test:

      [root@vm2 tests]# git log HEAD --pretty=short -n1
      commit d7d16447a00be217246f1d73ecaf43a05a832442
      Author: Alex Zhuravlev <bzzz@whamcloud.com>    
      
          LU-17188 mdt: remove n from LDLM_DEBUG
      [root@vm2 tests]# git diff
      diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh
      index 85836e6..de6a1e7 100755
      --- a/lustre/tests/replay-dual.sh
      +++ b/lustre/tests/replay-dual.sh
      @@ -1322,6 +1322,26 @@ test_33() { # LU-15935
       }
       run_test 33 "Check for OBD_INCOMPAT_MULTI_RPCS in last_rcvd after abort_recovery"
       
      +test_34() {
      +       lfs mkdir -i 0 /mnt/lustre/dir
      +       for i in `seq 1 100`;
      +       do
      +               replay_barrier mds2
      +               lfs getdirstripe /mnt/lustre/dir
      +               dd if=/dev/urandom of=/mnt/lustre/dir/f$i bs=1M count=2
      +               dd if=/dev/urandom of=/mnt/lustre/dir/f2$i bs=1M count=2
      +               lfs getstripe -v /mnt/lustre/dir/f$i
      +               lfs getstripe -v /mnt/lustre/dir/f2$i
      +               lfs migrate -m $((i%2)) /mnt/lustre/dir #migrate to 0 or 1
      +               rm -f /mnt/lustre/dir/f$i
      +               rm -f /mnt/lustre/dir/f$i2
      +               facet_failover mds2
      +               lctl dk > logs$i
      +               ls -aslh /mnt/lustre/dir | grep "????" && break
      +       done
      +}
      +run_test 34 "bla bla"
      + 

      How to reproduce:

      [root@vm2 tests]# MDSCOUNT=2 bash ./llmount.sh
      ...
      [root@vm2 tests]# ONLY=34 bash ./replay-dual.sh 

       

      Attachments

        Issue Links

          Activity

            [LU-17255] dt_record_write()) ASSERTION( dt->do_body_ops->dbo_write )

            IMO, this is not a bug in Lustre, rather a problem in the testing infrastructure.

            bzzz Alex Zhuravlev added a comment - IMO, this is not a bug in Lustre, rather a problem in the testing infrastructure.

            that's because MDSCOUNT=2 is not set for replay-dual, thus few variables configuring MDS2 were set propely in t-f and then:
            do_facet mds2 /mnt/build/lustre/utils/lctl --device notransno is called with empty device.
            as a result MDS2 keeps reporting transactions committed while the underlying device were ignoring writes.

            bzzz Alex Zhuravlev added a comment - that's because MDSCOUNT=2 is not set for replay-dual, thus few variables configuring MDS2 were set propely in t-f and then: do_facet mds2 /mnt/build/lustre/utils/lctl --device notransno is called with empty device. as a result MDS2 keeps reporting transactions committed while the underlying device were ignoring writes.

            It seems do_body_ops was not initialized with "osd_body_ops":

            it is initialized when/if object exists

            bzzz Alex Zhuravlev added a comment - It seems do_body_ops was not initialized with "osd_body_ops": it is initialized when/if object exists

            what branch do you hit this with ?

            master

            [root@vm2 tests]# git log HEAD --pretty=short -n1
            commit d7d16447a00be217246f1d73ecaf43a05a832442
            Author: Alex Zhuravlev <bzzz@whamcloud.com>    
            
                LU-17188 mdt: remove n from LDLM_DEBUG 
            scherementsev Sergey Cheremencev added a comment - what branch do you hit this with ? master [root@vm2 tests]# git log HEAD --pretty=short -n1 commit d7d16447a00be217246f1d73ecaf43a05a832442 Author: Alex Zhuravlev <bzzz@whamcloud.com>    LU-17188 mdt: remove n from LDLM_DEBUG

            what branch do you hit this with ?

            bzzz Alex Zhuravlev added a comment - what branch do you hit this with ?

            From the crash dump:

            crash> dt_object ffff965171950500
            struct dt_object {
              do_lu = {
                lo_header = 0xffff964fcae9abd0, 
                lo_dev = 0xffff96516fd68000, 
                lo_ops = 0xffffffffc10ec3a0 <osd_lu_obj_ops>, 
                lo_linkage = {
                  next = 0xffff964fcae9ac08, 
                  prev = 0xffff965170784c90
                }, 
                lo_dev_ref = {<No data fields>}
              }, 
              do_ops = 0xffffffffc10ec280 <osd_obj_ops>, 
              do_body_ops = 0xffffffffc10ed4e0 <osd_body_ops_new>, 
              do_index_ops = 0x0
            } crash> p *((struct dt_object *)0xffff965171950500)->do_body_ops
            $4 = {
              dbo_read = 0x0, 
              dbo_declare_write = 0xffffffffc10d1fa0 <osd_declare_write>, 
              dbo_write = 0x0, 
              dbo_bufs_get = 0x0, 
              dbo_bufs_put = 0x0, 
              dbo_read_prep = 0x0, 
              dbo_write_prep = 0x0, 
              dbo_declare_write_commit = 0x0, 
              dbo_write_commit = 0x0, 
              dbo_fiemap_get = 0x0, 
              dbo_declare_punch = 0x0, 
              dbo_punch = 0x0, 
              dbo_ladvise = 0x0, 
              dbo_declare_fallocate = 0x0, 
              dbo_fallocate = 0x0, 
              dbo_lseek = 0x0
            }
             

            It seems do_body_ops was not initialized with "osd_body_ops":

                    obj->oo_dt.do_body_ops = &osd_body_ops_new;
                    if (result == 0 && obj->oo_inode != NULL) {
                            struct osd_thread_info *oti = osd_oti_get(env);
                            struct lustre_ost_attrs *loa = &oti->oti_ost_attrs;                osd_object_init0(obj);
                            if (unlikely(obj->oo_header))
                                    return 0;                result = osd_get_lma(oti, obj->oo_inode,
                                                 &oti->oti_obj_dentry, loa);
                            if (!result) {
                                    /*
                                     * Convert LMAI flags to lustre LMA flags
                                     * and cache it to oo_lma_flags
                                     */
                                    obj->oo_lma_flags =
                                            lma_to_lustre_flags(loa->loa_lma.lma_incompat);
                            } else if (result == -ENODATA) {
                                    result = 0; 
                            }
                    } 
            scherementsev Sergey Cheremencev added a comment - From the crash dump: crash> dt_object ffff965171950500 struct dt_object {   do_lu = {     lo_header = 0xffff964fcae9abd0,      lo_dev = 0xffff96516fd68000,      lo_ops = 0xffffffffc10ec3a0 <osd_lu_obj_ops>,      lo_linkage = {       next = 0xffff964fcae9ac08,        prev = 0xffff965170784c90     },      lo_dev_ref = {<No data fields>}   },    do_ops = 0xffffffffc10ec280 <osd_obj_ops>,    do_body_ops = 0xffffffffc10ed4e0 <osd_body_ops_new>,    do_index_ops = 0x0 } crash> p *((struct dt_object *)0xffff965171950500)->do_body_ops $4 = {   dbo_read = 0x0,    dbo_declare_write = 0xffffffffc10d1fa0 <osd_declare_write>,    dbo_write = 0x0,    dbo_bufs_get = 0x0,    dbo_bufs_put = 0x0,    dbo_read_prep = 0x0,    dbo_write_prep = 0x0,    dbo_declare_write_commit = 0x0,    dbo_write_commit = 0x0,    dbo_fiemap_get = 0x0,    dbo_declare_punch = 0x0,    dbo_punch = 0x0,    dbo_ladvise = 0x0,    dbo_declare_fallocate = 0x0,    dbo_fallocate = 0x0,    dbo_lseek = 0x0 } It seems do_body_ops was not initialized with "osd_body_ops":       obj->oo_dt.do_body_ops = &osd_body_ops_new;         if (result == 0 && obj->oo_inode != NULL) {                 struct osd_thread_info *oti = osd_oti_get(env);                 struct lustre_ost_attrs *loa = &oti->oti_ost_attrs;                osd_object_init0(obj);                 if (unlikely(obj->oo_header))                         return 0;                result = osd_get_lma(oti, obj->oo_inode,                                      &oti->oti_obj_dentry, loa);                 if (!result) {                         /*                          * Convert LMAI flags to lustre LMA flags                          * and cache it to oo_lma_flags                          */                         obj->oo_lma_flags =                                 lma_to_lustre_flags(loa->loa_lma.lma_incompat);                 } else if (result == -ENODATA) {                         result = 0;                  }         }

            People

              wc-triage WC Triage
              scherementsev Sergey Cheremencev
              Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

              Dates

                Created:
                Updated: