Uploaded image for project: 'Lustre'
  1. Lustre
  2. LU-3850

mds-survey on a secondary MDT crashes

Details

    • Bug
    • Resolution: Fixed
    • Major
    • Lustre 2.6.0, Lustre 2.5.1
    • Lustre 2.4.0
    • None
    • 3
    • 9970

    Description

      When running mds-survey on multiple MDTs, the test_mkdir operation on a secondary MDT, ie. not MDT0000, crashes the system with the following LBUG.

      LustreError: 6743:0:(lod_dev.c:69:lod_fld_lookup()) ASSERTION( fid_is_sane(fid) ) failed: Invalid FID [0x0:0x0:0x0]
      LustreError: 6743:0:(lod_dev.c:69:lod_fld_lookup()) LBUG
      
      crash> bt
      PID: 6743   TASK: ffff8804b642a040  CPU: 1   COMMAND: "lctl"
       #0 [ffff880494b83650] machine_kexec at ffffffff8102c48b
       #1 [ffff880494b836b0] crash_kexec at ffffffff810abae2
       #2 [ffff880494b83780] panic at ffffffff81499d3d
       #3 [ffff880494b83800] lbug_with_loc at ffffffffa054deeb [libcfs]
       #4 [ffff880494b83820] lod_fld_lookup at ffffffffa0f01ac5 [lod]
       #5 [ffff880494b83880] lod_object_alloc at ffffffffa0f042e6 [lod]
       #6 [ffff880494b838c0] mdd_object_init at ffffffffa0cba412 [mdd]
       #7 [ffff880494b838f0] lu_object_alloc at ffffffffa067fc4d [obdclass]
       #8 [ffff880494b83950] lu_object_find_at at ffffffffa06807b5 [obdclass]
       #9 [ffff880494b83a10] echo_md_handler at ffffffffa076fc71 [obdecho]
      #10 [ffff880494b83af0] echo_client_iocontrol at ffffffffa0775257 [obdecho]
      #11 [ffff880494b83d90] class_handle_ioctl at ffffffffa063f4cf [obdclass]
      #12 [ffff880494b83e40] obd_class_ioctl at ffffffffa06272ab [obdclass]
      #13 [ffff880494b83e60] vfs_ioctl at ffffffff81181372
      #14 [ffff880494b83ea0] do_vfs_ioctl at ffffffff81181514
      #15 [ffff880494b83f30] sys_ioctl at ffffffff81181a91
      #16 [ffff880494b83f80] system_call_fastpath at ffffffff81003072
          RIP: 0000003a3fedf7b7  RSP: 00007fffd6d74ba0  RFLAGS: 00010246
          RAX: 0000000000000010  RBX: ffffffff81003072  RCX: 0000000000000384
          RDX: 00007fffd6d74c20  RSI: 00000000824066dd  RDI: 0000000000000003
          RBP: 0000000000000001   R8: 00007fffd6d76c20   R9: 0000000000000240
          R10: 0000000000000001  R11: 0000000000000246  R12: 00000000824066dd
          R13: 00007fffd6d74c20  R14: 00000000006796c0  R15: 0000000000000003
          ORIG_RAX: 0000000000000010  CS: 0033  SS: 002b
      

      Steps to reproduce the internal commands launched by mds-survey:

      [root@mo90 ~]# lctl dl
        0 UP osd-ldiskfs fs2-MDT0001-osd fs2-MDT0001-osd_UUID 11
        1 UP mgc MGC30.1.0.95@o2ib 3a9a8da6-eabc-340d-b7bd-ad6260f44767 5
        2 UP mds MDS MDS_uuid 3
        3 UP lod fs2-MDT0001-mdtlov fs2-MDT0001-mdtlov_UUID 4
        4 UP mdt fs2-MDT0001 fs2-MDT0001_UUID 7
        5 UP mdd fs2-MDD0001 fs2-MDD0001_UUID 4
        6 UP osp fs2-OST0003-osc-MDT0001 fs2-MDT0001-mdtlov_UUID 5
        7 UP osp fs2-OST0002-osc-MDT0001 fs2-MDT0001-mdtlov_UUID 5
        8 UP osp fs2-OST0001-osc-MDT0001 fs2-MDT0001-mdtlov_UUID 5
        9 UP osp fs2-OST0000-osc-MDT0001 fs2-MDT0001-mdtlov_UUID 5
       10 UP osp fs2-MDT0000-osp-MDT0001 fs2-MDT0001-mdtlov_UUID 5
       11 UP lwp fs2-MDT0000-lwp-MDT0001 fs2-MDT0000-lwp-MDT0001_UUID 5
      [root@mo90 ~]# modprobe obdecho
      [root@mo90 ~]# lctl << EOF
      > attach echo_client fs2-MDT0001_ecc fs2-MDT0001_ecc_UUID
      > setup fs2-MDT0001 mdd
      > EOF
      [root@mo90 ~]# lctl --device 12  test_mkdir /tests
      

      Looking at the code, the fid that is null comes from the mdd_device associated to the echo_device.

      crash> struct echo_device.ed_next 0xffff8804b70f0200
        ed_next = 0xffff880921903000
      crash> print &((struct mdd_device *)0xffff880921903000)->mdd_md_dev.md_lu_dev
      $4 = (struct lu_device *) 0xffff880921903000
      crash> struct mdd_device 0xffff880921903000
      struct mdd_device {
        mdd_md_dev = {
          md_lu_dev = {
            ld_ref = {
              counter = 18
            }, 
            ld_type = 0xffffffffa0cf3500, 
            ld_ops = 0xffffffffa0ce4a00, 
            ld_site = 0xffff8804abf14150, 
            ld_proc_entry = 0x0, 
            ld_obd = 0xffff8809219540b8, 
            ld_reference = {<No data fields>}, 
            ld_linkage = {
              next = 0xffff8804abf14030, 
              prev = 0xffff880921905030
            }
          }, 
          md_ops = 0xffffffffa0ce4a20, 
          md_upcall = {
            mu_upcall_sem = {
              count = 0, 
              wait_lock = {
                raw_lock = {
                  slock = 0
                }
              }, 
              wait_list = {
                next = 0x0, 
                prev = 0x0
              }
            }, 
            mu_upcall_dev = 0x0, 
            mu_upcall = 0
          }
        }, 
        mdd_child_exp = 0xffff88092195bc00, 
        mdd_child = 0xffff880921950000, 
        mdd_bottom = 0xffff8804abf14000, 
        mdd_root_fid = {
          f_seq = 0, 
          f_oid = 0, 
          f_ver = 0
        }, 
        mdd_local_root_fid = {
          f_seq = 8589934593, 
          f_oid = 13, 
          f_ver = 0
        }, 
        mdd_dt_conf = {
          ddp_max_name_len = 255, 
          ddp_max_nlink = 65000, 
          ddp_block_shift = 12, 
          ddp_mntopts = 3, 
          ddp_max_ea_size = 4096, 
          ddp_mnt = 0xffff8804abcde3c0, 
          ddp_mount_type = 1, 
          ddp_maxbytes = 17592186040320, 
          ddp_grant_reserved = 2, 
          ddp_inodespace = 28, 
          ddp_grant_frag = 24576
        }, 
        mdd_orphans = 0xffff8804949173f8, 
        mdd_proc_entry = 0xffff880921956d40, 
        mdd_cl = {
          mc_lock = {
            raw_lock = {
              slock = 0
            }
          }, 
          mc_flags = 0, 
          mc_mask = -526337, 
          mc_index = 0, 
          mc_starttime = 4295358216, 
          mc_user_lock = {
            raw_lock = {
              slock = 0
            }
          }, 
          mc_lastuser = 0
        }, 
        mdd_atime_diff = 60, 
        mdd_dot_lustre = 0x0, 
        mdd_dot_lustre_objs = {
          mdd_obf = 0x0
        }, 
        mdd_lfsck = {
          ml_mutex = {
            count = {
              counter = 1
            }, 
            wait_lock = {
              raw_lock = {
                slock = 0
              }
            }, 
            wait_list = {
              next = 0xffff880921903148, 
              prev = 0xffff880921903148
            }, 
            owner = 0x0
          }, 
          ml_lock = {
            raw_lock = {
              slock = 0
            }
          }, 
          ml_list_scan = {
            next = 0xffff880921903168, 
            prev = 0xffff880921903168
          }, 
          ml_list_dir = {
            next = 0xffff880921903178, 
            prev = 0xffff880921903178
          }, 
          ml_list_double_scan = {
            next = 0xffff880921903188, 
            prev = 0xffff880921903188
          }, 
          ml_list_idle = {
            next = 0xffff880494901500, 
            prev = 0xffff880494901500
          }, 
          ml_thread = {
            t_link = {
              next = 0x0, 
              prev = 0x0
            }, 
            t_data = 0x0, 
            t_flags = 0, 
            t_id = 0, 
            t_pid = 0, 
            t_watchdog = 0x0, 
            t_svcpt = 0x0, 
            t_ctl_waitq = {
              lock = {
                raw_lock = {
                  slock = 0
                }
              }, 
              task_list = {
                next = 0xffff8809219031e8, 
                prev = 0xffff8809219031e8
              }
            }, 
            t_env = 0x0, 
            t_name = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
          }, 
          ml_time_last_checkpoint = 0, 
          ml_time_next_checkpoint = 0, 
          ml_bookmark_obj = 0xffff8804949015c0, 
          ml_bookmark_ram = {
            lb_magic = 538119197, 
            lb_version = 2, 
            lb_param = 0, 
            lb_speed_limit = 0, 
            lb_padding = 0, 
            lb_reserved = {0, 0, 0, 0, 0, 0}
          }, 
          ml_bookmark_disk = {
            lb_magic = 538119197, 
            lb_version = 2, 
            lb_param = 0, 
            lb_speed_limit = 0, 
            lb_padding = 0, 
            lb_reserved = {0, 0, 0, 0, 0, 0}
          }, 
          ml_pos_current = {
            lp_oit_cookie = 0, 
            lp_dir_parent = {
              f_seq = 0, 
              f_oid = 0, 
              f_ver = 0
            }, 
            lp_dir_cookie = 0
          }, 
          ml_obj_oit = 0xffff880494901680, 
          ml_obj_dir = 0x0, 
          ml_di_oit = 0x0, 
          ml_di_dir = 0x0, 
          ml_args_oit = 0, 
          ml_args_dir = 0, 
          ml_sleep_rate = 0, 
          ml_sleep_jif = 0, 
          ml_new_scanned = 0, 
          ml_paused = 0, 
          ml_oit_over = 0, 
          ml_drop_dryrun = 0, 
          ml_initialized = 1, 
          ml_current_oit_processed = 0
        }, 
        mdd_sync_permission = 1, 
        mdd_connects = 1, 
        mdd_los = 0xffff880494dccc40
      }
      

      Does mds-survey support several MDTs ?

      Attachments

        Activity

          People

            di.wang Di Wang
            pichong Gregoire Pichon
            Votes:
            0 Vote for this issue
            Watchers:
            6 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved: