Details
-
Bug
-
Resolution: Fixed
-
Major
-
Lustre 2.4.0
-
None
-
3
-
9970
Description
When running mds-survey on multiple MDTs, the test_mkdir operation on a secondary MDT, ie. not MDT0000, crashes the system with the following LBUG.
LustreError: 6743:0:(lod_dev.c:69:lod_fld_lookup()) ASSERTION( fid_is_sane(fid) ) failed: Invalid FID [0x0:0x0:0x0] LustreError: 6743:0:(lod_dev.c:69:lod_fld_lookup()) LBUG crash> bt PID: 6743 TASK: ffff8804b642a040 CPU: 1 COMMAND: "lctl" #0 [ffff880494b83650] machine_kexec at ffffffff8102c48b #1 [ffff880494b836b0] crash_kexec at ffffffff810abae2 #2 [ffff880494b83780] panic at ffffffff81499d3d #3 [ffff880494b83800] lbug_with_loc at ffffffffa054deeb [libcfs] #4 [ffff880494b83820] lod_fld_lookup at ffffffffa0f01ac5 [lod] #5 [ffff880494b83880] lod_object_alloc at ffffffffa0f042e6 [lod] #6 [ffff880494b838c0] mdd_object_init at ffffffffa0cba412 [mdd] #7 [ffff880494b838f0] lu_object_alloc at ffffffffa067fc4d [obdclass] #8 [ffff880494b83950] lu_object_find_at at ffffffffa06807b5 [obdclass] #9 [ffff880494b83a10] echo_md_handler at ffffffffa076fc71 [obdecho] #10 [ffff880494b83af0] echo_client_iocontrol at ffffffffa0775257 [obdecho] #11 [ffff880494b83d90] class_handle_ioctl at ffffffffa063f4cf [obdclass] #12 [ffff880494b83e40] obd_class_ioctl at ffffffffa06272ab [obdclass] #13 [ffff880494b83e60] vfs_ioctl at ffffffff81181372 #14 [ffff880494b83ea0] do_vfs_ioctl at ffffffff81181514 #15 [ffff880494b83f30] sys_ioctl at ffffffff81181a91 #16 [ffff880494b83f80] system_call_fastpath at ffffffff81003072 RIP: 0000003a3fedf7b7 RSP: 00007fffd6d74ba0 RFLAGS: 00010246 RAX: 0000000000000010 RBX: ffffffff81003072 RCX: 0000000000000384 RDX: 00007fffd6d74c20 RSI: 00000000824066dd RDI: 0000000000000003 RBP: 0000000000000001 R8: 00007fffd6d76c20 R9: 0000000000000240 R10: 0000000000000001 R11: 0000000000000246 R12: 00000000824066dd R13: 00007fffd6d74c20 R14: 00000000006796c0 R15: 0000000000000003 ORIG_RAX: 0000000000000010 CS: 0033 SS: 002b
Steps to reproduce the internal commands launched by mds-survey:
[root@mo90 ~]# lctl dl 0 UP osd-ldiskfs fs2-MDT0001-osd fs2-MDT0001-osd_UUID 11 1 UP mgc MGC30.1.0.95@o2ib 3a9a8da6-eabc-340d-b7bd-ad6260f44767 5 2 UP mds MDS MDS_uuid 3 3 UP lod fs2-MDT0001-mdtlov fs2-MDT0001-mdtlov_UUID 4 4 UP mdt fs2-MDT0001 fs2-MDT0001_UUID 7 5 UP mdd fs2-MDD0001 fs2-MDD0001_UUID 4 6 UP osp fs2-OST0003-osc-MDT0001 fs2-MDT0001-mdtlov_UUID 5 7 UP osp fs2-OST0002-osc-MDT0001 fs2-MDT0001-mdtlov_UUID 5 8 UP osp fs2-OST0001-osc-MDT0001 fs2-MDT0001-mdtlov_UUID 5 9 UP osp fs2-OST0000-osc-MDT0001 fs2-MDT0001-mdtlov_UUID 5 10 UP osp fs2-MDT0000-osp-MDT0001 fs2-MDT0001-mdtlov_UUID 5 11 UP lwp fs2-MDT0000-lwp-MDT0001 fs2-MDT0000-lwp-MDT0001_UUID 5 [root@mo90 ~]# modprobe obdecho [root@mo90 ~]# lctl << EOF > attach echo_client fs2-MDT0001_ecc fs2-MDT0001_ecc_UUID > setup fs2-MDT0001 mdd > EOF [root@mo90 ~]# lctl --device 12 test_mkdir /tests
Looking at the code, the fid that is null comes from the mdd_device associated to the echo_device.
crash> struct echo_device.ed_next 0xffff8804b70f0200 ed_next = 0xffff880921903000 crash> print &((struct mdd_device *)0xffff880921903000)->mdd_md_dev.md_lu_dev $4 = (struct lu_device *) 0xffff880921903000 crash> struct mdd_device 0xffff880921903000 struct mdd_device { mdd_md_dev = { md_lu_dev = { ld_ref = { counter = 18 }, ld_type = 0xffffffffa0cf3500, ld_ops = 0xffffffffa0ce4a00, ld_site = 0xffff8804abf14150, ld_proc_entry = 0x0, ld_obd = 0xffff8809219540b8, ld_reference = {<No data fields>}, ld_linkage = { next = 0xffff8804abf14030, prev = 0xffff880921905030 } }, md_ops = 0xffffffffa0ce4a20, md_upcall = { mu_upcall_sem = { count = 0, wait_lock = { raw_lock = { slock = 0 } }, wait_list = { next = 0x0, prev = 0x0 } }, mu_upcall_dev = 0x0, mu_upcall = 0 } }, mdd_child_exp = 0xffff88092195bc00, mdd_child = 0xffff880921950000, mdd_bottom = 0xffff8804abf14000, mdd_root_fid = { f_seq = 0, f_oid = 0, f_ver = 0 }, mdd_local_root_fid = { f_seq = 8589934593, f_oid = 13, f_ver = 0 }, mdd_dt_conf = { ddp_max_name_len = 255, ddp_max_nlink = 65000, ddp_block_shift = 12, ddp_mntopts = 3, ddp_max_ea_size = 4096, ddp_mnt = 0xffff8804abcde3c0, ddp_mount_type = 1, ddp_maxbytes = 17592186040320, ddp_grant_reserved = 2, ddp_inodespace = 28, ddp_grant_frag = 24576 }, mdd_orphans = 0xffff8804949173f8, mdd_proc_entry = 0xffff880921956d40, mdd_cl = { mc_lock = { raw_lock = { slock = 0 } }, mc_flags = 0, mc_mask = -526337, mc_index = 0, mc_starttime = 4295358216, mc_user_lock = { raw_lock = { slock = 0 } }, mc_lastuser = 0 }, mdd_atime_diff = 60, mdd_dot_lustre = 0x0, mdd_dot_lustre_objs = { mdd_obf = 0x0 }, mdd_lfsck = { ml_mutex = { count = { counter = 1 }, wait_lock = { raw_lock = { slock = 0 } }, wait_list = { next = 0xffff880921903148, prev = 0xffff880921903148 }, owner = 0x0 }, ml_lock = { raw_lock = { slock = 0 } }, ml_list_scan = { next = 0xffff880921903168, prev = 0xffff880921903168 }, ml_list_dir = { next = 0xffff880921903178, prev = 0xffff880921903178 }, ml_list_double_scan = { next = 0xffff880921903188, prev = 0xffff880921903188 }, ml_list_idle = { next = 0xffff880494901500, prev = 0xffff880494901500 }, ml_thread = { t_link = { next = 0x0, prev = 0x0 }, t_data = 0x0, t_flags = 0, t_id = 0, t_pid = 0, t_watchdog = 0x0, t_svcpt = 0x0, t_ctl_waitq = { lock = { raw_lock = { slock = 0 } }, task_list = { next = 0xffff8809219031e8, prev = 0xffff8809219031e8 } }, t_env = 0x0, t_name = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" }, ml_time_last_checkpoint = 0, ml_time_next_checkpoint = 0, ml_bookmark_obj = 0xffff8804949015c0, ml_bookmark_ram = { lb_magic = 538119197, lb_version = 2, lb_param = 0, lb_speed_limit = 0, lb_padding = 0, lb_reserved = {0, 0, 0, 0, 0, 0} }, ml_bookmark_disk = { lb_magic = 538119197, lb_version = 2, lb_param = 0, lb_speed_limit = 0, lb_padding = 0, lb_reserved = {0, 0, 0, 0, 0, 0} }, ml_pos_current = { lp_oit_cookie = 0, lp_dir_parent = { f_seq = 0, f_oid = 0, f_ver = 0 }, lp_dir_cookie = 0 }, ml_obj_oit = 0xffff880494901680, ml_obj_dir = 0x0, ml_di_oit = 0x0, ml_di_dir = 0x0, ml_args_oit = 0, ml_args_dir = 0, ml_sleep_rate = 0, ml_sleep_jif = 0, ml_new_scanned = 0, ml_paused = 0, ml_oit_over = 0, ml_drop_dryrun = 0, ml_initialized = 1, ml_current_oit_processed = 0 }, mdd_sync_permission = 1, mdd_connects = 1, mdd_los = 0xffff880494dccc40 }
Does mds-survey support several MDTs ?