Details
-
Bug
-
Resolution: Fixed
-
Major
-
Lustre 2.4.0
-
None
-
3
-
9970
Description
When running mds-survey on multiple MDTs, the test_mkdir operation on a secondary MDT, ie. not MDT0000, crashes the system with the following LBUG.
LustreError: 6743:0:(lod_dev.c:69:lod_fld_lookup()) ASSERTION( fid_is_sane(fid) ) failed: Invalid FID [0x0:0x0:0x0]
LustreError: 6743:0:(lod_dev.c:69:lod_fld_lookup()) LBUG
crash> bt
PID: 6743 TASK: ffff8804b642a040 CPU: 1 COMMAND: "lctl"
#0 [ffff880494b83650] machine_kexec at ffffffff8102c48b
#1 [ffff880494b836b0] crash_kexec at ffffffff810abae2
#2 [ffff880494b83780] panic at ffffffff81499d3d
#3 [ffff880494b83800] lbug_with_loc at ffffffffa054deeb [libcfs]
#4 [ffff880494b83820] lod_fld_lookup at ffffffffa0f01ac5 [lod]
#5 [ffff880494b83880] lod_object_alloc at ffffffffa0f042e6 [lod]
#6 [ffff880494b838c0] mdd_object_init at ffffffffa0cba412 [mdd]
#7 [ffff880494b838f0] lu_object_alloc at ffffffffa067fc4d [obdclass]
#8 [ffff880494b83950] lu_object_find_at at ffffffffa06807b5 [obdclass]
#9 [ffff880494b83a10] echo_md_handler at ffffffffa076fc71 [obdecho]
#10 [ffff880494b83af0] echo_client_iocontrol at ffffffffa0775257 [obdecho]
#11 [ffff880494b83d90] class_handle_ioctl at ffffffffa063f4cf [obdclass]
#12 [ffff880494b83e40] obd_class_ioctl at ffffffffa06272ab [obdclass]
#13 [ffff880494b83e60] vfs_ioctl at ffffffff81181372
#14 [ffff880494b83ea0] do_vfs_ioctl at ffffffff81181514
#15 [ffff880494b83f30] sys_ioctl at ffffffff81181a91
#16 [ffff880494b83f80] system_call_fastpath at ffffffff81003072
RIP: 0000003a3fedf7b7 RSP: 00007fffd6d74ba0 RFLAGS: 00010246
RAX: 0000000000000010 RBX: ffffffff81003072 RCX: 0000000000000384
RDX: 00007fffd6d74c20 RSI: 00000000824066dd RDI: 0000000000000003
RBP: 0000000000000001 R8: 00007fffd6d76c20 R9: 0000000000000240
R10: 0000000000000001 R11: 0000000000000246 R12: 00000000824066dd
R13: 00007fffd6d74c20 R14: 00000000006796c0 R15: 0000000000000003
ORIG_RAX: 0000000000000010 CS: 0033 SS: 002b
Steps to reproduce the internal commands launched by mds-survey:
[root@mo90 ~]# lctl dl 0 UP osd-ldiskfs fs2-MDT0001-osd fs2-MDT0001-osd_UUID 11 1 UP mgc MGC30.1.0.95@o2ib 3a9a8da6-eabc-340d-b7bd-ad6260f44767 5 2 UP mds MDS MDS_uuid 3 3 UP lod fs2-MDT0001-mdtlov fs2-MDT0001-mdtlov_UUID 4 4 UP mdt fs2-MDT0001 fs2-MDT0001_UUID 7 5 UP mdd fs2-MDD0001 fs2-MDD0001_UUID 4 6 UP osp fs2-OST0003-osc-MDT0001 fs2-MDT0001-mdtlov_UUID 5 7 UP osp fs2-OST0002-osc-MDT0001 fs2-MDT0001-mdtlov_UUID 5 8 UP osp fs2-OST0001-osc-MDT0001 fs2-MDT0001-mdtlov_UUID 5 9 UP osp fs2-OST0000-osc-MDT0001 fs2-MDT0001-mdtlov_UUID 5 10 UP osp fs2-MDT0000-osp-MDT0001 fs2-MDT0001-mdtlov_UUID 5 11 UP lwp fs2-MDT0000-lwp-MDT0001 fs2-MDT0000-lwp-MDT0001_UUID 5 [root@mo90 ~]# modprobe obdecho [root@mo90 ~]# lctl << EOF > attach echo_client fs2-MDT0001_ecc fs2-MDT0001_ecc_UUID > setup fs2-MDT0001 mdd > EOF [root@mo90 ~]# lctl --device 12 test_mkdir /tests
Looking at the code, the fid that is null comes from the mdd_device associated to the echo_device.
crash> struct echo_device.ed_next 0xffff8804b70f0200
ed_next = 0xffff880921903000
crash> print &((struct mdd_device *)0xffff880921903000)->mdd_md_dev.md_lu_dev
$4 = (struct lu_device *) 0xffff880921903000
crash> struct mdd_device 0xffff880921903000
struct mdd_device {
mdd_md_dev = {
md_lu_dev = {
ld_ref = {
counter = 18
},
ld_type = 0xffffffffa0cf3500,
ld_ops = 0xffffffffa0ce4a00,
ld_site = 0xffff8804abf14150,
ld_proc_entry = 0x0,
ld_obd = 0xffff8809219540b8,
ld_reference = {<No data fields>},
ld_linkage = {
next = 0xffff8804abf14030,
prev = 0xffff880921905030
}
},
md_ops = 0xffffffffa0ce4a20,
md_upcall = {
mu_upcall_sem = {
count = 0,
wait_lock = {
raw_lock = {
slock = 0
}
},
wait_list = {
next = 0x0,
prev = 0x0
}
},
mu_upcall_dev = 0x0,
mu_upcall = 0
}
},
mdd_child_exp = 0xffff88092195bc00,
mdd_child = 0xffff880921950000,
mdd_bottom = 0xffff8804abf14000,
mdd_root_fid = {
f_seq = 0,
f_oid = 0,
f_ver = 0
},
mdd_local_root_fid = {
f_seq = 8589934593,
f_oid = 13,
f_ver = 0
},
mdd_dt_conf = {
ddp_max_name_len = 255,
ddp_max_nlink = 65000,
ddp_block_shift = 12,
ddp_mntopts = 3,
ddp_max_ea_size = 4096,
ddp_mnt = 0xffff8804abcde3c0,
ddp_mount_type = 1,
ddp_maxbytes = 17592186040320,
ddp_grant_reserved = 2,
ddp_inodespace = 28,
ddp_grant_frag = 24576
},
mdd_orphans = 0xffff8804949173f8,
mdd_proc_entry = 0xffff880921956d40,
mdd_cl = {
mc_lock = {
raw_lock = {
slock = 0
}
},
mc_flags = 0,
mc_mask = -526337,
mc_index = 0,
mc_starttime = 4295358216,
mc_user_lock = {
raw_lock = {
slock = 0
}
},
mc_lastuser = 0
},
mdd_atime_diff = 60,
mdd_dot_lustre = 0x0,
mdd_dot_lustre_objs = {
mdd_obf = 0x0
},
mdd_lfsck = {
ml_mutex = {
count = {
counter = 1
},
wait_lock = {
raw_lock = {
slock = 0
}
},
wait_list = {
next = 0xffff880921903148,
prev = 0xffff880921903148
},
owner = 0x0
},
ml_lock = {
raw_lock = {
slock = 0
}
},
ml_list_scan = {
next = 0xffff880921903168,
prev = 0xffff880921903168
},
ml_list_dir = {
next = 0xffff880921903178,
prev = 0xffff880921903178
},
ml_list_double_scan = {
next = 0xffff880921903188,
prev = 0xffff880921903188
},
ml_list_idle = {
next = 0xffff880494901500,
prev = 0xffff880494901500
},
ml_thread = {
t_link = {
next = 0x0,
prev = 0x0
},
t_data = 0x0,
t_flags = 0,
t_id = 0,
t_pid = 0,
t_watchdog = 0x0,
t_svcpt = 0x0,
t_ctl_waitq = {
lock = {
raw_lock = {
slock = 0
}
},
task_list = {
next = 0xffff8809219031e8,
prev = 0xffff8809219031e8
}
},
t_env = 0x0,
t_name = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
},
ml_time_last_checkpoint = 0,
ml_time_next_checkpoint = 0,
ml_bookmark_obj = 0xffff8804949015c0,
ml_bookmark_ram = {
lb_magic = 538119197,
lb_version = 2,
lb_param = 0,
lb_speed_limit = 0,
lb_padding = 0,
lb_reserved = {0, 0, 0, 0, 0, 0}
},
ml_bookmark_disk = {
lb_magic = 538119197,
lb_version = 2,
lb_param = 0,
lb_speed_limit = 0,
lb_padding = 0,
lb_reserved = {0, 0, 0, 0, 0, 0}
},
ml_pos_current = {
lp_oit_cookie = 0,
lp_dir_parent = {
f_seq = 0,
f_oid = 0,
f_ver = 0
},
lp_dir_cookie = 0
},
ml_obj_oit = 0xffff880494901680,
ml_obj_dir = 0x0,
ml_di_oit = 0x0,
ml_di_dir = 0x0,
ml_args_oit = 0,
ml_args_dir = 0,
ml_sleep_rate = 0,
ml_sleep_jif = 0,
ml_new_scanned = 0,
ml_paused = 0,
ml_oit_over = 0,
ml_drop_dryrun = 0,
ml_initialized = 1,
ml_current_oit_processed = 0
},
mdd_sync_permission = 1,
mdd_connects = 1,
mdd_los = 0xffff880494dccc40
}
Does mds-survey support several MDTs ?