[LU-2090] mdt_req_handle() ASSERTION(h->mh_act != NULL) failed Created: 04/Oct/12 Updated: 07/Oct/15 Resolved: 07/Oct/15 |
|
| Status: | Closed |
| Project: | Lustre |
| Component/s: | None |
| Affects Version/s: | Lustre 2.1.2 |
| Fix Version/s: | None |
| Type: | Bug | Priority: | Critical |
| Reporter: | Ned Bass | Assignee: | Alex Zhuravlev |
| Resolution: | Cannot Reproduce | Votes: | 0 |
| Labels: | llnl | ||
| Environment: | |||
| Attachments: |
|
| Severity: | 3 |
| Rank (Obsolete): | 4365 |
| Description |
|
Production MDS got stuck in a crash/reboot loop. It hit the summary assertion this morning then again on each reboot during recovery. Finally we aborted recovery and the MDS stabilized. We have several crash dumps. Backtrace below and console log attached. LustreError: 3411:0:(mdt_handler.c:2511:mdt_req_handle()) ASSERTION(h->mh_act != NULL) failed LustreError: 3411:0:(mdt_handler.c:2511:mdt_req_handle()) LBUG PID: 7364 TASK: ffff88079b637540 CPU: 14 COMMAND: "mdt_221" #0 [ffff88077e3abb98] machine_kexec at ffffffff8103216b #1 [ffff88077e3abbf8] crash_kexec at ffffffff810b8d12 #2 [ffff88077e3abcc8] panic at ffffffff814ee999 #3 [ffff88077e3abd48] lbug_with_loc at ffffffffa0456e1b [libcfs] #4 [ffff88077e3abd68] libcfs_assertion_failed at ffffffffa046042d [libcfs] #5 [ffff88077e3abd88] mdt_handle_common at ffffffffa0c162d9 [mdt] #6 [ffff88077e3abdd8] mdt_regular_handle at ffffffffa0c163f5 [mdt] #7 [ffff88077e3abde8] ptlrpc_main at ffffffffa0717d64 [ptlrpc] #8 [ffff88077e3abf48] kernel_thread at ffffffff8100c14a LLNL-bugzilla-ID: 1836 |
| Comments |
| Comment by Alex Zhuravlev [ 04/Oct/12 ] |
|
could you print *m and *req from crash, please? |
| Comment by Ned Bass [ 04/Oct/12 ] |
|
I believe I found the right pointers. I was able to validate req by req->rq_svc_thread->t_pid == 7364. Also h->mh_fail_id == OBD_FAIL_LLOG_CATINFO_NET which looks consistent for the name "CATINFO". ptlrpc_request: struct ptlrpc_request {
rq_type = 0x0,
rq_list = {
next = 0xffff88070dd9c808,
prev = 0xffff88070dd9c808
},
rq_timed_list = {
next = 0xffff88017ae504c0,
prev = 0xffff88017ae504c0
},
rq_history_list = {
next = 0xffff8801c4734878,
prev = 0xffff8806f68c1c28
},
rq_exp_list = {
next = 0x0,
prev = 0x0
},
rq_ops = 0x0,
rq_history_seq = 0xdda97211,
rq_at_index = 0x4c,
rq_status = 0x0,
rq_lock = {
raw_lock = {
slock = 0x10001
}
},
rq_intr = 0x0,
rq_replied = 0x0,
rq_err = 0x0,
rq_timedout = 0x0,
rq_resend = 0x0,
rq_restart = 0x0,
rq_replay = 0x0,
rq_no_resend = 0x0,
rq_waiting = 0x0,
rq_receiving_reply = 0x0,
rq_no_delay = 0x0,
rq_net_err = 0x0,
rq_wait_ctx = 0x0,
rq_early = 0x0,
rq_must_unlink = 0x0,
rq_fake = 0x0,
rq_memalloc = 0x0,
rq_packed_final = 0x0,
rq_hp = 0x0,
rq_at_linked = 0x1,
rq_reply_truncate = 0x0,
rq_committed = 0x0,
rq_invalid_rqset = 0x0,
rq_phase = 3955285507,
rq_next_phase = 0,
rq_refcount = {
counter = 0x1
},
rq_svc_thread = 0xffff88069183a1c0,
rq_request_portal = 0x0,
rq_reply_portal = 0x0,
rq_nob_received = 0x0,
rq_reqlen = 0xe0,
rq_reqmsg = 0xffff8801c4780600,
rq_replen = 0x0,
rq_repmsg = 0x0,
rq_transno = 0x0,
rq_xid = 0x505a44c0930ad,
rq_replay_list = {
next = 0x0,
prev = 0x0
},
rq_cli_ctx = 0x0,
rq_svc_ctx = 0xffffffffa07aff70,
rq_ctx_chain = {
next = 0x0,
prev = 0x0
},
rq_flvr = {
sf_rpc = 0x0,
sf_flags = 0x0,
u_rpc = {<No data fields>},
u_bulk = {
hash = {
hash_alg = 0x0
}
}
},
rq_sp_from = LUSTRE_SP_CLI,
rq_ctx_init = 0x0,
rq_ctx_fini = 0x0,
rq_bulk_read = 0x0,
rq_bulk_write = 0x0,
rq_auth_gss = 0x0,
rq_auth_remote = 0x0,
rq_auth_usr_root = 0x0,
rq_auth_usr_mdt = 0x0,
rq_auth_usr_ost = 0x0,
rq_pack_udesc = 0x0,
rq_pack_bulk = 0x0,
rq_no_reply = 0x0,
rq_pill_init = 0x1,
rq_auth_uid = 0xffffffff,
rq_auth_mapped_uid = 0xffffffff,
rq_user_desc = 0x0,
rq_reply_off = 0x0,
rq_reqbuf = 0xffff8801c4780600,
rq_reqbuf_len = 0x0,
rq_reqdata_len = 0xe0,
rq_repbuf = 0x0,
rq_repbuf_len = 0x0,
rq_repdata = 0x0,
rq_repdata_len = 0x0,
rq_clrbuf = 0x0,
rq_clrbuf_len = 0x0,
rq_clrdata_len = 0x0,
rq_req_swab_mask = 0x0,
rq_rep_swab_mask = 0x0,
rq_import_generation = 0x0,
rq_send_state = 0,
rq_early_count = 0x0,
rq_req_md_h = {
cookie = 0x0
},
rq_req_cbid = {
cbid_fn = 0,
cbid_arg = 0x0
},
rq_delay_limit = 0x0,
rq_queued_time = 0x0,
rq_arrival_time = {
tv_sec = 0x506dbaa0,
tv_usec = 0xb26ad
},
rq_reply_state = 0x0,
rq_rqbd = 0xffff8801c4734800,
rq_reply_md_h = {
cookie = 0x0
},
rq_reply_waitq = {
lock = {
raw_lock = {
slock = 0x0
}
},
task_list = {
next = 0x0,
prev = 0x0
}
},
rq_reply_cbid = {
cbid_fn = 0,
cbid_arg = 0x0
},
rq_self = 0x20000ac103cc8,
rq_peer = {
nid = 0x50005c0a8729b,
pid = 0x3039
},
rq_export = 0xffff8803af54c800,
rq_import = 0x0,
rq_replay_cb = 0,
rq_commit_cb = 0,
rq_cb_data = 0x0,
rq_bulk = 0x0,
rq_sent = 0x0,
rq_real_sent = 0x0,
rq_deadline = 0x506dbadd,
rq_reply_deadline = 0x0,
rq_bulk_deadline = 0x0,
rq_timeout = 0x0,
rq_set_chain = {
next = 0x0,
prev = 0x0
},
rq_set_waitq = {
lock = {
raw_lock = {
slock = 0x0
}
},
task_list = {
next = 0x0,
prev = 0x0
}
},
rq_set = 0x0,
rq_interpret_reply = 0,
rq_async_args = {
pointer_arg = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
space = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
},
rq_pool = 0x0,
rq_session = {
lc_tags = 0xa0000010,
lc_thread = 0xffff88069183a1c0,
lc_value = 0xffff88069f8ee380,
lc_state = LCS_ENTERED,
lc_remember = {
next = 0xffff880584afdb60,
prev = 0xffffffffa0625fb0
},
lc_version = 0x25,
lc_cookie = 0x5
},
rq_recov_session = {
lc_tags = 0x0,
lc_thread = 0x0,
lc_value = 0x0,
lc_state = 0,
lc_remember = {
next = 0x0,
prev = 0x0
},
lc_version = 0x0,
lc_cookie = 0x0
},
rq_pill = {
rc_req = 0xffff88070dd9c800,
rc_fmt = 0x0,
rc_loc = RCL_SERVER,
rc_area = {{0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff}, {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff}}
}
}
mdt_hanlder: struct mdt_handler {
mh_name = 0xffffffffa0c4459b "CATINFO",
mh_fail_id = 0x1309,
mh_opc = 0x1fb,
mh_flags = 0x0,
mh_act = 0,
mh_fmt = 0x0
}
|
| Comment by Alex Zhuravlev [ 07/Oct/15 ] |
|
the issue doesn't seem to happen? I wasn't able to reproduce that. |