[LU-2090] mdt_req_handle() ASSERTION(h->mh_act != NULL) failed Created: 04/Oct/12  Updated: 07/Oct/15  Resolved: 07/Oct/15

Status: Closed
Project: Lustre
Component/s: None
Affects Version/s: Lustre 2.1.2
Fix Version/s: None

Type: Bug Priority: Critical
Reporter: Ned Bass Assignee: Alex Zhuravlev
Resolution: Cannot Reproduce Votes: 0
Labels: llnl
Environment:

https://github.com/chaos/lustre/commits/2.1.2-3chaos


Attachments: Text File pigs-mds1-console-log.txt    
Severity: 3
Rank (Obsolete): 4365

 Description   

Production MDS got stuck in a crash/reboot loop. It hit the summary assertion this morning then again on each reboot during recovery. Finally we aborted recovery and the MDS stabilized. We have several crash dumps.

Backtrace below and console log attached.

LustreError: 3411:0:(mdt_handler.c:2511:mdt_req_handle()) ASSERTION(h->mh_act != NULL) failed
LustreError: 3411:0:(mdt_handler.c:2511:mdt_req_handle()) LBUG

PID: 7364   TASK: ffff88079b637540  CPU: 14  COMMAND: "mdt_221"
 #0 [ffff88077e3abb98] machine_kexec at ffffffff8103216b
 #1 [ffff88077e3abbf8] crash_kexec at ffffffff810b8d12
 #2 [ffff88077e3abcc8] panic at ffffffff814ee999
 #3 [ffff88077e3abd48] lbug_with_loc at ffffffffa0456e1b [libcfs]
 #4 [ffff88077e3abd68] libcfs_assertion_failed at ffffffffa046042d [libcfs]
 #5 [ffff88077e3abd88] mdt_handle_common at ffffffffa0c162d9 [mdt]
 #6 [ffff88077e3abdd8] mdt_regular_handle at ffffffffa0c163f5 [mdt]
 #7 [ffff88077e3abde8] ptlrpc_main at ffffffffa0717d64 [ptlrpc]
 #8 [ffff88077e3abf48] kernel_thread at ffffffff8100c14a

LLNL-bugzilla-ID: 1836



 Comments   
Comment by Alex Zhuravlev [ 04/Oct/12 ]

could you print *m and *req from crash, please?

Comment by Ned Bass [ 04/Oct/12 ]

I believe I found the right pointers. I was able to validate req by req->rq_svc_thread->t_pid == 7364. Also h->mh_fail_id == OBD_FAIL_LLOG_CATINFO_NET which looks consistent for the name "CATINFO".

ptlrpc_request:

struct ptlrpc_request {
  rq_type = 0x0,
  rq_list = {
    next = 0xffff88070dd9c808,
    prev = 0xffff88070dd9c808
  },
  rq_timed_list = {
    next = 0xffff88017ae504c0,
    prev = 0xffff88017ae504c0
  },
  rq_history_list = {
    next = 0xffff8801c4734878,
    prev = 0xffff8806f68c1c28
  },
  rq_exp_list = {
    next = 0x0,
    prev = 0x0
  },
  rq_ops = 0x0,
  rq_history_seq = 0xdda97211,
  rq_at_index = 0x4c,
  rq_status = 0x0,
  rq_lock = {
    raw_lock = {
      slock = 0x10001
    }
  },
  rq_intr = 0x0,
  rq_replied = 0x0,
  rq_err = 0x0,
  rq_timedout = 0x0,
  rq_resend = 0x0,
  rq_restart = 0x0,
  rq_replay = 0x0,
  rq_no_resend = 0x0,
  rq_waiting = 0x0,
  rq_receiving_reply = 0x0,
  rq_no_delay = 0x0,
  rq_net_err = 0x0,
  rq_wait_ctx = 0x0,
  rq_early = 0x0,
  rq_must_unlink = 0x0,
  rq_fake = 0x0,
  rq_memalloc = 0x0,
  rq_packed_final = 0x0,
  rq_hp = 0x0,
  rq_at_linked = 0x1,
  rq_reply_truncate = 0x0,
  rq_committed = 0x0,
  rq_invalid_rqset = 0x0,
  rq_phase = 3955285507,
  rq_next_phase = 0,
  rq_refcount = {
    counter = 0x1
  },
  rq_svc_thread = 0xffff88069183a1c0,
  rq_request_portal = 0x0,
  rq_reply_portal = 0x0,
  rq_nob_received = 0x0,
  rq_reqlen = 0xe0,
  rq_reqmsg = 0xffff8801c4780600,
  rq_replen = 0x0,
  rq_repmsg = 0x0,
  rq_transno = 0x0,
  rq_xid = 0x505a44c0930ad,
  rq_replay_list = {
    next = 0x0,
    prev = 0x0
  },
  rq_cli_ctx = 0x0,
  rq_svc_ctx = 0xffffffffa07aff70,
  rq_ctx_chain = {
    next = 0x0,
    prev = 0x0
  },
  rq_flvr = {
    sf_rpc = 0x0,
    sf_flags = 0x0,
    u_rpc = {<No data fields>},
    u_bulk = {
      hash = {
        hash_alg = 0x0
      }
    }
  },
  rq_sp_from = LUSTRE_SP_CLI,
  rq_ctx_init = 0x0,
  rq_ctx_fini = 0x0,
  rq_bulk_read = 0x0,
  rq_bulk_write = 0x0,
  rq_auth_gss = 0x0,
  rq_auth_remote = 0x0,
  rq_auth_usr_root = 0x0,
  rq_auth_usr_mdt = 0x0,
  rq_auth_usr_ost = 0x0,
  rq_pack_udesc = 0x0,
  rq_pack_bulk = 0x0,
  rq_no_reply = 0x0,
  rq_pill_init = 0x1,
  rq_auth_uid = 0xffffffff,
  rq_auth_mapped_uid = 0xffffffff,
  rq_user_desc = 0x0,
  rq_reply_off = 0x0,
  rq_reqbuf = 0xffff8801c4780600,
  rq_reqbuf_len = 0x0,
  rq_reqdata_len = 0xe0,
  rq_repbuf = 0x0,
  rq_repbuf_len = 0x0,
  rq_repdata = 0x0,
  rq_repdata_len = 0x0,
  rq_clrbuf = 0x0,
  rq_clrbuf_len = 0x0,
  rq_clrdata_len = 0x0,
  rq_req_swab_mask = 0x0,
  rq_rep_swab_mask = 0x0,
  rq_import_generation = 0x0,
  rq_send_state = 0,
  rq_early_count = 0x0,
  rq_req_md_h = {
    cookie = 0x0
  },
  rq_req_cbid = {
    cbid_fn = 0,
    cbid_arg = 0x0
  },
  rq_delay_limit = 0x0,
  rq_queued_time = 0x0,
  rq_arrival_time = {
    tv_sec = 0x506dbaa0,
    tv_usec = 0xb26ad
  },
  rq_reply_state = 0x0,
  rq_rqbd = 0xffff8801c4734800,
  rq_reply_md_h = {
    cookie = 0x0
  },
  rq_reply_waitq = {
    lock = {
      raw_lock = {
        slock = 0x0
      }
    },
    task_list = {
      next = 0x0,
      prev = 0x0
    }
  },
  rq_reply_cbid = {
    cbid_fn = 0,
    cbid_arg = 0x0
  },
  rq_self = 0x20000ac103cc8,
  rq_peer = {
    nid = 0x50005c0a8729b,
    pid = 0x3039
  },
  rq_export = 0xffff8803af54c800,
  rq_import = 0x0,
  rq_replay_cb = 0,
  rq_commit_cb = 0,
  rq_cb_data = 0x0,
  rq_bulk = 0x0,
  rq_sent = 0x0,
  rq_real_sent = 0x0,
  rq_deadline = 0x506dbadd,
  rq_reply_deadline = 0x0,
  rq_bulk_deadline = 0x0,
  rq_timeout = 0x0,
  rq_set_chain = {
    next = 0x0,
    prev = 0x0
  },
  rq_set_waitq = {
    lock = {
      raw_lock = {
        slock = 0x0
      }
    },
    task_list = {
      next = 0x0,
      prev = 0x0
    }
  },
  rq_set = 0x0,
  rq_interpret_reply = 0,
  rq_async_args = {
    pointer_arg = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
    space = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
  },
  rq_pool = 0x0,
  rq_session = {
    lc_tags = 0xa0000010,
    lc_thread = 0xffff88069183a1c0,
    lc_value = 0xffff88069f8ee380,
    lc_state = LCS_ENTERED,
    lc_remember = {
      next = 0xffff880584afdb60,
      prev = 0xffffffffa0625fb0
    },
    lc_version = 0x25,
    lc_cookie = 0x5
  },
  rq_recov_session = {
    lc_tags = 0x0,
    lc_thread = 0x0,
    lc_value = 0x0,
    lc_state = 0,
    lc_remember = {
      next = 0x0,
      prev = 0x0
    },
    lc_version = 0x0,
    lc_cookie = 0x0
  },
  rq_pill = {
    rc_req = 0xffff88070dd9c800,
    rc_fmt = 0x0,
    rc_loc = RCL_SERVER,
    rc_area = {{0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff}, {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff}}
  }
}

mdt_hanlder:

struct mdt_handler {
  mh_name = 0xffffffffa0c4459b "CATINFO", 
  mh_fail_id = 0x1309, 
  mh_opc = 0x1fb, 
  mh_flags = 0x0, 
  mh_act = 0, 
  mh_fmt = 0x0
}
Comment by Alex Zhuravlev [ 07/Oct/15 ]

the issue doesn't seem to happen? I wasn't able to reproduce that.

Generated at Sat Feb 10 01:22:17 UTC 2024 using Jira 9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c.