[LU-12582] lustre 2.10.3 QoS nrs_tbf_rule_match LBUG Created: 24/Jul/19 Updated: 24/Jul/19 |
|
| Status: | Open |
| Project: | Lustre |
| Component/s: | None |
| Affects Version/s: | Lustre 2.10.3 |
| Fix Version/s: | None |
| Type: | Bug | Priority: | Critical |
| Reporter: | anhua | Assignee: | WC Triage |
| Resolution: | Unresolved | Votes: | 0 |
| Labels: | None | ||
| Environment: |
Lustre 2.10.3, CentOS 7.4, OS 3.10.0.693, OFED 4.2 |
||
| Epic/Theme: | QoS-TBF |
| Severity: | 1 |
| Rank (Obsolete): | 9223372036854775807 |
| Description |
|
Hi all. The kernel assert 'LASSERT((tmp_rule->tr_flags & NTRS_STOPPING) == 0)' is triggered in function static struct nrs_tbf_rule *nrs_tbf_rule_match(struct nrs_tbf_head *head, struct nrs_tbf_client *cli) of lustre/ptlrpc/nrs_tbf.c. [1139547.817517] LustreError: 20086:0:(nrs_tbf.c:235:nrs_tbf_rule_match()) ASSERTION( (tmp_rule->tr_flags & 0x0000001) == 0 ) failed: [1139547.817540] LustreError: 20086:0:(nrs_tbf.c:235:nrs_tbf_rule_match()) LBUG [1139547.817544] Pid: 20086, comm: mdt00_068 [1139547.817547] Call Trace: [1139547.817586] [<ffffffffc09d57ae>] libcfs_call_trace+0x4e/0x60 [libcfs] [1139547.817602] [<ffffffffc09d583c>] lbug_with_loc+0x4c/0xb0 [libcfs] [1139547.817700] [<ffffffffc0f804c5>] nrs_tbf_rule_match+0xc5/0xd0 [ptlrpc] [1139547.817780] [<ffffffffc0f834ad>] nrs_tbf_res_get+0xad/0x4c0 [ptlrpc] [1139547.817852] [<ffffffffc0f7621c>] nrs_resource_get+0x7c/0x100 [ptlrpc] [1139547.817922] [<ffffffffc0f76790>] nrs_resource_get_safe+0x80/0xf0 [ptlrpc] [1139547.817993] [<ffffffffc0f7a263>] ptlrpc_nrs_req_initialize+0x83/0x100 [ptlrpc] [1139547.818059] [<ffffffffc0f48f31>] ptlrpc_main+0x1771/0x1e40 [ptlrpc] [1139547.818125] [<ffffffffc0f477c0>] ? ptlrpc_main+0x0/0x1e40 [ptlrpc] [1139547.818134] [<ffffffff810b252f>] kthread+0xcf/0xe0 [1139547.818141] [<ffffffff810b2460>] ? kthread+0x0/0xe0 [1139547.818149] [<ffffffff816b8798>] ret_from_fork+0x58/0x90 [1139547.818155] [<ffffffff810b2460>] ? kthread+0x0/0xe0 [1139547.818159] [1139547.818162] Kernel panic - not syncing: LBUG [1139547.818212] CPU: 16 PID: 20086 Comm: mdt00_068 Tainted: G OEL ------------ 3.10.0-693.11.6.el7_lustre.x86_64 #1 [1139547.818355] Call Trace: [1139547.818385] [<ffffffff816a5e7d>] dump_stack+0x19/0x1b [1139547.818433] [<ffffffff8169fd64>] panic+0xe8/0x20d [1139547.818492] [<ffffffffc09d5854>] lbug_with_loc+0x64/0xb0 [libcfs] [1139547.818611] [<ffffffffc0f804c5>] nrs_tbf_rule_match+0xc5/0xd0 [ptlrpc] [1139547.818732] [<ffffffffc0f834ad>] nrs_tbf_res_get+0xad/0x4c0 [ptlrpc] [1139547.818848] [<ffffffffc0f7621c>] nrs_resource_get+0x7c/0x100 [ptlrpc] [1139547.818965] [<ffffffffc0f76790>] nrs_resource_get_safe+0x80/0xf0 [ptlrpc] [1139547.819084] [<ffffffffc0f7a263>] ptlrpc_nrs_req_initialize+0x83/0x100 [ptlrpc] [1139547.819203] [<ffffffffc0f48f31>] ptlrpc_main+0x1771/0x1e40 [ptlrpc] [1139547.819316] [<ffffffffc0f477c0>] ? ptlrpc_register_service+0xe30/0xe30 [ptlrpc] [1139547.819376] [<ffffffff810b252f>] kthread+0xcf/0xe0 [1139547.819419] [<ffffffff810b2460>] ? insert_kthread_work+0x40/0x40 [1139547.819470] [<ffffffff816b8798>] ret_from_fork+0x58/0x90 [1139547.819516] [<ffffffff810b2460>] ? insert_kthread_work+0x40/0x40
|
| Comments |
| Comment by anhua [ 24/Jul/19 ] |
|
Here is the kernel vm snapshotted by kdump when the LASSERT is triggered.
crash> struct nrs_tbf_head ffff8852e0748b00
struct nrs_tbf_head {
th_res = {
res_parent = 0x0,
res_policy = 0xffff885eb9f08100
},
th_list = {
next = 0xffff885219687bd8,
prev = 0xffff884a160c58d8
},
th_rule_lock = {
{
rlock = {
raw_lock = {
val = {
counter = 1
}
}
}
}
},
th_rule_sequence = {
counter = 607
},
th_rule = 0xffff884a160c58c0,
th_timer = {
node = {
node = {
__rb_parent_color = 18446612488267270960,
rb_right = 0x0,
rb_left = 0x0
},
expires = {
tv64 = 1139545063897123
}
},
_softexpires = {
tv64 = 1139545063897123
},
function = 0xffffffffc0f80830 <nrs_tbf_timer_cb>,
base = 0xffff885ebf993960,
state = 0,
start_pid = 18992,
start_site = 0xffffffff810b65a2 <hrtimer_start+18>,
start_comm = "mdt00_045\000\000\000\000\000\000"
},
th_deadline = 1139545063897123,
th_sequence = 20564788676,
th_binheap = 0xffff884105ed4060,
th_cli_hash = 0xffff88b105ee2000,
th_type = "nid\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000",
th_ops = 0xffffffffc10485e0 <nrs_tbf_nid_ops>,
th_type_flag = 2,
th_purge_start = 0
}
crash> struct nrs_tbf_rule 0xffff884a160c58c0
struct nrs_tbf_rule {
tr_name = "default\000\000\000\000\000\000\000\000",
tr_head = 0xffff8852e0748b00,
tr_linkage = {
next = 0xffff8852e0748b10,
prev = 0xffff885219687bd8
},
tr_nids = {
next = 0xffff884a160c58e8,
prev = 0xffff884a160c58e8
},
tr_nids_str = 0xffff8854bffade60 "*",
tr_jobids = {
next = 0x0,
prev = 0x0
},
tr_jobids_str = 0x0,
tr_opcodes = 0x0,
tr_opcodes_str = 0x0,
tr_conds = {
next = 0x0,
prev = 0x0
},
tr_conds_str = 0x0,
tr_rpc_rate = 10000,
tr_nsecs = 100000,
tr_depth = 3,
tr_rule_lock = {
{
rlock = {
raw_lock = {
val = {
counter = 0
}
}
}
}
},
tr_cli_list = {
next = 0xffff88533b6938a0,
prev = 0xffff88533b6938a0
},
tr_flags = 2,
tr_ref = {
counter = 2
},
tr_generation = 0
}
rule_clients:
crash> struct nrs_tbf_rule 0xffff885219687bc0
struct nrs_tbf_rule {
tr_name = "rule_clients\000\000\000",
tr_head = 0xffff8852e0748b00,
tr_linkage = {
next = 0xffff884a160c58d8,
prev = 0xffff8852e0748b10
},
tr_nids = {
next = 0xffff885e1a280380,
prev = 0xffff885e1a280380
},
tr_nids_str = 0xffff885e0f762a40 "*******",
tr_jobids = {
next = 0x0,
prev = 0x0
},
tr_jobids_str = 0x0,
tr_opcodes = 0x0,
tr_opcodes_str = 0x0,
tr_conds = {
next = 0x0,
prev = 0x0
},
tr_conds_str = 0x0,
tr_rpc_rate = 3000,
tr_nsecs = 333333,
tr_depth = 3,
tr_rule_lock = {
{
rlock = {
raw_lock = {
val = {
counter = 0
}
}
}
}
},
tr_cli_list = {
next = 0xffff885219687c60,
prev = 0xffff885219687c60
},
tr_flags = 0,
tr_ref = {
counter = 1
},
tr_generation = 0
}
an unknown and bad rule:
crash> struct nrs_tbf_rule 0xffff8852e0748af8
struct nrs_tbf_rule {
tr_name = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000",
tr_head = 0xffff885eb9f08100,
tr_linkage = {
next = 0xffff885219687bd8,
prev = 0xffff884a160c58d8
},
tr_nids = {
next = 0x25f00000001,
prev = 0xffff884a160c58c0
},
tr_nids_str = 0xffff8852e0748b30 "0\213t\340R\210\377\377",
tr_jobids = {
next = 0x0,
prev = 0x0
},
tr_jobids_str = 0x40c6902bd3823 <Address 0x40c6902bd3823 out of bounds>,
tr_opcodes = 0x40c6902bd3823,
tr_opcodes_str = 0xffffffffc0f80830 <nrs_tbf_timer_cb> "\017\037D",
tr_conds = {
next = 0xffff885ebf993960,
prev = 0x0
},
tr_conds_str = 0x4a30 <Address 0x4a30 out of bounds>,
tr_rpc_rate = 18446744071579592098,
tr_nsecs = 3760610349430367341,
tr_depth = 53,
tr_rule_lock = {
{
rlock = {
raw_lock = {
val = {
counter = 45955107
}
}
}
}
},
tr_cli_list = {
next = 0x4c9c1c5c4,
prev = 0xffff884105ed4060
},
tr_flags = 99491840,
tr_ref = {
counter = -30543
},
tr_generation = 6580590
}
------------------------ |