[LU-12582] lustre 2.10.3 QoS nrs_tbf_rule_match LBUG Created: 24/Jul/19  Updated: 24/Jul/19

Status: Open
Project: Lustre
Component/s: None
Affects Version/s: Lustre 2.10.3
Fix Version/s: None

Type: Bug Priority: Critical
Reporter: anhua Assignee: WC Triage
Resolution: Unresolved Votes: 0
Labels: None
Environment:

Lustre 2.10.3, CentOS 7.4, OS 3.10.0.693, OFED 4.2


Epic/Theme: QoS-TBF
Severity: 1
Rank (Obsolete): 9223372036854775807

 Description   

Hi all. The kernel assert 'LASSERT((tmp_rule->tr_flags & NTRS_STOPPING) == 0)' is triggered in  function  static struct nrs_tbf_rule *nrs_tbf_rule_match(struct nrs_tbf_head *head, struct nrs_tbf_client *cli) of lustre/ptlrpc/nrs_tbf.c.
 
it seems that the tmp_rule->tr_flags of some rule is set to NTRS_STOPPING or the variable tmp_rule gets a bad value, but I have never run lctl command like 'lctl set_param mds.MDS.mdt.nrs_tbf_rule="stop rule1" '.
 
Here is the call trace:

[1139547.817517] LustreError: 20086:0:(nrs_tbf.c:235:nrs_tbf_rule_match()) ASSERTION( (tmp_rule->tr_flags & 0x0000001) == 0 ) failed:
[1139547.817540] LustreError: 20086:0:(nrs_tbf.c:235:nrs_tbf_rule_match()) LBUG
[1139547.817544] Pid: 20086, comm: mdt00_068
[1139547.817547]
Call Trace:
[1139547.817586]  [<ffffffffc09d57ae>] libcfs_call_trace+0x4e/0x60 [libcfs]
[1139547.817602]  [<ffffffffc09d583c>] lbug_with_loc+0x4c/0xb0 [libcfs]
[1139547.817700]  [<ffffffffc0f804c5>] nrs_tbf_rule_match+0xc5/0xd0 [ptlrpc]
[1139547.817780]  [<ffffffffc0f834ad>] nrs_tbf_res_get+0xad/0x4c0 [ptlrpc]
[1139547.817852]  [<ffffffffc0f7621c>] nrs_resource_get+0x7c/0x100 [ptlrpc]
[1139547.817922]  [<ffffffffc0f76790>] nrs_resource_get_safe+0x80/0xf0 [ptlrpc]
[1139547.817993]  [<ffffffffc0f7a263>] ptlrpc_nrs_req_initialize+0x83/0x100 [ptlrpc]
[1139547.818059]  [<ffffffffc0f48f31>] ptlrpc_main+0x1771/0x1e40 [ptlrpc]
[1139547.818125]  [<ffffffffc0f477c0>] ? ptlrpc_main+0x0/0x1e40 [ptlrpc]
[1139547.818134]  [<ffffffff810b252f>] kthread+0xcf/0xe0
[1139547.818141]  [<ffffffff810b2460>] ? kthread+0x0/0xe0
[1139547.818149]  [<ffffffff816b8798>] ret_from_fork+0x58/0x90
[1139547.818155]  [<ffffffff810b2460>] ? kthread+0x0/0xe0
[1139547.818159]
[1139547.818162] Kernel panic - not syncing: LBUG
[1139547.818212] CPU: 16 PID: 20086 Comm: mdt00_068 Tainted: G           OEL ------------   3.10.0-693.11.6.el7_lustre.x86_64 #1
[1139547.818355] Call Trace:
[1139547.818385]  [<ffffffff816a5e7d>] dump_stack+0x19/0x1b
[1139547.818433]  [<ffffffff8169fd64>] panic+0xe8/0x20d
[1139547.818492]  [<ffffffffc09d5854>] lbug_with_loc+0x64/0xb0 [libcfs]
[1139547.818611]  [<ffffffffc0f804c5>] nrs_tbf_rule_match+0xc5/0xd0 [ptlrpc]
[1139547.818732]  [<ffffffffc0f834ad>] nrs_tbf_res_get+0xad/0x4c0 [ptlrpc]
[1139547.818848]  [<ffffffffc0f7621c>] nrs_resource_get+0x7c/0x100 [ptlrpc]
[1139547.818965]  [<ffffffffc0f76790>] nrs_resource_get_safe+0x80/0xf0 [ptlrpc]
[1139547.819084]  [<ffffffffc0f7a263>] ptlrpc_nrs_req_initialize+0x83/0x100 [ptlrpc]
[1139547.819203]  [<ffffffffc0f48f31>] ptlrpc_main+0x1771/0x1e40 [ptlrpc]
[1139547.819316]  [<ffffffffc0f477c0>] ? ptlrpc_register_service+0xe30/0xe30 [ptlrpc]
[1139547.819376]  [<ffffffff810b252f>] kthread+0xcf/0xe0
[1139547.819419]  [<ffffffff810b2460>] ? insert_kthread_work+0x40/0x40
[1139547.819470]  [<ffffffff816b8798>] ret_from_fork+0x58/0x90
[1139547.819516]  [<ffffffff810b2460>] ? insert_kthread_work+0x40/0x40

 



 Comments   
Comment by anhua [ 24/Jul/19 ]

Here is the kernel vm snapshotted by kdump when the LASSERT is triggered.
 
nrs_tbf_head *head:
---------------------

crash> struct nrs_tbf_head ffff8852e0748b00
struct nrs_tbf_head {
  th_res = {
    res_parent = 0x0,
    res_policy = 0xffff885eb9f08100
  },
  th_list = {
    next = 0xffff885219687bd8,
    prev = 0xffff884a160c58d8
  },
  th_rule_lock = {
    {
      rlock = {
        raw_lock = {
          val = {
            counter = 1
          }
        }
      }
    }
  },
  th_rule_sequence = {
    counter = 607
  },
  th_rule = 0xffff884a160c58c0,
  th_timer = {
    node = {
      node = {
        __rb_parent_color = 18446612488267270960,
        rb_right = 0x0,
        rb_left = 0x0
      },
      expires = {
        tv64 = 1139545063897123
      }
    },
    _softexpires = {
      tv64 = 1139545063897123
    },
    function = 0xffffffffc0f80830 <nrs_tbf_timer_cb>,
    base = 0xffff885ebf993960,
    state = 0,
    start_pid = 18992,
    start_site = 0xffffffff810b65a2 <hrtimer_start+18>,
    start_comm = "mdt00_045\000\000\000\000\000\000"
  },
  th_deadline = 1139545063897123,
  th_sequence = 20564788676,
  th_binheap = 0xffff884105ed4060,
  th_cli_hash = 0xffff88b105ee2000,
  th_type = "nid\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000",
  th_ops = 0xffffffffc10485e0 <nrs_tbf_nid_ops>,
  th_type_flag = 2,
  th_purge_start = 0
}

 
the linked list of "struct nrs_tbf_rule":
-----------------------
default rule:

crash> struct nrs_tbf_rule 0xffff884a160c58c0
struct nrs_tbf_rule {
  tr_name = "default\000\000\000\000\000\000\000\000",
  tr_head = 0xffff8852e0748b00,
  tr_linkage = {
    next = 0xffff8852e0748b10,
    prev = 0xffff885219687bd8
  },
  tr_nids = {
    next = 0xffff884a160c58e8,
    prev = 0xffff884a160c58e8
  },
  tr_nids_str = 0xffff8854bffade60 "*",
  tr_jobids = {
    next = 0x0,
    prev = 0x0
  },
  tr_jobids_str = 0x0,
  tr_opcodes = 0x0,
  tr_opcodes_str = 0x0,
  tr_conds = {
    next = 0x0,
    prev = 0x0
  },
  tr_conds_str = 0x0,
  tr_rpc_rate = 10000,
  tr_nsecs = 100000,
  tr_depth = 3,
  tr_rule_lock = {
    {
      rlock = {
        raw_lock = {
          val = {
            counter = 0
          }
        }
      }
    }
  },
  tr_cli_list = {
    next = 0xffff88533b6938a0,
    prev = 0xffff88533b6938a0
  },
  tr_flags = 2,
  tr_ref = {
    counter = 2
  },
  tr_generation = 0
}

rule_clients:
-------------------

crash> struct nrs_tbf_rule 0xffff885219687bc0
struct nrs_tbf_rule {
  tr_name = "rule_clients\000\000\000",
  tr_head = 0xffff8852e0748b00,
  tr_linkage = {
    next = 0xffff884a160c58d8,
    prev = 0xffff8852e0748b10
  },
  tr_nids = {
    next = 0xffff885e1a280380,
    prev = 0xffff885e1a280380
  },
  tr_nids_str = 0xffff885e0f762a40 "*******",
  tr_jobids = {
    next = 0x0,
    prev = 0x0
  },
  tr_jobids_str = 0x0,
  tr_opcodes = 0x0,
  tr_opcodes_str = 0x0,
  tr_conds = {
    next = 0x0,
    prev = 0x0
  },
  tr_conds_str = 0x0,
  tr_rpc_rate = 3000,
  tr_nsecs = 333333,
  tr_depth = 3,
  tr_rule_lock = {
    {
      rlock = {
        raw_lock = {
          val = {
            counter = 0
          }
        }
      }
    }
  },
  tr_cli_list = {
    next = 0xffff885219687c60,
    prev = 0xffff885219687c60
  },
  tr_flags = 0,
  tr_ref = {
    counter = 1
  },
  tr_generation = 0
}

an unknown and bad  rule:
-------------------------

crash>  struct nrs_tbf_rule 0xffff8852e0748af8
struct nrs_tbf_rule {
  tr_name = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000",
  tr_head = 0xffff885eb9f08100,
  tr_linkage = {
    next = 0xffff885219687bd8,
    prev = 0xffff884a160c58d8
  },
  tr_nids = {
    next = 0x25f00000001,
    prev = 0xffff884a160c58c0
  },
  tr_nids_str = 0xffff8852e0748b30 "0\213t\340R\210\377\377",
  tr_jobids = {
    next = 0x0,
    prev = 0x0
  },
  tr_jobids_str = 0x40c6902bd3823 <Address 0x40c6902bd3823 out of bounds>,
  tr_opcodes = 0x40c6902bd3823,
  tr_opcodes_str = 0xffffffffc0f80830 <nrs_tbf_timer_cb> "\017\037D",
  tr_conds = {
    next = 0xffff885ebf993960,
    prev = 0x0
  },
  tr_conds_str = 0x4a30 <Address 0x4a30 out of bounds>,
  tr_rpc_rate = 18446744071579592098,
  tr_nsecs = 3760610349430367341,
  tr_depth = 53,
  tr_rule_lock = {
    {
      rlock = {
        raw_lock = {
          val = {
            counter = 45955107
          }
        }
      }
    }
  },
  tr_cli_list = {
    next = 0x4c9c1c5c4,
    prev = 0xffff884105ed4060
  },
  tr_flags = 99491840,
  tr_ref = {
    counter = -30543
  },
  tr_generation = 6580590
}

------------------------
It seems that the assert can not be triggered, since all the "tr_flags" of all three rules are 0, 2, 99491840(a bad value). the result of "tr_flags & 1" is always 0.
 

Generated at Sat Feb 10 02:53:52 UTC 2024 using Jira 9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c.