[2654930.082991] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2654930.089999] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2654930.097188] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2654930.103848] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2654930.111048] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2654930.117794] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2654930.124453] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2654930.131391] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2654930.138701] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2654930.145040] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2654930.152204] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2654930.160096] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2654930.166626] [] kthread+0xd1/0xe0 [2654930.171717] [] ret_from_fork_nospec_begin+0xe/0x21 [2654930.178382] [] 0xffffffffffffffff [2654930.183589] LustreError: dumping log to /tmp/lustre-log.1554810095.115613 [2654930.260843] Pid: 116295, comm: mdt00_109 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2654930.270840] Call Trace: [2654930.273476] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2654930.280588] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2654930.287987] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2654930.295011] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2654930.302200] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2654930.308852] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2654930.316044] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2654930.322788] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2654930.329457] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2654930.336402] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2654930.343699] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2654930.350037] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2654930.357172] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2654930.365073] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2654930.371590] [] kthread+0xd1/0xe0 [2654930.376676] [] ret_from_fork_nospec_begin+0xe/0x21 [2654930.383340] [] 0xffffffffffffffff [2654930.388555] Pid: 115866, comm: mdt00_046 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2654930.398554] Call Trace: [2654930.401185] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2654930.408295] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2654930.415669] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2654930.422689] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2654930.429881] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2654930.436539] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2654930.443745] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2654930.450492] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2654930.457164] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2654930.464091] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2654930.471379] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2654930.477714] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2654930.484838] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2654930.492741] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2654930.499258] [] kthread+0xd1/0xe0 [2654930.504363] [] ret_from_fork_nospec_begin+0xe/0x21 [2654930.511027] [] 0xffffffffffffffff [2654930.516207] Pid: 116242, comm: mdt02_084 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2654930.526234] Call Trace: [2654930.528870] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2654930.535972] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2654930.543348] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2654930.550351] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2654930.557565] [] mdt_object_lock+0x20/0x30 [mdt] [2654930.563884] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2654930.570380] [] mdt_intent_brw+0x1f/0x30 [mdt] [2654930.576604] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2654930.583277] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2654930.590225] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2654930.597521] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2654930.603876] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2654930.610998] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2654930.618889] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2654930.625416] [] kthread+0xd1/0xe0 [2654930.630518] [] ret_from_fork_nospec_begin+0xe/0x21 [2654930.637181] [] 0xffffffffffffffff [2654930.642395] Pid: 115822, comm: mdt01_046 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2654930.652412] Call Trace: [2654930.655043] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2654930.662144] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2654930.669512] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2654930.676529] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2654930.683708] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2654930.690400] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2654930.697582] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2654930.704357] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2654930.711015] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2654930.717939] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2654930.725221] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2654930.731547] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2654930.738653] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2654930.746542] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2654930.753041] [] kthread+0xd1/0xe0 [2654930.758127] [] ret_from_fork_nospec_begin+0xe/0x21 [2654930.764776] [] 0xffffffffffffffff [2654930.769962] LNet: Service thread pid 115408 was inactive for 200.89s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2654930.783083] LNet: Skipped 56 previous similar messages [2655029.139771] LNet: Service thread pid 116172 completed after 299.39s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2655029.156199] LNet: Skipped 79 previous similar messages [2655059.193460] Lustre: fir-MDT0001: Connection restored to (at 10.8.8.10@o2ib6) [2655059.200778] Lustre: Skipped 488 previous similar messages [2655230.068806] Pid: 116191, comm: mdt01_107 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2655230.078812] Call Trace: [2655230.081455] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2655230.088584] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2655230.095950] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2655230.102953] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2655230.110160] [] mdt_object_lock+0x20/0x30 [mdt] [2655230.116472] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2655230.122983] [] mdt_intent_brw+0x1f/0x30 [mdt] [2655230.129205] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2655230.135889] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2655230.142819] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2655230.150115] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2655230.156453] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2655230.163568] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2655230.171453] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2655230.177971] [] kthread+0xd1/0xe0 [2655230.183076] [] ret_from_fork_nospec_begin+0xe/0x21 [2655230.189724] [] 0xffffffffffffffff [2655230.194933] LustreError: dumping log to /tmp/lustre-log.1554810395.116191 [2655230.265851] Pid: 115784, comm: mdt01_041 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2655230.275848] Call Trace: [2655230.278492] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2655230.285621] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2655230.292989] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2655230.300007] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2655230.307186] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2655230.313857] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2655230.321050] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2655230.327810] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2655230.334482] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2655230.341434] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2655230.348717] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2655230.355052] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2655230.362181] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2655230.370089] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2655230.376605] [] kthread+0xd1/0xe0 [2655230.381694] [] ret_from_fork_nospec_begin+0xe/0x21 [2655230.388348] [] 0xffffffffffffffff [2655230.393564] Pid: 115959, comm: mdt01_081 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2655230.403578] Call Trace: [2655230.406234] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2655230.413345] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2655230.420737] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2655230.427740] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2655230.434929] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2655230.441588] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2655230.448765] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2655230.455499] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2655230.462172] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2655230.469121] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2655230.476404] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2655230.482731] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2655230.489852] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2655230.497741] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2655230.504270] [] kthread+0xd1/0xe0 [2655230.509355] [] ret_from_fork_nospec_begin+0xe/0x21 [2655230.516018] [] 0xffffffffffffffff [2655230.521208] Pid: 115621, comm: mdt01_022 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2655230.531222] Call Trace: [2655230.533854] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2655230.540981] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2655230.548362] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2655230.555366] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2655230.562542] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2655230.569206] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2655230.576383] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2655230.583150] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2655230.589809] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2655230.596747] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2655230.604037] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2655230.610367] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2655230.617473] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2655230.625376] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2655230.631875] [] kthread+0xd1/0xe0 [2655230.636970] [] ret_from_fork_nospec_begin+0xe/0x21 [2655230.643620] [] 0xffffffffffffffff [2655230.648812] Pid: 115871, comm: mdt01_061 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2655230.658822] Call Trace: [2655230.661454] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2655230.668554] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2655230.675930] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2655230.682933] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2655230.690109] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2655230.696765] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2655230.703955] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2655230.710701] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2655230.717357] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2655230.724281] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2655230.731591] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2655230.737925] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2655230.745044] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2655230.752950] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2655230.759481] [] kthread+0xd1/0xe0 [2655230.764564] [] ret_from_fork_nospec_begin+0xe/0x21 [2655230.771218] [] 0xffffffffffffffff [2655428.343098] Lustre: 115878:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ed017fa4e00 x1628647038065376/t0(0) o101->535a7c41-41b9-ec51-bbd6-e028c4ebcb2c@10.8.8.1@o2ib6:28/0 lens 576/3264 e 0 to 0 dl 1554810598 ref 2 fl Interpret:/0/0 rc 0/0 [2655428.372170] Lustre: 115878:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 150 previous similar messages [2655434.121001] Lustre: fir-MDT0001: Client eed99957-e395-8d59-f471-3be5bc5334d2 (at 10.8.27.31@o2ib6) reconnecting [2655434.131266] Lustre: Skipped 504 previous similar messages [2655448.695364] LustreError: dumping log to /tmp/lustre-log.1554810614.115636 [2655493.288852] LustreError: 116124:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554810568, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ecf78b598c0/0xbc3294614a09e1fa lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 102 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 116124 timeout: 0 lvb_type: 0 [2655493.328867] LustreError: 116124:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 128 previous similar messages [2655578.145815] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.27.25@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ec0ef2d5580/0xbc32946148b631b0 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 102 type: IBT flags: 0x60200400000020 nid: 10.8.27.25@o2ib6 remote: 0x47916098feb76811 expref: 75 pid: 115750 timeout: 2655557 lvb_type: 0 [2655578.184291] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 6 previous similar messages [2655603.321096] LNet: Service thread pid 116124 was inactive for 200.03s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2655603.338295] LNet: Skipped 9 previous similar messages [2655603.343531] Pid: 116124, comm: mdt01_089 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2655603.353539] Call Trace: [2655603.356182] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2655603.363310] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2655603.370697] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2655603.377706] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2655603.384898] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2655603.391658] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2655603.398314] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2655603.405263] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2655603.412544] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2655603.418904] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2655603.426022] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2655603.433934] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2655603.440434] [] kthread+0xd1/0xe0 [2655603.445551] [] ret_from_fork_nospec_begin+0xe/0x21 [2655603.452204] [] 0xffffffffffffffff [2655603.457426] LustreError: dumping log to /tmp/lustre-log.1554810769.116124 [2655603.498438] Pid: 115887, comm: mdt01_063 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2655603.508439] Call Trace: [2655603.511091] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2655603.518227] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2655603.525611] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2655603.532615] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2655603.539790] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2655603.546448] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2655603.553648] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2655603.560392] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2655603.567065] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2655603.574000] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2655603.581299] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2655603.587664] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2655603.594790] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2655603.602679] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2655603.609186] [] kthread+0xd1/0xe0 [2655603.614273] [] ret_from_fork_nospec_begin+0xe/0x21 [2655603.620921] [] 0xffffffffffffffff [2655635.065457] Pid: 115835, comm: mdt02_050 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2655635.075476] Call Trace: [2655635.078115] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2655635.085255] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2655635.092661] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2655635.099664] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2655635.106850] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2655635.113615] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2655635.120300] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2655635.127259] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2655635.134566] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2655635.140916] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2655635.148064] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2655635.155969] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2655635.162486] [] kthread+0xd1/0xe0 [2655635.167576] [] ret_from_fork_nospec_begin+0xe/0x21 [2655635.174254] [] 0xffffffffffffffff [2655635.179463] LustreError: dumping log to /tmp/lustre-log.1554810800.115835 [2655635.192902] Pid: 115345, comm: mdt00_002 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2655635.202914] Call Trace: [2655635.205554] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2655635.212661] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2655635.220059] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2655635.227075] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2655635.234286] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2655635.240943] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2655635.248141] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2655635.254889] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2655635.261583] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2655635.268519] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2655635.275829] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2655635.282169] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2655635.289329] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2655635.297214] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2655635.303729] [] kthread+0xd1/0xe0 [2655635.308818] [] ret_from_fork_nospec_begin+0xe/0x21 [2655635.315496] [] 0xffffffffffffffff [2655667.790967] Lustre: fir-MDT0003: Connection restored to (at 10.9.101.11@o2ib4) [2655667.798452] Lustre: Skipped 522 previous similar messages [2655763.148193] LNet: Service thread pid 115715 completed after 514.93s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2655763.148976] LustreError: 116124:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ebce5435000 ns: mdt-fir-MDT0001_UUID lock: ffff8ecf78b598c0/0xbc3294614a09e1fa lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 100 type: IBT flags: 0x50200000000000 nid: 10.8.27.30@o2ib6 remote: 0xae41da2122025b53 expref: 2 pid: 116124 timeout: 0 lvb_type: 0 [2655763.148979] LustreError: 116124:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 13 previous similar messages [2655763.149032] Lustre: 116124:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:330s); client may timeout. req@ffff8ec655e98000 x1628647176648496/t0(0) o101->ceea938e-6a91-5e80-97b6-3160b0748724@10.8.27.30@o2ib6:28/0 lens 568/2296 e 0 to 0 dl 1554810598 ref 1 fl Complete:/0/0 rc -107/-107 [2655763.149033] Lustre: 116124:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 11 previous similar messages [2655763.249942] LNet: Skipped 96 previous similar messages [2655964.797318] Pid: 115887, comm: mdt01_063 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2655964.807318] Call Trace: [2655964.809961] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2655964.817100] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2655964.824510] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2655964.831520] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2655964.838718] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2655964.845380] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2655964.852586] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2655964.859344] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2655964.866030] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2655964.872966] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2655964.880263] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2655964.886609] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2655964.893738] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2655964.901641] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2655964.908182] [] kthread+0xd1/0xe0 [2655964.913294] [] ret_from_fork_nospec_begin+0xe/0x21 [2655964.919957] [] 0xffffffffffffffff [2655964.925161] LustreError: dumping log to /tmp/lustre-log.1554811130.115887 [2655965.008288] Pid: 115724, comm: mdt01_033 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2655965.018292] Call Trace: [2655965.020936] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2655965.028078] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2655965.035498] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2655965.042518] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2655965.049727] [] mdt_object_lock+0x20/0x30 [mdt] [2655965.056055] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2655965.062571] [] mdt_intent_brw+0x1f/0x30 [mdt] [2655965.068810] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2655965.075493] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2655965.082451] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2655965.089751] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2655965.096129] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2655965.103289] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2655965.111221] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2655965.117746] [] kthread+0xd1/0xe0 [2655965.122866] [] ret_from_fork_nospec_begin+0xe/0x21 [2655965.129515] [] 0xffffffffffffffff [2655965.134743] Pid: 116124, comm: mdt01_089 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2655965.144770] Call Trace: [2655965.147415] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2655965.154540] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2655965.161937] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2655965.168962] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2655965.176141] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2655965.182844] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2655965.190028] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2655965.196808] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2655965.203492] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2655965.210449] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2655965.217751] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2655965.224134] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2655965.231280] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2655965.239197] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2655965.245719] [] kthread+0xd1/0xe0 [2655965.250828] [] ret_from_fork_nospec_begin+0xe/0x21 [2655965.257480] [] 0xffffffffffffffff [2655965.262699] Pid: 115636, comm: mdt01_026 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2655965.272703] Call Trace: [2655965.275341] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2655965.282482] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2655965.289868] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2655965.296915] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2655965.304107] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2655965.310794] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2655965.317973] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2655965.324738] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2655965.331407] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2655965.338380] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2655965.345682] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2655965.352069] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2655965.359202] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2655965.367121] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2655965.373651] [] kthread+0xd1/0xe0 [2655965.378760] [] ret_from_fork_nospec_begin+0xe/0x21 [2655965.385412] [] 0xffffffffffffffff [2655965.390646] Pid: 115611, comm: mdt02_023 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2655965.400656] Call Trace: [2655965.403318] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2655965.410446] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2655965.417833] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2655965.424870] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2655965.432055] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2655965.438751] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2655965.445941] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2655965.452714] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2655965.459373] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2655965.466338] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2655965.473638] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2655965.480012] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2655965.487167] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2655965.495084] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2655965.501608] [] kthread+0xd1/0xe0 [2655965.506731] [] ret_from_fork_nospec_begin+0xe/0x21 [2655965.513383] [] 0xffffffffffffffff [2655965.518603] LNet: Service thread pid 115939 was inactive for 201.46s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2655965.531751] LNet: Skipped 112 previous similar messages [2656038.161643] Lustre: fir-MDT0001: Client ceea938e-6a91-5e80-97b6-3160b0748724 (at 10.8.27.30@o2ib6) reconnecting [2656038.171911] Lustre: Skipped 488 previous similar messages [2656099.198839] Lustre: 116359:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eed359f2700 x1628546089211616/t0(0) o101->ac4b3f39-4aa4-91f4-89f9-e38a651f2f99@10.9.107.5@o2ib4:9/0 lens 576/3264 e 0 to 0 dl 1554811269 ref 2 fl Interpret:/0/0 rc 0/0 [2656099.227994] Lustre: 116359:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 82 previous similar messages [2656164.130584] LustreError: 115710:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554811239, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ed00c57c5c0/0xbc3294614ffabc2c lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 114 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115710 timeout: 0 lvb_type: 0 [2656164.170598] LustreError: 115710:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 96 previous similar messages [2656224.153247] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.17.16@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ec3d03daac0/0xbc3294614ffabb5a lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 114 type: IBT flags: 0x60200400000020 nid: 10.8.17.16@o2ib6 remote: 0xef7ba93a1b1b47b7 expref: 45 pid: 115887 timeout: 2656203 lvb_type: 0 [2656224.191699] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 4 previous similar messages [2656274.560821] LNet: Service thread pid 115815 was inactive for 200.34s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2656274.578026] LNet: Skipped 8 previous similar messages [2656274.583257] Pid: 115815, comm: mdt00_032 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2656274.593277] Call Trace: [2656274.595932] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2656274.603045] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2656274.610427] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2656274.617433] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2656274.624623] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2656274.631283] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2656274.638490] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2656274.645252] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2656274.651924] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2656274.658875] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2656274.666172] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2656274.672520] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2656274.679654] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2656274.687550] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2656274.694070] [] kthread+0xd1/0xe0 [2656274.699177] [] ret_from_fork_nospec_begin+0xe/0x21 [2656274.705828] [] 0xffffffffffffffff [2656274.711044] LustreError: dumping log to /tmp/lustre-log.1554811440.115815 [2656274.783900] Pid: 116242, comm: mdt02_084 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2656274.793898] Call Trace: [2656274.796536] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2656274.803657] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2656274.811041] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2656274.818053] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2656274.825245] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2656274.831902] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2656274.839076] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2656274.845821] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2656274.852508] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2656274.859445] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2656274.866725] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2656274.873086] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2656274.880218] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2656274.888115] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2656274.894629] [] kthread+0xd1/0xe0 [2656274.899719] [] ret_from_fork_nospec_begin+0xe/0x21 [2656274.906382] [] 0xffffffffffffffff [2656274.911598] Pid: 115568, comm: mdt02_007 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2656274.921596] Call Trace: [2656274.924236] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2656274.931343] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2656274.938710] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2656274.945729] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2656274.952905] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2656274.959578] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2656274.966757] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2656274.973515] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2656274.980174] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2656274.987119] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2656274.994421] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2656275.000770] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2656275.007889] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2656275.015791] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2656275.022292] [] kthread+0xd1/0xe0 [2656275.027395] [] ret_from_fork_nospec_begin+0xe/0x21 [2656275.034046] [] 0xffffffffffffffff [2656275.039236] Pid: 115887, comm: mdt01_063 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2656275.049233] Call Trace: [2656275.051874] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2656275.058995] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2656275.066363] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2656275.073385] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2656275.080569] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2656275.087312] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2656275.093968] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2656275.100901] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2656275.108180] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2656275.114523] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2656275.121640] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2656275.129570] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2656275.136078] [] kthread+0xd1/0xe0 [2656275.141180] [] ret_from_fork_nospec_begin+0xe/0x21 [2656275.147832] [] 0xffffffffffffffff [2656275.153045] Pid: 115636, comm: mdt01_026 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2656275.163044] Call Trace: [2656275.165684] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2656275.172792] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2656275.180183] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2656275.187188] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2656275.194399] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2656275.201054] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2656275.208245] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2656275.214991] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2656275.221660] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2656275.228598] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2656275.235893] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2656275.242230] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2656275.249361] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2656275.257268] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2656275.263767] [] kthread+0xd1/0xe0 [2656275.268869] [] ret_from_fork_nospec_begin+0xe/0x21 [2656275.275533] [] 0xffffffffffffffff [2656287.836007] Lustre: fir-MDT0003: Connection restored to (at 10.9.101.11@o2ib4) [2656287.843497] Lustre: Skipped 491 previous similar messages [2656379.155142] LNet: Service thread pid 115917 completed after 304.93s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2656379.155155] LustreError: 115345:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8eb97a496000 ns: mdt-fir-MDT0001_UUID lock: ffff8eb30a375100/0xbc3294614ffaee91 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 125 type: IBT flags: 0x50200400000020 nid: 10.8.17.16@o2ib6 remote: 0xef7ba93a1b1b4897 expref: 2 pid: 115345 timeout: 0 lvb_type: 0 [2656379.155158] LustreError: 115345:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 8 previous similar messages [2656379.155201] Lustre: 115345:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:151s); client may timeout. req@ffff8ebdcdbd0f00 x1628569898630576/t0(0) o101->e98ad6a7-830d-1725-2883-db155e5b9e43@10.8.17.16@o2ib6:9/0 lens 568/2296 e 0 to 0 dl 1554811393 ref 1 fl Complete:/0/0 rc -107/-107 [2656379.155203] Lustre: 115345:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 25 previous similar messages [2656379.256743] LNet: Skipped 48 previous similar messages [2656659.158107] Lustre: fir-MDT0001: Client 3d887935-8bfc-5320-4aa6-6c952f9fe8d7 (at 10.8.8.11@o2ib6) reconnecting [2656659.168281] Lustre: Skipped 495 previous similar messages [2656679.557508] Pid: 115575, comm: mdt01_006 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2656679.567524] Call Trace: [2656679.570167] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2656679.577296] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2656679.584679] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2656679.591699] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2656679.598901] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2656679.605589] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2656679.612798] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2656679.619549] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2656679.626232] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2656679.633174] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2656679.640469] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2656679.646850] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2656679.653974] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2656679.661898] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2656679.668431] [] kthread+0xd1/0xe0 [2656679.673546] [] ret_from_fork_nospec_begin+0xe/0x21 [2656679.680223] [] 0xffffffffffffffff [2656679.685441] LustreError: dumping log to /tmp/lustre-log.1554811845.115575 [2656679.790476] Pid: 115598, comm: mdt01_014 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2656679.800477] Call Trace: [2656679.803122] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2656679.810258] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2656679.817630] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2656679.824651] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2656679.831830] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2656679.838514] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2656679.845688] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2656679.852454] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2656679.859113] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2656679.866078] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2656679.873372] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2656679.879720] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2656679.886848] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2656679.894733] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2656679.901241] [] kthread+0xd1/0xe0 [2656679.906345] [] ret_from_fork_nospec_begin+0xe/0x21 [2656679.912995] [] 0xffffffffffffffff [2656679.918204] Pid: 116295, comm: mdt00_109 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2656679.928198] Call Trace: [2656679.930854] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2656679.937964] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2656679.945347] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2656679.952352] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2656679.959542] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2656679.966215] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2656679.973393] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2656679.980153] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2656679.986809] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2656679.993751] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2656680.001041] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2656680.007391] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2656680.014509] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2656680.022410] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2656680.028921] [] kthread+0xd1/0xe0 [2656680.034009] [] ret_from_fork_nospec_begin+0xe/0x21 [2656680.040654] [] 0xffffffffffffffff [2656680.045846] Pid: 115857, comm: mdt01_057 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2656680.055844] Call Trace: [2656680.058506] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2656680.065634] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2656680.073025] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2656680.080030] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2656680.087223] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2656680.093983] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2656680.100637] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2656680.107599] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2656680.114892] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2656680.121240] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2656680.128383] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2656680.136276] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2656680.142775] [] kthread+0xd1/0xe0 [2656680.147863] [] ret_from_fork_nospec_begin+0xe/0x21 [2656680.154511] [] 0xffffffffffffffff [2656680.159707] Pid: 115854, comm: mdt01_056 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2656680.169709] Call Trace: [2656680.172345] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2656680.179452] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2656680.186830] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2656680.193831] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2656680.201008] [] mdt_object_lock+0x20/0x30 [mdt] [2656680.207317] [] mdt_hsm_state_set+0xc9/0x830 [mdt] [2656680.213886] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2656680.221020] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2656680.228906] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2656680.235418] [] kthread+0xd1/0xe0 [2656680.240501] [] ret_from_fork_nospec_begin+0xe/0x21 [2656680.247150] [] 0xffffffffffffffff [2656680.252337] LNet: Service thread pid 115925 was inactive for 201.33s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2656680.265459] LNet: Skipped 38 previous similar messages [2656830.001253] Lustre: 115817:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ecc23889800 x1629293380799824/t0(0) o101->3d887935-8bfc-5320-4aa6-6c952f9fe8d7@10.8.8.11@o2ib6:20/0 lens 568/0 e 0 to 0 dl 1554812000 ref 2 fl Interpret:/0/ffffffff rc 0/-1 [2656830.030846] Lustre: 115817:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 202 previous similar messages [2656834.160300] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 29s: evicting client at 10.8.8.11@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8eda4d2bec00/0xbc32946156593fad lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 116 type: IBT flags: 0x60200400000020 nid: 10.8.8.11@o2ib6 remote: 0x5e338b5757ab80a8 expref: 94 pid: 115591 timeout: 2656813 lvb_type: 0 [2656834.198496] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 5 previous similar messages [2656894.763995] LustreError: 115407:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554811970, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ed98ce82f40/0xbc32946156593fde lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 115 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115407 timeout: 0 lvb_type: 0 [2656894.804006] LustreError: 115407:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 183 previous similar messages [2656897.791830] Lustre: fir-MDT0001: Connection restored to a4daaf47-6ec9-4753-388e-0d0b7a7f70d6 (at 10.8.27.25@o2ib6) [2656897.802357] Lustre: Skipped 495 previous similar messages [2657005.193290] LNet: Service thread pid 115695 was inactive for 200.14s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2657005.210516] LNet: Skipped 9 previous similar messages [2657005.215752] Pid: 115695, comm: mdt01_029 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2657005.225750] Call Trace: [2657005.228393] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2657005.235521] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2657005.242898] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2657005.249910] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2657005.257088] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2657005.263755] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2657005.270946] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2657005.277705] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2657005.284361] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2657005.291319] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2657005.298609] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2657005.304954] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2657005.312077] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2657005.319972] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2657005.326471] [] kthread+0xd1/0xe0 [2657005.331559] [] ret_from_fork_nospec_begin+0xe/0x21 [2657005.338225] [] 0xffffffffffffffff [2657005.343435] LustreError: dumping log to /tmp/lustre-log.1554812170.115695 [2657005.424283] Pid: 115965, comm: mdt01_083 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2657005.434286] Call Trace: [2657005.436944] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2657005.444063] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2657005.451461] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2657005.458476] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2657005.465651] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2657005.472312] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2657005.479509] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2657005.486253] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2657005.492924] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2657005.499861] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2657005.507141] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2657005.513485] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2657005.520620] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2657005.528530] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2657005.535031] [] kthread+0xd1/0xe0 [2657005.540125] [] ret_from_fork_nospec_begin+0xe/0x21 [2657005.546773] [] 0xffffffffffffffff [2657005.551994] Pid: 115857, comm: mdt01_057 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2657005.561995] Call Trace: [2657005.564634] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2657005.571750] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2657005.579117] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2657005.586120] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2657005.593294] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2657005.599953] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2657005.607144] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2657005.613904] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2657005.620576] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2657005.627526] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2657005.634812] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2657005.641145] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2657005.648269] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2657005.656157] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2657005.662655] [] kthread+0xd1/0xe0 [2657005.667743] [] ret_from_fork_nospec_begin+0xe/0x21 [2657005.674399] [] 0xffffffffffffffff [2657005.679589] Pid: 115948, comm: mdt00_073 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2657005.689587] Call Trace: [2657005.692225] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2657005.699354] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2657005.706727] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2657005.713730] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2657005.720904] [] mdt_object_lock+0x20/0x30 [mdt] [2657005.727234] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2657005.733714] [] mdt_intent_brw+0x1f/0x30 [mdt] [2657005.739966] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2657005.746634] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2657005.753602] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2657005.760884] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2657005.767221] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2657005.774337] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2657005.782255] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2657005.788759] [] kthread+0xd1/0xe0 [2657005.793874] [] ret_from_fork_nospec_begin+0xe/0x21 [2657005.800527] [] 0xffffffffffffffff [2657005.805729] Pid: 115608, comm: mdt02_021 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2657005.815747] Call Trace: [2657005.818379] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2657005.825504] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2657005.832891] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2657005.839908] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2657005.847101] [] mdt_object_lock+0x20/0x30 [mdt] [2657005.853426] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2657005.859921] [] mdt_intent_brw+0x1f/0x30 [mdt] [2657005.866151] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2657005.872807] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2657005.879765] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2657005.887047] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2657005.893397] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2657005.900508] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2657005.908409] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2657005.914911] [] kthread+0xd1/0xe0 [2657005.920013] [] ret_from_fork_nospec_begin+0xe/0x21 [2657005.926664] [] 0xffffffffffffffff [2657015.162685] LNet: Service thread pid 115948 completed after 210.11s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2657015.179103] LNet: Skipped 16 previous similar messages [2657016.384267] LustreError: 115618:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ebf4598a800 ns: mdt-fir-MDT0001_UUID lock: ffff8eb4b83d7980/0xbc329461569c0fcb lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 105 type: IBT flags: 0x50200400000020 nid: 10.8.27.26@o2ib6 remote: 0x3a52d8e5f90e67cd expref: 2 pid: 115618 timeout: 0 lvb_type: 0 [2657016.419422] LustreError: 115618:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 8 previous similar messages [2657016.430021] Lustre: 115959:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:182s); client may timeout. req@ffff8ecc23889800 x1629293380799824/t0(0) o101->3d887935-8bfc-5320-4aa6-6c952f9fe8d7@10.8.8.11@o2ib6:20/0 lens 568/2296 e 0 to 0 dl 1554812000 ref 1 fl Complete:/0/0 rc -107/-107 [2657016.459186] Lustre: 115959:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 13 previous similar messages [2657248.908092] LustreError: dumping log to /tmp/lustre-log.1554812414.115575 [2657249.420098] LustreError: dumping log to /tmp/lustre-log.1554812415.115408 [2657264.191990] Lustre: fir-MDT0001: Client a4daaf47-6ec9-4753-388e-0d0b7a7f70d6 (at 10.8.27.25@o2ib6) reconnecting [2657264.202255] Lustre: Skipped 466 previous similar messages [2657279.628448] LustreError: dumping log to /tmp/lustre-log.1554812445.115796 [2657309.324784] Pid: 115965, comm: mdt01_083 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2657309.334811] Call Trace: [2657309.337463] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2657309.344588] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2657309.351971] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2657309.358974] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2657309.366169] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2657309.372848] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2657309.380038] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2657309.386785] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2657309.393442] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2657309.400400] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2657309.407688] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2657309.414048] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2657309.421167] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2657309.429094] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2657309.435619] [] kthread+0xd1/0xe0 [2657309.440707] [] ret_from_fork_nospec_begin+0xe/0x21 [2657309.447375] [] 0xffffffffffffffff [2657309.452593] LustreError: dumping log to /tmp/lustre-log.1554812475.115965 [2657309.463193] Pid: 115764, comm: mdt02_042 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2657309.473220] Call Trace: [2657309.475873] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2657309.482993] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2657309.490385] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2657309.497388] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2657309.504566] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2657309.511214] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2657309.518390] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2657309.525148] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2657309.531806] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2657309.538755] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2657309.546060] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2657309.552398] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2657309.559539] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2657309.567425] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2657309.573936] [] kthread+0xd1/0xe0 [2657309.579023] [] ret_from_fork_nospec_begin+0xe/0x21 [2657309.585670] [] 0xffffffffffffffff [2657310.348792] Pid: 115948, comm: mdt00_073 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2657310.358791] Call Trace: [2657310.361437] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2657310.368573] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2657310.375959] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2657310.382961] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2657310.390152] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2657310.396810] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2657310.404003] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2657310.410746] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2657310.417417] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2657310.424352] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2657310.431650] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2657310.437995] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2657310.445140] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2657310.453032] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2657310.459531] [] kthread+0xd1/0xe0 [2657310.464621] [] ret_from_fork_nospec_begin+0xe/0x21 [2657310.471265] [] 0xffffffffffffffff [2657310.476461] LustreError: dumping log to /tmp/lustre-log.1554812476.115948 [2657310.484202] Pid: 116172, comm: mdt00_081 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2657310.494203] Call Trace: [2657310.496841] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2657310.503965] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2657310.511335] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2657310.518337] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2657310.525528] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2657310.532272] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2657310.538927] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2657310.545861] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2657310.553142] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2657310.559478] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2657310.566583] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2657310.574486] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2657310.580988] [] kthread+0xd1/0xe0 [2657310.586076] [] ret_from_fork_nospec_begin+0xe/0x21 [2657310.592724] [] 0xffffffffffffffff [2657341.581153] Pid: 115922, comm: mdt00_064 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2657341.591159] Call Trace: [2657341.593803] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2657341.600921] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2657341.608310] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2657341.615327] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2657341.622503] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2657341.629246] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2657341.635902] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2657341.642835] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2657341.650114] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2657341.656457] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2657341.663574] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2657341.671460] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2657341.677962] [] kthread+0xd1/0xe0 [2657341.683049] [] ret_from_fork_nospec_begin+0xe/0x21 [2657341.689698] [] 0xffffffffffffffff [2657341.694897] LustreError: dumping log to /tmp/lustre-log.1554812507.115922 [2657341.708428] LNet: Service thread pid 115928 was inactive for 200.47s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2657341.721576] LNet: Skipped 52 previous similar messages [2657439.167285] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.8.2@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ec914e6e780/0xbc329461588be5bf lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 116 type: IBT flags: 0x60200400000020 nid: 10.8.8.2@o2ib6 remote: 0x91b6d60cb7d446f8 expref: 44 pid: 115925 timeout: 2657418 lvb_type: 0 [2657439.205384] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 7 previous similar messages [2657464.270576] Lustre: 115784:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ecee7cae300 x1628637792123424/t0(0) o101->e3c4cf5f-8e04-bccb-9d13-7eae1b83e1a1@10.8.27.29@o2ib6:24/0 lens 576/3264 e 0 to 0 dl 1554812634 ref 2 fl Interpret:/0/0 rc 0/0 [2657464.270580] Lustre: 115588:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eb7bfa47800 x1628637792123408/t0(0) o101->e3c4cf5f-8e04-bccb-9d13-7eae1b83e1a1@10.8.27.29@o2ib6:24/0 lens 480/568 e 0 to 0 dl 1554812634 ref 2 fl Interpret:/0/0 rc 0/0 [2657464.270586] Lustre: 115588:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 165 previous similar messages [2657490.574869] LustreError: dumping log to /tmp/lustre-log.1554812656.115925 [2657501.218320] Lustre: fir-MDT0001: Connection restored to e3c4cf5f-8e04-bccb-9d13-7eae1b83e1a1 (at 10.8.27.29@o2ib6) [2657501.228841] Lustre: Skipped 498 previous similar messages [2657529.218313] LustreError: 115815:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554812604, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ebb982cd100/0xbc3294615c1af2cf lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 112 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115815 timeout: 0 lvb_type: 0 [2657529.258329] LustreError: 115815:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 93 previous similar messages [2657619.169798] LNet: Service thread pid 115750 completed after 570.65s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2657619.186230] LNet: Skipped 23 previous similar messages [2657637.646638] Lustre: 115873:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (588:1s); client may timeout. req@ffff8ede6087bc00 x1628649122690896/t0(0) o101->b7e890f7-ba97-78f8-0794-50613d838a98@10.9.107.7@o2ib4:24/0 lens 480/536 e 0 to 0 dl 1554812802 ref 1 fl Complete:/0/0 rc 0/0 [2657637.648232] LustreError: 115931:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ebf459c0800 ns: mdt-fir-MDT0001_UUID lock: ffff8ec2314b2d00/0xbc329461588e74af lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 103 type: IBT flags: 0x50200000000000 nid: 10.8.17.17@o2ib6 remote: 0xe31ee7d82674de87 expref: 2 pid: 115931 timeout: 0 lvb_type: 0 [2657637.648235] LustreError: 115931:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 7 previous similar messages [2657637.720699] Lustre: 115873:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 13 previous similar messages [2657838.226853] LNet: Service thread pid 116295 was inactive for 200.39s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2657838.244073] LNet: Skipped 9 previous similar messages [2657838.249307] Pid: 116295, comm: mdt00_109 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2657838.259320] Call Trace: [2657838.261956] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2657838.269078] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2657838.276461] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2657838.283481] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2657838.290676] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2657838.297347] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2657838.304526] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2657838.311286] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2657838.317942] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2657838.324876] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2657838.332177] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2657838.338525] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2657838.345664] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2657838.353564] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2657838.360078] [] kthread+0xd1/0xe0 [2657838.365183] [] ret_from_fork_nospec_begin+0xe/0x21 [2657838.371832] [] 0xffffffffffffffff [2657838.377039] LustreError: dumping log to /tmp/lustre-log.1554813003.116295 [2657838.476956] Pid: 115857, comm: mdt01_057 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2657838.486955] Call Trace: [2657838.489620] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2657838.496738] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2657838.504130] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2657838.511134] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2657838.518324] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2657838.524983] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2657838.532156] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2657838.538900] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2657838.545556] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2657838.552490] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2657838.559787] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2657838.566130] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2657838.573255] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2657838.581150] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2657838.587649] [] kthread+0xd1/0xe0 [2657838.592755] [] ret_from_fork_nospec_begin+0xe/0x21 [2657838.599404] [] 0xffffffffffffffff [2657838.604625] Pid: 115931, comm: mdt01_072 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2657838.614652] Call Trace: [2657838.617290] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2657838.624412] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2657838.631806] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2657838.638812] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2657838.646019] [] mdt_object_lock+0x20/0x30 [mdt] [2657838.652347] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2657838.658832] [] mdt_intent_brw+0x1f/0x30 [mdt] [2657838.665069] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2657838.671726] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2657838.678690] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2657838.685976] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2657838.692327] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2657838.699458] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2657838.707364] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2657838.713864] [] kthread+0xd1/0xe0 [2657838.718956] [] ret_from_fork_nospec_begin+0xe/0x21 [2657838.725608] [] 0xffffffffffffffff [2657838.730795] Pid: 115753, comm: mdt01_037 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2657838.740796] Call Trace: [2657838.743451] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2657838.750561] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2657838.757949] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2657838.764956] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2657838.772146] [] mdt_object_lock+0x20/0x30 [mdt] [2657838.778482] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2657838.784982] [] mdt_intent_brw+0x1f/0x30 [mdt] [2657838.791209] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2657838.797853] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2657838.804779] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2657838.812059] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2657838.818410] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2657838.825536] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2657838.833424] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2657838.839937] [] kthread+0xd1/0xe0 [2657838.845043] [] ret_from_fork_nospec_begin+0xe/0x21 [2657838.851708] [] 0xffffffffffffffff [2657838.856898] Pid: 115948, comm: mdt00_073 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2657838.866926] Call Trace: [2657838.869561] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2657838.876673] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2657838.884060] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2657838.891058] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2657838.898250] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2657838.904923] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2657838.912131] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2657838.918893] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2657838.925567] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2657838.932509] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2657838.939807] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2657838.946144] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2657838.953284] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2657838.961186] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2657838.967704] [] kthread+0xd1/0xe0 [2657838.972795] [] ret_from_fork_nospec_begin+0xe/0x21 [2657838.979480] [] 0xffffffffffffffff [2657868.935068] Lustre: fir-MDT0003: Client ecd69873-9a0a-0293-0ce7-d92dc18bf8d9 (at 10.9.101.11@o2ib4) reconnecting [2657868.945415] Lustre: Skipped 509 previous similar messages [2657869.459210] LustreError: dumping log to /tmp/lustre-log.1554813035.115587 [2657897.619545] LustreError: dumping log to /tmp/lustre-log.1554813063.115928 [2657900.179573] LustreError: dumping log to /tmp/lustre-log.1554813065.115764 [2658116.978931] Lustre: fir-MDT0003: Connection restored to (at 10.9.101.11@o2ib4) [2658116.986429] Lustre: Skipped 494 previous similar messages [2658155.175495] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.17.15@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ebbdc4a2ac0/0xbc32946160f39d7b lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 116 type: IBT flags: 0x60200400000020 nid: 10.8.17.15@o2ib6 remote: 0x794f201b73451fab expref: 111 pid: 115584 timeout: 2658134 lvb_type: 0 [2658155.214035] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 6 previous similar messages [2658185.814845] Lustre: 116228:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eb97a465a00 x1629029717870304/t0(0) o101->83661770-f8f4-e73b-ca15-a24428bd76a4@10.9.115.1@o2ib4:26/0 lens 576/3264 e 0 to 0 dl 1554813356 ref 2 fl Interpret:/0/0 rc 0/0 [2658185.844106] Lustre: 116228:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 136 previous similar messages [2658251.131591] LustreError: 115759:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554813326, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ed01691f980/0xbc329461625b84a1 lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 104 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115759 timeout: 0 lvb_type: 0 [2658251.171609] LustreError: 115759:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 116 previous similar messages [2658361.496843] Pid: 115930, comm: mdt02_073 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2658361.506838] Call Trace: [2658361.509475] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2658361.516595] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2658361.523979] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2658361.530998] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2658361.538182] [] mdt_object_lock+0x20/0x30 [mdt] [2658361.544494] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2658361.550996] [] mdt_intent_brw+0x1f/0x30 [mdt] [2658361.557217] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2658361.563891] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2658361.570842] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2658361.578125] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2658361.584491] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2658361.591621] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2658361.599525] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2658361.606043] [] kthread+0xd1/0xe0 [2658361.611148] [] ret_from_fork_nospec_begin+0xe/0x21 [2658361.617798] [] 0xffffffffffffffff [2658361.623000] LustreError: dumping log to /tmp/lustre-log.1554813527.115930 [2658361.722403] Pid: 115621, comm: mdt01_022 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2658361.732420] Call Trace: [2658361.735060] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2658361.742183] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2658361.749568] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2658361.756590] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2658361.763773] [] mdt_reint_object_lock+0x2c/0x60 [mdt] [2658361.770604] [] mdt_reint_striped_lock+0x8c/0x510 [mdt] [2658361.777629] [] mdt_reint_setattr+0x6c8/0x1340 [mdt] [2658361.784374] [] mdt_reint_rec+0x83/0x210 [mdt] [2658361.790611] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2658361.797355] [] mdt_reint+0x67/0x140 [mdt] [2658361.803231] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2658361.810364] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2658361.818280] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2658361.824806] [] kthread+0xd1/0xe0 [2658361.829901] [] ret_from_fork_nospec_begin+0xe/0x21 [2658361.836547] [] 0xffffffffffffffff [2658361.841752] Pid: 115896, comm: mdt02_064 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2658361.851752] Call Trace: [2658361.854392] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2658361.861497] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2658361.868875] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2658361.875877] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2658361.883054] [] mdt_object_lock+0x20/0x30 [mdt] [2658361.889362] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2658361.895860] [] mdt_intent_brw+0x1f/0x30 [mdt] [2658361.902084] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2658361.908755] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2658361.915693] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2658361.922985] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2658361.929325] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2658361.936463] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2658361.944354] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2658361.950883] [] kthread+0xd1/0xe0 [2658361.955976] [] ret_from_fork_nospec_begin+0xe/0x21 [2658361.962638] [] 0xffffffffffffffff [2658361.967839] Pid: 115584, comm: mdt01_010 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2658361.977864] Call Trace: [2658361.980499] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2658361.987610] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2658361.994986] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2658362.002004] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2658362.009197] [] mdt_object_lock+0x20/0x30 [mdt] [2658362.015499] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2658362.021981] [] mdt_intent_brw+0x1f/0x30 [mdt] [2658362.028219] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2658362.034884] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2658362.041818] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2658362.049101] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2658362.055449] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2658362.062584] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2658362.070473] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2658362.076988] [] kthread+0xd1/0xe0 [2658362.082075] [] ret_from_fork_nospec_begin+0xe/0x21 [2658362.088740] [] 0xffffffffffffffff [2658362.093941] Pid: 115899, comm: mdt00_057 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2658362.103937] Call Trace: [2658362.106573] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2658362.113683] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2658362.121050] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2658362.128071] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2658362.135239] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2658362.141998] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2658362.148644] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2658362.155611] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2658362.162894] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2658362.169245] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2658362.176371] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2658362.184259] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2658362.190773] [] kthread+0xd1/0xe0 [2658362.195864] [] ret_from_fork_nospec_begin+0xe/0x21 [2658362.202525] [] 0xffffffffffffffff [2658362.207732] LNet: Service thread pid 115653 was inactive for 200.96s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2658362.220852] LNet: Skipped 50 previous similar messages [2658461.179077] LNet: Service thread pid 115837 completed after 300.01s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2658461.195515] LNet: Skipped 85 previous similar messages [2658471.136185] Lustre: fir-MDT0001: Client e3c4cf5f-8e04-bccb-9d13-7eae1b83e1a1 (at 10.8.27.29@o2ib6) reconnecting [2658471.146458] Lustre: Skipped 481 previous similar messages [2658511.514531] LustreError: dumping log to /tmp/lustre-log.1554813677.115759 [2658516.634588] LustreError: dumping log to /tmp/lustre-log.1554813682.115582 [2658661.532257] LNet: Service thread pid 115890 was inactive for 200.28s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2658661.549473] LNet: Skipped 9 previous similar messages [2658661.554707] Pid: 115890, comm: mdt01_064 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2658661.564723] Call Trace: [2658661.567364] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2658661.574477] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2658661.581868] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2658661.588888] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2658661.596067] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2658661.602724] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2658661.609899] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2658661.616643] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2658661.623313] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2658661.630250] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2658661.637542] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2658661.643880] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2658661.651011] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2658661.658916] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2658661.665418] [] kthread+0xd1/0xe0 [2658661.670507] [] ret_from_fork_nospec_begin+0xe/0x21 [2658661.677152] [] 0xffffffffffffffff [2658661.682346] LustreError: dumping log to /tmp/lustre-log.1554813827.115890 [2658661.721270] Pid: 115917, comm: mdt01_069 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2658661.731287] Call Trace: [2658661.733921] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2658661.741060] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2658661.748442] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2658661.755463] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2658661.762643] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2658661.769298] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2658661.776489] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2658661.783226] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2658661.789897] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2658661.796833] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2658661.804128] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2658661.810473] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2658661.817606] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2658661.825510] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2658661.832025] [] kthread+0xd1/0xe0 [2658661.837136] [] ret_from_fork_nospec_begin+0xe/0x21 [2658661.843792] [] 0xffffffffffffffff [2658661.849024] Pid: 116172, comm: mdt00_081 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2658661.859020] Call Trace: [2658661.861659] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2658661.868784] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2658661.876152] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2658661.883162] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2658661.890339] [] mdt_object_lock+0x20/0x30 [mdt] [2658661.896678] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2658661.903167] [] mdt_intent_brw+0x1f/0x30 [mdt] [2658661.909411] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2658661.916070] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2658661.923004] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2658661.930284] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2658661.936618] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2658661.943743] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2658661.951630] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2658661.958154] [] kthread+0xd1/0xe0 [2658661.963244] [] ret_from_fork_nospec_begin+0xe/0x21 [2658661.969915] [] 0xffffffffffffffff [2658662.044258] Pid: 115837, comm: mdt01_051 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2658662.054258] Call Trace: [2658662.056900] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2658662.064019] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2658662.071404] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2658662.078407] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2658662.085594] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2658662.092258] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2658662.099432] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2658662.106177] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2658662.112840] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2658662.119784] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2658662.127072] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2658662.133406] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2658662.140526] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2658662.148418] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2658662.154918] [] kthread+0xd1/0xe0 [2658662.160006] [] ret_from_fork_nospec_begin+0xe/0x21 [2658662.166677] [] 0xffffffffffffffff [2658662.171877] Pid: 115931, comm: mdt01_072 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2658662.181886] Call Trace: [2658662.184520] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2658662.191644] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2658662.199014] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2658662.206042] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2658662.213220] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2658662.219876] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2658662.227050] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2658662.233827] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2658662.240486] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2658662.247433] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2658662.254717] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2658662.261079] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2658662.268191] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2658662.276111] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2658662.282630] [] kthread+0xd1/0xe0 [2658662.287735] [] ret_from_fork_nospec_begin+0xe/0x21 [2658662.294382] [] 0xffffffffffffffff [2658666.140306] LustreError: dumping log to /tmp/lustre-log.1554813831.115833 [2658707.458546] LustreError: 115899:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ede60903400 ns: mdt-fir-MDT0001_UUID lock: ffff8eb50d24ca40/0xbc329461625c06d7 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 92 type: IBT flags: 0x50200000000000 nid: 10.8.27.31@o2ib6 remote: 0x99d3453de1b42411 expref: 2 pid: 115899 timeout: 0 lvb_type: 0 [2658707.493601] LustreError: 115899:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 17 previous similar messages [2658707.504099] Lustre: 115899:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:393s); client may timeout. req@ffff8eb7bfbdec00 x1628649154144320/t0(0) o101->eed99957-e395-8d59-f471-3be5bc5334d2@10.8.27.31@o2ib6:26/0 lens 568/2296 e 0 to 0 dl 1554813480 ref 1 fl Complete:/0/0 rc -107/-107 [2658707.533449] Lustre: 115899:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 14 previous similar messages [2658737.024027] Lustre: fir-MDT0003: Connection restored to (at 10.9.101.11@o2ib4) [2658737.031516] Lustre: Skipped 501 previous similar messages [2658857.183497] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 149s: evicting client at 10.9.115.1@o2ib4 ns: mdt-fir-MDT0001_UUID lock: ffff8ec6c4e6a640/0xbc32946167415822 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 127 type: IBT flags: 0x60200400000020 nid: 10.9.115.1@o2ib4 remote: 0x5c08f5c6afaaff3d expref: 87 pid: 116318 timeout: 2658836 lvb_type: 0 [2658857.221965] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 6 previous similar messages [2658882.270773] Lustre: 115877:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ee252ee4e00 x1628546089745504/t133435571564(0) o36->ac4b3f39-4aa4-91f4-89f9-e38a651f2f99@10.9.107.5@o2ib4:2/0 lens 488/3152 e 0 to 0 dl 1554814052 ref 2 fl Interpret:/0/0 rc 0/0 [2658882.300799] Lustre: 115877:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 128 previous similar messages [2658908.831077] LustreError: dumping log to /tmp/lustre-log.1554814074.116121 [2658947.233488] LustreError: 115931:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554814022, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ec6943321c0/0xbc329461689e69ce lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 126 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115931 timeout: 0 lvb_type: 0 [2658947.273507] LustreError: 115931:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 109 previous similar messages [2659078.040925] Lustre: fir-MDT0003: Client ecd69873-9a0a-0293-0ce7-d92dc18bf8d9 (at 10.9.101.11@o2ib4) reconnecting [2659078.051273] Lustre: Skipped 476 previous similar messages [2659355.804998] Lustre: fir-MDT0001: Connection restored to 899c9439-b799-8bae-19f5-3b7d8e9f714b (at 10.8.27.32@o2ib6) [2659355.815521] Lustre: Skipped 489 previous similar messages [2659375.780404] LNet: Service thread pid 115710 was inactive for 200.10s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2659375.797607] LNet: Skipped 4 previous similar messages [2659375.802845] Pid: 115710, comm: mdt00_028 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2659375.812860] Call Trace: [2659375.815502] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2659375.822617] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2659375.830024] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2659375.837029] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2659375.844234] [] mdt_object_lock+0x20/0x30 [mdt] [2659375.850548] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2659375.857046] [] mdt_intent_brw+0x1f/0x30 [mdt] [2659375.863273] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2659375.869953] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2659375.876879] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2659375.884212] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2659375.890563] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2659375.897709] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2659375.905601] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2659375.912116] [] kthread+0xd1/0xe0 [2659375.917208] [] ret_from_fork_nospec_begin+0xe/0x21 [2659375.923861] [] 0xffffffffffffffff [2659375.929063] LustreError: dumping log to /tmp/lustre-log.1554814541.115710 [2659376.047185] Pid: 115931, comm: mdt01_072 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2659376.057189] Call Trace: [2659376.059833] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2659376.066941] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2659376.074299] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2659376.081301] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2659376.088496] [] mdt_object_lock+0x20/0x30 [mdt] [2659376.094797] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2659376.101293] [] mdt_intent_brw+0x1f/0x30 [mdt] [2659376.107525] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2659376.114181] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2659376.121116] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2659376.128387] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2659376.134766] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2659376.141906] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2659376.149821] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2659376.156338] [] kthread+0xd1/0xe0 [2659376.161441] [] ret_from_fork_nospec_begin+0xe/0x21 [2659376.168104] [] 0xffffffffffffffff [2659376.173303] Pid: 115618, comm: mdt00_016 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2659376.183306] Call Trace: [2659376.185966] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2659376.193078] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2659376.200484] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2659376.207500] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2659376.214683] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2659376.221341] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2659376.228515] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2659376.235263] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2659376.241915] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2659376.248848] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2659376.256154] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2659376.262490] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2659376.269629] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2659376.277521] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2659376.284029] [] kthread+0xd1/0xe0 [2659376.289138] [] ret_from_fork_nospec_begin+0xe/0x21 [2659376.295787] [] 0xffffffffffffffff [2659376.301000] Pid: 115566, comm: mdt02_005 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2659376.311017] Call Trace: [2659376.313656] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2659376.320797] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2659376.328160] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2659376.335183] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2659376.342385] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2659376.349058] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2659376.356239] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2659376.363007] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2659376.369664] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2659376.376630] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2659376.383930] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2659376.390262] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2659376.397378] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2659376.405296] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2659376.411802] [] kthread+0xd1/0xe0 [2659376.416892] [] ret_from_fork_nospec_begin+0xe/0x21 [2659376.423545] [] 0xffffffffffffffff [2659376.428750] Pid: 115837, comm: mdt01_051 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2659376.438778] Call Trace: [2659376.441415] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2659376.448538] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2659376.455908] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2659376.462941] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2659376.470119] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2659376.476790] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2659376.483968] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2659376.490725] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2659376.497379] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2659376.504338] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2659376.511640] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2659376.517984] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2659376.525093] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2659376.532993] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2659376.539495] [] kthread+0xd1/0xe0 [2659376.544600] [] ret_from_fork_nospec_begin+0xe/0x21 [2659376.551252] [] 0xffffffffffffffff [2659376.556458] LNet: Service thread pid 115903 was inactive for 200.72s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2659376.569587] LNet: Skipped 48 previous similar messages [2659407.012741] LustreError: dumping log to /tmp/lustre-log.1554814572.116309 [2659407.524746] LustreError: dumping log to /tmp/lustre-log.1554814573.115856 [2659505.190848] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.9.107.5@o2ib4 ns: mdt-fir-MDT0001_UUID lock: ffff8eed72b4f980/0xbc3294616b885384 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 118 type: IBT flags: 0x60200400000020 nid: 10.9.107.5@o2ib4 remote: 0xb9c41693abaeeaf0 expref: 57 pid: 115963 timeout: 2659484 lvb_type: 0 [2659505.229301] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 4 previous similar messages [2659505.239598] LNet: Service thread pid 115913 completed after 329.56s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2659505.256035] LNet: Skipped 52 previous similar messages [2659535.782198] Lustre: 115584:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ecc0549e000 x1628647038582448/t0(0) o101->535a7c41-41b9-ec51-bbd6-e028c4ebcb2c@10.8.8.1@o2ib6:26/0 lens 568/0 e 0 to 0 dl 1554814706 ref 2 fl Interpret:/0/ffffffff rc 0/-1 [2659535.811709] Lustre: 115584:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 101 previous similar messages [2659555.417455] Lustre: fir-MDT0003: haven't heard from client e35cfe60-d59c-b262-2c44-e4afe1e51c06 (at 10.8.1.29@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed723612000, cur 1554814721 expire 1554814571 last 1554814494 [2659556.006418] LustreError: dumping log to /tmp/lustre-log.1554814721.115871 [2659600.739926] LustreError: 115724:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554814676, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ed02e106540/0xbc3294616e89da23 lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 117 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115724 timeout: 0 lvb_type: 0 [2659600.779926] LustreError: 115724:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 89 previous similar messages [2659690.824378] Lustre: fir-MDT0001: Client fa7691ef-d46e-650c-947f-cea897c9625f (at 10.8.17.18@o2ib6) reconnecting [2659690.834654] Lustre: Skipped 470 previous similar messages [2659711.144184] Pid: 116206, comm: mdt00_088 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2659711.154182] Call Trace: [2659711.156828] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2659711.163946] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2659711.171350] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2659711.178359] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2659711.185558] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2659711.192207] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2659711.199384] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2659711.206126] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2659711.212783] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2659711.219733] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2659711.227031] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2659711.233374] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2659711.240508] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2659711.248410] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2659711.254930] [] kthread+0xd1/0xe0 [2659711.260018] [] ret_from_fork_nospec_begin+0xe/0x21 [2659711.266697] [] 0xffffffffffffffff [2659711.271896] LustreError: dumping log to /tmp/lustre-log.1554814876.116206 [2659711.312865] Pid: 115724, comm: mdt01_033 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2659711.322861] Call Trace: [2659711.325508] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2659711.332627] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2659711.340012] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2659711.347030] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2659711.354209] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2659711.360960] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2659711.367613] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2659711.374565] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2659711.381855] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2659711.388205] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2659711.395322] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2659711.403223] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2659711.409725] [] kthread+0xd1/0xe0 [2659711.414830] [] ret_from_fork_nospec_begin+0xe/0x21 [2659711.421479] [] 0xffffffffffffffff [2659860.137896] Pid: 115568, comm: mdt02_007 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2659860.147898] Call Trace: [2659860.150542] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2659860.157663] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2659860.165066] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2659860.172067] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2659860.179258] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2659860.185908] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2659860.193100] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2659860.199854] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2659860.206523] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2659860.213485] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2659860.220782] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2659860.227119] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2659860.234241] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2659860.242130] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2659860.248629] [] kthread+0xd1/0xe0 [2659860.253733] [] ret_from_fork_nospec_begin+0xe/0x21 [2659860.260398] [] 0xffffffffffffffff [2659860.265585] LustreError: dumping log to /tmp/lustre-log.1554815025.115568 [2659860.305062] Pid: 115658, comm: mdt02_031 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2659860.315065] Call Trace: [2659860.317708] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2659860.324844] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2659860.332230] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2659860.339232] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2659860.346439] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2659860.353188] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2659860.359866] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2659860.366800] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2659860.374129] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2659860.380476] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2659860.387621] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2659860.395521] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2659860.402041] [] kthread+0xd1/0xe0 [2659860.407129] [] ret_from_fork_nospec_begin+0xe/0x21 [2659860.413775] [] 0xffffffffffffffff [2659964.050590] Lustre: fir-MDT0001: Connection restored to ceea938e-6a91-5e80-97b6-3160b0748724 (at 10.8.27.30@o2ib6) [2659964.061115] Lustre: Skipped 477 previous similar messages [2660009.644619] LNet: Service thread pid 115587 was inactive for 200.62s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2660009.661818] LNet: Skipped 8 previous similar messages [2660009.667049] Pid: 115587, comm: mdt02_014 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2660009.677047] Call Trace: [2660009.679693] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2660009.686832] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2660009.694198] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2660009.701201] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2660009.708394] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2660009.715060] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2660009.722250] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2660009.729011] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2660009.735668] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2660009.742640] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2660009.749940] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2660009.756285] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2660009.763416] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2660009.771315] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2660009.777818] [] kthread+0xd1/0xe0 [2660009.782924] [] ret_from_fork_nospec_begin+0xe/0x21 [2660009.789592] [] 0xffffffffffffffff [2660009.794787] LustreError: dumping log to /tmp/lustre-log.1554815175.115587 [2660009.829166] LNet: Service thread pid 116301 was inactive for 200.85s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2660009.842293] LNet: Skipped 36 previous similar messages [2660015.196995] Lustre: 115860:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (340:499s); client may timeout. req@ffff8ef0093e6000 x1628546089830640/t133436203345(0) o36->ac4b3f39-4aa4-91f4-89f9-e38a651f2f99@10.9.107.5@o2ib4:21/0 lens 488/424 e 0 to 0 dl 1554814681 ref 1 fl Complete:/0/0 rc 0/0 [2660015.226602] Lustre: 115860:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 10 previous similar messages [2660040.363977] Pid: 115791, comm: mdt00_030 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2660040.373981] Call Trace: [2660040.376624] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2660040.383763] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2660040.391130] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2660040.398151] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2660040.405337] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2660040.411990] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2660040.419182] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2660040.425943] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2660040.432600] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2660040.439547] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2660040.446832] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2660040.453175] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2660040.460290] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2660040.468192] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2660040.474694] [] kthread+0xd1/0xe0 [2660040.479783] [] ret_from_fork_nospec_begin+0xe/0x21 [2660040.486429] [] 0xffffffffffffffff [2660040.491638] LustreError: dumping log to /tmp/lustre-log.1554815206.115791 [2660040.503397] Pid: 115948, comm: mdt00_073 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2660040.513395] Call Trace: [2660040.516043] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2660040.523158] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2660040.530541] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2660040.537574] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2660040.544779] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2660040.551525] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2660040.558181] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2660040.565123] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2660040.572404] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2660040.578765] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2660040.585881] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2660040.593766] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2660040.600266] [] kthread+0xd1/0xe0 [2660040.605355] [] ret_from_fork_nospec_begin+0xe/0x21 [2660040.612017] [] 0xffffffffffffffff [2660071.084327] Pid: 115348, comm: mdt01_002 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2660071.094336] Call Trace: [2660071.096976] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2660071.104095] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2660071.111482] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2660071.118483] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2660071.125660] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2660071.132402] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2660071.139071] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2660071.146007] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2660071.153319] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2660071.159665] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2660071.166813] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2660071.174705] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2660071.181221] [] kthread+0xd1/0xe0 [2660071.186309] [] ret_from_fork_nospec_begin+0xe/0x21 [2660071.192956] [] 0xffffffffffffffff [2660071.198149] LustreError: dumping log to /tmp/lustre-log.1554815236.115348 [2660071.596336] Pid: 115866, comm: mdt00_046 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2660071.606340] Call Trace: [2660071.608982] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2660071.616122] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2660071.623486] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2660071.630490] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2660071.637656] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2660071.644324] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2660071.651524] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2660071.658285] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2660071.664941] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2660071.671894] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2660071.679196] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2660071.685531] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2660071.692664] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2660071.700569] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2660071.707069] [] kthread+0xd1/0xe0 [2660071.712155] [] ret_from_fork_nospec_begin+0xe/0x21 [2660071.718835] [] 0xffffffffffffffff [2660071.724033] LustreError: dumping log to /tmp/lustre-log.1554815237.115866 [2660165.198395] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.27.31@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ecb1da51440/0xbc3294616b8858cb lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 106 type: IBT flags: 0x60200400000020 nid: 10.8.27.31@o2ib6 remote: 0x99d3453de1b45622 expref: 56 pid: 116191 timeout: 2660144 lvb_type: 0 [2660165.236829] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 5 previous similar messages [2660165.247013] LNet: Service thread pid 115928 completed after 989.56s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2660165.247658] LustreError: 115759:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8eb47e6c7c00 ns: mdt-fir-MDT0001_UUID lock: ffff8ec6d27798c0/0xbc3294616b885d16 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 92 type: IBT flags: 0x50200400000020 nid: 10.8.27.28@o2ib6 remote: 0x1df96237acfabe7a expref: 5 pid: 115759 timeout: 0 lvb_type: 0 [2660165.247660] LustreError: 115759:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 11 previous similar messages [2660165.247670] Lustre: 115869:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:959s); client may timeout. req@ffff8ec655e6ad00 x1628638020941392/t133436203359(0) o36->7d8ca85d-8b80-6a23-8fa9-83dca7eb7196@10.8.27.28@o2ib6:21/0 lens 488/424 e 0 to 0 dl 1554814371 ref 1 fl Complete:/0/0 rc 0/0 [2660165.338440] LNet: Skipped 49 previous similar messages [2660192.685707] Lustre: 115566:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ed3205aad00 x1628546090031984/t0(0) o55->ac4b3f39-4aa4-91f4-89f9-e38a651f2f99@10.9.107.5@o2ib4:23/0 lens 472/224 e 0 to 0 dl 1554815363 ref 2 fl Interpret:/0/0 rc 0/0 [2660192.714801] Lustre: 115566:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 16 previous similar messages [2660257.876421] LustreError: 115584:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554815333, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ecfe3943a80/0xbc32946174703a98 lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 94 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115584 timeout: 0 lvb_type: 0 [2660257.916359] LustreError: 115584:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 23 previous similar messages [2660291.876751] Lustre: fir-MDT0001: Client 4cd8fde3-ab19-6a6b-a7ee-5d70c4bd9893 (at 10.8.8.6@o2ib6) reconnecting [2660291.886842] Lustre: Skipped 422 previous similar messages [2660368.047699] Pid: 116309, comm: mdt02_102 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2660368.057703] Call Trace: [2660368.060353] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2660368.067490] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2660368.074857] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2660368.081876] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2660368.089055] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2660368.095741] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2660368.102938] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2660368.109697] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2660368.116370] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2660368.123320] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2660368.130626] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2660368.136989] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2660368.144121] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2660368.152027] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2660368.158544] [] kthread+0xd1/0xe0 [2660368.163649] [] ret_from_fork_nospec_begin+0xe/0x21 [2660368.170315] [] 0xffffffffffffffff [2660368.175524] LustreError: dumping log to /tmp/lustre-log.1554815533.116309 [2660368.249925] Pid: 116301, comm: mdt02_094 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2660368.259937] Call Trace: [2660368.262572] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2660368.269687] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2660368.277054] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2660368.284074] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2660368.291272] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2660368.297958] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2660368.305135] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2660368.311876] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2660368.318543] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2660368.325484] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2660368.332763] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2660368.339124] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2660368.346240] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2660368.354150] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2660368.360653] [] kthread+0xd1/0xe0 [2660368.365756] [] ret_from_fork_nospec_begin+0xe/0x21 [2660368.372404] [] 0xffffffffffffffff [2660368.377603] Pid: 115580, comm: mdt01_009 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2660368.387599] Call Trace: [2660368.390230] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2660368.397341] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2660368.404725] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2660368.411743] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2660368.418921] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2660368.425576] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2660368.432767] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2660368.439511] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2660368.446167] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2660368.453101] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2660368.460380] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2660368.466747] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2660368.473868] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2660368.481771] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2660368.488272] [] kthread+0xd1/0xe0 [2660368.493373] [] ret_from_fork_nospec_begin+0xe/0x21 [2660368.500039] [] 0xffffffffffffffff [2660368.505265] Pid: 116292, comm: mdt00_106 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2660368.515260] Call Trace: [2660368.517893] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2660368.525002] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2660368.532375] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2660368.539395] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2660368.546573] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2660368.553228] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2660368.560405] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2660368.567163] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2660368.573826] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2660368.580762] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2660368.588043] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2660368.594379] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2660368.601494] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2660368.609383] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2660368.615915] [] kthread+0xd1/0xe0 [2660368.621004] [] ret_from_fork_nospec_begin+0xe/0x21 [2660368.627652] [] 0xffffffffffffffff [2660368.632879] Pid: 115759, comm: mdt01_038 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2660368.642884] Call Trace: [2660368.645519] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2660368.652626] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2660368.659996] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2660368.666999] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2660368.674176] [] mdt_object_lock+0x20/0x30 [mdt] [2660368.680493] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2660368.686983] [] mdt_intent_brw+0x1f/0x30 [mdt] [2660368.693207] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2660368.699891] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2660368.706841] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2660368.714128] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2660368.720465] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2660368.727579] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2660368.735484] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2660368.741984] [] kthread+0xd1/0xe0 [2660368.747085] [] ret_from_fork_nospec_begin+0xe/0x21 [2660368.753758] [] 0xffffffffffffffff [2660523.697464] LustreError: dumping log to /tmp/lustre-log.1554815689.115962 [2660554.417831] LustreError: dumping log to /tmp/lustre-log.1554815719.115407 [2660565.218501] Lustre: fir-MDT0001: Connection restored to ceea938e-6a91-5e80-97b6-3160b0748724 (at 10.8.27.30@o2ib6) [2660565.229032] Lustre: Skipped 397 previous similar messages [2660703.923541] LNet: Service thread pid 115899 was inactive for 200.71s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2660703.940737] LNet: Skipped 9 previous similar messages [2660703.945969] Pid: 115899, comm: mdt00_057 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2660703.955987] Call Trace: [2660703.958629] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2660703.965762] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2660703.973159] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2660703.980189] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2660703.987398] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2660703.994144] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2660704.000802] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2660704.007734] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2660704.015044] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2660704.021394] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2660704.028542] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2660704.036440] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2660704.042964] [] kthread+0xd1/0xe0 [2660704.048086] [] ret_from_fork_nospec_begin+0xe/0x21 [2660704.054736] [] 0xffffffffffffffff [2660704.059967] LustreError: dumping log to /tmp/lustre-log.1554815869.115899 [2660704.095602] Pid: 115939, comm: mdt01_075 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2660704.105601] Call Trace: [2660704.108246] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2660704.115367] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2660704.122732] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2660704.129736] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2660704.136927] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2660704.143618] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2660704.150795] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2660704.157558] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2660704.164212] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2660704.171160] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2660704.178459] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2660704.184805] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2660704.191935] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2660704.199839] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2660704.206357] [] kthread+0xd1/0xe0 [2660704.211449] [] ret_from_fork_nospec_begin+0xe/0x21 [2660704.218152] [] 0xffffffffffffffff [2660727.475815] Pid: 115915, comm: mdt00_063 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2660727.485818] Call Trace: [2660727.488466] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2660727.495593] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2660727.502976] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2660727.509981] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2660727.517188] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2660727.523848] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2660727.531038] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2660727.537783] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2660727.544439] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2660727.551373] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2660727.558675] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2660727.565032] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2660727.572147] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2660727.580073] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2660727.586599] [] kthread+0xd1/0xe0 [2660727.591691] [] ret_from_fork_nospec_begin+0xe/0x21 [2660727.598345] [] 0xffffffffffffffff [2660727.603540] LustreError: dumping log to /tmp/lustre-log.1554815893.115915 [2660727.614249] Pid: 116242, comm: mdt02_084 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2660727.624296] Call Trace: [2660727.626940] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2660727.634054] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2660727.641426] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2660727.648431] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2660727.655636] [] mdt_object_lock+0x20/0x30 [mdt] [2660727.661953] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2660727.668449] [] mdt_intent_brw+0x1f/0x30 [mdt] [2660727.674674] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2660727.681355] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2660727.688290] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2660727.695586] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2660727.701923] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2660727.709058] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2660727.716952] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2660727.723465] [] kthread+0xd1/0xe0 [2660727.728555] [] ret_from_fork_nospec_begin+0xe/0x21 [2660727.735203] [] 0xffffffffffffffff [2660727.740400] Pid: 115871, comm: mdt01_061 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2660727.750399] Call Trace: [2660727.753036] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2660727.760170] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2660727.767539] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2660727.774542] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2660727.781717] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2660727.788388] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2660727.795567] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2660727.802311] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2660727.808966] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2660727.815915] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2660727.823198] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2660727.829541] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2660727.836655] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2660727.844560] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2660727.851060] [] kthread+0xd1/0xe0 [2660727.856177] [] ret_from_fork_nospec_begin+0xe/0x21 [2660727.862829] [] 0xffffffffffffffff [2660727.868020] LNet: Service thread pid 115573 was inactive for 200.66s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2660727.881157] LNet: Skipped 27 previous similar messages [2660734.643894] LustreError: dumping log to /tmp/lustre-log.1554815900.115965 [2660827.205965] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.17.18@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ec3bbe54800/0xbc3294617470525b lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 91 type: IBT flags: 0x60200400000020 nid: 10.8.17.18@o2ib6 remote: 0x47fcf41a45904c76 expref: 77 pid: 115759 timeout: 2660806 lvb_type: 0 [2660827.244333] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 5 previous similar messages [2660827.254578] LNet: Service thread pid 116301 completed after 659.30s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2660827.271039] LNet: Skipped 12 previous similar messages [2660852.391246] Lustre: 115820:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ed412a70300 x1629291887235888/t0(0) o101->a3edbd4a-e6f7-1207-b04e-348b28096812@10.8.17.14@o2ib6:22/0 lens 568/0 e 0 to 0 dl 1554816022 ref 2 fl Interpret:/0/ffffffff rc 0/-1 [2660852.421815] Lustre: 115820:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 47 previous similar messages [2660877.493526] LustreError: dumping log to /tmp/lustre-log.1554816043.115856 [2660882.613580] LustreError: dumping log to /tmp/lustre-log.1554816048.115815 [2660893.328156] Lustre: fir-MDT0001: Client fa7691ef-d46e-650c-947f-cea897c9625f (at 10.8.17.18@o2ib6) reconnecting [2660893.338423] Lustre: Skipped 399 previous similar messages [2660917.255996] LustreError: 115928:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554815992, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ed03c6fca40/0xbc3294617a4b0f62 lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x13/0x8 rrc: 90 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115928 timeout: 0 lvb_type: 0 [2660917.295920] LustreError: 115928:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 39 previous similar messages [2660981.207975] LustreError: 116292:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ecc238c6c00 ns: mdt-fir-MDT0001_UUID lock: ffff8eb21543d340/0xbc329461747068b2 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 85 type: IBT flags: 0x50200400000020 nid: 10.8.27.31@o2ib6 remote: 0x99d3453de1b4751d expref: 5 pid: 116292 timeout: 0 lvb_type: 0 [2660981.243043] LustreError: 116292:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 13 previous similar messages [2661027.511312] Pid: 116593, comm: mdt03_104 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2661027.521322] Call Trace: [2661027.523963] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2661027.531103] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2661027.538488] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2661027.545507] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2661027.552704] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2661027.559387] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2661027.566578] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2661027.573344] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2661027.580011] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2661027.586966] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2661027.594244] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2661027.600603] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2661027.607731] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2661027.615633] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2661027.622159] [] kthread+0xd1/0xe0 [2661027.627245] [] ret_from_fork_nospec_begin+0xe/0x21 [2661027.633904] [] 0xffffffffffffffff [2661027.639128] LustreError: dumping log to /tmp/lustre-log.1554816193.116593 [2661027.672168] Pid: 115750, comm: mdt01_036 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2661027.682162] Call Trace: [2661027.684826] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2661027.691946] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2661027.699336] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2661027.706342] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2661027.713532] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2661027.720181] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2661027.727364] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2661027.734126] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2661027.740803] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2661027.747741] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2661027.755037] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2661027.761375] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2661027.768514] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2661027.776401] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2661027.782917] [] kthread+0xd1/0xe0 [2661027.788005] [] ret_from_fork_nospec_begin+0xe/0x21 [2661027.794670] [] 0xffffffffffffffff [2661027.799900] Pid: 115684, comm: mdt02_034 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2661027.809901] Call Trace: [2661027.812531] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2661027.819658] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2661027.827040] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2661027.834061] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2661027.841240] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2661027.848012] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2661027.854663] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2661027.861614] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2661027.868914] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2661027.875246] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2661027.882355] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2661027.890256] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2661027.896759] [] kthread+0xd1/0xe0 [2661027.901867] [] ret_from_fork_nospec_begin+0xe/0x21 [2661027.908520] [] 0xffffffffffffffff [2661027.913698] Pid: 115928, comm: mdt01_071 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2661027.923725] Call Trace: [2661027.926362] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2661027.933509] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2661027.940897] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2661027.947917] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2661027.955095] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2661027.961783] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2661027.968961] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2661027.975738] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2661027.982397] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2661027.989367] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2661027.996641] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2661028.003014] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2661028.010128] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2661028.018044] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2661028.024550] [] kthread+0xd1/0xe0 [2661028.029651] [] ret_from_fork_nospec_begin+0xe/0x21 [2661028.036309] [] 0xffffffffffffffff [2661031.607356] Pid: 115838, comm: mdt00_039 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2661031.617357] Call Trace: [2661031.619997] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2661031.627110] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2661031.634501] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2661031.641507] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2661031.648697] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2661031.655372] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2661031.662563] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2661031.669323] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2661031.675982] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2661031.682916] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2661031.690212] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2661031.696574] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2661031.703707] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2661031.711611] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2661031.718127] [] kthread+0xd1/0xe0 [2661031.723216] [] ret_from_fork_nospec_begin+0xe/0x21 [2661031.729893] [] 0xffffffffffffffff [2661031.735093] LustreError: dumping log to /tmp/lustre-log.1554816197.115838 [2661167.273998] Lustre: fir-MDT0001: Connection restored to (at 10.8.8.3@o2ib6) [2661167.281229] Lustre: Skipped 401 previous similar messages [2661181.625083] LustreError: dumping log to /tmp/lustre-log.1554816347.116301 [2661281.211443] LustreError: 115617:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec655e4b000 ns: mdt-fir-MDT0001_UUID lock: ffff8eced526d100/0xbc329461747072d0 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 85 type: IBT flags: 0x50200400000020 nid: 10.8.27.32@o2ib6 remote: 0xa1738467d794efb9 expref: 5 pid: 115617 timeout: 0 lvb_type: 0 [2661281.246691] Lustre: 115573:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:600s); client may timeout. req@ffff8eb495986000 x1628647038709072/t0(0) o101->535a7c41-41b9-ec51-bbd6-e028c4ebcb2c@10.8.8.1@o2ib6:22/0 lens 576/1792 e 0 to 0 dl 1554815846 ref 1 fl Complete:/0/0 rc -107/-107 [2661281.275847] Lustre: 115573:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 16 previous similar messages [2661432.212985] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 149s: evicting client at 10.8.8.3@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ed6d04bd100/0xbc3294617e5d0642 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 78 type: IBT flags: 0x60200400000020 nid: 10.8.8.3@o2ib6 remote: 0x59829cb70450d48d expref: 58 pid: 115658 timeout: 2661411 lvb_type: 0 [2661432.251004] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 3 previous similar messages [2661462.588344] Lustre: 115869:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ec369140f00 x1629295504922960/t0(0) o101->8c206ea7-4fa6-6560-2c3b-626d4cc9e42f@10.8.8.3@o2ib6:3/0 lens 576/3264 e 0 to 0 dl 1554816633 ref 2 fl Interpret:/0/0 rc 0/0 [2661462.617358] Lustre: 115869:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 34 previous similar messages [2661483.196592] LNet: Service thread pid 115972 was inactive for 200.74s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2661483.213798] LNet: Skipped 9 previous similar messages [2661483.219040] Pid: 115972, comm: mdt01_086 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2661483.229062] Call Trace: [2661483.231705] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2661483.238845] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2661483.246247] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2661483.253266] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2661483.260453] [] mdt_object_lock+0x20/0x30 [mdt] [2661483.266785] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2661483.273289] [] mdt_intent_brw+0x1f/0x30 [mdt] [2661483.279533] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2661483.286193] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2661483.293164] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2661483.300457] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2661483.306819] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2661483.313954] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2661483.321871] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2661483.328389] [] kthread+0xd1/0xe0 [2661483.333495] [] ret_from_fork_nospec_begin+0xe/0x21 [2661483.340170] [] 0xffffffffffffffff [2661483.345394] LustreError: dumping log to /tmp/lustre-log.1554816648.115972 [2661483.416558] Pid: 115834, comm: mdt01_050 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2661483.426558] Call Trace: [2661483.429198] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2661483.436324] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2661483.443705] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2661483.450708] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2661483.457902] [] mdt_object_lock+0x20/0x30 [mdt] [2661483.464213] [] mdt_hsm_state_set+0xc9/0x830 [mdt] [2661483.470795] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2661483.477925] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2661483.485841] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2661483.492344] [] kthread+0xd1/0xe0 [2661483.497431] [] ret_from_fork_nospec_begin+0xe/0x21 [2661483.504099] [] 0xffffffffffffffff [2661483.509312] Pid: 115856, comm: mdt00_044 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2661483.519324] Call Trace: [2661483.521956] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2661483.529065] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2661483.536450] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2661483.543469] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2661483.550663] [] mdt_object_lock+0x20/0x30 [mdt] [2661483.556988] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2661483.563473] [] mdt_intent_brw+0x1f/0x30 [mdt] [2661483.569709] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2661483.576369] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2661483.583326] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2661483.590641] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2661483.596995] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2661483.604136] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2661483.612040] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2661483.618553] [] kthread+0xd1/0xe0 [2661483.623645] [] ret_from_fork_nospec_begin+0xe/0x21 [2661483.630306] [] 0xffffffffffffffff [2661483.635507] Pid: 115778, comm: mdt01_039 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2661483.645521] Call Trace: [2661483.648153] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2661483.655261] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2661483.662637] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2661483.669638] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2661483.676831] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2661483.683592] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2661483.690264] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2661483.697206] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2661483.704505] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2661483.710856] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2661483.717991] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2661483.725911] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2661483.732413] [] kthread+0xd1/0xe0 [2661483.737515] [] ret_from_fork_nospec_begin+0xe/0x21 [2661483.744180] [] 0xffffffffffffffff [2661483.749371] Pid: 115925, comm: mdt01_070 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2661483.759386] Call Trace: [2661483.762018] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2661483.769131] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2661483.776519] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2661483.783531] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2661483.790716] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2661483.797372] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2661483.804578] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2661483.811324] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2661483.818008] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2661483.824959] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2661483.832271] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2661483.838626] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2661483.845770] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2661483.853671] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2661483.860186] [] kthread+0xd1/0xe0 [2661483.865278] [] ret_from_fork_nospec_begin+0xe/0x21 [2661483.871950] [] 0xffffffffffffffff [2661483.877159] LNet: Service thread pid 115890 was inactive for 201.14s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2661483.890285] LNet: Skipped 13 previous similar messages [2661496.268639] Lustre: fir-MDT0003: Client ecd69873-9a0a-0293-0ce7-d92dc18bf8d9 (at 10.9.101.11@o2ib4) reconnecting [2661496.278989] Lustre: Skipped 357 previous similar messages [2661527.509105] LustreError: 115590:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554816603, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ee022cd9200/0xbc3294617fbfea9a lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 77 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115590 timeout: 0 lvb_type: 0 [2661527.549045] LustreError: 115590:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 32 previous similar messages [2661612.215194] LNet: Service thread pid 116301 completed after 329.76s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2661612.231630] LNet: Skipped 41 previous similar messages [2661637.822375] LustreError: dumping log to /tmp/lustre-log.1554816803.115590 [2661669.054741] LustreError: dumping log to /tmp/lustre-log.1554816834.115967 [2661772.547391] Lustre: fir-MDT0001: Connection restored to ceea938e-6a91-5e80-97b6-3160b0748724 (at 10.8.27.30@o2ib6) [2661772.557915] Lustre: Skipped 354 previous similar messages [2661818.048463] Pid: 115621, comm: mdt01_022 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2661818.058463] Call Trace: [2661818.061107] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2661818.068235] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2661818.075610] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2661818.082621] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2661818.089796] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2661818.096541] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2661818.103218] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2661818.110159] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2661818.117473] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2661818.123823] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2661818.130938] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2661818.138859] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2661818.145367] [] kthread+0xd1/0xe0 [2661818.150456] [] ret_from_fork_nospec_begin+0xe/0x21 [2661818.157104] [] 0xffffffffffffffff [2661818.162296] LustreError: dumping log to /tmp/lustre-log.1554816983.115621 [2661818.200102] Pid: 116306, comm: mdt02_099 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2661818.210103] Call Trace: [2661818.212748] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2661818.219884] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2661818.227266] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2661818.234296] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2661818.241488] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2661818.248148] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2661818.255340] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2661818.262099] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2661818.268771] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2661818.275725] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2661818.283019] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2661818.289386] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2661818.296508] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2661818.304425] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2661818.310943] [] kthread+0xd1/0xe0 [2661818.316034] [] ret_from_fork_nospec_begin+0xe/0x21 [2661818.322698] [] 0xffffffffffffffff [2661848.768822] Pid: 115348, comm: mdt01_002 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2661848.778838] Call Trace: [2661848.781487] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2661848.788622] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2661848.795986] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2661848.802988] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2661848.810174] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2661848.816918] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2661848.823574] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2661848.830509] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2661848.837796] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2661848.844149] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2661848.851265] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2661848.859208] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2661848.865720] [] kthread+0xd1/0xe0 [2661848.870811] [] ret_from_fork_nospec_begin+0xe/0x21 [2661848.877465] [] 0xffffffffffffffff [2661848.882657] LustreError: dumping log to /tmp/lustre-log.1554817014.115348 [2661848.895386] Pid: 116309, comm: mdt02_102 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2661848.905382] Call Trace: [2661848.908026] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2661848.915147] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2661848.922524] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2661848.929524] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2661848.936718] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2661848.943388] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2661848.950569] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2661848.957327] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2661848.963983] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2661848.970932] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2661848.978223] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2661848.984582] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2661848.991702] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2661848.999601] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2661849.006128] [] kthread+0xd1/0xe0 [2661849.011232] [] ret_from_fork_nospec_begin+0xe/0x21 [2661849.017883] [] 0xffffffffffffffff [2661972.219437] LustreError: 115625:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ebda5941800 ns: mdt-fir-MDT0001_UUID lock: ffff8eb6ca8121c0/0xbc32946184600517 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 66 type: IBT flags: 0x50200000000000 nid: 10.8.17.17@o2ib6 remote: 0xe31ee7d82675a3c2 expref: 90 pid: 115625 timeout: 0 lvb_type: 0 [2661972.254583] LustreError: 115625:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 11 previous similar messages [2661972.265167] Lustre: 115866:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:20s); client may timeout. req@ffff8eb39cc95a00 x1628647038857600/t0(0) o101->535a7c41-41b9-ec51-bbd6-e028c4ebcb2c@10.8.8.1@o2ib6:3/0 lens 568/2296 e 0 to 0 dl 1554817117 ref 1 fl Complete:/0/0 rc -107/-107 [2661972.294172] Lustre: 115866:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 10 previous similar messages [2662033.219966] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 30s: evicting client at 10.8.27.29@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ed002aaa640/0xbc32946184a01bf8 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 97 type: IBT flags: 0x60200400000020 nid: 10.8.27.29@o2ib6 remote: 0xd991486401829fa9 expref: 76 pid: 115834 timeout: 2662012 lvb_type: 0 [2662033.258248] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 7 previous similar messages [2662097.332334] Lustre: fir-MDT0001: Client eed99957-e395-8d59-f471-3be5bc5334d2 (at 10.8.27.31@o2ib6) reconnecting [2662097.342616] Lustre: Skipped 355 previous similar messages [2662173.380599] LNet: Service thread pid 115658 was inactive for 200.03s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2662173.397793] LNet: Skipped 8 previous similar messages [2662173.403028] Pid: 115658, comm: mdt02_031 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2662173.413024] Call Trace: [2662173.415664] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2662173.422770] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2662173.430139] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2662173.437158] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2662173.444352] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2662173.451007] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2662173.458198] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2662173.464945] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2662173.471617] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2662173.478558] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2662173.485840] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2662173.492206] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2662173.499325] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2662173.507243] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2662173.513746] [] kthread+0xd1/0xe0 [2662173.518848] [] ret_from_fork_nospec_begin+0xe/0x21 [2662173.525499] [] 0xffffffffffffffff [2662173.530705] LustreError: dumping log to /tmp/lustre-log.1554817339.115658 [2662173.612490] Pid: 115778, comm: mdt01_039 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2662173.622489] Call Trace: [2662173.625126] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2662173.632230] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2662173.639638] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2662173.647654] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2662173.654833] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2662173.661527] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2662173.668709] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2662173.675453] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2662173.682141] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2662173.689075] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2662173.696386] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2662173.702726] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2662173.709871] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2662173.717766] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2662173.724285] [] kthread+0xd1/0xe0 [2662173.729377] [] ret_from_fork_nospec_begin+0xe/0x21 [2662173.736023] [] 0xffffffffffffffff [2662173.741221] Pid: 115887, comm: mdt01_063 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2662173.751248] Call Trace: [2662173.753882] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2662173.761008] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2662173.768378] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2662173.775394] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2662173.782596] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2662173.789271] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2662173.796464] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2662173.803225] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2662173.809880] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2662173.816821] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2662173.824120] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2662173.830463] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2662173.837596] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2662173.845501] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2662173.852033] [] kthread+0xd1/0xe0 [2662173.857116] [] ret_from_fork_nospec_begin+0xe/0x21 [2662173.863785] [] 0xffffffffffffffff [2662173.868974] Pid: 115584, comm: mdt01_010 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2662173.879008] Call Trace: [2662173.881640] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2662173.888754] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2662173.896125] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2662173.903161] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2662173.910337] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2662173.917041] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2662173.924222] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2662173.930994] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2662173.937647] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2662173.944593] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2662173.951869] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2662173.958228] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2662173.965338] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2662173.973255] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2662173.979758] [] kthread+0xd1/0xe0 [2662173.984868] [] ret_from_fork_nospec_begin+0xe/0x21 [2662173.991521] [] 0xffffffffffffffff [2662173.996713] Pid: 115837, comm: mdt01_051 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2662174.006707] Call Trace: [2662174.009345] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2662174.016454] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2662174.023829] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2662174.030833] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2662174.038041] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2662174.044786] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2662174.051461] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2662174.058383] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2662174.065695] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2662174.072026] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2662174.079163] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2662174.087055] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2662174.093568] [] kthread+0xd1/0xe0 [2662174.098649] [] ret_from_fork_nospec_begin+0xe/0x21 [2662174.105318] [] 0xffffffffffffffff [2662174.110500] LNet: Service thread pid 115890 was inactive for 200.63s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2662174.123640] LNet: Skipped 19 previous similar messages [2662205.124953] LustreError: dumping log to /tmp/lustre-log.1554817370.115573 [2662209.478006] Lustre: 115784:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (4/-6), not sending early reply req@ffff8ec40cc65400 x1628649154658640/t0(0) o101->eed99957-e395-8d59-f471-3be5bc5334d2@10.8.27.31@o2ib6:29/0 lens 576/3264 e 0 to 0 dl 1554817379 ref 2 fl Interpret:/0/0 rc 0/0 [2662209.507249] Lustre: 115784:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 48 previous similar messages [2662213.222417] LNet: Service thread pid 115902 completed after 239.88s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2662213.238847] LNet: Skipped 37 previous similar messages [2662235.845317] LustreError: dumping log to /tmp/lustre-log.1554817401.116301 [2662273.222754] LustreError: 115896:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554817348, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eda23f00fc0/0xbc3294618686dd21 lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 104 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115896 timeout: 0 lvb_type: 0 [2662273.262770] LustreError: 115896:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 44 previous similar messages [2662376.338418] Lustre: fir-MDT0001: Connection restored to (at 10.8.8.1@o2ib6) [2662376.345648] Lustre: Skipped 361 previous similar messages [2662383.303023] LustreError: dumping log to /tmp/lustre-log.1554817548.115896 [2662384.839042] LustreError: dumping log to /tmp/lustre-log.1554817550.116148 [2662413.511385] LustreError: dumping log to /tmp/lustre-log.1554817579.116124 [2662416.071405] LustreError: dumping log to /tmp/lustre-log.1554817581.116254 [2662568.649209] Pid: 115822, comm: mdt01_046 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2662568.659216] Call Trace: [2662568.661862] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2662568.668990] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2662568.676389] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2662568.683400] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2662568.690577] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2662568.697337] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2662568.704010] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2662568.710960] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2662568.718241] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2662568.724602] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2662568.731747] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2662568.739642] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2662568.746172] [] kthread+0xd1/0xe0 [2662568.751263] [] ret_from_fork_nospec_begin+0xe/0x21 [2662568.757909] [] 0xffffffffffffffff [2662568.763107] LustreError: dumping log to /tmp/lustre-log.1554817734.115822 [2662568.801810] Pid: 115859, comm: mdt02_054 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2662568.811821] Call Trace: [2662568.814463] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2662568.821598] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2662568.828981] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2662568.836004] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2662568.843202] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2662568.849863] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2662568.857056] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2662568.863801] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2662568.870472] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2662568.877423] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2662568.884702] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2662568.891054] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2662568.898168] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2662568.906073] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2662568.912591] [] kthread+0xd1/0xe0 [2662568.917682] [] ret_from_fork_nospec_begin+0xe/0x21 [2662568.924343] [] 0xffffffffffffffff [2662581.605906] LustreError: 115408:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ecce7400c00 ns: mdt-fir-MDT0001_UUID lock: ffff8ec0c916ba80/0xbc32946184a0cce9 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 92 type: IBT flags: 0x50200400000020 nid: 10.9.107.8@o2ib4 remote: 0x309944bb3b11e49e expref: 4 pid: 115408 timeout: 0 lvb_type: 0 [2662581.640955] LustreError: 115408:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 7 previous similar messages [2662581.651401] Lustre: 115408:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (246:362s); client may timeout. req@ffff8ed78dc84e00 x1628639969846288/t0(0) o101->dc990748-ca32-0960-545e-99af2316a63e@10.9.107.8@o2ib4:29/0 lens 576/1792 e 0 to 0 dl 1554817385 ref 1 fl Complete:/0/0 rc -107/-107 [2662581.680726] Lustre: 115408:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 10 previous similar messages [2662601.417591] Pid: 115887, comm: mdt01_063 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2662601.427595] Call Trace: [2662601.430238] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2662601.437373] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2662601.444735] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2662601.451739] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2662601.458920] [] mdt_object_lock+0x20/0x30 [mdt] [2662601.465224] [] mdt_hsm_state_set+0xc9/0x830 [mdt] [2662601.471792] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2662601.478909] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2662601.486811] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2662601.493313] [] kthread+0xd1/0xe0 [2662601.498416] [] ret_from_fork_nospec_begin+0xe/0x21 [2662601.505063] [] 0xffffffffffffffff [2662601.510257] LustreError: dumping log to /tmp/lustre-log.1554817767.115887 [2662601.522355] Pid: 116601, comm: mdt03_107 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2662601.532368] Call Trace: [2662601.535010] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2662601.542125] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2662601.549498] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2662601.556503] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2662601.563684] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2662601.570350] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2662601.577517] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2662601.584251] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2662601.590899] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2662601.597832] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2662601.605113] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2662601.611449] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2662601.618556] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2662601.626443] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2662601.632942] [] kthread+0xd1/0xe0 [2662601.638028] [] ret_from_fork_nospec_begin+0xe/0x21 [2662601.644677] [] 0xffffffffffffffff [2662601.649865] Pid: 115646, comm: mdt03_014 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2662601.659862] Call Trace: [2662601.662495] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2662601.669603] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2662601.676963] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2662601.683966] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2662601.691171] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2662601.697834] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2662601.705014] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2662601.711759] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2662601.718431] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2662601.725382] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2662601.732664] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2662601.739013] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2662601.746122] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2662601.754027] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2662601.760526] [] kthread+0xd1/0xe0 [2662601.765605] [] ret_from_fork_nospec_begin+0xe/0x21 [2662601.772252] [] 0xffffffffffffffff [2662615.657409] LustreError: 115348:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec5e9bd6000 ns: mdt-fir-MDT0001_UUID lock: ffff8ecf7fb486c0/0xbc32946189c25b49 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 71 type: IBT flags: 0x50200400000020 nid: 10.8.27.28@o2ib6 remote: 0x1df96237acfb0ce6 expref: 4 pid: 115348 timeout: 0 lvb_type: 0 [2662615.692464] LustreError: 115348:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 11 previous similar messages [2662615.702930] Lustre: 115348:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:35s); client may timeout. req@ffff8ec5e982a100 x1628638021341680/t0(0) o101->7d8ca85d-8b80-6a23-8fa9-83dca7eb7196@10.8.27.28@o2ib6:6/0 lens 576/1792 e 0 to 0 dl 1554817746 ref 1 fl Complete:/0/0 rc -107/-107 [2662615.732025] Lustre: 115348:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 2 previous similar messages [2662705.348557] Lustre: fir-MDT0003: Client ecd69873-9a0a-0293-0ce7-d92dc18bf8d9 (at 10.9.101.11@o2ib4) reconnecting [2662705.358933] Lustre: Skipped 345 previous similar messages [2662768.228563] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.27.31@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ed0243dee40/0xbc3294618a5f6f86 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 99 type: IBT flags: 0x60200400000020 nid: 10.8.27.31@o2ib6 remote: 0x99d3453de1b4dcb1 expref: 71 pid: 115967 timeout: 2662747 lvb_type: 0 [2662768.266929] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 6 previous similar messages [2662818.508172] LNet: Service thread pid 115878 was inactive for 200.06s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2662818.521300] LNet: Skipped 46 previous similar messages [2662818.526620] LustreError: dumping log to /tmp/lustre-log.1554817984.115878 [2662918.230545] LustreError: 115764:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ed28c58e000 ns: mdt-fir-MDT0001_UUID lock: ffff8eb7276c7080/0xbc3294618a5f8c74 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 96 type: IBT flags: 0x50200400000020 nid: 10.8.27.31@o2ib6 remote: 0x99d3453de1b4dcb8 expref: 4 pid: 115764 timeout: 0 lvb_type: 0 [2662918.265603] LustreError: 115764:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 2 previous similar messages [2662918.276000] Lustre: 115764:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:146s); client may timeout. req@ffff8ed28c7c8600 x1628649154744320/t0(0) o101->eed99957-e395-8d59-f471-3be5bc5334d2@10.8.27.31@o2ib6:13/0 lens 480/536 e 0 to 0 dl 1554817937 ref 1 fl Complete:/0/0 rc -107/-107 [2662918.305364] LNet: Service thread pid 115764 completed after 299.87s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2662918.321781] LNet: Skipped 47 previous similar messages [2662947.533674] Lustre: 115778:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ec40cc17800 x1628574041422960/t0(0) o101->40dfa1b3-0e1e-4763-9ec6-4ac10d77215c@10.8.27.26@o2ib6:18/0 lens 576/3264 e 0 to 0 dl 1554818118 ref 2 fl Interpret:/0/0 rc 0/0 [2662947.562948] Lustre: 115778:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 82 previous similar messages [2662969.037917] LNet: Service thread pid 115715 was inactive for 200.75s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2662969.055114] LNet: Skipped 9 previous similar messages [2662969.060344] Pid: 115715, comm: mdt01_030 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2662969.070373] Call Trace: [2662969.073012] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2662969.080117] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2662969.087514] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2662969.094522] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2662969.101728] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2662969.108389] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2662969.115578] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2662969.122340] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2662969.129014] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2662969.135954] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2662969.143251] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2662969.149619] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2662969.156742] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2662969.164643] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2662969.171146] [] kthread+0xd1/0xe0 [2662969.176233] [] ret_from_fork_nospec_begin+0xe/0x21 [2662969.182894] [] 0xffffffffffffffff [2662969.188097] LustreError: dumping log to /tmp/lustre-log.1554818134.115715 [2662969.226499] Pid: 115759, comm: mdt01_038 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2662969.236494] Call Trace: [2662969.239138] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2662969.246250] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2662969.253648] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2662969.260656] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2662969.267845] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2662969.274504] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2662969.281696] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2662969.288442] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2662969.295112] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2662969.302054] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2662969.309352] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2662969.315689] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2662969.322812] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2662969.330702] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2662969.337214] [] kthread+0xd1/0xe0 [2662969.342305] [] ret_from_fork_nospec_begin+0xe/0x21 [2662969.348952] [] 0xffffffffffffffff [2662969.354159] Pid: 115917, comm: mdt01_069 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2662969.364172] Call Trace: [2662969.366804] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2662969.373903] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2662969.381286] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2662969.388305] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2662969.395486] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2662969.402152] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2662969.409333] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2662969.416090] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2662969.422750] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2662969.429690] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2662969.436972] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2662969.443332] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2662969.450441] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2662969.458361] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2662969.464862] [] kthread+0xd1/0xe0 [2662969.469948] [] ret_from_fork_nospec_begin+0xe/0x21 [2662969.476589] [] 0xffffffffffffffff [2662974.157998] Pid: 116310, comm: mdt02_103 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2662974.168007] Call Trace: [2662974.170653] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2662974.177777] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2662974.185160] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2662974.192169] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2662974.199349] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2662974.206004] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2662974.213180] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2662974.219924] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2662974.226588] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2662974.233531] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2662974.240840] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2662974.247187] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2662974.254320] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2662974.262217] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2662974.268742] [] kthread+0xd1/0xe0 [2662974.273828] [] ret_from_fork_nospec_begin+0xe/0x21 [2662974.280491] [] 0xffffffffffffffff [2662974.285692] LustreError: dumping log to /tmp/lustre-log.1554818139.116310 [2662974.293933] Pid: 115724, comm: mdt01_033 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2662974.303937] Call Trace: [2662974.306572] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2662974.313695] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2662974.321070] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2662974.328104] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2662974.335305] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2662974.342055] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2662974.348711] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2662974.355644] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2662974.362925] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2662974.369260] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2662974.376383] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2662974.384278] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2662974.390806] [] kthread+0xd1/0xe0 [2662974.395894] [] ret_from_fork_nospec_begin+0xe/0x21 [2662974.402557] [] 0xffffffffffffffff [2662984.367837] Lustre: fir-MDT0003: Connection restored to (at 10.9.101.11@o2ib4) [2662984.375325] Lustre: Skipped 353 previous similar messages [2663012.511431] LustreError: 115837:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554818088, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eb91713f980/0xbc3294618d279b0b lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x13/0x8 rrc: 96 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115837 timeout: 0 lvb_type: 0 [2663012.551360] LustreError: 115837:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 59 previous similar messages [2663059.116190] LustreError: 116121:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec34caff800 ns: mdt-fir-MDT0001_UUID lock: ffff8ed0243df740/0xbc3294618a5f8f53 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 85 type: IBT flags: 0x50200400000020 nid: 10.8.27.26@o2ib6 remote: 0x3a52d8e5f90f5882 expref: 4 pid: 116121 timeout: 0 lvb_type: 0 [2663059.151284] Lustre: 116121:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (309:132s); client may timeout. req@ffff8ec2a84a0900 x1628574041384928/t0(0) o101->40dfa1b3-0e1e-4763-9ec6-4ac10d77215c@10.8.27.26@o2ib6:13/0 lens 480/536 e 0 to 0 dl 1554818092 ref 1 fl Complete:/0/0 rc -107/-107 [2663059.180557] Lustre: 116121:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 2 previous similar messages [2663260.369343] LustreError: dumping log to /tmp/lustre-log.1554818425.115866 [2663260.881341] LustreError: dumping log to /tmp/lustre-log.1554818426.115764 [2663308.007185] Lustre: fir-MDT0001: Client ceea938e-6a91-5e80-97b6-3160b0748724 (at 10.8.27.30@o2ib6) reconnecting [2663308.017445] Lustre: Skipped 361 previous similar messages [2663409.363065] Pid: 116593, comm: mdt03_104 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2663409.373068] Call Trace: [2663409.375712] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2663409.382841] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2663409.390207] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2663409.397227] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2663409.404402] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2663409.411061] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2663409.418252] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2663409.425005] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2663409.431661] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2663409.438615] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2663409.445929] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2663409.452296] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2663409.459437] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2663409.467350] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2663409.473869] [] kthread+0xd1/0xe0 [2663409.478973] [] ret_from_fork_nospec_begin+0xe/0x21 [2663409.485635] [] 0xffffffffffffffff [2663409.490840] LustreError: dumping log to /tmp/lustre-log.1554818575.116593 [2663409.525353] Pid: 115588, comm: mdt00_010 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2663409.535356] Call Trace: [2663409.537996] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2663409.545135] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2663409.552518] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2663409.559536] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2663409.566729] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2663409.573389] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2663409.580564] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2663409.587306] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2663409.593992] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2663409.600932] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2663409.608226] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2663409.614572] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2663409.621703] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2663409.629624] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2663409.636140] [] kthread+0xd1/0xe0 [2663409.641244] [] ret_from_fork_nospec_begin+0xe/0x21 [2663409.647910] [] 0xffffffffffffffff [2663415.507138] Pid: 116207, comm: mdt01_111 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2663415.517142] Call Trace: [2663415.519786] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2663415.526907] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2663415.534291] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2663415.541293] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2663415.548481] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2663415.555230] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2663415.561877] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2663415.568811] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2663415.576091] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2663415.582444] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2663415.589550] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2663415.597454] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2663415.603955] [] kthread+0xd1/0xe0 [2663415.609061] [] ret_from_fork_nospec_begin+0xe/0x21 [2663415.615726] [] 0xffffffffffffffff [2663415.620951] LustreError: dumping log to /tmp/lustre-log.1554818581.116207 [2663415.629173] Pid: 116301, comm: mdt02_094 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2663415.639197] Call Trace: [2663415.641850] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2663415.648962] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2663415.656351] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2663415.663357] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2663415.670561] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2663415.677214] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2663415.684391] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2663415.691132] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2663415.697814] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2663415.704750] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2663415.712046] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2663415.718382] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2663415.725513] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2663415.733411] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2663415.739911] [] kthread+0xd1/0xe0 [2663415.745015] [] ret_from_fork_nospec_begin+0xe/0x21 [2663415.751698] [] 0xffffffffffffffff [2663509.237244] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.27.31@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ed071ffee40/0xbc3294618e66995f lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 96 type: IBT flags: 0x60200400000020 nid: 10.8.27.31@o2ib6 remote: 0x99d3453de1b4f0ca expref: 34 pid: 115406 timeout: 2663488 lvb_type: 0 [2663509.275591] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 3 previous similar messages [2663550.292716] Lustre: 115796:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-17), not sending early reply req@ffff8edb63362100 x1628650695461392/t0(0) o101->fa7691ef-d46e-650c-947f-cea897c9625f@10.8.17.18@o2ib6:20/0 lens 568/0 e 0 to 0 dl 1554818720 ref 2 fl Interpret:/0/ffffffff rc 0/-1 [2663550.322502] Lustre: 115796:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 44 previous similar messages [2663564.500871] Pid: 116148, comm: mdt01_097 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2663564.510882] Call Trace: [2663564.513527] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2663564.520654] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2663564.528022] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2663564.535024] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2663564.542191] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2663564.548935] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2663564.555591] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2663564.562525] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2663564.569821] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2663564.576158] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2663564.583291] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2663564.591179] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2663564.597703] [] kthread+0xd1/0xe0 [2663564.602790] [] ret_from_fork_nospec_begin+0xe/0x21 [2663564.609454] [] 0xffffffffffffffff [2663564.614648] LustreError: dumping log to /tmp/lustre-log.1554818730.116148 [2663564.651615] LNet: Service thread pid 115750 was inactive for 200.46s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2663564.664771] LNet: Skipped 61 previous similar messages [2663587.017189] Lustre: fir-MDT0001: Connection restored to (at 10.9.107.5@o2ib4) [2663587.024602] Lustre: Skipped 363 previous similar messages [2663662.239207] LNet: Service thread pid 115871 completed after 602.21s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2663662.255626] LNet: Skipped 42 previous similar messages [2663713.494583] LNet: Service thread pid 115710 was inactive for 200.26s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2663713.511779] LNet: Skipped 9 previous similar messages [2663713.517012] Pid: 115710, comm: mdt00_028 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2663713.527010] Call Trace: [2663713.529651] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2663713.536783] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2663713.544149] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2663713.551184] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2663713.558363] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2663713.565033] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2663713.572212] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2663713.578971] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2663713.585628] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2663713.592556] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2663713.599841] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2663713.606192] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2663713.613303] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2663713.621203] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2663713.627706] [] kthread+0xd1/0xe0 [2663713.632808] [] ret_from_fork_nospec_begin+0xe/0x21 [2663713.639457] [] 0xffffffffffffffff [2663713.644661] LustreError: dumping log to /tmp/lustre-log.1554818879.115710 [2663713.682085] Pid: 115837, comm: mdt01_051 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2663713.692086] Call Trace: [2663713.694726] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2663713.701849] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2663713.709234] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2663713.716239] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2663713.723428] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2663713.730089] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2663713.737285] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2663713.744063] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2663713.750725] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2663713.757665] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2663713.764959] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2663713.771298] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2663713.778413] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2663713.786315] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2663713.792820] [] kthread+0xd1/0xe0 [2663713.797914] [] ret_from_fork_nospec_begin+0xe/0x21 [2663713.804560] [] 0xffffffffffffffff [2663713.809805] Pid: 115822, comm: mdt01_046 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2663713.819807] Call Trace: [2663713.822445] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2663713.829572] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2663713.836956] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2663713.843961] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2663713.851150] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2663713.857807] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2663713.865000] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2663713.871745] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2663713.878428] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2663713.885352] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2663713.892631] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2663713.898967] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2663713.906097] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2663713.913987] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2663713.920506] [] kthread+0xd1/0xe0 [2663713.925592] [] ret_from_fork_nospec_begin+0xe/0x21 [2663713.932238] [] 0xffffffffffffffff [2663713.937418] Pid: 115890, comm: mdt01_064 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2663713.947449] Call Trace: [2663713.950080] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2663713.957192] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2663713.964559] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2663713.971594] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2663713.978772] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2663713.985440] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2663713.992621] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2663713.999363] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2663714.006020] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2663714.012968] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2663714.020251] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2663714.026601] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2663714.033709] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2663714.041611] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2663714.048114] [] kthread+0xd1/0xe0 [2663714.053216] [] ret_from_fork_nospec_begin+0xe/0x21 [2663714.059868] [] 0xffffffffffffffff [2663714.065076] Pid: 115917, comm: mdt01_069 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2663714.075078] Call Trace: [2663714.077717] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2663714.084841] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2663714.092210] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2663714.099230] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2663714.106409] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2663714.113176] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2663714.119834] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2663714.126780] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2663714.134062] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2663714.140404] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2663714.147528] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2663714.155433] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2663714.161934] [] kthread+0xd1/0xe0 [2663714.167037] [] ret_from_fork_nospec_begin+0xe/0x21 [2663714.173685] [] 0xffffffffffffffff [2663752.425045] LustreError: 115715:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554818827, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ecdc6f69b00/0xbc32946193de5edb lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x13/0x8 rrc: 87 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115715 timeout: 0 lvb_type: 0 [2663752.464972] LustreError: 115715:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 44 previous similar messages [2663842.241494] LustreError: 115791:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec3690bf000 ns: mdt-fir-MDT0001_UUID lock: ffff8eb2f2d14ec0/0xbc3294618e66a694 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 81 type: IBT flags: 0x50200400000020 nid: 10.8.17.17@o2ib6 remote: 0xe31ee7d826760f0e expref: 4 pid: 115791 timeout: 0 lvb_type: 0 [2663842.276558] LustreError: 115791:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 2 previous similar messages [2663842.286963] Lustre: 115791:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (774:8s); client may timeout. req@ffff8ebab3c6cb00 x1628649281609808/t0(0) o101->2c78cdad-2975-ca98-fb36-e7548576f834@10.8.17.17@o2ib6:5/0 lens 576/1792 e 0 to 0 dl 1554818999 ref 1 fl Complete:/0/0 rc -107/-107 [2663842.316050] Lustre: 115791:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 8 previous similar messages [2663914.460983] Lustre: fir-MDT0003: Client ecd69873-9a0a-0293-0ce7-d92dc18bf8d9 (at 10.9.101.11@o2ib4) reconnecting [2663914.471332] Lustre: Skipped 359 previous similar messages [2663993.243067] LustreError: 115715:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8eb67afacc00 ns: mdt-fir-MDT0001_UUID lock: ffff8ecf56801680/0xbc3294619587eaf4 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 91 type: IBT flags: 0x50200000000000 nid: 10.9.107.8@o2ib4 remote: 0x309944bb3b124e07 expref: 82 pid: 115715 timeout: 0 lvb_type: 0 [2663993.278225] LustreError: 115715:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 11 previous similar messages [2664173.244975] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.27.26@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8edfcc1a5580/0xbc32946196e5191e lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 89 type: IBT flags: 0x60200400000020 nid: 10.8.27.26@o2ib6 remote: 0x3a52d8e5f90fa4b7 expref: 55 pid: 115791 timeout: 2664152 lvb_type: 0 [2664173.283327] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 5 previous similar messages [2664174.414748] LustreError: 116207:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec61c86ac00 ns: mdt-fir-MDT0001_UUID lock: ffff8ecbe22db600/0xbc32946196e52dd1 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 83 type: IBT flags: 0x50200400000020 nid: 10.8.27.26@o2ib6 remote: 0x3a52d8e5f90fa597 expref: 7 pid: 116207 timeout: 0 lvb_type: 0 [2664174.450015] Lustre: 115948:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:150s); client may timeout. req@ffff8eb83fffcb00 x1628647349102064/t0(0) o101->eeceaf0a-f64b-44e4-4f28-fac655ebb0a4@10.8.17.19@o2ib6:9/0 lens 568/2296 e 0 to 0 dl 1554819189 ref 1 fl Complete:/0/0 rc -107/-107 [2664193.480113] Lustre: fir-MDT0003: Connection restored to (at 10.9.101.11@o2ib4) [2664193.487602] Lustre: Skipped 330 previous similar messages [2664199.836284] Lustre: 115714:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eeab5360000 x1628546090852864/t0(0) o101->ac4b3f39-4aa4-91f4-89f9-e38a651f2f99@10.9.107.5@o2ib4:10/0 lens 480/568 e 0 to 0 dl 1554819370 ref 2 fl Interpret:/0/0 rc 0/0 [2664199.865458] Lustre: 115714:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 70 previous similar messages [2664206.140216] LustreError: 115625:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ef0093fcc00 ns: mdt-fir-MDT0001_UUID lock: ffff8ecd7b63a1c0/0xbc3294619887c02d lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 82 type: IBT flags: 0x50200400000020 nid: 10.8.8.1@o2ib6 remote: 0xbeca8e2774bd8afe expref: 4 pid: 115625 timeout: 0 lvb_type: 0 [2664206.175097] LustreError: 115625:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 3 previous similar messages [2664206.185521] Lustre: 115625:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:1s); client may timeout. req@ffff8ebab3df0600 x1628647039231728/t0(0) o101->535a7c41-41b9-ec51-bbd6-e028c4ebcb2c@10.8.8.1@o2ib6:10/0 lens 576/1792 e 0 to 0 dl 1554819370 ref 1 fl Complete:/0/0 rc -107/-107 [2664206.214460] Lustre: 115625:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 1 previous similar message [2664407.774720] LNet: Service thread pid 115621 was inactive for 200.65s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2664407.791950] LNet: Skipped 4 previous similar messages [2664407.797185] Pid: 115621, comm: mdt01_022 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2664407.807241] Call Trace: [2664407.809883] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2664407.817030] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2664407.824414] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2664407.831451] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2664407.838652] [] mdt_object_lock+0x20/0x30 [mdt] [2664407.844975] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2664407.851495] [] mdt_intent_brw+0x1f/0x30 [mdt] [2664407.857729] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2664407.864429] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2664407.871386] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2664407.878677] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2664407.885046] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2664407.892172] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2664407.900106] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2664407.906638] [] kthread+0xd1/0xe0 [2664407.911762] [] ret_from_fork_nospec_begin+0xe/0x21 [2664407.918421] [] 0xffffffffffffffff [2664407.923653] LustreError: dumping log to /tmp/lustre-log.1554819573.115621 [2664408.079985] Pid: 115948, comm: mdt00_073 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2664408.089984] Call Trace: [2664408.092628] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2664408.099785] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2664408.107181] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2664408.114236] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2664408.121440] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2664408.128193] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2664408.134874] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2664408.141821] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2664408.149145] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2664408.155519] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2664408.162642] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2664408.170559] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2664408.177076] [] kthread+0xd1/0xe0 [2664408.182191] [] ret_from_fork_nospec_begin+0xe/0x21 [2664408.188882] [] 0xffffffffffffffff [2664408.194108] Pid: 115690, comm: mdt03_016 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2664408.204140] Call Trace: [2664408.206788] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2664408.213919] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2664408.221333] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2664408.228346] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2664408.235562] [] mdt_object_lock+0x20/0x30 [mdt] [2664408.241890] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2664408.248426] [] mdt_intent_brw+0x1f/0x30 [mdt] [2664408.254660] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2664408.261338] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2664408.268308] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2664408.275611] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2664408.281983] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2664408.289126] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2664408.297048] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2664408.303591] [] kthread+0xd1/0xe0 [2664408.308707] [] ret_from_fork_nospec_begin+0xe/0x21 [2664408.315401] [] 0xffffffffffffffff [2664408.320611] Pid: 115850, comm: mdt02_052 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2664408.330625] Call Trace: [2664408.333289] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2664408.340410] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2664408.347804] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2664408.354822] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2664408.362026] [] mdt_object_lock+0x20/0x30 [mdt] [2664408.368349] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2664408.374871] [] mdt_intent_brw+0x1f/0x30 [mdt] [2664408.381112] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2664408.387801] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2664408.394767] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2664408.402072] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2664408.408427] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2664408.415584] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2664408.423491] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2664408.430029] [] kthread+0xd1/0xe0 [2664408.435130] [] ret_from_fork_nospec_begin+0xe/0x21 [2664408.441795] [] 0xffffffffffffffff [2664408.447029] Pid: 115865, comm: mdt02_056 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2664408.457036] Call Trace: [2664408.459672] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2664408.466817] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2664408.474206] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2664408.481235] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2664408.488430] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2664408.495109] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2664408.502303] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2664408.509061] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2664408.515736] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2664408.522701] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2664408.530001] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2664408.536347] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2664408.543480] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2664408.551377] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2664408.557895] [] kthread+0xd1/0xe0 [2664408.563003] [] ret_from_fork_nospec_begin+0xe/0x21 [2664408.569670] [] 0xffffffffffffffff [2664408.574883] LNet: Service thread pid 115653 was inactive for 201.44s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2664408.588019] LNet: Skipped 1 previous similar message [2664452.196248] LustreError: 115813:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554819527, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ebc012d9b00/0xbc3294619a40ae59 lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 78 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115813 timeout: 0 lvb_type: 0 [2664452.236161] LustreError: 115813:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 82 previous similar messages [2664517.125351] Lustre: fir-MDT0001: Client 535a7c41-41b9-ec51-bbd6-e028c4ebcb2c (at 10.8.8.1@o2ib6) reconnecting [2664517.135441] Lustre: Skipped 309 previous similar messages [2664536.249469] LNet: Service thread pid 115568 completed after 329.12s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2664536.265891] LNet: Skipped 33 previous similar messages [2664562.400542] LustreError: dumping log to /tmp/lustre-log.1554819727.116303 [2664593.632920] LustreError: dumping log to /tmp/lustre-log.1554819759.115871 [2664742.626728] Pid: 115942, comm: mdt01_076 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2664742.636737] Call Trace: [2664742.639377] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2664742.646496] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2664742.653863] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2664742.660866] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2664742.668043] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2664742.674786] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2664742.681474] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2664742.688410] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2664742.695707] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2664742.702051] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2664742.709191] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2664742.717080] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2664742.723596] [] kthread+0xd1/0xe0 [2664742.728683] [] ret_from_fork_nospec_begin+0xe/0x21 [2664742.735331] [] 0xffffffffffffffff [2664742.740551] LustreError: dumping log to /tmp/lustre-log.1554819908.115942 [2664742.772719] Pid: 116295, comm: mdt00_109 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2664742.782714] Call Trace: [2664742.785357] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2664742.792471] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2664742.799852] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2664742.806860] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2664742.814058] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2664742.820720] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2664742.827918] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2664742.834661] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2664742.841334] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2664742.848269] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2664742.855578] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2664742.861928] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2664742.869059] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2664742.876947] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2664742.883456] [] kthread+0xd1/0xe0 [2664742.888543] [] ret_from_fork_nospec_begin+0xe/0x21 [2664742.895207] [] 0xffffffffffffffff [2664779.491168] Pid: 115695, comm: mdt01_029 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2664779.501174] Call Trace: [2664779.503822] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2664779.510945] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2664779.518343] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2664779.525357] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2664779.532549] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2664779.539294] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2664779.545966] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2664779.552901] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2664779.560194] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2664779.566534] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2664779.573648] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2664779.581545] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2664779.588053] [] kthread+0xd1/0xe0 [2664779.593139] [] ret_from_fork_nospec_begin+0xe/0x21 [2664779.599804] [] 0xffffffffffffffff [2664779.605000] LustreError: dumping log to /tmp/lustre-log.1554819945.115695 [2664779.620353] Pid: 115959, comm: mdt01_081 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2664779.630391] Call Trace: [2664779.633024] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2664779.640140] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2664779.647521] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2664779.654527] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2664779.661716] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2664779.668376] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2664779.675567] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2664779.682313] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2664779.688985] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2664779.695920] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2664779.703230] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2664779.709569] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2664779.716690] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2664779.724581] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2664779.731094] [] kthread+0xd1/0xe0 [2664779.736185] [] ret_from_fork_nospec_begin+0xe/0x21 [2664779.742831] [] 0xffffffffffffffff [2664796.131224] Lustre: fir-MDT0001: Connection restored to 7d8ca85d-8b80-6a23-8fa9-83dca7eb7196 (at 10.8.27.28@o2ib6) [2664796.141745] Lustre: Skipped 339 previous similar messages [2664866.253204] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.27.26@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ec74105da00/0xbc32946198d26b89 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 75 type: IBT flags: 0x60200400000020 nid: 10.8.27.26@o2ib6 remote: 0x3a52d8e5f90fc595 expref: 54 pid: 115621 timeout: 2664845 lvb_type: 0 [2664866.291551] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 6 previous similar messages [2664901.530620] Lustre: 115348:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ec75a7fec00 x1628647039330656/t0(0) o101->535a7c41-41b9-ec51-bbd6-e028c4ebcb2c@10.8.8.1@o2ib6:22/0 lens 568/0 e 0 to 0 dl 1554820072 ref 2 fl Interpret:/0/ffffffff rc 0/-1 [2664901.560142] Lustre: 115348:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 60 previous similar messages [2664928.996939] Pid: 115601, comm: mdt01_015 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2664929.006942] Call Trace: [2664929.009583] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2664929.016697] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2664929.024091] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2664929.031092] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2664929.038267] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2664929.045011] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2664929.051699] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2664929.058636] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2664929.065917] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2664929.072268] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2664929.079375] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2664929.087277] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2664929.093778] [] kthread+0xd1/0xe0 [2664929.098866] [] ret_from_fork_nospec_begin+0xe/0x21 [2664929.105521] [] 0xffffffffffffffff [2664929.110715] LustreError: dumping log to /tmp/lustre-log.1554820094.115601 [2665076.256238] LustreError: 115915:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ebc902b6400 ns: mdt-fir-MDT0001_UUID lock: ffff8ec72f29c5c0/0xbc32946198d2715c lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 64 type: IBT flags: 0x50200400000020 nid: 10.8.17.17@o2ib6 remote: 0xe31ee7d826766c68 expref: 4 pid: 115915 timeout: 0 lvb_type: 0 [2665076.291313] LustreError: 115915:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 1 previous similar message [2665076.301630] Lustre: 115915:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (867:2s); client may timeout. req@ffff8eb9975ad100 x1628649281793392/t0(0) o101->2c78cdad-2975-ca98-fb36-e7548576f834@10.8.17.17@o2ib6:12/0 lens 576/1792 e 0 to 0 dl 1554820239 ref 1 fl Complete:/0/0 rc -107/-107 [2665076.330790] Lustre: 115915:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 9 previous similar messages [2665110.624454] LustreError: 116295:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec1f0d28c00 ns: mdt-fir-MDT0001_UUID lock: ffff8eb6ce239f80/0xbc329461a0ba4a08 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 57 type: IBT flags: 0x50200000000000 nid: 10.8.17.15@o2ib6 remote: 0x794f201b734663bf expref: 2 pid: 116295 timeout: 0 lvb_type: 0 [2665110.659510] LustreError: 116295:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 10 previous similar messages [2665110.670135] Lustre: 116295:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:2s); client may timeout. req@ffff8eb83fe98000 x1628593190717408/t0(0) o101->bf6bd63b-9206-2f81-2780-2b483791a8c1@10.8.17.15@o2ib6:14/0 lens 568/2296 e 0 to 0 dl 1554820274 ref 1 fl Complete:/0/0 rc -107/-107 [2665123.628067] Lustre: fir-MDT0003: Client ecd69873-9a0a-0293-0ce7-d92dc18bf8d9 (at 10.9.101.11@o2ib4) reconnecting [2665123.638422] Lustre: Skipped 335 previous similar messages [2665201.859144] LustreError: 115653:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554820277, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ebddfbae540/0xbc329461a103684e lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 65 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115653 timeout: 0 lvb_type: 0 [2665201.899067] LustreError: 115653:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 24 previous similar messages [2665311.977455] LNet: Service thread pid 115890 was inactive for 200.05s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2665311.994652] LNet: Skipped 9 previous similar messages [2665311.999884] Pid: 115890, comm: mdt01_064 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2665312.009902] Call Trace: [2665312.012546] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2665312.019690] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2665312.027078] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2665312.034111] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2665312.041324] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2665312.047981] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2665312.055157] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2665312.061901] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2665312.068579] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2665312.075524] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2665312.082829] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2665312.089182] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2665312.096322] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2665312.104220] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2665312.110727] [] kthread+0xd1/0xe0 [2665312.115836] [] ret_from_fork_nospec_begin+0xe/0x21 [2665312.122511] [] 0xffffffffffffffff [2665312.127710] LustreError: dumping log to /tmp/lustre-log.1554820477.115890 [2665312.222996] Pid: 116191, comm: mdt01_107 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2665312.232995] Call Trace: [2665312.235635] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2665312.242766] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2665312.250132] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2665312.257143] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2665312.264339] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2665312.271002] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2665312.278178] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2665312.284921] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2665312.291579] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2665312.298536] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2665312.305817] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2665312.312161] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2665312.319296] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2665312.327187] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2665312.333723] [] kthread+0xd1/0xe0 [2665312.338809] [] ret_from_fork_nospec_begin+0xe/0x21 [2665312.345457] [] 0xffffffffffffffff [2665312.350657] Pid: 115653, comm: mdt00_022 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2665312.360714] Call Trace: [2665312.363355] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2665312.370462] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2665312.377833] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2665312.384849] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2665312.392027] [] mdt_object_lock+0x20/0x30 [mdt] [2665312.398361] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2665312.404867] [] mdt_intent_brw+0x1f/0x30 [mdt] [2665312.411091] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2665312.417779] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2665312.424764] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2665312.432066] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2665312.438419] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2665312.445532] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2665312.453419] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2665312.459919] [] kthread+0xd1/0xe0 [2665312.465007] [] ret_from_fork_nospec_begin+0xe/0x21 [2665312.471655] [] 0xffffffffffffffff [2665312.476884] Pid: 116306, comm: mdt02_099 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2665312.486884] Call Trace: [2665312.489524] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2665312.496631] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2665312.504000] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2665312.511001] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2665312.518176] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2665312.524835] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2665312.532015] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2665312.538761] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2665312.545418] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2665312.552350] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2665312.559630] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2665312.565965] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2665312.573074] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2665312.580960] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2665312.587483] [] kthread+0xd1/0xe0 [2665312.592574] [] ret_from_fork_nospec_begin+0xe/0x21 [2665312.599236] [] 0xffffffffffffffff [2665312.604457] Pid: 115939, comm: mdt01_075 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2665312.614454] Call Trace: [2665312.617094] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2665312.624197] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2665312.631565] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2665312.638585] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2665312.645764] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2665312.652447] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2665312.659639] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2665312.666381] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2665312.673037] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2665312.680002] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2665312.687288] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2665312.693645] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2665312.700778] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2665312.708665] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2665312.715171] [] kthread+0xd1/0xe0 [2665312.720261] [] ret_from_fork_nospec_begin+0xe/0x21 [2665312.726908] [] 0xffffffffffffffff [2665312.732108] LNet: Service thread pid 116148 was inactive for 200.84s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2665312.745228] LNet: Skipped 20 previous similar messages [2665341.673802] LustreError: dumping log to /tmp/lustre-log.1554820507.116166 [2665402.647305] Lustre: fir-MDT0003: Connection restored to (at 10.9.101.11@o2ib4) [2665402.654798] Lustre: Skipped 320 previous similar messages [2665441.260197] LNet: Service thread pid 115817 completed after 329.39s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2665441.276638] LNet: Skipped 30 previous similar messages [2665492.203572] LustreError: dumping log to /tmp/lustre-log.1554820657.115865 [2665591.261747] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.9.107.8@o2ib4 ns: mdt-fir-MDT0001_UUID lock: ffff8ecfbfb98fc0/0xbc329461a10367ec lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 62 type: IBT flags: 0x60200400000020 nid: 10.9.107.8@o2ib4 remote: 0x309944bb3b12c3b7 expref: 137 pid: 115817 timeout: 2665570 lvb_type: 0 [2665591.300203] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 7 previous similar messages [2665621.229099] Lustre: 115582:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8edf633fd700 x1628556958770832/t0(0) o101->4cd8fde3-ab19-6a6b-a7ee-5d70c4bd9893@10.8.8.6@o2ib6:21/0 lens 568/0 e 0 to 0 dl 1554820791 ref 2 fl Interpret:/0/ffffffff rc 0/-1 [2665621.258630] Lustre: 115582:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 55 previous similar messages [2665647.341407] Pid: 115878, comm: mdt01_062 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2665647.351412] Call Trace: [2665647.354055] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2665647.361201] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2665647.368583] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2665647.375589] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2665647.382775] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2665647.389438] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2665647.396616] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2665647.403357] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2665647.410021] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2665647.416972] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2665647.424252] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2665647.430598] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2665647.437737] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2665647.445632] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2665647.452132] [] kthread+0xd1/0xe0 [2665647.457219] [] ret_from_fork_nospec_begin+0xe/0x21 [2665647.463901] [] 0xffffffffffffffff [2665647.469109] LustreError: dumping log to /tmp/lustre-log.1554820812.115878 [2665647.508635] Pid: 115870, comm: mdt00_048 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2665647.518636] Call Trace: [2665647.521280] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2665647.528428] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2665647.535810] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2665647.542830] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2665647.550007] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2665647.556751] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2665647.563405] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2665647.570355] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2665647.577637] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2665647.583972] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2665647.591104] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2665647.598993] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2665647.605524] [] kthread+0xd1/0xe0 [2665647.610612] [] ret_from_fork_nospec_begin+0xe/0x21 [2665647.617261] [] 0xffffffffffffffff [2665726.037210] Lustre: fir-MDT0001: Client 7d8ca85d-8b80-6a23-8fa9-83dca7eb7196 (at 10.8.27.28@o2ib6) reconnecting [2665726.047474] Lustre: Skipped 280 previous similar messages [2665796.336161] Pid: 115621, comm: mdt01_022 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2665796.346172] Call Trace: [2665796.348810] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2665796.355982] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2665796.363406] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2665796.370420] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2665796.377629] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2665796.384299] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2665796.391522] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2665796.398276] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2665796.404968] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2665796.411917] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2665796.419225] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2665796.425582] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2665796.432739] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2665796.440660] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2665796.447175] [] kthread+0xd1/0xe0 [2665796.452292] [] ret_from_fork_nospec_begin+0xe/0x21 [2665796.458952] [] 0xffffffffffffffff [2665796.464193] LustreError: dumping log to /tmp/lustre-log.1554820961.115621 [2665796.497576] Pid: 115764, comm: mdt02_042 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2665796.507576] Call Trace: [2665796.510219] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2665796.517384] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2665796.524809] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2665796.531821] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2665796.539024] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2665796.545772] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2665796.552484] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2665796.559438] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2665796.566738] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2665796.573087] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2665796.580214] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2665796.588135] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2665796.594682] [] kthread+0xd1/0xe0 [2665796.599799] [] ret_from_fork_nospec_begin+0xe/0x21 [2665796.606471] [] 0xffffffffffffffff [2665835.227634] LustreError: 115948:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554820910, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ebba9ed3840/0xbc329461a6dcac4a lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 62 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115948 timeout: 0 lvb_type: 0 [2665835.267568] LustreError: 115948:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 13 previous similar messages [2665935.491833] Lustre: fir-MDT0003: haven't heard from client 83d4cdc2-b558-14f9-d9fc-b557eebe79ce (at 10.9.101.47@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eb79896f800, cur 1554821101 expire 1554820951 last 1554820874 [2665935.513801] Lustre: Skipped 1 previous similar message [2665945.840935] LNet: Service thread pid 115948 was inactive for 200.61s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2665945.858129] LNet: Skipped 8 previous similar messages [2665945.863367] Pid: 115948, comm: mdt00_073 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2665945.873401] Call Trace: [2665945.876062] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2665945.883191] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2665945.890600] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2665945.897618] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2665945.904813] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2665945.911580] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2665945.918238] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2665945.925191] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2665945.932515] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2665945.938879] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2665945.946016] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2665945.953928] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2665945.960438] [] kthread+0xd1/0xe0 [2665945.965524] [] ret_from_fork_nospec_begin+0xe/0x21 [2665945.972190] [] 0xffffffffffffffff [2665945.977398] LustreError: dumping log to /tmp/lustre-log.1554821111.115948 [2665946.019061] LNet: Service thread pid 115784 was inactive for 200.78s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2665946.032183] LNet: Skipped 15 previous similar messages [2665947.493325] Lustre: fir-MDT0001: haven't heard from client 83d4cdc2-b558-14f9-d9fc-b557eebe79ce (at 10.9.101.47@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee6b8bae400, cur 1554821113 expire 1554820963 last 1554820886 [2665947.515297] Lustre: Skipped 2 previous similar messages [2665951.266422] Lustre: 115871:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (185:654s); client may timeout. req@ffff8eccee638900 x1628638021805840/t0(0) o55->7d8ca85d-8b80-6a23-8fa9-83dca7eb7196@10.8.27.28@o2ib6:17/0 lens 472/192 e 0 to 0 dl 1554820462 ref 1 fl Complete:/0/0 rc -22/-22 [2665951.266467] LustreError: 115659:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8eca0b346400 ns: mdt-fir-MDT0001_UUID lock: ffff8ecdba659d40/0xbc329461a84761b4 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 52 type: IBT flags: 0x50200000000000 nid: 10.9.107.5@o2ib4 remote: 0xb9c41693abb0cf86 expref: 2 pid: 115659 timeout: 0 lvb_type: 0 [2665951.330511] Lustre: 115871:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 7 previous similar messages [2666012.450474] Lustre: fir-MDT0001: Connection restored to 4cd8fde3-ab19-6a6b-a7ee-5d70c4bd9893 (at 10.8.8.6@o2ib6) [2666012.460827] Lustre: Skipped 282 previous similar messages [2666152.691370] Pid: 115753, comm: mdt01_037 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2666152.701379] Call Trace: [2666152.704027] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2666152.711159] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2666152.718558] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2666152.725560] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2666152.732746] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2666152.739402] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2666152.746577] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2666152.753323] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2666152.759985] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2666152.766927] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2666152.774215] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2666152.780560] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2666152.787684] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2666152.795581] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2666152.802088] [] kthread+0xd1/0xe0 [2666152.807176] [] ret_from_fork_nospec_begin+0xe/0x21 [2666152.813822] [] 0xffffffffffffffff [2666152.819016] LustreError: dumping log to /tmp/lustre-log.1554821318.115753 [2666152.872045] Pid: 115621, comm: mdt01_022 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2666152.882041] Call Trace: [2666152.884689] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2666152.891827] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2666152.899210] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2666152.906223] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2666152.913405] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2666152.920063] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2666152.927250] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2666152.933996] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2666152.940654] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2666152.947624] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2666152.954919] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2666152.961290] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2666152.968441] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2666152.976343] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2666152.982871] [] kthread+0xd1/0xe0 [2666152.987984] [] ret_from_fork_nospec_begin+0xe/0x21 [2666152.994631] [] 0xffffffffffffffff [2666152.999828] Pid: 115784, comm: mdt01_041 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2666153.009845] Call Trace: [2666153.012484] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2666153.019630] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2666153.027002] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2666153.034002] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2666153.041181] [] mdt_object_lock+0x20/0x30 [mdt] [2666153.047499] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2666153.054006] [] mdt_intent_brw+0x1f/0x30 [mdt] [2666153.060246] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2666153.066902] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2666153.073854] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2666153.081164] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2666153.087505] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2666153.094634] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2666153.102523] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2666153.109022] [] kthread+0xd1/0xe0 [2666153.114109] [] ret_from_fork_nospec_begin+0xe/0x21 [2666153.120757] [] 0xffffffffffffffff [2666153.125987] Pid: 115347, comm: mdt01_001 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2666153.135990] Call Trace: [2666153.138643] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2666153.145753] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2666153.153129] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2666153.160140] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2666153.167332] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2666153.174076] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2666153.180731] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2666153.187694] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2666153.194982] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2666153.201333] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2666153.208439] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2666153.216334] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2666153.222852] [] kthread+0xd1/0xe0 [2666153.227938] [] ret_from_fork_nospec_begin+0xe/0x21 [2666153.234604] [] 0xffffffffffffffff [2666153.239800] Pid: 115939, comm: mdt01_075 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2666153.249813] Call Trace: [2666153.252457] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2666153.259573] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2666153.266959] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2666153.273961] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2666153.281127] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2666153.287773] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2666153.294970] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2666153.301702] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2666153.308391] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2666153.315327] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2666153.322621] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2666153.328959] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2666153.336099] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2666153.344004] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2666153.350505] [] kthread+0xd1/0xe0 [2666153.355607] [] ret_from_fork_nospec_begin+0xe/0x21 [2666153.362270] [] 0xffffffffffffffff [2666181.875710] LustreError: dumping log to /tmp/lustre-log.1554821347.116172 [2666281.269890] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.17.15@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ecfaedc8240/0xbc329461a8c05006 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 56 type: IBT flags: 0x60200400000020 nid: 10.8.17.15@o2ib6 remote: 0x794f201b7346911c expref: 61 pid: 115346 timeout: 2666260 lvb_type: 0 [2666281.308238] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 6 previous similar messages [2666281.318395] LNet: Service thread pid 115948 completed after 329.22s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2666281.334829] LNet: Skipped 25 previous similar messages [2666310.941239] Lustre: 115876:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ebe0cc3dd00 x1628593190876736/t0(0) o101->bf6bd63b-9206-2f81-2780-2b483791a8c1@10.8.17.15@o2ib6:21/0 lens 576/3264 e 0 to 0 dl 1554821481 ref 2 fl Interpret:/0/0 rc 0/0 [2666310.970486] Lustre: 115876:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 26 previous similar messages [2666330.133190] Lustre: fir-MDT0001: Client ac4b3f39-4aa4-91f4-89f9-e38a651f2f99 (at 10.9.107.5@o2ib4) reconnecting [2666330.143466] Lustre: Skipped 291 previous similar messages [2666337.014542] LustreError: dumping log to /tmp/lustre-log.1554821502.116295 [2666486.007364] Pid: 116334, comm: mdt02_110 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2666486.017366] Call Trace: [2666486.020008] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2666486.027129] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2666486.034514] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2666486.041543] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2666486.048744] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2666486.055504] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2666486.062159] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2666486.069102] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2666486.076424] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2666486.082786] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2666486.089917] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2666486.097806] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2666486.104328] [] kthread+0xd1/0xe0 [2666486.109420] [] ret_from_fork_nospec_begin+0xe/0x21 [2666486.116067] [] 0xffffffffffffffff [2666486.121260] LustreError: dumping log to /tmp/lustre-log.1554821651.116334 [2666486.519361] Pid: 115591, comm: mdt00_011 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2666486.529372] Call Trace: [2666486.532024] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2666486.539153] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2666486.546544] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2666486.553582] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2666486.560768] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2666486.567429] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2666486.574634] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2666486.581380] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2666486.588074] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2666486.595041] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2666486.602363] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2666486.608719] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2666486.615845] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2666486.623773] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2666486.630308] [] kthread+0xd1/0xe0 [2666486.635427] [] ret_from_fork_nospec_begin+0xe/0x21 [2666486.642094] [] 0xffffffffffffffff [2666486.647332] LustreError: dumping log to /tmp/lustre-log.1554821652.115591 [2666525.207819] LustreError: 115839:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554821600, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ebcd077b600/0xbc329461ad247c27 lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 54 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115839 timeout: 0 lvb_type: 0 [2666525.248048] LustreError: 115839:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 24 previous similar messages [2666581.273778] Lustre: 115861:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (482:147s); client may timeout. req@ffff8ebc22296900 x1628546091255472/t133448936939(0) o36->ac4b3f39-4aa4-91f4-89f9-e38a651f2f99@10.9.107.5@o2ib4:7/0 lens 488/424 e 1 to 0 dl 1554821599 ref 1 fl Complete:/0/0 rc 0/0 [2666584.265828] LustreError: 116295:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec40cd94400 ns: mdt-fir-MDT0001_UUID lock: ffff8ec67928bf00/0xbc329461aa6a235a lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 53 type: IBT flags: 0x50200000000000 nid: 10.8.17.15@o2ib6 remote: 0x794f201b734692a4 expref: 2 pid: 116295 timeout: 0 lvb_type: 0 [2666584.300875] LustreError: 116295:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 8 previous similar messages [2666584.311375] Lustre: 116295:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:293s); client may timeout. req@ffff8ebab3d4b000 x1628593190861904/t0(0) o101->bf6bd63b-9206-2f81-2780-2b483791a8c1@10.8.17.15@o2ib6:22/0 lens 568/2296 e 0 to 0 dl 1554821456 ref 1 fl Complete:/0/0 rc -107/-107 [2666584.340725] Lustre: 116295:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 3 previous similar messages [2666618.100820] Lustre: fir-MDT0001: Connection restored to (at 10.8.8.3@o2ib6) [2666618.108053] Lustre: Skipped 293 previous similar messages [2666787.578918] LNet: Service thread pid 115931 was inactive for 200.42s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2666787.596139] LNet: Skipped 7 previous similar messages [2666787.601381] Pid: 115931, comm: mdt01_072 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2666787.611406] Call Trace: [2666787.614060] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2666787.621193] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2666787.628602] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2666787.635619] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2666787.642808] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2666787.649569] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2666787.656257] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2666787.663201] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2666787.670512] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2666787.676870] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2666787.683998] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2666787.691933] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2666787.698455] [] kthread+0xd1/0xe0 [2666787.703558] [] ret_from_fork_nospec_begin+0xe/0x21 [2666787.710222] [] 0xffffffffffffffff [2666787.715442] LustreError: dumping log to /tmp/lustre-log.1554821953.115931 [2666787.788331] Pid: 115824, comm: mdt01_047 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2666787.798336] Call Trace: [2666787.800980] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2666787.808108] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2666787.815503] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2666787.822512] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2666787.829688] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2666787.836334] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2666787.843512] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2666787.850254] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2666787.856903] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2666787.863825] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2666787.871098] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2666787.877449] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2666787.884566] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2666787.892468] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2666787.898970] [] kthread+0xd1/0xe0 [2666787.904074] [] ret_from_fork_nospec_begin+0xe/0x21 [2666787.910721] [] 0xffffffffffffffff [2666787.915933] Pid: 115348, comm: mdt01_002 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2666787.925963] Call Trace: [2666787.928600] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2666787.935708] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2666787.943099] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2666787.950097] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2666787.957316] [] mdt_object_lock+0x20/0x30 [mdt] [2666787.963624] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2666787.970107] [] mdt_intent_brw+0x1f/0x30 [mdt] [2666787.976335] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2666787.982994] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2666787.989927] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2666787.997209] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2666788.003558] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2666788.010668] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2666788.018571] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2666788.025071] [] kthread+0xd1/0xe0 [2666788.030149] [] ret_from_fork_nospec_begin+0xe/0x21 [2666788.036814] [] 0xffffffffffffffff [2666788.042002] Pid: 115928, comm: mdt01_071 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2666788.052001] Call Trace: [2666788.054651] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2666788.061749] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2666788.069115] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2666788.076135] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2666788.083303] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2666788.089960] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2666788.097134] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2666788.103892] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2666788.110543] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2666788.117478] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2666788.124772] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2666788.131101] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2666788.138207] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2666788.146095] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2666788.152595] [] kthread+0xd1/0xe0 [2666788.157675] [] ret_from_fork_nospec_begin+0xe/0x21 [2666788.164320] [] 0xffffffffffffffff [2666788.169498] Pid: 115753, comm: mdt01_037 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2666788.179497] Call Trace: [2666788.182128] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2666788.189246] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2666788.196612] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2666788.203609] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2666788.210776] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2666788.217437] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2666788.224606] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2666788.231356] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2666788.238014] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2666788.244950] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2666788.252228] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2666788.258556] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2666788.265660] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2666788.273547] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2666788.280046] [] kthread+0xd1/0xe0 [2666788.285126] [] ret_from_fork_nospec_begin+0xe/0x21 [2666788.291774] [] 0xffffffffffffffff [2666788.296952] LNet: Service thread pid 115865 was inactive for 201.15s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2666788.310073] LNet: Skipped 15 previous similar messages [2666916.277424] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.17.17@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ebb21338d80/0xbc329461ae9511b8 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 56 type: IBT flags: 0x60200400000020 nid: 10.8.17.17@o2ib6 remote: 0xe31ee7d826775c91 expref: 113 pid: 115876 timeout: 2666895 lvb_type: 0 [2666916.315859] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 4 previous similar messages [2666916.326090] LNet: Service thread pid 115624 completed after 329.20s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2666916.342513] LNet: Skipped 21 previous similar messages [2666936.572656] LustreError: dumping log to /tmp/lustre-log.1554822102.115917 [2666942.204726] LustreError: dumping log to /tmp/lustre-log.1554822107.116479 [2666942.716735] LustreError: dumping log to /tmp/lustre-log.1554822108.115759 [2666947.324791] Lustre: 115870:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ebe0cc13900 x1629298147594048/t0(0) o101->a4daaf47-6ec9-4753-388e-0d0b7a7f70d6@10.8.27.25@o2ib6:27/0 lens 568/0 e 0 to 0 dl 1554822117 ref 2 fl Interpret:/0/ffffffff rc 0/-1 [2666947.354484] Lustre: 115870:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 31 previous similar messages [2666952.772674] Lustre: fir-MDT0003: Client ecd69873-9a0a-0293-0ce7-d92dc18bf8d9 (at 10.9.101.11@o2ib4) reconnecting [2666952.783026] Lustre: Skipped 279 previous similar messages [2666967.805026] LustreError: dumping log to /tmp/lustre-log.1554822133.115839 [2667066.280046] LustreError: 115931:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8eca0b38d800 ns: mdt-fir-MDT0001_UUID lock: ffff8eca2b770900/0xbc329461ae952c06 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 49 type: IBT flags: 0x50200000000000 nid: 10.8.8.1@o2ib6 remote: 0xbeca8e2774bef426 expref: 2 pid: 115931 timeout: 0 lvb_type: 0 [2667066.314924] LustreError: 115931:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 4 previous similar messages [2667066.325361] Lustre: 115931:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:325s); client may timeout. req@ffff8ec58ea51800 x1628647039728864/t0(0) o101->535a7c41-41b9-ec51-bbd6-e028c4ebcb2c@10.8.8.1@o2ib6:22/0 lens 568/2296 e 0 to 0 dl 1554821906 ref 1 fl Complete:/0/0 rc -107/-107 [2667066.354523] Lustre: 115931:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 1 previous similar message [2667077.322015] LustreError: 115836:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ecf1897f000 ns: mdt-fir-MDT0001_UUID lock: ffff8eb6d8d8bcc0/0xbc329461b1a69fca lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 57 type: IBT flags: 0x50200000000000 nid: 10.8.27.25@o2ib6 remote: 0x47916098feb8bea8 expref: 2 pid: 115836 timeout: 0 lvb_type: 0 [2667077.357074] LustreError: 115836:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 1 previous similar message [2667077.367435] Lustre: 116479:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:305s); client may timeout. req@ffff8ee7209a7800 x1628639970607328/t0(0) o101->dc990748-ca32-0960-545e-99af2316a63e@10.9.107.8@o2ib4:27/0 lens 568/2296 e 0 to 0 dl 1554821937 ref 1 fl Complete:/0/0 rc -107/-107 [2667077.396679] Lustre: 116479:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 2 previous similar messages [2667167.512401] LustreError: 116306:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554822243, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eb553b4c140/0xbc329461b30e88a9 lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 80 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 116306 timeout: 0 lvb_type: 0 [2667167.552323] LustreError: 116306:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 30 previous similar messages [2667231.791931] Lustre: fir-MDT0003: Connection restored to (at 10.9.101.11@o2ib4) [2667231.799442] Lustre: Skipped 280 previous similar messages [2667257.676987] LustreError: 115653:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8edf63223400 ns: mdt-fir-MDT0001_UUID lock: ffff8ecfea7a2f40/0xbc329461b47a31ca lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 78 type: IBT flags: 0x50200000000000 nid: 10.8.27.26@o2ib6 remote: 0x3a52d8e5f910a12e expref: 7 pid: 115653 timeout: 0 lvb_type: 0 [2667257.712044] LustreError: 115653:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 2 previous similar messages [2667257.722574] Lustre: 115625:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:26s); client may timeout. req@ffff8ebc222a8900 x1628593191008304/t0(0) o101->bf6bd63b-9206-2f81-2780-2b483791a8c1@10.8.17.15@o2ib6:3/0 lens 568/2296 e 0 to 0 dl 1554822397 ref 1 fl Complete:/0/0 rc -107/-107 [2667458.818852] LNet: Service thread pid 115834 was inactive for 200.30s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2667458.836056] LNet: Skipped 4 previous similar messages [2667458.841293] Pid: 115834, comm: mdt01_050 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2667458.851306] Call Trace: [2667458.853942] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2667458.861046] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2667458.868414] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2667458.875433] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2667458.882626] [] mdt_object_lock+0x20/0x30 [mdt] [2667458.888937] [] mdt_hsm_state_set+0xc9/0x830 [mdt] [2667458.895505] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2667458.902621] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2667458.910508] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2667458.917010] [] kthread+0xd1/0xe0 [2667458.922131] [] ret_from_fork_nospec_begin+0xe/0x21 [2667458.928778] [] 0xffffffffffffffff [2667458.933980] LustreError: dumping log to /tmp/lustre-log.1554822624.115834 [2667459.042023] Pid: 115937, comm: mdt01_074 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2667459.052023] Call Trace: [2667459.054669] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2667459.061786] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2667459.069153] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2667459.076156] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2667459.083333] [] mdt_object_lock+0x20/0x30 [mdt] [2667459.089661] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2667459.096143] [] mdt_intent_brw+0x1f/0x30 [mdt] [2667459.102366] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2667459.109040] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2667459.115971] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2667459.123251] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2667459.129595] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2667459.136711] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2667459.144605] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2667459.151114] [] kthread+0xd1/0xe0 [2667459.156204] [] ret_from_fork_nospec_begin+0xe/0x21 [2667459.162840] [] 0xffffffffffffffff [2667459.168054] Pid: 115753, comm: mdt01_037 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2667459.178082] Call Trace: [2667459.180718] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2667459.187830] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2667459.195195] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2667459.202212] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2667459.209391] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2667459.216045] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2667459.223223] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2667459.229966] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2667459.236622] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2667459.243555] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2667459.250833] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2667459.257172] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2667459.264300] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2667459.272216] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2667459.278741] [] kthread+0xd1/0xe0 [2667459.283834] [] ret_from_fork_nospec_begin+0xe/0x21 [2667459.290487] [] 0xffffffffffffffff [2667459.295680] Pid: 115930, comm: mdt02_073 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2667459.305683] Call Trace: [2667459.308323] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2667459.315420] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2667459.322786] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2667459.329789] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2667459.336965] [] mdt_object_lock+0x20/0x30 [mdt] [2667459.343274] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2667459.349759] [] mdt_intent_brw+0x1f/0x30 [mdt] [2667459.355995] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2667459.362655] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2667459.369587] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2667459.376885] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2667459.383219] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2667459.390336] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2667459.398222] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2667459.404730] [] kthread+0xd1/0xe0 [2667459.409818] [] ret_from_fork_nospec_begin+0xe/0x21 [2667459.416466] [] 0xffffffffffffffff [2667459.421660] Pid: 115659, comm: mdt00_023 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2667459.431653] Call Trace: [2667459.434294] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2667459.441401] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2667459.448766] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2667459.455787] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2667459.462953] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2667459.469610] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2667459.476792] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2667459.483538] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2667459.490211] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2667459.497153] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2667459.504425] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2667459.510761] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2667459.517875] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2667459.525771] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2667459.532271] [] kthread+0xd1/0xe0 [2667459.537376] [] ret_from_fork_nospec_begin+0xe/0x21 [2667459.544014] [] 0xffffffffffffffff [2667459.549193] LNet: Service thread pid 116295 was inactive for 200.99s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2667459.562338] LNet: Skipped 14 previous similar messages [2667565.489662] Lustre: fir-MDT0001: Client a4daaf47-6ec9-4753-388e-0d0b7a7f70d6 (at 10.8.27.25@o2ib6) reconnecting [2667565.499926] Lustre: Skipped 268 previous similar messages [2667588.285396] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.8.3@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ec3ff6e4140/0xbc329461b4b60204 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 69 type: IBT flags: 0x60200400000020 nid: 10.8.8.3@o2ib6 remote: 0x59829cb704519b2d expref: 68 pid: 115404 timeout: 2667567 lvb_type: 0 [2667588.323420] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 5 previous similar messages [2667588.333565] LNet: Service thread pid 115939 completed after 329.85s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2667588.334343] LustreError: 115817:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ed287270800 ns: mdt-fir-MDT0001_UUID lock: ffff8eccd65633c0/0xbc329461b614ee67 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 59 type: IBT flags: 0x50200000000000 nid: 10.8.17.19@o2ib6 remote: 0x8282471ef025a4ff expref: 2 pid: 115817 timeout: 0 lvb_type: 0 [2667588.334346] LustreError: 115817:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 3 previous similar messages [2667588.334402] Lustre: 115817:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:148s); client may timeout. req@ffff8ecb04864800 x1628647349683840/t0(0) o101->eeceaf0a-f64b-44e4-4f28-fac655ebb0a4@10.8.17.19@o2ib6:5/0 lens 568/2296 e 0 to 0 dl 1554822605 ref 1 fl Complete:/0/0 rc -107/-107 [2667588.424577] LNet: Skipped 41 previous similar messages [2667613.380687] Lustre: 116306:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ed28fc83900 x1628638022188912/t0(0) o101->7d8ca85d-8b80-6a23-8fa9-83dca7eb7196@10.8.27.28@o2ib6:3/0 lens 576/3264 e 0 to 0 dl 1554822783 ref 2 fl Interpret:/0/0 rc 0/0 [2667613.409846] Lustre: 116306:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 76 previous similar messages [2667789.062819] Pid: 115590, comm: mdt01_011 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2667789.072824] Call Trace: [2667789.075467] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2667789.082578] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2667789.089962] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2667789.096982] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2667789.104157] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2667789.110814] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2667789.117989] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2667789.124733] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2667789.131389] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2667789.138314] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2667789.145610] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2667789.151961] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2667789.159086] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2667789.166973] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2667789.173489] [] kthread+0xd1/0xe0 [2667789.178580] [] ret_from_fork_nospec_begin+0xe/0x21 [2667789.185257] [] 0xffffffffffffffff [2667789.190453] LustreError: dumping log to /tmp/lustre-log.1554822954.115590 [2667789.265911] Pid: 116124, comm: mdt01_089 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2667789.275938] Call Trace: [2667789.278578] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2667789.285683] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2667789.293057] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2667789.300062] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2667789.307236] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2667789.313895] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2667789.321086] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2667789.327837] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2667789.334493] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2667789.341448] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2667789.348734] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2667789.355085] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2667789.362202] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2667789.370114] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2667789.376615] [] kthread+0xd1/0xe0 [2667789.381709] [] ret_from_fork_nospec_begin+0xe/0x21 [2667789.388360] [] 0xffffffffffffffff [2667789.393571] Pid: 115753, comm: mdt01_037 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2667789.403570] Call Trace: [2667789.406200] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2667789.413309] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2667789.420669] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2667789.427688] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2667789.434866] [] mdt_reint_object_lock+0x2c/0x60 [mdt] [2667789.441685] [] mdt_reint_striped_lock+0x8c/0x510 [mdt] [2667789.448687] [] mdt_reint_setattr+0x6c8/0x1340 [mdt] [2667789.455446] [] mdt_reint_rec+0x83/0x210 [mdt] [2667789.461669] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2667789.468420] [] mdt_reint+0x67/0x140 [mdt] [2667789.474290] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2667789.481410] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2667789.489300] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2667789.495815] [] kthread+0xd1/0xe0 [2667789.500913] [] ret_from_fork_nospec_begin+0xe/0x21 [2667789.507553] [] 0xffffffffffffffff [2667789.512746] Pid: 115347, comm: mdt01_001 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2667789.522756] Call Trace: [2667789.525385] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2667789.532495] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2667789.539853] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2667789.546871] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2667789.554041] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2667789.560713] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2667789.567909] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2667789.574651] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2667789.581300] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2667789.588240] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2667789.595513] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2667789.601854] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2667789.608978] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2667789.616898] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2667789.623401] [] kthread+0xd1/0xe0 [2667789.628488] [] ret_from_fork_nospec_begin+0xe/0x21 [2667789.635136] [] 0xffffffffffffffff [2667789.640332] Pid: 115764, comm: mdt02_042 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2667789.650333] Call Trace: [2667789.652968] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2667789.660086] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2667789.667447] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2667789.674463] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2667789.681634] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2667789.688316] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2667789.695508] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2667789.702251] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2667789.708908] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2667789.715847] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2667789.723131] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2667789.729472] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2667789.736582] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2667789.744469] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2667789.750967] [] kthread+0xd1/0xe0 [2667789.756067] [] ret_from_fork_nospec_begin+0xe/0x21 [2667789.762721] [] 0xffffffffffffffff [2667798.288856] LustreError: 115594:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec7fd7e8800 ns: mdt-fir-MDT0001_UUID lock: ffff8ec6b3943f00/0xbc329461b7b8f4a3 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 60 type: IBT flags: 0x50200400000020 nid: 10.8.27.28@o2ib6 remote: 0x1df96237acfcb71b expref: 6 pid: 115594 timeout: 0 lvb_type: 0 [2667798.323915] LustreError: 115594:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 2 previous similar messages [2667798.334336] Lustre: 115594:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (185:25s); client may timeout. req@ffff8ed28fc45a00 x1628638022187808/t0(0) o101->7d8ca85d-8b80-6a23-8fa9-83dca7eb7196@10.8.27.28@o2ib6:3/0 lens 576/1792 e 0 to 0 dl 1554822938 ref 1 fl Complete:/0/0 rc -107/-107 [2667798.363544] Lustre: 115594:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 5 previous similar messages [2667851.837343] Lustre: fir-MDT0003: Connection restored to (at 10.9.101.11@o2ib4) [2667851.844835] Lustre: Skipped 277 previous similar messages [2667889.754005] LustreError: 115347:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554822965, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ec4c37eb600/0xbc329461b9ca521f lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 53 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115347 timeout: 0 lvb_type: 0 [2667889.793916] LustreError: 115347:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 82 previous similar messages [2668000.009316] LustreError: dumping log to /tmp/lustre-log.1554823165.115854 [2668155.659140] LNet: Service thread pid 115347 was inactive for 200.69s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2668155.676335] LNet: Skipped 9 previous similar messages [2668155.681569] Pid: 115347, comm: mdt01_001 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2668155.691568] Call Trace: [2668155.694203] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2668155.701343] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2668155.708709] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2668155.715726] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2668155.722904] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2668155.729561] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2668155.736736] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2668155.743496] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2668155.750154] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2668155.757104] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2668155.764384] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2668155.770744] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2668155.777860] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2668155.785764] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2668155.792265] [] kthread+0xd1/0xe0 [2668155.797367] [] ret_from_fork_nospec_begin+0xe/0x21 [2668155.804016] [] 0xffffffffffffffff [2668155.809210] LustreError: dumping log to /tmp/lustre-log.1554823321.115347 [2668155.855865] Pid: 115890, comm: mdt01_064 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2668155.865869] Call Trace: [2668155.868509] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2668155.875641] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2668155.883048] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2668155.890062] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2668155.897252] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2668155.903997] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2668155.910668] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2668155.917603] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2668155.924916] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2668155.931279] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2668155.938411] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2668155.946302] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2668155.952818] [] kthread+0xd1/0xe0 [2668155.957904] [] ret_from_fork_nospec_begin+0xe/0x21 [2668155.964566] [] 0xffffffffffffffff [2668165.997949] Lustre: fir-MDT0001: Client 7d8ca85d-8b80-6a23-8fa9-83dca7eb7196 (at 10.8.27.28@o2ib6) reconnecting [2668166.008215] Lustre: Skipped 265 previous similar messages [2668249.293211] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.8.1@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ec7b98ae540/0xbc329461b9ca5234 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 53 type: IBT flags: 0x60200400000020 nid: 10.8.8.1@o2ib6 remote: 0xbeca8e2774bf520c expref: 71 pid: 115861 timeout: 2668228 lvb_type: 0 [2668249.331209] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 5 previous similar messages [2668249.341368] LNet: Service thread pid 115939 completed after 449.56s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2668249.357786] LNet: Skipped 27 previous similar messages [2668274.572487] Lustre: 115348:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eccee63da00 x1629295505776880/t0(0) o101->8c206ea7-4fa6-6560-2c3b-626d4cc9e42f@10.8.8.3@o2ib6:5/0 lens 576/3264 e 0 to 0 dl 1554823445 ref 2 fl Interpret:/0/0 rc 0/0 [2668274.601514] Lustre: 115348:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 31 previous similar messages [2668304.140839] Pid: 115817, comm: mdt01_045 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2668304.150838] Call Trace: [2668304.153480] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2668304.160614] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2668304.167994] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2668304.175008] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2668304.182192] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2668304.188865] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2668304.196042] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2668304.202799] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2668304.209456] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2668304.216414] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2668304.223703] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2668304.230058] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2668304.237195] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2668304.245093] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2668304.251621] [] kthread+0xd1/0xe0 [2668304.256708] [] ret_from_fork_nospec_begin+0xe/0x21 [2668304.263372] [] 0xffffffffffffffff [2668304.268567] LustreError: dumping log to /tmp/lustre-log.1554823469.115817 [2668304.559444] Pid: 115594, comm: mdt02_016 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2668304.569443] Call Trace: [2668304.572089] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2668304.579206] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2668304.586601] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2668304.593611] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2668304.600812] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2668304.607557] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2668304.614212] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2668304.621153] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2668304.628433] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2668304.634778] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2668304.641893] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2668304.649789] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2668304.656298] [] kthread+0xd1/0xe0 [2668304.661385] [] ret_from_fork_nospec_begin+0xe/0x21 [2668304.668033] [] 0xffffffffffffffff [2668449.550522] Pid: 115967, comm: mdt01_084 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2668449.560529] Call Trace: [2668449.563190] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2668449.570319] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2668449.577715] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2668449.584721] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2668449.591898] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2668449.598553] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2668449.605766] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2668449.612508] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2668449.619197] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2668449.626148] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2668449.633428] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2668449.639781] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2668449.646898] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2668449.654823] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2668449.661329] [] kthread+0xd1/0xe0 [2668449.666414] [] ret_from_fork_nospec_begin+0xe/0x21 [2668449.673077] [] 0xffffffffffffffff [2668449.678274] LustreError: dumping log to /tmp/lustre-log.1554823615.115967 [2668450.384500] LNet: Service thread pid 115841 was inactive for 200.89s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2668450.397632] LNet: Skipped 51 previous similar messages [2668459.278628] Pid: 115925, comm: mdt01_070 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2668459.288625] Call Trace: [2668459.291267] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2668459.298380] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2668459.305746] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2668459.312751] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2668459.319916] [] mdt_object_lock+0x20/0x30 [mdt] [2668459.326241] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2668459.332725] [] mdt_intent_brw+0x1f/0x30 [mdt] [2668459.338948] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2668459.345604] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2668459.352553] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2668459.359836] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2668459.366181] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2668459.373295] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2668459.381182] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2668459.387700] [] kthread+0xd1/0xe0 [2668459.392787] [] ret_from_fork_nospec_begin+0xe/0x21 [2668459.399449] [] 0xffffffffffffffff [2668459.404655] LustreError: dumping log to /tmp/lustre-log.1554823624.115925 [2668459.456581] Pid: 115959, comm: mdt01_081 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2668459.466602] Call Trace: [2668459.469238] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2668459.476354] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2668459.483722] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2668459.490763] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2668459.497956] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2668459.504636] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2668459.511820] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2668459.518582] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2668459.525259] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2668459.532195] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2668459.539474] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2668459.545817] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2668459.552948] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2668459.560851] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2668459.567393] [] kthread+0xd1/0xe0 [2668459.572487] [] ret_from_fork_nospec_begin+0xe/0x21 [2668459.579147] [] 0xffffffffffffffff [2668459.584370] Pid: 115937, comm: mdt01_074 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2668459.594372] Call Trace: [2668459.597028] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2668459.604161] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2668459.611528] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2668459.618532] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2668459.625731] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2668459.632476] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2668459.639133] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2668459.646075] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2668459.653355] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2668459.659698] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2668459.666814] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2668459.674710] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2668459.681209] [] kthread+0xd1/0xe0 [2668459.686298] [] ret_from_fork_nospec_begin+0xe/0x21 [2668459.692962] [] 0xffffffffffffffff [2668459.698179] Pid: 115902, comm: mdt00_058 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2668459.708173] Call Trace: [2668459.710807] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2668459.717940] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2668459.725305] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2668459.732339] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2668459.739519] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2668459.746190] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2668459.753369] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2668459.760128] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2668459.766785] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2668459.773734] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2668459.781015] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2668459.787411] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2668459.794556] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2668459.802457] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2668459.808957] [] kthread+0xd1/0xe0 [2668459.814053] [] ret_from_fork_nospec_begin+0xe/0x21 [2668459.820700] [] 0xffffffffffffffff [2668466.490671] Lustre: fir-MDT0001: Connection restored to e3c4cf5f-8e04-bccb-9d13-7eae1b83e1a1 (at 10.8.27.29@o2ib6) [2668466.501223] Lustre: Skipped 293 previous similar messages [2668503.899142] LustreError: 115887:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554823579, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ec9c8556e40/0xbc329461c1318251 lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x13/0x8 rrc: 51 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115887 timeout: 0 lvb_type: 0 [2668503.939069] LustreError: 115887:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 14 previous similar messages [2668558.297084] LustreError: 115937:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ecb04bf7400 ns: mdt-fir-MDT0001_UUID lock: ffff8ed6627d4c80/0xbc329461bea7d7f3 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 41 type: IBT flags: 0x50200000000000 nid: 10.8.17.19@o2ib6 remote: 0x8282471ef025c081 expref: 2 pid: 115937 timeout: 0 lvb_type: 0 [2668558.297102] Lustre: 115874:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (464:294s); client may timeout. req@ffff8edbe8c93f00 x1628647039929056/t0(0) o55->535a7c41-41b9-ec51-bbd6-e028c4ebcb2c@10.8.8.1@o2ib6:5/0 lens 472/192 e 0 to 0 dl 1554823429 ref 1 fl Complete:/0/0 rc -22/-22 [2668558.360860] LustreError: 115937:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 9 previous similar messages [2668745.725845] LustreError: 115684:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec57f791400 ns: mdt-fir-MDT0001_UUID lock: ffff8edcb223a1c0/0xbc329461c38873a1 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 59 type: IBT flags: 0x50200400000020 nid: 10.8.27.26@o2ib6 remote: 0x3a52d8e5f9110152 expref: 4 pid: 115684 timeout: 0 lvb_type: 0 [2668745.760913] LustreError: 115684:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 4 previous similar messages [2668745.771349] Lustre: 115684:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:32s); client may timeout. req@ffff8eda62002400 x1628574042317488/t0(0) o101->40dfa1b3-0e1e-4763-9ec6-4ac10d77215c@10.8.27.26@o2ib6:15/0 lens 576/1792 e 0 to 0 dl 1554823879 ref 1 fl Complete:/0/0 rc -107/-107 [2668745.800612] Lustre: 115684:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 4 previous similar messages [2668759.826115] LNet: Service thread pid 115836 was inactive for 200.21s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2668759.843313] LNet: Skipped 8 previous similar messages [2668759.848547] Pid: 115836, comm: mdt00_038 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2668759.858572] Call Trace: [2668759.861214] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2668759.868344] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2668759.875741] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2668759.882773] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2668759.889982] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2668759.896639] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2668759.903851] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2668759.910594] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2668759.917286] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2668759.924258] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2668759.931588] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2668759.937971] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2668759.945095] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2668759.952998] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2668759.959510] [] kthread+0xd1/0xe0 [2668759.964596] [] ret_from_fork_nospec_begin+0xe/0x21 [2668759.971242] [] 0xffffffffffffffff [2668759.976476] LustreError: dumping log to /tmp/lustre-log.1554823925.115836 [2668760.813629] Pid: 115967, comm: mdt01_084 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2668760.823631] Call Trace: [2668760.826292] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2668760.833421] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2668760.840816] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2668760.847839] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2668760.855037] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2668760.861781] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2668760.868452] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2668760.875403] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2668760.882686] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2668760.889043] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2668760.896178] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2668760.904090] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2668760.910592] [] kthread+0xd1/0xe0 [2668760.915729] [] ret_from_fork_nospec_begin+0xe/0x21 [2668760.922378] [] 0xffffffffffffffff [2668760.927612] Pid: 115592, comm: mdt01_012 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2668760.937608] Call Trace: [2668760.940250] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2668760.947356] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2668760.954732] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2668760.961734] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2668760.968910] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2668760.975598] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2668760.982779] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2668760.989565] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2668760.996228] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2668761.003207] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2668761.010501] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2668761.016881] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2668761.023999] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2668761.031931] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2668761.038447] [] kthread+0xd1/0xe0 [2668761.043539] [] ret_from_fork_nospec_begin+0xe/0x21 [2668761.050187] [] 0xffffffffffffffff [2668761.055409] Pid: 115887, comm: mdt01_063 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2668761.065409] Call Trace: [2668761.068049] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2668761.075165] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2668761.082532] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2668761.089536] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2668761.096710] [] mdt_object_lock+0x20/0x30 [mdt] [2668761.103021] [] mdt_hsm_state_set+0xc9/0x830 [mdt] [2668761.109590] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2668761.116755] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2668761.124653] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2668761.131159] [] kthread+0xd1/0xe0 [2668761.136250] [] ret_from_fork_nospec_begin+0xe/0x21 [2668761.142944] [] 0xffffffffffffffff [2668761.148154] Pid: 115817, comm: mdt01_045 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2668761.158168] Call Trace: [2668761.160810] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2668761.167924] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2668761.175291] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2668761.182313] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2668761.189488] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2668761.196145] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2668761.203335] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2668761.210096] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2668761.216753] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2668761.223701] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2668761.230984] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2668761.237328] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2668761.244445] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2668761.252346] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2668761.258848] [] kthread+0xd1/0xe0 [2668761.263936] [] ret_from_fork_nospec_begin+0xe/0x21 [2668761.270589] [] 0xffffffffffffffff [2668776.588689] Lustre: fir-MDT0001: Client 2c78cdad-2975-ca98-fb36-e7548576f834 (at 10.8.17.17@o2ib6) reconnecting [2668776.598954] Lustre: Skipped 276 previous similar messages [2668895.300647] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 149s: evicting client at 10.8.17.19@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ed2aef34380/0xbc329461c3886fe2 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 62 type: IBT flags: 0x60200400000020 nid: 10.8.17.19@o2ib6 remote: 0x8282471ef025d12f expref: 47 pid: 115937 timeout: 2668874 lvb_type: 0 [2668895.338996] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 4 previous similar messages [2668895.349189] LNet: Service thread pid 115659 completed after 335.74s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2668895.365642] LNet: Skipped 28 previous similar messages [2668914.963862] LustreError: dumping log to /tmp/lustre-log.1554824080.115902 [2668918.547918] LustreError: dumping log to /tmp/lustre-log.1554824084.115582 [2668920.403926] Lustre: 115836:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eb4b9d73000 x1628649284175776/t0(0) o101->2c78cdad-2975-ca98-fb36-e7548576f834@10.8.17.17@o2ib6:20/0 lens 480/568 e 0 to 0 dl 1554824090 ref 2 fl Interpret:/0/0 rc 0/0 [2668920.433103] Lustre: 115836:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 45 previous similar messages [2668940.052137] LustreError: dumping log to /tmp/lustre-log.1554824105.115778 [2668946.196209] LustreError: dumping log to /tmp/lustre-log.1554824111.115937 [2669049.715022] LustreError: 115347:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8edc509a5c00 ns: mdt-fir-MDT0001_UUID lock: ffff8ecdba2ea640/0xbc329461c388e4ce lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 54 type: IBT flags: 0x50200400000020 nid: 10.8.17.19@o2ib6 remote: 0x8282471ef025d20f expref: 4 pid: 115347 timeout: 0 lvb_type: 0 [2669049.750194] Lustre: 115347:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (340:150s); client may timeout. req@ffff8eccee6a1200 x1628647349829936/t0(0) o101->eeceaf0a-f64b-44e4-4f28-fac655ebb0a4@10.8.17.19@o2ib6:15/0 lens 480/536 e 0 to 0 dl 1554824065 ref 1 fl Complete:/0/0 rc -107/-107 [2669088.958341] Lustre: fir-MDT0001: Connection restored to 2c78cdad-2975-ca98-fb36-e7548576f834 (at 10.8.17.17@o2ib6) [2669088.968869] Lustre: Skipped 288 previous similar messages [2669147.958463] LustreError: 116303:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554824223, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eccf4a87500/0xbc329461cbf1bcda lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 57 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 116303 timeout: 0 lvb_type: 0 [2669147.998388] LustreError: 116303:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 55 previous similar messages [2669258.519693] Pid: 115934, comm: mdt01_073 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2669258.529701] Call Trace: [2669258.532340] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2669258.539470] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2669258.546852] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2669258.553865] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2669258.561054] [] mdt_object_lock+0x20/0x30 [mdt] [2669258.567368] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2669258.573890] [] mdt_intent_brw+0x1f/0x30 [mdt] [2669258.580117] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2669258.586795] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2669258.593750] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2669258.601028] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2669258.607388] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2669258.614506] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2669258.622424] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2669258.628927] [] kthread+0xd1/0xe0 [2669258.634015] [] ret_from_fork_nospec_begin+0xe/0x21 [2669258.640662] [] 0xffffffffffffffff [2669258.645875] LustreError: dumping log to /tmp/lustre-log.1554824424.115934 [2669259.610429] Pid: 115813, comm: mdt00_031 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2669259.620442] Call Trace: [2669259.623086] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2669259.630217] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2669259.637585] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2669259.644596] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2669259.651789] [] mdt_object_lock+0x20/0x30 [mdt] [2669259.658097] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2669259.664608] [] mdt_intent_brw+0x1f/0x30 [mdt] [2669259.670830] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2669259.677488] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2669259.684436] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2669259.691733] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2669259.698070] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2669259.705191] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2669259.713080] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2669259.719587] [] kthread+0xd1/0xe0 [2669259.724675] [] ret_from_fork_nospec_begin+0xe/0x21 [2669259.731326] [] 0xffffffffffffffff [2669259.736530] Pid: 115573, comm: mdt00_006 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2669259.746529] Call Trace: [2669259.749169] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2669259.756276] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2669259.763642] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2669259.770643] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2669259.777821] [] mdt_object_lock+0x20/0x30 [mdt] [2669259.784131] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2669259.790621] [] mdt_intent_brw+0x1f/0x30 [mdt] [2669259.796864] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2669259.803528] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2669259.810468] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2669259.817748] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2669259.824115] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2669259.831235] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2669259.839135] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2669259.845639] [] kthread+0xd1/0xe0 [2669259.850741] [] ret_from_fork_nospec_begin+0xe/0x21 [2669259.857391] [] 0xffffffffffffffff [2669259.862586] Pid: 115896, comm: mdt02_064 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2669259.872587] Call Trace: [2669259.875224] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2669259.882334] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2669259.889699] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2669259.896729] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2669259.903921] [] mdt_object_lock+0x20/0x30 [mdt] [2669259.910233] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2669259.916741] [] mdt_intent_brw+0x1f/0x30 [mdt] [2669259.922985] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2669259.929644] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2669259.936589] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2669259.943903] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2669259.950247] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2669259.957372] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2669259.965264] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2669259.971773] [] kthread+0xd1/0xe0 [2669259.976863] [] ret_from_fork_nospec_begin+0xe/0x21 [2669259.983521] [] 0xffffffffffffffff [2669259.988725] Pid: 115874, comm: mdt02_058 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2669259.998723] Call Trace: [2669260.001369] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2669260.008487] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2669260.015852] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2669260.022889] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2669260.030066] [] mdt_object_lock+0x20/0x30 [mdt] [2669260.036392] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2669260.042876] [] mdt_intent_brw+0x1f/0x30 [mdt] [2669260.049100] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2669260.055756] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2669260.062688] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2669260.069969] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2669260.076304] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2669260.083419] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2669260.091309] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2669260.097815] [] kthread+0xd1/0xe0 [2669260.102901] [] ret_from_fork_nospec_begin+0xe/0x21 [2669260.109552] [] 0xffffffffffffffff [2669260.114746] LNet: Service thread pid 115591 was inactive for 201.95s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2669260.127868] LNet: Skipped 22 previous similar messages [2669393.078305] Lustre: fir-MDT0001: Client 535a7c41-41b9-ec51-bbd6-e028c4ebcb2c (at 10.8.8.1@o2ib6) reconnecting [2669393.088398] Lustre: Skipped 298 previous similar messages [2669413.145366] LustreError: dumping log to /tmp/lustre-log.1554824578.115653 [2669450.532799] Lustre: fir-MDT0003: haven't heard from client a954e878-9e3f-06fe-0694-8bc8efe6903b (at 10.8.26.33@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ec2dd321400, cur 1554824616 expire 1554824466 last 1554824389 [2669450.554673] Lustre: Skipped 2 previous similar messages [2669562.651015] LNet: Service thread pid 115590 was inactive for 200.55s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2669562.668213] LNet: Skipped 9 previous similar messages [2669562.673455] Pid: 115590, comm: mdt01_011 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2669562.683467] Call Trace: [2669562.686115] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2669562.693255] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2669562.700657] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2669562.707674] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2669562.714868] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2669562.721650] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2669562.728310] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2669562.735278] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2669562.742577] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2669562.748951] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2669562.756078] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2669562.764021] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2669562.770534] [] kthread+0xd1/0xe0 [2669562.775655] [] ret_from_fork_nospec_begin+0xe/0x21 [2669562.782314] [] 0xffffffffffffffff [2669562.787526] LustreError: dumping log to /tmp/lustre-log.1554824728.115590 [2669563.123329] Pid: 115897, comm: mdt02_065 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2669563.133336] Call Trace: [2669563.135983] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2669563.143125] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2669563.150510] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2669563.157536] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2669563.164743] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2669563.171409] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2669563.178600] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2669563.185351] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2669563.192019] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2669563.199002] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2669563.206325] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2669563.212696] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2669563.219820] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2669563.227740] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2669563.234267] [] kthread+0xd1/0xe0 [2669563.239371] [] ret_from_fork_nospec_begin+0xe/0x21 [2669563.246017] [] 0xffffffffffffffff [2669567.308073] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.8.6@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ec3f371e0c0/0xbc329461cbf1bd20 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 55 type: IBT flags: 0x60200400000020 nid: 10.8.8.6@o2ib6 remote: 0x28be6c2d1aabfa83 expref: 127 pid: 115887 timeout: 2669546 lvb_type: 0 [2669567.346180] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 5 previous similar messages [2669567.356502] LNet: Service thread pid 115928 completed after 509.39s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2669567.372949] LNet: Skipped 19 previous similar messages [2669593.371357] Pid: 115566, comm: mdt02_005 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2669593.381355] Call Trace: [2669593.383995] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2669593.391114] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2669593.398479] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2669593.405484] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2669593.412677] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2669593.419418] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2669593.426076] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2669593.433008] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2669593.440289] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2669593.446650] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2669593.453764] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2669593.461652] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2669593.468162] [] kthread+0xd1/0xe0 [2669593.473248] [] ret_from_fork_nospec_begin+0xe/0x21 [2669593.479913] [] 0xffffffffffffffff [2669593.485107] LustreError: dumping log to /tmp/lustre-log.1554824758.115566 [2669593.536360] Pid: 116124, comm: mdt01_089 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2669593.546361] Call Trace: [2669593.549001] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2669593.556120] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2669593.563487] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2669593.570489] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2669593.577676] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2669593.584338] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2669593.591515] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2669593.598258] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2669593.604916] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2669593.611847] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2669593.619126] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2669593.625463] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2669593.632577] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2669593.640466] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2669593.646965] [] kthread+0xd1/0xe0 [2669593.652051] [] ret_from_fork_nospec_begin+0xe/0x21 [2669593.658714] [] 0xffffffffffffffff [2669598.323411] Lustre: 115965:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eca3de04b00 x1629298149200864/t0(0) o101->a4daaf47-6ec9-4753-388e-0d0b7a7f70d6@10.8.27.25@o2ib6:8/0 lens 568/0 e 0 to 0 dl 1554824768 ref 2 fl Interpret:/0/ffffffff rc 0/-1 [2669598.353008] Lustre: 115965:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 27 previous similar messages [2669624.603704] Pid: 115695, comm: mdt01_029 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2669624.613705] Call Trace: [2669624.616349] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2669624.623467] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2669624.630877] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2669624.637881] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2669624.645084] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2669624.651746] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2669624.658936] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2669624.665683] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2669624.672357] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2669624.679297] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2669624.686595] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2669624.692947] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2669624.700080] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2669624.707987] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2669624.714504] [] kthread+0xd1/0xe0 [2669624.719598] [] ret_from_fork_nospec_begin+0xe/0x21 [2669624.726264] [] 0xffffffffffffffff [2669624.731466] LustreError: dumping log to /tmp/lustre-log.1554824790.115695 [2669697.266167] Lustre: fir-MDT0001: Connection restored to a4daaf47-6ec9-4753-388e-0d0b7a7f70d6 (at 10.8.27.25@o2ib6) [2669697.276689] Lustre: Skipped 310 previous similar messages [2669773.597252] LustreError: dumping log to /tmp/lustre-log.1554824939.115608 [2669812.364682] LustreError: 115824:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554824887, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eca10106e40/0xbc329461d54487ce lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 53 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115824 timeout: 0 lvb_type: 0 [2669812.404586] LustreError: 115824:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 11 previous similar messages [2669867.313743] LustreError: 115824:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8eb4b9cc6c00 ns: mdt-fir-MDT0001_UUID lock: ffff8eca10106e40/0xbc329461d54487ce lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 36 type: IBT flags: 0x50200000000000 nid: 10.8.27.29@o2ib6 remote: 0xd991486401930541 expref: 3 pid: 115824 timeout: 0 lvb_type: 0 [2669867.348801] LustreError: 115824:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 4 previous similar messages [2669867.359285] Lustre: 115566:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (36:438s); client may timeout. req@ffff8ed8a3e21800 x1628647353000592/t0(0) o101->eeceaf0a-f64b-44e4-4f28-fac655ebb0a4@10.8.17.19@o2ib6:14/0 lens 568/2296 e 0 to 0 dl 1554824594 ref 1 fl Complete:/0/0 rc -107/-107 [2669867.388533] Lustre: 115566:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 12 previous similar messages [2669993.833238] Lustre: fir-MDT0001: Client eed99957-e395-8d59-f471-3be5bc5334d2 (at 10.8.27.31@o2ib6) reconnecting [2669993.843513] Lustre: Skipped 306 previous similar messages [2670070.560531] Pid: 115347, comm: mdt01_001 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2670070.570589] Call Trace: [2670070.573235] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2670070.580361] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2670070.587744] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2670070.594751] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2670070.601934] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2670070.608667] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2670070.615337] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2670070.622274] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2670070.629554] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2670070.635898] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2670070.643039] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2670070.650926] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2670070.657426] [] kthread+0xd1/0xe0 [2670070.662513] [] ret_from_fork_nospec_begin+0xe/0x21 [2670070.669194] [] 0xffffffffffffffff [2670070.674391] LustreError: dumping log to /tmp/lustre-log.1554825236.115347 [2670071.547301] Pid: 116287, comm: mdt03_067 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2670071.557302] Call Trace: [2670071.559947] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2670071.567062] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2670071.574430] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2670071.581435] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2670071.588610] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2670071.595275] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2670071.602461] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2670071.609203] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2670071.615858] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2670071.622792] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2670071.630071] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2670071.636406] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2670071.643517] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2670071.651411] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2670071.657920] [] kthread+0xd1/0xe0 [2670071.663017] [] ret_from_fork_nospec_begin+0xe/0x21 [2670071.669671] [] 0xffffffffffffffff [2670071.674870] Pid: 115617, comm: mdt01_020 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2670071.684865] Call Trace: [2670071.687506] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2670071.694631] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2670071.701996] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2670071.709001] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2670071.716178] [] mdt_object_lock+0x20/0x30 [mdt] [2670071.722488] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2670071.728979] [] mdt_intent_brw+0x1f/0x30 [mdt] [2670071.735202] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2670071.741856] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2670071.748799] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2670071.756087] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2670071.762434] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2670071.769567] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2670071.777461] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2670071.783969] [] kthread+0xd1/0xe0 [2670071.789057] [] ret_from_fork_nospec_begin+0xe/0x21 [2670071.795706] [] 0xffffffffffffffff [2670071.800910] Pid: 115857, comm: mdt01_057 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2670071.810908] Call Trace: [2670071.813553] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2670071.820676] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2670071.828050] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2670071.835070] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2670071.842252] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2670071.848909] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2670071.856094] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2670071.862845] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2670071.869504] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2670071.876452] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2670071.883742] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2670071.890092] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2670071.897218] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2670071.905115] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2670071.911630] [] kthread+0xd1/0xe0 [2670071.916718] [] ret_from_fork_nospec_begin+0xe/0x21 [2670071.923365] [] 0xffffffffffffffff [2670071.928554] Pid: 115937, comm: mdt01_074 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2670071.938553] Call Trace: [2670071.941197] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2670071.948310] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2670071.955692] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2670071.962704] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2670071.969882] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2670071.976547] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2670071.983746] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2670071.990490] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2670071.997147] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2670072.004097] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2670072.011393] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2670072.017738] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2670072.024880] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2670072.032784] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2670072.039301] [] kthread+0xd1/0xe0 [2670072.044386] [] ret_from_fork_nospec_begin+0xe/0x21 [2670072.051038] [] 0xffffffffffffffff [2670072.056239] LNet: Service thread pid 115876 was inactive for 202.21s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2670072.069364] LNet: Skipped 15 previous similar messages [2670169.314610] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.17.19@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ed3c7a33f00/0xbc329461d8627800 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 55 type: IBT flags: 0x60200400000020 nid: 10.8.17.19@o2ib6 remote: 0x8282471ef02b3c39 expref: 67 pid: 115867 timeout: 2670148 lvb_type: 0 [2670169.352978] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 3 previous similar messages [2670169.363132] LNet: Service thread pid 115870 completed after 299.52s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2670169.379552] LNet: Skipped 22 previous similar messages [2670199.073927] Lustre: 115913:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eb8b5599800 x1628649158950160/t0(0) o101->eed99957-e395-8d59-f471-3be5bc5334d2@10.8.27.31@o2ib6:9/0 lens 568/0 e 0 to 0 dl 1554825369 ref 2 fl Interpret:/0/ffffffff rc 0/-1 [2670199.103558] Lustre: 115913:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 25 previous similar messages [2670225.186208] LustreError: dumping log to /tmp/lustre-log.1554825390.115942 [2670259.315743] Lustre: 116334:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:235s); client may timeout. req@ffff8ed4d09bf800 x1628649284400960/t133465098973(0) o36->2c78cdad-2975-ca98-fb36-e7548576f834@10.8.17.17@o2ib6:5/0 lens 488/424 e 0 to 0 dl 1554825189 ref 1 fl Complete:/0/0 rc 0/0 [2670259.316495] LustreError: 115928:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ed806018800 ns: mdt-fir-MDT0001_UUID lock: ffff8ed98d09ec00/0xbc329461df180204 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 44 type: IBT flags: 0x50200000000000 nid: 10.8.27.26@o2ib6 remote: 0x3a52d8e5f911a18e expref: 3 pid: 115928 timeout: 0 lvb_type: 0 [2670259.316498] LustreError: 115928:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 7 previous similar messages [2670259.390632] Lustre: 116334:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 4 previous similar messages [2670299.437864] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2670299.448479] Lustre: Skipped 300 previous similar messages [2670463.268765] LNet: Service thread pid 115931 was inactive for 200.27s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2670463.285970] LNet: Skipped 9 previous similar messages [2670463.291201] Pid: 115931, comm: mdt01_072 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2670463.301201] Call Trace: [2670463.303842] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2670463.310989] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2670463.318357] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2670463.325360] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2670463.332537] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2670463.339223] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2670463.346404] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2670463.353147] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2670463.359811] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2670463.366770] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2670463.374068] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2670463.380428] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2670463.387553] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2670463.395447] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2670463.401947] [] kthread+0xd1/0xe0 [2670463.407063] [] ret_from_fork_nospec_begin+0xe/0x21 [2670463.413708] [] 0xffffffffffffffff [2670463.418901] LustreError: dumping log to /tmp/lustre-log.1554825628.115931 [2670464.124426] Pid: 115816, comm: mdt02_046 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2670464.134429] Call Trace: [2670464.137067] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2670464.144200] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2670464.151587] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2670464.158593] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2670464.165771] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2670464.172446] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2670464.179629] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2670464.186376] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2670464.193051] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2670464.199993] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2670464.207282] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2670464.213626] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2670464.220739] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2670464.228655] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2670464.235178] [] kthread+0xd1/0xe0 [2670464.240267] [] ret_from_fork_nospec_begin+0xe/0x21 [2670464.246914] [] 0xffffffffffffffff [2670464.252115] Pid: 115937, comm: mdt01_074 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2670464.262111] Call Trace: [2670464.264750] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2670464.271858] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2670464.279225] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2670464.286226] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2670464.293404] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2670464.300077] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2670464.307253] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2670464.313996] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2670464.320653] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2670464.327585] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2670464.334874] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2670464.341217] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2670464.348341] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2670464.356236] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2670464.362746] [] kthread+0xd1/0xe0 [2670464.367834] [] ret_from_fork_nospec_begin+0xe/0x21 [2670464.374500] [] 0xffffffffffffffff [2670464.379705] Pid: 115617, comm: mdt01_020 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2670464.389702] Call Trace: [2670464.392342] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2670464.399457] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2670464.406825] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2670464.413835] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2670464.421013] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2670464.427691] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2670464.434872] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2670464.441614] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2670464.448292] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2670464.455229] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2670464.462523] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2670464.468871] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2670464.476001] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2670464.483891] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2670464.490390] [] kthread+0xd1/0xe0 [2670464.495477] [] ret_from_fork_nospec_begin+0xe/0x21 [2670464.502124] [] 0xffffffffffffffff [2670464.507322] Pid: 115346, comm: mdt01_000 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2670464.517340] Call Trace: [2670464.519978] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2670464.527093] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2670464.534470] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2670464.541472] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2670464.548647] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2670464.555304] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2670464.562482] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2670464.569248] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2670464.575905] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2670464.582846] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2670464.590135] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2670464.596479] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2670464.603597] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2670464.611491] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2670464.618019] [] kthread+0xd1/0xe0 [2670464.623131] [] ret_from_fork_nospec_begin+0xe/0x21 [2670464.629776] [] 0xffffffffffffffff [2670472.318995] LustreError: 115346:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8eb330059c00 ns: mdt-fir-MDT0001_UUID lock: ffff8ebfed3ade80/0xbc329461dfa9f533 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 48 type: IBT flags: 0x50200000000000 nid: 10.8.27.28@o2ib6 remote: 0x1df96237acffb8ce expref: 5 pid: 115346 timeout: 0 lvb_type: 0 [2670472.354053] LustreError: 115346:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 4 previous similar messages [2670472.364552] Lustre: 116172:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:24s); client may timeout. req@ffff8eb692ec0600 x1628574042649184/t0(0) o101->40dfa1b3-0e1e-4763-9ec6-4ac10d77215c@10.8.27.26@o2ib6:9/0 lens 568/2296 e 0 to 0 dl 1554825613 ref 1 fl Complete:/0/0 rc -107/-107 [2670472.393735] Lustre: 116172:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 2 previous similar messages [2670564.819827] LustreError: 116313:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554825640, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8edbb1b2e780/0xbc329461e40f4976 lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 54 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 116313 timeout: 0 lvb_type: 0 [2670564.859768] LustreError: 116313:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 60 previous similar messages [2670598.820230] Lustre: fir-MDT0001: Client 8c206ea7-4fa6-6560-2c3b-626d4cc9e42f (at 10.8.8.3@o2ib6) reconnecting [2670598.830314] Lustre: Skipped 276 previous similar messages [2670675.239025] LNet: Service thread pid 115937 was inactive for 200.41s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2670675.252144] LNet: Skipped 27 previous similar messages [2670675.257483] LustreError: dumping log to /tmp/lustre-log.1554825840.115937 [2670675.751017] LustreError: dumping log to /tmp/lustre-log.1554825841.115931 [2670684.320516] LustreError: 115816:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec48f917400 ns: mdt-fir-MDT0001_UUID lock: ffff8ede74a45c40/0xbc329461e40f5577 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 51 type: IBT flags: 0x50200400000020 nid: 10.8.17.17@o2ib6 remote: 0xe31ee7d82679efe1 expref: 4 pid: 115816 timeout: 0 lvb_type: 0 [2670684.355566] LustreError: 115816:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 4 previous similar messages [2670684.366025] Lustre: 115816:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:55s); client may timeout. req@ffff8edafcb38300 x1628649285158928/t0(0) o101->2c78cdad-2975-ca98-fb36-e7548576f834@10.8.17.17@o2ib6:10/0 lens 576/1792 e 0 to 0 dl 1554825794 ref 1 fl Complete:/0/0 rc -107/-107 [2670684.395332] Lustre: 115816:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 2 previous similar messages [2670786.321221] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 30s: evicting client at 10.8.27.26@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8eb3ac6945c0/0xbc329461e8e0b0d7 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 57 type: IBT flags: 0x60200400000020 nid: 10.8.27.26@o2ib6 remote: 0x3a52d8e5f9125780 expref: 134 pid: 115902 timeout: 2670765 lvb_type: 0 [2670786.359613] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 11 previous similar messages [2670813.224502] Lustre: 115850:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8edb88c6bc00 x1628574042803168/t0(0) o101->40dfa1b3-0e1e-4763-9ec6-4ac10d77215c@10.8.27.26@o2ib6:23/0 lens 576/3264 e 0 to 0 dl 1554825983 ref 2 fl Interpret:/0/0 rc 0/0 [2670813.253753] Lustre: 115850:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 78 previous similar messages [2670837.550732] Lustre: fir-MDT0001: haven't heard from client 60db087e-d6f8-e589-8a4d-35121c9f0611 (at 10.8.26.33@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ecdf657f000, cur 1554826003 expire 1554825853 last 1554825776 [2670837.572662] Lustre: Skipped 1 previous similar message [2670852.547994] Lustre: fir-MDT0003: haven't heard from client 60db087e-d6f8-e589-8a4d-35121c9f0611 (at 10.8.26.33@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ecdf6616c00, cur 1554826018 expire 1554825868 last 1554825791 [2670852.569909] Lustre: Skipped 1 previous similar message [2670896.937404] Pid: 115575, comm: mdt01_006 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2670896.947411] Call Trace: [2670896.950049] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2670896.957188] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2670896.964571] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2670896.971591] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2670896.978770] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2670896.985441] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2670896.992676] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2670896.999429] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2670897.006140] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2670897.013444] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2670897.021095] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2670897.027474] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2670897.034646] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2670897.042584] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2670897.049093] [] kthread+0xd1/0xe0 [2670897.054188] [] ret_from_fork_nospec_begin+0xe/0x21 [2670897.060867] [] 0xffffffffffffffff [2670897.066065] LustreError: dumping log to /tmp/lustre-log.1554826062.115575 [2670897.412371] Pid: 116121, comm: mdt01_087 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2670897.422374] Call Trace: [2670897.425017] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2670897.432137] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2670897.439538] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2670897.446539] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2670897.453734] [] mdt_object_lock+0x20/0x30 [mdt] [2670897.460058] [] mdt_hsm_state_set+0xc9/0x830 [mdt] [2670897.466629] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2670897.473773] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2670897.481676] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2670897.488223] [] kthread+0xd1/0xe0 [2670897.493316] [] ret_from_fork_nospec_begin+0xe/0x21 [2670897.499962] [] 0xffffffffffffffff [2670897.505183] Pid: 115869, comm: mdt01_060 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2670897.515184] Call Trace: [2670897.517824] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2670897.524940] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2670897.532359] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2670897.539362] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2670897.546539] [] mdt_object_lock+0x20/0x30 [mdt] [2670897.552848] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2670897.559329] [] mdt_intent_brw+0x1f/0x30 [mdt] [2670897.565552] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2670897.572199] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2670897.579150] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2670897.586473] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2670897.592828] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2670897.599941] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2670897.607845] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2670897.614358] [] kthread+0xd1/0xe0 [2670897.619461] [] ret_from_fork_nospec_begin+0xe/0x21 [2670897.626138] [] 0xffffffffffffffff [2670897.631338] Pid: 115892, comm: mdt02_063 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2670897.641337] Call Trace: [2670897.643968] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2670897.651083] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2670897.658469] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2670897.665472] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2670897.672681] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2670897.679357] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2670897.686547] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2670897.693291] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2670897.699963] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2670897.706908] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2670897.714195] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2670897.720541] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2670897.727665] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2670897.735559] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2670897.742070] [] kthread+0xd1/0xe0 [2670897.747154] [] ret_from_fork_nospec_begin+0xe/0x21 [2670897.753803] [] 0xffffffffffffffff [2670897.759000] Pid: 115890, comm: mdt01_064 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2670897.769003] Call Trace: [2670897.771641] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2670897.778754] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2670897.786139] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2670897.793140] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2670897.800328] [] mdt_object_lock+0x20/0x30 [mdt] [2670897.806636] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2670897.813118] [] mdt_intent_brw+0x1f/0x30 [mdt] [2670897.819331] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2670897.825989] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2670897.832931] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2670897.840219] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2670897.846581] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2670897.853706] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2670897.861592] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2670897.868102] [] kthread+0xd1/0xe0 [2670897.873187] [] ret_from_fork_nospec_begin+0xe/0x21 [2670897.879835] [] 0xffffffffffffffff [2670904.584842] Lustre: fir-MDT0001: Connection restored to a954e878-9e3f-06fe-0694-8bc8efe6903b (at 10.8.26.33@o2ib6) [2670904.595378] Lustre: Skipped 274 previous similar messages [2670927.657724] LustreError: dumping log to /tmp/lustre-log.1554826093.115822 [2670956.842038] LustreError: dumping log to /tmp/lustre-log.1554826122.115566 [2670966.323302] LNet: Service thread pid 115608 completed after 269.88s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2670966.323369] LustreError: 116166:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec6cb11cc00 ns: mdt-fir-MDT0001_UUID lock: ffff8ed2deb2bf00/0xbc329461eb9c84ed lrc: 5/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 48 type: IBT flags: 0x50200000000000 nid: 10.8.17.17@o2ib6 remote: 0xe31ee7d8267a459a expref: 29 pid: 116166 timeout: 0 lvb_type: 0 [2670966.323372] LustreError: 116166:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 3 previous similar messages [2670966.323471] Lustre: 115930:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (31:147s); client may timeout. req@ffff8edb88c6e600 x1628638025903504/t0(0) o101->7d8ca85d-8b80-6a23-8fa9-83dca7eb7196@10.8.27.28@o2ib6:24/0 lens 568/2296 e 0 to 0 dl 1554825984 ref 1 fl Complete:/0/0 rc -107/-107 [2670966.323474] Lustre: 115930:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 1 previous similar message [2670966.424773] LNet: Skipped 69 previous similar messages [2671122.942311] LustreError: 116084:0:(mdt_io.c:442:mdt_preprw_write()) fir-MDT0003: BRW to missing obj [0x2800118fb:0x13ce:0x0] [2671169.324307] LustreError: dumping log to /tmp/lustre-log.1554826334.115607 [2671200.059555] Lustre: fir-MDT0003: Client ecd69873-9a0a-0293-0ce7-d92dc18bf8d9 (at 10.9.101.11@o2ib4) reconnecting [2671200.069907] Lustre: Skipped 269 previous similar messages [2671210.491217] LustreError: 116119:0:(mdt_io.c:442:mdt_preprw_write()) fir-MDT0003: BRW to missing obj [0x28000f97e:0xcabc:0x0] [2671213.770766] LustreError: 115897:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554826289, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eef91a9aac0/0xbc329461f10af612 lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 55 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115897 timeout: 0 lvb_type: 0 [2671213.810698] LustreError: 115897:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 48 previous similar messages [2671323.949953] LNet: Service thread pid 115617 was inactive for 200.13s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2671323.967155] LNet: Skipped 9 previous similar messages [2671323.972390] Pid: 115617, comm: mdt01_020 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2671323.982394] Call Trace: [2671323.985037] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2671323.992184] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2671323.999574] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2671324.006582] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2671324.013756] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2671324.020413] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2671324.027589] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2671324.034354] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2671324.041013] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2671324.047989] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2671324.055271] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2671324.061641] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2671324.068755] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2671324.076707] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2671324.083220] [] kthread+0xd1/0xe0 [2671324.088307] [] ret_from_fork_nospec_begin+0xe/0x21 [2671324.094958] [] 0xffffffffffffffff [2671324.100157] LustreError: dumping log to /tmp/lustre-log.1554826489.115617 [2671324.404038] Pid: 115897, comm: mdt02_065 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2671324.414032] Call Trace: [2671324.416681] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2671324.423799] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2671324.431173] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2671324.438174] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2671324.445354] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2671324.452093] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2671324.458750] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2671324.465692] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2671324.472981] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2671324.479336] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2671324.486459] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2671324.494344] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2671324.500855] [] kthread+0xd1/0xe0 [2671324.505942] [] ret_from_fork_nospec_begin+0xe/0x21 [2671324.512589] [] 0xffffffffffffffff [2671448.328300] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.17.17@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ee7cd8dad00/0xbc329461eeae2a05 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 55 type: IBT flags: 0x60200400000020 nid: 10.8.17.17@o2ib6 remote: 0xe31ee7d8267a60e4 expref: 41 pid: 115636 timeout: 2671427 lvb_type: 0 [2671448.366664] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 5 previous similar messages [2671448.376892] LustreError: 115878:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ed4fa9c3400 ns: mdt-fir-MDT0001_UUID lock: ffff8ed0127eb840/0xbc329461eeae3653 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 46 type: IBT flags: 0x50200400000020 nid: 10.8.27.26@o2ib6 remote: 0x3a52d8e5f9127604 expref: 4 pid: 115878 timeout: 0 lvb_type: 0 [2671448.411993] LustreError: 115878:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 6 previous similar messages [2671448.422437] Lustre: 115878:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (340:139s); client may timeout. req@ffff8ecb04861500 x1628574042848064/t0(0) o101->40dfa1b3-0e1e-4763-9ec6-4ac10d77215c@10.8.27.26@o2ib6:24/0 lens 576/1792 e 0 to 0 dl 1554826474 ref 1 fl Complete:/0/0 rc -107/-107 [2671448.451801] Lustre: 115878:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 9 previous similar messages [2671474.735587] Lustre: 115902:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eba37e92700 x1628556959539312/t0(0) o101->4cd8fde3-ab19-6a6b-a7ee-5d70c4bd9893@10.8.8.6@o2ib6:25/0 lens 576/3264 e 0 to 0 dl 1554826645 ref 2 fl Interpret:/0/0 rc 0/0 [2671474.764685] Lustre: 115902:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 40 previous similar messages [2671508.553949] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2671508.564565] Lustre: Skipped 291 previous similar messages [2671599.331915] LustreError: 116228:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8eb2b8ddc800 ns: mdt-fir-MDT0001_UUID lock: ffff8ee6c6783180/0xbc329461f6f44181 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 46 type: IBT flags: 0x50200000000000 nid: 10.8.27.28@o2ib6 remote: 0x1df96237ad00cf77 expref: 3 pid: 116228 timeout: 0 lvb_type: 0 [2671599.367007] LustreError: 116228:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 6 previous similar messages [2671801.651077] Pid: 115878, comm: mdt01_062 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2671801.661084] Call Trace: [2671801.663723] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2671801.670864] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2671801.678275] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2671801.685302] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2671801.692484] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2671801.699141] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2671801.706316] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2671801.713082] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2671801.719759] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2671801.726703] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2671801.734016] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2671801.740359] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2671801.747493] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2671801.755417] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2671801.761938] [] kthread+0xd1/0xe0 [2671801.767043] [] ret_from_fork_nospec_begin+0xe/0x21 [2671801.773694] [] 0xffffffffffffffff [2671801.778885] LustreError: dumping log to /tmp/lustre-log.1554826967.115878 [2671802.790580] Pid: 115598, comm: mdt01_014 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2671802.800585] Call Trace: [2671802.803234] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2671802.810351] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2671802.817736] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2671802.824738] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2671802.831915] [] mdt_object_lock+0x20/0x30 [mdt] [2671802.838222] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2671802.844706] [] mdt_intent_brw+0x1f/0x30 [mdt] [2671802.850930] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2671802.857583] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2671802.864526] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2671802.871850] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2671802.878196] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2671802.885331] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2671802.893231] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2671802.899740] [] kthread+0xd1/0xe0 [2671802.904827] [] ret_from_fork_nospec_begin+0xe/0x21 [2671802.911496] [] 0xffffffffffffffff [2671802.916698] Pid: 115945, comm: mdt01_077 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2671802.926696] Call Trace: [2671802.929339] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2671802.936461] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2671802.943837] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2671802.950855] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2671802.958033] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2671802.964689] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2671802.971866] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2671802.978607] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2671802.985263] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2671802.992225] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2671802.999521] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2671803.005864] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2671803.012979] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2671803.020884] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2671803.027393] [] kthread+0xd1/0xe0 [2671803.032479] [] ret_from_fork_nospec_begin+0xe/0x21 [2671803.039128] [] 0xffffffffffffffff [2671803.044326] Pid: 116301, comm: mdt02_094 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2671803.054326] Call Trace: [2671803.056963] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2671803.064098] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2671803.071472] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2671803.078500] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2671803.085678] [] mdt_object_lock+0x20/0x30 [mdt] [2671803.091986] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2671803.098470] [] mdt_intent_brw+0x1f/0x30 [mdt] [2671803.104690] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2671803.111347] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2671803.118289] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2671803.125570] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2671803.131942] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2671803.139074] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2671803.146978] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2671803.153503] [] kthread+0xd1/0xe0 [2671803.158591] [] ret_from_fork_nospec_begin+0xe/0x21 [2671803.165236] [] 0xffffffffffffffff [2671803.170436] Pid: 115710, comm: mdt00_028 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2671803.180434] Call Trace: [2671803.183078] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2671803.190200] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2671803.197564] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2671803.204567] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2671803.211743] [] mdt_reint_object_lock+0x2c/0x60 [mdt] [2671803.218573] [] mdt_reint_striped_lock+0x8c/0x510 [mdt] [2671803.225575] [] mdt_reint_setattr+0x6c8/0x1340 [mdt] [2671803.232317] [] mdt_reint_rec+0x83/0x210 [mdt] [2671803.238541] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2671803.245284] [] mdt_reint+0x67/0x140 [mdt] [2671803.251160] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2671803.258294] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2671803.266190] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2671803.272697] [] kthread+0xd1/0xe0 [2671803.277786] [] ret_from_fork_nospec_begin+0xe/0x21 [2671803.284431] [] 0xffffffffffffffff [2671803.289629] LNet: Service thread pid 115601 was inactive for 202.06s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2671803.302751] LNet: Skipped 47 previous similar messages [2671812.331963] Lustre: fir-MDT0001: Client 40dfa1b3-0e1e-4763-9ec6-4ac10d77215c (at 10.8.27.26@o2ib6) reconnecting [2671812.342228] Lustre: Skipped 285 previous similar messages [2671821.309286] Lustre: 115568:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554826979/real 1554826979] req@ffff8ed70413a400 x1630272327811584/t0(0) o104->fir-MDT0001@10.9.101.36@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554826986 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2671821.336908] Lustre: 115568:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 37 previous similar messages [2671826.049329] Lustre: 115771:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554826984/real 1554826984] req@ffff8ee65736e600 x1630272327867360/t0(0) o104->fir-MDT0001@10.9.101.36@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554826991 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2671826.076927] Lustre: 115771:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 7 previous similar messages [2671834.329420] Lustre: 115346:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554826992/real 1554826992] req@ffff8ecb5d016c00 x1630272327948624/t0(0) o104->fir-MDT0001@10.9.101.36@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554826999 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2671834.357029] Lustre: 115346:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 14 previous similar messages [2671840.332483] LustreError: 115972:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554826915, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ed075d3d340/0xbc329461fd3c7eb0 lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 53 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115972 timeout: 0 lvb_type: 0 [2671840.332485] LustreError: 115834:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554826915, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ec88c67fbc0/0xbc329461fd3c7e71 lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 52 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115834 timeout: 0 lvb_type: 0 [2671840.332489] LustreError: 115834:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 47 previous similar messages [2671840.423542] LustreError: 115972:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 3 previous similar messages [2671849.346606] LustreError: 115568:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.9.101.36@o2ib4) failed to reply to blocking AST (req@ffff8ed70413a400 x1630272327811584 status 0 rc -110), evict it ns: mdt-fir-MDT0001_UUID lock: ffff8ed515d77500/0xbc329461fda248bc lrc: 4/0,0 mode: PR/PR res: [0x24000f5ba:0x17163:0x0].0x0 bits 0x40/0x0 rrc: 32 type: IBT flags: 0x60000400000020 nid: 10.9.101.36@o2ib4 remote: 0x79fbd4145c6947db expref: 1817 pid: 116305 timeout: 2671851 lvb_type: 0 [2671849.390260] LustreError: 138-a: fir-MDT0001: A client on nid 10.9.101.36@o2ib4 was evicted due to a lock blocking callback time out: rc -110 [2671900.333314] LNet: Service thread pid 116301 completed after 299.11s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2671900.349744] LNet: Skipped 22 previous similar messages [2671950.644635] LustreError: dumping log to /tmp/lustre-log.1554827116.116172 [2671960.334715] LustreError: 116361:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ed80623c800 ns: mdt-fir-MDT0001_UUID lock: ffff8eea80933840/0xbc329461fa44234f lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 50 type: IBT flags: 0x50200000000000 nid: 10.9.107.8@o2ib4 remote: 0x309944bb3b191440 expref: 2 pid: 116361 timeout: 0 lvb_type: 0 [2671960.370665] Lustre: 116361:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:205s); client may timeout. req@ffff8ee86fcba400 x1628639974136368/t0(0) o101->dc990748-ca32-0960-545e-99af2316a63e@10.9.107.8@o2ib4:26/0 lens 568/2296 e 0 to 0 dl 1554826920 ref 1 fl Complete:/0/0 rc -107/-107 [2672024.560407] Lustre: fir-MDT0003: haven't heard from client c0d56c4d-875b-09b3-2fba-bf596d61ddb1 (at 10.9.101.36@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee07b082400, cur 1554827190 expire 1554827040 last 1554826963 [2672024.582398] Lustre: Skipped 1 previous similar message [2672119.335383] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.9.107.8@o2ib4 ns: mdt-fir-MDT0001_UUID lock: ffff8eefd06ddc40/0xbc329462018b6c48 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 56 type: IBT flags: 0x60200400000020 nid: 10.9.107.8@o2ib4 remote: 0x309944bb3b192ac8 expref: 44 pid: 115568 timeout: 2672098 lvb_type: 0 [2672119.373738] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 6 previous similar messages [2672124.318182] Lustre: fir-MDT0001: Connection restored to (at 10.8.27.31@o2ib6) [2672124.325584] Lustre: Skipped 307 previous similar messages [2672149.430701] Lustre: 115896:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ed1d4eff800 x1628639974233616/t0(0) o101->dc990748-ca32-0960-545e-99af2316a63e@10.9.107.8@o2ib4:9/0 lens 576/3264 e 0 to 0 dl 1554827319 ref 2 fl Interpret:/0/0 rc 0/0 [2672149.459882] Lustre: 115896:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 80 previous similar messages [2672169.782928] LNet: Service thread pid 115592 was inactive for 200.46s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2672169.800124] LNet: Skipped 6 previous similar messages [2672169.805404] Pid: 115592, comm: mdt01_012 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2672169.815428] Call Trace: [2672169.818066] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2672169.825192] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2672169.832575] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2672169.839594] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2672169.846778] [] mdt_object_lock+0x20/0x30 [mdt] [2672169.853087] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2672169.859570] [] mdt_intent_brw+0x1f/0x30 [mdt] [2672169.865940] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2672169.872597] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2672169.879555] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2672169.886834] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2672169.893178] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2672169.900298] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2672169.908207] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2672169.914733] [] kthread+0xd1/0xe0 [2672169.919820] [] ret_from_fork_nospec_begin+0xe/0x21 [2672169.926469] [] 0xffffffffffffffff [2672169.931678] LustreError: dumping log to /tmp/lustre-log.1554827335.115592 [2672170.322456] Pid: 116172, comm: mdt00_081 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2672170.332463] Call Trace: [2672170.335101] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2672170.342222] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2672170.349615] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2672170.356618] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2672170.363793] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2672170.370449] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2672170.377649] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2672170.384404] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2672170.391060] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2672170.398003] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2672170.405301] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2672170.411654] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2672170.418779] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2672170.426682] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2672170.433217] [] kthread+0xd1/0xe0 [2672170.438312] [] ret_from_fork_nospec_begin+0xe/0x21 [2672170.444968] [] 0xffffffffffffffff [2672170.450166] Pid: 116601, comm: mdt03_107 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2672170.460162] Call Trace: [2672170.462804] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2672170.469935] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2672170.477321] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2672170.484330] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2672170.491508] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2672170.498162] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2672170.505357] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2672170.512099] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2672170.518754] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2672170.525698] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2672170.532987] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2672170.539431] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2672170.546604] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2672170.554489] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2672170.560997] [] kthread+0xd1/0xe0 [2672170.566085] [] ret_from_fork_nospec_begin+0xe/0x21 [2672170.572733] [] 0xffffffffffffffff [2672170.577930] Pid: 115598, comm: mdt01_014 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2672170.587928] Call Trace: [2672170.590569] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2672170.597684] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2672170.605051] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2672170.612054] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2672170.619229] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2672170.625884] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2672170.633052] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2672170.639794] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2672170.646444] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2672170.653376] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2672170.660656] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2672170.667020] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2672170.674132] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2672170.682020] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2672170.688519] [] kthread+0xd1/0xe0 [2672170.693606] [] ret_from_fork_nospec_begin+0xe/0x21 [2672170.700252] [] 0xffffffffffffffff [2672170.705452] Pid: 115925, comm: mdt01_070 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2672170.715454] Call Trace: [2672170.718090] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2672170.725200] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2672170.732566] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2672170.739604] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2672170.746779] [] mdt_object_lock+0x20/0x30 [mdt] [2672170.753096] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2672170.759587] [] mdt_intent_brw+0x1f/0x30 [mdt] [2672170.765812] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2672170.772466] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2672170.779425] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2672170.786708] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2672170.793047] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2672170.800174] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2672170.808071] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2672170.814569] [] kthread+0xd1/0xe0 [2672170.819665] [] ret_from_fork_nospec_begin+0xe/0x21 [2672170.826313] [] 0xffffffffffffffff [2672296.249113] LNetError: 115139:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (0, 5) [2672324.408567] LustreError: dumping log to /tmp/lustre-log.1554827489.116313 [2672350.008829] LustreError: dumping log to /tmp/lustre-log.1554827515.115886 [2672422.503344] Lustre: fir-MDT0001: Client bf6bd63b-9206-2f81-2780-2b483791a8c1 (at 10.8.17.15@o2ib6) reconnecting [2672422.513607] Lustre: Skipped 304 previous similar messages [2672450.503893] LustreError: 115890:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554827525, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eccbfd26e40/0xbc329462092b97f5 lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 54 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115890 timeout: 0 lvb_type: 0 [2672450.543828] LustreError: 115890:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 30 previous similar messages [2672504.634457] Pid: 115972, comm: mdt01_086 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2672504.644463] Call Trace: [2672504.647102] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2672504.654232] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2672504.661624] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2672504.668625] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2672504.675803] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2672504.682554] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2672504.689220] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2672504.696165] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2672504.703459] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2672504.709802] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2672504.716917] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2672504.724803] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2672504.731303] [] kthread+0xd1/0xe0 [2672504.736393] [] ret_from_fork_nospec_begin+0xe/0x21 [2672504.743039] [] 0xffffffffffffffff [2672504.748234] LustreError: dumping log to /tmp/lustre-log.1554827670.115972 [2672504.999316] Pid: 115903, comm: mdt01_068 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2672505.009322] Call Trace: [2672505.011971] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2672505.019094] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2672505.026486] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2672505.033489] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2672505.040666] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2672505.047322] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2672505.054497] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2672505.061240] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2672505.067895] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2672505.074854] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2672505.082146] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2672505.088496] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2672505.095628] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2672505.103516] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2672505.110043] [] kthread+0xd1/0xe0 [2672505.115129] [] ret_from_fork_nospec_begin+0xe/0x21 [2672505.121777] [] 0xffffffffffffffff [2672509.340088] LNet: Service thread pid 115925 completed after 540.01s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2672509.356504] LNet: Skipped 22 previous similar messages [2672535.866791] Pid: 115653, comm: mdt00_022 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2672535.876789] Call Trace: [2672535.879431] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2672535.886562] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2672535.893962] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2672535.900967] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2672535.908141] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2672535.914812] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2672535.921992] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2672535.928777] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2672535.935432] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2672535.942366] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2672535.949646] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2672535.955991] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2672535.963113] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2672535.971000] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2672535.977517] [] kthread+0xd1/0xe0 [2672535.982605] [] ret_from_fork_nospec_begin+0xe/0x21 [2672535.989243] [] 0xffffffffffffffff [2672535.994439] LustreError: dumping log to /tmp/lustre-log.1554827701.115653 [2672536.041276] Pid: 115959, comm: mdt01_081 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2672536.051274] Call Trace: [2672536.053910] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2672536.061022] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2672536.068391] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2672536.075394] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2672536.082587] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2672536.089321] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2672536.095974] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2672536.102925] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2672536.110207] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2672536.116567] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2672536.123701] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2672536.131597] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2672536.138134] [] kthread+0xd1/0xe0 [2672536.143228] [] ret_from_fork_nospec_begin+0xe/0x21 [2672536.149874] [] 0xffffffffffffffff [2672560.955051] Pid: 115779, comm: mdt01_040 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2672560.965060] Call Trace: [2672560.967704] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2672560.974820] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2672560.982211] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2672560.989214] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2672560.996390] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2672561.003045] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2672561.010222] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2672561.016956] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2672561.023622] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2672561.030553] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2672561.037850] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2672561.044195] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2672561.051328] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2672561.059214] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2672561.065751] [] kthread+0xd1/0xe0 [2672561.070836] [] ret_from_fork_nospec_begin+0xe/0x21 [2672561.077499] [] 0xffffffffffffffff [2672561.082697] LustreError: dumping log to /tmp/lustre-log.1554827726.115779 [2672561.117396] LNet: Service thread pid 115890 was inactive for 200.61s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2672561.130520] LNet: Skipped 31 previous similar messages [2672689.341607] LustreError: 116601:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec2c6138400 ns: mdt-fir-MDT0001_UUID lock: ffff8edfeb66a400/0xbc329462018b8c93 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 54 type: IBT flags: 0x50200400000020 nid: 10.9.107.8@o2ib4 remote: 0x309944bb3b192ba8 expref: 4 pid: 116601 timeout: 0 lvb_type: 0 [2672689.376660] LustreError: 116601:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 3 previous similar messages [2672689.387046] Lustre: 116601:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:566s); client may timeout. req@ffff8ee75c9acb00 x1628639974203680/t0(0) o101->dc990748-ca32-0960-545e-99af2316a63e@10.9.107.8@o2ib4:4/0 lens 576/1792 e 0 to 0 dl 1554827288 ref 1 fl Complete:/0/0 rc -107/-107 [2672689.416294] Lustre: 116601:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 12 previous similar messages [2672729.027549] Lustre: fir-MDT0001: Connection restored to eeceaf0a-f64b-44e4-4f28-fac655ebb0a4 (at 10.8.17.19@o2ib6) [2672729.038087] Lustre: Skipped 302 previous similar messages [2672847.343055] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 149s: evicting client at 10.9.107.8@o2ib4 ns: mdt-fir-MDT0001_UUID lock: ffff8eb450c32640/0xbc3294620f50d88f lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 49 type: IBT flags: 0x60200400000020 nid: 10.9.107.8@o2ib4 remote: 0x309944bb3b1953d3 expref: 54 pid: 115886 timeout: 2672826 lvb_type: 0 [2672847.381441] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 7 previous similar messages [2672878.242382] Lustre: 116192:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eb281145700 x1628546095190720/t0(0) o101->ac4b3f39-4aa4-91f4-89f9-e38a651f2f99@10.9.107.5@o2ib4:18/0 lens 568/0 e 0 to 0 dl 1554828048 ref 2 fl Interpret:/0/ffffffff rc 0/-1 [2672878.272057] Lustre: 116192:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 38 previous similar messages [2672898.366599] LNet: Service thread pid 115759 was inactive for 200.33s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2672898.383812] LNet: Skipped 9 previous similar messages [2672898.389050] Pid: 115759, comm: mdt01_038 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2672898.399092] Call Trace: [2672898.401733] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2672898.408862] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2672898.416243] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2672898.423249] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2672898.430449] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2672898.437105] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2672898.444292] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2672898.451032] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2672898.457699] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2672898.464649] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2672898.471971] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2672898.478339] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2672898.485482] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2672898.493380] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2672898.499927] [] kthread+0xd1/0xe0 [2672898.505027] [] ret_from_fork_nospec_begin+0xe/0x21 [2672898.511689] [] 0xffffffffffffffff [2672898.516886] LustreError: dumping log to /tmp/lustre-log.1554828063.115759 [2672898.943993] Pid: 115890, comm: mdt01_064 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2672898.953993] Call Trace: [2672898.956634] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2672898.963750] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2672898.971118] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2672898.978131] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2672898.985305] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2672898.991963] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2672898.999137] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2672899.005906] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2672899.012563] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2672899.019496] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2672899.026787] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2672899.033137] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2672899.040251] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2672899.048149] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2672899.054659] [] kthread+0xd1/0xe0 [2672899.059752] [] ret_from_fork_nospec_begin+0xe/0x21 [2672899.066401] [] 0xffffffffffffffff [2672899.071614] Pid: 115871, comm: mdt01_061 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2672899.081613] Call Trace: [2672899.084255] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2672899.091368] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2672899.098737] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2672899.105738] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2672899.112931] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2672899.119690] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2672899.126364] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2672899.133299] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2672899.140581] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2672899.146937] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2672899.154056] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2672899.161975] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2672899.168487] [] kthread+0xd1/0xe0 [2672899.173575] [] ret_from_fork_nospec_begin+0xe/0x21 [2672899.180221] [] 0xffffffffffffffff [2672899.185428] Pid: 115886, comm: mdt00_053 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2672899.195427] Call Trace: [2672899.198065] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2672899.205174] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2672899.212550] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2672899.219562] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2672899.226767] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2672899.233453] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2672899.240643] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2672899.247406] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2672899.254077] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2672899.261013] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2672899.268292] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2672899.274643] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2672899.281760] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2672899.289647] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2672899.296156] [] kthread+0xd1/0xe0 [2672899.301252] [] ret_from_fork_nospec_begin+0xe/0x21 [2672899.307901] [] 0xffffffffffffffff [2672899.313086] Pid: 116191, comm: mdt01_107 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2672899.323086] Call Trace: [2672899.325724] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2672899.332834] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2672899.340201] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2672899.347204] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2672899.354396] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2672899.361053] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2672899.368229] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2672899.374972] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2672899.381628] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2672899.388561] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2672899.395840] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2672899.402178] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2672899.409293] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2672899.417198] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2672899.423705] [] kthread+0xd1/0xe0 [2672899.428794] [] ret_from_fork_nospec_begin+0xe/0x21 [2672899.435441] [] 0xffffffffffffffff [2673027.656059] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2673027.666422] Lustre: Skipped 296 previous similar messages [2673053.504240] LustreError: dumping log to /tmp/lustre-log.1554828218.116124 [2673057.345552] LustreError: 115592:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec907afe400 ns: mdt-fir-MDT0001_UUID lock: ffff8ecd20e63a80/0xbc3294620f50e29f lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 43 type: IBT flags: 0x50200400000020 nid: 10.8.17.19@o2ib6 remote: 0x8282471ef02d8fef expref: 108 pid: 115592 timeout: 0 lvb_type: 0 [2673057.380778] LustreError: 115592:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 9 previous similar messages [2673057.391898] Lustre: 115963:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:19s); client may timeout. req@ffff8eb732017800 x1628649285987760/t0(0) o101->2c78cdad-2975-ca98-fb36-e7548576f834@10.8.17.17@o2ib6:23/0 lens 568/2296 e 0 to 0 dl 1554828203 ref 1 fl Complete:/0/0 rc -107/-107 [2673057.421082] Lustre: 115963:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 3 previous similar messages [2673148.106235] LustreError: 115667:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554828223, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eeb22605e80/0xbc32946216fa7bb6 lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 50 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115667 timeout: 0 lvb_type: 0 [2673148.146144] LustreError: 115667:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 32 previous similar messages [2673258.306385] Pid: 115963, comm: mdt00_077 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2673258.316408] Call Trace: [2673258.319055] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2673258.326183] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2673258.333558] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2673258.340560] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2673258.347734] [] mdt_object_lock+0x20/0x30 [mdt] [2673258.354046] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2673258.360538] [] mdt_intent_brw+0x1f/0x30 [mdt] [2673258.366776] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2673258.373433] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2673258.380386] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2673258.387698] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2673258.394072] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2673258.401220] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2673258.409123] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2673258.415649] [] kthread+0xd1/0xe0 [2673258.420737] [] ret_from_fork_nospec_begin+0xe/0x21 [2673258.427384] [] 0xffffffffffffffff [2673258.432579] LustreError: dumping log to /tmp/lustre-log.1554828423.115963 [2673258.704078] Pid: 116172, comm: mdt00_081 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2673258.714093] Call Trace: [2673258.716742] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2673258.723874] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2673258.731248] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2673258.738262] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2673258.745447] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2673258.752101] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2673258.759281] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2673258.766028] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2673258.772695] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2673258.779645] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2673258.786951] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2673258.793304] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2673258.800446] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2673258.808354] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2673258.814887] [] kthread+0xd1/0xe0 [2673258.819979] [] ret_from_fork_nospec_begin+0xe/0x21 [2673258.826627] [] 0xffffffffffffffff [2673258.831826] Pid: 115659, comm: mdt00_023 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2673258.841832] Call Trace: [2673258.844477] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2673258.851606] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2673258.858981] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2673258.866010] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2673258.873222] [] mdt_object_lock+0x20/0x30 [mdt] [2673258.879549] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2673258.886402] [] mdt_intent_brw+0x1f/0x30 [mdt] [2673258.892625] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2673258.899282] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2673258.906221] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2673258.913523] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2673258.919901] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2673258.927033] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2673258.934980] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2673258.941515] [] kthread+0xd1/0xe0 [2673258.946620] [] ret_from_fork_nospec_begin+0xe/0x21 [2673258.953277] [] 0xffffffffffffffff [2673258.958481] Pid: 116206, comm: mdt00_088 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2673258.968508] Call Trace: [2673258.971146] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2673258.978261] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2673258.985637] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2673258.992658] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2673258.999850] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2673259.006507] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2673259.013681] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2673259.020423] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2673259.027082] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2673259.034024] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2673259.041321] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2673259.047664] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2673259.054806] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2673259.062722] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2673259.069254] [] kthread+0xd1/0xe0 [2673259.074360] [] ret_from_fork_nospec_begin+0xe/0x21 [2673259.081005] [] 0xffffffffffffffff [2673259.086203] Pid: 115879, comm: mdt02_060 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2673259.096202] Call Trace: [2673259.098841] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2673259.105958] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2673259.113333] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2673259.120335] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2673259.127528] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2673259.134204] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2673259.141397] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2673259.148141] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2673259.154812] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2673259.161781] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2673259.169087] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2673259.175431] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2673259.182547] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2673259.190458] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2673259.197359] [] kthread+0xd1/0xe0 [2673259.202452] [] ret_from_fork_nospec_begin+0xe/0x21 [2673259.209126] [] 0xffffffffffffffff [2673259.214324] LNet: Service thread pid 115778 was inactive for 201.07s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2673259.227455] LNet: Skipped 14 previous similar messages [2673337.109665] Lustre: fir-MDT0001: Connection restored to e3c4cf5f-8e04-bccb-9d13-7eae1b83e1a1 (at 10.8.27.29@o2ib6) [2673337.120200] Lustre: Skipped 296 previous similar messages [2673357.348916] LNet: Service thread pid 115667 completed after 299.23s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2673357.365336] LNet: Skipped 43 previous similar messages [2673413.955971] LustreError: dumping log to /tmp/lustre-log.1554828579.115871 [2673507.349956] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.9.107.5@o2ib4 ns: mdt-fir-MDT0001_UUID lock: ffff8eeb22605e80/0xbc32946216fa7bb6 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 50 type: IBT flags: 0x60200400000020 nid: 10.9.107.5@o2ib4 remote: 0xb9c41693abb61d32 expref: 49 pid: 115667 timeout: 2673486 lvb_type: 0 [2673507.388328] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 5 previous similar messages [2673536.581241] Lustre: 115839:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (4/-6), not sending early reply req@ffff8eb5d824c800 x1628546095329024/t0(0) o101->ac4b3f39-4aa4-91f4-89f9-e38a651f2f99@10.9.107.5@o2ib4:16/0 lens 576/3264 e 0 to 0 dl 1554828706 ref 2 fl Interpret:/0/0 rc 0/0 [2673536.610492] Lustre: 115839:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 25 previous similar messages [2673562.437509] LNet: Service thread pid 115625 was inactive for 200.05s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2673562.454715] LNet: Skipped 9 previous similar messages [2673562.459976] Pid: 115625, comm: mdt00_018 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2673562.469992] Call Trace: [2673562.472632] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2673562.479756] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2673562.487130] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2673562.494132] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2673562.501308] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2673562.508082] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2673562.514746] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2673562.521714] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2673562.529013] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2673562.535371] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2673562.542496] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2673562.550397] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2673562.556904] [] kthread+0xd1/0xe0 [2673562.562009] [] ret_from_fork_nospec_begin+0xe/0x21 [2673562.568660] [] 0xffffffffffffffff [2673562.573850] LustreError: dumping log to /tmp/lustre-log.1554828727.115625 [2673562.812367] Pid: 115833, comm: mdt00_037 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2673562.822365] Call Trace: [2673562.825010] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2673562.832127] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2673562.839504] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2673562.846507] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2673562.853682] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2673562.860338] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2673562.867516] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2673562.874258] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2673562.880905] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2673562.887847] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2673562.895140] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2673562.901539] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2673562.908665] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2673562.916569] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2673562.923076] [] kthread+0xd1/0xe0 [2673562.928165] [] ret_from_fork_nospec_begin+0xe/0x21 [2673562.934828] [] 0xffffffffffffffff [2673597.351132] LustreError: 115869:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ed73ac42c00 ns: mdt-fir-MDT0001_UUID lock: ffff8ecd1afc0d80/0xbc32946220f7edfe lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 40 type: IBT flags: 0x50200000000000 nid: 10.8.27.29@o2ib6 remote: 0xd991486401acd492 expref: 2 pid: 115869 timeout: 0 lvb_type: 0 [2673597.386229] LustreError: 115869:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 5 previous similar messages [2673597.396745] Lustre: 115348:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:26s); client may timeout. req@ffff8ec7d3840900 x1629298149848720/t0(0) o101->a4daaf47-6ec9-4753-388e-0d0b7a7f70d6@10.8.27.25@o2ib6:16/0 lens 568/2296 e 0 to 0 dl 1554828736 ref 1 fl Complete:/0/0 rc -107/-107 [2673597.425914] Lustre: 115348:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 4 previous similar messages [2673629.315805] Lustre: fir-MDT0001: Client e98ad6a7-830d-1725-2883-db155e5b9e43 (at 10.8.17.16@o2ib6) reconnecting [2673629.326077] Lustre: Skipped 282 previous similar messages [2673798.983953] Pid: 115903, comm: mdt01_068 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2673798.993960] Call Trace: [2673798.996618] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2673799.003740] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2673799.011125] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2673799.018128] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2673799.025322] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2673799.031978] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2673799.039177] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2673799.045923] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2673799.052592] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2673799.059538] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2673799.066836] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2673799.073181] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2673799.080351] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2673799.088267] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2673799.094774] [] kthread+0xd1/0xe0 [2673799.099875] [] ret_from_fork_nospec_begin+0xe/0x21 [2673799.106561] [] 0xffffffffffffffff [2673799.111766] LustreError: dumping log to /tmp/lustre-log.1554828964.115903 [2673799.493382] Pid: 115878, comm: mdt01_062 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2673799.503434] Call Trace: [2673799.506081] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2673799.513205] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2673799.520585] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2673799.527596] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2673799.534804] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2673799.541486] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2673799.548663] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2673799.555413] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2673799.562077] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2673799.569032] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2673799.576364] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2673799.582735] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2673799.589889] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2673799.597787] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2673799.604310] [] kthread+0xd1/0xe0 [2673799.609401] [] ret_from_fork_nospec_begin+0xe/0x21 [2673799.616065] [] 0xffffffffffffffff [2673799.621291] Pid: 115879, comm: mdt02_060 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2673799.631303] Call Trace: [2673799.633946] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2673799.641074] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2673799.648461] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2673799.655464] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2673799.662641] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2673799.669303] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2673799.676489] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2673799.683251] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2673799.689897] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2673799.696849] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2673799.704153] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2673799.710497] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2673799.717625] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2673799.725526] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2673799.732028] [] kthread+0xd1/0xe0 [2673799.737132] [] ret_from_fork_nospec_begin+0xe/0x21 [2673799.743781] [] 0xffffffffffffffff [2673805.225654] LNetError: 115149:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (0, 5) [2673841.536386] LustreError: 115874:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554828916, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ed4a96d7bc0/0xbc329462261ca38e lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 58 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115874 timeout: 0 lvb_type: 0 [2673841.576356] LustreError: 115874:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 35 previous similar messages [2673937.571841] Lustre: fir-MDT0001: Connection restored to dc990748-ca32-0960-545e-99af2316a63e (at 10.9.107.8@o2ib4) [2673937.582382] Lustre: Skipped 291 previous similar messages [2673951.561527] Pid: 116172, comm: mdt00_081 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2673951.571540] Call Trace: [2673951.574184] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2673951.581325] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2673951.588724] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2673951.595742] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2673951.602920] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2673951.609688] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2673951.616345] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2673951.623302] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2673951.630598] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2673951.636945] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2673951.644088] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2673951.652007] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2673951.658550] [] kthread+0xd1/0xe0 [2673951.663640] [] ret_from_fork_nospec_begin+0xe/0x21 [2673951.670286] [] 0xffffffffffffffff [2673951.675541] LustreError: dumping log to /tmp/lustre-log.1554829117.116172 [2673951.928907] Pid: 115874, comm: mdt02_058 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2673951.938939] Call Trace: [2673951.941575] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2673951.948726] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2673951.956148] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2673951.963153] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2673951.970372] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2673951.977118] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2673951.983796] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2673951.990740] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2673951.998030] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2673952.004380] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2673952.011527] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2673952.019451] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2673952.025988] [] kthread+0xd1/0xe0 [2673952.031077] [] ret_from_fork_nospec_begin+0xe/0x21 [2673952.037747] [] 0xffffffffffffffff [2673952.073533] Pid: 115892, comm: mdt02_063 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2673952.083542] Call Trace: [2673952.086183] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2673952.093312] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2673952.100698] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2673952.107705] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2673952.114872] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2673952.121538] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2673952.128705] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2673952.135438] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2673952.142086] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2673952.149019] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2673952.156316] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2673952.162667] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2673952.169811] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2673952.177700] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2673952.184229] [] kthread+0xd1/0xe0 [2673952.189320] [] ret_from_fork_nospec_begin+0xe/0x21 [2673952.195990] [] 0xffffffffffffffff [2674056.355860] LNet: Service thread pid 115930 completed after 457.97s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2674056.372299] LNet: Skipped 41 previous similar messages [2674212.357213] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.27.26@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ec5aefcc380/0xbc3294622ce70854 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 55 type: IBT flags: 0x60200400000020 nid: 10.8.27.26@o2ib6 remote: 0x3a52d8e5f9136944 expref: 106 pid: 115724 timeout: 2674191 lvb_type: 0 [2674212.395679] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 6 previous similar messages [2674212.407551] LustreError: 115886:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec6e9e2b000 ns: mdt-fir-MDT0001_UUID lock: ffff8ec02c38d7c0/0xbc3294622ce7481f lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 47 type: IBT flags: 0x50200000000000 nid: 10.8.27.26@o2ib6 remote: 0x3a52d8e5f9136b19 expref: 5 pid: 115886 timeout: 0 lvb_type: 0 [2674212.442658] LustreError: 115886:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 9 previous similar messages [2674236.805631] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2674236.815983] Lustre: Skipped 273 previous similar messages [2674239.820500] Lustre: 115753:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ec1fd0b1e00 x1628647042440592/t0(0) o101->535a7c41-41b9-ec51-bbd6-e028c4ebcb2c@10.8.8.1@o2ib6:0/0 lens 568/0 e 0 to 0 dl 1554829410 ref 2 fl Interpret:/0/ffffffff rc 0/-1 [2674239.849927] Lustre: 115753:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 69 previous similar messages [2674415.438310] LNet: Service thread pid 115942 was inactive for 200.39s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2674415.455506] LNet: Skipped 7 previous similar messages [2674415.460740] Pid: 115942, comm: mdt01_076 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2674415.470769] Call Trace: [2674415.473414] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2674415.480551] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2674415.487942] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2674415.494949] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2674415.502159] [] mdt_object_lock+0x20/0x30 [mdt] [2674415.508475] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2674415.514976] [] mdt_intent_brw+0x1f/0x30 [mdt] [2674415.521213] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2674415.527897] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2674415.534859] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2674415.542178] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2674415.548518] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2674415.555665] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2674415.563570] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2674415.570081] [] kthread+0xd1/0xe0 [2674415.575166] [] ret_from_fork_nospec_begin+0xe/0x21 [2674415.581837] [] 0xffffffffffffffff [2674415.587051] LustreError: dumping log to /tmp/lustre-log.1554829581.115942 [2674416.334031] Pid: 115568, comm: mdt02_007 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2674416.344061] Call Trace: [2674416.346722] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2674416.353845] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2674416.361243] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2674416.368246] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2674416.375454] [] mdt_object_lock+0x20/0x30 [mdt] [2674416.381785] [] mdt_hsm_state_set+0xc9/0x830 [mdt] [2674416.388385] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2674416.395538] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2674416.403435] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2674416.409952] [] kthread+0xd1/0xe0 [2674416.415057] [] ret_from_fork_nospec_begin+0xe/0x21 [2674416.421703] [] 0xffffffffffffffff [2674416.426925] Pid: 116313, comm: mdt02_105 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2674416.436925] Call Trace: [2674416.439580] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2674416.446690] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2674416.454056] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2674416.461075] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2674416.468243] [] mdt_object_lock+0x20/0x30 [mdt] [2674416.474588] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2674416.481072] [] mdt_intent_brw+0x1f/0x30 [mdt] [2674416.487321] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2674416.493966] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2674416.500900] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2674416.508197] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2674416.514546] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2674416.521673] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2674416.529590] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2674416.536125] [] kthread+0xd1/0xe0 [2674416.541219] [] ret_from_fork_nospec_begin+0xe/0x21 [2674416.547882] [] 0xffffffffffffffff [2674416.553112] Pid: 116254, comm: mdt02_091 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2674416.563131] Call Trace: [2674416.565771] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2674416.572878] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2674416.580273] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2674416.587274] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2674416.594449] [] mdt_object_lock+0x20/0x30 [mdt] [2674416.600769] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2674416.607292] [] mdt_intent_brw+0x1f/0x30 [mdt] [2674416.613543] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2674416.620231] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2674416.627176] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2674416.634486] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2674416.640834] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2674416.647970] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2674416.655880] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2674416.662395] [] kthread+0xd1/0xe0 [2674416.667485] [] ret_from_fork_nospec_begin+0xe/0x21 [2674416.674145] [] 0xffffffffffffffff [2674416.679344] Pid: 115624, comm: mdt02_027 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2674416.689369] Call Trace: [2674416.692001] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2674416.699109] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2674416.706503] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2674416.713522] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2674416.720690] [] mdt_object_lock+0x20/0x30 [mdt] [2674416.726999] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2674416.733482] [] mdt_intent_brw+0x1f/0x30 [mdt] [2674416.739704] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2674416.746376] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2674416.753322] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2674416.760638] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2674416.766989] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2674416.774117] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2674416.782016] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2674416.788514] [] kthread+0xd1/0xe0 [2674416.793602] [] ret_from_fork_nospec_begin+0xe/0x21 [2674416.800266] [] 0xffffffffffffffff [2674416.805455] LNet: Service thread pid 115617 was inactive for 202.03s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2674416.818577] LNet: Skipped 29 previous similar messages [2674446.158630] LustreError: dumping log to /tmp/lustre-log.1554829611.115627 [2674485.222041] LustreError: 115859:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554829560, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8edb5ef6bf00/0xbc3294623400e63f lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 47 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115859 timeout: 0 lvb_type: 0 [2674485.261957] LustreError: 115859:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 49 previous similar messages [2674546.867846] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2674546.878503] Lustre: Skipped 280 previous similar messages [2674595.664176] LustreError: dumping log to /tmp/lustre-log.1554829761.115859 [2674611.587508] Lustre: fir-MDT0003: haven't heard from client 853a6fb5-cc6d-1922-2c09-7dc9f54e4f52 (at 10.8.1.29@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eb97a5eb000, cur 1554829777 expire 1554829627 last 1554829550 [2674724.362639] LNet: Service thread pid 116313 completed after 509.59s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2674724.379062] LNet: Skipped 7 previous similar messages [2674750.289806] Pid: 115878, comm: mdt01_062 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2674750.299812] Call Trace: [2674750.302454] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2674750.309578] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2674750.316954] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2674750.323956] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2674750.331143] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2674750.337903] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2674750.344565] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2674750.351505] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2674750.358800] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2674750.365151] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2674750.372308] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2674750.380202] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2674750.386709] [] kthread+0xd1/0xe0 [2674750.391800] [] ret_from_fork_nospec_begin+0xe/0x21 [2674750.398446] [] 0xffffffffffffffff [2674750.403638] LustreError: dumping log to /tmp/lustre-log.1554829915.115878 [2674750.801817] Pid: 115857, comm: mdt01_057 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2674750.811833] Call Trace: [2674750.814469] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2674750.821592] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2674750.828977] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2674750.835991] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2674750.843189] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2674750.849848] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2674750.857040] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2674750.863791] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2674750.870448] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2674750.877388] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2674750.884671] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2674750.891023] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2674750.898154] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2674750.906049] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2674750.912570] [] kthread+0xd1/0xe0 [2674750.917673] [] ret_from_fork_nospec_begin+0xe/0x21 [2674750.924338] [] 0xffffffffffffffff [2674750.929543] LustreError: dumping log to /tmp/lustre-log.1554829916.115857 [2674754.364714] Lustre: 115953:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:25s); client may timeout. req@ffff8ed506340f00 x1628593193089120/t0(0) o101->bf6bd63b-9206-2f81-2780-2b483791a8c1@10.8.17.15@o2ib6:0/0 lens 568/2296 e 0 to 0 dl 1554829894 ref 1 fl Complete:/0/0 rc -107/-107 [2674754.393904] Lustre: 115953:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 8 previous similar messages [2674856.888143] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2674856.898525] Lustre: Skipped 276 previous similar messages [2674915.364544] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 149s: evicting client at 10.8.17.15@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ec01a1c1b00/0xbc3294623ba11fde lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 49 type: IBT flags: 0x60200400000020 nid: 10.8.17.15@o2ib6 remote: 0x794f201b7349786c expref: 73 pid: 115612 timeout: 2674894 lvb_type: 0 [2674915.402907] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 6 previous similar messages [2674942.035825] Lustre: 115587:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ed44f2a2400 x1628593193158048/t0(0) o101->bf6bd63b-9206-2f81-2780-2b483791a8c1@10.8.17.15@o2ib6:12/0 lens 576/3264 e 0 to 0 dl 1554830112 ref 2 fl Interpret:/0/0 rc 0/0 [2674942.065096] Lustre: 115587:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 37 previous similar messages [2674966.356082] Pid: 115590, comm: mdt01_011 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2674966.366095] Call Trace: [2674966.368737] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2674966.375890] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2674966.383291] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2674966.390324] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2674966.397514] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2674966.404219] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2674966.411408] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2674966.418182] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2674966.424851] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2674966.431793] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2674966.439089] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2674966.445432] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2674966.452604] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2674966.460533] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2674966.467070] [] kthread+0xd1/0xe0 [2674966.472161] [] ret_from_fork_nospec_begin+0xe/0x21 [2674966.478808] [] 0xffffffffffffffff [2674966.484018] LustreError: dumping log to /tmp/lustre-log.1554830131.115590 [2674966.849944] Pid: 115857, comm: mdt01_057 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2674966.859986] Call Trace: [2674966.862631] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2674966.869786] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2674966.877151] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2674966.884154] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2674966.891347] [] mdt_object_lock+0x20/0x30 [mdt] [2674966.897667] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2674966.904151] [] mdt_intent_brw+0x1f/0x30 [mdt] [2674966.910398] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2674966.917057] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2674966.924020] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2674966.931326] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2674966.937671] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2674966.944797] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2674966.952737] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2674966.959251] [] kthread+0xd1/0xe0 [2674966.964338] [] ret_from_fork_nospec_begin+0xe/0x21 [2674966.970988] [] 0xffffffffffffffff [2674966.976219] Pid: 115724, comm: mdt01_033 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2674966.986216] Call Trace: [2674966.988858] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2674966.995972] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2674967.003338] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2674967.010343] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2674967.017548] [] mdt_object_lock+0x20/0x30 [mdt] [2674967.023891] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2674967.030404] [] mdt_intent_brw+0x1f/0x30 [mdt] [2674967.036638] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2674967.043310] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2674967.050270] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2674967.057577] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2674967.063931] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2674967.071099] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2674967.079046] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2674967.085615] [] kthread+0xd1/0xe0 [2674967.090745] [] ret_from_fork_nospec_begin+0xe/0x21 [2674967.097393] [] 0xffffffffffffffff [2675095.367470] LustreError: 115938:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8eb57a7ee000 ns: mdt-fir-MDT0001_UUID lock: ffff8eb58bcd21c0/0xbc3294624201307c lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 39 type: IBT flags: 0x50200000000000 nid: 10.8.27.29@o2ib6 remote: 0xd991486401b3add3 expref: 5 pid: 115938 timeout: 0 lvb_type: 0 [2675095.402533] LustreError: 115938:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 7 previous similar messages [2675158.381053] Lustre: fir-MDT0001: Connection restored to eeceaf0a-f64b-44e4-4f28-fac655ebb0a4 (at 10.8.17.19@o2ib6) [2675158.391613] Lustre: Skipped 271 previous similar messages [2675251.757081] LustreError: 116242:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554830327, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8edb08aa9200/0xbc32946243dc6441 lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 45 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 116242 timeout: 0 lvb_type: 0 [2675251.796987] LustreError: 116242:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 30 previous similar messages [2675362.136277] LNet: Service thread pid 115618 was inactive for 200.29s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2675362.153478] LNet: Skipped 9 previous similar messages [2675362.158712] Pid: 115618, comm: mdt00_016 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2675362.168747] Call Trace: [2675362.171388] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2675362.178518] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2675362.185914] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2675362.192923] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2675362.200097] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2675362.206767] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2675362.213947] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2675362.220705] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2675362.227363] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2675362.234296] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2675362.241576] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2675362.247922] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2675362.255052] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2675362.262941] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2675362.269441] [] kthread+0xd1/0xe0 [2675362.274528] [] ret_from_fork_nospec_begin+0xe/0x21 [2675362.281175] [] 0xffffffffffffffff [2675362.286368] LustreError: dumping log to /tmp/lustre-log.1554830527.115618 [2675362.671536] Pid: 115408, comm: mdt01_004 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2675362.681544] Call Trace: [2675362.684188] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2675362.691313] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2675362.698701] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2675362.705716] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2675362.712891] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2675362.719548] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2675362.726726] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2675362.733484] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2675362.740139] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2675362.747098] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2675362.754388] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2675362.760747] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2675362.767881] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2675362.775780] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2675362.782303] [] kthread+0xd1/0xe0 [2675362.787392] [] ret_from_fork_nospec_begin+0xe/0x21 [2675362.794056] [] 0xffffffffffffffff [2675362.799268] Pid: 115753, comm: mdt01_037 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2675362.809295] Call Trace: [2675362.811938] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2675362.819045] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2675362.826454] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2675362.833457] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2675362.840657] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2675362.847347] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2675362.854540] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2675362.861286] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2675362.867948] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2675362.874892] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2675362.882173] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2675362.888517] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2675362.895631] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2675362.903536] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2675362.910052] [] kthread+0xd1/0xe0 [2675362.915142] [] ret_from_fork_nospec_begin+0xe/0x21 [2675362.921803] [] 0xffffffffffffffff [2675362.927003] Pid: 115624, comm: mdt02_027 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2675362.937000] Call Trace: [2675362.939633] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2675362.946755] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2675362.954123] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2675362.961156] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2675362.968337] [] mdt_object_lock+0x20/0x30 [mdt] [2675362.974646] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2675362.981127] [] mdt_intent_brw+0x1f/0x30 [mdt] [2675362.987361] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2675362.994033] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2675363.000976] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2675363.008256] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2675363.014627] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2675363.021765] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2675363.029655] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2675363.036163] [] kthread+0xd1/0xe0 [2675363.041261] [] ret_from_fork_nospec_begin+0xe/0x21 [2675363.047907] [] 0xffffffffffffffff [2675363.053097] Pid: 115653, comm: mdt00_022 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2675363.063092] Call Trace: [2675363.065727] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2675363.072848] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2675363.080226] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2675363.087242] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2675363.094422] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2675363.101164] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2675363.107821] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2675363.114767] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2675363.122059] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2675363.128395] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2675363.135510] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2675363.143422] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2675363.149924] [] kthread+0xd1/0xe0 [2675363.155028] [] ret_from_fork_nospec_begin+0xe/0x21 [2675363.161676] [] 0xffffffffffffffff [2675363.166862] LNet: Service thread pid 115901 was inactive for 201.40s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2675363.180011] LNet: Skipped 23 previous similar messages [2675393.368610] LustreError: dumping log to /tmp/lustre-log.1554830558.115824 [2675466.071334] Lustre: fir-MDT0001: Client eed99957-e395-8d59-f471-3be5bc5334d2 (at 10.8.27.31@o2ib6) reconnecting [2675466.081595] Lustre: Skipped 263 previous similar messages [2675491.370813] LNet: Service thread pid 115764 completed after 329.60s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2675491.387229] LNet: Skipped 32 previous similar messages [2675497.154839] Lustre: 115866:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:1s); client may timeout. req@ffff8ed886c37800 x1628649163322992/t0(0) o101->eed99957-e395-8d59-f471-3be5bc5334d2@10.8.27.31@o2ib6:17/0 lens 568/2296 e 0 to 0 dl 1554830661 ref 1 fl Complete:/0/0 rc -107/-107 [2675497.184028] Lustre: 115866:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 8 previous similar messages [2675647.372399] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 149s: evicting client at 10.8.27.31@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8eefb1f30fc0/0xbc3294624b762fb3 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 46 type: IBT flags: 0x60200400000020 nid: 10.8.27.31@o2ib6 remote: 0x99d3453de1c03c3a expref: 35 pid: 115753 timeout: 2675626 lvb_type: 0 [2675647.410759] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 7 previous similar messages [2675678.043725] Lustre: 116299:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ee5ac6e8c00 x1628546096992080/t0(0) o101->ac4b3f39-4aa4-91f4-89f9-e38a651f2f99@10.9.107.5@o2ib4:28/0 lens 568/0 e 0 to 0 dl 1554830848 ref 2 fl Interpret:/0/ffffffff rc 0/-1 [2675678.073437] Lustre: 116299:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 63 previous similar messages [2675698.523963] Pid: 115859, comm: mdt02_054 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2675698.533970] Call Trace: [2675698.536613] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2675698.543741] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2675698.551125] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2675698.558136] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2675698.565313] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2675698.571965] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2675698.579145] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2675698.585894] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2675698.592561] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2675698.599511] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2675698.606799] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2675698.613142] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2675698.620257] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2675698.628154] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2675698.634671] [] kthread+0xd1/0xe0 [2675698.639758] [] ret_from_fork_nospec_begin+0xe/0x21 [2675698.646405] [] 0xffffffffffffffff [2675698.651600] LustreError: dumping log to /tmp/lustre-log.1554830864.115859 [2675698.957672] Pid: 115753, comm: mdt01_037 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2675698.967671] Call Trace: [2675698.970313] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2675698.977450] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2675698.984827] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2675698.991840] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2675698.999021] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2675699.005677] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2675699.012872] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2675699.019615] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2675699.026280] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2675699.033215] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2675699.040500] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2675699.046870] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2675699.053985] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2675699.061900] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2675699.068409] [] kthread+0xd1/0xe0 [2675699.073495] [] ret_from_fork_nospec_begin+0xe/0x21 [2675699.080153] [] 0xffffffffffffffff [2675699.085350] Pid: 115963, comm: mdt00_077 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2675699.095345] Call Trace: [2675699.097978] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2675699.105094] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2675699.112459] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2675699.119462] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2675699.126649] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2675699.133306] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2675699.140481] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2675699.147222] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2675699.153882] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2675699.160850] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2675699.168165] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2675699.174516] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2675699.181659] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2675699.189562] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2675699.196060] [] kthread+0xd1/0xe0 [2675699.201149] [] ret_from_fork_nospec_begin+0xe/0x21 [2675699.207820] [] 0xffffffffffffffff [2675699.213021] Pid: 116261, comm: mdt02_092 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2675699.223032] Call Trace: [2675699.225664] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2675699.232773] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2675699.240129] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2675699.247132] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2675699.254301] [] mdt_object_lock+0x20/0x30 [mdt] [2675699.260609] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2675699.267094] [] mdt_intent_brw+0x1f/0x30 [mdt] [2675699.273305] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2675699.279963] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2675699.286896] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2675699.294176] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2675699.300511] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2675699.307627] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2675699.315514] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2675699.322013] [] kthread+0xd1/0xe0 [2675699.327113] [] ret_from_fork_nospec_begin+0xe/0x21 [2675699.333777] [] 0xffffffffffffffff [2675699.338986] Pid: 115892, comm: mdt02_063 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2675699.348990] Call Trace: [2675699.351631] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2675699.358772] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2675699.366153] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2675699.373158] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2675699.380334] [] mdt_object_lock+0x20/0x30 [mdt] [2675699.386644] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2675699.393134] [] mdt_intent_brw+0x1f/0x30 [mdt] [2675699.399367] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2675699.406031] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2675699.412982] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2675699.420289] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2675699.426678] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2675699.433807] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2675699.441695] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2675699.448211] [] kthread+0xd1/0xe0 [2675699.453298] [] ret_from_fork_nospec_begin+0xe/0x21 [2675699.459945] [] 0xffffffffffffffff [2675769.449168] Lustre: fir-MDT0001: Connection restored to e3c4cf5f-8e04-bccb-9d13-7eae1b83e1a1 (at 10.8.27.29@o2ib6) [2675769.459720] Lustre: Skipped 271 previous similar messages [2675824.600333] Lustre: fir-MDT0003: haven't heard from client e19c3f0d-afda-27af-5e91-2202f9ebb395 (at 10.9.107.31@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed1e983fc00, cur 1554830990 expire 1554830840 last 1554830763 [2675824.622328] Lustre: Skipped 1 previous similar message [2675853.149620] LustreError: dumping log to /tmp/lustre-log.1554831018.115631 [2675864.119857] LustreError: 115886:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec48432e000 ns: mdt-fir-MDT0001_UUID lock: ffff8ed031599f80/0xbc3294625167a447 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 40 type: IBT flags: 0x50200000000000 nid: 10.8.27.26@o2ib6 remote: 0x3a52d8e5f91427cc expref: 4 pid: 115886 timeout: 0 lvb_type: 0 [2675864.154922] LustreError: 115886:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 8 previous similar messages [2675956.865756] LustreError: 115631:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554831032, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eb541248480/0xbc3294625568159a lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 44 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115631 timeout: 0 lvb_type: 0 [2675956.905689] LustreError: 115631:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 41 previous similar messages [2676067.426211] Lustre: fir-MDT0003: Client ecd69873-9a0a-0293-0ce7-d92dc18bf8d9 (at 10.9.101.11@o2ib4) reconnecting [2676067.436572] Lustre: Skipped 254 previous similar messages [2676203.479469] Lustre: 115967:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:32s); client may timeout. req@ffff8ec443f78300 x1628546098719760/t0(0) o101->ac4b3f39-4aa4-91f4-89f9-e38a651f2f99@10.9.107.5@o2ib4:2/0 lens 576/1792 e 0 to 0 dl 1554831336 ref 1 fl Complete:/0/0 rc -107/-107 [2676203.508653] Lustre: 115967:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 5 previous similar messages [2676355.380234] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 149s: evicting client at 10.8.27.28@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ed61aebc800/0xbc3294625da704b3 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 52 type: IBT flags: 0x60200400000020 nid: 10.8.27.28@o2ib6 remote: 0x1df96237ad0fa52f expref: 90 pid: 116301 timeout: 2676334 lvb_type: 0 [2676355.418580] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 6 previous similar messages [2676375.998604] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2676376.009217] Lustre: Skipped 267 previous similar messages [2676386.019578] Lustre: 115850:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eda6aff0850 x1628638034008000/t0(0) o101->7d8ca85d-8b80-6a23-8fa9-83dca7eb7196@10.8.27.28@o2ib6:16/0 lens 576/3264 e 0 to 0 dl 1554831556 ref 2 fl Interpret:/0/0 rc 0/0 [2676386.048849] Lustre: 115850:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 65 previous similar messages [2676406.115792] LNet: Service thread pid 115631 was inactive for 200.11s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2676406.133001] LNet: Skipped 9 previous similar messages [2676406.138239] Pid: 115631, comm: mdt01_025 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2676406.148235] Call Trace: [2676406.150880] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2676406.158008] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2676406.165376] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2676406.172456] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2676406.179915] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2676406.186573] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2676406.193797] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2676406.200546] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2676406.207202] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2676406.214142] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2676406.221432] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2676406.227777] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2676406.234891] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2676406.242786] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2676406.249286] [] kthread+0xd1/0xe0 [2676406.254374] [] ret_from_fork_nospec_begin+0xe/0x21 [2676406.261021] [] 0xffffffffffffffff [2676406.266224] LustreError: dumping log to /tmp/lustre-log.1554831571.115631 [2676406.721732] Pid: 115869, comm: mdt01_060 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2676406.731739] Call Trace: [2676406.734417] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2676406.741550] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2676406.748928] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2676406.755956] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2676406.763132] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2676406.769836] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2676406.777026] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2676406.783800] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2676406.790492] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2676406.797450] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2676406.804734] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2676406.811115] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2676406.818236] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2676406.826121] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2676406.832640] [] kthread+0xd1/0xe0 [2676406.837727] [] ret_from_fork_nospec_begin+0xe/0x21 [2676406.844401] [] 0xffffffffffffffff [2676406.849608] Pid: 115779, comm: mdt01_040 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2676406.859664] Call Trace: [2676406.862314] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2676406.869438] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2676406.876825] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2676406.883871] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2676406.891054] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2676406.897728] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2676406.904912] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2676406.911671] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2676406.918320] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2676406.925261] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2676406.932543] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2676406.938893] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2676406.946053] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2676406.953966] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2676406.960533] [] kthread+0xd1/0xe0 [2676406.965638] [] ret_from_fork_nospec_begin+0xe/0x21 [2676406.972285] [] 0xffffffffffffffff [2676406.977486] Pid: 115346, comm: mdt01_000 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2676406.987494] Call Trace: [2676406.990129] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2676406.997268] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2676407.004638] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2676407.011656] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2676407.018843] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2676407.025506] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2676407.032676] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2676407.039418] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2676407.046066] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2676407.053016] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2676407.060297] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2676407.066647] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2676407.073756] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2676407.081660] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2676407.088160] [] kthread+0xd1/0xe0 [2676407.093255] [] ret_from_fork_nospec_begin+0xe/0x21 [2676407.099905] [] 0xffffffffffffffff [2676407.105094] Pid: 116124, comm: mdt01_089 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2676407.115119] Call Trace: [2676407.117755] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2676407.124881] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2676407.132281] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2676407.139327] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2676407.146506] [] mdt_object_lock+0x20/0x30 [mdt] [2676407.152806] [] mdt_hsm_state_set+0xc9/0x830 [mdt] [2676407.159388] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2676407.166553] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2676407.174482] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2676407.181001] [] kthread+0xd1/0xe0 [2676407.186110] [] ret_from_fork_nospec_begin+0xe/0x21 [2676407.192760] [] 0xffffffffffffffff [2676407.197971] LNet: Service thread pid 115796 was inactive for 201.22s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2676407.211119] LNet: Skipped 22 previous similar messages [2676415.381790] LNet: Service thread pid 115942 completed after 209.40s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2676415.398221] LNet: Skipped 40 previous similar messages [2676597.383131] LustreError: 116261:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec87a7da400 ns: mdt-fir-MDT0001_UUID lock: ffff8ed811904c80/0xbc329462653ebb30 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 38 type: IBT flags: 0x50200000000000 nid: 10.8.17.19@o2ib6 remote: 0x8282471ef031b17b expref: 223 pid: 116261 timeout: 0 lvb_type: 0 [2676597.418389] LustreError: 116261:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 13 previous similar messages [2676686.018014] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2676686.028361] Lustre: Skipped 239 previous similar messages [2676751.697633] LustreError: 115753:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554831827, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ecd31a2bcc0/0xbc3294626715c837 lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 42 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115753 timeout: 0 lvb_type: 0 [2676751.737907] LustreError: 115753:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 65 previous similar messages [2676861.800912] Pid: 115901, comm: mdt01_067 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2676861.810915] Call Trace: [2676861.813554] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2676861.820694] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2676861.828092] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2676861.835116] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2676861.842316] [] mdt_object_lock+0x20/0x30 [mdt] [2676861.848625] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2676861.855134] [] mdt_intent_brw+0x1f/0x30 [mdt] [2676861.861356] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2676861.868045] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2676861.874989] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2676861.882295] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2676861.888649] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2676861.895783] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2676861.903699] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2676861.910227] [] kthread+0xd1/0xe0 [2676861.915316] [] ret_from_fork_nospec_begin+0xe/0x21 [2676861.921964] [] 0xffffffffffffffff [2676861.927166] LustreError: dumping log to /tmp/lustre-log.1554832027.115901 [2676862.323288] Pid: 116166, comm: mdt02_081 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2676862.333324] Call Trace: [2676862.335974] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2676862.343125] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2676862.350492] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2676862.357528] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2676862.364722] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2676862.371481] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2676862.378153] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2676862.385107] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2676862.392442] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2676862.398790] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2676862.405927] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2676862.413842] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2676862.420370] [] kthread+0xd1/0xe0 [2676862.425481] [] ret_from_fork_nospec_begin+0xe/0x21 [2676862.432171] [] 0xffffffffffffffff [2676862.437383] Pid: 115617, comm: mdt01_020 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2676862.447394] Call Trace: [2676862.450028] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2676862.457152] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2676862.464536] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2676862.471555] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2676862.478733] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2676862.485396] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2676862.492600] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2676862.499359] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2676862.506018] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2676862.512964] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2676862.520265] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2676862.526637] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2676862.533765] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2676862.541665] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2676862.548201] [] kthread+0xd1/0xe0 [2676862.553295] [] ret_from_fork_nospec_begin+0xe/0x21 [2676862.559972] [] 0xffffffffffffffff [2676862.565199] Pid: 116306, comm: mdt02_099 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2676862.575212] Call Trace: [2676862.577846] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2676862.584970] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2676862.592329] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2676862.599362] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2676862.606535] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2676862.613194] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2676862.620381] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2676862.627142] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2676862.633791] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2676862.640741] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2676862.648037] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2676862.654389] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2676862.661497] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2676862.669418] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2676862.675921] [] kthread+0xd1/0xe0 [2676862.681028] [] ret_from_fork_nospec_begin+0xe/0x21 [2676862.687680] [] 0xffffffffffffffff [2676862.692927] Pid: 115753, comm: mdt01_037 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2676862.702928] Call Trace: [2676862.705582] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2676862.712694] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2676862.720085] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2676862.727090] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2676862.734282] [] mdt_object_lock+0x20/0x30 [mdt] [2676862.740591] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2676862.747075] [] mdt_intent_brw+0x1f/0x30 [mdt] [2676862.753326] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2676862.760011] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2676862.766962] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2676862.774237] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2676862.780589] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2676862.787695] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2676862.795581] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2676862.802099] [] kthread+0xd1/0xe0 [2676862.807189] [] ret_from_fork_nospec_begin+0xe/0x21 [2676862.813864] [] 0xffffffffffffffff [2676893.545264] LustreError: dumping log to /tmp/lustre-log.1554832058.115346 [2676924.265610] LustreError: dumping log to /tmp/lustre-log.1554832089.115967 [2676961.387039] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 30s: evicting client at 10.8.8.6@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ebff7d4a400/0xbc3294626715d947 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 43 type: IBT flags: 0x60200400000020 nid: 10.8.8.6@o2ib6 remote: 0x28be6c2d1aad70cb expref: 195 pid: 115622 timeout: 2676940 lvb_type: 0 [2676961.425062] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 11 previous similar messages [2676961.435812] Lustre: 115583:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:27s); client may timeout. req@ffff8ebc244d5700 x1629298150458656/t0(0) o101->a4daaf47-6ec9-4753-388e-0d0b7a7f70d6@10.8.27.25@o2ib6:19/0 lens 568/2296 e 0 to 0 dl 1554832099 ref 1 fl Complete:/0/0 rc -107/-107 [2676961.464978] Lustre: 115583:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 9 previous similar messages [2676991.658381] Lustre: 115408:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ec4332b9500 x1628556960288880/t0(0) o101->4cd8fde3-ab19-6a6b-a7ee-5d70c4bd9893@10.8.8.6@o2ib6:22/0 lens 576/3264 e 0 to 0 dl 1554832162 ref 2 fl Interpret:/0/0 rc 0/0 [2676991.687468] Lustre: 115408:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 67 previous similar messages [2676996.062542] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2676996.073182] Lustre: Skipped 231 previous similar messages [2677306.081832] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2677306.092181] Lustre: Skipped 246 previous similar messages [2677324.654131] LNet: Service thread pid 116245 was inactive for 200.12s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2677324.671346] LNet: Skipped 9 previous similar messages [2677324.676581] Pid: 116245, comm: mdt02_086 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2677324.686597] Call Trace: [2677324.689239] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2677324.696344] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2677324.703727] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2677324.710730] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2677324.717906] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2677324.724564] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2677324.731738] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2677324.738483] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2677324.745130] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2677324.752063] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2677324.759341] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2677324.765695] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2677324.772812] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2677324.780700] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2677324.787199] [] kthread+0xd1/0xe0 [2677324.792286] [] ret_from_fork_nospec_begin+0xe/0x21 [2677324.798931] [] 0xffffffffffffffff [2677324.804125] LustreError: dumping log to /tmp/lustre-log.1554832490.116245 [2677325.172043] Pid: 115902, comm: mdt00_058 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2677325.182044] Call Trace: [2677325.184684] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2677325.191791] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2677325.199189] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2677325.206212] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2677325.213417] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2677325.220077] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2677325.227253] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2677325.233994] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2677325.240653] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2677325.247584] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2677325.254866] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2677325.261202] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2677325.268337] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2677325.276231] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2677325.282754] [] kthread+0xd1/0xe0 [2677325.287842] [] ret_from_fork_nospec_begin+0xe/0x21 [2677325.294492] [] 0xffffffffffffffff [2677325.299688] Pid: 115870, comm: mdt00_048 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2677325.309684] Call Trace: [2677325.312316] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2677325.319423] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2677325.326791] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2677325.333794] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2677325.340970] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2677325.347634] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2677325.354811] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2677325.361563] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2677325.368218] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2677325.375151] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2677325.382432] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2677325.388785] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2677325.395890] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2677325.403780] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2677325.410280] [] kthread+0xd1/0xe0 [2677325.415381] [] ret_from_fork_nospec_begin+0xe/0x21 [2677325.422031] [] 0xffffffffffffffff [2677325.427229] Pid: 115886, comm: mdt00_053 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2677325.437234] Call Trace: [2677325.439875] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2677325.446982] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2677325.454356] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2677325.461360] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2677325.468537] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2677325.475196] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2677325.482376] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2677325.489122] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2677325.495775] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2677325.502711] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2677325.509989] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2677325.516326] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2677325.523464] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2677325.531354] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2677325.537853] [] kthread+0xd1/0xe0 [2677325.542940] [] ret_from_fork_nospec_begin+0xe/0x21 [2677325.549603] [] 0xffffffffffffffff [2677325.554803] Pid: 115967, comm: mdt01_084 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2677325.564803] Call Trace: [2677325.567440] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2677325.574550] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2677325.581932] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2677325.588934] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2677325.596112] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2677325.602777] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2677325.609955] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2677325.616723] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2677325.623379] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2677325.630328] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2677325.637608] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2677325.643941] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2677325.651050] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2677325.658934] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2677325.665436] [] kthread+0xd1/0xe0 [2677325.670523] [] ret_from_fork_nospec_begin+0xe/0x21 [2677325.677190] [] 0xffffffffffffffff [2677325.682385] LNet: Service thread pid 115869 was inactive for 201.15s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2677325.695523] LNet: Skipped 19 previous similar messages [2677369.577633] LustreError: 115753:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554832444, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8edf663cc800/0xbc32946273376913 lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 50 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115753 timeout: 0 lvb_type: 0 [2677369.617590] LustreError: 115753:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 43 previous similar messages [2677454.392685] LNet: Service thread pid 116602 completed after 329.86s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2677454.409122] LNet: Skipped 24 previous similar messages [2677480.303816] LustreError: dumping log to /tmp/lustre-log.1554832645.115753 [2677484.393544] LustreError: 115902:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ee2258c4400 ns: mdt-fir-MDT0001_UUID lock: ffff8edf50672880/0xbc32946270243aff lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 46 type: IBT flags: 0x50200000000000 nid: 10.8.17.19@o2ib6 remote: 0x8282471ef0323dba expref: 5 pid: 115902 timeout: 0 lvb_type: 0 [2677484.428597] LustreError: 115902:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 11 previous similar messages [2677610.944360] Lustre: fir-MDT0001: Connection restored to (at 10.9.107.5@o2ib4) [2677610.951769] Lustre: Skipped 252 previous similar messages [2677636.394530] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 149s: evicting client at 10.8.17.15@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8eb1e935ee40/0xbc32946277573815 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 38 type: IBT flags: 0x60200400000020 nid: 10.8.17.15@o2ib6 remote: 0x794f201b734b0d38 expref: 75 pid: 115583 timeout: 2677615 lvb_type: 0 [2677636.432897] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 5 previous similar messages [2677661.489802] Lustre: 115796:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ec6c7d61b00 x1628647044544112/t0(0) o101->535a7c41-41b9-ec51-bbd6-e028c4ebcb2c@10.8.8.1@o2ib6:1/0 lens 576/3264 e 0 to 0 dl 1554832831 ref 2 fl Interpret:/0/0 rc 0/0 [2677661.518803] Lustre: 115796:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 56 previous similar messages [2677666.395485] Lustre: 116206:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:25s); client may timeout. req@ffff8ebaaee30000 x1628593193736544/t0(0) o101->bf6bd63b-9206-2f81-2780-2b483791a8c1@10.8.17.15@o2ib6:2/0 lens 568/2296 e 0 to 0 dl 1554832806 ref 1 fl Complete:/0/0 rc -107/-107 [2677666.424665] Lustre: 116206:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 10 previous similar messages [2677926.162637] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2677926.172996] Lustre: Skipped 206 previous similar messages [2677975.413262] LNet: Service thread pid 115406 was inactive for 200.39s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2677975.430469] LNet: Skipped 4 previous similar messages [2677975.435705] Pid: 115406, comm: mdt00_005 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2677975.445717] Call Trace: [2677975.448355] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2677975.455519] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2677975.462895] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2677975.469904] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2677975.477101] [] mdt_object_lock+0x20/0x30 [mdt] [2677975.483408] [] mdt_hsm_state_set+0xc9/0x830 [mdt] [2677975.489999] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2677975.497134] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2677975.505044] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2677975.511545] [] kthread+0xd1/0xe0 [2677975.516657] [] ret_from_fork_nospec_begin+0xe/0x21 [2677975.523316] [] 0xffffffffffffffff [2677975.528536] LustreError: dumping log to /tmp/lustre-log.1554833140.115406 [2677975.935634] Pid: 115408, comm: mdt01_004 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2677975.945634] Call Trace: [2677975.948277] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2677975.955415] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2677975.962825] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2677975.969846] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2677975.977041] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2677975.983714] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2677975.990896] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2677975.997658] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2677976.004334] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2677976.011284] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2677976.018600] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2677976.024960] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2677976.032112] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2677976.040013] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2677976.046538] [] kthread+0xd1/0xe0 [2677976.051636] [] ret_from_fork_nospec_begin+0xe/0x21 [2677976.058300] [] 0xffffffffffffffff [2677976.063532] Pid: 116242, comm: mdt02_084 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2677976.073555] Call Trace: [2677976.076197] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2677976.083336] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2677976.090725] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2677976.097763] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2677976.104944] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2677976.111631] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2677976.118820] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2677976.125611] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2677976.132270] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2677976.139260] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2677976.146564] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2677976.152924] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2677976.160058] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2677976.167986] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2677976.174512] [] kthread+0xd1/0xe0 [2677976.179637] [] ret_from_fork_nospec_begin+0xe/0x21 [2677976.186300] [] 0xffffffffffffffff [2677976.191516] Pid: 115879, comm: mdt02_060 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2677976.201539] Call Trace: [2677976.204180] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2677976.211312] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2677976.218695] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2677976.225737] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2677976.232928] [] mdt_object_lock+0x20/0x30 [mdt] [2677976.239271] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2677976.245787] [] mdt_intent_brw+0x1f/0x30 [mdt] [2677976.252028] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2677976.258694] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2677976.265661] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2677976.272967] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2677976.279335] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2677976.286472] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2677976.294374] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2677976.300893] [] kthread+0xd1/0xe0 [2677976.306000] [] ret_from_fork_nospec_begin+0xe/0x21 [2677976.312661] [] 0xffffffffffffffff [2677976.317896] Pid: 115870, comm: mdt00_048 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2677976.327900] Call Trace: [2677976.330542] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2677976.337679] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2677976.345065] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2677976.352100] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2677976.359290] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2677976.365976] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2677976.373156] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2677976.379918] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2677976.386590] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2677976.393555] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2677976.400863] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2677976.407237] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2677976.414374] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2677976.422295] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2677976.428823] [] kthread+0xd1/0xe0 [2677976.433943] [] ret_from_fork_nospec_begin+0xe/0x21 [2677976.440625] [] 0xffffffffffffffff [2677976.445842] LNet: Service thread pid 115913 was inactive for 201.43s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2677976.458973] LNet: Skipped 11 previous similar messages [2678006.645611] LustreError: dumping log to /tmp/lustre-log.1554833172.115779 [2678037.365955] LustreError: dumping log to /tmp/lustre-log.1554833202.116165 [2678068.598294] LustreError: dumping log to /tmp/lustre-log.1554833233.115937 [2678104.470697] LustreError: 116124:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554833179, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eda31f3d100/0xbc3294628244c915 lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x13/0x8 rrc: 39 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 116124 timeout: 0 lvb_type: 0 [2678104.510615] LustreError: 116124:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 37 previous similar messages [2678164.400569] LustreError: 115913:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ee88fb7a800 ns: mdt-fir-MDT0001_UUID lock: ffff8ec00fb4d580/0xbc32946282543649 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 33 type: IBT flags: 0x50200000000000 nid: 10.8.27.31@o2ib6 remote: 0x99d3453de1d92269 expref: 3 pid: 115913 timeout: 0 lvb_type: 0 [2678164.400589] LNet: Service thread pid 115879 completed after 389.38s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2678164.400592] LNet: Skipped 11 previous similar messages [2678164.457346] LustreError: 115913:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 9 previous similar messages [2678225.977031] Lustre: fir-MDT0001: Connection restored to e3c4cf5f-8e04-bccb-9d13-7eae1b83e1a1 (at 10.8.27.29@o2ib6) [2678225.987560] Lustre: Skipped 206 previous similar messages [2678344.402298] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 149s: evicting client at 10.8.17.15@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ec89d37f2c0/0xbc329462866cd1ea lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 44 type: IBT flags: 0x60200400000020 nid: 10.8.17.15@o2ib6 remote: 0x794f201b734bc5e6 expref: 55 pid: 115753 timeout: 2678323 lvb_type: 0 [2678344.440645] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 10 previous similar messages [2678375.225627] Lustre: 116249:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ebac65d7500 x1628593193986592/t0(0) o101->bf6bd63b-9206-2f81-2780-2b483791a8c1@10.8.17.15@o2ib6:25/0 lens 576/3264 e 0 to 0 dl 1554833545 ref 2 fl Interpret:/0/0 rc 0/0 [2678375.254908] Lustre: 116249:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 85 previous similar messages [2678395.257849] Pid: 115779, comm: mdt01_040 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2678395.267856] Call Trace: [2678395.270501] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2678395.277629] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2678395.285002] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2678395.292015] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2678395.299200] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2678395.305854] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2678395.313033] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2678395.319785] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2678395.326449] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2678395.333390] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2678395.340690] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2678395.347041] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2678395.354155] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2678395.362057] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2678395.368568] [] kthread+0xd1/0xe0 [2678395.373656] [] ret_from_fork_nospec_begin+0xe/0x21 [2678395.380303] [] 0xffffffffffffffff [2678395.385497] LustreError: dumping log to /tmp/lustre-log.1554833560.115779 [2678395.766751] Pid: 115898, comm: mdt01_066 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2678395.776757] Call Trace: [2678395.779402] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2678395.786594] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2678395.793960] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2678395.800961] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2678395.808137] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2678395.814795] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2678395.821970] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2678395.828724] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2678395.835384] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2678395.842328] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2678395.849626] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2678395.855979] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2678395.863104] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2678395.871033] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2678395.877543] [] kthread+0xd1/0xe0 [2678395.882628] [] ret_from_fork_nospec_begin+0xe/0x21 [2678395.889275] [] 0xffffffffffffffff [2678395.894474] Pid: 116191, comm: mdt01_107 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2678395.904488] Call Trace: [2678395.907127] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2678395.914252] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2678395.921613] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2678395.928612] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2678395.935790] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2678395.942439] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2678395.949613] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2678395.956356] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2678395.963013] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2678395.969954] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2678395.977244] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2678395.983599] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2678395.990720] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2678395.998618] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2678396.005128] [] kthread+0xd1/0xe0 [2678396.010224] [] ret_from_fork_nospec_begin+0xe/0x21 [2678396.016881] [] 0xffffffffffffffff [2678396.022084] Pid: 115918, comm: mdt02_070 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2678396.032084] Call Trace: [2678396.034724] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2678396.041864] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2678396.049232] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2678396.056242] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2678396.063436] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2678396.070092] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2678396.077270] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2678396.084025] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2678396.090686] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2678396.097643] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2678396.104940] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2678396.111300] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2678396.118419] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2678396.126339] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2678396.132847] [] kthread+0xd1/0xe0 [2678396.137926] [] ret_from_fork_nospec_begin+0xe/0x21 [2678396.144572] [] 0xffffffffffffffff [2678396.149769] Pid: 115880, comm: mdt00_051 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2678396.159768] Call Trace: [2678396.162410] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2678396.169524] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2678396.176909] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2678396.183922] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2678396.191107] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2678396.197772] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2678396.204946] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2678396.211689] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2678396.218346] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2678396.225289] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2678396.232576] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2678396.238945] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2678396.246070] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2678396.253957] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2678396.260467] [] kthread+0xd1/0xe0 [2678396.265554] [] ret_from_fork_nospec_begin+0xe/0x21 [2678396.272201] [] 0xffffffffffffffff [2678524.405199] Lustre: 115796:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:175s); client may timeout. req@ffff8ebb2d168600 x1628593193952640/t0(0) o101->bf6bd63b-9206-2f81-2780-2b483791a8c1@10.8.17.15@o2ib6:20/0 lens 568/2296 e 0 to 0 dl 1554833514 ref 1 fl Complete:/0/0 rc -107/-107 [2678524.434530] Lustre: 115796:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 7 previous similar messages [2678546.210404] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2678546.220756] Lustre: Skipped 229 previous similar messages [2678768.922868] LustreError: 115880:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554833844, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eb9154918c0/0xbc329462910dd942 lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 48 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115880 timeout: 0 lvb_type: 0 [2678768.962820] LustreError: 115880:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 43 previous similar messages [2678826.679331] Lustre: fir-MDT0003: Connection restored to (at 10.9.101.11@o2ib4) [2678826.686829] Lustre: Skipped 229 previous similar messages [2678879.103078] LNet: Service thread pid 115607 was inactive for 200.16s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2678879.120284] LNet: Skipped 9 previous similar messages [2678879.125546] Pid: 115607, comm: mdt01_018 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2678879.135561] Call Trace: [2678879.138201] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2678879.145356] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2678879.152730] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2678879.159735] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2678879.166926] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2678879.173583] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2678879.180758] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2678879.187501] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2678879.194171] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2678879.201108] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2678879.208389] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2678879.214732] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2678879.221848] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2678879.229745] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2678879.236251] [] kthread+0xd1/0xe0 [2678879.241341] [] ret_from_fork_nospec_begin+0xe/0x21 [2678879.247986] [] 0xffffffffffffffff [2678879.253190] LustreError: dumping log to /tmp/lustre-log.1554834044.115607 [2678879.705649] Pid: 115869, comm: mdt01_060 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2678879.715651] Call Trace: [2678879.718298] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2678879.725433] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2678879.732810] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2678879.739820] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2678879.746995] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2678879.753738] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2678879.760394] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2678879.767334] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2678879.774616] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2678879.780959] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2678879.788073] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2678879.795961] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2678879.802469] [] kthread+0xd1/0xe0 [2678879.807557] [] ret_from_fork_nospec_begin+0xe/0x21 [2678879.814204] [] 0xffffffffffffffff [2678879.819402] Pid: 116295, comm: mdt00_109 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2678879.829416] Call Trace: [2678879.832049] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2678879.839157] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2678879.846522] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2678879.853527] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2678879.860702] [] mdt_object_lock+0x20/0x30 [mdt] [2678879.867012] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2678879.873495] [] mdt_intent_brw+0x1f/0x30 [mdt] [2678879.879718] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2678879.886389] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2678879.893342] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2678879.900622] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2678879.906957] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2678879.914071] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2678879.921958] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2678879.928460] [] kthread+0xd1/0xe0 [2678879.933577] [] ret_from_fork_nospec_begin+0xe/0x21 [2678879.940229] [] 0xffffffffffffffff [2678879.945426] Pid: 115915, comm: mdt00_063 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2678879.955425] Call Trace: [2678879.958065] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2678879.965170] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2678879.972537] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2678879.979542] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2678879.986708] [] mdt_object_lock+0x20/0x30 [mdt] [2678879.993009] [] mdt_hsm_state_set+0xc9/0x830 [mdt] [2678879.999579] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2678880.006703] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2678880.014589] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2678880.021090] [] kthread+0xd1/0xe0 [2678880.026179] [] ret_from_fork_nospec_begin+0xe/0x21 [2678880.032825] [] 0xffffffffffffffff [2678880.038022] Pid: 116207, comm: mdt01_111 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2678880.048021] Call Trace: [2678880.050663] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2678880.057786] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2678880.065152] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2678880.072155] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2678880.079322] [] mdt_object_lock+0x20/0x30 [mdt] [2678880.085623] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2678880.092106] [] mdt_intent_brw+0x1f/0x30 [mdt] [2678880.098335] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2678880.104984] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2678880.111918] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2678880.119198] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2678880.125533] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2678880.132639] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2678880.140526] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2678880.147026] [] kthread+0xd1/0xe0 [2678880.152114] [] ret_from_fork_nospec_begin+0xe/0x21 [2678880.158752] [] 0xffffffffffffffff [2678880.163949] LNet: Service thread pid 115880 was inactive for 201.23s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2678880.177076] LNet: Skipped 20 previous similar messages [2678888.408582] LNet: Service thread pid 115880 completed after 209.48s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2678888.425013] LNet: Skipped 32 previous similar messages [2678896.251085] LustreError: 115869:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec44f5da800 ns: mdt-fir-MDT0001_UUID lock: ffff8ecd1aed2f40/0xbc329462910ed08e lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 43 type: IBT flags: 0x50200000000000 nid: 10.8.27.25@o2ib6 remote: 0x47916098febde7fd expref: 2 pid: 115869 timeout: 0 lvb_type: 0 [2678896.286166] LustreError: 115869:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 9 previous similar messages [2678960.408942] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 29s: evicting client at 10.8.8.1@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ed0e1860fc0/0xbc329462965571ba lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 44 type: IBT flags: 0x60200400000020 nid: 10.8.8.1@o2ib6 remote: 0xbeca8e2774cbea49 expref: 648 pid: 115710 timeout: 2678939 lvb_type: 0 [2678960.446973] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 7 previous similar messages [2678987.008229] Lustre: 115942:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ec28db79800 x1628569901758224/t0(0) o101->e98ad6a7-830d-1725-2883-db155e5b9e43@10.8.17.16@o2ib6:7/0 lens 576/3264 e 0 to 0 dl 1554834157 ref 2 fl Interpret:/0/0 rc 0/0 [2678987.037399] Lustre: 115942:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 68 previous similar messages [2679148.212563] Lustre: fir-MDT0001: Client 2c78cdad-2975-ca98-fb36-e7548576f834 (at 10.8.17.17@o2ib6) reconnecting [2679148.222829] Lustre: Skipped 177 previous similar messages [2679317.379714] Pid: 115917, comm: mdt01_069 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2679317.389718] Call Trace: [2679317.392360] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2679317.399505] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2679317.406888] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2679317.413895] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2679317.421069] [] mdt_object_lock+0x20/0x30 [mdt] [2679317.427378] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2679317.433862] [] mdt_intent_brw+0x1f/0x30 [mdt] [2679317.440084] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2679317.446757] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2679317.453715] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2679317.460997] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2679317.467342] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2679317.474458] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2679317.482352] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2679317.488869] [] kthread+0xd1/0xe0 [2679317.493958] [] ret_from_fork_nospec_begin+0xe/0x21 [2679317.500605] [] 0xffffffffffffffff [2679317.505800] LustreError: dumping log to /tmp/lustre-log.1554834482.115917 [2679317.890267] Pid: 115839, comm: mdt00_040 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2679317.900265] Call Trace: [2679317.902899] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2679317.910013] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2679317.917410] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2679317.924440] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2679317.931636] [] mdt_object_lock+0x20/0x30 [mdt] [2679317.937965] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2679317.944450] [] mdt_intent_brw+0x1f/0x30 [mdt] [2679317.950678] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2679317.957328] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2679317.964283] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2679317.971582] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2679317.977935] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2679317.985069] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2679317.992963] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2679317.999489] [] kthread+0xd1/0xe0 [2679318.004577] [] ret_from_fork_nospec_begin+0xe/0x21 [2679318.011224] [] 0xffffffffffffffff [2679318.016440] Pid: 115962, comm: mdt01_082 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2679318.026449] Call Trace: [2679318.029085] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2679318.036210] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2679318.043595] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2679318.050615] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2679318.057792] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2679318.064470] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2679318.071641] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2679318.078373] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2679318.085022] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2679318.091973] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2679318.099254] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2679318.105618] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2679318.112756] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2679318.120654] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2679318.127188] [] kthread+0xd1/0xe0 [2679318.132272] [] ret_from_fork_nospec_begin+0xe/0x21 [2679318.138919] [] 0xffffffffffffffff [2679318.144117] Pid: 115963, comm: mdt00_077 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2679318.154122] Call Trace: [2679318.156756] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2679318.163917] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2679318.171282] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2679318.178301] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2679318.185476] [] mdt_object_lock+0x20/0x30 [mdt] [2679318.191779] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2679318.198262] [] mdt_intent_brw+0x1f/0x30 [mdt] [2679318.204485] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2679318.211142] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2679318.218085] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2679318.225372] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2679318.231717] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2679318.238821] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2679318.246726] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2679318.253227] [] kthread+0xd1/0xe0 [2679318.258313] [] ret_from_fork_nospec_begin+0xe/0x21 [2679318.264961] [] 0xffffffffffffffff [2679318.270156] Pid: 116292, comm: mdt00_106 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2679318.280156] Call Trace: [2679318.282786] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2679318.289894] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2679318.297255] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2679318.304247] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2679318.311424] [] mdt_object_lock+0x20/0x30 [mdt] [2679318.317724] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2679318.324207] [] mdt_intent_brw+0x1f/0x30 [mdt] [2679318.330420] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2679318.337067] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2679318.344004] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2679318.351292] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2679318.357626] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2679318.364734] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2679318.372620] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2679318.379137] [] kthread+0xd1/0xe0 [2679318.384224] [] ret_from_fork_nospec_begin+0xe/0x21 [2679318.390879] [] 0xffffffffffffffff [2679326.413515] Lustre: 116313:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:179s); client may timeout. req@ffff8ed2bc95d400 x1628647358775424/t0(0) o101->eeceaf0a-f64b-44e4-4f28-fac655ebb0a4@10.8.17.19@o2ib6:12/0 lens 568/2296 e 0 to 0 dl 1554834312 ref 1 fl Complete:/0/0 rc -107/-107 [2679326.442765] Lustre: 116313:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 8 previous similar messages [2679418.768780] LustreError: 115587:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554834494, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8edc07b89200/0xbc3294629e08dfb3 lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 39 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115587 timeout: 0 lvb_type: 0 [2679418.808690] LustreError: 115587:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 41 previous similar messages [2679445.303864] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2679445.314470] Lustre: Skipped 195 previous similar messages [2679508.414954] LustreError: 115848:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8eb7b712cc00 ns: mdt-fir-MDT0001_UUID lock: ffff8ec1d7891b00/0xbc3294629e08fae1 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 28 type: IBT flags: 0x50200000000000 nid: 10.8.17.19@o2ib6 remote: 0x8282471ef036eb07 expref: 2 pid: 115848 timeout: 0 lvb_type: 0 [2679508.450016] LustreError: 115848:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 13 previous similar messages [2679696.416820] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.27.28@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ebcca212400/0xbc329462a124f8a7 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 43 type: IBT flags: 0x60200400000020 nid: 10.8.27.28@o2ib6 remote: 0x1df96237ad3413c1 expref: 122 pid: 116313 timeout: 2679675 lvb_type: 0 [2679696.455265] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 11 previous similar messages [2679723.080113] Lustre: 116156:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ed5b7301b00 x1629298151227920/t0(0) o101->a4daaf47-6ec9-4753-388e-0d0b7a7f70d6@10.8.27.25@o2ib6:23/0 lens 576/3264 e 0 to 0 dl 1554834893 ref 2 fl Interpret:/0/0 rc 0/0 [2679723.109357] Lustre: 116156:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 88 previous similar messages [2679755.323435] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2679755.333801] Lustre: Skipped 208 previous similar messages [2679774.644022] Lustre: fir-MDT0003: haven't heard from client 37756ecf-401e-392e-3027-0955474be57d (at 10.8.1.29@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ebb2d3a5400, cur 1554834940 expire 1554834790 last 1554834713 [2679774.665810] Lustre: Skipped 1 previous similar message [2679951.262728] Lustre: 115867:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:1s); client may timeout. req@ffff8ecf765e1e00 x1628638051913520/t0(0) o101->7d8ca85d-8b80-6a23-8fa9-83dca7eb7196@10.8.27.28@o2ib6:5/0 lens 576/1792 e 0 to 0 dl 1554835115 ref 1 fl Complete:/0/0 rc 0/0 [2679951.291203] Lustre: 115867:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 6 previous similar messages [2680046.436392] Lustre: fir-MDT0001: Connection restored to a4daaf47-6ec9-4753-388e-0d0b7a7f70d6 (at 10.8.27.25@o2ib6) [2680046.446932] Lustre: Skipped 199 previous similar messages [2680143.581948] LustreError: 115573:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554835218, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eb71e306e40/0xbc329462a9f7a385 lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 36 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115573 timeout: 0 lvb_type: 0 [2680143.621891] LustreError: 115573:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 37 previous similar messages [2680177.737772] LustreError: 115406:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ed79d4e2400 ns: mdt-fir-MDT0001_UUID lock: ffff8ebec58c0d80/0xbc329462ab7c9bc0 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 28 type: IBT flags: 0x50200000000000 nid: 10.8.27.25@o2ib6 remote: 0x47916098fec0938e expref: 2 pid: 115406 timeout: 0 lvb_type: 0 [2680177.772846] LustreError: 115406:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 12 previous similar messages [2680328.424239] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 149s: evicting client at 10.8.27.25@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ec478700900/0xbc329462ac001fb3 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 34 type: IBT flags: 0x60200400000020 nid: 10.8.27.25@o2ib6 remote: 0x47916098fec0b43b expref: 48 pid: 115347 timeout: 2680307 lvb_type: 0 [2680328.462583] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 11 previous similar messages [2680353.551562] Lustre: 115878:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ecea5209b00 x1628637840973008/t0(0) o101->e3c4cf5f-8e04-bccb-9d13-7eae1b83e1a1@10.8.27.29@o2ib6:23/0 lens 480/568 e 0 to 0 dl 1554835523 ref 2 fl Interpret:/0/0 rc 0/0 [2680353.580731] Lustre: 115878:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 94 previous similar messages [2680359.473572] Lustre: fir-MDT0001: Client 8c206ea7-4fa6-6560-2c3b-626d4cc9e42f (at 10.8.8.3@o2ib6) reconnecting [2680359.483662] Lustre: Skipped 192 previous similar messages [2680615.118989] Lustre: 115611:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:1s); client may timeout. req@ffff8ed0e910ec00 x1628647046373616/t0(0) o101->535a7c41-41b9-ec51-bbd6-e028c4ebcb2c@10.8.8.1@o2ib6:5/0 lens 568/2296 e 0 to 0 dl 1554835779 ref 1 fl Complete:/0/0 rc -107/-107 [2680615.147915] Lustre: 115611:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 9 previous similar messages [2680654.435893] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2680654.446503] Lustre: Skipped 209 previous similar messages [2680767.177583] LustreError: 115857:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554835842, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ecfc05e4380/0xbc329462b446fabf lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 36 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115857 timeout: 0 lvb_type: 0 [2680767.217517] LustreError: 115857:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 46 previous similar messages [2680815.509167] LNet: Service thread pid 115963 was inactive for 200.38s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2680815.526365] LNet: Skipped 9 previous similar messages [2680815.531601] Pid: 115963, comm: mdt00_077 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2680815.541619] Call Trace: [2680815.544283] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2680815.551407] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2680815.558790] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2680815.565798] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2680815.573009] [] mdt_object_lock+0x20/0x30 [mdt] [2680815.579324] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2680815.585837] [] mdt_intent_brw+0x1f/0x30 [mdt] [2680815.592079] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2680815.598767] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2680815.605705] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2680815.613007] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2680815.619368] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2680815.626492] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2680815.634399] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2680815.640917] [] kthread+0xd1/0xe0 [2680815.646019] [] ret_from_fork_nospec_begin+0xe/0x21 [2680815.652671] [] 0xffffffffffffffff [2680815.657886] LustreError: dumping log to /tmp/lustre-log.1554835981.115963 [2680816.600205] Pid: 115587, comm: mdt02_014 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2680816.610213] Call Trace: [2680816.612859] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2680816.619994] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2680816.627363] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2680816.634403] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2680816.641583] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2680816.648342] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2680816.655036] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2680816.661996] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2680816.669294] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2680816.675681] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2680816.682802] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2680816.690703] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2680816.697223] [] kthread+0xd1/0xe0 [2680816.702311] [] ret_from_fork_nospec_begin+0xe/0x21 [2680816.708966] [] 0xffffffffffffffff [2680816.714169] Pid: 115915, comm: mdt00_063 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2680816.724188] Call Trace: [2680816.726826] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2680816.733943] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2680816.741343] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2680816.748355] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2680816.755557] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2680816.762323] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2680816.769008] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2680816.775959] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2680816.783248] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2680816.789619] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2680816.796728] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2680816.804630] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2680816.811157] [] kthread+0xd1/0xe0 [2680816.816247] [] ret_from_fork_nospec_begin+0xe/0x21 [2680816.822922] [] 0xffffffffffffffff [2680816.828115] Pid: 115710, comm: mdt00_028 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2680816.838140] Call Trace: [2680816.840778] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2680816.847886] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2680816.855277] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2680816.862284] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2680816.869475] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2680816.876134] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2680816.883324] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2680816.890071] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2680816.896727] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2680816.903682] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2680816.910965] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2680816.917300] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2680816.924424] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2680816.932311] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2680816.938811] [] kthread+0xd1/0xe0 [2680816.943898] [] ret_from_fork_nospec_begin+0xe/0x21 [2680816.950537] [] 0xffffffffffffffff [2680816.955731] Pid: 115622, comm: mdt00_017 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2680816.965733] Call Trace: [2680816.968374] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2680816.975479] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2680816.982883] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2680816.989924] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2680816.997114] [] mdt_object_lock+0x20/0x30 [mdt] [2680817.003411] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2680817.009909] [] mdt_intent_brw+0x1f/0x30 [mdt] [2680817.016150] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2680817.022814] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2680817.029760] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2680817.037050] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2680817.043398] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2680817.050515] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2680817.058420] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2680817.064954] [] kthread+0xd1/0xe0 [2680817.070050] [] ret_from_fork_nospec_begin+0xe/0x21 [2680817.076708] [] 0xffffffffffffffff [2680817.081917] LNet: Service thread pid 115870 was inactive for 201.95s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2680817.095062] LNet: Skipped 9 previous similar messages [2680824.430577] LNet: Service thread pid 115963 completed after 209.30s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2680824.447003] LNet: Skipped 12 previous similar messages [2680846.229535] LustreError: dumping log to /tmp/lustre-log.1554836011.116124 [2680846.741548] LustreError: dumping log to /tmp/lustre-log.1554836012.116191 [2680854.430826] LustreError: 115622:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8eb5fe52f800 ns: mdt-fir-MDT0001_UUID lock: ffff8ebfecd8f080/0xbc329462b34308b3 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 34 type: IBT flags: 0x50200000000000 nid: 10.8.17.19@o2ib6 remote: 0x8282471ef03af031 expref: 4 pid: 115622 timeout: 0 lvb_type: 0 [2680854.465881] LustreError: 115622:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 10 previous similar messages [2680867.097290] LNetError: 115145:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (0, 5) [2680964.456808] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2680964.467157] Lustre: Skipped 193 previous similar messages [2681040.432945] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 149s: evicting client at 10.8.27.28@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ed00056f2c0/0xbc329462b7f23c87 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 37 type: IBT flags: 0x60200400000020 nid: 10.8.27.28@o2ib6 remote: 0x1df96237ad450b80 expref: 52 pid: 115953 timeout: 2681019 lvb_type: 0 [2681040.471311] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 10 previous similar messages [2681071.202311] Lustre: 116301:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8edf1dc7b900 x1629295509514192/t0(0) o101->8c206ea7-4fa6-6560-2c3b-626d4cc9e42f@10.8.8.3@o2ib6:21/0 lens 568/0 e 0 to 0 dl 1554836241 ref 2 fl Interpret:/0/ffffffff rc 0/-1 [2681071.231854] Lustre: 116301:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 93 previous similar messages [2681090.968539] LNet: Service thread pid 115898 was inactive for 200.45s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2681090.981673] LNet: Skipped 8 previous similar messages [2681090.986909] LustreError: dumping log to /tmp/lustre-log.1554836256.115898 [2681220.435257] LNet: Service thread pid 115573 completed after 329.95s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2681220.451695] LNet: Skipped 20 previous similar messages [2681220.455812] Lustre: 115844:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:144s); client may timeout. req@ffff8edf1dc7b900 x1629295509514192/t0(0) o101->8c206ea7-4fa6-6560-2c3b-626d4cc9e42f@10.8.8.3@o2ib6:21/0 lens 568/2296 e 0 to 0 dl 1554836241 ref 1 fl Complete:/0/0 rc -107/-107 [2681220.455815] Lustre: 115844:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 4 previous similar messages [2681274.519549] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2681274.530171] Lustre: Skipped 222 previous similar messages [2681423.260563] LNet: Service thread pid 116313 was inactive for 200.44s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2681423.277762] LNet: Skipped 4 previous similar messages [2681423.283001] Pid: 116313, comm: mdt02_105 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2681423.293004] Call Trace: [2681423.295648] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2681423.302768] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2681423.310133] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2681423.317137] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2681423.324315] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2681423.330981] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2681423.338164] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2681423.344906] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2681423.351579] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2681423.358523] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2681423.365827] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2681423.372181] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2681423.379315] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2681423.387217] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2681423.393725] [] kthread+0xd1/0xe0 [2681423.398813] [] ret_from_fork_nospec_begin+0xe/0x21 [2681423.405461] [] 0xffffffffffffffff [2681423.410673] LustreError: dumping log to /tmp/lustre-log.1554836588.116313 [2681423.665146] Pid: 115724, comm: mdt01_033 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2681423.675149] Call Trace: [2681423.677801] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2681423.684934] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2681423.692314] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2681423.699334] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2681423.706511] [] mdt_object_lock+0x20/0x30 [mdt] [2681423.712820] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2681423.719312] [] mdt_intent_brw+0x1f/0x30 [mdt] [2681423.725545] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2681423.732198] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2681423.739139] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2681423.746429] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2681423.752774] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2681423.759898] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2681423.767801] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2681423.774327] [] kthread+0xd1/0xe0 [2681423.779416] [] ret_from_fork_nospec_begin+0xe/0x21 [2681423.786062] [] 0xffffffffffffffff [2681423.791259] Pid: 115638, comm: mdt02_029 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2681423.801258] Call Trace: [2681423.803900] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2681423.811014] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2681423.818380] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2681423.825383] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2681423.832569] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2681423.839222] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2681423.846400] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2681423.853160] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2681423.859833] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2681423.866776] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2681423.874058] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2681423.880406] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2681423.887527] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2681423.895421] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2681423.901945] [] kthread+0xd1/0xe0 [2681423.907033] [] ret_from_fork_nospec_begin+0xe/0x21 [2681423.913680] [] 0xffffffffffffffff [2681423.918885] Pid: 116244, comm: mdt02_085 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2681423.928886] Call Trace: [2681423.931526] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2681423.938642] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2681423.946027] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2681423.953036] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2681423.960221] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2681423.966963] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2681423.973620] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2681423.980563] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2681423.987859] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2681423.994205] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2681424.001318] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2681424.009205] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2681424.015725] [] kthread+0xd1/0xe0 [2681424.020810] [] ret_from_fork_nospec_begin+0xe/0x21 [2681424.027468] [] 0xffffffffffffffff [2681424.032666] Pid: 115801, comm: mdt01_044 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2681424.042663] Call Trace: [2681424.045306] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2681424.052428] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2681424.059815] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2681424.066822] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2681424.074000] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2681424.080654] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2681424.087830] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2681424.094575] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2681424.101265] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2681424.108205] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2681424.115487] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2681424.121821] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2681424.128943] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2681424.136840] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2681424.143339] [] kthread+0xd1/0xe0 [2681424.148429] [] ret_from_fork_nospec_begin+0xe/0x21 [2681424.155079] [] 0xffffffffffffffff [2681424.160282] LNet: Service thread pid 115898 was inactive for 201.25s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2681424.173407] LNet: Skipped 10 previous similar messages [2681432.437814] LNet: Service thread pid 115844 completed after 209.61s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2681432.454238] LNet: Skipped 7 previous similar messages [2681469.059218] LustreError: 115622:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ed4a4c8c800 ns: mdt-fir-MDT0001_UUID lock: ffff8ec06da55340/0xbc329462c1238c41 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 36 type: IBT flags: 0x50200000000000 nid: 10.8.27.29@o2ib6 remote: 0xd991486402059397 expref: 7 pid: 115622 timeout: 0 lvb_type: 0 [2681469.094278] LustreError: 115622:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 11 previous similar messages [2681565.252340] Lustre: fir-MDT0001: Client e98ad6a7-830d-1725-2883-db155e5b9e43 (at 10.8.17.16@o2ib6) reconnecting [2681565.262603] Lustre: Skipped 208 previous similar messages [2681593.249636] LustreError: 115583:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554836668, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eb8adff0480/0xbc329462c23dcf42 lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 37 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115583 timeout: 0 lvb_type: 0 [2681593.289565] LustreError: 115583:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 42 previous similar messages [2681682.440730] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.17.19@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8eccd65321c0/0xbc329462c23dcf18 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 37 type: IBT flags: 0x60200400000020 nid: 10.8.17.19@o2ib6 remote: 0x8282471ef03be3f6 expref: 40 pid: 115408 timeout: 2681661 lvb_type: 0 [2681682.479113] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 8 previous similar messages [2681703.327985] LustreError: dumping log to /tmp/lustre-log.1554836868.115917 [2681703.839992] LustreError: dumping log to /tmp/lustre-log.1554836869.115345 [2681708.704055] Lustre: 115346:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (4/-6), not sending early reply req@ffff8eba9c3b4b00 x1628647360265968/t0(0) o101->eeceaf0a-f64b-44e4-4f28-fac655ebb0a4@10.8.17.19@o2ib6:28/0 lens 576/3264 e 0 to 0 dl 1554836878 ref 2 fl Interpret:/0/0 rc 0/0 [2681708.733322] Lustre: 115346:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 68 previous similar messages [2681735.072372] LNet: Service thread pid 115962 was inactive for 200.58s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2681735.089570] LNet: Skipped 4 previous similar messages [2681735.094820] Pid: 115962, comm: mdt01_082 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2681735.104818] Call Trace: [2681735.107480] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2681735.114616] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2681735.122000] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2681735.129021] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2681735.136216] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2681735.142901] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2681735.150083] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2681735.156857] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2681735.163508] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2681735.170471] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2681735.177758] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2681735.184129] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2681735.191250] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2681735.199151] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2681735.205670] [] kthread+0xd1/0xe0 [2681735.210760] [] ret_from_fork_nospec_begin+0xe/0x21 [2681735.217438] [] 0xffffffffffffffff [2681735.222635] LustreError: dumping log to /tmp/lustre-log.1554836900.115962 [2681735.246755] Pid: 115832, comm: mdt01_049 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2681735.256785] Call Trace: [2681735.259423] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2681735.266540] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2681735.273906] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2681735.280925] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2681735.288118] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2681735.294892] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2681735.301567] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2681735.308551] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2681735.315836] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2681735.322186] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2681735.329313] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2681735.337222] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2681735.343748] [] kthread+0xd1/0xe0 [2681735.348838] [] ret_from_fork_nospec_begin+0xe/0x21 [2681735.355502] [] 0xffffffffffffffff [2681832.442773] LNet: Service thread pid 115594 completed after 329.17s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2681832.442885] Lustre: 115832:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:144s); client may timeout. req@ffff8ed0f41b3000 x1628647360102032/t0(0) o101->eeceaf0a-f64b-44e4-4f28-fac655ebb0a4@10.8.17.19@o2ib6:29/0 lens 568/2296 e 0 to 0 dl 1554836853 ref 1 fl Complete:/0/0 rc -107/-107 [2681832.442890] Lustre: 115832:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 6 previous similar messages [2681832.498882] LNet: Skipped 11 previous similar messages [2681874.543951] Lustre: fir-MDT0001: Connection restored to e3c4cf5f-8e04-bccb-9d13-7eae1b83e1a1 (at 10.8.27.29@o2ib6) [2681874.554472] Lustre: Skipped 229 previous similar messages [2682083.446910] LustreError: 115764:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec54a082800 ns: mdt-fir-MDT0001_UUID lock: ffff8ed63f7318c0/0xbc329462c6742d9f lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 31 type: IBT flags: 0x50200000000000 nid: 10.8.27.31@o2ib6 remote: 0x99d3453de1fe1cca expref: 3 pid: 115764 timeout: 0 lvb_type: 0 [2682083.481975] LustreError: 115764:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 5 previous similar messages [2682173.589485] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2682173.599830] Lustre: Skipped 213 previous similar messages [2682217.982459] LustreError: 115899:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554837293, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eb6316f5a00/0xbc329462c6ad1604 lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 35 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115899 timeout: 0 lvb_type: 0 [2682218.022385] LustreError: 115899:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 23 previous similar messages [2682328.487834] LNet: Service thread pid 115844 was inactive for 200.42s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2682328.505050] LNet: Skipped 1 previous similar message [2682328.510201] Pid: 115844, comm: mdt02_051 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2682328.520219] Call Trace: [2682328.522864] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2682328.529994] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2682328.537420] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2682328.544444] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2682328.551642] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2682328.558323] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2682328.565517] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2682328.572278] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2682328.578950] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2682328.585902] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2682328.593183] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2682328.599540] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2682328.606689] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2682328.614614] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2682328.621142] [] kthread+0xd1/0xe0 [2682328.626245] [] ret_from_fork_nospec_begin+0xe/0x21 [2682328.632908] [] 0xffffffffffffffff [2682328.638106] LustreError: dumping log to /tmp/lustre-log.1554837493.115844 [2682329.065693] Pid: 116242, comm: mdt02_084 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2682329.075712] Call Trace: [2682329.078355] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2682329.085475] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2682329.092886] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2682329.099903] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2682329.107094] [] mdt_object_lock+0x20/0x30 [mdt] [2682329.113422] [] mdt_hsm_state_set+0xc9/0x830 [mdt] [2682329.120010] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2682329.127135] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2682329.135031] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2682329.141539] [] kthread+0xd1/0xe0 [2682329.146626] [] ret_from_fork_nospec_begin+0xe/0x21 [2682329.153290] [] 0xffffffffffffffff [2682329.158487] Pid: 115612, comm: mdt00_014 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2682329.168485] Call Trace: [2682329.171115] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2682329.178258] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2682329.185628] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2682329.192662] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2682329.199840] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2682329.206600] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2682329.213258] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2682329.220221] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2682329.227522] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2682329.233871] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2682329.240991] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2682329.248892] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2682329.255395] [] kthread+0xd1/0xe0 [2682329.260512] [] ret_from_fork_nospec_begin+0xe/0x21 [2682329.267163] [] 0xffffffffffffffff [2682329.272356] Pid: 115587, comm: mdt02_014 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2682329.282350] Call Trace: [2682329.285004] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2682329.292145] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2682329.299532] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2682329.306537] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2682329.313727] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2682329.320404] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2682329.327596] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2682329.334354] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2682329.341031] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2682329.347980] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2682329.355262] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2682329.361613] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2682329.368737] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2682329.376626] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2682329.383141] [] kthread+0xd1/0xe0 [2682329.388230] [] ret_from_fork_nospec_begin+0xe/0x21 [2682329.394905] [] 0xffffffffffffffff [2682329.400091] Pid: 115801, comm: mdt01_044 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2682329.410124] Call Trace: [2682329.412775] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2682329.419881] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2682329.427265] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2682329.434284] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2682329.441475] [] mdt_reint_object_lock+0x2c/0x60 [mdt] [2682329.448326] [] mdt_reint_striped_lock+0x8c/0x510 [mdt] [2682329.455343] [] mdt_reint_setattr+0x6c8/0x1340 [mdt] [2682329.462089] [] mdt_reint_rec+0x83/0x210 [mdt] [2682329.468326] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2682329.475087] [] mdt_reint+0x67/0x140 [mdt] [2682329.480979] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2682329.488115] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2682329.496032] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2682329.502535] [] kthread+0xd1/0xe0 [2682329.507640] [] ret_from_fork_nospec_begin+0xe/0x21 [2682329.514290] [] 0xffffffffffffffff [2682329.519488] LNet: Service thread pid 115594 was inactive for 201.40s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2682329.532621] LNet: Skipped 12 previous similar messages [2682359.720218] LustreError: dumping log to /tmp/lustre-log.1554837525.115346 [2682367.449320] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.9.107.5@o2ib4 ns: mdt-fir-MDT0001_UUID lock: ffff8ec1b93c8480/0xbc329462c6ad150f lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 38 type: IBT flags: 0x60200400000020 nid: 10.9.107.5@o2ib4 remote: 0xb9c41693abe51342 expref: 47 pid: 115764 timeout: 2682346 lvb_type: 0 [2682367.487686] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 9 previous similar messages [2682390.952606] LustreError: dumping log to /tmp/lustre-log.1554837556.115832 [2682394.280659] Lustre: 115406:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eb18c9d8300 x1628546114631968/t0(0) o101->ac4b3f39-4aa4-91f4-89f9-e38a651f2f99@10.9.107.5@o2ib4:24/0 lens 576/3264 e 0 to 0 dl 1554837564 ref 2 fl Interpret:/0/0 rc 0/0 [2682394.309931] Lustre: 115406:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 70 previous similar messages [2682420.136978] LustreError: dumping log to /tmp/lustre-log.1554837585.116359 [2682483.652377] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2682483.662985] Lustre: Skipped 222 previous similar messages [2682517.451380] LNet: Service thread pid 115917 completed after 389.46s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2682517.452249] Lustre: 115832:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:297s); client may timeout. req@ffff8ec9da93aa00 x1628649192135008/t0(0) o101->eed99957-e395-8d59-f471-3be5bc5334d2@10.8.27.31@o2ib6:25/0 lens 568/2296 e 0 to 0 dl 1554837385 ref 1 fl Complete:/0/0 rc -107/-107 [2682517.452251] Lustre: 115832:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 4 previous similar messages [2682517.507411] LNet: Skipped 16 previous similar messages [2682793.672242] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2682793.682587] Lustre: Skipped 226 previous similar messages [2682857.455719] LustreError: 115898:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8eb18c8b2800 ns: mdt-fir-MDT0001_UUID lock: ffff8ecaff9a7080/0xbc329462c804dc7a lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 25 type: IBT flags: 0x50200000000000 nid: 10.8.27.29@o2ib6 remote: 0xd9914864020c78b6 expref: 2 pid: 115898 timeout: 0 lvb_type: 0 [2682857.490776] LustreError: 115898:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 10 previous similar messages [2682982.652093] LustreError: 115859:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554838057, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8edd68e63a80/0xbc329462c872697c lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 32 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115859 timeout: 0 lvb_type: 0 [2682982.692020] LustreError: 115859:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 53 previous similar messages [2683042.458010] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.27.29@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ed7137672c0/0xbc329462c87268aa lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 32 type: IBT flags: 0x60200400000020 nid: 10.8.27.29@o2ib6 remote: 0xd9914864020d614c expref: 112 pid: 115920 timeout: 2683021 lvb_type: 0 [2683042.496445] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 5 previous similar messages [2683067.569195] Lustre: 115837:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ec84ddd5d00 x1628647047753904/t0(0) o101->535a7c41-41b9-ec51-bbd6-e028c4ebcb2c@10.8.8.1@o2ib6:7/0 lens 480/568 e 0 to 0 dl 1554838237 ref 2 fl Interpret:/0/0 rc 0/0 [2683067.598109] Lustre: 115837:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 57 previous similar messages [2683103.747142] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2683103.757748] Lustre: Skipped 195 previous similar messages [2683259.697552] Lustre: 115859:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:2s); client may timeout. req@ffff8ed4f5cb5d00 x1628649192464320/t0(0) o101->eed99957-e395-8d59-f471-3be5bc5334d2@10.8.27.31@o2ib6:9/0 lens 568/2296 e 0 to 0 dl 1554838423 ref 1 fl Complete:/0/0 rc -107/-107 [2683259.726659] Lustre: 115859:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 9 previous similar messages [2683413.767023] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2683413.777375] Lustre: Skipped 161 previous similar messages [2683580.466524] LustreError: 115920:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ed840b52000 ns: mdt-fir-MDT0001_UUID lock: ffff8ed459783cc0/0xbc329462c966457b lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 30 type: IBT flags: 0x50200000000000 nid: 10.8.17.19@o2ib6 remote: 0x8282471ef04230ce expref: 4 pid: 115920 timeout: 0 lvb_type: 0 [2683580.501576] LustreError: 115920:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 12 previous similar messages [2683673.841843] LustreError: 115573:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554838749, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ebd59781440/0xbc329462c9c3a23e lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 33 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115573 timeout: 0 lvb_type: 0 [2683673.881766] LustreError: 115573:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 33 previous similar messages [2683707.842499] Lustre: fir-MDT0001: Connection restored to 2c78cdad-2975-ca98-fb36-e7548576f834 (at 10.8.17.17@o2ib6) [2683707.853025] Lustre: Skipped 174 previous similar messages [2683733.466595] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 149s: evicting client at 10.8.27.31@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ed06ff757c0/0xbc329462c9c3a165 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 33 type: IBT flags: 0x60200400000020 nid: 10.8.27.31@o2ib6 remote: 0x99d3453de2012f4e expref: 57 pid: 115832 timeout: 2683712 lvb_type: 0 [2683733.504956] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 9 previous similar messages [2683759.225924] Lustre: 115345:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eb26cb94800 x1628546115817040/t0(0) o101->ac4b3f39-4aa4-91f4-89f9-e38a651f2f99@10.9.107.5@o2ib4:9/0 lens 576/3264 e 0 to 0 dl 1554838929 ref 2 fl Interpret:/0/0 rc 0/0 [2683759.255100] Lustre: 115345:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 83 previous similar messages [2684033.833887] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2684033.844239] Lustre: Skipped 152 previous similar messages [2684086.471676] Lustre: 116361:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:29s); client may timeout. req@ffff8eeb3bf5a400 x1628546116839552/t0(0) o101->ac4b3f39-4aa4-91f4-89f9-e38a651f2f99@10.9.107.5@o2ib4:2/0 lens 568/2296 e 0 to 0 dl 1554839222 ref 1 fl Complete:/0/0 rc -107/-107 [2684086.500744] Lustre: 116361:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 6 previous similar messages [2684211.473117] LustreError: 115948:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ecabb2b7400 ns: mdt-fir-MDT0001_UUID lock: ffff8ef044a0f2c0/0xbc329462cb0c7dfc lrc: 5/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 24 type: IBT flags: 0x50200000000000 nid: 10.8.27.28@o2ib6 remote: 0x1df96237ad52a50b expref: 10 pid: 115948 timeout: 0 lvb_type: 0 [2684211.508256] LustreError: 115948:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 15 previous similar messages [2684311.476923] Lustre: fir-MDT0001: Connection restored to a4daaf47-6ec9-4753-388e-0d0b7a7f70d6 (at 10.8.27.25@o2ib6) [2684311.487445] Lustre: Skipped 114 previous similar messages [2684344.474247] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 30s: evicting client at 10.8.27.25@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ec8ce5e3180/0xbc329462cb58f31b lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 31 type: IBT flags: 0x60200400000020 nid: 10.8.27.25@o2ib6 remote: 0x47916098fecc1625 expref: 418 pid: 116292 timeout: 2684323 lvb_type: 0 [2684344.512612] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 14 previous similar messages [2684369.601560] Lustre: 115886:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eb872cefb00 x1628593199087264/t0(0) o101->bf6bd63b-9206-2f81-2780-2b483791a8c1@10.8.17.15@o2ib6:19/0 lens 576/3264 e 0 to 0 dl 1554839539 ref 2 fl Interpret:/0/0 rc 0/0 [2684369.630821] Lustre: 115886:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 126 previous similar messages [2684404.504987] LustreError: 115568:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554839479, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eb7843d33c0/0xbc329462cb58f4db lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 30 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115568 timeout: 0 lvb_type: 0 [2684404.544900] LustreError: 115568:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 23 previous similar messages [2684653.900480] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2684653.910830] Lustre: Skipped 131 previous similar messages [2684717.479672] Lustre: 115721:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:29s); client may timeout. req@ffff8ec734552a00 x1628546117628896/t0(0) o101->ac4b3f39-4aa4-91f4-89f9-e38a651f2f99@10.9.107.5@o2ib4:3/0 lens 568/2296 e 0 to 0 dl 1554839853 ref 1 fl Complete:/0/0 rc -107/-107 [2684717.508764] Lustre: 115721:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 9 previous similar messages [2684874.483070] LustreError: 115950:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8edd26fbc800 ns: mdt-fir-MDT0001_UUID lock: ffff8ee3b4fac140/0xbc329462cc5d5bbe lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x1b/0x0 rrc: 29 type: IBT flags: 0x50200000000000 nid: 10.9.107.5@o2ib4 remote: 0xb9c41693abf1a7ff expref: 4 pid: 115950 timeout: 0 lvb_type: 0 [2684874.518118] LustreError: 115950:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 12 previous similar messages [2684932.977985] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2684932.988597] Lustre: Skipped 159 previous similar messages [2684970.312992] Lustre: 116204:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ec3cec06900 x1628593199735008/t0(0) o101->bf6bd63b-9206-2f81-2780-2b483791a8c1@10.8.17.15@o2ib6:20/0 lens 576/3264 e 0 to 0 dl 1554840140 ref 2 fl Interpret:/0/0 rc 0/0 [2684970.342259] Lustre: 116204:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 72 previous similar messages [2685035.398800] LustreError: 115837:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554840110, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ec5db7898c0/0xbc329462cccf0671 lrc: 3/1,0 mode: --/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 35 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115837 timeout: 0 lvb_type: 0 [2685035.438737] LustreError: 115837:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 31 previous similar messages [2685094.483526] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 149s: evicting client at 10.8.17.15@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ec2f47e18c0/0xbc329462cccf0250 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 35 type: IBT flags: 0x60200400000020 nid: 10.8.17.15@o2ib6 remote: 0x794f201b7362abbe expref: 191 pid: 115890 timeout: 2685073 lvb_type: 0 [2685094.521973] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 11 previous similar messages [2685266.207161] Lustre: fir-MDT0001: Client ac4b3f39-4aa4-91f4-89f9-e38a651f2f99 (at 10.9.107.5@o2ib4) reconnecting [2685266.217429] Lustre: Skipped 127 previous similar messages [2685415.536303] Lustre: 115837:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:150s); client may timeout. req@ffff8ec9ede95700 x1628647364065616/t133504265748(0) o36->eeceaf0a-f64b-44e4-4f28-fac655ebb0a4@10.8.17.19@o2ib6:10/0 lens 488/424 e 0 to 0 dl 1554840430 ref 1 fl Complete:/0/0 rc 0/0 [2685415.565822] Lustre: 115837:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 9 previous similar messages [2685478.490224] LustreError: 116142:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec9ddbddc00 ns: mdt-fir-MDT0001_UUID lock: ffff8ec15b769440/0xbc329462cdfa1c08 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 19 type: IBT flags: 0x50200000000000 nid: 10.8.27.25@o2ib6 remote: 0x47916098fecf8810 expref: 3 pid: 116142 timeout: 0 lvb_type: 0 [2685478.525275] LustreError: 116142:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 16 previous similar messages [2685551.503757] Lustre: fir-MDT0001: Connection restored to 7d8ca85d-8b80-6a23-8fa9-83dca7eb7196 (at 10.8.27.28@o2ib6) [2685551.514285] Lustre: Skipped 133 previous similar messages [2685578.832452] Lustre: 115857:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ed048d1ef00 x1628649195103728/t0(0) o101->eed99957-e395-8d59-f471-3be5bc5334d2@10.8.27.31@o2ib6:29/0 lens 480/568 e 0 to 0 dl 1554840749 ref 2 fl Interpret:/0/0 rc 0/0 [2685578.861607] Lustre: 115857:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 87 previous similar messages [2685717.491163] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 29s: evicting client at 10.8.17.15@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ecb25cbc380/0xbc329462ce8378b9 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 25 type: IBT flags: 0x60200400000020 nid: 10.8.17.15@o2ib6 remote: 0x794f201b7368d7db expref: 347 pid: 116159 timeout: 2685696 lvb_type: 0 [2685717.529538] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 14 previous similar messages [2685894.048745] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2685894.059093] Lustre: Skipped 73 previous similar messages [2686055.003141] Lustre: 115724:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:1s); client may timeout. req@ffff8ee002b5e900 x1628647055511440/t0(0) o101->535a7c41-41b9-ec51-bbd6-e028c4ebcb2c@10.8.8.1@o2ib6:19/0 lens 568/2296 e 0 to 0 dl 1554841219 ref 1 fl Complete:/0/0 rc -107/-107 [2686055.032035] Lustre: 115724:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 8 previous similar messages [2686110.509384] LustreError: 115611:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec9ca78fc00 ns: mdt-fir-MDT0001_UUID lock: ffff8edab0b2af40/0xbc329462cf771962 lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0x13376:0x0].0x0 bits 0x20/0x0 rrc: 18 type: IBT flags: 0x50200000000000 nid: 10.8.17.15@o2ib6 remote: 0x794f201b737c1ae4 expref: 2956 pid: 115611 timeout: 0 lvb_type: 0 [2686110.544697] LustreError: 115611:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 17 previous similar messages [2686173.093240] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2686173.103850] Lustre: Skipped 65 previous similar messages [2686192.560027] Lustre: 115892:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ed100121800 x1628593206274784/t0(0) o101->bf6bd63b-9206-2f81-2780-2b483791a8c1@10.8.17.15@o2ib6:12/0 lens 576/3264 e 0 to 0 dl 1554841362 ref 2 fl Interpret:/0/0 rc 0/0 [2686192.589288] Lustre: 115892:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 100 previous similar messages [2686363.768134] LustreError: 116206:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554841439, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eb4c1f55100/0xbc329462cfefcd6f lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 21 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 116206 timeout: 0 lvb_type: 0 [2686363.808061] LustreError: 116206:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 17 previous similar messages [2686423.499862] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 149s: evicting client at 10.8.8.1@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ed6bfeff500/0xbc329462cfefcba8 lrc: 3/0,0 mode: PW/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 20 type: IBT flags: 0x60200400000020 nid: 10.8.8.1@o2ib6 remote: 0xbeca8e277501e108 expref: 39 pid: 115892 timeout: 2686402 lvb_type: 0 [2686423.537857] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 13 previous similar messages [2686512.728524] Lustre: fir-MDT0001: haven't heard from client 2d10c5cd-abf6-dba8-fbcc-52f8092510eb (at 10.8.26.33@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ecc14037c00, cur 1554841678 expire 1554841528 last 1554841451 [2686512.750401] Lustre: Skipped 1 previous similar message [2686514.135337] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2686514.145689] Lustre: Skipped 76 previous similar messages [2686544.168349] LustreError: 115962:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554841619, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ec033c69b00/0xbc329462d0429984 lrc: 3/0,1 mode: --/PW res: [0x24000f611:0x13376:0x0].0x0 bits 0x40/0x0 rrc: 20 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 115962 timeout: 0 lvb_type: 0 [2686544.208278] LustreError: 115962:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 5 previous similar messages [2686793.179988] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2686793.190609] Lustre: Skipped 98 previous similar messages [2687134.219082] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2687134.229434] Lustre: Skipped 57 previous similar messages [2687413.263659] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2687413.274277] Lustre: Skipped 39 previous similar messages [2687754.283803] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2687754.294149] Lustre: Skipped 39 previous similar messages [2688033.328311] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2688033.338927] Lustre: Skipped 39 previous similar messages [2688374.372588] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2688374.382943] Lustre: Skipped 39 previous similar messages [2688653.417140] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2688653.427749] Lustre: Skipped 39 previous similar messages [2688994.437297] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2688994.447642] Lustre: Skipped 39 previous similar messages [2689273.481777] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2689273.492391] Lustre: Skipped 39 previous similar messages [2689614.501969] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2689614.512316] Lustre: Skipped 39 previous similar messages [2689634.765786] Lustre: fir-MDT0001: haven't heard from client 682fc84c-f203-4299-faa4-f7ee9e17c3fb (at 10.8.9.8@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ebc271f5000, cur 1554844800 expire 1554844650 last 1554844573 [2689634.787490] Lustre: Skipped 1 previous similar message [2689811.774767] Lustre: 115891:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ee5bc264800 x1628739434338016/t181396944539(0) o36->e133d3ba-741b-8aba-d6d6-045d6382599c@10.9.108.54@o2ib4:2/0 lens 512/2888 e 1 to 0 dl 1554844982 ref 2 fl Interpret:/0/0 rc 0/0 [2689811.804796] Lustre: 115891:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 38 previous similar messages [2689881.850231] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2689881.860758] Lustre: Skipped 51 previous similar messages [2689887.670682] LustreError: 116172:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554844962, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8ebb5cc1d100/0xbc329462d6c0d0b3 lrc: 3/1,0 mode: --/PR res: [0x28000f74a:0x1336a:0x0].0x0 bits 0x12/0x0 rrc: 4 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 116172 timeout: 0 lvb_type: 0 [2689887.710527] LustreError: 116172:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 5 previous similar messages [2689997.830989] LNet: Service thread pid 116172 was inactive for 200.15s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2689997.848181] LNet: Skipped 4 previous similar messages [2689997.853435] Pid: 116172, comm: mdt00_081 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2689997.863433] Call Trace: [2689997.866075] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2689997.873188] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2689997.880571] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2689997.887594] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2689997.894798] [] mdt_getattr_name_lock+0x11d/0x1c30 [mdt] [2689997.901892] [] mdt_getattr_name+0xc4/0x2b0 [mdt] [2689997.908379] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2689997.915511] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2689997.923445] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2689997.929948] [] kthread+0xd1/0xe0 [2689997.935033] [] ret_from_fork_nospec_begin+0xe/0x21 [2689997.941682] [] 0xffffffffffffffff [2689997.946891] LustreError: dumping log to /tmp/lustre-log.1554845163.116172 [2689999.250696] Pid: 115560, comm: mdt03_005 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2689999.260734] Call Trace: [2689999.263377] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2689999.270496] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2689999.277891] [] mdt_dom_discard_data+0x101/0x130 [mdt] [2689999.284832] [] mdt_reint_unlink+0x331/0x1480 [mdt] [2689999.291503] [] mdt_reint_rec+0x83/0x210 [mdt] [2689999.297745] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2689999.304495] [] mdt_reint+0x67/0x140 [mdt] [2689999.310380] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2689999.317512] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2689999.325403] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2689999.331918] [] kthread+0xd1/0xe0 [2689999.337006] [] ret_from_fork_nospec_begin+0xe/0x21 [2689999.343679] [] 0xffffffffffffffff [2690218.087355] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2690218.097531] Lustre: Skipped 77 previous similar messages [2690482.615806] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2690482.626413] Lustre: Skipped 95 previous similar messages [2690823.635970] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2690823.646323] Lustre: Skipped 95 previous similar messages [2690905.781457] Lustre: fir-MDT0001: haven't heard from client 3ac9e354-6111-8e51-fd56-30089bd085b0 (at 10.8.26.33@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eb7b7021800, cur 1554846071 expire 1554845921 last 1554845844 [2690905.803350] Lustre: Skipped 5 previous similar messages [2690906.861433] Lustre: fir-MDT0003: haven't heard from client 3ac9e354-6111-8e51-fd56-30089bd085b0 (at 10.8.26.33@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed3704ebc00, cur 1554846072 expire 1554845922 last 1554845845 [2691100.269235] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2691100.279667] Lustre: Skipped 101 previous similar messages [2691436.289060] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2691436.299252] Lustre: Skipped 97 previous similar messages [2691619.789736] Lustre: fir-MDT0003: haven't heard from client 0ca6981c-0ba4-6115-f42c-a883e9ce9e93 (at 10.8.17.25@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ef057988c00, cur 1554846785 expire 1554846635 last 1554846558 [2691695.790650] Lustre: fir-MDT0001: haven't heard from client 8250f13d-e190-d3ce-3c10-a468d3298428 (at 10.9.107.4@o2ib4) in 224 seconds. I think it's dead, and I am evicting it. exp ffff8eb5066a2400, cur 1554846861 expire 1554846711 last 1554846637 [2691695.812554] Lustre: Skipped 5 previous similar messages [2691698.808347] Lustre: fir-MDT0003: haven't heard from client 8250f13d-e190-d3ce-3c10-a468d3298428 (at 10.9.107.4@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed07c2ba400, cur 1554846864 expire 1554846714 last 1554846637 [2691698.830238] Lustre: Skipped 22 previous similar messages [2691709.385519] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2691709.395951] Lustre: Skipped 97 previous similar messages [2691771.792053] Lustre: fir-MDT0001: haven't heard from client b56cfab4-9432-2f85-7f2c-fc6928253cc7 (at 10.9.106.63@o2ib4) in 226 seconds. I think it's dead, and I am evicting it. exp ffff8ed078d5f400, cur 1554846937 expire 1554846787 last 1554846711 [2691771.814037] Lustre: Skipped 29 previous similar messages [2691947.794562] Lustre: fir-MDT0001: haven't heard from client 9613f5ca-1ec4-4b69-69b6-b9d2007eb68e (at 10.8.27.10@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee051f46400, cur 1554847113 expire 1554846963 last 1554846886 [2691947.816441] Lustre: Skipped 30 previous similar messages [2691957.797978] Lustre: fir-MDT0003: haven't heard from client 9613f5ca-1ec4-4b69-69b6-b9d2007eb68e (at 10.8.27.10@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee29fb81800, cur 1554847123 expire 1554846973 last 1554846896 [2691957.819869] Lustre: Skipped 1 previous similar message [2692045.405379] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2692045.415555] Lustre: Skipped 97 previous similar messages [2692311.796268] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2692311.806874] Lustre: Skipped 95 previous similar messages [2692502.259275] Lustre: 115658:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554847660/real 1554847660] req@ffff8edbc9278300 x1630272523997872/t0(0) o104->fir-MDT0003@10.8.18.15@o2ib6:15/16 lens 296/224 e 0 to 1 dl 1554847667 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2692502.286810] Lustre: 115658:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 48 previous similar messages [2692505.615310] Lustre: 115351:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554847663/real 1554847663] req@ffff8eddca330600 x1630272524041984/t0(0) o104->fir-MDT0003@10.8.18.15@o2ib6:15/16 lens 296/224 e 0 to 1 dl 1554847670 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2692510.351365] Lustre: 115859:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ed0dc955700 x1628582110904736/t0(0) o101->49479271-414c-ecea-b074-b345e2606124@10.9.108.9@o2ib4:0/0 lens 480/568 e 1 to 0 dl 1554847680 ref 2 fl Interpret:/0/0 rc 0/0 [2692510.380372] Lustre: 115859:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 1 previous similar message [2692512.642393] Lustre: 115351:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554847670/real 1554847670] req@ffff8eddca330600 x1630272524041984/t0(0) o104->fir-MDT0003@10.8.18.15@o2ib6:15/16 lens 296/224 e 0 to 1 dl 1554847677 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2692512.669908] Lustre: 115351:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 4 previous similar messages [2692520.397484] Lustre: 115844:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ed691dc8f00 x1628696015302240/t0(0) o101->8f06907a-36b8-ba84-2a7c-5d8586764554@10.9.107.65@o2ib4:10/0 lens 480/568 e 1 to 0 dl 1554847690 ref 2 fl Interpret:/0/0 rc 0/0 [2692520.426668] Lustre: 115844:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 3 previous similar messages [2692523.296533] Lustre: 115658:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554847681/real 1554847681] req@ffff8edbc9278300 x1630272523997872/t0(0) o104->fir-MDT0003@10.8.18.15@o2ib6:15/16 lens 296/224 e 0 to 1 dl 1554847688 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2692523.324077] Lustre: 115658:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 14 previous similar messages [2692530.333648] LustreError: 115658:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.8.18.15@o2ib6) failed to reply to blocking AST (req@ffff8edbc9278300 x1630272523997872 status 0 rc -110), evict it ns: mdt-fir-MDT0003_UUID lock: ffff8eb5c169bcc0/0xbc329462da3a13e9 lrc: 4/0,0 mode: PR/PR res: [0x28000f4f2:0xa7:0x0].0x0 bits 0x5b/0x0 rrc: 72 type: IBT flags: 0x60200400000020 nid: 10.8.18.15@o2ib6 remote: 0x9ba7471cf62d4211 expref: 2950 pid: 115618 timeout: 2692531 lvb_type: 0 [2692530.376881] LustreError: 138-a: fir-MDT0003: A client on nid 10.8.18.15@o2ib6 was evicted due to a lock blocking callback time out: rc -110 [2692530.389617] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 35s: evicting client at 10.8.18.15@o2ib6 ns: mdt-fir-MDT0003_UUID lock: ffff8eb5c169bcc0/0xbc329462da3a13e9 lrc: 3/0,0 mode: PR/PR res: [0x28000f4f2:0xa7:0x0].0x0 bits 0x5b/0x0 rrc: 72 type: IBT flags: 0x60200400000020 nid: 10.8.18.15@o2ib6 remote: 0x9ba7471cf62d4211 expref: 2951 pid: 115618 timeout: 0 lvb_type: 0 [2692530.427273] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 4 previous similar messages [2692530.510970] LustreError: 115658:0:(client.c:1175:ptlrpc_import_delay_req()) @@@ IMP_CLOSED req@ffff8edbc9253300 x1630272524237392/t0(0) o104->fir-MDT0003@10.8.18.15@o2ib6:15/16 lens 296/224 e 0 to 0 dl 0 ref 1 fl Rpc:/0/ffffffff rc 0/-1 [2692540.730751] Lustre: 115587:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554847698/real 1554847698] req@ffff8ed0dc9fce00 x1630272524087472/t0(0) o104->fir-MDT0003@10.8.18.17@o2ib6:15/16 lens 296/224 e 0 to 1 dl 1554847705 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2692540.758285] Lustre: 115587:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 13 previous similar messages [2692540.768418] LustreError: 115587:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.8.18.17@o2ib6) failed to reply to blocking AST (req@ffff8ed0dc9fce00 x1630272524087472 status 0 rc -110), evict it ns: mdt-fir-MDT0003_UUID lock: ffff8edb39216540/0xbc329462da3b5f04 lrc: 4/0,0 mode: PR/PR res: [0x2800102fc:0x84:0x0].0x0 bits 0x5b/0x0 rrc: 30 type: IBT flags: 0x60200400000020 nid: 10.8.18.17@o2ib6 remote: 0x30ff5905b89d22ca expref: 2963 pid: 116159 timeout: 2692542 lvb_type: 0 [2692540.811642] LustreError: 138-a: fir-MDT0003: A client on nid 10.8.18.17@o2ib6 was evicted due to a lock blocking callback time out: rc -110 [2692549.705840] Lustre: 115922:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ebecedc9800 x1628584633290016/t0(0) o101->062b6f4e-85d8-c11f-5a12-27ec1dc7a3ae@10.9.107.67@o2ib4:9/0 lens 480/568 e 1 to 0 dl 1554847719 ref 2 fl Interpret:/0/0 rc 0/0 [2692574.567149] Lustre: 115764:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554847732/real 1554847732] req@ffff8ed691e95d00 x1630272524360144/t0(0) o104->fir-MDT0003@10.8.18.13@o2ib6:15/16 lens 296/224 e 0 to 1 dl 1554847739 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2692574.594664] Lustre: 115764:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 34 previous similar messages [2692588.632309] Lustre: 115659:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ebececf3f00 x1628592858092480/t0(0) o101->ac76845a-f296-9da7-b752-06a24face459@10.8.18.12@o2ib6:18/0 lens 576/3264 e 0 to 0 dl 1554847758 ref 2 fl Interpret:/0/0 rc 0/0 [2692588.661571] Lustre: 115659:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 28 previous similar messages [2692631.545828] LustreError: 116165:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554847706, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8ecf5bd08900/0xbc329462dfc45e4f lrc: 3/1,0 mode: --/PR res: [0x28000f8d0:0x92c3:0x0].0x0 bits 0x13/0x0 rrc: 128 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 116165 timeout: 0 lvb_type: 0 [2692638.614924] Lustre: 115919:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554847796/real 1554847796] req@ffff8ee3904d3f00 x1630272524475344/t0(0) o104->fir-MDT0003@10.8.18.13@o2ib6:15/16 lens 296/224 e 0 to 1 dl 1554847803 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2692638.642457] Lustre: 115919:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 111 previous similar messages [2692641.807957] LustreError: 116301:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554847717, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8ed6ade50d80/0xbc329462dfc8cb22 lrc: 3/1,0 mode: --/PR res: [0x28000f8d0:0x92c3:0x0].0x0 bits 0x13/0x0 rrc: 128 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 116301 timeout: 0 lvb_type: 0 [2692645.503473] Lustre: fir-MDT0003: Client c23d08d1-0a95-9c2f-945e-9baed1e23306 (at 10.9.102.53@o2ib4) reconnecting [2692645.513822] Lustre: Skipped 332 previous similar messages [2692648.024024] LustreError: 116212:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554847723, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8eb76217d100/0xbc329462dfcb3a45 lrc: 3/1,0 mode: --/PR res: [0x280000dbb:0x18a:0x0].0x0 bits 0x13/0x0 rrc: 556 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 116212 timeout: 0 lvb_type: 0 [2692648.063861] LustreError: 116212:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 19 previous similar messages [2692659.825216] LustreError: 115587:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.8.18.13@o2ib6) failed to reply to blocking AST (req@ffff8ed0dc9ffb00 x1630272524087728 status 0 rc -110), evict it ns: mdt-fir-MDT0003_UUID lock: ffff8ec51df48fc0/0xbc329462da29f90c lrc: 4/0,0 mode: PR/PR res: [0x2800102fc:0x84:0x0].0x0 bits 0x5b/0x0 rrc: 29 type: IBT flags: 0x60200400000020 nid: 10.8.18.13@o2ib6 remote: 0x62028c164870431e expref: 2966 pid: 115890 timeout: 2692781 lvb_type: 0 [2692659.868436] LustreError: 138-a: fir-MDT0003: A client on nid 10.8.18.13@o2ib6 was evicted due to a lock blocking callback time out: rc -110 [2692659.881165] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 155s: evicting client at 10.8.18.13@o2ib6 ns: mdt-fir-MDT0003_UUID lock: ffff8ec51df48fc0/0xbc329462da29f90c lrc: 3/0,0 mode: PR/PR res: [0x2800102fc:0x84:0x0].0x0 bits 0x5b/0x0 rrc: 29 type: IBT flags: 0x60200400000020 nid: 10.8.18.13@o2ib6 remote: 0x62028c164870431e expref: 2967 pid: 115890 timeout: 0 lvb_type: 0 [2692659.918941] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 1 previous similar message [2692659.929350] Lustre: 115913:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (125:1s); client may timeout. req@ffff8ebecedc9800 x1628584633290016/t0(0) o101->062b6f4e-85d8-c11f-5a12-27ec1dc7a3ae@10.9.107.67@o2ib4:9/0 lens 480/536 e 1 to 0 dl 1554847824 ref 1 fl Complete:/0/0 rc 0/0 [2692659.940948] LustreError: 115957:0:(client.c:1175:ptlrpc_import_delay_req()) @@@ IMP_CLOSED req@ffff8ebbddc4b300 x1630272524917392/t0(0) o104->fir-MDT0003@10.8.18.13@o2ib6:15/16 lens 296/224 e 0 to 0 dl 0 ref 1 fl Rpc:/0/ffffffff rc 0/-1 [2692659.979309] Lustre: 115913:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 5 previous similar messages [2692693.671614] Lustre: 115832:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eca34fa1200 x1628869429550608/t0(0) o101->65c38a59-13b6-ad9d-d264-242871bd2192@10.8.27.18@o2ib6:3/0 lens 480/568 e 0 to 0 dl 1554847863 ref 2 fl Interpret:/0/0 rc 0/0 [2692693.700705] Lustre: 115832:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 193 previous similar messages [2692702.802735] Lustre: fir-MDT0001: haven't heard from client 9a813140-ad3d-15f6-bd96-54d3805fc928 (at 10.8.18.17@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8edf7d7cd400, cur 1554847868 expire 1554847718 last 1554847641 [2692702.824612] Lustre: Skipped 1 previous similar message [2692703.657771] LustreError: 116124:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.8.18.16@o2ib6) failed to reply to blocking AST (req@ffff8ec1b34e6900 x1630272525094832 status 0 rc -110), evict it ns: mdt-fir-MDT0003_UUID lock: ffff8ebfe8f5e0c0/0xbc329462da2fc84d lrc: 4/0,0 mode: PR/PR res: [0x28000f839:0x9287:0x0].0x0 bits 0x40/0x0 rrc: 76 type: IBT flags: 0x60000400000020 nid: 10.8.18.16@o2ib6 remote: 0xffc69be677deda78 expref: 2965 pid: 115618 timeout: 2692705 lvb_type: 0 [2692703.701171] LustreError: 138-a: fir-MDT0003: A client on nid 10.8.18.16@o2ib6 was evicted due to a lock blocking callback time out: rc -110 [2692725.573040] LustreError: 115912:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.8.18.12@o2ib6) failed to reply to blocking AST (req@ffff8eba6dc5a700 x1630272525159008 status 0 rc -110), evict it ns: mdt-fir-MDT0003_UUID lock: ffff8ec65ca3b3c0/0xbc329462dff952cb lrc: 4/0,0 mode: PR/PR res: [0x280000dbb:0x18a:0x0].0x0 bits 0x13/0x0 rrc: 652 type: IBT flags: 0x60200400000020 nid: 10.8.18.12@o2ib6 remote: 0xefb71e87a1ec5082 expref: 3002 pid: 115617 timeout: 2692726 lvb_type: 0 [2692725.616428] LustreError: 138-a: fir-MDT0003: A client on nid 10.8.18.12@o2ib6 was evicted due to a lock blocking callback time out: rc -110 [2692885.806190] Lustre: fir-MDT0001: haven't heard from client 0e5ee8fd-25ed-1aae-9a3a-3fc4362ec502 (at 10.8.18.16@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ec078da3400, cur 1554848051 expire 1554847901 last 1554847824 [2692885.828064] Lustre: Skipped 2 previous similar messages [2692927.588091] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2692927.598524] Lustre: Skipped 458 previous similar messages [2693263.608149] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2693263.618319] Lustre: Skipped 213 previous similar messages [2693536.689489] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2693536.699930] Lustre: Skipped 155 previous similar messages [2693711.815260] Lustre: fir-MDT0001: haven't heard from client 417a8c3f-2135-6cfd-4833-16427e9861f1 (at 10.8.26.16@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed198b59c00, cur 1554848877 expire 1554848727 last 1554848650 [2693711.837142] Lustre: Skipped 1 previous similar message [2693872.710552] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2693872.720731] Lustre: Skipped 97 previous similar messages [2694022.819121] Lustre: fir-MDT0001: haven't heard from client bcf64f18-80a8-ed4d-5922-949d80750823 (at 10.8.8.33@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed184e2dc00, cur 1554849188 expire 1554849038 last 1554848961 [2694022.840908] Lustre: Skipped 1 previous similar message [2694140.922466] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2694140.933081] Lustre: Skipped 135 previous similar messages [2694337.823666] Lustre: fir-MDT0003: haven't heard from client 524aa348-c2f6-75a0-0025-2824d3644a3f (at 10.8.25.21@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee3a43adc00, cur 1554849503 expire 1554849353 last 1554849276 [2694337.845572] Lustre: Skipped 1 previous similar message [2694481.802045] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2694481.812220] Lustre: Skipped 95 previous similar messages [2694754.886576] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2694754.897009] Lustre: Skipped 107 previous similar messages [2695090.911738] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2695090.921926] Lustre: Skipped 97 previous similar messages [2695132.832835] Lustre: fir-MDT0001: haven't heard from client 8f76a491-ee82-ce9f-f9c4-6dc9eb637daf (at 10.9.114.13@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ec07ba29800, cur 1554850298 expire 1554850148 last 1554850071 [2695132.854814] Lustre: Skipped 3 previous similar messages [2695208.835908] Lustre: fir-MDT0001: haven't heard from client 5f6d3747-ddfa-fc24-2c77-efd7e80ad18a (at 10.9.108.47@o2ib4) in 172 seconds. I think it's dead, and I am evicting it. exp ffff8edcb42c8400, cur 1554850374 expire 1554850224 last 1554850202 [2695208.857887] Lustre: Skipped 1 previous similar message [2695363.995170] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2695364.005610] Lustre: Skipped 97 previous similar messages [2695412.836136] Lustre: fir-MDT0001: haven't heard from client 250b848b-16c2-24b0-6c48-cac4d326e580 (at 10.8.2.15@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ec076f1ec00, cur 1554850578 expire 1554850428 last 1554850351 [2695412.857942] Lustre: Skipped 1 previous similar message [2695630.839701] Lustre: fir-MDT0001: haven't heard from client c1b7b606-cf3c-ce5f-31cb-45f37f0f59dc (at 10.9.114.9@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ebb46dc7400, cur 1554850796 expire 1554850646 last 1554850569 [2695630.861596] Lustre: Skipped 1 previous similar message [2695691.051474] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2695691.061820] Lustre: Skipped 95 previous similar messages [2695706.840146] Lustre: fir-MDT0001: haven't heard from client d8ab8b41-d578-d839-2400-f99293fe9723 (at 10.8.21.17@o2ib6) in 161 seconds. I think it's dead, and I am evicting it. exp ffff8ed91ae2c800, cur 1554850872 expire 1554850722 last 1554850711 [2695706.862019] Lustre: Skipped 1 previous similar message [2695782.840787] Lustre: fir-MDT0003: haven't heard from client df2c2787-f2f0-9f7b-851f-bd66240636b4 (at 10.8.24.14@o2ib6) in 164 seconds. I think it's dead, and I am evicting it. exp ffff8ed1fb157c00, cur 1554850948 expire 1554850798 last 1554850784 [2695782.862668] Lustre: Skipped 1 previous similar message [2695970.071019] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2695970.081625] Lustre: Skipped 99 previous similar messages [2696309.096671] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2696309.106848] Lustre: Skipped 97 previous similar messages [2696582.180014] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2696582.190449] Lustre: Skipped 105 previous similar messages [2696918.205079] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2696918.215258] Lustre: Skipped 97 previous similar messages [2696937.854833] Lustre: fir-MDT0001: haven't heard from client 016cbaed-1f79-b150-7080-ac5e42984022 (at 10.8.12.12@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed720ed5000, cur 1554852103 expire 1554851953 last 1554851876 [2696937.876728] Lustre: Skipped 1 previous similar message [2697191.305459] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2697191.315899] Lustre: Skipped 99 previous similar messages [2697520.201178] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2697520.211529] Lustre: Skipped 95 previous similar messages [2697578.862917] Lustre: fir-MDT0001: haven't heard from client 98a44362-65d0-d4ef-d06c-bc3321505252 (at 10.8.30.1@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8edce34c8000, cur 1554852744 expire 1554852594 last 1554852517 [2697578.884707] Lustre: Skipped 1 previous similar message [2697654.863819] Lustre: fir-MDT0001: haven't heard from client 9f3d5e5a-32c4-8ffd-0ceb-12eaf6850297 (at 10.8.28.1@o2ib6) in 151 seconds. I think it's dead, and I am evicting it. exp ffff8eca93375000, cur 1554852820 expire 1554852670 last 1554852669 [2697654.885617] Lustre: Skipped 1 previous similar message [2697799.220765] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2697799.231373] Lustre: Skipped 96 previous similar messages [2697865.866526] Lustre: fir-MDT0001: haven't heard from client ecd69873-9a0a-0293-0ce7-d92dc18bf8d9 (at 10.9.101.11@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8edf187cb800, cur 1554853031 expire 1554852881 last 1554852804 [2697865.888487] Lustre: Skipped 1 previous similar message [2698136.417568] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2698136.427747] Lustre: Skipped 81 previous similar messages [2698237.871808] Lustre: fir-MDT0003: haven't heard from client f3fc7f57-7bbe-e8dd-3226-dc6d0d68a0ee (at 10.8.23.19@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee0324c9800, cur 1554853403 expire 1554853253 last 1554853176 [2698237.893700] Lustre: Skipped 1 previous similar message [2698409.516946] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2698409.527378] Lustre: Skipped 77 previous similar messages [2698460.874020] Lustre: fir-MDT0001: haven't heard from client 68d0205c-c4f2-e29d-2052-3be40b877fcd (at 10.8.25.14@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ec07ba10400, cur 1554853626 expire 1554853476 last 1554853399 [2698460.895913] Lustre: Skipped 1 previous similar message [2698536.874987] Lustre: fir-MDT0003: haven't heard from client 52925f15-540a-1147-9a28-79194854cfac (at 10.8.29.5@o2ib6) in 167 seconds. I think it's dead, and I am evicting it. exp ffff8ed07c2b9400, cur 1554853702 expire 1554853552 last 1554853535 [2698536.896794] Lustre: Skipped 7 previous similar messages [2698612.876654] Lustre: fir-MDT0001: haven't heard from client 865d59e7-acb6-3aab-007f-1990f939aae4 (at 10.9.113.2@o2ib4) in 226 seconds. I think it's dead, and I am evicting it. exp ffff8eddae344800, cur 1554853778 expire 1554853628 last 1554853552 [2698612.898554] Lustre: Skipped 16 previous similar messages [2698688.878927] Lustre: fir-MDT0001: haven't heard from client 13899261-37ac-b403-0f0e-87f0f3708d1d (at 10.9.113.1@o2ib4) in 156 seconds. I think it's dead, and I am evicting it. exp ffff8ec06fb3f800, cur 1554853854 expire 1554853704 last 1554853698 [2698745.541052] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2698745.551250] Lustre: Skipped 77 previous similar messages [2698759.887215] Lustre: fir-MDT0003: haven't heard from client 13899261-37ac-b403-0f0e-87f0f3708d1d (at 10.9.113.1@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed5cf0d6400, cur 1554853925 expire 1554853775 last 1554853698 [2698759.909121] Lustre: Skipped 1 previous similar message [2699018.642487] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2699018.652923] Lustre: Skipped 79 previous similar messages [2699349.331080] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2699349.341449] Lustre: Skipped 76 previous similar messages [2699627.751166] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2699627.761603] Lustre: Skipped 82 previous similar messages [2699963.772402] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2699963.782573] Lustre: Skipped 77 previous similar messages [2700236.868821] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2700236.879257] Lustre: Skipped 111 previous similar messages [2700537.900582] Lustre: fir-MDT0003: haven't heard from client af46dcf0-5f7b-95b2-4146-c6ad2f0a40dd (at 10.8.25.2@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee434e3b000, cur 1554855703 expire 1554855553 last 1554855476 [2700537.922369] Lustre: Skipped 5 previous similar messages [2700572.894143] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2700572.904319] Lustre: Skipped 77 previous similar messages [2700837.448807] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2700837.459419] Lustre: Skipped 78 previous similar messages [2701078.907519] Lustre: fir-MDT0001: haven't heard from client 692aaeae-6e6d-ed7a-cc7e-274cc7b5e9c2 (at 10.9.103.42@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ec07ba2ec00, cur 1554856244 expire 1554856094 last 1554856017 [2701078.929498] Lustre: Skipped 1 previous similar message [2701178.494104] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2701178.504479] Lustre: Skipped 76 previous similar messages [2701455.066132] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2701455.076575] Lustre: Skipped 77 previous similar messages [2701456.911544] Lustre: fir-MDT0001: haven't heard from client b63d04aa-5121-0f9f-7ee5-d03203a0c186 (at 10.8.23.16@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8edf7d7c9400, cur 1554856622 expire 1554856472 last 1554856395 [2701456.933436] Lustre: Skipped 1 previous similar message [2701791.091300] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2701791.101479] Lustre: Skipped 77 previous similar messages [2702064.175723] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2702064.186166] Lustre: Skipped 77 previous similar messages [2702251.921405] Lustre: fir-MDT0003: haven't heard from client ec8d9d2f-4773-1f30-67c5-39104ab44883 (at 10.8.26.33@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eb8872c5c00, cur 1554857417 expire 1554857267 last 1554857190 [2702251.943309] Lustre: Skipped 1 previous similar message [2702400.199813] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2702400.209993] Lustre: Skipped 77 previous similar messages [2702420.926342] Lustre: fir-MDT0001: haven't heard from client 03231c7f-99cd-2243-1259-642fbbab230a (at 10.8.20.2@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8edf187ccc00, cur 1554857586 expire 1554857436 last 1554857359 [2702420.948136] Lustre: Skipped 1 previous similar message [2702666.579574] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2702666.590186] Lustre: Skipped 82 previous similar messages [2703007.642743] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2703007.653096] Lustre: Skipped 76 previous similar messages [2703282.368790] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2703282.379232] Lustre: Skipped 79 previous similar messages [2703384.935650] Lustre: fir-MDT0001: haven't heard from client e0b92664-577c-171c-b7d4-d76214a1f9b8 (at 10.8.24.30@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ec07b9edc00, cur 1554858550 expire 1554858400 last 1554858323 [2703384.957525] Lustre: Skipped 1 previous similar message [2703618.391881] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2703618.402053] Lustre: Skipped 77 previous similar messages [2703891.477366] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2703891.487799] Lustre: Skipped 77 previous similar messages [2704227.503518] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2704227.513711] Lustre: Skipped 77 previous similar messages [2704495.745292] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2704495.755902] Lustre: Skipped 78 previous similar messages [2704529.949798] Lustre: fir-MDT0003: haven't heard from client 92c2c453-81a9-337d-c5c6-b78bb0f7bbf2 (at 10.8.23.20@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8edd22612000, cur 1554859695 expire 1554859545 last 1554859468 [2704529.971675] Lustre: Skipped 3 previous similar messages [2704836.608086] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2704836.618268] Lustre: Skipped 76 previous similar messages [2705109.705492] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2705109.715930] Lustre: Skipped 83 previous similar messages [2705445.730668] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2705445.740846] Lustre: Skipped 78 previous similar messages [2705460.961393] Lustre: fir-MDT0001: haven't heard from client 6829a361-40ff-611c-75ed-f5242e05fe1c (at 10.8.24.11@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee07bc15400, cur 1554860626 expire 1554860476 last 1554860399 [2705460.983278] Lustre: Skipped 1 previous similar message [2705718.822035] Lustre: fir-MDT0003: Connection restored to e133d3ba-741b-8aba-d6d6-045d6382599c (at 10.9.108.54@o2ib4) [2705718.832646] Lustre: Skipped 78 previous similar messages [2706045.884077] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2706045.894426] Lustre: Skipped 75 previous similar messages [2706287.971569] Lustre: fir-MDT0003: haven't heard from client 0872ba06-433d-bfbf-f6f8-6ec5d4c9ec04 (at 10.8.23.34@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8edd22612800, cur 1554861453 expire 1554861303 last 1554861226 [2706287.993460] Lustre: Skipped 1 previous similar message [2706324.902161] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2706324.912777] Lustre: Skipped 77 previous similar messages [2706663.911835] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2706663.922030] Lustre: Skipped 77 previous similar messages [2706936.995209] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2706937.005642] Lustre: Skipped 77 previous similar messages [2707273.016297] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2707273.026475] Lustre: Skipped 77 previous similar messages [2707546.114749] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2707546.125183] Lustre: Skipped 79 previous similar messages [2707781.990091] Lustre: fir-MDT0003: haven't heard from client b455a33c-b7a8-4823-dd3e-850e58569e83 (at 10.9.109.3@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee3a43ac400, cur 1554862947 expire 1554862797 last 1554862720 [2707782.011987] Lustre: Skipped 1 previous similar message [2707875.039378] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2707875.049728] Lustre: Skipped 76 previous similar messages [2708154.058552] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2708154.069167] Lustre: Skipped 77 previous similar messages [2708181.995544] Lustre: fir-MDT0003: haven't heard from client 15cd0242-c007-9677-1119-6847160897d1 (at 10.8.28.4@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed3bf586800, cur 1554863347 expire 1554863197 last 1554863120 [2708182.017343] Lustre: Skipped 1 previous similar message [2708199.000867] Lustre: fir-MDT0001: haven't heard from client 15cd0242-c007-9677-1119-6847160897d1 (at 10.8.28.4@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ec078da4000, cur 1554863364 expire 1554863214 last 1554863137 [2708491.214538] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2708491.224715] Lustre: Skipped 77 previous similar messages [2708764.300117] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2708764.310551] Lustre: Skipped 77 previous similar messages [2709100.324307] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2709100.334487] Lustre: Skipped 78 previous similar messages [2709373.401009] Lustre: fir-MDT0003: Connection restored to e133d3ba-741b-8aba-d6d6-045d6382599c (at 10.9.108.54@o2ib4) [2709373.411619] Lustre: Skipped 79 previous similar messages [2709652.014133] Lustre: fir-MDT0003: haven't heard from client 8decdc5e-0e10-453e-6547-5a76533d9f7c (at 10.8.1.29@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed5b72fe400, cur 1554864817 expire 1554864667 last 1554864590 [2709704.186362] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2709704.196712] Lustre: Skipped 75 previous similar messages [2709982.498856] Lustre: fir-MDT0003: Connection restored to e133d3ba-741b-8aba-d6d6-045d6382599c (at 10.9.108.54@o2ib4) [2709982.509469] Lustre: Skipped 78 previous similar messages [2710318.526638] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2710318.536817] Lustre: Skipped 78 previous similar messages [2710591.608088] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2710591.618522] Lustre: Skipped 77 previous similar messages [2710927.628177] Lustre: fir-MDT0003: Client e133d3ba-741b-8aba-d6d6-045d6382599c (at 10.9.108.54@o2ib4) reconnecting [2710927.638529] Lustre: Skipped 77 previous similar messages [2711192.291342] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2711192.301957] Lustre: Skipped 75 previous similar messages [2711533.337249] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2711533.347603] Lustre: Skipped 75 previous similar messages [2711683.039199] Lustre: fir-MDT0003: haven't heard from client 22529811-0101-a89a-781a-7493a8f25b6b (at 10.9.102.70@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee434e38800, cur 1554866848 expire 1554866698 last 1554866621 [2711683.061167] Lustre: Skipped 1 previous similar message [2711759.040297] Lustre: fir-MDT0003: haven't heard from client 71bd1389-41d9-d380-693e-d732397cdf93 (at 10.8.24.12@o2ib6) in 162 seconds. I think it's dead, and I am evicting it. exp ffff8ed257abe800, cur 1554866924 expire 1554866774 last 1554866762 [2711759.062178] Lustre: Skipped 1 previous similar message [2711809.812455] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2711809.822898] Lustre: Skipped 77 previous similar messages [2712073.044207] Lustre: fir-MDT0003: haven't heard from client 2c77939b-87c9-c8d7-b123-0924936b6fae (at 10.8.20.20@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eb798839000, cur 1554867238 expire 1554867088 last 1554867011 [2712073.066404] Lustre: Skipped 1 previous similar message [2712145.836592] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2712145.846771] Lustre: Skipped 78 previous similar messages [2712418.914734] Lustre: fir-MDT0003: Connection restored to e133d3ba-741b-8aba-d6d6-045d6382599c (at 10.9.108.54@o2ib4) [2712418.925344] Lustre: Skipped 77 previous similar messages [2712754.926250] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2712754.936423] Lustre: Skipped 76 previous similar messages [2713021.441460] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2713021.452066] Lustre: Skipped 75 previous similar messages [2713362.461533] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2713362.471883] Lustre: Skipped 76 previous similar messages [2713637.099400] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2713637.109837] Lustre: Skipped 81 previous similar messages [2713738.065346] Lustre: fir-MDT0001: haven't heard from client e4435807-cb03-0ec8-5c3b-557dd0ed9607 (at 10.8.25.3@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed3f98a3000, cur 1554868903 expire 1554868753 last 1554868676 [2713738.087156] Lustre: Skipped 1 previous similar message [2713745.066141] Lustre: fir-MDT0003: haven't heard from client e4435807-cb03-0ec8-5c3b-557dd0ed9607 (at 10.8.25.3@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eee6cb44800, cur 1554868910 expire 1554868760 last 1554868683 [2713973.127558] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2713973.137732] Lustre: Skipped 77 previous similar messages [2714246.228976] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2714246.239408] Lustre: Skipped 80 previous similar messages [2714582.252008] Lustre: fir-MDT0003: Client e133d3ba-741b-8aba-d6d6-045d6382599c (at 10.9.108.54@o2ib4) reconnecting [2714582.262360] Lustre: Skipped 78 previous similar messages [2714625.076104] Lustre: fir-MDT0003: haven't heard from client 2f2deed1-bd16-d1c8-d67c-b6db931629d2 (at 10.9.102.36@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee434e3c800, cur 1554869790 expire 1554869640 last 1554869563 [2714701.076993] Lustre: fir-MDT0003: haven't heard from client ef148919-2f18-019d-5c6c-d3ca079dc19c (at 10.9.114.5@o2ib4) in 160 seconds. I think it's dead, and I am evicting it. exp ffff8ebd821e5c00, cur 1554869866 expire 1554869716 last 1554869706 [2714701.098892] Lustre: Skipped 1 previous similar message [2714850.590187] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2714850.600813] Lustre: Skipped 75 previous similar messages [2715191.333656] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2715191.343831] Lustre: Skipped 75 previous similar messages [2715464.433093] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2715464.443554] Lustre: Skipped 81 previous similar messages [2715534.087342] Lustre: fir-MDT0003: haven't heard from client d924362c-8c55-8d42-2fc4-a04d9845569e (at 10.9.103.3@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ef07ef9d400, cur 1554870699 expire 1554870549 last 1554870472 [2715534.109217] Lustre: Skipped 1 previous similar message [2715610.088282] Lustre: fir-MDT0001: haven't heard from client 420de317-c24c-be46-8796-d52cfdbc9803 (at 10.8.20.5@o2ib6) in 209 seconds. I think it's dead, and I am evicting it. exp ffff8ee07bc13400, cur 1554870775 expire 1554870625 last 1554870566 [2715610.110068] Lustre: Skipped 1 previous similar message [2715800.458210] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2715800.468397] Lustre: Skipped 77 previous similar messages [2716073.556638] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2716073.567073] Lustre: Skipped 77 previous similar messages [2716400.718891] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2716400.729247] Lustre: Skipped 76 previous similar messages [2716529.099695] Lustre: fir-MDT0003: haven't heard from client 361b6430-6664-17c2-0023-4a55a6bd84b5 (at 10.8.23.14@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eda36f9e400, cur 1554871694 expire 1554871544 last 1554871467 [2716529.121591] Lustre: Skipped 1 previous similar message [2716679.763400] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2716679.774015] Lustre: Skipped 78 previous similar messages [2716729.502169] Lustre: 116204:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554871887/real 1554871887] req@ffff8ec17a5a5100 x1630272758750176/t0(0) o104->fir-MDT0003@10.9.101.46@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554871894 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2716729.529770] Lustre: 116204:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 81 previous similar messages [2716737.533266] Lustre: 115842:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ecad75e5d00 x1628734539946112/t0(0) o101->253c07d0-1dda-ba64-13bf-14531ffca918@10.9.101.18@o2ib4:17/0 lens 1784/3288 e 1 to 0 dl 1554871907 ref 2 fl Interpret:/0/0 rc 0/0 [2716737.562608] Lustre: 115842:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 36 previous similar messages [2716750.539428] Lustre: 116204:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554871908/real 1554871908] req@ffff8ec17a5a5100 x1630272758750176/t0(0) o104->fir-MDT0003@10.9.101.46@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554871915 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2716750.567038] Lustre: 116204:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 2 previous similar messages [2716769.301664] Lustre: 115585:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ebc93226c00 x1629300618830576/t0(0) o101->57942e14-8354-c6f0-f568-9aab0cc28619@10.9.108.52@o2ib4:19/0 lens 584/3264 e 1 to 0 dl 1554871939 ref 2 fl Interpret:/0/0 rc 0/0 [2716769.330931] Lustre: 115585:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 2 previous similar messages [2716785.577866] Lustre: 116204:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554871943/real 1554871943] req@ffff8ec17a5a5100 x1630272758750176/t0(0) o104->fir-MDT0003@10.9.101.46@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554871950 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2716785.605470] Lustre: 116204:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 4 previous similar messages [2716816.761249] LustreError: 116593:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554871891, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8eefe0815e80/0xbc329463af2f09bb lrc: 3/1,0 mode: --/PR res: [0x280000404:0x157:0x0].0x0 bits 0x13/0x0 rrc: 22 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 116593 timeout: 0 lvb_type: 0 [2716816.800992] LustreError: 116593:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 2 previous similar messages [2716820.750308] Lustre: 115929:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ebc99d5ad00 x1630285938601392/t0(0) o101->ed4a110c-e924-fdde-dd0b-bcf07f7a98fc@10.9.101.26@o2ib4:10/0 lens 576/3264 e 0 to 0 dl 1554871990 ref 2 fl Interpret:/0/0 rc 0/0 [2716824.342346] LustreError: 115941:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554871899, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8eb695ab8900/0xbc329463af5f2ff7 lrc: 3/1,0 mode: --/PR res: [0x280000404:0x157:0x0].0x0 bits 0x13/0x0 rrc: 22 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115941 timeout: 0 lvb_type: 0 [2716844.261595] LustreError: 115880:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554871919, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8ebb2120e0c0/0xbc329463afdf1018 lrc: 3/1,0 mode: --/PR res: [0x280000404:0x157:0x0].0x0 bits 0x13/0x0 rrc: 22 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 115880 timeout: 0 lvb_type: 0 [2716855.615730] Lustre: 116204:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554872013/real 1554872013] req@ffff8ec17a5a5100 x1630272758750176/t0(0) o104->fir-MDT0003@10.9.101.46@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554872020 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2716855.643343] Lustre: 116204:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 9 previous similar messages [2716876.653018] LustreError: 116204:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.9.101.46@o2ib4) failed to reply to blocking AST (req@ffff8ec17a5a5100 x1630272758750176 status 0 rc -110), evict it ns: mdt-fir-MDT0003_UUID lock: ffff8ee87458d580/0xbc329463acdc1aa8 lrc: 4/0,0 mode: PR/PR res: [0x280000404:0x157:0x0].0x0 bits 0x13/0x0 rrc: 22 type: IBT flags: 0x60200400000020 nid: 10.9.101.46@o2ib4 remote: 0xee223cdd76c89bf expref: 31 pid: 116178 timeout: 2716997 lvb_type: 0 [2716876.696241] LustreError: 138-a: fir-MDT0003: A client on nid 10.9.101.46@o2ib4 was evicted due to a lock blocking callback time out: rc -110 [2716876.709043] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 154s: evicting client at 10.9.101.46@o2ib4 ns: mdt-fir-MDT0003_UUID lock: ffff8ee87458d580/0xbc329463acdc1aa8 lrc: 3/0,0 mode: PR/PR res: [0x280000404:0x157:0x0].0x0 bits 0x13/0x0 rrc: 22 type: IBT flags: 0x60200400000020 nid: 10.9.101.46@o2ib4 remote: 0xee223cdd76c89bf expref: 32 pid: 116178 timeout: 0 lvb_type: 0 [2716876.746855] LustreError: 115329:0:(ldlm_lockd.c:256:expired_lock_main()) Skipped 2 previous similar messages [2716941.105187] Lustre: fir-MDT0001: haven't heard from client 45691b6c-9faa-1f1e-36f6-ab8ff09b1e0d (at 10.9.101.46@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ec07ba13c00, cur 1554872106 expire 1554871956 last 1554871879 [2716941.127147] Lustre: Skipped 1 previous similar message [2717018.652271] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2717018.662448] Lustre: Skipped 99 previous similar messages [2717291.740612] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2717291.751072] Lustre: Skipped 99 previous similar messages [2717443.111902] Lustre: fir-MDT0003: haven't heard from client 92964478-08d5-6a0e-08a3-b26f8dfeffde (at 10.9.103.16@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eb79896c000, cur 1554872608 expire 1554872458 last 1554872381 [2717627.763703] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2717627.773882] Lustre: Skipped 77 previous similar messages [2717900.862004] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2717900.872448] Lustre: Skipped 81 previous similar messages [2718231.849738] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2718231.860100] Lustre: Skipped 76 previous similar messages [2718509.972384] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2718509.982836] Lustre: Skipped 78 previous similar messages [2718845.994385] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2718846.004563] Lustre: Skipped 77 previous similar messages [2719100.131129] Lustre: fir-MDT0003: haven't heard from client 4ca4c2ad-cd6a-dfde-f226-169e39a2885b (at 10.9.104.26@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee07bdc8000, cur 1554874265 expire 1554874115 last 1554874038 [2719100.153090] Lustre: Skipped 1 previous similar message [2719119.092774] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2719119.103209] Lustre: Skipped 79 previous similar messages [2719455.117833] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2719455.128013] Lustre: Skipped 77 previous similar messages [2719585.136828] Lustre: fir-MDT0003: haven't heard from client e13f22ad-3a51-95c3-d402-3545a241a47e (at 10.8.13.24@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed257ab8000, cur 1554874750 expire 1554874600 last 1554874523 [2719585.158713] Lustre: Skipped 1 previous similar message [2719720.030714] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2719720.041338] Lustre: Skipped 78 previous similar messages [2720049.142481] Lustre: fir-MDT0003: haven't heard from client f64ddd3f-cc88-e3a4-198f-2a79d1e00d9f (at 10.9.113.10@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ef07bcd1000, cur 1554875214 expire 1554875064 last 1554874987 [2720049.164475] Lustre: Skipped 1 previous similar message [2720061.051949] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2720061.062298] Lustre: Skipped 76 previous similar messages [2720337.317918] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2720337.328358] Lustre: Skipped 77 previous similar messages [2720409.146698] Lustre: fir-MDT0003: haven't heard from client c7588c5a-5486-537c-a91f-40591c2c2605 (at 10.8.23.11@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed46cd23c00, cur 1554875574 expire 1554875424 last 1554875347 [2720409.168586] Lustre: Skipped 1 previous similar message [2720673.338874] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2720673.349054] Lustre: Skipped 77 previous similar messages [2720897.152183] Lustre: fir-MDT0001: haven't heard from client 3436b90e-b3c0-9d88-8c74-80d242262cfb (at 10.9.114.5@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eedbc6d6000, cur 1554876062 expire 1554875912 last 1554875835 [2720897.174227] Lustre: Skipped 1 previous similar message [2720946.436989] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2720946.447430] Lustre: Skipped 79 previous similar messages [2720978.152778] Lustre: fir-MDT0003: haven't heard from client 3d7a4588-750c-d9f2-ef2e-b2a802ec36db (at 10.8.22.35@o2ib6) in 152 seconds. I think it's dead, and I am evicting it. exp ffff8eec45b90800, cur 1554876143 expire 1554875993 last 1554875991 [2720978.174680] Lustre: Skipped 1 previous similar message [2721053.154102] Lustre: fir-MDT0001: haven't heard from client 3d7a4588-750c-d9f2-ef2e-b2a802ec36db (at 10.8.22.35@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8edcc5c05800, cur 1554876218 expire 1554876068 last 1554875991 [2721282.459856] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2721282.470040] Lustre: Skipped 77 previous similar messages [2721549.229556] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2721549.240173] Lustre: Skipped 81 previous similar messages [2721674.160874] Lustre: fir-MDT0003: haven't heard from client 7dbaa523-612e-e29b-9b49-504e08eec32a (at 10.9.103.34@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eee6cb42c00, cur 1554876839 expire 1554876689 last 1554876612 [2721890.250645] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2721890.261006] Lustre: Skipped 75 previous similar messages [2722164.650455] Lustre: fir-MDT0003: Connection restored to e133d3ba-741b-8aba-d6d6-045d6382599c (at 10.9.108.54@o2ib4) [2722164.661064] Lustre: Skipped 80 previous similar messages [2722500.660062] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2722500.670240] Lustre: Skipped 77 previous similar messages [2722773.757265] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2722773.767739] Lustre: Skipped 79 previous similar messages [2722848.174509] Lustre: fir-MDT0003: haven't heard from client e7625a94-c63b-d973-91d1-00899d3fc926 (at 10.9.114.5@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eb9d2b34800, cur 1554878013 expire 1554877863 last 1554877786 [2722848.196397] Lustre: Skipped 1 previous similar message [2723109.779183] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2723109.789369] Lustre: Skipped 77 previous similar messages [2723314.348419] LNetError: 115147:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (0, 5) [2723378.414404] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2723378.425025] Lustre: Skipped 79 previous similar messages [2723718.888236] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2723718.898415] Lustre: Skipped 76 previous similar messages [2723967.187577] Lustre: fir-MDT0003: haven't heard from client 3598d71b-3a03-8d91-c081-2856587527fa (at 10.8.30.2@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eec45b96000, cur 1554879132 expire 1554878982 last 1554878905 [2723967.209374] Lustre: Skipped 1 previous similar message [2723991.987454] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2723991.997889] Lustre: Skipped 77 previous similar messages [2724328.014287] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2724328.024468] Lustre: Skipped 77 previous similar messages [2724601.114451] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2724601.124909] Lustre: Skipped 77 previous similar messages [2724928.535078] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2724928.545438] Lustre: Skipped 76 previous similar messages [2725207.579522] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2725207.590142] Lustre: Skipped 76 previous similar messages [2725214.300290] LNetError: 115142:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (0, 5) [2725546.232218] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2725546.242396] Lustre: Skipped 77 previous similar messages [2725661.206994] Lustre: fir-MDT0003: haven't heard from client 8f5040ee-c5c4-c8ea-d244-5982ed763c94 (at 10.8.22.32@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed210f23400, cur 1554880826 expire 1554880676 last 1554880599 [2725661.228878] Lustre: Skipped 1 previous similar message [2725819.334332] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2725819.344775] Lustre: Skipped 79 previous similar messages [2726155.357087] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2726155.367263] Lustre: Skipped 77 previous similar messages [2726428.455248] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2726428.465689] Lustre: Skipped 77 previous similar messages [2726591.217471] Lustre: fir-MDT0003: haven't heard from client f853739d-19d3-f847-48c2-b362ce5f1edc (at 10.8.25.19@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ef0bfc96c00, cur 1554881756 expire 1554881606 last 1554881529 [2726591.239352] Lustre: Skipped 1 previous similar message [2726757.700248] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2726757.710608] Lustre: Skipped 76 previous similar messages [2727036.744631] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2727036.755237] Lustre: Skipped 76 previous similar messages [2727210.699353] LNetError: 115144:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (0, 5) [2727373.552095] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2727373.562269] Lustre: Skipped 77 previous similar messages [2727646.652289] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2727646.662734] Lustre: Skipped 79 previous similar messages [2727982.672206] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2727982.682379] Lustre: Skipped 77 previous similar messages [2728255.752477] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2728255.762918] Lustre: Skipped 77 previous similar messages [2728586.898864] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2728586.909220] Lustre: Skipped 76 previous similar messages [2728864.858654] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2728864.869088] Lustre: Skipped 78 previous similar messages [2729200.277992] Lustre: fir-MDT0003: Client e133d3ba-741b-8aba-d6d6-045d6382599c (at 10.9.108.54@o2ib4) reconnecting [2729200.288345] Lustre: Skipped 77 previous similar messages [2729473.381864] Lustre: fir-MDT0003: Connection restored to e133d3ba-741b-8aba-d6d6-045d6382599c (at 10.9.108.54@o2ib4) [2729473.392489] Lustre: Skipped 77 previous similar messages [2729809.438864] Lustre: fir-MDT0003: Client e133d3ba-741b-8aba-d6d6-045d6382599c (at 10.9.108.54@o2ib4) reconnecting [2729809.449221] Lustre: Skipped 77 previous similar messages [2729863.446804] LNetError: 115150:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (0, 5) [2730075.000895] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2730075.011517] Lustre: Skipped 76 previous similar messages [2730416.063984] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2730416.074333] Lustre: Skipped 76 previous similar messages [2730656.264714] Lustre: fir-MDT0003: haven't heard from client f0c222b7-84e4-d268-86a3-fba53dfe791f (at 10.9.107.6@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ef07bcd7c00, cur 1554885821 expire 1554885671 last 1554885594 [2730656.286607] Lustre: Skipped 1 previous similar message [2730691.687394] Lustre: fir-MDT0003: Connection restored to e133d3ba-741b-8aba-d6d6-045d6382599c (at 10.9.108.54@o2ib4) [2730691.698021] Lustre: Skipped 77 previous similar messages [2731027.749066] Lustre: fir-MDT0003: Client e133d3ba-741b-8aba-d6d6-045d6382599c (at 10.9.108.54@o2ib4) reconnecting [2731027.759416] Lustre: Skipped 77 previous similar messages [2731300.876317] Lustre: fir-MDT0003: Connection restored to e133d3ba-741b-8aba-d6d6-045d6382599c (at 10.9.108.54@o2ib4) [2731300.886940] Lustre: Skipped 77 previous similar messages [2731636.970297] Lustre: fir-MDT0003: Client e133d3ba-741b-8aba-d6d6-045d6382599c (at 10.9.108.54@o2ib4) reconnecting [2731636.980669] Lustre: Skipped 77 previous similar messages [2731669.276203] Lustre: fir-MDT0003: haven't heard from client 39512890-4d65-084d-b3fa-38ea8a8ecdc4 (at 10.8.23.36@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed24be1ac00, cur 1554886834 expire 1554886684 last 1554886607 [2731669.298086] Lustre: Skipped 1 previous similar message [2731904.164661] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2731904.175285] Lustre: Skipped 76 previous similar messages [2731999.280013] Lustre: fir-MDT0003: haven't heard from client 090f2f35-616a-e6c0-3001-dc96ac2213bb (at 10.8.26.22@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed723610400, cur 1554887164 expire 1554887014 last 1554886937 [2731999.301905] Lustre: Skipped 1 previous similar message [2732245.227736] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2732245.238092] Lustre: Skipped 76 previous similar messages [2732519.262139] Lustre: fir-MDT0003: Connection restored to e133d3ba-741b-8aba-d6d6-045d6382599c (at 10.9.108.54@o2ib4) [2732519.272752] Lustre: Skipped 79 previous similar messages [2732855.327741] Lustre: fir-MDT0003: Client e133d3ba-741b-8aba-d6d6-045d6382599c (at 10.9.108.54@o2ib4) reconnecting [2732855.338094] Lustre: Skipped 77 previous similar messages [2732907.290045] Lustre: fir-MDT0001: haven't heard from client b12295be-89ae-2059-d129-fafcf57fe599 (at 10.8.26.7@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ebe198a9400, cur 1554888072 expire 1554887922 last 1554887845 [2732907.311844] Lustre: Skipped 1 previous similar message [2733128.434857] Lustre: fir-MDT0003: Connection restored to e133d3ba-741b-8aba-d6d6-045d6382599c (at 10.9.108.54@o2ib4) [2733128.445476] Lustre: Skipped 77 previous similar messages [2733464.486053] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2733464.496235] Lustre: Skipped 78 previous similar messages [2733733.328855] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2733733.339475] Lustre: Skipped 80 previous similar messages [2734073.579912] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2734073.590092] Lustre: Skipped 75 previous similar messages [2734157.415118] LNetError: 115144:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (0, 5) [2734346.676012] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2734346.686458] Lustre: Skipped 77 previous similar messages [2734682.701831] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2734682.712022] Lustre: Skipped 77 previous similar messages [2734877.312428] Lustre: fir-MDT0001: haven't heard from client 18579504-8257-6c23-4ce6-c57bbcfa5a86 (at 10.8.31.3@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee07b267400, cur 1554890042 expire 1554889892 last 1554889815 [2734877.334238] Lustre: Skipped 1 previous similar message [2734955.799943] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2734955.810381] Lustre: Skipped 79 previous similar messages [2735283.474583] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2735283.484936] Lustre: Skipped 76 previous similar messages [2735562.493737] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2735562.504345] Lustre: Skipped 76 previous similar messages [2735737.322300] Lustre: fir-MDT0003: haven't heard from client cb7f47b6-0d87-ca9f-a5cd-07266ca2e534 (at 10.8.25.32@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed24be1f400, cur 1554890902 expire 1554890752 last 1554890675 [2735737.344183] Lustre: Skipped 1 previous similar message [2735900.913752] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2735900.923948] Lustre: Skipped 77 previous similar messages [2736174.010864] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2736174.021296] Lustre: Skipped 77 previous similar messages [2736510.033548] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2736510.043722] Lustre: Skipped 77 previous similar messages [2736783.120621] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2736783.131074] Lustre: Skipped 79 previous similar messages [2737112.619189] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2737112.629545] Lustre: Skipped 76 previous similar messages [2737391.637901] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2737391.648513] Lustre: Skipped 76 previous similar messages [2737454.340070] Lustre: fir-MDT0003: haven't heard from client 19a181a8-8152-911a-364c-094943207f91 (at 10.9.108.50@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed87129f800, cur 1554892619 expire 1554892469 last 1554892392 [2737454.362051] Lustre: Skipped 1 previous similar message [2737728.230479] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2737728.240653] Lustre: Skipped 77 previous similar messages [2738001.330269] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2738001.340703] Lustre: Skipped 79 previous similar messages [2738337.355812] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2738337.365997] Lustre: Skipped 77 previous similar messages [2738610.453739] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2738610.464176] Lustre: Skipped 77 previous similar messages [2738732.669779] LNetError: 115148:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (0, 5) [2738941.762891] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2738941.773246] Lustre: Skipped 76 previous similar messages [2739219.547420] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2739219.557862] Lustre: Skipped 78 previous similar messages [2739555.571100] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2739555.581278] Lustre: Skipped 77 previous similar messages [2739828.668199] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2739828.678651] Lustre: Skipped 77 previous similar messages [2740120.368908] Lustre: fir-MDT0003: haven't heard from client e76b5818-8853-efcb-dcd6-6c04b41910b6 (at 10.8.20.11@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed723617400, cur 1554895285 expire 1554895135 last 1554895058 [2740120.390792] Lustre: Skipped 1 previous similar message [2740164.690000] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2740164.700196] Lustre: Skipped 77 previous similar messages [2740429.895945] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2740429.906559] Lustre: Skipped 76 previous similar messages [2740770.958698] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2740770.969054] Lustre: Skipped 76 previous similar messages [2741046.861903] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2741046.872346] Lustre: Skipped 77 previous similar messages [2741382.884688] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2741382.894870] Lustre: Skipped 77 previous similar messages [2741417.581088] LustreError: 137-5: fir-MDT0002_UUID: not available for connect from 10.8.9.8@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2741424.371040] LustreError: 137-5: fir-MDT0000_UUID: not available for connect from 10.8.28.12@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2741435.165704] LustreError: 137-5: fir-MDT0002_UUID: not available for connect from 10.8.18.2@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2741451.225791] LustreError: 137-5: fir-MDT0000_UUID: not available for connect from 10.8.21.27@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2741451.243260] LustreError: Skipped 821 previous similar messages [2741483.732018] LustreError: 137-5: fir-MDT0000_UUID: not available for connect from 10.8.8.1@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2741483.749310] LustreError: Skipped 545 previous similar messages [2741547.744318] LustreError: 137-5: fir-MDT0000_UUID: not available for connect from 10.8.30.26@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2741547.761768] LustreError: Skipped 1871 previous similar messages [2741602.556309] LNetError: 115150:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (0, 5) [2741655.983822] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2741655.994259] Lustre: Skipped 77 previous similar messages [2741858.388545] Lustre: fir-MDT0003: haven't heard from client 62a9f54e-6f08-6ca5-8c20-d06d639f0b65 (at 10.8.19.2@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee0324cdc00, cur 1554897023 expire 1554896873 last 1554896796 [2741858.410341] Lustre: Skipped 1 previous similar message [2741992.007605] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2741992.017827] Lustre: Skipped 77 previous similar messages [2742172.392159] Lustre: fir-MDT0003: haven't heard from client 07df4bf0-4e62-42e4-56bb-423049f84e7f (at 10.8.30.8@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8edd22613800, cur 1554897337 expire 1554897187 last 1554897110 [2742172.413971] Lustre: Skipped 1 previous similar message [2742259.060097] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2742259.070717] Lustre: Skipped 78 previous similar messages [2742600.104935] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2742600.115297] Lustre: Skipped 76 previous similar messages [2742874.183779] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2742874.194226] Lustre: Skipped 77 previous similar messages [2743003.766365] LNetError: 115150:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (0, 5) [2743017.401821] Lustre: fir-MDT0003: haven't heard from client bf6bd63b-9206-2f81-2780-2b483791a8c1 (at 10.8.17.15@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed5cf0d5c00, cur 1554898182 expire 1554898032 last 1554897955 [2743017.423714] Lustre: Skipped 1 previous similar message [2743210.209603] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2743210.219788] Lustre: Skipped 77 previous similar messages [2743319.405257] Lustre: fir-MDT0001: haven't heard from client fc73222d-6665-f06d-6918-124a48772130 (at 10.8.20.35@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ede9c5a2400, cur 1554898484 expire 1554898334 last 1554898257 [2743319.427139] Lustre: Skipped 1 previous similar message [2743483.308741] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2743483.319192] Lustre: Skipped 79 previous similar messages [2743819.334606] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2743819.344788] Lustre: Skipped 77 previous similar messages [2744088.205663] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2744088.216329] Lustre: Skipped 78 previous similar messages [2744200.957230] LNetError: 115147:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (0, 5) [2744428.423690] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2744428.433875] Lustre: Skipped 76 previous similar messages [2744701.522763] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2744701.533208] Lustre: Skipped 77 previous similar messages [2744710.421281] Lustre: fir-MDT0003: haven't heard from client b5b7198b-e79f-767a-ff00-9700c6c841f3 (at 10.8.26.14@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed871299800, cur 1554899875 expire 1554899725 last 1554899648 [2744710.443166] Lustre: Skipped 1 previous similar message [2744885.824473] LNetError: 115140:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (0, 5) [2745037.545563] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2745037.555739] Lustre: Skipped 77 previous similar messages [2745310.643623] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2745310.654073] Lustre: Skipped 81 previous similar messages [2745638.315095] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2745638.325450] Lustre: Skipped 76 previous similar messages [2745766.433101] Lustre: fir-MDT0003: haven't heard from client 6644986b-6ded-2fb4-0b62-220707003113 (at 10.8.9.8@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ecb3aa30800, cur 1554900931 expire 1554900781 last 1554900704 [2745766.454838] Lustre: Skipped 1 previous similar message [2745917.352232] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2745917.362845] Lustre: Skipped 76 previous similar messages [2746029.147519] LustreError: 119929:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) ldlm_cancel from 10.8.9.8@o2ib6 arrived at 1554901193 with bad export cookie 13561064580192762757 [2746029.163160] LustreError: 119929:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) Skipped 1 previous similar message [2746255.756093] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2746255.766282] Lustre: Skipped 77 previous similar messages [2746528.852054] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2746528.862507] Lustre: Skipped 81 previous similar messages [2746864.876683] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2746864.886868] Lustre: Skipped 77 previous similar messages [2746963.446155] Lustre: fir-MDT0003: haven't heard from client 1ad132ad-6cc0-575b-4310-386ad50d33b6 (at 10.8.9.10@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee3a43a8800, cur 1554902128 expire 1554901978 last 1554901901 [2746963.467949] Lustre: Skipped 1 previous similar message [2747137.957712] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2747137.968157] Lustre: Skipped 77 previous similar messages [2747467.436150] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2747467.446513] Lustre: Skipped 76 previous similar messages [2747746.480327] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2747746.490937] Lustre: Skipped 76 previous similar messages [2748034.980649] LNetError: 115139:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (0, 5) [2748083.066024] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2748083.076210] Lustre: Skipped 77 previous similar messages [2748356.163975] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2748356.174418] Lustre: Skipped 77 previous similar messages [2748692.187635] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2748692.197842] Lustre: Skipped 77 previous similar messages [2748877.467064] Lustre: fir-MDT0003: haven't heard from client d1685f63-4372-789e-d3e6-62c1d7fdb2ea (at 10.8.22.22@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee07a7e8c00, cur 1554904042 expire 1554903892 last 1554903815 [2748877.488960] Lustre: Skipped 1 previous similar message [2748965.285603] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2748965.296056] Lustre: Skipped 79 previous similar messages [2749296.600126] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2749296.610480] Lustre: Skipped 76 previous similar messages [2749574.394215] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2749574.404662] Lustre: Skipped 76 previous similar messages [2749910.419837] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2749910.430022] Lustre: Skipped 77 previous similar messages [2750071.465808] LNetError: 115144:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (0, 5) [2750183.517824] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2750183.528265] Lustre: Skipped 77 previous similar messages [2750519.540266] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2750519.550498] Lustre: Skipped 77 previous similar messages [2750784.744360] Lustre: fir-MDT0003: Connection restored to 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) [2750784.754973] Lustre: Skipped 78 previous similar messages [2751125.763928] Lustre: fir-MDT0003: Client 253c07d0-1dda-ba64-13bf-14531ffca918 (at 10.9.101.18@o2ib4) reconnecting [2751125.774292] Lustre: Skipped 76 previous similar messages [2751401.734682] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2751401.745129] Lustre: Skipped 77 previous similar messages [2751737.758071] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2751737.768251] Lustre: Skipped 77 previous similar messages [2751964.165118] LustreError: 11-0: fir-MDT0000-lwp-MDT0001: operation mds_disconnect to node 10.0.10.51@o2ib7 failed: rc = -107 [2751964.167654] Lustre: Failing over fir-MDT0003 [2751964.180870] LustreError: Skipped 3 previous similar messages [2751964.212274] Lustre: fir-MDT0001: Not available for connect from 10.9.104.44@o2ib4 (stopping) [2751964.220894] Lustre: Skipped 16 previous similar messages [2751964.239774] LustreError: 94365:0:(osp_dev.c:485:osp_disconnect()) fir-MDT0000-osp-MDT0001: can't disconnect: rc = -19 [2751964.254308] LustreError: 94365:0:(lod_dev.c:265:lod_sub_process_config()) fir-MDT0001-mdtlov: error cleaning up LOD index 0: cmd 0xcf031: rc = -19 [2751964.717173] Lustre: fir-MDT0001: Not available for connect from 10.9.102.6@o2ib4 (stopping) [2751964.725700] Lustre: Skipped 57 previous similar messages [2751964.842458] LNet: Service thread pid 115584 completed after 83245.77s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2751964.859051] LNet: Skipped 1 previous similar message [2751965.102450] LustreError: 116029:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) ldlm_cancel from 10.9.103.42@o2ib4 arrived at 1554907129 with bad export cookie 13561064580945032450 [2751965.227772] Lustre: 115705:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (71925:54320s); client may timeout. req@ffff8ed3886d1b00 x1628598725314096/t180553472270(0) o36->ecd69873-9a0a-0293-0ce7-d92dc18bf8d9@10.9.101.11@o2ib4:14/0 lens 544/424 e 0 to 0 dl 1554852809 ref 1 fl Complete:/0/0 rc -19/-19 [2751965.258155] Lustre: 115705:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 1 previous similar message [2751965.456922] LustreError: 115205:0:(client.c:1175:ptlrpc_import_delay_req()) @@@ IMP_CLOSED req@ffff8ee8a6b5e000 x1630272945348640/t0(0) o41->fir-MDT0000-osp-MDT0003@10.0.10.51@o2ib7:24/4 lens 224/368 e 0 to 0 dl 0 ref 1 fl Rpc:/0/ffffffff rc 0/-1 [2751965.479060] LustreError: 115205:0:(client.c:1175:ptlrpc_import_delay_req()) Skipped 2 previous similar messages [2751965.611994] LustreError: 119917:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) ldlm_cancel from 10.9.101.56@o2ib4 arrived at 1554907130 with bad export cookie 13561064561773290903 [2751965.721288] Lustre: fir-MDT0003: Not available for connect from 10.8.7.34@o2ib6 (stopping) [2751965.729736] Lustre: Skipped 117 previous similar messages [2751966.790256] LustreError: 116029:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) ldlm_cancel from 10.8.9.9@o2ib6 arrived at 1554907131 with bad export cookie 13561064561773286885 [2751967.769941] Lustre: fir-MDT0003: Not available for connect from 10.9.108.7@o2ib4 (stopping) [2751967.778472] Lustre: Skipped 168 previous similar messages [2751968.987083] LustreError: 116029:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) ldlm_cancel from 10.8.27.6@o2ib6 arrived at 1554907133 with bad export cookie 13561064562341942210 [2751969.002807] LustreError: 116029:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) Skipped 3 previous similar messages [2751970.652829] LustreError: 137-5: fir-MDT0003_UUID: not available for connect from 10.9.113.10@o2ib4 (no target). If you are running an HA pair check that the target is mounted on the other server. [2751970.670366] LustreError: Skipped 845 previous similar messages [2751971.129248] Lustre: server umount fir-MDT0003 complete [2751971.774255] Lustre: fir-MDT0001: Not available for connect from 10.8.21.14@o2ib6 (stopping) [2751971.782779] Lustre: Skipped 312 previous similar messages [2751974.505886] LustreError: 116029:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) ldlm_cancel from 10.8.8.7@o2ib6 arrived at 1554907139 with bad export cookie 13561064561773291862 [2751974.521523] LustreError: 116029:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) Skipped 11 previous similar messages [2751979.784090] Lustre: fir-MDT0001: Not available for connect from 10.8.23.2@o2ib6 (stopping) [2751979.792530] Lustre: Skipped 413 previous similar messages [2751984.504632] LustreError: 117441:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) ldlm_cancel from 10.9.107.53@o2ib4 arrived at 1554907149 with bad export cookie 13561064561773292996 [2751984.520527] LustreError: 117441:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) Skipped 1 previous similar message [2751986.657230] LustreError: 137-5: fir-MDT0003_UUID: not available for connect from 10.9.106.53@o2ib4 (no target). If you are running an HA pair check that the target is mounted on the other server. [2751986.674775] LustreError: Skipped 779 previous similar messages [2751993.512375] Lustre: server umount fir-MDT0001 complete [2752026.980911] LNetError: 114936:0:(o2iblnd_cb.c:2469:kiblnd_passive_connect()) Can't accept conn from 10.0.10.202@o2ib7 on NA (ib0:1:10.0.10.52): bad dst nid 10.0.10.52@o2ib7 [2752027.510914] LNetError: 94667:0:(o2iblnd_cb.c:2469:kiblnd_passive_connect()) Can't accept conn from 10.0.10.203@o2ib7 on NA (ib0:1:10.0.10.52): bad dst nid 10.0.10.52@o2ib7 [2752027.526384] LNetError: 94667:0:(o2iblnd_cb.c:2469:kiblnd_passive_connect()) Skipped 14 previous similar messages [2752028.973569] LNet: Removed LNI 10.0.10.52@o2ib7 [2752076.742035] LNet: HW NUMA nodes: 4, HW CPU cores: 48, npartitions: 4 [2752076.749838] alg: No test for adler32 (adler32-zlib) [2752077.554248] Lustre: Lustre: Build Version: 2.12.0.pl4 [2752077.676235] LNet: Using FastReg for registration [2752077.686933] LNetError: 94667:0:(o2iblnd_cb.c:2469:kiblnd_passive_connect()) Can't accept conn from 10.0.10.210@o2ib7 on NA (ib0:0:10.0.10.52): bad dst nid 10.0.10.52@o2ib7 [2752077.693785] LNet: Added LNI 10.0.10.52@o2ib7 [8/256/0/180] [2752139.918851] LDISKFS-fs (dm-0): file extents enabled, maximum tree depth=5 [2752139.936866] LDISKFS-fs (dm-3): file extents enabled, maximum tree depth=5 [2752140.087207] LDISKFS-fs (dm-3): mounted filesystem with ordered data mode. Opts: user_xattr,errors=remount-ro,acl,no_mbcache,nodelalloc [2752140.117300] LDISKFS-fs (dm-0): mounted filesystem with ordered data mode. Opts: user_xattr,errors=remount-ro,acl,no_mbcache,nodelalloc [2752140.890897] LustreError: 137-5: fir-MDT0000_UUID: not available for connect from 10.9.113.5@o2ib4 (no target). If you are running an HA pair check that the target is mounted on the other server. [2752141.710721] LustreError: 137-5: fir-MDT0000_UUID: not available for connect from 10.9.104.50@o2ib4 (no target). If you are running an HA pair check that the target is mounted on the other server. [2752141.728288] LustreError: Skipped 1 previous similar message [2752143.181048] LustreError: 137-5: fir-MDT0000_UUID: not available for connect from 10.9.106.15@o2ib4 (no target). If you are running an HA pair check that the target is mounted on the other server. [2752143.198617] LustreError: Skipped 5 previous similar messages [2752145.192233] LustreError: 137-5: fir-MDT0001_UUID: not available for connect from 10.8.25.18@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2752145.209712] LustreError: Skipped 131 previous similar messages [2752148.494393] LustreError: 11-0: fir-MDT0002-osp-MDT0001: operation mds_connect to node 10.0.10.51@o2ib7 failed: rc = -114 [2752148.605117] Lustre: fir-MDT0001: Imperative Recovery not enabled, recovery window 300-900 [2752148.786895] Lustre: fir-MDD0001: changelog on [2752148.795388] Lustre: fir-MDT0001: in recovery but waiting for the first client to connect [2752148.810556] Lustre: fir-MDT0001: Will be in recovery for at least 5:00, or until 1323 clients reconnect [2752149.120685] LustreError: 11-0: fir-MDT0002-osp-MDT0003: operation mds_connect to node 10.0.10.51@o2ib7 failed: rc = -114 [2752149.131727] LustreError: Skipped 1 previous similar message [2752149.242663] Lustre: fir-MDT0003: Imperative Recovery not enabled, recovery window 300-900 [2752149.293681] Lustre: fir-MDD0003: changelog on [2752149.302338] Lustre: fir-MDT0003: in recovery but waiting for the first client to connect [2752149.320874] Lustre: fir-MDT0003: Will be in recovery for at least 5:00, or until 1323 clients reconnect [2752149.533914] LustreError: 137-5: fir-MDT0000_UUID: not available for connect from 10.9.108.60@o2ib4 (no target). If you are running an HA pair check that the target is mounted on the other server. [2752149.551455] LustreError: Skipped 245 previous similar messages [2752149.820353] Lustre: fir-MDT0001: Connection restored to (at 10.8.12.12@o2ib6) [2752149.827885] Lustre: Skipped 132 previous similar messages [2752150.355147] Lustre: fir-MDT0003: Connection restored to ec9c8464-eb47-cb23-3b42-bb47517ad6ae (at 10.8.23.18@o2ib6) [2752150.365668] Lustre: Skipped 61 previous similar messages [2752151.490122] Lustre: fir-MDT0003: Connection restored to 1c0f2aa2-2b39-3a12-cd1c-1bded24a45b7 (at 10.9.106.36@o2ib4) [2752151.500732] Lustre: Skipped 29 previous similar messages [2752153.551298] Lustre: fir-MDT0003: Connection restored to 92370bd6-9a15-6556-d9c9-7bdf5c971973 (at 10.8.8.5@o2ib6) [2752153.561647] Lustre: Skipped 51 previous similar messages [2752156.823434] LustreError: 95568:0:(mdt_open.c:1364:mdt_reint_open()) @@@ OPEN & CREAT not in open replay/by_fid. req@ffff8ebc806f5a00 x1628739457720992/t0(181400170295) o101->e133d3ba-741b-8aba-d6d6-045d6382599c@10.9.108.54@o2ib4:7/0 lens 1768/3288 e 0 to 0 dl 1554907327 ref 1 fl Interpret:/4/0 rc 0/0 [2752157.552252] Lustre: fir-MDT0003: Connection restored to a68f9963-772f-f022-ef02-79f9f156cc77 (at 10.9.108.17@o2ib4) [2752157.562860] Lustre: Skipped 1501 previous similar messages [2752158.581001] LustreError: 95533:0:(mdt_open.c:1364:mdt_reint_open()) @@@ OPEN & CREAT not in open replay/by_fid. req@ffff8ed8154eb000 x1628734478078752/t0(181061105584) o101->253c07d0-1dda-ba64-13bf-14531ffca918@10.9.101.18@o2ib4:9/0 lens 1768/3288 e 0 to 0 dl 1554907329 ref 1 fl Interpret:/4/0 rc 0/0 [2752160.312626] LustreError: 137-5: fir-MDT0000_UUID: not available for connect from 10.0.10.108@o2ib7 (no target). If you are running an HA pair check that the target is mounted on the other server. [2752160.330170] LustreError: Skipped 24 previous similar messages [2752169.796441] Lustre: fir-MDT0001: Connection restored to (at 10.8.24.12@o2ib6) [2752169.803848] Lustre: Skipped 811 previous similar messages [2752185.817864] Lustre: fir-MDT0003: Connection restored to 10.0.10.102@o2ib7 (at 10.0.10.102@o2ib7) [2752185.826827] Lustre: Skipped 96 previous similar messages [2752199.948134] Lustre: fir-MDT0001: Recovery already passed deadline 0:50. If you do not want to wait more, please abort the recovery by force. [2752199.960914] Lustre: Skipped 2 previous similar messages [2752199.996770] Lustre: fir-MDT0001: Recovery over after 0:51, of 1323 clients 1323 recovered and 0 were evicted. [2752507.758682] Lustre: 95551:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554907665/real 1554907665] req@ffff8ebab7e25400 x1630438418338672/t0(0) o104->fir-MDT0001@10.9.101.10@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554907672 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2752508.493685] Lustre: 95180:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554907665/real 1554907665] req@ffff8eed12780900 x1630438418340160/t0(0) o106->fir-MDT0001@10.9.101.10@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554907672 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2752509.630703] Lustre: 96222:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554907667/real 1554907667] req@ffff8eb8fd205400 x1630438418341264/t0(0) o106->fir-MDT0001@10.9.101.10@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554907674 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2752511.832720] Lustre: 95401:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554907669/real 1554907669] req@ffff8eefeabf5d00 x1630438418344352/t0(0) o106->fir-MDT0001@10.9.101.10@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554907676 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2752511.860231] Lustre: 95401:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 2 previous similar messages [2752516.658779] Lustre: 96222:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554907674/real 1554907674] req@ffff8eb8fd205400 x1630438418341264/t0(0) o106->fir-MDT0001@10.9.101.10@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554907681 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2752516.686301] Lustre: 96222:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 5 previous similar messages [2752525.065871] Lustre: 95930:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554907682/real 1554907682] req@ffff8eef277f7500 x1630438418363152/t0(0) o104->fir-MDT0001@10.9.101.10@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554907689 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2752525.093386] Lustre: 95930:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 13 previous similar messages [2752525.798875] Lustre: 95925:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ed4b0ef9e00 x1628668227518048/t0(0) o101->6bb6b586-f426-e568-1fd9-7be05a8314e1@10.9.101.8@o2ib4:15/0 lens 480/568 e 0 to 0 dl 1554907695 ref 2 fl Interpret:/0/0 rc 0/0 [2752526.566883] Lustre: 96022:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (4/-6), not sending early reply req@ffff8ee3cbba2a00 x1628744016817840/t0(0) o101->3dd549d8-d4a4-54cb-063f-e718a86a8731@10.9.101.27@o2ib4:15/0 lens 480/568 e 0 to 0 dl 1554907695 ref 2 fl Interpret:/0/0 rc 0/0 [2752527.802897] Lustre: 95581:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ed4b0efc200 x1628668854515440/t0(0) o101->310b526c-2d9c-12b8-e9c3-eec1a3d9d55d@10.9.101.13@o2ib4:17/0 lens 480/568 e 0 to 0 dl 1554907697 ref 2 fl Interpret:/0/0 rc 0/0 [2752531.238932] Lustre: 95887:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ed99d33a100 x1628668227574944/t0(0) o101->6bb6b586-f426-e568-1fd9-7be05a8314e1@10.9.101.8@o2ib4:20/0 lens 480/568 e 0 to 0 dl 1554907700 ref 2 fl Interpret:/0/0 rc 0/0 [2752531.268036] Lustre: 95887:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 3 previous similar messages [2752531.815975] Lustre: fir-MDT0001: Client 6bb6b586-f426-e568-1fd9-7be05a8314e1 (at 10.9.101.8@o2ib4) reconnecting [2752531.826279] Lustre: fir-MDT0001: Connection restored to 6bb6b586-f426-e568-1fd9-7be05a8314e1 (at 10.9.101.8@o2ib4) [2752531.836813] Lustre: Skipped 51 previous similar messages [2752533.536333] Lustre: fir-MDT0001: Client 3dd549d8-d4a4-54cb-063f-e718a86a8731 (at 10.9.101.27@o2ib4) reconnecting [2752535.403450] Lustre: fir-MDT0001: Client a487b03f-d5d2-c729-8ddf-d69e191e351d (at 10.9.101.12@o2ib4) reconnecting [2752535.413802] Lustre: Skipped 1 previous similar message [2752537.255002] Lustre: 96005:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ee00c389e00 x1630350056149504/t0(0) o101->c927c3a4-10d5-5d74-f8fb-42ec7ee8b67c@10.9.101.43@o2ib4:26/0 lens 480/568 e 0 to 0 dl 1554907706 ref 2 fl Interpret:/0/0 rc 0/0 [2752537.284170] Lustre: 96005:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 2 previous similar messages [2752538.554826] Lustre: fir-MDT0001: Client 51925813-97dc-db3b-8c07-12e05c18abbc (at 10.9.101.30@o2ib4) reconnecting [2752538.565181] Lustre: Skipped 1 previous similar message [2752541.184041] Lustre: 95353:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554907698/real 1554907698] req@ffff8ec619b7d700 x1630438418384528/t0(0) o106->fir-MDT0001@10.9.101.10@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554907705 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2752541.211565] Lustre: 95353:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 41 previous similar messages [2752543.029012] Lustre: fir-MDT0001: Client c927c3a4-10d5-5d74-f8fb-42ec7ee8b67c (at 10.9.101.43@o2ib4) reconnecting [2752543.039364] Lustre: Skipped 1 previous similar message [2752549.863139] Lustre: 95537:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eba8f1ca700 x1630356475236752/t0(0) o101->e9aa8893-8856-db92-99ea-6ec688e98582@10.9.101.36@o2ib4:9/0 lens 480/568 e 0 to 0 dl 1554907719 ref 2 fl Interpret:/0/0 rc 0/0 [2752549.892215] Lustre: 95537:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 5 previous similar messages [2752556.393460] Lustre: fir-MDT0001: Client e9aa8893-8856-db92-99ea-6ec688e98582 (at 10.9.101.36@o2ib4) reconnecting [2752556.403818] Lustre: Skipped 5 previous similar messages [2752565.895307] Lustre: 95566:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ebdb6c3b600 x1629065440370144/t0(0) o101->f56c06c4-effd-a509-8171-07daf442df55@10.9.108.37@o2ib4:25/0 lens 480/568 e 0 to 0 dl 1554907735 ref 2 fl Interpret:/0/0 rc 0/0 [2752565.924497] Lustre: 95566:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 3 previous similar messages [2752573.260388] Lustre: 96312:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554907730/real 1554907730] req@ffff8eb7af240c00 x1630438418409440/t0(0) o106->fir-MDT0001@10.9.101.10@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554907737 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2752573.287929] Lustre: 96312:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 111 previous similar messages [2752574.114501] Lustre: fir-MDT0001: Client c927c3a4-10d5-5d74-f8fb-42ec7ee8b67c (at 10.9.101.43@o2ib4) reconnecting [2752574.124862] Lustre: Skipped 10 previous similar messages [2752596.142495] Lustre: fir-MDT0001: Connection restored to 2e0c4648-6e21-c09c-2281-ea766335a2a8 (at 10.9.101.35@o2ib4) [2752596.153108] Lustre: Skipped 40 previous similar messages [2752597.927653] Lustre: 95925:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eb217366300 x1628600427187328/t0(0) o101->d3813457-4417-03b3-a468-74026cb0ff17@10.9.101.5@o2ib4:27/0 lens 480/568 e 0 to 0 dl 1554907767 ref 2 fl Interpret:/0/0 rc 0/0 [2752597.956733] Lustre: 95925:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 10 previous similar messages [2752607.383819] Lustre: fir-MDT0001: Client 4215d6e6-d370-4df7-a891-0157cd0f211c (at 10.9.101.22@o2ib4) reconnecting [2752607.394174] Lustre: Skipped 27 previous similar messages [2752637.274069] Lustre: 96046:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554907794/real 1554907794] req@ffff8ee5bc6be000 x1630438418342608/t0(0) o106->fir-MDT0001@10.9.101.10@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554907801 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2752637.301581] Lustre: 96046:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 360 previous similar messages [2752654.789291] LustreError: 95551:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.9.101.10@o2ib4) failed to reply to blocking AST (req@ffff8ebab7e25400 x1630438418338672 status 0 rc -110), evict it ns: mdt-fir-MDT0001_UUID lock: ffff8ee41ef69440/0x857bd8c99d77d52 lrc: 4/0,0 mode: PR/PR res: [0x2400122a1:0xabf9:0x0].0x0 bits 0x40/0x0 rrc: 45 type: IBT flags: 0x60000400000020 nid: 10.9.101.10@o2ib4 remote: 0xf5e95f570f616d5a expref: 1769 pid: 96046 timeout: 2752775 lvb_type: 0 [2752654.832605] LustreError: 138-a: fir-MDT0001: A client on nid 10.9.101.10@o2ib4 was evicted due to a lock blocking callback time out: rc -110 [2752654.845416] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 154s: evicting client at 10.9.101.10@o2ib4 ns: mdt-fir-MDT0001_UUID lock: ffff8ee41ef69440/0x857bd8c99d77d52 lrc: 3/0,0 mode: PR/PR res: [0x2400122a1:0xabf9:0x0].0x0 bits 0x40/0x0 rrc: 45 type: IBT flags: 0x60000400000020 nid: 10.9.101.10@o2ib4 remote: 0xf5e95f570f616d5a expref: 1770 pid: 96046 timeout: 0 lvb_type: 0 [2752654.883474] Lustre: 96286:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (92:9s); client may timeout. req@ffff8ef0760bef00 x1628751103150272/t0(0) o101->6ffc2e8b-b062-5b88-97a9-a61177559c11@10.9.101.3@o2ib4:8/0 lens 480/536 e 0 to 0 dl 1554907810 ref 1 fl Complete:/0/0 rc 301/301 [2752705.507822] Lustre: fir-MDT0003: haven't heard from client b424099b-0e6f-9f1b-e9e1-b67f18b90317 (at 10.9.101.10@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed3bf582800, cur 1554907870 expire 1554907720 last 1554907643 [2752726.510011] Lustre: fir-MDT0001: haven't heard from client 81876adf-0ff1-a02b-8db5-b9a3c2273260 (at 10.9.114.7@o2ib4) in 213 seconds. I think it's dead, and I am evicting it. exp ffff8ef076ca0400, cur 1554907891 expire 1554907741 last 1554907678 [2752726.531883] Lustre: Skipped 1 previous similar message [2754001.338390] Lustre: fir-MDT0001: Connection restored to (at 10.9.114.7@o2ib4) [2754001.345800] Lustre: Skipped 67 previous similar messages [2754424.800060] Lustre: fir-MDT0001: Connection restored to b424099b-0e6f-9f1b-e9e1-b67f18b90317 (at 10.9.101.10@o2ib4) [2754424.810674] Lustre: Skipped 1 previous similar message [2757174.556501] Lustre: fir-MDT0001: haven't heard from client 672fb03f-9053-1684-c128-790aeca874a8 (at 10.9.103.39@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ef07bcd7000, cur 1554912339 expire 1554912189 last 1554912112 [2757185.555676] Lustre: fir-MDT0003: haven't heard from client 672fb03f-9053-1684-c128-790aeca874a8 (at 10.9.103.39@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ebab761f800, cur 1554912350 expire 1554912200 last 1554912123 [2757312.433434] LNetError: 94922:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (0, 5) [2758522.236746] Lustre: fir-MDT0001: Connection restored to 672fb03f-9053-1684-c128-790aeca874a8 (at 10.9.103.39@o2ib4) [2758522.247358] Lustre: Skipped 1 previous similar message [2758580.571892] Lustre: fir-MDT0003: haven't heard from client 26e9285d-a17e-4504-ddeb-2f45459e682f (at 10.8.23.27@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eb6f3dcac00, cur 1554913745 expire 1554913595 last 1554913518 [2759717.585118] Lustre: fir-MDT0003: haven't heard from client d681b0c6-fb2c-bc67-5eb0-adc9739bfde6 (at 10.9.114.6@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ebed7f9a400, cur 1554914882 expire 1554914732 last 1554914655 [2759717.606994] Lustre: Skipped 1 previous similar message [2759993.343350] LNetError: 94919:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (0, 5) [2760279.886649] Lustre: fir-MDT0001: Connection restored to 26e9285d-a17e-4504-ddeb-2f45459e682f (at 10.8.23.27@o2ib6) [2760279.897175] Lustre: Skipped 1 previous similar message [2761150.384086] Lustre: fir-MDT0001: Connection restored to 4b15a192-8c10-6520-238f-0538231b18f1 (at 10.8.15.3@o2ib6) [2761150.394532] Lustre: Skipped 1 previous similar message [2761461.604789] Lustre: fir-MDT0003: haven't heard from client 6ca6652e-0187-99bc-e850-704338e05c6f (at 10.9.114.5@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eb218244800, cur 1554916626 expire 1554916476 last 1554916399 [2761461.626671] Lustre: Skipped 1 previous similar message [2761480.063462] Lustre: fir-MDT0001: Connection restored to 6ca6652e-0187-99bc-e850-704338e05c6f (at 10.9.114.5@o2ib4) [2761480.073988] Lustre: Skipped 1 previous similar message [2761518.360120] Lustre: fir-MDT0001: Connection restored to d681b0c6-fb2c-bc67-5eb0-adc9739bfde6 (at 10.9.114.6@o2ib4) [2761518.370646] Lustre: Skipped 1 previous similar message [2761751.597853] Lustre: fir-MDT0001: Connection restored to f407a3e9-1c92-7c5f-ecf2-01d7df9a66ab (at 10.8.1.29@o2ib6) [2761751.608292] Lustre: Skipped 1 previous similar message [2762092.611919] Lustre: fir-MDT0003: haven't heard from client 60436878-76bd-e2ae-c7ae-914483a7aa01 (at 10.8.21.22@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed3bf584400, cur 1554917257 expire 1554917107 last 1554917030 [2762092.633797] Lustre: Skipped 1 previous similar message [2763700.968669] Lustre: 95354:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554918858/real 1554918858] req@ffff8ed6f3224e00 x1630438475983360/t0(0) o106->fir-MDT0001@10.9.101.40@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554918865 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2763700.996181] Lustre: 95354:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 109 previous similar messages [2763709.002769] Lustre: 95973:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8edfa37e1b00 x1630332563818400/t0(0) o101->13201a86-baaf-e717-5368-ad7fbde16e3b@10.9.101.4@o2ib4:8/0 lens 480/568 e 1 to 0 dl 1554918878 ref 2 fl Interpret:/0/0 rc 0/0 [2763709.031670] Lustre: 95973:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 11 previous similar messages [2763715.095605] Lustre: fir-MDT0001: Client 13201a86-baaf-e717-5368-ad7fbde16e3b (at 10.9.101.4@o2ib4) reconnecting [2763715.105869] Lustre: Skipped 55 previous similar messages [2763715.111404] Lustre: fir-MDT0001: Connection restored to 13201a86-baaf-e717-5368-ad7fbde16e3b (at 10.9.101.4@o2ib4) [2763716.230949] Lustre: fir-MDT0001: Connection restored to 98c09566-8207-6420-49d3-d7a016768ffe (at 10.9.108.50@o2ib4) [2763717.206880] Lustre: 96067:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554918874/real 1554918874] req@ffff8ed8d703d100 x1630438475994464/t0(0) o106->fir-MDT0001@10.9.101.40@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554918881 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2763717.234399] Lustre: 96067:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 29 previous similar messages [2763720.041944] Lustre: 95558:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ed79a1bce00 x1630440837703152/t0(0) o101->e75095d8-7c9e-d169-de21-b278ca100ccd@10.9.101.10@o2ib4:19/0 lens 480/568 e 1 to 0 dl 1554918889 ref 2 fl Interpret:/0/0 rc 0/0 [2763720.071023] Lustre: 95558:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 4 previous similar messages [2763721.897004] Lustre: fir-MDT0001: Connection restored to (at 10.9.101.17@o2ib4) [2763721.904498] Lustre: Skipped 2 previous similar messages [2763725.852663] Lustre: fir-MDT0001: Client e75095d8-7c9e-d169-de21-b278ca100ccd (at 10.9.101.10@o2ib4) reconnecting [2763725.863025] Lustre: Skipped 4 previous similar messages [2763725.868456] Lustre: fir-MDT0001: Connection restored to b424099b-0e6f-9f1b-e9e1-b67f18b90317 (at 10.9.101.10@o2ib4) [2763730.185064] LustreError: 95589:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.9.101.40@o2ib4) failed to reply to blocking AST (req@ffff8eb6e6fd9200 x1630438475991024 status 0 rc -110), evict it ns: mdt-fir-MDT0001_UUID lock: ffff8eb59bfc8d80/0x857bd8ddf268c2b lrc: 4/0,0 mode: PR/PR res: [0x24001260b:0x5:0x0].0x0 bits 0x40/0x0 rrc: 52 type: IBT flags: 0x60000400000020 nid: 10.9.101.40@o2ib4 remote: 0xfdaf792fe005dde1 expref: 1812 pid: 95514 timeout: 2763730 lvb_type: 0 [2763730.228099] LustreError: 138-a: fir-MDT0001: A client on nid 10.9.101.40@o2ib4 was evicted due to a lock blocking callback time out: rc -110 [2763730.240926] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 35s: evicting client at 10.9.101.40@o2ib4 ns: mdt-fir-MDT0001_UUID lock: ffff8eb59bfc8d80/0x857bd8ddf268c2b lrc: 3/0,0 mode: PR/PR res: [0x24001260b:0x5:0x0].0x0 bits 0x40/0x0 rrc: 52 type: IBT flags: 0x60000400000020 nid: 10.9.101.40@o2ib4 remote: 0xfdaf792fe005dde1 expref: 1813 pid: 95514 timeout: 0 lvb_type: 0 [2763730.278556] Lustre: 96067:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:4s); client may timeout. req@ffff8edccae15d00 x1628873382268800/t0(0) o101->a1d7b991-cc5a-550d-12be-e2d6e8d019b4@10.9.101.34@o2ib4:20/0 lens 480/536 e 0 to 0 dl 1554918890 ref 1 fl Complete:/0/0 rc 301/301 [2763730.307384] Lustre: 96067:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 1 previous similar message [2763795.188889] Lustre: fir-MDT0001: Connection restored to 60436878-76bd-e2ae-c7ae-914483a7aa01 (at 10.8.21.22@o2ib6) [2763795.199413] Lustre: Skipped 3 previous similar messages [2763911.633397] Lustre: fir-MDT0003: haven't heard from client e0982516-a138-a328-edef-e2447d7c8df2 (at 10.9.101.40@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eb8a6ace000, cur 1554919076 expire 1554918926 last 1554918849 [2763911.655360] Lustre: Skipped 1 previous similar message [2764878.645207] Lustre: fir-MDT0003: haven't heard from client d3813457-4417-03b3-a468-74026cb0ff17 (at 10.9.101.5@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8edb7ee90c00, cur 1554920043 expire 1554919893 last 1554919816 [2764880.650238] Lustre: fir-MDT0001: haven't heard from client d3813457-4417-03b3-a468-74026cb0ff17 (at 10.9.101.5@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed97f086800, cur 1554920045 expire 1554919895 last 1554919818 [2765564.741566] Lustre: fir-MDT0001: Connection restored to e0982516-a138-a328-edef-e2447d7c8df2 (at 10.9.101.40@o2ib4) [2765564.752173] Lustre: Skipped 1 previous similar message [2766047.659506] Lustre: fir-MDT0003: haven't heard from client 6d1dbc37-4cd8-8085-186b-d10709d6a0dc (at 10.9.114.5@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ec5dc36f000, cur 1554921212 expire 1554921062 last 1554920985 [2766047.831029] Lustre: fir-MDT0001: Connection restored to 6ca6652e-0187-99bc-e850-704338e05c6f (at 10.9.114.5@o2ib4) [2766047.841555] Lustre: Skipped 1 previous similar message [2766388.300762] LNetError: 94912:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (0, 5) [2766546.195514] Lustre: 96085:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554921703/real 1554921703] req@ffff8ed6f3226c00 x1630438491960080/t0(0) o104->fir-MDT0001@10.9.101.37@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554921710 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2766546.223062] Lustre: 96085:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 40 previous similar messages [2766550.998564] Lustre: 96323:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554921708/real 1554921708] req@ffff8ef07d442d00 x1630438491974976/t0(0) o104->fir-MDT0001@10.9.101.37@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554921715 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2766551.026080] Lustre: 96323:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 1 previous similar message [2766554.226609] Lustre: 95351:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8edc917a8600 x1630356561534384/t0(0) o101->e9aa8893-8856-db92-99ea-6ec688e98582@10.9.101.36@o2ib4:3/0 lens 480/568 e 1 to 0 dl 1554921723 ref 2 fl Interpret:/0/0 rc 0/0 [2766554.255601] Lustre: 95351:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 9 previous similar messages [2766559.236663] Lustre: 96251:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ed9d45f9b00 x1630440850151920/t0(0) o101->e75095d8-7c9e-d169-de21-b278ca100ccd@10.9.101.10@o2ib4:8/0 lens 480/568 e 1 to 0 dl 1554921728 ref 2 fl Interpret:/0/0 rc 0/0 [2766559.265693] Lustre: 96251:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 2 previous similar messages [2766559.641674] Lustre: 95926:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554921716/real 1554921716] req@ffff8ed00a501b00 x1630438491981536/t0(0) o104->fir-MDT0001@10.9.101.37@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554921723 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2766559.669202] Lustre: 95926:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 10 previous similar messages [2766560.247602] Lustre: fir-MDT0001: Client e9aa8893-8856-db92-99ea-6ec688e98582 (at 10.9.101.36@o2ib4) reconnecting [2766560.257988] Lustre: Skipped 2 previous similar messages [2766560.263335] Lustre: fir-MDT0001: Connection restored to (at 10.9.101.17@o2ib4) [2766565.525311] Lustre: fir-MDT0001: Client e75095d8-7c9e-d169-de21-b278ca100ccd (at 10.9.101.10@o2ib4) reconnecting [2766565.535669] Lustre: Skipped 1 previous similar message [2766565.541014] Lustre: fir-MDT0001: Connection restored to b424099b-0e6f-9f1b-e9e1-b67f18b90317 (at 10.9.101.10@o2ib4) [2766565.551651] Lustre: Skipped 2 previous similar messages [2766568.688775] Lustre: 95353:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eca78b71500 x1628764488968848/t0(0) o101->a5d1449b-840e-ceb7-788a-dd5f9c5805aa@10.9.101.32@o2ib4:18/0 lens 480/568 e 0 to 0 dl 1554921738 ref 2 fl Interpret:/0/0 rc 0/0 [2766568.717935] Lustre: 95353:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 1 previous similar message [2766574.214884] LustreError: 96067:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.9.101.37@o2ib4) failed to reply to blocking AST (req@ffff8ee03e2a8600 x1630438491960592 status 0 rc -110), evict it ns: mdt-fir-MDT0001_UUID lock: ffff8ecf50bd45c0/0x857bd8e1a6e19ae lrc: 4/0,0 mode: PR/PR res: [0x24001083b:0xdb90:0x0].0x0 bits 0x40/0x0 rrc: 43 type: IBT flags: 0x60000400000020 nid: 10.9.101.37@o2ib4 remote: 0xd1e43eba640975e4 expref: 1817 pid: 95538 timeout: 2766574 lvb_type: 0 [2766574.232867] LustreError: 138-a: fir-MDT0001: A client on nid 10.9.101.37@o2ib4 was evicted due to a lock blocking callback time out: rc -110 [2766574.232895] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 35s: evicting client at 10.9.101.37@o2ib4 ns: mdt-fir-MDT0001_UUID lock: ffff8ed329b6d580/0x857bd8e1a5addc8 lrc: 3/0,0 mode: PR/PR res: [0x2400125da:0x11:0x0].0x0 bits 0x40/0x0 rrc: 43 type: IBT flags: 0x60000400000020 nid: 10.9.101.37@o2ib4 remote: 0xd1e43eba6408bbbc expref: 1818 pid: 95990 timeout: 0 lvb_type: 0 [2766574.308554] LustreError: 96067:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) Skipped 1 previous similar message [2766624.531844] Lustre: fir-MDT0001: Connection restored to d3813457-4417-03b3-a468-74026cb0ff17 (at 10.9.101.5@o2ib4) [2766624.542379] Lustre: Skipped 2 previous similar messages [2766752.668023] Lustre: fir-MDT0003: haven't heard from client 558f6111-251f-9917-b7f1-a1e757b9c3a9 (at 10.9.101.37@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed3bf587000, cur 1554921917 expire 1554921767 last 1554921690 [2766752.689999] Lustre: Skipped 1 previous similar message [2768295.650159] Lustre: 96403:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554923452/real 1554923452] req@ffff8eee6d626000 x1630438501432128/t0(0) o104->fir-MDT0001@10.9.101.35@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554923459 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2768295.677673] Lustre: 96403:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 19 previous similar messages [2768302.687246] Lustre: 96403:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554923460/real 1554923460] req@ffff8eee6d626000 x1630438501432128/t0(0) o104->fir-MDT0001@10.9.101.35@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554923467 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2768309.714334] Lustre: 96403:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554923467/real 1554923467] req@ffff8eee6d626000 x1630438501432128/t0(0) o104->fir-MDT0001@10.9.101.35@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554923474 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2768313.066376] Lustre: 95401:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eefff786000 x1628752027645312/t0(0) o101->fe0b67cc-88bf-64bd-33d4-8b260bad0c72@10.9.101.17@o2ib4:22/0 lens 480/568 e 0 to 0 dl 1554923482 ref 2 fl Interpret:/0/0 rc 0/0 [2768313.095548] Lustre: 95401:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 2 previous similar messages [2768319.701993] Lustre: fir-MDT0001: Client fe0b67cc-88bf-64bd-33d4-8b260bad0c72 (at 10.9.101.17@o2ib4) reconnecting [2768319.712348] Lustre: Skipped 1 previous similar message [2768319.717695] Lustre: fir-MDT0001: Connection restored to (at 10.9.101.17@o2ib4) [2768323.741506] Lustre: 96403:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554923481/real 1554923481] req@ffff8eee6d626000 x1630438501432128/t0(0) o104->fir-MDT0001@10.9.101.35@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554923488 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2768323.769022] Lustre: 96403:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 1 previous similar message [2768323.778899] LustreError: 96403:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.9.101.35@o2ib4) failed to reply to blocking AST (req@ffff8eee6d626000 x1630438501432128 status 0 rc -110), evict it ns: mdt-fir-MDT0001_UUID lock: ffff8ec240658240/0x857bd8e27d2a247 lrc: 4/0,0 mode: PR/PR res: [0x24001083b:0xdb98:0x0].0x0 bits 0x40/0x0 rrc: 46 type: IBT flags: 0x60000400000020 nid: 10.9.101.35@o2ib4 remote: 0xcd62e4185e95507d expref: 1809 pid: 96218 timeout: 2768324 lvb_type: 0 [2768323.822215] LustreError: 138-a: fir-MDT0001: A client on nid 10.9.101.35@o2ib4 was evicted due to a lock blocking callback time out: rc -110 [2768323.835003] LustreError: Skipped 1 previous similar message [2768323.840779] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 36s: evicting client at 10.9.101.35@o2ib4 ns: mdt-fir-MDT0001_UUID lock: ffff8ec240658240/0x857bd8e27d2a247 lrc: 3/0,0 mode: PR/PR res: [0x24001083b:0xdb98:0x0].0x0 bits 0x40/0x0 rrc: 46 type: IBT flags: 0x60000400000020 nid: 10.9.101.35@o2ib4 remote: 0xcd62e4185e95507d expref: 1810 pid: 96218 timeout: 0 lvb_type: 0 [2768336.118956] Lustre: fir-MDT0001: Connection restored to 558f6111-251f-9917-b7f1-a1e757b9c3a9 (at 10.9.101.37@o2ib4) [2768336.129570] Lustre: Skipped 1 previous similar message [2768342.688692] Lustre: fir-MDT0001: haven't heard from client 4be11fe3-b6ee-1f5f-0a23-14b3c54d5e8d (at 10.9.106.61@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee4e948a800, cur 1554923507 expire 1554923357 last 1554923280 [2768508.696064] Lustre: fir-MDT0003: haven't heard from client 2e0c4648-6e21-c09c-2281-ea766335a2a8 (at 10.9.101.35@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eba74da3800, cur 1554923673 expire 1554923523 last 1554923446 [2768508.718026] Lustre: Skipped 1 previous similar message [2768902.189298] Lustre: fir-MDT0001: Connection restored to 55aafc93-9dff-fa25-67e4-1bdcf1bf214f (at 10.8.14.7@o2ib6) [2768902.199737] Lustre: Skipped 1 previous similar message [2769685.154085] Lustre: fir-MDT0001: Connection restored to 4be11fe3-b6ee-1f5f-0a23-14b3c54d5e8d (at 10.9.106.61@o2ib4) [2769685.164699] Lustre: Skipped 1 previous similar message [2769909.707246] Lustre: fir-MDT0001: haven't heard from client 55aafc93-9dff-fa25-67e4-1bdcf1bf214f (at 10.8.14.7@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eca15a07400, cur 1554925074 expire 1554924924 last 1554924847 [2770185.710911] Lustre: fir-MDT0001: haven't heard from client 4e7a41ff-599c-6fd3-1588-d7b9b8fbf455 (at 10.9.114.5@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ecff57c6c00, cur 1554925350 expire 1554925200 last 1554925123 [2770185.732789] Lustre: Skipped 1 previous similar message [2770189.748438] Lustre: fir-MDT0001: Connection restored to 6ca6652e-0187-99bc-e850-704338e05c6f (at 10.9.114.5@o2ib4) [2770189.758960] Lustre: Skipped 1 previous similar message [2770190.718998] Lustre: fir-MDT0003: haven't heard from client 4e7a41ff-599c-6fd3-1588-d7b9b8fbf455 (at 10.9.114.5@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eebbc977c00, cur 1554925355 expire 1554925205 last 1554925128 [2770197.717979] Lustre: fir-MDT0001: Connection restored to 2e0c4648-6e21-c09c-2281-ea766335a2a8 (at 10.9.101.35@o2ib4) [2770197.728592] Lustre: Skipped 1 previous similar message [2770339.169131] Lustre: 95550:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554925496/real 1554925496] req@ffff8ec619b7a100 x1630438513091328/t0(0) o106->fir-MDT0001@10.9.101.7@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554925503 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2770339.196571] Lustre: 95550:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 1 previous similar message [2770341.642167] Lustre: 95955:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554925498/real 1554925498] req@ffff8ebffe102a00 x1630438513098976/t0(0) o104->fir-MDT0001@10.9.101.7@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554925505 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2770341.669596] Lustre: 95955:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 5 previous similar messages [2770345.790202] Lustre: 96046:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554925503/real 1554925503] req@ffff8eefb1e36c00 x1630438513111152/t0(0) o104->fir-MDT0001@10.9.101.7@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554925510 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2770345.817627] Lustre: 96046:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 9 previous similar messages [2770347.257228] Lustre: 95930:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ee32a11ce00 x1628748313075408/t0(0) o101->3936a9aa-9a26-d302-25ed-f408e6cd8b5f@10.9.101.1@o2ib4:16/0 lens 480/568 e 1 to 0 dl 1554925516 ref 2 fl Interpret:/0/0 rc 0/0 [2770348.601241] Lustre: 96304:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ec06876aa00 x1628734716980464/t0(0) o101->253c07d0-1dda-ba64-13bf-14531ffca918@10.9.101.18@o2ib4:17/0 lens 480/568 e 1 to 0 dl 1554925517 ref 2 fl Interpret:/0/0 rc 0/0 [2770348.630328] Lustre: 96304:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 3 previous similar messages [2770350.693268] Lustre: 95351:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ed9555cd400 x1629020156276976/t0(0) o101->a6c677eb-4ace-e7b4-4376-24acf7debcee@10.9.108.51@o2ib4:19/0 lens 480/568 e 1 to 0 dl 1554925519 ref 2 fl Interpret:/0/0 rc 0/0 [2770350.722339] Lustre: 95351:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 1 previous similar message [2770353.277818] Lustre: fir-MDT0001: Client 98c09566-8207-6420-49d3-d7a016768ffe (at 10.9.108.50@o2ib4) reconnecting [2770353.282611] Lustre: fir-MDT0001: Connection restored to 310b526c-2d9c-12b8-e9c3-eec1a3d9d55d (at 10.9.101.13@o2ib4) [2770353.298782] Lustre: Skipped 1 previous similar message [2770353.670300] Lustre: 95969:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ecd0c777500 x1628754907326400/t0(0) o101->a487b03f-d5d2-c729-8ddf-d69e191e351d@10.9.101.12@o2ib4:22/0 lens 480/568 e 1 to 0 dl 1554925522 ref 2 fl Interpret:/0/0 rc 0/0 [2770353.699384] Lustre: 95969:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 1 previous similar message [2770353.989300] Lustre: 95358:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554925511/real 1554925511] req@ffff8eccce384500 x1630438513115344/t0(0) o106->fir-MDT0001@10.9.101.7@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554925518 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2770354.016746] Lustre: 95358:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 28 previous similar messages [2770354.303049] Lustre: fir-MDT0001: Client 6bb6b586-f426-e568-1fd9-7be05a8314e1 (at 10.9.101.8@o2ib4) reconnecting [2770354.313333] Lustre: Skipped 2 previous similar messages [2770355.862356] Lustre: fir-MDT0001: Client a6c677eb-4ace-e7b4-4376-24acf7debcee (at 10.9.108.51@o2ib4) reconnecting [2770355.872708] Lustre: Skipped 1 previous similar message [2770358.276354] Lustre: 95351:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8eb4ca5f3300 x1630350127047104/t0(0) o101->c927c3a4-10d5-5d74-f8fb-42ec7ee8b67c@10.9.101.43@o2ib4:27/0 lens 480/568 e 1 to 0 dl 1554925527 ref 2 fl Interpret:/0/0 rc 0/0 [2770358.305427] Lustre: 95351:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 2 previous similar messages [2770359.476814] Lustre: fir-MDT0001: Client a487b03f-d5d2-c729-8ddf-d69e191e351d (at 10.9.101.12@o2ib4) reconnecting [2770363.338809] Lustre: fir-MDT0001: Connection restored to 35a6c719-a690-c287-6e64-f26cf0bc0032 (at 10.9.108.43@o2ib4) [2770363.349419] Lustre: Skipped 8 previous similar messages [2770364.383143] Lustre: fir-MDT0001: Client c927c3a4-10d5-5d74-f8fb-42ec7ee8b67c (at 10.9.101.43@o2ib4) reconnecting [2770364.393490] Lustre: Skipped 2 previous similar messages [2770367.365490] LustreError: 96415:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.9.101.7@o2ib4) failed to reply to blocking AST (req@ffff8eee7eedd700 x1630438513092048 status 0 rc -110), evict it ns: mdt-fir-MDT0001_UUID lock: ffff8ee8c770f500/0x857bd8e2c6c180a lrc: 4/0,0 mode: PR/PR res: [0x240010873:0x5ba1:0x0].0x0 bits 0x40/0x0 rrc: 43 type: IBT flags: 0x60000400000020 nid: 10.9.101.7@o2ib4 remote: 0x2104414dfdd522a5 expref: 1862 pid: 95947 timeout: 2770367 lvb_type: 0 [2770367.408623] LustreError: 138-a: fir-MDT0001: A client on nid 10.9.101.7@o2ib4 was evicted due to a lock blocking callback time out: rc -110 [2770367.421334] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 35s: evicting client at 10.9.101.7@o2ib4 ns: mdt-fir-MDT0001_UUID lock: ffff8ee8c770f500/0x857bd8e2c6c180a lrc: 3/0,0 mode: PR/PR res: [0x240010873:0x5ba1:0x0].0x0 bits 0x40/0x0 rrc: 43 type: IBT flags: 0x60000400000020 nid: 10.9.101.7@o2ib4 remote: 0x2104414dfdd522a5 expref: 1863 pid: 95947 timeout: 0 lvb_type: 0 [2770367.459076] Lustre: 96310:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:1s); client may timeout. req@ffff8eb3bc480900 x1628752037196928/t0(0) o101->fe0b67cc-88bf-64bd-33d4-8b260bad0c72@10.9.101.17@o2ib4:0/0 lens 480/536 e 0 to 0 dl 1554925530 ref 1 fl Complete:/0/0 rc 301/301 [2770367.487822] Lustre: 96310:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 1 previous similar message [2770478.252811] Lustre: 95176:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554925635/real 1554925635] req@ffff8eb7bf850000 x1630438513608656/t0(0) o106->fir-MDT0001@10.9.104.38@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554925642 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2770478.280344] Lustre: 95176:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 48 previous similar messages [2770495.863029] Lustre: 96196:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ec0077aec00 x1629298516591792/t0(0) o101->bb4082ce-df8b-595a-09d9-4fe9f1e40877@10.9.103.38@o2ib4:15/0 lens 480/568 e 0 to 0 dl 1554925665 ref 2 fl Interpret:/0/0 rc 0/0 [2770495.892188] Lustre: 96196:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 9 previous similar messages [2770502.300860] Lustre: fir-MDT0001: Client bb4082ce-df8b-595a-09d9-4fe9f1e40877 (at 10.9.103.38@o2ib4) reconnecting [2770502.311210] Lustre: Skipped 3 previous similar messages [2770502.316643] Lustre: fir-MDT0001: Connection restored to bb4082ce-df8b-595a-09d9-4fe9f1e40877 (at 10.9.103.38@o2ib4) [2770502.327265] Lustre: Skipped 4 previous similar messages [2770513.290244] Lustre: 95176:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554925670/real 1554925670] req@ffff8eb7bf850000 x1630438513608656/t0(0) o106->fir-MDT0001@10.9.104.38@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554925677 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2770513.317776] Lustre: 95176:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 4 previous similar messages [2770533.449399] Lustre: fir-MDT0001: Client bb4082ce-df8b-595a-09d9-4fe9f1e40877 (at 10.9.103.38@o2ib4) reconnecting [2770557.723779] Lustre: fir-MDT0003: haven't heard from client 618e27a5-1163-e0cc-db2b-5b7bd2c4fd87 (at 10.9.101.7@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee053670000, cur 1554925722 expire 1554925572 last 1554925495 [2770564.561330] Lustre: fir-MDT0001: Connection restored to bb4082ce-df8b-595a-09d9-4fe9f1e40877 (at 10.9.103.38@o2ib4) [2770564.571957] Lustre: Skipped 1 previous similar message [2770583.330093] Lustre: 95176:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554925740/real 1554925740] req@ffff8eb7bf850000 x1630438513608656/t0(0) o106->fir-MDT0001@10.9.104.38@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554925747 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2770583.357625] Lustre: 95176:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 9 previous similar messages [2770595.600234] Lustre: fir-MDT0001: Client bb4082ce-df8b-595a-09d9-4fe9f1e40877 (at 10.9.103.38@o2ib4) reconnecting [2770595.610589] Lustre: Skipped 1 previous similar message [2770641.716097] Lustre: fir-MDT0003: haven't heard from client 2cfc41a5-db0c-965a-d142-7bbd5278a4b1 (at 10.9.104.38@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ebce3fc1800, cur 1554925806 expire 1554925656 last 1554925579 [2770656.716159] Lustre: fir-MDT0001: haven't heard from client 2cfc41a5-db0c-965a-d142-7bbd5278a4b1 (at 10.9.104.38@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ef05798cc00, cur 1554925821 expire 1554925671 last 1554925594 [2770656.738217] Lustre: 95176:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (185:1s); client may timeout. req@ffff8ec0077aec00 x1629298516591792/t0(0) o101->bb4082ce-df8b-595a-09d9-4fe9f1e40877@10.9.103.38@o2ib4:15/0 lens 480/536 e 0 to 0 dl 1554925820 ref 1 fl Complete:/0/0 rc 301/301 [2770707.575380] Lustre: fir-MDT0001: Connection restored to 55aafc93-9dff-fa25-67e4-1bdcf1bf214f (at 10.8.14.7@o2ib6) [2770707.585825] Lustre: Skipped 3 previous similar messages [2772151.643559] Lustre: fir-MDT0001: Connection restored to 618e27a5-1163-e0cc-db2b-5b7bd2c4fd87 (at 10.9.101.7@o2ib4) [2772151.654108] Lustre: Skipped 1 previous similar message [2772404.078672] Lustre: fir-MDT0001: Connection restored to 2cfc41a5-db0c-965a-d142-7bbd5278a4b1 (at 10.9.104.38@o2ib4) [2772404.089282] Lustre: Skipped 1 previous similar message [2773047.745763] Lustre: fir-MDT0001: haven't heard from client 766c24dc-05fc-2c20-7902-bb94de21535e (at 10.8.10.29@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed64f66d400, cur 1554928212 expire 1554928062 last 1554927985 [2773060.442894] Lustre: 95918:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554928217/real 1554928217] req@ffff8ed92d5de600 x1630438535210976/t0(0) o104->fir-MDT0001@10.9.114.5@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554928224 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2773060.470328] Lustre: 95918:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 10 previous similar messages [2773068.491981] Lustre: 96001:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ed3c0356300 x1629299486786464/t137479246150(0) o36->5fcbc0cf-d60c-9adf-ab6c-2fa9131c2ee1@10.9.105.55@o2ib4:7/0 lens 488/3152 e 1 to 0 dl 1554928237 ref 2 fl Interpret:/0/0 rc 0/0 [2773074.494565] Lustre: fir-MDT0001: Client 5fcbc0cf-d60c-9adf-ab6c-2fa9131c2ee1 (at 10.9.105.55@o2ib4) reconnecting [2773074.504913] Lustre: Skipped 1 previous similar message [2773074.510285] Lustre: fir-MDT0001: Connection restored to 5fcbc0cf-d60c-9adf-ab6c-2fa9131c2ee1 (at 10.9.105.55@o2ib4) [2773081.480147] Lustre: 95918:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554928238/real 1554928238] req@ffff8ed92d5de600 x1630438535210976/t0(0) o104->fir-MDT0001@10.9.114.5@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554928245 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2773081.507571] Lustre: 95918:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 2 previous similar messages [2773088.517252] LustreError: 95918:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.9.114.5@o2ib4) failed to reply to blocking AST (req@ffff8ed92d5de600 x1630438535210976 status 0 rc -110), evict it ns: mdt-fir-MDT0001_UUID lock: ffff8ec9c6d80d80/0x857bd8e30c6e83a lrc: 4/0,0 mode: PR/PR res: [0x2400122bb:0x16:0x0].0x0 bits 0x1b/0x0 rrc: 116 type: IBT flags: 0x60200400000020 nid: 10.9.114.5@o2ib4 remote: 0xcadf6e682818844 expref: 284 pid: 95408 timeout: 2773088 lvb_type: 0 [2773088.560112] LustreError: 138-a: fir-MDT0001: A client on nid 10.9.114.5@o2ib4 was evicted due to a lock blocking callback time out: rc -110 [2773088.572852] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 35s: evicting client at 10.9.114.5@o2ib4 ns: mdt-fir-MDT0001_UUID lock: ffff8ec9c6d80d80/0x857bd8e30c6e83a lrc: 3/0,0 mode: PR/PR res: [0x2400122bb:0x16:0x0].0x0 bits 0x1b/0x0 rrc: 116 type: IBT flags: 0x60200400000020 nid: 10.9.114.5@o2ib4 remote: 0xcadf6e682818844 expref: 285 pid: 95408 timeout: 0 lvb_type: 0 [2773228.746982] Lustre: fir-MDT0003: haven't heard from client 80c07668-3931-2a59-f84a-8b9dac6db9e5 (at 10.9.114.5@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ef0497f3400, cur 1554928393 expire 1554928243 last 1554928166 [2773228.768861] Lustre: Skipped 1 previous similar message [2773245.950596] Lustre: fir-MDT0001: Connection restored to 6ca6652e-0187-99bc-e850-704338e05c6f (at 10.9.114.5@o2ib4) [2773245.961122] Lustre: Skipped 1 previous similar message [2773656.986211] Lustre: 95573:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554928814/real 1554928814] req@ffff8ebbc2ee6000 x1630438540282176/t0(0) o106->fir-MDT0003@10.8.27.23@o2ib6:15/16 lens 296/280 e 0 to 1 dl 1554928821 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2773657.013652] Lustre: 95573:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 1 previous similar message [2773665.029311] Lustre: 95874:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ec000cc8000 x1629245967208560/t0(0) o101->bafe047a-89ea-21d5-a24e-5c407648cfc7@10.0.10.3@o2ib7:4/0 lens 480/568 e 1 to 0 dl 1554928834 ref 2 fl Interpret:/0/0 rc 0/0 [2773671.038562] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2773671.048761] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2773692.616260] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2773692.626473] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2773727.026092] Lustre: 95573:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554928884/real 1554928884] req@ffff8ebbc2ee6000 x1630438540282176/t0(0) o106->fir-MDT0003@10.8.27.23@o2ib6:15/16 lens 296/280 e 0 to 1 dl 1554928891 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2773727.053532] Lustre: 95573:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 9 previous similar messages [2773734.637635] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2773734.647814] Lustre: Skipped 1 previous similar message [2773734.653183] Lustre: fir-MDT0003: Connection restored to bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) [2773734.663657] Lustre: Skipped 1 previous similar message [2773794.064485] LustreError: 95902:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.8.27.23@o2ib6) returned error from blocking AST (req@ffff8edb7e85d400 x1630438541341536 status -107 rc -107), evict it ns: mdt-fir-MDT0001_UUID lock: ffff8ec3d1b67740/0x857bd8e35e6ba6e lrc: 4/0,0 mode: PR/PR res: [0x24000cd10:0x194c1:0x0].0x0 bits 0x13/0x0 rrc: 91 type: IBT flags: 0x60200400000020 nid: 10.8.27.23@o2ib6 remote: 0x2c978d64b4c3e64e expref: 50 pid: 95936 timeout: 2773921 lvb_type: 0 [2773794.107884] LustreError: 138-a: fir-MDT0001: A client on nid 10.8.27.23@o2ib6 was evicted due to a lock blocking callback time out: rc -107 [2773794.120622] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 0s: evicting client at 10.8.27.23@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ec3d1b67740/0x857bd8e35e6ba6e lrc: 3/0,0 mode: PR/PR res: [0x24000cd10:0x194c1:0x0].0x0 bits 0x13/0x0 rrc: 86 type: IBT flags: 0x60200400000020 nid: 10.8.27.23@o2ib6 remote: 0x2c978d64b4c3e64e expref: 51 pid: 95936 timeout: 0 lvb_type: 0 [2773797.065170] LustreError: 95573:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.8.27.23@o2ib6) returned error from glimpse AST (req@ffff8ebbc2ee6000 x1630438540282176 status -107 rc -107), evict it ns: mdt-fir-MDT0003_UUID lock: ffff8ecbd12e98c0/0x857bd8e353dc98f lrc: 4/0,0 mode: PW/PW res: [0x28000f7fd:0x50:0x0].0x0 bits 0x40/0x0 rrc: 6 type: IBT flags: 0x40200000000000 nid: 10.8.27.23@o2ib6 remote: 0x2c978d64b4c3cc2a expref: 21 pid: 95855 timeout: 0 lvb_type: 0 [2773797.107617] LustreError: 138-a: fir-MDT0003: A client on nid 10.8.27.23@o2ib6 was evicted due to a lock glimpse callback time out: rc -107 [2773797.120254] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 447s: evicting client at 10.8.27.23@o2ib6 ns: mdt-fir-MDT0003_UUID lock: ffff8ecbd12e98c0/0x857bd8e353dc98f lrc: 4/0,0 mode: PW/PW res: [0x28000f7fd:0x50:0x0].0x0 bits 0x40/0x0 rrc: 6 type: IBT flags: 0x40200000000000 nid: 10.8.27.23@o2ib6 remote: 0x2c978d64b4c3cc2a expref: 22 pid: 95855 timeout: 0 lvb_type: 0 [2773894.798190] Lustre: 96001:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554929052/real 1554929052] req@ffff8ed4c4997500 x1630438542319648/t0(0) o104->fir-MDT0001@10.9.101.33@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554929059 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2773894.825727] Lustre: 96001:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 10 previous similar messages [2773902.878284] Lustre: 96259:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ed95de02100 x1630350141829888/t0(0) o101->c927c3a4-10d5-5d74-f8fb-42ec7ee8b67c@10.9.101.43@o2ib4:2/0 lens 480/568 e 1 to 0 dl 1554929072 ref 2 fl Interpret:/0/0 rc 0/0 [2773908.849622] Lustre: fir-MDT0001: Client c927c3a4-10d5-5d74-f8fb-42ec7ee8b67c (at 10.9.101.43@o2ib4) reconnecting [2773908.860009] Lustre: Skipped 3 previous similar messages [2773908.865460] Lustre: fir-MDT0001: Connection restored to c927c3a4-10d5-5d74-f8fb-42ec7ee8b67c (at 10.9.101.43@o2ib4) [2773908.876069] Lustre: Skipped 5 previous similar messages [2773913.378420] Lustre: 96282:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ede42032400 x1630290222239568/t0(0) o101->712f8bd7-e015-5753-6533-6ad5fd7bc4e5@10.9.101.24@o2ib4:12/0 lens 480/568 e 1 to 0 dl 1554929082 ref 2 fl Interpret:/0/0 rc 0/0 [2773922.835559] LustreError: 96001:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.9.101.33@o2ib4) failed to reply to blocking AST (req@ffff8ed4c4997500 x1630438542319648 status 0 rc -110), evict it ns: mdt-fir-MDT0001_UUID lock: ffff8ebd3ae89680/0x857bd8e36f90925 lrc: 4/0,0 mode: PR/PR res: [0x2400124ed:0x13f73:0x0].0x0 bits 0x40/0x0 rrc: 48 type: IBT flags: 0x60000400000020 nid: 10.9.101.33@o2ib4 remote: 0x28aedc8697f60eff expref: 1884 pid: 96001 timeout: 2773923 lvb_type: 0 [2773922.878972] LustreError: 138-a: fir-MDT0001: A client on nid 10.9.101.33@o2ib4 was evicted due to a lock blocking callback time out: rc -110 [2773922.891806] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 35s: evicting client at 10.9.101.33@o2ib4 ns: mdt-fir-MDT0001_UUID lock: ffff8ebd3ae89680/0x857bd8e36f90925 lrc: 3/0,0 mode: PR/PR res: [0x2400124ed:0x13f73:0x0].0x0 bits 0x40/0x0 rrc: 48 type: IBT flags: 0x60000400000020 nid: 10.9.101.33@o2ib4 remote: 0x28aedc8697f60eff expref: 1885 pid: 96001 timeout: 0 lvb_type: 0 [2774101.757988] Lustre: fir-MDT0003: haven't heard from client 98af2bbe-c574-a824-5582-485a32691920 (at 10.9.101.33@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8edf77aa3800, cur 1554929266 expire 1554929116 last 1554929039 [2774149.676022] Lustre: fir-MDT0001: Connection restored to ff5b12dd-8ca8-48e9-b029-fe7197b561ba (at 10.8.11.10@o2ib6) [2774149.686561] Lustre: Skipped 7 previous similar messages [2774218.806300] Lustre: 96178:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554929376/real 1554929376] req@ffff8ec677b1dd00 x1630438545067472/t0(0) o106->fir-MDT0001@10.8.27.23@o2ib6:15/16 lens 296/280 e 0 to 1 dl 1554929383 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2774218.833724] Lustre: 96178:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 39 previous similar messages [2774236.900538] Lustre: 95952:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ec5538c5d00 x1630377321918960/t0(0) o101->72836ae9-c6ce-1b1b-18a6-171eb9b92f9b@10.8.1.11@o2ib6:6/0 lens 480/568 e 0 to 0 dl 1554929406 ref 2 fl Interpret:/0/0 rc 0/0 [2774236.929457] Lustre: 95952:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 7 previous similar messages [2774243.119425] Lustre: fir-MDT0001: Client 72836ae9-c6ce-1b1b-18a6-171eb9b92f9b (at 10.8.1.11@o2ib6) reconnecting [2774243.129605] Lustre: Skipped 6 previous similar messages [2774267.761049] Lustre: fir-MDT0001: haven't heard from client c7a325a4-90c0-0ef9-2b1d-f81ba35ea8f1 (at 10.8.27.23@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ecd162f9400, cur 1554929432 expire 1554929282 last 1554929205 [2774452.757012] Lustre: fir-MDT0001: Connection restored to 6ca6652e-0187-99bc-e850-704338e05c6f (at 10.9.114.5@o2ib4) [2774452.767539] Lustre: Skipped 13 previous similar messages [2774546.664421] Lustre: 96077:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ec4fb3ea400 x1629293467003328/t0(0) o101->7d7e8bb3-20d9-424d-1b68-5bd451b4f2d7@10.8.2.30@o2ib6:15/0 lens 600/3264 e 0 to 0 dl 1554929715 ref 2 fl Interpret:/0/0 rc 0/0 [2774546.693498] Lustre: 96077:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 19 previous similar messages [2774552.631696] Lustre: fir-MDT0003: Client 969d9ccb-e18d-fe21-6ff7-1558592a2ce9 (at 10.8.2.28@o2ib6) reconnecting [2774552.641883] Lustre: Skipped 11 previous similar messages [2774611.632226] LustreError: 95510:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554929685, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8ed02117ec00/0x857bd8e3bc4c032 lrc: 3/0,1 mode: --/PW res: [0x280010292:0x340:0x0].0x0 bits 0x21/0x0 rrc: 48 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 95510 timeout: 0 lvb_type: 0 [2774611.632228] LustreError: 96019:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554929685, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8ec22ee11d40/0x857bd8e3bc4c0e8 lrc: 3/1,0 mode: --/PR res: [0x280010292:0x340:0x0].0x0 bits 0x20/0x0 rrc: 49 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 96019 timeout: 0 lvb_type: 0 [2774611.632233] LustreError: 96019:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 3 previous similar messages [2774611.632276] LustreError: dumping log to /tmp/lustre-log.1554929775.95533 [2774611.729096] LustreError: 95510:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 15 previous similar messages [2774613.980260] LustreError: 95855:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554929688, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8ec2a5fb0240/0x857bd8e3bc82c4a lrc: 3/1,0 mode: --/PR res: [0x280010292:0x340:0x0].0x0 bits 0x13/0x8 rrc: 48 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 95855 timeout: 0 lvb_type: 0 [2774614.019735] LustreError: 95855:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 3 previous similar messages [2774615.831273] LustreError: 95532:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554929690, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8ec1a6b3d7c0/0x857bd8e3bcb250c lrc: 3/1,0 mode: --/PR res: [0x280010292:0x340:0x0].0x0 bits 0x13/0x8 rrc: 48 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 95532 timeout: 0 lvb_type: 0 [2774671.538943] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.2.28@o2ib6 ns: mdt-fir-MDT0003_UUID lock: ffff8ec862b01d40/0x857bd8e3bc4bef7 lrc: 3/0,0 mode: PW/PW res: [0x280010292:0x340:0x0].0x0 bits 0x40/0x0 rrc: 48 type: IBT flags: 0x60200400000020 nid: 10.8.2.28@o2ib6 remote: 0xf57fd75e3d5f1467 expref: 34 pid: 95860 timeout: 2774649 lvb_type: 0 [2774671.576853] LustreError: 95593:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8eb6e4a28000 ns: mdt-fir-MDT0003_UUID lock: ffff8ede50a48fc0/0x857bd8e3bc4bfbb lrc: 5/0,0 mode: PW/PW res: [0x280010292:0x340:0x0].0x0 bits 0x40/0x0 rrc: 46 type: IBT flags: 0x50200400000020 nid: 10.8.2.28@o2ib6 remote: 0xf57fd75e3d5f147c expref: 13 pid: 95593 timeout: 0 lvb_type: 0 [2774696.682238] Lustre: 95546:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ed00b584b00 x1629297381122592/t0(0) o101->a77148c9-d434-623d-b422-9d417c28aa4b@10.8.1.32@o2ib6:15/0 lens 568/0 e 0 to 0 dl 1554929865 ref 2 fl Interpret:/0/ffffffff rc 0/-1 [2774696.711737] Lustre: 95546:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 37 previous similar messages [2774759.766053] Lustre: fir-MDT0001: haven't heard from client a2be9cb9-225a-12bb-3d5f-c18b08c767d0 (at 10.8.27.23@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eb967a9ec00, cur 1554929924 expire 1554929774 last 1554929697 [2774759.787955] Lustre: Skipped 3 previous similar messages [2774761.666030] LustreError: 95415:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554929835, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8edea8bc0d80/0x857bd8e3cf2a866 lrc: 3/1,0 mode: --/PR res: [0x280010292:0x340:0x0].0x0 bits 0x13/0x8 rrc: 97 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 95415 timeout: 0 lvb_type: 0 [2774761.705529] LustreError: 95415:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 36 previous similar messages [2774821.540765] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.7.8@o2ib6 ns: mdt-fir-MDT0003_UUID lock: ffff8edd6e6db180/0x857bd8e3cf2a740 lrc: 3/0,0 mode: PW/PW res: [0x280010292:0x340:0x0].0x0 bits 0x40/0x0 rrc: 95 type: IBT flags: 0x60200400000020 nid: 10.8.7.8@o2ib6 remote: 0xb0c1a57e4144802d expref: 44 pid: 95550 timeout: 2774799 lvb_type: 0 [2774846.728080] Lustre: 95575:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eca72e68c00 x1629293467042096/t0(0) o101->7d7e8bb3-20d9-424d-1b68-5bd451b4f2d7@10.8.2.30@o2ib6:15/0 lens 568/0 e 0 to 0 dl 1554930015 ref 2 fl Interpret:/0/ffffffff rc 0/-1 [2774846.757591] Lustre: 95575:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 14 previous similar messages [2774872.044402] LNet: Service thread pid 95564 was inactive for 200.37s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2774872.061516] Pid: 95564, comm: mdt00_022 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2774872.071453] Call Trace: [2774872.074096] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2774872.081216] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2774872.088583] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2774872.095624] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2774872.102805] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2774872.109479] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2774872.116655] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2774872.123398] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2774872.130056] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2774872.136990] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2774872.144269] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2774872.150636] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2774872.157753] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2774872.165681] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2774872.172186] [] kthread+0xd1/0xe0 [2774872.177281] [] ret_from_fork_nospec_begin+0xe/0x21 [2774872.183928] [] 0xffffffffffffffff [2774872.189154] LustreError: dumping log to /tmp/lustre-log.1554930036.95564 [2774872.909671] LNet: Service thread pid 96019 was inactive for 201.23s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2774872.926781] Pid: 96019, comm: mdt01_075 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2774872.936711] Call Trace: [2774872.939352] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2774872.946470] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2774872.953866] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2774872.960888] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2774872.968084] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2774872.974837] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2774872.981511] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2774872.988467] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2774872.995784] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2774873.002149] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2774873.009277] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2774873.017188] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2774873.023688] [] kthread+0xd1/0xe0 [2774873.028793] [] ret_from_fork_nospec_begin+0xe/0x21 [2774873.035454] [] 0xffffffffffffffff [2774873.040671] Pid: 96105, comm: mdt02_069 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2774873.050597] Call Trace: [2774873.053232] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2774873.060350] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2774873.067733] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2774873.074770] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2774873.081948] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2774873.088612] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2774873.095816] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2774873.102563] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2774873.109250] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2774873.116188] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2774873.123470] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2774873.129830] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2774873.136965] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2774873.144885] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2774873.151385] [] kthread+0xd1/0xe0 [2774873.156504] [] ret_from_fork_nospec_begin+0xe/0x21 [2774873.163154] [] 0xffffffffffffffff [2774873.168356] Pid: 95178, comm: mdt01_003 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2774873.178288] Call Trace: [2774873.180927] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2774873.188039] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2774873.195444] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2774873.202479] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2774873.209651] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2774873.216417] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2774873.223066] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2774873.230029] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2774873.237314] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2774873.243687] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2774873.250812] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2774873.258742] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2774873.265267] [] kthread+0xd1/0xe0 [2774873.270371] [] ret_from_fork_nospec_begin+0xe/0x21 [2774873.277033] [] 0xffffffffffffffff [2774873.282248] Pid: 95855, comm: mdt01_046 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2774873.292170] Call Trace: [2774873.294810] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2774873.301917] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2774873.309302] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2774873.316334] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2774873.323513] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2774873.330200] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2774873.337386] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2774873.344169] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2774873.350834] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2774873.357811] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2774873.365103] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2774873.371451] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2774873.378605] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2774873.386520] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2774873.393041] [] kthread+0xd1/0xe0 [2774873.398166] [] ret_from_fork_nospec_begin+0xe/0x21 [2774873.404833] [] 0xffffffffffffffff [2774873.410047] LNet: Service thread pid 96243 was inactive for 201.73s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2774911.579875] LustreError: 95585:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554929985, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8ed02fd760c0/0x857bd8e3de92dcd lrc: 3/1,0 mode: --/PR res: [0x280010292:0x340:0x0].0x0 bits 0x20/0x0 rrc: 99 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 95585 timeout: 0 lvb_type: 0 [2774911.579877] LustreError: 96002:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554929985, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8ecffb3b0900/0x857bd8e3de92e05 lrc: 3/1,0 mode: --/PR res: [0x280010292:0x340:0x0].0x0 bits 0x13/0x8 rrc: 98 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 96002 timeout: 0 lvb_type: 0 [2774911.579882] LustreError: 96002:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 1 previous similar message [2774939.550211] LustreError: 96191:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554930013, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8ec525ac6780/0x857bd8e3e0ef0fe lrc: 3/1,0 mode: --/PR res: [0x280010292:0x340:0x0].0x0 bits 0x13/0x8 rrc: 98 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 96191 timeout: 0 lvb_type: 0 [2774939.589714] LustreError: 96191:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 2 previous similar messages [2774949.768358] Lustre: fir-MDT0001: haven't heard from client d40514bf-f5f1-614f-2e6c-056d786841f0 (at 10.8.24.25@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ef06de98c00, cur 1554930114 expire 1554929964 last 1554929887 [2774949.790255] Lustre: Skipped 1 previous similar message [2774969.551580] LustreError: 96030:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554930043, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8edd21355c40/0x857bd8e3e350e17 lrc: 3/1,0 mode: --/PR res: [0x280010292:0x340:0x0].0x0 bits 0x13/0x8 rrc: 98 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 96030 timeout: 0 lvb_type: 0 [2774971.542613] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.2.30@o2ib6 ns: mdt-fir-MDT0003_UUID lock: ffff8ed23aab86c0/0x857bd8e3cf2a819 lrc: 3/0,0 mode: PW/PW res: [0x280010292:0x340:0x0].0x0 bits 0x40/0x0 rrc: 98 type: IBT flags: 0x60200400000020 nid: 10.8.2.30@o2ib6 remote: 0x688e8829129436ba expref: 25 pid: 95543 timeout: 2774949 lvb_type: 0 [2774971.580503] LNet: Service thread pid 95573 completed after 299.91s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2774971.596872] LNet: Skipped 15 previous similar messages [2774971.599727] Lustre: 96005:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:146s); client may timeout. req@ffff8edc646b6600 x1628549298827584/t0(0) o55->d65897f0-41a6-773e-c926-3cb8be200fda@10.8.7.8@o2ib6:15/0 lens 472/192 e 0 to 0 dl 1554929989 ref 1 fl Complete:/0/0 rc -22/-22 [2774971.600731] LustreError: 96189:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8eb7d80d9400 ns: mdt-fir-MDT0003_UUID lock: ffff8ec46976f500/0x857bd8e3cf2aea2 lrc: 3/0,0 mode: PW/PW res: [0x280010292:0x340:0x0].0x0 bits 0x40/0x0 rrc: 76 type: IBT flags: 0x50200400000020 nid: 10.8.7.8@o2ib6 remote: 0xb0c1a57e4144803b expref: 6 pid: 96189 timeout: 0 lvb_type: 0 [2774971.600734] LustreError: 96189:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 2 previous similar messages [2774971.614486] Lustre: fir-MDT0003: Connection restored to 7d7e8bb3-20d9-424d-1b68-5bd451b4f2d7 (at 10.8.2.30@o2ib6) [2774971.614488] Lustre: Skipped 248 previous similar messages [2775061.607706] LustreError: 95178:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554930135, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8ec434b9f980/0x857bd8e3e972b6e lrc: 3/0,1 mode: --/PW res: [0x280010292:0x340:0x0].0x0 bits 0x40/0x0 rrc: 83 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 95178 timeout: 0 lvb_type: 0 [2775061.647200] LustreError: 95178:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 3 previous similar messages [2775064.651457] Lustre: fir-MDT0003: Client 45da2240-7d39-1a64-babd-5c9f7e12bcc9 (at 10.8.7.5@o2ib6) reconnecting [2775064.661558] Lustre: Skipped 271 previous similar messages [2775121.544450] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.2.25@o2ib6 ns: mdt-fir-MDT0003_UUID lock: ffff8ee027909440/0x857bd8e3e9729fb lrc: 3/0,0 mode: PW/PW res: [0x280010292:0x340:0x0].0x0 bits 0x40/0x0 rrc: 83 type: IBT flags: 0x60200400000020 nid: 10.8.2.25@o2ib6 remote: 0xccb876b61e8b0853 expref: 38 pid: 96243 timeout: 2775099 lvb_type: 0 [2775146.667761] Lustre: 95530:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ecdd82f5a00 x1629292530852640/t0(0) o101->e1eddc12-cf0d-d509-565c-de2bcf13f095@10.8.2.34@o2ib6:15/0 lens 576/3264 e 0 to 0 dl 1554930315 ref 2 fl Interpret:/0/0 rc 0/0 [2775146.696854] Lustre: 95530:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 35 previous similar messages [2775172.080081] LNet: Service thread pid 95902 was inactive for 200.38s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2775172.097213] LNet: Skipped 3 previous similar messages [2775172.102448] Pid: 95902, comm: mdt02_041 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2775172.112384] Call Trace: [2775172.115021] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2775172.122165] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2775172.129551] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2775172.136572] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2775172.143785] [] mdt_object_lock+0x20/0x30 [mdt] [2775172.150101] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2775172.156597] [] mdt_intent_brw+0x1f/0x30 [mdt] [2775172.162832] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2775172.169506] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2775172.176438] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2775172.183727] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2775172.190107] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2775172.197222] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2775172.205149] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2775172.211667] [] kthread+0xd1/0xe0 [2775172.216772] [] ret_from_fork_nospec_begin+0xe/0x21 [2775172.223421] [] 0xffffffffffffffff [2775172.228656] LustreError: dumping log to /tmp/lustre-log.1554930336.95902 [2775172.575177] Pid: 95358, comm: mdt01_010 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2775172.585140] Call Trace: [2775172.587798] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2775172.594929] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2775172.602336] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2775172.609385] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2775172.616597] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2775172.623296] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2775172.630512] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2775172.637284] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2775172.643978] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2775172.650942] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2775172.658260] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2775172.664634] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2775172.671783] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2775172.679721] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2775172.686265] [] kthread+0xd1/0xe0 [2775172.691383] [] ret_from_fork_nospec_begin+0xe/0x21 [2775172.698080] [] 0xffffffffffffffff [2775172.703312] Pid: 95952, comm: mdt01_062 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2775172.713260] Call Trace: [2775172.715919] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2775172.723084] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2775172.730498] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2775172.737539] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2775172.744777] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2775172.751497] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2775172.758714] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2775172.765495] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2775172.772212] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2775172.779227] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2775172.786596] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2775172.792964] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2775172.800116] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2775172.808037] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2775172.814592] [] kthread+0xd1/0xe0 [2775172.819704] [] ret_from_fork_nospec_begin+0xe/0x21 [2775172.826399] [] 0xffffffffffffffff [2775172.831632] Pid: 95925, comm: mdt00_044 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2775172.841572] Call Trace: [2775172.844213] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2775172.851350] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2775172.858748] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2775172.865799] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2775172.873011] [] mdt_object_lock+0x20/0x30 [mdt] [2775172.879344] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2775172.885881] [] mdt_intent_brw+0x1f/0x30 [mdt] [2775172.892155] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2775172.898847] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2775172.905813] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2775172.913160] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2775172.919510] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2775172.926649] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2775172.934594] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2775172.941178] [] kthread+0xd1/0xe0 [2775172.946291] [] ret_from_fork_nospec_begin+0xe/0x21 [2775172.953001] [] 0xffffffffffffffff [2775172.958220] Pid: 96171, comm: mdt01_091 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2775172.968158] Call Trace: [2775172.970817] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2775172.977942] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2775172.985366] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2775172.992439] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2775172.999654] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2775173.006338] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2775173.013544] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2775173.020334] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2775173.027066] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2775173.034034] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2775173.041375] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2775173.047717] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2775173.054877] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2775173.062806] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2775173.069358] [] kthread+0xd1/0xe0 [2775173.074473] [] ret_from_fork_nospec_begin+0xe/0x21 [2775173.081175] [] 0xffffffffffffffff [2775173.086403] LNet: Service thread pid 95575 was inactive for 201.38s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2775173.099490] LNet: Skipped 28 previous similar messages [2775211.583556] LustreError: 96275:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554930285, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8edf4df49d40/0x857bd8e3f497796 lrc: 3/1,0 mode: --/PR res: [0x280010292:0x340:0x0].0x0 bits 0x20/0x0 rrc: 92 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 96275 timeout: 0 lvb_type: 0 [2775211.623055] LustreError: 96275:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 28 previous similar messages [2775271.546301] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.7.5@o2ib6 ns: mdt-fir-MDT0003_UUID lock: ffff8ec434b9f980/0x857bd8e3e972b6e lrc: 3/0,0 mode: PW/PW res: [0x280010292:0x340:0x0].0x0 bits 0x40/0x0 rrc: 96 type: IBT flags: 0x60200400000020 nid: 10.8.7.5@o2ib6 remote: 0xd48b524de95cd7d4 expref: 49 pid: 95178 timeout: 2775249 lvb_type: 0 [2775271.584105] LNet: Service thread pid 96206 completed after 299.89s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2775271.600457] LNet: Skipped 5 previous similar messages [2775286.772634] Lustre: fir-MDT0001: haven't heard from client 92910762-2800-6855-c3ae-478c6013cc25 (at 10.8.31.5@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee9c5232c00, cur 1554930451 expire 1554930301 last 1554930224 [2775286.794421] Lustre: Skipped 1 previous similar message [2775321.585922] LNet: Service thread pid 96222 was inactive for 200.00s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2775321.598960] LNet: Skipped 18 previous similar messages [2775321.604299] LustreError: dumping log to /tmp/lustre-log.1554930485.96222 [2775322.097930] LustreError: dumping log to /tmp/lustre-log.1554930486.96259 [2775327.217986] LNet: Service thread pid 95538 was inactive for 200.43s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2775327.231024] LNet: Skipped 5 previous similar messages [2775327.236277] LustreError: dumping log to /tmp/lustre-log.1554930491.95538 [2775338.994135] LNet: Service thread pid 96189 was inactive for 200.72s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2775339.007174] LustreError: dumping log to /tmp/lustre-log.1554930503.96189 [2775348.417257] LustreError: 95548:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554930422, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8eb1d65b5e80/0x857bd8e3fecfcb5 lrc: 3/1,0 mode: --/PR res: [0x280010292:0x340:0x0].0x0 bits 0x13/0x8 rrc: 97 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 95548 timeout: 0 lvb_type: 0 [2775348.456746] LustreError: 95548:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 3 previous similar messages [2775399.410890] LNet: Service thread pid 96202 was inactive for 200.48s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2775399.423926] LustreError: dumping log to /tmp/lustre-log.1554930563.96202 [2775421.548173] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.8.23@o2ib6 ns: mdt-fir-MDT0003_UUID lock: ffff8ee027903cc0/0x857bd8e3e97462c lrc: 3/0,0 mode: PW/PW res: [0x280010292:0x340:0x0].0x0 bits 0x40/0x0 rrc: 99 type: IBT flags: 0x60200400000020 nid: 10.8.8.23@o2ib6 remote: 0x2a90aeeb40c1e162 expref: 29 pid: 95902 timeout: 2775399 lvb_type: 0 [2775421.586069] LustreError: 95925:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8eb09b7f9000 ns: mdt-fir-MDT0003_UUID lock: ffff8eba96b557c0/0x857bd8e3e974b73 lrc: 3/0,0 mode: PW/PW res: [0x280010292:0x340:0x0].0x0 bits 0x40/0x0 rrc: 96 type: IBT flags: 0x50200400000020 nid: 10.8.2.25@o2ib6 remote: 0xccb876b61e8b08b5 expref: 5 pid: 95925 timeout: 0 lvb_type: 0 [2775421.620614] LustreError: 95925:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 3 previous similar messages [2775421.630902] Lustre: 95925:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (154:296s); client may timeout. req@ffff8ebbcde5b000 x1629293004094864/t0(0) o101->9df0fc78-44aa-b2bc-1b11-0984014fc902@10.8.2.25@o2ib6:15/0 lens 480/536 e 0 to 0 dl 1554930289 ref 1 fl Complete:/0/0 rc -107/-107 [2775421.659989] Lustre: 95925:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 3 previous similar messages [2775421.670258] LNet: Service thread pid 95925 completed after 449.96s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2775458.803633] LNet: Service thread pid 95548 was inactive for 200.38s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2775458.816672] LustreError: dumping log to /tmp/lustre-log.1554930623.95548 [2775471.091784] LustreError: dumping log to /tmp/lustre-log.1554930635.96287 [2775472.115801] LNet: Service thread pid 95358 was inactive for 200.52s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2775472.132918] LNet: Skipped 4 previous similar messages [2775472.138183] Pid: 95358, comm: mdt01_010 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2775472.148123] Call Trace: [2775472.150777] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2775472.157909] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2775472.165277] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2775472.172303] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2775472.179513] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2775472.186274] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2775472.192945] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2775472.199905] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2775472.207207] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2775472.213566] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2775472.220697] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2775472.228610] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2775472.235135] [] kthread+0xd1/0xe0 [2775472.240255] [] ret_from_fork_nospec_begin+0xe/0x21 [2775472.246925] [] 0xffffffffffffffff [2775472.252136] LustreError: dumping log to /tmp/lustre-log.1554930636.95358 [2775472.260750] Pid: 95902, comm: mdt02_041 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2775472.270682] Call Trace: [2775472.273315] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2775472.280429] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2775472.287831] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2775472.294835] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2775472.302011] [] mdt_intent_getxattr+0xb5/0x270 [mdt] [2775472.308753] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2775472.315409] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2775472.322352] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2775472.329667] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2775472.336043] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2775472.343168] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2775472.351064] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2775472.357572] [] kthread+0xd1/0xe0 [2775472.362659] [] ret_from_fork_nospec_begin+0xe/0x21 [2775472.369307] [] 0xffffffffffffffff [2775472.374506] Pid: 95344, comm: mdt00_006 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2775472.384442] Call Trace: [2775472.387084] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2775472.394213] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2775472.401593] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2775472.408593] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2775472.415798] [] mdt_object_lock_try+0x27/0xb0 [mdt] [2775472.422470] [] mdt_getattr_name_lock+0x1287/0x1c30 [mdt] [2775472.429658] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2775472.436421] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2775472.443078] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2775472.450028] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2775472.457311] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2775472.463664] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2775472.470798] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2775472.478709] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2775472.485235] [] kthread+0xd1/0xe0 [2775472.490322] [] ret_from_fork_nospec_begin+0xe/0x21 [2775472.496970] [] 0xffffffffffffffff [2775472.502173] Pid: 95990, comm: mdt02_052 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2775472.512088] Call Trace: [2775472.514728] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2775472.521891] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2775472.529289] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2775472.536304] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2775472.543501] [] mdt_object_lock+0x20/0x30 [mdt] [2775472.549810] [] mdt_brw_enqueue+0x44b/0x760 [mdt] [2775472.556294] [] mdt_intent_brw+0x1f/0x30 [mdt] [2775472.562530] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2775472.569188] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2775472.576132] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2775472.583411] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2775472.589757] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2775472.596903] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2775472.604801] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2775472.611318] [] kthread+0xd1/0xe0 [2775472.616405] [] ret_from_fork_nospec_begin+0xe/0x21 [2775472.623055] [] 0xffffffffffffffff [2775472.628243] Pid: 95363, comm: mdt01_011 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2775472.638206] Call Trace: [2775472.640848] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2775472.647996] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2775472.655373] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2775472.662389] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2775472.669584] [] mdt_object_lock+0x20/0x30 [mdt] [2775472.675911] [] mdt_hsm_state_set+0xc9/0x830 [mdt] [2775472.682506] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2775472.689632] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2775472.697518] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2775472.704049] [] kthread+0xd1/0xe0 [2775472.709142] [] ret_from_fork_nospec_begin+0xe/0x21 [2775472.715792] [] 0xffffffffffffffff [2775476.211849] LustreError: dumping log to /tmp/lustre-log.1554930640.96047 [2775519.220385] LNet: Service thread pid 95359 was inactive for 200.55s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2775519.233443] LNet: Skipped 2 previous similar messages [2775519.238676] LustreError: dumping log to /tmp/lustre-log.1554930683.95359 [2775571.739708] Lustre: fir-MDT0003: Connection restored to ca268eff-8bf8-0ad8-eaa9-213224fab174 (at 10.9.0.63@o2ib4) [2775571.750146] Lustre: Skipped 281 previous similar messages [2775578.613115] LustreError: dumping log to /tmp/lustre-log.1554930742.96114 [2775631.349766] LNet: Service thread pid 96206 was inactive for 200.42s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2775631.362803] LNet: Skipped 1 previous similar message [2775631.367949] LustreError: dumping log to /tmp/lustre-log.1554930795.96206 [2775639.029868] LustreError: dumping log to /tmp/lustre-log.1554930803.95385 [2775674.104873] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2775674.115053] Lustre: Skipped 291 previous similar messages [2775698.934606] LustreError: dumping log to /tmp/lustre-log.1554930863.95573 [2775762.778559] Lustre: fir-MDT0001: haven't heard from client 3839258f-4e4a-0265-f390-0213a4accfb2 (at 10.8.27.23@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ecf41983400, cur 1554930927 expire 1554930777 last 1554930700 [2775762.800439] Lustre: Skipped 7 previous similar messages [2775838.779927] Lustre: fir-MDT0001: haven't heard from client 98179822-15bb-223e-365a-0f77834fe7fa (at 10.8.10.1@o2ib6) in 202 seconds. I think it's dead, and I am evicting it. exp ffff8ee9402e5c00, cur 1554931003 expire 1554930853 last 1554930801 [2775838.801733] Lustre: Skipped 1 previous similar message [2776054.910100] LustreError: 95575:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8eb6f3dc9400 ns: mdt-fir-MDT0003_UUID lock: ffff8ecff6722880/0x857bd8e3e974b81 lrc: 3/0,0 mode: PW/PW res: [0x280010292:0x340:0x0].0x0 bits 0x40/0x0 rrc: 100 type: IBT flags: 0x50200400000020 nid: 10.8.7.5@o2ib6 remote: 0xd48b524de95cd844 expref: 5 pid: 95575 timeout: 0 lvb_type: 0 [2776054.944665] LNet: Service thread pid 96090 completed after 1083.23s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources). [2776054.944669] Lustre: 95575:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (309:775s); client may timeout. req@ffff8ec52f1d3f00 x1629292475421808/t0(0) o101->45da2240-7d39-1a64-babd-5c9f7e12bcc9@10.8.7.5@o2ib6:15/0 lens 480/536 e 0 to 0 dl 1554930444 ref 1 fl Complete:/0/0 rc -107/-107 [2776054.990111] LNet: Skipped 38 previous similar messages [2776079.995334] Lustre: 95517:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ed65f60b600 x1629298767724976/t0(0) o101->de5dce88-12c8-ba11-d59d-2561e7b2cce5@10.8.7.9@o2ib6:19/0 lens 600/3264 e 0 to 0 dl 1554931249 ref 2 fl Interpret:/0/0 rc 0/0 [2776080.024343] Lustre: 95517:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 25 previous similar messages [2776090.573274] LustreError: 96259:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.8.27.23@o2ib6) returned error from blocking AST (req@ffff8ed1435a0c00 x1630438559497520 status -107 rc -107), evict it ns: mdt-fir-MDT0001_UUID lock: ffff8ec3309f8240/0x857bd8e42ab653a lrc: 4/0,0 mode: PR/PR res: [0x24000cd10:0x18ec9:0x0].0x0 bits 0x13/0x0 rrc: 124 type: IBT flags: 0x60200400000020 nid: 10.8.27.23@o2ib6 remote: 0xba3c07706a8d7028 expref: 75 pid: 96002 timeout: 2776218 lvb_type: 0 [2776090.616783] LustreError: 138-a: fir-MDT0001: A client on nid 10.8.27.23@o2ib6 was evicted due to a lock blocking callback time out: rc -107 [2776090.629515] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 0s: evicting client at 10.8.27.23@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8ec3309f8240/0x857bd8e42ab653a lrc: 3/0,0 mode: PR/PR res: [0x24000cd10:0x18ec9:0x0].0x0 bits 0x13/0x0 rrc: 132 type: IBT flags: 0x60200400000020 nid: 10.8.27.23@o2ib6 remote: 0xba3c07706a8d7028 expref: 76 pid: 96002 timeout: 0 lvb_type: 0 [2776103.800284] Lustre: fir-MDT0003: haven't heard from client deba5a81-3a13-8b5b-dbdf-2ecdb7eb2d38 (at 10.8.27.23@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eda06b46c00, cur 1554931268 expire 1554931118 last 1554931041 [2776103.822178] Lustre: Skipped 1 previous similar message [2776144.977134] LustreError: 96282:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554931219, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8edd5b021f80/0x857bd8e4386afa4 lrc: 3/0,1 mode: --/PW res: [0x280010292:0x341:0x0].0x0 bits 0x40/0x0 rrc: 66 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 96282 timeout: 0 lvb_type: 0 [2776145.016631] LustreError: 96282:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 22 previous similar messages [2776178.977616] Lustre: fir-MDT0003: Connection restored to 9df0fc78-44aa-b2bc-1b11-0984014fc902 (at 10.8.2.25@o2ib6) [2776178.988053] Lustre: Skipped 265 previous similar messages [2776204.557876] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 149s: evicting client at 10.8.2.25@o2ib6 ns: mdt-fir-MDT0003_UUID lock: ffff8ec354389200/0x857bd8e4386add6 lrc: 3/0,0 mode: PW/PW res: [0x280010292:0x341:0x0].0x0 bits 0x40/0x0 rrc: 66 type: IBT flags: 0x60200400000020 nid: 10.8.2.25@o2ib6 remote: 0xccb876b61e8b0ff4 expref: 30 pid: 96189 timeout: 2776182 lvb_type: 0 [2776204.596492] LustreError: 96282:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8eda292c6400 ns: mdt-fir-MDT0003_UUID lock: ffff8edd5b021f80/0x857bd8e4386afa4 lrc: 3/0,0 mode: PW/PW res: [0x280010292:0x341:0x0].0x0 bits 0x40/0x0 rrc: 61 type: IBT flags: 0x50200400000020 nid: 10.8.2.25@o2ib6 remote: 0xccb876b61e8b0ffb expref: 5 pid: 96282 timeout: 0 lvb_type: 0 [2776204.631026] LustreError: 96282:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 4 previous similar messages [2776297.649367] Lustre: fir-MDT0003: Client 3e4cef37-4508-c87a-8ca1-5992fb5ac7d8 (at 10.8.2.26@o2ib6) reconnecting [2776297.659545] Lustre: Skipped 228 previous similar messages [2776354.559719] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 150s: evicting client at 10.8.2.26@o2ib6 ns: mdt-fir-MDT0003_UUID lock: ffff8ec1bf26d580/0x857bd8e4425421e lrc: 3/0,0 mode: PW/PW res: [0x280010292:0x341:0x0].0x0 bits 0x40/0x0 rrc: 49 type: IBT flags: 0x60200400000020 nid: 10.8.2.26@o2ib6 remote: 0x338e4779fa49a0ad expref: 30 pid: 95363 timeout: 2776332 lvb_type: 0 [2776354.597613] LustreError: 95344:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ec07b068400 ns: mdt-fir-MDT0003_UUID lock: ffff8ebbf3247740/0x857bd8e44254367 lrc: 5/0,0 mode: PW/PW res: [0x280010292:0x341:0x0].0x0 bits 0x40/0x0 rrc: 46 type: IBT flags: 0x50200400000020 nid: 10.8.2.26@o2ib6 remote: 0x338e4779fa49a0c2 expref: 12 pid: 95344 timeout: 0 lvb_type: 0 [2776354.632262] LustreError: 95344:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 1 previous similar message [2776389.786193] Lustre: fir-MDT0003: haven't heard from client 7d769bde-3cd1-e339-2607-177aa4aa40cf (at 10.9.101.6@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eb58dc7fc00, cur 1554931554 expire 1554931404 last 1554931327 [2776520.788007] Lustre: fir-MDT0001: haven't heard from client eacb59ca-647f-dd3d-4d0d-fe167b61d004 (at 10.8.9.2@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee36bcd8400, cur 1554931685 expire 1554931535 last 1554931458 [2776520.809709] Lustre: Skipped 2 previous similar messages [2776960.884953] Lustre: fir-MDT0001: Connection restored to 3387f135-17d5-9331-2fb1-15691a69b5a2 (at 10.8.22.14@o2ib6) [2776960.895482] Lustre: Skipped 32 previous similar messages [2777001.794121] Lustre: fir-MDT0001: haven't heard from client 43db767e-1175-1089-38dd-81d5c5f316da (at 10.8.27.23@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ec368707c00, cur 1554932166 expire 1554932016 last 1554931939 [2777001.816018] Lustre: Skipped 2 previous similar messages [2777735.802710] Lustre: fir-MDT0001: haven't heard from client 04c36c72-b4ce-d99a-b97d-bb0d21c053dc (at 10.8.27.23@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed005bca000, cur 1554932900 expire 1554932750 last 1554932673 [2777735.824589] Lustre: Skipped 3 previous similar messages [2777747.762175] Lustre: fir-MDT0001: Connection restored to f324518e-874c-2e94-6825-98f2d3de708f (at 10.8.27.23@o2ib6) [2777747.772723] Lustre: Skipped 7 previous similar messages [2778383.636389] Lustre: fir-MDT0001: Connection restored to f324518e-874c-2e94-6825-98f2d3de708f (at 10.8.27.23@o2ib6) [2778383.646916] Lustre: Skipped 5 previous similar messages [2778391.815983] Lustre: fir-MDT0003: haven't heard from client 5a7ac3a2-9a33-8f4d-0bba-8552442e9a4c (at 10.9.101.23@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eba74da4400, cur 1554933556 expire 1554933406 last 1554933329 [2778391.837951] Lustre: Skipped 3 previous similar messages [2779052.824045] Lustre: fir-MDT0001: Connection restored to 766c24dc-05fc-2c20-7902-bb94de21535e (at 10.8.10.29@o2ib6) [2779052.834569] Lustre: Skipped 3 previous similar messages [2779607.825224] Lustre: fir-MDT0001: haven't heard from client a113795d-26c3-ae3f-6fff-6a09176dbfb5 (at 10.8.10.29@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ecf19fe1400, cur 1554934772 expire 1554934622 last 1554934545 [2779607.847102] Lustre: Skipped 7 previous similar messages [2779767.996446] Lustre: fir-MDT0001: Connection restored to 766c24dc-05fc-2c20-7902-bb94de21535e (at 10.8.10.29@o2ib6) [2779768.006972] Lustre: Skipped 1 previous similar message [2779776.828507] Lustre: fir-MDT0001: haven't heard from client a7ac5e88-186b-7af2-35cc-0d64af89702d (at 10.9.104.52@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee4b3fc7800, cur 1554934941 expire 1554934791 last 1554934714 [2779776.850467] Lustre: Skipped 1 previous similar message [2779919.419877] Lustre: 96287:0:(mdd_device.c:1794:mdd_changelog_clear()) fir-MDD0001: Failure to clear the changelog for user 1: -22 [2780043.241646] Lustre: 95537:0:(mdd_device.c:1794:mdd_changelog_clear()) fir-MDD0001: Failure to clear the changelog for user 1: -22 [2780161.831883] Lustre: fir-MDT0001: haven't heard from client 3abdd817-3f5f-e715-4a35-bb8529b9524d (at 10.9.101.39@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ef055c89000, cur 1554935326 expire 1554935176 last 1554935099 [2780161.853874] Lustre: Skipped 1 previous similar message [2780327.269997] Lustre: 96222:0:(mdd_device.c:1794:mdd_changelog_clear()) fir-MDD0001: Failure to clear the changelog for user 1: -22 [2780401.727657] Lustre: 95533:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554935558/real 1554935558] req@ffff8edc6a1af800 x1630438588387344/t0(0) o104->fir-MDT0001@10.9.101.28@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554935565 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2780401.755192] Lustre: 95533:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 68 previous similar messages [2780409.777748] Lustre: 95902:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8edf1c013c00 x1630383433263184/t0(0) o101->7b2d8a4b-21d1-3f9e-8366-ba99121229d6@10.9.101.11@o2ib4:28/0 lens 480/568 e 1 to 0 dl 1554935578 ref 2 fl Interpret:/0/0 rc 0/0 [2780409.806872] Lustre: 95902:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 13 previous similar messages [2780415.778304] Lustre: fir-MDT0001: Client 7b2d8a4b-21d1-3f9e-8366-ba99121229d6 (at 10.9.101.11@o2ib4) reconnecting [2780415.788650] Lustre: Skipped 5 previous similar messages [2780415.794104] Lustre: fir-MDT0001: Connection restored to 7b2d8a4b-21d1-3f9e-8366-ba99121229d6 (at 10.9.101.11@o2ib4) [2780415.804740] Lustre: Skipped 2 previous similar messages [2780429.766010] LustreError: 95533:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.9.101.28@o2ib4) failed to reply to blocking AST (req@ffff8edc6a1af800 x1630438588387344 status 0 rc -110), evict it ns: mdt-fir-MDT0001_UUID lock: ffff8ed1e202d100/0x857bd8e5783b3b0 lrc: 4/0,0 mode: PR/PR res: [0x24001260c:0xa:0x0].0x0 bits 0x40/0x0 rrc: 53 type: IBT flags: 0x60000400000020 nid: 10.9.101.28@o2ib4 remote: 0xd0946e936ea0c2a3 expref: 1880 pid: 96101 timeout: 2780430 lvb_type: 0 [2780429.809066] LustreError: 138-a: fir-MDT0001: A client on nid 10.9.101.28@o2ib4 was evicted due to a lock blocking callback time out: rc -110 [2780429.821866] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 35s: evicting client at 10.9.101.28@o2ib4 ns: mdt-fir-MDT0001_UUID lock: ffff8ed1e202d100/0x857bd8e5783b3b0 lrc: 3/0,0 mode: PR/PR res: [0x24001260c:0xa:0x0].0x0 bits 0x40/0x0 rrc: 53 type: IBT flags: 0x60000400000020 nid: 10.9.101.28@o2ib4 remote: 0xd0946e936ea0c2a3 expref: 1881 pid: 96101 timeout: 0 lvb_type: 0 [2780469.845839] Lustre: fir-MDT0003: haven't heard from client 8f06907a-36b8-ba84-2a7c-5d8586764554 (at 10.9.107.65@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee07bc13800, cur 1554935634 expire 1554935484 last 1554935407 [2780469.867817] Lustre: Skipped 7 previous similar messages [2780872.586194] LustreError: 96143:0:(mdt_io.c:442:mdt_preprw_write()) fir-MDT0001: BRW to missing obj [0x240010707:0x88:0x0] [2781514.255565] Lustre: fir-MDT0001: Connection restored to a7ac5e88-186b-7af2-35cc-0d64af89702d (at 10.9.104.52@o2ib4) [2781514.266180] Lustre: Skipped 7 previous similar messages [2781605.849294] Lustre: fir-MDT0001: haven't heard from client 73e4334b-e1a4-a4d7-284d-1cd9fb4562bc (at 10.9.114.5@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed3dac96000, cur 1554936770 expire 1554936620 last 1554936543 [2781605.871165] Lustre: Skipped 2 previous similar messages [2782239.857015] Lustre: fir-MDT0001: haven't heard from client 4396a612-edbd-dc95-0827-14aad3fc5f61 (at 10.9.104.45@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ef07c49a800, cur 1554937404 expire 1554937254 last 1554937177 [2782239.878998] Lustre: Skipped 1 previous similar message [2782284.009544] Lustre: fir-MDT0001: Connection restored to 59184f70-fdbe-1430-fc35-de4e56cbaa44 (at 10.9.101.28@o2ib4) [2782284.020181] Lustre: Skipped 13 previous similar messages [2783529.448127] Lustre: fir-MDT0003: Client 39b21bb1-06bc-8c69-c9c3-5986c42b070c (at 10.8.0.65@o2ib6) reconnecting [2783529.458302] Lustre: Skipped 4 previous similar messages [2783529.463753] Lustre: fir-MDT0003: Connection restored to 39b21bb1-06bc-8c69-c9c3-5986c42b070c (at 10.8.0.65@o2ib6) [2783589.356862] Lustre: 96243:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554938746/real 1554938746] req@ffff8ed8c0be3f00 x1630438612876752/t0(0) o104->fir-MDT0001@10.8.0.65@o2ib6:15/16 lens 296/224 e 0 to 1 dl 1554938753 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2783589.384223] Lustre: 96243:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 22 previous similar messages [2783597.407974] Lustre: 96282:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ed9f694f800 x1629928196650096/t0(0) o101->d688f6d7-017e-8817-dbcd-6b016c3633d8@10.9.0.61@o2ib4:6/0 lens 1784/3288 e 1 to 0 dl 1554938766 ref 2 fl Interpret:/0/0 rc 0/0 [2783597.436994] Lustre: 96282:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 4 previous similar messages [2783603.394031] Lustre: 96243:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554938760/real 1554938760] req@ffff8ed8c0be3f00 x1630438612876752/t0(0) o104->fir-MDT0001@10.8.0.65@o2ib6:15/16 lens 296/224 e 0 to 1 dl 1554938767 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2783603.409980] Lustre: fir-MDT0001: Client d688f6d7-017e-8817-dbcd-6b016c3633d8 (at 10.9.0.61@o2ib4) reconnecting [2783603.431562] Lustre: 96243:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 1 previous similar message [2783624.427795] Lustre: fir-MDT0001: Client d688f6d7-017e-8817-dbcd-6b016c3633d8 (at 10.9.0.61@o2ib4) reconnecting [2783624.438001] Lustre: fir-MDT0001: Connection restored to d688f6d7-017e-8817-dbcd-6b016c3633d8 (at 10.9.0.61@o2ib4) [2783624.442290] Lustre: 96243:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554938781/real 1554938781] req@ffff8ed8c0be3f00 x1630438612876752/t0(0) o104->fir-MDT0001@10.8.0.65@o2ib6:15/16 lens 296/224 e 0 to 1 dl 1554938788 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2783624.442292] Lustre: 96243:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 2 previous similar messages [2783624.485734] Lustre: Skipped 1 previous similar message [2783659.442733] Lustre: 96243:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554938816/real 1554938816] req@ffff8ed8c0be3f00 x1630438612876752/t0(0) o104->fir-MDT0001@10.8.0.65@o2ib6:15/16 lens 296/224 e 0 to 1 dl 1554938823 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2783659.470097] Lustre: 96243:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 4 previous similar messages [2783659.948771] Lustre: 96030:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-10), not sending early reply req@ffff8eddf26ffb00 x1629928196696816/t0(0) o101->d688f6d7-017e-8817-dbcd-6b016c3633d8@10.9.0.61@o2ib4:9/0 lens 576/3264 e 1 to 0 dl 1554938829 ref 2 fl Interpret:/0/0 rc 0/0 [2783659.977868] Lustre: 96030:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 1 previous similar message [2783666.503260] Lustre: fir-MDT0001: Client d688f6d7-017e-8817-dbcd-6b016c3633d8 (at 10.9.0.61@o2ib4) reconnecting [2783666.513458] Lustre: Skipped 1 previous similar message [2783676.985945] LustreError: 96314:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554938751, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ee01bae69c0/0x857bd8e6650ef4f lrc: 3/1,0 mode: --/PR res: [0x240010083:0xb082:0x0].0x0 bits 0x13/0x0 rrc: 17 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 96314 timeout: 0 lvb_type: 0 [2783677.025523] LustreError: 96314:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 10 previous similar messages [2783681.056996] Lustre: 96275:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-2), not sending early reply req@ffff8ed7b7f8f200 x1629928196720160/t0(0) o101->d688f6d7-017e-8817-dbcd-6b016c3633d8@10.9.0.61@o2ib4:0/0 lens 576/3264 e 1 to 0 dl 1554938850 ref 2 fl Interpret:/0/0 rc 0/0 [2783720.130992] LustreError: 137-5: fir-MDT0002_UUID: not available for connect from 10.8.0.65@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2783720.148382] LustreError: Skipped 25 previous similar messages [2783727.617262] LustreError: 137-5: fir-MDT0000_UUID: not available for connect from 10.8.0.65@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2783752.705955] Lustre: fir-MDT0001: Client 39b21bb1-06bc-8c69-c9c3-5986c42b070c (at 10.8.0.65@o2ib6) reconnecting [2783752.716132] Lustre: Skipped 4 previous similar messages [2783902.311479] Lustre: fir-MDT0001: Connection restored to 4396a612-edbd-dc95-0827-14aad3fc5f61 (at 10.9.104.45@o2ib4) [2783902.322106] Lustre: Skipped 8 previous similar messages [2783950.655248] Lustre: 96304:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554939107/real 1554939107] req@ffff8eb0a6246f00 x1630438615319152/t0(0) o104->fir-MDT0001@10.9.101.1@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554939114 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2783950.682713] Lustre: 96304:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 8 previous similar messages [2783958.336333] Lustre: 95597:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ed16cc9b900 x1628600964562480/t0(0) o101->4215d6e6-d370-4df7-a891-0157cd0f211c@10.9.101.22@o2ib4:7/0 lens 576/3264 e 1 to 0 dl 1554939127 ref 2 fl Interpret:/0/0 rc 0/0 [2783964.719283] Lustre: fir-MDT0001: Client 4215d6e6-d370-4df7-a891-0157cd0f211c (at 10.9.101.22@o2ib4) reconnecting [2783978.692607] LustreError: 96304:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.9.101.1@o2ib4) failed to reply to blocking AST (req@ffff8eb0a6246f00 x1630438615319152 status 0 rc -110), evict it ns: mdt-fir-MDT0001_UUID lock: ffff8ed38978e9c0/0x857bd8e58da9094 lrc: 4/0,0 mode: PR/PR res: [0x240006028:0x16432:0x0].0x0 bits 0x13/0x0 rrc: 42 type: IBT flags: 0x60200400000020 nid: 10.9.101.1@o2ib4 remote: 0xa12569eefb281665 expref: 1867 pid: 95359 timeout: 2783979 lvb_type: 0 [2783978.735835] LustreError: 138-a: fir-MDT0001: A client on nid 10.9.101.1@o2ib4 was evicted due to a lock blocking callback time out: rc -110 [2783978.748575] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 35s: evicting client at 10.9.101.1@o2ib4 ns: mdt-fir-MDT0001_UUID lock: ffff8ed38978e9c0/0x857bd8e58da9094 lrc: 3/0,0 mode: PR/PR res: [0x240006028:0x16432:0x0].0x0 bits 0x13/0x0 rrc: 42 type: IBT flags: 0x60200400000020 nid: 10.9.101.1@o2ib4 remote: 0xa12569eefb281665 expref: 1868 pid: 95359 timeout: 0 lvb_type: 0 [2784034.395253] Lustre: 95931:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ebdaee50f00 x1629292131523648/t0(0) o101->9a39e07b-d2eb-1e26-18c2-91ada219e613@10.8.7.7@o2ib6:23/0 lens 480/568 e 0 to 0 dl 1554939203 ref 2 fl Interpret:/0/0 rc 0/0 [2784034.424171] Lustre: 95931:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 17 previous similar messages [2784038.969485] LustreError: 137-5: fir-MDT0002_UUID: not available for connect from 10.8.0.65@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2784043.889930] Lustre: fir-MDT0003: haven't heard from client 3936a9aa-9a26-d302-25ed-f408e6cd8b5f (at 10.9.101.1@o2ib4) in 194 seconds. I think it's dead, and I am evicting it. exp ffff8eb6f3dc8000, cur 1554939208 expire 1554939058 last 1554939014 [2784043.911810] Lustre: Skipped 3 previous similar messages [2784052.050894] LNetError: 94916:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (-125, 0) [2784137.559508] LustreError: 95945:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554939211, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ebd90b41440/0x857bd8e68183843 lrc: 3/0,1 mode: --/CW res: [0x240003366:0xf85:0x0].0x0 bits 0x2/0x0 rrc: 6 type: IBT flags: 0x40210400000020 nid: local remote: 0x0 expref: -99 pid: 95945 timeout: 0 lvb_type: 0 [2784137.598832] LustreError: 95945:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 3 previous similar messages [2784146.959632] Lustre: 95401:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554939301/real 1554939301] req@ffff8ee25465c500 x1630438616398144/t0(0) o104->fir-MDT0001@10.8.0.65@o2ib6:15/16 lens 296/224 e 0 to 1 dl 1554939311 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2784146.986967] Lustre: 95401:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 12 previous similar messages [2784176.998014] LustreError: 95401:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.8.0.65@o2ib6) failed to reply to blocking AST (req@ffff8ee25465c500 x1630438616398144 status 0 rc -110), evict it ns: mdt-fir-MDT0001_UUID lock: ffff8eb4f60ff080/0x857bd8e64b3c14a lrc: 4/0,0 mode: PR/PR res: [0x24000f611:0xecd1:0x0].0x0 bits 0x13/0x0 rrc: 4 type: IBT flags: 0x60200400000020 nid: 10.8.0.65@o2ib6 remote: 0x249bbee8e2e29bef expref: 120375 pid: 95124 timeout: 2784174 lvb_type: 0 [2784177.041065] LustreError: 138-a: fir-MDT0001: A client on nid 10.8.0.65@o2ib6 was evicted due to a lock blocking callback time out: rc -110 [2784177.053706] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 40s: evicting client at 10.8.0.65@o2ib6 ns: mdt-fir-MDT0001_UUID lock: ffff8eb4f60ff080/0x857bd8e64b3c14a lrc: 3/0,0 mode: PR/PR res: [0x24000f611:0xecd1:0x0].0x0 bits 0x13/0x0 rrc: 4 type: IBT flags: 0x60200400000020 nid: 10.8.0.65@o2ib6 remote: 0x249bbee8e2e29bef expref: 120376 pid: 95124 timeout: 0 lvb_type: 0 [2784177.091490] LustreError: 96030:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8ee65c2ec000 ns: mdt-fir-MDT0001_UUID lock: ffff8ef0560dd580/0x857bd8e681b7b11 lrc: 3/0,0 mode: PR/PR res: [0x240003366:0xf85:0x0].0x0 bits 0x13/0x0 rrc: 3 type: IBT flags: 0x50200000000000 nid: 10.8.0.65@o2ib6 remote: 0x249bbee8e33ae6e8 expref: 120302 pid: 96030 timeout: 0 lvb_type: 0 [2784177.126370] LustreError: 96030:0:(ldlm_lockd.c:1357:ldlm_handle_enqueue0()) Skipped 1 previous similar message [2784177.136575] Lustre: 96030:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:96s); client may timeout. req@ffff8ed476f07800 x1629179583860768/t0(0) o101->39b21bb1-06bc-8c69-c9c3-5986c42b070c@10.8.0.65@o2ib6:5/0 lens 584/536 e 0 to 0 dl 1554939245 ref 1 fl Complete:/0/0 rc -107/-107 [2784177.165382] Lustre: 96030:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 8 previous similar messages [2784208.395893] LustreError: 95097:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) ldlm_cancel from 10.8.0.65@o2ib6 arrived at 1554939372 with bad export cookie 601157486662820825 [2784208.411363] LustreError: 95097:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) Skipped 3 previous similar messages [2784208.906419] LustreError: 96035:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) ldlm_cancel from 10.8.0.65@o2ib6 arrived at 1554939373 with bad export cookie 601157486662820825 [2784208.921898] LustreError: 96035:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) Skipped 7 previous similar messages [2784209.974976] LustreError: 96828:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) ldlm_cancel from 10.8.0.65@o2ib6 arrived at 1554939374 with bad export cookie 601157486662820825 [2784209.990434] LustreError: 96828:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) Skipped 19 previous similar messages [2784212.062027] LustreError: 95096:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) ldlm_cancel from 10.8.0.65@o2ib6 arrived at 1554939376 with bad export cookie 601157486662820825 [2784212.077505] LustreError: 95096:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) Skipped 31 previous similar messages [2784216.089025] LustreError: 96035:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) ldlm_cancel from 10.8.0.65@o2ib6 arrived at 1554939380 with bad export cookie 601157486662820825 [2784216.104500] LustreError: 96035:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) Skipped 77 previous similar messages [2784224.108274] LustreError: 95096:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) ldlm_cancel from 10.8.0.65@o2ib6 arrived at 1554939388 with bad export cookie 601157486662820825 [2784224.123733] LustreError: 95096:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) Skipped 227 previous similar messages [2784240.130341] LustreError: 95097:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) ldlm_cancel from 10.8.0.65@o2ib6 arrived at 1554939404 with bad export cookie 601157486662820825 [2784240.145819] LustreError: 95097:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) Skipped 530 previous similar messages [2784272.138810] LustreError: 95096:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) ldlm_cancel from 10.8.0.65@o2ib6 arrived at 1554939436 with bad export cookie 601157486662820825 [2784272.154286] LustreError: 95096:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) Skipped 1069 previous similar messages [2784322.742440] Lustre: fir-MDT0003: Client 39b21bb1-06bc-8c69-c9c3-5986c42b070c (at 10.8.0.65@o2ib6) reconnecting [2784322.752615] Lustre: Skipped 26 previous similar messages [2784322.758136] Lustre: fir-MDT0003: Connection restored to 39b21bb1-06bc-8c69-c9c3-5986c42b070c (at 10.8.0.65@o2ib6) [2784322.768585] Lustre: Skipped 27 previous similar messages [2784336.151460] LustreError: 96828:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) ldlm_cancel from 10.8.0.65@o2ib6 arrived at 1554939500 with bad export cookie 601157486662820825 [2784336.166926] LustreError: 96828:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) Skipped 4886 previous similar messages [2784451.684821] LustreError: 137-5: fir-MDT0000_UUID: not available for connect from 10.8.0.65@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2784451.702191] LustreError: Skipped 1 previous similar message [2784479.322027] LustreError: 137-5: fir-MDT0002_UUID: not available for connect from 10.8.0.65@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2784549.153497] Lustre: 95899:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eb6be2c8900 x1628644301592192/t0(0) o101->f8614979-4372-eafb-d847-ef6d85cec60d@10.9.106.57@o2ib4:28/0 lens 480/568 e 0 to 0 dl 1554939718 ref 2 fl Interpret:/0/0 rc 0/0 [2784549.182657] Lustre: 95899:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 5 previous similar messages [2784616.204036] LustreError: 137-5: fir-MDT0002_UUID: not available for connect from 10.8.0.65@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2784616.221411] LustreError: Skipped 1 previous similar message [2784763.888088] Lustre: fir-MDT0001: haven't heard from client 445cb3a8-ea98-c57a-69c4-8c31c4fef696 (at 10.8.10.29@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ec7c3b5b800, cur 1554939928 expire 1554939778 last 1554939701 [2784763.909964] Lustre: Skipped 4 previous similar messages [2784776.368344] LustreError: 137-5: fir-MDT0002_UUID: not available for connect from 10.8.0.65@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2785493.899315] Lustre: fir-MDT0001: haven't heard from client 12994599-ad9c-a9cf-bbe3-80e2e4fd455c (at 10.9.106.61@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee449aef000, cur 1554940658 expire 1554940508 last 1554940431 [2785493.921297] Lustre: Skipped 1 previous similar message [2785748.648908] Lustre: fir-MDT0001: Connection restored to c400b380-b490-5a7d-2708-06c02b267645 (at 10.9.101.2@o2ib4) [2785748.659434] Lustre: Skipped 17 previous similar messages [2785975.903344] Lustre: fir-MDT0003: haven't heard from client 6644986b-6ded-2fb4-0b62-220707003113 (at 10.8.9.8@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee07e451c00, cur 1554941140 expire 1554940990 last 1554940913 [2785975.925052] Lustre: Skipped 3 previous similar messages [2785988.902170] Lustre: fir-MDT0001: haven't heard from client 6644986b-6ded-2fb4-0b62-220707003113 (at 10.8.9.8@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed97f083c00, cur 1554941153 expire 1554941003 last 1554940926 [2786091.992442] Lustre: 95385:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ec404ba6c00 x1628554767880304/t0(0) o36->4941db80-d61d-3570-a7d5-2e5e8b9d41b4@10.9.101.55@o2ib4:11/0 lens 528/2888 e 1 to 0 dl 1554941261 ref 2 fl Interpret:/0/0 rc 0/0 [2786092.021534] Lustre: 95385:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 1 previous similar message [2786098.015699] Lustre: fir-MDT0003: Client 4941db80-d61d-3570-a7d5-2e5e8b9d41b4 (at 10.9.101.55@o2ib4) reconnecting [2786098.026045] Lustre: Skipped 11 previous similar messages [2786098.031561] Lustre: fir-MDT0003: Connection restored to 4941db80-d61d-3570-a7d5-2e5e8b9d41b4 (at 10.9.101.55@o2ib4) [2786098.042164] Lustre: Skipped 4 previous similar messages [2786295.906100] Lustre: fir-MDT0001: haven't heard from client c65ff351-a3e9-9dcc-ef16-10d61a063f1a (at 10.8.16.4@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee480b4ac00, cur 1554941460 expire 1554941310 last 1554941233 [2786483.908529] Lustre: fir-MDT0001: haven't heard from client f0d2ac09-a0c3-12ba-906e-832d6c09b0ea (at 10.8.22.28@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee2bf8a8800, cur 1554941648 expire 1554941498 last 1554941421 [2786483.930410] Lustre: Skipped 25 previous similar messages [2786559.909430] Lustre: fir-MDT0001: haven't heard from client 404b3112-4942-66cc-d3d2-d2daa56b1359 (at 10.8.15.4@o2ib6) in 151 seconds. I think it's dead, and I am evicting it. exp ffff8ee9c5230000, cur 1554941724 expire 1554941574 last 1554941573 [2786559.931222] Lustre: Skipped 9 previous similar messages [2786635.910355] Lustre: fir-MDT0001: haven't heard from client d16d3de7-a181-7578-6b96-a5a55a343c96 (at 10.8.28.12@o2ib6) in 196 seconds. I think it's dead, and I am evicting it. exp ffff8ed64f66f400, cur 1554941800 expire 1554941650 last 1554941604 [2786635.932250] Lustre: Skipped 1 previous similar message [2786807.618574] Lustre: fir-MDT0001: Connection restored to 922fcc09-f6de-aac7-6b5a-edebfa27d078 (at 10.9.108.5@o2ib4) [2786807.629090] Lustre: Skipped 3 previous similar messages [2786860.877031] Lustre: 95359:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554942017/real 1554942017] req@ffff8ed30820aa00 x1630438632942096/t0(0) o106->fir-MDT0001@10.9.101.3@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554942024 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2786860.904452] Lustre: 95359:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 3 previous similar messages [2786867.924123] Lustre: 95351:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8edaf3b33000 x1628752922845968/t0(0) o101->d4014899-b33d-ac76-e84c-04c52d0b75a9@10.9.101.21@o2ib4:7/0 lens 480/568 e 1 to 0 dl 1554942037 ref 2 fl Interpret:/0/0 rc 0/0 [2786867.953134] Lustre: 95351:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 2 previous similar messages [2786874.717357] Lustre: fir-MDT0001: Client d4014899-b33d-ac76-e84c-04c52d0b75a9 (at 10.9.101.21@o2ib4) reconnecting [2786874.727709] Lustre: Skipped 2 previous similar messages [2786876.438216] Lustre: 96022:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ec51b7f2400 x1630350194967856/t0(0) o101->c927c3a4-10d5-5d74-f8fb-42ec7ee8b67c@10.9.101.43@o2ib4:15/0 lens 480/568 e 1 to 0 dl 1554942045 ref 2 fl Interpret:/0/0 rc 0/0 [2786876.467311] Lustre: 96022:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 2 previous similar messages [2786886.264157] Lustre: fir-MDT0001: Client ba312893-3b9e-ec41-cd1b-c94457862133 (at 10.9.101.16@o2ib4) reconnecting [2786886.274506] Lustre: Skipped 4 previous similar messages [2786886.604338] Lustre: 95996:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8eca1c69aa00 x1628638509909776/t0(0) o101->341f5684-4263-80ac-3a8b-2cbfc9878a0a@10.9.108.42@o2ib4:25/0 lens 480/568 e 0 to 0 dl 1554942055 ref 2 fl Interpret:/0/0 rc 0/0 [2786886.633522] Lustre: 95996:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 4 previous similar messages [2786889.504397] LustreError: 96002:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.9.101.3@o2ib4) failed to reply to blocking AST (req@ffff8ec1696e3300 x1630438632943664 status 0 rc -110), evict it ns: mdt-fir-MDT0001_UUID lock: ffff8ed923eba400/0x857bd8e74fb1ca1 lrc: 4/0,0 mode: PR/PR res: [0x2400125da:0x2:0x0].0x0 bits 0x40/0x0 rrc: 47 type: IBT flags: 0x60000400000020 nid: 10.9.101.3@o2ib4 remote: 0x3efddecc146eb9f9 expref: 1934 pid: 95918 timeout: 2786889 lvb_type: 0 [2786889.547266] LustreError: 138-a: fir-MDT0001: A client on nid 10.9.101.3@o2ib4 was evicted due to a lock blocking callback time out: rc -110 [2786889.559996] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 35s: evicting client at 10.9.101.3@o2ib4 ns: mdt-fir-MDT0001_UUID lock: ffff8ed923eba400/0x857bd8e74fb1ca1 lrc: 3/0,0 mode: PR/PR res: [0x2400125da:0x2:0x0].0x0 bits 0x40/0x0 rrc: 47 type: IBT flags: 0x60000400000020 nid: 10.9.101.3@o2ib4 remote: 0x3efddecc146eb9f9 expref: 1935 pid: 95918 timeout: 0 lvb_type: 0 [2786889.598184] Lustre: 96030:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:1s); client may timeout. req@ffff8edc49e98300 x1630455516820640/t0(0) o101->1e6f74c6-3496-d691-5a66-fdc2823a3b7a@10.9.101.37@o2ib4:22/0 lens 480/536 e 0 to 0 dl 1554942052 ref 1 fl Complete:/0/0 rc 301/301 [2786889.913394] Lustre: fir-MDT0001: haven't heard from client be32f736-8681-f0da-6eae-ec0a11160081 (at 10.8.13.27@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee3b8b6cc00, cur 1554942054 expire 1554941904 last 1554941827 [2786889.935274] Lustre: Skipped 1 previous similar message [2787058.917536] Lustre: fir-MDT0003: haven't heard from client 6ffc2e8b-b062-5b88-97a9-a61177559c11 (at 10.9.101.3@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee0553edc00, cur 1554942223 expire 1554942073 last 1554941996 [2787058.939414] Lustre: Skipped 3 previous similar messages [2787570.796517] Lustre: fir-MDT0001: Connection restored to (at 10.9.115.6@o2ib4) [2787570.803921] Lustre: Skipped 9 previous similar messages [2787714.923806] Lustre: fir-MDT0001: haven't heard from client 09ccae74-da49-3ed9-15f5-3662d8bb22c0 (at 10.9.103.5@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eefc87c5c00, cur 1554942879 expire 1554942729 last 1554942652 [2788180.793765] Lustre: fir-MDT0001: Connection restored to 6ca6652e-0187-99bc-e850-704338e05c6f (at 10.9.114.5@o2ib4) [2788180.804292] Lustre: Skipped 27 previous similar messages [2788404.538981] Lustre: 95899:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554943561/real 1554943561] req@ffff8ebfe38d6600 x1630438643660032/t0(0) o104->fir-MDT0003@10.9.103.9@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554943568 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2788404.566409] Lustre: 95899:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 51 previous similar messages [2788411.577072] Lustre: 95899:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554943568/real 1554943568] req@ffff8ebfe38d6600 x1630438643660032/t0(0) o104->fir-MDT0003@10.9.103.9@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554943575 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2788411.604496] Lustre: 95899:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 3 previous similar messages [2788412.578074] Lustre: 95925:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ecdbba7aa00 x1629179824157392/t0(0) o36->39b21bb1-06bc-8c69-c9c3-5986c42b070c@10.8.0.65@o2ib6:21/0 lens 528/2888 e 1 to 0 dl 1554943581 ref 2 fl Interpret:/0/0 rc 0/0 [2788412.606969] Lustre: 95925:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 2 previous similar messages [2788418.609049] Lustre: fir-MDT0003: Client 39b21bb1-06bc-8c69-c9c3-5986c42b070c (at 10.8.0.65@o2ib6) reconnecting [2788422.879210] Lustre: 96091:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ec42cbd0900 x1629293817294896/t0(0) o36->96b650c1-a3a9-82fa-55ba-34c48caa51d0@10.9.107.17@o2ib4:1/0 lens 544/2888 e 1 to 0 dl 1554943591 ref 2 fl Interpret:/0/0 rc 0/0 [2788422.908211] Lustre: 96091:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 1 previous similar message [2788425.614240] Lustre: 95899:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554943582/real 1554943582] req@ffff8ebfe38d6600 x1630438643660032/t0(0) o104->fir-MDT0003@10.9.103.9@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554943589 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2788425.641668] Lustre: 95899:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 7 previous similar messages [2788428.874679] Lustre: fir-MDT0001: Client 96b650c1-a3a9-82fa-55ba-34c48caa51d0 (at 10.9.107.17@o2ib4) reconnecting [2788439.618972] Lustre: fir-MDT0003: Client 39b21bb1-06bc-8c69-c9c3-5986c42b070c (at 10.8.0.65@o2ib6) reconnecting [2788439.629147] Lustre: Skipped 1 previous similar message [2788441.527435] Lustre: 95946:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8edbc431e600 x1628641681225264/t0(0) o36->7ad56f83-0cd6-077b-3bd8-38487ece975e@10.9.107.18@o2ib4:20/0 lens 544/2888 e 0 to 0 dl 1554943610 ref 2 fl Interpret:/0/0 rc 0/0 [2788441.556600] Lustre: 95946:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 1 previous similar message [2788446.651497] Lustre: 95899:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554943603/real 1554943603] req@ffff8ebfe38d6600 x1630438643660032/t0(0) o104->fir-MDT0003@10.9.103.9@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554943610 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2788446.678941] Lustre: 95899:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 11 previous similar messages [2788449.947506] Lustre: fir-MDT0001: Client 96b650c1-a3a9-82fa-55ba-34c48caa51d0 (at 10.9.107.17@o2ib4) reconnecting [2788449.957861] Lustre: Skipped 1 previous similar message [2788457.144625] Lustre: 95537:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8eb7bf97f800 x1630290286910928/t0(0) o36->712f8bd7-e015-5753-6533-6ad5fd7bc4e5@10.9.101.24@o2ib4:6/0 lens 528/2888 e 1 to 0 dl 1554943626 ref 2 fl Interpret:/0/0 rc 0/0 [2788471.019715] Lustre: fir-MDT0001: Client 96b650c1-a3a9-82fa-55ba-34c48caa51d0 (at 10.9.107.17@o2ib4) reconnecting [2788471.030065] Lustre: Skipped 4 previous similar messages [2788481.688928] Lustre: 95899:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554943638/real 1554943638] req@ffff8ebfe38d6600 x1630438643660032/t0(0) o104->fir-MDT0003@10.9.103.9@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554943645 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2788481.716383] Lustre: 95899:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 19 previous similar messages [2788487.936012] LustreError: 96005:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554943562, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8eda9a7c0480/0x857bd8e7ed582a2 lrc: 3/1,0 mode: --/PR res: [0x28000f8d0:0x8542:0x0].0x0 bits 0x13/0x0 rrc: 8 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 96005 timeout: 0 lvb_type: 0 [2788487.975490] LustreError: 96005:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 1 previous similar message [2788491.754057] Lustre: 96091:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ece4b3d8c00 x1628752929742720/t0(0) o36->d4014899-b33d-ac76-e84c-04c52d0b75a9@10.9.101.21@o2ib4:10/0 lens 528/2888 e 0 to 0 dl 1554943660 ref 2 fl Interpret:/0/0 rc 0/0 [2788491.783248] Lustre: 96091:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 1 previous similar message [2788497.824124] Lustre: fir-MDT0000-osp-MDT0001: Connection to fir-MDT0000 (at 10.0.10.51@o2ib7) was lost; in progress operations using this service will wait for recovery to complete [2788497.840319] LustreError: 95593:0:(ldlm_request.c:147:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554943571, 90s ago), entering recovery for fir-MDT0000_UUID@10.0.10.51@o2ib7 ns: fir-MDT0000-osp-MDT0001 lock: ffff8ecff0e83600/0x857bd8e7ee6aeef lrc: 4/0,1 mode: --/EX res: [0x200000004:0x1:0x0].0x0 bits 0x2/0x0 rrc: 4 type: IBT flags: 0x1000001000000 nid: local remote: 0x53d0f215b4280936 expref: -99 pid: 95593 timeout: 0 lvb_type: 0 [2788497.933965] Lustre: fir-MDT0001: haven't heard from client 4761d91d-2dfe-5702-4fe8-b846b6ec7501 (at 10.9.103.25@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee35c025400, cur 1554943662 expire 1554943512 last 1554943435 [2788497.955928] Lustre: Skipped 5 previous similar messages [2788499.804151] Lustre: fir-MDT0000-osp-MDT0003: Connection to fir-MDT0000 (at 10.0.10.51@o2ib7) was lost; in progress operations using this service will wait for recovery to complete [2788499.820346] LustreError: 95575:0:(ldlm_request.c:147:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554943573, 90s ago), entering recovery for fir-MDT0000_UUID@10.0.10.51@o2ib7 ns: fir-MDT0000-osp-MDT0003 lock: ffff8ec2ca7f5580/0x857bd8e7ee9f27a lrc: 4/0,1 mode: --/EX res: [0x200000004:0x1:0x0].0x0 bits 0x2/0x0 rrc: 8 type: IBT flags: 0x1000001000000 nid: local remote: 0x53d0f215b42fa1e6 expref: -99 pid: 95575 timeout: 0 lvb_type: 0 [2788506.389241] LustreError: 96279:0:(ldlm_request.c:147:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554943580, 90s ago), entering recovery for fir-MDT0000_UUID@10.0.10.51@o2ib7 ns: fir-MDT0000-osp-MDT0001 lock: ffff8ecdd96c5c40/0x857bd8e7ef475f3 lrc: 4/0,1 mode: --/EX res: [0x200000004:0x1:0x0].0x0 bits 0x2/0x0 rrc: 4 type: IBT flags: 0x1000001000000 nid: local remote: 0x53d0f215b44b3ae5 expref: -99 pid: 96279 timeout: 0 lvb_type: 0 [2788509.560059] Lustre: fir-MDT0001: Client 7ad56f83-0cd6-077b-3bd8-38487ece975e (at 10.9.107.18@o2ib4) reconnecting [2788509.570426] Lustre: Skipped 10 previous similar messages [2788523.308441] LustreError: 95860:0:(ldlm_request.c:147:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554943597, 90s ago), entering recovery for fir-MDT0000_UUID@10.0.10.51@o2ib7 ns: fir-MDT0000-osp-MDT0001 lock: ffff8ec15b743840/0x857bd8e7f0fda80 lrc: 4/0,1 mode: --/EX res: [0x200000004:0x1:0x0].0x0 bits 0x2/0x0 rrc: 8 type: IBT flags: 0x1000001000000 nid: local remote: 0x53d0f215b49042d5 expref: -99 pid: 95860 timeout: 0 lvb_type: 0 [2788532.078552] LustreError: 96310:0:(ldlm_request.c:147:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554943606, 90s ago), entering recovery for fir-MDT0000_UUID@10.0.10.51@o2ib7 ns: fir-MDT0000-osp-MDT0003 lock: ffff8ec4405e45c0/0x857bd8e7f1de340 lrc: 4/0,1 mode: --/EX res: [0x200000004:0x1:0x0].0x0 bits 0x2/0x0 rrc: 8 type: IBT flags: 0x1000001000000 nid: local remote: 0x53d0f215b4b4572b expref: -99 pid: 96310 timeout: 0 lvb_type: 0 [2788532.144553] Lustre: 96323:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ee6d959f800 x1630408058466752/t0(0) o36->5a7f137e-7364-babe-5e04-cab73eaf6cfb@10.9.103.34@o2ib4:21/0 lens 560/2888 e 0 to 0 dl 1554943701 ref 2 fl Interpret:/0/0 rc 0/0 [2788532.173733] Lustre: 96323:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 4 previous similar messages [2788548.230743] LustreError: 95991:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554943622, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0003_UUID lock: ffff8ed040669200/0x857bd8e7f423959 lrc: 3/1,0 mode: --/PR res: [0x28000f8d0:0x8542:0x0].0x0 bits 0x13/0x0 rrc: 10 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 95991 timeout: 0 lvb_type: 0 [2788551.728792] Lustre: 95899:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554943708/real 1554943708] req@ffff8ebfe38d6600 x1630438643660032/t0(0) o104->fir-MDT0003@10.9.103.9@o2ib4:15/16 lens 296/224 e 0 to 1 dl 1554943715 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2788551.756217] Lustre: 95899:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 39 previous similar messages [2788551.766280] LustreError: 95899:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ### client (nid 10.9.103.9@o2ib4) failed to reply to blocking AST (req@ffff8ebfe38d6600 x1630438643660032 status 0 rc -110), evict it ns: mdt-fir-MDT0003_UUID lock: ffff8ee058a33cc0/0x857bd8e7e7cc182 lrc: 4/0,0 mode: CR/CR res: [0x28000f8d0:0x85e2:0x0].0x0 bits 0x9/0x0 rrc: 7 type: IBT flags: 0x60200400000020 nid: 10.9.103.9@o2ib4 remote: 0xe92ab4debb920175 expref: 92 pid: 96047 timeout: 2788672 lvb_type: 0 [2788551.809084] LustreError: 138-a: fir-MDT0003: A client on nid 10.9.103.9@o2ib4 was evicted due to a lock blocking callback time out: rc -110 [2788551.821800] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 154s: evicting client at 10.9.103.9@o2ib4 ns: mdt-fir-MDT0003_UUID lock: ffff8ee058a33cc0/0x857bd8e7e7cc182 lrc: 3/0,0 mode: CR/CR res: [0x28000f8d0:0x85e2:0x0].0x0 bits 0x9/0x0 rrc: 8 type: IBT flags: 0x60200400000020 nid: 10.9.103.9@o2ib4 remote: 0xe92ab4debb920175 expref: 93 pid: 96047 timeout: 0 lvb_type: 0 [2788795.182327] Lustre: fir-MDT0001: Connection restored to (at 10.8.28.12@o2ib6) [2788795.189735] Lustre: Skipped 60 previous similar messages [2789121.877912] Lustre: 96254:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554944278/real 1554944278] req@ffff8ed9dd747200 x1630438648868432/t0(0) o106->fir-MDT0001@10.9.115.3@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554944285 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2789121.905340] Lustre: 96254:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 3 previous similar messages [2789129.920012] Lustre: 95946:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8edeef7f0f00 x1629179828899344/t0(0) o101->39b21bb1-06bc-8c69-c9c3-5986c42b070c@10.8.0.65@o2ib6:18/0 lens 480/568 e 1 to 0 dl 1554944298 ref 2 fl Interpret:/0/0 rc 0/0 [2789129.948938] Lustre: 95946:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 2 previous similar messages [2789136.160469] Lustre: fir-MDT0001: Client 39b21bb1-06bc-8c69-c9c3-5986c42b070c (at 10.8.0.65@o2ib6) reconnecting [2789136.170647] Lustre: Skipped 19 previous similar messages [2789225.943398] Lustre: fir-MDT0003: haven't heard from client a5c8a438-2f61-6c1a-7178-b80e1ca94d85 (at 10.9.115.3@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ebe1d945000, cur 1554944390 expire 1554944240 last 1554944163 [2789225.965271] Lustre: Skipped 2 previous similar messages [2789226.007326] LustreError: 96101:0:(client.c:1175:ptlrpc_import_delay_req()) @@@ IMP_CLOSED req@ffff8ec334364800 x1630438649563040/t0(0) o104->fir-MDT0001@10.9.115.3@o2ib4:15/16 lens 296/224 e 0 to 0 dl 0 ref 1 fl Rpc:/0/ffffffff rc 0/-1 [2789445.565885] Lustre: fir-MDT0001: Connection restored to 09ccae74-da49-3ed9-15f5-3662d8bb22c0 (at 10.9.103.5@o2ib4) [2789445.576408] Lustre: Skipped 6 previous similar messages [2790303.776315] Lustre: fir-MDT0001: Connection restored to 879034ed-1710-bc72-970e-ba28c4c7d4e9 (at 10.9.103.9@o2ib4) [2790303.786845] Lustre: Skipped 5 previous similar messages [2791176.970395] Lustre: fir-MDT0001: haven't heard from client b2ab8d48-0658-141b-0b32-f738fa655f05 (at 10.9.105.19@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ef07c454400, cur 1554946341 expire 1554946191 last 1554946114 [2791176.992371] Lustre: Skipped 1 previous similar message [2791406.970712] Lustre: fir-MDT0003: haven't heard from client f6f4af16-1479-93f2-5af6-1e2c846a1ab6 (at 10.9.112.6@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8edf406be800, cur 1554946571 expire 1554946421 last 1554946344 [2791406.992605] Lustre: Skipped 15 previous similar messages [2792489.189784] Lustre: fir-MDT0001: Connection restored to (at 10.9.108.63@o2ib4) [2792489.197278] Lustre: Skipped 3 previous similar messages [2792759.750244] Lustre: fir-MDT0001: Connection restored to 404b3112-4942-66cc-d3d2-d2daa56b1359 (at 10.8.15.4@o2ib6) [2792759.760686] Lustre: Skipped 9 previous similar messages [2792912.555742] Lustre: fir-MDT0001: Connection restored to (at 10.8.27.31@o2ib6) [2792912.563147] Lustre: Skipped 7 previous similar messages [2793261.075937] Lustre: fir-MDT0001: Client 708f469b-f504-7275-d236-e90aa8bcec67 (at 10.9.0.64@o2ib4) reconnecting [2793261.086114] Lustre: Skipped 4 previous similar messages [2793261.091544] Lustre: fir-MDT0001: Connection restored to 708f469b-f504-7275-d236-e90aa8bcec67 (at 10.9.0.64@o2ib4) [2793261.102005] Lustre: Skipped 2 previous similar messages [2793596.998532] Lustre: fir-MDT0003: haven't heard from client 9fb7f2d4-f4f7-ebda-25d8-184ef751d76f (at 10.9.113.13@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eba4a370800, cur 1554948761 expire 1554948611 last 1554948534 [2793597.020515] Lustre: Skipped 1 previous similar message [2793768.000273] Lustre: fir-MDT0003: haven't heard from client 39b21bb1-06bc-8c69-c9c3-5986c42b070c (at 10.8.0.65@o2ib6) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ee07e455400, cur 1554948932 expire 1554948782 last 1554948705 [2793768.022079] Lustre: Skipped 1 previous similar message [2794328.007562] Lustre: fir-MDT0003: haven't heard from client 685c7782-a6d3-9ae9-f6a1-6708a0b985ae (at 10.9.114.14@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8ed210f20c00, cur 1554949492 expire 1554949342 last 1554949265 [2794437.015985] Lustre: 95538:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ec84e2aa700 x1630377123442784/t0(0) o101->6ad98c65-f004-7186-b4a3-025eb3e3a442@10.8.17.24@o2ib6:16/0 lens 480/568 e 1 to 0 dl 1554949606 ref 2 fl Interpret:/0/0 rc 0/0 [2794444.030225] Lustre: fir-MDT0001: Client 6ad98c65-f004-7186-b4a3-025eb3e3a442 (at 10.8.17.24@o2ib6) reconnecting [2794444.040514] Lustre: fir-MDT0001: Connection restored to (at 10.8.17.24@o2ib6) [2794444.047916] Lustre: Skipped 2 previous similar messages [2794465.053653] Lustre: fir-MDT0001: Client 6ad98c65-f004-7186-b4a3-025eb3e3a442 (at 10.8.17.24@o2ib6) reconnecting [2794486.063838] Lustre: fir-MDT0001: Client 6ad98c65-f004-7186-b4a3-025eb3e3a442 (at 10.8.17.24@o2ib6) reconnecting [2794493.545733] Lustre: 95347:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ee3798a3600 x1628544889706176/t0(0) o101->09af055d-73b0-0f86-9143-6e4e7900b1c7@10.9.107.42@o2ib4:12/0 lens 480/568 e 1 to 0 dl 1554949662 ref 2 fl Interpret:/0/0 rc 0/0 [2794499.562379] Lustre: fir-MDT0003: Client 09af055d-73b0-0f86-9143-6e4e7900b1c7 (at 10.9.107.42@o2ib4) reconnecting [2794518.203114] Lustre: 95593:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (20:1s); client may timeout. req@ffff8ed1b0d63600 x1629293037659280/t0(0) o101->18792659-3f71-143f-ec4b-3ede54b6d0b7@10.8.1.20@o2ib6:1/0 lens 480/536 e 1 to 0 dl 1554949681 ref 1 fl Complete:/0/0 rc 0/0 [2794520.265716] Lustre: fir-MDT0003: Client 9def1eab-085d-16da-14fb-4131b000cf36 (at 10.8.27.1@o2ib6) reconnecting [2794520.275897] Lustre: Skipped 2 previous similar messages [2794524.738065] LustreError: 137-5: fir-MDT0002_UUID: not available for connect from 10.8.0.65@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2794524.755454] LustreError: Skipped 3 previous similar messages [2794532.782253] Lustre: 96314:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554949689/real 1554949689] req@ffff8ebbd2ac3600 x1630438688879440/t0(0) o106->fir-MDT0003@10.9.101.53@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554949696 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2794532.809786] Lustre: 96314:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 14 previous similar messages [2794540.775351] Lustre: 95353:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ec3369a6900 x1629290129016880/t185311235487(0) o36->9a6d5107-a0dd-89cc-cc60-368ce86bc790@10.8.17.4@o2ib6:29/0 lens 488/3152 e 0 to 0 dl 1554949709 ref 2 fl Interpret:/0/0 rc 0/0 [2794540.805323] Lustre: 95353:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 7 previous similar messages [2794551.011583] Lustre: fir-MDT0001: haven't heard from client 8bee312c-adaa-4011-4be3-7f8f8eac98cb (at 10.9.101.56@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eefc87c2000, cur 1554949715 expire 1554949565 last 1554949488 [2794551.033551] Lustre: Skipped 1 previous similar message [2794558.856972] Lustre: fir-MDT0003: Client 39b21bb1-06bc-8c69-c9c3-5986c42b070c (at 10.8.0.65@o2ib6) reconnecting [2794558.867147] Lustre: Skipped 7 previous similar messages [2794589.377989] Lustre: 95581:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554949746/real 1554949746] req@ffff8ec35628a100 x1630438689273872/t0(0) o106->fir-MDT0003@10.9.101.50@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554949753 ref 1 fl Rpc:X/0/ffffffff rc 0/-1 [2794589.405526] Lustre: 95581:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 4 previous similar messages [2794644.021306] Lustre: fir-MDT0003: Client bafe047a-89ea-21d5-a24e-5c407648cfc7 (at 10.0.10.3@o2ib7) reconnecting [2794644.031483] Lustre: Skipped 1 previous similar message [2794654.794240] LustreError: 137-5: fir-MDT0002_UUID: not available for connect from 10.8.0.65@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2794654.811631] LustreError: Skipped 1 previous similar message [2794659.416917] Lustre: 95581:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1554949816/real 1554949816] req@ffff8ec35628a100 x1630438689273872/t0(0) o106->fir-MDT0003@10.9.101.50@o2ib4:15/16 lens 296/280 e 0 to 1 dl 1554949823 ref 1 fl Rpc:X/2/ffffffff rc 0/-1 [2794659.444447] Lustre: 95581:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 9 previous similar messages [2794704.161506] Lustre: 95581:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/-5), not sending early reply req@ffff8ebfe3e38000 x1629247700281440/t0(0) o101->bafe047a-89ea-21d5-a24e-5c407648cfc7@10.0.10.3@o2ib7:13/0 lens 480/568 e 0 to 0 dl 1554949873 ref 2 fl Interpret:/0/0 rc 0/0 [2794704.190514] Lustre: 95581:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 3 previous similar messages [2794732.013151] Lustre: fir-MDT0001: haven't heard from client 056b5f54-6fde-d24e-91db-e32a1e5acf9e (at 10.9.101.51@o2ib4) in 227 seconds. I think it's dead, and I am evicting it. exp ffff8eefc63f8c00, cur 1554949896 expire 1554949746 last 1554949669 [2794732.035113] Lustre: Skipped 10 previous similar messages [2794779.403925] LustreError: 137-5: fir-MDT0000_UUID: not available for connect from 10.8.0.65@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2794779.421317] LustreError: Skipped 1 previous similar message [2794794.134121] Lustre: fir-MDT0001: Client d2973835-df0e-7fd3-e160-4ee0dd527af4 (at 10.9.102.54@o2ib4) reconnecting [2794794.144471] Lustre: Skipped 74 previous similar messages [2794846.706390] LustreError: 95947:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554949920, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eedfaee3840/0x857bd8ea1f9eaf1 lrc: 3/1,0 mode: --/PR res: [0x24000ff77:0x101e8:0x0].0x0 bits 0x13/0x0 rrc: 6 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 95947 timeout: 0 lvb_type: 0 [2794854.391483] Lustre: 95893:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8eb89289c850 x1628640777528496/t0(0) o4->e5407977-2002-52d7-281b-57025bf38084@10.9.107.13@o2ib4:13/0 lens 1384/0 e 0 to 0 dl 1554950023 ref 2 fl New:/2/ffffffff rc 0/-1 [2794854.420473] Lustre: 95893:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 682 previous similar messages [2794854.920495] LustreError: 96422:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554949928, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ee29aa50d80/0x857bd8ea1fdae5f lrc: 3/1,0 mode: --/PR res: [0x240000caf:0x6:0x0].0x0 bits 0x13/0x0 rrc: 11 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 96422 timeout: 0 lvb_type: 0 [2794854.959795] LustreError: 96422:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 2 previous similar messages [2794915.044287] LNet: Service thread pid 96778 was inactive for 200.02s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2794915.061406] LNet: Skipped 4 previous similar messages [2794915.066643] Pid: 96778, comm: mdt_rdpg00_058 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2794915.077008] Call Trace: [2794915.079670] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2794915.086752] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2794915.094022] [] start_this_handle+0x1a1/0x430 [jbd2] [2794915.100759] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2794915.107595] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2794915.115208] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2794915.122388] [] dqget+0x3fa/0x450 [2794915.127478] [] dquot_get_dqblk+0x14/0x1f0 [2794915.133351] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2794915.141052] [] lquota_disk_read+0x12f/0x390 [lquota] [2794915.147886] [] qsd_refresh_usage+0x6a/0x2b0 [lquota] [2794915.154724] [] qsd_op_adjust+0x2f1/0x730 [lquota] [2794915.161302] [] osd_object_delete+0x22c/0x340 [osd_ldiskfs] [2794915.168646] [] lu_object_free.isra.30+0x68/0x170 [obdclass] [2794915.176116] [] lu_site_purge_objects+0x2fe/0x530 [obdclass] [2794915.183562] [] lu_cache_shrink+0x259/0x2d0 [obdclass] [2794915.190502] [] shrink_slab+0x175/0x340 [2794915.196111] [] zone_reclaim+0x1d1/0x2f0 [2794915.201810] [] get_page_from_freelist+0x87b/0xa70 [2794915.208378] [] __alloc_pages_nodemask+0x176/0x420 [2794915.214949] [] alloc_pages_current+0x98/0x110 [2794915.221182] [] new_slab+0x2c5/0x390 [2794915.226545] [] ___slab_alloc+0x3ac/0x4f0 [2794915.232321] [] __slab_alloc+0x40/0x5c [2794915.237841] [] __kmalloc+0x1c0/0x230 [2794915.243275] [] ldiskfs_htree_store_dirent+0x7f/0x190 [ldiskfs] [2794915.250977] [] htree_dirblock_to_tree+0x169/0x190 [ldiskfs] [2794915.258409] [] ldiskfs_htree_fill_tree+0xb5/0x2f0 [ldiskfs] [2794915.265847] [] ldiskfs_readdir+0x61c/0x850 [ldiskfs] [2794915.272673] [] osd_ldiskfs_it_fill+0xbe/0x260 [osd_ldiskfs] [2794915.280109] [] osd_it_ea_load+0x37/0x100 [osd_ldiskfs] [2794915.287116] [] lod_it_load+0x27/0x90 [lod] [2794915.293097] [] dt_index_walk+0xf8/0x430 [obdclass] [2794915.299768] [] mdd_readpage+0x25f/0x5a0 [mdd] [2794915.306003] [] mdt_readpage+0x63a/0x880 [mdt] [2794915.312231] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2794915.319375] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2794915.327265] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2794915.333781] [] kthread+0xd1/0xe0 [2794915.338869] [] ret_from_fork_nospec_begin+0xe/0x21 [2794915.345531] [] 0xffffffffffffffff [2794915.350747] LustreError: dumping log to /tmp/lustre-log.1554950079.96778 [2794938.769450] Lustre: 96734:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:82s); client may timeout. req@ffff8ed8d4dbdc50 x1630459477547616/t0(0) o4->a41ae2bb-8d19-1967-3ec4-70e471172f6c@10.9.101.7@o2ib4:10/0 lens 2280/0 e 0 to 0 dl 1554950020 ref 1 fl Interpret:H/0/ffffffff rc 0/-1 [2794946.240754] INFO: task systemd-logind:16948 blocked for more than 120 seconds. [2794946.248153] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [2794946.256156] systemd-logind D ffff8ee05443b0c0 0 16948 1 0x00000084 [2794946.263430] Call Trace: [2794946.266064] [] schedule+0x29/0x70 [2794946.271209] [] rwsem_down_write_failed+0x225/0x3a0 [2794946.277849] [] ? get_from_free_list+0x42/0x50 [2794946.284053] [] call_rwsem_down_write_failed+0x17/0x30 [2794946.290936] [] ? ida_get_new_above+0x230/0x2a0 [2794946.297223] [] down_write+0x2d/0x3d [2794946.302540] [] register_shrinker+0x21/0x50 [2794946.308471] [] sget_userns+0x489/0x4c0 [2794946.314063] [] ? get_anon_bdev+0x110/0x110 [2794946.319983] [] sget+0x7d/0xb0 [2794946.324777] [] ? get_anon_bdev+0x110/0x110 [2794946.330706] [] ? shmem_rename+0x20/0x20 [2794946.336392] [] mount_nodev+0x30/0xb0 [2794946.341826] [] shmem_mount+0x18/0x20 [2794946.347233] [] mount_fs+0x3e/0x1b0 [2794946.352468] [] vfs_kern_mount+0x67/0x110 [2794946.358223] [] do_mount+0x1ef/0xce0 [2794946.363559] [] ? __check_object_size+0x1ca/0x250 [2794946.370002] [] ? kmem_cache_alloc_trace+0x3c/0x200 [2794946.376624] [] SyS_mount+0x83/0xd0 [2794946.381871] [] system_call_fastpath+0x22/0x27 [2794946.388098] INFO: task jbd2/dm-3-8:95076 blocked for more than 120 seconds. [2794946.395240] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [2794946.403261] jbd2/dm-3-8 D ffff8ee054069040 0 95076 2 0x00000080 [2794946.410531] Call Trace: [2794946.413161] [] schedule+0x29/0x70 [2794946.418334] [] jbd2_journal_commit_transaction+0x23c/0x19b0 [jbd2] [2794946.426351] [] ? dequeue_task_fair+0x41e/0x660 [2794946.432621] [] ? __switch_to+0xce/0x580 [2794946.438303] [] ? wake_up_atomic_t+0x30/0x30 [2794946.444318] [] ? __schedule+0x3ff/0x890 [2794946.449989] [] ? try_to_del_timer_sync+0x5e/0x90 [2794946.456452] [] kjournald2+0xc9/0x260 [jbd2] [2794946.462482] [] ? wake_up_atomic_t+0x30/0x30 [2794946.468493] [] ? commit_timeout+0x10/0x10 [jbd2] [2794946.474955] [] kthread+0xd1/0xe0 [2794946.480017] [] ? insert_kthread_work+0x40/0x40 [2794946.486292] [] ret_from_fork_nospec_begin+0xe/0x21 [2794946.492914] [] ? insert_kthread_work+0x40/0x40 [2794946.499196] INFO: task mdt00_002:95124 blocked for more than 120 seconds. [2794946.506178] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [2794946.514181] mdt00_002 D ffff8ee02f9d30c0 0 95124 2 0x00000080 [2794946.521476] Call Trace: [2794946.524137] [] ? ldlm_pool_recalc+0x12e/0x1f0 [ptlrpc] [2794946.531100] [] schedule+0x29/0x70 [2794946.536250] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2794946.543301] [] ? wake_up_atomic_t+0x30/0x30 [2794946.549334] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2794946.556563] [] start_this_handle+0x1a1/0x430 [jbd2] [2794946.563281] [] ? osd_declare_write+0x330/0x470 [osd_ldiskfs] [2794946.570767] [] ? kmem_cache_alloc+0x1c2/0x1f0 [2794946.576958] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2794946.583773] [] ? osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794946.591105] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2794946.598680] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794946.605852] [] mdt_empty_transno+0xf7/0x850 [mdt] [2794946.612395] [] mdt_mfd_open+0x8de/0xe70 [mdt] [2794946.618591] [] ? mdt_pack_acl2body+0x1c2/0x9f0 [mdt] [2794946.625409] [] mdt_finish_open+0x64b/0x760 [mdt] [2794946.631862] [] mdt_open_by_fid_lock+0x672/0x9b0 [mdt] [2794946.638750] [] mdt_reint_open+0x760/0x27d0 [mdt] [2794946.645234] [] ? upcall_cache_get_entry+0x218/0x8b0 [obdclass] [2794946.652910] [] ? lu_ucred+0x1e/0x30 [obdclass] [2794946.659193] [] ? mdt_ucred+0x15/0x20 [mdt] [2794946.665130] [] ? mdt_root_squash+0x21/0x430 [mdt] [2794946.671676] [] mdt_reint_rec+0x83/0x210 [mdt] [2794946.677886] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2794946.684594] [] ? mdt_intent_fixup_resent+0x36/0x220 [mdt] [2794946.691824] [] mdt_intent_open+0x82/0x350 [mdt] [2794946.698216] [] ? lprocfs_counter_add+0xf9/0x160 [obdclass] [2794946.705533] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2794946.712158] [] ? mdt_intent_fixup_resent+0x220/0x220 [mdt] [2794946.719513] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2794946.726409] [] ? cfs_hash_bd_add_locked+0x63/0x80 [libcfs] [2794946.733727] [] ? cfs_hash_add+0xbe/0x1a0 [libcfs] [2794946.740303] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2794946.747557] [] ? lustre_swab_ldlm_lock_desc+0x30/0x30 [ptlrpc] [2794946.755250] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2794946.761571] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2794946.768653] [] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc] [2794946.776401] [] ? ktime_get_real_seconds+0xe/0x10 [libcfs] [2794946.783677] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2794946.788701] LNet: Service thread pid 96775 was inactive for 200.73s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2794946.788704] Pid: 96775, comm: mdt_rdpg00_055 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2794946.788704] Call Trace: [2794946.788722] [] call_rwsem_down_write_failed+0x17/0x30 [2794946.788733] [] osd_write_lock+0x5c/0xe0 [osd_ldiskfs] [2794946.788745] [] lod_write_lock+0x3b/0xb0 [lod] [2794946.788756] [] mdd_write_lock+0x3b/0xd0 [mdd] [2794946.788764] [] mdd_attr_set+0x749/0xca0 [mdd] [2794946.788783] [] mdt_mfd_close+0x750/0x850 [mdt] [2794946.788795] [] mdt_close_internal+0x121/0x220 [mdt] [2794946.788806] [] mdt_close+0x220/0x780 [mdt] [2794946.788862] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2794946.788894] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2794946.788926] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2794946.788930] [] kthread+0xd1/0xe0 [2794946.788934] [] ret_from_fork_nospec_begin+0xe/0x21 [2794946.788963] [] 0xffffffffffffffff [2794946.788965] LustreError: dumping log to /tmp/lustre-log.1554950110.96775 [2794946.917797] [] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc] [2794946.924763] [] ? default_wake_function+0x12/0x20 [2794946.931211] [] ? __wake_up_common+0x5b/0x90 [2794946.937266] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2794946.943735] [] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc] [2794946.951308] [] kthread+0xd1/0xe0 [2794946.956383] [] ? insert_kthread_work+0x40/0x40 [2794946.962652] [] ret_from_fork_nospec_begin+0xe/0x21 [2794946.969274] [] ? insert_kthread_work+0x40/0x40 [2794946.975564] INFO: task mdt02_001:95129 blocked for more than 120 seconds. [2794946.982524] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [2794946.990525] mdt02_001 D ffff8ed1776ea080 0 95129 2 0x00000080 [2794946.997819] Call Trace: [2794947.000474] [] ? ldlm_pool_recalc+0x12e/0x1f0 [ptlrpc] [2794947.007443] [] schedule+0x29/0x70 [2794947.012596] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2794947.019645] [] ? wake_up_atomic_t+0x30/0x30 [2794947.025662] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2794947.032892] [] start_this_handle+0x1a1/0x430 [jbd2] [2794947.039604] [] ? osd_declare_write+0x330/0x470 [osd_ldiskfs] [2794947.047108] [] ? kmem_cache_alloc+0x1c2/0x1f0 [2794947.053292] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2794947.060091] [] ? osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794947.067421] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2794947.074997] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794947.082142] [] mdt_empty_transno+0xf7/0x850 [mdt] [2794947.088701] [] mdt_mfd_open+0x8de/0xe70 [mdt] [2794947.094897] [] ? mdt_pack_acl2body+0x1c2/0x9f0 [mdt] [2794947.101695] [] mdt_finish_open+0x64b/0x760 [mdt] [2794947.108164] [] mdt_open_by_fid_lock+0x672/0x9b0 [mdt] [2794947.115052] [] mdt_reint_open+0x760/0x27d0 [mdt] [2794947.121518] [] ? upcall_cache_get_entry+0x218/0x8b0 [obdclass] [2794947.129210] [] ? lu_ucred+0x1e/0x30 [obdclass] [2794947.135486] [] ? mdt_ucred+0x15/0x20 [mdt] [2794947.141423] [] ? mdt_root_squash+0x21/0x430 [mdt] [2794947.147981] [] mdt_reint_rec+0x83/0x210 [mdt] [2794947.154174] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2794947.160889] [] ? mdt_intent_fixup_resent+0x36/0x220 [mdt] [2794947.168142] [] mdt_intent_open+0x82/0x350 [mdt] [2794947.174521] [] ? lprocfs_counter_add+0xf9/0x160 [obdclass] [2794947.181844] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2794947.188489] [] ? mdt_intent_fixup_resent+0x220/0x220 [mdt] [2794947.195822] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2794947.202712] [] ? cfs_hash_bd_add_locked+0x63/0x80 [libcfs] [2794947.210027] [] ? cfs_hash_add+0xbe/0x1a0 [libcfs] [2794947.216589] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2794947.223864] [] ? lustre_swab_ldlm_lock_desc+0x30/0x30 [ptlrpc] [2794947.231556] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2794947.237865] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2794947.244952] [] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc] [2794947.252701] [] ? ktime_get_real_seconds+0xe/0x10 [libcfs] [2794947.259968] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2794947.267833] [] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc] [2794947.274796] [] ? default_wake_function+0x12/0x20 [2794947.281243] [] ? __wake_up_common+0x5b/0x90 [2794947.287288] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2794947.293776] [] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc] [2794947.301351] [] kthread+0xd1/0xe0 [2794947.306412] [] ? insert_kthread_work+0x40/0x40 [2794947.312707] [] ret_from_fork_nospec_begin+0xe/0x21 [2794947.319325] [] ? insert_kthread_work+0x40/0x40 [2794947.325604] INFO: task mdt03_001:95132 blocked for more than 120 seconds. [2794947.332583] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [2794947.340601] mdt03_001 D ffff8edc58ee8000 0 95132 2 0x00000080 [2794947.347879] Call Trace: [2794947.350537] [] ? ldlm_pool_recalc+0x12e/0x1f0 [ptlrpc] [2794947.357520] [] schedule+0x29/0x70 [2794947.362671] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2794947.369723] [] ? wake_up_atomic_t+0x30/0x30 [2794947.375755] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2794947.382985] [] start_this_handle+0x1a1/0x430 [jbd2] [2794947.389702] [] ? osd_declare_write+0x330/0x470 [osd_ldiskfs] [2794947.397205] [] ? kmem_cache_alloc+0x1c2/0x1f0 [2794947.403411] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2794947.410213] [] ? osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794947.417555] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2794947.425136] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794947.432282] [] mdt_empty_transno+0xf7/0x850 [mdt] [2794947.438840] [] mdt_mfd_open+0x8de/0xe70 [mdt] [2794947.445035] [] ? mdt_pack_acl2body+0x1c2/0x9f0 [mdt] [2794947.451841] [] mdt_finish_open+0x64b/0x760 [mdt] [2794947.458314] [] mdt_open_by_fid_lock+0x672/0x9b0 [mdt] [2794947.465207] [] mdt_reint_open+0x760/0x27d0 [mdt] [2794947.471672] [] ? upcall_cache_get_entry+0x218/0x8b0 [obdclass] [2794947.479349] [] ? lu_ucred+0x1e/0x30 [obdclass] [2794947.485633] [] ? mdt_ucred+0x15/0x20 [mdt] [2794947.491568] [] ? mdt_root_squash+0x21/0x430 [mdt] [2794947.498114] [] mdt_reint_rec+0x83/0x210 [mdt] [2794947.504324] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2794947.511035] [] ? mdt_intent_fixup_resent+0x36/0x220 [mdt] [2794947.518272] [] mdt_intent_open+0x82/0x350 [mdt] [2794947.524665] [] ? lprocfs_counter_add+0xf9/0x160 [obdclass] [2794947.531981] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2794947.538611] [] ? mdt_intent_fixup_resent+0x220/0x220 [mdt] [2794947.545953] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2794947.552839] [] ? cfs_hash_bd_add_locked+0x63/0x80 [libcfs] [2794947.560170] [] ? cfs_hash_add+0xbe/0x1a0 [libcfs] [2794947.566730] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2794947.573984] [] ? lustre_swab_ldlm_lock_desc+0x30/0x30 [ptlrpc] [2794947.581690] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2794947.587993] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2794947.595075] [] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc] [2794947.602831] [] ? ktime_get_real_seconds+0xe/0x10 [libcfs] [2794947.610099] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2794947.617975] [] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc] [2794947.624943] [] ? default_wake_function+0x12/0x20 [2794947.631406] [] ? __wake_up_common+0x5b/0x90 [2794947.637453] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2794947.643924] [] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc] [2794947.651512] [] kthread+0xd1/0xe0 [2794947.656567] [] ? insert_kthread_work+0x40/0x40 [2794947.662842] [] ret_from_fork_nospec_begin+0xe/0x21 [2794947.669481] [] ? insert_kthread_work+0x40/0x40 [2794947.675771] INFO: task mdt03_002:95133 blocked for more than 120 seconds. [2794947.682731] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [2794947.690746] mdt03_002 D ffff8edc58eea080 0 95133 2 0x00000080 [2794947.698018] Call Trace: [2794947.700673] [] ? ldlm_pool_recalc+0x12e/0x1f0 [ptlrpc] [2794947.707643] [] schedule+0x29/0x70 [2794947.712809] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2794947.719859] [] ? wake_up_atomic_t+0x30/0x30 [2794947.725879] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2794947.733119] [] start_this_handle+0x1a1/0x430 [jbd2] [2794947.739842] [] ? osd_declare_write+0x330/0x470 [osd_ldiskfs] [2794947.747326] [] ? kmem_cache_alloc+0x1c2/0x1f0 [2794947.753530] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2794947.760332] [] ? osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794947.767649] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2794947.775246] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794947.782394] [] mdt_empty_transno+0xf7/0x850 [mdt] [2794947.788937] [] mdt_mfd_open+0x8de/0xe70 [mdt] [2794947.795145] [] ? mdt_pack_acl2body+0x1c2/0x9f0 [mdt] [2794947.801943] [] mdt_finish_open+0x64b/0x760 [mdt] [2794947.808403] [] mdt_open_by_fid_lock+0x672/0x9b0 [mdt] [2794947.815291] [] mdt_reint_open+0x760/0x27d0 [mdt] [2794947.821756] [] ? upcall_cache_get_entry+0x218/0x8b0 [obdclass] [2794947.829450] [] ? lu_ucred+0x1e/0x30 [obdclass] [2794947.835726] [] ? mdt_ucred+0x15/0x20 [mdt] [2794947.841663] [] ? mdt_root_squash+0x21/0x430 [mdt] [2794947.848223] [] mdt_reint_rec+0x83/0x210 [mdt] [2794947.854422] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2794947.861138] [] ? mdt_intent_fixup_resent+0x36/0x220 [mdt] [2794947.868390] [] mdt_intent_open+0x82/0x350 [mdt] [2794947.874777] [] ? lprocfs_counter_add+0xf9/0x160 [obdclass] [2794947.882094] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2794947.888739] [] ? mdt_intent_fixup_resent+0x220/0x220 [mdt] [2794947.896071] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2794947.902953] [] ? cfs_hash_bd_add_locked+0x63/0x80 [libcfs] [2794947.910290] [] ? cfs_hash_add+0xbe/0x1a0 [libcfs] [2794947.916848] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2794947.924105] [] ? lustre_swab_ldlm_lock_desc+0x30/0x30 [ptlrpc] [2794947.931811] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2794947.938114] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2794947.945194] [] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc] [2794947.952939] [] ? ktime_get_real_seconds+0xe/0x10 [libcfs] [2794947.960193] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2794947.968068] [] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc] [2794947.975037] [] ? default_wake_function+0x12/0x20 [2794947.981484] [] ? __wake_up_common+0x5b/0x90 [2794947.987529] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2794947.994003] [] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc] [2794948.001590] [] kthread+0xd1/0xe0 [2794948.006644] [] ? insert_kthread_work+0x40/0x40 [2794948.012920] [] ret_from_fork_nospec_begin+0xe/0x21 [2794948.019540] [] ? insert_kthread_work+0x40/0x40 [2794948.025815] INFO: task mdt_rdpg00_000:95134 blocked for more than 120 seconds. [2794948.033217] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [2794948.041225] mdt_rdpg00_000 D ffff8edc58ee9040 0 95134 2 0x00000080 [2794948.048518] Call Trace: [2794948.051149] [] schedule+0x29/0x70 [2794948.056299] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2794948.063366] [] ? wake_up_atomic_t+0x30/0x30 [2794948.069377] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2794948.076612] [] ? osd_trunc_lock+0xe0/0x210 [osd_ldiskfs] [2794948.083756] [] start_this_handle+0x1a1/0x430 [jbd2] [2794948.090474] [] ? lod_sub_declare_write+0xe1/0x2a0 [lod] [2794948.097541] [] ? kmem_cache_alloc+0x1c2/0x1f0 [2794948.103732] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2794948.110531] [] ? osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794948.117871] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2794948.125446] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794948.132616] [] top_trans_start+0x702/0x940 [ptlrpc] [2794948.139346] [] lod_trans_start+0x34/0x40 [lod] [2794948.145640] [] mdd_trans_start+0x1a/0x20 [mdd] [2794948.151914] [] mdd_attr_set+0x56f/0xca0 [mdd] [2794948.158120] [] mdt_mfd_close+0x750/0x850 [mdt] [2794948.164398] [] mdt_close_internal+0x121/0x220 [mdt] [2794948.171117] [] mdt_close+0x220/0x780 [mdt] [2794948.177090] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2794948.184172] [] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc] [2794948.191918] [] ? ktime_get_real_seconds+0xe/0x10 [libcfs] [2794948.199189] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2794948.207047] [] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc] [2794948.214015] [] ? default_wake_function+0x12/0x20 [2794948.220462] [] ? __wake_up_common+0x5b/0x90 [2794948.226506] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2794948.232998] [] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc] [2794948.240570] [] kthread+0xd1/0xe0 [2794948.245634] [] ? insert_kthread_work+0x40/0x40 [2794948.251920] [] ret_from_fork_nospec_begin+0xe/0x21 [2794948.258536] [] ? insert_kthread_work+0x40/0x40 [2794948.264812] INFO: task mdt_rdpg00_001:95135 blocked for more than 120 seconds. [2794948.272212] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [2794948.280233] mdt_rdpg00_001 D ffff8edc58eeb0c0 0 95135 2 0x00000080 [2794948.287505] Call Trace: [2794948.290135] [] schedule+0x29/0x70 [2794948.295287] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2794948.302353] [] ? wake_up_atomic_t+0x30/0x30 [2794948.308364] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2794948.315611] [] ? osd_trunc_lock+0xe0/0x210 [osd_ldiskfs] [2794948.322751] [] start_this_handle+0x1a1/0x430 [jbd2] [2794948.329466] [] ? lod_sub_declare_write+0xe1/0x2a0 [lod] [2794948.336535] [] ? kmem_cache_alloc+0x1c2/0x1f0 [2794948.342734] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2794948.349536] [] ? osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794948.356874] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2794948.364450] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794948.371619] [] top_trans_start+0x702/0x940 [ptlrpc] [2794948.378334] [] lod_trans_start+0x34/0x40 [lod] [2794948.384610] [] mdd_trans_start+0x1a/0x20 [mdd] [2794948.390911] [] mdd_attr_set+0x56f/0xca0 [mdd] [2794948.397109] [] mdt_mfd_close+0x750/0x850 [mdt] [2794948.403394] [] mdt_close_internal+0x121/0x220 [mdt] [2794948.410125] [] mdt_close+0x220/0x780 [mdt] [2794948.416088] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2794948.423174] [] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc] [2794948.430923] [] ? ktime_get_real_seconds+0xe/0x10 [libcfs] [2794948.438176] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2794948.446051] [] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc] [2794948.453017] [] ? default_wake_function+0x12/0x20 [2794948.459468] [] ? __wake_up_common+0x5b/0x90 [2794948.465529] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2794948.472018] [] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc] [2794948.479592] [] kthread+0xd1/0xe0 [2794948.484654] [] ? insert_kthread_work+0x40/0x40 [2794948.490928] [] ret_from_fork_nospec_begin+0xe/0x21 [2794948.497548] [] ? insert_kthread_work+0x40/0x40 [2794948.503829] INFO: task mdt_io00_000:95164 blocked for more than 120 seconds. [2794948.511064] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [2794948.519068] mdt_io00_000 D ffff8ee0541c30c0 0 95164 2 0x00000080 [2794948.526363] Call Trace: [2794948.528991] [] schedule+0x29/0x70 [2794948.534144] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2794948.541209] [] ? wake_up_atomic_t+0x30/0x30 [2794948.547222] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2794948.554463] [] start_this_handle+0x1a1/0x430 [jbd2] [2794948.561177] [] ? osd_declare_xattr_set+0xf1/0x3a0 [osd_ldiskfs] [2794948.568921] [] ? kmem_cache_alloc+0x1c2/0x1f0 [2794948.575126] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2794948.581927] [] ? osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794948.589252] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2794948.596847] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794948.603999] [] mdt_obd_commitrw+0x635/0x1150 [mdt] [2794948.610647] [] obd_commitrw+0x9c/0x370 [ptlrpc] [2794948.617034] [] tgt_brw_write+0x100d/0x1a90 [ptlrpc] [2794948.623738] [] ? load_balance+0x1be/0x9a0 [2794948.629618] [] ? target_send_reply_msg+0x170/0x170 [ptlrpc] [2794948.637044] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2794948.644127] [] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc] [2794948.651891] [] ? ktime_get_real_seconds+0xe/0x10 [libcfs] [2794948.659144] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2794948.667004] [] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc] [2794948.673985] [] ? default_wake_function+0x12/0x20 [2794948.680426] [] ? __wake_up_common+0x5b/0x90 [2794948.686469] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2794948.692959] [] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc] [2794948.700532] [] kthread+0xd1/0xe0 [2794948.705595] [] ? insert_kthread_work+0x40/0x40 [2794948.711885] [] ret_from_fork_nospec_begin+0xe/0x21 [2794948.718500] [] ? insert_kthread_work+0x40/0x40 [2794948.724774] INFO: task mdt_io00_001:95165 blocked for more than 120 seconds. [2794948.732015] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [2794948.740017] mdt_io00_001 D ffff8ee300a78000 0 95165 2 0x00000080 [2794948.747287] Call Trace: [2794948.749932] [] schedule+0x29/0x70 [2794948.755078] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2794948.762143] [] ? wake_up_atomic_t+0x30/0x30 [2794948.768153] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2794948.775381] [] start_this_handle+0x1a1/0x430 [jbd2] [2794948.782092] [] ? osd_declare_xattr_set+0xf1/0x3a0 [osd_ldiskfs] [2794948.789838] [] ? kmem_cache_alloc+0x1c2/0x1f0 [2794948.796042] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2794948.802842] [] ? osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794948.810159] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2794948.817740] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794948.824886] [] mdt_obd_commitrw+0x635/0x1150 [mdt] [2794948.831553] [] obd_commitrw+0x9c/0x370 [ptlrpc] [2794948.836777] LNet: Service thread pid 96022 was inactive for 200.49s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2794948.836780] Pid: 96022, comm: mdt03_028 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2794948.836780] Call Trace: [2794948.836819] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2794948.836823] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2794948.836826] [] start_this_handle+0x1a1/0x430 [jbd2] [2794948.836831] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2794948.836846] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2794948.836857] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794948.836874] [] mdt_empty_transno+0xf7/0x850 [mdt] [2794948.836886] [] mdt_mfd_open+0x8de/0xe70 [mdt] [2794948.836897] [] mdt_finish_open+0x64b/0x760 [mdt] [2794948.836908] [] mdt_open_by_fid_lock+0x672/0x9b0 [mdt] [2794948.836918] [] mdt_reint_open+0x760/0x27d0 [mdt] [2794948.836929] [] mdt_reint_rec+0x83/0x210 [mdt] [2794948.836938] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2794948.836948] [] mdt_intent_open+0x82/0x350 [mdt] [2794948.836958] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2794948.836997] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2794948.837027] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2794948.837071] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2794948.837107] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2794948.837139] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2794948.837171] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2794948.837177] [] kthread+0xd1/0xe0 [2794948.837181] [] ret_from_fork_nospec_begin+0xe/0x21 [2794948.837192] [] 0xffffffffffffffff [2794948.837195] LustreError: dumping log to /tmp/lustre-log.1554950112.96022 [2794948.837873] Pid: 96784, comm: mdt_rdpg03_046 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2794948.837874] Call Trace: [2794948.837885] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2794948.837888] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2794948.837891] [] start_this_handle+0x1a1/0x430 [jbd2] [2794948.837894] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2794948.837904] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2794948.837911] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794948.837948] [] top_trans_start+0x702/0x940 [ptlrpc] [2794948.837960] [] lod_trans_start+0x34/0x40 [lod] [2794948.837973] [] mdd_trans_start+0x1a/0x20 [mdd] [2794948.837980] [] mdd_attr_set+0x56f/0xca0 [mdd] [2794948.837992] [] mdt_mfd_close+0x750/0x850 [mdt] [2794948.838003] [] mdt_close_internal+0x121/0x220 [mdt] [2794948.838014] [] mdt_close+0x220/0x780 [mdt] [2794948.838049] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2794948.838081] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2794948.838112] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2794948.838114] [] kthread+0xd1/0xe0 [2794948.838116] [] ret_from_fork_nospec_begin+0xe/0x21 [2794948.838119] [] 0xffffffffffffffff [2794948.838123] Pid: 96700, comm: mdt_rdpg03_041 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2794948.838123] Call Trace: [2794948.838132] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2794948.838135] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2794948.838138] [] start_this_handle+0x1a1/0x430 [jbd2] [2794948.838142] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2794948.838151] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2794948.838158] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2794948.838192] [] top_trans_start+0x702/0x940 [ptlrpc] [2794948.838198] [] lod_trans_start+0x34/0x40 [lod] [2794948.838206] [] mdd_trans_start+0x1a/0x20 [mdd] [2794948.838214] [] mdd_attr_set+0x56f/0xca0 [mdd] [2794948.838225] [] mdt_mfd_close+0x750/0x850 [mdt] [2794948.838236] [] mdt_close_internal+0x121/0x220 [mdt] [2794948.838247] [] mdt_close+0x220/0x780 [mdt] [2794948.838280] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2794948.838311] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2794948.838342] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2794948.838344] [] kthread+0xd1/0xe0 [2794948.838346] [] ret_from_fork_nospec_begin+0xe/0x21 [2794948.838349] [] 0xffffffffffffffff [2794948.838351] LNet: Service thread pid 96694 was inactive for 200.17s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2794948.838353] LNet: Skipped 2 previous similar messages [2794949.327639] [] tgt_brw_write+0x100d/0x1a90 [ptlrpc] [2794949.334373] [] ? lustre_msg_buf_v2+0x1b0/0x1b0 [ptlrpc] [2794949.341453] [] ? lustre_msg_buf+0x17/0x60 [ptlrpc] [2794949.348087] [] ? update_curr+0x14c/0x1e0 [2794949.348746] LustreError: dumping log to /tmp/lustre-log.1554950113.95517 [2794949.360734] [] ? account_entity_dequeue+0xae/0xd0 [2794949.367308] [] ? target_send_reply_msg+0x170/0x170 [ptlrpc] [2794949.374738] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2794949.381824] [] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc] [2794949.389574] [] ? ktime_get_real_seconds+0xe/0x10 [libcfs] [2794949.396826] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2794949.404702] [] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc] [2794949.411668] [] ? default_wake_function+0x12/0x20 [2794949.418118] [] ? __wake_up_common+0x5b/0x90 [2794949.424173] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2794949.430642] [] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc] [2794949.438215] [] kthread+0xd1/0xe0 [2794949.443290] [] ? insert_kthread_work+0x40/0x40 [2794949.449561] [] ret_from_fork_nospec_begin+0xe/0x21 [2794949.456180] [] ? insert_kthread_work+0x40/0x40 [2794950.372746] LustreError: dumping log to /tmp/lustre-log.1554950114.95991 [2794951.396758] LustreError: dumping log to /tmp/lustre-log.1554950115.96222 [2794952.420772] LustreError: dumping log to /tmp/lustre-log.1554950116.96673 [2794953.956794] LustreError: dumping log to /tmp/lustre-log.1554950117.96082 [2794954.468804] LustreError: dumping log to /tmp/lustre-log.1554950118.96758 [2794957.028833] LustreError: dumping log to /tmp/lustre-log.1554950121.95924 [2794958.052846] LustreError: dumping log to /tmp/lustre-log.1554950122.96038 [2794959.076864] LustreError: dumping log to /tmp/lustre-log.1554950123.96663 [2794960.100880] LustreError: dumping log to /tmp/lustre-log.1554950124.96503 [2794961.124892] LustreError: dumping log to /tmp/lustre-log.1554950125.96765 [2794962.148900] LustreError: dumping log to /tmp/lustre-log.1554950126.95532 [2794962.896927] Lustre: 108395:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (20:16s); client may timeout. req@ffff8ebac7aadc50 x1628544889779200/t0(0) o3->09af055d-73b0-0f86-9143-6e4e7900b1c7@10.9.107.42@o2ib4:10/0 lens 488/0 e 0 to 0 dl 1554950110 ref 1 fl Interpret:H/2/ffffffff rc 0/-1 [2794962.926084] Lustre: 108395:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 2 previous similar messages [2794963.172913] LustreError: dumping log to /tmp/lustre-log.1554950127.95993 [2794964.196926] LustreError: dumping log to /tmp/lustre-log.1554950128.96388 [2794965.220944] LNet: Service thread pid 96501 was inactive for 200.17s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2794965.233976] LNet: Skipped 192 previous similar messages [2794965.239385] LustreError: dumping log to /tmp/lustre-log.1554950129.96501 [2794966.244961] LustreError: dumping log to /tmp/lustre-log.1554950130.95533 [2794967.268969] LustreError: dumping log to /tmp/lustre-log.1554950131.95847 [2794968.292984] LustreError: dumping log to /tmp/lustre-log.1554950132.95548 [2794969.317002] LustreError: dumping log to /tmp/lustre-log.1554950133.96182 [2794970.341019] LustreError: dumping log to /tmp/lustre-log.1554950134.96149 [2794971.877028] LustreError: dumping log to /tmp/lustre-log.1554950135.96055 [2794972.389043] LustreError: dumping log to /tmp/lustre-log.1554950136.95954 [2794973.413052] LustreError: dumping log to /tmp/lustre-log.1554950137.96601 [2794974.437067] LustreError: dumping log to /tmp/lustre-log.1554950138.95994 [2794975.461080] LustreError: dumping log to /tmp/lustre-log.1554950139.96137 [2794976.997100] LustreError: dumping log to /tmp/lustre-log.1554950140.96165 [2794977.509103] LustreError: dumping log to /tmp/lustre-log.1554950141.96434 [2794978.533124] LustreError: dumping log to /tmp/lustre-log.1554950142.95888 [2794979.045129] LustreError: dumping log to /tmp/lustre-log.1554950143.96310 [2794980.069143] LustreError: dumping log to /tmp/lustre-log.1554950144.108329 [2794981.093151] LustreError: dumping log to /tmp/lustre-log.1554950145.96395 [2794982.117164] LustreError: dumping log to /tmp/lustre-log.1554950146.108337 [2794983.141178] LustreError: dumping log to /tmp/lustre-log.1554950147.108341 [2794984.165199] LustreError: dumping log to /tmp/lustre-log.1554950148.96203 [2794985.701218] LustreError: dumping log to /tmp/lustre-log.1554950149.95915 [2794986.213217] LustreError: dumping log to /tmp/lustre-log.1554950150.96291 [2794987.749241] LustreError: dumping log to /tmp/lustre-log.1554950151.96506 [2794988.261244] LustreError: dumping log to /tmp/lustre-log.1554950152.96202 [2794989.797270] LustreError: dumping log to /tmp/lustre-log.1554950153.96253 [2794990.309274] LustreError: dumping log to /tmp/lustre-log.1554950154.95129 [2794991.333285] LustreError: dumping log to /tmp/lustre-log.1554950155.108352 [2794992.869314] LustreError: dumping log to /tmp/lustre-log.1554950156.96139 [2794993.893319] LustreError: dumping log to /tmp/lustre-log.1554950157.96529 [2794994.405327] LustreError: dumping log to /tmp/lustre-log.1554950158.108358 [2794995.429337] LustreError: dumping log to /tmp/lustre-log.1554950159.108362 [2794996.453353] LustreError: dumping log to /tmp/lustre-log.1554950160.95917 [2794997.477365] LNet: Service thread pid 108364 was inactive for 200.44s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2794997.490482] LNet: Skipped 244 previous similar messages [2794997.495888] LustreError: dumping log to /tmp/lustre-log.1554950161.108364 [2794999.013387] LustreError: dumping log to /tmp/lustre-log.1554950162.108365 [2794999.525395] LustreError: dumping log to /tmp/lustre-log.1554950163.95920 [2795000.037400] LustreError: dumping log to /tmp/lustre-log.1554950164.95551 [2795001.061409] LustreError: dumping log to /tmp/lustre-log.1554950165.96401 [2795002.085428] LustreError: dumping log to /tmp/lustre-log.1554950166.96745 [2795003.109439] LustreError: dumping log to /tmp/lustre-log.1554950167.108370 [2795004.133448] LustreError: dumping log to /tmp/lustre-log.1554950168.95530 [2795005.157488] LustreError: dumping log to /tmp/lustre-log.1554950169.95914 [2795006.181485] LustreError: dumping log to /tmp/lustre-log.1554950170.96492 [2795007.717527] LustreError: dumping log to /tmp/lustre-log.1554950171.96101 [2795008.229510] LustreError: dumping log to /tmp/lustre-log.1554950172.96572 [2795009.253522] LustreError: dumping log to /tmp/lustre-log.1554950173.96723 [2795010.277556] LustreError: dumping log to /tmp/lustre-log.1554950174.96591 [2795011.301564] LustreError: dumping log to /tmp/lustre-log.1554950175.108379 [2795012.325573] LustreError: dumping log to /tmp/lustre-log.1554950176.108389 [2795013.349581] LustreError: dumping log to /tmp/lustre-log.1554950177.96750 [2795014.373593] LustreError: dumping log to /tmp/lustre-log.1554950178.108398 [2795015.397612] LustreError: dumping log to /tmp/lustre-log.1554950179.108400 [2795016.421614] LustreError: dumping log to /tmp/lustre-log.1554950180.96740 [2795017.445626] LustreError: dumping log to /tmp/lustre-log.1554950181.108408 [2795020.517675] LustreError: dumping log to /tmp/lustre-log.1554950184.96312 [2795021.541684] LustreError: dumping log to /tmp/lustre-log.1554950185.96751 [2795022.565695] LustreError: dumping log to /tmp/lustre-log.1554950186.108412 [2795023.077716] LustreError: dumping log to /tmp/lustre-log.1554950187.96210 [2795024.613723] LustreError: dumping log to /tmp/lustre-log.1554950188.108414 [2795026.661754] LustreError: dumping log to /tmp/lustre-log.1554950190.96522 [2795027.173765] LustreError: dumping log to /tmp/lustre-log.1554950191.96259 [2795028.197776] LustreError: dumping log to /tmp/lustre-log.1554950192.108415 [2795035.877870] LustreError: dumping log to /tmp/lustre-log.1554950199.95362 [2795038.949913] LustreError: dumping log to /tmp/lustre-log.1554950202.96074 [2795044.183648] Lustre: fir-MDT0001: Connection restored to c6802582-b94d-a600-4745-181267927202 (at 10.9.105.48@o2ib4) [2795044.194259] Lustre: Skipped 1596 previous similar messages [2795047.654030] LustreError: dumping log to /tmp/lustre-log.1554950211.95562 [2795059.942188] LustreError: dumping log to /tmp/lustre-log.1554950223.95592 [2795060.454197] LustreError: dumping log to /tmp/lustre-log.1554950224.95969 [2795061.478210] LustreError: dumping log to /tmp/lustre-log.1554950225.95526 [2795067.622288] LNet: Service thread pid 95930 was inactive for 200.56s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2795067.635322] LNet: Skipped 166 previous similar messages [2795067.640729] LustreError: dumping log to /tmp/lustre-log.1554950231.95930 [2795073.609416] Lustre: 96716:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:42s); client may timeout. req@ffff8ecce76d3850 x1629294845491632/t0(0) o3->f3b5dfe4-1917-2d20-fd73-c3a1a450a23e@10.8.18.1@o2ib6:5/0 lens 488/0 e 0 to 0 dl 1554950195 ref 1 fl Interpret:H/2/ffffffff rc 0/-1 [2795073.638229] Lustre: 96716:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 1 previous similar message [2795076.839404] LustreError: dumping log to /tmp/lustre-log.1554950240.95509 [2795094.134003] Lustre: fir-MDT0001: Client 547db8d7-46dc-41fc-3cbe-14d2ac9e4c9e (at 10.9.102.8@o2ib4) reconnecting [2795094.144262] Lustre: Skipped 1935 previous similar messages [2795102.438743] LustreError: dumping log to /tmp/lustre-log.1554950266.96286 [2795103.346270] Lustre: 95907:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (20:35s); client may timeout. req@ffff8ec313166850 x1630000354744080/t0(0) o3->32c80f01-51ca-2acc-b61a-cb7b510385c5@10.9.105.56@o2ib4:12/0 lens 488/0 e 0 to 0 dl 1554950232 ref 1 fl Interpret:H/2/ffffffff rc 0/-1 [2795103.375345] Lustre: 95907:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 7 previous similar messages [2795107.046804] LustreError: dumping log to /tmp/lustre-log.1554950271.96525 [2795112.192901] Lustre: 95893:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (20:111s); client may timeout. req@ffff8eb6c0219c50 x1628761643517120/t0(0) o3->146b965a-8924-f96f-1eab-c9060c03ef24@10.9.105.51@o2ib4:5/0 lens 488/0 e 0 to 0 dl 1554950165 ref 1 fl Interpret:H/2/ffffffff rc 0/-1 [2795112.221970] Lustre: 95893:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 2 previous similar messages [2795136.231185] LustreError: dumping log to /tmp/lustre-log.1554950300.96226 [2795137.767205] LustreError: dumping log to /tmp/lustre-log.1554950301.96091 [2795154.451431] Lustre: 95907:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ec853fa4850 x1629293874449264/t0(0) o4->1029bc9d-5417-6a10-709f-a2c75b097c08@10.9.102.5@o2ib4:13/0 lens 4456/0 e 0 to 0 dl 1554950323 ref 2 fl New:/2/ffffffff rc 0/-1 [2795154.480328] Lustre: 95907:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 3965 previous similar messages [2795166.951581] LustreError: dumping log to /tmp/lustre-log.1554950330.95891 [2795175.284901] Lustre: 96734:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:33s); client may timeout. req@ffff8ed327a4f850 x1628648590813856/t0(0) o3->b8034fa4-eaf4-cc28-568b-f59fc8afdda7@10.9.107.15@o2ib4:26/0 lens 488/0 e 0 to 0 dl 1554950306 ref 1 fl Interpret:H/0/ffffffff rc 0/-1 [2795175.313977] Lustre: 96734:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 6 previous similar messages [2795187.431854] LustreError: dumping log to /tmp/lustre-log.1554950351.96079 [2795204.841079] LNet: Service thread pid 96255 was inactive for 200.29s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2795204.854116] LNet: Skipped 8 previous similar messages [2795204.859350] LustreError: dumping log to /tmp/lustre-log.1554950368.96255 [2795226.061987] Lustre: 95893:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (20:6s); client may timeout. req@ffff8ebf8b0b5450 x1629292181394368/t0(0) o3->9871e41f-42f0-e762-946d-b12a7d261ff0@10.9.105.12@o2ib4:14/0 lens 488/0 e 0 to 0 dl 1554950384 ref 1 fl Interpret:H/2/ffffffff rc 0/-1 [2795226.090991] Lustre: 95893:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 7 previous similar messages [2795236.072491] LNet: Service thread pid 96164 was inactive for 200.38s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2795236.089609] LNet: Skipped 2 previous similar messages [2795236.094842] Pid: 96164, comm: mdt01_087 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2795236.104753] Call Trace: [2795236.107415] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2795236.114501] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2795236.121771] [] start_this_handle+0x1a1/0x430 [jbd2] [2795236.128522] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2795236.135344] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2795236.142966] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2795236.150146] [] mdt_empty_transno+0xf7/0x850 [mdt] [2795236.156740] [] mdt_mfd_open+0x8de/0xe70 [mdt] [2795236.162966] [] mdt_finish_open+0x64b/0x760 [mdt] [2795236.169459] [] mdt_open_by_fid_lock+0x672/0x9b0 [mdt] [2795236.176382] [] mdt_reint_open+0x760/0x27d0 [mdt] [2795236.182864] [] mdt_reint_rec+0x83/0x210 [mdt] [2795236.189101] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2795236.195861] [] mdt_intent_open+0x82/0x350 [mdt] [2795236.202254] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2795236.208922] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2795236.215865] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2795236.223171] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2795236.229509] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2795236.236628] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2795236.244519] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2795236.251034] [] kthread+0xd1/0xe0 [2795236.256137] [] ret_from_fork_nospec_begin+0xe/0x21 [2795236.262803] [] 0xffffffffffffffff [2795236.267982] LustreError: dumping log to /tmp/lustre-log.1554950400.96164 [2795239.656548] Pid: 96429, comm: mdt03_110 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2795239.666462] Call Trace: [2795239.669124] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2795239.676208] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2795239.683478] [] start_this_handle+0x1a1/0x430 [jbd2] [2795239.690215] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2795239.697035] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2795239.704659] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2795239.711852] [] mdt_empty_transno+0xf7/0x850 [mdt] [2795239.718432] [] mdt_mfd_open+0x8de/0xe70 [mdt] [2795239.724671] [] mdt_finish_open+0x64b/0x760 [mdt] [2795239.731147] [] mdt_open_by_fid_lock+0x672/0x9b0 [mdt] [2795239.738078] [] mdt_reint_open+0x760/0x27d0 [mdt] [2795239.744563] [] mdt_reint_rec+0x83/0x210 [mdt] [2795239.750801] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2795239.757546] [] mdt_intent_open+0x82/0x350 [mdt] [2795239.763957] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2795239.770623] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2795239.777566] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2795239.784848] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2795239.791181] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2795239.798288] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2795239.806189] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2795239.812692] [] kthread+0xd1/0xe0 [2795239.817793] [] ret_from_fork_nospec_begin+0xe/0x21 [2795239.824445] [] 0xffffffffffffffff [2795239.829647] LustreError: dumping log to /tmp/lustre-log.1554950403.96429 [2795240.681548] LNet: Service thread pid 95358 was inactive for 200.14s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2795240.698658] LNet: Skipped 1 previous similar message [2795240.703805] Pid: 95358, comm: mdt01_010 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2795240.713734] Call Trace: [2795240.716376] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2795240.723459] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2795240.730726] [] start_this_handle+0x1a1/0x430 [jbd2] [2795240.737455] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2795240.744288] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2795240.751903] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2795240.759068] [] mdt_empty_transno+0xf7/0x850 [mdt] [2795240.765640] [] mdt_mfd_open+0x8de/0xe70 [mdt] [2795240.771877] [] mdt_finish_open+0x64b/0x760 [mdt] [2795240.778353] [] mdt_open_by_fid_lock+0x672/0x9b0 [mdt] [2795240.785276] [] mdt_reint_open+0x760/0x27d0 [mdt] [2795240.791760] [] mdt_reint_rec+0x83/0x210 [mdt] [2795240.797984] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2795240.804718] [] mdt_intent_open+0x82/0x350 [mdt] [2795240.811120] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2795240.817771] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2795240.824732] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2795240.832018] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2795240.838353] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2795240.845469] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2795240.853372] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2795240.859873] [] kthread+0xd1/0xe0 [2795240.864973] [] ret_from_fork_nospec_begin+0xe/0x21 [2795240.871624] [] 0xffffffffffffffff [2795240.876815] LustreError: dumping log to /tmp/lustre-log.1554950404.95358 [2795243.752593] Pid: 96243, comm: mdt02_090 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2795243.762511] Call Trace: [2795243.765177] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2795243.772257] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2795243.779531] [] start_this_handle+0x1a1/0x430 [jbd2] [2795243.786263] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2795243.793099] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2795243.800709] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2795243.807901] [] mdt_empty_transno+0xf7/0x850 [mdt] [2795243.814481] [] mdt_mfd_open+0x8de/0xe70 [mdt] [2795243.820721] [] mdt_finish_open+0x64b/0x760 [mdt] [2795243.827220] [] mdt_open_by_fid_lock+0x672/0x9b0 [mdt] [2795243.834151] [] mdt_reint_open+0x760/0x27d0 [mdt] [2795243.840640] [] mdt_reint_rec+0x83/0x210 [mdt] [2795243.846866] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2795243.853603] [] mdt_intent_open+0x82/0x350 [mdt] [2795243.859992] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2795243.866648] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2795243.873605] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2795243.880888] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2795243.887223] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2795243.894328] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2795243.902215] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2795243.908714] [] kthread+0xd1/0xe0 [2795243.913818] [] ret_from_fork_nospec_begin+0xe/0x21 [2795243.920468] [] 0xffffffffffffffff [2795243.925666] LustreError: dumping log to /tmp/lustre-log.1554950407.96243 [2795244.435505] LustreError: 137-5: fir-MDT0000_UUID: not available for connect from 10.8.0.65@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2795244.435506] LustreError: 137-5: fir-MDT0002_UUID: not available for connect from 10.8.0.65@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2795250.920686] LNet: Service thread pid 96077 was inactive for 200.19s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2795250.937800] LNet: Skipped 1 previous similar message [2795250.942950] Pid: 96077, comm: mdt01_079 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2795250.952860] Call Trace: [2795250.955519] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2795250.962600] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2795250.969853] [] start_this_handle+0x1a1/0x430 [jbd2] [2795250.976609] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2795250.983434] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2795250.991060] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2795250.998238] [] mdt_empty_transno+0xf7/0x850 [mdt] [2795251.004831] [] mdt_mfd_open+0x8de/0xe70 [mdt] [2795251.011070] [] mdt_finish_open+0x64b/0x760 [mdt] [2795251.017562] [] mdt_open_by_fid_lock+0x672/0x9b0 [mdt] [2795251.024471] [] mdt_reint_open+0x760/0x27d0 [mdt] [2795251.030944] [] mdt_reint_rec+0x83/0x210 [mdt] [2795251.037158] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2795251.043919] [] mdt_intent_open+0x82/0x350 [mdt] [2795251.050317] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2795251.056989] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2795251.063932] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2795251.071226] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2795251.077587] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2795251.084719] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2795251.092611] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2795251.099124] [] kthread+0xd1/0xe0 [2795251.104214] [] ret_from_fork_nospec_begin+0xe/0x21 [2795251.110878] [] 0xffffffffffffffff [2795251.116064] LustreError: dumping log to /tmp/lustre-log.1554950415.96077 [2795269.197930] LustreError: 96245:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554950343, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eb40b7c6c00/0x857bd8ea29e3100 lrc: 3/1,0 mode: --/PR res: [0x24000f649:0x1212d:0x0].0x0 bits 0x13/0x0 rrc: 9 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 96245 timeout: 0 lvb_type: 0 [2795270.888949] LustreError: dumping log to /tmp/lustre-log.1554950434.96289 [2795284.713133] LustreError: dumping log to /tmp/lustre-log.1554950448.96263 [2795301.097348] LustreError: dumping log to /tmp/lustre-log.1554950465.96223 [2795306.217410] LustreError: dumping log to /tmp/lustre-log.1554950470.95578 [2795324.649634] LustreError: dumping log to /tmp/lustre-log.1554950488.95879 [2795329.260692] LustreError: 95598:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554950403, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ed4b638bcc0/0x857bd8ea2b48487 lrc: 3/1,0 mode: --/PR res: [0x24000f649:0x1212d:0x0].0x0 bits 0x13/0x0 rrc: 11 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 95598 timeout: 0 lvb_type: 0 [2795331.817726] LustreError: dumping log to /tmp/lustre-log.1554950495.96277 [2795341.033835] LustreError: dumping log to /tmp/lustre-log.1554950505.96305 [2795350.761950] LustreError: dumping log to /tmp/lustre-log.1554950514.95955 [2795369.194194] LustreError: dumping log to /tmp/lustre-log.1554950533.95962 [2795379.946331] LustreError: dumping log to /tmp/lustre-log.1554950543.96245 [2795386.090409] LustreError: dumping log to /tmp/lustre-log.1554950550.95128 [2795389.272452] LustreError: 95552:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554950463, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ec8047d0d80/0x857bd8ea2cad521 lrc: 3/1,0 mode: --/PR res: [0x24000f649:0x1212d:0x0].0x0 bits 0x13/0x0 rrc: 13 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 95552 timeout: 0 lvb_type: 0 [2795390.616152] Lustre: 96716:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:134s); client may timeout. req@ffff8ec27d3ef850 x1628594440640352/t0(0) o3->6499b812-cd9e-69cd-2f7e-a245f3cf6b05@10.9.104.41@o2ib4:20/0 lens 488/0 e 0 to 0 dl 1554950420 ref 1 fl Interpret:H/0/ffffffff rc 0/-1 [2795390.645331] Lustre: 96716:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 2 previous similar messages [2795392.234490] LustreError: dumping log to /tmp/lustre-log.1554950556.95574 [2795395.818541] LustreError: dumping log to /tmp/lustre-log.1554950559.96555 [2795402.474628] LustreError: dumping log to /tmp/lustre-log.1554950566.96110 [2795406.570686] LustreError: dumping log to /tmp/lustre-log.1554950570.95343 [2795410.666732] LustreError: dumping log to /tmp/lustre-log.1554950574.95408 [2795415.786801] LustreError: dumping log to /tmp/lustre-log.1554950579.96096 [2795417.834825] LustreError: dumping log to /tmp/lustre-log.1554950581.95596 [2795423.978909] LustreError: dumping log to /tmp/lustre-log.1554950587.95970 [2795438.003983] Lustre: fir-MDT0001: Export ffff8edfbaaafc00 already connecting from 10.9.113.13@o2ib4 [2795439.339108] LustreError: dumping log to /tmp/lustre-log.1554950603.95598 [2795474.155547] LNet: Service thread pid 95973 was inactive for 200.38s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2795474.168593] LNet: Skipped 23 previous similar messages [2795474.173917] LustreError: dumping log to /tmp/lustre-log.1554950638.95973 [2795481.323647] LustreError: dumping log to /tmp/lustre-log.1554950645.95999 [2795485.419703] LustreError: dumping log to /tmp/lustre-log.1554950649.96299 [2795488.180645] Lustre: fir-MDT0001: Export ffff8edfbaaafc00 already connecting from 10.9.113.13@o2ib4 [2795499.755881] LustreError: dumping log to /tmp/lustre-log.1554950663.95552 [2795505.899964] LustreError: dumping log to /tmp/lustre-log.1554950669.95586 [2795509.383008] LustreError: 95131:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554950583, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ee2f6c0de80/0x857bd8ea2f71cba lrc: 3/1,0 mode: --/PR res: [0x24000f649:0x1212d:0x0].0x0 bits 0x13/0x0 rrc: 17 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 95131 timeout: 0 lvb_type: 0 [2795509.422652] LustreError: 95131:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 1 previous similar message [2795513.262379] LustreError: 137-5: fir-MDT0002_UUID: not available for connect from 10.8.0.65@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2795513.279753] LustreError: Skipped 1 previous similar message [2795538.357179] Lustre: fir-MDT0001: Export ffff8edfbaaafc00 already connecting from 10.9.113.13@o2ib4 [2795559.148649] LNet: Service thread pid 96292 was inactive for 200.03s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2795559.165763] Pid: 96292, comm: mdt00_101 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2795559.175676] Call Trace: [2795559.178321] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2795559.185449] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2795559.192830] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2795559.199836] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2795559.207025] [] mdt_getattr_name_lock+0x90a/0x1c30 [mdt] [2795559.214118] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2795559.220878] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2795559.227534] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2795559.234490] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2795559.241776] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2795559.248142] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2795559.255259] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2795559.263160] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2795559.269664] [] kthread+0xd1/0xe0 [2795559.274765] [] ret_from_fork_nospec_begin+0xe/0x21 [2795559.281419] [] 0xffffffffffffffff [2795559.286644] LustreError: dumping log to /tmp/lustre-log.1554950723.96292 [2795585.772999] Pid: 95354, comm: mdt02_006 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2795585.782962] Call Trace: [2795585.785617] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2795585.792715] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2795585.799976] [] start_this_handle+0x1a1/0x430 [jbd2] [2795585.806720] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2795585.813567] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2795585.821184] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2795585.828399] [] tgt_client_data_update+0x303/0x5e0 [ptlrpc] [2795585.835796] [] tgt_client_new+0x41b/0x610 [ptlrpc] [2795585.842494] [] mdt_obd_connect+0x465/0x850 [mdt] [2795585.848977] [] target_handle_connect+0x109e/0x2950 [ptlrpc] [2795585.856447] [] tgt_request_handle+0x50a/0x1580 [ptlrpc] [2795585.863556] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2795585.871457] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2795585.877961] [] kthread+0xd1/0xe0 [2795585.883070] [] ret_from_fork_nospec_begin+0xe/0x21 [2795585.889722] [] 0xffffffffffffffff [2795585.894944] LustreError: dumping log to /tmp/lustre-log.1554950749.95354 [2795588.533855] Lustre: fir-MDT0001: Export ffff8edfbaaafc00 already connecting from 10.9.113.13@o2ib4 [2795598.217335] Lustre: 96716:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (20:49s); client may timeout. req@ffff8ecf2ad76450 x1629299333903632/t0(0) o3->39b26b25-3151-384d-34d9-da20c3dc7f78@10.8.7.35@o2ib6:13/0 lens 488/0 e 0 to 0 dl 1554950713 ref 1 fl Interpret:H/2/ffffffff rc 0/-1 [2795598.246245] Lustre: 96716:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 32 previous similar messages [2795619.565439] LNet: Service thread pid 95131 was inactive for 200.18s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2795619.582555] LNet: Skipped 1 previous similar message [2795619.587704] Pid: 95131, comm: mdt03_000 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2795619.597633] Call Trace: [2795619.600277] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2795619.607408] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2795619.614805] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2795619.621811] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2795619.629003] [] mdt_getattr_name_lock+0x90a/0x1c30 [mdt] [2795619.636085] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2795619.642844] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2795619.649502] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2795619.656449] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2795619.663732] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2795619.670084] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2795619.677200] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2795619.685117] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2795619.691623] [] kthread+0xd1/0xe0 [2795619.696725] [] ret_from_fork_nospec_begin+0xe/0x21 [2795619.703375] [] 0xffffffffffffffff [2795619.708583] LustreError: dumping log to /tmp/lustre-log.1554950783.95131 [2795623.149486] Pid: 96218, comm: mdt01_107 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2795623.159402] Call Trace: [2795623.162056] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2795623.169142] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2795623.176428] [] start_this_handle+0x1a1/0x430 [jbd2] [2795623.183165] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2795623.190000] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2795623.197611] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2795623.204795] [] top_trans_start+0x702/0x940 [ptlrpc] [2795623.211583] [] lod_trans_start+0x34/0x40 [lod] [2795623.217896] [] mdd_trans_start+0x1a/0x20 [mdd] [2795623.224210] [] mdd_create+0xa50/0x1440 [mdd] [2795623.230351] [] mdt_reint_open+0x19d0/0x27d0 [mdt] [2795623.236948] [] mdt_reint_rec+0x83/0x210 [mdt] [2795623.243176] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2795623.249916] [] mdt_intent_open+0x82/0x350 [mdt] [2795623.256317] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2795623.262967] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2795623.269906] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2795623.277188] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2795623.283530] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2795623.290641] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2795623.298546] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2795623.305061] [] kthread+0xd1/0xe0 [2795623.310162] [] ret_from_fork_nospec_begin+0xe/0x21 [2795623.316814] [] 0xffffffffffffffff [2795623.322004] LustreError: dumping log to /tmp/lustre-log.1554950787.96218 [2795631.853595] Pid: 96198, comm: mdt00_068 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2795631.863509] Call Trace: [2795631.866172] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2795631.873267] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2795631.880538] [] start_this_handle+0x1a1/0x430 [jbd2] [2795631.887276] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2795631.894116] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2795631.901743] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2795631.908928] [] dqget+0x3fa/0x450 [2795631.914018] [] dquot_get_dqblk+0x14/0x1f0 [2795631.919877] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2795631.927581] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2795631.934143] [] mdt_quotactl+0x4d2/0x770 [mdt] [2795631.940380] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2795631.947516] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2795631.955415] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2795631.961933] [] kthread+0xd1/0xe0 [2795631.967039] [] ret_from_fork_nospec_begin+0xe/0x21 [2795631.973690] [] 0xffffffffffffffff [2795631.978887] LustreError: dumping log to /tmp/lustre-log.1554950795.96198 [2795637.997669] LustreError: dumping log to /tmp/lustre-log.1554950801.95515 [2795638.710448] Lustre: fir-MDT0001: Export ffff8edfbaaafc00 already connecting from 10.9.113.13@o2ib4 [2795642.605736] LustreError: dumping log to /tmp/lustre-log.1554950806.96296 [2795644.241736] Lustre: fir-MDT0003: Connection restored to (at 10.9.102.3@o2ib4) [2795644.249142] Lustre: Skipped 6260 previous similar messages [2795645.165797] LustreError: dumping log to /tmp/lustre-log.1554950809.95929 [2795651.821854] LustreError: dumping log to /tmp/lustre-log.1554950815.95589 [2795655.917966] LustreError: dumping log to /tmp/lustre-log.1554950819.96034 [2795672.302118] LustreError: dumping log to /tmp/lustre-log.1554950836.95869 [2795679.982217] LustreError: dumping log to /tmp/lustre-log.1554950843.96421 [2795688.887120] Lustre: fir-MDT0001: Export ffff8edfbaaafc00 already connecting from 10.9.113.13@o2ib4 [2795694.009977] Lustre: fir-MDT0001: Client f2e3283d-7971-ad13-864e-1dc90cb5dd2a (at 10.8.7.20@o2ib6) reconnecting [2795694.020153] Lustre: Skipped 6450 previous similar messages [2795701.486491] LustreError: dumping log to /tmp/lustre-log.1554950865.95122 [2795706.094549] LustreError: dumping log to /tmp/lustre-log.1554950870.95953 [2795710.887201] Lustre: fir-MDT0001: Export ffff8ec43ab3b800 already connecting from 10.9.114.14@o2ib4 [2795711.214621] LustreError: dumping log to /tmp/lustre-log.1554950875.95527 [2795719.918722] LustreError: dumping log to /tmp/lustre-log.1554950883.96386 [2795754.649165] Lustre: 96734:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ed3284e3850 x1629019244926432/t0(0) o3->2fa1d9ab-19d7-0304-aef1-9c6080150aa9@10.8.8.31@o2ib6:13/0 lens 488/0 e 0 to 0 dl 1554950923 ref 2 fl New:/2/ffffffff rc 0/-1 [2795754.677899] Lustre: 96734:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 11324 previous similar messages [2795760.800412] LustreError: 95108:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 30s: evicting client at 10.8.8.32@o2ib6 ns: mdt-fir-MDT0003_UUID lock: ffff8ec235790240/0x857bd8ea311c10d lrc: 3/0,0 mode: PW/PW res: [0x280002b7f:0x535:0x0].0x0 bits 0x40/0x0 rrc: 18 type: IBT flags: 0x60200400000020 nid: 10.8.8.32@o2ib6 remote: 0xa4f795a250b329c expref: 39046 pid: 96174 timeout: 2795738 lvb_type: 0 [2795761.063830] Lustre: fir-MDT0001: Export ffff8ec43ab3b800 already connecting from 10.9.114.14@o2ib4 [2795761.072971] Lustre: Skipped 1 previous similar message [2795764.649541] LustreError: 95836:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) ldlm_cancel from 10.8.8.32@o2ib6 arrived at 1554950928 with bad export cookie 601157486662831927 [2795764.665005] LustreError: 95836:0:(ldlm_lockd.c:2322:ldlm_cancel_handler()) Skipped 17737 previous similar messages [2795788.527584] LustreError: dumping log to /tmp/lustre-log.1554950952.95896 [2795819.759987] LustreError: dumping log to /tmp/lustre-log.1554950983.96234 [2795822.320021] LustreError: dumping log to /tmp/lustre-log.1554950986.95529 [2795839.417108] Lustre: fir-MDT0001: Export ffff8edfbaaafc00 already connecting from 10.9.113.13@o2ib4 [2795839.426248] Lustre: Skipped 2 previous similar messages [2795840.752259] LustreError: dumping log to /tmp/lustre-log.1554951004.96326 [2795858.672486] LustreError: dumping log to /tmp/lustre-log.1554951022.96212 [2795872.229340] Lustre: 96734:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:58s); client may timeout. req@ffff8ed98a17f450 x1629290130585632/t0(0) o3->9a6d5107-a0dd-89cc-cc60-368ce86bc790@10.8.17.4@o2ib6:8/0 lens 488/0 e 0 to 0 dl 1554950978 ref 1 fl Interpret:H/2/ffffffff rc 0/-1 [2795872.258158] Lustre: 96734:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 439 previous similar messages [2795878.502395] LustreError: 96734:0:(service.c:2128:ptlrpc_server_handle_request()) @@@ Dropping timed-out request from 12345-10.8.17.4@o2ib6: deadline 30:2s ago req@ffff8edc84785450 x1629290130602096/t0(0) o3->9a6d5107-a0dd-89cc-cc60-368ce86bc790@10.8.17.4@o2ib6:10/0 lens 488/0 e 0 to 0 dl 1554951040 ref 1 fl Interpret:H/2/ffffffff rc 0/-1 [2795989.946919] Lustre: fir-MDT0001: Export ffff8edfbaaafc00 already connecting from 10.9.113.13@o2ib4 [2795989.956060] Lustre: Skipped 5 previous similar messages [2796054.770962] LNet: Service thread pid 96094 was inactive for 200.74s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2796054.788074] LNet: Skipped 2 previous similar messages [2796054.793309] Pid: 96094, comm: mdt00_063 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2796054.803244] Call Trace: [2796054.805903] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2796054.812996] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2796054.820262] [] start_this_handle+0x1a1/0x430 [jbd2] [2796054.827001] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2796054.833838] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2796054.841462] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2796054.848654] [] mdt_empty_transno+0xf7/0x850 [mdt] [2796054.855228] [] mdt_mfd_open+0x8de/0xe70 [mdt] [2796054.861467] [] mdt_finish_open+0x64b/0x760 [mdt] [2796054.867955] [] mdt_open_by_fid_lock+0x672/0x9b0 [mdt] [2796054.874889] [] mdt_reint_open+0x760/0x27d0 [mdt] [2796054.881373] [] mdt_reint_rec+0x83/0x210 [mdt] [2796054.887613] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2796054.894358] [] mdt_intent_open+0x82/0x350 [mdt] [2796054.900772] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2796054.907441] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2796054.914403] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2796054.921695] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2796054.928044] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2796054.935153] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2796054.943056] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2796054.949559] [] kthread+0xd1/0xe0 [2796054.954658] [] ret_from_fork_nospec_begin+0xe/0x21 [2796054.961308] [] 0xffffffffffffffff [2796054.966509] LustreError: dumping log to /tmp/lustre-log.1554951218.96094 [2796076.787246] Pid: 96176, comm: mdt01_093 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2796076.797158] Call Trace: [2796076.799818] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2796076.806897] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2796076.814168] [] start_this_handle+0x1a1/0x430 [jbd2] [2796076.820901] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2796076.827737] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2796076.835350] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2796076.842532] [] dqget+0x3fa/0x450 [2796076.847623] [] dquot_get_dqblk+0x14/0x1f0 [2796076.853481] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2796076.861186] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2796076.867748] [] mdt_quotactl+0x4d2/0x770 [mdt] [2796076.873985] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2796076.881121] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2796076.889020] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2796076.895534] [] kthread+0xd1/0xe0 [2796076.900633] [] ret_from_fork_nospec_begin+0xe/0x21 [2796076.907285] [] 0xffffffffffffffff [2796076.912477] LustreError: dumping log to /tmp/lustre-log.1554951240.96176 [2796077.299249] Pid: 96338, comm: mdt_rdpg00_007 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2796077.309600] Call Trace: [2796077.312269] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2796077.319355] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2796077.326631] [] start_this_handle+0x1a1/0x430 [jbd2] [2796077.333368] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2796077.340204] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2796077.347817] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2796077.355006] [] top_trans_start+0x702/0x940 [ptlrpc] [2796077.361807] [] lod_trans_start+0x34/0x40 [lod] [2796077.368134] [] mdd_trans_start+0x1a/0x20 [mdd] [2796077.374449] [] mdd_xattr_set+0x3d5/0x17d0 [mdd] [2796077.380861] [] mdt_set_som+0xf4/0x2f0 [mdt] [2796077.386921] [] mdt_lsom_update+0x120/0x440 [mdt] [2796077.393418] [] mdt_mfd_close+0x516/0x850 [mdt] [2796077.399730] [] mdt_close_internal+0x121/0x220 [mdt] [2796077.406487] [] mdt_close+0x220/0x780 [mdt] [2796077.412455] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2796077.419584] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2796077.427476] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2796077.434013] [] kthread+0xd1/0xe0 [2796077.439104] [] ret_from_fork_nospec_begin+0xe/0x21 [2796077.445768] [] 0xffffffffffffffff [2796077.450955] LustreError: dumping log to /tmp/lustre-log.1554951241.96338 [2796081.907302] Pid: 95979, comm: mdt03_023 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2796081.917211] Call Trace: [2796081.919867] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2796081.926953] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2796081.934222] [] start_this_handle+0x1a1/0x430 [jbd2] [2796081.940961] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2796081.947817] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2796081.955432] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2796081.962614] [] dqget+0x3fa/0x450 [2796081.967705] [] dquot_get_dqblk+0x14/0x1f0 [2796081.973571] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2796081.981275] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2796081.987838] [] mdt_quotactl+0x4d2/0x770 [mdt] [2796081.994074] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2796082.001203] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2796082.009102] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2796082.015621] [] kthread+0xd1/0xe0 [2796082.020726] [] ret_from_fork_nospec_begin+0xe/0x21 [2796082.027375] [] 0xffffffffffffffff [2796082.032572] LustreError: dumping log to /tmp/lustre-log.1554951246.95979 [2796086.515354] Pid: 95401, comm: mdt03_008 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2796086.525269] Call Trace: [2796086.527925] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2796086.535009] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2796086.542284] [] start_this_handle+0x1a1/0x430 [jbd2] [2796086.549015] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2796086.555852] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2796086.563454] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2796086.570636] [] dqget+0x3fa/0x450 [2796086.575727] [] dquot_get_dqblk+0x14/0x1f0 [2796086.581594] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2796086.589298] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2796086.595859] [] mdt_quotactl+0x4d2/0x770 [mdt] [2796086.602097] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2796086.609225] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2796086.617126] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2796086.623644] [] kthread+0xd1/0xe0 [2796086.628738] [] ret_from_fork_nospec_begin+0xe/0x21 [2796086.635390] [] 0xffffffffffffffff [2796086.640586] LustreError: dumping log to /tmp/lustre-log.1554951250.95401 [2796119.283703] LNet: Service thread pid 95430 was inactive for 200.25s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2796119.296739] LNet: Skipped 22 previous similar messages [2796119.302062] LustreError: dumping log to /tmp/lustre-log.1554951283.95430 [2796120.825736] LustreError: 137-5: fir-MDT0000_UUID: not available for connect from 10.8.0.65@o2ib6 (no target). If you are running an HA pair check that the target is mounted on the other server. [2796120.843100] LustreError: Skipped 3 previous similar messages [2796139.763928] LustreError: dumping log to /tmp/lustre-log.1554951303.95126 [2796158.708174] LustreError: dumping log to /tmp/lustre-log.1554951322.95966 [2796163.828237] LustreError: dumping log to /tmp/lustre-log.1554951327.95934 [2796168.436298] LustreError: dumping log to /tmp/lustre-log.1554951332.95547 [2796215.540898] LustreError: dumping log to /tmp/lustre-log.1554951379.96174 [2796216.052896] LustreError: dumping log to /tmp/lustre-log.1554951380.95870 [2796244.269990] Lustre: fir-MDT0001: Connection restored to 3d0bb7bb-d871-200a-0553-40fe5b229159 (at 10.9.103.11@o2ib4) [2796244.280601] Lustre: Skipped 7899 previous similar messages [2796246.556546] Lustre: fir-MDT0001: Export ffff8ebad3157400 already connecting from 10.9.101.53@o2ib4 [2796246.565681] Lustre: Skipped 10 previous similar messages [2796249.333300] LustreError: dumping log to /tmp/lustre-log.1554951413.96013 [2796294.061603] Lustre: fir-MDT0001: Client 392d8331-072b-04ee-380d-4c214c504d41 (at 10.9.101.40@o2ib4) reconnecting [2796294.071953] Lustre: Skipped 7949 previous similar messages [2796310.774029] LustreError: dumping log to /tmp/lustre-log.1554951474.96424 [2796331.766287] LustreError: dumping log to /tmp/lustre-log.1554951495.96228 [2796336.374352] LustreError: dumping log to /tmp/lustre-log.1554951500.96023 [2796342.006427] LustreError: dumping log to /tmp/lustre-log.1554951505.96281 [2796347.638482] LustreError: dumping log to /tmp/lustre-log.1554951511.96178 [2796354.854610] Lustre: 96734:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ed99b30a450 x1630440980689856/t0(0) o4->e75095d8-7c9e-d169-de21-b278ca100ccd@10.9.101.10@o2ib4:13/0 lens 488/0 e 0 to 0 dl 1554951523 ref 2 fl New:/2/ffffffff rc 0/-1 [2796354.883515] Lustre: 96734:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 14397 previous similar messages [2796394.231048] LNet: Service thread pid 95568 was inactive for 200.49s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2796394.248161] LNet: Skipped 4 previous similar messages [2796394.253396] Pid: 95568, comm: mdt00_024 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2796394.263325] Call Trace: [2796394.265987] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2796394.273073] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2796394.280341] [] start_this_handle+0x1a1/0x430 [jbd2] [2796394.287078] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2796394.293926] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2796394.301534] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2796394.308717] [] tgt_client_data_update+0x303/0x5e0 [ptlrpc] [2796394.316113] [] tgt_client_new+0x41b/0x610 [ptlrpc] [2796394.322800] [] mdt_obd_connect+0x465/0x850 [mdt] [2796394.329285] [] target_handle_connect+0x109e/0x2950 [ptlrpc] [2796394.336754] [] tgt_request_handle+0x50a/0x1580 [ptlrpc] [2796394.343872] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2796394.351773] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2796394.358276] [] kthread+0xd1/0xe0 [2796394.363392] [] ret_from_fork_nospec_begin+0xe/0x21 [2796394.370044] [] 0xffffffffffffffff [2796394.375246] LustreError: dumping log to /tmp/lustre-log.1554951558.95568 [2796409.079229] Pid: 96543, comm: mdt_rdpg02_025 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2796409.089594] Call Trace: [2796409.092246] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2796409.099342] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2796409.106612] [] start_this_handle+0x1a1/0x430 [jbd2] [2796409.113363] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2796409.120200] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2796409.127810] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2796409.135002] [] mdt_empty_transno+0xf7/0x850 [mdt] [2796409.141574] [] mdt_close+0x232/0x780 [mdt] [2796409.147552] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2796409.154678] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2796409.162580] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2796409.169083] [] kthread+0xd1/0xe0 [2796409.174169] [] ret_from_fork_nospec_begin+0xe/0x21 [2796409.180838] [] 0xffffffffffffffff [2796409.186028] LustreError: dumping log to /tmp/lustre-log.1554951573.96543 [2796420.343359] Pid: 96419, comm: mdt03_102 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2796420.353268] Call Trace: [2796420.355932] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2796420.363019] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2796420.370296] [] start_this_handle+0x1a1/0x430 [jbd2] [2796420.377031] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2796420.383866] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2796420.391478] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2796420.398683] [] dqget+0x3fa/0x450 [2796420.403778] [] dquot_get_dqblk+0x14/0x1f0 [2796420.409635] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2796420.417347] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2796420.423911] [] mdt_quotactl+0x4d2/0x770 [mdt] [2796420.430145] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2796420.437273] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2796420.445175] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2796420.451678] [] kthread+0xd1/0xe0 [2796420.456779] [] ret_from_fork_nospec_begin+0xe/0x21 [2796420.463444] [] 0xffffffffffffffff [2796420.468650] LustreError: dumping log to /tmp/lustre-log.1554951584.96419 [2796421.367377] Pid: 96080, comm: mdt01_082 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2796421.377289] Call Trace: [2796421.379945] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2796421.387032] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2796421.394306] [] start_this_handle+0x1a1/0x430 [jbd2] [2796421.401046] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2796421.407882] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2796421.415493] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2796421.422690] [] dqget+0x3fa/0x450 [2796421.427782] [] dquot_get_dqblk+0x14/0x1f0 [2796421.433642] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2796421.441354] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2796421.447924] [] mdt_quotactl+0x4d2/0x770 [mdt] [2796421.454160] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2796421.461298] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2796421.469199] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2796421.475703] [] kthread+0xd1/0xe0 [2796421.480804] [] ret_from_fork_nospec_begin+0xe/0x21 [2796421.487469] [] 0xffffffffffffffff [2796421.492668] LustreError: dumping log to /tmp/lustre-log.1554951585.96080 [2796424.951419] Pid: 95582, comm: mdt00_031 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2796424.961332] Call Trace: [2796424.963979] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2796424.971074] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2796424.978344] [] start_this_handle+0x1a1/0x430 [jbd2] [2796424.985079] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2796424.991915] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2796424.999529] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2796425.006703] [] dqget+0x3fa/0x450 [2796425.011790] [] dquot_get_dqblk+0x14/0x1f0 [2796425.017663] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2796425.025363] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2796425.031937] [] mdt_quotactl+0x4d2/0x770 [mdt] [2796425.038165] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2796425.045302] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2796425.053191] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2796425.059707] [] kthread+0xd1/0xe0 [2796425.064796] [] ret_from_fork_nospec_begin+0xe/0x21 [2796425.071473] [] 0xffffffffffffffff [2796425.076665] LustreError: dumping log to /tmp/lustre-log.1554951589.95582 [2796427.511451] LustreError: dumping log to /tmp/lustre-log.1554951591.95545 [2796430.071492] LustreError: dumping log to /tmp/lustre-log.1554951594.96195 [2796431.095494] LustreError: dumping log to /tmp/lustre-log.1554951595.95587 [2796458.231823] LustreError: dumping log to /tmp/lustre-log.1554951622.96061 [2796464.375908] LustreError: dumping log to /tmp/lustre-log.1554951628.95894 [2796469.288965] LustreError: 96180:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554951543, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ebe94c08900/0x857bd8ea313ebdb lrc: 3/1,0 mode: --/PR res: [0x240012262:0x1dd3:0x0].0x0 bits 0x13/0x0 rrc: 14 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 96180 timeout: 0 lvb_type: 0 [2796469.328523] LustreError: 96180:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 1 previous similar message [2796469.495962] LustreError: dumping log to /tmp/lustre-log.1554951633.96408 [2796473.080008] LustreError: dumping log to /tmp/lustre-log.1554951637.96088 [2796477.688057] LustreError: dumping log to /tmp/lustre-log.1554951641.95887 [2796481.272100] LustreError: dumping log to /tmp/lustre-log.1554951645.96056 [2796482.808121] LustreError: dumping log to /tmp/lustre-log.1554951646.96085 [2796504.312389] LustreError: dumping log to /tmp/lustre-log.1554951668.95513 [2796509.618859] Lustre: 96387:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:41s); client may timeout. req@ffff8eee5a6e5c50 x1629292181564608/t0(0) o3->9871e41f-42f0-e762-946d-b12a7d261ff0@10.9.105.12@o2ib4:2/0 lens 488/0 e 0 to 0 dl 1554951632 ref 1 fl Interpret:H/2/ffffffff rc 0/-1 [2796509.647835] Lustre: 96387:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 278 previous similar messages [2796522.744591] LustreError: dumping log to /tmp/lustre-log.1554951686.96316 [2796552.440939] LustreError: dumping log to /tmp/lustre-log.1554951716.96027 [2796578.041254] LustreError: dumping log to /tmp/lustre-log.1554951742.96325 [2796579.577275] LustreError: dumping log to /tmp/lustre-log.1554951743.96180 [2796580.089278] LustreError: dumping log to /tmp/lustre-log.1554951744.96011 [2796583.161318] LustreError: dumping log to /tmp/lustre-log.1554951747.95575 [2796584.697337] LustreError: dumping log to /tmp/lustre-log.1554951748.95571 [2796585.209342] LustreError: dumping log to /tmp/lustre-log.1554951749.96309 [2796586.233356] LustreError: dumping log to /tmp/lustre-log.1554951750.95421 [2796587.257377] LustreError: dumping log to /tmp/lustre-log.1554951751.96418 [2796599.545519] LustreError: dumping log to /tmp/lustre-log.1554951763.96197 [2796606.713614] LustreError: dumping log to /tmp/lustre-log.1554951770.95130 [2796640.835063] LustreError: 96300:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554951714, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ee9a6bcb840/0x857bd8ea314518c lrc: 3/1,0 mode: --/PR res: [0x240012262:0x1dd3:0x0].0x0 bits 0x13/0x0 rrc: 15 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 96300 timeout: 0 lvb_type: 0 [2796640.874621] LustreError: 96300:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 9 previous similar messages [2796677.164992] LustreError: 95893:0:(service.c:2128:ptlrpc_server_handle_request()) @@@ Dropping timed-out request from 12345-10.8.17.19@o2ib6: deadline 30:1s ago req@ffff8eb9e82a2850 x1628647419313600/t0(0) o3->eeceaf0a-f64b-44e4-4f28-fac655ebb0a4@10.8.17.19@o2ib6:0/0 lens 488/0 e 0 to 0 dl 1554951840 ref 1 fl Interpret:H/2/ffffffff rc 0/-1 [2796677.370526] LustreError: dumping log to /tmp/lustre-log.1554951841.95932 [2796710.650940] LNet: Service thread pid 95177 was inactive for 200.39s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2796710.668054] LNet: Skipped 4 previous similar messages [2796710.673287] Pid: 95177, comm: mdt02_003 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2796710.683218] Call Trace: [2796710.685878] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2796710.692971] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2796710.700241] [] start_this_handle+0x1a1/0x430 [jbd2] [2796710.706994] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2796710.713831] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2796710.721441] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2796710.728631] [] tgt_client_data_update+0x303/0x5e0 [ptlrpc] [2796710.736033] [] tgt_client_new+0x41b/0x610 [ptlrpc] [2796710.742733] [] mdt_obd_connect+0x465/0x850 [mdt] [2796710.749220] [] target_handle_connect+0x109e/0x2950 [ptlrpc] [2796710.756687] [] tgt_request_handle+0x50a/0x1580 [ptlrpc] [2796710.763805] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2796710.771707] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2796710.778226] [] kthread+0xd1/0xe0 [2796710.783328] [] ret_from_fork_nospec_begin+0xe/0x21 [2796710.789979] [] 0xffffffffffffffff [2796710.795178] LustreError: dumping log to /tmp/lustre-log.1554951874.95177 [2796734.203233] Pid: 96070, comm: mdt01_076 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2796734.213147] Call Trace: [2796734.215804] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2796734.222898] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2796734.230167] [] start_this_handle+0x1a1/0x430 [jbd2] [2796734.236906] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2796734.243748] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2796734.251359] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2796734.258548] [] tgt_client_data_update+0x303/0x5e0 [ptlrpc] [2796734.265946] [] tgt_client_new+0x41b/0x610 [ptlrpc] [2796734.272643] [] mdt_obd_connect+0x465/0x850 [mdt] [2796734.279127] [] target_handle_connect+0x109e/0x2950 [ptlrpc] [2796734.286595] [] tgt_request_handle+0x50a/0x1580 [ptlrpc] [2796734.293713] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2796734.301614] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2796734.308118] [] kthread+0xd1/0xe0 [2796734.313203] [] ret_from_fork_nospec_begin+0xe/0x21 [2796734.319879] [] 0xffffffffffffffff [2796734.325055] LustreError: dumping log to /tmp/lustre-log.1554951898.96070 [2796738.299284] Pid: 96247, comm: mdt03_055 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2796738.309196] Call Trace: [2796738.311855] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2796738.318939] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2796738.326210] [] start_this_handle+0x1a1/0x430 [jbd2] [2796738.332944] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2796738.339780] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2796738.347417] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2796738.354609] [] mdt_empty_transno+0xf7/0x850 [mdt] [2796738.361189] [] mdt_mfd_open+0x8de/0xe70 [mdt] [2796738.367428] [] mdt_finish_open+0x64b/0x760 [mdt] [2796738.373913] [] mdt_open_by_fid_lock+0x672/0x9b0 [mdt] [2796738.380845] [] mdt_reint_open+0x760/0x27d0 [mdt] [2796738.387330] [] mdt_reint_rec+0x83/0x210 [mdt] [2796738.393566] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2796738.400313] [] mdt_intent_open+0x82/0x350 [mdt] [2796738.406719] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2796738.413396] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2796738.420346] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2796738.427634] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2796738.433989] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2796738.441097] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2796738.448999] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2796738.455500] [] kthread+0xd1/0xe0 [2796738.460605] [] ret_from_fork_nospec_begin+0xe/0x21 [2796738.467256] [] 0xffffffffffffffff [2796738.472466] LustreError: dumping log to /tmp/lustre-log.1554951902.96247 [2796744.955370] Pid: 96225, comm: mdt01_109 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2796744.965286] Call Trace: [2796744.967949] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2796744.975043] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2796744.982320] [] start_this_handle+0x1a1/0x430 [jbd2] [2796744.989058] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2796744.995890] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2796745.003506] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2796745.010695] [] tgt_client_data_update+0x303/0x5e0 [ptlrpc] [2796745.018107] [] tgt_client_new+0x41b/0x610 [ptlrpc] [2796745.024804] [] mdt_obd_connect+0x465/0x850 [mdt] [2796745.031291] [] target_handle_connect+0x109e/0x2950 [ptlrpc] [2796745.038760] [] tgt_request_handle+0x50a/0x1580 [ptlrpc] [2796745.045877] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2796745.053787] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2796745.060289] [] kthread+0xd1/0xe0 [2796745.065392] [] ret_from_fork_nospec_begin+0xe/0x21 [2796745.072042] [] 0xffffffffffffffff [2796745.077241] LustreError: dumping log to /tmp/lustre-log.1554951909.96225 [2796751.099448] Pid: 96300, comm: mdt03_067 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2796751.109368] Call Trace: [2796751.112032] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2796751.119138] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2796751.126524] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2796751.133542] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2796751.140735] [] mdt_getattr_name_lock+0x90a/0x1c30 [mdt] [2796751.147817] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2796751.154566] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2796751.161232] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2796751.168187] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2796751.175463] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2796751.181815] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2796751.188947] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2796751.196853] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2796751.203355] [] kthread+0xd1/0xe0 [2796751.208457] [] ret_from_fork_nospec_begin+0xe/0x21 [2796751.215121] [] 0xffffffffffffffff [2796751.220339] LustreError: dumping log to /tmp/lustre-log.1554951915.96300 [2796762.548502] Lustre: fir-MDT0001: Export ffff8ec43ab3b800 already connecting from 10.9.114.14@o2ib4 [2796762.557638] Lustre: Skipped 93 previous similar messages [2796781.819830] LNet: Service thread pid 96113 was inactive for 200.07s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2796781.832866] LNet: Skipped 41 previous similar messages [2796781.838188] LustreError: dumping log to /tmp/lustre-log.1554951945.96113 [2796783.355844] LustreError: dumping log to /tmp/lustre-log.1554951947.96609 [2796789.499917] LustreError: dumping log to /tmp/lustre-log.1554951953.96100 [2796829.948426] LustreError: dumping log to /tmp/lustre-log.1554951993.96116 [2796839.894559] LustreError: 96306:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554951913, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ebb5cc94140/0x857bd8ea319ac7b lrc: 3/1,0 mode: --/PR res: [0x240012262:0x1dd3:0x0].0x0 bits 0x13/0x0 rrc: 17 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 96306 timeout: 0 lvb_type: 0 [2796844.279583] Lustre: fir-MDT0003: Connection restored to 58b75192-b975-3fa8-f93d-47a68dd1093f (at 10.9.103.14@o2ib4) [2796844.290193] Lustre: Skipped 8664 previous similar messages [2796860.156806] LustreError: dumping log to /tmp/lustre-log.1554952024.95127 [2796866.300899] LustreError: dumping log to /tmp/lustre-log.1554952030.95926 [2796894.074967] Lustre: fir-MDT0003: Client c38b3c3a-063d-bcab-2e04-567bdde86398 (at 10.9.107.39@o2ib4) reconnecting [2796894.085321] Lustre: Skipped 8733 previous similar messages [2796916.989530] LustreError: dumping log to /tmp/lustre-log.1554952080.96290 [2796923.133614] LustreError: dumping log to /tmp/lustre-log.1554952087.95525 [2796927.229674] LustreError: dumping log to /tmp/lustre-log.1554952091.95537 [2796950.269962] LustreError: dumping log to /tmp/lustre-log.1554952114.96306 [2796951.805979] LustreError: dumping log to /tmp/lustre-log.1554952115.95512 [2796952.829989] LustreError: dumping log to /tmp/lustre-log.1554952116.95906 [2796953.854004] LustreError: dumping log to /tmp/lustre-log.1554952117.95508 [2796955.056026] Lustre: 96734:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8edbaa83a850 x1628640777528496/t0(0) o4->e5407977-2002-52d7-281b-57025bf38084@10.9.107.13@o2ib4:14/0 lens 1384/0 e 0 to 0 dl 1554952124 ref 2 fl New:/2/ffffffff rc 0/-1 [2796955.085008] Lustre: 96734:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 16870 previous similar messages [2796970.750216] LustreError: dumping log to /tmp/lustre-log.1554952134.96111 [2796975.358272] LustreError: dumping log to /tmp/lustre-log.1554952139.96273 [2796980.478334] LustreError: dumping log to /tmp/lustre-log.1554952144.95420 [2797005.151655] LustreError: 96264:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554952079, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ee258056c00/0x857bd8ea319fa70 lrc: 3/1,0 mode: --/PR res: [0x240012262:0x1dd3:0x0].0x0 bits 0x13/0x0 rrc: 19 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 96264 timeout: 0 lvb_type: 0 [2797005.191215] LustreError: 96264:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 2 previous similar messages [2797027.582936] Pid: 95563, comm: mdt01_033 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2797027.592850] Call Trace: [2797027.595508] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2797027.602594] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2797027.609864] [] start_this_handle+0x1a1/0x430 [jbd2] [2797027.616600] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2797027.623433] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2797027.631047] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2797027.638213] [] dqget+0x3fa/0x450 [2797027.643302] [] dquot_get_dqblk+0x14/0x1f0 [2797027.649159] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2797027.656871] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2797027.663432] [] mdt_quotactl+0x4d2/0x770 [mdt] [2797027.669673] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2797027.676809] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2797027.684717] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2797027.691220] [] kthread+0xd1/0xe0 [2797027.696305] [] ret_from_fork_nospec_begin+0xe/0x21 [2797027.702955] [] 0xffffffffffffffff [2797027.708154] LustreError: dumping log to /tmp/lustre-log.1554952191.95563 [2797032.703002] Pid: 95355, comm: mdt01_009 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2797032.712916] Call Trace: [2797032.715573] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2797032.722656] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2797032.729930] [] start_this_handle+0x1a1/0x430 [jbd2] [2797032.736657] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2797032.743499] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2797032.751112] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2797032.758302] [] dqget+0x3fa/0x450 [2797032.763392] [] dquot_get_dqblk+0x14/0x1f0 [2797032.769251] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2797032.776973] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2797032.783541] [] mdt_quotactl+0x4d2/0x770 [mdt] [2797032.789781] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2797032.796913] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2797032.804817] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2797032.811318] [] kthread+0xd1/0xe0 [2797032.816420] [] ret_from_fork_nospec_begin+0xe/0x21 [2797032.823072] [] 0xffffffffffffffff [2797032.828273] LustreError: dumping log to /tmp/lustre-log.1554952196.95355 [2797037.311059] Pid: 96208, comm: mdt02_081 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2797037.320975] Call Trace: [2797037.323638] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2797037.330724] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2797037.337996] [] start_this_handle+0x1a1/0x430 [jbd2] [2797037.344730] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2797037.351564] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2797037.359175] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2797037.366359] [] dqget+0x3fa/0x450 [2797037.371451] [] dquot_get_dqblk+0x14/0x1f0 [2797037.377309] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2797037.385028] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2797037.391599] [] mdt_quotactl+0x4d2/0x770 [mdt] [2797037.397844] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2797037.404990] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2797037.412893] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2797037.419396] [] kthread+0xd1/0xe0 [2797037.424497] [] ret_from_fork_nospec_begin+0xe/0x21 [2797037.431146] [] 0xffffffffffffffff [2797037.436345] LustreError: dumping log to /tmp/lustre-log.1554952201.96208 [2797053.183260] Pid: 95123, comm: mdt00_001 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2797053.193175] Call Trace: [2797053.195833] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2797053.202917] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2797053.210190] [] start_this_handle+0x1a1/0x430 [jbd2] [2797053.216922] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2797053.223759] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2797053.231369] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2797053.238561] [] mdt_empty_transno+0xf7/0x850 [mdt] [2797053.245135] [] mdt_mfd_open+0x8de/0xe70 [mdt] [2797053.251411] [] mdt_finish_open+0x64b/0x760 [mdt] [2797053.257905] [] mdt_open_by_fid_lock+0x672/0x9b0 [mdt] [2797053.264840] [] mdt_reint_open+0x760/0x27d0 [mdt] [2797053.271325] [] mdt_reint_rec+0x83/0x210 [mdt] [2797053.277564] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2797053.284308] [] mdt_intent_open+0x82/0x350 [mdt] [2797053.290719] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2797053.297378] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2797053.304336] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2797053.311618] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2797053.317998] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2797053.325119] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2797053.333031] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2797053.339531] [] kthread+0xd1/0xe0 [2797053.344636] [] ret_from_fork_nospec_begin+0xe/0x21 [2797053.351286] [] 0xffffffffffffffff [2797053.356483] LustreError: dumping log to /tmp/lustre-log.1554952217.95123 [2797115.648041] Pid: 96264, comm: mdt03_060 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2797115.657953] Call Trace: [2797115.660596] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2797115.667709] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2797115.675095] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2797115.682097] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2797115.689286] [] mdt_getattr_name_lock+0x90a/0x1c30 [mdt] [2797115.696372] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2797115.703145] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2797115.709797] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2797115.716736] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2797115.724017] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2797115.730368] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2797115.737478] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2797115.745378] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2797115.751882] [] kthread+0xd1/0xe0 [2797115.756967] [] ret_from_fork_nospec_begin+0xe/0x21 [2797115.763624] [] 0xffffffffffffffff [2797115.768826] LustreError: dumping log to /tmp/lustre-log.1554952279.96264 [2797125.623276] Lustre: 96507:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:93s); client may timeout. req@ffff8ed880096c50 x1630378142622608/t0(0) o3->91ab2c2b-c1df-91e3-c2aa-127eb3d0faf2@10.8.18.17@o2ib6:26/0 lens 488/0 e 0 to 0 dl 1554952196 ref 1 fl Interpret:H/2/ffffffff rc 0/-1 [2797125.652264] Lustre: 96507:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 840 previous similar messages [2797169.325721] LustreError: 96194:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554952243, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8ebffdacc380/0x857bd8ea31a5554 lrc: 3/1,0 mode: --/PR res: [0x240012262:0x1dd3:0x0].0x0 bits 0x13/0x0 rrc: 24 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 96194 timeout: 0 lvb_type: 0 [2797169.365280] LustreError: 96194:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 2 previous similar messages [2797169.408719] LustreError: dumping log to /tmp/lustre-log.1554952333.95580 [2797174.528785] LustreError: dumping log to /tmp/lustre-log.1554952338.96078 [2797179.648845] LustreError: dumping log to /tmp/lustre-log.1554952343.96224 [2797201.665121] LustreError: dumping log to /tmp/lustre-log.1554952365.95349 [2797206.273181] LustreError: dumping log to /tmp/lustre-log.1554952370.96205 [2797211.905261] LustreError: dumping log to /tmp/lustre-log.1554952375.96241 [2797240.577611] LustreError: dumping log to /tmp/lustre-log.1554952404.96293 [2797245.697678] LustreError: dumping log to /tmp/lustre-log.1554952409.96166 [2797250.817741] LustreError: dumping log to /tmp/lustre-log.1554952414.95565 [2797271.297999] LustreError: dumping log to /tmp/lustre-log.1554952435.95881 [2797279.490104] LustreError: dumping log to /tmp/lustre-log.1554952443.95992 [2797295.874346] LustreError: dumping log to /tmp/lustre-log.1554952459.96229 [2797363.644155] Lustre: fir-MDT0001: Export ffff8ec43ab3b800 already connecting from 10.9.114.14@o2ib4 [2797363.653292] Lustre: Skipped 179 previous similar messages [2797444.335597] Lustre: fir-MDT0003: Connection restored to 569d2d44-944c-64a8-9fcd-3d42238341e5 (at 10.8.7.25@o2ib6) [2797444.346040] Lustre: Skipped 8990 previous similar messages [2797462.788415] LNet: Service thread pid 96167 was inactive for 200.49s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2797462.805538] LNet: Skipped 9 previous similar messages [2797462.810773] Pid: 96167, comm: mdt01_089 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2797462.820687] Call Trace: [2797462.823351] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2797462.830449] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2797462.837706] [] start_this_handle+0x1a1/0x430 [jbd2] [2797462.844456] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2797462.851280] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2797462.858901] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2797462.866081] [] mdt_empty_transno+0xf7/0x850 [mdt] [2797462.872673] [] mdt_mfd_open+0x8de/0xe70 [mdt] [2797462.878900] [] mdt_finish_open+0x64b/0x760 [mdt] [2797462.885395] [] mdt_open_by_fid_lock+0x672/0x9b0 [mdt] [2797462.892317] [] mdt_reint_open+0x760/0x27d0 [mdt] [2797462.898834] [] mdt_reint_rec+0x83/0x210 [mdt] [2797462.905046] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2797462.911805] [] mdt_intent_open+0x82/0x350 [mdt] [2797462.918204] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2797462.924864] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2797462.931808] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2797462.939104] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2797462.945444] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2797462.952571] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2797462.960460] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2797462.966989] [] kthread+0xd1/0xe0 [2797462.972074] [] ret_from_fork_nospec_begin+0xe/0x21 [2797462.978738] [] 0xffffffffffffffff [2797462.983927] LustreError: dumping log to /tmp/lustre-log.1554952626.96167 [2797470.980525] Pid: 95916, comm: mdt02_043 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2797470.990474] Call Trace: [2797470.993117] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2797471.000234] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2797471.007602] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2797471.014623] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2797471.021812] [] mdt_getattr_name_lock+0x90a/0x1c30 [mdt] [2797471.028895] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2797471.035651] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2797471.042311] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2797471.049260] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2797471.056544] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2797471.062893] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2797471.070011] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2797471.077914] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2797471.084430] [] kthread+0xd1/0xe0 [2797471.089534] [] ret_from_fork_nospec_begin+0xe/0x21 [2797471.096186] [] 0xffffffffffffffff [2797471.101393] LustreError: dumping log to /tmp/lustre-log.1554952635.95916 [2797471.109674] Pid: 95522, comm: mdt02_019 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2797471.119624] Call Trace: [2797471.122259] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2797471.129367] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2797471.136741] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2797471.143748] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2797471.150942] [] mdt_getattr_name_lock+0x90a/0x1c30 [mdt] [2797471.158022] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2797471.164770] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2797471.171422] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2797471.178359] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2797471.185634] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2797471.191975] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2797471.199085] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2797471.206985] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2797471.213489] [] kthread+0xd1/0xe0 [2797471.218596] [] ret_from_fork_nospec_begin+0xe/0x21 [2797471.225241] [] 0xffffffffffffffff [2797489.412807] Pid: 96236, comm: mdt00_081 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2797489.422723] Call Trace: [2797489.425379] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2797489.432465] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2797489.439747] [] start_this_handle+0x1a1/0x430 [jbd2] [2797489.446486] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2797489.453309] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2797489.460915] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2797489.468086] [] dqget+0x3fa/0x450 [2797489.473187] [] dquot_get_dqblk+0x14/0x1f0 [2797489.479055] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2797489.486768] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2797489.493333] [] mdt_quotactl+0x4d2/0x770 [mdt] [2797489.499571] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2797489.506725] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2797489.514632] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2797489.521135] [] kthread+0xd1/0xe0 [2797489.526222] [] ret_from_fork_nospec_begin+0xe/0x21 [2797489.532871] [] 0xffffffffffffffff [2797489.538063] LustreError: dumping log to /tmp/lustre-log.1554952653.96236 [2797494.106638] Lustre: fir-MDT0001: Client f88aeb6b-7dd2-5816-4332-40384c49c964 (at 10.9.104.15@o2ib4) reconnecting [2797494.116984] Lustre: Skipped 8986 previous similar messages [2797494.532817] Pid: 96285, comm: mdt02_104 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2797494.542727] Call Trace: [2797494.545382] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2797494.552470] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2797494.559755] [] start_this_handle+0x1a1/0x430 [jbd2] [2797494.566490] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2797494.573327] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2797494.580940] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2797494.588130] [] dqget+0x3fa/0x450 [2797494.593219] [] dquot_get_dqblk+0x14/0x1f0 [2797494.599087] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2797494.606800] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2797494.613371] [] mdt_quotactl+0x4d2/0x770 [mdt] [2797494.619607] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2797494.626759] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2797494.634659] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2797494.641164] [] kthread+0xd1/0xe0 [2797494.646267] [] ret_from_fork_nospec_begin+0xe/0x21 [2797494.652917] [] 0xffffffffffffffff [2797494.658115] LustreError: dumping log to /tmp/lustre-log.1554952658.96285 [2797499.140880] LNet: Service thread pid 95357 was inactive for 200.48s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2797499.153916] LNet: Skipped 32 previous similar messages [2797499.159241] LustreError: dumping log to /tmp/lustre-log.1554952663.95357 [2797555.261593] Lustre: 96734:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ede0ef67450 x1628646094988416/t0(0) o4->99d57a3c-a90a-aca9-dbc0-aac5914e3bde@10.8.1.3@o2ib6:14/0 lens 488/0 e 0 to 0 dl 1554952724 ref 2 fl New:/2/ffffffff rc 0/-1 [2797555.290233] Lustre: 96734:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 18135 previous similar messages [2797565.189720] LustreError: dumping log to /tmp/lustre-log.1554952729.95179 [2797570.309781] LustreError: dumping log to /tmp/lustre-log.1554952734.95560 [2797571.333797] LustreError: dumping log to /tmp/lustre-log.1554952735.95590 [2797575.429843] LustreError: dumping log to /tmp/lustre-log.1554952739.96382 [2797576.453867] LustreError: dumping log to /tmp/lustre-log.1554952740.96307 [2797581.061915] LustreError: dumping log to /tmp/lustre-log.1554952745.96177 [2797588.230003] LustreError: dumping log to /tmp/lustre-log.1554952752.95569 [2797622.534443] LustreError: dumping log to /tmp/lustre-log.1554952786.96097 [2797630.214531] LustreError: dumping log to /tmp/lustre-log.1554952794.96006 [2797635.846603] LustreError: dumping log to /tmp/lustre-log.1554952799.95342 [2797640.454663] LustreError: dumping log to /tmp/lustre-log.1554952804.96221 [2797701.650421] LustreError: 96317:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1554952775, 90s ago); not entering recovery in server code, just going back to sleep ns: mdt-fir-MDT0001_UUID lock: ffff8eef913be300/0x857bd8ea31e6f31 lrc: 3/1,0 mode: --/PR res: [0x240012262:0x1dd3:0x0].0x0 bits 0x13/0x0 rrc: 26 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 96317 timeout: 0 lvb_type: 0 [2797701.689980] LustreError: 96317:0:(ldlm_request.c:129:ldlm_expired_completion_wait()) Skipped 4 previous similar messages [2797735.326663] Lustre: 95907:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:55s); client may timeout. req@ffff8ecfe39eb850 x1629293845476976/t0(0) o3->083c2635-6aea-5ac3-6cf6-90334c2fbbb7@10.9.105.24@o2ib4:14/0 lens 488/0 e 0 to 0 dl 1554952844 ref 1 fl Interpret:H/2/ffffffff rc 0/-1 [2797735.355751] Lustre: 95907:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 343 previous similar messages [2797780.232393] Pid: 95950, comm: mdt00_047 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2797780.242304] Call Trace: [2797780.244962] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2797780.252056] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2797780.259341] [] start_this_handle+0x1a1/0x430 [jbd2] [2797780.266077] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2797780.272912] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2797780.280525] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2797780.287693] [] dqget+0x3fa/0x450 [2797780.292779] [] dquot_get_dqblk+0x14/0x1f0 [2797780.298645] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2797780.306350] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2797780.312914] [] mdt_quotactl+0x4d2/0x770 [mdt] [2797780.319150] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2797780.326285] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2797780.334187] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2797780.340690] [] kthread+0xd1/0xe0 [2797780.345794] [] ret_from_fork_nospec_begin+0xe/0x21 [2797780.352441] [] 0xffffffffffffffff [2797780.357640] LustreError: dumping log to /tmp/lustre-log.1554952944.95950 [2797785.352530] Pid: 96183, comm: mdt01_097 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2797785.362440] Call Trace: [2797785.365095] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2797785.372181] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2797785.379457] [] start_this_handle+0x1a1/0x430 [jbd2] [2797785.386197] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2797785.393040] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2797785.400649] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2797785.407832] [] dqget+0x3fa/0x450 [2797785.412924] [] dquot_get_dqblk+0x14/0x1f0 [2797785.418782] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2797785.426522] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2797785.433083] [] mdt_quotactl+0x4d2/0x770 [mdt] [2797785.439320] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2797785.446455] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2797785.454356] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2797785.460861] [] kthread+0xd1/0xe0 [2797785.465961] [] ret_from_fork_nospec_begin+0xe/0x21 [2797785.472610] [] 0xffffffffffffffff [2797785.477811] LustreError: dumping log to /tmp/lustre-log.1554952949.96183 [2797790.472520] Pid: 96416, comm: mdt03_100 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2797790.482438] Call Trace: [2797790.485101] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2797790.492185] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2797790.499456] [] start_this_handle+0x1a1/0x430 [jbd2] [2797790.506192] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2797790.513025] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2797790.520636] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2797790.527821] [] dqget+0x3fa/0x450 [2797790.532910] [] dquot_get_dqblk+0x14/0x1f0 [2797790.538778] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2797790.546498] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2797790.553060] [] mdt_quotactl+0x4d2/0x770 [mdt] [2797790.559305] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2797790.566431] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2797790.574335] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2797790.580837] [] kthread+0xd1/0xe0 [2797790.585938] [] ret_from_fork_nospec_begin+0xe/0x21 [2797790.592589] [] 0xffffffffffffffff [2797790.597786] LustreError: dumping log to /tmp/lustre-log.1554952954.96416 [2797811.976796] Pid: 96317, comm: mdt03_072 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2797811.986710] Call Trace: [2797811.989354] [] ldlm_completion_ast+0x63d/0x920 [ptlrpc] [2797811.996470] [] ldlm_cli_enqueue_local+0x23c/0x870 [ptlrpc] [2797812.003852] [] mdt_object_local_lock+0x50b/0xb20 [mdt] [2797812.010854] [] mdt_object_lock_internal+0x70/0x3e0 [mdt] [2797812.018033] [] mdt_getattr_name_lock+0x90a/0x1c30 [mdt] [2797812.025118] [] mdt_intent_getattr+0x2b5/0x480 [mdt] [2797812.031866] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2797812.038519] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2797812.045473] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2797812.052750] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2797812.059097] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2797812.066225] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2797812.074129] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2797812.080629] [] kthread+0xd1/0xe0 [2797812.085715] [] ret_from_fork_nospec_begin+0xe/0x21 [2797812.092373] [] 0xffffffffffffffff [2797812.097568] LustreError: dumping log to /tmp/lustre-log.1554952976.96317 [2797845.769215] Pid: 96315, comm: mdt00_111 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2797845.779126] Call Trace: [2797845.781792] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2797845.788872] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2797845.796126] [] start_this_handle+0x1a1/0x430 [jbd2] [2797845.802860] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2797845.809683] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2797845.817291] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2797845.824459] [] dqget+0x3fa/0x450 [2797845.829547] [] dquot_get_dqblk+0x14/0x1f0 [2797845.835414] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2797845.843110] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2797845.849670] [] mdt_quotactl+0x4d2/0x770 [mdt] [2797845.855893] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2797845.863046] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2797845.870938] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2797845.877455] [] kthread+0xd1/0xe0 [2797845.882545] [] ret_from_fork_nospec_begin+0xe/0x21 [2797845.889189] [] 0xffffffffffffffff [2797845.894375] LustreError: dumping log to /tmp/lustre-log.1554953009.96315 [2797850.889277] LustreError: dumping log to /tmp/lustre-log.1554953014.96237 [2797856.009344] LustreError: dumping log to /tmp/lustre-log.1554953019.95998 [2797908.746001] LustreError: dumping log to /tmp/lustre-log.1554953072.95889 [2797913.354061] LustreError: dumping log to /tmp/lustre-log.1554953077.95877 [2797915.402088] LustreError: dumping log to /tmp/lustre-log.1554953079.95125 [2797918.474180] LustreError: dumping log to /tmp/lustre-log.1554953082.95591 [2797921.034156] LustreError: dumping log to /tmp/lustre-log.1554953084.96214 [2797925.642218] LustreError: dumping log to /tmp/lustre-log.1554953089.96298 [2797929.738264] LustreError: dumping log to /tmp/lustre-log.1554953093.96288 [2797934.858334] LustreError: dumping log to /tmp/lustre-log.1554953098.96209 [2797939.978395] LustreError: dumping log to /tmp/lustre-log.1554953103.96231 [2797964.739795] Lustre: fir-MDT0001: Export ffff8ec43ab3b800 already connecting from 10.9.114.14@o2ib4 [2797964.748932] Lustre: Skipped 181 previous similar messages [2797972.746825] LustreError: dumping log to /tmp/lustre-log.1554953136.95350 [2798022.923449] LustreError: dumping log to /tmp/lustre-log.1554953186.96327 [2798044.427724] LustreError: dumping log to /tmp/lustre-log.1554953208.96274 [2798044.434247] Lustre: fir-MDT0003: Connection restored to 8ee7c285-dd5a-acbb-69cd-3d9ec3cccebe (at 10.8.27.11@o2ib6) [2798044.434249] Lustre: Skipped 9194 previous similar messages [2798054.667877] LustreError: dumping log to /tmp/lustre-log.1554953218.95511 [2798094.117996] Lustre: fir-MDT0003: Client 61e57ad5-5940-5a04-c390-5bcbf8468cfd (at 10.9.107.9@o2ib4) reconnecting [2798094.128266] Lustre: Skipped 9266 previous similar messages [2798107.916520] LNet: Service thread pid 95368 was inactive for 200.36s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2798107.933629] LNet: Skipped 9 previous similar messages [2798107.938865] Pid: 95368, comm: mdt01_012 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2798107.948793] Call Trace: [2798107.951456] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2798107.958540] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2798107.965809] [] start_this_handle+0x1a1/0x430 [jbd2] [2798107.972562] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2798107.979396] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2798107.987012] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2798107.994201] [] dqget+0x3fa/0x450 [2798107.999291] [] dquot_get_dqblk+0x14/0x1f0 [2798108.005149] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2798108.012854] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2798108.019416] [] mdt_quotactl+0x4d2/0x770 [mdt] [2798108.025662] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2798108.032797] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2798108.040714] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2798108.047229] [] kthread+0xd1/0xe0 [2798108.052314] [] ret_from_fork_nospec_begin+0xe/0x21 [2798108.058970] [] 0xffffffffffffffff [2798108.064155] LustreError: dumping log to /tmp/lustre-log.1554953272.95368 [2798113.036584] Pid: 95858, comm: mdt01_048 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2798113.046498] Call Trace: [2798113.049160] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2798113.056246] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2798113.063523] [] start_this_handle+0x1a1/0x430 [jbd2] [2798113.070277] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2798113.077113] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2798113.084724] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2798113.091915] [] dqget+0x3fa/0x450 [2798113.097005] [] dquot_get_dqblk+0x14/0x1f0 [2798113.102873] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2798113.110577] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2798113.117146] [] mdt_quotactl+0x4d2/0x770 [mdt] [2798113.123384] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2798113.130538] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2798113.138460] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2798113.144958] [] kthread+0xd1/0xe0 [2798113.150060] [] ret_from_fork_nospec_begin+0xe/0x21 [2798113.156711] [] 0xffffffffffffffff [2798113.161912] LustreError: dumping log to /tmp/lustre-log.1554953277.95858 [2798117.644648] Pid: 96086, comm: mdt01_083 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2798117.654563] Call Trace: [2798117.657196] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2798117.664275] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2798117.671549] [] start_this_handle+0x1a1/0x430 [jbd2] [2798117.678298] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2798117.685120] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2798117.692721] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2798117.699895] [] dqget+0x3fa/0x450 [2798117.704986] [] dquot_get_dqblk+0x14/0x1f0 [2798117.710843] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2798117.718547] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2798117.725110] [] mdt_quotactl+0x4d2/0x770 [mdt] [2798117.731347] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2798117.738464] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2798117.746380] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2798117.752885] [] kthread+0xd1/0xe0 [2798117.757978] [] ret_from_fork_nospec_begin+0xe/0x21 [2798117.764621] [] 0xffffffffffffffff [2798117.769810] LustreError: dumping log to /tmp/lustre-log.1554953281.96086 [2798155.469134] Lustre: 96734:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8edc2a6ffc50 x1628745021209488/t0(0) o4->3f5657b8-e32f-30cd-8fb3-11d7e558b9e3@10.9.109.6@o2ib4:14/0 lens 4640/0 e 0 to 0 dl 1554953324 ref 2 fl New:/2/ffffffff rc 0/-1 [2798155.498035] Lustre: 96734:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 19033 previous similar messages [2798213.389857] Pid: 95884, comm: mdt01_054 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2798213.399775] Call Trace: [2798213.402437] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2798213.409523] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2798213.416796] [] start_this_handle+0x1a1/0x430 [jbd2] [2798213.423530] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2798213.430366] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2798213.437978] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2798213.445181] [] mdt_empty_transno+0xf7/0x850 [mdt] [2798213.451765] [] mdt_mfd_open+0x8de/0xe70 [mdt] [2798213.458003] [] mdt_finish_open+0x64b/0x760 [mdt] [2798213.464488] [] mdt_open_by_fid_lock+0x672/0x9b0 [mdt] [2798213.471418] [] mdt_reint_open+0x760/0x27d0 [mdt] [2798213.477903] [] mdt_reint_rec+0x83/0x210 [mdt] [2798213.484141] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2798213.490888] [] mdt_intent_open+0x82/0x350 [mdt] [2798213.497293] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2798213.503950] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2798213.510928] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2798213.518215] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2798213.524571] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2798213.531691] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2798213.539593] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2798213.546095] [] kthread+0xd1/0xe0 [2798213.551197] [] ret_from_fork_nospec_begin+0xe/0x21 [2798213.557850] [] 0xffffffffffffffff [2798213.563057] LustreError: dumping log to /tmp/lustre-log.1554953377.95884 [2798214.413870] Pid: 95544, comm: mdt01_027 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2798214.423779] Call Trace: [2798214.426436] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2798214.433528] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2798214.440800] [] start_this_handle+0x1a1/0x430 [jbd2] [2798214.447535] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2798214.454371] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2798214.461980] [] osd_trans_start+0x20e/0x4e0 [osd_ldiskfs] [2798214.469172] [] mdt_empty_transno+0xf7/0x850 [mdt] [2798214.475755] [] mdt_mfd_open+0x8de/0xe70 [mdt] [2798214.481992] [] mdt_finish_open+0x64b/0x760 [mdt] [2798214.488491] [] mdt_open_by_fid_lock+0x672/0x9b0 [mdt] [2798214.495425] [] mdt_reint_open+0x760/0x27d0 [mdt] [2798214.501913] [] mdt_reint_rec+0x83/0x210 [mdt] [2798214.508156] [] mdt_reint_internal+0x6e3/0xaf0 [mdt] [2798214.514893] [] mdt_intent_open+0x82/0x350 [mdt] [2798214.521288] [] mdt_intent_policy+0x2e8/0xd00 [mdt] [2798214.527937] [] ldlm_lock_enqueue+0x366/0xa60 [ptlrpc] [2798214.534894] [] ldlm_handle_enqueue0+0xa47/0x15a0 [ptlrpc] [2798214.542178] [] tgt_enqueue+0x62/0x210 [ptlrpc] [2798214.548527] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2798214.555659] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2798214.563562] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2798214.570065] [] kthread+0xd1/0xe0 [2798214.575167] [] ret_from_fork_nospec_begin+0xe/0x21 [2798214.581819] [] 0xffffffffffffffff [2798214.587018] LustreError: dumping log to /tmp/lustre-log.1554953378.95544 [2798231.822091] LNet: Service thread pid 96283 was inactive for 200.28s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2798231.835127] LNet: Skipped 27 previous similar messages [2798231.840450] LustreError: dumping log to /tmp/lustre-log.1554953395.96283 [2798389.617482] Lustre: 96716:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:61s); client may timeout. req@ffff8ecf56148c50 x1629035066259920/t0(0) o3->36e2649c-5bde-7360-b2cc-505510b79efe@10.8.7.26@o2ib6:2/0 lens 488/0 e 0 to 0 dl 1554953492 ref 1 fl Interpret:H/2/ffffffff rc 0/-1 [2798389.646297] Lustre: 96716:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 565 previous similar messages [2798426.384570] Pid: 95898, comm: mdt01_056 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2798426.394483] Call Trace: [2798426.397143] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2798426.404255] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2798426.411528] [] start_this_handle+0x1a1/0x430 [jbd2] [2798426.418263] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2798426.425106] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2798426.432719] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2798426.439899] [] dqget+0x3fa/0x450 [2798426.444991] [] dquot_get_dqblk+0x14/0x1f0 [2798426.450849] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2798426.458579] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2798426.465139] [] mdt_quotactl+0x4d2/0x770 [mdt] [2798426.471377] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2798426.478513] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2798426.486412] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2798426.492916] [] kthread+0xd1/0xe0 [2798426.498019] [] ret_from_fork_nospec_begin+0xe/0x21 [2798426.504670] [] 0xffffffffffffffff [2798426.509878] LustreError: dumping log to /tmp/lustre-log.1554953590.95898 [2798431.504660] Pid: 96019, comm: mdt01_075 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2798431.514574] Call Trace: [2798431.517232] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2798431.524312] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2798431.531582] [] start_this_handle+0x1a1/0x430 [jbd2] [2798431.538320] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2798431.545163] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2798431.552774] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2798431.559957] [] dqget+0x3fa/0x450 [2798431.565048] [] dquot_get_dqblk+0x14/0x1f0 [2798431.570905] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2798431.578626] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2798431.585189] [] mdt_quotactl+0x4d2/0x770 [mdt] [2798431.591425] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2798431.598561] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2798431.606480] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2798431.612983] [] kthread+0xd1/0xe0 [2798431.618082] [] ret_from_fork_nospec_begin+0xe/0x21 [2798431.624734] [] 0xffffffffffffffff [2798431.629925] LustreError: dumping log to /tmp/lustre-log.1554953595.96019 [2798436.112704] Pid: 96075, comm: mdt01_078 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2798436.122609] Call Trace: [2798436.125248] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2798436.132335] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2798436.139597] [] start_this_handle+0x1a1/0x430 [jbd2] [2798436.146330] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2798436.153165] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2798436.160769] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2798436.167952] [] dqget+0x3fa/0x450 [2798436.173034] [] dquot_get_dqblk+0x14/0x1f0 [2798436.178892] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2798436.186612] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2798436.193175] [] mdt_quotactl+0x4d2/0x770 [mdt] [2798436.199413] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2798436.206522] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2798436.214423] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2798436.220925] [] kthread+0xd1/0xe0 [2798436.226019] [] ret_from_fork_nospec_begin+0xe/0x21 [2798436.232661] [] 0xffffffffffffffff [2798436.237843] LustreError: dumping log to /tmp/lustre-log.1554953600.96075 [2798466.321072] Pid: 96089, comm: mdt00_061 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2798466.330984] Call Trace: [2798466.333642] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2798466.340735] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2798466.348007] [] start_this_handle+0x1a1/0x430 [jbd2] [2798466.354740] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2798466.361575] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2798466.369204] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2798466.376396] [] dqget+0x3fa/0x450 [2798466.381503] [] dquot_get_dqblk+0x14/0x1f0 [2798466.387363] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2798466.395065] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2798466.401645] [] mdt_quotactl+0x4d2/0x770 [mdt] [2798466.407883] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2798466.415017] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2798466.422920] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2798466.429424] [] kthread+0xd1/0xe0 [2798466.434549] [] ret_from_fork_nospec_begin+0xe/0x21 [2798466.441200] [] 0xffffffffffffffff [2798466.446399] LustreError: dumping log to /tmp/lustre-log.1554953630.96089 [2798470.929147] Pid: 96250, comm: mdt00_085 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2798470.939077] Call Trace: [2798470.941721] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2798470.948809] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2798470.956079] [] start_this_handle+0x1a1/0x430 [jbd2] [2798470.962816] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2798470.969653] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2798470.977280] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2798470.984472] [] dqget+0x3fa/0x450 [2798470.989560] [] dquot_get_dqblk+0x14/0x1f0 [2798470.995428] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2798471.003139] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2798471.009702] [] mdt_quotactl+0x4d2/0x770 [mdt] [2798471.015941] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2798471.023067] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2798471.030967] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2798471.037471] [] kthread+0xd1/0xe0 [2798471.042588] [] ret_from_fork_nospec_begin+0xe/0x21 [2798471.049242] [] 0xffffffffffffffff [2798471.054449] LustreError: dumping log to /tmp/lustre-log.1554953634.96250 [2798475.537188] LustreError: dumping log to /tmp/lustre-log.1554953639.96267 [2798490.897384] LustreError: dumping log to /tmp/lustre-log.1554953654.96099 [2798496.017449] LustreError: dumping log to /tmp/lustre-log.1554953659.95557 [2798501.137509] LustreError: dumping log to /tmp/lustre-log.1554953665.95360 [2798510.865634] LustreError: dumping log to /tmp/lustre-log.1554953674.96207 [2798515.474023] LustreError: dumping log to /tmp/lustre-log.1554953679.95542 [2798520.593757] LustreError: dumping log to /tmp/lustre-log.1554953684.95985 [2798556.434203] LustreError: dumping log to /tmp/lustre-log.1554953720.96257 [2798561.554270] LustreError: dumping log to /tmp/lustre-log.1554953725.95539 [2798565.835490] Lustre: fir-MDT0001: Export ffff8ec43ab3b800 already connecting from 10.9.114.14@o2ib4 [2798565.844627] Lustre: Skipped 191 previous similar messages [2798566.674332] LustreError: dumping log to /tmp/lustre-log.1554953730.95965 [2798610.706888] LustreError: dumping log to /tmp/lustre-log.1554953774.96391 [2798644.513215] Lustre: fir-MDT0003: Connection restored to ca2e9722-ff1a-a77d-8af9-e4d6b8d00fb3 (at 10.9.105.61@o2ib4) [2798644.523827] Lustre: Skipped 10017 previous similar messages [2798650.131379] LustreError: dumping log to /tmp/lustre-log.1554953814.96425 [2798676.243707] LustreError: dumping log to /tmp/lustre-log.1554953840.95868 [2798680.851773] LustreError: dumping log to /tmp/lustre-log.1554953844.96104 [2798685.971832] LustreError: dumping log to /tmp/lustre-log.1554953849.95489 [2798694.145631] Lustre: fir-MDT0003: Client 338dce11-1d8c-d11c-4363-af216d5b0418 (at 10.9.108.48@o2ib4) reconnecting [2798694.155978] Lustre: Skipped 10034 previous similar messages [2798734.612434] LNet: Service thread pid 95361 was inactive for 200.14s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [2798734.629542] LNet: Skipped 9 previous similar messages [2798734.634778] Pid: 95361, comm: mdt00_008 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2798734.644713] Call Trace: [2798734.647373] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2798734.654456] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2798734.661731] [] start_this_handle+0x1a1/0x430 [jbd2] [2798734.668469] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2798734.675307] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2798734.682918] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2798734.690100] [] dqget+0x3fa/0x450 [2798734.695189] [] dquot_get_dqblk+0x14/0x1f0 [2798734.701055] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2798734.708768] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2798734.715332] [] mdt_quotactl+0x4d2/0x770 [mdt] [2798734.721570] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2798734.728731] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2798734.736639] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2798734.743154] [] kthread+0xd1/0xe0 [2798734.748240] [] ret_from_fork_nospec_begin+0xe/0x21 [2798734.754914] [] 0xffffffffffffffff [2798734.760112] LustreError: dumping log to /tmp/lustre-log.1554953898.95361 [2798739.732500] Pid: 96284, comm: mdt00_098 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2798739.742410] Call Trace: [2798739.745056] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2798739.752142] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2798739.759404] [] start_this_handle+0x1a1/0x430 [jbd2] [2798739.766140] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2798739.772975] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2798739.780581] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2798739.787769] [] dqget+0x3fa/0x450 [2798739.792861] [] dquot_get_dqblk+0x14/0x1f0 [2798739.798716] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2798739.806422] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2798739.812984] [] mdt_quotactl+0x4d2/0x770 [mdt] [2798739.819214] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2798739.826355] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2798739.834257] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2798739.840759] [] kthread+0xd1/0xe0 [2798739.845854] [] ret_from_fork_nospec_begin+0xe/0x21 [2798739.852505] [] 0xffffffffffffffff [2798739.857695] LustreError: dumping log to /tmp/lustre-log.1554953903.96284 [2798744.852570] Pid: 95400, comm: mdt01_014 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2798744.862483] Call Trace: [2798744.865142] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2798744.872236] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2798744.879509] [] start_this_handle+0x1a1/0x430 [jbd2] [2798744.886250] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2798744.893069] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2798744.900677] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2798744.907864] [] dqget+0x3fa/0x450 [2798744.912952] [] dquot_get_dqblk+0x14/0x1f0 [2798744.918809] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2798744.926513] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2798744.933077] [] mdt_quotactl+0x4d2/0x770 [mdt] [2798744.939323] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2798744.946457] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2798744.954360] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2798744.960863] [] kthread+0xd1/0xe0 [2798744.965964] [] ret_from_fork_nospec_begin+0xe/0x21 [2798744.972614] [] 0xffffffffffffffff [2798744.977814] LustreError: dumping log to /tmp/lustre-log.1554953908.95400 [2798755.682731] Lustre: 96734:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8ed17cc18450 x1629295050400160/t0(0) o4->1bb33ff9-4145-26df-e6ad-0325b49afddd@10.8.1.26@o2ib6:14/0 lens 488/0 e 0 to 0 dl 1554953924 ref 2 fl New:/2/ffffffff rc 0/-1 [2798755.711456] Lustre: 96734:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 20428 previous similar messages [2798826.773630] Pid: 95341, comm: mdt00_004 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2798826.783544] Call Trace: [2798826.786208] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2798826.793300] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2798826.800570] [] start_this_handle+0x1a1/0x430 [jbd2] [2798826.807306] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2798826.814148] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2798826.821779] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2798826.828960] [] dqget+0x3fa/0x450 [2798826.834051] [] dquot_get_dqblk+0x14/0x1f0 [2798826.839919] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2798826.847624] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2798826.854190] [] mdt_quotactl+0x4d2/0x770 [mdt] [2798826.860428] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2798826.867563] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2798826.875466] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2798826.881970] [] kthread+0xd1/0xe0 [2798826.887085] [] ret_from_fork_nospec_begin+0xe/0x21 [2798826.893738] [] 0xffffffffffffffff [2798826.898936] LustreError: dumping log to /tmp/lustre-log.1554953990.95341 [2798859.542040] Pid: 96319, comm: mdt03_074 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2798859.551952] Call Trace: [2798859.554611] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2798859.561690] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2798859.568961] [] start_this_handle+0x1a1/0x430 [jbd2] [2798859.575697] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2798859.582535] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2798859.590143] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2798859.597340] [] dqget+0x3fa/0x450 [2798859.602435] [] dquot_get_dqblk+0x14/0x1f0 [2798859.608292] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2798859.616005] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2798859.622576] [] mdt_quotactl+0x4d2/0x770 [mdt] [2798859.628812] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2798859.635939] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2798859.643841] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2798859.650344] [] kthread+0xd1/0xe0 [2798859.655459] [] ret_from_fork_nospec_begin+0xe/0x21 [2798859.662113] [] 0xffffffffffffffff [2798859.667310] LustreError: dumping log to /tmp/lustre-log.1554954023.96319 [2798862.127031] LustreError: 96734:0:(service.c:2128:ptlrpc_server_handle_request()) @@@ Dropping timed-out request from 12345-10.8.7.23@o2ib6: deadline 30:1s ago req@ffff8ed6742a2850 x1630377396431648/t0(0) o3->35f134a7-b0d1-f34f-600d-20d902f3663f@10.8.7.23@o2ib6:25/0 lens 488/0 e 0 to 0 dl 1554954025 ref 1 fl Interpret:H/2/ffffffff rc 0/-1 [2798866.710140] LNet: Service thread pid 96269 was inactive for 200.68s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2798866.723175] LNet: Skipped 15 previous similar messages [2798866.728500] LustreError: dumping log to /tmp/lustre-log.1554954030.96269 [2798871.318205] LustreError: dumping log to /tmp/lustre-log.1554954035.96270 [2798997.431622] Lustre: 108395:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:170s); client may timeout. req@ffff8ebf197b5450 x1628642466707568/t0(0) o3->6d84db07-946f-3a55-fea2-55904c526e1b@10.8.1.21@o2ib6:21/0 lens 488/0 e 0 to 0 dl 1554953991 ref 1 fl Interpret:H/2/ffffffff rc 0/-1 [2798997.460714] Lustre: 108395:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 638 previous similar messages [2799166.931116] Lustre: fir-MDT0001: Export ffff8ec43ab3b800 already connecting from 10.9.114.14@o2ib4 [2799166.940253] Lustre: Skipped 191 previous similar messages [2799202.582455] LustreError: 96507:0:(service.c:2128:ptlrpc_server_handle_request()) @@@ Dropping timed-out request from 12345-10.8.27.4@o2ib6: deadline 30:1s ago req@ffff8ed726e75450 x1629291936804464/t0(0) o3->c721e6fd-5107-7d3f-3603-1773b6ecd4de@10.8.27.4@o2ib6:5/0 lens 488/0 e 0 to 0 dl 1554954365 ref 1 fl Interpret:H/2/ffffffff rc 0/-1 [2799203.610405] Pid: 95857, comm: mdt01_047 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2799203.620315] Call Trace: [2799203.622970] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2799203.630056] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2799203.637326] [] start_this_handle+0x1a1/0x430 [jbd2] [2799203.644062] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2799203.650897] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2799203.658508] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2799203.665690] [] dqget+0x3fa/0x450 [2799203.670781] [] dquot_get_dqblk+0x14/0x1f0 [2799203.676640] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2799203.684359] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2799203.690930] [] mdt_quotactl+0x4d2/0x770 [mdt] [2799203.697168] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2799203.704303] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2799203.712205] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2799203.718724] [] kthread+0xd1/0xe0 [2799203.723825] [] ret_from_fork_nospec_begin+0xe/0x21 [2799203.730478] [] 0xffffffffffffffff [2799203.735678] LustreError: dumping log to /tmp/lustre-log.1554954367.95857 [2799209.754488] Pid: 96256, comm: mdt02_095 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2799209.764408] Call Trace: [2799209.767070] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2799209.774149] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2799209.781421] [] start_this_handle+0x1a1/0x430 [jbd2] [2799209.788157] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2799209.794992] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2799209.802603] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2799209.809790] [] dqget+0x3fa/0x450 [2799209.814876] [] dquot_get_dqblk+0x14/0x1f0 [2799209.820736] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2799209.828457] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2799209.835016] [] mdt_quotactl+0x4d2/0x770 [mdt] [2799209.841254] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2799209.848389] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2799209.856293] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2799209.862799] [] kthread+0xd1/0xe0 [2799209.867904] [] ret_from_fork_nospec_begin+0xe/0x21 [2799209.874556] [] 0xffffffffffffffff [2799209.879747] LustreError: dumping log to /tmp/lustre-log.1554954373.96256 [2799214.874552] Pid: 95540, comm: mdt00_016 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2799214.884468] Call Trace: [2799214.887127] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2799214.894208] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2799214.901482] [] start_this_handle+0x1a1/0x430 [jbd2] [2799214.908225] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2799214.915060] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2799214.922670] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2799214.929853] [] dqget+0x3fa/0x450 [2799214.934944] [] dquot_get_dqblk+0x14/0x1f0 [2799214.940803] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2799214.948523] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2799214.955084] [] mdt_quotactl+0x4d2/0x770 [mdt] [2799214.961320] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2799214.968457] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2799214.976356] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2799214.982861] [] kthread+0xd1/0xe0 [2799214.987961] [] ret_from_fork_nospec_begin+0xe/0x21 [2799214.994614] [] 0xffffffffffffffff [2799214.999814] LustreError: dumping log to /tmp/lustre-log.1554954378.95540 [2799218.118595] LustreError: 96507:0:(ldlm_lib.c:3248:target_bulk_io()) @@@ timeout on bulk READ after 0+0s req@ffff8edf774fd050 x1630011730087568/t0(0) o3->3e4cef37-4508-c87a-8ca1-5992fb5ac7d8@10.8.2.26@o2ib6:22/0 lens 488/440 e 0 to 0 dl 1554954382 ref 1 fl Interpret:H/2/0 rc 0/0 [2799218.143427] Lustre: fir-MDT0003: Bulk IO read error with 3e4cef37-4508-c87a-8ca1-5992fb5ac7d8 (at 10.8.2.26@o2ib6), client will retry: rc -110 [2799244.553433] Lustre: fir-MDT0001: Connection restored to 6bb6b586-f426-e568-1fd9-7be05a8314e1 (at 10.9.101.8@o2ib4) [2799244.563954] Lustre: Skipped 10375 previous similar messages [2799294.199428] Lustre: fir-MDT0003: Client 837ddb3e-ef73-e0a1-07e2-e8ac1e8f67f1 (at 10.8.7.30@o2ib6) reconnecting [2799294.209601] Lustre: Skipped 10412 previous similar messages [2799322.907935] Pid: 95940, comm: mdt02_047 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2799322.917848] Call Trace: [2799322.920504] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2799322.927598] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2799322.934868] [] start_this_handle+0x1a1/0x430 [jbd2] [2799322.941603] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2799322.948440] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2799322.956052] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2799322.963232] [] dqget+0x3fa/0x450 [2799322.968340] [] dquot_get_dqblk+0x14/0x1f0 [2799322.974213] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2799322.981914] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2799322.988496] [] mdt_quotactl+0x4d2/0x770 [mdt] [2799322.994733] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2799323.001887] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2799323.009776] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2799323.016292] [] kthread+0xd1/0xe0 [2799323.021381] [] ret_from_fork_nospec_begin+0xe/0x21 [2799323.028044] [] 0xffffffffffffffff [2799323.033245] LustreError: dumping log to /tmp/lustre-log.1554954486.95940 [2799328.027984] Pid: 95595, comm: mdt01_045 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Dec 7 14:50:35 PST 2018 [2799328.037896] Call Trace: [2799328.040552] [] wait_transaction_locked+0x85/0xd0 [jbd2] [2799328.047635] [] add_transaction_credits+0x268/0x2f0 [jbd2] [2799328.054900] [] start_this_handle+0x1a1/0x430 [jbd2] [2799328.061626] [] jbd2__journal_start+0xf3/0x1f0 [jbd2] [2799328.068459] [] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs] [2799328.076072] [] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs] [2799328.083255] [] dqget+0x3fa/0x450 [2799328.088360] [] dquot_get_dqblk+0x14/0x1f0 [2799328.094221] [] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs] [2799328.101924] [] lquotactl_slv+0x27d/0x9d0 [lquota] [2799328.108485] [] mdt_quotactl+0x4d2/0x770 [mdt] [2799328.114721] [] tgt_request_handle+0xaea/0x1580 [ptlrpc] [2799328.121858] [] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] [2799328.129759] [] ptlrpc_main+0xafc/0x1fc0 [ptlrpc] [2799328.136264] [] kthread+0xd1/0xe0 [2799328.141363] [] ret_from_fork_nospec_begin+0xe/0x21 [2799328.148015] [] 0xffffffffffffffff [2799328.153220] LustreError: dumping log to /tmp/lustre-log.1554954492.95595 [2799333.148055] LustreError: dumping log to /tmp/lustre-log.1554954497.95862 [2799355.890368] Lustre: 96507:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8edd9e771850 x1628655362080128/t0(0) o3->b9f34394-bbd7-2caf-04a9-589b5380aaa7@10.9.106.19@o2ib4:14/0 lens 488/0 e 0 to 0 dl 1554954524 ref 2 fl New:/2/ffffffff rc 0/-1 [2799355.919268] Lustre: 96507:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 21948 previous similar messages [2799386.908729] LustreError: dumping log to /tmp/lustre-log.1554954550.96301 [2799392.028798] LustreError: dumping log to /tmp/lustre-log.1554954555.96103 [2799397.148857] LustreError: dumping log to /tmp/lustre-log.1554954561.96069 [2799415.581099] LustreError: dumping log to /tmp/lustre-log.1554954579.96024 [2799420.701155] LustreError: dumping log to /tmp/lustre-log.1554954584.96266 [2799425.821587] LustreError: dumping log to /tmp/lustre-log.1554954589.95559 [2799443.741864] LustreError: dumping log to /tmp/lustre-log.1554954607.95976 [2799449.373517] LustreError: dumping log to /tmp/lustre-log.1554954613.96252 [2799453.469579] LustreError: dumping log to /tmp/lustre-log.1554954617.95577 [2799481.117936] LNet: Service thread pid 96483 was inactive for 200.48s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [2799481.130971] LNet: Skipped 11 previous similar messages [2799481.136292] LustreError: dumping log to /tmp/lustre-log.1554954645.96483 [2799603.565212] Lustre: 96716:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (30:28s); client may timeout. req@ffff8ecdefa28050 x1629035069095184/t0(0) o3->36e2649c-5bde-7360-b2cc-505510b79efe@10.8.7.26@o2ib6:19/0 lens 488/0 e 0 to 0 dl 1554954739 ref 1 fl Interpret:H/2/ffffffff rc 0/-1 [2799603.594109] Lustre: 96716:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 476 previous similar messages [2799619.782037] SysRq : Trigger a crash [2799619.785785] BUG: unable to handle kernel NULL pointer dereference at (null) [2799619.793836] IP: [] sysrq_handle_crash+0x16/0x20 [2799619.800139] PGD baf0aa067 PUD 41561a067 PMD 0 [2799619.804835] Oops: 0002 [#1] SMP [2799619.808294] Modules linked in: osp(OE) mdd(OE) lod(OE) mdt(OE) lfsck(OE) mgc(OE) osd_ldiskfs(OE) lquota(OE) ldiskfs(OE) lustre(OE) lmv(OE) mdc(OE) osc(OE) lov(OE) fid(OE) fld(OE) ko2iblnd(OE) ptlrpc(OE) obdclass(OE) lnet(OE) libcfs(OE) rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache rdma_ucm(OE) ib_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_fpga_tools(OE) mlx4_en(OE) mlx4_ib(OE) mlx4_core(OE) sunrpc vfat fat dm_round_robin amd64_edac_mod edac_mce_amd kvm_amd kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd dcdbas ses ipmi_si dm_multipath pcspkr enclosure dm_mod ipmi_devintf i2c_piix4 ccp sg k10temp ipmi_msghandler acpi_power_meter knem(OE) ip_tables ext4 mbcache jbd2 sd_mod crc_t10dif crct10dif_generic [2799619.881442] mlx5_ib(OE) ib_uverbs(OE) ib_core(OE) i2c_algo_bit drm_kms_helper mlx5_core(OE) syscopyarea sysfillrect sysimgblt fb_sys_fops mlxfw(OE) devlink ttm ahci crct10dif_pclmul crct10dif_common mlx_compat(OE) libahci drm tg3 crc32c_intel libata ptp megaraid_sas drm_panel_orientation_quirks pps_core mpt3sas(OE) raid_class scsi_transport_sas [last unloaded: libcfs] [2799619.913539] CPU: 18 PID: 109549 Comm: bash Kdump: loaded Tainted: G OEL ------------ 3.10.0-957.1.3.el7_lustre.x86_64 #1 [2799619.925607] Hardware name: Dell Inc. PowerEdge R6415/065PKD, BIOS 1.6.7 10/29/2018 [2799619.933348] task: ffff8ee05406c100 ti: ffff8edeb689c000 task.ti: ffff8edeb689c000 [2799619.941001] RIP: 0010:[] [] sysrq_handle_crash+0x16/0x20 [2799619.949720] RSP: 0018:ffff8edeb689fe58 EFLAGS: 00010246 [2799619.955204] RAX: ffffffffb3a61e50 RBX: ffffffffb42e4c60 RCX: 0000000000000000 [2799619.962512] RDX: 0000000000000000 RSI: ffff8ee07f713898 RDI: 0000000000000063 [2799619.969816] RBP: ffff8edeb689fe58 R08: ffffffffb45e38bc R09: ffffffffb462dbab [2799619.977123] R10: 000000000002295f R11: 000000000002295e R12: 0000000000000063 [2799619.984430] R13: 0000000000000000 R14: 0000000000000007 R15: 0000000000000000 [2799619.991736] FS: 00007f3b1b868740(0000) GS:ffff8ee07f700000(0000) knlGS:0000000000000000 [2799619.999994] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [2799620.005913] CR2: 0000000000000000 CR3: 0000000beac40000 CR4: 00000000003407e0 [2799620.013220] Call Trace: [2799620.015851] [] __handle_sysrq+0x10d/0x170 [2799620.021680] [] write_sysrq_trigger+0x28/0x40 [2799620.027775] [] proc_reg_write+0x40/0x80 [2799620.033434] [] vfs_write+0xc0/0x1f0 [2799620.038743] [] SyS_write+0x7f/0xf0 [2799620.043974] [] system_call_fastpath+0x22/0x27 [2799620.050148] Code: eb 9b 45 01 f4 45 39 65 34 75 e5 4c 89 ef e8 e2 f7 ff ff eb db 66 66 66 66 90 55 48 89 e5 c7 05 b1 54 7e 00 01 00 00 00 0f ae f8 04 25 00 00 00 00 01 5d c3 66 66 66 66 90 55 31 c0 c7 05 2e [2799620.070844] RIP [] sysrq_handle_crash+0x16/0x20 [2799620.077225] RSP [2799620.080889] CR2: 0000000000000000