<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:02:02 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13526] Client eviction mitigation</title>
                <link>https://jira.whamcloud.com/browse/LU-13526</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We have a user that writes/read 52000 restart files using 52000 cores on 1300 nodes. During write cycle a several clients were evicted. We would like to know if this is expected. What can we do to mitigate client evictions or manage client load.&lt;/p&gt;

&lt;p&gt;Filesystem background:&lt;/p&gt;

&lt;p&gt;OST# = 359&lt;/p&gt;

&lt;p&gt;OSS# = 20&lt;/p&gt;

&lt;p&gt;Filesystem Size= 18P&lt;/p&gt;

&lt;p&gt;Free space = %5&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 Apr 29 14:36:12 nbp2-oss11 rsyslogd: -- MARK --
Apr 29 14:39:27 nbp2-oss11 kernel: [13220183.158983] Lustre: 17955:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (30/30), not sending early reply
Apr 29 14:39:27 nbp2-oss11 kernel: [13220183.158983]   req@ffff91a2c9552050 x1664344156868944/t0(0) o4-&amp;gt;fd329a67-7aa9-5ce6-277a-db7f93ba95aa@10.149.13.21@o2ib313:292/0 lens 488/448 e 0 to 0 dl 1588196397 ref 2 fl Interpret:/0/0 rc 0/0
Apr 29 14:39:27 nbp2-oss11 kernel: [13220183.247695] Lustre: 17955:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 34664 previous similar messages
Apr 29 14:40:28 nbp2-oss11 kernel: [13220244.019145] LustreError: 17999:0:(ldlm_lib.c:3243:target_bulk_io()) @@@ timeout on bulk WRITE after 0+0s  req@ffff91a64368a850 x1664344138126592/t0(0) o4-&amp;gt;b563eaec-b2d0-3747-168c-69ed81119736@10.149.6.81@o2ib313:323/0 lens 488/448 e 0 to 0 dl 1588196428 ref 1 fl Interpret:/0/0 rc 0/0
Apr 29 14:40:28 nbp2-oss11 kernel: [13220244.025295] Lustre: nbp2-OST00fa: Bulk IO write error with 0739fb77-7542-56ca-6966-bffa4195aecf (at 10.149.16.35@o2ib313), client will retry: rc = -110
Apr 29 14:40:28 nbp2-oss11 kernel: [13220244.025297] Lustre: Skipped 75 previous similar messages
Apr 29 14:40:28 nbp2-oss11 kernel: [13220244.153362] LustreError: 17999:0:(ldlm_lib.c:3243:target_bulk_io()) Skipped 75 previous similar messages
Apr 29 14:40:28 nbp2-oss11 sec[10946]: SEC_EVENT |msg lustre bulk W Error - |client 10.149.6.81@o2ib313:323/0 lens 488/448 e 0 to 0 dl 1588196428 ref 1 fl Interpret
Apr 29 14:40:29 nbp2-oss11 kernel: [13220245.018264] LustreError: 17858:0:(ldlm_lib.c:3243:target_bulk_io()) @@@ timeout on bulk WRITE after 0+0s  req@ffff918d61360050 x1664344138633104/t0(0) o4-&amp;gt;34a9ec93-e179-3f1b-62a7-22ad5a22aea9@10.149.6.119@o2ib313:324/0 lens 488/448 e 0 to 0 dl 1588196429 ref 1 fl Interpret:/0/0 rc 0/0
Apr 29 14:40:29 nbp2-oss11 kernel: [13220245.094523] LustreError: 17858:0:(ldlm_lib.c:3243:target_bulk_io()) Skipped 30 previous similar messages
Apr 29 14:40:29 nbp2-oss11 sec[10946]: SEC_EVENT |msg lustre bulk W Error - |client 10.149.6.119@o2ib313:324/0 lens 488/448 e 0 to 0 dl 1588196429 ref 1 fl Interpret
Apr 29 14:40:30 nbp2-oss11 kernel: [13220246.008901] LustreError: 17939:0:(service.c:2128:ptlrpc_server_handle_request()) @@@ Dropping timed-out request from 12345-10.149.7.10@o2ib313: deadline 348:1s ago
Apr 29 14:40:30 nbp2-oss11 kernel: [13220246.008901]   req@ffff919fd3fb2050 x1664344146942672/t0(0) o4-&amp;gt;66debb2a-1df9-c27b-c48d-871b7d8fa964@10.149.7.10@o2ib313:324/0 lens 488/0 e 0 to 0 dl 1588196429 ref 1 fl Interpret:/0/ffffffff rc 0/-1
Apr 29 14:40:30 nbp2-oss11 kernel: [13220246.013387] Lustre: 17026:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (348:1s); client may timeout.  req@ffff919fd3fb4050 x1664344147733408/t0(0) o4-&amp;gt;167cdb14-3f34-ede1-05b4-a382f87e8f1d@10.149.15.213@o2ib313:324/0 lens 488/448 e 0 to 0 dl 1588196429 ref 1 fl Complete:/0/ffffffff rc -110/-1
Apr 29 14:40:30 nbp2-oss11 kernel: [13220246.013389] Lustre: 17026:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 27666 previous similar messages
Apr 29 14:40:30 nbp2-oss11 kernel: [13220246.124392] LustreError: 17854:0:(ldlm_lib.c:3243:target_bulk_io()) @@@ timeout on bulk WRITE after -1+1s  req@ffff919516f30050 x1664344144144480/t0(0) o4-&amp;gt;dbc49150-1d9b-0b95-85cb-88713b8bcc5c@10.149.8.228@o2ib313:324/0 lens 488/448 e 0 to 0 dl 1588196429 ref 1 fl Interpret:/0/0 rc 0/0
Apr 29 14:40:30 nbp2-oss11 kernel: [13220246.124394] LustreError: 17854:0:(ldlm_lib.c:3243:target_bulk_io()) Skipped 122 previous similar messages
Apr 29 14:40:30 nbp2-oss11 kernel: [13220246.124434] Lustre: nbp2-OST014a: Bulk IO write error with dbc49150-1d9b-0b95-85cb-88713b8bcc5c (at 10.149.8.228@o2ib313), client will retry: rc = -110
Apr 29 14:40:30 nbp2-oss11 kernel: [13220246.124435] Lustre: Skipped 177 previous similar messages
Apr 29 14:40:30 nbp2-oss11 kernel: [13220246.392656] LustreError: 17939:0:(service.c:2128:ptlrpc_server_handle_request()) Skipped 27619 previous similar messages
Apr 29 14:40:30 nbp2-oss11 sec[10946]: SEC_EVENT |msg lustre bulk W Error - |client 10.149.8.228@o2ib313:324/0 lens 488/448 e 0 to 0 dl 1588196429 ref 1 fl Interpret
Apr 29 14:40:42 nbp2-oss11 kernel: [13220258.327977] Lustre: 18004:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (30/30), not sending early reply
Apr 29 14:40:42 nbp2-oss11 kernel: [13220258.327977]   req@ffff91a4e91df850 x1664344145875136/t0(0) o4-&amp;gt;c4bf71af-35ca-1c64-8e3c-551577dd6547@10.149.15.180@o2ib313:367/0 lens 488/0 e 1 to 0 dl 1588196472 ref 2 fl New:/0/ffffffff rc 0/-1
Apr 29 14:40:42 nbp2-oss11 kernel: [13220258.416941] Lustre: 18004:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 20239 previous similar messages
Apr 29 14:41:38 nbp2-oss11 kernel: [13220314.386763] Lustre: nbp2-OST0096: Connection restored to 665eb153-c8b4-ef08-9aa9-bc7678541e13 (at 10.151.48.225@o2ib)
Apr 29 14:41:38 nbp2-oss11 kernel: [13220314.386768] Lustre: Skipped 7880 previous similar messages
Apr 29 14:41:45 nbp2-oss11 kernel: [13220321.021082] LustreError: 17858:0:(service.c:2128:ptlrpc_server_handle_request()) @@@ Dropping timed-out request from 12345-10.149.12.51@o2ib313: deadline 349:36s ago
Apr 29 14:41:45 nbp2-oss11 kernel: [13220321.021082]   req@ffff91a58fec7050 x1664344142537360/t0(0) o4-&amp;gt;3df9289b-2501-4c35-2d7e-72ef6a69b9d1@10.149.12.51@o2ib313:364/0 lens 488/0 e 1 to 0 dl 1588196469 ref 1 fl Interpret:/0/ffffffff rc 0/-1
Apr 29 14:41:45 nbp2-oss11 kernel: [13220321.027758] Lustre: 26698:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (349:36s); client may timeout.  req@ffff91a58fec6050 x1664344142537344/t0(0) o4-&amp;gt;3df9289b-2501-4c35-2d7e-72ef6a69b9d1@10.149.12.51@o2ib313:364/0 lens 488/0 e 1 to 0 dl 1588196469 ref 1 fl Interpret:/0/ffffffff rc 0/-1
Apr 29 14:41:45 nbp2-oss11 kernel: [13220321.027760] Lustre: 26698:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 10633 previous similar messages
Apr 29 14:41:45 nbp2-oss11 kernel: [13220321.240374] LustreError: 17858:0:(service.c:2128:ptlrpc_server_handle_request()) Skipped 10622 previous similar messages
Apr 29 14:43:12 nbp2-oss11 kernel: [13220408.647670] Lustre: 26697:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (30/30), not sending early reply
Apr 29 14:43:12 nbp2-oss11 kernel: [13220408.647670]   req@ffff91a3564d9850 x1664344152516880/t0(0) o4-&amp;gt;53e3df5f-39de-a589-433b-daa41393218b@10.149.8.94@o2ib313:517/0 lens 488/0 e 1 to 0 dl 1588196622 ref 2 fl New:/0/ffffffff rc 0/-1
Apr 29 14:43:12 nbp2-oss11 kernel: [13220408.736173] Lustre: 26697:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 34738 previous similar messages
Apr 29 14:44:15 nbp2-oss11 kernel: [13220471.042441] LustreError: 14440:0:(service.c:2128:ptlrpc_server_handle_request()) @@@ Dropping timed-out request from 12345-10.149.6.20@o2ib313: deadline 349:100s ago
Apr 29 14:44:15 nbp2-oss11 kernel: [13220471.042441]   req@ffff91a67453a850 x1664344143041968/t0(0) o4-&amp;gt;446d7667-304c-c3a9-7da5-ea05a2e11855@10.149.6.20@o2ib313:450/0 lens 488/0 e 1 to 0 dl 1588196555 ref 1 fl Interpret:/0/ffffffff rc 0/-1
Apr 29 14:44:15 nbp2-oss11 kernel: [13220471.053248] Lustre: 26698:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (349:100s); client may timeout.  req@ffff91a67453a050 x1664344151066320/t0(0) o4-&amp;gt;6734aee0-1342-6be1-80c5-28dcfe56f3d5@10.149.15.121@o2ib313:450/0 lens 488/0 e 1 to 0 dl 1588196555 ref 1 fl Interpret:/0/ffffffff rc 0/-1
Apr 29 14:44:15 nbp2-oss11 kernel: [13220471.053250] Lustre: 26698:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 21018 previous similar messages
Apr 29 14:44:15 nbp2-oss11 kernel: [13220471.262026] LustreError: 14440:0:(service.c:2128:ptlrpc_server_handle_request()) Skipped 21013 previous similar messages
Apr 29 14:45:03 nbp2-oss11 kernel: [13220519.060553] Lustre: nbp2-OST0122: Client c6660b13-b7df-d499-0da4-0bcd5b1ab97e (at 10.149.12.124@o2ib313) reconnecting
Apr 29 14:45:03 nbp2-oss11 kernel: [13220519.093100] Lustre: Skipped 4078 previous similar messages
Apr 29 14:46:07 nbp2-oss11 kernel: [13220583.068725] Lustre: nbp2-OST001e: Client e1d5b360-ccb3-148c-b5d5-84fa7d138daf (at 10.149.12.79@o2ib313) reconnecting
Apr 29 14:46:07 nbp2-oss11 kernel: [13220583.101037] Lustre: Skipped 6438 previous similar messages
Apr 29 14:48:13 nbp2-oss11 kernel: [13220709.291045] Lustre: 26723:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (30/30), not sending early reply
Apr 29 14:48:13 nbp2-oss11 kernel: [13220709.291045]   req@ffff91a5f248d050 x1664344180110656/t0(0) o4-&amp;gt;4c9dc38a-8f49-9a8d-327d-79c3d54a46c4@10.149.6.139@o2ib313:63/0 lens 488/0 e 1 to 0 dl 1588196923 ref 2 fl New:/0/ffffffff rc 0/-1
Apr 29 14:48:13 nbp2-oss11 kernel: [13220709.379519] Lustre: 26723:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 29510 previous similar messages
Apr 29 14:48:15 nbp2-oss11 kernel: [13220711.121311] Lustre: nbp2-OST0032: Client 0afcf4fb-d9af-39fd-ea80-b5690be000e5 (at 10.149.11.180@o2ib313) reconnecting
Apr 29 14:48:15 nbp2-oss11 kernel: [13220711.153862] Lustre: Skipped 1964 previous similar messages
Apr 29 14:49:15 nbp2-oss11 kernel: [13220771.090816] Lustre: 17088:0:(service.c:2165:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (441:86s); client may timeout.  req@ffff91a366d40050 x1664344157372688/t0(0) o4-&amp;gt;7ba7dfa5-dbae-cafb-8bc0-e9253d5f248b@10.149.13.10@o2ib313:9/0 lens 504/0 e 1 to 0 dl 1588196869 ref 1 fl Interpret:/0/ffffffff rc 0/-1
Apr 29 14:49:15 nbp2-oss11 kernel: [13220771.178775] Lustre: 17088:0:(service.c:2165:ptlrpc_server_handle_request()) Skipped 42720 previous similar messages
Apr 29 14:49:15 nbp2-oss11 kernel: [13220771.396142] LustreError: 26723:0:(service.c:2128:ptlrpc_server_handle_request()) @@@ Dropping timed-out request from 12345-10.149.7.93@o2ib313: deadline 441:85s ago
Apr 29 14:49:15 nbp2-oss11 kernel: [13220771.396142]   req@ffff91a5a79da850 x1664344159391440/t0(0) o4-&amp;gt;9a170297-06d1-674c-6a91-07c8fca552d6@10.149.7.93@o2ib313:10/0 lens 488/0 e 1 to 0 dl 1588196870 ref 1 fl Interpret:/0/ffffffff rc 0/-1
Apr 29 14:49:15 nbp2-oss11 kernel: [13220771.494220] LustreError: 26723:0:(service.c:2128:ptlrpc_server_handle_request()) Skipped 15128 previous similar messages
Apr 29 14:51:38 nbp2-oss11 kernel: [13220914.549113] Lustre: nbp2-OST010e: Connection restored to f69b2fd4-9b38-54c5-3208-db9c7f846e17 (at 10.151.2.31@o2ib)
Apr 29 14:51:38 nbp2-oss11 kernel: [13220914.549118] Lustre: Skipped 13576 previous similar messages
Apr 29 14:52:41 nbp2-oss11 kernel: [13220977.944989] Lustre: nbp2-OST006e: Client 46ab1987-e3e4-99f0-7564-ed9662109994 (at 10.149.16.61@o2ib313) reconnecting
Apr 29 14:52:41 nbp2-oss11 kernel: [13220977.977308] Lustre: Skipped 205 previous similar messages
Apr 29 14:52:47 nbp2-oss11 kernel: [13220983.133316] LustreError: 18054:0:(ldlm_lib.c:3243:target_bulk_io()) @@@ timeout on bulk WRITE after 0+0s  req@ffff91a4d33dc850 x1664344159308096/t0(0) o4-&amp;gt;4adb3af8-dc82-8ffb-8736-61b354e478c4@10.149.6.251@o2ib313:307/0 lens 488/448 e 1 to 0 dl 1588197167 ref 1 fl Interpret:/2/0 rc 0/0
Apr 29 14:52:47 nbp2-oss11 kernel: [13220983.134944] Lustre: nbp2-OST0046: Bulk IO write error with 4adb3af8-dc82-8ffb-8736-61b354e478c4 (at 10.149.6.251@o2ib313), client will retry: rc = -110
Apr 29 14:52:47 nbp2-oss11 kernel: [13220983.134946] Lustre: Skipped 110 previous similar messages
Apr 29 14:52:47 nbp2-oss11 kernel: [13220983.267840] LustreError: 18054:0:(ldlm_lib.c:3243:target_bulk_io()) Skipped 132 previous similar messages
Apr 29 14:52:47 nbp2-oss11 sec[10946]: SEC_EVENT |msg lustre bulk W Error - |client 10.149.6.251@o2ib313:307/0 lens 488/448 e 1 to 0 dl 1588197167 ref 1 fl Interpret
Apr 29 14:52:47 nbp2-oss11 kernel: [13220983.654382] LustreError: 18054:0:(ldlm_lib.c:3243:target_bulk_io()) @@@ timeout on bulk WRITE after 0+0s  req@ffff919b2904f050 x1664344191291856/t0(0) o4-&amp;gt;e4aed8fe-bccd-ec15-769f-6b0e9cc30004@10.149.7.223@o2ib313:307/0 lens 488/448 e 1 to 0 dl 1588197167 ref 1 fl Interpret:/2/0 rc 0/0
Apr 29 14:52:47 nbp2-oss11 kernel: [13220983.657505] Lustre: nbp2-OST0122: Bulk IO write error with e4aed8fe-bccd-ec15-769f-6b0e9cc30004 (at 10.149.7.223@o2ib313), client will retry: rc = -110
Apr 29 14:52:47 nbp2-oss11 kernel: [13220983.657507] Lustre: Skipped 66 previous similar messages
Apr 29 14:52:47 nbp2-oss11 kernel: [13220983.788633] LustreError: 18054:0:(ldlm_lib.c:3243:target_bulk_io()) Skipped 58 previous similar messages
Apr 29 14:52:47 nbp2-oss11 sec[10946]: SEC_EVENT |msg lustre bulk W Error - |client 10.149.7.223@o2ib313:307/0 lens 488/448 e 1 to 0 dl 1588197167 ref 1 fl Interpret
Apr 29 14:57:13 nbp2-oss11 kernel: [13221250.041828] LustreError: 93070:0:(service.c:3336:ptlrpc_svcpt_health_check()) ost_io: unhealthy - request has been waiting 635s
Apr 29 14:58:13 nbp2-oss11 kernel: [13221309.565832] Lustre: 17380:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (30/30), not sending early reply
Apr 29 14:58:13 nbp2-oss11 kernel: [13221309.565832]   req@ffff919d8c688850 x1664344177163312/t0(0) o4-&amp;gt;a3a7a9ca-a291-5357-a974-27d6aca23eb6@10.149.16.70@o2ib313:663/0 lens 488/0 e 2 to 0 dl 1588197523 ref 2 fl New:/0/ffffffff rc 0/-1
Apr 29 14:58:13 nbp2-oss11 kernel: [13221309.654554] Lustre: 17380:0:(service.c:1372:ptlrpc_at_send_early_reply()) Skipped 53492 previous similar messages
Apr 29 14:58:21 nbp2-oss11 kernel: [13221317.290723] LustreError: 18024:0:(ldlm_lib.c:3243:target_bulk_io()) @@@ timeout on bulk WRITE after 0+0s  req@ffff919d530c9850 x1664344171829184/t0(0) o4-&amp;gt;4ee4a5be-cd77-6c61-ff43-8f84801e5802@10.149.6.196@o2ib313:641/0 lens 504/448 e 2 to 0 dl 1588197501 ref 1 fl Interpret:/0/0 rc 0/0
Apr 29 14:58:21 nbp2-oss11 kernel: [13221317.296858] Lustre: nbp2-OST00e6: Bulk IO write error with 7cb63a26-acc1-c21e-deb5-06da7541e125 (at 10.149.7.105@o2ib313), client will retry: rc = -110
Apr 29 14:58:21 nbp2-oss11 kernel: [13221317.296860] Lustre: Skipped 63 previous similar messages
Apr 29 14:58:21 nbp2-oss11 kernel: [13221317.424993] LustreError: 18024:0:(ldlm_lib.c:3243:target_bulk_io()) Skipped 50 previous similar messages
Apr 29 14:58:21 nbp2-oss11 sec[10946]: SEC_EVENT |msg lustre bulk W Error - |client 10.149.6.196@o2ib313:641/0 lens 504/448 e 2 to 0 dl 1588197501 ref 1 fl Interpret
Apr 29 14:59:28 nbp2-oss11 kernel: [13221384.262654] LNetError: 12896:0:(o2iblnd_cb.c:3335:kiblnd_check_txs_locked()) Timed out tx: active_txs, 1 seconds
Apr 29 14:59:28 nbp2-oss11 kernel: [13221384.293877] LNetError: 12896:0:(o2iblnd_cb.c:3410:kiblnd_check_conns()) Timed out RDMA with 10.151.51.188@o2ib (339): c: 31, oc: 0, rc: 32
Apr 29 14:59:28 nbp2-oss11 sec[10946]: SEC_EVENT |msg lustre rdma timeout|nid 10.151.51.188@o2ib
Apr 29 15:01:30 nbp2-oss11 kernel: [13221506.622717] Lustre: nbp2-OST00fa: Client 0ecf2450-94bc-ebc0-dd57-47522e44f80c (at 10.149.15.82@o2ib313) reconnecting
Apr 29 15:01:30 nbp2-oss11 kernel: [13221506.654991] Lustre: Skipped 6279 previous similar messages
Apr 29 15:01:39 nbp2-oss11 kernel: [13221515.678756] Lustre: nbp2-OST00d2: Connection restored to  (at 10.141.5.250@o2ib417)
Apr 29 15:01:39 nbp2-oss11 kernel: [13221515.678761] Lustre: Skipped 15038 previous similar messages
Apr 29 15:02:46 nbp2-oss11 kernel: [13221582.426999] LustreError: 14419:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 762s: evicting client at 10.149.15.193@o2ib313  ns: filter-nbp2-OST00e6_UUID lock: ffff918dbfc28900/0x51a717b6c3666956 lrc: 3/0,0 mode: PW/PW res: [0xadd8f1a:0x0:0x0].0x0 rrc: 3 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;18446744073709551615) flags: 0x60000480020020 nid: 10.149.15.193@o2ib313 remote: 0xc7b02ca87786dcb expref: 16 pid: 17818 timeout: 13220018 lvb_type: 0
Apr 29 15:02:46 nbp2-oss11 sec[10946]: SEC_EVENT |msg lustre eviction - server lock callback|nid 10.149.15.193@o2ib313|target nbp2-OST00e6
Apr 29 15:03:45 nbp2-oss11 kernel: [13221641.234530] Lustre: nbp2-OST010e: haven&lt;span class=&quot;code-quote&quot;&gt;&apos;t heard from client 998a6981-bf59-9f90-8853-473782cbfa09 (at 10.151.51.188@o2ib) in 597 seconds. I think it&apos;&lt;/span&gt;s dead, and I am evicting it. exp ffff91932d10b400, cur 1588197825 expire 1588197675 last 1588197228
Apr 29 15:03:45 nbp2-oss11 kernel: [13221641.301156] Lustre: Skipped 5 previous similar messages
Apr 29 15:03:45 nbp2-oss11 sec[10946]: Evaluating code &lt;span class=&quot;code-quote&quot;&gt;&apos;5 &amp;gt; 1500&apos;&lt;/span&gt; and setting variable &lt;span class=&quot;code-quote&quot;&gt;&apos;%num&apos;&lt;/span&gt;
Apr 29 15:03:45 nbp2-oss11 sec[10946]: Variable &lt;span class=&quot;code-quote&quot;&gt;&apos;%num&apos;&lt;/span&gt; set to &apos;&apos;
Apr 29 15:04:22 nbp2-oss11 kernel: [13221678.438319] LustreError: 14419:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 761s: evicting client at 10.149.11.54@o2ib313  ns: filter-nbp2-OST00e6_UUID lock: ffff918c2388f3c0/0x51a717b6c366750a lrc: 3/0,0 mode: PW/PW res: [0xadd8fd7:0x0:0x0].0x0 rrc: 3 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;18446744073709551615) flags: 0x60000480020020 nid: 10.149.11.54@o2ib313 remote: 0x82d6b40bef341db9 expref: 17 pid: 17619 timeout: 13220114 lvb_type: 0
Apr 29 15:04:22 nbp2-oss11 sec[10946]: SEC_EVENT |msg lustre eviction - server lock callback|nid 10.149.11.54@o2ib313|target nbp2-OST00e6
Apr 29 15:05:33 nbp2-oss11 kernel: [13221749.241169] Lustre: nbp2-OST00e6: haven&lt;span class=&quot;code-quote&quot;&gt;&apos;t heard from client 998a6981-bf59-9f90-8853-473782cbfa09 (at 10.151.51.188@o2ib) in 705 seconds. I think it&apos;&lt;/span&gt;s dead, and I am evicting it. exp ffff9194bb54c800, cur 1588197933 expire 1588197783 last 1588197228
Apr 29 15:11:40 nbp2-oss11 kernel: [13222116.235086] Lustre: nbp2-OST015e: Connection restored to 65ec87b7-8769-2c1a-a9b0-6a06a50f62a9 (at 10.151.5.38@o2ib)
Apr 29 15:11:40 nbp2-oss11 kernel: [13222116.235091] Lustre: Skipped 4232 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="59072">LU-13526</key>
            <summary>Client eviction mitigation</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="green">Oleg Drokin</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Wed, 6 May 2020 22:28:56 +0000</created>
                <updated>Wed, 6 Jan 2021 13:04:27 +0000</updated>
                            <resolved>Wed, 6 Jan 2021 13:04:27 +0000</resolved>
                                    <version>Lustre 2.12.2</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="269594" author="green" created="Thu, 7 May 2020 18:37:37 +0000"  >&lt;p&gt;All of your messages point at a severely overloaded disk system that takes way longer than expected to process.&lt;/p&gt;

&lt;p&gt;You can try increasing obd_timeout and at_max cluster-wise to see if that would help nodes to be more patient but might also lead to just waiting longer and timing out anyway. I see you are hitting at_max that&apos;s default at 600 seconds normally.&lt;/p&gt;

&lt;p&gt;I think you just need to lower the load on the backend. Ideally just spreading the load over more OSTs/spindles if you have any more, other than that reducing the number of io threads on OSS nodes might also help.&lt;/p&gt;</comment>
                            <comment id="269654" author="pjones" created="Fri, 8 May 2020 12:34:45 +0000"  >&lt;p&gt;Oleg as per the discussion on the call yesterday, I understand that you are going to point to some patches that might be of interest to NASA&lt;/p&gt;</comment>
                            <comment id="269804" author="green" created="Mon, 11 May 2020 06:15:04 +0000"  >&lt;p&gt;yes, sorry for the delay just as we discussed, the patches are from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13131&quot; title=&quot;Partial writes on multi-client strided files&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13131&quot;&gt;&lt;del&gt;LU-13131&lt;/del&gt;&lt;/a&gt;: 37468, 37918, 37992, 38114 and 38292&lt;/p&gt;

&lt;p&gt;there&apos;s going to be another patch for some further pathological cases, but the 5 above should be good to have for now.&lt;/p&gt;</comment>
                            <comment id="271465" author="pjones" created="Thu, 28 May 2020 22:12:32 +0000"  >&lt;p&gt;All these fixes are in 2.12.5&lt;/p&gt;</comment>
                            <comment id="288749" author="mhanafi" created="Wed, 6 Jan 2021 01:26:24 +0000"  >&lt;p&gt;We can close this&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00zo7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>