<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:11:06 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-7692] LNet: Service thread Hung</title>
                <link>https://jira.whamcloud.com/browse/LU-7692</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;On MDT LNet service thread hung Dumped stack trace&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;an 20 13:42:10 nbp8-mds1 kernel: LNet: Service thread pid 17862 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 424.00s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; debugging purposes:
Jan 20 13:42:10 nbp8-mds1 kernel: Pid: 17862, comm: mdt00_057
Jan 20 13:42:14 nbp8-mds1 kernel: 
Jan 20 13:42:14 nbp8-mds1 kernel: Call Trace:
Jan 20 13:42:14 nbp8-mds1 kernel: [&amp;lt;ffffffff8155a9c2&amp;gt;] schedule_timeout+0x192/0x2e0
Jan 20 13:42:14 nbp8-mds1 kernel: [&amp;lt;ffffffff81083300&amp;gt;] ? process_timeout+0x0/0x10
Jan 20 13:42:14 nbp8-mds1 kernel: [&amp;lt;ffffffffa078af70&amp;gt;] ? ldlm_expired_completion_wait+0x0/0x360 [ptlrpc]
Jan 20 13:42:14 nbp8-mds1 kernel: [&amp;lt;ffffffffa078f7a1&amp;gt;] ldlm_completion_ast+0x4b1/0x920 [ptlrpc]
Jan 20 13:42:14 nbp8-mds1 kernel: [&amp;lt;ffffffff81061fe0&amp;gt;] ? default_wake_function+0x0/0x20
Jan 20 13:42:14 nbp8-mds1 kernel: [&amp;lt;ffffffffa078ef00&amp;gt;] ldlm_cli_enqueue_local+0x1f0/0x5e0 [ptlrpc]
Jan 20 13:42:14 nbp8-mds1 kernel: [&amp;lt;ffffffffa078f2f0&amp;gt;] ? ldlm_completion_ast+0x0/0x920 [ptlrpc]
Jan 20 13:42:14 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e72de0&amp;gt;] ? mdt_blocking_ast+0x0/0x2a0 [mdt]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e7cde4&amp;gt;] mdt_object_lock0+0x394/0xb30 [mdt]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e72de0&amp;gt;] ? mdt_blocking_ast+0x0/0x2a0 [mdt]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa078f2f0&amp;gt;] ? ldlm_completion_ast+0x0/0x920 [ptlrpc]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e7d644&amp;gt;] mdt_object_lock+0x14/0x20 [mdt]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e7d801&amp;gt;] mdt_object_find_lock+0x61/0x170 [mdt]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa0ea982c&amp;gt;] mdt_reint_open+0x88c/0x21a0 [mdt]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa04fb5d6&amp;gt;] ? upcall_cache_get_entry+0x296/0x880 [libcfs]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa0652ef0&amp;gt;] ? lu_ucred+0x20/0x30 [obdclass]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e71935&amp;gt;] ? mdt_ucred+0x15/0x20 [mdt]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e8e51c&amp;gt;] ? mdt_root_squash+0x2c/0x3f0 [mdt]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa07df766&amp;gt;] ? __req_capsule_get+0x166/0x710 [ptlrpc]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e92481&amp;gt;] mdt_reint_rec+0x41/0xe0 [mdt]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e77ed3&amp;gt;] mdt_reint_internal+0x4c3/0x780 [mdt]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e7845e&amp;gt;] mdt_intent_reint+0x1ee/0x410 [mdt]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e75c3e&amp;gt;] mdt_intent_policy+0x3ae/0x770 [mdt]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa076f2c5&amp;gt;] ldlm_lock_enqueue+0x135/0x980 [ptlrpc]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa0798ebb&amp;gt;] ldlm_handle_enqueue0+0x51b/0x10c0 [ptlrpc]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e76106&amp;gt;] mdt_enqueue+0x46/0xe0 [mdt]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e7aada&amp;gt;] mdt_handle_common+0x52a/0x1470 [mdt]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa0eb74a5&amp;gt;] mds_regular_handle+0x15/0x20 [mdt]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa07c80c5&amp;gt;] ptlrpc_server_handle_request+0x385/0xc00 [ptlrpc]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa04f08d5&amp;gt;] ? lc_watchdog_touch+0x65/0x170 [libcfs]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa07c0a69&amp;gt;] ? ptlrpc_wait_event+0xa9/0x2d0 [ptlrpc]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa07ca89d&amp;gt;] ptlrpc_main+0xafd/0x1780 [ptlrpc]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffff8100c20a&amp;gt;] child_rip+0xa/0x20
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffffa07c9da0&amp;gt;] ? ptlrpc_main+0x0/0x1780 [ptlrpc]
Jan 20 13:42:17 nbp8-mds1 kernel: [&amp;lt;ffffffff8100c200&amp;gt;] ? child_rip+0x0/0x20

an 20 13:42:18 nbp8-mds1 kernel: Pid: 17780, comm: mdt00_044
Jan 20 13:42:18 nbp8-mds1 kernel: 
Jan 20 13:42:18 nbp8-mds1 kernel: Call Trace:
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffff8155a9c2&amp;gt;] schedule_timeout+0x192/0x2e0
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffff81083300&amp;gt;] ? process_timeout+0x0/0x10
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa078af70&amp;gt;] ? ldlm_expired_completion_wait+0x0/0x360 [ptlrpc]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa078f7a1&amp;gt;] ldlm_completion_ast+0x4b1/0x920 [ptlrpc]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffff81061fe0&amp;gt;] ? default_wake_function+0x0/0x20
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa078ef00&amp;gt;] ldlm_cli_enqueue_local+0x1f0/0x5e0 [ptlrpc]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa078f2f0&amp;gt;] ? ldlm_completion_ast+0x0/0x920 [ptlrpc]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e72de0&amp;gt;] ? mdt_blocking_ast+0x0/0x2a0 [mdt]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e7cc06&amp;gt;] mdt_object_lock0+0x1b6/0xb30 [mdt]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e72de0&amp;gt;] ? mdt_blocking_ast+0x0/0x2a0 [mdt]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa078f2f0&amp;gt;] ? ldlm_completion_ast+0x0/0x920 [ptlrpc]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e7d644&amp;gt;] mdt_object_lock+0x14/0x20 [mdt]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e85b8e&amp;gt;] mdt_getattr_name_lock+0x8fe/0x19d0 [mdt]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa07df766&amp;gt;] ? __req_capsule_get+0x166/0x710 [ptlrpc]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa07ba7b4&amp;gt;] ? lustre_msg_get_flags+0x34/0xb0 [ptlrpc]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e86ef9&amp;gt;] mdt_intent_getattr+0x299/0x480 [mdt]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e75c3e&amp;gt;] mdt_intent_policy+0x3ae/0x770 [mdt]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa076f2c5&amp;gt;] ldlm_lock_enqueue+0x135/0x980 [ptlrpc]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa0798ebb&amp;gt;] ldlm_handle_enqueue0+0x51b/0x10c0 [ptlrpc]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e76106&amp;gt;] mdt_enqueue+0x46/0xe0 [mdt]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e7aada&amp;gt;] mdt_handle_common+0x52a/0x1470 [mdt]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa0eb74a5&amp;gt;] mds_regular_handle+0x15/0x20 [mdt]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa07c80c5&amp;gt;] ptlrpc_server_handle_request+0x385/0xc00 [ptlrpc]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffff81061ff2&amp;gt;] ? default_wake_function+0x12/0x20
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa07ca89d&amp;gt;] ptlrpc_main+0xafd/0x1780 [ptlrpc]
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffff8100c20a&amp;gt;] child_rip+0xa/0x20
Jan 20 13:42:18 nbp8-mds1 kernel: [&amp;lt;ffffffffa07c9da0&amp;gt;] ? ptlrpc_main+0x0/0x1780 [ptlrpc]
Jan 20 13:42:19 nbp8-mds1 kernel: [&amp;lt;ffffffff8100c200&amp;gt;] ? child_rip+0x0/0x20
Jan 20 13:42:19 nbp8-mds1 kernel: 
Jan 20 13:42:19 nbp8-mds1 kernel: Pid: 16840, comm: mdt03_017
Jan 20 13:42:19 nbp8-mds1 kernel: 
Jan 20 13:42:19 nbp8-mds1 kernel: Call Trace:
Jan 20 13:42:19 nbp8-mds1 kernel: [&amp;lt;ffffffffa076a885&amp;gt;] ? _ldlm_lock_debug+0x2d5/0x660 [ptlrpc]
Jan 20 13:42:19 nbp8-mds1 kernel: [&amp;lt;ffffffff8155a9c2&amp;gt;] schedule_timeout+0x192/0x2e0
Jan 20 13:42:19 nbp8-mds1 kernel: [&amp;lt;ffffffff81083300&amp;gt;] ? process_timeout+0x0/0x10
Jan 20 13:42:19 nbp8-mds1 kernel: [&amp;lt;ffffffffa078af70&amp;gt;] ? ldlm_expired_completion_wait+0x0/0x360 [ptlrpc]
Jan 20 13:42:19 nbp8-mds1 kernel: [&amp;lt;ffffffffa078f7a1&amp;gt;] ldlm_completion_ast+0x4b1/0x920 [ptlrpc]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffff81061fe0&amp;gt;] ? default_wake_function+0x0/0x20
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa078ef00&amp;gt;] ldlm_cli_enqueue_local+0x1f0/0x5e0 [ptlrpc]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa078f2f0&amp;gt;] ? ldlm_completion_ast+0x0/0x920 [ptlrpc]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e72de0&amp;gt;] ? mdt_blocking_ast+0x0/0x2a0 [mdt]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e7cc06&amp;gt;] mdt_object_lock0+0x1b6/0xb30 [mdt]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e72de0&amp;gt;] ? mdt_blocking_ast+0x0/0x2a0 [mdt]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa078f2f0&amp;gt;] ? ldlm_completion_ast+0x0/0x920 [ptlrpc]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e7d644&amp;gt;] mdt_object_lock+0x14/0x20 [mdt]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e85b8e&amp;gt;] mdt_getattr_name_lock+0x8fe/0x19d0 [mdt]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa07df766&amp;gt;] ? __req_capsule_get+0x166/0x710 [ptlrpc]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa07ba7b4&amp;gt;] ? lustre_msg_get_flags+0x34/0xb0 [ptlrpc]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e86ef9&amp;gt;] mdt_intent_getattr+0x299/0x480 [mdt]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e75c3e&amp;gt;] mdt_intent_policy+0x3ae/0x770 [mdt]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa076f2c5&amp;gt;] ldlm_lock_enqueue+0x135/0x980 [ptlrpc]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa0798ebb&amp;gt;] ldlm_handle_enqueue0+0x51b/0x10c0 [ptlrpc]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e76106&amp;gt;] mdt_enqueue+0x46/0xe0 [mdt]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e7aada&amp;gt;] mdt_handle_common+0x52a/0x1470 [mdt]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa0eb74a5&amp;gt;] mds_regular_handle+0x15/0x20 [mdt]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa07c80c5&amp;gt;] ptlrpc_server_handle_request+0x385/0xc00 [ptlrpc]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa04f08d5&amp;gt;] ? lc_watchdog_touch+0x65/0x170 [libcfs]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa07c0a69&amp;gt;] ? ptlrpc_wait_event+0xa9/0x2d0 [ptlrpc]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa07ca89d&amp;gt;] ptlrpc_main+0xafd/0x1780 [ptlrpc]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffff8100c20a&amp;gt;] child_rip+0xa/0x20
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffffa07c9da0&amp;gt;] ? ptlrpc_main+0x0/0x1780 [ptlrpc]
Jan 20 13:42:20 nbp8-mds1 kernel: [&amp;lt;ffffffff8100c200&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Looks the same as &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7232&quot; title=&quot;racer deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7232&quot;&gt;&lt;del&gt;LU-7232&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;I can upload debug logs if needed.&lt;/p&gt;</description>
                <environment></environment>
        <key id="34203">LU-7692</key>
            <summary>LNet: Service thread Hung</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="bobijam">Zhenyu Xu</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Wed, 20 Jan 2016 22:17:10 +0000</created>
                <updated>Mon, 2 Oct 2017 19:37:09 +0000</updated>
                            <resolved>Thu, 8 Sep 2016 18:06:39 +0000</resolved>
                                    <version>Lustre 2.5.3</version>
                                    <fixVersion>Lustre 2.7.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="139524" author="jaylan" created="Wed, 20 Jan 2016 23:48:04 +0000"  >&lt;p&gt;It seems another instance of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7640&quot; title=&quot;stuck mdt thread required reboot of mds&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7640&quot;&gt;&lt;del&gt;LU-7640&lt;/del&gt;&lt;/a&gt; to me.&lt;/p&gt;</comment>
                            <comment id="139618" author="green" created="Thu, 21 Jan 2016 18:16:30 +0000"  >&lt;p&gt;So the problem at hand is that the lock a server queued is not being grnted for a long time - either becase a client is not releasing a conflicting lock or for some other reason.&lt;/p&gt;

&lt;p&gt;There&apos;s a whole bunch of different problems that could cause this and I think &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7232&quot; title=&quot;racer deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7232&quot;&gt;&lt;del&gt;LU-7232&lt;/del&gt;&lt;/a&gt; is not it due to that caused by a particular master patch that I bet you are not carrying.&lt;/p&gt;

&lt;p&gt;As for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7640&quot; title=&quot;stuck mdt thread required reboot of mds&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7640&quot;&gt;&lt;del&gt;LU-7640&lt;/del&gt;&lt;/a&gt; - it might be a bug of a similar nature, but again hard to tell what&apos;s the underlying cause at this stage.&lt;/p&gt;

&lt;p&gt;Just as a wild guess - you do not happen to have any SELinux-enabled client nodes (even if permissive) do you?&lt;/p&gt;</comment>
                            <comment id="139658" author="mhanafi" created="Thu, 21 Jan 2016 19:39:28 +0000"  >&lt;p&gt;We are not running SELinux on any client. We had 3 incidents of this yesterday. Even with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7372&quot; title=&quot;replay-dual test_26: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7372&quot;&gt;&lt;del&gt;LU-7372&lt;/del&gt;&lt;/a&gt; applied.&lt;br/&gt;
This is getting trigger by a particular user. I tried to find the client/clients triggering it, but didn&apos;t have any luck.&lt;/p&gt;
</comment>
                            <comment id="139663" author="mhanafi" created="Thu, 21 Jan 2016 19:45:22 +0000"  >&lt;p&gt;Uploading debug dump to see if can help.&lt;/p&gt;</comment>
                            <comment id="139685" author="green" created="Thu, 21 Jan 2016 22:40:07 +0000"  >&lt;p&gt;is this a particular job of this user that can trigger it? how large is the scale of that job?&lt;/p&gt;</comment>
                            <comment id="139688" author="pjones" created="Thu, 21 Jan 2016 22:56:48 +0000"  >&lt;p&gt;Oleg&lt;/p&gt;

&lt;p&gt;NASA shared that there is one specific user who can trigger this issue with their job runs and they will supply details when they are able to&lt;/p&gt;

&lt;p&gt;Bobijam&lt;/p&gt;

&lt;p&gt;Could you please review the uploaded debug logs and see if it is possible to deduce anything about the situation?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="139691" author="mhanafi" created="Thu, 21 Jan 2016 23:23:42 +0000"  >&lt;p&gt;I am checking with a suspected user now. His job is 484 node and 5832 Tasks. The nodes are part of a remote cluster connected via routers. &lt;/p&gt;

&lt;p&gt;Can a client eviction cause this? Here we see this client getting evicted. &lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Jan 21 13:16:19 nbp8-mds1 kernel: LustreError: 0:0:(ldlm_lockd.c:346:waiting_locks_callback()) ### lock callback timer expired after 251s: evicting client at 10.153.10.226@o2ib233  ns: mdt-nbp8-MDT0000_UUID lock: ffff883cfce50dc0/0x8ea5cf5f62ad392a lrc: 3/0,0 mode: PR/PR res: [0x3603b8f6b:0xb4b:0x0].0 bits 0x13 rrc: 584 type: IBT flags: 0x60200400000020 nid: 10.153.10.226@o2ib233 remote: 0x4103cc31cf8b204c expref: 131 pid: 16613 timeout: 4333267923 lvb_type: 0
Jan 21 13:17:34 nbp8-mds1 kernel: LNet: Service thread pid 16447 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 300.00s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; debugging purposes:
Jan 21 13:17:34 nbp8-mds1 kernel: Pid: 16447, comm: mdt01_085
Jan 21 13:17:38 nbp8-mds1 kernel: 
Jan 21 13:17:38 nbp8-mds1 kernel: Call Trace:
Jan 21 13:17:38 nbp8-mds1 kernel: [&amp;lt;ffffffffa076a885&amp;gt;] ? _ldlm_lock_debug+0x2d5/0x660 [ptlrpc]
Jan 21 13:17:38 nbp8-mds1 kernel: [&amp;lt;ffffffff8155a9c2&amp;gt;] schedule_timeout+0x192/0x2e0
Jan 21 13:17:38 nbp8-mds1 kernel: [&amp;lt;ffffffff81083300&amp;gt;] ? process_timeout+0x0/0x10
Jan 21 13:17:38 nbp8-mds1 kernel: [&amp;lt;ffffffffa078af70&amp;gt;] ? ldlm_expired_completion_wait+0x0/0x360 [ptlrpc]
Jan 21 13:17:38 nbp8-mds1 kernel: [&amp;lt;ffffffffa078f7a1&amp;gt;] ldlm_completion_ast+0x4b1/0x920 [ptlrpc]
Jan 21 13:17:38 nbp8-mds1 kernel: [&amp;lt;ffffffff81061fe0&amp;gt;] ? default_wake_function+0x0/0x20
Jan 21 13:17:38 nbp8-mds1 kernel: [&amp;lt;ffffffffa078ef00&amp;gt;] ldlm_cli_enqueue_local+0x1f0/0x5e0 [ptlrpc]
Jan 21 13:17:38 nbp8-mds1 kernel: [&amp;lt;ffffffffa078f2f0&amp;gt;] ? ldlm_completion_ast+0x0/0x920 [ptlrpc]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e73de0&amp;gt;] ? mdt_blocking_ast+0x0/0x2a0 [mdt]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e7dde4&amp;gt;] mdt_object_lock0+0x394/0xb30 [mdt]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e73de0&amp;gt;] ? mdt_blocking_ast+0x0/0x2a0 [mdt]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa078f2f0&amp;gt;] ? ldlm_completion_ast+0x0/0x920 [ptlrpc]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e7e644&amp;gt;] mdt_object_lock+0x14/0x20 [mdt]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e7e801&amp;gt;] mdt_object_find_lock+0x61/0x170 [mdt]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa0eaa88c&amp;gt;] mdt_reint_open+0x88c/0x21a0 [mdt]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa04fb5d6&amp;gt;] ? upcall_cache_get_entry+0x296/0x880 [libcfs]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa0652ef0&amp;gt;] ? lu_ucred+0x20/0x30 [obdclass]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e72935&amp;gt;] ? mdt_ucred+0x15/0x20 [mdt]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e8f51c&amp;gt;] ? mdt_root_squash+0x2c/0x3f0 [mdt]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa07df766&amp;gt;] ? __req_capsule_get+0x166/0x710 [ptlrpc]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e93481&amp;gt;] mdt_reint_rec+0x41/0xe0 [mdt]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e78ed3&amp;gt;] mdt_reint_internal+0x4c3/0x780 [mdt]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e7945e&amp;gt;] mdt_intent_reint+0x1ee/0x410 [mdt]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e76c3e&amp;gt;] mdt_intent_policy+0x3ae/0x770 [mdt]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa076f2c5&amp;gt;] ldlm_lock_enqueue+0x135/0x980 [ptlrpc]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa0798ebb&amp;gt;] ldlm_handle_enqueue0+0x51b/0x10c0 [ptlrpc]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e77106&amp;gt;] mdt_enqueue+0x46/0xe0 [mdt]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa0e7bada&amp;gt;] mdt_handle_common+0x52a/0x1470 [mdt]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa0eb8505&amp;gt;] mds_regular_handle+0x15/0x20 [mdt]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa07c80c5&amp;gt;] ptlrpc_server_handle_request+0x385/0xc00 [ptlrpc]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa04f08d5&amp;gt;] ? lc_watchdog_touch+0x65/0x170 [libcfs]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa07c0a69&amp;gt;] ? ptlrpc_wait_event+0xa9/0x2d0 [ptlrpc]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa07ca89d&amp;gt;] ptlrpc_main+0xafd/0x1780 [ptlrpc]
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffff8100c20a&amp;gt;] child_rip+0xa/0x20
Jan 21 13:17:41 nbp8-mds1 kernel: [&amp;lt;ffffffffa07c9da0&amp;gt;] ? ptlrpc_main+0x0/0x1780 [ptlrpc]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Here is the client console log&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[1453410882.304755] LNetError: 7223:0:(o2iblnd_cb.c:3018:kiblnd_check_txs_locked()) Timed out tx: active_txs, 3 seconds
[1453410882.316755] LNetError: 7223:0:(o2iblnd_cb.c:3081:kiblnd_check_conns()) Timed out RDMA with 10.153.26.93@o2ib233 (13): c: 61, oc: 0, rc: 63
[1453410882.328756] LustreError: 7224:0:(events.c:203:client_bulk_callback()) event type 1, status -5, desc ffff8807b8798000
[1453410882.340756] Lustre: 7260:0:(client.c:1940:ptlrpc_expire_one_request()) @@@ Request sent has failed due to network error: [sent 1453410728/real 1453410728]  req@ffff880c045b5000 x1523566274464988/t0(0) o4-&amp;gt;nbp8-OST00eb-osc-ffff88060b4b8800@10.151.27.63@o2ib:6/4 lens 488/448 e 0 to 1 dl 1453411070 ref 2 fl Rpc:X/0/ffffffff rc 0/-1
[1453410882.340756] Lustre: 7260:0:(client.c:1940:ptlrpc_expire_one_request()) Skipped 82 previous similar messages
[1453410882.340756] Lustre: nbp8-OST00eb-osc-ffff88060b4b8800: Connection to nbp8-OST00eb (at 10.151.27.63@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
[1453410882.340756] Lustre: Skipped 133 previous similar messages
[1453410882.404758] Lustre: nbp8-OST00eb-osc-ffff88060b4b8800: Connection restored to nbp8-OST00eb (at 10.151.27.63@o2ib)
[1453410882.404758] Lustre: Skipped 56 previous similar messages
[1453410919.469932] LustreError: 7232:0:(events.c:203:client_bulk_callback()) event type 1, status -5, desc ffff880bce378000
[1453411070.294708] Lustre: 24291:0:(client.c:1940:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1453410728/real 1453410728]  req@ffff88061692bc00 x1523566274464784/t0(0) o101-&amp;gt;nbp8-MDT0000-mdc-ffff88060b4b8800@10.151.27.60@o2ib:12/10 lens 624/4944 e 0 to 1 dl 1453411070 ref 2 fl Rpc:X/0/ffffffff rc 0/-1
[1453411070.322709] Lustre: 24291:0:(client.c:1940:ptlrpc_expire_one_request()) Skipped 1 previous similar message
[1453411070.334710] Lustre: nbp8-MDT0000-mdc-ffff88060b4b8800: Connection to nbp8-MDT0000 (at 10.151.27.60@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
[1453411071.346742] Lustre: nbp9-OST00e3-osc-ffff880607dea400: Connection restored to nbp9-OST00e3 (at 10.151.26.9@o2ib)
[1453411096.347534] Lustre: nbp8-OST00be-osc-ffff88060b4b8800: Connection to nbp8-OST00be (at 10.151.27.70@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
[1453411096.363534] Lustre: Skipped 169 previous similar messages
[1453411121.368326] Lustre: nbp2-OST001f-osc-ffff880607d01c00: Connection restored to nbp2-OST001f (at 10.151.26.108@o2ib)
[1453411121.368326] Lustre: Skipped 178 previous similar messages
[1453411146.369118] Lustre: 7247:0:(client.c:1940:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1453410804/real 1453410804]  req@ffff880a414bb400 x1523566274482484/t0(0) o400-&amp;gt;nbp7-OST0014-osc-ffff880607e0e000@10.151.27.45@o2ib:28/4 lens 224/224 e 0 to 1 dl 1453411146 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
[1453411146.369118] Lustre: nbp7-OST0023-osc-ffff880607e0e000: Connection to nbp7-OST0023 (at 10.151.27.51@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
[1453411146.369118] Lustre: Skipped 34 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="139779" author="mhanafi" created="Fri, 22 Jan 2016 19:30:55 +0000"  >&lt;p&gt;Some info from the user. Each task reads a restart file does some processing and writes the results. All the 5832 restart files are in the same directory.&lt;/p&gt;</comment>
                            <comment id="139854" author="green" created="Mon, 25 Jan 2016 01:55:12 +0000"  >&lt;p&gt;The relationship between the message and the eviction is direct more or less.&lt;/p&gt;

&lt;p&gt;What happens here is:&lt;br/&gt;
1. client gets a lock, starts to do IO under the lock&lt;br/&gt;
2. Somebody else wants the lock, so asks the above client to finish the io and drop the lock&lt;br/&gt;
3. The IO encounters a timeout, client is evicted&lt;br/&gt;
4. Lock is cancelled&lt;br/&gt;
5. if some thread spent long time waiting for the lock - you&apos;ll get hte message.&lt;/p&gt;

&lt;p&gt;Now in your case the eviciton is on one server (OST), but the message is on another (MDT). They should not really influence each other as such.&lt;br/&gt;
But if there&apos;s some message dropping going on, then if the blocking AST is lost in transint, you&apos;ll se exactly this:&lt;br/&gt;
&quot;lock callback timer expired after&quot; on the server that then evicts the offending client and then the thread that was waiting for the lock is able to get it, do whatever processing it needed and then complain that processing took too long once it was done - all thanks to the long waiting to get the lock.&lt;/p&gt;</comment>
                            <comment id="139967" author="mhanafi" created="Mon, 25 Jan 2016 21:06:22 +0000"  >&lt;p&gt;This issues is triggered when our remote cluster loses connectivity to the servers. We are trying to find the root cause of the network errors. But the MDT threads shouldn&apos;t lockup.&lt;/p&gt;
</comment>
                            <comment id="140161" author="bobijam" created="Wed, 27 Jan 2016 04:12:30 +0000"  >&lt;p&gt;I think the patch &lt;a href=&quot;http://review.whamcloud.com/#/c/17853/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/17853/&lt;/a&gt; addresses the MDT thread lockup issue, where the thread won&apos;t come out of ldlm_completion_ast() waiting.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Jan 21 13:17:38 nbp8-mds1 kernel: [&amp;lt;ffffffffa076a885&amp;gt;] ? _ldlm_lock_debug+0x2d5/0x660 [ptlrpc]
Jan 21 13:17:38 nbp8-mds1 kernel: [&amp;lt;ffffffff8155a9c2&amp;gt;] schedule_timeout+0x192/0x2e0
Jan 21 13:17:38 nbp8-mds1 kernel: [&amp;lt;ffffffff81083300&amp;gt;] ? process_timeout+0x0/0x10
Jan 21 13:17:38 nbp8-mds1 kernel: [&amp;lt;ffffffffa078af70&amp;gt;] ? ldlm_expired_completion_wait+0x0/0x360 [ptlrpc]
Jan 21 13:17:38 nbp8-mds1 kernel: [&amp;lt;ffffffffa078f7a1&amp;gt;] ldlm_completion_ast+0x4b1/0x920 [ptlrpc]
...
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;ldlm_expired_completion_wait() should return -ETIMEDOUT instead of 0, otherwise the ldlm_completion_ast() won&apos;t finish and keeps waiting.&lt;/p&gt;</comment>
                            <comment id="140269" author="jaylan" created="Wed, 27 Jan 2016 19:18:01 +0000"  >&lt;p&gt;I always wonder how I can tell which patchset of a review was cherry-picked into our tree. Note that we cherry-picked some patch not in b2_5_fe repo, so the commit id is different than the one that I cherry-picked from. I have been using date that I did cherry-pick and examining codes to judge. Is there a good way to tell?&lt;/p&gt;

&lt;p&gt;As to &lt;a href=&quot;http://review.whamcloud.com/#/c/17853/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/17853/&lt;/a&gt;, I picked patch set #2 of 17853 (see &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7640&quot; title=&quot;stuck mdt thread required reboot of mds&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7640&quot;&gt;&lt;del&gt;LU-7640&lt;/del&gt;&lt;/a&gt;) but MDS still hung. I noticed that the lastest patch set is #4. I will cherry-pick #4 when it passes your internal testing.&lt;/p&gt;</comment>
                            <comment id="140421" author="jaylan" created="Thu, 28 Jan 2016 19:30:22 +0000"  >&lt;p&gt;I just posted a comment to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7372&quot; title=&quot;replay-dual test_26: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7372&quot;&gt;&lt;del&gt;LU-7372&lt;/del&gt;&lt;/a&gt;:&lt;/p&gt;

&lt;p&gt;&quot;&lt;br/&gt;
Zhenyu Xu ported an earlier version (patch set 3, IIRC) of #17853 for b2_5_fe at&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/17976&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/17976&lt;/a&gt; (see comment at 12/Jan/16 9:26 PM)&lt;/p&gt;

&lt;p&gt;Could someone from Intel update 17976 to the latest patch set #4 for us? Thanks!&lt;br/&gt;
&quot;&lt;/p&gt;</comment>
                            <comment id="140424" author="jaylan" created="Thu, 28 Jan 2016 20:00:46 +0000"  >&lt;p&gt;I checked the difference of patch set 3 and patch set 4 of &lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/#/c/17853/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/17853/&lt;/a&gt;&lt;br/&gt;
Well, no difference. They are the same.&lt;/p&gt;

&lt;p&gt;Assuming &lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/17976&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/17976&lt;/a&gt; is a b2_5_fe port of 17853 patch set 3, and we already have that patch in our mds/mgs, then the patch did not address our problem.&lt;/p&gt;</comment>
                            <comment id="141276" author="jay" created="Thu, 4 Feb 2016 23:40:52 +0000"  >&lt;p&gt;Hi Jay,&lt;/p&gt;

&lt;p&gt;can you please extract the log of all processes from crash dump and post it here?&lt;/p&gt;

&lt;p&gt;This doesn&apos;t look like a duplication of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7372&quot; title=&quot;replay-dual test_26: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7372&quot;&gt;&lt;del&gt;LU-7372&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;From the backtrace, the MDT thread is doing open; also it&apos;s conflicting with a PR lock with LOOK|UPDATE|LAYOUT, so the evicted client opened the file for execution. However, I don&apos;t know what type of lock the stuck MDT thread was requesting.&lt;/p&gt;</comment>
                            <comment id="141289" author="mhanafi" created="Fri, 5 Feb 2016 01:03:52 +0000"  >&lt;p&gt;Attaching the process trace and dmesg output. &lt;/p&gt;

&lt;p&gt;mds.processtrace&lt;br/&gt;
mds.dmesg&lt;/p&gt;</comment>
                            <comment id="141303" author="jay" created="Fri, 5 Feb 2016 07:00:00 +0000"  >&lt;p&gt;From the backtrace, ALL mdt threads were blocked at LDLM lock enqueue. This implied a deadlock case if the DLM lock that blocked those MDT threads was canceled by ELC so that it was piggy-backed in a RPC request, but that RPC request couldn&apos;t be served due to lack of MDT threads. This is really possible based on the behavior of that user;s application.&lt;/p&gt;

&lt;p&gt;Let&apos;s try to disable to ELC by writing this:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param ldlm.namespaces.*-mdc-*.early_lock_cancel=0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;on all client nodes.&lt;/p&gt;

&lt;p&gt;This may have some negative impact on performance, definitely should not try it at peak hours.&lt;/p&gt;

&lt;p&gt;One thing I have noticed is this message from the mds:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;&amp;lt;6&amp;gt;Lustre: nbp8-MDT0000: Recovery over after 2:29, of 12307 clients 12307 recovered and 0 were evicted.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;12307 clients were recovered in 2:29 minutes, this is really impressive.&lt;/p&gt;</comment>
                            <comment id="142010" author="mhanafi" created="Thu, 11 Feb 2016 19:14:45 +0000"  >&lt;p&gt;Can you provide any additional info on a permanent solution. &lt;/p&gt;</comment>
                            <comment id="142021" author="jay" created="Thu, 11 Feb 2016 20:42:58 +0000"  >&lt;p&gt;I need to understand the problem first and then propose a solution.&lt;/p&gt;</comment>
                            <comment id="142022" author="jay" created="Thu, 11 Feb 2016 20:43:51 +0000"  >&lt;p&gt;wait, does it imply it actually helped by disabling ELC?&lt;/p&gt;</comment>
                            <comment id="142472" author="mhanafi" created="Wed, 17 Feb 2016 19:35:11 +0000"  >&lt;p&gt;We haven&apos;t tried &apos;disabling ELC&apos; because the filesystem is in production. We know that this is triggered when clients are writing/read a large number of files and the clients get disconnected due to a network issue.&lt;/p&gt;</comment>
                            <comment id="165345" author="mhanafi" created="Thu, 8 Sep 2016 16:50:08 +0000"  >&lt;p&gt;Please close this case. No longer an issue.&lt;/p&gt;</comment>
                            <comment id="165360" author="pjones" created="Thu, 8 Sep 2016 18:06:39 +0000"  >&lt;p&gt;ok - thanks Mahmoud&lt;/p&gt;</comment>
                            <comment id="174471" author="spiechurski" created="Mon, 21 Nov 2016 16:26:52 +0000"  >&lt;p&gt;Just to make sure: Is this no longer an issue because it was fixed by patch 17853 for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7372&quot; title=&quot;replay-dual test_26: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7372&quot;&gt;&lt;del&gt;LU-7372&lt;/del&gt;&lt;/a&gt;, or simply because it was never seen again (maybe it was fixed on the application side) ?&lt;/p&gt;</comment>
                            <comment id="174477" author="mhanafi" created="Mon, 21 Nov 2016 17:10:05 +0000"  >&lt;p&gt;It was not fixed by &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7372&quot; title=&quot;replay-dual test_26: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7372&quot;&gt;&lt;del&gt;LU-7372&lt;/del&gt;&lt;/a&gt;. We upgraded to 2.7 and was not seen again.&lt;/p&gt;</comment>
                            <comment id="174481" author="spiechurski" created="Mon, 21 Nov 2016 17:26:35 +0000"  >&lt;p&gt;Ok, thanks Mahmoud.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="32965">LU-7372</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="20173" name="lustre-log.1453326130.17862.gz" size="257" author="mhanafi" created="Thu, 21 Jan 2016 19:45:22 +0000"/>
                            <attachment id="20332" name="mds.dmesg" size="243124" author="mhanafi" created="Fri, 5 Feb 2016 01:04:53 +0000"/>
                            <attachment id="20331" name="mds.processtrace" size="2110138" author="mhanafi" created="Fri, 5 Feb 2016 01:04:36 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10490" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>End date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Fri, 29 Apr 2016 22:17:10 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzxyon:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10493" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>Start date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Wed, 20 Jan 2016 22:17:10 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                    </customfields>
    </item>
</channel>
</rss>