<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:38:17 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10800] Mount hangs on clients. </title>
                <link>https://jira.whamcloud.com/browse/LU-10800</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Mounts frequently hang on clients. &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Mar  9 18:14:41 soak-36 kernel: INFO: task mount.lustre:2807 blocked &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; more than 120 seconds.
Mar  9 18:14:41 soak-36 kernel: &lt;span class=&quot;code-quote&quot;&gt;&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot;&lt;/span&gt; disables &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; message.
Mar  9 18:14:41 soak-36 kernel: mount.lustre    D ffff88085b7a0000     0  2807   2806 0x00000080
Mar  9 18:14:41 soak-36 kernel: Call Trace:
Mar  9 18:14:41 soak-36 kernel: [&amp;lt;ffffffff816ab8a9&amp;gt;] schedule+0x29/0x70
Mar  9 18:14:41 soak-36 kernel: [&amp;lt;ffffffff816a92b9&amp;gt;] schedule_timeout+0x239/0x2c0
Mar  9 18:14:41 soak-36 kernel: [&amp;lt;ffffffff81050b4c&amp;gt;] ? native_smp_send_reschedule+0x4c/0x70
Mar  9 18:14:41 soak-36 kernel: [&amp;lt;ffffffff810c2358&amp;gt;] ? resched_curr+0xa8/0xc0
Mar  9 18:14:41 soak-36 kernel: [&amp;lt;ffffffff810c30d8&amp;gt;] ? check_preempt_curr+0x78/0xa0
Mar  9 18:14:41 soak-36 kernel: [&amp;lt;ffffffff810c3119&amp;gt;] ? ttwu_do_wakeup+0x19/0xd0
Mar  9 18:14:41 soak-36 kernel: [&amp;lt;ffffffff816abc5d&amp;gt;] wait_for_completion+0xfd/0x140
Mar  9 18:14:42 soak-36 kernel: [&amp;lt;ffffffff810c6620&amp;gt;] ? wake_up_state+0x20/0x20
Mar  9 18:14:42 soak-36 kernel: [&amp;lt;ffffffffc0b28854&amp;gt;] llog_process_or_fork+0x244/0x450 [obdclass]
Mar  9 18:14:42 soak-36 kernel: [&amp;lt;ffffffffc0b28a74&amp;gt;] llog_process+0x14/0x20 [obdclass]
Mar  9 18:14:42 soak-36 kernel: [&amp;lt;ffffffffc0b5b1c5&amp;gt;] class_config_parse_llog+0x125/0x350 [obdclass]
Mar  9 18:14:42 soak-36 kernel: [&amp;lt;ffffffffc06501c8&amp;gt;] mgc_process_cfg_log+0x788/0xc40 [mgc]
Mar  9 18:14:42 soak-36 kernel: [&amp;lt;ffffffffc0652243&amp;gt;] mgc_process_log+0x3d3/0x8b0 [mgc]
Mar  9 18:14:42 soak-36 kernel: [&amp;lt;ffffffffc0b63240&amp;gt;] ? class_config_dump_handler+0x7e0/0x7e0 [obdclass]
Mar  9 18:14:42 soak-36 kernel: [&amp;lt;ffffffffc0652968&amp;gt;] ? do_config_log_add+0x248/0x580 [mgc]
Mar  9 18:14:42 soak-36 kernel: [&amp;lt;ffffffffc0653840&amp;gt;] mgc_process_config+0x890/0x13f0 [mgc]
Mar  9 18:14:42 soak-36 kernel: [&amp;lt;ffffffffc0b66c85&amp;gt;] lustre_process_log+0x2d5/0xae0 [obdclass]
Mar  9 18:14:42 soak-36 kernel: [&amp;lt;ffffffffc0855e27&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
Mar  9 18:14:43 soak-36 kernel: [&amp;lt;ffffffffc0f3e3bb&amp;gt;] ll_fill_super+0x45b/0x1100 [lustre]
Mar  9 18:14:43 soak-36 kernel: [&amp;lt;ffffffffc0b6caa6&amp;gt;] lustre_fill_super+0x286/0x910 [obdclass]
Mar  9 18:14:43 soak-36 kernel: [&amp;lt;ffffffffc0b6c820&amp;gt;] ? lustre_common_put_super+0x270/0x270 [obdclass]
Mar  9 18:14:43 soak-36 kernel: [&amp;lt;ffffffff81206abd&amp;gt;] mount_nodev+0x4d/0xb0
Mar  9 18:14:43 soak-36 kernel: [&amp;lt;ffffffffc0b64ab8&amp;gt;] lustre_mount+0x38/0x60 [obdclass]
Mar  9 18:14:43 soak-36 kernel: [&amp;lt;ffffffff81207549&amp;gt;] mount_fs+0x39/0x1b0
Mar  9 18:14:43 soak-36 kernel: [&amp;lt;ffffffff81224177&amp;gt;] vfs_kern_mount+0x67/0x110
Mar  9 18:14:43 soak-36 kernel: [&amp;lt;ffffffff81226683&amp;gt;] do_mount+0x233/0xaf0
Mar  9 18:14:43 soak-36 kernel: [&amp;lt;ffffffff811894ee&amp;gt;] ? __get_free_pages+0xe/0x40
Mar  9 18:14:43 soak-36 kernel: [&amp;lt;ffffffff812272c6&amp;gt;] SyS_mount+0x96/0xf0
Mar  9 18:14:43 soak-36 kernel: [&amp;lt;ffffffff816b89fd&amp;gt;] system_call_fastpath+0x16/0x1b
Mar  9 18:16:43 soak-36 kernel: INFO: task mount.lustre:2807 blocked &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; more than 120 seconds.
Mar  9 18:16:43 soak-36 kernel: &lt;span class=&quot;code-quote&quot;&gt;&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot;&lt;/span&gt; disables &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; message.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I dumped the lustre log during the hang, attached. I also crash-dumped the client, files available on soak &lt;/p&gt;</description>
                <environment>Soak stress cluster, lustre-master-next-ib build 1 </environment>
        <key id="51215">LU-10800</key>
            <summary>Mount hangs on clients. </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="cliffw">Cliff White</reporter>
                        <labels>
                            <label>soak</label>
                    </labels>
                <created>Fri, 9 Mar 2018 18:28:25 +0000</created>
                <updated>Mon, 19 Mar 2018 17:39:20 +0000</updated>
                            <resolved>Mon, 19 Mar 2018 17:39:20 +0000</resolved>
                                    <version>Lustre 2.11.0</version>
                                    <fixVersion>Lustre 2.11.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>10</watches>
                                                                            <comments>
                            <comment id="223005" author="jhammond" created="Fri, 9 Mar 2018 21:11:46 +0000"  >&lt;p&gt;Lustre: Build Version: 2.10.58_140_ge103776&lt;/p&gt;</comment>
                            <comment id="223014" author="cliffw" created="Fri, 9 Mar 2018 23:00:59 +0000"  >&lt;p&gt;Tried lustre-master-ib build 58, per Peter - same issue version=2.10.58_118_gcf6b8f5&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[  842.767852] INFO: task mount.lustre:2760 blocked for more than 120 seconds.
[  842.791128] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[  842.817070] mount.lustre    D ffff88105acb0000     0  2760   2759 0x00000080
[  842.840464] Call Trace:
[  842.848565]  [&amp;lt;ffffffff810cae08&amp;gt;] ? __enqueue_entity+0x78/0x80
[  842.867884]  [&amp;lt;ffffffff810d17fc&amp;gt;] ? enqueue_entity+0x26c/0xb60
[  842.887205]  [&amp;lt;ffffffff816ab8a9&amp;gt;] schedule+0x29/0x70
[  842.903644]  [&amp;lt;ffffffff816a92b9&amp;gt;] schedule_timeout+0x239/0x2c0
[  842.922960]  [&amp;lt;ffffffff810c95d5&amp;gt;] ? sched_clock_cpu+0x85/0xc0
[  842.941987]  [&amp;lt;ffffffff810c30d8&amp;gt;] ? check_preempt_curr+0x78/0xa0
[  842.961880]  [&amp;lt;ffffffff810c3119&amp;gt;] ? ttwu_do_wakeup+0x19/0xd0
[  842.980620]  [&amp;lt;ffffffff816abc5d&amp;gt;] wait_for_completion+0xfd/0x140
[  843.000516]  [&amp;lt;ffffffff810c6620&amp;gt;] ? wake_up_state+0x20/0x20
[  843.019150]  [&amp;lt;ffffffffc0a24854&amp;gt;] llog_process_or_fork+0x244/0x450 [obdclass]
[  843.042950]  [&amp;lt;ffffffffc0a24a74&amp;gt;] llog_process+0x14/0x20 [obdclass]
[  843.063850]  [&amp;lt;ffffffffc0a56da5&amp;gt;] class_config_parse_llog+0x125/0x350 [obdclass]
[  843.088364]  [&amp;lt;ffffffffc05a51c8&amp;gt;] mgc_process_cfg_log+0x788/0xc40 [mgc]
[  843.110283]  [&amp;lt;ffffffffc05a7243&amp;gt;] mgc_process_log+0x3d3/0x8b0 [mgc]
[  843.131181]  [&amp;lt;ffffffffc0a5ee20&amp;gt;] ? class_config_dump_handler+0x7e0/0x7e0 [obdclass]
[  843.156774]  [&amp;lt;ffffffffc05a7968&amp;gt;] ? do_config_log_add+0x248/0x580 [mgc]
[  843.178634]  [&amp;lt;ffffffffc05a8840&amp;gt;] mgc_process_config+0x890/0x13f0 [mgc]
[  843.200574]  [&amp;lt;ffffffffc0a62865&amp;gt;] lustre_process_log+0x2d5/0xae0 [obdclass]
[  843.223581]  [&amp;lt;ffffffffc0939e27&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
[  843.245497]  [&amp;lt;ffffffffc0ea994b&amp;gt;] ll_fill_super+0x45b/0x1100 [lustre]
[  843.266861]  [&amp;lt;ffffffffc0a68686&amp;gt;] lustre_fill_super+0x286/0x910 [obdclass]
[  843.289656]  [&amp;lt;ffffffffc0a68400&amp;gt;] ? lustre_common_put_super+0x270/0x270 [obdclass]
[  843.314667]  [&amp;lt;ffffffff81206abd&amp;gt;] mount_nodev+0x4d/0xb0
[  843.331977]  [&amp;lt;ffffffffc0a60698&amp;gt;] lustre_mount+0x38/0x60 [obdclass]
[  843.352676]  [&amp;lt;ffffffff81207549&amp;gt;] mount_fs+0x39/0x1b0
[  843.369355]  [&amp;lt;ffffffff81224177&amp;gt;] vfs_kern_mount+0x67/0x110
[  843.387756]  [&amp;lt;ffffffff81226683&amp;gt;] do_mount+0x233/0xaf0
[  843.404720]  [&amp;lt;ffffffff811894ee&amp;gt;] ? __get_free_pages+0xe/0x40
[  843.423693]  [&amp;lt;ffffffff812272c6&amp;gt;] SyS_mount+0x96/0xf0
[  843.440374]  [&amp;lt;ffffffff816b89fd&amp;gt;] system_call_fastpath+0x16/0x1b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="223373" author="jhammond" created="Mon, 12 Mar 2018 22:04:58 +0000"  >&lt;p&gt;I got onto soak-34 while it was stuck in mount and found the following stack trace for the llog processing thread:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;13498 llog_process_th
[&amp;lt;ffffffffc099d61b&amp;gt;] lnet_discover_peer_locked+0x10b/0x380 [lnet]
[&amp;lt;ffffffffc099d900&amp;gt;] LNetPrimaryNID+0x70/0x1a0 [lnet]
[&amp;lt;ffffffffc0c1937e&amp;gt;] ptlrpc_connection_get+0x3e/0x450 [ptlrpc]
[&amp;lt;ffffffffc0c0da1c&amp;gt;] ptlrpc_uuid_to_connection+0xec/0x1a0 [ptlrpc]
[&amp;lt;ffffffffc0bf21b2&amp;gt;] import_set_conn+0xb2/0x7a0 [ptlrpc]
[&amp;lt;ffffffffc0bf35f9&amp;gt;] client_obd_setup+0xd39/0x1450 [ptlrpc]
[&amp;lt;ffffffffc0d83c0b&amp;gt;] osc_setup_common+0x5b/0x330 [osc]
[&amp;lt;ffffffffc0d83f13&amp;gt;] osc_setup+0x33/0x2f0 [osc]
[&amp;lt;ffffffffc0d8f7c5&amp;gt;] osc_device_alloc+0xa5/0x240 [osc]
[&amp;lt;ffffffffc0a2be2a&amp;gt;] obd_setup+0x11a/0x2b0 [obdclass]
[&amp;lt;ffffffffc0a2d188&amp;gt;] class_setup+0x2a8/0x840 [obdclass]
[&amp;lt;ffffffffc0a30662&amp;gt;] class_process_config+0x1ae2/0x27b0 [obdclass]
[&amp;lt;ffffffffc0a32e94&amp;gt;] class_config_llog_handler+0x914/0x1330 [obdclass]
[&amp;lt;ffffffffc09f69b9&amp;gt;] llog_process_thread+0x839/0x1560 [obdclass]
[&amp;lt;ffffffffc09f80ef&amp;gt;] llog_process_thread_daemonize+0x9f/0xe0 [obdclass]
[&amp;lt;ffffffff810b4031&amp;gt;] kthread+0xd1/0xe0
[&amp;lt;ffffffff816c0577&amp;gt;] ret_from_fork+0x77/0xb0
[&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff

13473 mount.lustre
[&amp;lt;ffffffffc09f7924&amp;gt;] llog_process_or_fork+0x244/0x450 [obdclass]
[&amp;lt;ffffffffc09f7b44&amp;gt;] llog_process+0x14/0x20 [obdclass]
[&amp;lt;ffffffffc0a2a495&amp;gt;] class_config_parse_llog+0x125/0x350 [obdclass]
[&amp;lt;ffffffffc0652038&amp;gt;] mgc_process_cfg_log+0x788/0xc40 [mgc]
[&amp;lt;ffffffffc0655233&amp;gt;] mgc_process_log+0x3d3/0x8b0 [mgc]
[&amp;lt;ffffffffc0656830&amp;gt;] mgc_process_config+0x890/0x13f0 [mgc]
[&amp;lt;ffffffffc0a35fd8&amp;gt;] lustre_process_log+0x2d8/0xaf0 [obdclass]
[&amp;lt;ffffffffc0e835eb&amp;gt;] ll_fill_super+0x45b/0x1100 [lustre]
[&amp;lt;ffffffffc0a3be3c&amp;gt;] lustre_fill_super+0x28c/0x920 [obdclass]
[&amp;lt;ffffffff8120948f&amp;gt;] mount_nodev+0x4f/0xb0
[&amp;lt;ffffffffc0a33df8&amp;gt;] lustre_mount+0x38/0x60 [obdclass]
[&amp;lt;ffffffff81209f1e&amp;gt;] mount_fs+0x3e/0x1b0
[&amp;lt;ffffffff81226d57&amp;gt;] vfs_kern_mount+0x67/0x110
[&amp;lt;ffffffff81229263&amp;gt;] do_mount+0x233/0xaf0
[&amp;lt;ffffffff81229ea6&amp;gt;] SyS_mount+0x96/0xf0
[&amp;lt;ffffffff816c0715&amp;gt;] system_call_fastpath+0x1c/0x21
[&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This is at the &lt;tt&gt;schedule()&lt;/tt&gt; call in the &lt;tt&gt;for (;;&amp;#41;&lt;/tt&gt; loop of &lt;tt&gt;lnet_discover_peer_locked()&lt;/tt&gt;:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@soak-34 ~]# nm /lib/modules/3.10.0-693.21.1.el7.x86_64/extra/lustre-client/net/lnet.ko | grep lnet_discover_peer_locked
0000000000035510 T lnet_discover_peer_locked
[root@soak-34 ~]# printf &apos;%#x\n&apos; $((0x0000000000035510 + 0x10b))
0x3561b
[root@soak-34 ~]# addr2line -e /lib/modules/3.10.0-693.21.1.el7.x86_64/extra/lustre-client/net/lnet.ko --functions --inlines 0x3561b
lnet_discover_peer_locked
/usr/src/debug/lustre-2.10.58_139_g630cd49/lnet/lnet/peer.c:2064
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The last lines in the debug buffer for the processing thread were:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000020:01000000:17.0:1520891497.612056:0:13498:0:(obd_config.c:1599:class_config_llog_handler()) Marker, inst_flg=0x2 mark_flg=0x2
00000020:00000080:17.0:1520891497.612058:0:13498:0:(obd_config.c:1135:class_process_config()) processing cmd: cf010
00000020:00000080:17.0:1520891497.612059:0:13498:0:(obd_config.c:1205:class_process_config()) marker 79 (0x2) soaked-OST0017 add osc
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="223666" author="ashehata" created="Thu, 15 Mar 2018 02:51:43 +0000"  >&lt;p&gt;It appears that the issue is that with peer discovery enabled some nodes are not responding to the ping (REPLY to the GET). Discovery is initiated in:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;ptlrpc_connection_get() &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;by calling&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;LNetPrimaryNID() &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This function blocks until discovery completes. However because discovery is not getting a reply, the timeout for the reply is 3 minutes. In some of the logs I have seen, it shows that two OSTs are timing out, so that adds 6 minutes to mount time. What we see is that after this long period the mount completes.&lt;/p&gt;

&lt;p&gt;So there appears to be two problems:&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;Why are some of the nodes not responding to the ping?&lt;/li&gt;
	&lt;li&gt;The 3 minutes timeout for the ping REPLY or push ACK seems to be an overkill&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;For 1, I have run a simple test where I modprobe lustre on all the nodes and then run lctl ping &amp;lt;ost nid&amp;gt; on all the nodes. What I found is that some would fail the ping. If I run the ping again, the nodes which failed to ping succeed the second time around. This behavior is unrelated to discovery. I&apos;ll need to look at it in more details tomorrow.&lt;/p&gt;

&lt;p&gt;To solve 2, there are two options to consider:&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;Reduce the timeout to 5 or 10 seconds. This will reduce the time to mount the FS from 3 minutes to 10 or 20 seconds per failed server. This solution I think would be sufficient for 2.11.&lt;/li&gt;
	&lt;li&gt;Change the behavior to do discovery in the background. Create a non-MR peer and initiate discovery on it. And at the same time continue sending messages as normal. When discovery completes the peer will be updated according to the returned reply. If discovery fails then it can be initiated again. In case of discovery failure the system operates no worse than if discovery is disabled. This can be done for 2.12&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="223786" author="gerrit" created="Thu, 15 Mar 2018 19:15:36 +0000"  >&lt;p&gt;Amir Shehata (amir.shehata@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/31663&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/31663&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10800&quot; title=&quot;Mount hangs on clients. &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10800&quot;&gt;&lt;del&gt;LU-10800&lt;/del&gt;&lt;/a&gt; lnet: reduce discovery timeout&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 3daf02b0245d57b4250e5404985b88eb72af3588&lt;/p&gt;</comment>
                            <comment id="223849" author="gerrit" created="Fri, 16 Mar 2018 15:23:04 +0000"  >&lt;p&gt;John L. Hammond (john.hammond@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/31675&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/31675&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10800&quot; title=&quot;Mount hangs on clients. &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10800&quot;&gt;&lt;del&gt;LU-10800&lt;/del&gt;&lt;/a&gt; lnet: Revert &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10270&quot; title=&quot;remove a early rx code from o2ib lnd&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10270&quot;&gt;LU-10270&lt;/a&gt; lnet: remove an early rx code&quot;&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 46eb7f97c029a998ca384ddd208ff8444d4cb126&lt;/p&gt;</comment>
                            <comment id="223877" author="ashehata" created="Fri, 16 Mar 2018 19:38:57 +0000"  >&lt;p&gt;John found the reason for ping being dropped. There was a patch that was introduced which removed handling the early rxs from the o2iblnd. I believe we are hitting this case. Which triggered the discovery ping timeout. We tested the revert on soak and it works properly.&lt;/p&gt;

&lt;p&gt;I believe we should land both the revert and the timeout change.&lt;/p&gt;

&lt;p&gt;The revert will eliminate the drop ping issue.&lt;/p&gt;

&lt;p&gt;The timeout change will deal with cases where there is a legitimate communication failure, like trying to mount while some of the OSTs are down.&lt;/p&gt;</comment>
                            <comment id="223904" author="gerrit" created="Sat, 17 Mar 2018 05:13:48 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/31663/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/31663/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10800&quot; title=&quot;Mount hangs on clients. &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10800&quot;&gt;&lt;del&gt;LU-10800&lt;/del&gt;&lt;/a&gt; lnet: reduce discovery timeout&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 1cf929df259a9aaa5446a4cda637930ca5b27d7a&lt;/p&gt;</comment>
                            <comment id="223961" author="gerrit" created="Mon, 19 Mar 2018 17:36:26 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/31675/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/31675/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10800&quot; title=&quot;Mount hangs on clients. &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10800&quot;&gt;&lt;del&gt;LU-10800&lt;/del&gt;&lt;/a&gt; lnet: Revert &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10270&quot; title=&quot;remove a early rx code from o2ib lnd&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10270&quot;&gt;LU-10270&lt;/a&gt; lnet: remove an early rx code&quot;&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 6224bb44d8d10894f1c21921a0224dd8baf0ded0&lt;/p&gt;</comment>
                            <comment id="223963" author="pjones" created="Mon, 19 Mar 2018 17:39:20 +0000"  >&lt;p&gt;Landed for 2.11&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="48303">LU-9984</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="49427">LU-10270</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="29740" name="soak-36.mount.hang.txt.gz" size="5750" author="cliffw" created="Fri, 9 Mar 2018 18:29:44 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzu93:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>