<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:22:38 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2133] Peers always report down for non-routers after LND disconnects</title>
                <link>https://jira.whamcloud.com/browse/LU-2133</link>
                <project id="10000" key="LU">Lustre</project>
                    <description></description>
                <environment>Originated from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-630&quot; title=&quot;mount failure after MGS connection lost and file system is unmounted&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-630&quot;&gt;&lt;strike&gt;LU-630&lt;/strike&gt;&lt;/a&gt; patch</environment>
        <key id="16306">LU-2133</key>
            <summary>Peers always report down for non-routers after LND disconnects</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="isaac">Isaac Huang</assignee>
                                    <reporter username="jfilizetti">Jeremy Filizetti</reporter>
                        <labels>
                            <label>ptr</label>
                    </labels>
                <created>Tue, 9 Oct 2012 16:59:01 +0000</created>
                <updated>Sat, 13 Apr 2013 01:34:42 +0000</updated>
                            <resolved>Sat, 13 Apr 2013 01:34:42 +0000</resolved>
                                    <version>Lustre 2.3.0</version>
                    <version>Lustre 2.1.2</version>
                                    <fixVersion>Lustre 2.4.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="46279" author="jfilizetti" created="Tue, 9 Oct 2012 17:11:12 +0000"  >&lt;p&gt;After adding the patch from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-630&quot; title=&quot;mount failure after MGS connection lost and file system is unmounted&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-630&quot;&gt;&lt;del&gt;LU-630&lt;/del&gt;&lt;/a&gt; peers always report down after the connection is disconnected even after it is reconnected.  Even though the peer health wasn&apos;t meant as a generic health mechanism as mentioned in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-630&quot; title=&quot;mount failure after MGS connection lost and file system is unmounted&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-630&quot;&gt;&lt;del&gt;LU-630&lt;/del&gt;&lt;/a&gt; reporting incorrectly that the a peer is down even after it just reconnected seems counter-productive.  Is there any reason that LNDs can&apos;t just call lnet_notify to report the link is alive after a the connection is established similar to the following patch:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c
index eb567a5..d620cf3 100644
--- a/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -2060,6 +2060,7 @@ kiblnd_connreq_done(kib_conn_t *conn, &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; status)
         }
 
         write_unlock_irqrestore(&amp;amp;kiblnd_data.kib_global_lock, flags);
+        lnet_notify(peer-&amp;gt;ibp_ni, peer-&amp;gt;ibp_nid, 1, cfs_time_current());
 
         &lt;span class=&quot;code-comment&quot;&gt;/* Schedule blocked txs */&lt;/span&gt;
         spin_lock (&amp;amp;conn-&amp;gt;ibc_lock);
diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c
index 56e5ad3..e84a37b 100644
--- a/lnet/klnds/socklnd/socklnd_cb.c
+++ b/lnet/klnds/socklnd/socklnd_cb.c
@@ -1987,9 +1987,12 @@ ksocknal_connect (ksock_route_t *route)
                 }
 
                 ksocknal_launch_connection_locked(route);
+                cfs_write_unlock_bh (&amp;amp;ksocknal_data.ksnd_global_lock);
+        } &lt;span class=&quot;code-keyword&quot;&gt;else&lt;/span&gt; {
+                cfs_write_unlock_bh (&amp;amp;ksocknal_data.ksnd_global_lock);
+                lnet_notify(peer-&amp;gt;ksnp_ni, peer-&amp;gt;ksnp_id.nid, 1, cfs_time_current());
         }
 
-        cfs_write_unlock_bh (&amp;amp;ksocknal_data.ksnd_global_lock);
         &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; retry_later;
 
  failed:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Unfortunately I only have ksocklnd and ko2iblnd clients so I don&apos;t know about other LNDs and can&apos;t test them anyways.&lt;/p&gt;</comment>
                            <comment id="46285" author="pjones" created="Tue, 9 Oct 2012 18:08:25 +0000"  >&lt;p&gt;Isaac&lt;/p&gt;

&lt;p&gt;Could you please comment on this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="54376" author="isaac" created="Tue, 19 Mar 2013 15:49:35 +0000"  >&lt;p&gt;Hi, did you mean that peers were reported as &quot;down&quot; in /proc/sys/lnet/peers?&lt;/p&gt;

&lt;p&gt;The reason why lnet_notify() isn&apos;t called from LNDs for &quot;up&quot; is that it was a design choice for LNet to poll LNDs when necessary instead of LNDs interrupting LNet - e.g. when peer health isn&apos;t enabled at LNet layer there isn&apos;t any polling calls into LNDs at all, and there&apos;s other reasons here why polling from upper layer is more efficient than interrupting from lower layer.&lt;/p&gt;

&lt;p&gt;If your concern was wrong status shown in /proc/sys/lnet/peers, I think the solution should be to simply show &quot;NA&quot; in /proc/sys/lnet/peers when peer health not enabled - while your patch would enable proper status in the case of reconnects there could be other cases where incorrect status would still show up in /proc/sys/lnet/peers, because without peer health enabled, the aliveness timestamps aren&apos;t refreshed at all.&lt;/p&gt;</comment>
                            <comment id="54412" author="isaac" created="Tue, 19 Mar 2013 19:26:55 +0000"  >&lt;p&gt;Patch posted at &lt;a href=&quot;http://review.whamcloud.com/#change,5770&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,5770&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="54697" author="jfilizetti" created="Fri, 22 Mar 2013 19:08:59 +0000"  >&lt;p&gt;Yes, after the peer is goes down it remains down for non-routers after the patch from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-630&quot; title=&quot;mount failure after MGS connection lost and file system is unmounted&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-630&quot;&gt;&lt;del&gt;LU-630&lt;/del&gt;&lt;/a&gt;.  The patch I had here was incomplete but after digging around for the correct place to actually notify LNet it doesn&apos;t seem there are any really good places and based on your comments I guess I can see why.  The problem with just showing NA is we have a requirement to know who is connected.  I lose that ability with the current &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-630&quot; title=&quot;mount failure after MGS connection lost and file system is unmounted&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-630&quot;&gt;&lt;del&gt;LU-630&lt;/del&gt;&lt;/a&gt; patch so it creates a problem while fixing another.&lt;/p&gt;

&lt;p&gt;Instead of just reverting &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-630&quot; title=&quot;mount failure after MGS connection lost and file system is unmounted&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-630&quot;&gt;&lt;del&gt;LU-630&lt;/del&gt;&lt;/a&gt;&apos;s patch I think I&apos;d like to see it revised to do something along the lines of returning 1 from lnet_peer_alive_locked if (the_lnet.ln_routing == 0) instead of returning 0 but not before querying the LND.&lt;/p&gt;</comment>
                            <comment id="54713" author="isaac" created="Fri, 22 Mar 2013 21:41:13 +0000"  >&lt;p&gt;1. By reverting the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-630&quot; title=&quot;mount failure after MGS connection lost and file system is unmounted&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-630&quot;&gt;&lt;del&gt;LU-630&lt;/del&gt;&lt;/a&gt; patch, you&apos;d essentially enable a feature that does not work except on routers. Then you&apos;d have to make sure that it&apos;s disabled everywhere else by module options, otherwise clients and servers would see dropped messages when the network and peers are in good health. In other words, I believe there&apos;s a very good reason for the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-630&quot; title=&quot;mount failure after MGS connection lost and file system is unmounted&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-630&quot;&gt;&lt;del&gt;LU-630&lt;/del&gt;&lt;/a&gt; patch.&lt;/p&gt;

&lt;p&gt;2. &quot;returning 1 from lnet_peer_alive_locked if (the_lnet.ln_routing == 0)&quot; gives false information. &quot;NA&quot; is correct because LNet does not know. If you want to know who is connected, &quot;lctl conn_list&quot; is a better way. It shows active peers at the LND level. /proc/sys/lnet/peers shows peer state in LNet level, and LNet level peer state is persistent, in that even if the LND has closed connection with a peer (e.g. peer is dead), there&apos;d still be an entry for the peer in /proc/sys/lnet/peers.&lt;/p&gt;</comment>
                            <comment id="54714" author="jfilizetti" created="Fri, 22 Mar 2013 22:07:27 +0000"  >&lt;p&gt;Thanks for the clarification on the lnet state.  conn_list looks like it will suffice for status information although certainly not as nice of a layout or as simple to get a list of all peers.  &lt;/p&gt;</comment>
                            <comment id="55608" author="isaac" created="Fri, 5 Apr 2013 16:21:41 +0000"  >&lt;p&gt;New patch at &lt;a href=&quot;http://review.whamcloud.com/#change,5955&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,5955&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="56237" author="pjones" created="Sat, 13 Apr 2013 01:34:42 +0000"  >&lt;p&gt;Landed for 2.4&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzv9vr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>5135</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>