<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:04:01 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13764] lp_dc_error is not cleared when new discovery ping is sent as response to incoming push</title>
                <link>https://jira.whamcloud.com/browse/LU-13764</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This bug can result in routes being marked down when they are in fact up:&lt;/p&gt;

&lt;p&gt;Ping fails: lp_dc_error is set, and lp_state is set with FORCE_PING&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000400:00000200:25.0:1594238418.911990:0:44689:0:(peer.c:3029:lnet_peer_send_ping()) peer 601@gni99
00000400:00000200:25.0:1594238418.911990:0:44689:0:(peer.c:3394:lnet_peer_discovery()) peer 601@gni99(ffff90348c6c8000) state 0x30275 rc 0
00000400:00000200:25.0:1594238420.912144:0:44689:0:(peer.c:3260:lnet_peer_discovery_wait_for_work()) woken: 0
00000400:00000200:25.0:1594238420.912146:0:44689:0:(peer.c:3375:lnet_peer_discovery()) peer 601@gni99(ffff90348c6c8000) state 0x30875
00000400:00000200:25.0:1594238420.912147:0:44689:0:(peer.c:2523:lnet_discovery_event_handler()) Received event: 6
00000400:00000200:25.0:1594238420.912148:0:44689:0:(lib-md.c:69:lnet_md_unlink()) Unlinking md ffff905fbb363210
00000400:00000200:25.0:1594238420.912150:0:44689:0:(peer.c:2954:lnet_peer_ping_failed()) peer 601@gni99:-113
00000400:00000200:25.0:1594238420.912150:0:44689:0:(peer.c:3394:lnet_peer_discovery()) peer 601@gni99(ffff90348c6c8000) state 0x32075 rc -113
00000400:00000200:25.0:1594238420.912152:0:44689:0:(peer.c:3161:lnet_peer_discovery_error()) Discovery error 601@gni99: -113
00000400:00000200:25.0:1594238420.912153:0:44689:0:(peer.c:1938:lnet_peer_discovery_complete()) Discovery complete. Dequeue peer 601@gni99
00000400:00000200:25.0:1594238420.912154:0:44689:0:(router.c:576:lnet_router_discovery_complete()) 601@gni99: Router discovery failed -113
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Push arrives&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000400:00000200:0.0:1594238425.096430:0:43969:0:(api-ni.c:2085:lnet_push_target_event_handler()) type 2 status 0 unlinked 1
00000400:00000200:0.0:1594238425.096432:0:43969:0:(peer.c:2065:lnet_peer_push_event()) peer 601@gni99(ffff90348c6c8000) is MR
00000400:00000200:0.0:1594238425.096433:0:43969:0:(peer.c:2142:lnet_peer_push_event()) Received Push 601@gni99 1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Push is processed:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000400:00000200:25.0:1594238425.096441:0:44689:0:(peer.c:3260:lnet_peer_discovery_wait_for_work()) woken: 0
00000400:00000200:25.0:1594238425.096445:0:44689:0:(api-ni.c:2075:lnet_push_target_post()) posted push target ffff9064ad063000
00000400:00000200:25.0:1594238425.096449:0:44689:0:(peer.c:3375:lnet_peer_discovery()) peer 601@gni99(ffff90348c6c8000) state 0x220f5
00000400:00000200:25.0:1594238425.096452:0:44689:0:(peer.c:2659:lnet_peer_merge_data()) cur lpni 10.16.100.10@o2ib10 status 363512030
00000400:00000200:25.0:1594238425.096453:0:44689:0:(peer.c:2659:lnet_peer_merge_data()) cur lpni 601@gni99 status 363512030
00000400:00000200:25.0:1594238425.096454:0:44689:0:(peer.c:2659:lnet_peer_merge_data()) cur lpni 601@gni status 363512030
00000400:00000200:25.0:1594238425.096455:0:44689:0:(peer.c:2726:lnet_peer_merge_data()) peer 601@gni99 (ffff90348c6c8000): 0
00000400:00000200:25.0:1594238425.096456:0:44689:0:(peer.c:2921:lnet_peer_data_present()) peer 601@gni99(ffff90348c6c8000): 0. state = 0x22175
00000400:00000200:25.0:1594238425.096458:0:44689:0:(peer.c:3394:lnet_peer_discovery()) peer 601@gni99(ffff90348c6c8000) state 0x22175 rc 1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Peer is marked with FORCE_PING, so a ping is sent:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000400:00000200:25.0:1594238425.096459:0:44689:0:(peer.c:3375:lnet_peer_discovery()) peer 601@gni99(ffff90348c6c8000) state 0x22175
00000400:00000200:25.0:1594238425.096462:0:44689:0:(lib-move.c:5068:LNetGet()) LNetGet -&amp;gt; 12345-10.16.100.10@o2ib10
...
00000400:00000200:25.0:1594238425.096475:0:44689:0:(lib-move.c:1855:lnet_handle_send()) TRACE: 10.16.100.60@o2ib10(10.16.100.60@o2ib10:&amp;lt;?&amp;gt;) -&amp;gt; 10.16.100.10@o2ib10(10.16.100.10@o2ib10:10.16.100.10@o2ib10) &amp;lt;?&amp;gt; : GET try# 0
00000800:00000200:25.0:1594238425.096477:0:44689:0:(o2iblnd_cb.c:1663:kiblnd_send()) sending 0 bytes in 0 frags to 12345-10.16.100.10@o2ib10
00000800:00000200:25.0:1594238425.096480:0:44689:0:(o2iblnd.c:405:kiblnd_find_peer_locked()) got peer_ni [ffff9034adc34a80] -&amp;gt; 10.16.100.10@o2ib10 (2) version: 12
00000800:00000200:25.0:1594238425.096481:0:44689:0:(o2iblnd_cb.c:1543:kiblnd_launch_tx()) conn[ffff90345d3e1e00] (36)++
00000800:00000200:25.0:1594238425.096482:0:44689:0:(o2iblnd_cb.c:1316:kiblnd_queue_tx_locked()) conn[ffff90345d3e1e00] (37)++
00000800:00000200:25.0:1594238425.096485:0:44689:0:(o2iblnd_cb.c:1549:kiblnd_launch_tx()) conn[ffff90345d3e1e00] (38)--
00000400:00000200:25.0:1594238425.096486:0:44689:0:(peer.c:3029:lnet_peer_send_ping()) peer 601@gni99
00000400:00000200:25.0:1594238425.096486:0:44689:0:(peer.c:3394:lnet_peer_discovery()) peer 601@gni99(ffff90348c6c8000) state 0x20375 rc 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Ping reply is processed:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000400:00000200:25.0:1594238425.096580:0:44689:0:(peer.c:3260:lnet_peer_discovery_wait_for_work()) woken: 0
00000400:00000200:25.0:1594238425.096581:0:44689:0:(peer.c:3375:lnet_peer_discovery()) peer 601@gni99(ffff90348c6c8000) state 0x201f5
00000400:00000200:25.0:1594238425.096582:0:44689:0:(peer.c:2659:lnet_peer_merge_data()) cur lpni 10.16.100.10@o2ib10 status 363512030
00000400:00000200:25.0:1594238425.096583:0:44689:0:(peer.c:2659:lnet_peer_merge_data()) cur lpni 601@gni99 status 363512030
00000400:00000200:25.0:1594238425.096584:0:44689:0:(peer.c:2659:lnet_peer_merge_data()) cur lpni 601@gni status 363512030
00000400:00000200:25.0:1594238425.096584:0:44689:0:(peer.c:2726:lnet_peer_merge_data()) peer 601@gni99 (ffff90348c6c8000): 0
00000400:00000200:25.0:1594238425.096585:0:44689:0:(peer.c:2921:lnet_peer_data_present()) peer 601@gni99(ffff90348c6c8000): 0. state = 0x20175
00000400:00000200:25.0:1594238425.096586:0:44689:0:(peer.c:3394:lnet_peer_discovery()) peer 601@gni99(ffff90348c6c8000) state 0x20175 rc 1
00000400:00000200:25.0:1594238425.096587:0:44689:0:(peer.c:3375:lnet_peer_discovery()) peer 601@gni99(ffff90348c6c8000) state 0x20175
00000400:00000200:25.0:1594238425.096587:0:44689:0:(peer.c:3180:lnet_peer_discovered()) peer 601@gni99
00000400:00000200:25.0:1594238425.096588:0:44689:0:(peer.c:3394:lnet_peer_discovery()) peer 601@gni99(ffff90348c6c8000) state 0x20115 rc 0
00000400:00000200:25.0:1594238425.096589:0:44689:0:(peer.c:1938:lnet_peer_discovery_complete()) Discovery complete. Dequeue peer 601@gni99
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;lp_dc_error was never cleared, so it looks like discovery failed, and route is marked down:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000400:00000200:25.0:1594238425.096590:0:44689:0:(router.c:576:lnet_router_discovery_complete()) 601@gni99: Router discovery failed -113
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="59891">LU-13764</key>
            <summary>lp_dc_error is not cleared when new discovery ping is sent as response to incoming push</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="hornc">Chris Horn</assignee>
                                    <reporter username="hornc">Chris Horn</reporter>
                        <labels>
                    </labels>
                <created>Wed, 8 Jul 2020 21:28:10 +0000</created>
                <updated>Fri, 7 Aug 2020 19:26:47 +0000</updated>
                            <resolved>Fri, 7 Aug 2020 19:26:39 +0000</resolved>
                                                    <fixVersion>Lustre 2.14.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="274804" author="ashehata" created="Wed, 8 Jul 2020 21:33:29 +0000"  >&lt;p&gt;FORCE_PING should be cleared when the ping fails. This will not trigger another PING until the next time the router checker decides to re-discover the router. In which case the error will be cleared. Would that resolve the issue?&lt;/p&gt;

&lt;p&gt;UPDATE: After discussing it with Chris, seems like we should clear the error when the discovery completes successfully. This will prevent the error from bringing down the route.&lt;/p&gt;

&lt;p&gt;Normally, the discovery FSM marks the peer with an error and flags it for rediscovery, such that any action on that peer will trigger it to be rediscovered. The expected actions was a send messages to the peer. However, now that we&apos;re using discovery not only to actively discover, but to actively push updates to peers when the internal state changes, this scenario is possible and the FSM needs to be updated to take the new flow into consideration.&lt;/p&gt;</comment>
                            <comment id="275276" author="gerrit" created="Mon, 13 Jul 2020 20:05:22 +0000"  >&lt;p&gt;Chris Horn (chris.horn@hpe.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/39348&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/39348&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13764&quot; title=&quot;lp_dc_error is not cleared when new discovery ping is sent as response to incoming push&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13764&quot;&gt;&lt;del&gt;LU-13764&lt;/del&gt;&lt;/a&gt; lnet: Clear lp_dc_error when discovery completes&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 56df66290ca67494d71d9354e0f06686683a0608&lt;/p&gt;</comment>
                            <comment id="276902" author="gerrit" created="Fri, 7 Aug 2020 04:59:34 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/39348/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/39348/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13764&quot; title=&quot;lp_dc_error is not cleared when new discovery ping is sent as response to incoming push&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13764&quot;&gt;&lt;del&gt;LU-13764&lt;/del&gt;&lt;/a&gt; lnet: Clear lp_dc_error when discovery completes&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: d3fa46705225f3f17aa10144ab688008d5230daa&lt;/p&gt;</comment>
                            <comment id="276975" author="spitzcor" created="Fri, 7 Aug 2020 19:08:12 +0000"  >&lt;p&gt;I believe that this can be resolved (but I don&apos;t want to do it myself in case there is a specific protocol to follow that I&apos;m not aware of).&lt;/p&gt;</comment>
                            <comment id="276976" author="pjones" created="Fri, 7 Aug 2020 19:26:39 +0000"  >&lt;p&gt;Yes I agree - thanks Cory. No special protocol (other than setting fix version)&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i014pj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>