<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:31:05 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-9990] MDS fails to mount due to (client.c:96:ptlrpc_uuid_to_connection()) cannot find peer MGC10.37.248.196@o2ib1 _0!</title>
                <link>https://jira.whamcloud.com/browse/LU-9990</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Recently I started to run into issues with the MDT failing to mount randomly. Now with the latest master the MDT fails to mount every single time. Looking at the debug log I noticed the following error on the MDT:&lt;/p&gt;

&lt;p&gt;(client.c:96:ptlrpc_uuid_to_connection()) cannot find peer MGC10.37.248.196@o2ib1_0!&lt;/p&gt;


</description>
                <environment>Latest lustre 2.10.5X running on RHEL7.4 with default OFED. Using IB for LND.</environment>
        <key id="48309">LU-9990</key>
            <summary>MDS fails to mount due to (client.c:96:ptlrpc_uuid_to_connection()) cannot find peer MGC10.37.248.196@o2ib1 _0!</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="simmonsja">James A Simmons</reporter>
                        <labels>
                    </labels>
                <created>Thu, 14 Sep 2017 14:37:13 +0000</created>
                <updated>Mon, 6 Nov 2017 15:57:24 +0000</updated>
                            <resolved>Tue, 24 Oct 2017 12:51:42 +0000</resolved>
                                    <version>Lustre 2.11.0</version>
                                    <fixVersion>Lustre 2.11.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>12</watches>
                                                                            <comments>
                            <comment id="208362" author="simmonsja" created="Thu, 14 Sep 2017 14:38:01 +0000"  >&lt;p&gt;I attached full debug logs from the MDS/MGS.&lt;/p&gt;</comment>
                            <comment id="208365" author="pjones" created="Thu, 14 Sep 2017 15:01:39 +0000"  >&lt;p&gt;Dropping severity because I take it that this is not a production service interruption&lt;/p&gt;</comment>
                            <comment id="208376" author="simmonsja" created="Thu, 14 Sep 2017 16:30:45 +0000"  >&lt;p&gt;Just our test bed is busted &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/sad.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="208377" author="pjones" created="Thu, 14 Sep 2017 17:13:48 +0000"  >&lt;p&gt;James&lt;/p&gt;

&lt;p&gt;Could you please elaborate as to the exact commit you are seeing this with and the last commit that you did not see this problem?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="208378" author="jhammond" created="Thu, 14 Sep 2017 17:17:47 +0000"  >&lt;p&gt;James,&lt;/p&gt;

&lt;p&gt;Could you be more specific about the Lustre version here? (Saying &apos;latest&apos; does not age well.) Also can you give a recent Lustre version where you didn&apos;t see this issue?&lt;/p&gt;

&lt;p&gt;Dose this still happen if you set lnet_peer_discovery_disabled=1 in the lnet module parameters?&lt;/p&gt;</comment>
                            <comment id="208522" author="ashehata" created="Fri, 15 Sep 2017 17:35:56 +0000"  >&lt;p&gt;James, can you also try:  &lt;a href=&quot;https://review.whamcloud.com/#/c/29007/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/29007/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="208526" author="simmonsja" created="Fri, 15 Sep 2017 18:00:41 +0000"  >&lt;p&gt;I did and it still fail to bring up the file system &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/sad.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt; John was right. When I set lnet_peer_discovery_disabled=1 everything worked as usual.&lt;/p&gt;</comment>
                            <comment id="208527" author="ashehata" created="Fri, 15 Sep 2017 18:12:00 +0000"  >&lt;p&gt;in the dump-mds.log I see:&lt;br/&gt;
(lib-move.c:3199:LNetGet()) Error sending GET to 12345-10.37.202.59@o2ib1: -113&lt;/p&gt;

&lt;p&gt;Is that issue resolved?&lt;/p&gt;

&lt;p&gt;Can you also turn on net and neterror when you&apos;re mounting and failing, and attach the output.&lt;/p&gt;</comment>
                            <comment id="208529" author="simmonsja" created="Fri, 15 Sep 2017 18:25:23 +0000"  >&lt;p&gt;That is due to patch &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt; lnet: prefer Fast Reg. I had to revert it locally to make things work better. I will post new logs shortly.&lt;/p&gt;</comment>
                            <comment id="208763" author="simmonsja" created="Tue, 19 Sep 2017 16:53:39 +0000"  >&lt;p&gt;With the revert of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt; prefer Fast Reg and the fix from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9933&quot; title=&quot;Hitting ASSERTION in lnet_peer_add_nid()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9933&quot;&gt;&lt;del&gt;LU-9933&lt;/del&gt;&lt;/a&gt; + &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9992&quot; title=&quot;Multi-Rail: use lolnd when sending locally&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9992&quot;&gt;&lt;del&gt;LU-9992&lt;/del&gt;&lt;/a&gt; everything seems to work now.&lt;/p&gt;</comment>
                            <comment id="208835" author="simmonsja" created="Tue, 19 Sep 2017 23:14:06 +0000"  >&lt;p&gt;Ah, I found what caused this issue. I have a reproducer. Its very simple, add routes that are all down to your yaml config file and you will not be able to mount your file system. We have Cray routers and they have been done recently. So add to your yaml config file something like this:&lt;/p&gt;

&lt;p&gt;route:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;net: gni1&lt;br/&gt;
      gateway: 10.37.202.60@o2ib1&lt;br/&gt;
      hop: 1&lt;br/&gt;
      priority: 0&lt;br/&gt;
      state: down&lt;/li&gt;
	&lt;li&gt;net: gni1&lt;br/&gt;
      gateway: 10.37.202.59@o2ib1&lt;br/&gt;
      hop: 1&lt;br/&gt;
      priority: 0&lt;br/&gt;
      state: down&lt;/li&gt;
	&lt;li&gt;net: gni1&lt;br/&gt;
      gateway: 10.37.202.61@o2ib1&lt;br/&gt;
      hop: 1&lt;br/&gt;
      priority: 0&lt;br/&gt;
      state: down&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;And you will see problems.&lt;/p&gt;</comment>
                            <comment id="208841" author="ashehata" created="Wed, 20 Sep 2017 02:09:47 +0000"  >&lt;p&gt;I couldn&apos;t reproduce this on my local setup. I&apos;m assuming that the downed gateways are not needed for the FS to be mounted, correct? IE do you need those routes for communication? If that&apos;s not the case, can you please enable net and neterror and dump the logs after the problem happens to so I can take a look.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;lctl set_param debug=+net
lctl set_param debug=+neterror
lctl dk &amp;gt; log
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="208872" author="shadow" created="Wed, 20 Sep 2017 04:14:24 +0000"  >&lt;p&gt;James, &lt;/p&gt;

&lt;p&gt;can you confirm - &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9180&quot; title=&quot;Upstream ko2iblnd does not work with map_on_demand &amp;lt;256&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9180&quot;&gt;&lt;del&gt;LU-9180&lt;/del&gt;&lt;/a&gt; isn&apos;t caused it bug? if no, can you test with patches from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt; ?&lt;br/&gt;
thanks.&lt;/p&gt;</comment>
                            <comment id="208989" author="simmonsja" created="Thu, 21 Sep 2017 03:56:49 +0000"  >&lt;p&gt;I have been testing without &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt; patches. This bug is still their. Its just when I run with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9810&quot; title=&quot;Melanox OFED 4.1 support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9810&quot;&gt;&lt;del&gt;LU-9810&lt;/del&gt;&lt;/a&gt; patches new bugs show up &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/sad.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt; Amir I will get new debug logs for you Friday.&lt;/p&gt;</comment>
                            <comment id="208990" author="shadow" created="Thu, 21 Sep 2017 04:00:26 +0000"  >&lt;p&gt;James,&lt;/p&gt;

&lt;p&gt;what HW you use for testing? if it MLX5, these patches do nothing for you. MLX5 uses a FastReg only model, while MLX4 support a both Fast and FMR.&lt;/p&gt;

&lt;p&gt;I don&apos;t have access to HW IB for now, so may check only with VM&apos;s + M-OFED 4.1+ Soft IB&lt;/p&gt;</comment>
                            <comment id="209450" author="simmonsja" created="Mon, 25 Sep 2017 17:27:11 +0000"  >&lt;p&gt;Sorry I was having a hard time reproducing this problem. Its not the route configuration that breaks lnet but the numa node setting in my lnet.conf that did. I ended up removing the numa stuff from my config file. If you add&lt;/p&gt;

&lt;p&gt;numa:&lt;br/&gt;
   range: 0&lt;/p&gt;

&lt;p&gt;to your lnet yaml config file you will see this breakage. &lt;/p&gt;</comment>
                            <comment id="209496" author="ashehata" created="Mon, 25 Sep 2017 22:24:58 +0000"  >&lt;p&gt;I believe the numa range defaults to 0. When you remove it and you do &quot;lnetctl numa show&quot; do you see a different value for the range?&lt;/p&gt;</comment>
                            <comment id="209501" author="simmonsja" created="Mon, 25 Sep 2017 22:53:53 +0000"  >&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@ninja34 ~&amp;#93;&lt;/span&gt;# lnetctl global show&lt;br/&gt;
global:&lt;br/&gt;
    numa_range: 0&lt;br/&gt;
    max_intf: 200&lt;br/&gt;
    discovery: 1&lt;/p&gt;

&lt;p&gt;Does the numa_range have to be &quot;under&quot; global: in the YAML config file?&lt;/p&gt;</comment>
                            <comment id="209504" author="ashehata" created="Mon, 25 Sep 2017 23:22:00 +0000"  >&lt;p&gt;so the output is a little strange. It looks like you&apos;re using the latest master. But the configuration you&apos;re feeding in seems to have been generated from 2.10. numa should be fed in under the global, as shown in the global show output you pasted above. Now thinking about it, this is a backwards compatibility issue since 2.10 is already out. I&apos;ll have to make lnetctl handle the older configuration as well for master. &lt;/p&gt;

&lt;p&gt;Do you see &quot;call back for &apos;numa&apos; not found&quot; error when you configure with the numa block?&lt;/p&gt;

&lt;p&gt;I think I know what the problem is. When the parser encounters a problem in the YAML file, it&apos;ll quit and simply stop configuring the rest of the items. So it could be that when it hits this error it doesn&apos;t finish the configuration leading to the problem you&apos;re seeing.&lt;/p&gt;

&lt;p&gt;Are you calling &quot;lnetctl import&quot; from a script? If so, I think you should be checking if the command succeeds or fails. If it fails you should assume that the node is not configured properly.&lt;/p&gt;

&lt;p&gt;Can you verify if my theory is correct?&lt;/p&gt;
</comment>
                            <comment id="209510" author="simmonsja" created="Tue, 26 Sep 2017 01:12:46 +0000"  >&lt;p&gt;Yes I do see a &quot;call back from &apos;num&apos; not found error when I start up. Also I was using lnet.conf from my 2.10 setup.  I&apos;m running the lnetctl import from the command line not script.&lt;/p&gt;</comment>
                            <comment id="209601" author="ashehata" created="Tue, 26 Sep 2017 16:53:16 +0000"  >&lt;p&gt;ok. I&apos;ll make a change to handle &quot;numa&quot; entry in YAML file so I can make master backwards compatible with 2.10. In the meantime, if there is an error configuring, you should assume that the configuration is not complete, and the node is not really usable.&lt;/p&gt;</comment>
                            <comment id="210405" author="ashehata" created="Thu, 5 Oct 2017 17:33:05 +0000"  >&lt;p&gt;Amir Shehata (amir.shehata@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29333&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29333&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9990&quot; title=&quot;MDS fails to mount due to (client.c:96:ptlrpc_uuid_to_connection()) cannot find peer MGC10.37.248.196@o2ib1 _0!&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9990&quot;&gt;&lt;del&gt;LU-9990&lt;/del&gt;&lt;/a&gt; lnet: add backwards compatibility for YAML config&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: e72cdc1373dfb930eccbc5d9afba215d8368b331&lt;/p&gt;</comment>
                            <comment id="211754" author="gerrit" created="Tue, 24 Oct 2017 07:17:50 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/29333/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29333/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9990&quot; title=&quot;MDS fails to mount due to (client.c:96:ptlrpc_uuid_to_connection()) cannot find peer MGC10.37.248.196@o2ib1 _0!&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9990&quot;&gt;&lt;del&gt;LU-9990&lt;/del&gt;&lt;/a&gt; lnet: add backwards compatibility for YAML config&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 3187d551d538bd8203c7156daaa617620c6569ab&lt;/p&gt;</comment>
                            <comment id="211779" author="pjones" created="Tue, 24 Oct 2017 12:51:42 +0000"  >&lt;p&gt;Landed for 2.11&lt;/p&gt;</comment>
                            <comment id="212523" author="mdiep" created="Wed, 1 Nov 2017 15:23:09 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=ashehata&quot; class=&quot;user-hover&quot; rel=&quot;ashehata&quot;&gt;ashehata&lt;/a&gt; said we don&apos;t need this for LTS&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="48073">LU-9933</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="48315">LU-9992</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="47572">LU-9810</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="28285" name="dump-mds.log" size="20386" author="simmonsja" created="Thu, 14 Sep 2017 14:37:24 +0000"/>
                            <attachment id="28286" name="dump-mgs.log" size="1786998" author="simmonsja" created="Thu, 14 Sep 2017 14:37:38 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzk6n:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>