<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:30:35 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-3056] conf-sanity test_66 - replace nids failed</title>
                <link>https://jira.whamcloud.com/browse/LU-3056</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for Nathaniel Clark &amp;lt;nathaniel.l.clark@intel.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite runs:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/95fcddea-97b0-11e2-a652-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/95fcddea-97b0-11e2-a652-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/810da798-9760-11e2-9ec7-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/810da798-9760-11e2-9ec7-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The sub-test test_66 failed with the following error:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;replace nids failed&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Info required for matching: conf-sanity 66&lt;/p&gt;

&lt;p&gt;All subsequent ZFS test suites (recovery-small, etc) fail with the following error:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Starting mds1: -o user_xattr,acl  lustre-mdt1/mdt1 /mnt/mds1
CMD: wtm-16vm3 mkdir -p /mnt/mds1; mount -t lustre -o user_xattr,acl  		                   lustre-mdt1/mdt1 /mnt/mds1
wtm-16vm3: mount.lustre: according to /etc/mtab lustre-mdt1/mdt1 is already mounted on /mnt/mds1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="18146">LU-3056</key>
            <summary>conf-sanity test_66 - replace nids failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="keith">Keith Mannthey</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                            <label>revzfs</label>
                    </labels>
                <created>Thu, 28 Mar 2013 16:11:46 +0000</created>
                <updated>Fri, 22 Sep 2023 22:23:11 +0000</updated>
                            <resolved>Fri, 21 Jun 2013 22:23:07 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="55026" author="green" created="Thu, 28 Mar 2013 17:49:46 +0000"  >&lt;p&gt;I asked Artem to look at this and he thinks &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2988&quot; title=&quot;conf-sanity 66: Modules still loaded&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2988&quot;&gt;&lt;del&gt;LU-2988&lt;/del&gt;&lt;/a&gt; might be the culprit, but is still confirming this theory.&lt;/p&gt;</comment>
                            <comment id="55167" author="artem_blagodarenko" created="Mon, 1 Apr 2013 06:51:31 +0000"  >&lt;p&gt;The error reason is because this statement doesn&apos;t return TRUE last call:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; only_mgs_is_running(struct obd_device *mgs_obd)
{
        &lt;span class=&quot;code-comment&quot;&gt;/* TDB: Is global variable with devices count exists? */&lt;/span&gt;
        &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; num_devices = get_devices_count();
        /* osd, MGS and MGC + self_export
           (wc -l /proc/fs/lustre/devices &amp;lt;= 2) &amp;amp;&amp;amp; (num_exports &amp;lt;= 2) */
        &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; (num_devices &amp;lt;= 3) &amp;amp;&amp;amp; (mgs_obd-&amp;gt;obd_num_exports &amp;lt;= 2);
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I am trying to figure out why this happens.&lt;/p&gt;</comment>
                            <comment id="55175" author="artem_blagodarenko" created="Mon, 1 Apr 2013 09:34:25 +0000"  >&lt;p&gt;I can&apos;t reproduce this bug locally. Statistics shows &quot;Subtest passes:	99/100&quot;. Is it possible that something was launched in parallel with tests in that 1 failed test execution?&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2988&quot; title=&quot;conf-sanity 66: Modules still loaded&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2988&quot;&gt;&lt;del&gt;LU-2988&lt;/del&gt;&lt;/a&gt; is useful for correct modules unloading.&lt;/p&gt;</comment>
                            <comment id="55453" author="keith" created="Thu, 4 Apr 2013 01:33:33 +0000"  >&lt;p&gt;test_66 	&lt;br/&gt;
    Error: &apos;replace nids failed&apos;&lt;br/&gt;
    Failure Rate: 23.00% of last 100 executions &lt;span class=&quot;error&quot;&gt;&amp;#91;all branches&amp;#93;&lt;/span&gt; &lt;/p&gt;

&lt;p&gt;23% makes me think this may need to be a blocker. &lt;/p&gt;

&lt;p&gt;In general there are no parallel tests.  Tests are safe to assume they are the only thing running at this point in time. &lt;/p&gt;

&lt;p&gt;Just a quick breakdown for casual observers:&lt;/p&gt;

&lt;p&gt;From conf_sanity test_66 &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;        echo &quot;replace MDS nid&quot;
        do_facet mgs $LCTL replace_nids $FSNAME-MDT0000 $MDS_NID ||
                error &quot;replace nids failed&quot;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And from that do_facet call we get:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;CMD: wtm-18vm3 /usr/sbin/lctl replace_nids lustre-MDT0000 10.10.16.188@tcp
wtm-18vm3: error: replace_nids: Operation now in progress
 conf-sanity test_66: @@@@@@ FAIL: replace nids failed 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;The error: replace... comes from here:&lt;br/&gt;
In &quot;utils/obd.c jt_replace_nids&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;        rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_REPLACE_NIDS, buf);
        if (rc &amp;lt; 0) {
                fprintf(stderr, &quot;error: %s: %s\n&quot;, jt_cmdname(argv[0]),
                        strerror(rc = errno));
        }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This is seen on the MDS:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LustreError: 28590:0:(mgs_llog.c:1286:mgs_replace_nids()) Only MGS is allowed to be started
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And that leads to the above only_mgs_is_running and mgs_replace_nids in lustre/mgs/mgs_llog.c&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;        /* We can not change nids if not only MGS is started */
        if (!only_mgs_is_running(mgs_obd)) {
                CERROR(&quot;Only MGS is allowed to be started\n&quot;);
                GOTO(out, rc = -EINPROGRESS);
        }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Also &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;           (wc -l /proc/fs/lustre/devices &amp;lt;= 2) &amp;amp;&amp;amp; (num_exports &amp;lt;= 2) */
        return (num_devices &amp;lt;= 3) &amp;amp;&amp;amp; (mgs_obd-&amp;gt;obd_num_exports &amp;lt;= 2);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Should num_devices be 2 or 3?  &lt;/p&gt;



&lt;p&gt;The untested debug patch can be found here: &lt;a href=&quot;http://review.whamcloud.com/5940&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5940&lt;/a&gt; &lt;br/&gt;
It is a rare path so it should not hurt.&lt;/p&gt;

&lt;p&gt;If we don&apos;t want to land the patch I will have it run conf_sanity alot. &lt;/p&gt;</comment>
                            <comment id="55501" author="utopiabound" created="Thu, 4 Apr 2013 16:36:43 +0000"  >&lt;p&gt;I haven&apos;t seen this conf-sanity/66 fail for any patch based past &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2988&quot; title=&quot;conf-sanity 66: Modules still loaded&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2988&quot;&gt;&lt;del&gt;LU-2988&lt;/del&gt;&lt;/a&gt;.  Granted, at this point the sample set is pretty small at this point.&lt;/p&gt;</comment>
                            <comment id="55529" author="keith" created="Thu, 4 Apr 2013 17:44:55 +0000"  >&lt;p&gt;Ahh thanks I didn&apos;t have lu-2988 on my radar. &lt;/p&gt;

&lt;p&gt;I checked as well and I don&apos;t see any errors in the past two days. &lt;/p&gt;</comment>
                            <comment id="55530" author="keith" created="Thu, 4 Apr 2013 17:48:38 +0000"  >&lt;p&gt;Dup of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2988&quot; title=&quot;conf-sanity 66: Modules still loaded&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2988&quot;&gt;&lt;del&gt;LU-2988&lt;/del&gt;&lt;/a&gt;. &lt;/p&gt;

&lt;p&gt;This issue is believed to be fixed. A patch was landed April 1st or so and no errors have been seen since. &lt;/p&gt;

&lt;p&gt;Please reopen if the errors continue. &lt;/p&gt;</comment>
                            <comment id="55865" author="utopiabound" created="Tue, 9 Apr 2013 13:38:07 +0000"  >&lt;p&gt;Failure on current master with fix for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2988&quot; title=&quot;conf-sanity 66: Modules still loaded&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2988&quot;&gt;&lt;del&gt;LU-2988&lt;/del&gt;&lt;/a&gt;:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/ae3e5ec6-a104-11e2-b1c3-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/ae3e5ec6-a104-11e2-b1c3-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="55867" author="artem_blagodarenko" created="Tue, 9 Apr 2013 13:51:20 +0000"  >&lt;p&gt;this message in log &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;20000000:00020000:0.0:1365501872.365629:0:20267:0:(mgs_llog.c:1286:mgs_replace_nids()) Only MGS is allowed to be started&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Do anybody know what changed in code tree, so this become wrong?&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-comment&quot;&gt;/*(wc -l /proc/fs/lustre/devices &amp;lt;= 3) &amp;amp;&amp;amp; (num_exports &amp;lt;= 2) */&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="55868" author="artem_blagodarenko" created="Tue, 9 Apr 2013 13:56:57 +0000"  >&lt;p&gt;&amp;gt;The untested debug patch can be found here: &lt;a href=&quot;http://review.whamcloud.com/5940&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5940&lt;/a&gt; &lt;/p&gt;

&lt;p&gt;&amp;gt;If we don&apos;t want to land the patch I will have it run conf_sanity alot.&lt;/p&gt;

&lt;p&gt;Keith, do you have output with patch applied and test failed?&lt;/p&gt;
</comment>
                            <comment id="55889" author="keith" created="Tue, 9 Apr 2013 16:25:06 +0000"  >&lt;p&gt;There is no debug output yet as the problem was thought fixed.  I will revisit the patch. &lt;/p&gt;
</comment>
                            <comment id="55964" author="keith" created="Wed, 10 Apr 2013 06:17:14 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/6005&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/6005&lt;/a&gt;  is a conf-sanity run with the debug patch. &lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/5940&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5940&lt;/a&gt; has been rebased for possible inclusion.&lt;/p&gt;

</comment>
                            <comment id="57157" author="adilger" created="Fri, 26 Apr 2013 19:05:19 +0000"  >&lt;p&gt;I definitely don&apos;t think this needs to be a 2.4.0 blocker, since replace_nids is a very rarely used code path.  The only potential reason for increased priority might be the frequency to other patches failing due to this bug, but I don&apos;t see very many failures due to this specific bug (several other conf-sanity failures are increasing the test failure rates).&lt;/p&gt;</comment>
                            <comment id="57562" author="keith" created="Thu, 2 May 2013 19:00:36 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/5940&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5940&lt;/a&gt; has been resubmitted for testing in the effort to land it after the 2.4 split. We still see the issue on Master a few times a week and it will be good to know more about out what is causing the issue.&lt;/p&gt;</comment>
                            <comment id="59990" author="keith" created="Tue, 4 Jun 2013 18:22:51 +0000"  >&lt;p&gt;Quick update:&lt;/p&gt;

&lt;p&gt;conf_sanity test_66 has not failed in a few weeks.  The &quot;replace nids failed&quot; error really dropped off after 2013-04-29.  Perhaps some code path has changed. &lt;/p&gt;</comment>
                            <comment id="61047" author="keith" created="Fri, 21 Jun 2013 22:21:26 +0000"  >&lt;p&gt;Still no sign of the &quot;replace nids failed&quot; errors. &lt;/p&gt;</comment>
                            <comment id="61049" author="keith" created="Fri, 21 Jun 2013 22:23:07 +0000"  >&lt;p&gt;We no longer see this issue.  Please reopen if this starts  to trigger again.  There is on sense landing a debug patch for a problem that does not happen. &lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="54950">LU-11990</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="20505">LU-3793</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="24996">LU-5137</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvmkn:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>7450</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>