<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:20:42 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-8805] Failover: recovery-mds-scale test_failover_mds: test_failover_mds returned 4</title>
                <link>https://jira.whamcloud.com/browse/LU-8805</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for Saurabh Tandan &amp;lt;saurabh.tandan@intel.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/be9e4ae0-a1c0-11e6-8ed2-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/be9e4ae0-a1c0-11e6-8ed2-5254006e85c2&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_failover_mds failed with the following error:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;test_failover_mds returned 4
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;test_log:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;== recovery-mds-scale test failover_mds: failover MDS ================================================ 17:03:39 (1478131419)
Started client load: dd on onyx-40vm5
CMD: onyx-40vm5 PATH=/opt/iozone/bin:/opt/iozone/bin:/usr/lib64/lustre/tests/mpi:/usr/lib64/lustre/tests/racer:/usr/lib64/lustre/../lustre-iokit/sgpdd-survey:/usr/lib64/lustre/tests:/usr/lib64/lustre/utils/gss:/usr/lib64/lustre/utils:/usr/lib64/qt-3.3/bin:/usr/lib64/compat-openmpi16/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/usr/sbin:/sbin:/bin: MOUNT=/mnt/lustre ERRORS_OK= BREAK_ON_ERROR= END_RUN_FILE=/shared_test/autotest/2016-11-02/153732-70163256913820/end_run_file LOAD_PID_FILE=/tmp/client-load.pid TESTLOG_PREFIX=/logdir/test_logs/2016-11-02/lustre-master-el7-x86_64--failover--1_15_1__3468__-70163256913820-153732/recovery-mds-scale TESTNAME=test_failover_mds DBENCH_LIB=/usr/share/doc/dbench/loadfiles DBENCH_SRC= CLIENT_COUNT=2 LFS=/usr/bin/lfs run_dd.sh
Started client load: tar on onyx-40vm6
CMD: onyx-40vm6 PATH=/opt/iozone/bin:/opt/iozone/bin:/usr/lib64/lustre/tests/mpi:/usr/lib64/lustre/tests/racer:/usr/lib64/lustre/../lustre-iokit/sgpdd-survey:/usr/lib64/lustre/tests:/usr/lib64/lustre/utils/gss:/usr/lib64/lustre/utils:/usr/lib64/qt-3.3/bin:/usr/lib64/compat-openmpi16/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/usr/sbin:/sbin:/bin: MOUNT=/mnt/lustre ERRORS_OK= BREAK_ON_ERROR= END_RUN_FILE=/shared_test/autotest/2016-11-02/153732-70163256913820/end_run_file LOAD_PID_FILE=/tmp/client-load.pid TESTLOG_PREFIX=/logdir/test_logs/2016-11-02/lustre-master-el7-x86_64--failover--1_15_1__3468__-70163256913820-153732/recovery-mds-scale TESTNAME=test_failover_mds DBENCH_LIB=/usr/share/doc/dbench/loadfiles DBENCH_SRC= CLIENT_COUNT=2 LFS=/usr/bin/lfs run_tar.sh
client loads pids:
CMD: onyx-40vm5,onyx-40vm6 cat /tmp/client-load.pid
onyx-40vm6: 7449
onyx-40vm5: 7479
==== Checking the clients loads BEFORE failover -- failure NOT OK              ELAPSED=0 DURATION=86400 PERIOD=1200
Client load failed on node onyx-40vm5, rc=1
2016-11-02 17:03:46 Terminating clients loads ...
Duration:               86400
Server failover period: 1200 seconds
Exited after:           0 seconds
Number of failovers before exit:
mds1: 0 times
ost1: 0 times
ost2: 0 times
ost3: 0 times
ost4: 0 times
ost5: 0 times
ost6: 0 times
ost7: 0 times
Status: FAIL: rc=4
CMD: onyx-40vm5,onyx-40vm6 test -f /tmp/client-load.pid &amp;amp;&amp;amp;
        { kill -s TERM \$(cat /tmp/client-load.pid); rm -f /tmp/client-load.pid; }
/usr/lib64/lustre/tests/recovery-mds-scale.sh: line 103: 22083 Killed                  do_node $client &quot;PATH=$PATH MOUNT=$MOUNT ERRORS_OK=$ERRORS_OK BREAK_ON_ERROR=$BREAK_ON_ERROR END_RUN_FILE=$END_RUN_FILE LOAD_PID_FILE=$LOAD_PID_FILE TESTLOG_PREFIX=$TESTLOG_PREFIX TESTNAME=$TESTNAME DBENCH_LIB=$DBENCH_LIB DBENCH_SRC=$DBENCH_SRC CLIENT_COUNT=$((CLIENTCOUNT - 1)) LFS=$LFS run_${load}.sh&quot;
/usr/lib64/lustre/tests/recovery-mds-scale.sh: line 103: 22277 Killed                  do_node $client &quot;PATH=$PATH MOUNT=$MOUNT ERRORS_OK=$ERRORS_OK BREAK_ON_ERROR=$BREAK_ON_ERROR END_RUN_FILE=$END_RUN_FILE LOAD_PID_FILE=$LOAD_PID_FILE TESTLOG_PREFIX=$TESTLOG_PREFIX TESTNAME=$TESTNAME DBENCH_LIB=$DBENCH_LIB DBENCH_SRC=$DBENCH_SRC CLIENT_COUNT=$((CLIENTCOUNT - 1)) LFS=$LFS run_${load}.sh&quot;
Dumping lctl log to /logdir/test_logs/2016-11-02/lustre-master-el7-x86_64--failover--1_15_1__3468__-70163256913820-153732/recovery-mds-scale.test_failover_mds.*.1478131427.log
CMD: onyx-40vm3,onyx-40vm4,onyx-40vm7,onyx-40vm8 /usr/sbin/lctl dk &amp;gt; /logdir/test_logs/2016-11-02/lustre-master-el7-x86_64--failover--1_15_1__3468__-70163256913820-153732/recovery-mds-scale.test_failover_mds.debug_log.\$(hostname -s).1478131427.log;
         dmesg &amp;gt; /logdir/test_logs/2016-11-02/lustre-master-el7-x86_64--failover--1_15_1__3468__-70163256913820-153732/recovery-mds-scale.test_failover_mds.dmesg.\$(hostname -s).1478131427.log
onyx-40vm3: invalid parameter &apos;dump_kernel&apos;
onyx-40vm3: open(dump_kernel) failed: No such file or directory
onyx-40vm4: invalid parameter &apos;dump_kernel&apos;
onyx-40vm4: open(dump_kernel) failed: No such file or directory
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Could not find another useful information.&lt;br/&gt;
Can be related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5483&quot; title=&quot;recovery-mds-scale test failover_mds: oom failure on client&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5483&quot;&gt;LU-5483&lt;/a&gt;&lt;/p&gt;</description>
                <environment>Failover: EL7 Server/Client&lt;br/&gt;
master, build# 3468</environment>
        <key id="41369">LU-8805</key>
            <summary>Failover: recovery-mds-scale test_failover_mds: test_failover_mds returned 4</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Mon, 7 Nov 2016 19:09:29 +0000</created>
                <updated>Wed, 24 May 2017 16:23:58 +0000</updated>
                            <resolved>Mon, 21 Nov 2016 20:38:34 +0000</resolved>
                                    <version>Lustre 2.9.0</version>
                                    <fixVersion>Lustre 2.9.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="173164" author="pjones" created="Thu, 10 Nov 2016 18:44:34 +0000"  >&lt;p&gt;Hongchao&lt;/p&gt;

&lt;p&gt;Could you please advise on this issue?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="173165" author="adilger" created="Thu, 10 Nov 2016 18:46:59 +0000"  >&lt;p&gt;These tests are failing in under 20s, so I&apos;d suspect there is something broken in the test scripts, even before it is doing anything in the test.  From the &quot;dump_kernel&quot; messages, it appears maybe even the Lustre modules are not loaded.&lt;/p&gt;</comment>
                            <comment id="173279" author="gerrit" created="Fri, 11 Nov 2016 11:46:29 +0000"  >&lt;p&gt;Hongchao Zhang (hongchao.zhang@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/23717&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/23717&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8805&quot; title=&quot;Failover: recovery-mds-scale test_failover_mds: test_failover_mds returned 4&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8805&quot;&gt;&lt;del&gt;LU-8805&lt;/del&gt;&lt;/a&gt; test: debug patch&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 312ab76e4049a443fc45ac8593825a722240003b&lt;/p&gt;</comment>
                            <comment id="173280" author="hongchao.zhang" created="Fri, 11 Nov 2016 11:47:12 +0000"  >&lt;p&gt;According to the logs ( &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/be9e4ae0-a1c0-11e6-8ed2-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/be9e4ae0-a1c0-11e6-8ed2-5254006e85c2&lt;/a&gt;), the failure should be caused&lt;br/&gt;
by the following script in &quot;test-framework.sh&quot;&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;check_client_load () {
        local client=$1
        local var=$(node_var_name $client)_load
        local testload=run_${!var}.sh
        
        ps -C $testload | grep $client || return 1      &amp;lt;--- this check failed.
    
        # bug 18914: try to connect several times not only when
        # check ps, but  while check_node_health also
        ...
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;the load has been started successfully at onyx-40vm5 and onyx-40vm6.&lt;br/&gt;
the debug patch &lt;a href=&quot;http://review.whamcloud.com/23717&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/23717&lt;/a&gt; is created to collect more logs.&lt;/p&gt;</comment>
                            <comment id="174283" author="egryaznova" created="Fri, 18 Nov 2016 18:11:32 +0000"  >&lt;p&gt;regression is caused by &lt;a href=&quot;http://review.whamcloud.com/#/c/20539/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/20539/&lt;/a&gt; :&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;commit 35119a60678b970b76dc13d8932f5a59a9d53996
Author:     Parinay Kondekar &amp;lt;parinay.kondekar@seagate.com&amp;gt;
AuthorDate: Thu Sep 29 12:50:28 2016 +0530
Commit:     Vitaly Fertman &amp;lt;vitaly.fertman@seagate.com&amp;gt;
CommitDate: Fri Oct 28 23:13:58 2016 +0300

    LU-8226 tests: Change check_catastrophe() to check_node_health()

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;by the proposed modification :&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;+       ps -C $testload | grep $client || &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;testload is started on remote node, ps -C does not show it :&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[root@fre813 ~]# ps aux | grep run_dd
root     13143  0.0  0.1 103952  1352 ?        Sl   17:54   0:00 /usr/bin/pdsh -R ssh -S -w fre814 (PATH=$PATH:/usr/lib64/lustre/utils:/usr/lib64/lustre/tests:/sbin:/usr/sbin; cd /root; LUSTRE=&lt;span class=&quot;code-quote&quot;&gt;&quot;/usr/lib64/lustre&quot;&lt;/span&gt; sh -c &lt;span class=&quot;code-quote&quot;&gt;&quot;PATH=/opt/iozone/bin:/usr/lib64/lustre/tests/mpi:/usr/lib64/lustre/tests/racer:/usr/lib64/lustre/../lustre-iokit/sgpdd-survey:/usr/lib64/lustre/tests:/usr/lib64/lustre/utils/gss:/usr/lib64/lustre/utils:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin: MOUNT=/mnt/lustre ERRORS_OK= BREAK_ON_ERROR= END_RUN_FILE=/shared/fremont/test-results/xperior-custom/3114&lt;span class=&quot;code-comment&quot;&gt;//kvm8-octet-2/shared-dir//recovery-mds-scale/end_run_file LOAD_PID_FILE=/tmp/client-load.pid TESTLOG_PREFIX=/tmp/test_logs/1479491651/recovery-mds-scale TESTNAME=test_failover_mds DBENCH_LIB= DBENCH_SRC= CLIENT_COUNT=3 LFS=/usr/bin/lfs run_dd.sh&quot;&lt;/span&gt;)
&lt;/span&gt;root     13152  0.0  0.2  58016  3316 ?        Ss   17:54   0:00 ssh -oConnectTimeout=10 -2 -a -x -lroot fre814 (PATH=$PATH:/usr/lib64/lustre/utils:/usr/lib64/lustre/tests:/sbin:/usr/sbin; cd /root; LUSTRE=&lt;span class=&quot;code-quote&quot;&gt;&quot;/usr/lib64/lustre&quot;&lt;/span&gt; sh -c &lt;span class=&quot;code-quote&quot;&gt;&quot;PATH=/opt/iozone/bin:/usr/lib64/lustre/tests/mpi:/usr/lib64/lustre/tests/racer:/usr/lib64/lustre/../lustre-iokit/sgpdd-survey:/usr/lib64/lustre/tests:/usr/lib64/lustre/utils/gss:/usr/lib64/lustre/utils:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin: MOUNT=/mnt/lustre ERRORS_OK= BREAK_ON_ERROR= END_RUN_FILE=/shared/fremont/test-results/xperior-custom/3114&lt;span class=&quot;code-comment&quot;&gt;//kvm8-octet-2/shared-dir//recovery-mds-scale/end_run_file LOAD_PID_FILE=/tmp/client-load.pid TESTLOG_PREFIX=/tmp/test_logs/1479491651/recovery-mds-scale TESTNAME=test_failover_mds DBENCH_LIB= DBENCH_SRC= CLIENT_COUNT=3 LFS=/usr/bin/lfs run_dd.sh&quot;&lt;/span&gt;)
&lt;/span&gt;root     15383  0.0  0.0 103236   840 pts/0    R+   17:55   0:00 grep run_dd


[root@fre813 ~]# ps -C run_dd.sh
  PID TTY          TIME CMD

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="174304" author="gerrit" created="Fri, 18 Nov 2016 19:31:13 +0000"  >&lt;p&gt;Elena Gryaznova (elena.gryaznova@seagate.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/23861&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/23861&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8805&quot; title=&quot;Failover: recovery-mds-scale test_failover_mds: test_failover_mds returned 4&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8805&quot;&gt;&lt;del&gt;LU-8805&lt;/del&gt;&lt;/a&gt; tests: fix defect introduced by &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8226&quot; title=&quot;t-f check_catastrophe() defect&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8226&quot;&gt;&lt;del&gt;LU-8226&lt;/del&gt;&lt;/a&gt;&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: ef177978fb0028083e7452ee8ffe6d1cd5ac719b&lt;/p&gt;</comment>
                            <comment id="174524" author="gerrit" created="Mon, 21 Nov 2016 20:22:45 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/23861/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/23861/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8805&quot; title=&quot;Failover: recovery-mds-scale test_failover_mds: test_failover_mds returned 4&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8805&quot;&gt;&lt;del&gt;LU-8805&lt;/del&gt;&lt;/a&gt; tests: fix defect introduced by &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8226&quot; title=&quot;t-f check_catastrophe() defect&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8226&quot;&gt;&lt;del&gt;LU-8226&lt;/del&gt;&lt;/a&gt;&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 300739c76abdaec738c63237249d88097c595cc8&lt;/p&gt;</comment>
                            <comment id="174527" author="pjones" created="Mon, 21 Nov 2016 20:38:34 +0000"  >&lt;p&gt;Landed for 2.9&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="37306">LU-8226</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzyup3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>