<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:27:38 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-9602] recovery-random-scale test_fail_client_mds: PASS but marked as FAIL</title>
                <link>https://jira.whamcloud.com/browse/LU-9602</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/2df73ff0-8703-4bf1-9f76-2ea9f98949fb&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/2df73ff0-8703-4bf1-9f76-2ea9f98949fb&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;From suite_log:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PASS fail_client_mds (85783s)
Starting client: onyx-36vm1.onyx.hpdd.intel.com:  -o user_xattr,flock onyx-36vm7:onyx-36vm3:/lustre /mnt/lustre
CMD: onyx-36vm1.onyx.hpdd.intel.com mkdir -p /mnt/lustre
CMD: onyx-36vm1.onyx.hpdd.intel.com mount -t lustre -o user_xattr,flock onyx-36vm7:onyx-36vm3:/lustre /mnt/lustre
== recovery-random-scale test complete, duration 85822 sec =========================================== 17:10:21 (1495991421)
rm: cannot remove &apos;/mnt/lustre/d0.tar-onyx-36vm6.onyx.hpdd.intel.com/etc&apos;: Directory not empty
 recovery-random-scale : @@@@@@ FAIL: remove sub-test dirs failed 
  Trace dump:
  = /usr/lib64/lustre/tests/test-framework.sh:4952:error()
  = /usr/lib64/lustre/tests/test-framework.sh:4471:check_and_cleanup_lustre()
  = /usr/lib64/lustre/tests/recovery-random-scale.sh:267:main()
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;and&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Status: PASS: rc=0
CMD: onyx-36vm5,onyx-36vm6 test -f /tmp/client-load.pid &amp;amp;&amp;amp;
        { kill -s TERM \$(cat /tmp/client-load.pid); rm -f /tmp/client-load.pid; }
SKIP pairwise_fail (3867s)
Starting client: onyx-36vm1.onyx.hpdd.intel.com:  -o user_xattr,flock onyx-36vm7:onyx-36vm3:/lustre /mnt/lustre
CMD: onyx-36vm1.onyx.hpdd.intel.com mkdir -p /mnt/lustre
CMD: onyx-36vm1.onyx.hpdd.intel.com mount -t lustre -o user_xattr,flock onyx-36vm7:onyx-36vm3:/lustre /mnt/lustre
== recovery-double-scale test complete, duration 3904 sec ============================================ 18:20:55 (1495995655)
rm: cannot remove &apos;/mnt/lustre/d0.tar-onyx-36vm6.onyx.hpdd.intel.com/etc/fonts/conf.d&apos;: Directory not empty
 recovery-double-scale : @@@@@@ FAIL: remove sub-test dirs failed 
  Trace dump:
  = /usr/lib64/lustre/tests/test-framework.sh:4952:error()
  = /usr/lib64/lustre/tests/test-framework.sh:4471:check_and_cleanup_lustre()
  = /usr/lib64/lustre/tests/recovery-double-scale.sh:309:main()
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>onyx, failover&lt;br/&gt;
&amp;nbsp;&amp;nbsp;clients: EL7, master branch, v2.9.58, b3591&lt;br/&gt;
&amp;nbsp;&amp;nbsp;servers: EL7, zfs, master branch, v2.9.58, b3591&lt;br/&gt;
</environment>
        <key id="46524">LU-9602</key>
            <summary>recovery-random-scale test_fail_client_mds: PASS but marked as FAIL</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="3" iconUrl="https://jira.whamcloud.com/images/icons/statuses/inprogress.png" description="This issue is being actively worked on at the moment by the assignee.">In Progress</status>
                    <statusCategory id="4" key="indeterminate" colorName="inprogress"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="Deiter">Alex Deiter</assignee>
                                    <reporter username="jcasper">James Casper</reporter>
                        <labels>
                    </labels>
                <created>Mon, 5 Jun 2017 21:44:57 +0000</created>
                <updated>Fri, 14 Apr 2023 23:00:50 +0000</updated>
                                            <version>Lustre 2.10.0</version>
                    <version>Lustre 2.11.0</version>
                    <version>Lustre 2.12.0</version>
                    <version>Lustre 2.10.4</version>
                    <version>Lustre 2.10.5</version>
                    <version>Lustre 2.13.0</version>
                    <version>Lustre 2.10.7</version>
                    <version>Lustre 2.12.1</version>
                    <version>Lustre 2.12.3</version>
                    <version>Lustre 2.14.0</version>
                    <version>Lustre 2.12.5</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="198207" author="casperjx" created="Mon, 5 Jun 2017 21:50:10 +0000"  >&lt;p&gt;Also seen in this config with recovery-double-scale: test pairwise_fail: SKIP but marked as FAIL&lt;/p&gt;</comment>
                            <comment id="203730" author="gerrit" created="Thu, 27 Jul 2017 22:40:04 +0000"  >&lt;p&gt;James Nunez (james.a.nunez@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/28264&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/28264&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9602&quot; title=&quot;recovery-random-scale test_fail_client_mds: PASS but marked as FAIL&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9602&quot;&gt;LU-9602&lt;/a&gt; test: kill all tar and dd processes&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 51a95be4f89237a7e2dd16d8af99793b0126c5b9&lt;/p&gt;</comment>
                            <comment id="220423" author="jamesanunez" created="Thu, 8 Feb 2018 15:49:39 +0000"  >&lt;p&gt;We see several of the recovery-*-scale tests fail in this way; we are not able to clean up the Lustre directory.&lt;/p&gt;

&lt;p&gt;For example, a recent failure in recovery-mds-scale test_failover_mds provides some new information; &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/4929a310-fded-11e7-bd00-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/4929a310-fded-11e7-bd00-52540065bddc&lt;/a&gt;. Looking at the suite_log, we see the test fail due to rm failing&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;rm: cannot remove &apos;/mnt/lustre/d0.tar-onyx-41vm4.onyx.hpdd.intel.com/etc&apos;: Directory not empty
recovery-mds-scale : @@@@@@ FAIL: remove sub-test dirs failed 
  Trace dump:
  = /usr/lib64/lustre/tests/test-framework.sh:5335:error()
  = /usr/lib64/lustre/tests/test-framework.sh:4829:check_and_cleanup_lustre()
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Later in the log, we can see that run_tar.sh is still running:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Stopping client onyx-41vm1.onyx.hpdd.intel.com /mnt/lustre opts:
Stopping client onyx-41vm4.onyx.hpdd.intel.com /mnt/lustre opts:
COMMAND     PID USER   FD   TYPE      DEVICE SIZE/OFF               NODE NAME
sleep      3581 root  cwd    DIR 1273,181606     4096 144115205289292362 /mnt/lustre/d0.tar-onyx-41vm4.onyx.hpdd.intel.com
run_tar.s 13671 root  cwd    DIR 1273,181606     4096 144115205289292362 /mnt/lustre/d0.tar-onyx-41vm4.onyx.hpdd.intel.com
/mnt/lustre is still busy, wait one second
/mnt/lustre is still busy, wait one second
/mnt/lustre is still busy, wait one second
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The run_tar, run_dd, etc. processes are started and the PID of each is stored in a file called client-load.pid in /tmp on the client node. In this case vm4 started run_tar as seen here  &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Started client load: tar on onyx-41vm4
CMD: onyx-41vm4 PATH=/ [&#8230;]	LCTL=/usr/sbin/lctl 			FSNAME=lustre 			run_tar.sh
client loads pids:
CMD: onyx-41vm3,onyx-41vm4 cat /tmp/client-load.pid
onyx-41vm3: 12666
onyx-41vm4: 13671
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In stop_process(), we kill the PID saved in /tmp/client-load.pid&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;2416 # Stop the process which had its PID saved in a file.
2417 stop_process() {
2418     local nodes=$1
2419     local pid_file=$2
2420 
2421     [ -z &lt;span class=&quot;code-quote&quot;&gt;&quot;$nodes&quot;&lt;/span&gt; -o -z &lt;span class=&quot;code-quote&quot;&gt;&quot;$pid_file&quot;&lt;/span&gt; ] &amp;amp;&amp;amp; &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 0
2422 
2423     do_nodes $nodes &quot;test -f $pid_file &amp;amp;&amp;amp;
2424         { kill -s TERM \\\$(cat $pid_file); rm -f $pid_file; }&quot; || &lt;span class=&quot;code-keyword&quot;&gt;true&lt;/span&gt;
2425 }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;We see that vm3 tries to clean up it&apos;s run_dd PID and fails. What about vm4:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;CMD: onyx-41vm3,onyx-41vm4 test -f /tmp/client-load.pid &amp;amp;&amp;amp;
        { kill -s TERM \$(cat /tmp/client-load.pid); rm -f /tmp/client-load.pid; }
onyx-41vm3: sh: line 1: kill: (12666) - No such process
/usr/lib64/lustre/tests/recovery-mds-scale.sh: line 104:  9503 Killed                  do_node $client &quot;PATH=$PATH MOUNT=$MOUNT ERRORS_OK=$ERRORS_OK 			BREAK_ON_ERROR=$BREAK_ON_ERROR 			END_RUN_FILE=$END_RUN_FILE 			LOAD_PID_FILE=$LOAD_PID_FILE 			TESTLOG_PREFIX=$TESTLOG_PREFIX 			TESTNAME=$TESTNAME 			DBENCH_LIB=$DBENCH_LIB 			DBENCH_SRC=$DBENCH_SRC 			CLIENT_COUNT=$((CLIENTCOUNT - 1)) 			LFS=$LFS 			LCTL=$LCTL 			FSNAME=$FSNAME 			run_${load}.sh&quot;
/usr/lib64/lustre/tests/recovery-mds-scale.sh: line 104:  9697 Killed                  do_node $client &quot;PATH=$PATH MOUNT=$MOUNT ERRORS_OK=$ERRORS_OK 			BREAK_ON_ERROR=$BREAK_ON_ERROR 			END_RUN_FILE=$END_RUN_FILE 			LOAD_PID_FILE=$LOAD_PID_FILE 			TESTLOG_PREFIX=$TESTLOG_PREFIX 			TESTNAME=$TESTNAME 			DBENCH_LIB=$DBENCH_LIB 			DBENCH_SRC=$DBENCH_SRC 			CLIENT_COUNT=$((CLIENTCOUNT - 1)) 			LFS=$LFS 			LCTL=$LCTL 			FSNAME=$FSNAME 			run_${load}.sh&quot;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
</comment>
                            <comment id="302965" author="jamesanunez" created="Fri, 28 May 2021 15:28:56 +0000"  >&lt;p&gt;Looking at the results at &lt;a href=&quot;https://testing.whamcloud.com/test_sets/3bb67f2c-156c-42e3-8b23-9050d1d799a4&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/3bb67f2c-156c-42e3-8b23-9050d1d799a4&lt;/a&gt;, the test passes, but we can see in the suite_log that not all processes were killed&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Stopping client onyx-42vm3.onyx.whamcloud.com /mnt/lustre opts:
COMMAND     PID USER   FD      TYPE      DEVICE    SIZE/OFF               NODE NAME
run_dd.sh 16846 root  cwd   unknown 1273,181606                                /mnt/lustre/d0.dd-onyx-42vm3.onyx.whamcloud.com
dd        16909 root  cwd   unknown 1273,181606                                /mnt/lustre/d0.dd-onyx-42vm3.onyx.whamcloud.com
dd        16909 root    1w      REG 1273,181606 26873016320 144115205272502274 /mnt/lustre/d0.dd-onyx-42vm3.onyx.whamcloud.com/dd-file
/mnt/lustre is still busy, wait one second
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Looking at the run_dd_debug log on onyx-42vm3, we tried to kill the process but&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2021-05-14 00:35:09: client load was signaled to terminate
++++ ps -eo &apos;%c %p %r&apos;
++++ awk &apos;/ 16845 / {print $3}&apos;
+++ local PGID=
+++ kill -TERM -
/usr/lib64/lustre/tests/functions.sh: line 166: kill: -: arguments must be process or job IDs
+++ sleep 5
+++ kill -KILL -
/usr/lib64/lustre/tests/functions.sh: line 168: kill: -: arguments must be process or job IDs
++ sleep 5
++ kill -KILL -16804
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="318625" author="egryaznova" created="Fri, 19 Nov 2021 10:22:26 +0000"  >&lt;p&gt;one more with &quot;remove sub-test dirs failed&quot; marked as PASS&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_logs/966cb14d-a670-47ba-8141-f5f54088c23c/show_text&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_logs/966cb14d-a670-47ba-8141-f5f54088c23c/show_text&lt;/a&gt;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
== recovery-&lt;span class=&quot;code-object&quot;&gt;double&lt;/span&gt;-scale test complete, duration 10761 sec ========================================================== 07:46:23 (1637307983)
rm: cannot remove &lt;span class=&quot;code-quote&quot;&gt;&apos;/mnt/lustre/d0.tar-onyx-78vm3.onyx.whamcloud.com&apos;&lt;/span&gt;: Directory not empty
rm: cannot remove &lt;span class=&quot;code-quote&quot;&gt;&apos;/mnt/lustre/d0.tar-onyx-78vm7.onyx.whamcloud.com/etc&apos;&lt;/span&gt;: Directory not empty
 recovery-&lt;span class=&quot;code-object&quot;&gt;double&lt;/span&gt;-scale : @@@@@@ FAIL: remove sub-test dirs failed 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="66770">LU-15140</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="64553">LU-14738</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzze87:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>