<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:07:47 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-14208] sanity-hsm test 55 fails with &apos;request on 0xM:0xN:0x0 is not FAILED on mds1&apos;</title>
                <link>https://jira.whamcloud.com/browse/LU-14208</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;sanity-hsm test_55 fails with &apos;request on 0x200001b79:0x4:0x0 is not FAILED on mds1&apos; for RHEL 8.3 client/server testing for review-dne-zfs-part-2 and review-dne-part-2 (ldiskfs). Although test 55 has failed many times with this message, the failures for the REHL8.3 testing have a consistent copytool_log where the copy is stuck at 8% (&#8220;%8&#8221;?):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lhsmtool_posix: 1607364549.271481 lhsmtool_posix[152920]: action=0 src=(null) dst=(null) mount_point=/mnt/lustre2
lhsmtool_posix: 1607364549.279162 lhsmtool_posix[152922]: waiting for message from kernel
lhsmtool_posix: 1607364549.852166 lhsmtool_posix[152922]: copytool fs=lustre archive#=2 item_count=1
lhsmtool_posix: 1607364549.852331 lhsmtool_posix[152922]: waiting for message from kernel
lhsmtool_posix: 1607364549.852363 lhsmtool_posix[152923]: &apos;[0x200001b79:0x4:0x0]&apos; action ARCHIVE reclen 72, cookie=0x5fce6f8f
lhsmtool_posix: 1607364549.853177 lhsmtool_posix[152923]: processing file &apos;d55.sanity-hsm/f55.sanity-hsm&apos;
lhsmtool_posix: 1607364549.868566 lhsmtool_posix[152923]: archiving &apos;/mnt/lustre2/.lustre/fid/0x200001b79:0x4:0x0&apos; to &apos;/tmp/arc1/sanity-hsm.test_55//0004/0000/1b79/0000/0002/0000/0x200001b79:0x4:0x0_tmp&apos;
lhsmtool_posix: 1607364549.868732 lhsmtool_posix[152923]: saving stripe info of &apos;/mnt/lustre2/.lustre/fid/0x200001b79:0x4:0x0&apos; in /tmp/arc1/sanity-hsm.test_55//0004/0000/1b79/0000/0002/0000/0x200001b79:0x4:0x0_tmp.lov
lhsmtool_posix: 1607364549.870442 lhsmtool_posix[152923]: start copy of 39000000 bytes from &apos;/mnt/lustre2/.lustre/fid/0x200001b79:0x4:0x0&apos; to &apos;/tmp/arc1/sanity-hsm.test_55//0004/0000/1b79/0000/0002/0000/0x200001b79:0x4:0x0_tmp&apos;
lhsmtool_posix: 1607364579.001120 lhsmtool_posix[152923]: %8 
lhsmtool_posix: 1607364609.001120 lhsmtool_posix[152923]: %8 
lhsmtool_posix: 1607364639.000122 lhsmtool_posix[152923]: %8 
lhsmtool_posix: 1607364669.001121 lhsmtool_posix[152923]: %8 
lhsmtool_posix: 1607364699.001121 lhsmtool_posix[152923]: %8 
lhsmtool_posix: 1607364729.001120 lhsmtool_posix[152923]: %8 
lhsmtool_posix: 1607364759.001121 lhsmtool_posix[152923]: %8 
exiting: Terminated
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The output from the client looks as expected for these failures, from the suite_log, write the file, start the archive and wait for the expected state which, in this case, is &#8220;FAILED&#8221;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;== sanity-hsm test 55: Truncate during an archive cancels it ========================================= 18:09:06 (1607364546)
39+0 records in
39+0 records out
39000000 bytes (39 MB, 37 MiB) copied, 0.146523 s, 266 MB/s
CMD: trevis-202vm3 mkdir -p /tmp/arc1/sanity-hsm.test_55/
Starting copytool agt1 on trevis-202vm3
CMD: trevis-202vm3 lhsmtool_posix  --daemon --hsm-root &quot;/tmp/arc1/sanity-hsm.test_55/&quot; --bandwidth 1 &quot;/mnt/lustre2&quot; &amp;lt; /dev/null &amp;gt; &quot;/autotest/autotest-1/2020-12-07/lustre-reviews_review-dne-zfs-part-2_78162_1_108_a473c2cb-2a7d-4640-8bee-2a175a1428d2/sanity-hsm.test_55.copytool_log.trevis-202vm3.log&quot; 2&amp;gt;&amp;amp;1
CMD: trevis-202vm5 /usr/sbin/lctl get_param -n mdt.lustre-MDT0000.hsm.actions | awk &apos;/&apos;0x200001b79:0x4:0x0&apos;.*action=&apos;ARCHIVE&apos;/ {print \$13}&apos; | cut -f2 -d=
CMD: trevis-202vm5 /usr/sbin/lctl set_param mdt.lustre-MDT0000.hsm.policy=+NRA
mdt.lustre-MDT0000.hsm.policy=+NRA
CMD: trevis-202vm6 /usr/sbin/lctl set_param mdt.lustre-MDT0001.hsm.policy=+NRA
mdt.lustre-MDT0001.hsm.policy=+NRA
CMD: trevis-202vm5 /usr/sbin/lctl set_param mdt.lustre-MDT0002.hsm.policy=+NRA
mdt.lustre-MDT0002.hsm.policy=+NRA
CMD: trevis-202vm6 /usr/sbin/lctl set_param mdt.lustre-MDT0003.hsm.policy=+NRA
mdt.lustre-MDT0003.hsm.policy=+NRA
CMD: trevis-202vm5 /usr/sbin/lctl get_param -n mdt.lustre-MDT0000.hsm.actions | awk &apos;/&apos;0x200001b79:0x4:0x0&apos;.*action=&apos;ARCHIVE&apos;/ {print \$13}&apos; | cut -f2 -d=
Waiting 200s for &apos;FAILED&apos;
CMD: trevis-202vm5 /usr/sbin/lctl get_param -n mdt.lustre-MDT0000.hsm.actions | awk &apos;/&apos;0x200001b79:0x4:0x0&apos;.*action=&apos;ARCHIVE&apos;/ {print \$13}&apos; | cut -f2 -d=
&#8230;
CMD: trevis-202vm5 /usr/sbin/lctl get_param -n mdt.lustre-MDT0000.hsm.actions | awk &apos;/&apos;0x200001b79:0x4:0x0&apos;.*action=&apos;ARCHIVE&apos;/ {print \$13}&apos; | cut -f2 -d=
Update not seen after 200s: want &apos;FAILED&apos; got &apos;STARTED&apos;
 sanity-hsm test_55: @@@@@@ FAIL: request on 0x200001b79:0x4:0x0 is not FAILED on mds1 
  Trace dump:
  = /usr/lib64/lustre/tests/test-framework.sh:6257:error()
  = /usr/lib64/lustre/tests/test-framework.sh:10395:wait_request_state()
  = /usr/lib64/lustre/tests/sanity-hsm.sh:2865:test_55()
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Logs for these failures are at&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/358b2b64-faf2-4877-b5af-3f8a6cb54155&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/358b2b64-faf2-4877-b5af-3f8a6cb54155&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/1f82471e-e5c3-4057-9064-928b06e616a1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/1f82471e-e5c3-4057-9064-928b06e616a1&lt;/a&gt;&lt;/p&gt;</description>
                <environment>RHEL8.3 severs/clients</environment>
        <key id="61952">LU-14208</key>
            <summary>sanity-hsm test 55 fails with &apos;request on 0xM:0xN:0x0 is not FAILED on mds1&apos;</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="jhammond">John Hammond</assignee>
                                    <reporter username="jamesanunez">James Nunez</reporter>
                        <labels>
                            <label>rhel8.3</label>
                    </labels>
                <created>Thu, 10 Dec 2020 22:39:54 +0000</created>
                <updated>Thu, 17 Dec 2020 18:25:46 +0000</updated>
                            <resolved>Thu, 17 Dec 2020 18:25:46 +0000</resolved>
                                    <version>Lustre 2.14.0</version>
                                    <fixVersion>Lustre 2.14.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="287267" author="gerrit" created="Fri, 11 Dec 2020 00:47:46 +0000"  >&lt;p&gt;John L. Hammond (jhammond@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/40941&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40941&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14208&quot; title=&quot;sanity-hsm test 55 fails with &amp;#39;request on 0xM:0xN:0x0 is not FAILED on mds1&amp;#39;&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14208&quot;&gt;&lt;del&gt;LU-14208&lt;/del&gt;&lt;/a&gt; utils: revert copy_file_range() usage&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: bce74ae6a7254dcffa7fb7d562fd9da08a611a1b&lt;/p&gt;</comment>
                            <comment id="287268" author="simmonsja" created="Fri, 11 Dec 2020 00:56:04 +0000"  >&lt;p&gt;While reverting helps us to pass the test suite this shows that Lustre does not support copy_file_range() in any form on RHEL8.3 This function is starting to be adopted by external applications which is worrying.&lt;/p&gt;</comment>
                            <comment id="287272" author="jhammond" created="Fri, 11 Dec 2020 01:26:57 +0000"  >&lt;p&gt;&amp;gt; While reverting helps us to pass the test suite this shows that Lustre does not support copy_file_range() in any form on RHEL8.3 This function is starting to be adopted by external applications which is worrying.&lt;/p&gt;

&lt;p&gt;Then please open a new ticket to introduce standalone test code to exercise this functionality. Once it&apos;s correctness and the performance benefit is demonstrated I will not object to it being included in any of the utilities.&lt;/p&gt;</comment>
                            <comment id="287311" author="simmonsja" created="Fri, 11 Dec 2020 13:09:11 +0000"  >&lt;p&gt;Since this is using the default kernel path for copy_file_range() that means most file systems will have broken copy_file_range() implementations. We should report this failure to RedHat since it will impact a much larger&#160;audience. The question is does this bug only show up for a specific conditions.&lt;/p&gt;</comment>
                            <comment id="287367" author="jhammond" created="Fri, 11 Dec 2020 19:58:52 +0000"  >&lt;p&gt;This is not an upstream bug, When &lt;tt&gt;copy_file_range()&lt;/tt&gt; is used in the copytool we do not check for EOF (by testing for a return value of 0). That is a defect and is why the test fails. From &lt;a href=&quot;https://review.whamcloud.com/#/c/38651/5/lustre/utils/lhsmtool_posix.c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/38651/5/lustre/utils/lhsmtool_posix.c&lt;/a&gt;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
		/* Try the accelerated copy path first. Once LU-10180 lands
		 * we should deal with holes.
		 */
		wsize = copy_file_range(src_fd, NULL, dst_fd, NULL,
					chunk, 0);
		&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (wsize != -1)
			&lt;span class=&quot;code-keyword&quot;&gt;goto&lt;/span&gt; fini_fastcopy;
		rc = -errno;
		/* Before Linux kernel 5.3 copy_file_range only supported
		 * file copies between filesystems of the same type. In
		 * that &lt;span class=&quot;code-keyword&quot;&gt;case&lt;/span&gt; copy_file_range() will &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; -EXDEV.
		 */
		&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc != -EXDEV &amp;amp;&amp;amp; rc != -ENOSYS) {
			CT_ERROR(rc, &lt;span class=&quot;code-quote&quot;&gt;&quot;copy_file_range failed &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;code-quote&quot;&gt;&apos;%s&apos;&lt;/span&gt; to &lt;span class=&quot;code-quote&quot;&gt;&apos;%s&apos;&lt;/span&gt;&quot;&lt;/span&gt;,
				 src, dst);
			&lt;span class=&quot;code-keyword&quot;&gt;break&lt;/span&gt;;
		}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;But still, this should be reverted rather than simply fixed in place since &lt;tt&gt;copy_file_range()&lt;/tt&gt; is currently 2x slower than normal &lt;tt&gt;read()+write()&lt;/tt&gt; copy. Once that is fixed then we can integrate &lt;tt&gt;copy_file_range()&lt;/tt&gt; into &lt;tt&gt;lhsmtool_posix&lt;/tt&gt;.&lt;/p&gt;</comment>
                            <comment id="287368" author="simmonsja" created="Fri, 11 Dec 2020 20:27:30 +0000"  >&lt;p&gt;So Lustre kernel implement has to be done first then. We have tested internally at ORNL and we saw read + write have the same performance as copy_file_range() using real hardware. Andreas did point out in VMs with fake disk it doesn&apos;t perform as well. Mind you that is a not a real world example. I also have to ask is lhsm_posix used in production systems ?&lt;/p&gt;</comment>
                            <comment id="287477" author="jhammond" created="Mon, 14 Dec 2020 14:54:22 +0000"  >&lt;p&gt;&amp;gt;  I also have to ask is lhsm_posix used in production systems ?&lt;/p&gt;

&lt;p&gt;Yes. It is also intended to be a reference for implementations of other copytools.&lt;/p&gt;</comment>
                            <comment id="287501" author="gerrit" created="Mon, 14 Dec 2020 18:38:20 +0000"  >&lt;p&gt;John L. Hammond (jhammond@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/40966&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40966&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14208&quot; title=&quot;sanity-hsm test 55 fails with &amp;#39;request on 0xM:0xN:0x0 is not FAILED on mds1&amp;#39;&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14208&quot;&gt;&lt;del&gt;LU-14208&lt;/del&gt;&lt;/a&gt; utils: remove copy_file_range() usage from copytool&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: b03fe0ddb1f0da1af18b16928074b80e3f2f008b&lt;/p&gt;</comment>
                            <comment id="287880" author="gerrit" created="Thu, 17 Dec 2020 17:00:28 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/40966/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40966/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14208&quot; title=&quot;sanity-hsm test 55 fails with &amp;#39;request on 0xM:0xN:0x0 is not FAILED on mds1&amp;#39;&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14208&quot;&gt;&lt;del&gt;LU-14208&lt;/del&gt;&lt;/a&gt; utils: remove copy_file_range() usage from copytool&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 8294b497f27f66c0f388889ae6d8b29d916a17b4&lt;/p&gt;</comment>
                            <comment id="287911" author="pjones" created="Thu, 17 Dec 2020 18:25:46 +0000"  >&lt;p&gt;Landed for 2.14&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="53924">LU-11621</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i01gx3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>