<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:58:09 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-6203] sanity-hsm test 251: FAIL: Copytool failed to stop in 20s</title>
                <link>https://jira.whamcloud.com/browse/LU-6203</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;sanity-hsm test 251 failed as follows:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;CMD: shadow-26vm10 pkill -INT -x lhsmtool_posix
CMD: shadow-26vm10 pgrep -x lhsmtool_posix
shadow-26vm10: 7902
Copytool still running on shadow-26vm10
CMD: shadow-26vm10 pgrep -x lhsmtool_posix
shadow-26vm10: 7902
Copytool still running on shadow-26vm10
CMD: shadow-26vm10 pgrep -x lhsmtool_posix
shadow-26vm10: 7902
Copytool still running on shadow-26vm10
CMD: shadow-26vm10 pgrep -x lhsmtool_posix
shadow-26vm10: 7902
Copytool still running on shadow-26vm10
CMD: shadow-26vm10 pgrep -x lhsmtool_posix
shadow-26vm10: 7902
Copytool still running on shadow-26vm10
CMD: shadow-26vm10 pgrep -x lhsmtool_posix
shadow-26vm10: 7902
Copytool still running on shadow-26vm10
CMD: shadow-26vm10 pgrep -x lhsmtool_posix
shadow-26vm10: 7902
Copytool still running on shadow-26vm10
CMD: shadow-26vm10 pgrep -x lhsmtool_posix
shadow-26vm10: 7902
Copytool still running on shadow-26vm10
CMD: shadow-26vm10 pgrep -x lhsmtool_posix
shadow-26vm10: 7902
Copytool still running on shadow-26vm10
CMD: shadow-26vm10 pgrep -x lhsmtool_posix
shadow-26vm10: 7902
 sanity-hsm test_251: @@@@@@ FAIL: Copytool failed to stop in 20s ... 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Maloo reports:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/49281842-a9ef-11e4-8c6f-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/49281842-a9ef-11e4-8c6f-5254006e85c2&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/4cac5380-aa11-11e4-a5c6-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/4cac5380-aa11-11e4-a5c6-5254006e85c2&lt;/a&gt;&lt;/p&gt;</description>
                <environment>Lustre Build: &lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-b2_5/112/&quot;&gt;https://build.hpdd.intel.com/job/lustre-b2_5/112/&lt;/a&gt;&lt;br/&gt;
FSTYPE=zfs</environment>
        <key id="28521">LU-6203</key>
            <summary>sanity-hsm test 251: FAIL: Copytool failed to stop in 20s</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bfaccini">Bruno Faccini</assignee>
                                    <reporter username="yujian">Jian Yu</reporter>
                        <labels>
                            <label>22pl</label>
                            <label>zfs</label>
                    </labels>
                <created>Tue, 3 Feb 2015 07:51:05 +0000</created>
                <updated>Mon, 14 Sep 2015 16:01:25 +0000</updated>
                            <resolved>Wed, 11 Mar 2015 12:42:57 +0000</resolved>
                                    <version>Lustre 2.5.4</version>
                                    <fixVersion>Lustre 2.8.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>10</watches>
                                                                            <comments>
                            <comment id="105478" author="yujian" created="Tue, 3 Feb 2015 07:55:23 +0000"  >&lt;p&gt;This is a regression failure introduced by the following commit in Lustre b2_5 build #112:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Commit 97fc8c8caf41e9d74cdb1e373f19c907ed8481b2 by Oleg Drokin

LU-5622 tests: check/wait for copytool death

Seems that copytool death/kill may take more time so
this condition must be handled in sanity-hsm copytool_cleanup()
function to avoid situations where copytool will then not be
restarted, but only signaled, in next copytool_setup().

This patch is back-ported from the following one:
Lustre-commit: 6facf3953b170832200ca9c111398da8feecd281
Lustre-change: http://review.whamcloud.com/11922

Signed-off-by: Bruno Faccini &amp;lt;bruno.faccini@intel.com&amp;gt;
Change-Id: Ia817936eb030386dbe539ec8d5297812f4b6fff2
Reviewed-on: http://review.whamcloud.com/12967
Tested-by: Jenkins
Tested-by: Maloo &amp;lt;hpdd-maloo@intel.com&amp;gt;
Reviewed-by: James Nunez &amp;lt;james.a.nunez@intel.com&amp;gt;
Reviewed-by: Henri Doreau &amp;lt;henri.doreau@cea.fr&amp;gt;
Reviewed-by: Oleg Drokin &amp;lt;oleg.drokin@intel.com&amp;gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Hi Bruno,&lt;br/&gt;
Could you please take a look at the failure? Thank you.&lt;/p&gt;</comment>
                            <comment id="105480" author="yujian" created="Tue, 3 Feb 2015 08:07:55 +0000"  >&lt;p&gt;The zfs full group test session was not run on master branch, so we do not know whether the failure exists on master branch or not for now.&lt;/p&gt;</comment>
                            <comment id="105481" author="bfaccini" created="Tue, 3 Feb 2015 08:24:13 +0000"  >&lt;p&gt;Yu Jian, thanks for all this research work already!!&lt;br/&gt;
Will try to reproduce with a ZFS-only configuration and also have a look to the logs of the different cases you pointed to.&lt;/p&gt;</comment>
                            <comment id="105562" author="adilger" created="Tue, 3 Feb 2015 18:54:49 +0000"  >&lt;p&gt;Bruno, Yu Jian, is this also happening on master, or only on b2_5?&lt;/p&gt;</comment>
                            <comment id="105653" author="bfaccini" created="Wed, 4 Feb 2015 10:49:19 +0000"  >&lt;p&gt;Andreas: no not on master. Based on Maloo reports search, latest test_251 sub-test failures in master have occurred about a year ago and at this time, my patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5622&quot; title=&quot;copytool_cleanup function should check/wait for copytool death&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5622&quot;&gt;&lt;del&gt;LU-5622&lt;/del&gt;&lt;/a&gt; was far to be integrated!, and they were linked to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3852&quot; title=&quot;sanity-hsm test_251: client26-vm &amp;quot;dd: no space left on device&amp;quot;&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3852&quot;&gt;&lt;del&gt;LU-3852&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;There has only been a bunch of 8 failures, between 2014-12-30 07:31:46 UTC and 2015-01-30 14:05:49 UTC and after my patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5622&quot; title=&quot;copytool_cleanup function should check/wait for copytool death&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5622&quot;&gt;&lt;del&gt;LU-5622&lt;/del&gt;&lt;/a&gt; has been integrated, all with b2_5 (2 occurrences) or b_ieel2_0 (6 occurrences) branches, and only when using zfs targets. But there are also frequent success for b2_5/b_ieel2_0 branches using zfs!&lt;/p&gt;

&lt;p&gt;I have still not been able to reproduce the problem running with b2_5 build #112, that has been reported to trigger the problem.&lt;br/&gt;
I have also analyzed the logs of the different failures and it appears that :&lt;br/&gt;
               _ the copytool&apos;s PID still being reported as alive is either the one running the archive action or the main one.&lt;br/&gt;
               _ each time the copytool log shows that at the time of the kill, the archive action seems to have a slow start and the last log line is &quot;processing file ...&quot;, when the &quot;archiving ...&quot;/&quot;saving stripe info of ...&quot;/&quot;start copy of ...&quot; log lines are present in the successful run logs.&lt;br/&gt;
               _ the Agent debug log shows that the PID running the archive action has been stuck awaiting for an OST_GETATTR request to be replied, during a variable period of time but each time exceeding the 20s allowed to wait for copytool death.&lt;br/&gt;
               _ during that time the concerned OSS has been trying/waiting to cancel a contending lock from the Client that has created the file being archived.&lt;/p&gt;

&lt;p&gt;So could this be the consequence of some ZFS/Network config/performance related causing the file&apos;s dirty page flush, occuring at lock cancel time, to take more than 20s under some circumstances ?&lt;br/&gt;
I think that a fix for these failures could be either to raise the timer waiting for copytool death (40s?) or ensure dirty data/blocks, during file creation, have been flushed before to start the archive operation (with &quot;cancel_lru_locks osc&quot;?).&lt;/p&gt;

&lt;p&gt;Also, I wonder if the priority of this ticket should be kept as Blocker ??&lt;/p&gt;</comment>
                            <comment id="105684" author="gerrit" created="Wed, 4 Feb 2015 17:00:43 +0000"  >&lt;p&gt;Faccini Bruno (bruno.faccini@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/13646&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/13646&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6203&quot; title=&quot;sanity-hsm test 251: FAIL: Copytool failed to stop in 20s&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6203&quot;&gt;&lt;del&gt;LU-6203&lt;/del&gt;&lt;/a&gt; tests: early lock cancel to allow early copytool death&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: cc1f9bc9c062e07515fe6c08e358a703cee116ae&lt;/p&gt;</comment>
                            <comment id="105685" author="bfaccini" created="Wed, 4 Feb 2015 17:02:57 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/13646&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/13646&lt;/a&gt; implements early lock cancel solution to speed-up copytool death.&lt;/p&gt;</comment>
                            <comment id="105725" author="yujian" created="Wed, 4 Feb 2015 20:14:07 +0000"  >&lt;p&gt;The zfs full group test session was not run on master branch. Maybe this is the reason that we did not search out failure instances on master branch on Maloo.&lt;/p&gt;

&lt;p&gt;In the following report on b2_5 branch, many sub-tests failed with this issue:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/97fd06d8-ac1c-11e4-992b-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/97fd06d8-ac1c-11e4-992b-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="105790" author="yujian" created="Thu, 5 Feb 2015 08:39:32 +0000"  >&lt;p&gt;The failure started affecting patch review testing on Lustre b2_5 patches:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/66e27944-acde-11e4-872a-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/66e27944-acde-11e4-872a-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="105831" author="bfaccini" created="Thu, 5 Feb 2015 14:32:01 +0000"  >&lt;p&gt;Concerning the &quot;https://testing.hpdd.intel.com/test_sets/97fd06d8-ac1c-11e4-992b-5254006e85c2&quot; case, the &apos;Copytool failed to stop in 20s ...&apos; errors/symptoms there looks more as an additional consequence (due to cleanup()/copytool_cleanup() execute as a trap, set in copytool_cleanup(), upon exit) of previous problem within same sub-test or even preceeding sub-tests :&lt;/p&gt;

&lt;p&gt;           _ 1st sub-test reported to have failed for &apos;Copytool failed to stop in 20s ...&apos; is test_33, but it has failed previously with &quot;sanity-hsm test_33: @@@@@@ FAIL: request on 0x200007931:0x2b:0x0 is not SUCCEED on mds1&quot; waiting for archive to complete/succeed ...&lt;/p&gt;

&lt;p&gt;           _ 2nd sub-test reported to have failed for &apos;Copytool failed to stop in 20s ...&apos; is test_60, but it has failed previously with &quot;sanity-hsm test_60: @@@@@@ FAIL: Timed out waiting for progress update!&quot; waiting for a progress update during archive.&lt;/p&gt;

&lt;p&gt;           _ 3rd sub-test reported to have failed for &apos;Copytool failed to stop in 20s ...&apos; is test_70, but it is the next just after test_60 and since its 1st cmd is a copytool_cleanup that is likely to have encountered the same problem than the preceeding.&lt;/p&gt;

&lt;p&gt;           _ 4th sub-test reported to have failed for &apos;Copytool failed to stop in 20s ...&apos; is test_71, but it is the next just after test_70 and since its 1st cmd is a copytool_cleanup that is likely to have encountered the same problem than the 2 preceeding.&lt;/p&gt;

&lt;p&gt;           _ 5th sub-test reported to have failed for &apos;Copytool failed to stop in 20s ...&apos; is test_103, and according to its specific logs it is the only one that seems to have triggered the same scenario (huge delay during lock flush/cancel processing) I have already described in my previous update. So it may be an other potential subject to the same change than for test_251.&lt;/p&gt;

&lt;p&gt;After more Lustre debug log reading, it seems that the &quot;huge delay during lock flush/cancel processing&quot; that seems to be the root cause of the problem is mainly on the OSS side, after the Client has handled the Blocking callback and sent back its Cancel of lock to the OSS. The thread handling it on the OSS can then spend multiple tens of seconds in ldlm_request_cancel()&lt;del&gt;&amp;gt;ldlm_lock_cancel()&lt;/del&gt;&amp;gt;ldlm_cancel_callback()-&amp;gt; .... and &lt;br/&gt;
highly probably tgt_blocking_ast()&lt;del&gt;&amp;gt;tgt_sync()&lt;/del&gt;&amp;gt;dt_object_sync()-&amp;gt;osd_object_sync(). So is this finally some kind of ZFS performance issue?&lt;/p&gt;</comment>
                            <comment id="107753" author="yong.fan" created="Tue, 24 Feb 2015 08:19:23 +0000"  >&lt;p&gt;Another failure instance on b2_5:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/465bb228-bbfa-11e4-a79b-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/465bb228-bbfa-11e4-a79b-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="107772" author="bfaccini" created="Tue, 24 Feb 2015 15:49:22 +0000"  >&lt;p&gt;Last 2 failures occurrences again show the same delay of copytool death/cleanup due to being stuck, during tens of seconds, because waiting for lock flush/cancel (between Client, who have created file and asked for its archive, and OST&lt;span class=&quot;error&quot;&gt;&amp;#91;s&amp;#93;&lt;/span&gt;) upon archiving file.&lt;/p&gt;

&lt;p&gt;This seem to only occur on ZFS until now, and I have verified that my patch, adding a cancel_lru_locks() before hsm_archive allows the copytool copy to start immediatelly, which is normally delayed to allow for previous Client&apos;s lock flush/cancel.&lt;/p&gt;</comment>
                            <comment id="108817" author="gerrit" created="Wed, 4 Mar 2015 22:54:03 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/13646/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/13646/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6203&quot; title=&quot;sanity-hsm test 251: FAIL: Copytool failed to stop in 20s&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6203&quot;&gt;&lt;del&gt;LU-6203&lt;/del&gt;&lt;/a&gt; tests: early lock cancel to allow early copytool death&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 446ad1ef8be5b282224336817936043b42a10fee&lt;/p&gt;</comment>
                            <comment id="109432" author="pjones" created="Wed, 11 Mar 2015 12:42:57 +0000"  >&lt;p&gt;Landed for 2.8&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="26536">LU-5622</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="32096">LU-7150</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzx5hz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>17337</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>