<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:14:57 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-8136] sanity-hsm test_9 fails with &apos;request on 0x200000405:0x4:0x0 is not SUCCEED on mds1&apos; </title>
                <link>https://jira.whamcloud.com/browse/LU-8136</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;sanity-hsm test 9 fails with &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;&apos;request on 0x200000405:0x4:0x0 is not SUCCEED on mds1&apos;  
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The last thing seen in the test log before the failure is&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;CMD: trevis-5vm4 /usr/sbin/lctl get_param -n mdt.lustre-MDT0000.hsm.actions | awk &apos;/&apos;0x200000405:0x4:0x0&apos;.*action=&apos;ARCHIVE&apos;/ {print \$13}&apos; | cut -f2 -d=
CMD: trevis-5vm4 /usr/sbin/lctl get_param -n mdt.lustre-MDT0000.hsm.actions | awk &apos;/&apos;0x200000405:0x4:0x0&apos;.*action=&apos;ARCHIVE&apos;/ {print \$13}&apos; | cut -f2 -d=
CMD: trevis-5vm4 /usr/sbin/lctl get_param -n mdt.lustre-MDT0000.hsm.actions | awk &apos;/&apos;0x200000405:0x4:0x0&apos;.*action=&apos;ARCHIVE&apos;/ {print \$13}&apos; | cut -f2 -d=
CMD: trevis-5vm4 /usr/sbin/lctl get_param -n mdt.lustre-MDT0000.hsm.actions | awk &apos;/&apos;0x200000405:0x4:0x0&apos;.*action=&apos;ARCHIVE&apos;/ {print \$13}&apos; | cut -f2 -d=
Update not seen after 200s: wanted &apos;SUCCEED&apos; got &apos;STARTED&apos;
 sanity-hsm test_9: @@@@@@ FAIL: request on 0x200000405:0x4:0x0 is not SUCCEED on mds1 
  Trace dump:
  = /usr/lib64/lustre/tests/test-framework.sh:4769:error()
  = /usr/lib64/lustre/tests/sanity-hsm.sh:766:wait_request_state()
  = /usr/lib64/lustre/tests/sanity-hsm.sh:1010:test_9()
  = /usr/lib64/lustre/tests/test-framework.sh:5033:run_one()
  = /usr/lib64/lustre/tests/test-framework.sh:5072:run_one_logged()
  = /usr/lib64/lustre/tests/test-framework.sh:4919:run_test()
  = /usr/lib64/lustre/tests/sanity-hsm.sh:1016:main()
Dumping lctl log to /logdir/test_logs/2016-05-11/lustre-reviews-el7-x86_64--review-dne-part-2--1_7_1__38816__-70227460739120-004004/sanity-hsm.test_9.*.1462941864.log
 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Is this the same or similar issue as in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8111&quot; title=&quot;sanity-hsm test_28: request on 0x200000401:0x28:0x0 is not SUCCEED on mds1&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8111&quot;&gt;&lt;del&gt;LU-8111&lt;/del&gt;&lt;/a&gt;?&lt;/p&gt;

&lt;p&gt;So far, this test is only failing on review-dne-* tests groups. Test 9 started failing with this failure in the past two days;7 failures. Here are the failures:&lt;br/&gt;
2016-05-10  - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/dd4e43d0-168a-11e6-855a-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/dd4e43d0-168a-11e6-855a-5254006e85c2&lt;/a&gt;&lt;br/&gt;
2016-05-10  - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/ab71c010-16d2-11e6-855a-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/ab71c010-16d2-11e6-855a-5254006e85c2&lt;/a&gt;&lt;br/&gt;
2016-05-10  - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/6f746a34-1710-11e6-855a-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/6f746a34-1710-11e6-855a-5254006e85c2&lt;/a&gt;&lt;br/&gt;
2016-05-10  - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/ae6d43d4-1713-11e6-9b34-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/ae6d43d4-1713-11e6-9b34-5254006e85c2&lt;/a&gt;&lt;br/&gt;
2016-05-11  - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/541a3da0-1732-11e6-b5f1-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/541a3da0-1732-11e6-b5f1-5254006e85c2&lt;/a&gt;&lt;br/&gt;
2016-05-11 - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/0923a158-173f-11e6-855a-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/0923a158-173f-11e6-855a-5254006e85c2&lt;/a&gt;&lt;br/&gt;
2016-05-11  - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/6f95b094-1757-11e6-9b34-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/6f95b094-1757-11e6-9b34-5254006e85c2&lt;/a&gt;&lt;/p&gt;</description>
                <environment>autotest review-dne</environment>
        <key id="36893">LU-8136</key>
            <summary>sanity-hsm test_9 fails with &apos;request on 0x200000405:0x4:0x0 is not SUCCEED on mds1&apos; </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="jamesanunez">James Nunez</reporter>
                        <labels>
                    </labels>
                <created>Thu, 12 May 2016 20:50:29 +0000</created>
                <updated>Thu, 8 Nov 2018 07:36:30 +0000</updated>
                            <resolved>Thu, 8 Nov 2018 07:36:30 +0000</resolved>
                                    <version>Lustre 2.9.0</version>
                                    <fixVersion>Lustre 2.9.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="152191" author="bfaccini" created="Fri, 13 May 2016 13:05:05 +0000"  >&lt;p&gt;James,&lt;br/&gt;
Well, I had a look to the last/most recent (2016-05-11 - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/6f95b094-1757-11e6-9b34-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/6f95b094-1757-11e6-9b34-5254006e85c2&lt;/a&gt;) failure you have reported, and looks like according to the 1st MDS (handling MDT1 and MDT3) there could be a slow start of CT (likely due to the need to register with each MDTs for/when DNE), that may allow the archive request to be somewhat trashed/unseen on CT side after its startup has completed :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;CT registers with 1st MDT:
00000100:00100000:0.0:1462941602.244264:0:5563:0:(service.c:2070:ptlrpc_server_handle_request()) Handling RPC pname:cluuid+ref:pid:xid:nid:opc mdt00_001:50b3d422-923b-abd9-9810-4f7c10608c4b+8:17919:x1533991955961472:12345-10.9.4.43@tcp:59
00000100:00100000:0.0:1462941602.244278:0:5563:0:(service.c:2120:ptlrpc_server_handle_request()) Handled RPC pname:cluuid+ref:pid:xid:nid:opc mdt00_001:50b3d422-923b-abd9-9810-4f7c10608c4b+8:17919:x1533991955961472:12345-10.9.4.43@tcp:59 Request procesed in 13us (26us total) trans 0 rc 0/0
00000100:00100000:0.0:1462941602.244280:0:5563:0:(nrs_fifo.c:241:nrs_fifo_req_stop()) NRS stop fifo request from 12345-10.9.4.43@tcp, seq: 308
00000100:00100000:0.0:1462941602.245002:0:6662:0:(events.c:351:request_in_callback()) peer: 12345-10.9.4.42@tcp
00000100:00100000:0.0:1462941602.245007:0:5563:0:(service.c:1922:ptlrpc_server_handle_req_in()) got req x1534001997146880
00000100:00100000:0.0:1462941602.245012:0:5563:0:(nrs_fifo.c:179:nrs_fifo_req_get()) NRS start fifo request from 12345-10.9.4.42@tcp, seq: 309
Client sends hsm_archive request to MDT/CDT:
00000100:00100000:0.0:1462941602.245014:0:5563:0:(service.c:2070:ptlrpc_server_handle_request()) Handling RPC pname:cluuid+ref:pid:xid:nid:opc mdt00_001:bed34d71-9fc0-26f6-02c9-3e3df67d2b69+38:22988:x1534001997146880:12345-10.9.4.42@tcp:58
00000040:00080000:0.0:1462941602.245032:0:5563:0:(llog_cat.c:735:llog_cat_process_cb()) processing log 0x15:1:0 at index 1 of catalog 0x8:10
00000040:00080000:0.0:1462941602.245138:0:5563:0:(llog_osd.c:696:llog_osd_write_rec()) added record [0x1:0x15:0x0]: idx: 4, 136 off8736
00000100:00100000:0.0:1462941602.245149:0:5563:0:(service.c:2120:ptlrpc_server_handle_request()) Handled RPC pname:cluuid+ref:pid:xid:nid:opc mdt00_001:bed34d71-9fc0-26f6-02c9-3e3df67d2b69+38:22988:x1534001997146880:12345-10.9.4.42@tcp:58 Request procesed in 135us (148us total) trans 0 rc 0/0
00000100:00100000:0.0:1462941602.245152:0:5563:0:(nrs_fifo.c:241:nrs_fifo_req_stop()) NRS stop fifo request from 12345-10.9.4.42@tcp, seq: 309
00000040:00080000:0.0:1462941602.245161:0:12867:0:(llog_cat.c:735:llog_cat_process_cb()) processing log 0x15:1:0 at index 1 of catalog 0x8:10
00000040:00100000:0.0:1462941602.245165:0:12867:0:(llog.c:211:llog_cancel_rec()) Canceling 2 in log 0x15:1
00000040:00100000:0.0:1462941602.245172:0:12867:0:(llog.c:211:llog_cancel_rec()) Canceling 3 in log 0x15:1
MDT/CDT sends archive request to CT:
00000100:00100000:0.0:1462941602.245214:0:12867:0:(client.c:1589:ptlrpc_send_new_req()) Sending RPC pname:cluuid:pid:xid:nid:opc hsm_cdtr:lustre-MDT0000_UUID:12867:1533991886170496:10.9.4.43@tcp:107
00000100:00100000:0.0:1462941602.245224:0:12867:0:(client.c:2287:ptlrpc_set_wait()) set ffff8800482b46c0 going to sleep for 11 seconds
00000100:00100000:0.0:1462941602.245250:0:6662:0:(events.c:351:request_in_callback()) peer: 12345-10.9.4.43@tcp
00000100:00100000:0.0:1462941602.245353:0:5563:0:(service.c:1922:ptlrpc_server_handle_req_in()) got req x1533991955961504
00000100:00100000:0.0:1462941602.245361:0:5563:0:(nrs_fifo.c:179:nrs_fifo_req_get()) NRS start fifo request from 12345-10.9.4.43@tcp, seq: 310
CT registers with 2nd MDT:
00000100:00100000:0.0:1462941602.245363:0:5563:0:(service.c:2070:ptlrpc_server_handle_request()) Handling RPC pname:cluuid+ref:pid:xid:nid:opc mdt00_001:50b3d422-923b-abd9-9810-4f7c10608c4b+5:17919:x1533991955961504:12345-10.9.4.43@tcp:59
00000100:00100000:0.0:1462941602.245377:0:5563:0:(service.c:2120:ptlrpc_server_handle_request()) Handled RPC pname:cluuid+ref:pid:xid:nid:opc mdt00_001:50b3d422-923b-abd9-9810-4f7c10608c4b+5:17919:x1533991955961504:12345-10.9.4.43@tcp:59 Request procesed in 14us (127us total) trans 0 rc 0/0
00000100:00100000:0.0:1462941602.245379:0:5563:0:(nrs_fifo.c:241:nrs_fifo_req_stop()) NRS stop fifo request from 12345-10.9.4.43@tcp, seq: 310
Agent answers (after trashing because of some race during CT start?) to MDT/CDT archive request:
00000100:00100000:0.0:1462941602.245679:0:12867:0:(client.c:1997:ptlrpc_check_set()) Completed RPC pname:cluuid:pid:xid:nid:opc hsm_cdtr:lustre-MDT0000_UUID:12867:1533991886170496:10.9.4.43@tcp:107
00000040:00080000:0.0:1462941602.245689:0:12867:0:(llog_cat.c:735:llog_cat_process_cb()) processing log 0x15:1:0 at index 1 of catalog 0x8:10
00000001:02000400:0.0:1462941602.356340:0:13401:0:(debug.c:335:libcfs_debug_mark_buffer()) DEBUG MARKER: /usr/sbin/lctl get_param -n mdt.lustre-MDT0000.hsm.actions | awk &apos;/&apos;0x200000405:0x4:0x0&apos;.*action=&apos;ARCHIVE&apos;/ {print $13}&apos; | cut -f2 -d=
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;By the way, and even if a possible racy situation will need to be investigated further, we may already avoid this by moving CT start at the beginning of sanity-hsm/test_9() vs currently :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;test_9() {
        mkdir -p $DIR/$tdir
        local f=$DIR/$tdir/$tfile
        local fid=$(copy_file /etc/passwd $f)
        # we do not use the default one to be sure
        local new_an=$((HSM_ARCHIVE_NUMBER + 1))
        copytool_cleanup
        copytool_setup $SINGLEAGT $MOUNT $new_an
        $LFS hsm_archive --archive $new_an $f
        wait_request_state $fid ARCHIVE SUCCEED

        check_hsm_flags $f &quot;0x00000009&quot;

        copytool_cleanup
}
run_test 9 &quot;Use of explicit archive number, with dedicated copytool&quot;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I will push a first patch in this direction and also continue to investigate to better understand the possible implication of DNE configuration vs CT startup.&lt;/p&gt;</comment>
                            <comment id="152518" author="gerrit" created="Tue, 17 May 2016 09:50:24 +0000"  >&lt;p&gt;Faccini Bruno (bruno.faccini@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/20258&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/20258&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8136&quot; title=&quot;sanity-hsm test_9 fails with &amp;#39;request on 0x200000405:0x4:0x0 is not SUCCEED on mds1&amp;#39; &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8136&quot;&gt;&lt;del&gt;LU-8136&lt;/del&gt;&lt;/a&gt; tests: allow CT to registers with all MDTs&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: b9f23bd58cedf93898bb8a0854f12d181bb0f39b&lt;/p&gt;</comment>
                            <comment id="154187" author="cengku9660" created="Wed, 1 Jun 2016 01:17:51 +0000"  >&lt;p&gt;Hit similar problem on our local autotest:&lt;br/&gt;
Tue May 31 11:28:27 JST 2016: waited 200 secs for update&lt;br/&gt;
Update not seen after 200s: wanted &apos;SUCCEED&apos; got &apos;STARTED&apos;&lt;br/&gt;
 sanity-hsm test_12g: @@@@@@ FAIL: request on 0x280000401:0xe:0x0 is not SUCCEED on mds1 &lt;br/&gt;
  Trace dump:&lt;br/&gt;
  = /usr/lib64/lustre/tests/test-framework.sh:4730:error_noexit()&lt;br/&gt;
  = /usr/lib64/lustre/tests/test-framework.sh:4761:error()&lt;br/&gt;
  = /usr/lib64/lustre/tests/sanity-hsm.sh:704:wait_request_state()&lt;br/&gt;
  = /usr/lib64/lustre/tests/sanity-hsm.sh:1278:test_12g()&lt;br/&gt;
  = /usr/lib64/lustre/tests/test-framework.sh:5008:run_one()&lt;br/&gt;
  = /usr/lib64/lustre/tests/test-framework.sh:5045:run_one_logged()&lt;br/&gt;
  = /usr/lib64/lustre/tests/test-framework.sh:4910:run_test()&lt;br/&gt;
  = /usr/lib64/lustre/tests/sanity-hsm.sh:1291:main()&lt;br/&gt;
so I am wondering whether we also add the wait step into other sub-cases, if it happened to test_9, it may happen to other cases too, e.g. the above 12g&lt;/p&gt;</comment>
                            <comment id="154390" author="gerrit" created="Thu, 2 Jun 2016 04:45:40 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/20258/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/20258/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8136&quot; title=&quot;sanity-hsm test_9 fails with &amp;#39;request on 0x200000405:0x4:0x0 is not SUCCEED on mds1&amp;#39; &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8136&quot;&gt;&lt;del&gt;LU-8136&lt;/del&gt;&lt;/a&gt; tests: allow CT to registers with all MDTs&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: f56b701235674c8661691d73f55570c161fa13cb&lt;/p&gt;</comment>
                            <comment id="155924" author="pjones" created="Thu, 16 Jun 2016 14:47:02 +0000"  >&lt;p&gt;Landed for 2.9&lt;/p&gt;</comment>
                            <comment id="179090" author="dmiter" created="Wed, 28 Dec 2016 08:31:33 +0000"  >&lt;p&gt;The same failure now happens with test_12*, test_33-36, test_57-58, test_110*, test_222*&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/7c0d8752-cc8d-11e6-9816-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/7c0d8752-cc8d-11e6-9816-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/85ff554c-ccd8-11e6-9296-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/85ff554c-ccd8-11e6-9296-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="179091" author="bfaccini" created="Wed, 28 Dec 2016 09:30:05 +0000"  >&lt;p&gt;Well, may be the tempo and CT registration verification should be, as John H had already suggested when commenting my first patch, generalized (in copytool_setup()?).&lt;br/&gt;
I will check the auto-tests links you have provided and see if they show the same behavior than the original ones.&lt;/p&gt;</comment>
                            <comment id="179092" author="bfaccini" created="Wed, 28 Dec 2016 10:35:24 +0000"  >&lt;p&gt;Well having a look to the recent failed auto-tests logs, it seems that their problem is not the one (CT failing to register with all MDTs in a too short time, requiring to add a tempo) that has been tracked in this ticket.&lt;br/&gt;
My feeling comes from the same set of following lines :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;CMD: trevis-35vm7 /usr/sbin/lctl get_param -n mdt.lustre-MDT0000.hsm.actions | awk &apos;/&apos;0x200000405:0xf:0x0&apos;.*action=&apos;RESTORE&apos;/ {print \$13}&apos; | cut -f2 -d=
Changed after 16s: from &apos;SUCCEED
FAILED&apos; to &apos;&apos;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;that can be found in each of these tests main log showing that the SUCCEED request state may not have been detected due to some (new?) issue in the wait_request_state() function when interpreting &quot;hsm/actions&quot; proc file output.&lt;/p&gt;
</comment>
                            <comment id="179176" author="bfaccini" created="Thu, 29 Dec 2016 10:42:25 +0000"  >&lt;p&gt;Dmitry,&lt;br/&gt;
I have checked the recent auto-tests results in Maloo and seems that all recent failures of sanity-hsm test_12*, test_33-36, test_57-58, test_110*, test_222* subtests have only occurred during the tests sessions for your patches for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8964&quot; title=&quot;use parallel I/O to improve performance on machines with slow single thread performance&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8964&quot;&gt;&lt;del&gt;LU-8964&lt;/del&gt;&lt;/a&gt; and also a few with patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8709&quot; title=&quot;parallel asynchronous readahead&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8709&quot;&gt;&lt;del&gt;LU-8709&lt;/del&gt;&lt;/a&gt;, and always during implicit restore operations where their result interpretation leads to multiple statuses/lines (due to multiple requests??...).&lt;br/&gt;
If I understand correctly both tickets/patches intent is to optimize/parallelize Client&apos;s regular/read-ahead I/O ops, so could it be possible that your code/patch, can cause multiple hsm_restore requests to be generated for the same file/fid, when only one will succeed ??&lt;br/&gt;
If yes, we may only need to strengthen sanity-hsm/wait_request_state() function to filter uniquely the wanted status in its check command :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@eagle-31 lustre-release]# diff -urpN /usr/lib64/lustre/tests/sanity-hsm.sh.bfi /usr/lib64/lustre/tests/sanity-hsm.sh.bfi+
--- /usr/lib64/lustre/tests/sanity-hsm.sh.bfi   2016-12-28 16:41:09.000000000 +0000
+++ /usr/lib64/lustre/tests/sanity-hsm.sh.bfi+  2016-12-29 10:40:08.000000000 +0000
@@ -724,7 +724,8 @@ wait_request_state() {
        local mds=mds$(($mdtidx + 1))
 
        local cmd=&quot;$LCTL get_param -n ${MDT_PREFIX}${mdtidx}.hsm.actions&quot;
-       cmd+=&quot; | awk &apos;/&apos;$fid&apos;.*action=&apos;$request&apos;/ {print \\\$13}&apos; | cut -f2 -d=&quot;
+       cmd+=&quot; | awk &apos;/&apos;$fid&apos;.*action=&apos;$request&apos;/ {print \\\$13}&apos; |\
+            cut -f2 -d= | uniq | grep $state&quot;
 
        wait_result $mds &quot;$cmd&quot; $state 200 ||
                error &quot;request on $fid is not $state on $mds&quot;
[root@eagle-31 lustre-release]# 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;What do you think ??&lt;/p&gt;</comment>
                            <comment id="179178" author="dmiter" created="Thu, 29 Dec 2016 12:05:18 +0000"  >&lt;p&gt;Thanks Bruno,&lt;/p&gt;

&lt;p&gt;I don&apos;t understand how my patch can affect this functionality but I will look into this. I parallelize only regular I/O. Other I/O should use old pipeline. Even with parallel I/O we should not have multiple requests.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="179184" author="dmiter" created="Thu, 29 Dec 2016 14:11:18 +0000"  >&lt;p&gt;Thanks Bruno,&lt;/p&gt;

&lt;p&gt;You are absolutely right! HSM reads the data by 4Mb chunks even small files. Therefore we can get many callbacks if will read by small portions. Now the size of test file is less than stripe size and we always get single replay. But if the size of test file will be more than stripe size or the read will be by small portions like happens in my patch we will have several replays and tests will fail. Your fix resolve this issue. Thanks again.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="179204" author="bfaccini" created="Thu, 29 Dec 2016 22:28:11 +0000"  >&lt;p&gt;Hello Dmitry, I am happy to have helped.&lt;br/&gt;
But concerning my previous patch proposal, I don&apos;t think it can be considered as a definitive fix since it introduces a regression by no longer providing the wrong/unexpected status in case of a real failure (vs handling multiple failed requests and one that succeed, in your case).&lt;br/&gt;
In your case/patch, can&apos;t you detect that the file needs to be restored and launch+wait for it to be available/online before starting your splitted I/Os ?&lt;/p&gt;</comment>
                            <comment id="179222" author="bfaccini" created="Fri, 30 Dec 2016 09:30:50 +0000"  >&lt;p&gt;Last, concerning the generalized need of delay to allow CT to register with all MDTs, after I have reviewed the recent auto-tests failures I still think it is not required and was only, as part of this ticket, for sanity-hsm/test_9 due  to being the only sub-test using copytool_setup() and &quot;lfs hsm_archive&quot; in a raw, without any other cmd in-between to give enough time for CT&apos;s full registering.&lt;/p&gt;

&lt;p&gt;But anyway, I will push a patch to implement this in copytool_setup().&lt;/p&gt;
</comment>
                            <comment id="179223" author="gerrit" created="Fri, 30 Dec 2016 09:31:48 +0000"  >&lt;p&gt;Faccini Bruno (bruno.faccini@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/24542&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/24542&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8136&quot; title=&quot;sanity-hsm test_9 fails with &amp;#39;request on 0x200000405:0x4:0x0 is not SUCCEED on mds1&amp;#39; &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8136&quot;&gt;&lt;del&gt;LU-8136&lt;/del&gt;&lt;/a&gt; tests: ensure CT is registered with all MDTs&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 8117260d68c2fd5b6107d30f163fec89ac4ba317&lt;/p&gt;</comment>
                            <comment id="236406" author="adilger" created="Tue, 6 Nov 2018 10:20:12 +0000"  >&lt;p&gt;It looks like this test is still failing occasionally, and there is an unlanded patch for this ticket.&lt;/p&gt;</comment>
                            <comment id="236413" author="bougetq" created="Tue, 6 Nov 2018 11:49:28 +0000"  >&lt;p&gt;The only &lt;a href=&quot;https://testing.whamcloud.com/sub_tests/query?utf8=%E2%9C%93&amp;amp;warn%5Bnotice%5D=&amp;amp;test_set_script_id=10d5ab1c-78af-11e2-9928-52540035b04c&amp;amp;sub_test_script_id=41889004-0da5-11e8-a7cd-52540065bddc&amp;amp;status%5B%5D=FAIL&amp;amp;query_bugs=&amp;amp;builds=&amp;amp;hosts=&amp;amp;commit_id=&amp;amp;horizon=2332800&amp;amp;window%5Bstart_date%5D=&amp;amp;window%5Bend_date%5D=&amp;amp;os_type_id=&amp;amp;distribution_type_id=&amp;amp;architecture_type_id=&amp;amp;file_system_type_id=&amp;amp;branch_type_id=&amp;amp;network_type_id=&amp;amp;commit=Update+results&amp;amp;buggable_class=SubTest&amp;amp;bug_upstream_id=&amp;amp;num_results=250&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;recent test failures I could find&lt;/a&gt; targeted test_9A and they didn&apos;t seem to relate to the issue described in this ticket (they all happened for the same patch and many other tests failed on those runs). Am I missing something?&lt;/p&gt;</comment>
                            <comment id="236635" author="adilger" created="Thu, 8 Nov 2018 07:36:30 +0000"  >&lt;p&gt;Sorry, I didn&apos;t see that the failures were related to another issue. I&apos;ve abandoned the old patch. &lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzybgf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>