<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:32:57 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-3325] recovery-mds-scale test_failover_mds: tar: Cannot write: Input/output error</title>
                <link>https://jira.whamcloud.com/browse/LU-3325</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;After running recovery-mds-scale test_failover_mds for about 9 hours (MDS failed over 38 times), client load on one of the clients failed as follows:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;tar: etc/ConsoleKit/seats.d/00-primary.seat: Cannot write: Input/output error
tar: etc/ConsoleKit/seats.d/00-primary.seat: Cannot utime: Cannot send after transport endpoint shutdown
tar: Exiting with failure status due to previous errors
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Console log on the client (client-32vm6) showed that:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;17:52:45:Lustre: DEBUG MARKER: mds1 has failed over 38 times, and counting...
17:52:46:Lustre: Evicted from MGS (at 10.10.4.198@tcp) after server handle changed from 0x9510c59bdf7b69c8 to 0xa78bdb1c4b1a185f
17:52:46:Lustre: MGC10.10.4.198@tcp: Connection restored to MGS (at 10.10.4.198@tcp)
18:00:41:Lustre: lustre-OST0005-osc-ffff880037c72c00: Connection to lustre-OST0005 (at 10.10.4.199@tcp) was lost; in progress operations using this service will wait for recovery to complete
18:00:41:LustreError: 11-0: lustre-OST0004-osc-ffff880037c72c00: Communicating with 10.10.4.199@tcp, operation ost_write failed with -107.
18:00:41:LustreError: Skipped 3 previous similar messages
18:00:41:LustreError: 11-0: lustre-OST0000-osc-ffff880037c72c00: Communicating with 10.10.4.199@tcp, operation ost_write failed with -107.
18:00:41:LustreError: Skipped 1 previous similar message
18:00:41:Lustre: lustre-OST0000-osc-ffff880037c72c00: Connection to lustre-OST0000 (at 10.10.4.199@tcp) was lost; in progress operations using this service will wait for recovery to complete
18:00:41:Lustre: Skipped 1 previous similar message
18:00:41:Lustre: lustre-OST0006-osc-ffff880037c72c00: Connection to lustre-OST0006 (at 10.10.4.199@tcp) was lost; in progress operations using this service will wait for recovery to complete
18:00:41:Lustre: 7109:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1368234025/real 1368234025]  req@ffff88006dbdbc00 x1434661777629520/t0(0) o103-&amp;gt;lustre-OST0005-osc-ffff880037c72c00@10.10.4.199@tcp:17/18 lens 328/224 e 0 to 1 dl 1368234036 ref 1 fl Rpc:X/0/ffffffff rc 0/-1
18:00:41:Lustre: 7109:0:(client.c:1868:ptlrpc_expire_one_request()) Skipped 3 previous similar messages
18:00:41:LustreError: 11-0: lustre-OST0001-osc-ffff880037c72c00: Communicating with 10.10.4.199@tcp, operation ost_write failed with -107.
18:00:41:LustreError: Skipped 3 previous similar messages
18:00:41:LustreError: 167-0: lustre-OST0005-osc-ffff880037c72c00: This client was evicted by lustre-OST0005; in progress operations using this service will fail.
18:00:41:LustreError: Skipped 1 previous similar message
18:00:41:Lustre: 7109:0:(llite_lib.c:2503:ll_dirty_page_discard_warn()) lustre: dirty page discard: 10.10.4.198@tcp:10.10.4.202@tcp:/lustre/fid: [0x200000401:0x1ce3c:0x0]/ may get corrupted (rc -108)
18:00:41:Lustre: 7109:0:(llite_lib.c:2503:ll_dirty_page_discard_warn()) Skipped 3 previous similar messages
18:00:41:Lustre: lustre-OST0001-osc-ffff880037c72c00: Connection restored to lustre-OST0001 (at 10.10.4.199@tcp)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Console log on OSS (client-32vm4) showed that:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;17:52:50:Lustre: DEBUG MARKER: mds1 has failed over 38 times, and counting...
17:53:12:LustreError: 167-0: lustre-MDT0000-lwp-OST0000: This client was evicted by lustre-MDT0000; in progress operations using this service will fail.
17:53:12:LustreError: Skipped 3 previous similar messages
17:53:12:Lustre: lustre-MDT0000-lwp-OST0002: Connection restored to lustre-MDT0000 (at 10.10.4.198@tcp)
18:00:36:Lustre: 5788:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1368234022/real 1368234024]  req@ffff880068743800 x1434661750825740/t0(0) o104-&amp;gt;lustre-OST0005@10.10.4.201@tcp:15/16 lens 296/224 e 0 to 1 dl 1368234033 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
18:00:36:Lustre: 5788:0:(client.c:1868:ptlrpc_expire_one_request()) Skipped 105 previous similar messages
18:00:36:LustreError: 138-a: lustre-OST0005: A client on nid 10.10.4.201@tcp was evicted due to a lock blocking callback time out: rc -107
18:00:36:LustreError: Skipped 1 previous similar message
18:00:36:LustreError: 9765:0:(ldlm_lockd.c:709:ldlm_handle_ast_error()) ### client (nid 10.10.4.201@tcp) returned 0 from blocking AST ns: filter-lustre-OST0005_UUID lock: ffff880069496c00/0x15300103a13ef60b lrc: 4/0,0 mode: PR/PR res: 13012/0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;18446744073709551615) flags: 0x10020 nid: 10.10.4.201@tcp remote: 0x413d78011f8a8246 expref: 156 pid: 9910 timeout: 4329187148 lvb_type: 1
18:00:36:LustreError: 9765:0:(ldlm_lockd.c:709:ldlm_handle_ast_error()) Skipped 17 previous similar messages
18:00:36:LustreError: 9867:0:(ldlm_lib.c:2706:target_bulk_io()) @@@ Eviction on bulk GET  req@ffff880048f78c00 x1434661777625888/t0(0) o4-&amp;gt;c3d0e929-541a-b945-bf7d-9ade70c3a652@10.10.4.201@tcp:0/0 lens 488/448 e 1 to 0 dl 1368234065 ref 1 fl Interpret:/0/0 rc 0/0
18:00:36:Lustre: lustre-OST0005: Bulk IO write error with c3d0e929-541a-b945-bf7d-9ade70c3a652 (at 10.10.4.201@tcp), client will retry: rc -107
18:00:36:LustreError: 7431:0:(ldlm_lockd.c:2348:ldlm_cancel_handler()) ldlm_cancel from 10.10.4.201@tcp arrived at 1368234033 with bad export cookie 1526721388779448426
18:00:36:LustreError: 7431:0:(ldlm_lockd.c:2348:ldlm_cancel_handler()) Skipped 3 previous similar messages
18:00:36:LustreError: 7431:0:(ldlm_lockd.c:2348:ldlm_cancel_handler()) ldlm_cancel from 10.10.4.201@tcp arrived at 1368234034 with bad export cookie 1526721388779448426
18:00:36:LustreError: 7431:0:(ldlm_lockd.c:2348:ldlm_cancel_handler()) Skipped 1 previous similar message
18:00:36:LustreError: 9427:0:(ldlm_lib.c:2706:target_bulk_io()) @@@ Eviction on bulk GET  req@ffff880048f78400 x1434661777625880/t0(0) o4-&amp;gt;c3d0e929-541a-b945-bf7d-9ade70c3a652@10.10.4.201@tcp:0/0 lens 488/448 e 1 to 0 dl 1368234065 ref 1 fl Interpret:/0/0 rc 0/0
18:00:36:Lustre: lustre-OST0004: Bulk IO write error with c3d0e929-541a-b945-bf7d-9ade70c3a652 (at 10.10.4.201@tcp), client will retry: rc -107
18:00:36:LustreError: 138-a: lustre-OST0000: A client on nid 10.10.4.201@tcp was evicted due to a lock blocking callback time out: rc -107
18:00:36:LustreError: Skipped 1 previous similar message
18:00:36:LustreError: 5063:0:(ldlm_lockd.c:709:ldlm_handle_ast_error()) ### client (nid 10.10.4.201@tcp) returned 0 from blocking AST ns: filter-lustre-OST0000_UUID lock: ffff880069496800/0x15300103a13ef619 lrc: 4/0,0 mode: PR/PR res: 13204/0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;18446744073709551615) flags: 0x10020 nid: 10.10.4.201@tcp remote: 0x413d78011f8a8270 expref: 178 pid: 9910 timeout: 4329190079 lvb_type: 1
18:00:36:LustreError: 5063:0:(ldlm_lockd.c:709:ldlm_handle_ast_error()) Skipped 13 previous similar messages
18:00:36:LustreError: 9425:0:(ldlm_lib.c:2706:target_bulk_io()) @@@ Eviction on bulk GET  req@ffff88002d861800 x1434661777625848/t0(0) o4-&amp;gt;c3d0e929-541a-b945-bf7d-9ade70c3a652@10.10.4.201@tcp:0/0 lens 488/448 e 1 to 0 dl 1368234065 ref 1 fl Interpret:/0/0 rc 0/0
18:00:36:Lustre: lustre-OST0000: Bulk IO write error with c3d0e929-541a-b945-bf7d-9ade70c3a652 (at 10.10.4.201@tcp), client will retry: rc -107
18:00:36:LustreError: 7431:0:(ldlm_lockd.c:2348:ldlm_cancel_handler()) ldlm_cancel from 10.10.4.201@tcp arrived at 1368234035 with bad export cookie 1526721388779166123
18:00:36:LustreError: 7431:0:(ldlm_lockd.c:2348:ldlm_cancel_handler()) Skipped 1 previous similar message
18:00:36:LustreError: 138-a: lustre-OST0006: A client on nid 10.10.4.201@tcp was evicted due to a lock blocking callback time out: rc -107
18:00:36:LustreError: Skipped 1 previous similar message
18:00:36:LustreError: 9930:0:(ldlm_lockd.c:709:ldlm_handle_ast_error()) ### client (nid 10.10.4.201@tcp) returned 0 from blocking AST ns: filter-lustre-OST0006_UUID lock: ffff88005eee0800/0x15300103a13ef5e1 lrc: 4/0,0 mode: PR/PR res: 12947/0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;18446744073709551615) flags: 0x10020 nid: 10.10.4.201@tcp remote: 0x413d78011f8a81c8 expref: 168 pid: 9910 timeout: 4329191220 lvb_type: 1
18:00:36:LustreError: 9930:0:(ldlm_lockd.c:709:ldlm_handle_ast_error()) Skipped 13 previous similar messages
18:00:36:LustreError: 7431:0:(ldlm_lock.c:2433:ldlm_lock_dump_handle()) ### ### ns: filter-lustre-OST0006_UUID lock: ffff880069496a00/0x15300103a13ef612 lrc: 4/0,0 mode: PR/PR res: 12948/0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;18446744073709551615) flags: 0x10020 nid: 10.10.4.201@tcp remote: 0x413d78011f8a825b expref: 161 pid: 9910 timeout: 4329188939 lvb_type: 1
18:00:37:LustreError: 9429:0:(ldlm_lib.c:2706:target_bulk_io()) @@@ Eviction on bulk GET  req@ffff880058037800 x1434661777628724/t0(0) o4-&amp;gt;c3d0e929-541a-b945-bf7d-9ade70c3a652@10.10.4.201@tcp:0/0 lens 488/448 e 0 to 0 dl 1368234045 ref 1 fl Interpret:/0/0 rc 0/0
18:00:37:LustreError: 9429:0:(ldlm_lib.c:2706:target_bulk_io()) Skipped 3 previous similar messages
18:00:37:Lustre: lustre-OST0006: Bulk IO write error with c3d0e929-541a-b945-bf7d-9ade70c3a652 (at 10.10.4.201@tcp), client will retry: rc -107
18:00:37:Lustre: Skipped 3 previous similar messages
18:00:37:LustreError: 7431:0:(ldlm_lock.c:2433:ldlm_lock_dump_handle()) ### ### ns: filter-lustre-OST0002_UUID lock: ffff88005eee0200/0x15300103a13ef5f6 lrc: 4/0,0 mode: PR/PR res: 13043/0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;18446744073709551615) flags: 0x10020 nid: 10.10.4.201@tcp remote: 0x413d78011f8a8207 expref: 164 pid: 9910 timeout: 4329189806 lvb_type: 1
18:00:38:LustreError: 5064:0:(ldlm_lockd.c:709:ldlm_handle_ast_error()) ### client (nid 10.10.4.201@tcp) returned 0 from blocking AST ns: filter-lustre-OST0001_UUID lock: ffff88005eee0400/0x15300103a13ef5ef lrc: 4/0,0 mode: PR/PR res: 13011/0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;18446744073709551615) flags: 0x10020 nid: 10.10.4.201@tcp remote: 0x413d78011f8a81f2 expref: 188 pid: 9942 timeout: 4329191830 lvb_type: 1
18:00:38:LustreError: 5064:0:(ldlm_lockd.c:709:ldlm_handle_ast_error()) Skipped 13 previous similar messages
18:00:38:LustreError: 7431:0:(ldlm_lockd.c:2348:ldlm_cancel_handler()) ldlm_cancel from 10.10.4.201@tcp arrived at 1368234037 with bad export cookie 1526721388779166137
18:00:38:LustreError: 7431:0:(ldlm_lockd.c:2348:ldlm_cancel_handler()) Skipped 6 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Maloo report: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/053120d2-bb19-11e2-8824-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/053120d2-bb19-11e2-8824-52540035b04c&lt;/a&gt;&lt;/p&gt;</description>
                <environment>&lt;br/&gt;
Lustre Branch: master&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-master/1486&quot;&gt;http://build.whamcloud.com/job/lustre-master/1486&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64&lt;br/&gt;
Test Group: failover&lt;br/&gt;
</environment>
        <key id="18910">LU-3325</key>
            <summary>recovery-mds-scale test_failover_mds: tar: Cannot write: Input/output error</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="yujian">Jian Yu</reporter>
                        <labels>
                            <label>LB</label>
                    </labels>
                <created>Mon, 13 May 2013 04:46:50 +0000</created>
                <updated>Tue, 12 Aug 2014 20:39:50 +0000</updated>
                            <resolved>Tue, 12 Aug 2014 20:39:49 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                    <version>Lustre 2.4.1</version>
                    <version>Lustre 2.6.0</version>
                    <version>Lustre 2.5.1</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="58254" author="yujian" created="Mon, 13 May 2013 04:56:10 +0000"  >&lt;p&gt;This issue also occurred on master build #1481 (after running for 12 hours and MDS failed over 48 times):&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/52c5d99a-b8f9-11e2-891d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/52c5d99a-b8f9-11e2-891d-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="58301" author="keith" created="Mon, 13 May 2013 18:36:11 +0000"  >&lt;p&gt;I see the tests report over 75% failure rates over all the branches.  Do we know that this is a valid test? Under what conditions has this test passed?&lt;/p&gt;</comment>
                            <comment id="58302" author="keith" created="Mon, 13 May 2013 18:42:37 +0000"  >&lt;p&gt;Also as noted in a client log&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;osc.lustre-OST0000-osc-*.ost_server_uuid in FULL state after 0 sec
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Maybe the OSTs get full during this test?  I would think the IO error on a client would be ok if the OST is FULL. &lt;/p&gt;</comment>
                            <comment id="58304" author="adilger" created="Mon, 13 May 2013 18:57:30 +0000"  >&lt;p&gt;Keith, &quot;FULL&quot; is the &lt;em&gt;connection&lt;/em&gt; state, not that the OST is out of space.&lt;/p&gt;

&lt;p&gt;Yu Jian, there are a steady stream of failures for this test in the past.  Could you go through at least the past several weeks of tests and triage the failures.  We&apos;re trying to determine if this is a new type of failure, or just one that has existed a long time that we haven&apos;t noticed before.&lt;/p&gt;</comment>
                            <comment id="58422" author="hongchao.zhang" created="Tue, 14 May 2013 04:06:09 +0000"  >&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/053120d2-bb19-11e2-8824-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/053120d2-bb19-11e2-8824-52540035b04c&lt;/a&gt;, &lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/52c5d99a-b8f9-11e2-891d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/52c5d99a-b8f9-11e2-891d-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;the client is evicted from OST at 18:00 for the time out of lock blocking ast, but the debug logs related to the eviction has been overwritten&lt;br/&gt;
by the later message both at client and OST, then it is needed to increase DEBUG_SIZE to reproduce it again to collect these debug logs.&lt;/p&gt;</comment>
                            <comment id="58423" author="yujian" created="Tue, 14 May 2013 04:50:37 +0000"  >&lt;blockquote&gt;&lt;p&gt;it is needed to increase DEBUG_SIZE to reproduce it again to collect these debug logs.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;The patch for gathering debug logs is in &lt;a href=&quot;http://review.whamcloud.com/6013&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/6013&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="58432" author="yujian" created="Tue, 14 May 2013 08:25:14 +0000"  >&lt;blockquote&gt;&lt;p&gt;Yu Jian, there are a steady stream of failures for this test in the past. Could you go through at least the past several weeks of tests and triage the failures. We&apos;re trying to determine if this is a new type of failure, or just one that has existed a long time that we haven&apos;t noticed before.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Here are all of the recovery-mds-scale test failover_mds reports on master branch:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/sub_tests/query?utf8=%E2%9C%93&amp;amp;test_set[test_set_script_id]=a7fa1cd6-5989-11e0-a272-52540025f9af&amp;amp;sub_test[sub_test_script_id]=da5ac8b8-69de-11e1-9d76-5254004bbbd3&amp;amp;sub_test[status]=&amp;amp;sub_test[query_bugs]=&amp;amp;test_session[test_host]=&amp;amp;test_session[test_group]=failover&amp;amp;test_session[user_id]=&amp;amp;test_session[query_date]=&amp;amp;test_session[query_recent_period]=&amp;amp;test_node[os_type_id]=&amp;amp;test_node[distribution_type_id]=&amp;amp;test_node[architecture_type_id]=&amp;amp;test_node[file_system_type_id]=&amp;amp;test_node[lustre_branch_id]=24a6947e-04a9-11e1-bb5f-52540025f9af&amp;amp;test_node_network[network_type_id]=&amp;amp;commit=Update+results&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;recovery-mds-scale test_failover_mds&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The test was not performed on every master build. Before 2013-04-09, the test was not really run due to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2415&quot; title=&quot;recovery-mds-scale test_failover_mds: lustre:MDT0000/recovery_status found no match&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2415&quot;&gt;&lt;del&gt;LU-2415&lt;/del&gt;&lt;/a&gt;. After that, the test hit &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3142&quot; title=&quot;recovery-mds-scale test_failover_mds: dd: writing `/mnt/lustre/d0.dd-client-32vm5.lab.whamcloud.com/dd-file&amp;#39;: Bad file descriptor&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3142&quot;&gt;&lt;del&gt;LU-3142&lt;/del&gt;&lt;/a&gt; twice, then since 2013-04-19 (master build #1420), the test has been failing with this ticket. So, there is a possibility that the issue has existed for a long time but has been hidden by &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2415&quot; title=&quot;recovery-mds-scale test_failover_mds: lustre:MDT0000/recovery_status found no match&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2415&quot;&gt;&lt;del&gt;LU-2415&lt;/del&gt;&lt;/a&gt; and &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3142&quot; title=&quot;recovery-mds-scale test_failover_mds: dd: writing `/mnt/lustre/d0.dd-client-32vm5.lab.whamcloud.com/dd-file&amp;#39;: Bad file descriptor&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3142&quot;&gt;&lt;del&gt;LU-3142&lt;/del&gt;&lt;/a&gt;. It&apos;s worth mentioning that there are two test runs passed for 24 hours on 2013-04-24 (build #1431) and 2013-05-07 (build #1478) separately.&lt;/p&gt;</comment>
                            <comment id="58484" author="jlevi" created="Tue, 14 May 2013 18:28:16 +0000"  >&lt;p&gt;Lowering priority from blocker, but still want to continue digging into this issue.&lt;/p&gt;</comment>
                            <comment id="58548" author="yujian" created="Wed, 15 May 2013 09:05:03 +0000"  >&lt;blockquote&gt;&lt;p&gt;The patch for gathering debug logs is in &lt;a href=&quot;http://review.whamcloud.com/6013&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/6013&lt;/a&gt;.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Patch Set: 10&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-master/1492&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-master/1492&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;recovery-mds-scale test_failover_mds passed for 24 hours (MDS failed over 96 times): &lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sessions/469a819e-bd2b-11e2-a548-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sessions/469a819e-bd2b-11e2-a548-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Re-triggering the test to try to reproduce the failure.&lt;/p&gt;</comment>
                            <comment id="58654" author="yujian" created="Thu, 16 May 2013 14:09:51 +0000"  >&lt;blockquote&gt;&lt;p&gt;Re-triggering the test to try to reproduce the failure.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Patch Set 11 in &lt;a href=&quot;http://review.whamcloud.com/6013&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/6013&lt;/a&gt; :&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-master/1495&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-master/1495&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Again, recovery-mds-scale test_failover_mds passed for 24 hours (MDS failed over 96 times):&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sessions/96fed7ee-be24-11e2-be2a-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sessions/96fed7ee-be24-11e2-be2a-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The issue can not be reproduced in manually triggered test runs but occurred constantly in autotest failover test group runs.&lt;/p&gt;</comment>
                            <comment id="58829" author="yujian" created="Sat, 18 May 2013 07:27:12 +0000"  >&lt;p&gt;Lustre Tag: v2_4_0_RC1&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-master/1501/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-master/1501/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64&lt;/p&gt;

&lt;p&gt;PTLDEBUG=-1&lt;br/&gt;
DEBUG_SIZE=256&lt;/p&gt;

&lt;p&gt;The issue occurred on recovery-mds-scale test_failover_ost:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/147bf416-bf58-11e2-88e0-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/147bf416-bf58-11e2-88e0-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="58856" author="yujian" created="Mon, 20 May 2013 05:48:32 +0000"  >&lt;p&gt;Lustre Tag: v2_4_0_RC1&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-master/1501/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-master/1501/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64&lt;/p&gt;

&lt;p&gt;The issue occurred on recovery-mds-scale test_failover_ost again:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/70ecbb58-c00d-11e2-8398-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/70ecbb58-c00d-11e2-8398-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="58860" author="yujian" created="Mon, 20 May 2013 07:56:19 +0000"  >&lt;p&gt;Lustre Tag: v2_4_0_RC1&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-master/1501/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-master/1501/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: SLES11SP2/x86_64 (client), RHEL6.4/x86_64 (server)&lt;/p&gt;

&lt;p&gt;The issue occurred on recovery-mds-scale test_failover_mds:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/a43ff822-bf9e-11e2-8398-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/a43ff822-bf9e-11e2-8398-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="59398" author="yujian" created="Tue, 28 May 2013 03:11:12 +0000"  >&lt;p&gt;Lustre Tag: v2_4_0_RC2&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_4/12/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_4/12/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64&lt;/p&gt;

&lt;p&gt;The issue occurred on recovery-mds-scale test_failover_mds:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/2e35d81c-c6c5-11e2-ae4e-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/2e35d81c-c6c5-11e2-ae4e-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="60797" author="hongchao.zhang" created="Tue, 18 Jun 2013 09:44:50 +0000"  >&lt;p&gt;this issue is similar with &lt;a href=&quot;https://jira.hpdd.intel.com/browse/LU-1499&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jira.hpdd.intel.com/browse/LU-1499&lt;/a&gt;, which is caused by the time out of lock AST callback.&lt;/p&gt;

&lt;p&gt;does the issue occur again recently? how about running the test with some big timeout value(currently it&apos;s 10) for the load of the test is a little heavy!&lt;/p&gt;</comment>
                            <comment id="60804" author="yujian" created="Tue, 18 Jun 2013 14:03:28 +0000"  >&lt;blockquote&gt;&lt;p&gt;does the issue occur again recently?&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Yes, still occurs on master branch:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/22470fc8-d578-11e2-9a7f-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/22470fc8-d578-11e2-9a7f-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/c9e1b6ce-d065-11e2-a7e3-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/c9e1b6ce-d065-11e2-a7e3-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/d855b29a-cbca-11e2-b831-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/d855b29a-cbca-11e2-b831-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/2a780870-c39c-11e2-a2df-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/2a780870-c39c-11e2-a2df-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;blockquote&gt;&lt;p&gt;how about running the test with some big timeout value(currently it&apos;s 10) for the load of the test is a little heavy!&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;The default value of TIMEOUT variable in cfg/local.sh is 20. Since the issue can only be reproduced in autotest runs, the value of TIMEOUT variable in the test configuration file used by autotest needs to be changed, however, this will affect all of the test sessions performed by autotest.&lt;/p&gt;</comment>
                            <comment id="60908" author="hongchao.zhang" created="Thu, 20 Jun 2013 11:23:46 +0000"  >&lt;p&gt;according the these occurrences, there is no IO RPC during the time interval between the blocking AST sent and eviction.&lt;br/&gt;
I&apos;ll create a debug patch to collect more info to help trace the issue.&lt;/p&gt;</comment>
                            <comment id="61086" author="hongchao.zhang" created="Mon, 24 Jun 2013 13:32:59 +0000"  >&lt;p&gt;debug patch is tracked at &lt;a href=&quot;http://review.whamcloud.com/#/c/6747/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/6747/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="66718" author="yujian" created="Mon, 16 Sep 2013 13:21:35 +0000"  >&lt;p&gt;Lustre Tag: v2_4_1_RC2&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_4/45/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_4/45/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: SLES11SP2/x86_64 (client), RHEL6.4/x86_64 (server)&lt;/p&gt;

&lt;p&gt;The issue occurred on recovery-mds-scale test_failover_ost:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/1f5265dc-1dfe-11e3-a184-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/1f5265dc-1dfe-11e3-a184-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="72960" author="yujian" created="Fri, 6 Dec 2013 04:27:06 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_4/63/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_4/63/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64&lt;/p&gt;

&lt;p&gt;The issue occurred on recovery-mds-scale test_failover_ost:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/5592fa00-5ddf-11e3-aed2-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/5592fa00-5ddf-11e3-aed2-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="75166" author="yujian" created="Fri, 17 Jan 2014 09:15:25 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_5/13/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_5/13/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64&lt;/p&gt;

&lt;p&gt;The issue occurred on recovery-mds-scale test_failover_mds:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/b5aaac10-7f3d-11e3-94f3-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/b5aaac10-7f3d-11e3-94f3-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="76444" author="yujian" created="Fri, 7 Feb 2014 08:19:54 +0000"  >&lt;p&gt;More instance on Lustre b2_5 branch:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/c06adfbc-8f32-11e3-b8e1-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/c06adfbc-8f32-11e3-b8e1-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/7d4fc910-956b-11e3-936f-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/7d4fc910-956b-11e3-936f-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="78827" author="yujian" created="Sun, 9 Mar 2014 07:27:17 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_5/40/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_5/40/&lt;/a&gt; (2.5.1 RC2)&lt;br/&gt;
Distro/Arch: RHEL6.5/x86_64&lt;br/&gt;
Test Group: failover&lt;/p&gt;

&lt;p&gt;The same failure occurred:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/4815f934-a71f-11e3-aad0-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/4815f934-a71f-11e3-aad0-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="80675" author="yujian" created="Tue, 1 Apr 2014 08:36:53 +0000"  >&lt;p&gt;This is blocking recovery-mds-scale testing.&lt;/p&gt;</comment>
                            <comment id="85695" author="hongchao.zhang" created="Wed, 4 Jun 2014 13:43:39 +0000"  >&lt;p&gt;Okay, will check it along with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4621&quot; title=&quot;recovery-mds-scale: test_failover_ost&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4621&quot;&gt;&lt;del&gt;LU-4621&lt;/del&gt;&lt;/a&gt;, which is similar with this one.&lt;/p&gt;</comment>
                            <comment id="91469" author="pjones" created="Tue, 12 Aug 2014 20:39:49 +0000"  >&lt;p&gt;As per Yu, Jian this ticket can be closed as a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4621&quot; title=&quot;recovery-mds-scale: test_failover_ost&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4621&quot;&gt;&lt;del&gt;LU-4621&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="23130">LU-4621</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvqp3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>8209</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>