<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:50:24 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5312] sanity test_161a: cannot create regular file &apos;...f.sanity.161a&apos;: Input/output error</title>
                <link>https://jira.whamcloud.com/browse/LU-5312</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for John Hammond &amp;lt;john.hammond@intel.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;http://maloo.whamcloud.com/test_sets/ba2bfa4a-6e59-11e3-bae0-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://maloo.whamcloud.com/test_sets/ba2bfa4a-6e59-11e3-bae0-52540035b04c&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_161a failed with the following error:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;failed to hardlink many files&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Info required for matching: sanity 161a&lt;/p&gt;

</description>
                <environment></environment>
        <key id="25503">LU-5312</key>
            <summary>sanity test_161a: cannot create regular file &apos;...f.sanity.161a&apos;: Input/output error</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="di.wang">Di Wang</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                            <label>dne</label>
                    </labels>
                <created>Wed, 9 Jul 2014 17:52:57 +0000</created>
                <updated>Tue, 8 Sep 2015 17:26:26 +0000</updated>
                            <resolved>Mon, 1 Dec 2014 18:22:22 +0000</resolved>
                                    <version>Lustre 2.6.0</version>
                                    <fixVersion>Lustre 2.7.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="88625" author="adilger" created="Wed, 9 Jul 2014 17:55:42 +0000"  >&lt;p&gt;This appears to have the same symptoms as &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4420&quot; title=&quot;sanity test_161a: cannot create regular file &amp;#39;...f.sanity.161a&amp;#39;: Input/output error&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4420&quot;&gt;&lt;del&gt;LU-4420&lt;/del&gt;&lt;/a&gt;, but we&apos;re moving it to a separate bug because &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4420&quot; title=&quot;sanity test_161a: cannot create regular file &amp;#39;...f.sanity.161a&amp;#39;: Input/output error&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4420&quot;&gt;&lt;del&gt;LU-4420&lt;/del&gt;&lt;/a&gt; had a fix landed for 2.6.0 and it isn&apos;t clear if this new failure is the same root cause or just the same failure symptoms.  There haven&apos;t been any other failures of this test in the past 4 weeks, while &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4420&quot; title=&quot;sanity test_161a: cannot create regular file &amp;#39;...f.sanity.161a&amp;#39;: Input/output error&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4420&quot;&gt;&lt;del&gt;LU-4420&lt;/del&gt;&lt;/a&gt; failed regularly.&lt;/p&gt;</comment>
                            <comment id="88664" author="di.wang" created="Wed, 9 Jul 2014 23:08:56 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/11039&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11039&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="91521" author="green" created="Wed, 13 Aug 2014 13:25:00 +0000"  >&lt;p&gt;Apparently this is still being hit from time to time:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/9c3f9cea-22d0-11e4-94dd-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/9c3f9cea-22d0-11e4-94dd-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="91592" author="di.wang" created="Thu, 14 Aug 2014 03:36:47 +0000"  >&lt;p&gt;This is actually different problem, it seems related with OSP, instead of LWP&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000100:00080000:1.0:1407905310.504717:0:3659:0:(import.c:1004:ptlrpc_connect_interpret()) @@@ lustre-MDT0000-osp-MDT0001: evicting (reconnect/recover flags not set: 4)  req@ffff88007c0c2800 x1476289415644816/t0(0) o38-&amp;gt;lustre-MDT0000-osp-MDT0001@10.2.4.185@tcp:24/4 lens 400/264 e 0 to 0 dl 1407905346 ref 1 fl Interpret:RN/0/0 rc 0/0
00000100:00080000:1.0:1407905310.504720:0:3659:0:(import.c:1007:ptlrpc_connect_interpret()) ffff880079bdf800 lustre-MDT0000_UUID: changing import state from CONNECTING to EVICTED
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So It seems MDT1 is being evicted by MDT0(being restarted during the test) &lt;/p&gt;

&lt;p&gt;According to the debug log &lt;/p&gt;

&lt;p&gt;1. MDT1 is in the final stage of recovery at 1407570696, so it sends the final PING to MDT0&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000100:00080000:1.0:1407570696.905010:0:3680:0:(import.c:1420:ptlrpc_import_recovery_state_machine()) ffff880067730800 lustre-MDT0000_UUID: changing import state from REPLAY to REPLAY_LOCKS
00010000:00010000:1.0:1407570696.905011:0:3680:0:(ldlm_request.c:2278:ldlm_cancel_unused_locks_for_replay()) Dropping as many unused locks as possible beforereplay for namespace lustre-MDT0000-osp-MDT0001 (0)
00010000:00010000:1.0:1407570696.905012:0:3680:0:(ldlm_request.c:2287:ldlm_cancel_unused_locks_for_replay()) Canceled 0 unused locks from namespace lustre-MDT0000-osp-MDT0001
00000100:00080000:1.0:1407570696.905287:0:3680:0:(import.c:1430:ptlrpc_import_recovery_state_machine()) ffff880067730800 lustre-MDT0000_UUID: changing import state from REPLAY_LOCKS to REPLAY_WAIT
00000100:00100000:1.0:1407570696.905292:0:3680:0:(client.c:1863:ptlrpc_check_set()) Completed RPC pname:cluuid:pid:xid:nid:opc ptlrpcd_rcv:lustre-MDT0001-mdtlov_UUID:3680:1475938571525140:10.2.4.194@tcp:38
00000100:00100000:1.0:1407570696.905297:0:3680:0:(client.c:1480:ptlrpc_send_new_req()) Sending RPC pname:cluuid:pid:xid:nid:opc ptlrpcd_rcv:lustre-MDT0002-mdtlov_UUID:3680:1475938571525152:10.2.4.194@tcp:400
00000100:00100000:1.0:1407570696.905318:0:3680:0:(client.c:1480:ptlrpc_send_new_req()) Sending RPC pname:cluuid:pid:xid:nid:opc ptlrpcd_rcv:lustre-MDT0001-mdtlov_UUID:3680:1475938571525156:10.2.4.194@tcp:400
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;2. MDT0 queue the ping at the same time 1407570696&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000100:00100000:0.0:1407570696.907586:0:710:0:(service.c:2092:ptlrpc_server_handle_request()) Handling RPC pname:cluuid+ref:pid:xid:nid:opc mdt_out00_001:lustre-MDT0001-mdtlov_UUID+4:3680:x1475938571525156:12345-10.2.4.198@tcp:400
00010000:00080000:0.0:1407570696.907589:0:710:0:(ldlm_lib.c:2226:target_queue_recovery_request()) @@@ queue final req  req@ffff880066ddf000 x1475938571525156/t0(0) o400-&amp;gt;lustre-MDT0001-mdtlov_UUID@10.2.4.198@tcp:0/0 lens 224/0 e 0 to 0 dl 1407570702 ref 2 fl Interpret:/c0/ffffffff rc 0/-1
00000100:00100000:0.0:1407570696.907593:0:710:0:(service.c:2142:ptlrpc_server_handle_request()) Handled RPC pname:cluuid+ref:pid:xid:nid:opc mdt_out00_001:lustre-MDT0001-mdtlov_UUID+4:3680:x1475938571525156:12345-10.2.4.198@tcp:400 Request procesed in 7us (51us total) trans 0 rc 0/-999
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;3. MDT0 processing the final ping req 24 seconds later, and it should reply MDT1 with &quot;RECOVERY complete&quot; to tell the recovery is done.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:00080000:1.0:1407570720.856384:0:747:0:(ldlm_lib.c:2061:target_recovery_thread()) @@@ processing final ping from 10.2.4.198@tcp:   req@ffff880066ddf000 x1475938571525156/t0(0) o400-&amp;gt;lustre-MDT0001-mdtlov_UUID@10.2.4.198@tcp:0/0 lens 224/0 e 710876 to 0 dl 1407570712 ref 1 fl Complete:/c0/ffffffff rc 0/-1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;4. For some unknown reasons, MDT1 get the &quot;RECOVERY complete&quot; reply from MDT0 after 20 seconds&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000100:00080000:1.0:1407570744.024908:0:3680:0:(import.c:1439:ptlrpc_import_recovery_state_machine()) ffff880067730800 lustre-MDT0000_UUID: changing import state from REPLAY_WAIT to RECOVER
00000100:00080000:1.0:1407570744.024910:0:3680:0:(import.c:1446:ptlrpc_import_recovery_state_machine()) reconnected to lustre-MDT0000_UUID@10.2.4.194@tcp
00000100:00080000:1.0:1407570744.024911:0:3680:0:(client.c:2578:ptlrpc_resend_req()) @@@ going to resend  req@ffff88005c8f3400 x1475938571525156/t0(0) o400-&amp;gt;lustre-MDT0000-osp-MDT0001@10.2.4.194@tcp:24/4 lens 224/192 e 710876 to 0 dl 1407570754 ref 1 fl Interpret:R/c0/0 rc 0/0
00000100:00080000:1.0:1407570744.024914:0:3680:0:(client.c:2585:ptlrpc_resend_req()) @@@ it has reply, so skip it  req@ffff88005c8f3400 x1475938571525156/t0(0) o400-&amp;gt;lustre-MDT0000-osp-MDT0001@10.2.4.194@tcp:24/4 lens 224/192 e 710876 to 0 dl 1407570754 ref 1 fl Interpret:R/c0/0 rc 0/0
00000100:00080000:1.0:1407570744.024916:0:3680:0:(import.c:1451:ptlrpc_import_recovery_state_machine()) ffff880067730800 lustre-MDT0000_UUID: changing import state from RECOVER to FULL
00000100:02000000:1.0:1407570744.024918:0:3680:0:(import.c:1459:ptlrpc_import_recovery_state_machine()) lustre-MDT0000-osp-MDT0001: Connection restored to lustre-MDT0000 (at 10.2.4.194@tcp)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;5. In the mean time, the ping_evictor on MDT0 evict the export from MDT1, because MDT1 can not ping MDT0 during recovery stage, i.e. the import state is not FULL&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000100:00080000:0.0:1407570744.291927:0:700:0:(pinger.c:641:ping_evictor_main()) evicting all exports of obd lustre-MDT0000 older than 1407570714
00000100:02000400:0.0:1407570744.291936:0:700:0:(pinger.c:667:ping_evictor_main()) lustre-MDT0000: haven&apos;t heard from client lustre-MDT0001-mdtlov_UUID (at 10.2.4.198@tcp) in 49 seconds. I think it&apos;s dead, and I am evicting it. exp ffff88007a9cec00, cur 1407570744 expire 1407570714 last 1407570695
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I am not sure why we see this now, probably because some recent changes, I did not dig yet. I think the way to fix this  might be update exp_last_request_time in stage 3, because the client can not ping the server when it is waiting for the &quot;final recovery&quot; signal.  So we should refresh the exp_last_request_time once server is ready to accept ping and other request.&lt;/p&gt;</comment>
                            <comment id="91593" author="di.wang" created="Thu, 14 Aug 2014 04:01:20 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/11443&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11443&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="100327" author="jlevi" created="Mon, 1 Dec 2014 18:22:22 +0000"  >&lt;p&gt;Patches landed to Master.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="22590">LU-4420</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="31981">LU-7115</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwqzz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>14840</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>