<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:42:34 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4420] sanity test_161a: cannot create regular file &apos;...f.sanity.161a&apos;: Input/output error</title>
                <link>https://jira.whamcloud.com/browse/LU-4420</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for John Hammond &amp;lt;john.hammond@intel.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;http://maloo.whamcloud.com/test_sets/ba2bfa4a-6e59-11e3-bae0-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://maloo.whamcloud.com/test_sets/ba2bfa4a-6e59-11e3-bae0-52540035b04c&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_161a failed with the following error:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;failed to hardlink many files&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Info required for matching: sanity 161a&lt;/p&gt;

</description>
                <environment></environment>
        <key id="22590">LU-4420</key>
            <summary>sanity test_161a: cannot create regular file &apos;...f.sanity.161a&apos;: Input/output error</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="di.wang">Di Wang</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                            <label>dne</label>
                    </labels>
                <created>Mon, 30 Dec 2013 15:38:25 +0000</created>
                <updated>Wed, 9 Jul 2014 17:55:42 +0000</updated>
                            <resolved>Wed, 9 Jul 2014 17:53:48 +0000</resolved>
                                    <version>Lustre 2.6.0</version>
                                    <fixVersion>Lustre 2.6.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="74159" author="jhammond" created="Mon, 30 Dec 2013 15:40:41 +0000"  >&lt;p&gt;This has been seen in some review-dne runs.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;20:43:40:Lustre: DEBUG MARKER: == sanity test 161a: link ea sanity == 20:43:26 (1388033006)
20:43:40:LustreError: 167-0: lustre-MDT0000-lwp-MDT0001: This client was evicted by lustre-MDT0000; in progress operations using this service will fail.
20:43:40:LustreError: Skipped 2 previous similar messages
20:43:40:LustreError: 12222:0:(lod_lov.c:556:lod_generate_and_set_lovea()) lustre-MDT0001-mdtlov: Can not locate [0x2c0000402:0x142:0x0]: rc = -5
20:43:40:Lustre: lustre-MDT0000-lwp-MDT0002: Connection restored to lustre-MDT0000 (at 10.10.4.164@tcp)
20:43:40:Lustre: Skipped 2 previous similar messages
20:43:40:Lustre: DEBUG MARKER: /usr/sbin/lctl mark  sanity test_161a: @@@@@@ IGNORE \(bz17935\): bad link ea 
20:43:40:Lustre: DEBUG MARKER: sanity test_161a: @@@@@@ IGNORE (bz17935): bad link ea
20:43:40:Lustre: DEBUG MARKER: /usr/sbin/lctl dk &amp;gt; /logdir/test_logs/2013-12-25/lustre-reviews-el6-x86_64--review-dne--1_3_1__20485__-70341555671180-185246/sanity.test_161a.debug_log.$(hostname -s).1388033009.log;
20:43:41:         dmesg &amp;gt; /logdir/test_logs/2013-12-25/lustre-reviews-el6-x86_64-
20:43:41:Lustre: DEBUG MARKER: /usr/sbin/lctl mark  sanity test_161a: @@@@@@ IGNORE \(bz17935\): bad link rename 
20:43:41:Lustre: DEBUG MARKER: sanity test_161a: @@@@@@ IGNORE (bz17935): bad link rename
20:43:41:Lustre: DEBUG MARKER: /usr/sbin/lctl dk &amp;gt; /logdir/test_logs/2013-12-25/lustre-reviews-el6-x86_64--review-dne--1_3_1__20485__-70341555671180-185246/sanity.test_161a.debug_log.$(hostname -s).1388033015.log;
20:43:41:         dmesg &amp;gt; /logdir/test_logs/2013-12-25/lustre-reviews-el6-x86_64-
20:43:41:Lustre: DEBUG MARKER: /usr/sbin/lctl mark  sanity test_161a: @@@@@@ FAIL: failed to hardlink many files 
20:43:41:Lustre: DEBUG MARKER: sanity test_161a: @@@@@@ FAIL: failed to hardlink many files
20:43:41:Lustre: DEBUG MARKER: /usr/sbin/lctl dk &amp;gt; /logdir/test_logs/2013-12-25/lustre-reviews-el6-x86_64--review-dne--1_3_1__20485__-70341555671180-185246/sanity.test_161a.debug_log.$(hostname -s).1388033016.log;
20:43:41:         dmesg &amp;gt; /logdir/test_logs/2013-12-25/lustre-reviews-el6-x86_64-
20:43:41:Lustre: DEBUG MARKER: /usr/sbin/lctl mark == sanity test 161b: link ea sanity under remote directory == 20:43:37 \(1388033017\)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="74160" author="jhammond" created="Mon, 30 Dec 2013 15:41:16 +0000"  >&lt;p&gt;Another instance at &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/a4f2742e-702f-11e3-a8cf-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/a4f2742e-702f-11e3-a8cf-52540035b04c&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="74406" author="adilger" created="Mon, 6 Jan 2014 17:00:05 +0000"  >&lt;p&gt;One possibility here is that with many hard links the linkEA size is growing increasingly large and slowing down the creation rate enough to eventually cause a timeout.  That would only be the case if the large xattr (xattr_inode) feature was enabled. Otherwise the linkEA size is limited at around 4000 bytes (maybe 150 links depending on the filename length). &lt;/p&gt;</comment>
                            <comment id="74586" author="adilger" created="Wed, 8 Jan 2014 18:48:34 +0000"  >&lt;p&gt;I think my theory is invalid, since I don&apos;t think that we enable the large_xattr feature by default.&lt;/p&gt;</comment>
                            <comment id="74589" author="adilger" created="Wed, 8 Jan 2014 19:08:19 +0000"  >&lt;p&gt;It looks like this test is only failing on review-dne test runs, and it always starts with:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;mkdir 1 for /mnt/lustre/d0.sanity/d161
cp: cannot create regular file `/mnt/lustre/d0.sanity/d161/f.sanity.161a&apos;: Input/output error
ln: accessing `/mnt/lustre/d0.sanity/d161/f.sanity.161a&apos;: No such file or directory
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;On the MDS it reports:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;09:48:57:Lustre: DEBUG MARKER: == sanity test 161a: link ea sanity ==
LustreError: 167-0: lustre-MDT0000-lwp-MDT0001: This client was evicted by lustre-MDT0000; in progress operations using this service will fail.
LustreError: Skipped 2 previous similar messages
LustreError: 12146:0:(lod_lov.c:582:lod_generate_and_set_lovea()) lustre-MDT0001-mdtlov: Can not locate [0x280000402:0x142:0x0]: rc = -5
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="74591" author="adilger" created="Wed, 8 Jan 2014 19:53:20 +0000"  >&lt;p&gt;Looking a bit at the debug log on MDS2 (&lt;a href=&quot;https://maloo.whamcloud.com/test_logs/2f514104-6ed7-11e3-ad93-52540035b04c/show_text&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_logs/2f514104-6ed7-11e3-ad93-52540035b04c/show_text&lt;/a&gt;), it seems that setting up the LWP connections between the MDTs times out.   It looks like test_160a() is doing an unmount and remount of mds1 at time 1388080102, which is why the other MDTs are trying to re-establish their LWP connections to it.&lt;/p&gt;

&lt;p&gt;The good news is that we detect the disconnection within a fraction of a second and try to reconnect:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000020:02000400:1.0:1388080102.248396:0:16797:0:(obd_mount_server.c:1530:server_put_super()) server umount lustre-MDT0000 complete
00000100:00080000:0.0:1388080102.480343:0:3730:0:(pinger.c:222:ptlrpc_pinger_process_import()) lustre-MDT0000-lwp-MDT0001_UUID-&amp;gt;lustre-MDT0000_UUID: level DISCONN/3 force 0 force_next 0 deactive 0 pingable 1 suppress 0
00000100:00080000:0.0:1388080102.480350:0:3730:0:(recover.c:69:ptlrpc_initiate_recovery()) lustre-MDT0000_UUID: starting recovery
00000100:00080000:0.0:1388080102.480353:0:3730:0:(import.c:635:ptlrpc_connect_import()) ffff88005fc56000 lustre-MDT0000_UUID: changing import state from DISCONN to CONNECTING
00000100:00080000:0.0:1388080102.480360:0:3730:0:(import.c:480:import_select_connection()) lustre-MDT0000-lwp-MDT0001: connect to NID 10.10.19.11@tcp last attempt 4299742283
00000100:00080000:0.0:1388080102.480365:0:3730:0:(import.c:524:import_select_connection()) lustre-MDT0000-lwp-MDT0001: tried all connections, increasing latency to 6s
00000100:00080000:0.0:1388080102.480380:0:3730:0:(import.c:558:import_select_connection()) lustre-MDT0000-lwp-MDT0001: import ffff88005fc56000 using connection 10.10.19.11@tcp/10.10.19.11@tcp
00000100:00100000:0.0:1388080102.480403:0:3730:0:(import.c:727:ptlrpc_connect_import()) @@@ (re)connect request (timeout 5)  req@ffff880079428c00 x1455502439740708/t0(0) o38-&amp;gt;lustre-MDT0000-lwp-MDT0001@10.10.19.11@tcp:12/10 lens 400/544 e 0 to 0 dl 0 ref 1 fl New:N/0/ffffffff rc 0/-1
00000100:00100000:1.0:1388080102.480472:0:3727:0:(client.c:1473:ptlrpc_send_new_req()) Sending RPC pname:cluuid:pid:xid:nid:opc ptlrpcd_rcv:lustre-MDT0000-lwp-MDT0001_UUID:3727:1455502439740708:10.10.19.11@tcp:38
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The bad news is that the LWP reconnect RPCs are sent in the second before the mds1 service threads have started:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000001:02000400:0.0:1388080103.085625:0:16965:0:(debug.c:339:libcfs_debug_mark_buffer()) DEBUG MARKER: mkdir -p /mnt/mds1; mount -t lustre -o user_xattr,acl /dev/lvm-Role_MDS/P1 /mnt/mds1
00000100:00100000:1.0:1388080103.315529:0:16992:0:(service.c:2877:ptlrpc_start_thread()) mdt[0] started 0 min 3 max 80
00000100:00100000:1.0:1388080103.315533:0:16992:0:(service.c:2934:ptlrpc_start_thread()) starting thread &apos;mdt00_000&apos;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In the next 10s while test_160a() and test_160b() are running, the LWP reconnection RPCs time out, right at the start of test_161a():&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000001:02000400:1.0:1388080113.446804:0:18924:0:(debug.c:339:libcfs_debug_mark_buffer()) DEBUG MARKER: /usr/sbin/lctl mark == sanity test 161a: link ea sanity == 09:48:33 \(1388080113\)
00000100:00000400:1.0:1388080113.532523:0:3727:0:(client.c:1903:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1388080102/real 1388080102]  req@ffff880079428c00 x1455502439740708/t0(0) o38-&amp;gt;lustre-MDT0000-lwp-MDT0001@10.10.19.11@tcp:12/10 lens 400/544 e 0 to 1 dl 1388080113 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
00000100:00100000:1.0:1388080113.532534:0:3727:0:(client.c:1932:ptlrpc_expire_one_request()) @@@ err -110, sent_state=CONNECTING (now=CONNECTING)  req@ffff880079428c00 x1455502439740708/t0(0) o38-&amp;gt;lustre-MDT0000-lwp-MDT0001@10.10.19.11@tcp:12/10 lens 400/544 e 0 to 1 dl 1388080113 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
00000100:00080000:1.0:1388080113.532563:0:3727:0:(import.c:1153:ptlrpc_connect_interpret()) ffff88005ffe7800 lustre-MDT0000_UUID: changing import state from CONNECTING to DISCONN
00000100:00080000:1.0:1388080113.532566:0:3727:0:(import.c:1199:ptlrpc_connect_interpret()) recovery of lustre-MDT0000_UUID on 10.10.19.11@tcp failed (-110)
00000100:00100000:1.0:1388080113.532578:0:3727:0:(client.c:1840:ptlrpc_check_set()) Completed RPC pname:cluuid:pid:xid:nid:opc ptlrpcd_rcv:lustre-MDT0000-lwp-MDT0001_UUID:3727:1455502439740708:10.10.19.11@tcp:38
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It timed out the RPC after only 11s, which is expected based on the AT limit. I guess LWP connections do not retry, so they are evicted from MDT0000 and it returns an error to the client, which is bad.  This is probably hidden most of the time because the MDS2 pinger does not detect the problem right away and by the time it reconnects MDS1 has restarted.  Also, with the length of test_160a() and test_160b() there is probably enough time for MDS2 to retry the connection before the start of test_161a().&lt;/p&gt;

&lt;p&gt;I suspect that this would be visible with almost any kind of DNE recovery if the master is failed when the slaves try to reconnect.&lt;/p&gt;</comment>
                            <comment id="76336" author="yong.fan" created="Thu, 6 Feb 2014 13:45:59 +0000"  >&lt;p&gt;Another failure instances:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/c7f00bb2-8f15-11e3-a00c-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/c7f00bb2-8f15-11e3-a00c-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/862e37dc-8efa-11e3-a421-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/862e37dc-8efa-11e3-a421-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="77154" author="adilger" created="Sat, 15 Feb 2014 18:09:24 +0000"  >&lt;p&gt;Still seeing this despite &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4571&quot; title=&quot;sanity test_17n: create remote dir error 0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4571&quot;&gt;&lt;del&gt;LU-4571&lt;/del&gt;&lt;/a&gt; being landed. I thought they were the same, but I guess not quite. &lt;/p&gt;</comment>
                            <comment id="77535" author="di.wang" created="Thu, 20 Feb 2014 21:54:32 +0000"  >&lt;p&gt;I checked the failure, and it is the same reason as &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4571&quot; title=&quot;sanity test_17n: create remote dir error 0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4571&quot;&gt;&lt;del&gt;LU-4571&lt;/del&gt;&lt;/a&gt; (&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/c7f00bb2-8f15-11e3-a00c-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/c7f00bb2-8f15-11e3-a00c-52540035b04c&lt;/a&gt;)&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000100:00100000:0.0:1391624530.460496:0:852:0:(client.c:2876:ptlrpc_abort_inflight()) @@@ aborting waiting req  req@ffff88006b246000 x1459217185935836/t0(0) o900-&amp;gt;lustre-MDT0000-lwp-MDT0001@10.10.16.116@tcp:29/10 lens 264/248 e 0 to 0 dl 0 ref 2 fl Rpc:W/0/ffffffff rc 0/-1
00000100:00100000:0.0:1391624530.460506:0:852:0:(client.c:2489:ptlrpc_free_committed()) lustre-MDT0000-lwp-MDT0001: committing for last_committed 0 gen 3
00000100:00100000:0.0:1391624530.460509:0:852:0:(import.c:306:ptlrpc_invalidate_import()) Sleeping 20 sec for inflight to error out
00000100:00100000:0.0:1391624530.460518:0:27426:0:(client.c:1844:ptlrpc_check_set()) Completed RPC pname:cluuid:pid:xid:nid:opc mdt00_003:lustre-MDT0000-lwp-MDT0001_UUID:27426:1459217185935836:10.10.16.116@tcp:900
00000004:00020000:0.0:1391624530.460535:0:27426:0:(lod_lov.c:583:lod_generate_and_set_lovea()) lustre-MDT0001-mdtlov: Can not locate [0x340000402:0x162:0x0]: rc = -5
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So the LWP is evicted because of restart, and the request(fld_lookup) suppose to retry with the fix from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4571&quot; title=&quot;sanity test_17n: create remote dir error 0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4571&quot;&gt;&lt;del&gt;LU-4571&lt;/del&gt;&lt;/a&gt;. But it did not. Hmm, something does not make sense here, see the request message&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000004:00080000:1.0:1391624529.895797:0:27426:0:(osp_object.c:1034:osp_object_create()) lustre-OST0005-osc-MDT0001: Wrote last used FID: [0x340000402:0x162:0x0], index 5: 0
00000100:00080000:1.0:1391624529.895853:0:27426:0:(client.c:1437:ptlrpc_send_new_req()) @@@ req from PID 27426 waiting for recovery: (FULL != DISCONN)  req@ffff88006b246000 x1459217185935836/t0(0) o900-&amp;gt;lustre-MDT0000-lwp-MDT0001@10.10.16.116@tcp:29/10 lens 264/248 e 0 to 0 dl 0 ref 2 fl Rpc:W/0/ffffffff rc 0/-100000100:00100000:1.0:1391624529.895941:0:27426:0:(client.c:2127:ptlrpc_set_wait()) set ffff880079ad8080 going to sleep for 0 seconds
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So in ptlrpc_send_new_req, ptlrpc_import_delay_req() should not return 1, if we set the request with no_delay flags( see the fix in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4571&quot; title=&quot;sanity test_17n: create remote dir error 0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4571&quot;&gt;&lt;del&gt;LU-4571&lt;/del&gt;&lt;/a&gt;), instead it should return 0 and set rc with EWOULDBLOCK here. But it seems not according to the debug log. So I suspect the fix of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4571&quot; title=&quot;sanity test_17n: create remote dir error 0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4571&quot;&gt;&lt;del&gt;LU-4571&lt;/del&gt;&lt;/a&gt; is not included during this test. Fan Yong, could you please confirm that? Thanks. &lt;/p&gt;


</comment>
                            <comment id="77549" author="di.wang" created="Thu, 20 Feb 2014 23:01:26 +0000"  >&lt;p&gt;It is not as same as &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4658&quot; title=&quot;Test failure on test suite replay-single, subtest test_80a creation failed &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4658&quot;&gt;&lt;del&gt;LU-4658&lt;/del&gt;&lt;/a&gt; according to the debug log, but they are related.&lt;/p&gt;</comment>
                            <comment id="79549" author="di.wang" created="Tue, 18 Mar 2014 02:41:49 +0000"  >&lt;p&gt;I believe the problem should go away after fixes of 4571 and 4658 are landed.&lt;/p&gt;</comment>
                            <comment id="79550" author="di.wang" created="Tue, 18 Mar 2014 02:42:35 +0000"  >&lt;p&gt;Please reopen the ticket, if the problem happens again. &lt;/p&gt;</comment>
                            <comment id="82671" author="utopiabound" created="Mon, 28 Apr 2014 19:01:47 +0000"  >&lt;p&gt;I believe I have another instance of this issue:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/7a1ad8c4-ccda-11e3-89e3-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/7a1ad8c4-ccda-11e3-89e3-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="82973" author="di.wang" created="Thu, 1 May 2014 07:11:28 +0000"  >&lt;p&gt;Just checked the log, it seems we need check ESHUTDOWN in fld_client_rpc&lt;/p&gt;

&lt;p&gt;MDS debug log&lt;/p&gt;
{noforrmat}
&lt;p&gt;00000100:00100000:1.0:1398451628.656229:0:17304:0:(import.c:306:ptlrpc_invalidate_import()) Sleeping 20 sec for inflight to error out&lt;br/&gt;
00000100:00100000:0.0:1398451628.656243:0:23953:0:(client.c:2132:ptlrpc_set_wait()) set ffff88005d0ac180 going to sleep for 0 seconds&lt;br/&gt;
00000100:00100000:0.0:1398451628.656245:0:23953:0:(client.c:1849:ptlrpc_check_set()) Completed RPC pname:cluuid:pid:xid:nid:opc mdt00_002:lustre-MDT0000-lwp-MDT0001_UUID:23953:1466376860869008:10.1.4.105@tcp:900&lt;br/&gt;
00000100:00100000:0.0:1398451628.656304:0:23953:0:(service.c:2132:ptlrpc_server_handle_request()) Handled RPC pname:cluuid+ref:pid:xid:nid:opc mdt00_002:6901be4b-4d6f-320d-7490-3e18287b4afe+17:29401:x1466376766328092:12345-10.1.4.103@tcp:36 Request procesed in 148561us (148730us total) trans 0 rc -108/-108&lt;br/&gt;
00000100:00100000:0.0:1398451628.656309:0:23953:0:(nrs_fifo.c:244:nrs_fifo_req_stop()) NRS stop fifo request from 12345-10.1.4.103@tcp, seq: 393&lt;br/&gt;
00000100:00080000:1.0:1398451628.656892:0:17304:0:(import.c:1299:ptlrpc_invalidate_import_thread()) ffff88007589b000 lustre-MDT0000_UUID: changing import state from EVICTED to RECOVER&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;fld_client_rpc(...)&lt;br/&gt;
{&lt;br/&gt;
.....&lt;br/&gt;
        obd_put_request_slot(&amp;amp;exp-&amp;gt;exp_obd-&amp;gt;u.cli);&lt;br/&gt;
        if (rc != 0) {&lt;br/&gt;
                if (rc == -EWOULDBLOCK) &lt;/p&gt;
{        &amp;lt;---  Since the import might be evicted, i.e. imp_invalid = 1, so we might need check ESHUTDOWN as well. 
                        /* For no_delay req(see above), EWOULDBLOCK means the
                         * connection is being evicted, but this seq lookup
                         * should not return error, since it would cause
                         * unecessary failure of the application, instead
                         * it should retry here */
                        ptlrpc_req_finished(req);
                        rc = 0;
                        goto again;
                }
&lt;p&gt;                GOTO(out_req, rc);&lt;br/&gt;
        }&lt;/p&gt;

&lt;p&gt;}&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I will cook a patch&lt;/p&gt;</comment>
                            <comment id="83031" author="di.wang" created="Thu, 1 May 2014 18:07:43 +0000"  >&lt;p&gt;I think all of these resend might be just temporary solution,  and also might cost the thread loop forever. The best way to resolve this might be fix LWP as we suggest it in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4571&quot; title=&quot;sanity test_17n: create remote dir error 0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4571&quot;&gt;&lt;del&gt;LU-4571&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;&quot;Not evict LWP during reconnect after reboot, but then it needs quota to set special flag not to replay the quota lock during recovery.&quot;&lt;/p&gt;

&lt;p&gt;But I am not quota expert. Niu, could you please look at it.  Thanks.&lt;/p&gt;</comment>
                            <comment id="83393" author="bogl" created="Wed, 7 May 2014 14:38:22 +0000"  >&lt;p&gt;According to comment in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4868&quot; title=&quot;Test failure on test suite replay-single, subtest test_80a&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4868&quot;&gt;&lt;del&gt;LU-4868&lt;/del&gt;&lt;/a&gt; the following is probably another &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4420&quot; title=&quot;sanity test_161a: cannot create regular file &amp;#39;...f.sanity.161a&amp;#39;: Input/output error&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4420&quot;&gt;&lt;del&gt;LU-4420&lt;/del&gt;&lt;/a&gt;:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/258c676a-d599-11e3-b3f3-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/258c676a-d599-11e3-b3f3-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;showing a pretty high failure rate according to maloo:&lt;br/&gt;
Failure Rate: 63.00% of last 100 executions &lt;span class=&quot;error&quot;&gt;&amp;#91;all branches&amp;#93;&lt;/span&gt;&lt;/p&gt;</comment>
                            <comment id="83476" author="di.wang" created="Thu, 8 May 2014 06:35:31 +0000"  >&lt;p&gt;I am not sure every one like this patch, please check&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/10262&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/10262&lt;/a&gt;  (This patch was abandoned)&lt;/p&gt;</comment>
                            <comment id="83680" author="di.wang" created="Fri, 9 May 2014 21:55:30 +0000"  >&lt;p&gt;Not evict LWP seems cause some quota problem(not easy to fix), instead I might still try to resend fld RPC in 2.6. Here is the patch.&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/10285&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/10285&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="84161" author="jlevi" created="Thu, 15 May 2014 12:45:21 +0000"  >&lt;p&gt;Patch landed to Master. Please reopen ticket if more work is needed.&lt;/p&gt;</comment>
                            <comment id="88331" author="utopiabound" created="Mon, 7 Jul 2014 17:15:38 +0000"  >&lt;p&gt;review-dne-part-1 on master:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/5911f758-028f-11e4-a3d2-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/5911f758-028f-11e4-a3d2-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="88622" author="jlevi" created="Wed, 9 Jul 2014 17:52:12 +0000"  >&lt;p&gt;Di,&lt;br/&gt;
Could you please comment on this one?&lt;br/&gt;
Thank you!&lt;/p&gt;</comment>
                            <comment id="88624" author="jlevi" created="Wed, 9 Jul 2014 17:53:48 +0000"  >&lt;p&gt;Patch landed to Master. Additional work is being tracked under &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5312&quot; title=&quot;sanity test_161a: cannot create regular file &amp;#39;...f.sanity.161a&amp;#39;: Input/output error&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5312&quot;&gt;&lt;del&gt;LU-5312&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="22967">LU-4571</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="24112">LU-4868</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="23234">LU-4658</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="23191">LU-4644</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="25503">LU-5312</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="19627">LU-3531</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwbxz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>12136</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>