<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:35:42 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-3645] Interop 2.1.5 &lt;--&gt; 2.4 Write operations during failover errors out instead of stalling</title>
                <link>https://jira.whamcloud.com/browse/LU-3645</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;During acceptance testing for KIT, they tried OSS failover while running several applications. And applications got IO errors (can&apos;t create file and similar messages). This should not happen and IO should just stall till failover happens.&lt;br/&gt;
The clients were running 2.4 and servers were 2.1.5. We tried with 2.1.5 clients and did not see this issue. I have attached the client and server logs.&lt;/p&gt;</description>
                <environment></environment>
        <key id="20016">LU-3645</key>
            <summary>Interop 2.1.5 &lt;--&gt; 2.4 Write operations during failover errors out instead of stalling</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="gshilamkar">Girish Shilamkar</reporter>
                        <labels>
                            <label>mn1</label>
                    </labels>
                <created>Fri, 26 Jul 2013 08:26:15 +0000</created>
                <updated>Sat, 15 Mar 2014 22:48:24 +0000</updated>
                            <resolved>Fri, 22 Nov 2013 22:55:00 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                    <version>Lustre 2.1.5</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>10</watches>
                                                                            <comments>
                            <comment id="63028" author="pjones" created="Fri, 26 Jul 2013 13:37:05 +0000"  >&lt;p&gt;Hongchao&lt;/p&gt;

&lt;p&gt;Could you please look into this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="63034" author="kitwestneat" created="Fri, 26 Jul 2013 14:27:06 +0000"  >&lt;p&gt;This is actually a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3621&quot; title=&quot;during failover testing, statahead hangs&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3621&quot;&gt;&lt;del&gt;LU-3621&lt;/del&gt;&lt;/a&gt;, we got our wires crossed here at DDN HQ, sorry about that.&lt;/p&gt;</comment>
                            <comment id="63038" author="pjones" created="Fri, 26 Jul 2013 14:44:18 +0000"  >&lt;p&gt;ok - thanks Kit&lt;/p&gt;</comment>
                            <comment id="63278" author="kitwestneat" created="Tue, 30 Jul 2013 17:18:57 +0000"  >&lt;p&gt;Hi Peter, I was mistaken on this - it is a different issue uncovered during the same testing. Can you reopen the ticket and advise us on what kind of debug information we should get to help diagnose the issue? I&apos;ve attached rpctrace and vfstrace output.&lt;/p&gt;</comment>
                            <comment id="63281" author="kitwestneat" created="Tue, 30 Jul 2013 17:27:14 +0000"  >&lt;blockquote&gt;
&lt;p&gt;The I/O error appeared on ucbn003 and the bonnie++ error message was:&lt;br/&gt;
Create files in random order...Can&apos;t create file v8KnMgSE00000005fb&lt;/p&gt;&lt;/blockquote&gt;</comment>
                            <comment id="63284" author="jamesanunez" created="Tue, 30 Jul 2013 17:33:54 +0000"  >&lt;p&gt;Per Kit Westneat :&lt;br/&gt;
We accidentally closed out &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3645&quot; title=&quot;Interop 2.1.5 &amp;lt;--&amp;gt; 2.4 Write operations during failover errors out instead of stalling&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3645&quot;&gt;&lt;del&gt;LU-3645&lt;/del&gt;&lt;/a&gt; as a duplicate, but it&apos;s actually a different problem which arose at the same time. I just wanted to make sure it didn&apos;t fall off the radar since it was already closed as a duplicate. I have updated the ticket with new logs.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Kit&lt;/p&gt;</comment>
                            <comment id="63456" author="hongchao.zhang" created="Thu, 1 Aug 2013 11:37:28 +0000"  >&lt;p&gt;status update:&lt;br/&gt;
the logs are under investigation, but still no result, will update this ticket once any progress is made, Thanks!&lt;/p&gt;</comment>
                            <comment id="63530" author="hongchao.zhang" created="Fri, 2 Aug 2013 04:56:08 +0000"  >&lt;p&gt;I can&apos;t find the logs related to eviction in the client logs, but there are some in server (MDS &amp;amp; OSS)&lt;br/&gt;
what is running on the following IPs? Thanks&lt;br/&gt;
172.26.4.6&lt;br/&gt;
172.26.4.1&lt;br/&gt;
172.26.7.228&lt;br/&gt;
172.26.7.229&lt;br/&gt;
172.26.7.231&lt;/p&gt;


&lt;p&gt;LU-XXXX/oss1/messages-20130623:Jun 21 14:34:05 oss1 kernel: : Lustre: pfscdat1-OST0000: haven&apos;t heard from client 52a702a3-cdba-d2a9-b8ee-c95c3cce895b (at 172.26.4.6@o2ib) in 230 seconds. I think it&apos;s dead, and I am evicting it. exp ffff8800b073f800, cur 1371818045 expire 1371817895 last 1371817815&lt;br/&gt;
LU-XXXX/oss1/messages-20130623:Jun 21 14:34:13 oss1 kernel: : Lustre: pfscwor1-OST0000: haven&apos;t heard from client ac49323c-bd2e-c726-9261-28fae514373c (at 172.26.4.6@o2ib) in 238 seconds. I think it&apos;s dead, and I am evicting it. exp ffff880814dda400, cur 1371818053 expire 1371817903 last 1371817815&lt;br/&gt;
LU-XXXX/oss1/messages-20130630:Jun 26 11:21:43 oss1 kernel: : Lustre: pfscwor1-OST0000: haven&apos;t heard from client a12614ba-2215-a27d-dd84-d404b5350351 (at 172.26.7.228@o2ib) in 231 seconds. I think it&apos;s dead, and I am evicting it. exp ffff88081469d400, cur 1372238503 expire 1372238353 last 1372238272&lt;br/&gt;
LU-XXXX/oss1/messages-20130630:Jun 26 15:12:08 oss1 kernel: : Lustre: pfscdat1-OST0000: haven&apos;t heard from client b47094a7-bd11-5591-8fda-789563de29b3 (at 172.26.7.229@o2ib) in 234 seconds. I think it&apos;s dead, and I am evicting it. exp ffff88018615c400, cur 1372252328 expire 1372252178 last 1372252094&lt;br/&gt;
LU-XXXX/oss1/messages-20130630:Jun 26 15:38:53 oss1 kernel: : Lustre: pfscwor1-OST0000: haven&apos;t heard from client 1c07b9d5-3d84-2150-c182-e634ff69e868 (at 172.26.7.229@o2ib) in 227 seconds. I think it&apos;s dead, and I am evicting it. exp ffff880814f3ec00, cur 1372253933 expire 1372253783 last 1372253706&lt;br/&gt;
LU-XXXX/oss1/messages-20130630:Jun 26 17:02:35 oss1 kernel: : Lustre: pfscwor1-OST0000: haven&apos;t heard from client 2aa5cfe7-24e1-82df-7822-0695a2c9a627 (at 172.26.7.229@o2ib) in 228 seconds. I think it&apos;s dead, and I am evicting it. exp ffff880385efc400, cur 1372258955 expire 1372258805 last 1372258727&lt;br/&gt;
LU-XXXX/oss1/messages-20130630:Jun 27 12:45:03 oss1 kernel: : Lustre: pfscwor1-OST0000: haven&apos;t heard from client ff2ba913-cde8-1aa0-ac07-4e6e1a8b775d (at 172.26.7.229@o2ib) in 229 seconds. I think it&apos;s dead, and I am evicting it. exp ffff8803898ba800, cur 1372329903 expire 1372329753 last 1372329674&lt;br/&gt;
LU-XXXX/oss1/messages-20130630:Jun 27 12:45:05 oss1 kernel: : Lustre: pfscdat1-OST0000: haven&apos;t heard from client b7b0858f-7c85-8d52-0995-d93c0d0807c9 (at 172.26.7.229@o2ib) in 231 seconds. I think it&apos;s dead, and I am evicting it. exp ffff88081469d000, cur 1372329905 expire 1372329755 last 1372329674&lt;br/&gt;
LU-XXXX/oss1/messages-20130630:Jun 28 21:12:45 oss1 kernel: : Lustre: pfscwor1-OST0000: recovery is timed out, evict stale exports&lt;br/&gt;
LU-XXXX/oss1/messages-20130630:Jun 28 21:12:46 oss1 kernel: : Lustre: pfscdat2-OST0000: recovery is timed out, evict stale exports&lt;br/&gt;
LU-XXXX/oss1/messages-20130707:Jul  4 15:40:15 oss1 kernel: : Lustre: pfscwor1-OST0000: haven&apos;t heard from client 14f37ec5-cbc1-09f9-5ebb-c2be149e63d8 (at 172.26.7.231@o2ib) in 234 seconds. I think it&apos;s dead, and I am evicting it. exp ffff881013986000, cur 1372945215 expire 1372945065 last 1372944981&lt;br/&gt;
LU-XXXX/oss1/messages-20130707:Jul  4 15:45:50 oss1 kernel: : Lustre: pfscwor1-OST0000: haven&apos;t heard from client 14f37ec5-cbc1-09f9-5ebb-c2be149e63d8 (at 172.26.7.231@o2ib) in 228 seconds. I think it&apos;s dead, and I am evicting it. exp ffff880806639400, cur 1372945550 expire 1372945400 last 1372945322&lt;br/&gt;
LU-XXXX/oss1/messages-20130707:Jul  4 15:50:00 oss1 kernel: : Lustre: pfscdat2-OST0000: haven&apos;t heard from client 0514dccd-a92e-ce94-41cd-9c76585df636 (at 172.26.7.231@o2ib) in 228 seconds. I think it&apos;s dead, and I am evicting it. exp ffff88081b2c8800, cur 1372945800 expire 1372945650 last 1372945572&lt;br/&gt;
LU-XXXX/oss1/messages-20130714:Jul  8 09:32:07 oss1 kernel: : Lustre: pfscdat2-OST0000: haven&apos;t heard from client 9bb010b4-be70-74ca-50c0-d8a26989ff2b (at 172.26.7.229@o2ib) in 274 seconds. I think it&apos;s dead, and I am evicting it. exp ffff88081b336400, cur 1373268727 expire 1373268577 last 1373268453&lt;br/&gt;
LU-XXXX/oss1/messages-20130714:Jul 11 16:21:25 oss1 kernel: : Lustre: pfscwor1-OST0000: haven&apos;t heard from client f91d0897-30fc-c2ca-b15e-36b1093067eb (at 172.26.4.1@o2ib) in 227 seconds. I think it&apos;s dead, and I am evicting it. exp ffff880815de1c00, cur 1373552485 expire 1373552335 last 1373552258&lt;br/&gt;
LU-XXXX/oss2/messages-20130623:Jun 21 14:34:13 oss2 kernel: : Lustre: pfscdat2-OST0000: haven&apos;t heard from client e5a9b005-ddb0-703e-9583-ec501dad4374 (at 172.26.4.6@o2ib) in 238 seconds. I think it&apos;s dead, and I am evicting it. exp ffff880661c34400, cur 1371818053 expire 1371817903 last 1371817815&lt;br/&gt;
LU-XXXX/oss2/messages-20130711:Jun 26 11:21:43 oss2 kernel: : Lustre: pfscdat2-OST0000: haven&apos;t heard from client 72a4d8b7-4cd1-b836-d32e-9870e7cde133 (at 172.26.7.228@o2ib) in 231 seconds. I think it&apos;s dead, and I am evicting it. exp ffff880f4962b800, cur 1372238503 expire 1372238353 last 1372238272&lt;br/&gt;
LU-XXXX/oss2/messages-20130711:Jun 26 15:12:08 oss2 kernel: : Lustre: pfscdat2-OST0000: haven&apos;t heard from client 0b9d9c9d-305a-0c95-263a-ee4767c3649d (at 172.26.7.229@o2ib) in 234 seconds. I think it&apos;s dead, and I am evicting it. exp ffff880ff0366c00, cur 1372252328 expire 1372252178 last 1372252094&lt;br/&gt;
LU-XXXX/oss2/messages-20130711:Jun 26 15:38:53 oss2 kernel: : Lustre: pfscdat2-OST0000: haven&apos;t heard from client 7082363c-bbaf-7a14-8f27-bba2748ac8b6 (at 172.26.7.229@o2ib) in 227 seconds. I think it&apos;s dead, and I am evicting it. exp ffff880ec28e0800, cur 1372253933 expire 1372253783 last 1372253706&lt;br/&gt;
LU-XXXX/oss2/messages-20130711:Jun 26 17:02:35 oss2 kernel: : Lustre: pfscdat2-OST0000: haven&apos;t heard from client 2b993f06-3f9e-d311-c501-1dc637f23b9b (at 172.26.7.229@o2ib) in 228 seconds. I think it&apos;s dead, and I am evicting it. exp ffff880feaeac400, cur 1372258955 expire 1372258805 last 1372258727&lt;br/&gt;
LU-XXXX/oss2/messages-20130711:Jun 27 12:45:03 oss2 kernel: : Lustre: pfscdat2-OST0000: haven&apos;t heard from client 3b957e97-f989-033d-9b2f-d32653cdb7b5 (at 172.26.7.229@o2ib) in 229 seconds. I think it&apos;s dead, and I am evicting it. exp ffff880ff6b27000, cur 1372329903 expire 1372329753 last 1372329674&lt;/p&gt;
</comment>
                            <comment id="63696" author="gshilamkar" created="Tue, 6 Aug 2013 06:39:41 +0000"  >&lt;p&gt;&amp;gt; Hongchao Zhang added a comment - 02/Aug/13 4:56 AM&lt;br/&gt;
&amp;gt; I can&apos;t find the logs related to eviction in the client logs, but&lt;br/&gt;
&amp;gt; there are some in server (MDS &amp;amp; OSS) what is running on the following &amp;gt; IPs? Thanks&lt;br/&gt;
&amp;gt; 172.26.4.6&lt;br/&gt;
&amp;gt; 172.26.4.1&lt;br/&gt;
&amp;gt; 172.26.7.228&lt;br/&gt;
&amp;gt; 172.26.7.229&lt;br/&gt;
&amp;gt; 172.26.7.231&lt;/p&gt;

&lt;p&gt;They are the client IPs. 172.26.7.* had mounted the file systems but were not part of the failover testing.&lt;/p&gt;

&lt;p&gt;&amp;gt; LU-XXXX/oss1/messages-20130623:Jun 21 14:34:05 oss1 kernel: : Lustre: pfscdat1-OST0000: haven&apos;t heard from client 52a702a3-cdba-d2a9-b8ee-c95c3cce895b (at 172.26.4.6@o2ib) in 230 seconds. I think it&apos;s dead, and I am evicting it. exp ffff8800b073f800, cur 1371818045 expire 1371817895 last 1371817815&lt;br/&gt;
...&lt;/p&gt;

&lt;p&gt;All these messages are very old. The complete server system has been&lt;br/&gt;
rebooted 2013-07-16, i.e. all the messages should not be relevant.&lt;/p&gt;</comment>
                            <comment id="63758" author="gshilamkar" created="Wed, 7 Aug 2013 08:11:29 +0000"  >&lt;p&gt;Can you please increase the priority of this issue ? The problem is blocking this new system from getting into production.&lt;br/&gt;
Is there any additional logs, information needed from our side?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;</comment>
                            <comment id="63764" author="hongchao.zhang" created="Wed, 7 Aug 2013 11:45:42 +0000"  >&lt;p&gt;okay!&lt;br/&gt;
I&apos;m trying to reproduce it myself with 2 nodes(2.1.5 client + 2.4 server) by &quot;bonnie++&quot;, and can&apos;t reproduce it up to now.&lt;/p&gt;

&lt;p&gt;is this issue can be reproduced with fewer nodes on your site(say, 2 nodes)?&lt;br/&gt;
could you please attach the new syslog of ucbn003/oss1/oss2, the latest logs in LU-XXXX.tgz is July 16, Thanks&lt;/p&gt;</comment>
                            <comment id="63765" author="gshilamkar" created="Wed, 7 Aug 2013 12:10:39 +0000"  >&lt;p&gt;&amp;gt; I&apos;m trying to reproduce it myself with 2 nodes(2.1.5 client + 2.4 server) by &quot;bonnie++&quot;, and can&apos;t &amp;gt;reproduce it up to now.&lt;br/&gt;
is this issue can be reproduced with fewer nodes on your site(say, 2 nodes)?&lt;/p&gt;

&lt;p&gt;This problem was seen during acceptance testing and it only had mds failover pair and two oss.&lt;/p&gt;

&lt;p&gt;&amp;gt;could you please attach the new syslog of ucbn003/oss1/oss2, the latest logs in LU-XXXX.tgz is July 16, &lt;br/&gt;
The test was run on July 15/16, so all the necessary information should be there. Do you want us to collect logs with any particular debug option ?&lt;/p&gt;

&lt;p&gt;I can see ost_write operations failing with -ENODEV. I ran replay-ost-single on 2.1.5 (ost_writes fail with -ENODEV) and 2.4 ( ost_write fails with -ENOTCONN) &lt;/p&gt;

&lt;p&gt;Does the error returned by ost_write affect the failover behavior ?&lt;/p&gt;</comment>
                            <comment id="63768" author="gshilamkar" created="Wed, 7 Aug 2013 12:41:57 +0000"  >&lt;p&gt;&amp;gt; I&apos;m trying to reproduce it myself with 2 nodes(2.1.5 client + 2.4 server) by &quot;bonnie++&quot;, and can&apos;t reproduce it up to now.&lt;/p&gt;

&lt;p&gt;I am assuming 2.1.5 client + 2.4 server is typo, as this problem is seen with 2.1.5 &lt;em&gt;servers&lt;/em&gt; and 2.4 &lt;em&gt;client&lt;/em&gt;&lt;/p&gt;</comment>
                            <comment id="63814" author="kitwestneat" created="Wed, 7 Aug 2013 20:07:34 +0000"  >&lt;p&gt;I sorted out the relevant bits from the Lustre log so I could see what was going on a bit better, hopefully this is helpful.&lt;/p&gt;

&lt;p&gt;client:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;00000080:00200000:0.0:1375191524.336997:0:11756:0:(namei.c:503:ll_lookup_it()) VFS Op:name=v8KnMgSE00000005fb,dir=144115440287769024/33554490(ffff8808619835f8),intent=o
pen|creat
00000100:00100000:0.0:1375191524.337573:0:11756:0:(client.c:1441:ptlrpc_send_new_req()) Sending RPC pname:cluuid:pid:xid:nid:opc bonnie++:8ff70f2e-0c20-af6e-ad31-a95bc6b86f3a:11756:1441894532689196:172.26.17.2@o2ib:101
00000100:00100000:0.0:1375191524.337599:0:11756:0:(client.c:2090:ptlrpc_set_wait()) set ffff88086ac7b2c0 going to sleep &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 6 seconds
00000100:00100000:0.0:1375191524.423682:0:11756:0:(client.c:1805:ptlrpc_check_set()) Completed RPC pname:cluuid:pid:xid:nid:opc bonnie++:8ff70f2e-0c20-af6e-ad31-a95bc6b86f3a:11756:1441894532689196:172.26.17.2@o2ib:101
00000002:00100000:0.0:1375191524.423714:0:11756:0:(mdc_locks.c:579:mdc_finish_enqueue()) @@@ op: 3 disposition: 17, status: -5  req@ffff880867f32c00 x1441894532689196/t0(0) o101-&amp;gt;pfscdat2-MDT0000-mdc-ffff8810708aec00@172.26.17.2@o2ib:12/10 lens 592/544 e 0 to 0 dl 1375191531 ref 1 fl Complete:R/0/0 rc 301/301
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;mds2:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;00000100:00100000:1.0:1375191524.337381:0:17462:0:(service.c:1547:ptlrpc_server_handle_req_in()) got req x1441894532689196
00000100:00100000:1.0:1375191524.337395:0:17462:0:(service.c:1724:ptlrpc_server_handle_request()) Handling RPC pname:cluuid+ref:pid:xid:nid:opc mdt_10:8ff70f2e-0c20-af6e-ad31-a95bc6b86f3a+3316:11756:x1441894532689196:12345-172.26.4.3@o2ib:101
00000100:00080000:3.0:1375191524.337404:0:7060:0:(recover.c:216:ptlrpc_request_handle_notconn()) &lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt; pfscdat2-OST0000-osc-MDT0000 of pfscdat2-OST0000_UUID@172.26.17.3@o2ib abruptly disconnected: reconnecting
00000100:02000400:3.0:1375191524.337410:0:7060:0:(&lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt;.c:170:ptlrpc_set_import_discon()) pfscdat2-OST0000-osc-MDT0000: Connection to pfscdat2-OST0000 (at 172.26.17.3@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
00000100:00080000:3.0:1375191524.337419:0:7060:0:(&lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt;.c:180:ptlrpc_set_import_discon()) ffff88082aeed800 pfscdat2-OST0000_UUID: changing &lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt; state from FULL to DISCONN
00000100:00080000:3.0:1375191524.337424:0:7060:0:(&lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt;.c:626:ptlrpc_connect_import()) ffff88082aeed800 pfscdat2-OST0000_UUID: changing &lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt; state from DISCONN to CONNECTING
00000100:00080000:3.0:1375191524.337428:0:7060:0:(&lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt;.c:482:import_select_connection()) pfscdat2-OST0000-osc-MDT0000: connect to NID 172.26.17.3@o2ib last attempt 4982526859
00000100:00080000:3.0:1375191524.337433:0:7060:0:(&lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt;.c:561:import_select_connection()) pfscdat2-OST0000-osc-MDT0000: &lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt; ffff88082aeed800 using connection 172.26.17.3@o2ib/172.26.17.3@o2ib
00000100:00100000:3.0:1375191524.337453:0:7060:0:(&lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt;.c:725:ptlrpc_connect_import()) @@@ (re)connect request (timeout 5)  req@ffff881001505c00 x1441623136776061/t0(0) o8-&amp;gt;pfscdat2-OST0000-osc-MDT0000@172.26.17.3@o2ib:28/4 lens 368/512 e 0 to 0 dl 0 ref 1 fl New:N/0/ffffffff rc 0/-1
00000100:00100000:5.0:1375191524.337462:0:17454:0:(service.c:1547:ptlrpc_server_handle_req_in()) got req x1441894532689216
00000100:00020000:3.0:1375191524.337469:0:7060:0:(layout.c:1659:__req_capsule_get()) @@@ Wrong buffer &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; field `ost_body&lt;span class=&quot;code-quote&quot;&gt;&apos; (1 of 1) in format `OST_CREATE&apos;&lt;/span&gt;: 0 vs. 208 (server)
  req@ffff8810019bf000 x1441623136776022/t0(0) o5-&amp;gt;pfscdat2-OST0000-osc-MDT0000@172.26.17.3@o2ib:7/4 lens 400/192 e 0 to 0 dl 1375191531 ref 1 fl Interpret:RN/0/0 rc -107/-107
00000100:00100000:5.0:1375191524.337475:0:17454:0:(service.c:1724:ptlrpc_server_handle_request()) Handling RPC pname:cluuid+ref:pid:xid:nid:opc mdt_rdpg_02:8ff70f2e-0c20-af6e-ad31-a95bc6b86f3a+3319:11760:x1441894532689216:12345-172.26.4.3@o2ib:35
00000100:00100000:13.0:1375191524.337481:0:7061:0:(client.c:1434:ptlrpc_send_new_req()) Sending RPC pname:cluuid:pid:xid:nid:opc ptlrpcd-rcv:pfscdat2-MDT0000-mdtlov_UUID:7061:1441623136776061:172.26.17.3@o2ib:8
00000100:00100000:5.0:1375191524.337511:0:17454:0:(service.c:1771:ptlrpc_server_handle_request()) Handled RPC pname:cluuid+ref:pid:xid:nid:opc mdt_rdpg_02:8ff70f2e-0c20-af6e-ad31-a95bc6b86f3a+3318:11760:x1441894532689216:12345-172.26.4.3@o2ib:35 Request procesed in 38us (60us total) trans 51541021103 rc 0/0
00000100:02020000:13.0:1375191524.347566:0:7061:0:(client.c:1132:ptlrpc_check_status()) 11-0: an error occurred &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; communicating with 172.26.17.3@o2ib. The ost_connect operation failed with -19
00000100:00080000:13.0:1375191524.347576:0:7061:0:(&lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt;.c:1112:ptlrpc_connect_interpret()) ffff88082aeed800 pfscdat2-OST0000_UUID: changing &lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt; state from CONNECTING to DISCONN
00000100:00080000:13.0:1375191524.347579:0:7061:0:(&lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt;.c:1158:ptlrpc_connect_interpret()) recovery of pfscdat2-OST0000_UUID on 172.26.17.3@o2ib failed (-19)
00000100:00100000:13.0:1375191524.347583:0:7061:0:(client.c:1773:ptlrpc_check_set()) Completed RPC pname:cluuid:pid:xid:nid:opc ptlrpcd-rcv:pfscdat2-MDT0000-mdtlov_UUID:7061:1441623136776061:172.26.17.3@o2ib:8
00000008:00020000:3.0:1375191524.381692:0:7060:0:(osc_create.c:175:osc_interpret_create()) @@@ Unknown rc -107 from async create: failing oscc  req@ffff8810019bf000 x1441623136776022/t0(0) o5-&amp;gt;pfscdat2-OST0000-osc-MDT0000@172.26.17.3@o2ib:7/4 lens 400/192 e 0 to 0 dl 1375191531 ref 1 fl Interpret:RN/0/0 rc -107/-107
00000100:00080000:3.0:1375191524.410395:0:7060:0:(&lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt;.c:195:ptlrpc_set_import_discon()) osc: &lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt; ffff88082aeed800 already not connected (conn 5, was 4): DISCONN
00000008:00080000:3.0:1375191524.410401:0:7060:0:(osc_create.c:182:osc_interpret_create()) preallocated through id 5539121 (next to use 5534160)
00020000:00080000:3.0:1375191524.410404:0:7060:0:(lov_request.c:579:lov_update_create_set()) error creating fid 0x8bb8 sub-object on OST idx 0/1: rc = -107
00000100:00100000:3.0:1375191524.410411:0:7060:0:(client.c:1773:ptlrpc_check_set()) Completed RPC pname:cluuid:pid:xid:nid:opc ptlrpcd:pfscdat2-MDT0000-mdtlov_UUID:7060:1441623136776022:172.26.17.3@o2ib:5
00020000:00020000:1.0:1375191524.410415:0:17462:0:(lov_request.c:579:lov_update_create_set()) error creating fid 0x8bb8 sub-object on OST idx 0/1: rc = -5
00000100:00100000:1.0:1375191524.423247:0:17462:0:(service.c:1771:ptlrpc_server_handle_request()) Handled RPC pname:cluuid+ref:pid:xid:nid:opc mdt_10:8ff70f2e-0c20-af6e-ad31-a95bc6b86f3a+3317:11756:x1441894532689196:12345-172.26.4.3@o2ib:101 Request procesed in 85853us (85878us total) trans 0 rc 301/301
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;oss1:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;00000100:00100000:8.0:1375191524.325142:0:2026:0:(service.c:1547:ptlrpc_server_handle_req_in()) got req x1441623136776022
00000100:00100000:8.0:1375191524.325163:0:2026:0:(service.c:1724:ptlrpc_server_handle_request()) Handling RPC pname:cluuid+ref:pid:xid:nid:opc ll_ost_creat_02:0+-99:7060:x1441623136776022:12345-172.26.17.2@o2ib:5
00000010:00080000:8.0:1375191524.325170:0:2026:0:(ost_handler.c:2085:ost_handle()) operation 5 on unconnected OST from 12345-172.26.17.2@o2ib
00000100:00100000:8.0:1375191524.325200:0:2026:0:(service.c:1771:ptlrpc_server_handle_request()) Handled RPC pname:cluuid+ref:pid:xid:nid:opc ll_ost_creat_02:0+-99:7060:x1441623136776022:12345-172.26.17.2@o2ib:5 Request procesed in 41us (115us total) trans 0 rc -107/-107
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="63998" author="green" created="Fri, 9 Aug 2013 17:50:07 +0000"  >&lt;p&gt;The last log excerpts you show explains the EIO as due to MDS2 being disconnected from OST, so when we try to create an object, there&apos;s nowhere to get the object from, hence EIO.&lt;/p&gt;

&lt;p&gt;Now, why did MDS2 disconnected from the OST? Is OST still in recovery and does not recognize MDS, so does not let it in?&lt;/p&gt;</comment>
                            <comment id="64014" author="kitwestneat" created="Fri, 9 Aug 2013 20:50:17 +0000"  >&lt;p&gt;I found the kernel logs for the servers:&lt;/p&gt;

&lt;p&gt;mds2:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Jul 30 15:34:08 mds2 kernel: : Lustre: MDS mdd_obd-pfscdat2-MDT0000: pfscdat2-OST0000_UUID now active, resetting orphans
Jul 30 15:38:44 mds2 kernel: : LustreError: 11-0: an error occurred &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; communicating with 172.26.17.3@o2ib. The ost_create operation failed with -107
Jul 30 15:38:44 mds2 kernel: : Lustre: pfscdat2-OST0000-osc-MDT0000: Connection to pfscdat2-OST0000 (at 172.26.17.3@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Jul 30 15:38:44 mds2 kernel: : LustreError: 7060:0:(layout.c:1659:__req_capsule_get()) @@@ Wrong buffer &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; field `ost_body&lt;span class=&quot;code-quote&quot;&gt;&apos; (1 of 1) in format `OST_CREATE&apos;&lt;/span&gt;: 0 vs. 208 (server)
Jul 30 15:38:44 mds2 kernel: : req@ffff8810019bf000 x1441623136776022/t0(0) o5-&amp;gt;pfscdat2-OST0000-osc-MDT0000@172.26.17.3@o2ib:7/4 lens 400/192 e 0 to 0 dl 1375191531 ref 1 fl Interpret:RN/0/0 rc -107/-107
Jul 30 15:38:44 mds2 kernel: : LustreError: 11-0: an error occurred &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; communicating with 172.26.17.3@o2ib. The ost_connect operation failed with -19
Jul 30 15:38:44 mds2 kernel: : LustreError: 7060:0:(osc_create.c:175:osc_interpret_create()) @@@ Unknown rc -107 from async create: failing oscc  req@ffff8810019bf000 x1441623136776022/t0(0) o5-&amp;gt;pfscdat2-OST0000-osc-MDT0000@172.26.17.3@o2ib:7/4 lens 400/192 e 0 to 0 dl 1375191531 ref 1 fl Interpret:RN/0/0 rc -107/-107
Jul 30 15:38:44 mds2 kernel: : LustreError: 17462:0:(lov_request.c:579:lov_update_create_set()) error creating fid 0x8bb8 sub-object on OST idx 0/1: rc = -5
Jul 30 15:39:23 mds2 kernel: : Lustre: pfscdat2-OST0000-osc-MDT0000: Connection restored to pfscdat2-OST0000 (at 172.26.17.4@o2ib)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;oss1:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Jul 30 15:38:43 oss1 kernel: : Lustre: Failing over pfscdat2-OST0000
Jul 30 15:38:44 oss1 kernel: : LustreError: 137-5: pfscdat2-OST0000: Not available &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; connect from 172.26.4.3@o2ib (stopping)
Jul 30 15:38:44 oss1 kernel: : LustreError: 137-5: pfscdat2-OST0000: Not available &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; connect from 172.26.17.2@o2ib (stopping)
Jul 30 15:38:44 oss1 kernel: : Lustre: pfscdat2-OST0000: shutting down &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; failover; client state will be preserved.
Jul 30 15:38:45 oss1 kernel: : Lustre: OST pfscdat2-OST0000 has stopped.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;oss2:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Jul 30 15:38:56 oss2 kernel: : Lustre: 4795:0:(ldlm_lib.c:2028:target_recovery_init()) RECOVERY: service pfscdat2-OST0000, 11 recoverable clients, last_transno 42740433
Jul 30 15:38:56 oss2 kernel: : Lustre: pfscdat2-OST0000: Now serving pfscdat2-OST0000/ on /dev/mapper/ost_pfscdat2_0 with recovery enabled
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So oss1 was cleanly unmounted from oss1 and failed over to oss2. Shouldn&apos;t the MDT IO hang until it can connect to oss2?&lt;/p&gt;</comment>
                            <comment id="64083" author="hongchao.zhang" created="Mon, 12 Aug 2013 14:36:14 +0000"  >&lt;p&gt;this is the normal case for MDS to handle the objects precreation on OST, for the creation request is marked as no-delay and no-resend.&lt;br/&gt;
the interesting part of logs is where the MDT returned the error (in this case, -5/-EIO), which was not found in the log.&lt;/p&gt;</comment>
                            <comment id="64098" author="kitwestneat" created="Mon, 12 Aug 2013 17:05:44 +0000"  >&lt;p&gt;Hi Hongchao,&lt;/p&gt;

&lt;p&gt;I am not sure what you mean, I see this in the MDS logs:&lt;br/&gt;
00020000:00020000:1.0:1375191524.410415:0:17462:0:(lov_request.c:579:lov_update_create_set()) error creating fid 0x8bb8 sub-object on OST idx 0/1: rc = -5&lt;br/&gt;
00000100:00100000:1.0:1375191524.423247:0:17462:0:(service.c:1771:ptlrpc_server_handle_request()) Handled RPC pname:cluuid+ref:pid:xid:nid:opc mdt_10:8ff70f2e-0c20-af6e-ad31-a95bc6b86f3a+3317:11756:x1441894532689196:12345-172.26.4.3@o2ib:101 Request procesed in 85853us (85878us total) trans 0 rc 301/301&lt;/p&gt;

&lt;p&gt;Isn&apos;t that the part where the MDT is returning EIO? Let me know if there is another debug level I should use to get more information.&lt;/p&gt;

&lt;p&gt;Thanks.&lt;/p&gt;
</comment>
                            <comment id="64167" author="hongchao.zhang" created="Tue, 13 Aug 2013 13:10:23 +0000"  >&lt;p&gt;Hi Kit,&lt;/p&gt;

&lt;p&gt;At MDS, the OSC will create objects in advance (precreation) to be used by MDT, then the failure of creation request won&apos;t be known to client,&lt;br/&gt;
and MDT will check and wait if it found the the specified OSC is in recovery during process the &quot;open|create&quot; request.&lt;/p&gt;

&lt;p&gt;Is the log of the client running the failed application available at July 30? and it could contain the logs where&amp;amp;when the MDT returned error to the client.&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;</comment>
                            <comment id="64176" author="kitwestneat" created="Tue, 13 Aug 2013 14:27:37 +0000"  >&lt;p&gt;Sure, here&apos;s the client syslogs from that period:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Jul 29 13:37:06 ucbn003 kernel: Lustre: Trying to mount a client with IR setting not compatible with current mgc. Force to use current mgc setting that is IR disabled.
Jul 30 15:33:14 ucbn003 kernel: LustreError: 11-0: pfscdat2-OST0000-osc-ffff8810708aec00: Communicating with 172.26.17.4@o2ib, operation ost_destroy failed with -19.
Jul 30 15:33:14 ucbn003 kernel: Lustre: pfscdat2-OST0000-osc-ffff8810708aec00: Connection to pfscdat2-OST0000 (at 172.26.17.4@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Jul 30 15:34:08 ucbn003 kernel: Lustre: pfscdat2-OST0000-osc-ffff8810708aec00: Connection restored to pfscdat2-OST0000 (at 172.26.17.3@o2ib)
Jul 30 15:38:44 ucbn003 kernel: LustreError: 11-0: pfscdat2-OST0000-osc-ffff8810708aec00: Communicating with 172.26.17.3@o2ib, operation ldlm_enqueue failed with -107.
Jul 30 15:38:44 ucbn003 kernel: Lustre: pfscdat2-OST0000-osc-ffff8810708aec00: Connection to pfscdat2-OST0000 (at 172.26.17.3@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Jul 30 15:39:23 ucbn003 kernel: Lustre: pfscdat2-OST0000-osc-ffff8810708aec00: Connection restored to pfscdat2-OST0000 (at 172.26.17.4@o2ib)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Just to make sure we are on the same page, the EIO was in the client lctl dk logs posted earlier:&lt;br/&gt;
00000002:00100000:0.0:1375191524.423714:0:11756:0:(mdc_locks.c:579:mdc_finish_enqueue()) @@@ op: 3 disposition: 17, status: &lt;del&gt;5  req@ffff880867f32c00 x1441894532689196/t0(0) o101&lt;/del&gt;&amp;gt;pfscdat2-MDT0000-mdc-ffff8810708aec00@172.26.17.2@o2ib:12/10 lens 592/544 e 0 to 0 dl 1375191531 ref 1 fl Complete:R/0/0 rc 301/301&lt;/p&gt;

&lt;p&gt;It looks like pid 17462 gets the EIO in lov_update_create_set and then returns it to the client to complete x1441894532689196, which looks like the open|creat on the file. &lt;/p&gt;

&lt;p&gt;Thanks.&lt;/p&gt;</comment>
                            <comment id="64236" author="hongchao.zhang" created="Wed, 14 Aug 2013 11:20:53 +0000"  >&lt;p&gt;Hi KIT,&lt;/p&gt;

&lt;p&gt;yes, lov_create failed somehow, could you please retest with this patch, which prints more logs to help to find the cause, thanks!&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;diff --git a/lustre/lov/lov_request.c b/lustre/lov/lov_request.c
index 0fc635e..dd7c081 100644
--- a/lustre/lov/lov_request.c
+++ b/lustre/lov/lov_request.c
@@ -573,10 +573,10 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; lov_update_create_set(struct lov_request_set *set,
                 /* Pre-creating objects may timeout via -ETIMEDOUT or
                  * -ENOTCONN both are always non-critical events. */
                 CDEBUG(rc == -ETIMEDOUT || rc == -ENOTCONN ? D_HA : D_ERROR,
-                       &lt;span class=&quot;code-quote&quot;&gt;&quot;error creating fid &quot;&lt;/span&gt;LPX64&lt;span class=&quot;code-quote&quot;&gt;&quot; sub-object &quot;&lt;/span&gt;
+                       &lt;span class=&quot;code-quote&quot;&gt;&quot;error creating fid &quot;&lt;/span&gt;LPX64&lt;span class=&quot;code-quote&quot;&gt;&quot;/&quot;&lt;/span&gt;LPX64&lt;span class=&quot;code-quote&quot;&gt;&quot; sub-object &quot;&lt;/span&gt;
                        &lt;span class=&quot;code-quote&quot;&gt;&quot;on OST idx %d/%d: rc = %d\n&quot;&lt;/span&gt;,
-                       set-&amp;gt;set_oi-&amp;gt;oi_oa-&amp;gt;o_id, req-&amp;gt;rq_idx,
-                       lsm-&amp;gt;lsm_stripe_count, rc);
+                       set-&amp;gt;set_oi-&amp;gt;oi_oa-&amp;gt;o_id, set-&amp;gt;set_oi-&amp;gt;oi_oa-&amp;gt;o_seq,
+		       req-&amp;gt;rq_idx, lsm-&amp;gt;lsm_stripe_count, rc);
                 &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc &amp;gt; 0) {
                         CERROR(&lt;span class=&quot;code-quote&quot;&gt;&quot;obd_create returned invalid err %d\n&quot;&lt;/span&gt;, rc);
                         rc = -EIO;
@@ -636,6 +636,8 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; create_done(struct obd_export *exp, struct lov_request_set *set,
                         req-&amp;gt;rq_complete = 0;
 
                         rc = qos_remedy_create(set, req);
+			&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc != 0)
+				CERROR(&lt;span class=&quot;code-quote&quot;&gt;&quot;qos_remedy_create failed %d\n&quot;&lt;/span&gt;, rc);
                         lov_update_create_set(set, req, rc);
                 }
         }
@@ -733,6 +735,8 @@ &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; cb_create_update(void *cookie, &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; rc)
                 &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (lovreq-&amp;gt;rq_idx == cfs_fail_val)
                         rc = -ENOTCONN;
 
+	&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc != 0)
+		CERROR(&lt;span class=&quot;code-quote&quot;&gt;&quot;cb_create_update failed %d\n&quot;&lt;/span&gt;, rc);
         rc= lov_update_create_set(lovreq-&amp;gt;rq_rqset, lovreq, rc);
         &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (lov_set_finished(lovreq-&amp;gt;rq_rqset, 0))
                 lov_put_reqset(lovreq-&amp;gt;rq_rqset);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="64272" author="kitwestneat" created="Wed, 14 Aug 2013 18:54:31 +0000"  >&lt;p&gt;Hi Hongchao,&lt;/p&gt;

&lt;p&gt;I was reviewing this ticket and I realized that in this test configuration, the file system only has one OST. Looking at the code for the creates, it appears that it is doing what is intended. When all OSTs are offline, the MDT will return EIO to the client. Is this correct? &lt;/p&gt;

&lt;p&gt;I will work on rebuilding a server with that debug patch.&lt;/p&gt;

&lt;p&gt;Thanks.&lt;/p&gt;</comment>
                            <comment id="64335" author="hongchao.zhang" created="Thu, 15 Aug 2013 11:04:16 +0000"  >&lt;p&gt;Hi Kit,&lt;/p&gt;

&lt;p&gt;one OST isn&apos;t a problem and the MDT/clients will wait it back. in my local test(two nodes of VM), the all two OSTs are unmounted and mounted without this error.&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;</comment>
                            <comment id="65777" author="kitwestneat" created="Wed, 4 Sep 2013 21:02:52 +0000"  >&lt;p&gt;Hi Hongchao,&lt;/p&gt;

&lt;p&gt;Here are the results of the debug patch:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Sep  4 14:34:48 mds2 kernel: : Lustre: 4591:0:(client.c:1817:ptlrpc_expire_one_request()) @@@ Request  sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1378298081/real 1378298081]  req@ffff8808173ce000 x1445232561408895/t0(0) o400-&amp;gt;pfscdat2-OST0000-osc-MDT0000@172.26.17.4@o2ib:28/4 lens 192/192 e 0 to 1 dl 1378298088 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
Sep  4 14:34:48 mds2 kernel: : Lustre: pfscdat2-OST0000-osc-MDT0000: Connection to pfscdat2-OST0000 (at 172.26.17.4@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep  4 14:34:54 mds2 kernel: : Lustre: 4592:0:(client.c:1817:ptlrpc_expire_one_request()) @@@ Request  sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1378298088/real 1378298088]  req@ffff880825dca000 x1445232561408896/t0(0) o8-&amp;gt;pfscdat2-OST0000-osc-MDT0000@172.26.17.4@o2ib:28/4 lens 368/512 e 0 to 1 dl 1378298094 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
Sep  4 14:35:27 mds2 kernel: : Lustre: pfscdat2-OST0000-osc-MDT0000: Connection restored to pfscdat2-OST0000 (at 172.26.17.3@o2ib)
Sep  4 14:35:27 mds2 kernel: : Lustre: MDS mdd_obd-pfscdat2-MDT0000: pfscdat2-OST0000_UUID now active, resetting orphans
Sep  4 14:40:00 mds2 kernel: : LustreError: 11-0: an error occurred &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; communicating with 172.26.17.3@o2ib. The ost_setattr operation failed with -19
Sep  4 14:40:00 mds2 kernel: : Lustre: pfscdat2-OST0000-osc-MDT0000: Connection to pfscdat2-OST0000 (at 172.26.17.3@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep  4 14:40:00 mds2 kernel: : LustreError: 11-0: an error occurred &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; communicating with 172.26.17.3@o2ib. The ost_connect operation failed with -19
Sep  4 14:40:51 mds2 kernel: : LustreError: 4591:0:(lov_request.c:739:cb_create_update()) cb_create_update failed -11
Sep  4 14:40:51 mds2 kernel: : LustreError: 4591:0:(lov_request.c:579:lov_update_create_set()) error creating fid 0x9a8/0x0 sub-object on OST idx 0/1: rc = -11
Sep  4 14:40:51 mds2 kernel: : LustreError: 15954:0:(lov_request.c:640:create_done()) qos_remedy_create failed -5
Sep  4 14:40:51 mds2 kernel: : LustreError: 15954:0:(lov_request.c:579:lov_update_create_set()) error creating fid 0x9a8/0x0 sub-object on OST idx 0/1: rc = -5
Sep  4 14:40:51 mds2 kernel: : Lustre: pfscdat2-OST0000-osc-MDT0000: Connection restored to pfscdat2-OST0000 (at 172.26.17.4@o2ib)
Sep  4 14:40:51 mds2 kernel: : Lustre: MDS mdd_obd-pfscdat2-MDT0000: pfscdat2-OST0000_UUID now active, resetting orphans
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="65818" author="hongchao.zhang" created="Thu, 5 Sep 2013 09:58:18 +0000"  >&lt;p&gt;Hi Kit,&lt;/p&gt;

&lt;p&gt;Thanks for your effort to test the debug patch!&lt;/p&gt;


&lt;p&gt;this issue is caused by the failed precreate request (rc = -107/-ENOTCONN), which was just sent to precreate objects and triggere the recovery.&lt;br/&gt;
and the fake request queued by OSC to wait the objects to be created is woken up and the lov_create/osc_create_async failed&lt;/p&gt;

&lt;p&gt;step 1, OSC found there was no enough objects, then it called osc_internal_create to send a request to precreate objects&lt;br/&gt;
step 2, one of the following osc_create_async found the objects was used up and queued a fake request to wait the objects to be created&lt;br/&gt;
step 3, the request sent in step 1 triggered the recovery with recoverable error (-ENOTCONN or -ENODEV) and the fake request queued in step 2 will be woken up&lt;br/&gt;
        and the osc_create_async failed.&lt;/p&gt;

&lt;p&gt;normally, the failure of the request precreating objects won&apos;t affect the OSC, for there should be enough objects to be used during the processing interval of&lt;br/&gt;
the precreate request (normally spent no much time, except bad network). in this case, the precreated objects are used up before the reply of the precreate&lt;br/&gt;
request is received!&lt;/p&gt;

&lt;p&gt;the patch will be attached soon!&lt;/p&gt;</comment>
                            <comment id="65822" author="hongchao.zhang" created="Thu, 5 Sep 2013 11:07:03 +0000"  >&lt;p&gt;the patch is tracked at &lt;a href=&quot;http://review.whamcloud.com/#/c/7559/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/7559/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="65879" author="kitwestneat" created="Thu, 5 Sep 2013 19:49:03 +0000"  >&lt;p&gt;Hi Hongchao,&lt;/p&gt;

&lt;p&gt;Is this an MDT only patch, or is it for the clients too?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Kit&lt;/p&gt;</comment>
                            <comment id="65900" author="hongchao.zhang" created="Fri, 6 Sep 2013 02:45:19 +0000"  >&lt;p&gt;Hi Kit&lt;/p&gt;

&lt;p&gt;the patch is for MDT only.&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;</comment>
                            <comment id="66594" author="kitwestneat" created="Fri, 13 Sep 2013 14:30:00 +0000"  >&lt;p&gt;Hi Hongchao,&lt;/p&gt;

&lt;p&gt;We&apos;ve run into a problem with this patch. After upgrading the MDT, we are starting to get data corruption on the system. Here is a description of the issue:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Another admin has been working on the test file system this morning.&lt;br/&gt;
First, she could execute normal commands on the file system. Then&lt;br/&gt;
normal commands (like vi) added 112 MB rubbish to files. Even a touch&lt;br/&gt;
on a non-existing file created a file of 112 MB.&lt;/p&gt;

&lt;p&gt;Another admin logged in and his .Xauthority was increased. He looked&lt;br/&gt;
at the binary data ant it seemed like the additional data came from a&lt;br/&gt;
software package which is installed on the same Lustre file system,&lt;br/&gt;
i.e. the rubbish seems to be no arbitrary data but seems to come from&lt;br/&gt;
another location.&lt;/p&gt;

&lt;p&gt;root@iccn999:/software/all/tsm/sbin# touch gaga1&lt;br/&gt;
Wed Sep 11-14:38:21 (14/1012) -  ACTIVE&lt;br/&gt;
root@iccn999:/software/all/tsm/sbin# ls -l gaga1&lt;br/&gt;
&lt;del&gt;rw-r&lt;/del&gt;&lt;del&gt;r&lt;/del&gt;- 1 root root 116430464 Sep 11 14:38 gaga1&lt;/p&gt;

&lt;p&gt;On another client the behaviour is different:&lt;br/&gt;
root@iccn996:/software/all/tsm/sbin# touch gaga2&lt;br/&gt;
touch: setting times of `gaga2&apos;: No such file or directory&lt;br/&gt;
Wed Sep 11-14:39:15 (5/41)&lt;br/&gt;
root@iccn996:/software/all/tsm/sbin# ls -l gaga2&lt;br/&gt;
&lt;del&gt;rw-r&lt;/del&gt;&lt;del&gt;r&lt;/del&gt;- 1 root root 0 Sep 11 14:39 gaga2&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;I will upload the lctl dk logs (with vfstrace and rpctrace). Is there any other information we should get?&lt;/p&gt;</comment>
                            <comment id="66595" author="kitwestneat" created="Fri, 13 Sep 2013 14:31:11 +0000"  >&lt;p&gt;I should add that we didn&apos;t see the problem any more after downgrading to stock 2.1.5. We did however see the issue at both 2.1.6+patch and 2.1.5+patch.&lt;/p&gt;</comment>
                            <comment id="66714" author="hongchao.zhang" created="Mon, 16 Sep 2013 11:10:41 +0000"  >&lt;p&gt;sorry, there is an error in the previous patch, and it has been updated (&lt;a href=&quot;http://review.whamcloud.com/#/c/7559/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/7559/&lt;/a&gt;)&lt;/p&gt;</comment>
                            <comment id="67732" author="kitwestneat" created="Thu, 26 Sep 2013 17:18:34 +0000"  >&lt;p&gt;Hi Hongchao,&lt;/p&gt;

&lt;p&gt;The patch fixed the data corruption, but we are still seeing the same errors on failover. Should we reproduce the issue with the original debugging patch + fix? Here are the kernel messages from the fix (patch 7559):&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Sep 25 15:16:49 pfscn2 kernel: : LustreError: 11-0: an error occurred &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; communicating with 172.26.17.3@o2ib. The ost_setattr operation failed with -19
Sep 25 15:16:49 pfscn2 kernel: : Lustre: pfscdat2-OST0000-osc-MDT0000: Connection to pfscdat2-OST0000 (at 172.26.17.3@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 25 15:17:39 pfscn2 kernel: : LustreError: 9001:0:(lov_request.c:579:lov_update_create_set()) error creating fid 0x3288 sub-object on OST idx 0/1: rc = -11
Sep 25 15:17:39 pfscn2 kernel: : LustreError: 7308:0:(lov_request.c:579:lov_update_create_set()) error creating fid 0x3288 sub-object on OST idx 0/1: rc = -5
Sep 25 15:17:40 pfscn2 kernel: : Lustre: pfscdat2-OST0000-osc-MDT0000: Connection restored to pfscdat2-OST0000 (at 172.26.17.4@o2ib)
Sep 25 15:17:40 pfscn2 kernel: : Lustre: MDS mdd_obd-pfscdat2-MDT0000: pfscdat2-OST0000_UUID now active, resetting orphans
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="67850" author="kitwestneat" created="Fri, 27 Sep 2013 17:59:20 +0000"  >&lt;p&gt;Hi Hongchao,&lt;/p&gt;

&lt;p&gt;I was trying to follow the create logic some and it&apos;s confused me some. It seems like the fake requests are handled by oscc_internal_create, but then if the oscc is recovering (due to the EAGAIN) all it does is return 0. Am I reading that right?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Kit&lt;/p&gt;</comment>
                            <comment id="67960" author="hongchao.zhang" created="Mon, 30 Sep 2013 16:22:28 +0000"  >&lt;p&gt;Hi&lt;br/&gt;
the fake requests will be queued into osc_creater.oscc_wait_create_list in osc_create_async to wait the objects creation to complete, and it will be woken up&lt;br/&gt;
in osc_interpret_create with returned value from OST. in this case, -ENOTCONN or -ENODEV is returned and failed these fake requests, which should continue&lt;br/&gt;
waiting the recovery of OST to complete and the objects will be ready then.&lt;/p&gt;

&lt;p&gt;I&apos;ll create a new debug patch with the &lt;a href=&quot;http://review.whamcloud.com/#/c/7559/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/7559/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Thanks!&lt;/p&gt;</comment>
                            <comment id="68058" author="kitwestneat" created="Tue, 1 Oct 2013 15:47:28 +0000"  >&lt;p&gt;Ah ok, that makes sense. Should handle_async_create add the RECOVERING flag to the oscc? It seems like osc_interpret_create would add it, but is it possible that it could get to oscc_internal_create without that flag? &lt;/p&gt;</comment>
                            <comment id="68233" author="kitwestneat" created="Thu, 3 Oct 2013 14:30:47 +0000"  >&lt;p&gt;Hi Hongchao,&lt;/p&gt;

&lt;p&gt;Any updates?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Kit&lt;/p&gt;</comment>
                            <comment id="68317" author="kitwestneat" created="Thu, 3 Oct 2013 20:49:15 +0000"  >&lt;p&gt;I tested adding the degraded flag to the oscc and it seemed to fix the problem on my test system:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt;(rc != 0 &amp;amp;&amp;amp; rc != -ENOTCONN &amp;amp;&amp;amp; rc != -ENODEV)
                GOTO(out_wake, rc);
           
+        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc == -ENOTCONN || rc ==  -ENODEV)
+                oscc-&amp;gt;oscc_flags |= OSCC_FLAG_DEGRADED;

        /* Handle the critical type errors first.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I ran into one issue where the OST ran out of inodes and it caused the lov_create threads on the MDT to deadlock. I&apos;m not sure if that&apos;s a new behavior or in stock 2.1.6 as well. I&apos;ll investigate. &lt;/p&gt;</comment>
                            <comment id="68575" author="hongchao.zhang" created="Tue, 8 Oct 2013 14:35:27 +0000"  >&lt;p&gt;Hi Kit,&lt;/p&gt;

&lt;p&gt;Sorry for delayed response, we&apos;re in holiday in the last several days.&lt;/p&gt;

&lt;p&gt;if &quot;OSCC_FLAG_DEGRADED&quot; is set, then this OST won&apos;t be used preferentially when creating object. do you only test the failover on one OSS?&lt;/p&gt;

&lt;p&gt;BTW, is there something liken &quot;Unknown rc XXX from async create: failing oscc&quot; in the kernel logs?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;</comment>
                            <comment id="68671" author="kitwestneat" created="Wed, 9 Oct 2013 16:41:20 +0000"  >&lt;p&gt;The fix that worked on my test system didn&apos;t work on the customer system, so it might have been a fluke. &lt;/p&gt;

&lt;p&gt;I haven&apos;t seen that message in the new logs. Here is the latest server dk&apos;s:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;00000100:00100000:14.0:1381243911.199943:0:4598:0:(&lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt;.c:725:ptlrpc_connect_import()) @@@ (re)connect request (timeout 5)  req@ffff8808181fb400 x1448249214211642/t0(
0) o8-&amp;gt;pfscdat2-OST0000-osc-MDT0000@172.26.17.3@o2ib:28/4 lens 368/512 e 0 to 0 dl 0 ref 1 fl New:N/0/ffffffff rc 0/-1
00000100:00100000:13.0:1381243911.212619:0:4596:0:(client.c:1773:ptlrpc_check_set()) Completed RPC pname:cluuid:pid:xid:nid:opc ptlrpcd:pfscdat2-MDT0000-mdtlov_UUID:-1:
1448249214211637:172.26.17.3@o2ib:-1
00020000:00020000:13.0:1381243911.212624:0:4596:0:(lov_request.c:579:lov_update_create_set()) error creating fid 0x10f5a sub-object on OST idx 0/1: rc = -11
00000100:00100000:13.0:1381243911.225308:0:4596:0:(client.c:1773:ptlrpc_check_set()) Completed RPC pname:cluuid:pid:xid:nid:opc ptlrpcd:pfscdat2-MDT0000-mdtlov_UUID:-1:
1448249214211638:172.26.17.3@o2ib:-1
00020000:00020000:13.0:1381243911.225312:0:4596:0:(lov_request.c:579:lov_update_create_set()) error creating fid 0x18c7 sub-object on OST idx 0/1: rc = -11
00000100:00100000:13.0:1381243911.250579:0:4596:0:(client.c:1773:ptlrpc_check_set()) Completed RPC pname:cluuid:pid:xid:nid:opc ptlrpcd:pfscdat2-MDT0000-mdtlov_UUID:-1:
1448249214211639:172.26.17.3@o2ib:-1
00020000:00020000:6.0:1381243911.250579:0:29944:0:(lov_request.c:579:lov_update_create_set()) error creating fid 0x10f5a sub-object on OST idx 0/1: rc = -5
00000100:00100000:13.0:1381243911.250586:0:4596:0:(client.c:1434:ptlrpc_send_new_req()) Sending RPC pname:cluuid:pid:xid:nid:opc ptlrpcd:e76ce10d-1d27-cdc4-1091-649094c70331:4596:1448249214211641:172.26.17.1@o2ib:400
00020000:00020000:5.0:1381243911.250612:0:4755:0:(lov_request.c:579:lov_update_create_set()) error creating fid 0x18c7 sub-object on OST idx 0/1: rc = -5
00000100:00100000:13.0:1381243911.250638:0:4596:0:(client.c:1773:ptlrpc_check_set()) Completed RPC pname:cluuid:pid:xid:nid:opc ptlrpcd:pfscdat2-MDT0000-mdtlov_UUID:0:1448249214211644:172.26.17.3@o2ib:13
00000100:00100000:5.0:1381243911.250741:0:4755:0:(service.c:1771:ptlrpc_server_handle_request()) Handled RPC pname:cluuid+ref:pid:xid:nid:opc mdt_02:94b08aa2-54d7-b32e-019d-2561ed3286b5+6:58211:x1447047461344560:12345-172.26.4.3@o2ib:101 Request procesed in 49643595us (49643620us total) trans 0 rc 301/301
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="68735" author="hongchao.zhang" created="Thu, 10 Oct 2013 10:43:30 +0000"  >&lt;p&gt;Hi Kit,&lt;/p&gt;

&lt;p&gt;could you please test with the following debug patch&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;diff --git a/lustre/lov/lov_request.c b/lustre/lov/lov_request.c
index 0fc635e..f0476cb 100644
--- a/lustre/lov/lov_request.c
+++ b/lustre/lov/lov_request.c
@@ -577,6 +577,14 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; lov_update_create_set(struct lov_request_set *se
                        &lt;span class=&quot;code-quote&quot;&gt;&quot;on OST idx %d/%d: rc = %d\n&quot;&lt;/span&gt;,
                        set-&amp;gt;set_oi-&amp;gt;oi_oa-&amp;gt;o_id, req-&amp;gt;rq_idx,
                        lsm-&amp;gt;lsm_stripe_count, rc);
+
+#ifdef __KERNEL__
+               &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc == -EAGAIN) {
+                       CDEBUG(D_ERROR, &lt;span class=&quot;code-quote&quot;&gt;&quot;shouldn&apos;t encounter -EAGAIN!\n&quot;&lt;/span&gt;);
+                       dump_stack();
+               }
+#endif
+
                 &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc &amp;gt; 0) {
                         CERROR(&lt;span class=&quot;code-quote&quot;&gt;&quot;obd_create returned invalid err %d\n&quot;&lt;/span&gt;, rc);
                         rc = -EIO;
diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c
index ffc81d4..22265bd 100644
--- a/lustre/osc/osc_create.c
+++ b/lustre/osc/osc_create.c
@@ -412,7 +412,7 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; handle_async_create(struct ptlrpc_request *req, i
 
         LASSERT_SPIN_LOCKED(&amp;amp;oscc-&amp;gt;oscc_lock);
 
-        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt;(rc)
+        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt;(rc != 0 &amp;amp;&amp;amp; rc != -ENOTCONN &amp;amp;&amp;amp; rc != -ENODEV)
                 GOTO(out_wake, rc);
 
         /* Handle the critical type errors first.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="69637" author="kitwestneat" created="Wed, 23 Oct 2013 15:02:47 +0000"  >&lt;p&gt;Hi Hongchao,&lt;/p&gt;

&lt;p&gt;Sorry for the delay in getting the output:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Oct 23 10:47:48 pfscn2 kernel: : Lustre: pfscdat2-OST0000-osc-MDT0000: Connection to pfscdat2-OST0000 (at 172.26.17.4@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; 
service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Oct 23 10:48:39 pfscn2 kernel: : LustreError: 9651:0:(lov_request.c:579:lov_update_create_set()) error creating fid 0x10949 sub-object on OST idx 0/1: rc = -11
Oct 23 10:48:39 pfscn2 kernel: : LustreError: 9651:0:(lov_request.c:583:lov_update_create_set()) shouldn&apos;t encounter -EAGAIN!
Oct 23 10:48:39 pfscn2 kernel: : Pid: 9651, comm: ptlrpcd Not tainted 2.6.32-358.11.1.el6_lustre.es124.x86_64 #1
Oct 23 10:48:39 pfscn2 kernel: : Call Trace:
Oct 23 10:48:39 pfscn2 kernel: : [&amp;lt;ffffffffa0ad3505&amp;gt;] ? lov_update_create_set+0x4f5/0x500 [lov]
Oct 23 10:48:39 pfscn2 kernel: : [&amp;lt;ffffffffa0ad78cd&amp;gt;] ? cb_create_update+0x2d/0x100 [lov]
Oct 23 10:48:39 pfscn2 kernel: : [&amp;lt;ffffffffa0a6f28b&amp;gt;] ? handle_async_create+0x7b/0x390 [osc]
Oct 23 10:48:39 pfscn2 kernel: : [&amp;lt;ffffffffa0a6fad9&amp;gt;] ? async_create_interpret+0x39/0x50 [osc]
Oct 23 10:48:39 pfscn2 kernel: : [&amp;lt;ffffffffa0811c9b&amp;gt;] ? ptlrpc_check_set+0x29b/0x1b00 [ptlrpc]
Oct 23 10:48:39 pfscn2 kernel: : [&amp;lt;ffffffff8150ecda&amp;gt;] ? schedule_timeout+0x19a/0x2e0
Oct 23 10:48:39 pfscn2 kernel: : [&amp;lt;ffffffffa0842f20&amp;gt;] ? ptlrpcd_check+0x1a0/0x230 [ptlrpc]
Oct 23 10:48:39 pfscn2 kernel: : [&amp;lt;ffffffffa08431be&amp;gt;] ? ptlrpcd+0x20e/0x370 [ptlrpc]
Oct 23 10:48:39 pfscn2 kernel: : [&amp;lt;ffffffff81063310&amp;gt;] ? default_wake_function+0x0/0x20
Oct 23 10:48:39 pfscn2 kernel: : [&amp;lt;ffffffffa0842fb0&amp;gt;] ? ptlrpcd+0x0/0x370 [ptlrpc]
Oct 23 10:48:39 pfscn2 kernel: : [&amp;lt;ffffffff8100c0ca&amp;gt;] ? child_rip+0xa/0x20
Oct 23 10:48:39 pfscn2 kernel: : [&amp;lt;ffffffffa0842fb0&amp;gt;] ? ptlrpcd+0x0/0x370 [ptlrpc]
Oct 23 10:48:39 pfscn2 kernel: : [&amp;lt;ffffffffa0842fb0&amp;gt;] ? ptlrpcd+0x0/0x370 [ptlrpc]
Oct 23 10:48:39 pfscn2 kernel: : [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20
Oct 23 10:48:39 pfscn2 kernel: : LustreError: 10250:0:(lov_request.c:579:lov_update_create_set()) error creating fid 0x10949 sub-object on OST idx 0/1: rc = -5
Oct 23 10:49:21 pfscn2 kernel: : Lustre: pfscdat2-OST0000-osc-MDT0000: Connection restored to pfscdat2-OST0000 (at 172.26.17.3@o2ib)
Oct 23 10:49:21 pfscn2 kernel: : Lustre: MDS mdd_obd-pfscdat2-MDT0000: pfscdat2-OST0000_UUID now active, resetting orphans
Oct 23 10:53:38 pfscn2 kernel: : LustreError: 11-0: an error occurred &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; communicating with 172.26.17.3@o2ib. The ost_setattr operation failed with -19
Oct 23 10:53:38 pfscn2 kernel: : LustreError: Skipped 1 previous similar message
Oct 23 10:53:38 pfscn2 kernel: : Lustre: pfscdat2-OST0000-osc-MDT0000: Connection to pfscdat2-OST0000 (at 172.26.17.3@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Oct 23 10:54:09 pfscn2 kernel: : LustreError: 11-0: an error occurred &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; communicating with 172.26.17.4@o2ib. The ost_connect operation failed with -19
Oct 23 10:54:09 pfscn2 kernel: : LustreError: Skipped 1 previous similar message
Oct 23 10:54:16 pfscn2 kernel: : Lustre: 10169:0:(ldlm_lib.c:952:target_handle_connect()) MGS: connection from 6ede50b8-462e-c607-2c6d-f747541fabbd@172.26.17.4@o2ib t0 exp (&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;) cur 1382518456 last 0
Oct 23 10:54:28 pfscn2 kernel: : LustreError: 9651:0:(lov_request.c:579:lov_update_create_set()) error creating fid 0xd948 sub-object on OST idx 0/1: rc = -11
Oct 23 10:54:28 pfscn2 kernel: : LustreError: 9651:0:(lov_request.c:583:lov_update_create_set()) shouldn&apos;t encounter -EAGAIN!
Oct 23 10:54:28 pfscn2 kernel: : Pid: 9651, comm: ptlrpcd Not tainted 2.6.32-358.11.1.el6_lustre.es124.x86_64 #1
Oct 23 10:54:28 pfscn2 kernel: : Call Trace:
Oct 23 10:54:28 pfscn2 kernel: : [&amp;lt;ffffffffa0ad3505&amp;gt;] ? lov_update_create_set+0x4f5/0x500 [lov]
Oct 23 10:54:28 pfscn2 kernel: : [&amp;lt;ffffffffa0ad78cd&amp;gt;] ? cb_create_update+0x2d/0x100 [lov]
Oct 23 10:54:28 pfscn2 kernel: : [&amp;lt;ffffffffa0a6f28b&amp;gt;] ? handle_async_create+0x7b/0x390 [osc]
Oct 23 10:54:28 pfscn2 kernel: : [&amp;lt;ffffffffa0a6fad9&amp;gt;] ? async_create_interpret+0x39/0x50 [osc]
Oct 23 10:54:28 pfscn2 kernel: : [&amp;lt;ffffffffa0811c9b&amp;gt;] ? ptlrpc_check_set+0x29b/0x1b00 [ptlrpc]
Oct 23 10:54:28 pfscn2 kernel: : [&amp;lt;ffffffff8150ecda&amp;gt;] ? schedule_timeout+0x19a/0x2e0
Oct 23 10:54:28 pfscn2 kernel: : [&amp;lt;ffffffffa0842f20&amp;gt;] ? ptlrpcd_check+0x1a0/0x230 [ptlrpc]
Oct 23 10:54:28 pfscn2 kernel: : [&amp;lt;ffffffffa08431be&amp;gt;] ? ptlrpcd+0x20e/0x370 [ptlrpc]
Oct 23 10:54:28 pfscn2 kernel: : [&amp;lt;ffffffff81063310&amp;gt;] ? default_wake_function+0x0/0x20
Oct 23 10:54:28 pfscn2 kernel: : [&amp;lt;ffffffffa0842fb0&amp;gt;] ? ptlrpcd+0x0/0x370 [ptlrpc]
Oct 23 10:54:28 pfscn2 kernel: : [&amp;lt;ffffffff8100c0ca&amp;gt;] ? child_rip+0xa/0x20
Oct 23 10:54:28 pfscn2 kernel: : [&amp;lt;ffffffffa0842fb0&amp;gt;] ? ptlrpcd+0x0/0x370 [ptlrpc]
Oct 23 10:54:28 pfscn2 kernel: : [&amp;lt;ffffffffa0842fb0&amp;gt;] ? ptlrpcd+0x0/0x370 [ptlrpc]
Oct 23 10:54:28 pfscn2 kernel: : [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20
Oct 23 10:54:28 pfscn2 kernel: : LustreError: 24178:0:(lov_request.c:579:lov_update_create_set()) error creating fid 0xd948 sub-object on OST idx 0/1: rc = -5
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="69675" author="kitwestneat" created="Wed, 23 Oct 2013 20:34:51 +0000"  >&lt;p&gt;should it actually be:&lt;br/&gt;
+        if(rc != 0 &amp;amp;&amp;amp; rc != -ENOTCONN &amp;amp;&amp;amp; rc != -ENODEV &lt;b&gt;&amp;amp;&amp;amp; rc != -EAGAIN&lt;/b&gt;)&lt;/p&gt;

&lt;p&gt;I built a version with that just to test. I didn&apos;t get any IO errors, but I did get:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Oct 23 22:27:56 pfscn2 kernel: : LustreError: 11-0: an error occurred &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; communicating with 172.26.17.4@o2ib. The ost_create operation failed with -19
Oct 23 22:27:56 pfscn2 kernel: : Lustre: pfscdat2-OST0000-osc-MDT0000: Connection to pfscdat2-OST0000 (at 172.26.17.4@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Oct 23 22:27:56 pfscn2 kernel: : LustreError: 9255:0:(osc_create.c:175:osc_interpret_create()) @@@ Unknown rc -19 from async create: failing oscc  req@ffff88081f6c3800 x1449716075693976/t0(0) o5-&amp;gt;pfscdat2-OST0000-osc-MDT0000@172.26.17.4@o2ib:7/4 lens 400/400 e 0 to 0 dl 1382560083 ref 1 fl Interpret:RN/0/0 rc -19/-19
Oct 23 22:28:53 pfscn2 kernel: : Lustre: pfscdat2-OST0000-osc-MDT0000: Connection restored to pfscdat2-OST0000 (at 172.26.17.3@o2ib)
Oct 23 22:28:53 pfscn2 kernel: : Lustre: MDS mdd_obd-pfscdat2-MDT0000: pfscdat2-OST0000_UUID now active, resetting orphans
Oct 23 22:28:53 pfscn2 kernel: : Lustre: Skipped 1 previous similar message
Oct 23 22:31:16 pfscn2 kernel: : Lustre: Service thread pid 10987 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 200.00s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; debugging purposes:
Oct 23 22:31:16 pfscn2 kernel: : Pid: 10987, comm: mdt_13
Oct 23 22:31:16 pfscn2 kernel: :
Oct 23 22:31:16 pfscn2 kernel: : Call Trace:
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa04de923&amp;gt;] ? cfs_alloc+0x63/0x90 [libcfs]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa0808a45&amp;gt;] ? ptlrpc_next_xid+0x15/0x40 [ptlrpc]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa04de60e&amp;gt;] cfs_waitq_wait+0xe/0x10 [libcfs]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa0ac650a&amp;gt;] lov_create+0xbaa/0x1400 [lov]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa0d5c0d6&amp;gt;] ? mdd_get_md+0x96/0x2f0 [mdd]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffff81063310&amp;gt;] ? default_wake_function+0x0/0x20
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa0d7c916&amp;gt;] ? mdd_read_unlock+0x26/0x30 [mdd]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa0d6090e&amp;gt;] mdd_lov_create+0x9ee/0x1ba0 [mdd]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa0d72871&amp;gt;] mdd_create+0xf81/0x1a90 [mdd]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa0e72414&amp;gt;] ? osd_object_init+0xe4/0x420 [osd_ldiskfs]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa0eaa3f7&amp;gt;] cml_create+0x97/0x250 [cmm]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa0de55b1&amp;gt;] ? mdt_version_get_save+0x91/0xd0 [mdt]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa0dfb156&amp;gt;] mdt_reint_open+0x1aa6/0x2940 [mdt]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa081d754&amp;gt;] ? lustre_msg_add_version+0x74/0xd0 [ptlrpc]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa0d7556e&amp;gt;] ? md_ucred+0x1e/0x60 [mdd]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa0de3c51&amp;gt;] mdt_reint_rec+0x41/0xe0 [mdt]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa0ddaed4&amp;gt;] mdt_reint_internal+0x544/0x8e0 [mdt]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa0ddb53d&amp;gt;] mdt_intent_reint+0x1ed/0x500 [mdt]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa0dd9c09&amp;gt;] mdt_intent_policy+0x379/0x690 [mdt]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa07d9391&amp;gt;] ldlm_lock_enqueue+0x361/0x8f0 [ptlrpc]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa07ff1dd&amp;gt;] ldlm_handle_enqueue0+0x48d/0xf50 [ptlrpc]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa0dda586&amp;gt;] mdt_enqueue+0x46/0x130 [mdt]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa0dcf772&amp;gt;] mdt_handle_common+0x932/0x1750 [mdt]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa0dd0665&amp;gt;] mdt_regular_handle+0x15/0x20 [mdt]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa082db9e&amp;gt;] ptlrpc_main+0xc4e/0x1a40 [ptlrpc]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa082cf50&amp;gt;] ? ptlrpc_main+0x0/0x1a40 [ptlrpc]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffff8100c0ca&amp;gt;] child_rip+0xa/0x20
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa082cf50&amp;gt;] ? ptlrpc_main+0x0/0x1a40 [ptlrpc]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffffa082cf50&amp;gt;] ? ptlrpc_main+0x0/0x1a40 [ptlrpc]
Oct 23 22:31:16 pfscn2 kernel: : [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="69679" author="kitwestneat" created="Wed, 23 Oct 2013 21:05:47 +0000"  >&lt;p&gt;the decode lustrelog dumped&lt;/p&gt;</comment>
                            <comment id="69992" author="hongchao.zhang" created="Mon, 28 Oct 2013 05:16:19 +0000"  >&lt;p&gt;Hi Kit,&lt;/p&gt;

&lt;p&gt;Do you apply any patch on the original 2.1.5? the error &quot;-EAGAIN&quot; is originated from ptlrpc_check_set as per the logs and it should be &quot;ptlrpc_import_delay_req&quot;&lt;br/&gt;
to set this error when the fake request (queued by osc_create_async) is timed out, but no &quot;-EAGAIN&quot; is found in this function.&lt;/p&gt;

&lt;p&gt;but the logs show the the cause of the issue (the fake request failed and was not restarted if the OSC was recovering), will try to create the updated patch.&lt;/p&gt;

&lt;p&gt;Thanks!&lt;/p&gt;
</comment>
                            <comment id="70393" author="kitwestneat" created="Thu, 31 Oct 2013 16:27:14 +0000"  >&lt;p&gt;I think it was stock 2.1.5, I will see if I can find any patches.&lt;/p&gt;

&lt;p&gt;Any updates on creating the patch? The customer is getting impatient. When can I tell them to expect the fix?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Kit&lt;/p&gt;</comment>
                            <comment id="70458" author="hongchao.zhang" created="Fri, 1 Nov 2013 02:26:07 +0000"  >&lt;p&gt;the patch is under test and will be attached soon&lt;/p&gt;</comment>
                            <comment id="70469" author="hongchao.zhang" created="Fri, 1 Nov 2013 11:26:12 +0000"  >&lt;p&gt;the new patch is tracked at &lt;a href=&quot;http://review.whamcloud.com/#/c/7559/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/7559/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="70703" author="kitwestneat" created="Tue, 5 Nov 2013 12:14:50 +0000"  >&lt;p&gt;Hi Hongchao,&lt;/p&gt;

&lt;p&gt;Still getting errors:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Nov  4 21:33:56 pfscn2 kernel: : LustreError: 11-0: an error occurred &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; communicating with 172.26.17.4@o2ib. The ost_create operation failed with -107
Nov  4 21:33:56 pfscn2 kernel: : Lustre: pfscdat2-OST0000-osc-MDT0000: Connection to pfscdat2-OST0000 (at 172.26.17.4@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Nov  4 21:33:56 pfscn2 kernel: : LustreError: 9244:0:(layout.c:1659:__req_capsule_get()) @@@ Wrong buffer &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; field `ost_body&lt;span class=&quot;code-quote&quot;&gt;&apos; (1 of 1) in format `OST_CREATE&apos;&lt;/span&gt;: 0 vs. 208 (server)
Nov  4 21:33:56 pfscn2 kernel: : req@ffff8808092fa000 x1450802648479850/t0(0) o5-&amp;gt;pfscdat2-OST0000-osc-MDT0000@172.26.17.4@o2ib:7/4 lens 400/192 e 0 to 0 dl 1383597243 ref 1 fl Interpret:RN/0/0 rc -107/-107
Nov  4 21:33:56 pfscn2 kernel: : LustreError: 11-0: an error occurred &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; communicating with 172.26.17.4@o2ib. The ost_connect operation failed with -19
Nov  4 21:33:56 pfscn2 kernel: : LustreError: 9244:0:(osc_create.c:177:osc_interpret_create()) @@@ Unknown rc -107 from async create: failing oscc  req@ffff8808092fa000 x1450802648479850/t0(0) o5-&amp;gt;pfscdat2-OST0000-osc-MDT0000@172.26.17.4@o2ib:7/4 lens 400/192 e 0 to 0 dl 1383597243 ref 1 fl Interpret:RN/0/0 rc -107/-107
Nov  4 21:33:56 pfscn2 kernel: : LustreError: 12037:0:(lov_request.c:579:lov_update_create_set()) error creating fid 0x1377d sub-object on OST idx 0/1: rc = -5
Nov  4 21:34:48 pfscn2 kernel: : Lustre: pfscdat2-OST0000-osc-MDT0000: Connection restored to pfscdat2-OST0000 (at 172.26.17.3@o2ib)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Are you able to reproduce the error? Basically we are doing it by having 8 clients writing small files, then failing over an OSS. &lt;/p&gt;</comment>
                            <comment id="70729" author="kitwestneat" created="Tue, 5 Nov 2013 15:27:42 +0000"  >&lt;p&gt;Do you think that patchset 4 would fix this issue? It actually seems as if we are hitting 2 separate issues depending on where in the code path the OSS fails. &lt;/p&gt;</comment>
                            <comment id="70767" author="kitwestneat" created="Tue, 5 Nov 2013 19:28:05 +0000"  >&lt;p&gt;With both patchset 4 and 5, it seems to solve the issue:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;LustreError: 11-0: an error occurred &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; communicating with 172.26.17.4@o2ib. The ost_create operation failed with -107
Lustre: pfscdat2-OST0000-osc-MDT0000: Connection to pfscdat2-OST0000 (at 172.26.17.4@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
LustreError: 9266:0:(layout.c:1659:__req_capsule_get()) @@@ Wrong buffer &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; field `ost_body&lt;span class=&quot;code-quote&quot;&gt;&apos; (1 of 1) in format `OST_CREATE&apos;&lt;/span&gt;: 0 vs. 208 (server)
  req@ffff880808005800 x1450889175112924/t0(0) o5-&amp;gt;pfscdat2-OST0000-osc-MDT0000@172.26.17.4@o2ib:7/4 lens 400/192 e 0 to 0 dl 1383677590 ref 1 fl Interpret:RN/0/0 rc -107/-107
LustreError: 11-0: an error occurred &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; communicating with 172.26.17.4@o2ib. The ost_connect operation failed with -19
LustreError: 9266:0:(osc_create.c:177:osc_interpret_create()) @@@ Unknown rc -107 from async create: failing oscc  req@ffff880808005800 x1450889175112924/t0(0) o5-&amp;gt;pfscdat2-OST0000-osc-MDT0000@172.26.17.4@o2ib:7/4 lens 400/192 e 0 to 0 dl 1383677590 ref 1 fl Interpret:RN/0/0 rc -107/-107
Lustre: pfscdat2-OST0000-osc-MDT0000: Connection restored to pfscdat2-OST0000 (at 172.26.17.3@o2ib)
Lustre: MDS mdd_obd-pfscdat2-MDT0000: pfscdat2-OST0000_UUID now active, resetting orphans
LustreError: 11-0: an error occurred &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; communicating with 172.26.17.3@o2ib. The ost_statfs operation failed with -107
Lustre: pfscdat2-OST0000-osc-MDT0000: Connection to pfscdat2-OST0000 (at 172.26.17.3@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Lustre: pfscdat2-OST0000-osc-MDT0000: Connection restored to pfscdat2-OST0000 (at 172.26.17.4@o2ib)
Lustre: MDS mdd_obd-pfscdat2-MDT0000: pfscdat2-OST0000_UUID now active, resetting orphans
LustreError: 11-0: an error occurred &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; communicating with 172.26.17.4@o2ib. The ost_statfs operation failed with -107
LustreError: Skipped 1 previous similar message
Lustre: pfscdat2-OST0000-osc-MDT0000: Connection to pfscdat2-OST0000 (at 172.26.17.4@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
LustreError: 9266:0:(osc_create.c:177:osc_interpret_create()) @@@ Unknown rc -19 from async create: failing oscc  req@ffff881001a70000 x1450889176260853/t0(0) o5-&amp;gt;pfscdat2-OST0000-osc-MDT0000@172.26.17.4@o2ib:7/4 lens 400/400 e 0 to 0 dl 1383678377 ref 1 fl Interpret:RN/0/0 rc -19/-19
Lustre: pfscdat2-OST0000-osc-MDT0000: Connection restored to pfscdat2-OST0000 (at 172.26.17.3@o2ib)
Lustre: MDS mdd_obd-pfscdat2-MDT0000: pfscdat2-OST0000_UUID now active, resetting orphans
LustreError: 11-0: an error occurred &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; communicating with 172.26.17.3@o2ib. The obd_ping operation failed with -107
LustreError: Skipped 2 previous similar messages
Lustre: pfscdat2-OST0000-osc-MDT0000: Connection to pfscdat2-OST0000 (at 172.26.17.3@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Lustre: pfscdat2-OST0000-osc-MDT0000: Connection restored to pfscdat2-OST0000 (at 172.26.17.4@o2ib)
Lustre: MDS mdd_obd-pfscdat2-MDT0000: pfscdat2-OST0000_UUID now active, resetting orphans
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This error message looks somewhat concerning:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;LustreError: 9266:0:(layout.c:1659:__req_capsule_get()) @@@ Wrong buffer &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; field `ost_body&lt;span class=&quot;code-quote&quot;&gt;&apos; (1 of 1) in format `OST_CREATE&apos;&lt;/span&gt;: 0 vs. 208 (server)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Do you know if this should cause issues?&lt;/p&gt;</comment>
                            <comment id="70860" author="hongchao.zhang" created="Wed, 6 Nov 2013 16:16:22 +0000"  >&lt;p&gt;this error message should not cause issues, for it only detected there was no &quot;ost_body&quot; in the replied message (server side) of the failed ost_create request.&lt;br/&gt;
and 208 is just the size of &quot;ost_body&quot;&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;    &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (field-&amp;gt;rmf_flags &amp;amp; RMF_F_STRUCT_ARRAY) {
        /*
         * We&apos;ve already asserted that field-&amp;gt;rmf_size &amp;gt; 0 in
         * req_layout_init().
         */
        len = lustre_msg_buflen(msg, offset);
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; ((len % field-&amp;gt;rmf_size) != 0) {
            CERROR(&lt;span class=&quot;code-quote&quot;&gt;&quot;%s: array field size mismatch &quot;&lt;/span&gt;
                   &lt;span class=&quot;code-quote&quot;&gt;&quot;%d modulo %d != 0 (%d)\n&quot;&lt;/span&gt;,
                   field-&amp;gt;rmf_name, len, field-&amp;gt;rmf_size, loc);
            &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; NULL;
        }
    } &lt;span class=&quot;code-keyword&quot;&gt;else&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (pill-&amp;gt;rc_area[loc][offset] != -1) {
        len = pill-&amp;gt;rc_area[loc][offset];
    } &lt;span class=&quot;code-keyword&quot;&gt;else&lt;/span&gt; {
        len = max(field-&amp;gt;rmf_size, 0);
    }
    value = getter(msg, offset, len);

    &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (value == NULL) {
        DEBUG_REQ(D_ERROR, pill-&amp;gt;rc_req,
                  &lt;span class=&quot;code-quote&quot;&gt;&quot;Wrong buffer &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; field `%s&apos; (%d of %d) &quot;&lt;/span&gt;
                  &lt;span class=&quot;code-quote&quot;&gt;&quot;in format `%s&apos;: %d vs. %d (%s)\n&quot;&lt;/span&gt;,
                  field-&amp;gt;rmf_name, offset, lustre_msg_bufcount(msg),
                  fmt-&amp;gt;rf_name, lustre_msg_buflen(msg, offset), len,
                  rcl_names[loc]);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Yes, the two patches should be combined, and the patch has been updated.&lt;br/&gt;
Thanks&lt;/p&gt;</comment>
                            <comment id="70933" author="kitwestneat" created="Wed, 6 Nov 2013 23:25:24 +0000"  >&lt;p&gt;Would it be possible to suppress the warning message in this case? I think it is going to confuse sysadmins.&lt;/p&gt;</comment>
                            <comment id="70937" author="hongchao.zhang" created="Thu, 7 Nov 2013 00:38:33 +0000"  >&lt;p&gt;Okay, the patch has been updated to suppress this error message if the corresponding request failed.&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;</comment>
                            <comment id="70949" author="kitwestneat" created="Thu, 7 Nov 2013 03:33:04 +0000"  >&lt;p&gt;Perfect! I confirmed with the customer that this patch fixes their issue, so once this lands, I guess we can close the ticket. &lt;/p&gt;</comment>
                            <comment id="71442" author="kitwestneat" created="Wed, 13 Nov 2013 18:01:31 +0000"  >&lt;p&gt;Hi Hongchao,&lt;/p&gt;

&lt;p&gt;Do you think something like this patch is necessary on master? The osc appears to be a lot different in that version.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Kit&lt;/p&gt;</comment>
                            <comment id="71722" author="hongchao.zhang" created="Sat, 16 Nov 2013 15:41:34 +0000"  >&lt;p&gt;Hi Kit,&lt;/p&gt;

&lt;p&gt;the master has waited the object precreation in &quot;osp_declare_object_create&quot;/&quot;osp_precreate_reserve&quot;/&quot;osp_precreate_ready_condition&quot; for recovering case&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; osp_precreate_ready_condition(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_env *env,
                                         struct osp_device *d)
{
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (d-&amp;gt;opd_pre_recovering)
                &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 0;

        &lt;span class=&quot;code-comment&quot;&gt;/* ready &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; got enough precreated objects */&lt;/span&gt;
        &lt;span class=&quot;code-comment&quot;&gt;/* we need to wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; others (opd_pre_reserved) and our object (+1) */&lt;/span&gt;
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (d-&amp;gt;opd_pre_reserved + 1 &amp;lt; osp_objs_precreated(env, d))
                &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 1;

        &lt;span class=&quot;code-comment&quot;&gt;/* ready &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; OST reported no space and no destroys in progress */&lt;/span&gt;
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (d-&amp;gt;opd_syn_changes + d-&amp;gt;opd_syn_rpc_in_progress == 0 &amp;amp;&amp;amp;
            d-&amp;gt;opd_pre_status == -ENOSPC)
                &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 1;

        &lt;span class=&quot;code-comment&quot;&gt;/* Bail out I/O fails to OST */&lt;/span&gt;
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (d-&amp;gt;opd_pre_status == -EIO)
                &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 1;

        &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 0;
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="13256" name="LU-XXXX.tgz" size="474371" author="gshilamkar" created="Fri, 26 Jul 2013 08:26:15 +0000"/>
                            <attachment id="13462" name="client_lctl_dk_20130911.tgz" size="18927" author="kitwestneat" created="Fri, 13 Sep 2013 14:31:44 +0000"/>
                            <attachment id="13461" name="client_messages_20130911.tgz" size="233968" author="kitwestneat" created="Fri, 13 Sep 2013 14:31:44 +0000"/>
                            <attachment id="13676" name="ll10987.out.gz" size="221" author="kitwestneat" created="Wed, 23 Oct 2013 21:05:47 +0000"/>
                            <attachment id="13279" name="mds1.llog.gz" size="229754" author="kitwestneat" created="Tue, 30 Jul 2013 17:27:14 +0000"/>
                            <attachment id="13278" name="mds2.llog.gz" size="217" author="kitwestneat" created="Tue, 30 Jul 2013 17:27:14 +0000"/>
                            <attachment id="13460" name="server_lctl_dk_20130911.tgz" size="400951" author="kitwestneat" created="Fri, 13 Sep 2013 14:31:44 +0000"/>
                            <attachment id="13280" name="ucbn003.localdomain.llog.gz" size="247" author="kitwestneat" created="Tue, 30 Jul 2013 17:27:14 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvw9z:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9382</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>