<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:40:00 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4137] Upgrading Lustre Servers from 1.8.7 to 2.4.1 with data inplace</title>
                <link>https://jira.whamcloud.com/browse/LU-4137</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We are working on getting our Lustre servers upgraded from 1.8.7 to Lustre 2.4.1 with data in place.&lt;/p&gt;</description>
                <environment></environment>
        <key id="21610">LU-4137</key>
            <summary>Upgrading Lustre Servers from 1.8.7 to 2.4.1 with data inplace</summary>
                <type id="3" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11318&amp;avatarType=issuetype">Task</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="6">Not a Bug</resolution>
                                        <assignee username="mdiep">Minh Diep</assignee>
                                    <reporter username="haisong">Haisong Cai</reporter>
                        <labels>
                            <label>sdsc</label>
                    </labels>
                <created>Wed, 23 Oct 2013 20:24:19 +0000</created>
                <updated>Thu, 13 Mar 2014 04:37:47 +0000</updated>
                            <resolved>Thu, 13 Mar 2014 04:37:47 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="69680" author="pjones" created="Wed, 23 Oct 2013 21:07:29 +0000"  >&lt;p&gt;Minh&lt;/p&gt;

&lt;p&gt;Are you assisting with this activity?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="69683" author="mdiep" created="Wed, 23 Oct 2013 21:25:56 +0000"  >&lt;p&gt;Yes, I have been working on this&lt;/p&gt;</comment>
                            <comment id="79038" author="rpwagner" created="Tue, 11 Mar 2014 20:34:32 +0000"  >&lt;p&gt;On one of our file systems, we&apos;re seeing two problems. One is repeated messages about not being able to clean up orphans. Note that this is after a shutdown of all clients, a reset of the Lustre servers including a lustre_rmmod, and a mount of the MDT with the abort_recov option.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Mar 11 13:13:22 monkey-mds-10-3 kernel: LustreError: 11286:0:(osp_precreate.c:737:osp_precreate_cleanup_orphans()) monkey-OST0025-osc: cannot cleanup orphans: rc = -11
Mar 11 13:13:22 monkey-mds-10-3 kernel: LustreError: 11286:0:(osp_precreate.c:737:osp_precreate_cleanup_orphans()) Skipped 3 previous similar messages
Mar 11 13:15:43 monkey-mds-10-3 kernel: LustreError: 11234:0:(osp_precreate.c:737:osp_precreate_cleanup_orphans()) monkey-OST000b-osc: cannot cleanup orphans: rc = -11
Mar 11 13:15:43 monkey-mds-10-3 kernel: LustreError: 11234:0:(osp_precreate.c:737:osp_precreate_cleanup_orphans()) Skipped 2 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;The second issue is a hanging connection between a client and an OSS, that includes LNET pings failing.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[ux455215@oasis-dm4 temp_project]$ cd turbulence/tars/
[ux455215@oasis-dm4 tars]$ ls
DD0000_1024ppm.tar  DD0074_1024ppm.tar	DD0098_1024ppm.tar   ts_1024ppmDD0110.tar  ts_1024ppmDD0134.tar  ts_1024ppmDD0154.tar  ts_1024ppmDD0178.tar  ts_1024ppmDD0202.tar  ts_1024ppmDD0226.tar
DD0060_1024ppm.tar  DD0078_1024ppm.tar	DD0102_1024ppm.tar   ts_1024ppmDD0114.tar  ts_1024ppmDD0138.tar  ts_1024ppmDD0158.tar  ts_1024ppmDD0182.tar  ts_1024ppmDD0206.tar
DD0064_1024ppm.tar  DD0082_1024ppm.tar	DD0106_1024ppm.tar   ts_1024ppmDD0118.tar  ts_1024ppmDD0142.tar  ts_1024ppmDD0162.tar  ts_1024ppmDD0186.tar  ts_1024ppmDD0210.tar
DD0068_1024ppm.tar  DD0086_1024ppm.tar	dsets-to-grab.txt    ts_1024ppmDD0122.tar  ts_1024ppmDD0146.tar  ts_1024ppmDD0166.tar  ts_1024ppmDD0190.tar  ts_1024ppmDD0214.tar
DD0070_1024ppm.tar  DD0090_1024ppm.tar	dsets-to-untar.txt   ts_1024ppmDD0126.tar  ts_1024ppmDD0148.tar  ts_1024ppmDD0170.tar  ts_1024ppmDD0194.tar  ts_1024ppmDD0218.tar
DD0072_1024ppm.tar  DD0094_1024ppm.tar	dsets-to-untar.txt~  ts_1024ppmDD0130.tar  ts_1024ppmDD0150.tar  ts_1024ppmDD0174.tar  ts_1024ppmDD0198.tar  ts_1024ppmDD0222.tar
[ux455215@oasis-dm4 tars]$ ls -l
...hangs...
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@oasis-dm4 ~]# lctl ping 172.25.33.103@tcp
failed to ping 172.25.33.103@tcp: Input/output error
[root@oasis-dm4 ~]# ping -c 4 172.25.33.103
PING 172.25.33.103 (172.25.33.103) 56(84) bytes of data.
64 bytes from 172.25.33.103: icmp_seq=1 ttl=63 time=0.109 ms
64 bytes from 172.25.33.103: icmp_seq=2 ttl=63 time=0.159 ms
64 bytes from 172.25.33.103: icmp_seq=3 ttl=63 time=0.155 ms
64 bytes from 172.25.33.103: icmp_seq=4 ttl=63 time=0.158 ms

[root@monkey-oss-16-3 ~]# lctl ping 192.168.230.54@tcp
failed to ping 192.168.230.54@tcp: Input/output error

[root@monkey-oss-16-3 ~]# ping -c 4 192.168.230.54    
PING 192.168.230.54 (192.168.230.54) 56(84) bytes of data.
64 bytes from 192.168.230.54: icmp_seq=1 ttl=63 time=0.153 ms
64 bytes from 192.168.230.54: icmp_seq=2 ttl=63 time=0.175 ms
64 bytes from 192.168.230.54: icmp_seq=3 ttl=63 time=0.175 ms
64 bytes from 192.168.230.54: icmp_seq=4 ttl=63 time=0.175 ms
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;dmesg from client:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Lustre: Mounted monkey-client
Lustre: 24401:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1394569229/real 1394569229]  req@ffff8806189e8c00 x1462311819674200/t0(0) o8-&amp;gt;monkey-OST0078-osc-ffff880c3428b800@172.25.33.103@tcp:28/4 lens 400/544 e 0 to 1 dl 1394569234 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
Lustre: 24401:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1394569254/real 1394569254]  req@ffff8806354eb400 x1462311819674824/t0(0) o8-&amp;gt;monkey-OST0078-osc-ffff880c3428b800@172.25.33.103@tcp:28/4 lens 400/544 e 0 to 1 dl 1394569264 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
Lustre: 24401:0:(client.c:1868:ptlrpc_expire_one_request()) Skipped 11 previous similar messages
Lustre: 24401:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1394569279/real 1394569279]  req@ffff88063566ac00 x1462311819675332/t0(0) o8-&amp;gt;monkey-OST0078-osc-ffff880c3428b800@172.25.33.103@tcp:28/4 lens 400/544 e 0 to 1 dl 1394569294 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
Lustre: 24401:0:(client.c:1868:ptlrpc_expire_one_request()) Skipped 11 previous similar messages
[root@oasis-dm4 ~]# lctl dl | grep 0078
123 UP osc monkey-OST0078-osc-ffff880c3428b800 d81c416b-007d-7139-df5b-77b081b31fb4 5
[root@oasis-dm4 ~]# lctl dl -t | grep 0078
123 UP osc monkey-OST0078-osc-ffff880c3428b800 d81c416b-007d-7139-df5b-77b081b31fb4 5 172.25.33.103@tcp
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;dmesg on OSS:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;LDISKFS-fs (sdf): mounted filesystem with ordered data mode. quota=off. Opts: 
Lustre: 28444:0:(ofd_dev.c:221:ofd_process_config()) For interoperability, skip &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; ost.quota_type. It is obsolete.
Lustre: monkey-OST0078: Imperative Recovery enabled, recovery window shrunk from 300-900 down to 150-450
Lustre: monkey-OST0078: Will be in recovery &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; at least 2:30, or until 1 client reconnects
Lustre: monkey-OST0078: Recovery over after 0:01, of 1 clients 1 recovered and 0 were evicted.
Lustre: monkey-OST0078: deleting orphan objects from 0x0:12305142 to 0x0:12305281
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;/var/log/messages on OSS (172.25.32.253@tcp is the MDS):&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Mar 11 13:05:11 monkey-oss-16-3 kernel: LustreError: 137-5: monkey-OST0078_UUID: not available &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; connect from 172.25.32.253@tcp (no target)
Mar 11 13:05:11 monkey-oss-16-3 kernel: LustreError: Skipped 2 previous similar messages
Mar 11 13:07:25 monkey-oss-16-3 kernel: LustreError: 137-5: monkey-OST0018_UUID: not available &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; connect from 172.25.32.253@tcp (no target)
Mar 11 13:07:25 monkey-oss-16-3 kernel: LustreError: Skipped 4 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="79041" author="rpwagner" created="Tue, 11 Mar 2014 21:02:44 +0000"  >&lt;p&gt;The LNET failure is being repeated between the MDS and an OSS.&lt;/p&gt;

&lt;p&gt;This was spotted by seeing a message about a slow reply:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@monkey-mds-10-3 ~]# dmesg 
Lustre: 11080:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1394570829/real 1394570829]  req@ffff880638534000 x1462310351923796/t0(0) o8-&amp;gt;monkey-OST0065-osc@172.25.32.111@tcp:28/4 lens 400/544 e 0 to 1 dl 1394570885 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
Lustre: 11080:0:(client.c:1868:ptlrpc_expire_one_request()) Skipped 34 previous similar messages
Lustre: MGS: Client 3d8c91ee-addb-8c84-31d6-137714d50e7e (at 172.25.32.111@tcp) reconnecting
Lustre: Skipped 3 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@monkey-mds-10-3 ~]# lctl ping 172.25.32.111@tcp
failed to ping 172.25.32.111@tcp: Input/output error
[root@monkey-mds-10-3 ~]# ping -c 4 172.25.32.111    
PING 172.25.32.111 (172.25.32.111) 56(84) bytes of data.
64 bytes from 172.25.32.111: icmp_seq=1 ttl=64 time=0.223 ms
64 bytes from 172.25.32.111: icmp_seq=2 ttl=64 time=0.124 ms
64 bytes from 172.25.32.111: icmp_seq=3 ttl=64 time=0.146 ms
64 bytes from 172.25.32.111: icmp_seq=4 ttl=64 time=0.151 ms

--- 172.25.32.111 ping statistics ---
4 packets transmitted, 4 received, 0% packet loss, time 2999ms
rtt min/avg/max/mdev = 0.124/0.161/0.223/0.037 ms
[root@monkey-mds-10-3 ~]# 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It also shows up as packets queued on the network. All of these servers use the TCP LNET driver.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@monkey-mds-10-3 ~]# !181
netstat -at | grep ^tcp | grep -v &lt;span class=&quot;code-quote&quot;&gt;&apos;^tcp        0      0 &apos;&lt;/span&gt;
tcp        0   1984 money-mds-10-3-10g.lus:1023 money-oss-2-1-10g.lustr:988 ESTABLISHED 
[root@monkey-mds-10-3 ~]# netstat -at | grep ^tcp | grep -v &lt;span class=&quot;code-quote&quot;&gt;&apos;^tcp        0      0 &apos;&lt;/span&gt;
tcp        0   2344 money-mds-10-3-10g.lus:1023 money-oss-2-1-10g.lustr:988 ESTABLISHED 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="79057" author="ashehata" created="Tue, 11 Mar 2014 22:32:55 +0000"  >&lt;p&gt;Do we know the network topography?  Are there any routers in between the client and the servers?  Can we get the LNET configuration for the client/router/servers?&lt;/p&gt;</comment>
                            <comment id="79062" author="rpwagner" created="Tue, 11 Mar 2014 23:18:03 +0000"  >&lt;p&gt;The clients are either direct attached via Ethernet, or behind LNET routers bridging between Ethernet and InfiniBand. The client host shown in the example is on Ethernet, and uses the following lnet.conf:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;options lnet networks=tcp(bond0)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The compute nodes on InfiniBand use:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;options lnet networks=&lt;span class=&quot;code-quote&quot;&gt;&quot;o2ib(ib1)&quot;&lt;/span&gt; routes=&lt;span class=&quot;code-quote&quot;&gt;&quot;tcp 1 10.7.104.54@o2ib; tcp 2 10.7.104.[54-101]@o2ib&quot;&lt;/span&gt; dead_router_check_interval=60 live_router_check_interval=60
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The preferred router is listed first, and then the group of routers.&lt;/p&gt;

&lt;p&gt;The routers use:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;options lnet networks=&lt;span class=&quot;code-quote&quot;&gt;&quot;tcp(bond0),o2ib(ib1)&quot;&lt;/span&gt; forwarding=&lt;span class=&quot;code-quote&quot;&gt;&quot;enabled&quot;&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The Lustre servers use:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;options lnet networks=&lt;span class=&quot;code-quote&quot;&gt;&quot;tcp(bond0)&quot;&lt;/span&gt; routes=&lt;span class=&quot;code-quote&quot;&gt;&quot;o2ib 192.168.230.[63-78]@tcp 192.168.230.[95-126]@tcp&quot;&lt;/span&gt; dead_router_check_interval=60 live_router_check_interval=60
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="79134" author="rpwagner" created="Wed, 12 Mar 2014 15:39:27 +0000"  >&lt;p&gt;Minh, can we escalate this? After our work last night it seems that this is highly reproducible. Also, I don&apos;t see it as an LNET routing issue, since the other Lustre file system we tested on these clients performed as expected.&lt;/p&gt;

&lt;p&gt;An update: we can reproduce the failure from a single host attached via Ethernet using the TCP LNET driver. This eliminates the routers as a direct cause.&lt;/p&gt;

&lt;p&gt;Here are Mahidhar&apos;s notes on what he observed.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;The IOR runs are triggering a very consistent set of errors (I did a few runs and got the same set of things happen on multiple nodes). Btw, the node recovers automatically after about 10-15 minutes. During the time we can&apos;t kill the IOR process ... even as root. It cleans up when the node recovers.

#1 As soon as the run starts, &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; a node has problems (&lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; is pretty quick within a few seconds of the job start):

Mar 12 00:08:07 gcn-2-42 kernel: LustreError: 4870:0:(osc_cache.c:1555:osc_enter_cache()) monkey-OST006c-osc-ffff881022873400: grant { dirty: 2097152/33554432 dirty_pages: 512/8519680 dropped: 0 avail: 0, reserved: 0, flight: 2 } lru {in list: 0, left: 513, waiters: 0 }&lt;span class=&quot;code-keyword&quot;&gt;try&lt;/span&gt; to reserve 4096.
Mar 12 00:08:07 gcn-2-42 kernel: LustreError: 4870:0:(osc_cache.c:1160:osc_extent_tree_dump0()) Dump object ffff8807cfdb0de8 extents at osc_enter_cache:1556, mppr: 256.
Mar 12 00:08:07 gcn-2-42 kernel: LustreError: 4870:0:(osc_cache.c:1165:osc_extent_tree_dump0()) extent ffff8807cfde4870@{[0 -&amp;gt; 255/255], [2|0|+|rpc|wiY|ffff8807cfdb0de8], [1048576|256|+|-|ffff8807cfd72eb8|256|ffff88102f3df540]} in tree 1.
Mar 12 00:08:07 gcn-2-42 kernel: LustreError: 4870:0:(osc_cache.c:1165:osc_extent_tree_dump0()) extent ffff8807cfde4918@{[256 -&amp;gt; 511/511], [2|0|+|rpc|wiY|ffff8807cfdb0de8], [1048576|256|+|-|ffff8807cfd72eb8|256|ffff88102f3de080]} in tree 2.


#2 This then triggers timeouts after about 8-10 minutes:

Mar 12 00:16:30 gcn-2-42 kernel: Lustre: 3387:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1394607487/real 1394607487]  req@ffff88101f863400 x1462348057527252/t0(0) o4-&amp;gt;monkey-OST006c-osc-ffff881022873400@172.25.33.240@tcp:6/4 lens 488/448 e 5 to 1 dl 1394608590 ref 2 fl Rpc:X/0/ffffffff rc 0/-1
Mar 12 00:16:30 gcn-2-42 kernel: Lustre: monkey-OST006c-osc-ffff881022873400: Connection to monkey-OST006c (at 172.25.33.240@tcp) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Mar 12 00:16:30 gcn-2-42 kernel: LustreError: 11-0: monkey-OST006c-osc-ffff881022873400: Communicating with 172.25.33.240@tcp, operation ost_connect failed with -16.
Mar 12 00:16:55 gcn-2-42 kernel: Lustre: monkey-OST006c-osc-ffff881022873400: Connection restored to monkey-OST006c (at 172.25.33.240@tcp)
Mar 12 00:27:27 gcn-2-42 kernel: Lustre: 3392:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1394608615/real 1394608615]  req@ffff88101f863000 x1462348057562536/t0(0) o4-&amp;gt;monkey-OST006c-osc-ffff881022873400@172.25.33.240@tcp:6/4 lens 488/448 e 0 to 1 dl 1394609247 ref 2 fl Rpc:X/2/ffffffff rc -11/-1
...
...
goes on &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; some time
...
...

#3 Finally the node recovers and I see the following error:

Mar 12 00:39:02 gcn-2-42 kernel: LustreError: 4870:0:(vvp_io.c:1088:vvp_io_commit_write()) Write page 512 of inode ffff8807d03ecab8 failed -5

-Mahidhar
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="79211" author="adilger" created="Thu, 13 Mar 2014 04:37:47 +0000"  >&lt;p&gt;According to Minh, the root cause of this problem was caused by half of the TCP channel bonding using jumbo MTU=9000 frames and the other half using MTU=1500 frames.  That was causing random packet loss for large IOs.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="23612">LU-4761</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzw6p3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>11227</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>