<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:09:36 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-7520] OSTs are not available to client</title>
                <link>https://jira.whamcloud.com/browse/LU-7520</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>
&lt;p&gt;After OS failure and reinstalled, 3 out of 4 OSTs are unavailable to clients.&lt;br/&gt;
Getting various errors - dmesg attached.&lt;/p&gt;</description>
                <environment>lustre-2.5.3.90-2.6.32_431.29.2.el6_lustre.gb8d9077.x86_64_gb8d9077.x86_64&lt;br/&gt;
kernel-2.6.32-431.29.2.el6_lustre.gb8d9077.x86_64</environment>
        <key id="33475">LU-7520</key>
            <summary>OSTs are not available to client</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bobijam">Zhenyu Xu</assignee>
                                    <reporter username="haisong">Haisong Cai</reporter>
                        <labels>
                    </labels>
                <created>Sun, 6 Dec 2015 00:37:59 +0000</created>
                <updated>Tue, 8 Dec 2015 01:50:58 +0000</updated>
                            <resolved>Tue, 8 Dec 2015 01:50:58 +0000</resolved>
                                    <version>Lustre 2.5.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="135336" author="haisong" created="Sun, 6 Dec 2015 00:57:04 +0000"  >&lt;p&gt;What I found strange is the lines below when I first try to mount OST. The OST was never configured for HA.&lt;br/&gt;
Why would client think so?&lt;/p&gt;

&lt;p&gt;LustreError: 137-5: monkey-OST0016_UUID: not available for connect from 10.7.101.42@o2ib (no target). If you are running an HA pair check that the target is mounted on the other server.&lt;br/&gt;
LustreError: 137-5: monkey-OST0016_UUID: not available for connect from 10.7.103.215@o2ib (no target). If you are running an HA pair check that the target is mounted on the other server.&lt;br/&gt;
LustreError: Skipped 19 previous similar messages&lt;br/&gt;
LustreError: 137-5: monkey-OST0016_UUID: not available for connect from 10.7.100.88@o2ib (no target). If you are running an HA pair check that the target is mounted on the other server.&lt;br/&gt;
LustreError: Skipped 8 previous similar messages&lt;br/&gt;
LustreError: 137-5: monkey-OST0036_UUID: not available for connect from 132.249.107.14@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.&lt;br/&gt;
LustreError: 137-5: monkey-OST0016_UUID: not available for connect from 132.249.107.14@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.&lt;/p&gt;

</comment>
                            <comment id="135337" author="haisong" created="Sun, 6 Dec 2015 01:57:46 +0000"  >&lt;p&gt;Client:&lt;/p&gt;

&lt;p&gt;LustreError: 11-0: monkey-OST0016-osc-ffff880339b5d400: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
LustreError: Skipped 68 previous similar messages&lt;br/&gt;
LustreError: 11-0: monkey-OST0036-osc-ffff880339b5d400: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
LustreError: Skipped 72 previous similar messages&lt;br/&gt;
LustreError: 11-0: monkey-OST0016-osc-ffff880339b5d400: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
LustreError: Skipped 73 previous similar messages&lt;br/&gt;
LustreError: 11-0: monkey-OST0076-osc-ffff880339b5d400: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
LustreError: Skipped 73 previous similar messages&lt;br/&gt;
LustreError: 11-0: monkey-OST0016-osc-ffff880339b5d400: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
LustreError: Skipped 72 previous similar messages&lt;br/&gt;
LustreError: 11-0: monkey-OST0036-osc-ffff880339b5d400: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
LustreError: Skipped 72 previous similar messages&lt;br/&gt;
LustreError: 11-0: monkey-OST0016-osc-ffff880339b5d400: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
LustreError: Skipped 73 previous similar messages&lt;br/&gt;
LustreError: 11-0: monkey-OST0076-osc-ffff880339b5d400: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
LustreError: Skipped 73 previous similar messages&lt;br/&gt;
LustreError: 11-0: monkey-OST0016-osc-ffff880339b5d400: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
LustreError: Skipped 72 previous similar messages&lt;/p&gt;</comment>
                            <comment id="135338" author="haisong" created="Sun, 6 Dec 2015 03:13:56 +0000"  >&lt;p&gt;Saw this in dmesg, how to clear it?&lt;/p&gt;

&lt;p&gt;...&lt;br/&gt;
LustreError: 2853:0:(obd_mount_server.c:1120:server_register_target()) monkey-OST0016: error registering with the MGS: rc = -5 (not fatal)&lt;/p&gt;

&lt;p&gt;...&lt;/p&gt;
</comment>
                            <comment id="135340" author="haisong" created="Sun, 6 Dec 2015 06:09:54 +0000"  >&lt;p&gt;Further testing shows that above MGS error register when first OST is mounted on OSS:&lt;/p&gt;

&lt;p&gt;Dec  5 21:58:35 monkey-oss-16-1 kernel: LNet: HW CPU cores: 16, npartitions: 4&lt;br/&gt;
Dec  5 21:58:35 monkey-oss-16-1 kernel: alg: No test for crc32 (crc32-table)&lt;br/&gt;
Dec  5 21:58:35 monkey-oss-16-1 kernel: alg: No test for adler32 (adler32-zlib)  &lt;br/&gt;
Dec  5 21:58:36 monkey-mds-10-4 kernel: Lustre: 3644:0:(client.c:1940:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1449381480/real 1449381480&amp;#93;&lt;/span&gt;  req@ffff88031983b000 x1504076313551084/t0(0) o8-&amp;gt;monkey-OST0076-osc@172.25.32.234@tcp:28/4 lens 400/544 e 0 to 1 dl 1449381516 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1&lt;br/&gt;
Dec  5 21:58:36 monkey-mds-10-4 kernel: Lustre: 3644:0:(client.c:1940:ptlrpc_expire_one_request()) Skipped 3 previous similar messages&lt;br/&gt;
Dec  5 21:58:43 monkey-oss-16-1 kernel: Lustre: Lustre: Build Version: jenkins-arch=x86_64,build_type=server,distro=el6,ib_stack=inkernel-31844-gb8d9077-PRISTINE-2.6.32-431.29.2.el6_lustre.gb8d9077.x86_64&lt;br/&gt;
Dec  5 21:58:43 monkey-oss-16-1 kernel: LNet: Added LNI 172.25.32.234@tcp &lt;span class=&quot;error&quot;&gt;&amp;#91;8/256/0/180&amp;#93;&lt;/span&gt;&lt;br/&gt;
Dec  5 21:58:43 monkey-oss-16-1 kernel: LNet: Accept secure, port 988&lt;br/&gt;
Dec  5 22:00:28 monkey-oss-1-1 kernel: Lustre: 4566:0:(client.c:1940:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1449381582/real 1449381582&amp;#93;&lt;/span&gt;  req@ffff8806056a7000 x1504076377940368/t0(0) o8-&amp;gt;monkey-OST0036-osc-ffff880339b5d400@172.25.32.234@tcp:28/4 lens 400/544 e 0 to 1 dl 1449381628 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1&lt;br/&gt;
Dec  5 22:00:28 monkey-oss-1-1 kernel: Lustre: 4566:0:(client.c:1940:ptlrpc_expire_one_request()) Skipped 8 previous similar messages&lt;br/&gt;
Dec  5 22:00:36 monkey-oss-16-1 kernel: LDISKFS-fs (sde): mounted filesystem with ordered data mode. quota=off. Opts: &lt;br/&gt;
Dec  5 22:00:36 monkey-oss-16-1 kernel: LustreError: 137-5: monkey-OST0056_UUID: not available for connect from 10.7.100.219@o2ib (no target). If you are running an HA pair check that the target is mounted on the other server.&lt;br/&gt;
Dec  5 22:00:36 monkey-oss-16-1 kernel: LustreError: 137-5: monkey-OST0036_UUID: not available for connect from 10.7.103.221@o2ib (no target). If you are running an HA pair check that the target is mounted on the other server.&lt;br/&gt;
Dec  5 22:00:36 monkey-oss-16-1 kernel: LustreError: Skipped 104 previous similar messages&lt;br/&gt;
Dec  5 22:00:37 monkey-oss-16-1 kernel: LustreError: 137-5: monkey-OST0036_UUID: not available for connect from 132.249.107.85@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.&lt;br/&gt;
Dec  5 22:00:37 monkey-oss-16-1 kernel: LustreError: Skipped 213 previous similar messages&lt;br/&gt;
Dec  5 22:00:39 monkey-oss-16-1 kernel: LustreError: 137-5: monkey-OST0016_UUID: not available for connect from 10.7.101.213@o2ib (no target). If you are running an HA pair check that the target is mounted on the other server.&lt;br/&gt;
Dec  5 22:00:39 monkey-oss-16-1 kernel: LustreError: Skipped 321 previous similar messages&lt;br/&gt;
Dec  5 22:00:41 monkey-oss-16-1 kernel: Lustre: 10081:0:(client.c:1940:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1449381636/real 1449381636&amp;#93;&lt;/span&gt;  req@ffff8805f0cacc00 x1519786679861252/t0(0) o250-&amp;gt;MGC172.25.32.253@tcp@172.25.32.253@tcp:26/25 lens 400/544 e 0 to 1 dl 1449381641 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1&lt;br/&gt;
Dec  5 22:00:43 monkey-oss-16-1 kernel: LustreError: 137-5: monkey-OST0036_UUID: not available for connect from 10.7.102.72@o2ib (no target). If you are running an HA pair check that the target is mounted on the other server.&lt;br/&gt;
Dec  5 22:00:43 monkey-oss-16-1 kernel: LustreError: Skipped 695 previous similar messages&lt;br/&gt;
Dec  5 22:00:47 monkey-oss-16-1 kernel: LustreError: 10124:0:(client.c:1096:ptlrpc_import_delay_req()) @@@ send limit expired   req@ffff8805f0cac800 x1519786679861256/t0(0) o253-&amp;gt;MGC172.25.32.253@tcp@172.25.32.253@tcp:26/25 lens 4768/4768 e 0 to 0 dl 0 ref 2 fl Rpc:W/0/ffffffff rc 0/-1&lt;br/&gt;
Dec  5 22:00:47 monkey-oss-16-1 kernel: LustreError: 10124:0:(obd_mount_server.c:1120:server_register_target()) monkey-OST0056: error registering with the MGS: rc = -5 (not fatal)&lt;br/&gt;
Dec  5 22:00:52 monkey-oss-16-1 kernel: LustreError: 137-5: monkey-OST0036_UUID: not available for connect from 10.7.101.159@o2ib (no target). If you are running an HA pair check that the target is mounted on the other server.&lt;br/&gt;
Dec  5 22:00:52 monkey-oss-16-1 kernel: LustreError: Skipped 393 previous similar messages&lt;br/&gt;
Dec  5 22:00:53 monkey-oss-16-1 kernel: LustreError: 10124:0:(client.c:1096:ptlrpc_import_delay_req()) @@@ send limit expired   req@ffff8805f0cac800 x1519786679861260/t0(0) o101-&amp;gt;MGC172.25.32.253@tcp@172.25.32.253@tcp:26/25 lens 328/344 e 0 to 0 dl 0 ref 2 fl Rpc:W/0/ffffffff rc 0/-1&lt;br/&gt;
Dec  5 22:00:59 monkey-oss-16-1 kernel: LustreError: 10124:0:(client.c:1096:ptlrpc_import_delay_req()) @@@ send limit expired   req@ffff8805f0cac800 x1519786679861264/t0(0) o101-&amp;gt;MGC172.25.32.253@tcp@172.25.32.253@tcp:26/25 lens 328/344 e 0 to 0 dl 0 ref 2 fl Rpc:W/0/ffffffff rc 0/-1&lt;br/&gt;
Dec  5 22:00:59 monkey-oss-16-1 kernel: LustreError: 10124:0:(client.c:1096:ptlrpc_import_delay_req()) @@@ send limit expired   req@ffff8805f0cac800 x1519786679861264/t0(0) o101-&amp;gt;MGC172.25.32.253@tcp@172.25.32.253@tcp:&lt;br/&gt;
 server.&lt;br/&gt;
Dec  5 22:00:52 monkey-oss-16-1 kernel: LustreError: Skipped 393 previous similar messages&lt;br/&gt;
Dec  5 22:00:53 monkey-oss-16-1 kernel: LustreError: 10124:0:(client.c:1096:ptlrpc_import_delay_req()) @@@ send limit expired   req@ffff8805f0cac800 x1519786679861260/t0(0) o101-&amp;gt;MGC172.25.32.253@tcp@172.25.32.253@tcp:&lt;br/&gt;
26/25 lens 328/344 e 0 to 0 dl 0 ref 2 fl Rpc:W/0/ffffffff rc 0/-1&lt;br/&gt;
Dec  5 22:00:59 monkey-oss-16-1 kernel: LustreError: 10124:0:(client.c:1096:ptlrpc_import_delay_req()) @@@ send limit expired   req@ffff8805f0cac800 x1519786679861264/t0(0) o101-&amp;gt;MGC172.25.32.253@tcp@172.25.32.253@tcp:&lt;br/&gt;
26/25 lens 328/344 e 0 to 0 dl 0 ref 2 fl Rpc:W/0/ffffffff rc 0/-1&lt;br/&gt;
Dec  5 22:00:59 monkey-oss-16-1 kernel: Lustre: 10218:0:(ofd_dev.c:255:ofd_process_config()) For interoperability, skip this ost.quota_type. It is obsolete.&lt;br/&gt;
Dec  5 22:01:01 monkey-oss-16-1 kernel: Lustre: monkey-OST0056: Imperative Recovery enabled, recovery window shrunk from 300-900 down to 150-450&lt;br/&gt;
Dec  5 22:01:01 monkey-oss-16-1 kernel: Lustre: monkey-OST0056: Will be in recovery for at least 2:30, or until 1210 clients reconnect&lt;br/&gt;
Dec  5 22:01:06 monkey-oss-16-1 kernel: Lustre: 10081:0:(client.c:1940:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1449381661/real 1449381661&amp;#93;&lt;/span&gt;  req@ffff8805eaed6c00 x1519786679861372/t0(0) o38-&amp;gt;monkey-MDT0000-lwp-OST0056@172.25.32.253@tcp:12/10 lens 400/544 e 0 to 1 dl 1449381666 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1&lt;br/&gt;
Dec  5 22:01:07 oasis-monkey sshd&lt;span class=&quot;error&quot;&gt;&amp;#91;27267&amp;#93;&lt;/span&gt;: Set /proc/self/oom_score_adj to 0&lt;br/&gt;
Dec  5 22:01:07 oasis-monkey sshd&lt;span class=&quot;error&quot;&gt;&amp;#91;27267&amp;#93;&lt;/span&gt;: Connection from 192.31.21.156 port 59248&lt;br/&gt;
Dec  5 22:01:07 oasis-monkey audispd: node=oasis-monkey.sdsc.edu type=CRYPTO_KEY_USER msg=audit(1449381667.986:112163): user pid=27268 uid=0 auid=4294967295 ses=4294967295 msg=&apos;op=destroy kind=server fp=e0:d7:8e:6c:f6:a4:83:fd:33:cd:ec:c3:fb:f3:1c:b3 direction=? spid=27268 suid=0  exe=&quot;/usr/sbin/sshd&quot; hostname=? addr=192.31.21.156 terminal=? res=success&apos;&lt;br/&gt;
Dec  5 22:01:07 oasis-monkey audispd: node=oasis-monkey.sdsc.edu type=CRYPTO_KEY_USER msg=audit(1449381667.986:112164): user pid=27268 uid=0 auid=4294967295 ses=4294967295 msg=&apos;op=destroy kind=server fp=67:7b:43:dc:e9:d8:b5:30:6e:b5:93:7d:97:ac:94:50 direction=? spid=27268 suid=0  exe=&quot;/usr/sbin/sshd&quot; hostname=? addr=192.31.21.156 terminal=? res=success&apos;&lt;br/&gt;
Dec  5 22:01:08 oasis-monkey sshd&lt;span class=&quot;error&quot;&gt;&amp;#91;27268&amp;#93;&lt;/span&gt;: Postponed keyboard-interactive for cai from 192.31.21.156 port 59248 ssh2&lt;br/&gt;
Dec  5 22:01:08 monkey-oss-16-1 kernel: LustreError: 137-5: monkey-OST0036_UUID: not available for connect from 10.7.101.144@o2ib (no target). If you are running an HA pair check that the target is mounted on the other server.&lt;br/&gt;
Dec  5 22:01:18 oasis-monkey audispd: node=oasis-monkey.sdsc.edu type=CRYPTO_KEY_USER msg=audit(1449381678.853:112165): user pid=27267 uid=0 auid=4294967295 ses=4294967295 msg=&apos;op=destroy kind=session fp=? direction=both spid=27268 suid=74 rport=59248 laddr=192.168.111.6 lport=22  exe=&quot;/usr/sbin/sshd&quot; hostname=? addr=192.31.21.156 terminal=? res=success&apos;&lt;br/&gt;
Dec  5 22:01:18 oasis-monkey sshd&lt;span class=&quot;error&quot;&gt;&amp;#91;27267&amp;#93;&lt;/span&gt;: pam_unix(sshd:session): session opened for user cai by (uid=0)&lt;br/&gt;
Dec  5 22:01:18 oasis-monkey sshd&lt;span class=&quot;error&quot;&gt;&amp;#91;27267&amp;#93;&lt;/span&gt;: User child is on pid 27278  &lt;br/&gt;
Dec  5 22:01:18 oasis-monkey audispd: node=oasis-monkey.sdsc.edu type=CRYPTO_KEY_USER msg=audit(1449381678.866:112166): user pid=27267 uid=0 auid=4294967295 ses=4294967295 msg=&apos;op=destroy kind=session fp=? direction=both spid=27267 suid=0 rport=59248 laddr=192.168.111.6 lport=22  exe=&quot;/usr/sbin/sshd&quot; hostname=? addr=192.31.21.156 terminal=? res=success&apos;  &lt;br/&gt;
Dec  5 22:01:18 oasis-monkey audispd: node=oasis-monkey.sdsc.edu type=CRYPTO_KEY_USER msg=audit(1449381678.867:112167): user pid=27278 uid=0 auid=4294967295 ses=4294967295 msg=&apos;op=destroy kind=server fp=e0:d7:8e:6c:f6:a4:83:fd:33:cd:ec:c3:fb:f3:1c:b3 direction=? spid=27278 suid=0  exe=&quot;/usr/sbin/sshd&quot; hostname=? addr=192.31.21.156 terminal=? res=success&apos;  &lt;br/&gt;
Dec  5 22:01:18 oasis-monkey audispd: node=oasis-monkey.sdsc.edu type=CRYPTO_KEY_USER msg=audit(1449381678.868:112168): user pid=27278 uid=0 auid=4294967295 ses=4294967295 msg=&apos;op=destroy kind=server fp=67:7b:43:dc:e9:d8:b5:30:6e:b5:93:7d:97:ac:94:50 direction=? spid=27278 suid=0  exe=&quot;/usr/sbin/sshd&quot; hostname=? addr=192.31.21.156 terminal=? res=success&apos;  &lt;br/&gt;
Dec  5 22:01:21 monkey-mds-10-4 kernel: Lustre: 3644:0:(client.c:1940:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1449381630/real 1449381630&amp;#93;&lt;/span&gt;  req@ffff88031b0af400 x1504076313567188/t0(0) o8-&amp;gt;monkey-OST0076-osc@172.25.32.234@tcp:28/4 lens 400/544 e 0 to 1 dl 1449381681 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1&lt;br/&gt;
Dec  5 22:01:21 monkey-mds-10-4 kernel: Lustre: 3644:0:(client.c:1940:ptlrpc_expire_one_request()) Skipped 9 previous similar messages&lt;/p&gt;</comment>
                            <comment id="135342" author="pjones" created="Sun, 6 Dec 2015 14:51:54 +0000"  >&lt;p&gt;Bobijam&lt;/p&gt;

&lt;p&gt;Could you please assist with this issue?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="135349" author="green" created="Sun, 6 Dec 2015 19:15:38 +0000"  >&lt;p&gt;when you get errors about mgs registration, are there any problems reported on the MGS itself?&lt;/p&gt;

&lt;p&gt;The failover pair error could arise even if you do not have failover configure - basically it means that this server got a request for a service that is not started there. It could be due to failure to register on MGS (And therefore server mount failure), OST might have not been started yet (also visible in your logs with e.g. OST0036) and similar.&lt;/p&gt;

&lt;p&gt;The error -16 means that a client tried to reconnect to a server, but the server is already handling a request from this client. It&apos;s not clear in the logs what might be the reason for this one, possibly just too long recovery.&lt;/p&gt;

&lt;p&gt;I see you aborted recovery later on, so after that do you only see problems related to OSTs that failed to start?&lt;/p&gt;</comment>
                            <comment id="135353" author="haisong" created="Mon, 7 Dec 2015 00:08:42 +0000"  >&lt;p&gt;Oleg,&lt;/p&gt;

&lt;p&gt;1)  Here are the errors on MDS/MGS:&lt;/p&gt;

&lt;p&gt;Dec  5 21:09:59 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0016-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
Dec  5 21:09:59 monkey-mds-10-4 kernel: LustreError: Skipped 74 previous similar messages&lt;br/&gt;
Dec  5 21:20:24 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0016-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
Dec  5 21:20:24 monkey-mds-10-4 kernel: LustreError: Skipped 74 previous similar messages&lt;br/&gt;
Dec  5 21:30:49 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0016-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
Dec  5 21:30:49 monkey-mds-10-4 kernel: LustreError: Skipped 74 previous similar messages&lt;br/&gt;
Dec  5 21:41:14 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0016-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -19.&lt;br/&gt;
Dec  5 21:41:14 monkey-mds-10-4 kernel: LustreError: Skipped 74 previous similar messages&lt;br/&gt;
Dec  5 21:51:45 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0016-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -19.&lt;br/&gt;
Dec  5 21:51:45 monkey-mds-10-4 kernel: LustreError: Skipped 62 previous similar messages&lt;br/&gt;
Dec  5 22:02:10 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0076-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -19.&lt;br/&gt;
Dec  5 22:02:10 monkey-mds-10-4 kernel: LustreError: Skipped 49 previous similar messages&lt;br/&gt;
Dec  5 22:12:35 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0016-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -19.&lt;br/&gt;
Dec  5 22:12:35 monkey-mds-10-4 kernel: LustreError: Skipped 74 previous similar messages&lt;br/&gt;
Dec  5 22:22:49 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0016-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
Dec  5 22:22:49 monkey-mds-10-4 kernel: LustreError: Skipped 63 previous similar messages&lt;br/&gt;
Dec  5 22:33:14 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0016-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
Dec  5 22:33:14 monkey-mds-10-4 kernel: LustreError: Skipped 74 previous similar messages&lt;br/&gt;
Dec  5 22:43:39 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0016-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
Dec  5 22:43:39 monkey-mds-10-4 kernel: LustreError: Skipped 74 previous similar messages&lt;br/&gt;
Dec  5 22:54:04 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0016-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
Dec  5 22:54:04 monkey-mds-10-4 kernel: LustreError: Skipped 74 previous similar messages&lt;br/&gt;
Dec  5 23:04:29 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0016-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
Dec  5 23:04:29 monkey-mds-10-4 kernel: LustreError: Skipped 74 previous similar messages&lt;br/&gt;
Dec  5 23:14:54 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0076-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
Dec  5 23:14:54 monkey-mds-10-4 kernel: LustreError: Skipped 74 previous similar messages&lt;br/&gt;
Dec  5 23:25:19 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0016-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
Dec  5 23:25:19 monkey-mds-10-4 kernel: LustreError: Skipped 74 previous similar messages&lt;br/&gt;
Dec  5 23:35:44 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0016-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
Dec  5 23:35:44 monkey-mds-10-4 kernel: LustreError: Skipped 74 previous similar messages&lt;br/&gt;
Dec  5 23:46:09 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0076-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
Dec  5 23:46:09 monkey-mds-10-4 kernel: LustreError: Skipped 74 previous similar messages&lt;br/&gt;
Dec  5 23:56:09 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0076-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
Dec  5 23:56:09 monkey-mds-10-4 kernel: LustreError: Skipped 71 previous similar messages&lt;br/&gt;
Dec  6 00:06:34 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0036-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
Dec  6 00:06:34 monkey-mds-10-4 kernel: LustreError: Skipped 74 previous similar messages&lt;br/&gt;
Dec  6 00:16:59 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0016-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
Dec  6 00:16:59 monkey-mds-10-4 kernel: LustreError: Skipped 74 previous similar messages&lt;br/&gt;
Dec  6 00:27:24 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0016-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
Dec  6 00:27:24 monkey-mds-10-4 kernel: LustreError: Skipped 74 previous similar messages&lt;br/&gt;
Dec  6 00:37:49 monkey-mds-10-4 kernel: LustreError: 11-0: monkey-OST0076-osc: Communicating with 172.25.32.234@tcp, operation ost_connect failed with -16.&lt;br/&gt;
Dec  6 00:37:49 monkey-mds-10-4 kernel: LustreError: Skipped 74 previous similar messages&lt;/p&gt;


&lt;p&gt;2) OSTs always start on server side. It&apos;s client that can&apos;t connect to them.&lt;/p&gt;

</comment>
                            <comment id="135354" author="haisong" created="Mon, 7 Dec 2015 00:28:59 +0000"  >
&lt;p&gt;How long a typical recovery take? The filesystem has about 1200 clients. We have OSTs mounted on server side since last night 22:00. For about 18 hourse clients still can&apos;t access OSTs.&lt;/p&gt;

&lt;p&gt;I also want to point out that OSS hosts 4 OSTs. 3 are inaccessible from client side (via &quot;df&quot;, &quot;lfs df&quot;) and 1 is accessible.&lt;/p&gt;
</comment>
                            <comment id="135355" author="bobijam" created="Mon, 7 Dec 2015 01:50:50 +0000"  >&lt;p&gt;Can you find in the MGS logs (debug log could be more useful) to check when OST16/36/76 tried to register on the MGS, what caused the -5 error? It seems that OST16/36/76 did not successfully registered on MGS as available devices.&lt;/p&gt;</comment>
                            <comment id="135356" author="haisong" created="Mon, 7 Dec 2015 02:01:58 +0000"  >&lt;p&gt;Zhenyu,&lt;/p&gt;

&lt;p&gt;Do you mean to run &quot;debug_kernel &amp;gt; /tmp/log&quot; on MDS/MGS server?&lt;/p&gt;

&lt;p&gt;Haisong&lt;/p&gt;</comment>
                            <comment id="135357" author="bobijam" created="Mon, 7 Dec 2015 02:04:50 +0000"  >&lt;p&gt;The debug message buffer could have been recycled, you can mount one OST again (like OST16), and collect the debug log from the MGS (lctl dk, or similar)&lt;/p&gt;</comment>
                            <comment id="135358" author="haisong" created="Mon, 7 Dec 2015 02:19:48 +0000"  >&lt;p&gt;Zhenyu,&lt;/p&gt;

&lt;p&gt;I uploaded 2 files both from MGS server&lt;/p&gt;

&lt;p&gt;debug_kernel.20228.gz  was taken before I unmount OST0016&lt;br/&gt;
debug_kernel.123 was taken after I unmount-then-mount OST0016&lt;/p&gt;</comment>
                            <comment id="135359" author="bobijam" created="Mon, 7 Dec 2015 02:31:36 +0000"  >&lt;p&gt;The OST16/36/76 cannot successfully recover, you can mount them with &quot;-o abort_recov&quot; to abort the recovery process.&lt;/p&gt;</comment>
                            <comment id="135360" author="haisong" created="Mon, 7 Dec 2015 03:01:00 +0000"  >
&lt;p&gt;Zhengyu,&lt;/p&gt;

&lt;p&gt;The abort_recov has made the clients able to access the OSTs now.&lt;br/&gt;
We are running sanity checks.&lt;br/&gt;
I will update this ticket tomorrow morning on the results.&lt;/p&gt;


&lt;p&gt;Thanks very much for you and Oleg&apos;s help,&lt;br/&gt;
Haisong&lt;/p&gt;</comment>
                            <comment id="135420" author="haisong" created="Mon, 7 Dec 2015 20:01:59 +0000"  >&lt;p&gt;Zhengyu,&lt;/p&gt;

&lt;p&gt;Filesystem is operating normally. You may close this ticket.&lt;/p&gt;

&lt;p&gt;Thanks again for all the helps,&lt;br/&gt;
Haisong&lt;/p&gt;</comment>
                            <comment id="135454" author="jfc" created="Tue, 8 Dec 2015 01:50:58 +0000"  >&lt;p&gt;Thank you Haisong.&lt;br/&gt;
~ jfc.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="19824" name="debug_kernel.123" size="2703467" author="haisong" created="Mon, 7 Dec 2015 02:19:48 +0000"/>
                            <attachment id="19823" name="debug_kernel.20288.gz" size="236" author="haisong" created="Mon, 7 Dec 2015 02:19:48 +0000"/>
                            <attachment id="19822" name="dmesg.4251" size="119189" author="haisong" created="Sun, 6 Dec 2015 00:55:01 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzxv1j:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10023"><![CDATA[4]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>