<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:16:47 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1455] Drop of the performance and &apos;scratch-OSTXXXX: Recovery&apos; </title>
                <link>https://jira.whamcloud.com/browse/LU-1455</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Dear support, during a normal usage of the file system, we noticed an major drop down of the I/O performance (ex. from 5Gb/s to 0.9Gb/s ) and into the log a bunch of:&lt;/p&gt;


&lt;p&gt;May 31 10:17:39 weisshorn06 kernel: Lustre: scratch-OST001d: Client scratch-MDT0000-mdtlov_UUID (at 148.187.7.101@o2ib2) reconnecting&lt;br/&gt;
May 31 10:17:39 weisshorn06 kernel: Lustre: 7445:0:(filter.c:2699:filter_connect_internal()) scratch-OST001d: Received MDS connection for group 0&lt;br/&gt;
May 31 10:17:39 weisshorn06 kernel: Lustre: scratch-OST001d: received MDS connection from 148.187.7.101@o2ib2&lt;br/&gt;
May 31 10:17:39 weisshorn06 kernel: Lustre: 7445:0:(llog_net.c:168:llog_receptor_accept()) changing the import ffff880f8b248800 - ffff880edff5b800&lt;br/&gt;
May 31 10:17:39 weisshorn06 kernel: Lustre: 7445:0:(llog_net.c:168:llog_receptor_accept()) Skipped 1 previous similar message&lt;br/&gt;
May 31 10:17:39 weisshorn06 kernel: Lustre: 7445:0:(filter.c:2555:filter_llog_connect()) scratch-OST001d: Recovery from log 0x140001f/0x0:b3e57b20&lt;/p&gt;


&lt;p&gt;May 31 10:21:32 weisshorn01 kernel: LustreError: 7733:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x5ef sub-object on OST idx 49/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 9518:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x2560 sub-object on OST idx 12/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 4004:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x225b sub-object on OST idx 12/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 3986:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x1257 sub-object on OST idx 12/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 7357:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x76b0 sub-object on OST idx 49/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 9539:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x879 sub-object on OST idx 12/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 7324:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0xf3ad sub-object on OST idx 12/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 3988:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x2d00 sub-object on OST idx 49/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 7746:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x1220 sub-object on OST idx 49/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 9640:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x230f sub-object on OST idx 12/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 7328:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x5b4 sub-object on OST idx 12/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 9476:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x2591 sub-object on OST idx 12/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 4062:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x28f3 sub-object on OST idx 12/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 9654:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x2251 sub-object on OST idx 12/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 7737:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x251c sub-object on OST idx 12/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 7347:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x2bd9 sub-object on OST idx 12/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 7337:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x2560 sub-object on OST idx 49/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 7725:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x23d8 sub-object on OST idx 12/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 3908:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x8d2 sub-object on OST idx 12/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 9484:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x57b sub-object on OST idx 12/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 7403:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0xc7e sub-object on OST idx 49/38: rc = -5&lt;br/&gt;
May 31 10:21:33 weisshorn01 kernel: LustreError: 9689:0:(lov_request.c:560:lov_update_create_set()) error creating fid 0x16c9b sub-object on OST idx 12/38: rc = -5&lt;/p&gt;

&lt;p&gt;May 31 10:21:49 weisshorn08 kernel: Lustre: scratch-OST0017: Client scratch-MDT0000-mdtlov_UUID (at 148.187.7.101@o2ib2) reconnecting&lt;br/&gt;
May 31 10:21:49 weisshorn08 kernel: Lustre: Skipped 1 previous similar message&lt;br/&gt;
May 31 10:21:49 weisshorn08 kernel: Lustre: 7332:0:(filter.c:2699:filter_connect_internal()) scratch-OST0017: Received MDS connection for group 0&lt;br/&gt;
May 31 10:21:49 weisshorn08 kernel: Lustre: 7332:0:(filter.c:2699:filter_connect_internal()) Skipped 1 previous similar message&lt;br/&gt;
May 31 10:21:49 weisshorn08 kernel: Lustre: scratch-OST0017: received MDS connection from 148.187.7.101@o2ib2&lt;br/&gt;
May 31 10:21:49 weisshorn08 kernel: Lustre: Skipped 1 previous similar message&lt;br/&gt;
May 31 10:21:49 weisshorn08 kernel: Lustre: 7332:0:(llog_net.c:168:llog_receptor_accept()) changing the import ffff880fc7844800 - ffff880fc68b5000&lt;br/&gt;
May 31 10:21:49 weisshorn08 kernel: Lustre: 7332:0:(llog_net.c:168:llog_receptor_accept()) Skipped 3 previous similar messages&lt;br/&gt;
May 31 10:21:49 weisshorn08 kernel: Lustre: 7332:0:(filter.c:2555:filter_llog_connect()) scratch-OST0017: Recovery from log 0x1400019/0x0:b3e57b1a&lt;br/&gt;
May 31 10:21:49 weisshorn08 kernel: Lustre: 7332:0:(filter.c:2555:filter_llog_connect()) Skipped 1 previous similar message&lt;br/&gt;
May 31 10:21:49 weisshorn04 kernel: Lustre: scratch-OST0025: Client scratch-MDT0000-mdtlov_UUID (at 148.187.7.101@o2ib2) reconnecting&lt;br/&gt;
May 31 10:21:49 weisshorn04 kernel: Lustre: 7284:0:(filter.c:2699:filter_connect_internal()) scratch-OST0025: Received MDS connection for group 0&lt;br/&gt;
May 31 10:21:49 weisshorn04 kernel: Lustre: scratch-OST0025: received MDS connection from 148.187.7.101@o2ib2&lt;br/&gt;
May 31 10:21:49 weisshorn04 kernel: Lustre: 7284:0:(llog_net.c:168:llog_receptor_accept()) changing the import ffff8802d1d4c800 - ffff880fbe6d3800&lt;br/&gt;
May 31 10:21:49 weisshorn04 kernel: Lustre: 7284:0:(llog_net.c:168:llog_receptor_accept()) Skipped 1 previous similar message&lt;br/&gt;
May 31 10:21:49 weisshorn04 kernel: Lustre: 7603:0:(filter.c:2555:filter_llog_connect()) scratch-OST0025: Recovery from log 0x1400027/0x0:b3e57b28&lt;/p&gt;


&lt;p&gt;We are running an IOR benchmark (writing an aggregate file of 512Gb) and the first run was fine, but now we monitoring with LTOP and the output of the job that the performance are drop down. &lt;br/&gt;
Also few users complain about a behavior like that running hdf5 jobs, the first runs was fine but the following were really slow.&lt;/p&gt;

&lt;p&gt;The load on the machine regarding CPU is really low.&lt;/p&gt;</description>
                <environment>MDS HW &lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
Linux XXXX.admin.cscs.ch 2.6.32-220.7.1.el6_lustre.g9c8f747.x86_64 &lt;br/&gt;
Architecture: x86_64 &lt;br/&gt;
CPU op-mode(s): 32-bit, 64-bit &lt;br/&gt;
Byte Order: Little Endian &lt;br/&gt;
CPU(s): 16 &lt;br/&gt;
Vendor ID: AuthenticAMD &lt;br/&gt;
CPU family: 16 &lt;br/&gt;
64Gb RAM &lt;br/&gt;
Interconnect IB 40Gb/s &lt;br/&gt;
&lt;br/&gt;
MDT LSI 5480 Pikes Peak &lt;br/&gt;
SSDs SLC &lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
&lt;br/&gt;
OSS HW &lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
Architecture: x86_64 &lt;br/&gt;
CPU op-mode(s): 32-bit, 64-bit &lt;br/&gt;
Byte Order: Little Endian &lt;br/&gt;
CPU(s): 32 &lt;br/&gt;
Vendor ID: GenuineIntel &lt;br/&gt;
CPU family: 6 &lt;br/&gt;
64Gb RAM &lt;br/&gt;
Interconnect IB 40Gb/s &lt;br/&gt;
&lt;br/&gt;
OST LSI 7900 &lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
&lt;br/&gt;
Router nodes&lt;br/&gt;
-------------------&lt;br/&gt;
12 router nodes - IB 40Gb/s&lt;br/&gt;
&lt;br/&gt;
Clients&lt;br/&gt;
---------&lt;br/&gt;
Cray XE6 - Lustre 1.8.6&lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
1 MDS + 1 fail over &lt;br/&gt;
12 OSS - 6 OST per OSS </environment>
        <key id="14654">LU-1455</key>
            <summary>Drop of the performance and &apos;scratch-OSTXXXX: Recovery&apos; </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="green">Oleg Drokin</assignee>
                                    <reporter username="fverzell">Fabio Verzelloni</reporter>
                        <labels>
                    </labels>
                <created>Thu, 31 May 2012 04:26:59 +0000</created>
                <updated>Mon, 29 May 2017 03:53:38 +0000</updated>
                            <resolved>Mon, 29 May 2017 03:53:38 +0000</resolved>
                                    <version>Lustre 2.2.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>1</watches>
                                                                            <comments>
                            <comment id="39734" author="green" created="Thu, 31 May 2012 12:24:01 +0000"  >&lt;p&gt;Error -5 is IO error. Could also be returned when local osc enters recovery mode (i.e. disconnected from server).&lt;/p&gt;

&lt;p&gt;Now, in your logs I see that MDS got disconnected from a lot of OSTs (possibly all of them?)because they did not respond to MDS requests.&lt;br/&gt;
There is not really much messages from the OSSes to indicate they are terribly unhappy with something, so I wonder if it&apos;s just the network that gets overloaded and starts to drop messages?&lt;/p&gt;</comment>
                            <comment id="39792" author="fverzell" created="Fri, 1 Jun 2012 03:31:51 +0000"  >&lt;p&gt;After some debugging we changed our FDR switch with a QDR switch, and after that we didn&apos;t see anymore the disconnecting and reconnecting of the OSTs. We are monitoring and running benchmark since yesterday and no error so far. &lt;br/&gt;
I&apos;ll keep you up to date with the situation.&lt;/p&gt;

&lt;p&gt;Fabio &lt;/p&gt;</comment>
                            <comment id="40346" author="pjones" created="Mon, 11 Jun 2012 09:53:01 +0000"  >&lt;p&gt;Fabio&lt;/p&gt;

&lt;p&gt;Anything new to report?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="40413" author="fverzell" created="Tue, 12 Jun 2012 04:37:25 +0000"  >&lt;p&gt;Hi Peter, &lt;br/&gt;
  so far seems that this error did not appear anymore, we have some drop of performance but my colleague Nicola has already open another ticket 1503 and I&apos;ve just added few more info. &lt;/p&gt;

&lt;p&gt;Thanks&lt;br/&gt;
Fabio&lt;/p&gt;</comment>
                            <comment id="197375" author="adilger" created="Mon, 29 May 2017 03:53:38 +0000"  >&lt;p&gt;Close old ticket.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="11488" name="cluster.log" size="3154546" author="fverzell" created="Thu, 31 May 2012 04:26:59 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzv32n:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>3997</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>