<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:16:43 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1447] MDS Load avarage</title>
                <link>https://jira.whamcloud.com/browse/LU-1447</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Dear support, &lt;br/&gt;
   we are running Lustre 2.2 on our system and we noticed that also with a low usage of the file system, the MDS is always with a load avarage around 10 and comparing with Lustre 1.8.4 with a high I/O the MDS is basically always around load avarage of 1.80. Is this the normal behavior of Lustre 2.2?&lt;/p&gt;
</description>
                <environment>MDS HW &lt;br/&gt;
----------------------------------------------------------------------------------------------------&lt;br/&gt;
Linux XXXX.admin.cscs.ch 2.6.32-220.7.1.el6_lustre.g9c8f747.x86_64 &lt;br/&gt;
Architecture:          x86_64&lt;br/&gt;
CPU op-mode(s):        32-bit, 64-bit&lt;br/&gt;
Byte Order:            Little Endian&lt;br/&gt;
CPU(s):                16&lt;br/&gt;
Vendor ID:             AuthenticAMD&lt;br/&gt;
CPU family:            16&lt;br/&gt;
64Gb RAM&lt;br/&gt;
Interconnect IB 40Gb/s&lt;br/&gt;
&lt;br/&gt;
MDT LSI 5480 Pikes Peak &lt;br/&gt;
SSDs SLC &lt;br/&gt;
----------------------------------------------------------------------------------------------------&lt;br/&gt;
&lt;br/&gt;
OSS HW&lt;br/&gt;
----------------------------------------------------------------------------------------------------&lt;br/&gt;
Architecture:          x86_64&lt;br/&gt;
CPU op-mode(s):        32-bit, 64-bit&lt;br/&gt;
Byte Order:            Little Endian&lt;br/&gt;
CPU(s):                32&lt;br/&gt;
Vendor ID:             GenuineIntel&lt;br/&gt;
CPU family:            6&lt;br/&gt;
64Gb RAM&lt;br/&gt;
Interconnect IB 40Gb/s&lt;br/&gt;
&lt;br/&gt;
OST LSI 7900&lt;br/&gt;
----------------------------------------------------------------------------------------------------&lt;br/&gt;
&lt;br/&gt;
1 MDS + 1 fail over&lt;br/&gt;
12 OSS - 6 OST per OSS&lt;br/&gt;
</environment>
        <key id="14618">LU-1447</key>
            <summary>MDS Load avarage</summary>
                <type id="3" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11318&amp;avatarType=issuetype">Task</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="green">Oleg Drokin</assignee>
                                    <reporter username="fverzell">Fabio Verzelloni</reporter>
                        <labels>
                    </labels>
                <created>Wed, 30 May 2012 04:48:32 +0000</created>
                <updated>Mon, 29 May 2017 03:52:48 +0000</updated>
                            <resolved>Mon, 29 May 2017 03:52:48 +0000</resolved>
                                    <version>Lustre 2.2.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>2</watches>
                                                                            <comments>
                            <comment id="39640" author="green" created="Wed, 30 May 2012 15:39:15 +0000"  >&lt;p&gt;I see that there is not an issue with high CPU usage, rather some of the threads are sleeping in D state.&lt;br/&gt;
The top snapshot is not long enough to show them.&lt;br/&gt;
Can you please do ps ax and filter out only threads in the D state and then post the output here?&lt;br/&gt;
(e.g. ps ax | grep D)&lt;/p&gt;</comment>
                            <comment id="39705" author="fverzell" created="Thu, 31 May 2012 04:29:54 +0000"  >&lt;p&gt;Yesterday evening we had a hang of the file system ( ticket &lt;a href=&quot;http://jira.whamcloud.com/browse/LU-1451&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;http://jira.whamcloud.com/browse/LU-1451&lt;/a&gt; ) and now the load avarage is back to &apos;normal&apos;  (load average: 0.26, 0.24, 0.30 ) also during heavy I/O but we are having drop down of performance, ( &lt;a href=&quot;http://jira.whamcloud.com/browse/LU-1455&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;http://jira.whamcloud.com/browse/LU-1455&lt;/a&gt; ).&lt;/p&gt;

</comment>
                            <comment id="39719" author="fverzell" created="Thu, 31 May 2012 10:22:57 +0000"  >&lt;p&gt;This is the all cluster ps -ax | grep D after a while of working.&lt;/p&gt;</comment>
                            <comment id="39758" author="green" created="Thu, 31 May 2012 15:49:08 +0000"  >&lt;p&gt;Based on these it seems that weisshorn07, 09, 13 ... are having serious overload issues (likely induced by the disk subsystems there). Any chance you can survey your disk subsystem to see what&apos;s going on? I suspect it&apos;s not really happy to have a lot of parallel IO ongoing. Also since some other OSSes are less busy, it appears the IO is not distributed all that evenly.&lt;/p&gt;

&lt;p&gt;In a lot of cases with &quot;weak&quot; (parallel-io wise) disk subsystems limiting number of ost io threads possible should help the situation I think, by reducing the overload.&lt;br/&gt;
Ongoing work on NRS should help the issue even more by allowing to limit number of in-progress RPCs per target.&lt;/p&gt;

&lt;p&gt;The MDS does not have any processes in D state, and I assume at the time this snapshot was taken MDS Load Average was pretty small?&lt;/p&gt;</comment>
                            <comment id="39793" author="fverzell" created="Fri, 1 Jun 2012 03:56:08 +0000"  >&lt;p&gt;The disk HW is:&lt;br/&gt;
LSI 7900 &lt;br/&gt;
6 controllers&lt;br/&gt;
8 enclosure x controller&lt;br/&gt;
RAID 6 - SATA 7.2RPM&lt;/p&gt;

&lt;p&gt;Our &apos;max_rpmcs_in_flight&apos; on the MDS is:&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@weisshonr01 lustre&amp;#93;&lt;/span&gt;# cat ./osc/scratch-OST0047-osc-MDT0000/max_rpcs_in_flight&lt;br/&gt;
8&lt;/p&gt;

&lt;p&gt;and the threads_&lt;span class=&quot;error&quot;&gt;&amp;#91;max,min,started&amp;#93;&lt;/span&gt; are:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@weisshorn01 lustre&amp;#93;&lt;/span&gt;# cat ./mgs/MGS/mgs/threads_max&lt;br/&gt;
32&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@weisshorn01 lustre&amp;#93;&lt;/span&gt;# cat ./mgs/MGS/mgs/threads_started&lt;br/&gt;
32&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@weisshorn01 lustre&amp;#93;&lt;/span&gt;# cat ./mgs/MGS/mgs/threads_min&lt;br/&gt;
3&lt;/p&gt;

&lt;p&gt;&amp;#8211;&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@weisshorn01 scratch-MDT0000&amp;#93;&lt;/span&gt;# cat ./mdt_mds/threads_max&lt;br/&gt;
512&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@weisshorn01 scratch-MDT0000&amp;#93;&lt;/span&gt;# pwd&lt;br/&gt;
/proc/fs/lustre/mdt/scratch-MDT0000&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@weisshorn01 scratch-MDT0000&amp;#93;&lt;/span&gt;# cat ./mdt_mds/threads_min &lt;br/&gt;
2&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@weisshorn01 scratch-MDT0000&amp;#93;&lt;/span&gt;# cat ./mdt_mds/threads_started &lt;br/&gt;
2&lt;/p&gt;



&lt;p&gt;on the OSS:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@weisshorn03 lustre&amp;#93;&lt;/span&gt;# cat ./ost/OSS/ost/threads_max&lt;br/&gt;
512&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@weisshorn03 lustre&amp;#93;&lt;/span&gt;# cat ./ost/OSS/ost/threads_min &lt;br/&gt;
128&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@weisshorn03 lustre&amp;#93;&lt;/span&gt;# cat ./ost/OSS/ost/threads_started &lt;br/&gt;
512&lt;/p&gt;

&lt;p&gt;on the client/MDS side the max_rpcs_in_flight is:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@weisshorn01 lustre&amp;#93;&lt;/span&gt;# cat ./osc/scratch-OST0005-osc-MDT0000/max_rpcs_in_flight&lt;br/&gt;
8&lt;/p&gt;

&lt;p&gt;So far we didn&apos;t see anymore the high load average on the MDS instead the load average on the OSS when we run benchmark with block size of 4096k OSS load goes to 200-300 and with &apos;top&apos; we see  &quot;x.x%wa&quot; in some cases increasing.&lt;/p&gt;

&lt;p&gt;Do you have any suggestion about the right tuning based on our hardware/configuration? Also on the client side? ( Cray XE6 1500 nodes~ )&lt;br/&gt;
Thanks&lt;/p&gt;

&lt;p&gt;Fabio&lt;/p&gt;
</comment>
                            <comment id="39798" author="green" created="Fri, 1 Jun 2012 05:39:45 +0000"  >&lt;p&gt;Well, it&apos;s somewhat expected that as you increase the write activity, load average on OSTs goes up.&lt;/p&gt;

&lt;p&gt;Essentially what&apos;s going on is every write RPC (in 1M chunks) from every client is going to consume 1 OSS io thread.&lt;br/&gt;
If the thread blocks doing IO, you get +1 to LA.&lt;br/&gt;
The more threads in this state, the higher is the LA. Additionally a lot of the disk controllers hate a lot of parallel IO (because it&apos;s in effect highly random IO from their perspective).&lt;/p&gt;

&lt;p&gt;So solutions for you are multiple:&lt;br/&gt;
1. If it&apos;s just the high LA that bothers you and the rest of the system performs well and the speed is acceptable - just ignore the LA.&lt;br/&gt;
2. If you think your system could perform better under load (I am not really familiar with that particular LSI controller, you might want to speak with LSI guys to see what&apos;s the optimal IO pattern for it) - you might try decreasing OSS max thread number )to 256, 128 and so on) and see what impact would it have.&lt;br/&gt;
If you gather some more stats from your disk controllers to see the IO pattern as seen by it we might see if something looks out of place too (e.g. if you have a lot of small IO, that would not be good). What is the load-generating application doing?&lt;/p&gt;

&lt;p&gt;There is mostly no client-specific tuning on the client side that you can do that would relieve the situation without additionally ruining e.g. single client performance, so I suggest you to concentrate on servers here. (the possible exception is some read-ahead settings when you expect to have a lot of small read traffic).&lt;/p&gt;</comment>
                            <comment id="197373" author="adilger" created="Mon, 29 May 2017 03:52:48 +0000"  >&lt;p&gt;Close old ticket.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="11489" name="ps_D" size="68634" author="fverzell" created="Thu, 31 May 2012 10:22:57 +0000"/>
                            <attachment id="11480" name="top.png" size="26174" author="fverzell" created="Wed, 30 May 2012 04:48:32 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvzzb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>10075</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>