<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:17:12 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1503] Clients application IO errors and overloaded system messages</title>
                <link>https://jira.whamcloud.com/browse/LU-1503</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Dear Support,&lt;br/&gt;
we experienced some problem with our Lustre FS. &lt;br/&gt;
Our users complained that at the following times their jobs were killed due IO errors.&lt;/p&gt;

&lt;p&gt;Saturday 09 June 05:58 &lt;br/&gt;
Monday   11 June 12:47&lt;/p&gt;

&lt;p&gt;We collected the logs from the servers and clients side and actually we saw a lot of messages/errors that we have problem to &quot;decode&quot;.&lt;br/&gt;
Could you please help us to undestand why this problem arise ?&lt;br/&gt;
In the specific we don&apos;t understand if it&apos;s really a overload problem related to the hardware or configuration we used otherwise some congestion/bug issue...&lt;/p&gt;

&lt;p&gt;Usually the FS seems to work correctly but suddently the log fill up  of these messages.&lt;/p&gt;

&lt;p&gt;Regards&lt;br/&gt;
  Nicola&lt;/p&gt;</description>
                <environment>----------------------------------------------------------------------------------------------------&lt;br/&gt;
## MDS HW ##&lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
Linux XXXX.admin.cscs.ch 2.6.32-220.7.1.el6_lustre.g9c8f747.x86_64 &lt;br/&gt;
Architecture: x86_64 &lt;br/&gt;
CPU op-mode(s): 32-bit, 64-bit &lt;br/&gt;
Byte Order: Little Endian &lt;br/&gt;
CPU(s): 16 &lt;br/&gt;
Vendor ID: AuthenticAMD &lt;br/&gt;
CPU family: 16 &lt;br/&gt;
64Gb RAM &lt;br/&gt;
Interconnect IB 40Gb/s &lt;br/&gt;
---&lt;br/&gt;
MDT LSI 5480 Pikes Peak &lt;br/&gt;
SSDs SLC &lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
----------------------------------------------------------------------------------------------------&lt;br/&gt;
## OSS HW ##&lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
Architecture: x86_64 &lt;br/&gt;
CPU op-mode(s): 32-bit, 64-bit &lt;br/&gt;
Byte Order: Little Endian &lt;br/&gt;
CPU(s): 32 &lt;br/&gt;
Vendor ID: GenuineIntel &lt;br/&gt;
CPU family: 6 &lt;br/&gt;
64Gb RAM &lt;br/&gt;
Interconnect IB 40Gb/s &lt;br/&gt;
---&lt;br/&gt;
OSTs ---&amp;gt; LSI 7900 SATA Disks&lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
----------------------------------------------------------------------------------------------------&lt;br/&gt;
## Router nodes ##&lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
12 Cray XE6 Service nodes as router nodes - IB 40Gb/s &lt;br/&gt;
----------------------------------------------------------------------------------------------------&lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
----------------------------------------------------------------------------------------------------&lt;br/&gt;
## Clients ##&lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
~ 1500 Cray XE6 nodes - Lustre 1.8.6 &lt;br/&gt;
----------------------------------------------------------------------------------------------------&lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
----------------------------------------------------------------------------------------------------&lt;br/&gt;
## LUSTRE Config ##&lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
1 MDS + 1 fail over  (MDT on SSD array)&lt;br/&gt;
12 OSSs - 6 OSTs per OSS (72 OSTs)&lt;br/&gt;
&lt;br/&gt;
Luster Servers ---&amp;gt; 2.2.51.0&lt;br/&gt;
Lustre Clients ---&amp;gt; 1.8.6 (~1500 nodes) / 2.2.51.0 (~20 nodes)&lt;br/&gt;
----------------------------------------------------------------------------------------------------</environment>
        <key id="14835">LU-1503</key>
            <summary>Clients application IO errors and overloaded system messages</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="cliffw">Cliff White</assignee>
                                    <reporter username="nbianchi">Nicola Bianchi</reporter>
                        <labels>
                    </labels>
                <created>Mon, 11 Jun 2012 10:39:21 +0000</created>
                <updated>Mon, 20 Jan 2014 18:09:39 +0000</updated>
                            <resolved>Mon, 20 Jan 2014 18:09:39 +0000</resolved>
                                    <version>Lustre 2.2.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="40350" author="cliffw" created="Mon, 11 Jun 2012 12:10:27 +0000"  >&lt;p&gt;Do you know what the load factor was on the servers when the outages occurred? It appears you are having client evictions, due to server timeouts.&lt;/p&gt;</comment>
                            <comment id="40408" author="nbianchi" created="Tue, 12 Jun 2012 01:46:33 +0000"  >&lt;p&gt;Cliff,&lt;br/&gt;
it seems that there was more load than usual on the Lustre servers ... &lt;br/&gt;
Could be interesting understand why because the HW behind is pretty powerful (12 brand new Sandy Bridge OSSs and 6 LSI 7900 Controllers with 8Gbit FC interfaces).&lt;/p&gt;

&lt;p&gt;Could be some configuration problem?&lt;/p&gt;

&lt;p&gt;Regards&lt;br/&gt;
 Nicola&lt;/p&gt;</comment>
                            <comment id="40412" author="fverzell" created="Tue, 12 Jun 2012 03:44:42 +0000"  >&lt;p&gt;/modprobe.conf.local of router nodes on our Cray XE system:&lt;/p&gt;

&lt;p&gt;---------------------------------------------------------&lt;br/&gt;
options dvsipc_lnet lnd_name=gni1&lt;br/&gt;
options qla2xxx ql2xfailover=0&lt;br/&gt;
options libcfs libcfs_panic_on_lbug=1&lt;br/&gt;
options lnet ip2nets=&quot;gni 172.27.&lt;b&gt;.&lt;/b&gt;; o2ib2 148.187.7.*&quot;&lt;br/&gt;
options lnet routes=&quot;gni 148.187.6.&lt;span class=&quot;error&quot;&gt;&amp;#91;71-82&amp;#93;&lt;/span&gt;@o2ib2; o2ib2 &lt;span class=&quot;error&quot;&gt;&amp;#91;220,226,270,304,394,436,474,484,1364,1386,1476,1530&amp;#93;&lt;/span&gt;@gni&quot;&lt;br/&gt;
options lnet check_routers_before_use=1 router_ping_timeout=5&lt;br/&gt;
options lnet dead_router_check_interval=60 live_router_check_interval=60&lt;br/&gt;
options kptllnd max_nodes=5000 credits=2048 timeout=250&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;Enable MSI for Mellanox ConnectX HCAs&lt;br/&gt;
options mlx4_core msi_x=1&lt;br/&gt;
---------------------------------------------------------&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;



&lt;p&gt;Modprobe.conf.local of a client that mount the file system on our Cray XE:&lt;br/&gt;
---------------------------------------------------------&lt;br/&gt;
options lnet ip2nets=&quot;gni1,gni 172.27.&lt;b&gt;.&lt;/b&gt; ;o2ib 148.187.6.&lt;b&gt;;o2ib2 148.187.7.&lt;/b&gt;&quot;&lt;br/&gt;
options lnet routes=&quot;gni1 148.187.6.&lt;span class=&quot;error&quot;&gt;&amp;#91;142,140,141,146&amp;#93;&lt;/span&gt;@o2ib; o2ib &lt;span class=&quot;error&quot;&gt;&amp;#91;62,174,967,1110&amp;#93;&lt;/span&gt;@gni1; gni 148.187.7.&lt;span class=&quot;error&quot;&gt;&amp;#91;71-82&amp;#93;&lt;/span&gt;@o2ib2; o2ib2 &lt;span class=&quot;error&quot;&gt;&amp;#91;220,226,270,304,394,436,474,484,1364,1386,1476,1530&amp;#93;&lt;/span&gt;@gni&quot;&lt;br/&gt;
options lnet check_routers_before_use=1 router_ping_timeout=5&lt;br/&gt;
options lnet dead_router_check_interval=60 live_router_check_interval=60&lt;br/&gt;
options qla2xxx ql2xlogintimeout=0&lt;br/&gt;
options ost oss_num_threads=256&lt;br/&gt;
options libcfs libcfs_panic_on_lbug=1&lt;br/&gt;
options dvsipc_lnet lnd_name=gni1&lt;br/&gt;
---------------------------------------------------------&lt;/p&gt;


&lt;p&gt;modprobe.conf of a server:&lt;br/&gt;
---------------------------------------------------------&lt;br/&gt;
options lnet networks=&quot;o2ib2(ib0)&quot;&lt;br/&gt;
options lnet routes=&quot;gni 148.187.7.&lt;span class=&quot;error&quot;&gt;&amp;#91;71-82&amp;#93;&lt;/span&gt;@o2ib2&quot;&lt;br/&gt;
options lnet check_routers_before_use=1&lt;br/&gt;
options lnet router_ping_timeout=5&lt;br/&gt;
options lnet dead_router_check_interval=60&lt;br/&gt;
options lnet live_router_check_interval=60&lt;br/&gt;
---------------------------------------------------------&lt;/p&gt;


</comment>
                            <comment id="40435" author="cliffw" created="Tue, 12 Jun 2012 13:14:23 +0000"  >&lt;p&gt;I don&apos;t think so, with configuration issues things tend to Not Work. I think this may be network-load related, are you sure the network is healthy? What does load look like on the router nodes?&lt;/p&gt;</comment>
                            <comment id="40489" author="nbianchi" created="Wed, 13 Jun 2012 07:42:08 +0000"  >&lt;p&gt;Cliff,&lt;br/&gt;
thanks for the quick answer.&lt;br/&gt;
For what we saw the 12 routers nodes seems to be pretty quiet but we will monitor the situation.&lt;/p&gt;

&lt;p&gt;For the infiniband network part we are checking if all is working well, so far the common IB tests reports a good shape but we see some strange behavior for clients that directly mount the FS through IB. In the logs we found messages like this:&lt;/p&gt;

&lt;p&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;163598.503276&amp;#93;&lt;/span&gt; LustreError: 11-0: an error occurred while communicating with 148.187.7.106@o2ib2. The ost_connect operation failed with -16&lt;br/&gt;
  &lt;span class=&quot;error&quot;&gt;&amp;#91;163598.503287&amp;#93;&lt;/span&gt; LustreError: Skipped 1 previous similar message&lt;/p&gt;

&lt;p&gt;...and we figured out that some common operation like untar a 2GB file sometime hang in conjunction the messages above.  &lt;/p&gt;

&lt;p&gt;Any advice is welcome.&lt;/p&gt;

&lt;p&gt;Regards&lt;br/&gt;
  Nicola&lt;/p&gt;</comment>
                            <comment id="40514" author="cliffw" created="Wed, 13 Jun 2012 11:39:15 +0000"  >&lt;p&gt;The -EBUSY would indicate that the OST was in recovery, or disconnecting/reconnecting. You should not see that message in normal operation. Are there any &apos;slow IO&apos; messages in your logs? &lt;br/&gt;
You need to check the logs for 148.187.7.106 and see what was happening on that server when the client reported the -EBUSY (-16)&lt;/p&gt;

&lt;p&gt;I would suggest running lnet_selftest to verify your network help, there are example scripts in the Lustre Manual. &lt;/p&gt;</comment>
                            <comment id="40560" author="nbianchi" created="Thu, 14 Jun 2012 08:17:36 +0000"  >&lt;p&gt;Cliff,&lt;br/&gt;
in the logs for 148.187.7.106 (weisshorn06) we got this kind of messages:&lt;/p&gt;

&lt;p&gt;-----------------------------------------------------------------------------------&lt;br/&gt;
Jun 14 09:50:47 weisshorn06 kernel: Lustre: scratch-OST0041: Client 1d3575c8-d2c3-3917-0d4b-ae0b2c9af8d8 (at 148.187.6.237@o2ib2) reconnecting&lt;br/&gt;
Jun 14 09:50:47 weisshorn06 kernel: Lustre: Skipped 5 previous similar messages&lt;br/&gt;
Jun 14 09:50:47 weisshorn06 kernel: Lustre: scratch-OST0041: Client 1d3575c8-d2c3-3917-0d4b-ae0b2c9af8d8 (at 148.187.6.237@o2ib2) refused reconnection, still busy with 11 active R&lt;br/&gt;
PCs&lt;br/&gt;
Jun 14 09:50:47 weisshorn06 kernel: Lustre: Skipped 1 previous similar message&lt;br/&gt;
Jun 14 09:50:47 weisshorn06 kernel: LustreError: 6370:0:(ldlm_lib.c:2725:target_bulk_io()) @@@ bulk GET failed: rc &lt;del&gt;107  req@ffff88101b3b9050 x1403838171590959/t0(0) o4&lt;/del&gt;&amp;gt;1d3575c8-&lt;br/&gt;
d2c3-3917-0d4b-ae0b2c9af8d8@148.187.6.237@o2ib2:0/0 lens 456/416 e 0 to 0 dl 1339660290 ref 1 fl Interpret:/0/0 rc 0/0&lt;br/&gt;
Jun 14 09:50:47 weisshorn06 kernel: Lustre: scratch-OST0041: Bulk IO write error with 1d3575c8-d2c3-3917-0d4b-ae0b2c9af8d8 (at 148.187.6.237@o2ib2), client will retry: rc -107&lt;br/&gt;
Jun 14 09:50:47 weisshorn06 kernel: LustreError: 6370:0:(ldlm_lib.c:2725:target_bulk_io()) Skipped 3 previous similar messages&lt;br/&gt;
Jun 14 09:50:47 weisshorn06 kernel: LustreError: 8029:0:(ldlm_lib.c:2725:target_bulk_io()) @@@ bulk GET failed: rc &lt;del&gt;107  req@ffff880c9942fc00 x1403838171590983/t0(0) o4&lt;/del&gt;&amp;gt;1d3575c8-&lt;br/&gt;
d2c3-3917-0d4b-ae0b2c9af8d8@148.187.6.237@o2ib2:0/0 lens 456/416 e 0 to 0 dl 1339660290 ref 1 fl Interpret:/0/0 rc 0/0&lt;br/&gt;
Jun 14 09:50:47 weisshorn06 kernel: LustreError: 8029:0:(ldlm_lib.c:2725:target_bulk_io()) Skipped 4 previous similar messages&lt;br/&gt;
Jun 14 09:51:11 weisshorn06 kernel: Lustre: scratch-OST0041: Client 1d3575c8-d2c3-3917-0d4b-ae0b2c9af8d8 (at 148.187.6.237@o2ib2) reconnecting&lt;br/&gt;
-----------------------------------------------------------------------------------&lt;/p&gt;

&lt;p&gt;I attach also the complete log that contain all the messages from all the OSS/MDS.&lt;/p&gt;

&lt;p&gt;Regards&lt;br/&gt;
  Nicola&lt;/p&gt;</comment>
                            <comment id="40561" author="nbianchi" created="Thu, 14 Jun 2012 08:28:03 +0000"  >&lt;p&gt;Cliff,&lt;br/&gt;
about the lnet_selftest we run a batch script like in the manual and we got this:&lt;/p&gt;

&lt;p&gt;-------------------------------------------------------------------------------------&lt;br/&gt;
SESSION: read/write TIMEOUT: 300 FORCE: No&lt;br/&gt;
148.187.7.&lt;span class=&quot;error&quot;&gt;&amp;#91;101-114&amp;#93;&lt;/span&gt;@o2ib2 are added to session&lt;br/&gt;
148.187.6.&lt;span class=&quot;error&quot;&gt;&amp;#91;6,7,8&amp;#93;&lt;/span&gt;@o2ib2 are added to session&lt;br/&gt;
148.187.6.&lt;span class=&quot;error&quot;&gt;&amp;#91;6,7,8&amp;#93;&lt;/span&gt;@o2ib2 are added to session&lt;br/&gt;
Test was added successfully&lt;br/&gt;
Test was added successfully&lt;br/&gt;
bulk_rw is running now&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;LNet Rates of servers&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;R&amp;#93;&lt;/span&gt; Avg: 1395     RPC/s Min: 1        RPC/s Max: 5802     RPC/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;W&amp;#93;&lt;/span&gt; Avg: 1555     RPC/s Min: 1        RPC/s Max: 6653     RPC/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;LNet Bandwidth of servers&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;R&amp;#93;&lt;/span&gt; Avg: 72.86    MB/s  Min: 0.00     MB/s  Max: 163.51   MB/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;W&amp;#93;&lt;/span&gt; Avg: 163.19   MB/s  Min: 0.00     MB/s  Max: 855.02   MB/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;LNet Rates of servers&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;R&amp;#93;&lt;/span&gt; Avg: 1450     RPC/s Min: 15       RPC/s Max: 5744     RPC/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;W&amp;#93;&lt;/span&gt; Avg: 1610     RPC/s Min: 15       RPC/s Max: 6599     RPC/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;LNet Bandwidth of servers&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;R&amp;#93;&lt;/span&gt; Avg: 105.09   MB/s  Min: 0.00     MB/s  Max: 240.24   MB/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;W&amp;#93;&lt;/span&gt; Avg: 163.03   MB/s  Min: 0.00     MB/s  Max: 858.62   MB/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;LNet Rates of servers&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;R&amp;#93;&lt;/span&gt; Avg: 1459     RPC/s Min: 0        RPC/s Max: 5800     RPC/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;W&amp;#93;&lt;/span&gt; Avg: 1619     RPC/s Min: 0        RPC/s Max: 6657     RPC/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;LNet Bandwidth of servers&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;R&amp;#93;&lt;/span&gt; Avg: 88.47    MB/s  Min: 0.00     MB/s  Max: 199.88   MB/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;W&amp;#93;&lt;/span&gt; Avg: 163.60   MB/s  Min: 0.00     MB/s  Max: 858.06   MB/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;LNet Rates of servers&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;R&amp;#93;&lt;/span&gt; Avg: 2946     RPC/s Min: 0        RPC/s Max: 7408     RPC/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;W&amp;#93;&lt;/span&gt; Avg: 3104     RPC/s Min: 0        RPC/s Max: 8241     RPC/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;LNet Bandwidth of servers&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;R&amp;#93;&lt;/span&gt; Avg: 85.92    MB/s  Min: 0.00     MB/s  Max: 192.92   MB/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;W&amp;#93;&lt;/span&gt; Avg: 163.32   MB/s  Min: 0.00     MB/s  Max: 838.31   MB/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;LNet Rates of servers&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;R&amp;#93;&lt;/span&gt; Avg: 1374     RPC/s Min: 0        RPC/s Max: 5569     RPC/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;W&amp;#93;&lt;/span&gt; Avg: 1534     RPC/s Min: 0        RPC/s Max: 6419     RPC/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;LNet Bandwidth of servers&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;R&amp;#93;&lt;/span&gt; Avg: 70.01    MB/s  Min: 0.00     MB/s  Max: 162.32   MB/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;W&amp;#93;&lt;/span&gt; Avg: 165.01   MB/s  Min: 0.00     MB/s  Max: 854.23   MB/s&lt;br/&gt;
session is ended&lt;br/&gt;
lustre_self_test.sh: line 17: 17805 Terminated              lst stat servers&lt;br/&gt;
-------------------------------------------------------------------------------------&lt;/p&gt;

&lt;p&gt;Still difficult for us to understand the exact meaning of the results but running multiple times we got always the same scores.&lt;br/&gt;
At a glance: we run 3 clients (that time to time loses the connections) against the 14 nodes of the Lustre FS. &lt;/p&gt;

&lt;p&gt;Here the script we used:&lt;br/&gt;
-------------------------------------------------------------------------------------&lt;br/&gt;
#!/bin/bash&lt;br/&gt;
export LST_SESSION=$$&lt;br/&gt;
lst new_session read/write&lt;br/&gt;
lst add_group servers 148.187.7.&lt;span class=&quot;error&quot;&gt;&amp;#91;101-114&amp;#93;&lt;/span&gt;@o2ib2&lt;br/&gt;
lst add_group readers 148.187.6.&lt;span class=&quot;error&quot;&gt;&amp;#91;6,7,8&amp;#93;&lt;/span&gt;@o2ib2&lt;br/&gt;
lst add_group writers 148.187.6.&lt;span class=&quot;error&quot;&gt;&amp;#91;6,7,8&amp;#93;&lt;/span&gt;@o2ib2&lt;br/&gt;
lst add_batch bulk_rw&lt;br/&gt;
lst add_test --batch bulk_rw --from readers --to servers \&lt;br/&gt;
brw read check=simple size=1M&lt;br/&gt;
lst add_test --batch bulk_rw --from writers --to servers \&lt;br/&gt;
brw write check=full size=4K&lt;br/&gt;
##start running&lt;br/&gt;
lst run bulk_rw&lt;br/&gt;
##display server stats for 30 seconds&lt;br/&gt;
lst stat servers &amp;amp; sleep 30; kill $!&lt;br/&gt;
##tear down&lt;br/&gt;
lst end_session&lt;br/&gt;
-------------------------------------------------------------------------------------&lt;/p&gt;

&lt;p&gt;Regards&lt;br/&gt;
  Nicola&lt;/p&gt;</comment>
                            <comment id="40572" author="cliffw" created="Thu, 14 Jun 2012 11:38:40 +0000"  >&lt;p&gt;Those result tell you how much IO you can generate from three clients. How many clients do you normally run?&lt;br/&gt;
I would suggest doing a test with &lt;em&gt;all&lt;/em&gt; your clients, and monitor your network hardware for errors. &lt;br/&gt;
Also, please list the distribution and Lustre version for your clients and servers. &lt;/p&gt;</comment>
                            <comment id="40573" author="cliffw" created="Thu, 14 Jun 2012 12:01:54 +0000"  >&lt;p&gt;From the router logs, it appears the gni side is not especially happy, there are timeouts, mis-routes:&lt;/p&gt;

&lt;p&gt;Jun 11 12:29:48 nid00484 kernel: LNet: 12032:0:(gnilnd_conn.c:1872:kgnilnd_reaper_dgram_check()) GNILND_DGRAM_REQ datagram to 385@gni timed out @ 63s dgram 0xffff8803cf3e26c8 state GNILND_DGRAM_POSTED conn 0xffff880406d64400&lt;br/&gt;
Jun 11 12:29:48 nid00484 kernel: HWERR&lt;span class=&quot;error&quot;&gt;&amp;#91;2899&amp;#93;&lt;/span&gt;:0x0b11:SSID Detected Misrouted Packet:Info1=0x8001025000014281:Info2=0x0:Info3=0x2072&lt;br/&gt;
Jun 11 12:29:51 nid00484 kernel: LNet: could not send to 385@gni due to connection setup failure after 66 seconds&lt;br/&gt;
Jun 11 12:29:51 nid00484 kernel: LNet: Skipped 12 previous similar messages&lt;br/&gt;
Jun 11 12:29:51 nid00484 kernel: LNet: 12032:0:(gnilnd_conn.c:790:kgnilnd_process_dgram()) hardware timeout for connect to 385@gni after 0 seconds. Is node dead?&lt;br/&gt;
Jun 11 12:29:51 nid00484 kernel: HWERR&lt;span class=&quot;error&quot;&gt;&amp;#91;2900&amp;#93;&lt;/span&gt;:0x0b11:SSID Detected Misrouted Packet:Info1=0x8001025000014281:Info2=0x0:Info3=0x20cc&lt;br/&gt;
Jun 11 12:30:12 nid00484 kernel: HWERR&lt;span class=&quot;error&quot;&gt;&amp;#91;2901&amp;#93;&lt;/span&gt;:0x0b11:SSID Detected Misrouted Packet:Info1=0x8001025000014281:Info2=0x0:Info3=0x2126&lt;br/&gt;
Jun 11 12:30:34 nid00484 kernel: HWERR&lt;span class=&quot;error&quot;&gt;&amp;#91;2902&amp;#93;&lt;/span&gt;:0x0b11:SSID Detected Misrouted Packet:Info1=0x8001025000014281:Info2=0x0:Info3=0x2180&lt;br/&gt;
Jun 11 12:30:56 nid00484 kernel: LNet: could not send to 385@gni due to connection setup failure after 65 seconds&lt;/p&gt;

&lt;p&gt;can you identify which node uses &apos;385@gni&apos; as it&apos;s address? Checking that node&apos;s logs might produce some useful information. &lt;/p&gt;</comment>
                            <comment id="40649" author="nbianchi" created="Fri, 15 Jun 2012 09:25:12 +0000"  >&lt;p&gt;Cliff,&lt;br/&gt;
usually we run from about 1500 Cray XE6 nodes and ~20 standard Linux Infiniband nodes. &lt;/p&gt;

&lt;p&gt;Cheers&lt;br/&gt;
  Nicola&lt;/p&gt;</comment>
                            <comment id="40650" author="nbianchi" created="Fri, 15 Jun 2012 09:29:11 +0000"  >&lt;p&gt;Cliff,&lt;br/&gt;
about the &apos;385@gni&apos;:&lt;/p&gt;

&lt;p&gt;  this node failed (HW Failure nid00385 c4-0c0s0n1) and for this reason you see the error.&lt;/p&gt;

&lt;p&gt;Cheers&lt;br/&gt;
 Nicola&lt;/p&gt;</comment>
                            <comment id="40732" author="nbianchi" created="Mon, 18 Jun 2012 05:52:20 +0000"  >&lt;p&gt;Cliff,&lt;br/&gt;
we noticed that when on our 1500 nodes Cray machine we have congestion problems (due HW problem for instance) the Lustre servers suffer the situation, as you can see in the logs.&lt;/p&gt;

&lt;p&gt;There is some timeouts parameter that we can tune to mitigate the situation?&lt;/p&gt;

&lt;p&gt;We know that on the Cray side the time to reconfigure the routing due an error should not be so long but anyway it seems that something with the filesystem is still hanging  ...&lt;/p&gt;

&lt;p&gt;Regards&lt;br/&gt;
  Nicola&lt;/p&gt;</comment>
                            <comment id="40759" author="cliffw" created="Mon, 18 Jun 2012 10:28:19 +0000"  >&lt;p&gt;I will investigate, but most timeouts in lustre are auto-tuned. I am talking to our engineers. &lt;/p&gt;</comment>
                            <comment id="40984" author="liang" created="Thu, 21 Jun 2012 08:26:42 +0000"  >&lt;p&gt;I didn&apos;t get chance to look into it, but seems it&apos;s gnilnd right? It&apos;s not in Lustre source tree yet, where can we find source code?&lt;/p&gt;

&lt;p&gt;Doug, I&apos;ve added you to CC list, could you please look into it? I&apos;m quite busy in recent a few days and have to attend a two days &lt;br/&gt;
meeting since tomorrow.&lt;/p&gt;</comment>
                            <comment id="41033" author="cliffw" created="Fri, 22 Jun 2012 10:09:31 +0000"  >&lt;p&gt;Hi Cliff,&lt;br/&gt;
 for your information, on weisshorn02 you can find in /var/log/cluster.log all the collected log from the entire cluster.&lt;br/&gt;
And if you run the command &apos;ltop&apos; you can have a look of the on line performance.&lt;/p&gt;

&lt;p&gt;What do you think about these parameters to pass to router/client/servers does them make any sense to you?&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;Router&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;options ko2iblnd timeout=100 peer_timeout=130&lt;br/&gt;
options ko2iblnd credits=2048 ntx=2048&lt;br/&gt;
options ko2iblnd peer_credits=126 concurrent_sends=63 peer_buffer_credits=128&lt;br/&gt;
options kgnilnd credits=2048 peer_health=1&lt;/p&gt;

&lt;p&gt;options lnet check_routers_before_use=1&lt;br/&gt;
options lnet dead_router_check_interval=60&lt;br/&gt;
options lnet live_router_check_interval=60&lt;br/&gt;
options lnet router_ping_timeout=50&lt;br/&gt;
options lnet large_router_buffers=1024 small_router_buffers=16384&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;Server&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;options ko2iblnd timeout=100 peer_timeout=0 keepalive=30&lt;br/&gt;
options ko2iblnd credits=2048 ntx=2048&lt;br/&gt;
options ko2iblnd peer_credits=126 concurrent_sends=63&lt;/p&gt;

&lt;p&gt;options lnet avoid_asym_router_failure=1&lt;br/&gt;
options lnet dead_router_check_interval=60&lt;br/&gt;
options lnet live_router_check_interval=60&lt;br/&gt;
options lnet check_routers_before_use=1&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;Client&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;options ko2iblnd timeout=100 peer_timeout=0 keepalive=30&lt;br/&gt;
options ko2iblnd credits=2048 ntx=2048&lt;br/&gt;
options ko2iblnd peer_credits=126 concurrent_sends=63&lt;/p&gt;

&lt;p&gt;options lnet avoid_asym_router_failure=1&lt;br/&gt;
options lnet dead_router_check_interval=60&lt;br/&gt;
options lnet live_router_check_interval=60&lt;br/&gt;
options lnet check_routers_before_use=1&lt;/p&gt;</comment>
                            <comment id="41034" author="cliffw" created="Fri, 22 Jun 2012 10:53:40 +0000"  >&lt;p&gt;The area of concern for me is this:&lt;br/&gt;
#cat /proc/sys/lnet/peers&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;nid                      refs state  last   max   rtr   min    tx   min queue
....
148.187.7.71@o2ib2          6    up  9999     8     8     8     5  -177 1824
148.187.7.72@o2ib2          7    up  9999     8     8     8     5  -177 1824
148.187.7.73@o2ib2          5    up  9999     8     8     8     6  -175 1216
148.187.7.74@o2ib2          6    up  9999     8     8     8     5  -177 1824
148.187.7.75@o2ib2          6    up  9999     8     8     8     5  -178 1824
148.187.7.76@o2ib2          8    up  9999     8     8     8     4  -177 2432
148.187.7.77@o2ib2          6    up  9999     8     8     8     5  -177 1824
148.187.7.78@o2ib2          7    up  9999     8     8     8     5  -178 1824
148.187.7.79@o2ib2          7    up  9999     8     8     8     4  -176 2432
148.187.7.80@o2ib2          7    up  9999     8     8     8     4  -176 2432
148.187.7.81@o2ib2          6    up  9999     8     8     8     5  -178 1824
148.187.7.82@o2ib2          7    up  9999     8     8     8     5  -178 1824
...
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Those are the gni routers, and that is not especially normal.  &lt;br/&gt;
All of your options that have the word &apos;check&apos; in them are good, normal. &lt;br/&gt;
We think the peer_credits is high, we would advise &lt;/p&gt;

&lt;p&gt;options ko2iblnd peer_credits=16 concurrent_sends=16&lt;/p&gt;

&lt;p&gt;The &apos;min&apos; parameter, when negative indicates the number of queued messages, the &apos;queue&apos; indicates&lt;br/&gt;
the number of bytes queued from that peer. So that is an indication things are backed up.&lt;/p&gt;


</comment>
                            <comment id="41044" author="doug" created="Fri, 22 Jun 2012 14:40:59 +0000"  >&lt;p&gt;I&apos;m suspecting that the messages_router_node.log gives a good indication of what may be happening:&lt;/p&gt;

&lt;p&gt;At: &lt;br/&gt;
12:28:23 - problems communicating with one node is reported.&lt;br/&gt;
12:28:28 - hardware quiesce is reported.  I&apos;m not familiar with the gnilnd (or Gemini), but this implies to me that a hardware bit in Gemini has flipped telling software to back off...things are not good.&lt;br/&gt;
12:28:40 - All the gnilnd threads are now paused due to the hardware quiesce.  At this point, timeouts and message drops are rampant.  That would be due to everything being paused.&lt;br/&gt;
12:28:46 - All threads are back up and running again allowing traffic to flow.&lt;/p&gt;

&lt;p&gt;So, this is how I read what is going on (but a Gemini expert is needed to clarify):&lt;br/&gt;
The node which has a hardware fault somehow triggers back pressure on the gnilnd of the router.  This trigger the hardware to complain forcing the gnilnd to shut down all traffic on that interface for 6 seconds!  In that time, all the IB queues which need to forward to that interface will back up possibly causing many critical message drops.  It is hard to say how long after the 6 second lock down it takes for everything to recover.&lt;/p&gt;</comment>
                            <comment id="41073" author="nbianchi" created="Mon, 25 Jun 2012 06:33:31 +0000"  >&lt;p&gt;Dear Support,&lt;/p&gt;

&lt;p&gt;this morning: Jun 25 11:51:18 the MDS crashed due a kernel panic (weisshorn01) just after the crash for unknown reasons of the OSS weisshorn14.&lt;br/&gt;
All logs are on weisshorn02:/var/log/cluster.log&lt;/p&gt;


&lt;p&gt;Regards&lt;br/&gt;
  Nicola &lt;/p&gt;</comment>
                            <comment id="41086" author="cliffw" created="Mon, 25 Jun 2012 10:03:05 +0000"  >
&lt;p&gt;Crash one:&lt;br/&gt;
un  4 10:10:57 weisshorn01 kernel: LustreError: 4365:0:(mdd_object.c:635:mdd_big_lmm_get()) ASSERTION( ma-&amp;gt;ma_lmm_size &amp;gt; 0 ) failed:&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: LustreError: 4365:0:(mdd_object.c:635:mdd_big_lmm_get()) LBUG&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: Pid: 4365, comm: mdt_09&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel:&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: Call Trace:&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03c0915&amp;gt;&amp;#93;&lt;/span&gt; libcfs_debug_dumpstack+0x55/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03c0e47&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0bf29e3&amp;gt;&amp;#93;&lt;/span&gt; mdd_big_lmm_get+0x433/0x4f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0bfb9a0&amp;gt;&amp;#93;&lt;/span&gt; ? mdd_get_md+0xa0/0x2d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0bf34ee&amp;gt;&amp;#93;&lt;/span&gt; __mdd_lmm_get+0x1ce/0x2c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0bf6c09&amp;gt;&amp;#93;&lt;/span&gt; mdd_attr_get_internal+0x249/0x770 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d1279f&amp;gt;&amp;#93;&lt;/span&gt; ? osd_object_read_lock+0x9f/0x140 &lt;span class=&quot;error&quot;&gt;&amp;#91;osd_ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0bf7188&amp;gt;&amp;#93;&lt;/span&gt; mdd_attr_get_internal_locked+0x58/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0604af0&amp;gt;&amp;#93;&lt;/span&gt; ? ldlm_completion_ast+0x0/0x6d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c6b3c0&amp;gt;&amp;#93;&lt;/span&gt; ? mdt_blocking_ast+0x0/0x210 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0bf71ed&amp;gt;&amp;#93;&lt;/span&gt; mdd_attr_get+0x3d/0xa0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d618bc&amp;gt;&amp;#93;&lt;/span&gt; cml_attr_get+0x6c/0x160 &lt;span class=&quot;error&quot;&gt;&amp;#91;cmm&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0625d84&amp;gt;&amp;#93;&lt;/span&gt; ? lustre_msg_get_opc+0x64/0xa0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c77884&amp;gt;&amp;#93;&lt;/span&gt; mdt_getattr_internal+0x294/0xd00 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c7adf5&amp;gt;&amp;#93;&lt;/span&gt; mdt_getattr_name_lock+0xd25/0x1700 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa062607d&amp;gt;&amp;#93;&lt;/span&gt; ? lustre_msg_buf+0x5d/0x60 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa064b756&amp;gt;&amp;#93;&lt;/span&gt; ? __req_capsule_get+0x176/0x640 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0627d74&amp;gt;&amp;#93;&lt;/span&gt; ? lustre_msg_get_flags+0x34/0x70 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c7bcdd&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_getattr+0x2cd/0x4a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c79591&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_policy+0x2d1/0x600 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05eae69&amp;gt;&amp;#93;&lt;/span&gt; ldlm_lock_enqueue+0x2f9/0x830 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa060c197&amp;gt;&amp;#93;&lt;/span&gt; ldlm_handle_enqueue0+0x427/0xda0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c791d6&amp;gt;&amp;#93;&lt;/span&gt; mdt_enqueue+0x46/0x130 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c7184d&amp;gt;&amp;#93;&lt;/span&gt; mdt_handle_common+0x74d/0x1400 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c725d5&amp;gt;&amp;#93;&lt;/span&gt; mdt_regular_handle+0x15/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06333c1&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request+0x3c1/0xcb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03c14ce&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_timer_arm+0xe/0x10 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03cbef9&amp;gt;&amp;#93;&lt;/span&gt; ? lc_watchdog_touch+0x79/0x110 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa062d462&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_wait_event+0xb2/0x2c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810519c3&amp;gt;&amp;#93;&lt;/span&gt; ? __wake_up+0x53/0x70&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06343cf&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0x71f/0x1210 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0633cb0&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1210 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c14a&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
Jun  4 10:10:57 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0633cb0&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1210 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun  4 10:12:28 weisshorn02 heartbeat: &lt;span class=&quot;error&quot;&gt;&amp;#91;3260&amp;#93;&lt;/span&gt;: WARN: node weisshorn01.admin.cscs.ch: is dead&lt;/p&gt;</comment>
                            <comment id="41087" author="cliffw" created="Mon, 25 Jun 2012 10:03:56 +0000"  >&lt;p&gt;Second crash:&lt;/p&gt;

&lt;p&gt;Jun 25 11:51:18 weisshorn01 kernel: LustreError: 3589:0:(ldlm_lock.c:831:ldlm_lock_decref_and_cancel()) ASSERTION( lock != ((void *)0) ) failed:&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: LustreError: 3589:0:(ldlm_lock.c:831:ldlm_lock_decref_and_cancel()) LBUG&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: Pid: 3589, comm: mgs_scratch_not&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel:&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: Call Trace:&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03bc915&amp;gt;&amp;#93;&lt;/span&gt; libcfs_debug_dumpstack+0x55/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03bce47&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05e92e1&amp;gt;&amp;#93;&lt;/span&gt; ldlm_lock_decref_and_cancel+0x111/0x120 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0b6448b&amp;gt;&amp;#93;&lt;/span&gt; mgs_completion_ast_ir+0xfb/0x110 &lt;span class=&quot;error&quot;&gt;&amp;#91;mgs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0600810&amp;gt;&amp;#93;&lt;/span&gt; ldlm_cli_enqueue_local+0x1f0/0x4d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0b64390&amp;gt;&amp;#93;&lt;/span&gt; ? mgs_completion_ast_ir+0x0/0x110 &lt;span class=&quot;error&quot;&gt;&amp;#91;mgs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05ff940&amp;gt;&amp;#93;&lt;/span&gt; ? ldlm_blocking_ast+0x0/0x130 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0b6429f&amp;gt;&amp;#93;&lt;/span&gt; mgs_revoke_lock+0x13f/0x230 &lt;span class=&quot;error&quot;&gt;&amp;#91;mgs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05ff940&amp;gt;&amp;#93;&lt;/span&gt; ? ldlm_blocking_ast+0x0/0x130 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0b64390&amp;gt;&amp;#93;&lt;/span&gt; ? mgs_completion_ast_ir+0x0/0x110 &lt;span class=&quot;error&quot;&gt;&amp;#91;mgs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03c65c1&amp;gt;&amp;#93;&lt;/span&gt; ? libcfs_debug_msg+0x41/0x50 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0b7a4ec&amp;gt;&amp;#93;&lt;/span&gt; mgs_ir_notify+0x11c/0x230 &lt;span class=&quot;error&quot;&gt;&amp;#91;mgs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8105e7f0&amp;gt;&amp;#93;&lt;/span&gt; ? default_wake_function+0x0/0x20&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81007966&amp;gt;&amp;#93;&lt;/span&gt; ? xen_timer_interrupt+0x16/0x1b0&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0b7a3d0&amp;gt;&amp;#93;&lt;/span&gt; ? mgs_ir_notify+0x0/0x230 &lt;span class=&quot;error&quot;&gt;&amp;#91;mgs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c14a&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0b7a3d0&amp;gt;&amp;#93;&lt;/span&gt; ? mgs_ir_notify+0x0/0x230 &lt;span class=&quot;error&quot;&gt;&amp;#91;mgs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0b7a3d0&amp;gt;&amp;#93;&lt;/span&gt; ? mgs_ir_notify+0x0/0x230 &lt;span class=&quot;error&quot;&gt;&amp;#91;mgs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c140&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel:&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: Kernel panic - not syncing: LBUG&lt;br/&gt;
Jun 25 11:51:18 weisshorn01 kernel: Pid: 3589, comm: mgs_scratch_not Not tainted 2.6.32-220.7.1.el6_lustre.g9c8f747.x86_64 #1&lt;/p&gt;</comment>
                            <comment id="41088" author="cliffw" created="Mon, 25 Jun 2012 10:07:45 +0000"  >&lt;p&gt;The first LBUG appears to be this:&lt;a href=&quot;http://jira.whamcloud.com/browse/LU-1384&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;http://jira.whamcloud.com/browse/LU-1384&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="41094" author="cliffw" created="Mon, 25 Jun 2012 10:58:57 +0000"  >&lt;p&gt;It seems that you are running a patched version of 2.2 because Lustre-2.2.51-2.6.32_220.7.1.el6_lustre.g9c8f747.x86_64_gd2c1a39.x86_64 indicates a version of Lustre taken from gerrit rather than an official release. Which issue did this address? Are you comfortable with how to apply patches to your Lustre version so that you can also add the fix for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1384&quot; title=&quot;MDS Kernel Panic when trying to mount the lustre file system&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1384&quot;&gt;&lt;del&gt;LU-1384&lt;/del&gt;&lt;/a&gt;?&lt;/p&gt;</comment>
                            <comment id="41119" author="nbianchi" created="Tue, 26 Jun 2012 02:15:21 +0000"  >&lt;p&gt;Cliff,&lt;br/&gt;
my colleague told me that we are using the version  Lustre-2.2.51-2.6.32_220.7.1.el6_lustre.g9c8f747.x86_64_gd2c1a39.x86_64 because this was the only one that we were able to use on our sandy-bridge OSSes. With other versions we experienced instability or inability to boot the servers.&lt;/p&gt;

&lt;p&gt;About the patches I guess that with some procedure we should be able to get it done.&lt;br/&gt;
Anyway, we have for sure to wait at least for our next maintenance window.&lt;/p&gt;

&lt;p&gt;Regards&lt;br/&gt;
  Nicola&lt;/p&gt;</comment>
                            <comment id="41120" author="nbianchi" created="Tue, 26 Jun 2012 02:27:28 +0000"  >&lt;p&gt;Cliff,&lt;br/&gt;
regarding your suggestion for this parameter:&lt;/p&gt;

&lt;p&gt;  options ko2iblnd peer_credits=16 concurrent_sends=16&lt;/p&gt;

&lt;p&gt;Currently we don&apos;t specified anything about that in our modprobe.conf, we use the configuration posted by Fabio Verzelloni on this date: 12/Jun/12 3:44 AM.&lt;br/&gt;
The parameters you posted the &quot;22/Jun/12 10:09 AM&quot; , from a Fabio email, are suggestions found in a 3rd party manual... and we were wondering if some of these options could improve our situation.&lt;/p&gt;

&lt;p&gt;Regards &lt;br/&gt;
  Nicola&lt;/p&gt;</comment>
                            <comment id="41121" author="cliffw" created="Tue, 26 Jun 2012 02:53:17 +0000"  >&lt;p&gt;We really would like you to try options ko2iblnd peer_credits=16 concurrent_sends=16&lt;br/&gt;
We think that may improve the situation. &lt;br/&gt;
The other options you have listed from the 3rd party manual will not help, and setting the large values for peer_credits and concurrent_sends in that list will likely make things worse. &lt;/p&gt;

&lt;p&gt;Please try our suggestion and report your results.  &lt;/p&gt;</comment>
                            <comment id="41131" author="nbianchi" created="Tue, 26 Jun 2012 07:22:37 +0000"  >&lt;p&gt;Cliff,&lt;/p&gt;

&lt;p&gt;it&apos;s ok with you if we will put in the configuration the parameter you suggested in the next maintenance day we planned (4th July)?&lt;/p&gt;

&lt;p&gt;Regards&lt;br/&gt;
 Nicola&lt;/p&gt;</comment>
                            <comment id="41132" author="nbianchi" created="Tue, 26 Jun 2012 08:05:13 +0000"  >&lt;p&gt;Cliff,&lt;br/&gt;
right now we are experiencing some performance issue on the filesystem.&lt;br/&gt;
Users are complaining about a drop down of the performance and in fact I can see that a simple &quot;ls /scratch/weisshorn&quot; can take minutes to answer.&lt;/p&gt;

&lt;p&gt;In the logs there are a bunch of this messages:&lt;/p&gt;

&lt;p&gt;---------------------------------------------------------------------------------------------------&lt;br/&gt;
Jun 26 13:50:59 weisshorn02 kernel: LustreError: 9825:0:(lov_obd.c:1068:lov_clear_orphans()) error in orphan recovery on OST idx 65/72: rc = -5&lt;br/&gt;
Jun 26 13:50:59 weisshorn02 kernel: LustreError: 9825:0:(mds_lov.c:884:__mds_lov_synchronize()) scratch-OST0041_UUID failed at mds_lov_clear_orphans: -5&lt;br/&gt;
Jun 26 13:50:59 weisshorn02 kernel: LustreError: 9825:0:(mds_lov.c:905:__mds_lov_synchronize()) scratch-OST0041_UUID sync failed -5, deactivating&lt;br/&gt;
Jun 26 13:51:01 weisshorn02 /usr/sbin/cerebrod&lt;span class=&quot;error&quot;&gt;&amp;#91;2756&amp;#93;&lt;/span&gt;: lmt_mysql: failed to connect to database&lt;br/&gt;
Jun 26 13:51:04 weisshorn11 kernel: Lustre: 5917:0:(client.c:1762:ptlrpc_expire_one_request()) @@@ Request  sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1340711408/real 1340711408&amp;#93;&lt;/span&gt;  req@ffff880807301400 x1405364552083621/t0(0) o250-&amp;gt;MGC148.187.7.101@o2ib2@148.187.7.101@o2ib2:26/25 lens 368/512 e 0 to 1 dl 1340711464 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1&lt;br/&gt;
Jun 26 13:51:04 weisshorn11 kernel: Lustre: 5917:0:(client.c:1762:ptlrpc_expire_one_request()) Skipped 7 previous similar messages&lt;br/&gt;
Jun 26 13:51:07 weisshorn02 kernel: LustreError: 9835:0:(lov_obd.c:1068:lov_clear_orphans()) error in orphan recovery on OST idx 25/72: rc = -5&lt;br/&gt;
Jun 26 13:51:07 weisshorn02 kernel: LustreError: 9835:0:(mds_lov.c:884:__mds_lov_synchronize()) scratch-OST0019_UUID failed at mds_lov_clear_orphans: -5&lt;br/&gt;
Jun 26 13:51:07 weisshorn02 kernel: LustreError: 9835:0:(mds_lov.c:905:__mds_lov_synchronize()) scratch-OST0019_UUID sync failed -5, deactivating&lt;br/&gt;
Jun 26 13:51:13 weisshorn04 kernel: Lustre: 5655:0:(client.c:1762:ptlrpc_expire_one_request()) @@@ Request  sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1340711417/real 1340711417&amp;#93;&lt;/span&gt;  req@ffff880feec76000 x1405364532655945/t0(0) o250-&amp;gt;MGC148.187.7.101@o2ib2@148.187.7.101@o2ib2:26/25 lens 368/512 e 0 to 1 dl 1340711473 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1&lt;br/&gt;
Jun 26 13:51:13 weisshorn04 kernel: Lustre: 5655:0:(client.c:1762:ptlrpc_expire_one_request()) Skipped 8 previous similar messages&lt;br/&gt;
Jun 26 13:51:16 weisshorn02 /usr/sbin/cerebrod&lt;span class=&quot;error&quot;&gt;&amp;#91;2756&amp;#93;&lt;/span&gt;: lmt_mysql: failed to connect to database&lt;br/&gt;
---------------------------------------------------------------------------------------------------&lt;/p&gt;

&lt;p&gt;We don&apos;t see any issue with the network, neither IB nor GNI.&lt;/p&gt;

&lt;p&gt;Machine load:&lt;br/&gt;
---------------------------------------------------------------------------------------------------&lt;br/&gt;
weisshorn05:  14:01:32 up 5 days,  5:41,  0 users,  load average: 3.12, 3.61, 3.85&lt;br/&gt;
weisshorn14:  14:01:32 up 46 min,  0 users,  load average: 0.02, 0.01, 0.00    &amp;lt;----- out of production&lt;br/&gt;
weisshorn03:  14:01:32 up 5 days,  5:41,  0 users,  load average: 5.26, 4.41, 3.86&lt;br/&gt;
weisshorn07:  14:01:32 up 5 days,  5:41,  0 users,  load average: 3.21, 3.41, 3.95&lt;br/&gt;
weisshorn04:  14:01:32 up 5 days,  5:41,  0 users,  load average: 4.47, 5.05, 4.90&lt;br/&gt;
weisshorn06:  14:01:32 up 5 days,  5:41,  0 users,  load average: 4.32, 4.04, 4.08&lt;br/&gt;
weisshorn08:  14:01:32 up 5 days,  5:40,  0 users,  load average: 5.11, 4.73, 4.91&lt;br/&gt;
weisshorn01:  14:01:32 up 1 day,  1:53,  0 users,  load average: 0.10, 0.03, 0.01 &amp;lt;---- out of production&lt;br/&gt;
weisshorn09:  14:01:32 up 5 days,  5:41,  0 users,  load average: 3.83, 4.29, 4.68&lt;br/&gt;
weisshorn11:  14:01:32 up 5 days,  5:40,  0 users,  load average: 3.29, 3.69, 3.73&lt;br/&gt;
weisshorn12:  14:01:32 up 5 days,  5:40,  0 users,  load average: 3.37, 3.26, 3.06&lt;br/&gt;
weisshorn10:  14:01:32 up 5 days,  5:40,  0 users,  load average: 4.26, 3.72, 3.36&lt;br/&gt;
weisshorn13:  14:01:32 up 5 days,  5:40,  0 users,  load average: 9.32, 8.07, 7.24&lt;br/&gt;
weisshorn02:  14:01:32 up 5 days,  5:38,  5 users,  load average: 0.26, 1.88, 2.58  &amp;lt;--- MDS&lt;br/&gt;
---------------------------------------------------------------------------------------------------&lt;/p&gt;

&lt;p&gt;ltop screenshot:&lt;br/&gt;
---------------------------------------------------------------------------------------------------&lt;br/&gt;
Filesystem: scratch&lt;br/&gt;
    Inodes:    185.812m total,     16.950m used (  9%),    168.863m free&lt;br/&gt;
     Space:    503.720t total,    296.873t used ( 59%),    206.847t free&lt;br/&gt;
   Bytes/s:	 0.323g read,       0.285g write,              2711 IOPS&lt;br/&gt;
   MDops/s:    245 open,      262 close,      14 getattr,	6 setattr&lt;br/&gt;
                 0 link,        1 unlink,      0 mkdir,         0 rmdir&lt;br/&gt;
                 0 statfs,	3 rename,      0 getxattr&lt;br/&gt;
&amp;gt;OST S        OSS   Exp   CR rMB/s wMB/s  IOPS   LOCKS  LGR  LCR %cpu %mem %spc&lt;br/&gt;
(12) F eisshorn13  1551    0    56    41   525 1156010    9  363    1  100   61&lt;br/&gt;
 (6) F eisshorn03  1551    0    28    28   127  560595    7    2    0  100   59&lt;br/&gt;
 (6) F eisshorn04  1551    0    37    36   110  614243    0    0    0  100   56&lt;br/&gt;
 (6) F eisshorn05  1551    0    37    25   182  621734    0    0    0  100   61&lt;br/&gt;
 (6) F eisshorn06  1551    0    22    21   374  597840    0    0    1  100   57&lt;br/&gt;
 (6) F eisshorn07  1551    0    22    16   234  653836    2    0    0  100   60&lt;br/&gt;
 (6) F eisshorn08  1551    0    24    31   103  713393    3  117    0  100   53&lt;br/&gt;
 (6) F eisshorn09  1551    0    26    25   344  580579    2   98    1  100   55&lt;br/&gt;
 (6) F eisshorn10  1551    0    28    27   445  595866    1  164    1  100   59&lt;br/&gt;
 (6) F eisshorn11  1551    0    24    19   169  565911    4    4    0  100   62&lt;br/&gt;
 (6) F eisshorn12  1551    0    27    24    98  696995    2  152    0  100   62&lt;br/&gt;
---------------------------------------------------------------------------------------------------&lt;/p&gt;

&lt;p&gt;cat /proc/sys/lnet/peers&lt;br/&gt;
---------------------------------------------------------------------------------------------------&lt;br/&gt;
148.187.7.71@o2ib2          3    up  9999     8     8     8     8  -611 0&lt;br/&gt;
148.187.7.72@o2ib2          3    up  9999     8     8     8     8  -592 0&lt;br/&gt;
148.187.7.73@o2ib2          3    up  9999     8     8     8     8  -633 0&lt;br/&gt;
148.187.7.74@o2ib2          3    up  9999     8     8     8     8  -608 0&lt;br/&gt;
148.187.7.75@o2ib2          3    up  9999     8     8     8     8  -617 0&lt;br/&gt;
148.187.7.76@o2ib2          3    up  9999     8     8     8     8  -642 0&lt;br/&gt;
148.187.7.77@o2ib2          3    up  9999     8     8     8     8  -618 0&lt;br/&gt;
148.187.7.78@o2ib2          3    up  9999     8     8     8     8  -626 0&lt;br/&gt;
148.187.7.79@o2ib2          3    up  9999     8     8     8     8  -636 0&lt;br/&gt;
148.187.7.80@o2ib2          3    up  9999     8     8     8     8  -630 0&lt;br/&gt;
148.187.7.81@o2ib2          3    up  9999     8     8     8     8  -636 0&lt;br/&gt;
148.187.7.82@o2ib2          3    up  9999     8     8     8     8  -635 0&lt;br/&gt;
---------------------------------------------------------------------------------------------------&lt;/p&gt;

&lt;p&gt;Regards&lt;br/&gt;
  Nicola&lt;/p&gt;</comment>
                            <comment id="41137" author="cliffw" created="Tue, 26 Jun 2012 10:20:55 +0000"  >&lt;p&gt;You are reporting issues every day - i would NOT wait for the maint window to change the configuration. You could do a rolling failover and set this up without halting the filesystem, simply failover one node at a time.&lt;/p&gt;</comment>
                            <comment id="41139" author="cliffw" created="Tue, 26 Jun 2012 10:26:03 +0000"  >&lt;p&gt;The current issue may be related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1247&quot; title=&quot;After recovery 7 of 16 OST were deactivated by MDS.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1247&quot;&gt;&lt;del&gt;LU-1247&lt;/del&gt;&lt;/a&gt;, am consulting engineering. I think we need to point you to a newer release, and should plan on a system upgrade on the 4th. &lt;/p&gt;</comment>
                            <comment id="41140" author="nbianchi" created="Tue, 26 Jun 2012 10:26:59 +0000"  >&lt;p&gt;Cliff,&lt;br/&gt;
do I have to put this line only on the servers? (OSS and MDS)&lt;/p&gt;

&lt;p&gt;  options ko2iblnd peer_credits=16 concurrent_sends=16&lt;/p&gt;

&lt;p&gt;Regards&lt;br/&gt;
  Nicola&lt;/p&gt;</comment>
                            <comment id="41144" author="cliffw" created="Tue, 26 Jun 2012 11:15:59 +0000"  >&lt;p&gt;I am sorry, I was in error. The &quot;options ko2iblnd peer_credits=16 concurrent_sends=16&quot; needs to be set on all nodes, client and server. You would have to stop the clients for a moment to do this, the rolling failover idea could be used for servers, but likely better to do all at once.&lt;/p&gt;</comment>
                            <comment id="41145" author="nbianchi" created="Tue, 26 Jun 2012 11:19:32 +0000"  >&lt;p&gt;Cliff,&lt;br/&gt;
is it a real problem if I begin to put that on the servers only?&lt;/p&gt;

&lt;p&gt; Nicola&lt;/p&gt;</comment>
                            <comment id="41217" author="green" created="Wed, 27 Jun 2012 18:23:02 +0000"  >&lt;p&gt;the second crash is &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1259&quot; title=&quot;Test failure on test suite conf-sanity, subtest test_41a&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1259&quot;&gt;&lt;del&gt;LU-1259&lt;/del&gt;&lt;/a&gt;, patch at &lt;a href=&quot;http://review.whamcloud.com/2390&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/2390&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="41258" author="cliffw" created="Thu, 28 Jun 2012 09:52:59 +0000"  >&lt;p&gt;This via email:&lt;br/&gt;
Cliff,&lt;br/&gt;
thanks a lot for the informations.&lt;/p&gt;

&lt;p&gt;Hence the next Wednesday Fabio, who read us in copy, will proceed with the downgrade to the 2.1.2 version. Since this afternoon I will be away for 7 weeks so he will take over this task ...&lt;/p&gt;

&lt;p&gt;For sure Fabio will come back to you about the details and the schedule for this intervention.&lt;/p&gt;

&lt;p&gt;------&lt;/p&gt;</comment>
                            <comment id="41313" author="cliffw" created="Fri, 29 Jun 2012 10:13:44 +0000"  >&lt;p&gt;Okay, if you need help on the 4th, please email &lt;br/&gt;
dutymanager@whamcloud.com in the event of problems during the upgrade.&lt;br/&gt;
It is a US holiday, so some of us will not be available.&lt;/p&gt;</comment>
                            <comment id="41314" author="cliffw" created="Fri, 29 Jun 2012 10:14:46 +0000"  >&lt;p&gt;And of course, the bits you need are available here: &lt;a href=&quot;http://downloads.whamcloud.com/public/lustre/latest-maintenance-release/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://downloads.whamcloud.com/public/lustre/latest-maintenance-release/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="41369" author="fverzell" created="Mon, 2 Jul 2012 10:12:26 +0000"  >&lt;p&gt;I just tried on a test machine to recreate the situation will happen on Wednesday, basically downgrade from 2.2 --&amp;gt; 2.1.2 and I&apos;ve got the following error message:&lt;/p&gt;

&lt;p&gt;console log&lt;/p&gt;

&lt;p&gt;Jul 02 16:08 &lt;span class=&quot;error&quot;&gt;&amp;#91;root@wn47:~&amp;#93;&lt;/span&gt;# mount -t lustre /dev/vg_root/mds /mnt/lustre&lt;br/&gt;
mount.lustre: mount /dev/mapper/vg_root-mds at /mnt/lustre failed: Invalid argument&lt;br/&gt;
This may have multiple causes.&lt;br/&gt;
Are the mount options correct?&lt;br/&gt;
Check the syslog for more info.&lt;/p&gt;

&lt;p&gt;messages&lt;/p&gt;

&lt;p&gt;Jul  2 16:10:17 wn47 kernel: LDISKFS-fs warning (device dm-2): ldiskfs_fill_super: extents feature not enabled on this filesystem, use tune2fs.&lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: LDISKFS-fs (dm-2): mounted filesystem with ordered data mode. Opts: &lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: LDISKFS-fs warning (device dm-2): ldiskfs_fill_super: extents feature not enabled on this filesystem, use tune2fs.&lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: LDISKFS-fs (dm-2): mounted filesystem with ordered data mode. Opts: &lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: Lustre: MGS MGS started&lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: Lustre: 8536:0:(ldlm_lib.c:933:target_handle_connect()) MGS: connection from bd7d7986-674e-84d2-d489-d6f9028b0d52@0@lo t0 exp (null) cur 1341238217 last 0&lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: Lustre: MGC10.10.65.47@tcp: Reactivating import&lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: Lustre: Enabling ACL&lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: Lustre: lustre-MDT0000: used disk, loading&lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: LustreError: 8540:0:(mdt_recovery.c:409:mdt_server_data_init()) lustre-MDT0000: unsupported incompat filesystem feature(s) 200&lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: LustreError: 8540:0:(obd_config.c:522:class_setup()) setup lustre-MDT0000 failed (-22)&lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: LustreError: 8540:0:(obd_config.c:1363:class_config_llog_handler()) Err -22 on cfg command:&lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: Lustre:    cmd=cf003 0:lustre-MDT0000  1:lustre-MDT0000_UUID  2:0  3:lustre-MDT0000-mdtlov  4:f  &lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: LustreError: 15b-f: MGC10.10.65.47@tcp: The configuration from log &apos;lustre-MDT0000&apos;failed from the MGS (-22).  Make sure this client and the MGS are running compatible versions of Lustre.&lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: LustreError: 15c-8: MGC10.10.65.47@tcp: The configuration from log &apos;lustre-MDT0000&apos; failed (-22). This may be the result of communication errors between this node and the MGS, a bad configuration, or other errors. See the syslog for more information.&lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: LustreError: 8458:0:(obd_mount.c:1192:server_start_targets()) failed to start server lustre-MDT0000: -22&lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: LustreError: 8458:0:(obd_mount.c:1738:server_fill_super()) Unable to start targets: -22&lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: LustreError: 8458:0:(obd_config.c:567:class_cleanup()) Device 3 not setup&lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: LustreError: 8458:0:(ldlm_request.c:1172:ldlm_cli_cancel_req()) Got rc -108 from cancel RPC: canceling anyway&lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: Lustre: MGS has stopped.&lt;br/&gt;
Jul  2 16:10:17 wn47 kernel: LustreError: 8458:0:(ldlm_request.c:1799:ldlm_cli_cancel_list()) ldlm_cli_cancel_list: -108&lt;br/&gt;
Jul  2 16:10:23 wn47 kernel: Lustre: 8458:0:(client.c:1780:ptlrpc_expire_one_request()) @@@ Request  sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1341238217/real 1341238217&amp;#93;&lt;/span&gt;  req@ffff88041bdcac00 x1406387715309722/t0(0) o251-&amp;gt;MGC10.10.65.47@tcp@0@lo:26/25 lens 192/192 e 0 to 1 dl 1341238223 ref 2 fl Rpc:XN/0/ffffffff rc 0/-1&lt;br/&gt;
Jul  2 16:10:23 wn47 kernel: Lustre: server umount lustre-MDT0000 complete&lt;br/&gt;
Jul  2 16:10:23 wn47 kernel: LustreError: 8458:0:(obd_mount.c:2198:lustre_fill_super()) Unable to mount  (-22)&lt;br/&gt;
Jul  2 16:10:23 wn47 kernel: LustreError: 8458:0:(obd_mount.c:2198:lustre_fill_super()) Skipped 1 previous similar message&lt;/p&gt;

&lt;p&gt;And attached you can find the lustre dk&lt;/p&gt;

&lt;p&gt;Fabio&lt;/p&gt;
</comment>
                            <comment id="41372" author="cliffw" created="Mon, 2 Jul 2012 10:47:18 +0000"  >&lt;p&gt;Please tell us exactly what you did, list all steps if possible.&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Did you download the 2.1.2 release from our site?&lt;/li&gt;
	&lt;li&gt;Did you install the latest e2fsprogs, also from our site?&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="41394" author="johann" created="Tue, 3 Jul 2012 04:21:22 +0000"  >&lt;p&gt;Hi Fabio,&lt;/p&gt;

&lt;p&gt;As far as upgrade/downgrade is concerned, we only support the downgrade of a filesystem that was formatted with an &quot;old&quot; version of lustre. I mean that, you can format a filesystem with 2.x, upgrade it to 2.x+1 and then downgrade to 2.x. However, if you format with 2.x+1, we might enable new features (e.g. flex_bg is the first example i can think of) which won&apos;t be supported by prior releases. This has been a general rule with lustre releases for a while.&lt;/p&gt;

&lt;p&gt;In the present case, you can&apos;t mount the filesystem because of this message:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Jul 2 16:10:17 wn47 kernel: LustreError: 8540:0:(mdt_recovery.c:409:mdt_server_data_init()) lustre-MDT0000: unsupported incompat filesystem feature(s) 200
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The incompat feature is actually multiple object index support which was added in 2.2 (see &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-822&quot; title=&quot;allow multiple Object Index files to be created&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-822&quot;&gt;&lt;del&gt;LU-822&lt;/del&gt;&lt;/a&gt;).&lt;br/&gt;
Because of this new feature which is enabled by default for any new filesystem formatted with 2.2, you can&apos;t downgrade to 2.1.&lt;/p&gt;

&lt;p&gt;HTH&lt;/p&gt;</comment>
                            <comment id="41475" author="colinmcmurtrie" created="Thu, 5 Jul 2012 04:49:15 +0000"  >&lt;p&gt;Johann,&lt;/p&gt;

&lt;p&gt;Unfortunately this latest piece of information (namely the fact that the filesystem needs to be reformatted because of incompatibilities between it and a v2.1.2) came too late for us to adequately inform our user community.  The filesystem contained nearly 400TB of user data and we must allow at least 1 weeks advance notice so that users can move their crucial data to a more permanent filesystem.  Consequently we will not be able to perform this downgrade until mid-July at the earliest.  In fact we would prefer to wait until our next scheduled maintenance on Wednesday 8 August.  This is a regrettable situation.&lt;/p&gt;

&lt;p&gt;On the plus side we have unmounted the second Lustre filesystem from the 1500 node XE6 (which was an additional complication) and this seems to have made the Lustre logs less &quot;chatty&quot; (i.e. not logging as many problems).  The XE6 is however not completely full of jobs yet so we will keep a close eye on the situation over the coming days.  Furthermore one of the OSSes needed a motherboard swap last week so it may have had a hardware fault that was also contributing to destabilise the filesystem.&lt;/p&gt;

&lt;p&gt;Finally FYI we have ordered 4 additional OSSes of identical spec to the ones we have in production.  When they arrive (likely later in July) we will build a preproduction test environment (we have some unused DS4800 storage to use at the backend).&lt;/p&gt;

&lt;p&gt;Regards,&lt;/p&gt;

&lt;p&gt;Colin &lt;/p&gt;</comment>
                            <comment id="41499" author="cliffw" created="Thu, 5 Jul 2012 14:26:42 +0000"  >&lt;p&gt;Thanks, please let us know what we can do to assist.&lt;/p&gt;</comment>
                            <comment id="41555" author="fverzell" created="Fri, 6 Jul 2012 05:14:34 +0000"  >&lt;p&gt;I have two questions regarding the issue of the reconnecting compute nodes between Lustre &amp;amp; Cray (gni interconnect), as my latest attached file we had this issue yesterday, the question is Lustre version 2.1.2 is running the same kind of code for connecting to gni that Lustre 2.2 is using? &lt;/p&gt;

&lt;p&gt;If the code is the same regarding the gni part will be really helpful to downgrade or we would experiencing the same issue?&lt;/p&gt;

&lt;p&gt;The second question is, is it possible that a tuning of the router/client nodes side, maybe increasing the timeout in the modprobe would help? &lt;/p&gt;

&lt;p&gt;Right now we have on the router node a timeout=250 but there is not any specification on the client side.&lt;/p&gt;

&lt;p&gt;Regards&lt;br/&gt;
Fabio&lt;/p&gt;


</comment>
                            <comment id="41611" author="cliffw" created="Mon, 9 Jul 2012 13:07:47 +0000"  >&lt;p&gt;The gnilnd code is supplied by Cray - have you consulted them on this issue? We are at the present time unable to access the gnilnd source, so it is a bit difficult for us to say anything on this issue. We believe you would be better off in general running our current maintenance release (2.1.2) as you are currently hitting two known bugs that are not present in this release. Given your current state of Lustre experience 2.1.2 will be in general more stable. If you wish to continue with the feature release (2.2), you will need to patch your version of Lustre with the fixes for the bugs you have hit.&lt;/p&gt;</comment>
                            <comment id="41639" author="spitzcor" created="Mon, 9 Jul 2012 18:50:04 +0000"  >&lt;p&gt;Cray has been planning on pushing gnilnd upstream.  We&apos;ve opened &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1419&quot; title=&quot;Tracking ticket for gnilnd push&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1419&quot;&gt;&lt;del&gt;LU-1419&lt;/del&gt;&lt;/a&gt; to track that effort.&lt;/p&gt;

&lt;p&gt;For this ticket though, I think that it would help to be clear about what components and Lustre version you are discussing.  It&apos;s my guess that this customer is still running 1.8.x clients on their Cray mainframe, and not version 2.2 or 2.1.2.  I believe that in the course of this ticket, when 2.x is referenced, it refers to the server version only.&lt;/p&gt;

&lt;p&gt;BTW, the gnilnd has a feature to &quot;quiesce&quot; when the Cray high speed network is under a re-route for HW warmswap or link failure.  If the Cray has gone quiescent, then you won&apos;t hear from its clients until the quiesce event is complete.  During that time, messages to the Cray clients will probably even back up on the LNET routers such that all router buffers are consumed.&lt;/p&gt;</comment>
                            <comment id="41739" author="fverzell" created="Thu, 12 Jul 2012 09:14:33 +0000"  >&lt;p&gt;I&apos;ve just uploaded 2 new log files, related to a problem occurred  yesterday, a lot of clients from our XE6 system lost connectivity with Lustre. &lt;/p&gt;

&lt;p&gt;I hope the logs help, log_11_jul is the one from the lustre servers (weisshorn07) the drop_conn is the log from Cray XE6, from 19:00 related to the error with weisshorn07.&lt;/p&gt;</comment>
                            <comment id="41878" author="fverzell" created="Mon, 16 Jul 2012 09:16:47 +0000"  >&lt;p&gt;Tomorrow we are going to apply some tuning on Server, router &amp;amp; client side, the following are the changes we are going to apply:&lt;/p&gt;


&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;Router ##&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;options ko2iblnd timeout=100 peer_timeout=130&lt;br/&gt;
options ko2iblnd credits=2048 ntx=2048&lt;br/&gt;
options ko2iblnd peer_credits=126 concurrent_sends=63&lt;br/&gt;
peer_buffer_credits=128&lt;br/&gt;
options kgnilnd credits=2048 peer_health=1&lt;/p&gt;

&lt;p&gt;options lnet check_routers_before_use=1&lt;br/&gt;
options lnet dead_router_check_interval=60&lt;br/&gt;
options lnet live_router_check_interval=60&lt;br/&gt;
options lnet router_ping_timeout=50&lt;br/&gt;
options lnet large_router_buffers=1024 small_router_buffers=16384&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;Server ##&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;options ko2iblnd timeout=100 peer_timeout=0 keepalive=30&lt;br/&gt;
options ko2iblnd credits=2048 ntx=2048&lt;br/&gt;
options ko2iblnd peer_credits=126 concurrent_sends=63&lt;/p&gt;

&lt;p&gt;options lnet avoid_asym_router_failure=1&lt;br/&gt;
options lnet dead_router_check_interval=60&lt;br/&gt;
options lnet live_router_check_interval=60&lt;br/&gt;
options lnet check_routers_before_use=1&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;Client ##&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;LOGIN - COMPUTE&lt;/p&gt;

&lt;p&gt;No need ko2ilbnd tuning on ROSA Login nodes because there are not IB cards.&lt;br/&gt;
#options ko2iblnd timeout=100 peer_timeout=0 keepalive=30&lt;br/&gt;
#options ko2iblnd credits=2048 ntx=2048&lt;br/&gt;
#options ko2iblnd peer_credits=126 concurrent_sends=63&lt;/p&gt;

&lt;p&gt;options lnet avoid_asym_router_failure=1&lt;br/&gt;
options lnet dead_router_check_interval=60&lt;br/&gt;
options lnet live_router_check_interval=60&lt;br/&gt;
options lnet check_routers_before_use=1&lt;/p&gt;


&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;tuning ##&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;MDS&lt;/p&gt;

&lt;p&gt;echo &quot;100&quot; &amp;gt;  /proc/fs/lustre/lov/scratch-MDT0000-mdtlov/qos_threshold_rr &lt;br/&gt;
echo &quot;256&quot; &amp;gt;  /proc/fs/lustre/mdt/scratch-MDT0000/mdt/threads_max&lt;br/&gt;
lctl set_param lov.*.stripecount=4&lt;br/&gt;
echo &quot;240&quot; &amp;gt; /proc/fs/lustre/mdt/scratch-MDT0000/identity_acquire_expire&lt;/p&gt;

&lt;p&gt;OSS&lt;/p&gt;

&lt;p&gt;pdsh  -a -x weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;01-02&amp;#93;&lt;/span&gt; &quot;lctl set_param timeout=300&quot;&lt;br/&gt;
pdsh  -a -x weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;01-02&amp;#93;&lt;/span&gt; &quot;lctl set_param ldlm_timeout=230&quot;&lt;br/&gt;
pdsh  -a -x weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;01-02&amp;#93;&lt;/span&gt; &quot;lctl set_param at_min=230&quot;&lt;/p&gt;


&lt;p&gt;Can you please confirm me that the qos_threshold is enough to tune only on the MDS and no need of changes on the client side?&lt;br/&gt;
I noticed that the value on the clients is slightly different than the one on the server:&lt;/p&gt;

&lt;p&gt;fverzell@rosa5:~&amp;gt; cat /proc/fs/lustre/lov/scratch-clilov-ffff8803fc233000/qos_threshold_rr &lt;br/&gt;
16%&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@weisshorn01 ~&amp;#93;&lt;/span&gt;# cat /proc/fs/lustre/lov/scratch-MDT0000-mdtlov/qos_threshold_rr&lt;br/&gt;
17%&lt;/p&gt;


&lt;p&gt;Based on the fact that the FS seems to be unbalanced, it is necessary lfs_migrate to &apos;fix&apos; the situation?&lt;/p&gt;

&lt;p&gt;fverzell@rosa5:~&amp;gt; lfs df &lt;br/&gt;
UUID                   1K-blocks        Used   Available Use% Mounted on&lt;br/&gt;
scratch-MDT0000_UUID   292202572   118544144   154174788  43% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;MDT:0&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0000_UUID  7512024488  4689026804  2447151520  66% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:0&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0001_UUID  7512024488  4745725856  2390486948  67% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:1&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0002_UUID  7512024488  5075884136  2060329692  71% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:2&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0003_UUID  7512024488  4897558604  2238653176  69% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:3&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0004_UUID  7512024488  4454906816  2681307012  62% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:4&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0005_UUID  7512024488  4587438680  2548773100  64% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:5&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0006_UUID  7512024488  3992329488  3143885212  56% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:6&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0007_UUID  7512024488  4661958960  2474223000  65% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:7&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0008_UUID  7512024488  4167201580  2969012160  58% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:8&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0009_UUID  7512024488  4819927884  2316257380  68% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:9&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST000a_UUID  7512024488  4754286656  2381901152  67% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:10&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST000b_UUID  7512024488  4640072608  2496099572  65% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:11&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST000c_UUID  7512024488  4742974012  2393214728  66% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:12&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST000d_UUID  7512024488  2549614552  4586599524  36% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:13&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST000e_UUID  7512024488  4827107008  2309104772  68% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:14&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST000f_UUID  7512024488  4624081852  2512129928  65% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:15&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0010_UUID  7512024488  4579714656  2556475388  64% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:16&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0011_UUID  7512024488  4712685460  2423526320  66% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:17&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0012_UUID  7512024488  4467238040  2668940296  63% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:18&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0013_UUID  7512024488  3964966088  3171248460  56% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:19&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0014_UUID  7512024488  4467610568  2668572328  63% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:20&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0015_UUID  7512024488  4087648640  3048563952  57% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:21&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0016_UUID  7512024488  2768510532  4367704256  39% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:22&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0017_UUID  7512024488  4762759028  2373420232  67% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:23&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0018_UUID  7512024488  4818779532  2317404560  68% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:24&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0019_UUID  7512024488  2286805384  4849409388  32% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:25&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST001a_UUID  7512024488  4654062568  2482120052  65% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:26&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST001b_UUID  7512024488  4716531932  2419679848  66% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:27&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST001c_UUID  7512024488  4829306016  2306855832  68% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:28&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST001d_UUID  7512024488  4724954000  2411258804  66% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:29&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST001e_UUID  7512024488  4734034796  2402147544  66% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:30&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST001f_UUID  7512024488  4764160748  2372052056  67% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:31&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0020_UUID  7512024488  4749913620  2386283024  67% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:32&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0021_UUID  7512024488  4051468944  3084745524  57% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:33&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0022_UUID  7512024488  2519479924  4616734864  35% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:34&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0023_UUID  7512024488  5047050388  2089132448  71% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:35&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0024_UUID  7512024488  4777844024  2358343904  67% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:36&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0025_UUID  7512024488  2515885904  4620324508  35% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:37&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0026_UUID  7512024488  4612974860  2523237944  65% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:38&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0027_UUID  7512024488  4766933908  2369277872  67% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:39&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0028_UUID  7512024488  4732878372  2403333408  66% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:40&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0029_UUID  7512024488  4805835488  2330355492  67% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:41&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST002a_UUID  7512024488  4623393332  2512795728  65% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:42&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST002b_UUID  7512024488  4817231168  2318980612  68% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:43&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST002c_UUID  7512024488  2429377284  4706837500  34% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:44&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST002d_UUID  7512024488  4825606516  2310560384  68% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:45&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST002e_UUID  7512024488  2666838228  4469376560  37% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:46&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST002f_UUID  7512024488  3925127408  3211050572  55% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:47&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0030_UUID  7512024488  4727638656  2408540452  66% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:48&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0031_UUID  7512024488  2693940128  4442271700  38% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:49&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0032_UUID  7512024488  4764748760  2371463020  67% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:50&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0033_UUID  7512024488  4864564316  2271648488  68% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:51&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0034_UUID  7512024488  4625740760  2510443620  65% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:52&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0035_UUID  7512024488  4693269256  2442910852  66% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:53&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0036_UUID  7512024488  4544146008  2592030580  64% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:54&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0037_UUID  7512024488  4860051744  2276133072  68% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:55&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0038_UUID  7512024488  2854611264  4281603524  40% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:56&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0039_UUID  7512024488  4750769272  2385415720  67% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:57&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST003a_UUID  7512024488  2387156708  4749058080  33% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:58&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST003b_UUID  7512024488  4971879748  2164295048  70% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:59&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST003c_UUID  7512024488  4768532532  2367679248  67% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:60&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST003d_UUID  7512024488  2486278472  4649936292  35% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:61&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST003e_UUID  7512024488  4799709104  2336473268  67% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:62&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST003f_UUID  7512024488  4804003128  2332208652  67% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:63&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0040_UUID  7512024488  4737174800  2399036980  66% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:64&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0041_UUID  7512024488  4570719888  2565491892  64% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:65&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0042_UUID  7512024488  4680780720  2455398272  66% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:66&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0043_UUID  7512024488  4876523204  2259658024  68% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:67&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0044_UUID  7512024488  4016775868  3119437956  56% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:68&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0045_UUID  7512024488  4909041500  2227172328  69% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:69&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0046_UUID  7512024488  4894087388  2242097828  69% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:70&amp;#93;&lt;/span&gt;&lt;br/&gt;
scratch-OST0047_UUID  7512024488  4750555368  2385615184  67% /scratch/weisshorn&lt;span class=&quot;error&quot;&gt;&amp;#91;OST:71&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;Thanks&lt;br/&gt;
Fabio&lt;/p&gt;</comment>
                            <comment id="41879" author="fverzell" created="Mon, 16 Jul 2012 09:30:55 +0000"  >&lt;p&gt;this is the second option of tuning to apply:&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;Router nodes ##&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;options mlx4_core msi_x=1&lt;/p&gt;

&lt;p&gt;options kgnilnd credits=4000 timeout=120&lt;/p&gt;

&lt;p&gt;options qla2xxx ql2xlogintimeout=0&lt;br/&gt;
options ost oss_num_threads=256&lt;br/&gt;
options libcfs libcfs_panic_on_lbug=1&lt;/p&gt;

&lt;p&gt;options ko2iblnd credits=1024 ntx=2048 &lt;/p&gt;

&lt;p&gt;options lnet small_router_buffers=16384&lt;br/&gt;
############################################&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;compute nodes ##&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;options lnet check_routers_before_use=1&lt;br/&gt;
options lnet dead_router_check_interval=150&lt;br/&gt;
options lnet live_router_check_interval=150&lt;br/&gt;
options lnet router_ping_timeout=130&lt;br/&gt;
options kgnilnd credits=1024 timeout=120&lt;br/&gt;
############################################&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;
	&lt;ol&gt;
		&lt;li&gt;Servers ##&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;options mlx4_core msi_x=1&lt;br/&gt;
options lnet networks=o2ib(ib0)&lt;/p&gt;

&lt;p&gt;options lnet check_routers_before_use=1&lt;br/&gt;
options lnet router_ping_timeout=130&lt;br/&gt;
options lnet dead_router_check_interval=150&lt;br/&gt;
options lnet live_router_check_interval=150&lt;/p&gt;

&lt;p&gt;options ko2iblnd credits=1024 ntx=2048 peer_credits=8&lt;/p&gt;

&lt;p&gt;options libcfs libcfs_panic_on_lbug=1&lt;br/&gt;
############################################&lt;/p&gt;

&lt;p&gt;There&apos;s any pro &amp;amp; cons at a first glance about the first &amp;amp; the second set of tuning we want to try tomorrow?&lt;/p&gt;

&lt;p&gt;Fabio&lt;/p&gt;</comment>
                            <comment id="41925" author="cliffw" created="Tue, 17 Jul 2012 08:48:42 +0000"  >&lt;p&gt;The filesystem does not seem especially unbalanced, lfs migrate is used when adding new storage. What exactly is your concern? Normal usage should balance the space out, depending on your mix of workloads. it looks like your average OST is 66% full, with a few at ~30%. That should balance out in normal usage.&lt;/p&gt;</comment>
                            <comment id="41926" author="cliffw" created="Tue, 17 Jul 2012 08:53:40 +0000"  >&lt;p&gt;And yes, QOS parameters are set on the MDS, as MDS controls that allocation.&lt;/p&gt;</comment>
                            <comment id="44172" author="cliffw" created="Tue, 4 Sep 2012 17:10:02 +0000"  >&lt;p&gt;Is there anything more we can do on this issue, or is it okay to close?&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="11598" name="cluster.log" size="5991823" author="nbianchi" created="Thu, 14 Jun 2012 08:17:24 +0000"/>
                            <attachment id="11560" name="cluster.log-2012-06-09_05" size="2386089" author="nbianchi" created="Mon, 11 Jun 2012 10:39:21 +0000"/>
                            <attachment id="11559" name="cluster.log-2012-06-11_12" size="1185111" author="nbianchi" created="Mon, 11 Jun 2012 10:39:21 +0000"/>
                            <attachment id="11558" name="craylog-2012-06-09.log" size="7415113" author="nbianchi" created="Mon, 11 Jun 2012 10:39:21 +0000"/>
                            <attachment id="11557" name="craylog-2012-06-11.log" size="4746625" author="nbianchi" created="Mon, 11 Jun 2012 10:39:21 +0000"/>
                            <attachment id="11667" name="debug_lustre" size="35927" author="fverzell" created="Mon, 2 Jul 2012 10:12:37 +0000"/>
                            <attachment id="11684" name="drop_conn.log" size="3818626" author="fverzell" created="Thu, 12 Jul 2012 09:07:17 +0000"/>
                            <attachment id="11566" name="ganglia-load-2012-06-09.pdf" size="175706" author="nbianchi" created="Tue, 12 Jun 2012 01:40:51 +0000"/>
                            <attachment id="11567" name="ganglia-load-2012-06-11.pdf" size="177613" author="nbianchi" created="Tue, 12 Jun 2012 01:41:06 +0000"/>
                            <attachment id="11680" name="log1" size="1186153" author="fverzell" created="Fri, 6 Jul 2012 05:15:46 +0000"/>
                            <attachment id="11685" name="log_11_jul" size="101250" author="fverzell" created="Thu, 12 Jul 2012 09:07:27 +0000"/>
                            <attachment id="11568" name="messages_router_node.log" size="157439" author="fverzell" created="Tue, 12 Jun 2012 03:44:31 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzv3cn:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>4042</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10020"><![CDATA[1]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>