<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:18:25 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1642] Clients get disconnected and reconnected during heavy IO immediately after the halt of a blade.</title>
                <link>https://jira.whamcloud.com/browse/LU-1642</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;During Lustre testing yesterday we observe this behaviours:&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Halt 4 nodes on a blade&lt;/li&gt;
	&lt;li&gt;jobs doing IO intense such as IOR or MPIIO starts&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Jul 17 16:38:15 nid00475 aprun.x&lt;span class=&quot;error&quot;&gt;&amp;#91;13684&amp;#93;&lt;/span&gt;: apid=1177710, Starting, user=20859, batch_id=377847, cmd_line=&quot;/usr/bin/apr&lt;br/&gt;
un.x -n 256 src/C/IOR -a MPIIO -B -b 4096m -t 4096K -k -r -w -e -g -s 1 -i 2 -F -C -o /scratch/weisshorn/fverzell/te&lt;br/&gt;
st5/IORtest-377847 &quot;, num_nodes=64, node_list=64-65,126-129,190-191,702-705,766-769,830-833,894-897,958-961,1022-102&lt;br/&gt;
5,1086-1089,1150-1153,1214-1217,1278-1281,1294-1295,1342-1345,1406-1409,1470-1473,1534-1535&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;then a few minutes later Lustre is acting up&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Lustre server log:&lt;/p&gt;

&lt;p&gt;Jul 17 16:39:57 weisshorn03 kernel: LNetError: 4754:0:(o2iblnd_cb.c:2991:kiblnd_check_txs_locked()) Timed out tx: tx_queue, 11 seconds&lt;br/&gt;
Jul 17 16:39:57 weisshorn03 kernel: LNetError: 4754:0:(o2iblnd_cb.c:3054:kiblnd_check_conns()) Timed out RDMA with 148.187.7.73@o2ib2 (0): c: 0, oc: 1, rc: 5&lt;/p&gt;

&lt;p&gt;Jul 17 16:39:58 weisshorn08 kernel: LNetError: 5045:0:(o2iblnd_cb.c:2991:kiblnd_check_txs_locked()) Timed out tx: tx_queue, 12 seconds&lt;br/&gt;
Jul 17 16:39:58 weisshorn08 kernel: LNetError: 5045:0:(o2iblnd_cb.c:3054:kiblnd_check_conns()) Timed out RDMA with 148.187.7.78@o2ib2 (0): c: 0, oc: 3, rc: 4&lt;/p&gt;

&lt;p&gt;Jul 17 16:39:59 weisshorn13 kernel: LNet: 3394:0:(o2iblnd_cb.c:2340:kiblnd_passive_connect()) Conn race 148.187.7.81@o2ib2&lt;br/&gt;
Jul 17 16:39:59 weisshorn05 kernel: LNetError: 4875:0:(o2iblnd_cb.c:2991:kiblnd_check_txs_locked()) Timed out tx: tx_queue, 12 seconds&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Notice &quot;IO Bulk write error&quot;  for nid833 which part of job mentioned above followed by &quot;inactive thread &quot; then dumptrace:&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Jul 17 16:40:05 weisshorn14 kernel: LustreError: 7929:0:(ldlm_lib.c:2717:target_bulk_io()) @@@ network error on bulk GET 0(1048576)  req@f&lt;br/&gt;
fff880f07f8b400 x1407748581382025/t0(0) o4-&amp;gt;412fabdd-3b3a-df4b-bdc6-264145113d70@833@gni:0/0 lens 448/416 e 0 to 0 dl 1342536406 ref 1 fl Interpret:/0/0 rc 0/0&lt;br/&gt;
Jul 17 16:40:05 weisshorn14 kernel: Lustre: scratch-OST003f: Bulk IO write error with 412fabdd-3b3a-df4b-bdc6-264145113d70 (at 833@gni), c&lt;br/&gt;
lient will retry: rc -110&lt;/p&gt;

&lt;p&gt;Jul 17 16:43:19 weisshorn13 kernel: Lustre: 6182:0:(service.c:1034:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (5/1), not sending early reply&lt;br/&gt;
Jul 17 16:43:19 weisshorn13 kernel:  req@ffff880ddeb81400 x1407748579298657/t0(0) o4-&amp;gt;a34c7ab8-980f-db22-6596-e1db30724c4d@12@gni:0/0 lens 448/416 e 1 to 0 dl 1342536204 ref 2 fl Interpret:/0/0 rc 0/0&lt;br/&gt;
Jul 17 16:43:19 weisshorn13 kernel: Lustre: 6182:0:(service.c:1034:ptlrpc_at_send_early_reply()) Skipped 19 previous similar messages&lt;/p&gt;

&lt;p&gt;Jul 17 16:43:20 weisshorn13 kernel: LNet: Service thread pid 8102 was inactive for 600.00s. The thread might be hung, or it might only be&lt;br/&gt;
slow and will resume later. Dumping the stack trace for debugging purposes:&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: Pid: 8102, comm: ll_ost_io_153&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel:&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: Call Trace:&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8107bf8c&amp;gt;&amp;#93;&lt;/span&gt; ? lock_timer_base+0x3c/0x70&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff814edc52&amp;gt;&amp;#93;&lt;/span&gt; schedule_timeout+0x192/0x2e0&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8107c0a0&amp;gt;&amp;#93;&lt;/span&gt; ? process_timeout+0x0/0x10&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03a65c1&amp;gt;&amp;#93;&lt;/span&gt; cfs_waitq_timedwait+0x11/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05df0ad&amp;gt;&amp;#93;&lt;/span&gt; target_bulk_io+0x38d/0x8b0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8105e7f0&amp;gt;&amp;#93;&lt;/span&gt; ? default_wake_function+0x0/0x20&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0b4c792&amp;gt;&amp;#93;&lt;/span&gt; ost_brw_write+0x1172/0x1380 &lt;span class=&quot;error&quot;&gt;&amp;#91;ost&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03a527b&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_set_ptldebug_header+0x2b/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05d64a0&amp;gt;&amp;#93;&lt;/span&gt; ? target_bulk_timeout+0x0/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0b507c4&amp;gt;&amp;#93;&lt;/span&gt; ost_handle+0x2764/0x39e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ost&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0612c83&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_update_export_timer+0x1c3/0x360 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06183c1&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request+0x3c1/0xcb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03a64ce&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_timer_arm+0xe/0x10 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03b0ef9&amp;gt;&amp;#93;&lt;/span&gt; ? lc_watchdog_touch+0x79/0x110 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0612462&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_wait_event+0xb2/0x2c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06193cf&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0x71f/0x1210 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0618cb0&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1210 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c14a&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0618cb0&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1210 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0618cb0&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1210 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c140&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;br/&gt;
Jul 17 16:43:20 weisshorn13 kernel:&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Nid833 is then being evicted:&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Jul 17 16:47:44 weisshorn14 kernel: LustreError: 0:0:(ldlm_lockd.c:357:waiting_locks_callback()) ### lock callback timer expired after 568&lt;br/&gt;
s: evicting client at 833@gni  ns: filter-scratch-OST003f_UUID lock: ffff8808aa9fc480/0x9cc518d034bea3c8 lrc: 3/0,0 mode: PW/PW res: 22186&lt;br/&gt;
722/0 rrc: 2 type: EXT &lt;span class=&quot;error&quot;&gt;&amp;#91;0-&amp;gt;18446744073709551615&amp;#93;&lt;/span&gt; (req 0-&amp;gt;1048575) flags: 0x20 remote: 0x629daae2e6cf7351 expref: 5 pid: 5811 timeout 42997&lt;br/&gt;
03474&lt;/p&gt;

&lt;p&gt;On SMW console log:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;2012-07-17 16:48:07&amp;#93;&lt;/span&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;c7-0c1s0n1&amp;#93;&lt;/span&gt;Lustre: 9196:0:(client.c:1492:ptlrpc_expire_one_request()) @@@ Request x1407748581382025&lt;br/&gt;
 sent from scratch-OST003f-osc-ffff88041e142400 to NID 148.187.7.114@o2ib2 471s ago has timed out (471s prior to deadline&lt;br/&gt;
).&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;2012-07-17 16:48:07&amp;#93;&lt;/span&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;c7-0c1s0n1&amp;#93;&lt;/span&gt;  req@ffff8801bea18800 x1407748581382025/t0 o4-&amp;gt;scratch-OST003f_UUID@148.187.7.114@o2ib2&lt;br/&gt;
:6/4 lens 448/608 e 0 to 1 dl 1342536485 ref 2 fl Rpc:/0/0 rc 0/0&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;2012-07-17 16:48:07&amp;#93;&lt;/span&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;c7-0c1s0n1&amp;#93;&lt;/span&gt;Lustre: scratch-OST003f-osc-ffff88041e142400: Connection to service scratch-OST003f via&lt;br/&gt;
nid 148.187.7.114@o2ib2 was lost; in progress operations using this service will wait for recovery to complete.&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;2012-07-17 16:48:07&amp;#93;&lt;/span&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;c7-0c1s0n1&amp;#93;&lt;/span&gt;LustreError: 167-0: This client was evicted by scratch-OST003f; in progress operations u&lt;br/&gt;
sing this service will fail.&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;2012-07-17 16:48:07&amp;#93;&lt;/span&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;c7-0c1s0n1&amp;#93;&lt;/span&gt;Lustre: Server scratch-OST003f_UUID version (2.2.51.0) is much newer than client version&lt;br/&gt;
 (1.8.6)&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;2012-07-17 16:48:07&amp;#93;&lt;/span&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;c7-0c1s0n1&amp;#93;&lt;/span&gt;Lustre: Skipped 72 previous similar messages&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;2012-07-17 16:48:07&amp;#93;&lt;/span&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;c7-0c1s0n1&amp;#93;&lt;/span&gt;Lustre: scratch-OST003f-osc-ffff88041e142400: Connection restored to service scratch-OST&lt;br/&gt;
003f using nid 148.187.7.114@o2ib2.&lt;/p&gt;

&lt;p&gt;Job is talled then finally is killed due cputime limit exceeded&lt;/p&gt;

&lt;p&gt;slurmd&lt;span class=&quot;error&quot;&gt;&amp;#91;rosa12&amp;#93;&lt;/span&gt;: *** JOB 377847 CANCELLED AT 17:03:36 DUE TO TIME LIMIT ***&lt;br/&gt;
aprun.x: Apid 1177710: Caught signal Terminated, sending to application&lt;/p&gt;


&lt;p&gt;Attached the log file of Cray XE machine of the specific time range.&lt;/p&gt;</description>
                <environment>---------------------------------------------------------------------------------------------------- &lt;br/&gt;
## MDS HW ## &lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
Linux XXXX.admin.cscs.ch 2.6.32-220.7.1.el6_lustre.g9c8f747.x86_64 &lt;br/&gt;
Architecture: x86_64 &lt;br/&gt;
CPU op-mode(s): 32-bit, 64-bit &lt;br/&gt;
Byte Order: Little Endian &lt;br/&gt;
CPU(s): 16 &lt;br/&gt;
Vendor ID: AuthenticAMD &lt;br/&gt;
CPU family: 16 &lt;br/&gt;
64Gb RAM &lt;br/&gt;
Interconnect IB 40Gb/s &lt;br/&gt;
--- &lt;br/&gt;
MDT LSI 5480 Pikes Peak &lt;br/&gt;
SSDs SLC &lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
&lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
## OSS HW ## &lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
Architecture: x86_64 &lt;br/&gt;
CPU op-mode(s): 32-bit, 64-bit &lt;br/&gt;
Byte Order: Little Endian &lt;br/&gt;
CPU(s): 32 &lt;br/&gt;
Vendor ID: GenuineIntel &lt;br/&gt;
CPU family: 6 &lt;br/&gt;
64Gb RAM &lt;br/&gt;
Interconnect IB 40Gb/s &lt;br/&gt;
--- &lt;br/&gt;
OSTs ---&amp;gt; LSI 7900 SATA Disks &lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
&lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
## Router nodes ## &lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
12 Cray XE6 Service nodes as router nodes - IB 40Gb/s &lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
&lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
## Clients ## &lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
~ 1500 Cray XE6 nodes - Lustre 1.8.6 &lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
&lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
## LUSTRE Config ## &lt;br/&gt;
---------------------------------------------------------------------------------------------------- &lt;br/&gt;
1 MDS + 1 fail over (MDT on SSD array) &lt;br/&gt;
12 OSSs - 6 OSTs per OSS (72 OSTs) &lt;br/&gt;
&lt;br/&gt;
Luster Servers ---&amp;gt; 2.2.51.0 &lt;br/&gt;
Lustre Clients ---&amp;gt; 1.8.6 (~1500 nodes) / 2.2.51.0 (~20 nodes) &lt;br/&gt;
----------------------------------------------------------------------------------------------------</environment>
        <key id="15252">LU-1642</key>
            <summary>Clients get disconnected and reconnected during heavy IO immediately after the halt of a blade.</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="4">Incomplete</resolution>
                                        <assignee username="green">Oleg Drokin</assignee>
                                    <reporter username="fverzell">Fabio Verzelloni</reporter>
                        <labels>
                    </labels>
                <created>Wed, 18 Jul 2012 04:17:12 +0000</created>
                <updated>Mon, 29 May 2017 04:34:50 +0000</updated>
                            <resolved>Mon, 29 May 2017 04:34:50 +0000</resolved>
                                    <version>Lustre 2.2.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="41965" author="pjones" created="Wed, 18 Jul 2012 08:06:50 +0000"  >&lt;p&gt;Oleg&lt;/p&gt;

&lt;p&gt;Could you please look into this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="41973" author="cliffw" created="Wed, 18 Jul 2012 09:51:40 +0000"  >&lt;p&gt;I am still seeing these messages:&lt;/p&gt;

&lt;p&gt;LNetError: 1480:0:(o2iblnd_cb.c:2273:kiblnd_passive_connect()) Can&apos;t accept 148.187.6.201@o2ib2: incompatible queue depth 8 (16 wanted)&lt;/p&gt;

&lt;p&gt;which would indicate your parameters are not the same on all nodes. This could impact the issue, please make sure that IB parameters match every where. &lt;br/&gt;
We are escalating this issue with engineering.&lt;/p&gt;</comment>
                            <comment id="41975" author="fverzell" created="Wed, 18 Jul 2012 10:06:32 +0000"  >&lt;p&gt;Hi Cliff, &lt;br/&gt;
  we are in the process of update the missing external clients with the new parameters, the problem will be fixed soon.&lt;/p&gt;

&lt;p&gt;Thanks&lt;br/&gt;
Fabio&lt;/p&gt;</comment>
                            <comment id="41976" author="liang" created="Wed, 18 Jul 2012 11:16:25 +0000"  >&lt;p&gt;Hi Fabio, a few questsion:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;could you give some descriptions about those two log files (sdb.log and smw.log), i.e: which nodes they are from (client? OSS?)&lt;/li&gt;
	&lt;li&gt;what&apos;s the difference between Nid833 and c7-0c1s0n*?&lt;/li&gt;
	&lt;li&gt;we can see there are some error messages from o2iblnd, I assume 148.187.7.73@o2ib2 is a router right? is there any errors in dmesg or console output on that router?
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;o2iblnd_cb.c:3054:kiblnd_check_conns()) Timed out RDMA with 148.187.7.73@o2ib2 (0): c: 0, oc: 1, rc: 5
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;&lt;/li&gt;
	&lt;li&gt;dmesg on the client &amp;amp; OSS could be helpful as well&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="41978" author="isaac" created="Wed, 18 Jul 2012 12:03:26 +0000"  >&lt;p&gt;Hi Fabio, can you please also run a &apos;ibcheckerrors&apos; to make sure the IB fabric is clean? Sometimes those RDMA timeout errors are caused by faulty fabric. It makes sense to me to first make sure the network itself is healthy.&lt;/p&gt;</comment>
                            <comment id="41997" author="colinmcmurtrie" created="Thu, 19 Jul 2012 06:25:25 +0000"  >&lt;p&gt;The two log files (sdb.log and smw.log) are from the Cray XE6 so the events logged there relate to the clients running on the Cray (compute nodes and service nodes running the LNET routers).&lt;/p&gt;

&lt;p&gt;Nid833 and c7-0c1s0n1 refer to the same compute node (i.e. they are different names for the same thing).  I have attached the file nid2CrayMapping.txt so that you can see this mapping for all nodes on our Cray XE6.&lt;/p&gt;</comment>
                            <comment id="42005" author="liang" created="Thu, 19 Jul 2012 12:04:33 +0000"  >&lt;p&gt;I&apos;m still trying to understand the network topology at here, could you give a description of network topology?&lt;/p&gt;

&lt;p&gt;I think OSSs are on o2ib network correct? But after I checked the sdb.log I saw this:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Jul 17 16:37:52 nid00394 kernel: LNet: 12047:0:(gnilnd_conn.c:1872:kgnilnd_reaper_dgram_check()) GNILND_DGRAM_REQ datagram to 12@gni timed out @ 128s dgram 0xffff8803ef766b48 state GNILND_DGRAM_POSTED conn 0xffff8803b4b07000
Jul 17 16:37:52 nid01530 kernel: LNet: 12033:0:(gnilnd_conn.c:1872:kgnilnd_reaper_dgram_check()) GNILND_DGRAM_REQ datagram to 50@gni timed out @ 128s dgram 0xffff880407221b08 state GNILND_DGRAM_POSTED conn 0xffff8803cedc0000
Jul 17 16:37:54 nid01530 kernel: LNet: could not send to 50@gni due to connection setup failure after 130 seconds
Jul 17 16:37:54 nid01530 kernel: LNet: 12028:0:(gnilnd_cb.c:1104:kgnilnd_tx_done()) $$ error -113 on tx 0xffff8803e6be9b68-&amp;gt;&amp;lt;?&amp;gt; id 0/0 state GNILND_TX_ALLOCD age 130s  msg@0xffff8803e6be9be8 m/v/ty/ck/pck/pl b00fbabe/8/2/0/22d/0 x0:GNILND_MSG_IMMEDIATE
Jul 17 16:37:58 nid00394 kernel: LNet: could not send to 12@gni due to connection setup failure after 134 seconds
Jul 17 16:37:58 nid00394 kernel: LNet: 12042:0:(gnilnd_cb.c:1104:kgnilnd_tx_done()) $$ error -113 on tx 0xffff8803fdcab248-&amp;gt;&amp;lt;?&amp;gt; id 0/0 state GNILND_TX_ALLOCD age 134s  msg@0xffff8803fdcab2c8 m/v/ty/ck/pck/pl b00fbabe/8/2/0/24d4/0 x0:GNILND_MSG_IMMEDIATE
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I checked nid2CrayMapping.txt, these nodes (nid00394, nid01530) are marked as &quot;service&quot;, does it mean that those OSSs are also acting as &quot;router&quot;, or they are dedicated routers but also marked as &quot;service&quot; nodes?&lt;/p&gt;

&lt;p&gt;Also, what&apos;s o2ib NIDs of these two nodes(nid00394, nid01530)? what&apos;s hostname of 12@gni, 50@gni? We need to find out errors/logs on a message path (OSS&amp;lt;&lt;del&gt;&amp;gt;router&amp;lt;&lt;/del&gt;&amp;gt;client) at the same moment.&lt;/p&gt;</comment>
                            <comment id="42014" author="isaac" created="Thu, 19 Jul 2012 14:26:07 +0000"  >&lt;p&gt;Hi Colin and Fabio, I&apos;m an Intel/Whamcloud engineer (despite that my email isn&apos;t from either one) working with Liang on this bug. I&apos;d appreciate a couple of things from you:&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;As I requested above, please run &apos;ibcheckerrors&apos; on the IB network. I want to make sure that the network itself is OK. This is important - if the networking is faulty but we assume it&apos;s good, then we could be led to wrong directions and it&apos;d take us longer to solve the problem.&lt;/li&gt;
	&lt;li&gt;On the router nodes, particularly 148.187.7.&lt;span class=&quot;error&quot;&gt;&amp;#91;73,78&amp;#93;&lt;/span&gt;@o2ib2, please collect all files under /proc/sys/lnet/, e.g. by &quot;tar -czvf `hostname`.tgz /proc/sys/lnet/&quot;. As long as these nodes have not rebooted since the problem happened, the files would contain very useful history data to help us understand what was happening. Of course, it&apos;d be a lot more useful to gather these files while the problem is happening.&lt;/li&gt;
&lt;/ol&gt;
</comment>
                            <comment id="42015" author="fverzell" created="Thu, 19 Jul 2012 14:40:06 +0000"  >
&lt;p&gt;Dear Liang, &lt;br/&gt;
  that&apos;s an overview of the network topology.&lt;/p&gt;

&lt;p&gt;These are the router node &quot;inside&quot; Cray XE System:&lt;br/&gt;
rosa4:~ # lctl show_route&lt;br/&gt;
net              o2ib2 hops 1 gw                          220@gni up&lt;br/&gt;
net              o2ib2 hops 1 gw                          304@gni up&lt;br/&gt;
net              o2ib2 hops 1 gw                          394@gni up&lt;br/&gt;
net              o2ib2 hops 1 gw                          436@gni up&lt;br/&gt;
net              o2ib2 hops 1 gw                          226@gni up&lt;br/&gt;
net              o2ib2 hops 1 gw                         1530@gni up&lt;br/&gt;
net              o2ib2 hops 1 gw                         1476@gni up&lt;br/&gt;
net              o2ib2 hops 1 gw                          270@gni up&lt;br/&gt;
net              o2ib2 hops 1 gw                          474@gni up&lt;br/&gt;
net              o2ib2 hops 1 gw                          484@gni up&lt;br/&gt;
net              o2ib2 hops 1 gw                         1364@gni up&lt;br/&gt;
net              o2ib2 hops 1 gw                         1386@gni up&lt;/p&gt;

&lt;p&gt;These node have the IB &amp;amp; GNI.&lt;/p&gt;

&lt;p&gt;The following lines are from weisshorn, that is basically were Lustre is built:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@weisshorn01 ~&amp;#93;&lt;/span&gt;# lctl show_route&lt;br/&gt;
net                gni hops 1 gw               148.187.7.77@o2ib2 up&lt;br/&gt;
net                gni hops 1 gw               148.187.7.72@o2ib2 up&lt;br/&gt;
net                gni hops 1 gw               148.187.7.71@o2ib2 up&lt;br/&gt;
net                gni hops 1 gw               148.187.7.78@o2ib2 up&lt;br/&gt;
net                gni hops 1 gw               148.187.7.81@o2ib2 up&lt;br/&gt;
net                gni hops 1 gw               148.187.7.74@o2ib2 up&lt;br/&gt;
net                gni hops 1 gw               148.187.7.76@o2ib2 up&lt;br/&gt;
net                gni hops 1 gw               148.187.7.73@o2ib2 up&lt;br/&gt;
net                gni hops 1 gw               148.187.7.82@o2ib2 up&lt;br/&gt;
net                gni hops 1 gw               148.187.7.79@o2ib2 up&lt;br/&gt;
net                gni hops 1 gw               148.187.7.75@o2ib2 up&lt;br/&gt;
net                gni hops 1 gw               148.187.7.80@o2ib2 up&lt;/p&gt;

&lt;p&gt;These nodes have only IB.&lt;/p&gt;

&lt;p&gt;Weisshorn &amp;amp; Cray are two completely separated things, the nodes nid00394, nid01530 are router:&lt;br/&gt;
nid01530:~ # lctl list_nids&lt;br/&gt;
1530@gni&lt;br/&gt;
148.187.7.82@o2ib2&lt;/p&gt;

&lt;p&gt;nid00394:~ # lctl list_nids&lt;br/&gt;
394@gni&lt;br/&gt;
148.187.7.75@o2ib2&lt;/p&gt;

&lt;p&gt;In Cray speaking what is called 12@gni or &quot;any_number&quot;@gni if it is a compute nodes, so basically a node for computation should be called in different ways, but as example a node called 50@gni will be:&lt;/p&gt;

&lt;p&gt;50@gni&lt;br/&gt;
nid00050&lt;br/&gt;
c0-0c0s6n2&lt;br/&gt;
xxx.xx.0.51&lt;/p&gt;

&lt;p&gt;What you see marked as service, should be a router nodes or fronend nodes.&lt;br/&gt;
All the compute nodes have to pass thought the router nodes, because they only have gni. &lt;br/&gt;
As mentioned 12@gni &amp;amp; 50@gni I can confirm you that they are compute nodes.&lt;/p&gt;

&lt;p&gt;Please let me know if you need more details.&lt;br/&gt;
Fabio&lt;/p&gt;
</comment>
                            <comment id="42016" author="isaac" created="Thu, 19 Jul 2012 14:50:35 +0000"  >&lt;p&gt;Hi Fabio, thanks for the feedback. It&apos;s about 3AM for Liang now, so he&apos;s likely not going to respond soon. Please have a look at my previous comment where additional data was requested.&lt;/p&gt;</comment>
                            <comment id="42019" author="cliffw" created="Thu, 19 Jul 2012 17:33:46 +0000"  >&lt;p&gt;Tarball of /proc/sys/lnet on the MDS&lt;/p&gt;</comment>
                            <comment id="42026" author="cliffw" created="Thu, 19 Jul 2012 19:31:25 +0000"  >&lt;p&gt;Better tarball. Still need this from the routers&lt;/p&gt;</comment>
                            <comment id="42027" author="isaac" created="Thu, 19 Jul 2012 19:40:56 +0000"  >&lt;p&gt;First of all, Cliff did an ibcheckerrors, which said:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Summary: 236 nodes checked, 2 bad nodes found&lt;br/&gt;
         786 ports checked, 342 ports have errors beyond threshold&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;I&apos;m not sure how serious the problem is, as the error counters could have been accumulating for months. But I think it&apos;s a good idea to have your IB admin double check that the IB fabric is running OK. Such problems can be hard to nail down when they begin manifesting themselves at upper layers.&lt;/p&gt;

&lt;p&gt;From the data available, and under the assumption that the nodes&apos; system clocks are roughly synchronized, to seconds at least, here&apos;s my speculation of what happened.&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;&lt;b&gt;16:35:01&lt;/b&gt; Timeouts and errors began to show up in the @gni network:
&lt;blockquote&gt;
&lt;p&gt;No gnilnd traffic received from 50@gni for 120 seconds, terminating connection. Is node down?&lt;br/&gt;
kgnilnd_close_conn_locked()) closing conn to 12@gni: error -110&lt;br/&gt;
kgnilnd_tx_done()) $$ error &lt;del&gt;113 on tx 0xffff8803b4b87b68&lt;/del&gt;&amp;gt;&amp;lt;?&amp;gt;&lt;br/&gt;
kgnilnd_reaper_dgram_check()) GNILND_DGRAM_REQ datagram to 12@gni timed out&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;These errors could indicate problems in the GNI network, or they could be OK if they were errors about messages to the halted blade.&lt;/p&gt;&lt;/li&gt;
	&lt;li&gt;Then, on routers, messages to nodes in the @gni network got queued up, as the GNI network couldn&apos;t forward them out. Then servers ran out of router buffer credits. As a result, routers couldn&apos;t return TX credits back to servers. Then next step...&lt;/li&gt;
	&lt;li&gt;&lt;b&gt;16:39:57&lt;/b&gt; Servers began to see RDMA timeouts:
&lt;blockquote&gt;
&lt;p&gt;Jul 17 16:39:57 weisshorn03 kernel: LNetError: 4754:0:(o2iblnd_cb.c:2991:kiblnd_check_txs_locked()) Timed out tx: tx_queue, 11 seconds&lt;br/&gt;
Jul 17 16:39:57 weisshorn03 kernel: LNetError: 4754:0:(o2iblnd_cb.c:3054:kiblnd_check_conns()) Timed out RDMA with 148.187.7.73@o2ib2 (0): c: 0, oc: 1, rc: 5&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;These TXs were never put out on wire. Instead they waited for TX credits for too long and timed out before reaching the wire. The 1st message said they were waiting for TX credit, and the 2nd said the connection had no credit to use. The lack of tx credits could be seen from the peers file on the MDS:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;nid                      refs state  last   max   rtr   min    tx   min queue&lt;br/&gt;
148.187.7.71@o2ib2          3    up    -1    16    16    16    16  -153 0&lt;br/&gt;
148.187.7.72@o2ib2          3    up    -1    16    16    16    16  -150 0&lt;br/&gt;
148.187.7.73@o2ib2          3    up    -1    16    16    16    16  -152 0&lt;br/&gt;
148.187.7.74@o2ib2          3    up    -1    16    16    16    16  -152 0&lt;br/&gt;
148.187.7.75@o2ib2          3    up    -1    16    16    16    16  -152 0&lt;br/&gt;
148.187.7.76@o2ib2          3    up    -1    16    16    16    16  -151 0&lt;br/&gt;
148.187.7.77@o2ib2          3    up    -1    16    16    16    16  -151 0&lt;br/&gt;
148.187.7.78@o2ib2          3    up    -1    16    16    16    16  -151 0&lt;br/&gt;
148.187.7.79@o2ib2          3    up    -1    16    16    16    16  -152 0&lt;br/&gt;
148.187.7.80@o2ib2          3    up    -1    16    16    16    16  -151 0&lt;br/&gt;
148.187.7.81@o2ib2          3    up    -1    16    16    16    16  -151 0&lt;br/&gt;
148.187.7.82@o2ib2          3    up    -1    16    16    16    16  -150 0&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;The TX queues for routers became quite long at one point.&lt;/p&gt;&lt;/li&gt;
	&lt;li&gt;Active clients now wouldn&apos;t see any progress, as the servers couldn&apos;t send messages to the routers.&lt;/li&gt;
&lt;/ol&gt;



&lt;p&gt;To fix it, I&apos;d suggest to:&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;Server IB network: the errors reported by ibcheckerrors should be double checked. Also, the error counters should be reset, so that later we can query them again and be able to interpret the results better.&lt;/li&gt;
	&lt;li&gt;Client GNI network: If the errors were all about nodes in the halted blade, then they can be ignored. Otherwise, they must be investigated.&lt;/li&gt;
	&lt;li&gt;On routers:
	&lt;ol&gt;
		&lt;li&gt;More buffer credits should be granted to the servers. I&apos;d need to see the module options on routers and the files under /proc/sys/lnet/ to make suggestion on router buffer settings.&lt;/li&gt;
		&lt;li&gt;Peer health option must be turned on for both the ko2iblnd and the gnilnd:&lt;br/&gt;
options ko2iblnd peer_timeout=180&lt;br/&gt;
options kgnilnd peer_health=60&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ol&gt;
</comment>
                            <comment id="42034" author="fverzell" created="Fri, 20 Jul 2012 02:01:02 +0000"  >&lt;p&gt;Dear Isaac, &lt;br/&gt;
  I&apos;ll get in touch with our network admin to have a look on our Ib network looking for errors, regarding your second question the Cray XE machine has been rebooted so all the /proc/sys/lnet is not anymore the one would be helpful. In case something will happen I&apos;ll take immediately a dump of all the router nodes if that could help.&lt;/p&gt;

&lt;p&gt;Regards&lt;br/&gt;
Fabio &lt;/p&gt;</comment>
                            <comment id="42036" author="isaac" created="Fri, 20 Jul 2012 02:48:30 +0000"  >&lt;p&gt;Hi Fabio, three more notes on collecting data on routers:&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;Please enable console logging of network errors: echo +neterror &amp;gt; /proc/sys/lnet/printk&lt;/li&gt;
	&lt;li&gt;Cliff mentioned that &quot;tar -czvf `hostname`.tgz /proc/sys/lnet/&quot; might fail to grab the files as some were not readable. Cliff can you advise how you managed to get the &quot;Better tarball&quot;? Or maybe you could find out in the shell command line history on 148.187.7.102@o2ib2.&lt;/li&gt;
	&lt;li&gt;It&apos;d be helpful to include a timestamp in the tarball, e.g. tar -czvf `hostname`_`date +%T`.tgz. That&apos;ll help me correlate the data with events reported in the log files.&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;Thanks!&lt;/p&gt;</comment>
                            <comment id="42037" author="liang" created="Fri, 20 Jul 2012 03:05:01 +0000"  >&lt;p&gt;Isaac, I think you meant &quot;options kgnilnd peer_health=1&quot; correct? because peer_health is a boolean, &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;

&lt;p&gt;I remember that the ko2iblnd peer_buffer_credits on router is set to 128, but need to be verified, Fabio, could you check this? As Isaac said, all modparameters on routers could be helpful, I saw various versions of parameters were posted on the other ticket but not sure which one is your final choice, so could you post them at here.&lt;/p&gt;</comment>
                            <comment id="42039" author="fverzell" created="Fri, 20 Jul 2012 03:55:30 +0000"  >&lt;p&gt;We are having a file system hang right now, can you please connect to weisshorn and have a look, I&apos;m here in case of any kind of needs to help you with logs, details, etc.&lt;/p&gt;

&lt;p&gt;Thanks&lt;br/&gt;
Fabio&lt;/p&gt;</comment>
                            <comment id="42041" author="isaac" created="Fri, 20 Jul 2012 03:55:46 +0000"  >&lt;ol&gt;
	&lt;li&gt;You&apos;re correct on kgnilnd peer_health. Thanks.&lt;/li&gt;
	&lt;li&gt;If there&apos;s only a small number of servers, ko2iblnd peer_buffer_credits could be set to higher than 128.&lt;/li&gt;
&lt;/ol&gt;
</comment>
                            <comment id="42042" author="fverzell" created="Fri, 20 Jul 2012 04:07:15 +0000"  >&lt;p&gt;&amp;gt; Please enable console logging of network errors: echo +neterror &amp;gt; /proc/sys/lnet/printk&lt;br/&gt;
Done on all the Lustre Cluster (weisshorn).&lt;/p&gt;


&lt;p&gt;&amp;gt;1- Server IB network: &lt;br/&gt;
&amp;gt;the errors reported by ibcheckerrors should be double checked. Also, the error counters should be reset, so that later we can query them again and be able to interpret &amp;gt;the results better.&lt;/p&gt;

&lt;p&gt;Today I&apos;ll have a look with the network administrator.&lt;/p&gt;

&lt;p&gt;&amp;gt;2- Client GNI network: &lt;br/&gt;
&amp;gt;If the errors were all about nodes in the halted blade, then they can be ignored. Otherwise, they must be investigated.&lt;/p&gt;

&lt;p&gt;Yes the gni network errors were from nodes halted.&lt;/p&gt;

&lt;p&gt;&amp;gt;3- On routers:&lt;br/&gt;
&amp;gt;-More buffer credits should be granted to the servers. I&apos;d need to see the module options on routers and the files under /proc/sys/lnet/ to make suggestion on router &amp;gt;buffer settings.&lt;br/&gt;
&amp;gt;-Peer health option must be turned on for both the ko2iblnd and the gnilnd:&lt;br/&gt;
&amp;gt; options ko2iblnd peer_timeout=180&lt;br/&gt;
&amp;gt; options kgnilnd peer_health=60&lt;/p&gt;

&lt;p&gt;I&apos;ll do it at the first reboot of the cluster.&lt;/p&gt;</comment>
                            <comment id="42043" author="fverzell" created="Fri, 20 Jul 2012 04:10:33 +0000"  >&lt;p&gt;I&apos;m going to make a fsck on the MDT.&lt;/p&gt;

&lt;p&gt;Fabio&lt;/p&gt;</comment>
                            <comment id="42044" author="fverzell" created="Fri, 20 Jul 2012 04:40:07 +0000"  >&lt;p&gt;Is it normal that the MDT is using 110Gb? I think I&apos;ve never seen the MDT so full.&lt;/p&gt;

&lt;p&gt;Thanks&lt;br/&gt;
Fabio&lt;/p&gt;</comment>
                            <comment id="42045" author="liang" created="Fri, 20 Jul 2012 05:14:37 +0000"  >&lt;p&gt;I&apos;ve added Fanyong to CC list, I think if MDT size is growing faster than you thought, then it&apos;s very likely because our OI files are growing forever (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1512&quot; title=&quot;OI leaks&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1512&quot;&gt;&lt;del&gt;LU-1512&lt;/del&gt;&lt;/a&gt;). It&apos;s a design defect of IAM, Fanyong has already worked out a patch, but I don&apos;t know if it&apos;s possible to apply to existed filesystem.&lt;/p&gt;

&lt;p&gt;Fanyong, could you comment on this?&lt;/p&gt;</comment>
                            <comment id="42055" author="liang" created="Fri, 20 Jul 2012 08:25:33 +0000"  >&lt;p&gt;sorry, delete comment on wrong ticket&lt;/p&gt;</comment>
                            <comment id="42130" author="yong.fan" created="Mon, 23 Jul 2012 10:17:31 +0000"  >&lt;p&gt;Hi Fabio,&lt;/p&gt;

&lt;p&gt;Can you please to mount the MDT device as the type of &quot;ldiskfs&quot;, then check which file(s) consumed such large space?&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="11700" name="nid2CrayMapping.txt" size="90683" author="colinmcmurtrie" created="Thu, 19 Jul 2012 06:24:15 +0000"/>
                            <attachment id="11697" name="sdb.log" size="587971" author="fverzell" created="Wed, 18 Jul 2012 04:24:51 +0000"/>
                            <attachment id="11696" name="smw.log" size="132118" author="fverzell" created="Wed, 18 Jul 2012 04:17:12 +0000"/>
                            <attachment id="11704" name="weiss02.tar.gz" size="1644" author="cliffw" created="Thu, 19 Jul 2012 19:31:25 +0000"/>
                            <attachment id="11702" name="weisshorn02.tar.gz" size="565" author="cliffw" created="Thu, 19 Jul 2012 17:33:46 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzv34n:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>4006</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>