<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:11:24 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-890] MDS Failover Issue - Clients not reconnecting after MGT/MDT fail over to other MDS.</title>
                <link>https://jira.whamcloud.com/browse/LU-890</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;The production compute nodes and login nodes can access both filesystems when the MGT/MDT is running on the primary MDS of scratch1.  When the MGT and MDT are failed over to the backup MDS, the clients fail to reconnect.&lt;/p&gt;

&lt;p&gt;The basic configuration information is as follows:&lt;/p&gt;

&lt;p&gt;The primary MDS for scratch1 is named lfs-mds-1-1 and the secondary MDS is named lfs-mds-1-2.&lt;br/&gt;
/etc/modprobe.d/lustre.conf:&lt;br/&gt;
options lnet networks=&quot;o2ib0(ib0), o2ib1(ib2), o2ib2(ib3)&quot;&lt;/p&gt;

&lt;p&gt;lfs-mds-1-1:&lt;br/&gt;
ib0   inet addr:10.174.31.241  Bcast:10.174.31.255  Mask:255.255.224.0&lt;br/&gt;
ib1   inet addr:10.175.31.241  Bcast:10.175.31.255  Mask:255.255.224.0&lt;br/&gt;
ib2   inet addr:10.174.79.241  Bcast:10.174.79.255  Mask:255.255.240.0&lt;br/&gt;
ib3   inet addr:10.174.80.40   Bcast:10.174.111.255 Mask:255.255.240.0&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-1-1 config&amp;#93;&lt;/span&gt;# lctl list_nids&lt;br/&gt;
10.174.31.241@o2ib&lt;br/&gt;
10.174.79.241@o2ib1&lt;br/&gt;
10.174.80.40@o2ib2&lt;/p&gt;

&lt;p&gt;lfs-mds-1-2:&lt;br/&gt;
ib0   inet addr:10.174.31.251  Bcast:10.174.31.255  Mask:255.255.224.0&lt;br/&gt;
ib1   inet addr:10.175.31.251  Bcast:10.175.31.255  Mask:255.255.224.0&lt;br/&gt;
ib2   inet addr:10.174.79.251  Bcast:10.174.79.255  Mask:255.255.240.0&lt;br/&gt;
ib3   inet addr:10.174.80.41   Bcast:10.174.111.255 Mask:255.255.240.0&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-1-2 ~&amp;#93;&lt;/span&gt;# lctl list_nids&lt;br/&gt;
10.174.31.251@o2ib&lt;br/&gt;
10.174.79.251@o2ib1&lt;br/&gt;
10.174.80.41@o2ib2&lt;/p&gt;

&lt;p&gt;r1i0n0 config (compute node):&lt;br/&gt;
ib0   inet addr:10.174.0.55  Bcast:10.174.31.255  Mask:255.255.224.0&lt;br/&gt;
ib1   inet addr:10.175.0.55  Bcast:10.175.31.255  Mask:255.255.224.0&lt;/p&gt;

&lt;p&gt;/etc/modprobe.d/lustre.conf&lt;br/&gt;
options lnet networks=&quot;o2ib0(ib0), o2ib1(ib1)&quot;&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@r1i0n0 ~&amp;#93;&lt;/span&gt;# lctl list_nids&lt;br/&gt;
10.174.0.55@o2ib&lt;br/&gt;
10.175.0.55@o2ib1&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@r1i0n0 ~&amp;#93;&lt;/span&gt;# lctl ping 10.174.31.241@o2ib&lt;br/&gt;
12345-0@lo&lt;br/&gt;
12345-10.174.31.241@o2ib&lt;br/&gt;
12345-10.174.79.241@o2ib1&lt;br/&gt;
12345-10.174.80.40@o2ib2&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@r1i0n0 ~&amp;#93;&lt;/span&gt;# lctl ping 10.174.31.251@o2ib&lt;br/&gt;
12345-0@lo&lt;br/&gt;
12345-10.174.31.251@o2ib&lt;br/&gt;
12345-10.174.79.251@o2ib1&lt;br/&gt;
12345-10.174.80.41@o2ib2&lt;/p&gt;

&lt;p&gt;fe1 (login node):&lt;br/&gt;
          inet addr:10.174.0.37  Bcast:10.255.255.255  Mask:255.255.224.0&lt;br/&gt;
          inet addr:10.175.0.37  Bcast:10.255.255.255  Mask:255.255.224.0&lt;br/&gt;
          inet addr:10.174.81.1  Bcast:10.174.95.255   Mask:255.255.240.0&lt;/p&gt;

&lt;p&gt;/etc/modprobe.d/lustre.conf&lt;br/&gt;
options lnet networks=&quot;o2ib0(ib0), o2ib1(ib1), o2ib2(ib2)&quot;&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@fe1 ~&amp;#93;&lt;/span&gt;# lctl list_nids&lt;br/&gt;
10.174.0.37@o2ib&lt;br/&gt;
10.175.0.37@o2ib1&lt;br/&gt;
10.174.81.10@o2ib2&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@fe1 ~&amp;#93;&lt;/span&gt;# lctl ping 10.174.80.40@o2ib2&lt;br/&gt;
12345-0@lo&lt;br/&gt;
12345-10.174.31.241@o2ib&lt;br/&gt;
12345-10.174.79.241@o2ib1&lt;br/&gt;
12345-10.174.80.40@o2ib2&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@fe1 ~&amp;#93;&lt;/span&gt;# lctl ping 10.174.80.41@o2ib2&lt;br/&gt;
12345-0@lo&lt;br/&gt;
12345-10.174.31.251@o2ib&lt;br/&gt;
12345-10.174.79.251@o2ib1&lt;br/&gt;
12345-10.174.80.41@o2ib2&lt;/p&gt;


&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-1-1 ~&amp;#93;&lt;/span&gt;# tunefs.lustre --dryrun /dev/vg_scratch1/mdt&lt;br/&gt;
checking for existing Lustre data: found CONFIGS/mountdata&lt;br/&gt;
Reading CONFIGS/mountdata&lt;/p&gt;

&lt;p&gt;   Read previous values:&lt;br/&gt;
Target:     scratch1-MDT0000&lt;br/&gt;
Index:      0&lt;br/&gt;
Lustre FS:  scratch1&lt;br/&gt;
Mount type: ldiskfs&lt;br/&gt;
Flags:      0x1401&lt;br/&gt;
              (MDT no_primnode )&lt;br/&gt;
Persistent mount opts: iopen_nopriv,user_xattr,errors=remount-ro&lt;br/&gt;
Parameters: mgsnode=10.174.31.241@o2ib,10.174.79.241@o2ib1,10.174.80.40@o2ib2 mgsnode=10.174.31.251@o2ib,10.174.79.251@o2ib1,10.174.80.41@o2ib2 failover.node=10.174.31.241@o2ib,10.174.79.241@o2ib1,10.174.80.40@o2ib2 failover.node=10.174.31.251@o2ib,10.174.79.251@o2ib1,10.174.80.41@o2ib2 mdt.quota_type=ug&lt;/p&gt;


&lt;p&gt;   Permanent disk data:&lt;br/&gt;
Target:     scratch1-MDT0000&lt;br/&gt;
Index:      0&lt;br/&gt;
Lustre FS:  scratch1&lt;br/&gt;
Mount type: ldiskfs&lt;br/&gt;
Flags:      0x1401&lt;br/&gt;
              (MDT no_primnode )&lt;br/&gt;
Persistent mount opts: iopen_nopriv,user_xattr,errors=remount-ro&lt;br/&gt;
Parameters: mgsnode=10.174.31.241@o2ib,10.174.79.241@o2ib1,10.174.80.40@o2ib2 mgsnode=10.174.31.251@o2ib,10.174.79.251@o2ib1,10.174.80.41@o2ib2 failover.node=10.174.31.241@o2ib,10.174.79.241@o2ib1,10.174.80.40@o2ib2 failover.node=10.174.31.251@o2ib,10.174.79.251@o2ib1,10.174.80.41@o2ib2 mdt.quota_type=ug&lt;/p&gt;

&lt;p&gt;exiting before disk write.&lt;/p&gt;

&lt;p&gt;After failing over the MGT and MDT to the backup MDS (lfs-mds-1-2) it appears to have never started recovery:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-1-2 lustre&amp;#93;&lt;/span&gt;# cat&lt;br/&gt;
/proc/fs/lustre/mds/scratch1-MDT0000/recovery_status&lt;br/&gt;
status: RECOVERING&lt;br/&gt;
recovery_start: 0&lt;br/&gt;
time_remaining: 0&lt;br/&gt;
connected_clients: 0/2275&lt;br/&gt;
delayed_clients: 0/2275&lt;br/&gt;
completed_clients: 0/2275&lt;br/&gt;
replayed_requests: 0/??&lt;br/&gt;
queued_requests: 0&lt;br/&gt;
next_transno: 55834575147&lt;/p&gt;


&lt;p&gt;Once I moved the MGT and MDT back to the original system, the client reconnected again in less than a minute:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-1-1 ~&amp;#93;&lt;/span&gt;# cat&lt;br/&gt;
/proc/fs/lustre/mds/scratch1-MDT0000/recovery_status&lt;br/&gt;
status: RECOVERING&lt;br/&gt;
recovery_start: 1322752821&lt;br/&gt;
time_remaining: 267&lt;br/&gt;
connected_clients: 1896/2275&lt;br/&gt;
delayed_clients: 0/2275&lt;br/&gt;
completed_clients: 1896/2275&lt;br/&gt;
replayed_requests: 0/??&lt;br/&gt;
queued_requests: 0&lt;br/&gt;
next_transno: 55834575147&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-1-1 ~&amp;#93;&lt;/span&gt;# cat&lt;br/&gt;
/proc/fs/lustre/mds/scratch1-MDT0000/recovery_status&lt;br/&gt;
status: COMPLETE&lt;br/&gt;
recovery_start: 1322752821&lt;br/&gt;
recovery_duration: 56&lt;br/&gt;
delayed_clients: 0/2275&lt;br/&gt;
completed_clients: 2275/2275&lt;br/&gt;
replayed_requests: 0&lt;br/&gt;
last_transno: 55834575146&lt;/p&gt;


&lt;p&gt;The log file on fe1 showed this:&lt;br/&gt;
Dec  1 15:08:21 fe1 kernel: Lustre: 7508:0:(client.c:1487:ptlrpc_expire_one_request()) @@@ Request x1386944264150314 sent from scratch1-MDT0000-mdc-ffff880be72aec00 to NID 10.174.31.241@o2ib 7s ago has timed out (7s prior to deadline).&lt;br/&gt;
Dec  1 15:08:21 fe1 kernel:  req@ffff880bee44fc00 x1386944264150314/t0 o35-&amp;gt;scratch1-MDT0000_UUID@10.174.31.241@o2ib:23/10 lens 408/9864 e 0 to 1 dl 1322752101 ref 1 fl Rpc:/0/0 rc 0/0&lt;br/&gt;
Dec  1 15:08:21 fe1 kernel: Lustre: 7508:0:(client.c:1487:ptlrpc_expire_one_request()) Skipped 19 previous similar messages&lt;br/&gt;
Dec  1 15:08:21 fe1 kernel: Lustre: scratch1-MDT0000-mdc-ffff880be72aec00: Connection to service scratch1-MDT0000 via nid 10.174.31.241@o2ib was lost; in progress operations using this service will wait for recovery to complete.&lt;br/&gt;
Dec  1 15:08:36 fe1 kernel: Lustre: 5587:0:(import.c:517:import_select_connection()) scratch1-MDT0000-mdc-ffff880be72aec00: tried all connections, increasing latency to 2s&lt;br/&gt;
Dec  1 15:08:38 fe1 kernel: Lustre: 5585:0:(client.c:1487:ptlrpc_expire_one_request()) @@@ Request x1386944264150337 sent from MGC10.174.80.40@o2ib2 to NID 10.174.80.40@o2ib2 17s ago has timed out (17s prior to deadline).&lt;br/&gt;
Dec  1 15:08:38 fe1 kernel:  req@ffff880becc30000 x1386944264150337/t0 o400-&amp;gt;MGS@MGC10.174.80.40@o2ib2_0:26/25 lens 192/384 e 0 to 1 dl 1322752117 ref 1 fl Rpc:N/0/0 rc 0/0&lt;br/&gt;
Dec  1 15:08:38 fe1 kernel: Lustre: 5585:0:(client.c:1487:ptlrpc_expire_one_request()) Skipped 2 previous similar messages&lt;br/&gt;
Dec  1 15:08:38 fe1 kernel: LustreError: 166-1: MGC10.174.80.40@o2ib2: Connection to service MGS via nid 10.174.80.40@o2ib2 was lost; in progress operations using this service will fail.&lt;br/&gt;
Dec  1 15:08:52 fe1 kernel: Lustre: 5587:0:(import.c:517:import_select_connection()) scratch1-MDT0000-mdc-ffff880be72aec00: tried all connections, increasing latency to 3s&lt;br/&gt;
Dec  1 15:08:59 fe1 kernel: Lustre: 5586:0:(client.c:1487:ptlrpc_expire_one_request()) @@@ Request x1386944264151143 sent from MGC10.174.80.40@o2ib2 to NID 10.174.80.40@o2ib2 6s ago has timed out (6s prior to deadline).&lt;br/&gt;
Dec  1 15:08:59 fe1 kernel:  req@ffff880bed70c400 x1386944264151143/t0 o250-&amp;gt;MGS@MGC10.174.80.40@o2ib2_0:26/25 lens 368/584 e 0 to 1 dl 1322752139 ref 1 fl Rpc:N/0/0 rc 0/0&lt;br/&gt;
Dec  1 15:08:59 fe1 kernel: Lustre: 5586:0:(client.c:1487:ptlrpc_expire_one_request()) Skipped 5 previous similar messages&lt;br/&gt;
Dec  1 15:09:00 fe1 kernel: Lustre: 5586:0:(import.c:855:ptlrpc_connect_interpret()) MGS@MGC10.174.80.40@o2ib2_1 changed server handle from 0x242210f6584197b7 to 0xa6cae1b09294c1a2&lt;br/&gt;
Dec  1 15:09:00 fe1 kernel: Lustre: MGC10.174.80.40@o2ib2: Reactivating import&lt;br/&gt;
Dec  1 15:09:00 fe1 kernel: Lustre: MGC10.174.80.40@o2ib2: Connection restored to service MGS using nid 10.174.80.41@o2ib2.&lt;br/&gt;
Dec  1 15:09:11 fe1 kernel: Lustre: 5587:0:(import.c:517:import_select_connection()) scratch1-MDT0000-mdc-ffff880be72aec00: tried all connections, increasing latency to 4s&lt;br/&gt;
Dec  1 15:09:31 fe1 kernel: Lustre: 5587:0:(import.c:517:import_select_connection()) scratch1-MDT0000-mdc-ffff880be72aec00: tried all connections, increasing latency to 5s&lt;br/&gt;
Dec  1 15:09:41 fe1 kernel: Lustre: 5586:0:(client.c:1487:ptlrpc_expire_one_request()) @@@ Request x1386944264151550 sent from scratch1-MDT0000-mdc-ffff880be72aec00 to NID 10.174.31.241@o2ib 10s ago has timed out (10s prior to deadline).&lt;br/&gt;
Dec  1 15:09:41 fe1 kernel:  req@ffff8817eea7d000 x1386944264151550/t0 o38-&amp;gt;scratch1-MDT0000_UUID@10.174.31.241@o2ib:12/10 lens 368/584 e 0 to 1 dl 1322752181 ref 1 fl Rpc:N/0/0 rc 0/0&lt;br/&gt;
Dec  1 15:09:41 fe1 kernel: Lustre: 5586:0:(client.c:1487:ptlrpc_expire_one_request()) Skipped 4 previous similar messages&lt;br/&gt;
Dec  1 15:10:17 fe1 kernel: Lustre: 5587:0:(import.c:517:import_select_connection()) scratch1-MDT0000-mdc-ffff880be72aec00: tried all connections, increasing latency to 7s&lt;br/&gt;
Dec  1 15:10:17 fe1 kernel: Lustre: 5587:0:(import.c:517:import_select_connection()) Skipped 1 previous similar message&lt;br/&gt;
Dec  1 15:10:56 fe1 kernel: Lustre: 5586:0:(client.c:1487:ptlrpc_expire_one_request()) @@@ Request x1386944264152753 sent from scratch1-MDT0000-mdc-ffff880be72aec00 to NID 10.174.31.241@o2ib 13s ago has timed out (13s prior to deadline).&lt;br/&gt;
Dec  1 15:10:56 fe1 kernel:  req@ffff881808992800 x1386944264152753/t0 o38-&amp;gt;scratch1-MDT0000_UUID@10.174.31.241@o2ib:12/10 lens 368/584 e 0 to 1 dl 1322752256 ref 1 fl Rpc:N/0/0 rc 0/0&lt;br/&gt;
Dec  1 15:10:56 fe1 kernel: Lustre: 5586:0:(client.c:1487:ptlrpc_expire_one_request()) Skipped 5 previous similar messages&lt;br/&gt;
Dec  1 15:11:41 fe1 kernel: Lustre: 5587:0:(import.c:517:import_select_connection()) scratch1-MDT0000-mdc-ffff880be72aec00: tried all connections, increasing latency to 10s&lt;br/&gt;
Dec  1 15:11:41 fe1 kernel: Lustre: 5587:0:(import.c:517:import_select_connection()) Skipped 2 previous similar messages&lt;br/&gt;
Dec  1 15:13:41 fe1 kernel: Lustre: 5586:0:(client.c:1487:ptlrpc_expire_one_request()) @@@ Request x1386944264155556 sent from scratch1-MDT0000-mdc-ffff880be72aec00 to NID 10.174.31.241@o2ib 18s ago has timed out (18s prior to deadline).&lt;br/&gt;
Dec  1 15:13:41 fe1 kernel:  req@ffff880be9f5e000 x1386944264155556/t0 o38-&amp;gt;scratch1-MDT0000_UUID@10.174.31.241@o2ib:12/10 lens 368/584 e 0 to 1 dl 1322752421 ref 1 fl Rpc:N/0/0 rc 0/0&lt;br/&gt;
Dec  1 15:13:41 fe1 kernel: Lustre: 5586:0:(client.c:1487:ptlrpc_expire_one_request()) Skipped 9 previous similar messages&lt;br/&gt;
Dec  1 15:14:41 fe1 kernel: Lustre: 5587:0:(import.c:517:import_select_connection()) scratch1-MDT0000-mdc-ffff880be72aec00: tried all connections, increasing latency to 15s&lt;br/&gt;
Dec  1 15:14:41 fe1 kernel: Lustre: 5587:0:(import.c:517:import_select_connection()) Skipped 4 previous similar messages&lt;br/&gt;
Dec  1 15:18:56 fe1 kernel: Lustre: 5586:0:(client.c:1487:ptlrpc_expire_one_request()) @@@ Request x1386944264160358 sent from scratch1-MDT0000-mdc-ffff880be72aec00 to NID 10.174.31.241@o2ib 25s ago has timed out (25s prior to deadline).&lt;br/&gt;
Dec  1 15:18:56 fe1 kernel:  req@ffff880beb4a4000 x1386944264160358/t0 o38-&amp;gt;scratch1-MDT0000_UUID@10.174.31.241@o2ib:12/10 lens 368/584 e 0 to 1 dl 1322752736 ref 1 fl Rpc:N/0/0 rc 0/0&lt;br/&gt;
Dec  1 15:18:56 fe1 kernel: Lustre: 5586:0:(client.c:1487:ptlrpc_expire_one_request()) Skipped 13 previous similar messages&lt;br/&gt;
Dec  1 15:20:17 fe1 kernel: LustreError: 166-1: MGC10.174.80.40@o2ib2: Connection to service MGS via nid 10.174.80.41@o2ib2 was lost; in progress operations using this service will fail.&lt;br/&gt;
Dec  1 15:20:18 fe1 kernel: Lustre: 5587:0:(import.c:517:import_select_connection()) scratch1-MDT0000-mdc-ffff880be72aec00: tried all connections, increasing latency to 22s&lt;br/&gt;
Dec  1 15:20:18 fe1 kernel: Lustre: 5587:0:(import.c:517:import_select_connection()) Skipped 6 previous similar messages&lt;br/&gt;
Dec  1 15:20:24 fe1 kernel: Lustre: 5586:0:(import.c:855:ptlrpc_connect_interpret()) MGS@MGC10.174.80.40@o2ib2_0 changed server handle from 0xa6cae1b09294c1a2 to 0x242210f65845423c&lt;br/&gt;
Dec  1 15:20:24 fe1 kernel: Lustre: MGC10.174.80.40@o2ib2: Reactivating import&lt;br/&gt;
Dec  1 15:20:24 fe1 kernel: Lustre: MGC10.174.80.40@o2ib2: Connection restored to service MGS using nid 10.174.80.40@o2ib2.&lt;br/&gt;
Dec  1 15:21:14 fe1 kernel: LustreError: 5586:0:(client.c:2347:ptlrpc_replay_interpret()) @@@ status 301, old was 0  req@ffff880be96a8000 x1386944264100092/t55834575126 o101-&amp;gt;scratch1-MDT0000_UUID@10.174.31.241@o2ib:12/10 lens 512/4880 e 0 to 1 dl 1322752939 ref 2 fl Interpret:RP/4/0 rc 301/301&lt;br/&gt;
Dec  1 15:21:17 fe1 kernel: Lustre: scratch1-MDT0000-mdc-ffff880be72aec00: Connection restored to service scratch1-MDT0000 using nid 10.174.31.241@o2ib.&lt;br/&gt;
Dec  1 15:21:17 fe1 kernel: LustreError: 11-0: an error occurred while communicating with 10.174.31.241@o2ib. The mds_close operation failed with -116&lt;br/&gt;
Dec  1 15:21:17 fe1 kernel: LustreError: Skipped 7 previous similar messages&lt;br/&gt;
Dec  1 15:21:17 fe1 kernel: LustreError: 7508:0:(file.c:116:ll_close_inode_openhandle()) inode 1905262791 mdc close failed: rc = -116&lt;/p&gt;

&lt;p&gt;The log files on lfs-mds-1-1 and lfs-mds-1-2 are void of any useful data.&lt;/p&gt;

</description>
                <environment>This is a rather complex configuration.  It consists of two large Lustre filesystems.  scratch1 is comprised of 2, MDS, 16 OSS, 4 DDN SFA 10K storage arrays.  scratch2 is comprised of 2 MDS, 20 OSS, 5 DDN SFA 10K storage arrays.  The Lustre servers all have 4 IB ports for client access to the filesystems.  The compute nodes access scratch1 via their ib0 port (ib0 on the lustre servers).  They access scratch2 vi ib1 (also ib1 on the servers).  The various login nodes of the cluster access both scratch1 and scratch2 through their ib2 port (also ib2 on the servers).  Finally, ib3 is for access to the production filesystems from clients in a test cluster.&lt;br/&gt;
&lt;br/&gt;
The servers are running CentOS 5.5 (2.6.18-238.12.1.el5)&lt;br/&gt;
Lustre 1.8.6 with the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-530&quot; title=&quot;group qoutas not enforced&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-530&quot;&gt;&lt;strike&gt;LU-530&lt;/strike&gt;&lt;/a&gt; patch installed&lt;br/&gt;
The clients are currently running RHEL 6.0.</environment>
        <key id="12585">LU-890</key>
            <summary>MDS Failover Issue - Clients not reconnecting after MGT/MDT fail over to other MDS.</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="dnelson@ddn.com">Dennis Nelson</reporter>
                        <labels>
                    </labels>
                <created>Fri, 2 Dec 2011 15:40:14 +0000</created>
                <updated>Mon, 12 Dec 2011 13:06:36 +0000</updated>
                            <resolved>Mon, 12 Dec 2011 13:06:36 +0000</resolved>
                                                    <fixVersion>Lustre 1.8.6</fixVersion>
                    <fixVersion>Lustre 1.8.x (1.8.0 - 1.8.5)</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>2</watches>
                                                                            <comments>
                            <comment id="23648" author="pjones" created="Fri, 2 Dec 2011 17:25:05 +0000"  >&lt;p&gt;Hongchao&lt;/p&gt;

&lt;p&gt;Could you please comment on this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="23696" author="dnelson@ddn.com" created="Mon, 5 Dec 2011 22:00:33 +0000"  >&lt;p&gt;Peter,&lt;/p&gt;

&lt;p&gt;Any word on this?  I have not heard back from the assigned engineer.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
&amp;#8211;&lt;br/&gt;
Dennis Nelson&lt;/p&gt;

&lt;p&gt;Applications Support Engineer&lt;br/&gt;
DataDirect Networks, Inc.&lt;br/&gt;
dnelson@ddn.com&lt;/p&gt;





&lt;p&gt;On 12/2/11 4:26 PM, &quot;Peter Jones (JIRA)&quot; &amp;lt;jira@whamcloud.com&amp;gt; wrote:&lt;/p&gt;

</comment>
                            <comment id="23702" author="dnelson@ddn.com" created="Mon, 5 Dec 2011 23:25:51 +0000"  >&lt;p&gt;Another data point...  I just tested mds failover on scratch2 and it worked as expected.  I cannot see any differences between the scratch1 configuration and the scratch2 configuration:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-2-1 ~&amp;#93;&lt;/span&gt;# cat /etc/modprobe.d/lustre.conf &lt;br/&gt;
options lnet networks=&quot;o2ib0(ib1), o2ib1(ib2), o2ib2(ib3)&quot;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-2-1 ~&amp;#93;&lt;/span&gt;# ifconfig ib0 | grep &quot;inet &quot;&lt;br/&gt;
          inet addr:10.174.31.242  Bcast:10.174.31.255  Mask:255.255.224.0&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-2-1 ~&amp;#93;&lt;/span&gt;# ifconfig ib1 | grep &quot;inet &quot;&lt;br/&gt;
          inet addr:10.175.31.242  Bcast:10.175.31.255  Mask:255.255.224.0&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-2-1 ~&amp;#93;&lt;/span&gt;# ifconfig ib2 | grep &quot;inet &quot;&lt;br/&gt;
          inet addr:10.174.79.242  Bcast:10.174.79.255  Mask:255.255.240.0&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-2-1 ~&amp;#93;&lt;/span&gt;# ifconfig ib3 | grep &quot;inet &quot;&lt;br/&gt;
          inet addr:10.174.80.42  Bcast:10.174.111.255  Mask:255.255.240.0&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-2-1 ~&amp;#93;&lt;/span&gt;# lctl list_nids&lt;br/&gt;
10.175.31.242@o2ib&lt;br/&gt;
10.174.79.242@o2ib1&lt;br/&gt;
10.174.80.42@o2ib2&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-2-1 ~&amp;#93;&lt;/span&gt;# ssh lfs-mds-2-2&lt;br/&gt;
Last login: Tue Dec  6 04:11:53 2011 from 192.168.7.172&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-2-2 ~&amp;#93;&lt;/span&gt;# ifconfig ib0 | grep &quot;inet &quot;&lt;br/&gt;
          inet addr:10.174.31.252  Bcast:10.174.31.255  Mask:255.255.224.0&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-2-2 ~&amp;#93;&lt;/span&gt;# ifconfig ib1 | grep &quot;inet &quot;&lt;br/&gt;
          inet addr:10.175.31.252  Bcast:10.175.31.255  Mask:255.255.224.0&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-2-2 ~&amp;#93;&lt;/span&gt;# ifconfig ib2 | grep &quot;inet &quot;&lt;br/&gt;
          inet addr:10.174.79.252  Bcast:10.174.79.255  Mask:255.255.240.0&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-2-2 ~&amp;#93;&lt;/span&gt;# ifconfig ib3 | grep &quot;inet &quot;&lt;br/&gt;
          inet addr:10.174.80.43  Bcast:10.174.111.255  Mask:255.255.240.0&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-2-2 ~&amp;#93;&lt;/span&gt;# lctl list_nids&lt;br/&gt;
10.175.31.252@o2ib&lt;br/&gt;
10.174.79.252@o2ib1&lt;br/&gt;
10.174.80.43@o2ib2&lt;/p&gt;


&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-2-1 ~&amp;#93;&lt;/span&gt;# tunefs.lustre --dryrun /dev/vg_scratch2/mdt&lt;br/&gt;
checking for existing Lustre data: found CONFIGS/mountdata&lt;br/&gt;
Reading CONFIGS/mountdata&lt;/p&gt;

&lt;p&gt;   Read previous values:&lt;br/&gt;
Target:     scratch2-MDT0000&lt;br/&gt;
Index:      0&lt;br/&gt;
Lustre FS:  scratch2&lt;br/&gt;
Mount type: ldiskfs&lt;br/&gt;
Flags:      0x1401&lt;br/&gt;
              (MDT no_primnode )&lt;br/&gt;
Persistent mount opts: iopen_nopriv,user_xattr,errors=remount-ro&lt;br/&gt;
Parameters: mgsnode=10.175.31.242@o2ib,10.174.79.242@o2ib1,10.174.80.42@o2ib2 mgsnode=10.175.31.252@o2ib,10.174.79.252@o2ib1,10.174.80.43@o2ib2 failover.node=10.175.31.242@o2ib,10.174.79.242@o2ib1,10.174.80.42@o2ib2 failover.node=10.175.31.252@o2ib,10.174.79.252@o2ib1,10.174.80.43@o2ib2 mdt.quota_type=ug&lt;/p&gt;


&lt;p&gt;   Permanent disk data:&lt;br/&gt;
Target:     scratch2-MDT0000&lt;br/&gt;
Index:      0&lt;br/&gt;
Lustre FS:  scratch2&lt;br/&gt;
Mount type: ldiskfs&lt;br/&gt;
Flags:      0x1401&lt;br/&gt;
              (MDT no_primnode )&lt;br/&gt;
Persistent mount opts: iopen_nopriv,user_xattr,errors=remount-ro&lt;br/&gt;
Parameters: mgsnode=10.175.31.242@o2ib,10.174.79.242@o2ib1,10.174.80.42@o2ib2 mgsnode=10.175.31.252@o2ib,10.174.79.252@o2ib1,10.174.80.43@o2ib2 failover.node=10.175.31.242@o2ib,10.174.79.242@o2ib1,10.174.80.42@o2ib2 failover.node=10.175.31.252@o2ib,10.174.79.252@o2ib1,10.174.80.43@o2ib2 mdt.quota_type=ug&lt;/p&gt;

&lt;p&gt;exiting before disk write.&lt;/p&gt;</comment>
                            <comment id="23704" author="hongchao.zhang" created="Tue, 6 Dec 2011 00:42:43 +0000"  >&lt;p&gt;the problem could be related to the client config log, and i am investigating it currently, and will ask you to provide&lt;br/&gt;
some extra information later, thanks.&lt;/p&gt;</comment>
                            <comment id="23707" author="hongchao.zhang" created="Tue, 6 Dec 2011 06:40:30 +0000"  >&lt;p&gt;according to the log, the clients don&apos;t know there is a failover node of MDT, for it never try to connect the lfs-mds-1-2,&lt;br/&gt;
the possible reason for it is when MDT registered it at MGS, it failed to process the failover.node params, and the failover&lt;br/&gt;
info isn&apos;t contained in the client config log for MDC.&lt;/p&gt;

&lt;p&gt;could you please remount one client and attach its debug logs? thanks in advance.&lt;/p&gt;</comment>
                            <comment id="23734" author="dnelson@ddn.com" created="Tue, 6 Dec 2011 10:36:19 +0000"  >&lt;p&gt;OK, I need some help in how to gather the debug logs.  I assume that I have to change /proc/sys/lnet/debug to a different value but I am not sure what that should be to capture the right information.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;</comment>
                            <comment id="23757" author="cliffw" created="Tue, 6 Dec 2011 13:57:16 +0000"  >&lt;p&gt;Adding +trace should be enough, as in:&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;lctl get_param debug&lt;br/&gt;
debug=ioctl neterror warning error emerg ha config console&lt;/li&gt;
	&lt;li&gt;lctl set_param debug=+trace&lt;br/&gt;
debug=+trace&lt;/li&gt;
	&lt;li&gt;lctl get_param debug&lt;br/&gt;
debug=trace ioctl neterror warning error emerg ha config console&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;Then do the mount test, and &lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;lctl dk &amp;gt; &amp;lt;somefilename&amp;gt;&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;and attach.&lt;/p&gt;

&lt;p&gt;I am confused however by one thing - you say the compute nodes connect via ib1 on the servers, but you do not have ib1 in your lnet config:&lt;/p&gt;

&lt;p&gt;options lnet networks=&quot;o2ib0(ib0), o2ib1(ib2), o2ib2(ib3)&quot;&lt;/p&gt;

&lt;p&gt;And thus you don&apos;t show a 10.175.xx.xx address for your servers:&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-1-1 config&amp;#93;&lt;/span&gt;# lctl list_nids&lt;br/&gt;
10.174.31.241@o2ib&lt;br/&gt;
10.174.79.241@o2ib1&lt;br/&gt;
10.174.80.40@o2ib2&lt;br/&gt;
That bit I find confusing. &lt;/p&gt;</comment>
                            <comment id="23758" author="dnelson@ddn.com" created="Tue, 6 Dec 2011 14:03:44 +0000"  >&lt;p&gt;I did not design this.  The design specified that the compute nodes access scratch1 through ib0 and scratch2 through ib1.  So you will see that the scratch1 servers use ib0, ib2 and ib3, while the scratch2 servers use ib1, ib2, and ib3.&lt;/p&gt;</comment>
                            <comment id="23759" author="cliffw" created="Tue, 6 Dec 2011 14:10:04 +0000"  >&lt;p&gt;Okay, understood. We will need the debug logs for the mount attempt.&lt;/p&gt;</comment>
                            <comment id="23762" author="dnelson@ddn.com" created="Tue, 6 Dec 2011 14:28:55 +0000"  >&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@fe2 ~&amp;#93;&lt;/span&gt;# modprobe lustre&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@fe2 ~&amp;#93;&lt;/span&gt;# lctl get_param debug&lt;br/&gt;
lnet.debug=ioctl neterror warning error emerg ha config console&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@fe2 ~&amp;#93;&lt;/span&gt;# lctl set_param debug=+trace&lt;br/&gt;
lnet.debug=+trace&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@fe2 ~&amp;#93;&lt;/span&gt;#  lctl get_param debug&lt;br/&gt;
lnet.debug=trace ioctl neterror warning error emerg ha config console&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@fe2 ~&amp;#93;&lt;/span&gt;# mount /mnt/lustre1&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@fe2 ~&amp;#93;&lt;/span&gt;# lctl dk &amp;gt; /tmp/lustre-scratch1&lt;/p&gt;</comment>
                            <comment id="23764" author="dnelson@ddn.com" created="Tue, 6 Dec 2011 14:38:13 +0000"  >&lt;p&gt;I added the attachment.  The trace is only from mounting /mnt/lustre1.  I performed another trace where I performed a mount -at mounting both filesystems but it is a large file.  It is 32 MB, The JIRA interface says it has a 10 MB limit.  I&apos;ll be glad to forward the larger file if you give me a way to send it.&lt;/p&gt;</comment>
                            <comment id="23791" author="hongchao.zhang" created="Wed, 7 Dec 2011 05:06:57 +0000"  >&lt;p&gt;the log is a little strange, and there is no attach&amp;amp;setup info of the obd_device scratch1-MDT0000-mdc-ffff88180455bc00 and&lt;br/&gt;
some OSC device (e.g. scratch1-OST0001-osc-ffff88180455bc00), but these devices is used by the newly mounted Lustre.&lt;/p&gt;

&lt;p&gt;could you please retry to get the debug log at a clean node(meaning it doesn&apos;t mount Lustre ever) and without preload the &quot;lustre&quot; module&lt;br/&gt;
(the default &quot;debug&quot; config is enough, then it isn&apos;t needed to change)? just dump the log after mounting Lustre.&lt;/p&gt;

&lt;p&gt;mount /mnt/lustre1&lt;br/&gt;
lctl dk &amp;gt; /tmp/lustre-scratch1&lt;/p&gt;

&lt;p&gt;thanks!&lt;/p&gt;</comment>
                            <comment id="23797" author="dnelson@ddn.com" created="Wed, 7 Dec 2011 11:02:14 +0000"  >&lt;p&gt;Sorry, I sent the trace for the other case but did not send the trace for dtn1.  Here it is.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@dtn1 ~&amp;#93;&lt;/span&gt;# lustre_rmmod&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@dtn1 ~&amp;#93;&lt;/span&gt;# modprobe lustre&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@dtn1 ~&amp;#93;&lt;/span&gt;# lctl set_param debug=+trace lnet.debug=+trace&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@dtn1 ~&amp;#93;&lt;/span&gt;# lctl get_param debug&lt;br/&gt;
lnet.debug=trace ioctl neterror warning error emerg ha config console&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@dtn1 ~&amp;#93;&lt;/span&gt;# mount /mnt/lustre1&lt;br/&gt;
mount.lustre: mount 10.174.80.40@o2ib2:10.174.80.41@o2ib2:/scratch1 at /mnt/lustre1 failed: No such file or directory&lt;br/&gt;
Is the MGS specification correct?&lt;br/&gt;
Is the filesystem name correct?&lt;br/&gt;
If upgrading, is the copied client log valid? (see upgrade docs)&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@dtn1 ~&amp;#93;&lt;/span&gt;# lctl dk &amp;gt; /tmp/lustre-scratch1&lt;/p&gt;</comment>
                            <comment id="23798" author="dnelson@ddn.com" created="Wed, 7 Dec 2011 11:05:56 +0000"  >&lt;p&gt;Please ignore the last entry and attachment.  It was intended for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-899&quot; title=&quot;Client Connectivity Issues in Complex Lustre Environment&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-899&quot;&gt;&lt;del&gt;LU-899&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="23800" author="dnelson@ddn.com" created="Wed, 7 Dec 2011 11:16:10 +0000"  >&lt;p&gt;I realized that I did not run the test exactly as you had asked.  I did preload the Lustre modules.  I performed the test again:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@fe2 ~&amp;#93;&lt;/span&gt;# lustre_rmmod&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@fe2 ~&amp;#93;&lt;/span&gt;# mount /mnt/lustre1 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@fe2 ~&amp;#93;&lt;/span&gt;# lctl dk &amp;gt; /tmp/lustre-scratch1&lt;/p&gt;</comment>
                            <comment id="23889" author="hongchao.zhang" created="Wed, 7 Dec 2011 21:18:38 +0000"  >&lt;p&gt;in the last debug log, the connection to failover node of MDT 10.174.31.251 is added to MDC, but it wasn&apos;t shown in the logs&lt;br/&gt;
of the description section of this ticket, which only used the main 10.174.31.241 MDT node, is there any change for the &lt;br/&gt;
system? could you please retry to test whether this node (fe2) can fail over to 10.174.31.151 or not? thanks!&lt;/p&gt;</comment>
                            <comment id="23894" author="hongchao.zhang" created="Thu, 8 Dec 2011 05:15:29 +0000"  >&lt;p&gt;just like the comment in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-899&quot; title=&quot;Client Connectivity Issues in Complex Lustre Environment&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-899&quot;&gt;&lt;del&gt;LU-899&lt;/del&gt;&lt;/a&gt;, could you please run the following commands and attach the config files in JIRA,&lt;/p&gt;

&lt;p&gt;umount /mnt/mgs&lt;br/&gt;
mount -t ldiskfs /dev/your_mgs_device /mnt/mgs&lt;/p&gt;

&lt;p&gt;the config files is in directory /mnt/mgs/CONFIGS/&lt;/p&gt;

&lt;p&gt;thanks&lt;/p&gt;</comment>
                            <comment id="23913" author="dnelson@ddn.com" created="Thu, 8 Dec 2011 10:45:50 +0000"  >&lt;p&gt;Sorry for the delay.  I had laptop issues.  Here are the uuids.&lt;/p&gt;</comment>
                            <comment id="23934" author="cliffw" created="Thu, 8 Dec 2011 15:16:07 +0000"  >&lt;p&gt;I have been reviewing this again, wanted to add some clarification. Server recovery will NOT start until the first client attempts a connection. &lt;br/&gt;
We do this so a node with a dead network won&apos;t have a failed recovery, it waits for the network to be restored before starting recovery, and looks for a client connection attempt to happen. In your case, I think this is telling us that clients cannot find the backup MDS, since we do not see connection attempts. Are you certain all network routing, masks, etc are correct for clients to reach lfs-mds-1-2? Might be worth a re-check and another round of lctl pings. &lt;/p&gt;</comment>
                            <comment id="23935" author="dnelson@ddn.com" created="Thu, 8 Dec 2011 16:07:31 +0000"  >&lt;p&gt;OK, here is the server info once again:&lt;/p&gt;

&lt;p&gt;lfs-mds-1-2:&lt;br/&gt;
ib0          inet addr:10.174.31.251  Bcast:10.174.31.255  Mask:255.255.224.0&lt;br/&gt;
ib1          inet addr:10.175.31.251  Bcast:10.175.31.255  Mask:255.255.224.0&lt;br/&gt;
ib2          inet addr:10.174.79.251  Bcast:10.174.79.255  Mask:255.255.240.0&lt;br/&gt;
ib3          inet addr:10.174.80.41  Bcast:10.174.111.255  Mask:255.255.240.0&lt;/p&gt;

&lt;p&gt;Although configured with an ip address, the scratch1 filesystem does not use the ib1 fabric.  On lfs-mds-2-x, the ib0 fabric is not used.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-1-2 ~&amp;#93;&lt;/span&gt;# cat /etc/modprobe.d/lustre.conf &lt;br/&gt;
options lnet networks=&quot;o2ib0(ib0), o2ib1(ib2), o2ib2(ib3)&quot;&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@lfs-mds-1-2 ~&amp;#93;&lt;/span&gt;# lctl list_nids&lt;br/&gt;
10.174.31.251@o2ib&lt;br/&gt;
10.174.79.251@o2ib1&lt;br/&gt;
10.174.80.41@o2ib2&lt;/p&gt;

&lt;p&gt;Client fe1 (login node)&lt;br/&gt;
ib0          inet addr:10.174.0.37  Bcast:10.255.255.255  Mask:255.255.224.0&lt;br/&gt;
ib1          inet addr:10.175.0.37  Bcast:10.255.255.255  Mask:255.255.224.0&lt;br/&gt;
ib2          inet addr:10.174.81.10  Bcast:10.174.95.255  Mask:255.255.240.0&lt;/p&gt;

&lt;p&gt;Although the login nodes have a connection to the ib0 and ib1 fabrics (same as ib0 and ib1 on the Lustre servers), the design of the system was such that the login nodes should use the ib2 port (Same fabric as ib3 on Lustre servers) for mounting the Lustre filesystems.  Havin all three entries in teh file might be an issue.  I had difficulties making the mount work with just ib2 defined in modprobe.d/lustre.conf.  This configuration allows the mounts to work although, scratch2 does take a while to mount (about 2.5 minutes).&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@fe1 ~&amp;#93;&lt;/span&gt;# cat /etc/modprobe.d/lustre.conf &lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;Lustre module configuration file&lt;br/&gt;
options lnet networks=&quot;o2ib0(ib0), o2ib1(ib1), o2ib2(ib2)&quot;&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@fe1 ~&amp;#93;&lt;/span&gt;# lctl list_nids&lt;br/&gt;
10.174.0.37@o2ib&lt;br/&gt;
10.175.0.37@o2ib1&lt;br/&gt;
10.174.81.10@o2ib2&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@fe1 ~&amp;#93;&lt;/span&gt;# mount&lt;br/&gt;
10.174.80.40@o2ib2:10.174.80.41@o2ib2:/scratch1 on /mnt/lustre1 type lustre (rw,flock)&lt;br/&gt;
10.174.80.42@o2ib2:10.174.80.43@o2ib2:/scratch2 on /mnt/lustre2 type lustre (rw,flock)&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@fe1 ~&amp;#93;&lt;/span&gt;# lctl ping 10.174.80.40@o2ib2&lt;br/&gt;
12345-0@lo&lt;br/&gt;
12345-10.174.31.241@o2ib&lt;br/&gt;
12345-10.174.79.241@o2ib1&lt;br/&gt;
12345-10.174.80.40@o2ib2&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@fe1 ~&amp;#93;&lt;/span&gt;# lctl ping 10.174.80.41@o2ib2&lt;br/&gt;
12345-0@lo&lt;br/&gt;
12345-10.174.31.251@o2ib&lt;br/&gt;
12345-10.174.79.251@o2ib1&lt;br/&gt;
12345-10.174.80.41@o2ib2&lt;/p&gt;

&lt;p&gt;Client dtn1 (data transfer node):&lt;br/&gt;
dtn1 accesses the filesystems just like the login nodes but dtn1 does not have interfaces that connect to the ib0 and ib1 ports of the servers.&lt;/p&gt;

&lt;p&gt;ib0          inet addr:10.174.81.1  Bcast:10.174.95.255  Mask:255.255.240.0&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@dtn1 ~&amp;#93;&lt;/span&gt;# cat /etc/modprobe.d/lustre.conf&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;Lustre module configuration file&lt;br/&gt;
options lnet networks=&quot;o2ib2(ib0)&quot;&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@dtn1 ~&amp;#93;&lt;/span&gt;# lctl list_nids&lt;br/&gt;
10.174.81.1@o2ib2&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@dtn1 ~&amp;#93;&lt;/span&gt;# lctl ping 10.174.80.40@o2ib2&lt;br/&gt;
12345-0@lo&lt;br/&gt;
12345-10.174.31.241@o2ib&lt;br/&gt;
12345-10.174.79.241@o2ib1&lt;br/&gt;
12345-10.174.80.40@o2ib2&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@dtn1 ~&amp;#93;&lt;/span&gt;# lctl ping 10.174.80.41@o2ib2&lt;br/&gt;
12345-0@lo&lt;br/&gt;
12345-10.174.31.251@o2ib&lt;br/&gt;
12345-10.174.79.251@o2ib1&lt;br/&gt;
12345-10.174.80.41@o2ib2&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@dtn1 ~&amp;#93;&lt;/span&gt;# mount /mnt/lustre1&lt;br/&gt;
mount.lustre: mount 10.174.80.40@o2ib2:10.174.80.41@o2ib2:/scratch1 at /mnt/lustre1 failed: No such file or directory&lt;br/&gt;
Is the MGS specification correct?&lt;br/&gt;
Is the filesystem name correct?&lt;br/&gt;
If upgrading, is the copied client log valid? (see upgrade docs)&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@dtn1 ~&amp;#93;&lt;/span&gt;# cat /etc/fstab&lt;br/&gt;
10.174.80.40@o2ib2:10.174.80.41@o2ib2:/scratch1 /mnt/lustre1 lustre defaults,flock 0 0&lt;br/&gt;
10.174.80.42@o2ib2:10.174.80.43@o2ib2:/scratch2 /mnt/lustre2 lustre defaults,flock 0 0&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@dtn1 ~&amp;#93;&lt;/span&gt;# mount&lt;br/&gt;
10.174.80.42@o2ib2:10.174.80.43@o2ib2:/scratch2 on /mnt/lustre2 type lustre (rw,flock)&lt;/p&gt;

&lt;p&gt;Did I miss anything that you wanted to see?&lt;/p&gt;

</comment>
                            <comment id="23936" author="dnelson@ddn.com" created="Thu, 8 Dec 2011 16:10:42 +0000"  >&lt;p&gt;hold, on.  I just noticed that we have a broadcast address problem.&lt;/p&gt;</comment>
                            <comment id="23937" author="dnelson@ddn.com" created="Thu, 8 Dec 2011 16:34:16 +0000"  >&lt;p&gt;I am fixing the incorrect broadcast addresses.  I&apos;m not sure that will fix the issues but it is wrong and needs to be fixed.  I&apos;ll report back with new info after that is completed.&lt;/p&gt;</comment>
                            <comment id="24096" author="dnelson@ddn.com" created="Mon, 12 Dec 2011 12:37:52 +0000"  >&lt;p&gt;I believe this can be declared resolved by the writeconf.  I am curious though if anyone has any insight on what might have gone wrong.  We did not change any parameters, yet after the writeconf, it now works.  I still have a client connectivity issue being worked in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-899&quot; title=&quot;Client Connectivity Issues in Complex Lustre Environment&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-899&quot;&gt;&lt;del&gt;LU-899&lt;/del&gt;&lt;/a&gt; which may or may not be related.&lt;/p&gt;</comment>
                            <comment id="24106" author="cliffw" created="Mon, 12 Dec 2011 13:06:03 +0000"  >&lt;p&gt;As Johann said, we think there was an issue with the initial creation of the config log, recreation fixed it. We are also working on replication in the lab. &lt;br/&gt;
Closing&lt;/p&gt;</comment>
                            <comment id="24107" author="cliffw" created="Mon, 12 Dec 2011 13:06:36 +0000"  >&lt;p&gt;Recreating config logs with writeconf fixed the failover issue&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="10655" name="lustre-scratch1" size="1514210" author="dnelson@ddn.com" created="Wed, 7 Dec 2011 11:16:10 +0000"/>
                            <attachment id="10653" name="lustre-scratch1" size="845463" author="dnelson@ddn.com" created="Wed, 7 Dec 2011 11:03:41 +0000"/>
                            <attachment id="10652" name="lustre-scratch1" size="1514356" author="dnelson@ddn.com" created="Wed, 7 Dec 2011 08:56:28 +0000"/>
                            <attachment id="10651" name="lustre-scratch1" size="10181428" author="dnelson@ddn.com" created="Tue, 6 Dec 2011 14:28:55 +0000"/>
                            <attachment id="10660" name="lustre1_uuids.txt" size="142241" author="dnelson@ddn.com" created="Thu, 8 Dec 2011 10:45:50 +0000"/>
                            <attachment id="10661" name="lustre2_uuids.txt" size="355813" author="dnelson@ddn.com" created="Thu, 8 Dec 2011 10:45:50 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvho7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>6512</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>