<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:47:11 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4943] Client Failes to mount filesystem</title>
                <link>https://jira.whamcloud.com/browse/LU-4943</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Upgrading to ofed3.5 we have started to get random mount failures during client boot. The filesystem that failed to mount is random. Here it client side debug output. &lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;0000000:00000001:1.0:1398271322.986806:0:7677:0:(mgc_request.c:947:mgc_enqueue()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving (rc=18446744073709551611 : -5 : fffffffffffffffb)
10000000:01000000:1.0:1398271322.986808:0:7677:0:(mgc_request.c:1852:mgc_process_log()) Can&apos;t get cfg lock: -5
10000000:00000001:1.0:1398271322.986810:0:7677:0:(mgc_request.c:125:config_log_get()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; entered
10000000:00000001:1.0:1398271322.986811:0:7677:0:(mgc_request.c:129:config_log_get()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving (rc=0 : 0 : 0)
10000000:00000001:1.0:1398271322.986813:0:7677:0:(mgc_request.c:1713:mgc_process_cfg_log()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; entered
10000000:00000001:1.0:1398271322.986815:0:7677:0:(mgc_request.c:1774:mgc_process_cfg_log()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving via out_pop (rc=18446744073709551611 : -5 : 0xfffffffffffffffb)
10000000:00000001:1.0:1398271322.986818:0:7677:0:(mgc_request.c:1811:mgc_process_cfg_log()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving (rc=18446744073709551611 : -5 : fffffffffffffffb)
10000000:01000000:1.0:1398271322.986819:0:7677:0:(mgc_request.c:1871:mgc_process_log()) MGC10.151.25.171@o2ib: configuration from log &lt;span class=&quot;code-quote&quot;&gt;&apos;nbp3-client&apos;&lt;/span&gt; failed (-5).
10000000:00000001:1.0:1398271322.986822:0:7677:0:(mgc_request.c:1883:mgc_process_log()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving (rc=18446744073709551611 : -5 : fffffffffffffffb)
10000000:00000001:1.0:1398271322.986824:0:7677:0:(mgc_request.c:136:config_log_put()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; entered
10000000:00000001:1.0:1398271322.986825:0:7677:0:(mgc_request.c:160:config_log_put()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving
10000000:00000001:1.0:1398271322.986826:0:7677:0:(mgc_request.c:1982:mgc_process_config()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving (rc=18446744073709551611 : -5 : fffffffffffffffb)
00000020:00000001:1.0:1398271322.986829:0:7677:0:(obd_class.h:714:obd_process_config()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving (rc=18446744073709551611 : -5 : fffffffffffffffb)
00000020:00000001:1.0:1398271322.986830:0:7677:0:(lustre_cfg.h:214:lustre_cfg_len()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; entered
00000020:00000001:1.0:1398271322.986831:0:7677:0:(lustre_cfg.h:220:lustre_cfg_len()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving (rc=176 : 176 : b0)
00000020:00000001:1.0:1398271322.986833:0:7677:0:(lustre_cfg.h:259:lustre_cfg_free()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving
00000020:02020000:1.0:1398271322.986834:0:7677:0:(obd_mount.c:119:lustre_process_log()) 15c-8: MGC10.151.25.171@o2ib: The configuration from log &lt;span class=&quot;code-quote&quot;&gt;&apos;nbp3-client&apos;&lt;/span&gt; failed (-5). This may be the result of communication errors between &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; node and the MGS, a bad configuration, or other errors. See the syslog &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; more information.
00000020:00000001:1.0:1398271323.010020:0:7677:0:(obd_mount.c:122:lustre_process_log()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving (rc=18446744073709551611 : -5 : fffffffffffffffb)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Complete Debug output is attached&lt;/p&gt;</description>
                <environment>lustre-client-modules-2.4.1-6nasC OFED3.5&lt;br/&gt;
server lustre2.4.1 and 2.1.5 OFED1.5.4</environment>
        <key id="24353">LU-4943</key>
            <summary>Client Failes to mount filesystem</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bobijam">Zhenyu Xu</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                            <label>patch</label>
                    </labels>
                <created>Wed, 23 Apr 2014 18:33:15 +0000</created>
                <updated>Tue, 12 Apr 2016 07:50:02 +0000</updated>
                            <resolved>Tue, 4 Nov 2014 04:18:45 +0000</resolved>
                                    <version>Lustre 2.4.1</version>
                                    <fixVersion>Lustre 2.7.0</fixVersion>
                    <fixVersion>Lustre 2.5.4</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>12</watches>
                                                                            <comments>
                            <comment id="82341" author="pjones" created="Wed, 23 Apr 2014 23:24:58 +0000"  >&lt;p&gt;Amir is looking into this issue&lt;/p&gt;</comment>
                            <comment id="82344" author="ashehata" created="Thu, 24 Apr 2014 00:59:06 +0000"  >&lt;p&gt;From the logs:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;00000100:00020000:1.0:1398271322.963863:0:7677:0:(client.c:1052:ptlrpc_import_delay_req()) @@@ send limit expired   req@ffff8807f6f84400 x1466193689968652/t0(0) o101-&amp;gt;MGC10.151.25.171@o2ib@10.151.25.171@o2ib:26/25 lens 328/344 e 0 to 0 dl 0 ref 2 fl Rpc:W/0/ffffffff rc 0/-1

This indicates that requests being sent from the client are queued, but never sent within a specified time limit.  Thus -5 (-EIO) is returned.

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Looking further back in the log&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;00000800:00000100:0.0:1398271416.231812:0:7154:0:(o2iblnd_cb.c:2844:kiblnd_cm_callback()) 10.151.25.171@o2ib: ADDR ERROR -110
00000800:00000100:0.0:1398271416.231822:0:7154:0:(o2iblnd_cb.c:2072:kiblnd_peer_connect_failed()) Deleting messages &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 10.151.25.171@o2ib: connection failed
...
00000100:00080000:0.0:1398271370.987478:0:7628:0:(&lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt;.c:1187:ptlrpc_connect_interpret()) recovery of MGS on MGC10.151.25.171@o2ib_0 failed (-110)

This seems to indicate that the connection between client and server is timing out, -110 (-ETIMEDOUT)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Since clients are using OFED3.5 and servers are using OFED 1.5.4, could there be incompatibility between the versions?&lt;/p&gt;</comment>
                            <comment id="82396" author="mhanafi" created="Thu, 24 Apr 2014 15:21:32 +0000"  >&lt;p&gt;Agreed this could be an issue between ofed3.5 and ofed1.5.4. &lt;/p&gt;

&lt;p&gt;But there is a problem with how the failures is handled by lustre. When the filesystem failed to mount the mgc is put into a Stale state, which prevents any further mount attempts. The only way to recover from that is umonting all the filesystems to clear the Stale mgc, then remounting works.&lt;/p&gt;
</comment>
                            <comment id="82443" author="pjones" created="Thu, 24 Apr 2014 22:55:32 +0000"  >&lt;p&gt;Bobijam&lt;/p&gt;

&lt;p&gt;Could you please advise on this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="82470" author="bobijam" created="Fri, 25 Apr 2014 12:46:32 +0000"  >&lt;p&gt;what the address of the MGS? Is 10.151.25.172@o2ib a valid server NID?&lt;/p&gt;</comment>
                            <comment id="82510" author="mhanafi" created="Fri, 25 Apr 2014 17:42:00 +0000"  >&lt;p&gt;10.151.25.172? the MGS nid is 10.151.25.171@o2ib. it is a valid MGS server NID address.&lt;/p&gt;</comment>
                            <comment id="82514" author="jaylan" created="Fri, 25 Apr 2014 17:53:10 +0000"  >&lt;p&gt;10.151.25.172@o2ib is also a valid NID. It is a client.&lt;/p&gt;</comment>
                            <comment id="82574" author="bobijam" created="Mon, 28 Apr 2014 02:03:56 +0000"  >&lt;p&gt;From the log I can see that client nbp3-client cannot mount because of the first lustre_fill_super() failed for -5 (I could not find out the root cause of this failure, only know that the client MGC cannot get its lock from MGS to process MGS llogs), and this mount process has registered the mgc device, and later mount fails before of this existing mgc device attached. &lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;$ grep lustre_fill_super r403i1n1.1398271435.out 
18:00000020:00000001:0.0:1398271265.936424:0:7677:0:(obd_mount.c:1223:lustre_fill_super()) Process entered
19:00000020:01200004:0.0:1398271265.936425:0:7677:0:(obd_mount.c:1225:lustre_fill_super()) VFS Op: sb ffff8807f6f9e000
29:00000020:01000004:0.0:1398271265.936437:0:7677:0:(obd_mount.c:1250:lustre_fill_super()) Mounting client nbp3-client
37282:00000020:00000001:1.0:1398271370.987877:0:7677:0:(obd_mount.c:1285:lustre_fill_super()) Process leaving via out (rc=18446744073709551611 : -5 : 0xfffffffffffffffb)
37283:00000020:00020000:1.0:1398271370.987879:0:7677:0:(obd_mount.c:1289:lustre_fill_super()) Unable to mount  (-5)
38596:00000020:00000001:0.0:1398271375.047962:0:7703:0:(obd_mount.c:1223:lustre_fill_super()) Process entered
38597:00000020:01200004:0.0:1398271375.047964:0:7703:0:(obd_mount.c:1225:lustre_fill_super()) VFS Op: sb ffff880822c25c00
38607:00000020:01000004:0.0:1398271375.047985:0:7703:0:(obd_mount.c:1250:lustre_fill_super()) Mounting client nbp3-client
38656:00000020:00000001:0.0:1398271375.080789:0:7703:0:(obd_mount.c:1261:lustre_fill_super()) Process leaving via out (rc=18446744073709551599 : -17 : 0xffffffffffffffef)
38657:00000020:00020000:0.0:1398271375.080790:0:7703:0:(obd_mount.c:1289:lustre_fill_super()) Unable to mount  (-17)
41938:00000020:00000001:0.0:1398271385.134777:0:7706:0:(obd_mount.c:1223:lustre_fill_super()) Process entered
41939:00000020:01200004:0.0:1398271385.134780:0:7706:0:(obd_mount.c:1225:lustre_fill_super()) VFS Op: sb ffff8807eaaa8c00
41949:00000020:01000004:0.0:1398271385.134800:0:7706:0:(obd_mount.c:1250:lustre_fill_super()) Mounting client nbp3-client
41998:00000020:00000001:0.0:1398271385.167599:0:7706:0:(obd_mount.c:1261:lustre_fill_super()) Process leaving via out (rc=18446744073709551599 : -17 : 0xffffffffffffffef)
41999:00000020:00020000:0.0:1398271385.167601:0:7706:0:(obd_mount.c:1289:lustre_fill_super()) Unable to mount  (-17)
48640:00000020:00000001:0.0:1398271405.222478:0:7709:0:(obd_mount.c:1223:lustre_fill_super()) Process entered
48641:00000020:01200004:0.0:1398271405.222480:0:7709:0:(obd_mount.c:1225:lustre_fill_super()) VFS Op: sb ffff88083c939400
48651:00000020:01000004:0.0:1398271405.222501:0:7709:0:(obd_mount.c:1250:lustre_fill_super()) Mounting client nbp3-client
48700:00000020:00000001:0.0:1398271405.255289:0:7709:0:(obd_mount.c:1261:lustre_fill_super()) Process leaving via out (rc=18446744073709551599 : -17 : 0xffffffffffffffef)
48701:00000020:00020000:0.0:1398271405.255291:0:7709:0:(obd_mount.c:1289:lustre_fill_super()) Unable to mount  (-17)
58847:00000020:00000001:0.0:1398271435.314276:0:7712:0:(obd_mount.c:1223:lustre_fill_super()) Process entered
58848:00000020:01200004:0.0:1398271435.314278:0:7712:0:(obd_mount.c:1225:lustre_fill_super()) VFS Op: sb ffff8807f6f76800
58858:00000020:01000004:0.0:1398271435.314300:0:7712:0:(obd_mount.c:1250:lustre_fill_super()) Mounting client nbp3-client
85984:00000020:00000001:1.0:1398271435.639442:0:7712:0:(obd_mount.c:1285:lustre_fill_super()) Process leaving via out (rc=0 : 0 : 0x0)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;We should handle lustre_fill_super() error path to release this device on error. And you can work around this issue to bring down this client (rmmod all lustre modules) then start lustre modules again and mount this client. I think you don&apos;t need to bring down the whole system.&lt;/p&gt;</comment>
                            <comment id="82575" author="bobijam" created="Mon, 28 Apr 2014 02:22:30 +0000"  >&lt;p&gt;b2_4 patch tracking at &lt;a href=&quot;http://review.whamcloud.com/10127&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/10127&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="82621" author="bobijam" created="Mon, 28 Apr 2014 15:52:14 +0000"  >&lt;p&gt;master patch tracking at &lt;a href=&quot;http://review.whamcloud.com/10129&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/10129&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="82723" author="parinay" created="Tue, 29 Apr 2014 08:10:12 +0000"  >&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;To test the patch for b2_4 ( Lustre version: 2.4.3-g35d73cf-CHANGED-2.6.32-358.23.2.el6.x86_64) , I tried to reproduce the issue with the help of &quot;lctl set_param fail_val=120 ; lctl set_param fail_loc=0x8000050&quot;.  This is to introduce the &quot;first lustre_fill_super() failed for -5&quot;&lt;/li&gt;
	&lt;li&gt;The setup is, 2 MDS/MDT, 2 OSS/OST, 1 client, mounting the two file systems ( namely lustre, lustre1)&lt;/li&gt;
	&lt;li&gt;The first file system lustre is mounted. For the second file system, lustre1, before executing &quot;mount&quot;, I set fail_val and fail_loc as mentioned above.&lt;/li&gt;
	&lt;li&gt;what happens thereafter, is through the logs below,&lt;/li&gt;
&lt;/ul&gt;


&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Apr 28 23:49:09 osh-1 kernel: Lustre: Mounted lustre-client
Apr 28 23:49:35 osh-1 kernel: LustreError: 29991:0:(fail.c:133:__cfs_fail_timeout_set()) cfs_fail_timeout id 50c sleeping for 120000ms
Apr 28 23:49:41 osh-1 kernel: LustreError: 18261:0:(client.c:1052:ptlrpc_import_delay_req()) @@@ send limit expired   req@ffff8800181b2400 x1466653941460808/t0(0) o101-&amp;gt;MGC192.168.177.145@tcp@192.168.177.145@tcp:26/25 lens 328/344 e 0 to 0 dl 0 ref 2 fl Rpc:W/0/ffffffff rc 0/-1
Apr 28 23:50:32 osh-1 kernel: LustreError: 18261:0:(client.c:1052:ptlrpc_import_delay_req()) @@@ send limit expired   req@ffff8800181b2400 x1466653941460812/t0(0) o101-&amp;gt;MGC192.168.177.145@tcp@192.168.177.145@tcp:26/25 lens 328/344 e 0 to 0 dl 0 ref 2 fl Rpc:W/0/ffffffff rc 0/-1
Apr 28 23:50:32 osh-1 kernel: LustreError: 15c-8: MGC192.168.177.145@tcp: The configuration from log &apos;lustre1-client&apos; failed (-5). This may be the result of communication errors between this node and the MGS, a bad configuration, or other errors. See the syslog for more information.
Apr 28 23:50:32 osh-1 kernel: LustreError: 18261:0:(llite_lib.c:1043:ll_fill_super()) Unable to process log: -5
Apr 28 23:50:32 osh-1 kernel: Lustre: Unmounted lustre1-client
Apr 28 23:51:35 osh-1 kernel: LustreError: 29991:0:(fail.c:137:__cfs_fail_timeout_set()) cfs_fail_timeout id 50c awake
Apr 28 23:51:35 osh-1 kernel: LustreError: 18261:0:(obd_mount.c:1293:lustre_fill_super()) Unable to mount  (-5)
Apr 28 23:51:36 osh-1 kernel: LustreError: 18278:0:(genops.c:320:class_newdev()) Device MGC192.168.177.145@tcp already exists at 6, won&apos;t add
Apr 28 23:51:36 osh-1 kernel: LustreError: 18278:0:(obd_config.c:374:class_attach()) Cannot create device MGC192.168.177.145@tcp of type mgc : -17
Apr 28 23:51:36 osh-1 kernel: LustreError: 18278:0:(obd_mount.c:196:lustre_start_simple()) MGC192.168.177.145@tcp attach error -17
Apr 28 23:51:36 osh-1 kernel: LustreError: 18278:0:(obd_mount.c:1293:lustre_fill_super()) Unable to mount  (-17)
Apr 28 23:51:38 osh-1 kernel: LustreError: 18280:0:(genops.c:320:class_newdev()) Device MGC192.168.177.145@tcp already exists at 6, won&apos;t add
Apr 28 23:51:38 osh-1 kernel: LustreError: 18280:0:(obd_config.c:374:class_attach()) Cannot create device MGC192.168.177.145@tcp of type mgc : -17
Apr 28 23:51:38 osh-1 kernel: LustreError: 18280:0:(obd_mount.c:196:lustre_start_simple()) MGC192.168.177.145@tcp attach error -17
Apr 28 23:51:38 osh-1 kernel: LustreError: 18280:0:(obd_mount.c:1293:lustre_fill_super()) Unable to mount  (-17)
Apr 28 23:51:39 osh-1 kernel: LustreError: 18282:0:(genops.c:320:class_newdev()) Device MGC192.168.177.145@tcp already exists at 6, won&apos;t add
Apr 28 23:51:39 osh-1 kernel: LustreError: 18282:0:(obd_config.c:374:class_attach()) Cannot create device MGC192.168.177.145@tcp of type mgc : -17
Apr 28 23:51:39 osh-1 kernel: LustreError: 18282:0:(obd_mount.c:196:lustre_start_simple()) MGC192.168.177.145@tcp attach error -17
Apr 28 23:51:39 osh-1 kernel: LustreError: 18282:0:(obd_mount.c:1293:lustre_fill_super()) Unable to mount  (-17)
Apr 28 23:51:41 osh-1 kernel: LustreError: 18284:0:(genops.c:320:class_newdev()) Device MGC192.168.177.145@tcp already exists at 6, won&apos;t add
Apr 28 23:51:41 osh-1 kernel: LustreError: 18284:0:(obd_config.c:374:class_attach()) Cannot create device MGC192.168.177.145@tcp of type mgc : -17
Apr 28 23:51:41 osh-1 kernel: LustreError: 18284:0:(obd_mount.c:196:lustre_start_simple()) MGC192.168.177.145@tcp attach error -17
Apr 28 23:51:48 osh-1 kernel: Lustre: Mounted lustre1-client
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;It seems the issue isn&apos;t fixed with patch &lt;a href=&quot;http://review.whamcloud.com/#/c/10127/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/10127/&lt;/a&gt;. But feel free to correct me, if I am wrong or the way I am trying to reproduce the problem isn&apos;t right.&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Thanks&lt;/p&gt;</comment>
                            <comment id="82730" author="bobijam" created="Tue, 29 Apr 2014 12:14:48 +0000"  >&lt;p&gt;updated the patch&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedHeader panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;commit message&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;    LU-4943 obdclass: detach MGC dev on error
    
    lustre_start_mgc() creates MGC device, if error happens later on
    ll_fill_super(), this device is still attached, and later mount
    fails by keep complaining that the MGC device&apos;s already in the
    client node.
    
    It turns out that the device was referenced by mgc config llog data
    which is arranged in the mgc lock requeue threadn for re-trying to
    get mgc lock, and in normal case, this llog reference only released
    in mgc_blocking_ast() when the system is umount.
    
    This patch handles the error path when mgc connect fails, it releases
    the config llog data reference in config_log_end() since there will
    be no mgc_blocking_ast() happen in this case.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="83279" author="parinay" created="Tue, 6 May 2014 11:14:38 +0000"  >&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;I tried latest patch set (6). The logs,
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; 66 10000000:01000000:0.0:1399359209.816771:0:9520:0:(mgc_request.c:1864:mgc_process_log()) Can&apos;t get cfg lock: -5
 67 10000000:01000000:0.0:1399359209.816775:0:9520:0:(mgc_request.c:1883:mgc_process_log()) MGC192.168.177.145@tcp: configuration from log &apos;lustre1-client&apos; failed (-5).
 68 00000020:02020000:0.0:1399359209.816779:0:9520:0:(obd_mount.c:119:lustre_process_log()) 15c-8: MGC192.168.177.145@tcp: The configuration from log &apos;lustre1-client&apos; failed (-5). This may be the result of commun    ication errors between this node and the MGS, a bad configuration, or other errors. See the syslog for more information.
 69 00000080:00020000:0.0:1399359209.816787:0:9520:0:(llite_lib.c:1043:ll_fill_super()) Unable to process log: -5
 70 10000000:01000000:0.0:1399359209.816800:0:9520:0:(mgc_request.c:147:config_log_put()) dropping config log lustre1-cliir
 71 10000000:01000000:0.0:1399359209.816802:0:9520:0:(mgc_request.c:147:config_log_put()) dropping config log lustre1-client
 72 10000000:01000000:0.0:1399359209.816803:0:9520:0:(mgc_request.c:419:config_log_end()) end config log lustre1-client (0)
 73 00000080:02000400:0.0:1399359209.816921:0:9520:0:(llite_lib.c:1132:ll_put_super()) Unmounted lustre1-client
 74 00000020:01000000:0.0:1399359209.816924:0:9520:0:(obd_config.c:911:class_del_profile()) Del profile lustre1-client
 75 00000020:01000004:0.0:1399359209.817022:0:9520:0:(obd_mount.c:785:lustre_common_put_super()) dropping sb ffff88001bfea800
 76 00000100:00080000:0.0:1399359209.817026:0:9520:0:(pinger.c:499:ptlrpc_pinger_del_import()) removing pingable import 547e172a-7802-ef66-bd18-f473852ac6b1-&amp;gt;MGS
 77 00010000:00080000:0.0:1399359227.816973:0:9416:0:(ldlm_request.c:1311:ldlm_cli_update_pool()) @@@ Zero SLV or Limit found (SLV: 0, Limit: 25600)  req@ffff88000ee7ac00 x1467333992251936/t0(0) o400-&amp;gt;lustre-OST0    001-osc-ffff880019eaac00@192.168.177.129@tcp:28/4 lens 224/192 e 0 to 0 dl 1399359234 ref 1 fl Rpc:RN/0/0 rc 0/0
 78 00010000:00080000:0.0:1399359227.816992:0:9416:0:(ldlm_request.c:1311:ldlm_cli_update_pool()) @@@ Zero SLV or Limit found (SLV: 0, Limit: 25600)  req@ffff88000ee7a800 x1467333992251932/t0(0) o400-&amp;gt;lustre-OST0    000-osc-ffff880019eaac00@192.168.177.129@tcp:28/4 lens 224/192 e 0 to 0 dl 1399359234 ref 1 fl Rpc:RN/0/0 rc 0/0
 79 00010000:00080000:0.0:1399359252.816299:0:9416:0:(ldlm_request.c:1311:ldlm_cli_update_pool()) @@@ Zero SLV or Limit found (SLV: 0, Limit: 25600)  req@ffff880006505800 x1467333992251952/t0(0) o400-&amp;gt;lustre-OST0    001-osc-ffff880019eaac00@192.168.177.129@tcp:28/4 lens 224/192 e 0 to 0 dl 1399359259 ref 1 fl Rpc:RN/0/0 rc 0/0
 80 00010000:00080000:0.0:1399359252.816315:0:9416:0:(ldlm_request.c:1311:ldlm_cli_update_pool()) @@@ Zero SLV or Limit found (SLV: 0, Limit: 25600)  req@ffff88000ee7a800 x1467333992251948/t0(0) o400-&amp;gt;lustre-OST0    000-osc-ffff880019eaac00@192.168.177.129@tcp:28/4 lens 224/192 e 0 to 0 dl 1399359259 ref 1 fl Rpc:RN/0/0 rc 0/0
 81 00000001:00020000:0.0:1399359272.816677:0:9414:0:(fail.c:137:__cfs_fail_timeout_set()) cfs_fail_timeout id 50c awake
 82 00000100:00080000:0.0:1399359272.816729:0:9414:0:(import.c:816:ptlrpc_connect_interpret()) MGC192.168.177.145@tcp: connect to target with instance 0
 83 10000000:01000000:0.0:1399359272.816738:0:9414:0:(mgc_request.c:1160:mgc_import_event()) import event 0x808005
 84 00000100:00080000:0.0:1399359272.816741:0:9414:0:(import.c:871:ptlrpc_connect_interpret()) ffff88001d374800 MGS: changing import state from CONNECTING to FULL
 85 10000000:01000000:0.0:1399359272.816745:0:9414:0:(mgc_request.c:1160:mgc_import_event()) import event 0x808004
 86 00000100:00080000:0.0:1399359272.816756:0:9414:0:(pinger.c:239:ptlrpc_pinger_ir_up()) IR up
 87 00000100:00080000:0.0:1399359272.816763:0:9414:0:(import.c:1106:ptlrpc_connect_interpret()) MGC192.168.177.145@tcp: Resetting ns_connect_flags to server flags: 0x11005000020
 88 00000100:00080000:0.0:1399359272.816822:0:9520:0:(import.c:1482:ptlrpc_disconnect_import()) ffff88001d374800 MGS: changing import state from FULL to CONNECTING
 89 00000100:00080000:0.0:1399359272.817942:0:9520:0:(import.c:1495:ptlrpc_disconnect_import()) ffff88001d374800 MGS: changing import state from CONNECTING to CLOSED
 90 00000100:00080000:0.0:1399359272.817948:0:9520:0:(import.c:204:ptlrpc_deactivate_and_unlock_import()) setting import MGS INVALID
 91 10000000:01000000:0.0:1399359272.817951:0:9520:0:(mgc_request.c:1160:mgc_import_event()) import event 0x808002
 92 10000000:01000000:0.0:1399359272.817954:0:9520:0:(mgc_request.c:1160:mgc_import_event()) import event 0x808003
 93 00000020:00000080:0.0:1399359272.817961:0:9520:0:(genops.c:1225:class_disconnect()) disconnect: cookie 0xa9a320b358ce08f3
 94 00000020:00000080:0.0:1399359272.817965:0:9520:0:(genops.c:825:class_export_put()) final put ffff880019ec0000/547e172a-7802-ef66-bd18-f473852ac6b1
 95 00000020:01000000:0.0:1399359272.817972:0:9520:0:(obd_config.c:1727:class_manual_cleanup()) Manual cleanup of MGC192.168.177.145@tcp (flags=&apos;&apos;)
 96 00000020:00000080:0.0:1399359272.817977:0:9520:0:(obd_config.c:1068:class_process_config()) processing cmd: cf004
 97 00000020:00000080:0.0:1399359272.818061:0:9520:0:(obd_config.c:674:class_cleanup()) MGC192.168.177.145@tcp: forcing exports to disconnect: 2
 98 00000020:00080000:0.0:1399359272.818065:0:9520:0:(genops.c:1541:print_export_data()) MGC192.168.177.145@tcp: ACTIVE ffff880019eaa000 547e172a-7802-ef66-bd18-f473852ac6b1 (no nid) 3 (0 0 0) 0 0 0 0: (null)  0
 99 00000020:00080000:0.0:1399359272.818073:0:9520:0:(genops.c:1541:print_export_data()) MGC192.168.177.145@tcp: ZOMBIE ffff880019ec0000 547e172a-7802-ef66-bd18-f473852ac6b1 192.168.177.145@tcp 0 (0 0 0) 1 0 0 0:     (null)  0
100 00000020:00080000:0.0:1399359272.818077:0:9520:0:(genops.c:1314:class_disconnect_exports()) OBD device 6 (ffff88001b13e3b8) has exports, disconnecting them
101 00000020:00080000:0.0:1399359272.818080:0:9520:0:(genops.c:1277:class_disconnect_export_list()) exp ffff880019eaa000 export uuid == obd uuid, don&apos;t discon
102 10000000:01000000:0.0:1399359272.818083:0:9520:0:(obd_class.h:673:obd_cleanup_client_import()) MGC192.168.177.145@tcp: client import never connected
103 10000000:01000000:0.0:1399359272.818085:0:9520:0:(mgc_request.c:1160:mgc_import_event()) import event 0x808003
104 00000020:00000080:0.0:1399359272.818107:0:9520:0:(obd_config.c:1068:class_process_config()) processing cmd: cf002
105 00000020:00000080:0.0:1399359272.818109:0:9520:0:(obd_config.c:599:class_detach()) detach on obd MGC192.168.177.145@tcp (uuid 547e172a-7802-ef66-bd18-f473852ac6b1)
106 00000020:00000080:0.0:1399359272.818115:0:9520:0:(obd_config.c:1068:class_process_config()) processing cmd: cf006
107 00000020:00000080:0.0:1399359272.818117:0:9520:0:(obd_config.c:1087:class_process_config()) removing mappings for uuid MGC192.168.177.145@tcp_0
108 00000020:01000004:0.0:1399359272.818121:0:9520:0:(obd_mount.c:654:lustre_put_lsi()) put ffff88001bfea800 1
109 00000020:01000004:0.0:1399359272.818123:0:9520:0:(obd_mount.c:604:lustre_free_lsi()) Freeing lsi ffff88001b314800
110 00000020:00020000:0.0:1399359272.818133:0:9520:0:(obd_mount.c:1289:lustre_fill_super()) Unable to mount  (-5)
111 00000020:00000080:0.0:1399359272.818509:0:9402:0:(genops.c:955:class_import_destroy()) destroying import ffff88001d374800 for MGC192.168.177.145@tcp
112 00000020:00000080:0.0:1399359272.818514:0:9402:0:(genops.c:779:class_export_destroy()) destroying export ffff880019ec0000/547e172a-7802-ef66-bd18-f473852ac6b1 for MGC192.168.177.145@tcp
113 00000020:01200004:0.0:1399359274.021599:0:9559:0:(obd_mount.c:1225:lustre_fill_super()) VFS Op: sb ffff88001e3db800
114 00000020:01000004:0.0:1399359274.021606:0:9559:0:(obd_mount.c:809:lmd_print())   mount data:
115 00000020:01000004:0.0:1399359274.021606:0:9559:0:(obd_mount.c:811:lmd_print()) profile: lustre1-client
116 00000020:01000004:0.0:1399359274.021606:0:9559:0:(obd_mount.c:812:lmd_print()) device:  192.168.177.145@tcp:/lustre1
117 00000020:01000004:0.0:1399359274.021607:0:9559:0:(obd_mount.c:813:lmd_print()) flags:   2
118 00000020:01000004:0.0:1399359274.021607:0:9559:0:(obd_mount.c:1250:lustre_fill_super()) Mounting client lustre1-client
119 00000020:01000004:0.0:1399359274.021615:0:9559:0:(obd_mount.c:333:lustre_start_mgc()) Start MGC &apos;MGC192.168.177.145@tcp&apos;
120 00000020:00000080:0.0:1399359274.021617:0:9559:0:(obd_config.c:1068:class_process_config()) processing cmd: cf005
121 00000020:00000080:0.0:1399359274.021619:0:9559:0:(obd_config.c:1079:class_process_config()) adding mapping from uuid MGC192.168.177.145@tcp_0 to nid 0x20000c0a8b191 (192.168.177.145@tcp)
122 00000020:01000004:0.0:1399359274.021633:0:9559:0:(obd_mount.c:192:lustre_start_simple()) Starting obd MGC192.168.177.145@tcp (typ=mgc)
123 00000020:00000080:0.0:1399359274.021634:0:9559:0:(obd_config.c:1068:class_process_config()) processing cmd: cf001
124 00000020:00000080:0.0:1399359274.021635:0:9559:0:(obd_config.c:366:class_attach()) attach type mgc name: MGC192.168.177.145@tcp uuid: 36f7e595-bc95-0a5c-6291-bb650109f125
125 00000020:00020000:0.0:1399359274.021660:0:9559:0:(genops.c:320:class_newdev()) Device MGC192.168.177.145@tcp already exists at 6, won&apos;t add
126 00000020:00020000:0.0:1399359274.021666:0:9559:0:(obd_config.c:374:class_attach()) Cannot create device MGC192.168.177.145@tcp of type mgc : -17
127 00000020:00020000:0.0:1399359274.021668:0:9559:0:(obd_mount.c:196:lustre_start_simple()) MGC192.168.177.145@tcp attach error -17
128 00000020:01000004:0.0:1399359274.021671:0:9559:0:(obd_mount.c:654:lustre_put_lsi()) put ffff88001e3db800 1
129 00000020:01000004:0.0:1399359274.021673:0:9559:0:(obd_mount.c:604:lustre_free_lsi()) Freeing lsi ffff88001bfea800
130 00000020:00020000:0.0:1399359274.021674:0:9559:0:(obd_mount.c:1289:lustre_fill_super()) Unable to mount  (-17)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;&lt;/li&gt;
	&lt;li&gt;messages
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LustreError: 9414:0:(fail.c:133:__cfs_fail_timeout_set()) cfs_fail_timeout id 50c sleeping for 120000ms
LustreError: 9520:0:(client.c:1052:ptlrpc_import_delay_req()) @@@ send limit expired   req@ffff880006505800 x1467333992251884/t0(0) o101-&amp;gt;MGC192.168.177.145@tcp@192.168.177.145@tcp:26/25 lens 328/344 e 0 to 0 dl 0 ref 2 fl Rpc:W/0/ffffffff rc 0/-1
LustreError: 9520:0:(client.c:1052:ptlrpc_import_delay_req()) @@@ send limit expired   req@ffff880006505800 x1467333992251888/t0(0) o101-&amp;gt;MGC192.168.177.145@tcp@192.168.177.145@tcp:26/25 lens 328/344 e 0 to 0 dl 0 ref 2 fl Rpc:W/0/ffffffff rc 0/-1
LustreError: 15c-8: MGC192.168.177.145@tcp: The configuration from log &apos;lustre1-client&apos; failed (-5). This may be the result of communication errors between this node and the MGS, a bad configuration, or other errors. See the syslog for more information.
LustreError: 9520:0:(llite_lib.c:1043:ll_fill_super()) Unable to process log: -5
Lustre: Unmounted lustre1-client
LustreError: 9414:0:(fail.c:137:__cfs_fail_timeout_set()) cfs_fail_timeout id 50c awake
LustreError: 9520:0:(obd_mount.c:1289:lustre_fill_super()) Unable to mount  (-5)
LustreError: 9559:0:(genops.c:320:class_newdev()) Device MGC192.168.177.145@tcp already exists at 6, won&apos;t add
LustreError: 9559:0:(obd_config.c:374:class_attach()) Cannot create device MGC192.168.177.145@tcp of type mgc : -17
LustreError: 9559:0:(obd_mount.c:196:lustre_start_simple()) MGC192.168.177.145@tcp attach error -17
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="83289" author="bobijam" created="Tue, 6 May 2014 13:19:17 +0000"  >&lt;p&gt;would you please collect -1 log and upload here? I&apos;ve tried the patch with your reproduction procedure and it works.&lt;/p&gt;</comment>
                            <comment id="83304" author="parinay" created="Tue, 6 May 2014 15:27:13 +0000"  >&lt;p&gt;attached are the logs requested. Let me know, if I am missing something.&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;</comment>
                            <comment id="83324" author="bobijam" created="Tue, 6 May 2014 17:09:12 +0000"  >&lt;p&gt;in the -1 log&lt;br/&gt;
in 8711 line the mgc device reference count is 1&lt;/p&gt;
&lt;blockquote&gt;&lt;p&gt;00000020:00000040:0.0:1399389218.949028:0:4386:0:(obd_config.c:733:class_decref()) Decref MGC192.168.177.145@tcp (ffff88001a5861b8) now 1&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;and until in line 9810, its reference count becomes to 0 after lustre1-sptlrpc llog is finally quit from its mgc requeue thread&lt;/p&gt;
&lt;blockquote&gt;&lt;p&gt;00000020:00000040:0.0:1399389225.336730:0:4386:0:(obd_config.c:733:class_decref()) Decref MGC192.168.177.145@tcp (ffff88001a5861b8) now 0&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;and all the lustre_start_mgc() between them returns -17 since the llog request has reference of the mgc device.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;$ grep -n &quot;lustre_start_mgc().*-17&quot; lu-4943.log
8825:00000020:00000001:0.0:1399389220.091202:0:4506:0:(obd_mount.c:404:lustre_start_mgc()) Process leaving via out_free (rc=18446744073709551599 : -17 : 0xffffffffffffffef)
8829:00000020:00000001:0.0:1399389220.091209:0:4506:0:(obd_mount.c:495:lustre_start_mgc()) Process leaving (rc=18446744073709551599 : -17 : ffffffffffffffef)
8917:00000020:00000001:0.0:1399389220.794620:0:4508:0:(obd_mount.c:404:lustre_start_mgc()) Process leaving via out_free (rc=18446744073709551599 : -17 : 0xffffffffffffffef)
8921:00000020:00000001:0.0:1399389220.794622:0:4508:0:(obd_mount.c:495:lustre_start_mgc()) Process leaving (rc=18446744073709551599 : -17 : ffffffffffffffef)
9041:00000020:00000001:0.0:1399389221.706351:0:4510:0:(obd_mount.c:404:lustre_start_mgc()) Process leaving via out_free (rc=18446744073709551599 : -17 : 0xffffffffffffffef)
9045:00000020:00000001:0.0:1399389221.706357:0:4510:0:(obd_mount.c:495:lustre_start_mgc()) Process leaving (rc=18446744073709551599 : -17 : ffffffffffffffef)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt; 

&lt;p&gt;So I think the patch should work, we just need to wait longer.&lt;/p&gt;</comment>
                            <comment id="83382" author="parinay" created="Wed, 7 May 2014 09:50:59 +0000"  >&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Second mount works after wait.&lt;/li&gt;
	&lt;li&gt;Do you think,  large number of lustre clients, would affect this wait? (IMO increasing it,as refcount would be more)&lt;/li&gt;
	&lt;li&gt;Can we change any timeouts to control/reduce this wait time ?&lt;br/&gt;
e.g.  
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;484 #define MGC_TIMEOUT_MIN_SECONDS   5
485 #define MGC_TIMEOUT_RAND_CENTISEC 0x1ff /* ~500 */
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Please feel free to correct me, if wrong.&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;</comment>
                            <comment id="83383" author="bobijam" created="Wed, 7 May 2014 10:38:12 +0000"  >&lt;blockquote&gt;&lt;p&gt;Do you think, large number of lustre clients, would affect this wait? (IMO increasing it,as refcount would be more)&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Basically no, later lustre clients only increase the mgc&apos;s obd::u::cli::cl_mgc_refcount, which would be decreased when they fail to mount by calling lustre_common_put_super()-&amp;gt;lustre_stop_mgc(), while it does not affect the cld in the requeue thread. But of course unless these large number of lustre clients error-out take too much time it sure will delay the mgc device cleanup.&lt;/p&gt;

&lt;p&gt;I think shorten these mgc timeout values can reduce the wait time, but this really is a rare case and in the case the situation happens, we can umount lustre related devices and unload all lustre modules to expedite the cleanup and after fixing the issue preventing the mgc from connecting, we can restart the mount procedure again.&lt;/p&gt;</comment>
                            <comment id="83646" author="haasken" created="Fri, 9 May 2014 17:21:37 +0000"  >&lt;p&gt;I have tested patch-set 7 of the master version of the patch: &lt;a href=&quot;http://review.whamcloud.com/#/c/10129/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/10129/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Using Parinay&apos;s reproducer steps, it does not seem to resolve the issue.  I have uploaded &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4943&quot; title=&quot;Client Failes to mount filesystem&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4943&quot;&gt;&lt;del&gt;LU-4943&lt;/del&gt;&lt;/a&gt;-mgc-cleanup.tar.gz, which contains the test script and the full dk logs.&lt;/p&gt;

&lt;p&gt;Note that after I unmounted the first file system, the mgc from the first failed mount of the second file system was cleaned up.&lt;/p&gt;</comment>
                            <comment id="83798" author="bobijam" created="Mon, 12 May 2014 03:46:14 +0000"  >&lt;p&gt;The explanation of this phenomenon (until umount the first client got the 2nd mgc released) is that mgc requeue thread is for getting updated MGS config logs if necessary, if the whole system config is still without any config data change, the mgc requeue thread will not be waked, so the 2nd mgc llog data will stayed in the thread. &lt;/p&gt;

&lt;p&gt;In your test, the umount of the 1st client stur up the client&apos;s mgc requeue thread which make the 2nd mgc release, you can also simply make the stirring by &apos;lctl set_param -P debug=$olddebug&quot;&lt;/p&gt;</comment>
                            <comment id="83902" author="haasken" created="Mon, 12 May 2014 18:37:40 +0000"  >&lt;p&gt;So this command (or another command affecting config data) has to be run on the MGS of the mounted file system, is that right?&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param -P debug=&quot;$oldDebug&quot;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In my testing, running this command on the MGS for the first, mounted file system caused the client&apos;s mgc device from the second file system to be cleaned up after about 5 seconds.  Running this command on the MGS for the second file system did not affect the stale mgc device.  Does that sound right?  Is there some better fix in the works that does not require us to &quot;stir up&quot; the client&apos;s mgc requeue thread?  This does not seem like an optimal solution.&lt;/p&gt;

&lt;p&gt;Parinay, you did not mention this step in your reproducer.  Did you do something else to &quot;stir up&quot; the mgc requeue thread?&lt;/p&gt;</comment>
                            <comment id="83940" author="haasken" created="Mon, 12 May 2014 22:41:58 +0000"  >&lt;p&gt;I just tested the reproducer steps on a pristine master client without your patch, and I got the same results as I did with your patch.  Here are the steps.&lt;/p&gt;

&lt;p&gt;On the client:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;mount -t lustre centss01:/centss01 /mnt/centss01 -o rw,flock,lazystatfs
lctl set_param fail_loc=0x8000050c
lctl set_param fail_val=120
# The following mount attempts will fail, first with rc=5, then with rc=17
mount -t lustre centss06:/centss06 /mnt/centss06 -o rw,flock,lazystatfs
mount -t lustre centss06:/centss06 /mnt/centss06 -o rw,flock,lazystatfs
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;On the MGS for the first file system, execute the following command&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param -P debug=&quot;ioctl neterror warning error emerg ha config console&quot;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Then wait about a minute to be safe, and execute the mount command on the client again.  This time it will succeed.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;mount -t lustre centss06:/centss06 /mnt/centss06 -o rw,flock,lazystatfs
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;These steps give the same behavior between a master+patch client and a pristine master client.  Is there some reproducer which distinguishes between a client with the patch and a client without the patch?&lt;/p&gt;</comment>
                            <comment id="83953" author="bobijam" created="Tue, 13 May 2014 01:16:58 +0000"  >&lt;p&gt;1. yes, better execute it on MGS, since the 2nd system client does not established the mgc-&amp;gt;mgs connection, the stir up won&apos;t work.&lt;br/&gt;
2. the mgc requeue mechanism requires that mgc requeue thread won&apos;t be waked unless there is system config change happens in the whole filesystem, so if a client is unfortunately failing to establish its mgc-&amp;gt;mgs connection, this is the way to release its mgc device. Given this is a really rare case, I think it&apos;s not too much a burden solution.&lt;br/&gt;
3. there are several places to fail the mgc device setup, for example a) before any llog is inserted in the requeue thread; b) llog is inserted, while llog process failed for connection failure. &lt;/p&gt;</comment>
                            <comment id="83978" author="parinay" created="Tue, 13 May 2014 12:22:48 +0000"  >&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;No, your reproducer is what I followed nothing to&quot;stir up&quot; the mgc requeue.&lt;/li&gt;
	&lt;li&gt;I have created a new patch, will soon upload it here.&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Thanks&lt;/p&gt;</comment>
                            <comment id="83979" author="bobijam" created="Tue, 13 May 2014 12:35:24 +0000"  >&lt;p&gt;It&apos;s timing that makes the difference, when llog hasn&apos;t sit tight in the requeue thread, my patch will remove it from the requeue thread list and release mgc device eariler, but if the llog is already passed to the bottom of the requeue process and the thread is waiting for system config change, then my patch won&apos;t speed up the release, we need manually stir up the requeue thread for another loop to release the mgc device.&lt;/p&gt;</comment>
                            <comment id="85697" author="parinay" created="Wed, 4 Jun 2014 14:09:48 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/#/c/10569/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/10569/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="86154" author="haasken" created="Mon, 9 Jun 2014 20:43:37 +0000"  >&lt;p&gt;I have tested the master patch, and I&apos;ve found that it fixes this issue.&lt;/p&gt;

&lt;p&gt;Parinay, please update the patch when you get a chance to fix the style issues.  Then I&apos;ll inspect it again and approve.  Thanks!&lt;/p&gt;</comment>
                            <comment id="91109" author="mhanafi" created="Thu, 7 Aug 2014 20:57:27 +0000"  >&lt;p&gt;Can be close from our end&lt;/p&gt;</comment>
                            <comment id="91111" author="pjones" created="Thu, 7 Aug 2014 21:02:22 +0000"  >&lt;p&gt;Mahmoud&lt;/p&gt;

&lt;p&gt;To be clear - which patches (if any) did you end up using to meet your requirements?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="91113" author="haasken" created="Thu, 7 Aug 2014 21:08:06 +0000"  >&lt;p&gt;From our end, we need &lt;a href=&quot;http://review.whamcloud.com/#/c/10569/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/10569/&lt;/a&gt; to land, but there are some style issues with it right now.&lt;/p&gt;</comment>
                            <comment id="91120" author="mhanafi" created="Thu, 7 Aug 2014 21:55:10 +0000"  >&lt;p&gt;we used &lt;a href=&quot;http://review.whamcloud.com/10127&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/10127&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="91123" author="pjones" created="Thu, 7 Aug 2014 22:07:02 +0000"  >&lt;p&gt;Thanks Mahmoud. I suggest that we keep this ticket open until at least the master patch - &lt;a href=&quot;http://review.whamcloud.com/#/c/10129/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/10129/&lt;/a&gt; - lands.&lt;/p&gt;

&lt;p&gt;Ryan it would probably be best to decouple the upstreaming of &lt;a href=&quot;http://review.whamcloud.com/#/c/10569/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/10569/&lt;/a&gt; from this NASA support issue and use a unique JIRA ticket reference on the next push.&lt;/p&gt;
</comment>
                            <comment id="91441" author="haasken" created="Tue, 12 Aug 2014 18:31:26 +0000"  >&lt;p&gt;Are the two changes compatible with each other?&lt;/p&gt;</comment>
                            <comment id="91501" author="bobijam" created="Wed, 13 Aug 2014 01:33:55 +0000"  >&lt;p&gt;yes, I think they are compatible.&lt;/p&gt;</comment>
                            <comment id="93220" author="haasken" created="Thu, 4 Sep 2014 19:35:22 +0000"  >&lt;p&gt;Mahmoud, do you know which patch-set of the change &lt;a href=&quot;http://review.whamcloud.com/#/c/10129/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/10129/&lt;/a&gt; you used?  The newest version of the patch looks like it addresses the problem similar to the way &lt;a href=&quot;http://review.whamcloud.com/#/c/10569/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/10569/&lt;/a&gt; addresses the problem.&lt;/p&gt;

&lt;p&gt;Also, there is a problem with the current patch-set (PS10) of &lt;a href=&quot;http://review.whamcloud.com/#/c/10129/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/10129/&lt;/a&gt; .  With this patch-set applied mount.lustre hangs with the following trace in dmesg:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;INFO: task mount.lustre:12025 blocked for more than 120 seconds.
      Not tainted 2.6.32.431.23.3.el6_lustre #2
&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
mount.lustre  D 0000000000000001     0 12025  12024 0x00000080
 ffff88013bb15828 0000000000000086 ffff88013bb157b8 ffffffff81069f15
 ffff88013bb15798 ffff88013dc08ad8 ffff8800283168e8 ffff880028316880
 ffff88013a56fab8 ffff88013bb15fd8 000000000000fbc8 ffff88013a56fab8
Call Trace:
 [&amp;lt;ffffffff81069f15&amp;gt;] ? enqueue_entity+0x125/0x450
 [&amp;lt;ffffffff8152a365&amp;gt;] schedule_timeout+0x215/0x2e0
 [&amp;lt;ffffffff81069f15&amp;gt;] ? enqueue_entity+0x125/0x450
 [&amp;lt;ffffffff81529fe3&amp;gt;] wait_for_common+0x123/0x180
 [&amp;lt;ffffffff81061d00&amp;gt;] ? default_wake_function+0x0/0x20
 [&amp;lt;ffffffff8152a0fd&amp;gt;] wait_for_completion+0x1d/0x20
 [&amp;lt;ffffffffa11fd0a8&amp;gt;] mgc_setup+0x4c8/0x5a0 [mgc]
 [&amp;lt;ffffffffa0c4de8b&amp;gt;] obd_setup+0x19b/0x290 [obdclass]
 [&amp;lt;ffffffffa0afc181&amp;gt;] ? libcfs_debug_msg+0x41/0x50 [libcfs]
 [&amp;lt;ffffffffa0af63a8&amp;gt;] ? libcfs_log_return+0x28/0x40 [libcfs]
 [&amp;lt;ffffffffa0c4e188&amp;gt;] class_setup+0x208/0x870 [obdclass]
 [&amp;lt;ffffffffa0c56a6c&amp;gt;] class_process_config+0xc6c/0x1ad0 [obdclass]
 [&amp;lt;ffffffffa0af63a8&amp;gt;] ? libcfs_log_return+0x28/0x40 [libcfs]
 [&amp;lt;ffffffffa0c5baab&amp;gt;] ? lustre_cfg_new+0x40b/0x6f0 [obdclass]
 [&amp;lt;ffffffffa0c5bee8&amp;gt;] do_lcfg+0x158/0x450 [obdclass]
 [&amp;lt;ffffffff8128daa0&amp;gt;] ? sprintf+0x40/0x50
 [&amp;lt;ffffffffa0c5c274&amp;gt;] lustre_start_simple+0x94/0x200 [obdclass]
 [&amp;lt;ffffffffa0c60993&amp;gt;] lustre_start_mgc+0xbd3/0x1e00 [obdclass]
 [&amp;lt;ffffffffa0af63a8&amp;gt;] ? libcfs_log_return+0x28/0x40 [libcfs]
 [&amp;lt;ffffffffa0afc181&amp;gt;] ? libcfs_debug_msg+0x41/0x50 [libcfs]
 [&amp;lt;ffffffffa0c61ccc&amp;gt;] lustre_fill_super+0x10c/0x550 [obdclass]
 [&amp;lt;ffffffffa0c61bc0&amp;gt;] ? lustre_fill_super+0x0/0x550 [obdclass]
 [&amp;lt;ffffffff8118c5df&amp;gt;] get_sb_nodev+0x5f/0xa0
 [&amp;lt;ffffffffa0c59995&amp;gt;] lustre_get_sb+0x25/0x30 [obdclass]
 [&amp;lt;ffffffff8118bc3b&amp;gt;] vfs_kern_mount+0x7b/0x1b0
 [&amp;lt;ffffffff8118bde2&amp;gt;] do_kern_mount+0x52/0x130
 [&amp;lt;ffffffff811ad7bb&amp;gt;] do_mount+0x2fb/0x930
 [&amp;lt;ffffffff811ade80&amp;gt;] sys_mount+0x90/0xe0
 [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I think this is because rq_start is never completed in mgc_requeue_thread().&lt;/p&gt;</comment>
                            <comment id="93230" author="jaylan" created="Thu, 4 Sep 2014 20:15:23 +0000"  >&lt;p&gt;Ryan, we use &lt;a href=&quot;http://review.whamcloud.com/10127&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/10127&lt;/a&gt; (for b2_4 branch) instead.&lt;/p&gt;

&lt;p&gt;The latest PS for #10129 (for master branch) is PS10 as you pointed out, but the latest PS for #10127 is still PS7. We use PS7 of #10127.&lt;/p&gt;</comment>
                            <comment id="93248" author="haasken" created="Thu, 4 Sep 2014 22:36:43 +0000"  >&lt;p&gt;Thanks Jay.  I have been testing on master, so that may explain why PS7 didn&apos;t fix the problem for me.  PS10 of #10129 takes a different approach than the earlier patch-sets.  It now takes a similar approach to #10569.  PS10 of #10129 is broken, but when I fixed it locally and rebuilt, it resolved the problem in the same way that #10569 did.&lt;/p&gt;

&lt;p&gt;It seems to me that we only need either #10129 or #10569.  Can anybody confirm this?&lt;/p&gt;</comment>
                            <comment id="93290" author="bobijam" created="Fri, 5 Sep 2014 00:29:35 +0000"  >&lt;p&gt;update &lt;a href=&quot;http://review.whamcloud.com/#/c/10127/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/10127/&lt;/a&gt; (b2_4) to be sync with master.&lt;/p&gt;</comment>
                            <comment id="95715" author="haasken" created="Mon, 6 Oct 2014 16:46:22 +0000"  >&lt;p&gt;Is the test failure in replay-ost-single on &lt;a href=&quot;http://review.whamcloud.com/#/c/10129/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/10129/&lt;/a&gt; related to the patch?  It doesn&apos;t seem like it to me, but I don&apos;t see a bug matching that failure.&lt;/p&gt;</comment>
                            <comment id="96408" author="haasken" created="Wed, 15 Oct 2014 17:38:12 +0000"  >&lt;p&gt;The patch for master has landed.&lt;/p&gt;

&lt;p&gt;This issue also exists in 2.5.  Here is a port for b2_5: &lt;a href=&quot;http://review.whamcloud.com/#/c/12303&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/12303&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="96493" author="haasken" created="Thu, 16 Oct 2014 17:09:45 +0000"  >&lt;p&gt;It looks like we may have gotten the same spurious Maloo failures on the b2_5 patch as we did on other branches.  Can somebody restart Maloo?&lt;/p&gt;</comment>
                            <comment id="98265" author="pjones" created="Tue, 4 Nov 2014 04:18:45 +0000"  >&lt;p&gt;Landed for 2.7&lt;/p&gt;</comment>
                            <comment id="100043" author="haasken" created="Tue, 25 Nov 2014 16:12:44 +0000"  >&lt;p&gt;I didn&apos;t notice that there was already a b2_5 version of this fix, so &lt;a href=&quot;http://review.whamcloud.com/#/c/12303/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/12303/&lt;/a&gt; has been abandoned in favor of &lt;a href=&quot;http://review.whamcloud.com/#/c/11765&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/11765&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="100741" author="gerrit" created="Thu, 4 Dec 2014 20:24:05 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/11765/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11765/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4943&quot; title=&quot;Client Failes to mount filesystem&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4943&quot;&gt;&lt;del&gt;LU-4943&lt;/del&gt;&lt;/a&gt; obdclass: detach MGC dev on error&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_5&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 8d1e9394d3a984e257e1e4b0f46f16b7ff2183cd&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="26303">LU-5582</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="14850" name="LU-4943-mgc-cleanup.tar.gz" size="458318" author="haasken" created="Fri, 9 May 2014 17:21:37 +0000"/>
                            <attachment id="14845" name="lu-4943.log.tar.bz2" size="60561" author="parinay" created="Tue, 6 May 2014 15:27:13 +0000"/>
                            <attachment id="14770" name="r403i1n1.1398271435.out.gz" size="550794" author="mhanafi" created="Wed, 23 Apr 2014 18:33:15 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwku7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>13682</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>