<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:24:37 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2368] OSTs stuck in perpetual recovery</title>
                <link>https://jira.whamcloud.com/browse/LU-2368</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;MDS failover happend during OSTs recovery, and OST got two mds connections from different IP. First was processed by OST, second connection cause class_fail_export() at target_handle_connect(), and we got perpetual recovery.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Oct 26 17:09:07 snx11001n008 kernel: [  838.638847] Lustre: 90700:0:(ldlm_lib.c:2007:target_recovery_init()) RECOVERY: service snx11001-OST0012, 3 recoverable clients, last_transno 54526017
Oct 26 17:09:07 snx11001n008 kernel: [  838.708354] Lustre: snx11001-OST0012: Now serving snx11001-OST0012/ on /dev/md4 with recovery enabled
Oct 26 17:09:07 snx11001n008 kernel: [  838.717732] Lustre: snx11001-OST0012: Will be in recovery for at least 15:00, or until 3 clients reconnect
Oct 26 17:11:05 snx11001n008 kernel: [  956.648093] LustreError: 88011:0:(ldlm_lib.c:927:target_handle_connect()) snx11001-OST0012: NID 10.10.101.3@o2ib1 (snx11001-MDT0000-mdtlov_UUID) reconnected with 1 conn_cnt; cookies not random?
Oct 26 17:15:10 snx11001n008 kernel: [ 1201.718217] Lustre: 88009:0:(ldlm_lib.c:941:target_handle_connect()) snx11001-OST0012: connection from snx11001-MDT0000-mdtlov_UUID@10.10.101.3@o2ib1 recovering/t0 exp ffff88072cb90400 cur 1351289710 last 1351289346
Oct 26 17:18:40 snx11001n008 kernel: [ 1410.931800] Lustre: 88010:0:(ldlm_lib.c:854:target_handle_connect()) snx11001-OST0012: received MDS connection from NID 10.10.101.4@o2ib1, removing former export from NID 10.10.101.3@o2ib1
Oct 26 17:18:40 snx11001n008 kernel: [ 1410.948937] Lustre: 88010:0:(ldlm_lib.c:941:target_handle_connect()) snx11001-OST0012: connection from snx11001-MDT0000-mdtlov_UUID@10.10.101.4@o2ib1 recovering/t0 exp (null) cur 1351289920 last 0
Oct 26 17:18:40 snx11001n008 kernel: [ 1410.976334] LustreError: 88010:0:(ldlm_lib.c:974:target_handle_connect()) snx11001-OST0012: denying connection for new client 10.10.101.4@o2ib1 (snx11001-MDT0000-mdtlov_UUID): 0 clients in recovery for 381s
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="16738">LU-2368</key>
            <summary>OSTs stuck in perpetual recovery</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="keith">Keith Mannthey</assignee>
                                    <reporter username="aboyko">Alexander Boyko</reporter>
                        <labels>
                            <label>patch</label>
                    </labels>
                <created>Wed, 21 Nov 2012 01:23:17 +0000</created>
                <updated>Wed, 22 May 2013 20:54:04 +0000</updated>
                            <resolved>Wed, 22 May 2013 20:54:04 +0000</resolved>
                                    <version>Lustre 2.1.0</version>
                                    <fixVersion>Lustre 2.1.6</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="48166" author="aboyko" created="Wed, 21 Nov 2012 01:44:08 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/4641&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4641&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="48183" author="tappro" created="Wed, 21 Nov 2012 11:54:29 +0000"  >&lt;p&gt;Alexander, could you give more info about how does that causes perpetual recovery? Just logs showing that will be good. I wonder just isn&apos;t this the same as &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2104&quot; title=&quot;conf-sanity test 47 never completes, negative time to recovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2104&quot;&gt;&lt;del&gt;LU-2104&lt;/del&gt;&lt;/a&gt;?&lt;/p&gt;</comment>
                            <comment id="48191" author="aboyko" created="Wed, 21 Nov 2012 14:10:49 +0000"  >&lt;p&gt;I can see updated recovery timer (recovery is timed out, evict stale exports), so this does not relate to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2104&quot; title=&quot;conf-sanity test 47 never completes, negative time to recovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2104&quot;&gt;&lt;del&gt;LU-2104&lt;/del&gt;&lt;/a&gt;. &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Oct 26 17:18:40 snx11001n008 kernel: [ 1410.976334] LustreError: 88010:0:(ldlm_lib.c:974:target_handle_connect()) snx11001-OST0012: denying connection for new client 10.10.101.4@o2ib1 (snx11001-MDT0000-mdtlov_UUID): 0 clients in recovery for 381s
Oct 26 17:21:31 snx11001n008 kernel: [ 1582.137247] LustreError: 88010:0:(ldlm_lib.c:974:target_handle_connect()) snx11001-OST0012: denying connection for new client 10.10.101.4@o2ib1 (snx11001-MDT0000-mdtlov_UUID): 0 clients in recovery for 209s
Oct 26 17:24:01 snx11001n008 kernel: [ 1731.873364] Lustre: 88010:0:(ldlm_lib.c:941:target_handle_connect()) snx11001-OST0012: connection from snx11001-MDT0000-mdtlov_UUID@10.10.101.4@o2ib1 recovering/t0 exp (null) cur 1351290241 last 0
Oct 26 17:25:01 snx11001n008 kernel: [ 1791.526174] Lustre: snx11001-OST0012: disconnecting 1 stale clients
Oct 26 17:25:16 snx11001n008 kernel: [ 1806.790647] LustreError: 88010:0:(ldlm_lib.c:974:target_handle_connect()) snx11001-OST0012: denying connection for new client 10.10.101.4@o2ib1 (snx11001-MDT0000-mdtlov_UUID): 0 clients in recovery for 54s
Oct 26 17:26:11 snx11001n008 kernel: [ 1861.406988] Lustre: snx11001-OST0012: recovery is timed out, evict stale exports
Oct 26 17:27:21 snx11001n008 kernel: [ 1931.296723] Lustre: snx11001-OST0012: recovery is timed out, evict stale exports
Oct 26 17:28:31 snx11001n008 kernel: [ 2001.192849] Lustre: snx11001-OST0012: recovery is timed out, evict stale exports
Oct 26 17:30:16 snx11001n008 kernel: [ 2106.354440] LustreError: 88010:0:(ldlm_lib.c:974:target_handle_connect()) snx11001-OST0012: denying connection for new client 10.10.101.4@o2ib1 (snx11001-MDT0000-mdtlov_UUID): 0 clients in recovery for 34s
Oct 26 17:32:01 snx11001n008 kernel: [ 2210.862684] Lustre: snx11001-OST0012: recovery is timed out, evict stale exports
Oct 26 17:39:01 snx11001n008 kernel: [ 2630.201143] Lustre: snx11001-OST0012: recovery is timed out, evict stale exports
Oct 26 17:47:46 snx11001n008 kernel: [ 3154.655964] LustreError: 88010:0:(ldlm_lib.c:974:target_handle_connect()) snx11001-OST0012: denying connection for new client 10.10.101.4@o2ib1 (snx11001-MDT0000-mdtlov_UUID): 0 clients in recovery for 34s
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="48193" author="tappro" created="Wed, 21 Nov 2012 14:17:54 +0000"  >&lt;p&gt;but recovery never ends still? Or it just lasts too long?&lt;/p&gt;</comment>
                            <comment id="48208" author="nrutman" created="Wed, 21 Nov 2012 16:50:35 +0000"  >&lt;p&gt;Xyratex-bug-id: &lt;a href=&quot;http://jira-nss.xy01.xyratex.com:8080/browse/MRP-738&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;MRP-738&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="48272" author="tappro" created="Thu, 22 Nov 2012 03:59:59 +0000"  >&lt;p&gt;I cannot access Xyratex site and check bug internals there&lt;/p&gt;</comment>
                            <comment id="48304" author="aboyko" created="Fri, 23 Nov 2012 05:21:13 +0000"  >&lt;p&gt;&amp;gt; but recovery never ends still? Or it just lasts too long?&lt;br/&gt;
never ends&lt;br/&gt;
above, you can see how timer was restarted during 30 mins &lt;/p&gt;</comment>
                            <comment id="48314" author="tappro" created="Fri, 23 Nov 2012 13:06:16 +0000"  >&lt;p&gt;Strictly speaking the situation with two MDS connection is not something special, old one is evicted through class_fail_export(), the second is not allowed until recovery is finished. So problem is why recovery cannot finish and the reason is that class_fail_export() call. During recovery all evicted/failed clients are counted in obd_stale_clients, inconsistent counter may cause recovery stuck. I think this patch should solve your problem:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;	/* &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; called during recovery then should keep obd_stale_clients
	 * consistent */
	&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (exp-&amp;gt;exp_obd-&amp;gt;obd_recovering)
		exp-&amp;gt;exp_obd-&amp;gt;obd_stale_clients++;

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I&apos;d prefer this solution because it fixes source of problem, your patch is correct too, but cover only case with that one particular call to class_fail_export(). I tried to simulate similar situation in master branch and patch above works, I&apos;d appreciate if you will check does it help in your case? If this is not easy to do then I will agree with your patch for b2_1.&lt;/p&gt;

&lt;p&gt;The &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2104&quot; title=&quot;conf-sanity test 47 never completes, negative time to recovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2104&quot;&gt;&lt;del&gt;LU-2104&lt;/del&gt;&lt;/a&gt; is the same problem actually, but there is also miscalculation in recovery timer reset which stops timer at all so recovery exceed timeout without being woken up, but the reason is the same and master will be fixed with lu-2104 patch anyway.&lt;/p&gt;</comment>
                            <comment id="48341" author="aboyko" created="Sun, 25 Nov 2012 01:42:41 +0000"  >&lt;p&gt;Thanks Mikhail, I will try it.&lt;/p&gt;</comment>
                            <comment id="48342" author="aboyko" created="Sun, 25 Nov 2012 02:26:33 +0000"  >&lt;p&gt;I approve, the patch&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;/* if called during recovery then should keep obd_stale_clients
	 * consistent */
	if (exp-&amp;gt;exp_obd-&amp;gt;obd_recovering)
		exp-&amp;gt;exp_obd-&amp;gt;obd_stale_clients++;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;fix this issue. And is better than &lt;a href=&quot;http://review.whamcloud.com/4641&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4641&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;Mikhail, do you plan to land &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2104&quot; title=&quot;conf-sanity test 47 never completes, negative time to recovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2104&quot;&gt;&lt;del&gt;LU-2104&lt;/del&gt;&lt;/a&gt; patch to b2_1 branch, or small change?&lt;/p&gt;</comment>
                            <comment id="48343" author="tappro" created="Sun, 25 Nov 2012 12:54:43 +0000"  >&lt;p&gt;I had no such plan so far. I think that code you checked is enough for b2_1, I&apos;d use just that fix as it is sufficient. Can you push it for b2_1 in context of this ticket?&lt;/p&gt;</comment>
                            <comment id="48347" author="aboyko" created="Mon, 26 Nov 2012 01:43:37 +0000"  >&lt;p&gt;sure, I have changed my previos patch with this one.&lt;/p&gt;</comment>
                            <comment id="59110" author="keith" created="Wed, 22 May 2013 20:53:48 +0000"  >&lt;p&gt;Both patches were landed for 2.1 . &lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvclz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>5631</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>