<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:09:42 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-7531] MDT recovery stalled if rescources are failed back immediatelly</title>
                <link>https://jira.whamcloud.com/browse/LU-7531</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;The error occurred during soak testing of build &apos;20151201.1&apos; (see &lt;a href=&quot;https://wiki.hpdd.intel.com/pages/viewpage.action?title=Soak+Testing+on+Lola&amp;amp;spaceKey=Releases#SoakTestingonLola-20151201.1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://wiki.hpdd.intel.com/pages/viewpage.action?title=Soak+Testing+on+Lola&amp;amp;spaceKey=Releases#SoakTestingonLola-20151201.1&lt;/a&gt;). DNE is enabled. MDSes are set-up in active-active HA failover configuration.&lt;/p&gt;

&lt;p&gt;The MDT recovery process stalls on the primary node in case the recovery process is interrupted on the secondary node by failing back the&lt;br/&gt;
resources immediately. This effects all running and new jobs using the remote MDTs.&lt;/p&gt;

&lt;p&gt;Sequence of events:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;2015-12-09 04:35:10   - Failover MDTs owned by &lt;tt&gt;lola-9&lt;/tt&gt; --&amp;gt; &lt;tt&gt;lola-8&lt;/tt&gt;&lt;/li&gt;
	&lt;li&gt;2015-12-09 04:43:52   - MDTs mounted successful on secondary (&lt;tt&gt;lola-8&lt;/tt&gt;&lt;/li&gt;
	&lt;li&gt;2015-12-09 04:44:13   - Stop recovery process (incomplete at that time, see soak.log), and initiated failback&lt;/li&gt;
	&lt;li&gt;2015-12-09 04:44:25   - mds_failover (failback) completed&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Double checked MDTs are active and mounted:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@lola-16 lola]# ssh lola-9 &apos;lctl dl | grep &quot; mdt &quot;&apos; | less -i
  4 UP mdt soaked-MDT0003 soaked-MDT0003_UUID 67
 32 UP mdt soaked-MDT0002 soaked-MDT0002_UUID 63

[root@lola-16 lola]# ssh lola-9 &apos;mount | grep lustre&apos; | less -i
/dev/mapper/360080e50002ffd8200000251520130a4p1 on /mnt/soaked-mdt3 type lustre (rw,user_xattr)
/dev/mapper/360080e50002ff4f00000026d52013098p1 on /mnt/soaked-mdt2 type lustre (rw,user_xattr)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Recovery still ongoing after ~ 50 mins:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@lola-9 ~]# date
Wed Dec  9 05:30:04 PST 2015
[root@lola-9 ~]# lctl get_param mdt.*.recovery_status
mdt.soaked-MDT0002.recovery_status=
status: RECOVERING
recovery_start: 1449667442
time_remaining: 0
connected_clients: 16/16
req_replay_clients: 5
lock_repay_clients: 5
completed_clients: 11
evicted_clients: 0
replayed_requests: 0
queued_requests: 4
next_transno: 1090929750241
mdt.soaked-MDT0003.recovery_status=
status: RECOVERING
recovery_start: 1449667442
time_remaining: 0
connected_clients: 16/16
req_replay_clients: 5
lock_repay_clients: 5
completed_clients: 11
evicted_clients: 0
replayed_requests: 0
queued_requests: 4
next_transno: 1047980457114
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Attached messages, console log file of MDT (&lt;tt&gt;lola-8&lt;/tt&gt;), debug log file created  manually at 04:55 and soak.log file.&lt;/p&gt;</description>
                <environment>lola&lt;br/&gt;
build: 2.7.63-28-g5fda01f, 5fda01f3002e7e742a206ce149652c6b78356828 + patches</environment>
        <key id="33521">LU-7531</key>
            <summary>MDT recovery stalled if rescources are failed back immediatelly</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="di.wang">Di Wang</assignee>
                                    <reporter username="heckes">Frank Heckes</reporter>
                        <labels>
                            <label>soak</label>
                    </labels>
                <created>Wed, 9 Dec 2015 13:41:06 +0000</created>
                <updated>Mon, 21 Dec 2015 14:17:58 +0000</updated>
                            <resolved>Mon, 21 Dec 2015 14:17:58 +0000</resolved>
                                    <version>Lustre 2.8.0</version>
                                    <fixVersion>Lustre 2.8.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="135701" author="jgmitter" created="Wed, 9 Dec 2015 19:34:52 +0000"  >&lt;p&gt;Assigning to Di who reported he will be looking into it shortly to assess.&lt;/p&gt;</comment>
                            <comment id="135762" author="di.wang" created="Thu, 10 Dec 2015 00:12:44 +0000"  >&lt;p&gt;I just checked the debug log, and it looks like the recovery process is stuck during update recovery. Though I can not figure out why the recovery is stuck by current log. &lt;/p&gt;

&lt;p&gt;Frank, Could you please re-run test, and get me the threads stack trace when the endless recovery happens? Unfortunately lola-9 has been restarted, thanks.&lt;/p&gt;</comment>
                            <comment id="135775" author="di.wang" created="Thu, 10 Dec 2015 06:59:32 +0000"  >&lt;p&gt;I saw a similar issue in soak-test, and got the trace&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;tgt_recov     S 0000000000000013     0 13622      2 0x00000080
 ffff8803f5fb57e0 0000000000000046 0000000000000000 ffff8803f5fb57a4
 ffffffff00000000 ffff88043fe84400 000003162493ac87 0000000000000286
 ffff8803f5fb5780 ffffffff8108742c ffff8803f5fad068 ffff8803f5fb5fd8
Call Trace:
 [&amp;lt;ffffffff8108742c&amp;gt;] ? lock_timer_base+0x3c/0x70
 [&amp;lt;ffffffff8152b222&amp;gt;] schedule_timeout+0x192/0x2e0
 [&amp;lt;ffffffff81087540&amp;gt;] ? process_timeout+0x0/0x10
 [&amp;lt;ffffffffa0bad779&amp;gt;] ptlrpc_set_wait+0x319/0xa20 [ptlrpc]
 [&amp;lt;ffffffffa0967344&amp;gt;] ? lprocfs_stats_alloc_one+0x344/0x360 [obdclass]
 [&amp;lt;ffffffffa0ba2c20&amp;gt;] ? ptlrpc_interrupted_set+0x0/0x110 [ptlrpc]
 [&amp;lt;ffffffff81064c00&amp;gt;] ? default_wake_function+0x0/0x20
 [&amp;lt;ffffffffa0bb9b15&amp;gt;] ? lustre_msg_set_jobid+0xf5/0x130 [ptlrpc]
 [&amp;lt;ffffffffa0badf01&amp;gt;] ptlrpc_queue_wait+0x81/0x220 [ptlrpc]
 [&amp;lt;ffffffffa0b85f6e&amp;gt;] ldlm_cli_enqueue+0x37e/0x870 [ptlrpc]
 [&amp;lt;ffffffffa0b8b050&amp;gt;] ? ldlm_completion_ast+0x0/0x9b0 [ptlrpc]
 [&amp;lt;ffffffffa122b390&amp;gt;] ? mdt_remote_blocking_ast+0x0/0x210 [mdt]
 [&amp;lt;ffffffffa140f615&amp;gt;] osp_md_object_lock+0x185/0x240 [osp]
 [&amp;lt;ffffffffa131c557&amp;gt;] lod_object_lock+0x147/0x860 [lod]
 [&amp;lt;ffffffffa0855109&amp;gt;] ? cfs_hash_bd_add_locked+0x29/0x90 [libcfs]
 [&amp;lt;ffffffffa099161f&amp;gt;] ? lu_object_find_try+0x1df/0x260 [obdclass]
 [&amp;lt;ffffffffa139e90b&amp;gt;] mdd_object_lock+0x3b/0xd0 [mdd]
 [&amp;lt;ffffffffa1238bba&amp;gt;] mdt_remote_object_lock+0x14a/0x310 [mdt]
 [&amp;lt;ffffffffa1238ea9&amp;gt;] mdt_object_lock_internal+0x129/0x2d0 [mdt]
 [&amp;lt;ffffffffa1239111&amp;gt;] mdt_object_lock+0x11/0x20 [mdt]
 [&amp;lt;ffffffffa1250474&amp;gt;] mdt_reint_unlink+0x204/0xff0 [mdt]
 [&amp;lt;ffffffffa1247a2d&amp;gt;] mdt_reint_rec+0x5d/0x200 [mdt]
 [&amp;lt;ffffffffa123381b&amp;gt;] mdt_reint_internal+0x62b/0xb80 [mdt]
 [&amp;lt;ffffffffa123420b&amp;gt;] mdt_reint+0x6b/0x120 [mdt]
 [&amp;lt;ffffffffa0c2350c&amp;gt;] tgt_request_handle+0x8ec/0x1470 [ptlrpc]
 [&amp;lt;ffffffffa0c22c20&amp;gt;] ? tgt_request_handle+0x0/0x1470 [ptlrpc]
 [&amp;lt;ffffffffa0b741eb&amp;gt;] handle_recovery_req+0x16b/0x290 [ptlrpc]
 [&amp;lt;ffffffffa0b7b17c&amp;gt;] target_recovery_thread+0x10dc/0x2550 [ptlrpc]
 [&amp;lt;ffffffff81064c12&amp;gt;] ? default_wake_function+0x12/0x20
 [&amp;lt;ffffffffa0b7a0a0&amp;gt;] ? target_recovery_thread+0x0/0x2550 [ptlrpc]
 [&amp;lt;ffffffff8109e78e&amp;gt;] kthread+0x9e/0xc0
 [&amp;lt;ffffffff8100c28a&amp;gt;] child_rip+0xa/0x20
 [&amp;lt;ffffffff8109e6f0&amp;gt;] ? kthread+0x0/0xc0
 [&amp;lt;ffffffff8100c280&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Ah, we should allow the lock enqueue during recovering, I will cook a patch.&lt;/p&gt;</comment>
                            <comment id="135777" author="gerrit" created="Thu, 10 Dec 2015 07:23:50 +0000"  >&lt;p&gt;wangdi (di.wang@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/17539&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/17539&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7531&quot; title=&quot;MDT recovery stalled if rescources are failed back immediatelly&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7531&quot;&gt;&lt;del&gt;LU-7531&lt;/del&gt;&lt;/a&gt; osp: allow enqueue during recovery&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 240fc3cf7188c4291085862225cf469ac3d29850&lt;/p&gt;</comment>
                            <comment id="135787" author="heckes" created="Thu, 10 Dec 2015 09:59:15 +0000"  >&lt;p&gt;Error happens again, even if recovery time is 1h.  6 MDTs are stalled for over 1 h now:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;----------------
lola-9
----------------
mdt.soaked-MDT0002.recovery_status=
status: RECOVERING
recovery_start: 1449724636
time_remaining: 0
connected_clients: 9/16
req_replay_clients: 1
lock_repay_clients: 1
completed_clients: 8
evicted_clients: 7
replayed_requests: 10
queued_requests: 0
next_transno: 1090929751042
mdt.soaked-MDT0003.recovery_status=
status: RECOVERING
recovery_start: 1449724648
time_remaining: 0
connected_clients: 9/16
req_replay_clients: 1
lock_repay_clients: 1
completed_clients: 8
evicted_clients: 7
replayed_requests: 0
queued_requests: 0
next_transno: 1047980457124
----------------
lola-10
----------------
mdt.soaked-MDT0004.recovery_status=
status: RECOVERING
recovery_start: 1449735425
time_remaining: 0
connected_clients: 14/14
req_replay_clients: 0
lock_repay_clients: 0
completed_clients: 14
evicted_clients: 0
replayed_requests: 0
queued_requests: 0
next_transno: 1129576470250
mdt.soaked-MDT0005.recovery_status=
status: RECOVERING
recovery_start: 1449735422
time_remaining: 0
connected_clients: 14/14
req_replay_clients: 0
lock_repay_clients: 0
completed_clients: 14
evicted_clients: 0
replayed_requests: 0
queued_requests: 0
next_transno: 1138166481892
----------------
lola-11
----------------
mdt.soaked-MDT0006.recovery_status=
status: RECOVERING
recovery_start: 1449729112
time_remaining: 0
connected_clients: 14/14
req_replay_clients: 0
lock_repay_clients: 0
completed_clients: 14
evicted_clients: 0
replayed_requests: 0
queued_requests: 0
next_transno: 1116691658301
mdt.soaked-MDT0007.recovery_status=
status: RECOVERING
recovery_start: 1449729069
time_remaining: 0
connected_clients: 14/14
req_replay_clients: 0
lock_repay_clients: 0
completed_clients: 14
evicted_clients: 0
replayed_requests: 0
queued_requests: 0
next_transno: 1125281595927
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I attached the messages file containing the stack traces  of the nodes affected as requested above.&lt;/p&gt;</comment>
                            <comment id="137018" author="gerrit" created="Mon, 21 Dec 2015 12:41:28 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/17539/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/17539/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7531&quot; title=&quot;MDT recovery stalled if rescources are failed back immediatelly&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7531&quot;&gt;&lt;del&gt;LU-7531&lt;/del&gt;&lt;/a&gt; osp: allow few requests during recovery&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 2fe2d1e82005746180309d9b79057a418a729e54&lt;/p&gt;</comment>
                            <comment id="137031" author="jgmitter" created="Mon, 21 Dec 2015 14:17:58 +0000"  >&lt;p&gt;Landed for 2.8&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="19835" name="console-lola-9.log.bz2" size="96228" author="heckes" created="Wed, 9 Dec 2015 13:53:53 +0000"/>
                            <attachment id="19836" name="lola-9-lustre-log-recovery-stalled-2015-12-09-0455.log.bz2" size="415313" author="heckes" created="Wed, 9 Dec 2015 13:53:53 +0000"/>
                            <attachment id="19855" name="messages+stack-trace-lola-10.log.bz2" size="298969" author="heckes" created="Thu, 10 Dec 2015 10:02:43 +0000"/>
                            <attachment id="19856" name="messages+stack-trace-lola-11.log.bz2" size="245510" author="heckes" created="Thu, 10 Dec 2015 10:02:43 +0000"/>
                            <attachment id="19854" name="messages+stack-trace-lola-9.log.bz2" size="269386" author="heckes" created="Thu, 10 Dec 2015 10:02:43 +0000"/>
                            <attachment id="19837" name="messages-lola-9.log.bz2" size="178255" author="heckes" created="Wed, 9 Dec 2015 13:53:53 +0000"/>
                            <attachment id="19838" name="soak.log.bz2" size="14667" author="heckes" created="Wed, 9 Dec 2015 14:00:34 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzxvan:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>