<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:09:24 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-14399] mount MDT takes very long with hsm enable</title>
                <link>https://jira.whamcloud.com/browse/LU-14399</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We observed that when mounting MDT with HSM enable, mount command take minutes compare to seconds as before. We saw this in the log&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;

[53618.238941] Lustre: DEBUG MARKER: mkdir -p /mnt/lustre-mds2; mount -t lustre -o localrecov  /dev/mapper/mds2_flakey /mnt/lustre-mds2
[53618.624098] LDISKFS-fs (dm-6): mounted filesystem with ordered data mode. Opts: user_xattr,errors=remount-ro,no_mbcache,nodelalloc
[53720.390690] Lustre: 1722736:0:(mdt_coordinator.c:1114:mdt_hsm_cdt_start()) lustre-MDT0001: trying to init HSM before MDD
[53720.392834] LustreError: 1722736:0:(mdt_coordinator.c:1125:mdt_hsm_cdt_start()) lustre-MDT0001: cannot take the layout locks needed &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; registered restore: -2
[53720.398049] LustreError: 1722741:0:(mdt_coordinator.c:1090:mdt_hsm_cdt_start()) lustre-MDT0001: Coordinator already started or stopping
[53720.400681] Lustre: lustre-MDT0001: Imperative Recovery not enabled, recovery window 60-180
[53720.424872] Lustre: lustre-MDT0001: in recovery but waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; the first client to connect
[53720.953893] Lustre: DEBUG MARKER: /usr/sbin/lctl get_param -n health_check
[53722.067555] Lustre: DEBUG MARKER:  
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Seems related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13920&quot; title=&quot;HSM: hsm_actions are not processed after MDS failover&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13920&quot;&gt;&lt;del&gt;LU-13920&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</description>
                <environment>&lt;a href=&quot;https://testing.whamcloud.com/test_sets/20fcc789-47a3-44a2-a17e-d6f39fbb3ec0&quot;&gt;https://testing.whamcloud.com/test_sets/20fcc789-47a3-44a2-a17e-d6f39fbb3ec0&lt;/a&gt;</environment>
        <key id="62692">LU-14399</key>
            <summary>mount MDT takes very long with hsm enable</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="scherementsev">Sergey Cheremencev</assignee>
                                    <reporter username="mdiep">Minh Diep</reporter>
                        <labels>
                    </labels>
                <created>Wed, 3 Feb 2021 21:19:02 +0000</created>
                <updated>Mon, 14 Feb 2022 23:20:57 +0000</updated>
                            <resolved>Tue, 18 Jan 2022 14:54:30 +0000</resolved>
                                                    <fixVersion>Lustre 2.15.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="291170" author="gerrit" created="Wed, 3 Feb 2021 22:11:31 +0000"  >&lt;p&gt;John L. Hammond (jhammond@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/41409&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/41409&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14399&quot; title=&quot;mount MDT takes very long with hsm enable&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14399&quot;&gt;&lt;del&gt;LU-14399&lt;/del&gt;&lt;/a&gt; Revert &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13920&quot; title=&quot;HSM: hsm_actions are not processed after MDS failover&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13920&quot;&gt;&lt;del&gt;LU-13920&lt;/del&gt;&lt;/a&gt; hsm: process hsm_actions only after mdd setup&quot;&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 5b375e90e3cc3538f4dd92dc81f98fcf5b98e41e&lt;/p&gt;</comment>
                            <comment id="291229" author="sergey" created="Thu, 4 Feb 2021 15:29:07 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;&#160; &#160; &#160; &#160; /* wait until MDD initialize hsm actions llog */
&#160; &#160; &#160; &#160; while (!test_bit(MDT_FL_CFGLOG, &amp;amp;mdt-&amp;gt;mdt_state) &amp;amp;&amp;amp; i &amp;lt; obd_timeout) {
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; schedule_timeout_interruptible(cfs_time_seconds(1));
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; i++;
&#160; &#160; &#160; &#160; }
&#160; &#160; &#160; &#160; if (!test_bit(MDT_FL_CFGLOG, &amp;amp;mdt-&amp;gt;mdt_state))
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; CWARN(&quot;%s: trying to init HSM before MDD\n&quot;, mdt_obd_name(mdt));&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13920&quot; title=&quot;HSM: hsm_actions are not processed after MDS failover&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13920&quot;&gt;&lt;del&gt;LU-13920&lt;/del&gt;&lt;/a&gt; just waits until MDT_FL_CFGLOG will be set.&lt;br/&gt;
This flag is set at the end of mdt_prepare. There is a chance mdt_prepare was stuck for some reasons.&lt;br/&gt;
Anyway I need debug logs from MDT to say something.&lt;br/&gt;
Is it possible to reproduce it again and gather debug logs?&lt;/p&gt;</comment>
                            <comment id="291231" author="gerrit" created="Thu, 4 Feb 2021 15:50:43 +0000"  >&lt;p&gt;John L. Hammond (jhammond@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/41415&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/41415&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14399&quot; title=&quot;mount MDT takes very long with hsm enable&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14399&quot;&gt;&lt;del&gt;LU-14399&lt;/del&gt;&lt;/a&gt;: Revert &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13651&quot; title=&quot;Conditionally skip finding compatible HSM requests&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13651&quot;&gt;&lt;del&gt;LU-13651&lt;/del&gt;&lt;/a&gt; hsm: call hsm_find_compatible_cb() only for cancel&quot;&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 4ed2cd37d3331511ffff1fcd2bbe53c9b2513502&lt;/p&gt;</comment>
                            <comment id="291378" author="mdiep" created="Fri, 5 Feb 2021 23:26:27 +0000"  >&lt;p&gt;here is the sequence that I did to hit this bug&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;mkfs.lustre --mdt --mgsnode ....&lt;/li&gt;
	&lt;li&gt;tunefs.lustre --param mdt.hsm_control=enabled ...&lt;/li&gt;
	&lt;li&gt;mount -t lustre ...&lt;/li&gt;
&lt;/ol&gt;
</comment>
                            <comment id="291438" author="sergey" created="Mon, 8 Feb 2021 16:02:00 +0000"  >&lt;blockquote&gt;&lt;p&gt;here is the sequence that I did to hit this bug&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Thank you. Now the problem is clear.&lt;/p&gt;

&lt;p&gt;I&apos;ll provide a fix in a few days.&lt;/p&gt;</comment>
                            <comment id="291474" author="gerrit" created="Mon, 8 Feb 2021 22:28:52 +0000"  >&lt;p&gt;Sergey Cheremencev (sergey.cheremencev@hpe.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/41445&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/41445&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14399&quot; title=&quot;mount MDT takes very long with hsm enable&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14399&quot;&gt;&lt;del&gt;LU-14399&lt;/del&gt;&lt;/a&gt; hsm: process hsm_actions in coordinator&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: ec9b9b8d2e05568f75ca75e596585503ae0d4216&lt;/p&gt;</comment>
                            <comment id="294656" author="gerrit" created="Thu, 11 Mar 2021 08:54:52 +0000"  >&lt;p&gt;Sergey Cheremencev (sergey.cheremencev@hpe.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/42005&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/42005&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14399&quot; title=&quot;mount MDT takes very long with hsm enable&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14399&quot;&gt;&lt;del&gt;LU-14399&lt;/del&gt;&lt;/a&gt; tests: hsm_actions after failover&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 87493ef365d9faaf1f6c1e1a40f65157d37f72dc&lt;/p&gt;</comment>
                            <comment id="322980" author="gerrit" created="Tue, 18 Jan 2022 09:09:16 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/41445/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/41445/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14399&quot; title=&quot;mount MDT takes very long with hsm enable&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14399&quot;&gt;&lt;del&gt;LU-14399&lt;/del&gt;&lt;/a&gt; hsm: process hsm_actions in coordinator&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: e26d7cc3992252e5fce5a51aee716f933b04c13a&lt;/p&gt;</comment>
                            <comment id="323019" author="pjones" created="Tue, 18 Jan 2022 14:54:30 +0000"  >&lt;p&gt;Landed for 2.15&lt;/p&gt;</comment>
                            <comment id="323262" author="bzzz" created="Thu, 20 Jan 2022 05:35:04 +0000"  >&lt;p&gt;the patch just landed fails every run on my setup:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;...&lt;br/&gt;
Writing CONFIGS/mountdata&lt;br/&gt;
start mds service on tmp.BKaRODHgLn&lt;br/&gt;
Starting mds1: -o localrecov  /dev/mapper/mds1_flakey /mnt/lustre-mds1&lt;br/&gt;
Started lustre-MDT0000&lt;br/&gt;
 conf-sanity test_132: @@@@@@ FAIL: Can not take the layout lock &lt;br/&gt;
  Trace dump:&lt;br/&gt;
  = ./../tests/test-framework.sh:6389:error()&lt;br/&gt;
  = conf-sanity.sh:9419:test_132()&lt;br/&gt;
  = ./../tests/test-framework.sh:6693:run_one()&lt;br/&gt;
  = ./../tests/test-framework.sh:6740:run_one_logged()&lt;br/&gt;
  = ./../tests/test-framework.sh:6581:run_test()&lt;br/&gt;
  = conf-sanity.sh:9422:main()&lt;br/&gt;
Dumping lctl log to /tmp/ltest-logs/conf-sanity.test_132.*.1642612854.log&lt;br/&gt;
Dumping logs only on local client.&lt;br/&gt;
FAIL 132 (84s)&lt;/p&gt;&lt;/blockquote&gt;</comment>
                            <comment id="323284" author="sergey" created="Thu, 20 Jan 2022 12:43:45 +0000"  >&lt;p&gt;Hello Alex,&lt;/p&gt;

&lt;p&gt;what setup did you use? It doesn&apos;t fail on my local VM on the latest master(0feec5a3).&lt;/p&gt;</comment>
                            <comment id="323313" author="bzzz" created="Thu, 20 Jan 2022 15:57:30 +0000"  >&lt;p&gt;well, essentially just a local VM:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;FSTYPE=ldiskfs MDSCOUNT=2 MDSSIZE=300000 OSTSIZE=400000 OSTCOUNT=2 LOGDIR=/tmp/ltest-logs REFORMAT=yes HONOR_EXCEPT=y bash conf-sanity.sh &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="323467" author="sergey" created="Fri, 21 Jan 2022 14:42:43 +0000"  >&lt;p&gt;Alex, it still doesn&apos;t fail on my local VM.&lt;br/&gt;
I can you look into the logs if you attach them to the ticket.&lt;/p&gt;</comment>
                            <comment id="323583" author="bzzz" created="Sat, 22 Jan 2022 07:22:15 +0000"  >&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[ 8878.042110] Lustre: Found index 0 &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; lustre-MDT0000, updating log
[ 8878.044766] Lustre: Modifying parameter lustre-MDT0000.mdt.identity_upcall in log lustre-MDT0000
[ 8898.880071] Lustre: 497204:0:(mdt_coordinator.c:1145:mdt_hsm_cdt_start()) lustre-MDT0000: trying to init HSM before MDD
[ 8898.888824] LustreError: 497204:0:(mdt_coordinator.c:1156:mdt_hsm_cdt_start()) lustre-MDT0000: cannot take the layout locks needed &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; registered restore: -2
[ 8899.150565] Lustre: DEBUG MARKER: conf-sanity test_132: @@@@@@ FAIL: Can not take the layout lock
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I guess  &quot;trying to init HSM before MDD&quot; is a hint?&lt;/p&gt;</comment>
                            <comment id="323812" author="sergey" created="Tue, 25 Jan 2022 13:53:39 +0000"  >&lt;blockquote&gt;&lt;blockquote&gt;&lt;p&gt;&#160;I guess &quot;trying to init HSM before MDD&quot; is a hint?&lt;/p&gt;&lt;/blockquote&gt;&lt;/blockquote&gt;
&lt;p&gt;IMO, it hints your build doesn&apos;t include &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14399&quot; title=&quot;mount MDT takes very long with hsm enable&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14399&quot;&gt;&lt;del&gt;LU-14399&lt;/del&gt;&lt;/a&gt; hsm: process hsm_actions in coordinator&quot;.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;static void cdt_start_pending_restore(struct mdt_device *mdt,
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; struct coordinator *cdt)
{
&#160; &#160; struct mdt_thread_info *cdt_mti;
&#160; &#160; unsigned int i = 0;
&#160; &#160; int rc;&#160; &#160; /* wait until MDD initialize hsm actions llog */
&#160; &#160; while (!test_bit(MDT_FL_CFGLOG, &amp;amp;mdt-&amp;gt;mdt_state) &amp;amp;&amp;amp; i &amp;lt; obd_timeout) {
&#160; &#160; &#160; &#160; schedule_timeout_interruptible(cfs_time_seconds(1));
&#160; &#160; &#160; &#160; i++;
&#160; &#160; }
&#160; &#160; if (!test_bit(MDT_FL_CFGLOG, &amp;amp;mdt-&amp;gt;mdt_state))
&#160; &#160; &#160; &#160; CWARN(&quot;%s: trying to init HSM before MDD\n&quot;, mdt_obd_name(mdt));
 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&quot;trying to init HSM before MDD&quot; message should be printed by cdt_start_pending_restore, while in your case it is mdt_hsm_cdt_start.&lt;/p&gt;</comment>
                            <comment id="323980" author="bzzz" created="Wed, 26 Jan 2022 05:52:41 +0000"  >&lt;p&gt;well, I&apos;m running master branch, so clearly &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14399&quot; title=&quot;mount MDT takes very long with hsm enable&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14399&quot;&gt;&lt;del&gt;LU-14399&lt;/del&gt;&lt;/a&gt; is in. this is what helps:\&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
-       &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; (!test_bit(MDT_FL_CFGLOG, &amp;amp;mdt-&amp;gt;mdt_state) &amp;amp;&amp;amp; i &amp;lt; obd_timeout) {
+       &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; (!test_bit(MDT_FL_CFGLOG, &amp;amp;mdt-&amp;gt;mdt_state) &amp;amp;&amp;amp; i &amp;lt; obd_timeout * 2) {
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;not sure how good it is...&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="60471">LU-13920</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i01lh3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>