<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:11:58 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-14694] racer timeouts with LU-10948</title>
                <link>https://jira.whamcloud.com/browse/LU-14694</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;many racer runs timeout, bisected that to patch:&lt;br/&gt;
&lt;a href=&quot;https://review.whamcloud.com/32158/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/32158/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10948&quot; title=&quot;client cache open lock after N opens&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10948&quot;&gt;LU-10948&lt;/a&gt; llite: Introduce inode open heat counter&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Commit: 41d99c4902836b7265db946dfa49cf99381f0db4&lt;/p&gt;</description>
                <environment></environment>
        <key id="64347">LU-14694</key>
            <summary>racer timeouts with LU-10948</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bzzz">Alex Zhuravlev</assignee>
                                    <reporter username="bzzz">Alex Zhuravlev</reporter>
                        <labels>
                    </labels>
                <created>Thu, 20 May 2021 04:22:30 +0000</created>
                <updated>Tue, 10 Aug 2021 13:12:35 +0000</updated>
                            <resolved>Tue, 10 Aug 2021 13:12:35 +0000</resolved>
                                    <version>Upstream</version>
                                    <fixVersion>Lustre 2.15.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="302090" author="adilger" created="Thu, 20 May 2021 05:04:55 +0000"  >&lt;p&gt;Alex, what kind of timeout?  Stuck processes (with stack?), unable to unmount, oops, other?  Single client mountpoint or multiple?&lt;/p&gt;

&lt;p&gt;Previously Oleg and I had discussed whether opencache locks needed to time out sooner. Could you try running with &lt;tt&gt;ldlm.namespaces.&amp;#42;.lru_max_age=30&lt;/tt&gt; to see if that helps?&lt;/p&gt;</comment>
                            <comment id="302091" author="adilger" created="Thu, 20 May 2021 05:05:21 +0000"  >&lt;p&gt;Also, is this with opencache enabled, or just with the patch applied?&lt;/p&gt;</comment>
                            <comment id="302145" author="bzzz" created="Thu, 20 May 2021 17:12:22 +0000"  >&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
...
Resetting fail_loc on all nodes...done.
PASS 1 (2720s)
== racer test complete, duration 2768 sec ============================================================ 16:47:13 (1621529233)
Stopping clients: tmp.xMEAXwSwZi /mnt/lustre (opts:-f)
Stopping client tmp.xMEAXwSwZi /mnt/lustre opts:-f
Stopping clients: tmp.xMEAXwSwZi /mnt/lustre2 (opts:-f)
Stopping client tmp.xMEAXwSwZi /mnt/lustre2 opts:-f
Stopping /mnt/lustre-mds1 (opts:-f) on tmp.xMEAXwSwZi
modprobe: FATAL: Module dm-flakey not found.
Stopping /mnt/lustre-mds2 (opts:-f) on tmp.xMEAXwSwZi
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;then unique traces:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
schedule,do_select,trace_hardirqs_off_thunk
	PIDs(1): &lt;span class=&quot;code-quote&quot;&gt;&quot;sshd&quot;&lt;/span&gt;:2287 

schedule,do_select,_copy_to_user
	PIDs(1): &lt;span class=&quot;code-quote&quot;&gt;&quot;sshd&quot;&lt;/span&gt;:2587 

schedule,top_trans_stop,lod_trans_stop,mdd_trans_stop,mdt_mfd_close,mdt_obd_disconnect,class_fail_export,ping_evictor_main
	PIDs(1): &lt;span class=&quot;code-quote&quot;&gt;&quot;ll_evictor&quot;&lt;/span&gt;:7754 

schedule,ptlrpc_set_wait,ptlrpc_queue_wait,osp_send_update_req,osp_send_update_thread
	PIDs(1): &lt;span class=&quot;code-quote&quot;&gt;&quot;osp_up0-1&quot;&lt;/span&gt;:7989 

schedule,top_trans_stop,lod_trans_stop,mdd_trans_stop,mdt_mfd_close,mdt_obd_disconnect,class_disconnect_export_list,class_disconnect_exports,class_cleanup,class_process_config,class_manual_cleanup,server_put_super,generic_shutdown_super,kill_anon_super,deactivate_locked_super,cleanup_mnt,task_work_run,exit_to_usermode_loop
	PIDs(1): &lt;span class=&quot;code-quote&quot;&gt;&quot;umount&quot;&lt;/span&gt;:535030 

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="302146" author="bzzz" created="Thu, 20 May 2021 17:12:46 +0000"  >&lt;p&gt;basically it&apos;s MDSCOUNT=2 REFORMAT=yes bash racer.sh&lt;/p&gt;</comment>
                            <comment id="302151" author="bzzz" created="Thu, 20 May 2021 18:02:10 +0000"  >&lt;blockquote&gt;&lt;p&gt;Also, is this with opencache enabled, or just with the patch applied?&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;this is the latest clean master&lt;/p&gt;</comment>
                            <comment id="302228" author="bzzz" created="Fri, 21 May 2021 11:54:34 +0000"  >&lt;p&gt;sanity/29 fails as well:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
== sanity test 29: IT_GETATTR regression  ============================================================ 11:52:13 (1621597933)
Lustre: DEBUG MARKER: == sanity test 29: IT_GETATTR regression ============================================================ 11:52:13 (1621597933)
striped dir -i1 -c2 -H all_char /mnt/lustre/d29
first d29
Lustre: DEBUG MARKER: first d29
total 0
-rw-r--r-- 1 root root 0 May 21 11:52 foo
second d29
Lustre: DEBUG MARKER: second d29
total 0
-rw-r--r-- 1 root root 0 May 21 11:52 foo
done
Lustre: DEBUG MARKER: done
 sanity test_29: @@@@@@ FAIL: CURRENT: 10 &amp;gt; 9 
Lustre: DEBUG MARKER: sanity test_29: @@@@@@ FAIL: CURRENT: 10 &amp;gt; 9
  Trace dump:
  = ./../tests/test-framework.sh:6280:error()
  = sanity.sh:3398:test_29()
  = ./../tests/test-framework.sh:6583:run_one()
  = ./../tests/test-framework.sh:6630:run_one_logged()
  = ./../tests/test-framework.sh:6457:run_test()
  = sanity.sh:3410:main()
Dumping lctl log to /tmp/ltest-logs/sanity.test_29.*.1621597935.log
Dumping logs only on local client.
Resetting fail_loc on all nodes...done.
FAIL 29 (2s)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="302318" author="bzzz" created="Mon, 24 May 2021 07:25:57 +0000"  >&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
--- a/lustre/tests/racer.sh
+++ b/lustre/tests/racer.sh
@@ -180,6 +180,7 @@ test_1() {
                lss_cleanup
        fi
 
+       cancel_lru_locks mdc
        &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; $rrc
 }
 run_test 1 &lt;span class=&quot;code-quote&quot;&gt;&quot;racer on clients: ${CLIENTS:-$(hostname)} DURATION=$DURATION&quot;&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;this helps.. it looks like openhandle locks are lost at umount, causing MDT to close them at export cleanup.&lt;/p&gt;</comment>
                            <comment id="302332" author="bzzz" created="Mon, 24 May 2021 13:29:19 +0000"  >&lt;p&gt;basically the problem is that:&lt;br/&gt;
1) openlock is taken on directories&lt;br/&gt;
2) the client doesn&apos;t cancel locks on umount:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
                &lt;span class=&quot;code-comment&quot;&gt;/* obd_force == local only */&lt;/span&gt;
                ldlm_cli_cancel_unused(obd-&amp;gt;obd_namespace, NULL,
                                       obd-&amp;gt;obd_force ? LCF_LOCAL : 0, NULL);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;3) thus MDT has to clear those &quot;lost&quot; locks, so close the directories&lt;br/&gt;
4) a close may result in directory removal&lt;br/&gt;
5) directory can be striped, thus needs all involved MDTs to be healty&lt;br/&gt;
6) MDTs are stopped one by one&lt;/p&gt;

&lt;p&gt;locally I &quot;solved&quot; the problem disabling opencache for directories:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
--- a/lustre/mdt/mdt_open.c
+++ b/lustre/mdt/mdt_open.c
@@ -820,6 +820,9 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; mdt_object_open_lock(struct mdt_thread_info *info,
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (req_is_replay(mdt_info_req(info)))
                RETURN(0);
 
+       &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (S_ISDIR(lu_object_attr(&amp;amp;obj-&amp;gt;mot_obj)))
+               open_flags &amp;amp;= ~MDS_OPEN_LOCK;
+
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (S_ISREG(lu_object_attr(&amp;amp;obj-&amp;gt;mot_obj))) {
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;but it looks like we&apos;ve got few real problems to solve here. the most serious one, IMO, is to handle such a close on MDT in a better manner, so that umount doesn&apos;t get stuck indefinitely.&lt;/p&gt;</comment>
                            <comment id="302338" author="bzzz" created="Mon, 24 May 2021 15:15:18 +0000"  >&lt;p&gt;probably a bit better option is to re-use existing infrastructure to skip orphan removal:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
--- a/lustre/mdt/mdt_handler.c
+++ b/lustre/mdt/mdt_handler.c
@@ -6438,7 +6438,8 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; mdt_export_cleanup(struct obd_export *exp)
                                rc = mdt_ctxt_add_dirty_flag(&amp;amp;env, info, mfd);
 
                        &lt;span class=&quot;code-comment&quot;&gt;/* Don&apos;t unlink orphan on failover umount, LU-184 */&lt;/span&gt;
-                       &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (exp-&amp;gt;exp_flags &amp;amp; OBD_OPT_FAILOVER) {
+                       &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (exp-&amp;gt;exp_flags &amp;amp; OBD_OPT_FAILOVER ||
+                               exp-&amp;gt;exp_obd-&amp;gt;obd_stopping) {
                                ma-&amp;gt;ma_valid = MA_FLAGS;
                                ma-&amp;gt;ma_attr_flags |= MDS_KEEP_ORPHAN;
                        }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="302415" author="bzzz" created="Tue, 25 May 2021 09:30:31 +0000"  >&lt;p&gt;I think we need something like above on its own as umount with just evicted client is a legal case.&lt;/p&gt;</comment>
                            <comment id="302458" author="gerrit" created="Tue, 25 May 2021 15:40:43 +0000"  >&lt;p&gt;Alex Zhuravlev (bzzz@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/43783&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/43783&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14694&quot; title=&quot;racer timeouts with LU-10948&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14694&quot;&gt;&lt;del&gt;LU-14694&lt;/del&gt;&lt;/a&gt; mdt: do not remove orphans at umount&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 9f9e98a5e94e021c68882db5ab3d03484048b810&lt;/p&gt;</comment>
                            <comment id="309735" author="gerrit" created="Tue, 10 Aug 2021 08:07:43 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/43783/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/43783/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14694&quot; title=&quot;racer timeouts with LU-10948&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14694&quot;&gt;&lt;del&gt;LU-14694&lt;/del&gt;&lt;/a&gt; mdt: do not remove orphans at umount&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 0b94a058fe6732907d7aa3f5de0cbd7888d28ce1&lt;/p&gt;</comment>
                            <comment id="309755" author="pjones" created="Tue, 10 Aug 2021 13:12:35 +0000"  >&lt;p&gt;Landed for 2.15&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="51963">LU-10948</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i01v1j:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>