<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:19:15 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-15546] Shared Directory File Creates regression seen in 2.15 when comparing to 2.12.6</title>
                <link>https://jira.whamcloud.com/browse/LU-15546</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;When testing mdtest 2.15 (2.14.57) and comparing to 2.12.6, I see a large 25% regression with Shared Directory File Creates. Perf traces (attached) show a lot of extra ldlm overhead.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;#!/bin/bash

NODES=21
PPN=16
PROCS=$(( $NODES * $PPN ))
MDT_COUNT=1
PAUSED=120

srun -N $NODES --ntasks-per-node $PPN ~bloewe/benchmarks/ior-3.3.0-CentOS-8.2/install/bin/mdtest -v -i 5 -p $PAUSED -C -E -T -r -n $(( $MDT_COUNT * 1048576 / $PROCS )) -d /mnt/kjlmo2/pkoutoupis/mdt0/test.`date +&quot;%Y%m%d.%H%M%S&quot;` 2&amp;gt;&amp;amp;1 |&amp;amp; tee f_mdt0_0k_ost_shared.out

srun -N $NODES --ntasks-per-node $PPN ~bloewe/benchmarks/ior-3.3.0-CentOS-8.2/install/bin/mdtest -v -i 5 -p $PAUSED -C -w 32768 -E -e 32768 -T -r -n $(( $MDT_COUNT * 1048576 / $PROCS )) -d /mnt/kjlmo2/pkoutoupis/mdt0/test.`date +&quot;%Y%m%d.%H%M%S&quot;` 2&amp;gt;&amp;amp;1 |&amp;amp; tee f_mdt0_32k_ost_shared.out
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="68618">LU-15546</key>
            <summary>Shared Directory File Creates regression seen in 2.15 when comparing to 2.12.6</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="eaujames">Etienne Aujames</assignee>
                                    <reporter username="koutoupis">Petros Koutoupis</reporter>
                        <labels>
                    </labels>
                <created>Thu, 10 Feb 2022 17:05:32 +0000</created>
                <updated>Fri, 1 Jul 2022 04:08:23 +0000</updated>
                            <resolved>Fri, 18 Mar 2022 19:06:07 +0000</resolved>
                                    <version>Lustre 2.15.0</version>
                                    <fixVersion>Lustre 2.15.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>10</watches>
                                                                            <comments>
                            <comment id="325891" author="paf0186" created="Thu, 10 Feb 2022 17:16:46 +0000"  >&lt;p&gt;Petros,&lt;/p&gt;

&lt;p&gt;You&apos;ve provided a bunch of different perf traces, but it looks like there&apos;s a flame graph summary for only 2.15 and not for 2.12, so I can&apos;t compare them?&lt;/p&gt;

&lt;p&gt;Can you be more specific about which perf traces show LDLM overhead, exactly how they&apos;re showing it, and where they were gathered from?&#160; It looks like based on the node name these were gathered on an MDT?&lt;/p&gt;

&lt;p&gt;For what it&apos;s worth, the on CPU perf traces here can wave in the general direction, but unless we&apos;re CPU bound, they&apos;re not going to tell us much.&lt;/p&gt;

&lt;p&gt;Generally a DNE issue like this is related to LDLM locking behavior, but that shows up mostly in sleeping time as nodes wait for locks to be passed around, not on-CPU time.&#160; Most likely we&apos;re going to need comparative ldlm stats and, ideally, dlmtrace logs from the server or clients.&lt;/p&gt;</comment>
                            <comment id="326191" author="koutoupis" created="Mon, 14 Feb 2022 15:01:49 +0000"  >&lt;p&gt;Let me delete the one tarball as it is not really needed and may add confusion (LUS-10749-perf-results.tar.gz). The remaining tarball LUS-10749-perf-traces.tar.gz show a comparison between a 2.12.6 baseline which performs fairly well in our environment and a 2.15 build which does not. In the 2.15 flamegraphs I see&#160; a significant amount of ldlm_reprocess_all occurring which is not reflect in the 2.12.6 traces. And yes, this is a trace on the MDT I am writing to. We are also not pegging the CPU.&lt;/p&gt;</comment>
                            <comment id="326203" author="paf0186" created="Mon, 14 Feb 2022 15:34:47 +0000"  >&lt;p&gt;OK, that makes sense, and I see we have both svgs now.&lt;/p&gt;

&lt;p&gt;So the greater LDLM activity suggests worse locking behavior, eg, a conflict where one did not exist before.&lt;/p&gt;

&lt;p&gt;It might be possible to guess why that&apos;s occurring from the greater LDLM thread activity and looking at patches, but it&apos;s a huge gap in changes (~2 two full releases).&#160; So what&apos;s really needed is at least ldlm stats (ldlm_cbd_stats I believe it&apos;s called) or, ideally, a snippet of debug logs from a client in both cases.&#160; (A client is better than a server because hopefully we can see which operations are leading to conflicts.)&lt;/p&gt;</comment>
                            <comment id="326234" author="koutoupis" created="Mon, 14 Feb 2022 16:50:03 +0000"  >&lt;p&gt;Patrick, You are correct, it is a very large gap which is quite overwhelming. I will work on grabbing client traces and gather the requested debug data shortly. Thank you.&lt;/p&gt;</comment>
                            <comment id="326265" author="paf0186" created="Mon, 14 Feb 2022 19:17:50 +0000"  >&lt;p&gt;Note on a mistake in my previous - ldlm_cbd_stats are client side stats, I don&apos;t have the server side stats name handy.&lt;/p&gt;</comment>
                            <comment id="326654" author="pjones" created="Thu, 17 Feb 2022 18:57:20 +0000"  >&lt;p&gt;From a community release point of view, shouldn&apos;t the comparison be between 2.14 and 2.15 (with no patches applied)? If a drop is seen then can you do a git bisect to identify which change(s) has introduced the performance regression?&lt;/p&gt;</comment>
                            <comment id="327469" author="koutoupis" created="Fri, 25 Feb 2022 21:00:45 +0000"  >&lt;p&gt;Uploaded&#160;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15546&quot; title=&quot;Shared Directory File Creates regression seen in 2.15 when comparing to 2.12.6&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15546&quot;&gt;&lt;del&gt;LU-15546&lt;/del&gt;&lt;/a&gt;&amp;#45;reverted&amp;#45;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10262&quot; title=&quot;Lock contention when doing creates for the same name&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10262&quot;&gt;&lt;del&gt;LU-10262&lt;/del&gt;&lt;/a&gt;-performance.tar.gz which contains the shared directory results of our 2.12, 2.15 and again, 2.15 with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10262&quot; title=&quot;Lock contention when doing creates for the same name&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10262&quot;&gt;&lt;del&gt;LU-10262&lt;/del&gt;&lt;/a&gt; reverted. A bisect of the master branch revealed the patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10262&quot; title=&quot;Lock contention when doing creates for the same name&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10262&quot;&gt;&lt;del&gt;LU-10262&lt;/del&gt;&lt;/a&gt; being the culprit and once the patch was reverted, the performance for shared directory file creates was restored and more in line with what we were seeing in our 2.12.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;commit 33dc40d58ef6eb8b384fce1da9f8d21cad4ef6d8
Author: Dominique Martinet &amp;lt;dominique.martinet@cea.fr&amp;gt;
Date:   Fri Aug 31 18:03:36 2018 +0900

    LU-10262 mdt: mdt_reint_open: check EEXIST without lock
    
    Many applications blindly open files with O_CREAT, and the mds gets a
    write lock to the parent directory for these even if the file already
    exists.
    Checking for file existence first lets us take a PR lock if file
    already existed even if O_CREAT was specified.
    
    This opens up multiple races between the first lookup and the actual
    locking, in each of them drop the resources we aquired and retry from
    scratch to keep things as far from complicated as possible, with mixed
    success.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="327474" author="spitzcor" created="Fri, 25 Feb 2022 21:20:38 +0000"  >&lt;blockquote&gt;&lt;p&gt;From a community release point of view, shouldn&apos;t the comparison be between 2.14 and 2.15 (with no patches applied)? &lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Yes, of course. I think that assumes that 2.14.0 was deemed &apos;good&apos;, and perhaps it was, but with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10262&quot; title=&quot;Lock contention when doing creates for the same name&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10262&quot;&gt;&lt;del&gt;LU-10262&lt;/del&gt;&lt;/a&gt; identified as the culprit, we now know that 2.14.0 may not have been as good as thought.  No matter, what&apos;s important now is to understand how to move forward.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=vitaly_fertman&quot; class=&quot;user-hover&quot; rel=&quot;vitaly_fertman&quot;&gt;vitaly_fertman&lt;/a&gt; has indicated that it would be best to revert &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10262&quot; title=&quot;Lock contention when doing creates for the same name&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10262&quot;&gt;&lt;del&gt;LU-10262&lt;/del&gt;&lt;/a&gt; because reverting it would not reintroduce any defect.  &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10262&quot; title=&quot;Lock contention when doing creates for the same name&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10262&quot;&gt;&lt;del&gt;LU-10262&lt;/del&gt;&lt;/a&gt; was inserted as an &apos;improvement&apos;.  In fact, there appears to be plenty of performance test results provided as landing collateral that indicated that it was &apos;good&apos;.  I&apos;d like to try and understand how to reconcile that info with the shared-dir mdtest results that demonstrate a regression with the patch.  I know that &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10262&quot; title=&quot;Lock contention when doing creates for the same name&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10262&quot;&gt;&lt;del&gt;LU-10262&lt;/del&gt;&lt;/a&gt; was made as an optimization targeting FORTRAN applications, but perhaps it was made at the expense of the general case?  Is this a simple tradeoff or is there more to it?&lt;/p&gt;</comment>
                            <comment id="327479" author="pjones" created="Fri, 25 Feb 2022 22:10:07 +0000"  >&lt;p&gt;So it looks like this patch was in 2.14 and also the last several 2.12.x releases. Perhaps, as with the performance issue we found late in the 2.14 release cycle, this performance issue is more apparent on some systems than others....&lt;/p&gt;</comment>
                            <comment id="327488" author="pjones" created="Fri, 25 Feb 2022 22:12:39 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=eaujames&quot; class=&quot;user-hover&quot; rel=&quot;eaujames&quot;&gt;eaujames&lt;/a&gt; &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=asmadeus&quot; class=&quot;user-hover&quot; rel=&quot;asmadeus&quot;&gt;asmadeus&lt;/a&gt; any thoughts on this reported issue?&lt;/p&gt;</comment>
                            <comment id="327497" author="asmadeus" created="Fri, 25 Feb 2022 22:48:23 +0000"  >&lt;p&gt;The original rationale for this patch was indeed something we saw on FORTRAN code, but any sloppy program opening files with O_CREAT on all ranks with a large number of ranks would trigger this &#8211; basically we are doing a lock storm without this patch where each client has to revoke the lock of other clients for each open, and we&apos;ve pushed hard for the patch because on very large jobs (12-32k ranks) we&apos;ve seen MDS hangs from it. Had it been just sloppy programs we&apos;d fix the programs, but FORTRAN open always pass O_CREAT so there is no fix (we&apos;ve also temporarily had a LD_PRELOAD lib that traps opens with O_CREAT, does access() and strips the flags if file exists but that&apos;s just ugly...)&lt;/p&gt;

&lt;p&gt;afair it&apos;s been known from day 1 that patterns like mdtest (each rank creating new files in the directory) would be a slowdown, while cases where a shared file is open get immensely faster (the O_CREAT+precreate line of Etienne&apos;s benchmark in the other LU), and that also fixed the MDS hang we had.&lt;/p&gt;

&lt;p&gt;The LDLM overhead is not so much a conflict that we&apos;re taking the lock one more time iirc. gerrit doesn&apos;t load for me right now so I didn&apos;t re-read the patch, but we first take the lock for read, check file existence then upgrade it to write if it didn&apos;t exist &#8211; that will be slower in the really create case. iirc the final patch for file creation vs. directory creation was different and directory might have been checking without taking the read lock, in which case the impact would be lower? Perhaps a similar optimization could be done for file creation... we want to take care of catching MPI-style &quot;wall of open(O_CREAT) on same file&quot; so I was thinking locking is required but I recall a comment on the later patch saying later on that this check is on MDT so it&apos;s actually not required.&lt;/p&gt;</comment>
                            <comment id="327729" author="eaujames" created="Tue, 1 Mar 2022 10:00:29 +0000"  >&lt;p&gt;The  &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10262&quot; title=&quot;Lock contention when doing creates for the same name&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10262&quot;&gt;&lt;del&gt;LU-10262&lt;/del&gt;&lt;/a&gt; test results are not from mdtests, they represent a common production case where all the nodes of a job attempt to create the same files at the same time with the same parent directory (&quot;wall of open(O_CREAT) on same file&quot;). In that case without the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10262&quot; title=&quot;Lock contention when doing creates for the same name&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10262&quot;&gt;&lt;del&gt;LU-10262&lt;/del&gt;&lt;/a&gt;, for each O_CREAT request the MDT will take a LCK_PW on the parent (blocking further operation on the directory) while the first request creates the file.&lt;/p&gt;

&lt;p&gt;But yes, the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10262&quot; title=&quot;Lock contention when doing creates for the same name&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10262&quot;&gt;&lt;del&gt;LU-10262&lt;/del&gt;&lt;/a&gt; degrades performances for the case of pure file creations: for each file to create we take a LCK_PR on the parent lock to check if the child exists and then a LCK_PW.&lt;/p&gt;

&lt;p&gt;To prevent this overhead, we could determine the lock_mode with a lookup without DLM lock and then we have to redo the lookup with the lock. If the file has been removed in the meantime, we have to re-lock the parent with PW but hopefully this will be rare.&lt;/p&gt;

&lt;p&gt;What do you think about this?&lt;/p&gt;</comment>
                            <comment id="327904" author="gerrit" created="Wed, 2 Mar 2022 18:54:19 +0000"  >&lt;p&gt;&quot;Etienne AUJAMES &amp;lt;eaujames@ddn.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/46679&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46679&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15546&quot; title=&quot;Shared Directory File Creates regression seen in 2.15 when comparing to 2.12.6&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15546&quot;&gt;&lt;del&gt;LU-15546&lt;/del&gt;&lt;/a&gt; mdt: mdt_reint_open lookup before locking&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 2df91b9c5aa46c5e3cfbe7206f7e2644e304f81d&lt;/p&gt;</comment>
                            <comment id="327940" author="adilger" created="Thu, 3 Mar 2022 04:37:11 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=koutoupis&quot; class=&quot;user-hover&quot; rel=&quot;koutoupis&quot;&gt;koutoupis&lt;/a&gt;, would you be able to test this patch in your testbed to see if it resolves the performance problem?&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=eaujames&quot; class=&quot;user-hover&quot; rel=&quot;eaujames&quot;&gt;eaujames&lt;/a&gt;, thanks for the patch.  One further option that might avoid the extra lookup is to track a history of recent open modes (e.g. most recent 256 opens), and then use that to decide whether to do the pre-lookup or not.&lt;/p&gt;

&lt;p&gt;For workloads like mdtest that never try to recreate existing files the pre-lookup is purely overhead, so defaulting to PW after a number of such opens would avoid this.  For many FORTRAN threads, repeatedly opening an existing file with &lt;tt&gt;O_CREAT&lt;/tt&gt; this would default over time to PR.  A mixed workload would always do the pre-lookup, which is still a win compared to getting the wrong DLM lock.  In the rare case that the decision is wrong (e.g. workload change), it can retry the same as if the lookup was racy.  Something like:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
struct mdt_device {
         unsigned &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; mdt_open_lock_history;
};
#define MDT_OPEN_LOCK_COUNT 256
&lt;span class=&quot;code-comment&quot;&gt;/* bias toward LCK_PW since it does not need a full retry loop */&lt;/span&gt;
#define MDT_OPEN_LOCK_THRESH_PW LCK_PW * (MDT_OPEN_LOCK_COUNT - 32)
#define MDT_OPEN_LOCK_THRESH_PR LCK_PR * (MDT_OPEN_LOCK_COUNT + 8)

&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; mdt_init0(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_env *env, struct mdt_device *m,
                     struct lu_device_type *ldt, struct lustre_cfg *cfg)
{
        mdt-&amp;gt;mdt_open_history  = (LCK_PR + LCK_PW) * MDT_OPEN_LOCK_COUNT / 2;
}

void mdt_open_lock_history_add(struct mdt_device *mdt, &lt;span class=&quot;code-keyword&quot;&gt;enum&lt;/span&gt; ldlm_mode mode)
{
        mdt-&amp;gt;mdt_open_history = (mdt-&amp;gt;mdt_open_history * (MDT_OPEN_LOCK_COUNT - 1) + mode) /
                MDT_OPEN_LOCK_COUNT;
}

&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; inline &lt;span class=&quot;code-keyword&quot;&gt;enum&lt;/span&gt; ldlm_mode mdt_open_lock_mode(struct mdt_thread_info *info,
						struct mdt_object *p,
						struct lu_name *name,
						u64 open_flags)
{
	struct lu_fid fid;
	&lt;span class=&quot;code-keyword&quot;&gt;enum&lt;/span&gt; ldlm_lock_mode mode;

	&lt;span class=&quot;code-comment&quot;&gt;/* We don&apos;t need to take the DLM lock &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; a &lt;span class=&quot;code-keyword&quot;&gt;volatile&lt;/span&gt; */&lt;/span&gt;
	&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (open_flags &amp;amp; MDS_OPEN_VOLATILE)
		&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; LCK_NL;

        &lt;span class=&quot;code-comment&quot;&gt;/* &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; most recent opens used the same mode, assume next one will also */&lt;/span&gt;
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (info-&amp;gt;mti_mdt-&amp;gt;mdt_open_history &amp;gt; MDT_OPEN_LOCK_THRESH_PW)
               &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;  LCK_PW;
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (info-&amp;gt;mti_mdt-&amp;gt;mdt_open_history &amp;gt; MDT_OPEN_LOCK_THRESH_PR)
               &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;  LCK_PR;

	&lt;span class=&quot;code-comment&quot;&gt;/* If the file exists we only need a read lock on the parent */&lt;/span&gt;
	mode = mdo_lookup(info-&amp;gt;mti_env, mdt_object_child(p), name, &amp;amp;fid,
			&amp;amp;info-&amp;gt;mti_spec) == 0 ? LCK_PR : LCK_PW;

        mdt_open_lock_history_add(info-&amp;gt;mti_mdt, mode);

	&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; mode;
}

&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc)
{
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (result == -ENOENT) {
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (lh != NULL &amp;amp;&amp;amp; lock_mode == LCK_PR) {
                        &lt;span class=&quot;code-comment&quot;&gt;/* first pass: get write lock and restart */&lt;/span&gt;
                        mdt_object_unlock(info, parent, lh, 1);
                        mdt_clear_disposition(info, ldlm_rep, DISP_LOOKUP_NEG);
                        mdt_lock_handle_init(lh);
                        lock_mode = LCK_PW;
                        mdt_open_lock_history_add(info-&amp;gt;mti_mdt, lock_mode);
                        &lt;span class=&quot;code-keyword&quot;&gt;goto&lt;/span&gt; again_pw;
                }
        }
        :
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (created) {
                        :
                } &lt;span class=&quot;code-keyword&quot;&gt;else&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (lock_mode == LCK_PW) {
                        &lt;span class=&quot;code-comment&quot;&gt;/* LCK_PW was not needed, child already exists */&lt;/span&gt;
                        mdt_open_lock_history_add(info-&amp;gt;mti_mdt, lock_mode);
                }
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="327974" author="koutoupis" created="Thu, 3 Mar 2022 14:34:30 +0000"  >&lt;p&gt;@Andreas Dilger. Of course. I can do so over the next few days.&lt;/p&gt;</comment>
                            <comment id="328045" author="gerrit" created="Fri, 4 Mar 2022 06:49:08 +0000"  >&lt;p&gt;&quot;Andreas Dilger &amp;lt;adilger@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/46696&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46696&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15546&quot; title=&quot;Shared Directory File Creates regression seen in 2.15 when comparing to 2.12.6&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15546&quot;&gt;&lt;del&gt;LU-15546&lt;/del&gt;&lt;/a&gt; mdt: keep history of mdt_reint_open() lock&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 47cf66750b44a34be3543e0f3e629cbf5103f8da&lt;/p&gt;</comment>
                            <comment id="328125" author="koutoupis" created="Fri, 4 Mar 2022 18:32:16 +0000"  >&lt;p&gt;I tested (mdtest shared directory file creates) change &lt;a href=&quot;https://review.whamcloud.com/46679&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;46679&lt;/a&gt; against my original 2.15 baseline and it is about a 20-22% improvement but still a 5% regression from a revert of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10262&quot; title=&quot;Lock contention when doing creates for the same name&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10262&quot;&gt;&lt;del&gt;LU-10262&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="328186" author="adilger" created="Sat, 5 Mar 2022 02:34:21 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=koutoupis&quot; class=&quot;user-hover&quot; rel=&quot;koutoupis&quot;&gt;koutoupis&lt;/a&gt; feel free to play with my patch  &lt;a href=&quot;https://review.whamcloud.com/46696&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46696&lt;/a&gt; - it isn&apos;t quite ready, just something I whipped together and have not tested, but close to something that would work.&lt;/p&gt;</comment>
                            <comment id="328388" author="gerrit" created="Tue, 8 Mar 2022 12:14:27 +0000"  >&lt;p&gt;&quot;Dominique Martinet &amp;lt;qhufhnrynczannqp.f@noclue.notk.org&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/46738&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46738&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15546&quot; title=&quot;Shared Directory File Creates regression seen in 2.15 when comparing to 2.12.6&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15546&quot;&gt;&lt;del&gt;LU-15546&lt;/del&gt;&lt;/a&gt; mdt: optimistically trust non-locked lookup&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 55162c4d0fc798b5070d08a670f1e36b575672c6&lt;/p&gt;</comment>
                            <comment id="329407" author="koutoupis" created="Wed, 16 Mar 2022 20:15:23 +0000"  >&lt;p&gt;I tested change &lt;a href=&quot;https://review.whamcloud.com/46696&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;46696&lt;/a&gt; and it performs a bit better than just 46679 (on shared directory file creates). It is within 5% of my original baseline. Actually, around 3.5-4% less which I consider great.&lt;/p&gt;</comment>
                            <comment id="329410" author="adilger" created="Wed, 16 Mar 2022 20:53:51 +0000"  >&lt;p&gt;Petros, did you make fixes to 46696 before testing it?  There are a couple of bugs in the current version patch (though clearly described there), but they could be fixed relatively easily, but I have no fast system to benchmark it myself.  If you have made fixes, please push a new version, otherwise I will take another crack at it. &lt;/p&gt;</comment>
                            <comment id="329411" author="koutoupis" created="Wed, 16 Mar 2022 21:07:46 +0000"  >&lt;p&gt;Andreas, I have not made the fixes. I see the bugs that you are referring to now.&lt;/p&gt;</comment>
                            <comment id="329633" author="gerrit" created="Fri, 18 Mar 2022 17:34:24 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/46679/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46679/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15546&quot; title=&quot;Shared Directory File Creates regression seen in 2.15 when comparing to 2.12.6&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15546&quot;&gt;&lt;del&gt;LU-15546&lt;/del&gt;&lt;/a&gt; mdt: mdt_reint_open lookup before locking&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: f14090e56c9d94e3cfaa6f13f357173d6d570547&lt;/p&gt;</comment>
                            <comment id="329646" author="pjones" created="Fri, 18 Mar 2022 19:05:22 +0000"  >&lt;p&gt;As per the discussion on the LWG call, Etienne&apos;s patch (which just landed) is what we will address for 2.15. I suggest that the other patches in flight get moved to a new JIRA for possible inclusion in a future release&lt;/p&gt;</comment>
                            <comment id="329670" author="adilger" created="Sat, 19 Mar 2022 17:16:57 +0000"  >&lt;p&gt;Petros and Shuichi, the latest version of my patch: &lt;a href=&quot;https://review.whamcloud.com/46696&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46696&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15546&quot; title=&quot;Shared Directory File Creates regression seen in 2.15 when comparing to 2.12.6&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15546&quot;&gt;&lt;del&gt;LU-15546&lt;/del&gt;&lt;/a&gt; mdt: keep history of mdt_reint_open() lock&lt;/tt&gt;&quot; looks like it is working properly in my local testing, but needs some benchmarking on real hardware to see whether it provides a performance improvement. &lt;/p&gt;

&lt;p&gt;The patch has been updated to have a per-directory history counter. In my local testing it takes about 128 open+creates (with pre-lookup, like Etienne&apos;s just-landed patch) before it gets &quot;into the zone&quot; and speculatively skips the lookup to predict the PW lock mode and skip the pre-lookup. It takes about 16 &quot;bad&quot; lookups in the same directory before it reverts to doing the pre-lookup again, and 256 open-existing before it swings to the opposite end to predict PR locks and skip the pre-lookup. &lt;/p&gt;

&lt;p&gt;Mixed workloads within a single directory will be essentially the same as the current code, so it will always do a pre-lookup in the directory if the open mode doesn&apos;t give enough info. &lt;/p&gt;</comment>
                            <comment id="336321" author="gerrit" created="Mon, 30 May 2022 17:26:16 +0000"  >&lt;p&gt;&quot;Etienne AUJAMES &amp;lt;eaujames@ddn.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/47487&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47487&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15546&quot; title=&quot;Shared Directory File Creates regression seen in 2.15 when comparing to 2.12.6&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15546&quot;&gt;&lt;del&gt;LU-15546&lt;/del&gt;&lt;/a&gt; mdt: mdt_reint_open lookup before locking&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: cfb5c55e9550a04aa22a5849ac8e86a2dc36eada&lt;/p&gt;</comment>
                            <comment id="338964" author="adilger" created="Tue, 28 Jun 2022 06:06:10 +0000"  >&lt;p&gt;Shuichi tested my patch but didn&apos;t find it changed the performance significantly:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;I did test patch and compared against lustre-2.15.0-RC3.&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;1 x MDS(1xMDT, 12 CPU cores, 142GB RAM)&lt;/li&gt;
	&lt;li&gt;4 x OSS(2xOST/OSS)&lt;/li&gt;
	&lt;li&gt;40 x client(16 CPU cores, 96GB RAM)&lt;/li&gt;
	&lt;li&gt;IB-HDR100 network&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;a workload is many processes (640 processes) write huge amount of files (19.2M files) into a single shared directory.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;mpirun -np 640 mdtest -n 30000 -F -v -d /exafs/d0/d1/d2/mdtest.out -C -r -p 30 -i 3
lustre-2.15.0-RC3
SUMMARY rate: (of 3 iterations)
   Operation                     Max            Min           Mean        Std Dev
   ---------                     ---            ---           ----        -------
   File creation               64713.164      53431.547      60835.215       6414.183
   File stat                       0.000          0.000          0.000          0.000
   File read                       0.000          0.000          0.000          0.000
   File removal                46277.792      44080.164      45406.512       1167.351
   Tree creation                4629.475       3495.253       4131.971        579.784
   Tree removal                    2.302          2.019          2.137          0.147

lustre-2.15.0-RC3 + patch46696
SUMMARY rate: (of 3 iterations)
   Operation                     Max            Min           Mean        Std Dev
   ---------                     ---            ---           ----        -------
   File creation               67544.538      52056.964      61429.964       8241.920
   File stat                       0.000          0.000          0.000          0.000
   File read                       0.000          0.000          0.000          0.000
   File removal                45532.402      41724.110      43966.753       1992.363
   Tree creation                4132.319       3472.106       3721.652        358.386
   Tree removal                    2.251          1.837          2.030          0.209
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;In my test environment, I didn&apos;t see huge improvements by patch (might be limited of MDS&apos;s CPU resources), but didn&apos;t find regressions too.&lt;/p&gt;&lt;/blockquote&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="49395">LU-10262</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="69261">LU-15692</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="69578">LU-15720</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="42499" name="LU-15546-reverted-LU-10262-performance.tar.gz" size="4691" author="koutoupis" created="Fri, 25 Feb 2022 20:53:06 +0000"/>
                            <attachment id="42311" name="LUS-10749-perf-traces.tar.gz" size="1979792" author="koutoupis" created="Thu, 10 Feb 2022 17:04:58 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i02i1r:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>