<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:20:46 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-15725] Client side Mdtest File Read Regression introduced with fix for LU-11623</title>
                <link>https://jira.whamcloud.com/browse/LU-15725</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;While testing 2.15 and comparing it to our 2.12 branch, I observed a noticeable file read regression on the client side and after doing a git bisect, I narrowed it down to the patch &lt;a href=&quot;https://review.whamcloud.com/38763&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/38763&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11623&quot; title=&quot;Allow caching of open-created dentries&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11623&quot;&gt;LU-11623&lt;/a&gt; llite: hash just created files if lock allows&lt;/tt&gt;&quot;.&lt;/p&gt;

&lt;p&gt;After reverting the patch, my read performance was immediately restored but it was at the expense of the huge file stat boost.&lt;/p&gt;

&lt;p&gt;&lt;b&gt;File stats&lt;/b&gt;&lt;br/&gt;
Original (2.12 results): 399893&lt;br/&gt;
Before Revert (2.15): 683490 &#8212;&amp;gt; +73%&lt;br/&gt;
After Revert (2.15): 401637&lt;/p&gt;

&lt;p&gt;&lt;b&gt;File Reads&lt;/b&gt;&lt;br/&gt;
Original (2.12 results) 297644&lt;br/&gt;
Before Revert (2.15): 250536 &#8212;&amp;gt; -15%&lt;br/&gt;
After Revert (2.15): 295096&lt;/p&gt;

&lt;p&gt;mdtest script:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;#!/bin/bash


NODES=21
PPN=16
PROCS=$(( $NODES * $PPN ))
MDT_COUNT=1
PAUSED=120


# Unique directory #
srun -N $NODES --ntasks-per-node $PPN ~bloewe/benchmarks/ior-3.3.0-CentOS-8.2/install/bin/mdtest -v -i 5 -p $PAUSED -C -E -T -r -n $(( $MDT_COUNT * 1048576 / $PROCS )) -u -d /mnt/kjlmo13/pkoutoupis/mdt0/test.`date +&quot;%Y%m%d.%H%M%S&quot;` 2&amp;gt;&amp;amp;1 |&amp;amp; tee f_mdt0_0k_ost_uniq.out

srun -N $NODES --ntasks-per-node $PPN ~bloewe/benchmarks/ior-3.3.0-CentOS-8.2/install/bin/mdtest -v -i 5 -p $PAUSED -C -w 32768 -E -e 32768 -T -r -n $(( $MDT_COUNT * 1048576 / $PROCS )) -u -d /mnt/kjlmo13/pkoutoupis/mdt0/test.`date +&quot;%Y%m%d.%H%M%S&quot;` 2&amp;gt;&amp;amp;1 |&amp;amp; tee f_mdt0_32k_ost_uniq.out &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;</description>
                <environment></environment>
        <key id="69588">LU-15725</key>
            <summary>Client side Mdtest File Read Regression introduced with fix for LU-11623</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="laisiyao">Lai Siyao</assignee>
                                    <reporter username="koutoupis">Petros Koutoupis</reporter>
                        <labels>
                    </labels>
                <created>Wed, 6 Apr 2022 13:32:39 +0000</created>
                <updated>Tue, 10 May 2022 23:15:30 +0000</updated>
                                            <version>Lustre 2.13.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="331262" author="adilger" created="Wed, 6 Apr 2022 22:47:25 +0000"  >&lt;p&gt;Petros, it would be useful if you edited your original description to indicate &quot;&lt;tt&gt;git describe&lt;/tt&gt;&quot; versions for the &quot;Original&quot; and &quot;Before Revert&quot; tests.  Is &quot;Original&quot; the commit before the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11623&quot; title=&quot;Allow caching of open-created dentries&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11623&quot;&gt;LU-11623&lt;/a&gt; patch, and &quot;After Revert&quot; on master with that patch reverted?  Or is &quot;Original&quot; the 2.12.x test results?&lt;/p&gt;</comment>
                            <comment id="331263" author="adilger" created="Wed, 6 Apr 2022 22:57:51 +0000"  >&lt;p&gt;The first thing to check is whether there is something that is not being done correctly in this case.  Unfortunately, the original patch did not show the &quot;File read&quot; results, or it might have been more visible if there was a regression.  In some cases, performance issues like this are caused by incorrectly conflicting/cancelling the lock on the client, and it &lt;em&gt;might&lt;/em&gt; be possible to &quot;have your lock and read it too&quot; by avoiding the extra cancellation(s) or efficiently handling the cancellation (if needed) with ELC (Early Lock Cancellation).&lt;/p&gt;

&lt;p&gt;In situations where there is no single &quot;good answer&quot; for whether the extra lock should be taken or not, it may be that there a weighted history of what is done to the file (e.g. similar to patch &lt;a href=&quot;https://review.whamcloud.com/46696&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46696&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15546&quot; title=&quot;Shared Directory File Creates regression seen in 2.15 when comparing to 2.12.6&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15546&quot;&gt;&lt;del&gt;LU-15546&lt;/del&gt;&lt;/a&gt; mdt: keep history of mdt_reint_open() lock&lt;/tt&gt;&quot;) so that the performance can be dynamically optimized for the current workload (stat() vs. read() intensive, or &quot;don&apos;t grant the extra lock under heavy contention&quot;).  IMHO, this is preferable to any kind of static tunable that just enables/disables the extra locking, and will be sub-optimal at one time or another.&lt;/p&gt;</comment>
                            <comment id="331270" author="laisiyao" created="Thu, 7 Apr 2022 02:58:25 +0000"  >&lt;p&gt;Petros, what are the results of &quot;Directory stat&quot; before and after the revert?&lt;/p&gt;</comment>
                            <comment id="331278" author="green" created="Thu, 7 Apr 2022 06:20:07 +0000"  >&lt;p&gt;There was a follow-on patch &lt;a href=&quot;https://review.whamcloud.com/#/c/33585/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/33585/&lt;/a&gt; that was not landed for a variety of reasons, I wonder if it could be tried too.&lt;/p&gt;</comment>
                            <comment id="331314" author="koutoupis" created="Thu, 7 Apr 2022 13:56:10 +0000"  >&lt;p&gt;Andreas,&lt;/p&gt;

&lt;p&gt;I modified the description. I hope that clarifies things.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Lai,&lt;/p&gt;

&lt;p&gt;Directory stats are unchanged in all cases.&lt;/p&gt;</comment>
                            <comment id="332758" author="laisiyao" created="Mon, 25 Apr 2022 01:40:57 +0000"  >&lt;p&gt;The last patch of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11623&quot; title=&quot;Allow caching of open-created dentries&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11623&quot;&gt;LU-11623&lt;/a&gt; &lt;a href=&quot;https://review.whamcloud.com/#/c/33585/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/33585/&lt;/a&gt; was updated, will you cherry-pick and try again?&lt;/p&gt;</comment>
                            <comment id="332899" author="koutoupis" created="Mon, 25 Apr 2022 20:53:16 +0000"  >&lt;p&gt;@Lai Siyao,&lt;/p&gt;

&lt;p&gt;I cherry picked the patch on top of 2.15.0 RC3 and reran the same tests. Unfortunately, the file read performance looks worse.&lt;/p&gt;

&lt;p&gt;2.15.0 RC3 &lt;b&gt;without&lt;/b&gt; the patch:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ ... ]
   Operation                      Max            Min           Mean        Std Dev
   ---------                      ---            ---           ----        -------
   File stat                 :     710652.674     680830.320     695315.322      10282.708
   File read                 :     267242.290     211957.110     243331.807      20164.563
[ ... ]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;2.15.0 RC3 &lt;b&gt;with&lt;/b&gt; the patch:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ ... ]
   Operation                      Max            Min           Mean        Std Dev
   ---------                      ---            ---           ----        -------
   File stat                 :     704615.924     665430.996     690638.517      13355.073
   File read                 :     255746.075     194060.211     226496.114      21414.336
[ ... ]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="333381" author="laisiyao" created="Fri, 29 Apr 2022 01:23:35 +0000"  >&lt;p&gt;Petros, &lt;a href=&quot;https://review.whamcloud.com/#/c/33585/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/33585/&lt;/a&gt; is updated, local test looks promising.&lt;/p&gt;</comment>
                            <comment id="334200" author="koutoupis" created="Mon, 9 May 2022 19:56:19 +0000"  >&lt;p&gt;With updated patch cherry-picked on top of 2.15:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;&#160;&#160; File stat &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; : &#160; &#160; 703505.087 &#160; &#160; 689172.890 &#160; &#160; 696705.795 &#160; &#160; &#160; 4933.824
&#160;&#160; File read &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; : &#160; &#160; 270560.870 &#160; &#160; 217336.834 &#160; &#160; 248256.171&#160; &#160; &#160; 17416.326 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;There does not seem to be much difference with 2.15.0 RC3&#160;&lt;b&gt;without&lt;/b&gt; the patch. Please refer to my mdtest script above for testing parameters. Thank you for working on this.&lt;/p&gt;</comment>
                            <comment id="334256" author="laisiyao" created="Tue, 10 May 2022 12:53:43 +0000"  >&lt;p&gt;I did more test, it looks When the total number of file is too large, client can&apos;t cache all the locks, then the cached locks won&apos;t help. I&apos;ll see how to improve this.&lt;/p&gt;</comment>
                            <comment id="334343" author="adilger" created="Tue, 10 May 2022 23:15:30 +0000"  >&lt;p&gt;Petros, Lai,&lt;br/&gt;
has there been any kind of analysis done as to where the read performance is being lost with/without the open lock?  Is there an increase of DLM locks/cancellations (MDT or OST), extra RPCs being sent, overhead in the VFS, delay in cancelling the DLM lock that increases latency on the mdtest read operation, other?&lt;/p&gt;

&lt;p&gt;Collecting a flame graph during the test on the client and server with/without the open cache would definitely help isolate where the time is being spent.   Initially I thought it might relate to the delay in cancelling the open lock when a second client node is reading the file, and that hurts read performance (either because of the extra lock cancel, or possibly delayed flushing due to write cache).  However, there is a 120s sleep between phases, and I didn&apos;t see the &quot;&lt;tt&gt;mdtest -N stride&lt;/tt&gt;&quot; option being used to force file access from a different node, so reads should be local to the node that wrote the file.&lt;/p&gt;

&lt;p&gt;There are only about 50k files and 1.6GB of data being created on each client, so this shouldn&apos;t exceed the client cache size, and reads should be &quot;free&quot; in this case.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="53926">LU-11623</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i02mlr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>