<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:20:29 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-15692] performance regressions for files in stripe directory</title>
                <link>https://jira.whamcloud.com/browse/LU-15692</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;performance regressions in stripe directory on 2.15.0 (commit;4d93fd7) were found against b2_14(commit:d4b9557).&lt;br/&gt;
Here is configuration.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;4 x MDS (1 x MDT per MDS)
4 x OSS (2 x OSS per OSS)
40 x client

[root@ec01 ~]# mkdir -p /exafs/d0/d1/d2/mdt_stripe/
[root@ec01 ~]# lfs setdirstripe -c 4 -D /exafs/d0/d1/d2/mdt_stripe/
[root@ec01 ~]# salloc -p 40n -N 40 --ntasks-per-node=16 mpirun --allow-run-as-root -oversubscribe -mca btl_openib_if_include mlx5_1:1 -x UCX_NET_DEVICES=mlx5_1:1 /work/tools/bin/mdtest -n 2000 -F -i 3 -p 10 -v -d /exafs/d0/d1/d2/mdt_stripe/
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Here is test resutls.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;server: version=2.15.0_RC2_22_g4d93fd7
client: version=2.15.0_RC2_22_g4d93fd7

SUMMARY rate: (of 3 iterations)
   Operation                     Max            Min           Mean        Std Dev
   ---------                     ---            ---           ----        -------
   File creation              103733.203      76276.410      93728.713      15168.101
   File stat                  693152.731     656461.448     671671.960      19132.425
   File read                  259081.462     247951.008     253393.168       5569.308
   File removal               145137.390     142142.699     143590.068       1499.846
   Tree creation                  48.035          1.922         17.475         26.467
   Tree removal                   35.643         15.861         24.045         10.323
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;server: version=2.14.0_21_gd4b9557
client: version=2.14.0_21_gd4b9557

SUMMARY rate: (of 3 iterations)
   Operation                     Max            Min           Mean        Std Dev
   ---------                     ---            ---           ----        -------
   File creation              138939.425      81336.388     117014.695      31167.261
   File stat                 1678888.952    1580356.340    1645190.276      56162.463
   File read                  569731.788     528830.155     546121.363      21170.387
   File removal               191837.291     186597.900     188595.661       2832.527
   Tree creation                 120.108          0.986         51.078         61.778
   Tree removal                   40.863         33.203         37.987          4.171
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;As far as I observed this, it seems to be server side regression since because performance with lustre-2.15 clients + lustre-2.14 was ok below.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;server: version=2.14.0_21_gd4b9557
client: version=2.15.0_RC2_22_g4d93fd7

SUMMARY rate: (of 3 iterations)
   Operation                     Max            Min           Mean        Std Dev
   ---------                     ---            ---           ----        -------
   File creation              132009.360      74074.615     106514.108      29585.056
   File stat                 1570754.679    1457120.401    1532703.082      65457.038
   File read                  563710.286     540228.432     553871.772      12194.544
   File removal               189557.092     186065.253     187536.946       1809.374
   Tree creation                  54.678          1.883         19.576         30.399
   Tree removal                   42.065         41.677         41.875          0.194
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I am running &apos;git bisect&apos;, can hopefully find an commit where started regression soon.&lt;/p&gt;</description>
                <environment></environment>
        <key id="69261">LU-15692</key>
            <summary>performance regressions for files in stripe directory</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="laisiyao">Lai Siyao</assignee>
                                    <reporter username="sihara">Shuichi Ihara</reporter>
                        <labels>
                    </labels>
                <created>Fri, 25 Mar 2022 00:21:55 +0000</created>
                <updated>Wed, 14 Sep 2022 06:24:15 +0000</updated>
                            <resolved>Sat, 2 Apr 2022 15:05:49 +0000</resolved>
                                    <version>Lustre 2.15.0</version>
                                    <fixVersion>Lustre 2.15.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="330176" author="adilger" created="Fri, 25 Mar 2022 01:08:43 +0000"  >&lt;p&gt;Tentatively adding 2.15.0 fix version for tracking until we understand this better.&lt;/p&gt;</comment>
                            <comment id="330178" author="sihara" created="Fri, 25 Mar 2022 02:10:16 +0000"  >&lt;p&gt;it seems that the following patch where regressions started.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;    LU-14459 lmv: change default hash type to crush
    
    Change the default hash type to CRUSH to minimize the number
    of directory entries that need to be migrated.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;server: version=2.14.51_197_gf269497
client: version=2.15.0_RC2_22_g4d93fd7

SUMMARY rate: (of 3 iterations)
   Operation                     Max            Min           Mean        Std Dev
   ---------                     ---            ---           ----        -------
   File creation              148072.690      87600.145     127000.919      34149.618
   File stat                 1523849.471    1388808.972    1441253.182      72393.681
   File read                  562840.721     505515.837     538333.864      29552.364
   File removal               197259.873     191117.823     194934.244       3331.372
   Tree creation                 111.869          1.707         39.426         62.755
   Tree removal                   44.113         30.518         36.562          6.922
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;server: version=2.14.2.14.51_198_gbb60caa
client: version=2.15.0_RC2_22_g4d93fd7

SUMMARY rate: (of 3 iterations)
   Operation                     Max            Min           Mean        Std Dev
   ---------                     ---            ---           ----        -------
   File creation               86531.781      63506.794      72790.003      12142.761
   File stat                  808075.643     746570.771     784071.104      32898.551
   File read                  260064.500     249212.881     256291.924       6135.058
   File removal               159592.539     155603.788     157752.556       2012.224
   Tree creation                 120.060          1.138         41.069         68.410
   Tree removal                   37.780         37.263         37.450          0.287
V-1: Entering PrintTimestamp...
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Lustre client didn&apos;t change version and server version only changed, but does it make sense patch &lt;a href=&quot;https://review.whamcloud.com/#/c/43684/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/43684/&lt;/a&gt; impact for server as well?&lt;/p&gt;</comment>
                            <comment id="330183" author="adilger" created="Fri, 25 Mar 2022 03:31:40 +0000"  >&lt;p&gt;It looks like &lt;tt&gt;LMV_HASH_TYPE_DEFAULT&lt;/tt&gt; is used on both the client and server, so patch &lt;a href=&quot;https://review.whamcloud.com/43684&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/43684&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14459&quot; title=&quot;DNE3: directory auto split during create&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14459&quot;&gt;LU-14459&lt;/a&gt; lmv: change default hash type to crush&lt;/tt&gt;&quot; would definitely affect behavior of the server if the client is not specifying the directory hash type (which it is not).&lt;/p&gt;</comment>
                            <comment id="330184" author="laisiyao" created="Fri, 25 Mar 2022 03:54:33 +0000"  >&lt;p&gt;Ihara, can you collect flamegraph on MDS for both cases?&lt;/p&gt;</comment>
                            <comment id="330185" author="adilger" created="Fri, 25 Mar 2022 04:00:23 +0000"  >&lt;p&gt;Shuichi, do you have any idea of why this is slower on the server?  Higher CPU usage, uneven file distribution across MDTs, something else?  If high MDS CPU usage, would you be able to collect a flame graph of the before/after the patch?  The CRUSH hash is a bit more CPU intensive, but I wouldn&apos;t think it would hurt performance by 50%, but it would be good to know where the performance is lost so possibly it can be optimized.&lt;/p&gt;</comment>
                            <comment id="330193" author="sihara" created="Fri, 25 Mar 2022 05:27:08 +0000"  >&lt;p&gt;Before collecting flamegraph or other information in detail, I just found MDT load balancing seems to be not working well after patch. It&apos;s unbalanced file distribution across MDTs at create.&lt;br/&gt;
For instnace, here is just file creation test in a stirpe directory.&lt;/p&gt;

&lt;p&gt;Before patch (commit:f269497)&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;mpirunp -np 640 mdtest -n 2000 -F -C -i 1 -p 10 -v -d /exafs/d0/d1/d2/mdt_stripe/

[root@ec01 ~]# lfs df -i | grep MDT
exafs-MDT0000_UUID &#160; &#160; &#160;83050496 &#160; &#160; &#160;320298 &#160; &#160;82730198 &#160; 1% /exafs[MDT:0]&#160;
exafs-MDT0001_UUID &#160; &#160; &#160;83050496 &#160; &#160; &#160;320283 &#160; &#160;82730213 &#160; 1% /exafs[MDT:1]&#160;
exafs-MDT0002_UUID &#160; &#160; &#160;83050496 &#160; &#160; &#160;320334 &#160; &#160;82730162 &#160; 1% /exafs[MDT:2]&#160;
exafs-MDT0003_UUID &#160; &#160; &#160;83050496 &#160; &#160; &#160;320293 &#160; &#160;82730203 &#160; 1% /exafs[MDT:3]&#160; &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;After patch (commit:bb60caa)&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@ec01 ~]# lfs df -i | grep MDT
exafs-MDT0000_UUID      83050496      192404    82858092   1% /exafs[MDT:0] 
exafs-MDT0001_UUID      83050496      190698    82859798   1% /exafs[MDT:1] 
exafs-MDT0002_UUID      83050496      177266    82873230   1% /exafs[MDT:2] 
exafs-MDT0003_UUID      83050496      720852    82329644   1% /exafs[MDT:3] 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;That&apos;s why mdtest&apos;s numbers was slower since one of MDS/MDT (MDT3 in this case) is more working longer than others. Eventually, mdtest&apos;s elapsed time is longer than balanced case.&lt;/p&gt;</comment>
                            <comment id="330420" author="adilger" created="Mon, 28 Mar 2022 17:30:47 +0000"  >&lt;p&gt;Lai, can you please look into the CRUSH hash to see if this is causing imbalance, and why. It would be possibly to just create a bunch of names in the format of mdtest and count the distribution of files. &lt;/p&gt;

&lt;p&gt;One thing that might be happening is that the format of the mdtest filenames may be triggering the &quot;no rename across MDTs&quot; heuristic for temp files if they are of the form .xxxxxxxx at the end? &lt;/p&gt;</comment>
                            <comment id="330569" author="adilger" created="Tue, 29 Mar 2022 23:32:02 +0000"  >&lt;p&gt;Shuichi, it should be possible to see if changing the hash function back to the old one solves the problem.  Use:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# lfs mkdir -H fnv_1a_64 /exafs/d0/d1/d2/mdt_stripe
# lfs setdirstripe -H fnv_1a_64 -D /exafs/d0/d1/d2/mdt_stripe
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;It is easy enough to make a patch that will change the default hash function back to &lt;tt&gt;fnv_1a_64&lt;/tt&gt; for 2.15.0, but it would be &lt;b&gt;preferable&lt;/b&gt; to understand why the performance/balance is so bad because this hash function &lt;em&gt;should&lt;/em&gt; be better for directory migration/restriping.&lt;/p&gt;

&lt;p&gt;Also, did you give patch &lt;a href=&quot;https://review.whamcloud.com/46696&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46696&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15546&quot; title=&quot;Shared Directory File Creates regression seen in 2.15 when comparing to 2.12.6&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15546&quot;&gt;&lt;del&gt;LU-15546&lt;/del&gt;&lt;/a&gt; mdt: keep history of mdt_reint_open() lock&lt;/tt&gt;&quot; a try to see if this further restores the file create performance?&lt;/p&gt;</comment>
                            <comment id="330584" author="gerrit" created="Wed, 30 Mar 2022 04:13:47 +0000"  >&lt;p&gt;&quot;Andreas Dilger &amp;lt;adilger@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/46950&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46950&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15692&quot; title=&quot;performance regressions for files in stripe directory&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15692&quot;&gt;&lt;del&gt;LU-15692&lt;/del&gt;&lt;/a&gt; lmv: change default hash back to fnv_1a_64&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: a70be6f15e5f742443c2ca2e854d0bdab53a3dc5&lt;/p&gt;</comment>
                            <comment id="330603" author="laisiyao" created="Wed, 30 Mar 2022 07:59:00 +0000"  >&lt;p&gt;If the testing system is fresh new, /exafs/d0/d1/d2/mdt_stripe/ will be created on MDT3 (because max-inherit-rr is 3 by default, d0 will be created on MDT0, d1 on MDT1, d2 on MDT2), and the new directories created by mdtest will all be located on MDT3 (master inode) because at this level max-inherit-rr doesn&apos;t work any more. This explains why there are far more inodes used on MDT3. But I don&apos;t understand why the inodes are used equally before this patch, will you count the number of directories on each MDT? You can get it via &quot;for i in 0 1 2 3; do lfs getstripe -m /exafs/d0/d1/d2/mdt_stripe/* | grep -c $i; done&quot;.&lt;/p&gt;</comment>
                            <comment id="330653" author="gerrit" created="Wed, 30 Mar 2022 18:37:38 +0000"  >&lt;p&gt;&lt;del&gt;&quot;Andreas Dilger &amp;lt;adilger@whamcloud.com&amp;gt;&quot; uploaded a new patch:&lt;/del&gt; &lt;a href=&quot;https://review.whamcloud.com/46958&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46958&lt;/a&gt;&lt;br/&gt;
&lt;del&gt;Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15692&quot; title=&quot;performance regressions for files in stripe directory&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15692&quot;&gt;&lt;del&gt;LU-15692&lt;/del&gt;&lt;/a&gt; tests: clean up sanity test_316/test_319&lt;/del&gt;&lt;br/&gt;
&lt;del&gt;Project: fs/lustre-release&lt;/del&gt;&lt;br/&gt;
&lt;del&gt;Branch: master&lt;/del&gt;&lt;br/&gt;
&lt;del&gt;Current Patch Set: 1&lt;/del&gt;&lt;br/&gt;
&lt;del&gt;Commit: c65ed52cf04219ef247e7764c5c45e36add6f825&lt;/del&gt;&lt;/p&gt;</comment>
                            <comment id="330671" author="sihara" created="Wed, 30 Mar 2022 23:04:34 +0000"  >&lt;p&gt;Andreas, &quot;fnv_1a_64&quot; worked fine below.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@ec01 ~]# lfs mkdir -H fnv_1a_64 /exafs/d0/d1/d2/mdt_stripe
[root@ec01 ~]# lfs setdirstripe -c 4 -H fnv_1a_64 -D /exafs/d0/d1/d2/mdt_stripe
[root@ec01 ~]# lfs getdirstripe  /exafs/d0/d1/d2/mdt_stripe
lmv_stripe_count: 0 lmv_stripe_offset: 2 lmv_hash_type: none
[root@ec01 ~]# lfs getdirstripe -D /exafs/d0/d1/d2/mdt_stripe
lmv_stripe_count: 4 lmv_stripe_offset: -1 lmv_hash_type: fnv_1a_64 lmv_max_inherit: 3 lmv_max_inherit_rr: 0

[root@ec01 ~]# mpirun -np 640 mdtest -n 2000 -F -i 1 -v -d /exafs/d0/d1/d2/mdt_stripe/ -C
SUMMARY rate: (of 1 iterations)
   Operation                     Max            Min           Mean        Std Dev
   ---------                     ---            ---           ----        -------
   File creation              146900.918     146900.918     146900.918          0.000
   File stat                       0.000          0.000          0.000          0.000
   File read                       0.000          0.000          0.000          0.000
   File removal                    0.000          0.000          0.000          0.000
   Tree creation                  34.638         34.638         34.638          0.000
   Tree removal                    0.000          0.000          0.000          0.000

[root@ec01 ~]# lfs df -i | grep MDT
exafs-MDT0000_UUID      83050496      320295    82730201   1% /exafs[MDT:0] 
exafs-MDT0001_UUID      83050496      320290    82730206   1% /exafs[MDT:1] 
exafs-MDT0002_UUID      83050496      320285    82730211   1% /exafs[MDT:2] 
exafs-MDT0003_UUID      83050496      320284    82730212   1% /exafs[MDT:3] 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;When I did test &quot;fnv_1a_64&quot; before to check, it didn&apos;t work becouse I did run another &quot;lfs setdirstirpe -D&quot; after &quot;lfs setdirstripe -H fnv_1a_64&quot;. e.g.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@ec01 ~]# lfs mkdir -H fnv_1a_64 /exafs/d0/d1/d2/mdt_stripe
[root@ec01 ~]# lfs setdirstripe -H fnv_1a_64 -D /exafs/d0/d1/d2/mdt_stripe
[root@ec01 ~]# lfs setdirstripe -c 4 -D /exafs/d0/d1/d2/mdt_stripe
[root@ec01 ~]# lfs getdirstripe -D /exafs/d0/d1/d2/mdt_stripe
lmv_stripe_count: 4 lmv_stripe_offset: -1 lmv_hash_type: none lmv_max_inherit: 3 lmv_max_inherit_rr: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;stripe count is configured properly, but hash_type changed back to default CRUSH algorithm.&lt;br/&gt;
all combinations is needed when if &apos;-D&apos; (default) option is required.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Also, did you give patch &lt;a href=&quot;https://review.whamcloud.com/46696&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46696&lt;/a&gt; &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15546&quot; title=&quot;Shared Directory File Creates regression seen in 2.15 when comparing to 2.12.6&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15546&quot;&gt;&lt;del&gt;LU-15546&lt;/del&gt;&lt;/a&gt; mdt: keep history of mdt_reint_open() lock&quot; a try to see if this further restores the file create performance?&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;let me test patch too.&lt;/p&gt;</comment>
                            <comment id="330672" author="sihara" created="Wed, 30 Mar 2022 23:12:58 +0000"  >&lt;blockquote&gt;&lt;p&gt;If the testing system is fresh new, /exafs/d0/d1/d2/mdt_stripe/ will be created on MDT3 (because max-inherit-rr is 3 by default, d0 wil&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Lai, /exafs/d0/d1/d2/mdt_stripe/ is a striped directory on across all MDTs. I don&apos;t think inherit-rr is related in this case, no?&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@ec01 ~]# lfs setdirstripe -c 4 -D /exafs/d0/d1/d2/mdt_stripe/
[root@ec01 ~]# lfs getdirstripe -D /exafs/d0/d1/d2/mdt_stripe
lmv_stripe_count: 4 lmv_stripe_offset: -1 lmv_hash_type: none lmv_max_inherit: 3 lmv_max_inherit_rr: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="330675" author="laisiyao" created="Thu, 31 Mar 2022 01:34:37 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@ec01 ~]# mkdir -p /exafs/d0/d1/d2/mdt_stripe/
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;According to the description, mdt_stripe is created with &quot;mkdir&quot;, it should be a plain directory. And it has a default LMV to stripe to all MDTs.&lt;/p&gt;</comment>
                            <comment id="330683" author="adilger" created="Thu, 31 Mar 2022 03:40:04 +0000"  >&lt;p&gt;I don&apos;t think the MDT/striping of &lt;tt&gt;/exafs/d0/d1/d2/mdt_stripe/&lt;/tt&gt; itself is not so critical, because it is only holding another level of subdirectories in it, like &lt;tt&gt;/exafs/d0/d1/d2/mdt_stripe/test-dir.0-0/mdtest_tree.M.N/&lt;/tt&gt; (for different client nodes and MPI ranks), and those &lt;em&gt;should&lt;/em&gt; be striped across all MDTs.&lt;/p&gt;

&lt;p&gt;Rather than checking the MDT index of specific directories, it would probably be better to check all subdirectories like:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# lfs find /exafs/d0/d1/d2/mdt_stripe -type d | xargs lfs getdirstripe -m | sort | uniq -c
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;That would at least print the starting MDT index for each directory, and presumably all of those directories should be striped over all 4 MDTs. This would be useful to check in the &lt;tt&gt;crush&lt;/tt&gt; hash test if the imbalance is because of &lt;b&gt;directory&lt;/b&gt; imbalance (e.g. 3/4 of subdirectories are created on MDT0003), or if it is a &lt;b&gt;file&lt;/b&gt; imbalance within a single directory (e.g. 3/4 of files in &lt;b&gt;every&lt;/b&gt; directory are created on MDT0003).&lt;/p&gt;

&lt;p&gt;The MDT distribution of files within each directory could probably be checked something like:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# find /exafs/d0/d1/d2/mdt_stripe -mindepth 2 -maxdepth 2 -type d |
    while read D; do
        echo $D
        find $D -type f | xargs lfs getstripe -m | sort | uniq -c
    done
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;though this may take a few minutes if there are a lot of files.&lt;/p&gt;</comment>
                            <comment id="330694" author="sihara" created="Thu, 31 Mar 2022 05:03:53 +0000"  >&lt;p&gt;This is mdtest to a single shared dir, not unique dir operation. So, all files are cretead in a sub directory.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@ec01 ~]# lfs find /exafs/d0/d1/d2/mdt_stripe -type d 
/exafs/d0/d1/d2/mdt_stripe
[root@ec01 ~]# mpirun -np 640 mdtest -n 2000 -F -i 1 -v -d /exafs/d0/d1/d2/mdt_stripe/ -C
[root@ec01 ~]# lfs df -i | grep MDT
exafs-MDT0000_UUID      83050496      720861    82329635   1% /exafs[MDT:0] 
exafs-MDT0001_UUID      83050496      192389    82858107   1% /exafs[MDT:1] 
exafs-MDT0002_UUID      83050496      190695    82859801   1% /exafs[MDT:2] 
exafs-MDT0003_UUID      83050496      177213    82873283   1% /exafs[MDT:3] 
[root@ec01 ~]# lfs find /exafs/d0/d1/d2/mdt_stripe -type d 
/exafs/d0/d1/d2/mdt_stripe
/exafs/d0/d1/d2/mdt_stripe/test-dir.0-0
/exafs/d0/d1/d2/mdt_stripe/test-dir.0-0/mdtest_tree.0
[root@ec01 ~]# lfs find /exafs/d0/d1/d2/mdt_stripe -type d | xargs lfs getdirstripe -m
1
1
1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;And, unbalanced distibution is not associated to particular a MDT.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@ec01 ~]# lfs df -i | grep MDT
exafs-MDT0000_UUID      83050496         301    83050195   1% /exafs[MDT:0] 
exafs-MDT0001_UUID      83050496         281    83050215   1% /exafs[MDT:1] 
exafs-MDT0002_UUID      83050496         281    83050215   1% /exafs[MDT:2] 
exafs-MDT0003_UUID      83050496         280    83050216   1% /exafs[MDT:3] 

[root@ec01 ~]# mpirun -np 640 mdtest -n 2000 -F -i 1 -v -d /exafs/d0/d1/d2/mdt_stripe/ -C
[root@ec01 ~]# lfs df -i | grep MDT
exafs-MDT0000_UUID      83050496      190707    82859789   1% /exafs[MDT:0] 
exafs-MDT0001_UUID      83050496      192389    82858107   1% /exafs[MDT:1] 
exafs-MDT0002_UUID      83050496      177215    82873281   1% /exafs[MDT:2] 
exafs-MDT0003_UUID      83050496      720848    82329648   1% /exafs[MDT:3] 

[root@ec01 ~]# mpirun -np 640 mdtest -n 2000 -F -i 1 -v -d /exafs/d0/d1/d2/mdt_stripe/ -r
[root@ec01 ~]# mpirun -np 640 mdtest -n 2000 -F -i 1 -v -d /exafs/d0/d1/d2/mdt_stripe/ -C
[root@ec01 ~]# lfs df -i | grep MDT
exafs-MDT0000_UUID      83050496      720861    82329635   1% /exafs[MDT:0] 
exafs-MDT0001_UUID      83050496      192389    82858107   1% /exafs[MDT:1] 
exafs-MDT0002_UUID      83050496      190703    82859793   1% /exafs[MDT:2] 
exafs-MDT0003_UUID      83050496      177214    82873282   1% /exafs[MDT:3] 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="330700" author="laisiyao" created="Thu, 31 Mar 2022 09:06:12 +0000"  >&lt;p&gt;This should be because files are treated as temp file, which are created on the same MDT. All digit suffix filename (if the suffix length after &quot;.&quot; is 8) are treated as temp file with crush hash type.&lt;/p&gt;

&lt;p&gt;Ihara, can you list some typical filename in the bottom level?&lt;/p&gt;</comment>
                            <comment id="330764" author="adilger" created="Thu, 31 Mar 2022 18:57:01 +0000"  >&lt;p&gt;Lai - two things about the &quot;temp&quot; filenames:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;files &lt;em&gt;should&lt;/em&gt; still be created on same MDT as the &quot;non-temp&quot; filename.  However, if the filename is like &quot;&lt;tt&gt;foo.12345678&lt;/tt&gt;&quot; then the hashing code will only ever use &quot;&lt;tt&gt;foo&lt;/tt&gt;&quot; to determine the &quot;proper&quot; MDT index.&lt;/li&gt;
	&lt;li&gt;the &quot;temp filename&quot; code should &lt;b&gt;not&lt;/b&gt; consider suffixes with &lt;b&gt;only&lt;/b&gt; numbers as a temp filename.  That is specifically to avoid putting all &quot;&lt;tt&gt;foo.nnnnnnnn&lt;/tt&gt;&quot; filenames on the same MDT.  However, if there is a mix of numbers and letters (e.g. hex suffix?) then it might be doing the wrong thing.&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;The version of mdtest that I&apos;m using locally only has numbers in the suffix:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# ls mdtest-easy/test-dir.0-0/mdtest_tree.0.0
file.mdtest.1.127
file.mdtest.1.128
file.mdtest.1.129
file.mdtest.1.13
file.mdtest.1.130
file.mdtest.1.131
file.mdtest.1.132
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;However, it might be getting confused by the extra &apos;&lt;tt&gt;.&lt;/tt&gt;&apos; in the name if there are more files, like &quot;&lt;tt&gt;file.mdtest.1.345678&lt;/tt&gt;&quot; or &quot;&lt;tt&gt;file.mdtest.12.45678&lt;/tt&gt;&quot;?  This would &lt;b&gt;incorrectly&lt;/b&gt; fail the &quot;&lt;tt&gt;(digit &amp;gt;= suffixlen -1)&lt;/tt&gt;&quot; check because the second &apos;&lt;tt&gt;.&lt;/tt&gt;&apos; is not counted in &lt;tt&gt;digit&lt;/tt&gt; or &lt;tt&gt;upper&lt;/tt&gt; or &lt;tt&gt;lower&lt;/tt&gt;.  There should probably be an additional check that there aren&apos;t non-alphanumeric characters in the suffix:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; ((digit &amp;gt;= suffixlen - 1 &amp;amp;&amp;amp; !isdigit(name[namelen - suffixlen])) ||
            upper == suffixlen || lower == suffixlen)
                &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;false&lt;/span&gt;;
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (type == LMV_HASH_TYPE_CRUSH2 &amp;amp;&amp;amp; digit + upper + lower != suffixlen)
                &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;false&lt;/span&gt;;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Unfortunately, this changes the hash function subtly, so a new &quot;&lt;tt&gt;LMV_HASH_TYPE_CRUSH2&lt;/tt&gt;&quot; hash type is needed for the new behavior.  Otherwise, clients may &lt;em&gt;think&lt;/em&gt; they know which MDT a particular filename is on but it would be wrong.&lt;/p&gt;</comment>
                            <comment id="330780" author="adilger" created="Thu, 31 Mar 2022 23:48:52 +0000"  >&lt;p&gt;I&apos;m working on a patch to add the CRUSH2 hash, but for now we are planning to land patch &lt;a href=&quot;https://review.whamcloud.com/46950&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46950&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15692&quot; title=&quot;performance regressions for files in stripe directory&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15692&quot;&gt;&lt;del&gt;LU-15692&lt;/del&gt;&lt;/a&gt; lmv: change default hash back to fnv_1a_64&lt;/tt&gt;&quot; for 2.15.0 since that is the lowest risk option.  I don&apos;t think the current CRUSH hash is totally broken, but it unfortunately has imbalance in some important use cases.&lt;/p&gt;</comment>
                            <comment id="330782" author="laisiyao" created="Fri, 1 Apr 2022 01:05:09 +0000"  >&lt;p&gt;Andreas, how about retry lookup on client? if the filename looks to be a temp file in above format, if it&apos;s not located on the MDT with name like &quot;foo&quot;, then lookup again with full name hash?&lt;/p&gt;</comment>
                            <comment id="330887" author="adilger" created="Sat, 2 Apr 2022 00:27:18 +0000"  >&lt;blockquote&gt;
&lt;p&gt;how about retry lookup on client?&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;that would only work for new clients that are patched to repeat the lookup, and at that point the clients could also be patched to understand CRUSH2.  Older clients will &lt;b&gt;not&lt;/b&gt; repeat the lookup, so they would not be able to find the filename on the MDT where it should be located (based on older CRUSH hash).  That might also cause the same filename to be created twice in the same directory (old clients on one MDT, new clients on a different MDT).  Having a different hash function that old clients do not understand &lt;b&gt;should&lt;/b&gt; result in them looking for the filename on &lt;b&gt;all&lt;/b&gt; MDTs, and prevent them from creating new files in that directory (return &lt;tt&gt;-EBADFD&lt;/tt&gt;).&lt;/p&gt;

&lt;p&gt;I&apos;m having a hard time to convince myself this is code correct. The comment in the commit message says:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LU-13481 dne: improve temp file name check

Previously if all but two characters in file name suffix are digit,
it&apos;s not treated as temp file, as is too strict if suffix length is
short, e.g. 6. Change it to allow one character, and this non-digit
character should not be the starting character.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Besides the problem with &quot;&lt;tt&gt;.-_&lt;/tt&gt;&quot; characters in the suffix (which would make count of digits/upper/lower too small and fail the &lt;tt&gt;suffixlen&lt;/tt&gt; check), it doesn&apos;t look like the &lt;tt&gt;isdigit()&lt;/tt&gt; check is correct:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; ((digit &amp;gt;= suffixlen - 1 &amp;amp;&amp;amp; !isdigit(name[namelen - suffixlen])) ||
            upper == suffixlen || lower == suffixlen)
                &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;false&lt;/span&gt;;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;If &quot;&lt;tt&gt;digit &amp;gt;= suffixlen -1&lt;/tt&gt;&quot; (say &lt;tt&gt;name = &quot;foo.12345678&quot;&lt;/tt&gt;, &lt;tt&gt;digit = 8&lt;/tt&gt;, &lt;tt&gt;suffixlen = 8&lt;/tt&gt;) this check will fail (and return &quot;true&quot; for the temp filename check) because &quot;&lt;tt&gt;1&lt;/tt&gt;&quot; is a digit.  I think this is supposed to be just &quot;&lt;tt&gt;isdigit(name[])&lt;/tt&gt;&quot; (no &apos;&lt;tt&gt;!&lt;/tt&gt;&apos;).&lt;/p&gt;</comment>
                            <comment id="330890" author="laisiyao" created="Sat, 2 Apr 2022 01:36:41 +0000"  >&lt;p&gt;It&apos;s in accordance with the comment, isn&apos;t it? The comment says &quot;foo.a1234567&quot; shouldn&apos;t be treated as temp file, but &quot;foo.12a34567&quot; and &quot;foo.12345678&quot; should be. IMO &quot;foo.123.4567&quot; shouldn&apos;t be treated as a temp file, that is, isdot should be checked here, if suffix contains &quot;.&quot;, it should return false.&lt;/p&gt;</comment>
                            <comment id="330900" author="gerrit" created="Sat, 2 Apr 2022 06:58:19 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/46950/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46950/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15692&quot; title=&quot;performance regressions for files in stripe directory&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15692&quot;&gt;&lt;del&gt;LU-15692&lt;/del&gt;&lt;/a&gt; lmv: change default hash back to fnv_1a_64&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 0090b6f6f6cfd65fcacfa4dda23bee2cec11cb70&lt;/p&gt;</comment>
                            <comment id="330902" author="adilger" created="Sat, 2 Apr 2022 08:18:06 +0000"  >&lt;p&gt;Definitely &quot;foo.12345678&quot; should &lt;b&gt;not&lt;/b&gt; be considered a temp file, since this is a common case (eg file.YYYYMMDD). The chance of a 6-number temp file is 1/40k, and an 8-number temp file being hit randomly is less than 1/2M. &lt;/p&gt;

&lt;p&gt;The original code even considered 6/6, 5/6 and 4/6 numbers to not be temp files (ie. &quot;digit &amp;gt;= suffixlen - 2&quot;) , but 4/6 numbers was too easily hit by mktemp. It was &lt;b&gt;supposed&lt;/b&gt; to keep 8/8 and 7/8 as non-temp files as long as 7/8 was like &quot;file.f1234567&quot;.  The problem is that the 8/8 case also fails because the first char &lt;b&gt;is&lt;/b&gt; a digit, so &quot;&lt;tt&gt;!isdigit(name&lt;span class=&quot;error&quot;&gt;&amp;#91;namelen-suffixlen&amp;#93;&lt;/span&gt;)&lt;/tt&gt;&quot; fails, and it doesn&apos;t matter if the &quot;&lt;tt&gt;(digit &amp;gt;= suffixlen - 1)&lt;/tt&gt;&quot; part is true or not because the &quot;false&quot; check is not met, and &quot;true&quot; is returned. &lt;/p&gt;

&lt;p&gt;The proper check should be something like:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (digit == suffixlen || upper == suffixlen || lower == suffixlen ||
            (digit == suffixlen - 1 &amp;amp;&amp;amp; !isdigit(name[namelen - suffixlen])))
                &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;false&lt;/span&gt;;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="330905" author="pjones" created="Sat, 2 Apr 2022 15:05:49 +0000"  >&lt;p&gt;I believe that the part intend for 2.15 has now landed. There is still some outstanding work to track to work out the issues with the new hash function so we can switch back to that but that can be tracked under a new JIRA for 2.16/2.15.x&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10324">
                    <name>Cloners</name>
                                                                <inwardlinks description="is cloned by">
                                        <issuelink>
            <issuekey id="69578">LU-15720</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="68194">LU-15479</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="68406">LU-15502</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="68618">LU-15546</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="58881">LU-13481</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="52253">LU-11025</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="62971">LU-14459</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i02lpj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>