<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:49:41 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12103] Improve block allocation for large partitions</title>
                <link>https://jira.whamcloud.com/browse/LU-12103</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Block allocator uses some heuristic while chooseing group to allocate new blocks. This works good in most cases, but take a time for large low-free-space partition. The algorithm should be adjusted for this special case.&lt;/p&gt;</description>
                <environment></environment>
        <key id="55236">LU-12103</key>
            <summary>Improve block allocation for large partitions</summary>
                <type id="4" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11310&amp;avatarType=issuetype">Improvement</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="artem_blagodarenko">Artem Blagodarenko</assignee>
                                    <reporter username="artem_blagodarenko">Artem Blagodarenko</reporter>
                        <labels>
                    </labels>
                <created>Mon, 25 Mar 2019 07:54:39 +0000</created>
                <updated>Tue, 16 Feb 2021 17:01:01 +0000</updated>
                            <resolved>Wed, 25 Sep 2019 12:17:11 +0000</resolved>
                                                    <fixVersion>Lustre 2.13.0</fixVersion>
                    <fixVersion>Lustre 2.12.4</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>14</watches>
                                                                            <comments>
                            <comment id="244612" author="artem_blagodarenko" created="Mon, 25 Mar 2019 09:33:17 +0000"  >&lt;p&gt;Hello&#160;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=adilger&quot; class=&quot;user-hover&quot; rel=&quot;adilger&quot;&gt;adilger&lt;/a&gt;, what do you think about optimisation idea from&#160;&lt;a href=&quot;https://patchwork.ozlabs.org/patch/1054251/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://patchwork.ozlabs.org/patch/1054251/&lt;/a&gt;&#160;? Do you know other optimisation&#160;to suggest? I attached test I used and testing output to this issue. Thanks.&lt;/p&gt;</comment>
                            <comment id="244613" author="adilger" created="Mon, 25 Mar 2019 09:52:03 +0000"  >&lt;p&gt;I think in the long run, it seems like a better approach would be to have a tree-based allocator using the extent status tree that already exists.  Otherwise, searching through 3-4 million groups becomes too slow regardless of how the iteration is done.&lt;/p&gt;</comment>
                            <comment id="244614" author="artem_blagodarenko" created="Mon, 25 Mar 2019 10:33:15 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=adilger&quot; class=&quot;user-hover&quot; rel=&quot;adilger&quot;&gt;adilger&lt;/a&gt;, thank you for fast answer! I like this long-run idea.&lt;/p&gt;

&lt;p&gt;We faced with very slow OST operations on filled target.&#160;Do you think my patch can solve this problem as shot run solution?&lt;/p&gt;

&lt;p&gt;Thanks.&lt;/p&gt;</comment>
                            <comment id="244616" author="bzzz" created="Mon, 25 Mar 2019 11:17:26 +0000"  >&lt;p&gt;it would be interesting to understand where the most time is spent in: checking (nearly)empty groups or search for a better chunk? or probably waiting on IO to fill bitmaps?&lt;/p&gt;</comment>
                            <comment id="246210" author="artem_blagodarenko" created="Tue, 23 Apr 2019 15:11:00 +0000"  >&lt;p&gt;Hello &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=bzzz&quot; class=&quot;user-hover&quot; rel=&quot;bzzz&quot;&gt;bzzz&lt;/a&gt;,&lt;/p&gt;

&lt;p&gt;Here is data from one &#160;of the stacked OST&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
&#160;4.80% &#160; &#160; 0.00%&#160; ll_ost_io00_031&#160; [ptlrpc]&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; [k] ptlrpc_server_handle_request
&#160; &#160; &#160; &#160; &#160; &#160; |
&#160; &#160; &#160; &#160; &#160; &#160; ---ptlrpc_server_handle_request
&#160;&#160; &#160; &#160; &#160; &#160; &#160; &#160; |
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; --4.80%--tgt_request_handle
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; |
&#160;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; --4.80%--tgt_brw_write
&#160;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; |
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; --4.80%--obd_commitrw.constprop.39
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; ofd_commitrw
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; ofd_commitrw_write.isra.32
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; |
&#160;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; --4.80%--osd_write_commit
&#160;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; |
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; --4.80%--osd_ldiskfs_map_inode_pages
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; |
&#160;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; --4.80%--ldiskfs_map_blocks
&#160;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; |
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; --4.80%--ldiskfs_ext_map_blocks
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; |
&#160;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; --4.80%--ldiskfs_mb_new_blocks
&#160;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; |
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; --4.43%--ldiskfs_mb_regular_allocator
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; |
&#160;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; --4.16%--ldiskfs_mb_good_group
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Most time are spent in ldiskfs_mb_regular_allocator() loops (4 loops other all groups)&lt;/p&gt;</comment>
                            <comment id="246687" author="adilger" created="Sat, 4 May 2019 04:59:57 +0000"  >&lt;p&gt;Another possibility is to improve large OST allocation by using the &lt;tt&gt;bigalloc&lt;/tt&gt; feature.  This will reduce the number of block groups to search by the factor of the chunk size, and increase the efficiency of block allocations.&lt;/p&gt;

&lt;p&gt;bigalloc has been in use by Google for many years, though there may be some issues to be fixed with osd-ldiskfs in order to convert block allocations to cluster allocations.&lt;/p&gt;</comment>
                            <comment id="246903" author="adilger" created="Thu, 9 May 2019 18:03:28 +0000"  >&lt;p&gt;The benefit of bigalloc is that it reduces metadata size and handling overhead by a significant factor. The number of bits to allocate per unit size is reduced linearly by the chuck factor.  This will help mballoc significantly, since huge OSTs can have millions of block groups to search, and a bigalloc chunk size of, say, 128kB would reduce allocation overhead &lt;b&gt;and&lt;/b&gt; the number of block groups by a factor of 32. &lt;/p&gt;

&lt;p&gt;The main drawback of bigalloc is that it can waste space because the chunk size is the minimum allocation unit of the filesystem (eg. any file &amp;lt; chunk_size will consume a full chunk of space, even though only one 4KB block might be written).  The space in a chunk &lt;b&gt;cannot&lt;/b&gt; be shared between files. However, this is not worse than if the block size was actually increased to match the bigalloc chuck size, and better in several regards. The one drawback vs. a larger block size is that it does not increase the maximum extent size or maximum file size, since the blocksize and block addressing is the same, only the allocation size is changed. &lt;/p&gt;

&lt;p&gt;Has anyone tested bigalloc on an OST, and are there any known issues?&lt;/p&gt;</comment>
                            <comment id="246904" author="adilger" created="Thu, 9 May 2019 18:13:13 +0000"  >&lt;p&gt;Note that I&apos;m not &lt;b&gt;against&lt;/b&gt; improving mballoc to be more efficient, but I think bigalloc is a very easy way to improve allocation performance with minimum effort (mainly going through osd-ldiskfs and &lt;em&gt;maybe&lt;/em&gt; LFSCK and mapping blocks to chunks during allocation), vs. significant work to rewrite the block allocation code, which would also touch lots of core code and need a long time to validate correctness and allocator behavior. &lt;/p&gt;</comment>
                            <comment id="247633" author="artem_blagodarenko" created="Fri, 24 May 2019 09:23:05 +0000"  >&lt;p&gt;Hello &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=adilger&quot; class=&quot;user-hover&quot; rel=&quot;adilger&quot;&gt;adilger&lt;/a&gt;,&lt;/p&gt;

&lt;p&gt;I agree that bigalloc can improve metadata operation performance and save space. But it looks like it can hep with allocator problems. Here are results of testing that shows that allocator make ~1million useless groups scanning. If this number become 4 times less, nothing changed dramatically.&lt;/p&gt;

&lt;p&gt;During test, system was fragmented with pattern &quot;50 free blocks - 50 occupied &#160;blocks&quot;. Performance digradated from 1.2 Gb/sed to 10 MB/sec.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&#160;
1. dd on non fragmented fs : ~1.2 Gb/sec
[root@cslmo1704 ~]# df -T /mnt/ldiskfs
Filesystem&#160;&#160;&#160;&#160;&#160;Type&#160;&#160;&#160;&#160;&#160;&#160;&#160;1K-blocks&#160;&#160;Used&#160;&#160;&#160;&#160;Available Use% Mounted on
/dev/md0&#160;&#160;&#160;&#160;&#160;&#160;&#160;ldiskfs 121226819924&#160;&#160;1260 120014147240&#160;&#160;&#160;1% /mnt/ldiskfs
[root@cslmo1704 ~]#
&#160;
cslmo1704 ~]# time dd &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt;=/dev/zero of=/mnt/ldiskfs/foo bs=$((1024*1024)) count=$((32*10*1024)) &amp;amp;
[1] 74048
[root@cslmo1704 ~]# 327680+0 records in
327680+0 records out
343597383680 bytes (344 GB) copied, 292.264 s, 1.2 GB/s
&#160;
real&#160;&#160;&#160;&#160;4m52.267s
user&#160;&#160;&#160;&#160;0m0.287s
sys&#160;&#160;&#160;&#160;&#160;4m51.010s
&#160;
2. fragmented fs, mb_c[1-3]_threshold are &lt;span class=&quot;code-keyword&quot;&gt;default&lt;/span&gt; (24,14, 4): write: ~10Mb/sec (93.89system, 90.98system) :
fake_fill_fs 50
1 Fri May 17 15:41:27 UTC 2019 ==================
+ cat /sys/fs/ldiskfs/md0/mb_c1_threshold
24
+ cat /sys/fs/ldiskfs/md0/mb_c2_threshold
14
+ cat /sys/fs/ldiskfs/md0/mb_c3_threshold
4
+ set +x
WRITE 1: =======-===
Linux 3.10.0-693.21.1.x3.2.12.x86_64 (cslmo1704)&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;05/17/2019&#160;&#160;&#160;&#160;&#160;&#160;_x86_64_&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;(20 CPU)
&#160;
avg-cpu:&#160;&#160;%user&#160;&#160;&#160;%nice %system %iowait&#160;&#160;%steal&#160;&#160;&#160;%idle
&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;1.84&#160;&#160;&#160;&#160;0.00&#160;&#160;&#160;&#160;0.41&#160;&#160;&#160;&#160;0.13&#160;&#160;&#160;&#160;0.00&#160;&#160;&#160;97.61
&#160;
Device:&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;tps&#160;&#160;&#160;&#160;kB_read/s&#160;&#160;&#160;&#160;kB_wrtn/s&#160;&#160;&#160;&#160;kB_read&#160;&#160;&#160;&#160;kB_wrtn
md0&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;78.09&#160;&#160;&#160;&#160;&#160;&#160;&#160;239.67&#160;&#160;&#160;&#160;&#160;&#160;7133.00&#160;&#160;&#160;22778200&#160;&#160;677929976
&#160;
65536+0 records in
65536+0 records out
68719476736 bytes (69 GB) copied, 6530.55 s, 10.5 MB/s
0.23user 93.89system 1:48:50elapsed 1%CPU (0avgtext+0avgdata 1824maxresident)k
168inputs+134217728outputs (1major+501minor)pagefaults 0swaps
Linux 3.10.0-693.21.1.x3.2.12.x86_64 (cslmo1704)&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;05/17/2019&#160;&#160;&#160;&#160;&#160;&#160;_x86_64_&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;(20 CPU)
&#160;
avg-cpu:&#160;&#160;%user&#160;&#160;&#160;%nice %system %iowait&#160;&#160;%steal&#160;&#160;&#160;%idle
&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;1.79&#160;&#160;&#160;&#160;0.00&#160;&#160;&#160;&#160;0.80&#160;&#160;&#160;&#160;0.24&#160;&#160;&#160;&#160;0.00&#160;&#160;&#160;97.18
&#160;
Device:&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;tps&#160;&#160;&#160;&#160;kB_read/s&#160;&#160;&#160;&#160;kB_wrtn/s&#160;&#160;&#160;&#160;kB_read&#160;&#160;&#160;&#160;kB_wrtn
md0&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;80.86&#160;&#160;&#160;&#160;&#160;&#160;&#160;224.30&#160;&#160;&#160;&#160;&#160;&#160;7279.30&#160;&#160;&#160;22782608&#160;&#160;739372736
&#160;
-rw-r--r-- 1 root root 68719476736 May 17 17:30 /mnt/ldiskfs/foo
READ 1: ========
65536+0 records in
65536+0 records out
68719476736 bytes (69 GB) copied, 56.2515 s, 1.2 GB/s
0.04user 24.33system 0:56.25elapsed 43%CPU (0avgtext+0avgdata 1828maxresident)k
134217784inputs+0outputs (1major+502minor)pagefaults 0swaps
RM 1: =====
0.00user 4.14system 0:04.89elapsed 84%CPU (0avgtext+0avgdata 684maxresident)k
1264inputs+0outputs (1major+214minor)pagefaults 0swaps
2 Fri May 17 17:43:21 UTC 2019 ==================
+ cat /sys/fs/ldiskfs/md0/mb_c1_threshold
24
+ cat /sys/fs/ldiskfs/md0/mb_c2_threshold
14
+ cat /sys/fs/ldiskfs/md0/mb_c3_threshold
4
+ set +x
2 Fri May 17 17:43:21 UTC 2019 ==================
+ cat /sys/fs/ldiskfs/md0/mb_c1_threshold
24
+ cat /sys/fs/ldiskfs/md0/mb_c2_threshold
14
+ cat /sys/fs/ldiskfs/md0/mb_c3_threshold
4
+ set +x
WRITE 2: =======-===
Linux 3.10.0-693.21.1.x3.2.12.x86_64 (cslmo1704)&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;05/17/2019&#160;&#160;&#160;&#160;&#160;&#160;_x86_64_&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;(20 CPU)
&#160;
avg-cpu:&#160;&#160;%user&#160;&#160;&#160;%nice %system %iowait&#160;&#160;%steal&#160;&#160;&#160;%idle
&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;1.78&#160;&#160;&#160;&#160;0.00&#160;&#160;&#160;&#160;0.84&#160;&#160;&#160;&#160;0.24&#160;&#160;&#160;&#160;0.00&#160;&#160;&#160;97.14
&#160;
Device:&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;tps&#160;&#160;&#160;&#160;kB_read/s&#160;&#160;&#160;&#160;kB_wrtn/s&#160;&#160;&#160;&#160;kB_read&#160;&#160;&#160;&#160;kB_wrtn
md0&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;89.69&#160;&#160;&#160;&#160;&#160;&#160;&#160;878.23&#160;&#160;&#160;&#160;&#160;&#160;7278.99&#160;&#160;&#160;89892436&#160;&#160;745047808
&#160;
65536+0 records in
65536+0 records out
68719476736 bytes (69 GB) copied, 6619.02 s, 10.4 MB/s
0.25user 90.98system 1:50:19elapsed 1%CPU (0avgtext+0avgdata 1828maxresident)k
64inputs+134217728outputs (1major+502minor)pagefaults 0swaps
Linux 3.10.0-693.21.1.x3.2.12.x86_64 (cslmo1704)&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;05/17/2019&#160;&#160;&#160;&#160;&#160;&#160;_x86_64_&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;(20 CPU)
&#160;
avg-cpu:&#160;&#160;%user&#160;&#160;&#160;%nice %system %iowait&#160;&#160;%steal&#160;&#160;&#160;%idle
&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;1.73&#160;&#160;&#160;&#160;0.00&#160;&#160;&#160;&#160;1.18&#160;&#160;&#160;&#160;0.34&#160;&#160;&#160;&#160;0.00&#160;&#160;&#160;96.75
&#160;
Device:&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;tps&#160;&#160;&#160;&#160;kB_read/s&#160;&#160;&#160;&#160;kB_wrtn/s&#160;&#160;&#160;&#160;kB_read&#160;&#160;&#160;&#160;kB_wrtn
md0&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;91.65&#160;&#160;&#160;&#160;&#160;&#160;&#160;824.93&#160;&#160;&#160;&#160;&#160;&#160;7400.80&#160;&#160;&#160;89896736&#160;&#160;806502028
&#160;
-rw-r--r-- 1 root root 68719476736 May 17 19:33 /mnt/ldiskfs/foo
&#160;&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
For given fragmentation (50 free blocks -  50 occupied blocks) excluding c1 doesn&apos;t give improvement, but excluding c2 loops gives 500M/s writing performance:
 
Filesystem     Type       1K-blocks        Used   Available Use% Mounted on
/dev/md0       ldiskfs 121226819924 60622252404 59391896096  51% /mnt/ldiskfs
 
 
/dev/md0:
 Timing buffered disk reads: 6256 MB in  3.00 seconds = 2084.39 MB/sec
1 Tue May 21 12:25:00 UTC 2019 ============================================
+ cat /sys/fs/ldiskfs/md0/mb_c1_threshold
59
+ cat /sys/fs/ldiskfs/md0/mb_c2_threshold
59
+ cat /sys/fs/ldiskfs/md0/mb_c3_threshold
4
+ echo 1
+ set +x
WRITE 1: ================================
Linux 3.10.0-693.21.1.x3.2.12.x86_64 (cslmo1704) 	05/21/2019 	_x86_64_	(20 CPU)
 
 
avg-cpu:  %user   %nice %system %iowait  %steal   %idle
           1.11    0.00    0.99    0.29    0.00   97.61
 
 
Device:            tps    kB_read/s    kB_wrtn/s    kB_read    kB_wrtn
md0              51.63      1384.45      2903.89  593729636 1245346552
 
 
	65536+0 records in
65536+0 records out
68719476736 bytes (69 GB) copied, 133.066 s, 516 MB/s
0.07user 84.88system 2:14.63elapsed 63%CPU (0avgtext+0avgdata 1828maxresident)k
2928inputs+134217728outputs (0major+502minor)pagefaults 0swaps
Linux 3.10.0-693.21.1.x3.2.12.x86_64 (cslmo1704) 	05/21/2019 	_x86_64_	(20 CPU)
 
 
avg-cpu:  %user   %nice %system %iowait  %steal   %idle
           1.11    0.00    0.99    0.30    0.00   97.60
 
 
Device:            tps    kB_read/s    kB_wrtn/s    kB_read    kB_wrtn
md0              53.68      1384.03      3059.43  593735516 1312462836
 
 
mballoc: 16777208 blocks 335827 reqs (325 success)
mballoc: 67425781 extents scanned, 177 goal hits, 0 2^N hits, 335442 breaks, 0 lost
mballoc: (0, 0, 0) useless c(0,1,2) loops
mballoc: (1425456, 1083502, 0) skipped c(0,1,2) loops
-rw-r--r-- 1 root root 68719476736 May 21 12:27 /mnt/ldiskfs/foo
READ 1: ========
65536+0 records in
65536+0 records out
68719476736 bytes (69 GB) copied, 64.7674 s, 1.1 GB/s
0.05user 24.76system 1:04.77elapsed 38%CPU (0avgtext+0avgdata 1828maxresident)k
134217784inputs+0outputs (1major+501minor)pagefaults 0swaps
Filesystem     Type       1K-blocks        Used   Available Use% Mounted on
/dev/md0       ldiskfs 121226819924 60680523488 59333625012  51% /mnt/ldiskfs
RM 1: =====
0.00user 4.02system 0:06.78elapsed 59%CPU (0avgtext+0avgdata 680maxresident)k
4784inputs+0outputs (1major+214minor)pagefaults 0swaps
2 Tue May 21 12:33:37 UTC 2019 ============================================
+ cat /sys/fs/ldiskfs/md0/mb_c1_threshold
59
+ cat /sys/fs/ldiskfs/md0/mb_c2_threshold
59
+ cat /sys/fs/ldiskfs/md0/mb_c3_threshold
4
+ echo 1
+ set +x
WRITE 2: ================================
Linux 3.10.0-693.21.1.x3.2.12.x86_64 (cslmo1704) 	05/21/2019 	_x86_64_	(20 CPU)
 
 
avg-cpu:  %user   %nice %system %iowait  %steal   %idle
           1.11    0.00    1.00    0.30    0.00   97.60
 
 
Device:            tps    kB_read/s    kB_wrtn/s    kB_read    kB_wrtn
md0              55.76      1539.10      3056.71  660846708 1312464796
 
 
65536+0 records in
65536+0 records out
68719476736 bytes (69 GB) copied, 123.318 s, 557 MB/s
0.05user 78.67system 2:03.32elapsed 63%CPU (0avgtext+0avgdata 1828maxresident)k
56inputs+134217728outputs (1major+501minor)pagefaults 0swaps
Linux 3.10.0-693.21.1.x3.2.12.x86_64 (cslmo1704) 	05/21/2019 	_x86_64_	(20 CPU)
 
 
avg-cpu:  %user   %nice %system %iowait  %steal   %idle
           1.11    0.00    1.00    0.30    0.00   97.59
 
 
Device:            tps    kB_read/s    kB_wrtn/s    kB_read    kB_wrtn
md0              57.64      1538.67      3199.87  660851024 1374326816
 
 
mballoc: 15462744 blocks 309333 reqs (149 success)
mballoc: 62147084 extents scanned, 6 goal hits, 0 2^N hits, 309183 breaks, 0 lost
mballoc: (0, 0, 0) useless c(0,1,2) loops
mballoc: (1425456, 1393743, 0) skipped c(0,1,2) loops
-rw-r--r-- 1 root root 68719476736 May 21 12:35 /mnt/ldiskfs/foo
READ 2: ========
65536+0 records in
65536+0 records out
68719476736 bytes (69 GB) copied, 65.9076 s, 1.0 GB/s
0.05user 24.05system 1:05.91elapsed 36%CPU (0avgtext+0avgdata 1824maxresident)k
134217784inputs+0outputs (1major+500minor)pagefaults 0swaps
Filesystem     Type       1K-blocks        Used   Available Use% Mounted on
/dev/md0       ldiskfs 121226819924 60680523488 59333625012  51% /mnt/ldiskfs
RM 2: =====
0.00user 3.94system 0:06.80elapsed 57%CPU (0avgtext+0avgdata 680maxresident)k
4752inputs+0outputs (1major+214minor)pagefaults 0swaps 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
The reason why in &lt;span class=&quot;code-keyword&quot;&gt;case&lt;/span&gt; &lt;span class=&quot;code-quote&quot;&gt;&quot;50 free blocks - 50 ocupated blocks&quot;&lt;/span&gt; setting &lt;span class=&quot;code-quote&quot;&gt;&quot;60-0-0&quot;&lt;/span&gt; can be illustrated by statistics:
mballoc: (7829, 1664192, 0) useless c(0,1,2) loops
mballoc: (981753, 0, 0) skipped c(0,1,2) loops
Yes, there are 7829 c1 loops, but 1664192 c2 loops x1000 times, so we can drop c1 loops influence. In &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;case&lt;/span&gt; we need set &lt;span class=&quot;code-quote&quot;&gt;&quot;60-60-0&quot;&lt;/span&gt; options. Statistic with &lt;span class=&quot;code-quote&quot;&gt;&quot;60-60-0&quot;&lt;/span&gt; shows 1393743 c2 loops skipped and &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; returns to 500M/s write performance:
mballoc: (0, 0, 0) useless c(0,1,2) loops
mballoc: (1425456, 1393743, 0) skipped c(0,1,2) loops
Read performance hasn&apos;t changed - 1.0 GB/s.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="247654" author="adilger" created="Fri, 24 May 2019 16:38:25 +0000"  >&lt;p&gt;A summary of these statistics (table showing total group scans, performance vs for unpatched and patches code for a few different configs) should be included in the commit message for the patch submitted upstream. That makes it clear the patch is providing a real benefit (improved performance, reduced CPU usage).  I think that would make it much easier to get the patch accepted, otherwise just a vague &quot;improves performance&quot; in the comment is not a compelling reason to land it. &lt;/p&gt;</comment>
                            <comment id="248250" author="artem_blagodarenko" created="Mon, 3 Jun 2019 08:44:25 +0000"  >&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Artem, we discussed &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; patch on the Ext4 concall today. A couple
 of items came up during discussion:

the patch submission should include performance results to
 &#160;&#160;show that the patch is providing an improvement it would be preferable &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; the thresholds &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; the stages were found
 &#160;&#160;dynamically in the kernel based on how many groups have been skipped
 &#160;&#160;and the free chunk size in each group there would need to be some way to dynamically reset the scanning
 &#160;&#160;level when lots of blocks have been freed &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Hello &#160;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=adilger&quot; class=&quot;user-hover&quot; rel=&quot;adilger&quot;&gt;adilger&lt;/a&gt;, what do you think about idea split this ideas to two phases: first as I already send, second with some autotune logic?&#160;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="248285" author="adilger" created="Mon, 3 Jun 2019 18:59:52 +0000"  >&lt;p&gt;Definitely including statistics for the performance improvement should be part of the first patch.  I didn&apos;t see a copy of the patch in Gerrit.  Have you submitted it yet?&lt;/p&gt;

&lt;p&gt;I think a very simple heuristic could be used for auto-tune, something like &quot;skip groups with number of free blocks less than 1/2 of average&quot;, or possibly &quot;skip groups with free allocation order less than 1/2 average&quot;, and adjust which scanning stage this applies when, say, 1/2, 3/4, ... of groups are below this level. &lt;/p&gt;</comment>
                            <comment id="248428" author="artem_blagodarenko" created="Wed, 5 Jun 2019 07:41:45 +0000"  >&lt;p&gt;Hello &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=adilger&quot; class=&quot;user-hover&quot; rel=&quot;adilger&quot;&gt;adilger&lt;/a&gt;,&lt;/p&gt;

&lt;p&gt;&amp;gt;I didn&apos;t see a copy of the patch in Gerrit. Have you submitted it yet?&lt;/p&gt;

&lt;p&gt;Do we need LDISKFS patches here? The right way is land it to ext4 directly. I am going send new patches series, with test results and debugfs fake fragmentation patch for tests.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="248457" author="adilger" created="Wed, 5 Jun 2019 15:34:33 +0000"  >&lt;p&gt;I agree that it makes sense to get the patches reviewed and accepted upstream if possible, but after that it might take several years before the change is available in a vendor kernel, so it would make sense to have an ldiskfs patch as well. &lt;/p&gt;

&lt;p&gt;Also, in some cases, patches that improve performance and/or functionality still do not get accepted upstream because of various reasons, so in this case it would still make sense to carry this patch in the Lustre tree because it mostly affects very large OST filesystems. &lt;/p&gt;</comment>
                            <comment id="248880" author="artem_blagodarenko" created="Mon, 10 Jun 2019 15:42:11 +0000"  >&lt;p&gt;Here is summary from EXT4 developers call descussion last Thursday.&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;EXT4 users prefer automatic block allocator settings adjusting&#160;&lt;/li&gt;
	&lt;li&gt;Current versions of skipping loops patch is not interesting, because requires manual setting.&lt;/li&gt;
	&lt;li&gt;Some heuristic that change allocator behaviour is preferred.&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;My next steps:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;Continue discussion in EXT4 email list to find the best heuristic&lt;/li&gt;
	&lt;li&gt;Upload current patch version to the Gerrit&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="248985" author="gerrit" created="Tue, 11 Jun 2019 13:59:33 +0000"  >&lt;p&gt;Artem Blagodarenko (c17828@cray.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/35180&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/35180&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12103&quot; title=&quot;Improve block allocation for large partitions&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12103&quot;&gt;&lt;del&gt;LU-12103&lt;/del&gt;&lt;/a&gt; ldiskfs: don&apos;t search large block range if disk full&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 5444c2b2d17a58f7b0d2d8aeb23b652ae8d6ecd4&lt;/p&gt;</comment>
                            <comment id="255366" author="artem_blagodarenko" created="Wed, 25 Sep 2019 11:53:13 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=green&quot; class=&quot;user-hover&quot; rel=&quot;green&quot;&gt;green&lt;/a&gt;, I added the issue is about porting to RHEL8 - &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12801&quot; title=&quot;Port &amp;quot;ldiskfs: don&amp;#39;t search large block range if disk full&amp;quot; to RHEL 8&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12801&quot;&gt;&lt;del&gt;LU-12801&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="255367" author="gerrit" created="Wed, 25 Sep 2019 12:07:38 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/35180/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/35180/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12103&quot; title=&quot;Improve block allocation for large partitions&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12103&quot;&gt;&lt;del&gt;LU-12103&lt;/del&gt;&lt;/a&gt; ldiskfs: don&apos;t search large block range if disk full&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 95f8ae5677491508ae7182b4f61ead3d413434ae&lt;/p&gt;</comment>
                            <comment id="255368" author="pjones" created="Wed, 25 Sep 2019 12:17:11 +0000"  >&lt;p&gt;Landed for 2.13&lt;/p&gt;</comment>
                            <comment id="257780" author="gerrit" created="Tue, 5 Nov 2019 21:15:56 +0000"  >&lt;p&gt;Minh Diep (mdiep@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/36681&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/36681&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12103&quot; title=&quot;Improve block allocation for large partitions&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12103&quot;&gt;&lt;del&gt;LU-12103&lt;/del&gt;&lt;/a&gt; ldiskfs: don&apos;t search large block range if disk full&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 0da3a44425f5ab1c0417663281f7cc626f99b675&lt;/p&gt;</comment>
                            <comment id="258966" author="adilger" created="Thu, 28 Nov 2019 22:29:34 +0000"  >&lt;p&gt;Hi Artem, I noticed that this patch was only added to the rhel7.6 series, but not the rhel7.7 and rhel8.0 series.  Could you please submit a patch to add &lt;tt&gt;ext4-simple-blockalloc.patch&lt;/tt&gt; to these newer series.&lt;/p&gt;</comment>
                            <comment id="259207" author="gerrit" created="Thu, 5 Dec 2019 14:58:25 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/36681/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/36681/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12103&quot; title=&quot;Improve block allocation for large partitions&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12103&quot;&gt;&lt;del&gt;LU-12103&lt;/del&gt;&lt;/a&gt; ldiskfs: don&apos;t search large block range if disk full&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 810a952303969ca0ee01639a5408ff2f0e3456d9&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="37967">LU-8365</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="55762">LU-12345</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="56993">LU-12801</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="55742">LU-12335</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="57389">LU-12970</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="57424">LU-12988</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="62254">LU-14305</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="32312" name="0002-LUS-6746-ldiskfs-block-allocator-tests.patch" size="3359" author="artem_blagodarenko" created="Mon, 25 Mar 2019 09:34:04 +0000"/>
                            <attachment id="32313" name="allocator-skip-loops-test-results.txt" size="4226" author="artem_blagodarenko" created="Mon, 25 Mar 2019 09:35:51 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00dt3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>