<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:53:11 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12505] mounting bigalloc enabled large OST takes a long time</title>
                <link>https://jira.whamcloud.com/browse/LU-12505</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Not only Lustre OST, but also when OSS mounts large OST device which &apos;bigalloc&apos; is enabled, it takes huge amount of time to complete.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# time mount -t ldiskfs /dev/ddn/scratch0_ost0000 /lustre/scratch0/ost0000

real    12m32.153s
user    0m0.000s
sys     11m49.887s
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# dumpe2fs -h /dev/ddn/scratch0_ost0000
dumpe2fs 1.45.2.wc1 (27-May-2019)
Filesystem volume name:   scratch0-OST0000
Last mounted on:          /
Filesystem UUID:          1ca9dd81-8b70-4805-a430-78b0eafc1c45
Filesystem magic number:  0xEF53
Filesystem revision #:    1 (dynamic)
Filesystem features:      has_journal ext_attr dir_index filetype needs_recovery meta_bg extent 64bit mmp flex_bg sparse_super large_file huge_file uninit_bg dir_nlink quota bigalloc
Filesystem flags:         signed_directory_hash 
Default mount options:    user_xattr acl
Filesystem state:         clean
Errors behavior:          Continue
Filesystem OS type:       Linux
Inode count:              1074397184
Block count:              275045679104
Reserved block count:     2750456791
Free blocks:              274909403680
Free inodes:              1074396851
First block:              0
Block size:               4096
Cluster size:             131072
Group descriptor size:    64
Blocks per group:         1048576
Clusters per group:       32768
Inodes per group:         4096
Inode blocks per group:   512
RAID stride:              512
RAID stripe width:        512
Flex block group size:    256
Filesystem created:       Mon Jul  1 00:43:14 2019
Last mount time:          Wed Jul  3 05:55:22 2019
Last write time:          Wed Jul  3 05:55:22 2019
Mount count:              8
Maximum mount count:      -1
Last checked:             Mon Jul  1 00:43:14 2019
Check interval:           0 (&amp;lt;none&amp;gt;)
Lifetime writes:          2693 GB
Reserved blocks uid:      0 (user root)
Reserved blocks gid:      0 (group root)
First inode:              11
Inode size:               512
Required extra isize:     32
Desired extra isize:      32
Journal inode:            8
Default directory hash:   half_md4
Directory Hash Seed:      4eeb2234-062d-4af5-8973-872baabd2e9f
Journal backup:           inode blocks
MMP block number:         131680
MMP update interval:      5
User quota inode:         3
Group quota inode:        4
Journal features:         journal_incompat_revoke journal_64bit
Journal size:             4096M
Journal length:           1048576
Journal sequence:         0x00000494
Journal start:            0
MMP_block:
    mmp_magic: 0x4d4d50
    mmp_check_interval: 10
    mmp_sequence: 0x0000cd
    mmp_update_date: Wed Jul  3 06:00:33 2019
    mmp_update_time: 1562133633
    mmp_node_name: es18k-vm11
    mmp_device_name: sda
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Without bigalloc&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# time mount -t ldiskfs /dev/ddn/scratch0_ost0000 /lustre/scratch0/ost0000

real	0m6.484s
user	0m0.000s
sys	0m4.954s
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>master</environment>
        <key id="56260">LU-12505</key>
            <summary>mounting bigalloc enabled large OST takes a long time</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="dongyang">Dongyang Li</assignee>
                                    <reporter username="sihara">Shuichi Ihara</reporter>
                        <labels>
                            <label>e2fsprogs</label>
                    </labels>
                <created>Wed, 3 Jul 2019 06:13:23 +0000</created>
                <updated>Wed, 10 May 2023 20:55:48 +0000</updated>
                            <resolved>Wed, 10 May 2023 20:55:48 +0000</resolved>
                                                                        <due></due>
                            <votes>0</votes>
                                    <watches>12</watches>
                                                                            <comments>
                            <comment id="250586" author="adilger" created="Wed, 3 Jul 2019 06:36:33 +0000"  >&lt;p&gt;Could you please add the &quot;&lt;tt&gt;dumpe2fs&lt;/tt&gt;&quot; output from the OST (gzipped).  Normally when mount is slow it is because the disk is seeking between thousands/millions of different data structures at 10ms/seek.  Those problems were largely fixed by &lt;tt&gt;flex_bg&lt;/tt&gt;, but it may be that &lt;tt&gt;meta_bg&lt;/tt&gt; has reintroduced this problem again.  It may be that with &lt;tt&gt;bigalloc&lt;/tt&gt; we don&apos;t need &lt;tt&gt;meta_bg&lt;/tt&gt; anymore because the number of block groups are reduced?&lt;/p&gt;</comment>
                            <comment id="250587" author="sihara" created="Wed, 3 Jul 2019 07:01:22 +0000"  >&lt;p&gt;uploaded dumpe2fs.out.gz. i&apos;ve tested without &apos;meta_bg&apos; before, but it was same and took a long time.&lt;br/&gt;
 And there were nothing disk I/O at most of time and 100% cpu bound below.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Tasks: 237 total,   2 running, 235 sleeping,   0 stopped,   0 zombie
%Cpu(s):  0.4 us,  6.6 sy,  0.0 ni, 93.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
KiB Mem : 15456899+total, 15270185+free,  1553196 used,   313944 buff/cache
KiB Swap:  5472252 total,  5472252 free,        0 used. 15216664+avail Mem 

  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND                                                                            
13008 root      20   0   19940   1052    868 R 100.0  0.0   0:29.86 mount                                                                              
    1 root      20   0   44604   4908   2552 S   0.0  0.0   0:02.00 systemd         
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;it looks ldiskfs_get_group_desc() and ldiskfs_calculate_overhead() are taking most of CPU cycle a long while during mount.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Samples: 108K of event &apos;cycles&apos;, Event count (approx.): 26372312997                                                                                     
Overhead  Shared Object          Symbol                                                                                                                 
  52.20%  [kernel]               [k] ldiskfs_get_group_desc                                                                                             
  45.13%  [kernel]               [k] ldiskfs_calculate_overhead                                                                                         
   0.31%  [kernel]               [k] native_write_msr_safe                                                                                               
   0.23%  [kernel]               [k] crc16                                                                                                               
   0.21%  [kernel]               [k] apic_timer_interrupt                                                                                               
   0.19%  [kernel]               [k] arch_cpu_idle                    
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="250590" author="adilger" created="Wed, 3 Jul 2019 07:43:32 +0000"  >&lt;p&gt;It looks like the problem is in &lt;tt&gt;ext4_calculate_overhead()&lt;/tt&gt; and &lt;tt&gt;count_overhead()&lt;/tt&gt;, since there is a simple calculation for normal filesystems, and a complex one that loads and checks every group in the &lt;tt&gt;bigalloc&lt;/tt&gt; case, and &lt;tt&gt;ext4_calculate_ovehead()&lt;/tt&gt; calls &lt;tt&gt;count_overhead()&lt;/tt&gt; for &lt;b&gt;every group&lt;/b&gt; as well:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; count_overhead(struct super_block *sb, ext4_group_t grp,
                          &lt;span class=&quot;code-object&quot;&gt;char&lt;/span&gt; *buf)
{
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC))
                &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
                        sbi-&amp;gt;s_itb_per_group + 2);
        
        first_block = le32_to_cpu(sbi-&amp;gt;s_es-&amp;gt;s_first_data_block) +
                (grp * EXT4_BLOCKS_PER_GROUP(sb));
        last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
        &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; (i = 0; i &amp;lt; ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                :

&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; ext4_calculate_overhead(struct super_block *sb)
{
        /* Compute the overhead (FS structures).  This is constant
         * &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; a given filesystem unless the number of block groups
         * changes so we cache the previous value until it does. */

        &lt;span class=&quot;code-comment&quot;&gt;/* All of the blocks before first_data_block are overhead */&lt;/span&gt;
        overhead = EXT4_B2C(sbi, le32_to_cpu(es-&amp;gt;s_first_data_block));

        &lt;span class=&quot;code-comment&quot;&gt;/* Add the overhead found in each block group */&lt;/span&gt;
        &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; (i = 0; i &amp;lt; ngroups; i++) {
                blks = count_overhead(sb, i, buf);
                overhead += blks;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;That means for a 1024 TiB filesystem (&lt;tt&gt;num_groups = 1024TB / (32768 * chunk_size/group) = 256K groups&lt;/tt&gt;) it will do 256K*256K = 68B checks, which would be very slow and pointless.  I did read somewhere that mke2fs should store this overhead into the superblock at format time, so the kernel can avoid doing this pointless operation, but possibly that isn&apos;t in the kernel you are using, or it isn&apos;t working properly and nobody noticed for small filesystems?&lt;/p&gt;</comment>
                            <comment id="250594" author="sihara" created="Wed, 3 Jul 2019 08:50:57 +0000"  >&lt;p&gt;maybe, it would be better to test with newer kernel if same behavior reproduced?&lt;br/&gt;
btw, mke2fs to bigalloc enabled OST, is also very slow either.&lt;/p&gt;

&lt;p&gt;without bigalloc&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# time mkfs.lustre --ost --servicenode=127.0.0.2@tcp --fsname=scratch0 --index=2 --mgsnode=127.0.0.2@tcp --mkfsoptions=&apos;-E lazy_itable_init=0,lazy_journal_init=0,stripe_width=512,stride=512 -O meta_bg,^resize_inode -m1 -J size=4096&apos; --reformat --backfstype=ldiskfs /dev/ddn/scratch0_ost0tune2fs -E mmp_update_interval=5 /dev/ddn/scratch0_ost0002

real    9m11.614s
user    0m59.894s
sys     7m10.594s
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;with bigalloc&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# time mkfs.lustre --ost --servicenode=127.0.0.2@tcp --fsname=scratch0 --index=0 --mgsnode=127.0.0.2@tcp --mkfsoptions=&apos;-E lazy_itable_init=0,lazy_journal_init=0,stripe_width=512,stride=512 -O bigalloc -C 131072 -m1 -J size=4096&apos; --reformat --backfstype=ldiskfs /dev/ddn/scratch0_ost0000

real    43m5.349s
user    24m29.652s
sys     18m35.058s
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The most of CPU time are consumed at the following functions which I didn&apos;t see mke2fs without &apos;-O bigalloc&apos;.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Samples: 24K of event &apos;cycles&apos;, Event count (approx.): 14154870804              
Overhead  Shared Object      Symbol                                             
  46.30%  libext2fs.so.2.4   [.] rb_test_bmap                                   
  32.98%  libext2fs.so.2.4   [.] ext2fs_test_generic_bmap                       
  13.10%  libext2fs.so.2.4   [.] ext2fs_convert_subcluster_bitmap               
   6.96%  libext2fs.so.2.4   [.] ext2fs_test_generic_bmap@plt       
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="250634" author="adilger" created="Wed, 3 Jul 2019 18:49:55 +0000"  >&lt;p&gt;It wouldn&apos;t be a bad idea to post an email to linux-ext4 with this information.  Maybe we can get some input on how to fix it, or Ted will &quot;just know&quot; the best way to fix the problem.&lt;/p&gt;</comment>
                            <comment id="252363" author="gerrit" created="Wed, 31 Jul 2019 23:52:54 +0000"  >&lt;p&gt;Li Dongyang (dongyangli@ddn.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/35659&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/35659&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12505&quot; title=&quot;mounting bigalloc enabled large OST takes a long time&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12505&quot;&gt;&lt;del&gt;LU-12505&lt;/del&gt;&lt;/a&gt; libext2fs: optimize ext2fs_convert_subcluster_bitmap()&lt;br/&gt;
Project: tools/e2fsprogs&lt;br/&gt;
Branch: master-lustre&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 47d5bc9d922585229dfd5da82a1f19ff93bea28e&lt;/p&gt;</comment>
                            <comment id="252383" author="shadow" created="Thu, 1 Aug 2019 12:55:38 +0000"  >&lt;p&gt;&amp;gt;it looks ldiskfs_get_group_desc() and ldiskfs_calculate_overhead() are taking most of CPU cycle a long while during mount.&lt;/p&gt;

&lt;p&gt;only once and store to super block for later use.&lt;/p&gt;

&lt;p&gt; &amp;gt;46.30%  libext2fs.so.2.4   &lt;span class=&quot;error&quot;&gt;&amp;#91;.&amp;#93;&lt;/span&gt; rb_test_bmap                                   &lt;br/&gt;
 &amp;gt;32.98%  libext2fs.so.2.4   &lt;span class=&quot;error&quot;&gt;&amp;#91;.&amp;#93;&lt;/span&gt; ext2fs_test_generic_bmap                       &lt;/p&gt;

&lt;p&gt;it&apos;s know problem. bitmaps on e2fsprogs isn&apos;t good designed in case word have a several bits set, replace with IDR (from kernel) can improve speed dramatically.&lt;/p&gt;</comment>
                            <comment id="252985" author="gerrit" created="Tue, 13 Aug 2019 04:59:37 +0000"  >&lt;p&gt;Li Dongyang (dongyangli@ddn.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/35781&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/35781&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12505&quot; title=&quot;mounting bigalloc enabled large OST takes a long time&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12505&quot;&gt;&lt;del&gt;LU-12505&lt;/del&gt;&lt;/a&gt; mke2fs: set overhead in super block for bigalloc&lt;br/&gt;
Project: tools/e2fsprogs&lt;br/&gt;
Branch: master-lustre&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 8624a496ff7c3e4fd69fb7217ff56030111f4460&lt;/p&gt;</comment>
                            <comment id="257466" author="adilger" created="Fri, 1 Nov 2019 01:00:01 +0000"  >&lt;p&gt;Dongyang, have these patches been submitted upstream yet?&lt;/p&gt;</comment>
                            <comment id="271208" author="adilger" created="Wed, 27 May 2020 02:29:48 +0000"  >&lt;p&gt;To answer my own question, the bigalloc patches are on the master branch of the e2fsprogs repo, but not in the maint branch for 1.45.6.&lt;/p&gt;</comment>
                            <comment id="371834" author="adilger" created="Wed, 10 May 2023 20:55:48 +0000"  >&lt;p&gt;Patch was landed upstream for 1.46 via commit 59037c5357d39c6d0f14a0aff70e67dc13eafc84&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="59334">LU-13604</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="33063" name="dumpe2fs.out.gz" size="15180458" author="sihara" created="Wed, 3 Jul 2019 07:00:08 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00j53:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>