<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:29:11 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-16691] optimize ldiskfs prealloc (PA) under random read workloads</title>
                <link>https://jira.whamcloud.com/browse/LU-16691</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;In some cases, ldiskfs block allocation can consume a large amount of CPU cycles handling block allocations and cause OST threads to become blocked:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crmd[16542]:  notice: High CPU load detected: 261.019989
crmd[16542]:  notice: High CPU load detected: 258.720001
crmd[16542]:  notice: High CPU load detected: 265.029999
crmd[16542]:  notice: High CPU load detected: 270.309998

 INFO: task ll_ost00_027:20788 blocked for more than 90 seconds.
 ll_ost00_027    D ffff92242eda9080     0 20788      2 0x00000080
 Call Trace:
 schedule+0x29/0x70
 wait_transaction_locked+0x85/0xd0 [jbd2]
 add_transaction_credits+0x278/0x310 [jbd2]
 start_this_handle+0x1a1/0x430 [jbd2]
 jbd2__journal_start+0xf3/0x1f0 [jbd2]
 __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs]
 osd_trans_start+0x1e7/0x570 [osd_ldiskfs]
 ofd_trans_start+0x75/0xf0 [ofd]
 ofd_attr_set+0x586/0xb00 [ofd]
 ofd_setattr_hdl+0x31d/0x960 [ofd]
 tgt_request_handle+0xb7e/0x1700 [ptlrpc]
 ptlrpc_server_handle_request+0x253/0xbd0 [ptlrpc]
 ptlrpc_main+0xc09/0x1c30 [ptlrpc]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Perf stats show that a large amount of CPU time is used in preallocation:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Samples: 86M of event &apos;cycles&apos;, 4000 Hz, Event count (approx.): 25480688920 lost: 0/0 drop: 0/0
Overhead  Shared Object               Symbol
  23,81%  [kernel]                    [k] _raw_qspin_lock
  21,90%  [kernel]                    [k] ldiskfs_mb_use_preallocated
  20,16%  [kernel]                    [k] __raw_callee_save___pv_queued_spin_unlock
  15,46%  [kernel]                    [k] ldiskfs_mb_normalize_request
   1,21%  [kernel]                    [k] rwsem_spin_on_owner
   0,98%  [kernel]                    [k] native_write_msr_safe
   0,54%  [kernel]                    [k] apic_timer_interrupt
   0,51%  [kernel]                    [k] ktime_get
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
</description>
                <environment></environment>
        <key id="75370">LU-16691</key>
            <summary>optimize ldiskfs prealloc (PA) under random read workloads</summary>
                <type id="4" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11310&amp;avatarType=issuetype">Improvement</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bzzz">Alex Zhuravlev</assignee>
                                    <reporter username="adilger">Andreas Dilger</reporter>
                        <labels>
                            <label>ldiskfs</label>
                    </labels>
                <created>Fri, 31 Mar 2023 05:01:25 +0000</created>
                <updated>Sat, 29 Jul 2023 07:57:53 +0000</updated>
                            <resolved>Sun, 9 Jul 2023 14:25:08 +0000</resolved>
                                    <version>Lustre 2.16.0</version>
                    <version>Lustre 2.15.2</version>
                                    <fixVersion>Lustre 2.16.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="367955" author="adilger" created="Fri, 31 Mar 2023 05:04:41 +0000"  >&lt;p&gt;Looking at the flame graphs, I would suspect that something may be wrong with the preallocation (PA), for example too many PA regions, or something else that is causing these functions to be slow.  According to the flame graph &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/48613/48613_oss07.perf.svg&quot; title=&quot;oss07.perf.svg attached to LU-16691&quot;&gt;oss07.perf.svg&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;, for each call to &lt;tt&gt;ldiskfs_mb_new_blocks()&lt;/tt&gt; there is a large amount of time spent in _raw_spin_lock(), ldiskfs_mb_normalize_request(), and ldiskfs_mb_use_preallocated().&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
ldiskfs_fsblk_t ldiskfs_mb_new_blocks(handle_t *handle,
                                struct ldiskfs_allocation_request *ar, &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; *errp)
{
        :
        :
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!ldiskfs_mb_use_preallocated(ac)) {
                ac-&amp;gt;ac_op = LDISKFS_MB_HISTORY_ALLOC;
                ldiskfs_mb_normalize_request(ac, ar);
repeat:
                &lt;span class=&quot;code-comment&quot;&gt;/* allocate space in core */&lt;/span&gt;
                *errp = ldiskfs_mb_regular_allocator(ac);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;so these heavy functions are &lt;b&gt;before&lt;/b&gt; &lt;tt&gt;ldiskfs_mb_regular_allocator()&lt;/tt&gt; is called.  There is a loop in &lt;tt&gt;ldiskfs_mb_use_preallocated()&lt;/tt&gt; that is repeatedly getting a spinlock, but it doesn&apos;t appear to be successful in finding a good PA, since the function ends up returning &quot;0&quot; and then &lt;tt&gt;ldiskfs_mb_normalize_request()&lt;/tt&gt; is called anyway:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
ldiskfs_mb_use_preallocated(struct ldiskfs_allocation_context *ac)
{
        &lt;span class=&quot;code-comment&quot;&gt;/* first, &lt;span class=&quot;code-keyword&quot;&gt;try&lt;/span&gt; per-file preallocation */&lt;/span&gt;
        list_for_each_entry_rcu(pa, &amp;amp;ei-&amp;gt;i_prealloc_list, pa_inode_list) {
                :
                &lt;span class=&quot;code-comment&quot;&gt;/* found preallocated blocks, use them */&lt;/span&gt;
                spin_lock(&amp;amp;pa-&amp;gt;pa_lock);
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (pa-&amp;gt;pa_deleted == 0 &amp;amp;&amp;amp; pa-&amp;gt;pa_free) {
                        :
                        &lt;span class=&quot;code-comment&quot;&gt;/* &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; branch is never taken */&lt;/span&gt;
                        :
                        &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 1;
                }
                spin_unlock(&amp;amp;pa-&amp;gt;pa_lock);
        }
        :
        /*
         * search &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; the prealloc space that is having
         * minimal distance from the goal block.                
         */             
        &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; (i = order; i &amp;lt; PREALLOC_TB_SIZE; i++) {
                list_for_each_entry_rcu(pa, &amp;amp;lg-&amp;gt;lg_prealloc_list[i],
                                        pa_inode_list) {
                        spin_lock(&amp;amp;pa-&amp;gt;pa_lock);
                        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (pa-&amp;gt;pa_deleted == 0 &amp;amp;&amp;amp;
                            pa-&amp;gt;pa_free &amp;gt;= ac-&amp;gt;ac_o_ex.fe_len) {
        
                                cpa = ldiskfs_mb_check_group_pa(goal_block,
                                                                pa, cpa);
                        }
                        spin_unlock(&amp;amp;pa-&amp;gt;pa_lock);
                }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;and then in &lt;tt&gt;ldiskfs_mb_normalize_request()&lt;/tt&gt; it looks like the same PA lists are walked again and the same locks are contended:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
ldiskfs_mb_normalize_request(struct ldiskfs_allocation_context *ac,
                                struct ldiskfs_allocation_request *ar)
{
         :
        list_for_each_entry_rcu(pa, &amp;amp;ei-&amp;gt;i_prealloc_list, pa_inode_list) {
                ldiskfs_lblk_t pa_end;

                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (pa-&amp;gt;pa_deleted)
                        &lt;span class=&quot;code-keyword&quot;&gt;continue&lt;/span&gt;;
                spin_lock(&amp;amp;pa-&amp;gt;pa_lock);
                :
                &lt;span class=&quot;code-comment&quot;&gt;/* lots of checks */&lt;/span&gt;
                :
                spin_unlock(&amp;amp;pa-&amp;gt;pa_lock);
        }
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;By all rights, since these PA lists are on a single inode, there shouldn&apos;t be much contention, but it seems to fit the pattern shown by the flame graphs.  Unfortunately, it isn&apos;t possible to know if the slow threads were all accessing a single file or different files.&lt;/p&gt;

&lt;p&gt;I think it makes sense to backport either &lt;a href=&quot;https://patchwork.ozlabs.org/project/linux-ext4/list/?series=346731&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://patchwork.ozlabs.org/project/linux-ext4/list/?series=346731&lt;/a&gt; to ldiskfs, or at least the prealloc list fixed limit patch &lt;a href=&quot;https://lore.kernel.org/all/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://lore.kernel.org/all/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com/&lt;/a&gt; to prevent the PA list from getting too long...&lt;/p&gt;</comment>
                            <comment id="367957" author="bzzz" created="Fri, 31 Mar 2023 05:17:20 +0000"  >&lt;blockquote&gt;&lt;p&gt; &lt;a href=&quot;https://lore.kernel.org/all/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://lore.kernel.org/all/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com/&lt;/a&gt; &lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;this one looks simple enough&lt;/p&gt;</comment>
                            <comment id="367958" author="gerrit" created="Fri, 31 Mar 2023 05:42:31 +0000"  >&lt;p&gt;&quot;Alex Zhuravlev &amp;lt;bzzz@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/50481&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/50481&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16691&quot; title=&quot;optimize ldiskfs prealloc (PA) under random read workloads&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16691&quot;&gt;&lt;del&gt;LU-16691&lt;/del&gt;&lt;/a&gt; ldiskfs: limit preallocation list&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: acf2f540db47d223e6999e5923aec8549be52d0b&lt;/p&gt;</comment>
                            <comment id="378036" author="gerrit" created="Sat, 8 Jul 2023 22:34:56 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/50481/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/50481/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16691&quot; title=&quot;optimize ldiskfs prealloc (PA) under random read workloads&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16691&quot;&gt;&lt;del&gt;LU-16691&lt;/del&gt;&lt;/a&gt; ldiskfs: limit length of per-inode prealloc list&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: b16c9333a00802faea419dfe6fbb013c4477c9c6&lt;/p&gt;</comment>
                            <comment id="378068" author="pjones" created="Sun, 9 Jul 2023 14:25:08 +0000"  >&lt;p&gt;Landed for 2.16&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="57389">LU-12970</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="48613" name="oss07.perf.svg" size="1342323" author="adilger" created="Fri, 31 Mar 2023 04:58:21 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i03hq7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>