<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:14:56 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1248] all mdt_rdpg_* threads busy in osd_ea_fid_get()</title>
                <link>https://jira.whamcloud.com/browse/LU-1248</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;The load average on the MDS for a classified production 2.1 filesystem jumped to over 400.  Top showed mdt_rdpg_* threads all using 4-7% CPU time.  This may have been due to a pathological workload, but we were wondering if there&apos;s something like an overly contended lock in ldiskfs going on here. &lt;/p&gt;

&lt;p&gt;Most of the stacks looked like this:&lt;/p&gt;

&lt;p&gt;__cond_resched    &lt;br/&gt;
_cond_resched                                                                  &lt;br/&gt;
ifind_fast                                                                     &lt;br/&gt;
iget_locked                                                                    &lt;br/&gt;
ldiskfs_iget                                                                   &lt;br/&gt;
? generic_detach_inode                                                         &lt;br/&gt;
osd_iget                                                                       &lt;br/&gt;
osd_ea_fid_get                                                                 &lt;br/&gt;
osd_it_ea_rec                                                                  &lt;br/&gt;
mdd_readpage                                                                   &lt;br/&gt;
cml_readpage                                                                   &lt;br/&gt;
mdt_readpage                                                                   &lt;br/&gt;
? mdt_unpack_req_pack_rep                                                      &lt;br/&gt;
mdt_handle_common                                                              &lt;br/&gt;
?  lustre_msg_get_transno                                                      &lt;br/&gt;
mdt_readpage_handle                                                            &lt;br/&gt;
ptlrpc_main                                                                    &lt;br/&gt;
child_rip&lt;/p&gt;
</description>
                <environment>&lt;a href=&quot;https://github.com/chaos/lustre/tree/2.1.0-llnl&quot;&gt;https://github.com/chaos/lustre/tree/2.1.0-llnl&lt;/a&gt;</environment>
        <key id="13654">LU-1248</key>
            <summary>all mdt_rdpg_* threads busy in osd_ea_fid_get()</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="laisiyao">Lai Siyao</assignee>
                                    <reporter username="nedbass">Ned Bass</reporter>
                        <labels>
                    </labels>
                <created>Wed, 21 Mar 2012 17:16:55 +0000</created>
                <updated>Sat, 1 Feb 2014 08:35:15 +0000</updated>
                            <resolved>Tue, 26 Jun 2012 10:03:13 +0000</resolved>
                                    <version>Lustre 2.1.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="31952" author="pjones" created="Thu, 22 Mar 2012 22:38:32 +0000"  >&lt;p&gt;Lsi&lt;/p&gt;

&lt;p&gt;Could you please comment on this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="33219" author="laisiyao" created="Sun, 1 Apr 2012 04:58:56 +0000"  >&lt;p&gt;This looks normal from the code. Basically it&apos;s an inode scalability problem, these busy threads are contending on inode_lock, and NIck Piggin&apos;s inode lock scalability patches are getting merged into kernel 3.x.&lt;/p&gt;

&lt;p&gt;Currently MDT still uses directory+ea to store metadata, while IAM looks to have better performance and scalability, but I&apos;m not clear why it&apos;s not enabled yet.&lt;/p&gt;</comment>
                            <comment id="33517" author="laisiyao" created="Thu, 5 Apr 2012 06:36:45 +0000"  >&lt;p&gt;Ned, is the system upgraded from 1.8? Normally readdir can get fid from dir data, and then it doesn&apos;t need read fid from ea, but for a upgraded system, it needs query each inode. Besides, did you see any error messages related with this dir?&lt;/p&gt;</comment>
                            <comment id="33593" author="nedbass" created="Thu, 5 Apr 2012 17:54:47 +0000"  >&lt;p&gt;Yes the system was upgraded from 1.8.  Will files created after the upgrade store the fid in the dir data?&lt;/p&gt;

&lt;p&gt;There are a few &quot;osd_object_delete() Failed to cleanup: -2&quot; console messages on the MDS from around that time.  I don&apos;t find any other errors worth mentioning.&lt;/p&gt;</comment>
                            <comment id="33638" author="laisiyao" created="Fri, 6 Apr 2012 00:37:14 +0000"  >&lt;p&gt;Yes, on upgraded system even newly created dir won&apos;t store fid in dir data; I&apos;ll try whether it&apos;s easy to implement this.&lt;/p&gt;</comment>
                            <comment id="33647" author="laisiyao" created="Fri, 6 Apr 2012 01:52:36 +0000"  >&lt;p&gt;I don&apos;t find a easy way to implement: without change for disk format, there&apos;s no way to distinguish 1.8 directory and newly created directory. The original design for 1.8 &amp;lt;-&amp;gt; 2.x interoperatability server is on &lt;a href=&quot;https://bugzilla.lustre.org/show_bug.cgi?id=11826&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;bz11826&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="39907" author="yong.fan" created="Mon, 4 Jun 2012 05:06:27 +0000"  >&lt;p&gt;There are incompatible format between b1_8 and b2_1: in b1_8, lvfs_dentry_params is appended after the name entry in parent directory; but in b2_1, it is ldiskfs_dentry_param. They are different and incompatible. So when system upgraded from b1_8 to b2_1, the new created fie cannot append ldiskfs_dentry_param (which contains the FID) after its name entry in parent directory, otherwise, the system cannot downgrade back to b1_8.&lt;/p&gt;

&lt;p&gt;But if without FID appended after name entry in parent directory, it will cause performance (for dir readpage) regression. I do not think it is good solution, because upgrade is more often used than downgrade.&lt;/p&gt;

&lt;p&gt;We should make some patch in b2_x to support appending FID after the name entry in parent directory for upgrading case and skip lvfs_dentry_params after the name entry for old files. On the other hand, need another patch against b1_8_x (x &amp;gt;= 8) to skip FID after the name entry in parent directory to support downgrade back to b1_8_x (x &amp;gt;= 8).&lt;/p&gt;</comment>
                            <comment id="39909" author="laisiyao" created="Mon, 4 Jun 2012 05:17:06 +0000"  >&lt;p&gt;Andreas, it looks like we need change both 2.x and 1.8 ldiskfs code to keep both backward and forward compatibility for this, any suggestion for this?&lt;/p&gt;</comment>
                            <comment id="39952" author="adilger" created="Mon, 4 Jun 2012 16:21:27 +0000"  >&lt;p&gt;The need to be able to downgrade from 2.x to 1.8 is only in the case of &quot;simple&quot; upgrade to 2.x that has hit problems and needs to be able to downgrade.  If the upgrade has been successful, and then the admin (separately) enables the &quot;dir_data&quot; feature using tune2fs on the filesystem, this should be enough to allow storing FIDs in the directory entries.  After that point, the filesystem should not be downgraded to 1.8 anymore.&lt;/p&gt;

&lt;p&gt;What definitely &lt;em&gt;should&lt;/em&gt; be avoided is any automatic enabling of the &quot;dir_data&quot; feature on the filesystem when it is first mounted, since this will cause problems if there are FIDs stored in the directory entries, then the filesystem is downgraded to 1.8, the FID-in-LMA is deleted upon access (reverting to IGIF for that inode), and then the filesystem is upgraded again.  That would cause the FID-in-dirent to contain invalid data that OI scrub and e2fsck will not fix yet.&lt;/p&gt;

&lt;p&gt;So, my understanding is that if you are sure there is no need to downgrade to 1.8, it should be possible with 2.1+ to use:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;tune2fs -O dirdata /dev/{mdtdev}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;to enable this feature, and then newly-created files/links will store the FID in the directory.  I don&apos;t know if we have tested this process or not.&lt;/p&gt;

&lt;p&gt;Assuming this is OK, it would then be possible in that case to &quot;refresh&quot; the directory with a script to re-link filenames that are expected to live for a long time, assuming they are not in use, something like:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lfs find /mnt/lustre/some/dir -type f | while read F; do
        FTMP=&quot;$F.XXXXXX.$$.$RANDOM&quot;
        mv &quot;$F&quot; &quot;$FTMP&quot; &amp;amp;&amp;amp; mv &quot;$FTMP&quot; &quot;$F&quot;
done
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In a later phase of LFSCK, the FID-in-dirent data will be verified and refreshed if missing, but this is not part of the Phase I deliverable.&lt;/p&gt;

&lt;p&gt;This existing behaviour is not a net performance loss in many use cases, since it is prefetching the inode into MDS memory for use as soon as e.g. &quot;ls&quot; does a stat() on the file.  There would only be a visible slowdown in the case of e.g. &quot;find&quot; that is not accessing any of the file attributes, and only generating pathnames.&lt;/p&gt;</comment>
                            <comment id="39972" author="laisiyao" created="Mon, 4 Jun 2012 22:03:05 +0000"  >&lt;p&gt;Andreas, thanks for your detailed explanation! I&apos;ll verify `tune2fs` to enable &apos;dir_data&apos; feature later.&lt;/p&gt;

&lt;p&gt;Ned, are you fine with the result?&lt;/p&gt;</comment>
                            <comment id="39979" author="laisiyao" created="Tue, 5 Jun 2012 00:11:27 +0000"  >&lt;p&gt;The command should be `tune2fs -O dirdata /dev/&lt;/p&gt;
{mdtdev}
&lt;p&gt;`, and I&apos;ve verified that newly created dir will store FID in it.&lt;/p&gt;</comment>
                            <comment id="40026" author="nedbass" created="Tue, 5 Jun 2012 12:19:04 +0000"  >&lt;p&gt;Lai, enabling dir_data seems like a reasonable course of action.  We&apos;ll start some local testing and propose it to our sysadmin team.  Thanks&lt;/p&gt;</comment>
                            <comment id="41136" author="laisiyao" created="Tue, 26 Jun 2012 10:03:13 +0000"  >&lt;p&gt;If 1.8 system is upgraded to 2.x successfully, tunefs can be used to enable dirdata feature, then new directory will contain inode FID in its data.&lt;/p&gt;</comment>
                            <comment id="41829" author="adilger" created="Fri, 13 Jul 2012 15:52:31 +0000"  >&lt;p&gt;The Lustre Manual should be updated to inform users about how to enable &quot;dirdata&quot; on an upgraded 1.8-&amp;gt;2.x MDT, once they are sure that they will not be downgrading the MDS to 1.8 again.  This will minimize performance impact on newly created files.&lt;/p&gt;</comment>
                            <comment id="41880" author="pjones" created="Mon, 16 Jul 2012 09:34:31 +0000"  >&lt;p&gt;Cliff could you please create an LUDOC ticket to track Andreas&apos;s request?&lt;/p&gt;</comment>
                            <comment id="41895" author="cliffw" created="Mon, 16 Jul 2012 14:29:53 +0000"  >&lt;p&gt;&lt;a href=&quot;http://jira.whamcloud.com/browse/LUDOC-68&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;http://jira.whamcloud.com/browse/LUDOC-68&lt;/a&gt; has been created to track the manual changes&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvh5j:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>6428</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>