<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:11:51 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-933] allow disabling the mdc_rpc_lock for performance testing</title>
                <link>https://jira.whamcloud.com/browse/LU-933</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;It is desirable to allow disabling the client mdc_&lt;/p&gt;
{get,put}
&lt;p&gt;_rpc_lock() in order to allow clients to send multiple filesystem-modifying RPCs at the same time.  While this would break MDS recovery (due to insufficient transaction slots in the MDS last_rcvd file) it would allow a smaller number of clients to generate a much higher RPC load on the MDS.  This is ideal for MDS/RPC load testing purposes, and can also be used to help evaluate the potential benefits of implementing the multi-slot last_rcvd feature.&lt;/p&gt;

&lt;p&gt;A simple mechanism to do this would be to set the client fail_loc to a specific value, which allows the client to multiple metadata-modifying requests at one time.  Some care must be taken when setting and clearing this fail_loc, since it could lead to inconsistencies where mdc_get_rpc_lock() is skipped when the fail_loc is set, but mdc_put_rpc_lock() for that same RPC is run when fail_loc is cleared.&lt;/p&gt;

&lt;p&gt;One possibility is something like the following, though there are may others.  This implementation:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;ensures that requests sent when OBD_FAIL_MDC_SEM is turned off do not happen concurrent with other requests&lt;/li&gt;
	&lt;li&gt;is race free even in the transition period when OBD_FAIL_MDC_SEM is turned on or ff&lt;/li&gt;
&lt;/ul&gt;


&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;struct mdc_rpc_lock {
        cfs_semaphore_t       rpcl_sem;
        struct lookup_intent *rpcl_it;
        int                   rpcl_fakes;
};

#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL)

static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck,
                                    struct lookup_intent *it)
{
        ENTRY;
        if (it == NULL || (it-&amp;gt;it_op != IT_GETATTR &amp;amp;&amp;amp; it-&amp;gt;it_op != IT_LOOKUP)) {
                /* This would normally block until the existing request finishes.
                 * If fail_loc is set it will block until the regular request is
                 * done, then set rpcl_it to MDC_FAKE_RPCL_IT.  Once that is set
                 * it will only be cleared when all fake requests are finished.
                 * Only when all fake requests are finished can normal requests
                 * be sent, to ensure they are recoverable again. */
                cfs_down(&amp;amp;lck-&amp;gt;rpcl_sem);
                if (CFS_FAIL_CHECK(OBD_FAIL_MDC_RPCS_SEM)) {
                        lck-&amp;gt;rpcl_it = MDC_FAKE_RPCL_IT;
                        lck-&amp;gt;rpcl_fakes++
                        cfs_up(&amp;amp;lck-&amp;gt;rpcl_sem);
                } else {
                        /* This will only happen when the CFS_FAIL_CHECK() was
                         * just turned off but there are still requests in progress.
                         * Wait until they finish.  It doesn&apos;t need to be efficient
                         * in this extremely rare case, just have low overhead in
                         * the common case when it isn&apos;t true. */
                        while (unlikely(lck-&amp;gt;rpcl_it == MDC_FAKE_RPCL_IT))
                                cfs_schedule_timeout(cfs_time_seconds(1));
                        LASSERT(lck-&amp;gt;rpcl_it == NULL);
                        lck-&amp;gt;rpcl_it = it;
                }
        }
}

static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck,
                                    struct lookup_intent *it)
{
        if (it == NULL || (it-&amp;gt;it_op != IT_GETATTR &amp;amp;&amp;amp; it-&amp;gt;it_op != IT_LOOKUP)) {
                if (lck-&amp;gt;rpcl_it == MDC_FAKE_RPCL_IT) {
                        cfs_down(&amp;amp;lck-&amp;gt;rpcl_sem);
                        LASSERTF(lck-&amp;gt;rpcl_fakes &amp;gt; 0, &quot;%d\n&quot;, lck-&amp;gt;rpcl_fakes);
                        if (--lck-&amp;gt;rpcl_fakes == 0) {
                                lck-&amp;gt;rpcl_it = NULL;
                        }
                } else {
                        LASSERTF(it == lck-&amp;gt;rpcl_it, &quot;%p != %p\n&quot;, it, lck-&amp;gt;rpcl_it);
                        lck-&amp;gt;rpcl_it = NULL;
                }
                cfs_up(&amp;amp;lck-&amp;gt;rpcl_sem);
        }
        EXIT;
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="12696">LU-933</key>
            <summary>allow disabling the mdc_rpc_lock for performance testing</summary>
                <type id="4" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11310&amp;avatarType=issuetype">Improvement</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="liang">Liang Zhen</assignee>
                                    <reporter username="adilger">Andreas Dilger</reporter>
                        <labels>
                            <label>opensfs</label>
                    </labels>
                <created>Fri, 16 Dec 2011 01:57:56 +0000</created>
                <updated>Wed, 17 Sep 2014 20:44:24 +0000</updated>
                            <resolved>Fri, 28 Sep 2012 12:08:38 +0000</resolved>
                                                    <fixVersion>Lustre 2.3.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="24862" author="adilger" created="Fri, 16 Dec 2011 02:01:55 +0000"  >&lt;p&gt;Note that the reason I picked fail_loc as the mechanism for setting this is because this is hard for a user to accidentally think is &quot;just a tunable&quot;.  If there was a tunable like mdc.*.write_rpcs_in_flight someone might set this and find improved performance, and not realize that it breaks recovery even if that were documented in the manual.&lt;/p&gt;</comment>
                            <comment id="27880" author="liang" created="Fri, 3 Feb 2012 11:07:34 +0000"  >&lt;p&gt;I&apos;ve posted a patch at here: &lt;a href=&quot;http://review.whamcloud.com/#change,2084&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,2084&lt;/a&gt;&lt;br/&gt;
it&apos;s just from code sample of Andreas with some small adjustments.&lt;/p&gt;</comment>
                            <comment id="45732" author="jlevi" created="Fri, 28 Sep 2012 12:08:38 +0000"  >&lt;p&gt;Please let me know if additional work is needed and I will reopen this ticket.&lt;/p&gt;</comment>
                            <comment id="71514" author="gabriele.paciucci" created="Thu, 14 Nov 2013 10:05:54 +0000"  >&lt;p&gt;Hi,&lt;br/&gt;
I have received two different behaviors on two identical clients:&lt;/p&gt;

&lt;p&gt;root@pilatus11:~# rpm -qa | grep lustre&lt;br/&gt;
lustre-client-modules-2.4.1-3.0.80_0.7_default&lt;br/&gt;
lustre-client-2.4.1-3.0.80_0.7_default&lt;/p&gt;

&lt;p&gt;root@pilatus11:~# /usr/sbin/lctl set_param fail_loc=0x804&lt;br/&gt;
fail_loc=0x804&lt;/p&gt;

&lt;p&gt;root@pilatus11:~# /usr/sbin/lctl get_param fail_loc&lt;br/&gt;
fail_loc=1073743876&lt;/p&gt;

&lt;p&gt;this is for another client:&lt;/p&gt;

&lt;p&gt;root@pilatus31:~# rpm -qa | grep lustre&lt;br/&gt;
lustre-client-modules-2.4.1-3.0.80_0.7_default&lt;br/&gt;
lustre-client-2.4.1-3.0.80_0.7_default&lt;/p&gt;

&lt;p&gt;root@pilatus31:~# lctl set_param fail_loc=0x804&lt;br/&gt;
fail_loc=0x804&lt;/p&gt;

&lt;p&gt;root@pilatus31:~# lctl get_param fail_loc&lt;br/&gt;
fail_loc=2052&lt;/p&gt;


</comment>
                            <comment id="71515" author="gabriele.paciucci" created="Thu, 14 Nov 2013 10:51:54 +0000"  >&lt;p&gt;which is the default value for fail_loc? I have remounted the client and I have received the same value.&lt;/p&gt;</comment>
                            <comment id="71516" author="adilger" created="Thu, 14 Nov 2013 11:46:00 +0000"  >&lt;p&gt;1073743876 = 0x40000804, and 2052 = 0x804 = OBD_FAIL_MDC_RPCS_SEM.  The 0x40000000 value is CFS_FAILED, which means that the OBD_FAIL_MDC_RPCS_SEM check was hit at least once.&lt;/p&gt;

&lt;p&gt;The default value for fail_loc is &quot;0&quot;, which means no failures are being injected into the code.  Since the cfs_fail_loc variable is in the libcfs code, it will only be reset if you unmount the client and remove all of the Lustre modules.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="25521">LU-5319</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzv6jr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>4595</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                            <customfield id="customfield_10002" key="com.atlassian.jira.plugin.system.customfieldtypes:float">
                        <customfieldname>Story Points</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>2.0</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>