<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:13:38 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1114] ptrlrpd thread spinning since Lustre start on Client</title>
                <link>https://jira.whamcloud.com/browse/LU-1114</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Example of affected thread :&lt;br/&gt;
============================&lt;br/&gt;
PID: 7947 TASK: ffff881030721850 CPU: 1 COMMAND: &quot;ptlrpcd_4&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880044e27e90&amp;#93;&lt;/span&gt; crash_nmi_callback at ffffffff8101fd06&lt;br/&gt;
 0000001 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880044e27ea0&amp;#93;&lt;/span&gt; notifier_call_chain at ffffffff814837f5&lt;br/&gt;
 0000002 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880044e27ee0&amp;#93;&lt;/span&gt; atomic_notifier_call_chain at ffffffff8148385a&lt;br/&gt;
 0000003 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880044e27ef0&amp;#93;&lt;/span&gt; notify_die at ffffffff8108026e&lt;br/&gt;
 0000004 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880044e27f20&amp;#93;&lt;/span&gt; do_nmi at ffffffff81481443&lt;br/&gt;
 0000005 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880044e27f50&amp;#93;&lt;/span&gt; nmi at ffffffff81480d50&lt;br/&gt;
    &lt;span class=&quot;error&quot;&gt;&amp;#91;exception RIP: _spin_lock+30&amp;#93;&lt;/span&gt;&lt;br/&gt;
    RIP: ffffffff8148062e RSP: ffff881030757da0 RFLAGS: 00000202&lt;br/&gt;
    RAX: 0000000000000000 RBX: ffff881030632540 RCX: 0000000000000000&lt;br/&gt;
    RDX: 0000000000000001 RSI: ffff88103c45e3d0 RDI: ffff88103c45e498&lt;br/&gt;
    RBP: ffff881030757da0 R8: ebc0de0100000000 R9: ffffffff00000100&lt;br/&gt;
    R10: 0000000000000000 R11: 000000000000000f R12: ffff881030632540&lt;br/&gt;
    R13: ffff881030632570 R14: ffff88103c45e3d0 R15: ffff88103c45e498&lt;br/&gt;
    ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018&lt;br/&gt;
&amp;#8212; &amp;lt;NMI exception stack&amp;gt; &amp;#8212;&lt;br/&gt;
 0000006 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881030757da0&amp;#93;&lt;/span&gt; _spin_lock at ffffffff8148062e&lt;br/&gt;
 0000007 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881030757da8&amp;#93;&lt;/span&gt; ptlrpcd_check at ffffffffa05cfecc &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 0000008 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881030757e38&amp;#93;&lt;/span&gt; ptlrpcd at ffffffffa05d03ff &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 0000009 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881030757f48&amp;#93;&lt;/span&gt; kernel_thread at ffffffff810041aa&lt;br/&gt;
============================&lt;/p&gt;

&lt;p&gt;Concerned &quot;partner&quot; ptlrpcds-&amp;gt;pd_threads[]-&amp;gt;pc_lock spin_lock seems not initialized causing current ptlrpcd thread to spin for-ever !!!&lt;/p&gt;

&lt;p&gt;A possible fix for this problem should be to wait for all ptlrpcd/partner threads to fully initialize prior to start operations .... &lt;/p&gt;
</description>
                <environment></environment>
        <key id="13225">LU-1114</key>
            <summary>ptrlrpd thread spinning since Lustre start on Client</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="yong.fan">nasf</assignee>
                                    <reporter username="louveta">Alexandre Louvet</reporter>
                        <labels>
                    </labels>
                <created>Fri, 17 Feb 2012 13:03:57 +0000</created>
                <updated>Mon, 19 Nov 2012 17:14:38 +0000</updated>
                            <resolved>Fri, 2 Mar 2012 06:56:03 +0000</resolved>
                                    <version>Lustre 2.1.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="29372" author="pjones" created="Fri, 17 Feb 2012 14:31:47 +0000"  >&lt;p&gt;Bruno&lt;/p&gt;

&lt;p&gt;Could you please confirm what version of the code you are running? You have marked this as a 2.1 issue but this seems to be related to mult-threaded ptlrpc which is a 2.2 feature&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="29500" author="pjones" created="Tue, 21 Feb 2012 08:01:58 +0000"  >&lt;p&gt;Bruno?&lt;/p&gt;</comment>
                            <comment id="29501" author="adegremont" created="Tue, 21 Feb 2012 08:59:15 +0000"  >&lt;p&gt;In fact this is Lustre 2.1 + patch ORNL-22 applied.&lt;/p&gt;</comment>
                            <comment id="29502" author="pjones" created="Tue, 21 Feb 2012 09:08:41 +0000"  >&lt;p&gt;Fanyong&lt;/p&gt;

&lt;p&gt;Could you please comment on this issue?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="29504" author="bfaccini" created="Tue, 21 Feb 2012 11:01:12 +0000"  >&lt;p&gt;Sorry for my silence Peter, but in-between others problems debugging, I was waiting for the confirmation from Bull R&amp;amp;D of the infos/details that Aurelien added ...&lt;/p&gt;</comment>
                            <comment id="29668" author="bfaccini" created="Thu, 23 Feb 2012 13:19:37 +0000"  >&lt;p&gt;Concerning the scenario/conditions to trigger this issue we have no real nor interesting infos ...&lt;/p&gt;

&lt;p&gt;The only thing we can imagine/explain is that each ptlrpcd thread when starting, and according to PDB_POLICY_NODE, has only one partner choosen as &quot;next core in the same NUMA node&quot; but unfortunatelly this 2nd guy startup/init was not completed when the 1st guy tried to access the 2nd one&apos;s private data and to protect itself via the other&apos;s ptlrpcd_ctl.pc_lock which was found un-initialized, hence the dead-lock !!!&lt;/p&gt;

&lt;p&gt;As a possible fix, may be some kind of a &quot;barrier&quot; (with some additional+specific &quot;started&quot; flag) should be implemented to synchronize/ensure all ptlrpcd threads/partners start in ptlrpcd_init() ??...&lt;/p&gt;</comment>
                            <comment id="29867" author="pichong" created="Mon, 27 Feb 2012 09:05:30 +0000"  >&lt;p&gt;The PDB_POLICY_NODE is a ptlrpcd binding policy implemented by Bull that is NUMA aware.&lt;br/&gt;
I have uploaded the patch into Gerrit &lt;a href=&quot;http://review.whamcloud.com/2212&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/2212&lt;/a&gt; so the source code is available and can be integrated in the master.&lt;/p&gt;

&lt;p&gt;Note that there was a bug in the version running on the customer cluster.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeHeader panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;&quot;lustre/ptlrpc/ptlrpcd.c line 600&quot;&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; (i = index+1;
     i != index;
     i = (i+1)%max) {
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;which has been fixed in the uploaded patched:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; (i = (index+1)%max;
     i != index;
     i = (i+1)%max) {
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="29869" author="pjones" created="Mon, 27 Feb 2012 09:53:43 +0000"  >&lt;p&gt;Gregoire&lt;/p&gt;

&lt;p&gt;Could you please resubmit your patch with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1114&quot; title=&quot;ptrlrpd thread spinning since Lustre start on Client&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1114&quot;&gt;&lt;del&gt;LU-1114&lt;/del&gt;&lt;/a&gt; as the identified ticket number? We are trying to ensure that all landings to master have public LU tickets so that interested parties can read the full details relating to the change&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="29871" author="pichong" created="Mon, 27 Feb 2012 10:41:35 +0000"  >&lt;p&gt;I have created a separate ticket &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1144&quot; title=&quot;implement a NUMA aware ptlrpcd binding policy&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1144&quot;&gt;&lt;del&gt;LU-1144&lt;/del&gt;&lt;/a&gt; to track the implementation of the ptlrpcd binding policy PDB_POLICY_NODE.&lt;/p&gt;</comment>
                            <comment id="29964" author="pichong" created="Wed, 29 Feb 2012 11:27:06 +0000"  >&lt;p&gt;I have uploaded a patch that ensures partner thread control structure is accessed only when it is completely initialized: &lt;a href=&quot;http://review.whamcloud.com/2227&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/2227&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="30094" author="yong.fan" created="Thu, 1 Mar 2012 10:44:49 +0000"  >&lt;p&gt;The failure should not happend on original Lustre-2.2 branch if without other bind mode introduced. Because in original implementation, there is order control, when the partnership is established, private data for the partner should has been initialized already. For example: for the default bind mode &quot;PDB_POLICY_PAIR&quot;, &quot;&amp;lt;0,1&amp;gt;&quot; are partners for each other, the partnership between &quot;0&quot; and &quot;1&quot; are established after &quot;1&quot;&apos;s private data initialized. And according to ptlrpcd threads starting order, &quot;0&quot; should started before &quot;1&quot;. So it can guarantee that: when &amp;lt;0,1&amp;gt; pair is established, both &quot;0&quot;&apos;s and &quot;1&quot;&apos;s private are ready.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;        pc-&amp;gt;pc_index = index;
        cfs_init_completion(&amp;amp;pc-&amp;gt;pc_starting);
        cfs_init_completion(&amp;amp;pc-&amp;gt;pc_finishing);
        cfs_spin_lock_init(&amp;amp;pc-&amp;gt;pc_lock);
        strncpy(pc-&amp;gt;pc_name, name, sizeof(pc-&amp;gt;pc_name) - 1);
        pc-&amp;gt;pc_set = ptlrpc_prep_set();
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (pc-&amp;gt;pc_set == NULL)
                GOTO(out, rc = -ENOMEM);
        /*
         * So far only &lt;span class=&quot;code-quote&quot;&gt;&quot;client&quot;&lt;/span&gt; ptlrpcd uses an environment. In the &lt;span class=&quot;code-keyword&quot;&gt;future&lt;/span&gt;,
         * ptlrpcd thread (or a thread-set) has to be given an argument,
         * describing its &lt;span class=&quot;code-quote&quot;&gt;&quot;scope&quot;&lt;/span&gt;.
         */
        rc = lu_context_init(&amp;amp;pc-&amp;gt;pc_env.le_ctx, LCT_CL_THREAD|LCT_REMEMBER);
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc != 0)
                GOTO(out, rc);

        env = 1;
#ifdef __KERNEL__
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (index &amp;gt;= 0) {
&lt;span class=&quot;code-comment&quot;&gt;/* XXX: When &lt;span class=&quot;code-quote&quot;&gt;&quot;1&quot;&lt;/span&gt; comes here, &lt;span class=&quot;code-quote&quot;&gt;&quot;1&quot;&lt;/span&gt;&apos;s &lt;span class=&quot;code-keyword&quot;&gt;private&lt;/span&gt; data has been initialized, &lt;span class=&quot;code-quote&quot;&gt;&quot;0&quot;&lt;/span&gt; is ready before &lt;span class=&quot;code-quote&quot;&gt;&quot;1&quot;&lt;/span&gt; started. So here, we can establish the partnership between &lt;span class=&quot;code-quote&quot;&gt;&quot;0&quot;&lt;/span&gt; and &lt;span class=&quot;code-quote&quot;&gt;&quot;1&quot;&lt;/span&gt;.*/&lt;/span&gt;
                rc = ptlrpcd_bind(index, max);
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc &amp;lt; 0)
                        GOTO(out, rc);
        }

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="30095" author="yong.fan" created="Thu, 1 Mar 2012 10:48:38 +0000"  >&lt;p&gt;So consider my above comment, what&apos;s your thought? If it is introduced by the new bind mode &quot;PDB_POLICY_NODE&quot;, do you think whether it is better to fix the issue inside such &quot;PDB_POLICY_NODE&quot; implementation patch?&lt;/p&gt;</comment>
                            <comment id="30278" author="pichong" created="Fri, 2 Mar 2012 05:29:32 +0000"  >&lt;p&gt;Thanks for looking.&lt;br/&gt;
You are right the problem is specific to the PDB_POLICY_NODE which currently does not take care of establishing partnership only with ptlrpcd threads that are initialized.&lt;/p&gt;

&lt;p&gt;Therefore, I think this ticket can be closed (or marked duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1144&quot; title=&quot;implement a NUMA aware ptlrpcd binding policy&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1144&quot;&gt;&lt;del&gt;LU-1144&lt;/del&gt;&lt;/a&gt;) and I will take this into account in the new version of the PDB_POLICY_NODE implementation I will post under &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1144&quot; title=&quot;implement a NUMA aware ptlrpcd binding policy&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1144&quot;&gt;&lt;del&gt;LU-1144&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="30280" author="pjones" created="Fri, 2 Mar 2012 06:56:03 +0000"  >&lt;p&gt;ok thanks Gregoire!&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvhav:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>6452</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>