<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:13:23 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1088] mgs threads go nuts</title>
                <link>https://jira.whamcloud.com/browse/LU-1088</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;While investigating &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1087&quot; title=&quot;mdt thread spinning out of control&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1087&quot;&gt;&lt;del&gt;LU-1087&lt;/del&gt;&lt;/a&gt;, the ll_mgs_* threads suddenly went nuts and shot the load through the roof, to the point where the node is almost completely unresponsive, and a &quot;top&quot; that I had running is only able to redraw ever minute or so.&lt;/p&gt;

&lt;p&gt;The console is mostly unresponsive, but it did respond to a sysreq-l, so I can see that they are all in a backtrace similar to this:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Call Trace:
 [&amp;lt;ffffffffa06da060&amp;gt;] lock_res_and_lock+0x30/0x40 [ptlrpc]
 [&amp;lt;ffffffffa06deca3&amp;gt;] ldlm_lock_enqueue+0x453/0x7e0 [ptlrpc]
 [&amp;lt;ffffffffa06fd206&amp;gt;] ldlm_handle_enqueue0+0x406/0xd70 [ptlrpc]
 [&amp;lt;ffffffffa06fdbd6&amp;gt;] ldlm_handle_enqueue+0x66/0x70 [ptlrpc]
 [&amp;lt;ffffffffa06fdbe0&amp;gt;] ? ldlm_server_completion_ast+0x0/0x590 [ptlrpc]
 [&amp;lt;ffffffffa06fe170&amp;gt;] ? ldlm_server_blocking_ast+0x0/0x740 [ptlrpc]
 [&amp;lt;ffffffffa0b55245&amp;gt;] mgs_handle+0x545/0x1350 [mgs]
 [&amp;lt;ffffffffa04933f1&amp;gt;] ? libcfs_debug_vmsg1+0x41/0x50 [libcfs]
 [&amp;lt;ffffffffa04933f1&amp;gt;] ? libcfs_debug_vmsg1+0x41/0x50 [libcfs]
 [&amp;lt;ffffffffa0723181&amp;gt;] ptlrpc_main+0xcd1/0x1690 [ptlrpc]
 [&amp;lt;ffffffffa07224b0&amp;gt;] ? ptlrpc_main+0x0/0x1690 [ptlrpc]
 [&amp;lt;ffffffff8100c14a&amp;gt;] child_rip+0xa/0x20
 [&amp;lt;ffffffffa07224b0&amp;gt;] ? ptlrpc_main+0x0/0x1690 [ptlrpc]
 [&amp;lt;ffffffffa07224b0&amp;gt;] ? ptlrpc_main+0x0/0x1690 [ptlrpc]
 [&amp;lt;ffffffff8100c140&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>lustre 2.1.0-21chaos (github.com/chaos/lustre)</environment>
        <key id="13148">LU-1088</key>
            <summary>mgs threads go nuts</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="laisiyao">Lai Siyao</assignee>
                                    <reporter username="morrone">Christopher Morrone</reporter>
                        <labels>
                    </labels>
                <created>Thu, 9 Feb 2012 20:24:01 +0000</created>
                <updated>Mon, 4 Jun 2012 02:21:41 +0000</updated>
                            <resolved>Mon, 4 Jun 2012 02:21:41 +0000</resolved>
                                                    <fixVersion>Lustre 2.3.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="28325" author="morrone" created="Thu, 9 Feb 2012 20:38:57 +0000"  >&lt;p&gt;We gave up on waiting for the node to recover on its own.  We forced a crash dump and are rebooting now.&lt;/p&gt;</comment>
                            <comment id="28326" author="morrone" created="Thu, 9 Feb 2012 20:53:08 +0000"  >&lt;p&gt;MDS went through recovery and seems happy for the moment.&lt;/p&gt;</comment>
                            <comment id="28336" author="bzzz" created="Fri, 10 Feb 2012 03:48:38 +0000"  >&lt;p&gt;once you meet this problem again, please grab all the traces and attach them here.&lt;/p&gt;</comment>
                            <comment id="28365" author="morrone" created="Fri, 10 Feb 2012 13:16:14 +0000"  >&lt;p&gt;Alex, we have a crash dump.  If you want backtraces from all tasks, we&apos;ll get you that.  No need to wait for another instance.&lt;/p&gt;</comment>
                            <comment id="28372" author="morrone" created="Fri, 10 Feb 2012 14:05:51 +0000"  >&lt;p&gt;Attach &quot;foreach bt&quot; from momus mds.&lt;/p&gt;</comment>
                            <comment id="28395" author="pjones" created="Fri, 10 Feb 2012 18:38:10 +0000"  >&lt;p&gt;Added Alex as a watcher so he is aware of Chris&apos;s answer&lt;/p&gt;</comment>
                            <comment id="28399" author="green" created="Fri, 10 Feb 2012 19:18:32 +0000"  >&lt;p&gt;From the stack trace:&lt;br/&gt;
PID: 21012  TASK: ffff880812dd7500  CPU: 8   COMMAND: &quot;ll_mgs_22&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88043f407e90&amp;#93;&lt;/span&gt; crash_nmi_callback at ffffffff81029746&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88043f407ea0&amp;#93;&lt;/span&gt; notifier_call_chain at ffffffff814f3eb5&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88043f407ee0&amp;#93;&lt;/span&gt; atomic_notifier_call_chain at ffffffff814f3f1a&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88043f407ef0&amp;#93;&lt;/span&gt; notify_die at ffffffff81096a6e&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88043f407f20&amp;#93;&lt;/span&gt; do_nmi at ffffffff814f1b33&lt;br/&gt;
 #5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88043f407f50&amp;#93;&lt;/span&gt; nmi at ffffffff814f1440&lt;br/&gt;
    &lt;span class=&quot;error&quot;&gt;&amp;#91;exception RIP: vsnprintf+779&amp;#93;&lt;/span&gt;&lt;br/&gt;
    RIP: ffffffff81276c2b  RSP: ffff880812da5810  RFLAGS: 00000206&lt;br/&gt;
    RAX: ffff880812da5a40  RBX: ffff8805aa88a529  RCX: 0000000000000002&lt;br/&gt;
    RDX: ffffffffa076a2fc  RSI: ffff8805aa88b000  RDI: ffff8805aa88a529&lt;br/&gt;
    RBP: ffff880812da58a0   R8: 0000000000000073   R9: 0000000000000000&lt;br/&gt;
    R10: 0000000000000001  R11: 00000000000000dc  R12: ffffffffa075b626&lt;br/&gt;
    R13: ffffffffa075b624  R14: ffff880812da5980  R15: ffff8805aa88b000&lt;br/&gt;
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018&lt;br/&gt;
&amp;#8212; &amp;lt;NMI exception stack&amp;gt; &amp;#8212;&lt;br/&gt;
 #6 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880812da5810&amp;#93;&lt;/span&gt; vsnprintf at ffffffff81276c2b&lt;br/&gt;
 #7 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880812da5868&amp;#93;&lt;/span&gt; cfs_set_ptldebug_header at ffffffffa048919b &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880812da58a8&amp;#93;&lt;/span&gt; libcfs_debug_vmsg2 at ffffffffa0492b13 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #9 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880812da5a08&amp;#93;&lt;/span&gt; libcfs_debug_vmsg1 at ffffffffa04933f1 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
#10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880812da5a68&amp;#93;&lt;/span&gt; ldlm_lock_dump at ffffffffa06da97e &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#11 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880812da5aa8&amp;#93;&lt;/span&gt; ldlm_resource_dump at ffffffffa06e0ae3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#12 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880812da5b08&amp;#93;&lt;/span&gt; ldlm_granted_list_add_lock at ffffffffa06dd280 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#13 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880812da5b48&amp;#93;&lt;/span&gt; ldlm_grant_lock at ffffffffa06dd83c &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#14 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880812da5ba8&amp;#93;&lt;/span&gt; ldlm_process_plain_lock at ffffffffa06edc1a &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#15 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880812da5c48&amp;#93;&lt;/span&gt; ldlm_lock_enqueue at ffffffffa06deb6d &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#16 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880812da5cb8&amp;#93;&lt;/span&gt; ldlm_handle_enqueue0 at ffffffffa06fd206 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#17 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880812da5d28&amp;#93;&lt;/span&gt; ldlm_handle_enqueue at ffffffffa06fdbd6 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#18 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880812da5d68&amp;#93;&lt;/span&gt; mgs_handle at ffffffffa0b55245 &lt;span class=&quot;error&quot;&gt;&amp;#91;mgs&amp;#93;&lt;/span&gt;&lt;br/&gt;
#19 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880812da5df8&amp;#93;&lt;/span&gt; ptlrpc_main at ffffffffa0723181 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#20 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880812da5f48&amp;#93;&lt;/span&gt; kernel_thread at ffffffff8100c14a&lt;/p&gt;

&lt;p&gt;So I imagine you have a lot of clients (tens of thousands?), and once all of them somehow got disconnected, they all come rushing to reconnect back and get their config lock too (all on the same resource)..&lt;br/&gt;
Now in the middle of the locking we have this ldlm_resource_dump() call that would print all locks on the resource, which would take quite a while I can imagine, and everybody else is spinning on the resource spinlock meanwhile.&lt;/p&gt;

&lt;p&gt;That ldlm_resource_dump is D_IFO which is probably bad idea and should be D_DLMTRACE too, what is the lustre debug level you are running at?&lt;br/&gt;
I wonder if we should just change ldlm_resource_dump() to only print the individual locks if the refcount is below some threshold, as otherwise it is just too much info I suspect.&lt;/p&gt;</comment>
                            <comment id="28401" author="morrone" created="Fri, 10 Feb 2012 19:41:58 +0000"  >&lt;p&gt;Yes, I had recently added D_INFO to get a look at what the spinning mdt thread was doing for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1087&quot; title=&quot;mdt thread spinning out of control&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1087&quot;&gt;&lt;del&gt;LU-1087&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;We have a few thousand clients.  Most of them should not be disconnected.  There are on the order of a couple hundred that might reboot and reconnect at any time (BGP nodes).&lt;/p&gt;

&lt;p&gt;I think that we definitely need ldlm_resource_dump changed.  I certainly accept that performance is reduced when I enable higher logging levels, but I don&apos;t expect a denial-of-service attack. &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="30369" author="laisiyao" created="Fri, 2 Mar 2012 22:10:13 +0000"  >&lt;p&gt;It looks okay to use RCU for resource lock dump, and compared to ldlm_lock_debug() ldlm_lock_dump() is inefficient, I&apos;ll replace it the former one.&lt;/p&gt;</comment>
                            <comment id="30372" author="laisiyao" created="Fri, 2 Mar 2012 23:00:48 +0000"  >&lt;p&gt;Review is on &lt;a href=&quot;http://review.whamcloud.com/#change,2250&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,2250&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="39891" author="pjones" created="Mon, 4 Jun 2012 02:21:41 +0000"  >&lt;p&gt;Landed for 2.3&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="10823" name="momus-mds1-backtraces.txt" size="555505" author="morrone" created="Fri, 10 Feb 2012 14:05:51 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzv6nz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>4614</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>