<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:13:07 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-7926] MDS sits idle with extreme slow response to clients</title>
                <link>https://jira.whamcloud.com/browse/LU-7926</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We had our MDS become idle no with no errors on the console or logs. It responds extremely slow (takes mins to just do an ls). This has happened twice with 24 hours. We took crash dump in both cases. The crash dump show all mdt threads sitting in &apos; qsd_op_begin&apos;&lt;br/&gt;
like this&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;PID: 14754  TASK: ffff88400a83c040  CPU: 7   COMMAND: &lt;span class=&quot;code-quote&quot;&gt;&quot;mdt01_002&quot;&lt;/span&gt;
 #0 [ffff883f634cd500] schedule at ffffffff81565692
 #1 [ffff883f634cd5d8] schedule_timeout at ffffffff81566572
 #2 [ffff883f634cd688] qsd_op_begin at ffffffffa0d04909 [lquota]
 #3 [ffff883f634cd738] osd_declare_qid at ffffffffa0d88449 [osd_ldiskfs]
 #4 [ffff883f634cd798] osd_declare_inode_qid at ffffffffa0d88702 [osd_ldiskfs]
 #5 [ffff883f634cd7f8] osd_declare_object_create at ffffffffa0d65d53 [osd_ldiskfs]
 #6 [ffff883f634cd858] lod_declare_object_create at ffffffffa0f4b482 [lod]
 #7 [ffff883f634cd8b8] mdd_declare_object_create_internal at ffffffffa0fa78cf [mdd]
 #8 [ffff883f634cd918] mdd_declare_create at ffffffffa0fbb4ce [mdd]
 #9 [ffff883f634cd988] mdd_create at ffffffffa0fbc631 [mdd]
#10 [ffff883f634cda88] mdo_create at ffffffffa0e88058 [mdt]
#11 [ffff883f634cda98] mdt_reint_open at ffffffffa0e923f4 [mdt]
#12 [ffff883f634cdb78] mdt_reint_rec at ffffffffa0e7a481 [mdt]
#13 [ffff883f634cdb98] mdt_reint_internal at ffffffffa0e5fed3 [mdt]
#14 [ffff883f634cdbd8] mdt_intent_reint at ffffffffa0e6045e [mdt]
#15 [ffff883f634cdc28] mdt_intent_policy at ffffffffa0e5dc3e [mdt]
#16 [ffff883f634cdc68] ldlm_lock_enqueue at ffffffffa075e2c5 [ptlrpc]
#17 [ffff883f634cdcd8] ldlm_handle_enqueue0 at ffffffffa0787ebb [ptlrpc]
#18 [ffff883f634cdd48] mdt_enqueue at ffffffffa0e5e106 [mdt]
#19 [ffff883f634cdd68] mdt_handle_common at ffffffffa0e62ada [mdt]
#20 [ffff883f634cddb8] mds_regular_handle at ffffffffa0e9f505 [mdt]
#21 [ffff883f634cddc8] ptlrpc_server_handle_request at ffffffffa07b70c5 [ptlrpc]
#22 [ffff883f634cdea8] ptlrpc_main at ffffffffa07b989d [ptlrpc]
#23 [ffff883f634cdf48] kernel_thread at ffffffff8100c28a
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I am attaching backtrace from the 2 crash dumps and also to lustre debug dumps. &lt;/p&gt;</description>
                <environment></environment>
        <key id="35623">LU-7926</key>
            <summary>MDS sits idle with extreme slow response to clients</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="niu">Niu Yawei</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Sun, 27 Mar 2016 18:32:26 +0000</created>
                <updated>Tue, 25 Oct 2016 14:10:59 +0000</updated>
                            <resolved>Thu, 12 May 2016 12:55:26 +0000</resolved>
                                    <version>Lustre 2.5.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="147007" author="pjones" created="Sun, 27 Mar 2016 20:14:38 +0000"  >&lt;p&gt;Niu&lt;/p&gt;

&lt;p&gt;Could you please advise here?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="147012" author="niu" created="Mon, 28 Mar 2016 06:23:15 +0000"  >&lt;p&gt;Looks lots of MDT threads (creating objects) were trying to acquire quota for id 11361, but they all waited on an in-flight DQACQ. And I found lots of OSTs were acquiring quota lock from quota master, probably MDT was busy on these acquire request and the DQACQ from MDT wasn&apos;t processed in time?&lt;/p&gt;

&lt;p&gt;Are the OSTs just get rebooted (or network to MDT just recovered) when this happened? I&apos;m wondering why there are so many requests from different OSTs to acquire quota locks.&lt;/p&gt;</comment>
                            <comment id="147077" author="mhanafi" created="Mon, 28 Mar 2016 18:46:57 +0000"  >&lt;p&gt;The OST/MDT and networks were not having any issues. UID 11361 user was running a 30K core job and they have an inode quota of 600K. They currently have over 530K files which is over their softquota of 500K.&lt;/p&gt;

&lt;p&gt;I have sent a request to the user to get additional info about their job.&lt;/p&gt;

&lt;p&gt;So it sounds like the user who was over their inode soft quota was creates a lot of files. Could this be a scalling issue in inode quota acquire? &lt;/p&gt;</comment>
                            <comment id="147101" author="mhanafi" created="Mon, 28 Mar 2016 20:43:07 +0000"  >&lt;p&gt;The user creates 90K files during their 30K core run.&lt;/p&gt;</comment>
                            <comment id="147142" author="niu" created="Tue, 29 Mar 2016 02:58:31 +0000"  >&lt;blockquote&gt;
&lt;p&gt;So it sounds like the user who was over their inode soft quota was creates a lot of files. Could this be a scalling issue in inode quota acquire?&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Right, from the log we can see the user was creating lots of files. I don&apos;t think it&apos;s a scaling problem, because the root cause is a that former DQACQ from MDT didn&apos;t get replied somehow, but unfortunately that part of log is missed.&lt;/p&gt;

&lt;p&gt;My speculation is: There was a network hiccup caused all OSTs lost connections then reconnect, because there were tons of quota lock acquire from OSTs (pure quota lock acquire should only happen when quota slave start or reconnect to master). And probably that hiccup caused the DQACQ reply being lost?&lt;/p&gt;</comment>
                            <comment id="147220" author="mhanafi" created="Tue, 29 Mar 2016 17:32:59 +0000"  >&lt;p&gt;Going through the logs there is nothing to indicate that there was a lost of connections between OST,  MDT or clients. There was a high load on a number of OSSes at 9:27:00. Are there any quota allocation adjusted once your over your inode softlimit?&lt;/p&gt;
</comment>
                            <comment id="147303" author="niu" created="Wed, 30 Mar 2016 02:58:18 +0000"  >&lt;p&gt;Ah, I think it&apos;s a livelock problem, from the stack trace we can see that all mdt service threads were busy on sending DQACQ requests and no available threads to handle the requests. This has been fixed in master by &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6433&quot; title=&quot;MDS deadlock in qouta  &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6433&quot;&gt;&lt;del&gt;LU-6433&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="147408" author="niu" created="Thu, 31 Mar 2016 03:21:25 +0000"  >&lt;p&gt;port to b2_5_fe: &lt;a href=&quot;http://review.whamcloud.com/#/c/19250/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/19250/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="147876" author="bzzz" created="Tue, 5 Apr 2016 18:37:16 +0000"  >&lt;p&gt;I&apos;d think that MDT shouldn&apos;t get blocked in this case at all - just return to the client with -EINPROGRESS or something?&lt;/p&gt;</comment>
                            <comment id="147951" author="niu" created="Wed, 6 Apr 2016 02:59:06 +0000"  >&lt;p&gt;Alex, the problem is that the thread sending DQACQ has to wait for the RPC timeout, once it get timeout, it&apos;ll reply -EINPROGRESS to client. So literally it&apos;s not a deadlock but a livelock problem, our solution is to use different set of threads for sending and handling the DQACQ requests.&lt;/p&gt;</comment>
                            <comment id="147956" author="bzzz" created="Wed, 6 Apr 2016 06:30:51 +0000"  >&lt;p&gt;well, exactly. the fact that we send an RPC to get another quota unit can be the point at which we interrupt and return -EINPROGRESS to the client? I guess a potential problem is to avoid a livelock where a single unit is ping-pong&apos;ed among 2+ servers because nobody consumes that right away..&lt;/p&gt;</comment>
                            <comment id="147974" author="niu" created="Wed, 6 Apr 2016 14:03:17 +0000"  >&lt;blockquote&gt;
&lt;p&gt;well, exactly. the fact that we send an RPC to get another quota unit can be the point at which we interrupt and return -EINPROGRESS to the client? &lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;That sounds doable, it could eliminate all the sync acquirings.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;I guess a potential problem is to avoid a livelock where a single unit is ping-pong&apos;ed among 2+ servers because nobody consumes that right away..&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;I don&apos;t know exactly what your are referring, could you illustrate?&lt;/p&gt;</comment>
                            <comment id="148276" author="jaylan" created="Fri, 8 Apr 2016 19:40:09 +0000"  >&lt;p&gt;I cherry-picked the b2_5_fe port of &lt;a href=&quot;http://review.whamcloud.com/#/c/19250/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/19250/&lt;/a&gt; that Nui Yawei commented on 30/Mar/16 8:21 PM:&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6433&quot; title=&quot;MDS deadlock in qouta  &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6433&quot;&gt;&lt;del&gt;LU-6433&lt;/del&gt;&lt;/a&gt; quota: handle QUOTA_DQACQ in READPAGE po&lt;br/&gt;
rtal&lt;/p&gt;

&lt;p&gt;Not in production yet, but in our 2.5.3 code now.&lt;/p&gt;</comment>
                            <comment id="149472" author="jaylan" created="Tue, 19 Apr 2016 19:35:07 +0000"  >&lt;p&gt;I cherry-picked the patch to b2_7_fe.&lt;/p&gt;</comment>
                            <comment id="150646" author="pjones" created="Sat, 30 Apr 2016 12:55:48 +0000"  >&lt;p&gt;So is it ok to close this ticket as a dupliate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6433&quot; title=&quot;MDS deadlock in qouta  &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6433&quot;&gt;&lt;del&gt;LU-6433&lt;/del&gt;&lt;/a&gt;? &lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="29401">LU-6433</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="20902" name="bt.all.Mar26.gz" size="111432" author="mhanafi" created="Sun, 27 Mar 2016 18:32:26 +0000"/>
                            <attachment id="20903" name="bt.all.Mar27.05.25.28.gz" size="106368" author="mhanafi" created="Sun, 27 Mar 2016 18:32:27 +0000"/>
                            <attachment id="20904" name="debug.out.1.gz" size="225" author="mhanafi" created="Sun, 27 Mar 2016 18:32:27 +0000"/>
                            <attachment id="20905" name="debug.out.2.gz" size="225" author="mhanafi" created="Sun, 27 Mar 2016 18:32:27 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzy5sn:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10023"><![CDATA[4]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>