<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:51:51 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12354] MDT threads stuck at ldlm_completion_ast</title>
                <link>https://jira.whamcloud.com/browse/LU-12354</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;MDT with very high load and lots of threads stuck at ldlm_completion_ast and &lt;br/&gt;
osp_sync_process_queues. &lt;/p&gt;

&lt;p&gt;May be a dup of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11359&quot; title=&quot;racer test 1 times out with client hung in dir_create.sh, ls, &#8230; and MDS in ldlm_completion_ast()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11359&quot;&gt;&lt;del&gt;LU-11359&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[2418705.962173] LNet: Service thread pid 34570 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 400.29s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; debugging purposes:
[2418706.013418] Pid: 34570, comm: mdt01_236 3.10.0-693.21.1.el7.20180508.x86_64.lustre2106 #1 SMP Wed Jan 30 00:30:34 UTC 2019
[2418706.013419] Call Trace:
[2418706.013433]  [&amp;lt;ffffffffa0beba11&amp;gt;] ldlm_completion_ast+0x5b1/0x920 [ptlrpc]
[2418706.034654]  [&amp;lt;ffffffffa0becb53&amp;gt;] ldlm_cli_enqueue_local+0x233/0x860 [ptlrpc]
[2418706.034673]  [&amp;lt;ffffffffa1248d72&amp;gt;] mdt_object_local_lock+0x512/0xb00 [mdt]
[2418706.034680]  [&amp;lt;ffffffffa12493be&amp;gt;] mdt_object_lock_internal+0x5e/0x300 [mdt]
[2418706.034687]  [&amp;lt;ffffffffa124a164&amp;gt;] mdt_getattr_name_lock+0x8a4/0x1910 [mdt]
[2418706.034694]  [&amp;lt;ffffffffa124b480&amp;gt;] mdt_intent_getattr+0x2b0/0x480 [mdt]
[2418706.034701]  [&amp;lt;ffffffffa124746b&amp;gt;] mdt_intent_opc+0x1eb/0xaf0 [mdt]
[2418706.034708]  [&amp;lt;ffffffffa124fd08&amp;gt;] mdt_intent_policy+0x138/0x320 [mdt]
[2418706.034727]  [&amp;lt;ffffffffa0bd12cd&amp;gt;] ldlm_lock_enqueue+0x38d/0x980 [ptlrpc]
[2418706.034751]  [&amp;lt;ffffffffa0bfac13&amp;gt;] ldlm_handle_enqueue0+0xa83/0x1670 [ptlrpc]
[2418706.034787]  [&amp;lt;ffffffffa0c80622&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
[2418706.034818]  [&amp;lt;ffffffffa0c8428a&amp;gt;] tgt_request_handle+0x92a/0x1370 [ptlrpc]
[2418706.034845]  [&amp;lt;ffffffffa0c2c6cb&amp;gt;] ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
[2418706.034870]  [&amp;lt;ffffffffa0c306b2&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[2418706.034873]  [&amp;lt;ffffffff810b1131&amp;gt;] kthread+0xd1/0xe0
[2418706.034875]  [&amp;lt;ffffffff816a14f7&amp;gt;] ret_from_fork+0x77/0xb0
[2418706.034894]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[2418706.034897] LustreError: dumping log to /tmp/lustre-log.1557763688.34570
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;



&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
PID: 32153  TASK: ffff883ed28d5ee0  CPU: 10  COMMAND: &lt;span class=&quot;code-quote&quot;&gt;&quot;osp-syn-153-0&quot;&lt;/span&gt;
 #0 [ffff883ed292f908] __schedule at ffffffff816946af
 #1 [ffff883ed292f990] schedule at ffffffff81694cd9
 #2 [ffff883ed292f9a0] osp_sync_process_queues at ffffffffa13f8197 [osp]
 #3 [ffff883ed292fab0] llog_process_thread at ffffffffa09ad8bb [obdclass]
 #4 [ffff883ed292fb78] llog_process_or_fork at ffffffffa09ae5cc [obdclass]
 #5 [ffff883ed292fbc0] llog_cat_process_cb at ffffffffa09b3dad [obdclass]
 #6 [ffff883ed292fc30] llog_process_thread at ffffffffa09ad8bb [obdclass]
 #7 [ffff883ed292fcf8] llog_process_or_fork at ffffffffa09ae5cc [obdclass]
 #8 [ffff883ed292fd40] llog_cat_process_or_fork at ffffffffa09b2e49 [obdclass]
 #9 [ffff883ed292fdb8] llog_cat_process at ffffffffa09b2f7e [obdclass]
#10 [ffff883ed292fdd8] osp_sync_thread at ffffffffa13f63af [osp]
#11 [ffff883ed292fec8] kthread at ffffffff810b1131
#12 [ffff883ed292ff50] ret_from_fork at ffffffff816a14f7
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
PID: 33442  TASK: ffff881fcfa38000  CPU: 21  COMMAND: &lt;span class=&quot;code-quote&quot;&gt;&quot;mdt01_073&quot;&lt;/span&gt;
 #0 [ffff883eb683f718] __schedule at ffffffff816946af
 #1 [ffff883eb683f7a8] schedule at ffffffff81694cd9
 #2 [ffff883eb683f7b8] schedule_timeout at ffffffff81692574
 #3 [ffff883eb683f860] ldlm_completion_ast at ffffffffa0beba11 [ptlrpc]
 #4 [ffff883eb683f900] ldlm_cli_enqueue_local at ffffffffa0becb53 [ptlrpc]
 #5 [ffff883eb683f998] mdt_object_local_lock at ffffffffa1248d72 [mdt]
 #6 [ffff883eb683fa48] mdt_object_lock_internal at ffffffffa12493be [mdt]
 #7 [ffff883eb683fa98] mdt_getattr_name_lock at ffffffffa124a164 [mdt]
 #8 [ffff883eb683fb20] mdt_intent_getattr at ffffffffa124b480 [mdt]
 #9 [ffff883eb683fb60] mdt_intent_opc at ffffffffa124746b [mdt]
#10 [ffff883eb683fbc0] mdt_intent_policy at ffffffffa124fd08 [mdt]
#11 [ffff883eb683fbf8] ldlm_lock_enqueue at ffffffffa0bd12cd [ptlrpc]
#12 [ffff883eb683fc50] ldlm_handle_enqueue0 at ffffffffa0bfac13 [ptlrpc]
#13 [ffff883eb683fce0] tgt_enqueue at ffffffffa0c80622 [ptlrpc]
#14 [ffff883eb683fd00] tgt_request_handle at ffffffffa0c8428a [ptlrpc]
#15 [ffff883eb683fd48] ptlrpc_server_handle_request at ffffffffa0c2c6cb [ptlrpc]
#16 [ffff883eb683fde8] ptlrpc_main at ffffffffa0c306b2 [ptlrpc]
#17 [ffff883eb683fec8] kthread at ffffffff810b1131
#18 [ffff883eb683ff50] ret_from_fork at ffffffff816a14f7
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="55781">LU-12354</key>
            <summary>MDT threads stuck at ldlm_completion_ast</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="4">Incomplete</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Wed, 29 May 2019 18:42:20 +0000</created>
                <updated>Mon, 6 Mar 2023 15:55:46 +0000</updated>
                            <resolved>Wed, 6 Jan 2021 13:33:02 +0000</resolved>
                                    <version>Lustre 2.10.7</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="248050" author="pjones" created="Thu, 30 May 2019 14:14:42 +0000"  >&lt;p&gt;Hongchao&lt;/p&gt;

&lt;p&gt;Could you please investigate?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="248051" author="pfarrell" created="Thu, 30 May 2019 14:15:14 +0000"  >&lt;p&gt;This is not related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11359&quot; title=&quot;racer test 1 times out with client hung in dir_create.sh, ls, &#8230; and MDS in ldlm_completion_ast()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11359&quot;&gt;&lt;del&gt;LU-11359&lt;/del&gt;&lt;/a&gt;, as that is a DOM specific issue.&lt;/p&gt;

&lt;p&gt;It would be good to have dmesg from the MDS and also the dumped lustre log(s), like the one listed here:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[2418706.034897] LustreError: dumping log to /tmp/lustre-log.1557763688.34570
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="248149" author="mhanafi" created="Fri, 31 May 2019 21:56:03 +0000"  >&lt;p&gt;Attaching dmesg. We don&apos;t have /tmp/lustre-log.1557763688.34570 it was deleted.  &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/32700/32700_vmcore-dmesg.txt.gz&quot; title=&quot;vmcore-dmesg.txt.gz attached to LU-12354&quot;&gt;vmcore-dmesg.txt.gz&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;/p&gt;</comment>
                            <comment id="248366" author="degremoa" created="Tue, 4 Jun 2019 12:30:00 +0000"  >&lt;p&gt;@mhanafi, there is plenty of stack traces in your vmcode logs related to stuck read I/O like:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[2415376.231996] Pid: 35506, comm: mdt_rdpg00_051 3.10.0-693.21.1.el7.20180508.x86_64.lustre2106 #1 SMP Wed Jan 30 00:30:34 UTC 2019
[2415376.231998] Call Trace:
[2415376.232023] [&amp;lt;ffffffff8123499a&amp;gt;] __wait_on_buffer+0x2a/0x30
[2415376.249546] [&amp;lt;ffffffffa0ee553c&amp;gt;] ldiskfs_bread+0x7c/0xc0 [ldiskfs]
[2415376.249554] [&amp;lt;ffffffffa0ed3e7a&amp;gt;] __ldiskfs_read_dirblock+0x4a/0x400 [ldiskfs]
[2415376.249560] [&amp;lt;ffffffffa0ed4270&amp;gt;] htree_dirblock_to_tree+0x40/0x190 [ldiskfs]
[2415376.249566] [&amp;lt;ffffffffa0ed6451&amp;gt;] ldiskfs_htree_fill_tree+0x201/0x2f0 [ldiskfs]&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Are you sure you do not have any disk issue? This could be a side effect but you should first verify there is not a disk issue that prevent Lustre threads from doing I/O.&lt;/p&gt;</comment>
                            <comment id="248439" author="hongchao.zhang" created="Wed, 5 Jun 2019 12:28:08 +0000"  >&lt;p&gt;As per the logs, there is an IBITS lock on root directory is holding by client at 10.151.10.140, but there is network error&lt;br/&gt;
when sending the Blocking callback to the client&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2418411.342868] LNetError: 21360:0:(o2iblnd_cb.c:3147:kiblnd_check_txs_locked()) Timed out tx: active_txs, 5 seconds
[2418411.373829] LNetError: 21360:0:(o2iblnd_cb.c:3222:kiblnd_check_conns()) Timed out RDMA with 10.151.10.140@o2ib (207): c: 54, oc: 0, rc: 63
[2418411.411904] Lustre: 33336:0:(client.c:2116:ptlrpc_expire_one_request()) @@@ Request sent has failed due to network error: [sent 1557763288/real 1557763393]  req@ffff880139dd6600 x1630930117996992/t0(0) o104-&amp;gt;nbp2-MDT0000@10.151.10.140@o2ib:15/16 lens 296/224 e 0 to 1 dl 1557763743 ref 1 fl Rpc:eX/0/ffffffff rc 0/-1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;After 407 seconds, the lock waiting timer is expired and released, so all other lock enqueue request are blocked&lt;br/&gt;
in ldlm_completion_ast to wait its lock to be granted.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[2418713.129522] LustreError: 0:0:(ldlm_lockd.c:334:waiting_locks_callback()) ### lock callback timer expired after 407s: evicting client at 10.151.10.140@o2ib  ns: mdt-nbp2-MDT0000_UUID lock: ffff8810b458a200/0x843e968fdb2cd3cc lrc: 4/0,0 mode: PR/PR res: [0x200000007:0x1:0x0].0x0 bits 0x13 rrc: 2189 type: IBT flags: 0x60200400000020 nid: 10.151.10.140@o2ib remote: 0x7e3f2695c7cccb4e expref: 160144 pid: 33420 timeout: 6713595267 lvb_type: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;What is the output of &quot;lctl get_param at_max&quot;, &quot;lctl get_param timeout&quot;?&lt;br/&gt;
This issue can be mitigated by reducing the timeout, but it could cause some operations to be failed if it spent more&lt;br/&gt;
time to be processed, especially under high load situation.&lt;/p&gt;</comment>
                            <comment id="248589" author="mhanafi" created="Thu, 6 Jun 2019 21:02:41 +0000"  >&lt;ol&gt;
	&lt;li&gt;lctl get_param at_max&lt;br/&gt;
at_max=600&lt;/li&gt;
&lt;/ol&gt;


&lt;ol&gt;
	&lt;li&gt;# lctl get_param timeout&lt;br/&gt;
timeout=100&lt;/li&gt;
&lt;/ol&gt;
</comment>
                            <comment id="288757" author="mhanafi" created="Wed, 6 Jan 2021 01:44:31 +0000"  >&lt;p&gt;We can close this for now&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="63772">LU-14611</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="32700" name="vmcore-dmesg.txt.gz" size="208382" author="mhanafi" created="Fri, 31 May 2019 21:55:56 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00h5r:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>