<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:15:52 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-15148] Service thread pid 6006 was inactive for 200.10s; filesystem hangs for specific MDT but eventually works</title>
                <link>https://jira.whamcloud.com/browse/LU-15148</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;A user was first unable to &lt;b&gt;ls&lt;/b&gt; his home directory. &lt;b&gt;ls&lt;/b&gt; hung for a long time, but eventually worked. Then, he had the same issue for a sub-directory called MDT3, which is set to have it&apos;s meta-data on MDT3. After the initial &lt;b&gt;ls&lt;/b&gt; which took several minutes, subsequent &lt;b&gt;ls&lt;/b&gt; attempts returned quickly.&lt;/p&gt;

&lt;p&gt;The host for MDT3 is jet4, and jet4 was the only server that was generating the following logs messages.&lt;/p&gt;

&lt;p&gt;Here&apos;s the stack trace from the first time it happened.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2021-10-20 12:06:19 [3552825.135826] LNet: Service thread pid 21159 was inactive for 200.13s. The thread might be hung, or it might only be slow and will resume later. Dum\
ping the stack trace for debugging purposes:
2021-10-20 12:06:19 [3552825.156754] Pid: 21159, comm: mdt01_059 3.10.0-1160.36.2.1chaos.ch6.x86_64 #1 SMP Wed Jul 21 15:34:23 PDT 2021
2021-10-20 12:06:19 [3552825.168961] Call Trace:
2021-10-20 12:06:19 [3552825.172738]  [&amp;lt;ffffffffc13352e1&amp;gt;] top_trans_wait_result+0xa9/0x158 [ptlrpc]
2021-10-20 12:06:19 [3552825.181613]  [&amp;lt;ffffffffc13156d9&amp;gt;] top_trans_stop+0x4e9/0xa70 [ptlrpc]
2021-10-20 12:06:19 [3552825.189848]  [&amp;lt;ffffffffc16bee2c&amp;gt;] lod_trans_stop+0x25c/0x340 [lod]
2021-10-20 12:06:19 [3552825.197769]  [&amp;lt;ffffffffc1771b5e&amp;gt;] mdd_trans_stop+0x2e/0x174 [mdd]
2021-10-20 12:06:19 [3552825.205594]  [&amp;lt;ffffffffc1755b99&amp;gt;] mdd_create+0x11a9/0x14a0 [mdd]
2021-10-20 12:06:19 [3552825.213311]  [&amp;lt;ffffffffc160fc54&amp;gt;] mdt_create+0xb54/0x1090 [mdt]
2021-10-20 12:06:19 [3552825.220936]  [&amp;lt;ffffffffc16102fb&amp;gt;] mdt_reint_create+0x16b/0x360 [mdt]
2021-10-20 12:06:19 [3552825.229030]  [&amp;lt;ffffffffc1612db3&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
2021-10-20 12:06:19 [3552825.236714]  [&amp;lt;ffffffffc15ee4b3&amp;gt;] mdt_reint_internal+0x6e3/0xaf0 [mdt]
2021-10-20 12:06:19 [3552825.244990]  [&amp;lt;ffffffffc15f9cc7&amp;gt;] mdt_reint+0x67/0x140 [mdt]
2021-10-20 12:06:19 [3552825.252289]  [&amp;lt;ffffffffc130266a&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
2021-10-20 12:06:19 [3552825.260986]  [&amp;lt;ffffffffc12a55fb&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
2021-10-20 12:06:19 [3552825.270534]  [&amp;lt;ffffffffc12a99ed&amp;gt;] ptlrpc_main+0xc4d/0x2280 [ptlrpc]
2021-10-20 12:06:19 [3552825.278536]  [&amp;lt;ffffffffba8cb201&amp;gt;] kthread+0xd1/0xe0
2021-10-20 12:06:19 [3552825.284919]  [&amp;lt;ffffffffbafc3ff7&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
2021-10-20 12:06:19 [3552825.292863]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;After the first time, the stack trace changed to:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2021-10-21 16:27:57 [3654921.183865] LNet: Service thread pid 6006 was inactive for 200.10s. The thread might be hung, or it might only be slow and will resume later. Dump\
ing the stack trace for debugging purposes:
2021-10-21 16:27:57 [3654921.204706] Pid: 6006, comm: mdt00_034 3.10.0-1160.36.2.1chaos.ch6.x86_64 #1 SMP Wed Jul 21 15:34:23 PDT 2021
2021-10-21 16:27:57 [3654921.216836] Call Trace:
2021-10-21 16:27:57 [3654921.220584]  [&amp;lt;ffffffffc1262650&amp;gt;] ldlm_completion_ast+0x440/0x870 [ptlrpc]
2021-10-21 16:27:57 [3654921.229352]  [&amp;lt;ffffffffc12638c1&amp;gt;] ldlm_cli_enqueue_local+0x231/0x830 [ptlrpc]
2021-10-21 16:27:57 [3654921.238377]  [&amp;lt;ffffffffc15f331b&amp;gt;] mdt_object_local_lock+0x50b/0xb20 [mdt]
2021-10-21 16:27:57 [3654921.246994]  [&amp;lt;ffffffffc15f39a0&amp;gt;] mdt_object_lock_internal+0x70/0x360 [mdt]
2021-10-21 16:27:57 [3654921.255788]  [&amp;lt;ffffffffc15f4371&amp;gt;] mdt_getattr_name_lock+0x121/0x1df0 [mdt]
2021-10-21 16:27:57 [3654921.264477]  [&amp;lt;ffffffffc15fc5c5&amp;gt;] mdt_intent_getattr+0x2b5/0x480 [mdt]
2021-10-21 16:27:57 [3654921.272769]  [&amp;lt;ffffffffc15f17ea&amp;gt;] mdt_intent_opc+0x1ba/0xb50 [mdt]
2021-10-21 16:27:57 [3654921.280652]  [&amp;lt;ffffffffc15f9aa4&amp;gt;] mdt_intent_policy+0x1a4/0x360 [mdt]
2021-10-21 16:27:57 [3654921.288834]  [&amp;lt;ffffffffc1249586&amp;gt;] ldlm_lock_enqueue+0x376/0x9b0 [ptlrpc]
2021-10-21 16:27:57 [3654921.297322]  [&amp;lt;ffffffffc1271176&amp;gt;] ldlm_handle_enqueue0+0xa86/0x1620 [ptlrpc]
2021-10-21 16:27:57 [3654921.306178]  [&amp;lt;ffffffffc12fbe72&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
2021-10-21 16:27:57 [3654921.313963]  [&amp;lt;ffffffffc130266a&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
2021-10-21 16:27:57 [3654921.322597]  [&amp;lt;ffffffffc12a55fb&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
2021-10-21 16:27:57 [3654921.332123]  [&amp;lt;ffffffffc12a99ed&amp;gt;] ptlrpc_main+0xc4d/0x2280 [ptlrpc]
2021-10-21 16:27:57 [3654921.340086]  [&amp;lt;ffffffffba8cb201&amp;gt;] kthread+0xd1/0xe0
2021-10-21 16:27:57 [3654921.346479]  [&amp;lt;ffffffffbafc3ff7&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
2021-10-21 16:27:57 [3654921.354429]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;There are a bunch of these in the logs from the last few days, and the processes still exists and their stacks are still stuck in the same state as the dump.&lt;/p&gt;

&lt;p&gt;&#160;&#160;&lt;/p&gt;

&lt;p&gt;The problem was noticed this morning and the full logs for that &lt;b&gt;ls&lt;/b&gt; attempt are&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2021-10-22 08:09:57 [3711440.339065] LNet: Service thread pid 21278 was inactive for 200.48s. The thread might be hung, or it might only be slow and will resume later. Dum\
ping the stack trace for debugging purposes:
2021-10-22 08:09:57 [3711440.360090] Pid: 21278, comm: mdt00_123 3.10.0-1160.36.2.1chaos.ch6.x86_64 #1 SMP Wed Jul 21 15:34:23 PDT 2021
2021-10-22 08:09:57 [3711440.372373] Call Trace:
2021-10-22 08:09:57 [3711440.376201]  [&amp;lt;ffffffffc1262650&amp;gt;] ldlm_completion_ast+0x440/0x870 [ptlrpc]
2021-10-22 08:09:57 [3711440.385068]  [&amp;lt;ffffffffc12638c1&amp;gt;] ldlm_cli_enqueue_local+0x231/0x830 [ptlrpc]
2021-10-22 08:09:57 [3711440.394166]  [&amp;lt;ffffffffc15f331b&amp;gt;] mdt_object_local_lock+0x50b/0xb20 [mdt]
2021-10-22 08:09:57 [3711440.402898]  [&amp;lt;ffffffffc15f39a0&amp;gt;] mdt_object_lock_internal+0x70/0x360 [mdt]
2021-10-22 08:09:57 [3711440.411797]  [&amp;lt;ffffffffc15f4371&amp;gt;] mdt_getattr_name_lock+0x121/0x1df0 [mdt]
2021-10-22 08:09:57 [3711440.420608]  [&amp;lt;ffffffffc15fc5c5&amp;gt;] mdt_intent_getattr+0x2b5/0x480 [mdt]
2021-10-22 08:09:57 [3711440.429022]  [&amp;lt;ffffffffc15f17ea&amp;gt;] mdt_intent_opc+0x1ba/0xb50 [mdt]
2021-10-22 08:09:57 [3711440.437045]  [&amp;lt;ffffffffc15f9aa4&amp;gt;] mdt_intent_policy+0x1a4/0x360 [mdt]
2021-10-22 08:09:57 [3711440.445338]  [&amp;lt;ffffffffc1249586&amp;gt;] ldlm_lock_enqueue+0x376/0x9b0 [ptlrpc]
2021-10-22 08:09:57 [3711440.453922]  [&amp;lt;ffffffffc1271176&amp;gt;] ldlm_handle_enqueue0+0xa86/0x1620 [ptlrpc]
2021-10-22 08:09:57 [3711440.462899]  [&amp;lt;ffffffffc12fbe72&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
2021-10-22 08:09:57 [3711440.470790]  [&amp;lt;ffffffffc130266a&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
2021-10-22 08:09:57 [3711440.479509]  [&amp;lt;ffffffffc12a55fb&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
2021-10-22 08:09:57 [3711440.489097]  [&amp;lt;ffffffffc12a99ed&amp;gt;] ptlrpc_main+0xc4d/0x2280 [ptlrpc]
2021-10-22 08:09:57 [3711440.497117]  [&amp;lt;ffffffffba8cb201&amp;gt;] kthread+0xd1/0xe0
2021-10-22 08:09:57 [3711440.503545]  [&amp;lt;ffffffffbafc3ff7&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
2021-10-22 08:09:57 [3711440.511516]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
2021-10-22 08:09:57 [3711440.518045] LustreError: dumping log to /tmp/lustre-log.1634915397.21278
2021-10-22 08:11:36 [3711539.849250] LustreError: 21278:0:(ldlm_request.c:130:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1634915196, 300s ago); not en\
tering recovery in server code, just going back to sleep ns: mdt-lquake-MDT0003_UUID lock: ffff9f5eade79b00/0xc43735e83ac402f0 lrc: 3/1,0 mode: --/PR res: [0x4c00c7f52:0x5\
:0x0].0x0 bits 0x12/0x0 rrc: 22 type: IBT flags: 0x40210000000000 nid: local remote: 0x0 expref: -99 pid: 21278 timeout: 0 lvb_type: 0
2021-10-22 08:16:31 [3711834.827788] Lustre: 21121:0:(service.c:1372:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (5/5), not sending early reply
2021-10-22 08:16:31 [3711834.827788]   req@ffff9f25e8a75100 x1714259348772800/t0(0) o101-&amp;gt;98d2fd76-058a-227d-d9be-65ffdb9bb578@192.168.128.124@o2ib18:291/0 lens 576/3272 e\
 22 to 0 dl 1634915796 ref 2 fl Interpret:/0/0 rc 0/0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;There are some similar messages on jet4 before 2021-10-20, but not very many and not as frequently. There are very few such similar messages on other machines in jet, most have none this month.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</description>
                <environment>TOSS 3.7-11&lt;br/&gt;
3.10.0-1160.36.2.1chaos.ch6.x86_64&lt;br/&gt;
lustre 2.12.7_2.llnl&lt;br/&gt;
zfs-0.7.11-9.8llnl.ch6.x86_64</environment>
        <key id="66795">LU-15148</key>
            <summary>Service thread pid 6006 was inactive for 200.10s; filesystem hangs for specific MDT but eventually works</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="green">Oleg Drokin</assignee>
                                    <reporter username="defazio">Gian-Carlo Defazio</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Fri, 22 Oct 2021 18:02:56 +0000</created>
                <updated>Sat, 6 Nov 2021 15:53:09 +0000</updated>
                                            <version>Lustre 2.12.7</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="316480" author="green" created="Mon, 25 Oct 2021 17:34:03 +0000"  >&lt;p&gt;Just to confirm, when you say &quot;After the first time, the stack trace changed to&quot; does that mean the very first stack trace is always the one in the create-&amp;gt;trans stop path?&lt;/p&gt;</comment>
                            <comment id="316485" author="defazio" created="Mon, 25 Oct 2021 19:28:02 +0000"  >&lt;p&gt;What do you mean by create-&amp;gt;trans stop path?&lt;/p&gt;

&lt;p&gt;Here&apos;s me grepping for stack dumps in October&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2021-10-01 02:24:35 [1876351.651648] LustreError: dumping log to /tmp/lustre-log.1633080275.21241
2021-10-01 15:40:12 [1924088.038346] LustreError: dumping log to /tmp/lustre-log.1633128012.12376
2021-10-04 14:46:28 [2180059.422390] LustreError: dumping log to /tmp/lustre-log.1633383988.31179
2021-10-05 04:12:51 [2228441.432712] LustreError: dumping log to /tmp/lustre-log.1633432371.19181
2021-10-20 12:06:19 [3552825.299378] LustreError: dumping log to /tmp/lustre-log.1634756779.21159
2021-10-21 01:33:05 [3601230.956954] LustreError: dumping log to /tmp/lustre-log.1634805185.21311
2021-10-21 01:34:45 [3601330.752850] LustreError: dumping log to /tmp/lustre-log.1634805285.21311
2021-10-21 07:47:19 [3623683.794004] LustreError: dumping log to /tmp/lustre-log.1634827639.16605
2021-10-21 08:21:17 [3625722.539271] LustreError: dumping log to /tmp/lustre-log.1634829677.16601
2021-10-21 08:58:26 [3627950.722608] LustreError: dumping log to /tmp/lustre-log.1634831906.21124
2021-10-21 09:41:07 [3630512.212157] LustreError: dumping log to /tmp/lustre-log.1634834467.31952
2021-10-21 12:45:17 [3641561.995392] LustreError: dumping log to /tmp/lustre-log.1634845517.21154
2021-10-21 12:57:34 [3642298.747487] LustreError: dumping log to /tmp/lustre-log.1634846254.21276
2021-10-21 14:34:54 [3648138.512019] LustreError: dumping log to /tmp/lustre-log.1634852094.21240
2021-10-21 16:27:57 [3654921.360927] LustreError: dumping log to /tmp/lustre-log.1634858877.6006
2021-10-22 08:09:57 [3711440.518045] LustreError: dumping log to /tmp/lustre-log.1634915397.21278
2021-10-22 08:39:53 [3713236.067411] LustreError: dumping log to /tmp/lustre-log.1634917192.16602
2021-10-22 09:18:54 [3715577.400409] LustreError: dumping log to /tmp/lustre-log.1634919534.21261
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;By &lt;b&gt;first&lt;/b&gt; stack trace I mean the one the occurs at &lt;b&gt;2021-10-20 12:06:19&lt;/b&gt; and is the first stack trace shown in the description.&lt;/p&gt;

&lt;p&gt;Starting &lt;b&gt;2021-10-21 01:33:05&lt;/b&gt;, that stack trace, and all subsequent traces look like the second stack trace show in the description.&lt;/p&gt;</comment>
                            <comment id="316721" author="green" created="Wed, 27 Oct 2021 17:22:10 +0000"  >&lt;p&gt;So can we assume you have a way to reliably hit this? Would it be feasible to increase debug levels on the client and servers just before this first access happens and then dump lustre debug logs when complete?&lt;/p&gt;

&lt;p&gt;It&apos;s strange that there&apos;s no evictions or anything in the logs and the lock being held is a server side lock.&lt;/p&gt;

&lt;p&gt;The extra debug levels we want are: rpctrace dlmtrace ha vfstrace&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Do increase debug_mb to 1000 as well please and then once done dump the log with lctl dk. Ideally you&apos;ll do this on all MDS and OST servers and the client that initiated that first request.&lt;/p&gt;</comment>
                            <comment id="316794" author="defazio" created="Wed, 27 Oct 2021 23:43:36 +0000"  >&lt;p&gt;For my records, the local ticket is &lt;a href=&quot;https://lc.llnl.gov/jira/browse/TOSS-5346&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;TOSS-5346&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="316893" author="defazio" created="Thu, 28 Oct 2021 18:49:23 +0000"  >&lt;p&gt;We reran the tests that we believe resulted in the first error, but were unable to reproduce the error.&lt;/p&gt;

&lt;p&gt;The differences were that the debug levels rpctrace dlmtrace ha vfstrace were set, and that one node that had failed over had been restored. The node that had failed over did so over a month ago, well before 2021-10-20 when this issue seems to have started.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="41105" name="console.jet4" size="1642758" author="defazio" created="Mon, 25 Oct 2021 19:28:36 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i0284v:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>