<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:37:11 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10674] MGS very unstable in 2.10.x</title>
                <link>https://jira.whamcloud.com/browse/LU-10674</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We keep having issues with the MGS since the upgrade from 2.9 to 2.10 LTS. As soon as we failover/failback some target, the MGS seems to be stuck. Additionally, stopping the MGS always triggers a crash (reported in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10390&quot; title=&quot;MGS crashes in ldlm_reprocess_queue() when stopping&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10390&quot;&gt;LU-10390&lt;/a&gt;). This is concerning for a stable version.&lt;/p&gt;

&lt;p&gt;MGS stuck this morning when trying to add a new OST:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[669739.991439] LNet: Service thread pid 136320 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 200.27s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; debugging purposes:
[669740.010557] Pid: 136320, comm: ll_mgs_0011
[669740.015223] 
Call Trace:
[669740.019798]  [&amp;lt;ffffffff816a94e9&amp;gt;] schedule+0x29/0x70
[669740.025437]  [&amp;lt;ffffffff816a6f34&amp;gt;] schedule_timeout+0x174/0x2c0
[669740.032077]  [&amp;lt;ffffffffc0b6bef1&amp;gt;] ? ldlm_run_ast_work+0x1d1/0x3a0 [ptlrpc]
[669740.039848]  [&amp;lt;ffffffff81098b20&amp;gt;] ? process_timeout+0x0/0x10
[669740.046276]  [&amp;lt;ffffffffc0b85020&amp;gt;] ? ldlm_expired_completion_wait+0x0/0x240 [ptlrpc]
[669740.054934]  [&amp;lt;ffffffffc0b85811&amp;gt;] ldlm_completion_ast+0x5b1/0x920 [ptlrpc]
[669740.062704]  [&amp;lt;ffffffff810c4810&amp;gt;] ? default_wake_function+0x0/0x20
[669740.069704]  [&amp;lt;ffffffffc138e75c&amp;gt;] mgs_completion_ast_generic+0x5c/0x200 [mgs]
[669740.077777]  [&amp;lt;ffffffffc0b6a6bc&amp;gt;] ? ldlm_lock_create+0x1fc/0xa30 [ptlrpc]
[669740.085451]  [&amp;lt;ffffffffc138e973&amp;gt;] mgs_completion_ast_config+0x13/0x20 [mgs]
[669740.093331]  [&amp;lt;ffffffffc0b87730&amp;gt;] ldlm_cli_enqueue_local+0x230/0x860 [ptlrpc]
[669740.101394]  [&amp;lt;ffffffffc138e960&amp;gt;] ? mgs_completion_ast_config+0x0/0x20 [mgs]
[669740.109372]  [&amp;lt;ffffffffc0b8ae00&amp;gt;] ? ldlm_blocking_ast+0x0/0x170 [ptlrpc]
[669740.116950]  [&amp;lt;ffffffffc139335c&amp;gt;] mgs_revoke_lock+0xfc/0x370 [mgs]
[669740.123956]  [&amp;lt;ffffffffc0b8ae00&amp;gt;] ? ldlm_blocking_ast+0x0/0x170 [ptlrpc]
[669740.131534]  [&amp;lt;ffffffffc138e960&amp;gt;] ? mgs_completion_ast_config+0x0/0x20 [mgs]
[669740.139498]  [&amp;lt;ffffffffc1393ae5&amp;gt;] mgs_target_reg+0x515/0x1370 [mgs]
[669740.146608]  [&amp;lt;ffffffffc0bbb0b1&amp;gt;] ? lustre_pack_reply+0x11/0x20 [ptlrpc]
[669740.154208]  [&amp;lt;ffffffffc0c1dda5&amp;gt;] tgt_request_handle+0x925/0x1370 [ptlrpc]
[669740.161997]  [&amp;lt;ffffffffc0bc6b16&amp;gt;] ptlrpc_server_handle_request+0x236/0xa90 [ptlrpc]
[669740.170655]  [&amp;lt;ffffffffc0bc3148&amp;gt;] ? ptlrpc_wait_event+0x98/0x340 [ptlrpc]
[669740.178328]  [&amp;lt;ffffffff810c4822&amp;gt;] ? default_wake_function+0x12/0x20
[669740.185420]  [&amp;lt;ffffffff810ba588&amp;gt;] ? __wake_up_common+0x58/0x90
[669740.192041]  [&amp;lt;ffffffffc0bca252&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[669740.199133]  [&amp;lt;ffffffff81029557&amp;gt;] ? __switch_to+0xd7/0x510
[669740.205350]  [&amp;lt;ffffffff816a8f00&amp;gt;] ? __schedule+0x2f0/0x8b0
[669740.211583]  [&amp;lt;ffffffffc0bc97c0&amp;gt;] ? ptlrpc_main+0x0/0x1e40 [ptlrpc]
[669740.218675]  [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
[669740.224215]  [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
[669740.229851]  [&amp;lt;ffffffff816b4f58&amp;gt;] ret_from_fork+0x58/0x90
[669740.235970]  [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0

[669740.243363] LustreError: dumping log to /tmp/lustre-log.1518720988.136320

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Clients output something like that:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[1466043.295178] Lustre: MGC10.0.2.51@o2ib5: Connection restored to MGC10.0.2.51@o2ib5_0 (at 10.0.2.51@o2ib5)
[1466043.295179] Lustre: Skipped 1 previous similar message
[1466043.767551] LustreError: 5993:0:(ldlm_request.c:148:ldlm_expired_completion_wait()) Skipped 1 previous similar message
[1466351.198284] LustreError: 368700:0:(ldlm_resource.c:1682:ldlm_resource_dump()) --- Resource: [0x6b616f:0x2:0x0].0x0 (ffff8823eebbb2c0) refcount = 2
[1466351.242084] LustreError: 368700:0:(ldlm_resource.c:1703:ldlm_resource_dump()) Waiting locks:
[1466657.253528] LustreError: 166-1: MGC10.0.2.51@o2ib5: Connection to MGS (at 10.0.2.51@o2ib5) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail
[1466657.299037] LustreError: Skipped 1 previous similar message
[1466657.317969] LustreError: 5993:0:(ldlm_request.c:148:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1518719899, 300s ago), entering recovery &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; MGS@MGC10.0.2.51@o2ib5_0 ns: MGC10.0.2.51@o2i
[1466657.318229] LustreError: 372154:0:(ldlm_resource.c:1100:ldlm_resource_complain()) MGC10.0.2.51@o2ib5: namespace resource [0x6b616f:0x2:0x0].0x0 (ffff883aca373200) refcount nonzero (2) after lock cleanup; fo
[1466657.318230] LustreError: 372154:0:(ldlm_resource.c:1100:ldlm_resource_complain()) Skipped 1 previous similar message
[1466657.318232] LustreError: 372154:0:(ldlm_resource.c:1682:ldlm_resource_dump()) --- Resource: [0x6b616f:0x2:0x0].0x0 (ffff883aca373200) refcount = 3
[1466657.318233] LustreError: 372154:0:(ldlm_resource.c:1703:ldlm_resource_dump()) Waiting locks:
[1466657.318238] LustreError: 372154:0:(ldlm_resource.c:1705:ldlm_resource_dump()) ### ### ns: MGC10.0.2.51@o2ib5 lock: ffff883193225800/0xe5ac076a284d2d lrc: 4/1,0 mode: --/CR res: [0x6b616f:0x2:0x0].0x0 rrc: 4
[1466657.318239] LustreError: 372154:0:(ldlm_resource.c:1705:ldlm_resource_dump()) Skipped 1 previous similar message
[1466657.318244] Lustre: MGC10.0.2.51@o2ib5: Connection restored to MGC10.0.2.51@o2ib5_0 (at 10.0.2.51@o2ib5)

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Rebooting the MGS fixes the issue, until the next target failover/failback.&lt;/p&gt;

&lt;p&gt;Stephane&lt;br/&gt;
 &#160;&lt;/p&gt;</description>
                <environment>3.10.0-693.2.2.el7_lustre.pl1.x86_64</environment>
        <key id="50844">LU-10674</key>
            <summary>MGS very unstable in 2.10.x</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="sthiell">Stephane Thiell</reporter>
                        <labels>
                    </labels>
                <created>Thu, 15 Feb 2018 19:12:29 +0000</created>
                <updated>Fri, 2 Oct 2020 09:28:53 +0000</updated>
                                            <version>Lustre 2.10.3</version>
                    <version>Lustre 2.10.4</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>14</watches>
                                                                            <comments>
                            <comment id="221158" author="bfaccini" created="Fri, 16 Feb 2018 11:32:49 +0000"  >&lt;p&gt;Hello Stephane!&lt;br/&gt;
We can try to reproduce in-house, but as you appear to be able to reproduce easily may be you could provide with a full Lustre debug log from all sides when causing a target failover/failback?&lt;/p&gt;

&lt;p&gt;Also, about &quot;stopping the MGS always triggers a crash (reported in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10390&quot; title=&quot;MGS crashes in ldlm_reprocess_queue() when stopping&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10390&quot;&gt;LU-10390&lt;/a&gt;)&quot;, again we can try to reproduce in-house, but may be you have a crash-dump already available that could be analyzed as a first step?&lt;/p&gt;</comment>
                            <comment id="221189" author="jhammond" created="Fri, 16 Feb 2018 18:45:42 +0000"  >&lt;p&gt;Hi Stephane,&lt;/p&gt;

&lt;p&gt;Could you describe the interop situation here? What version is the MGS and what versions are the targets?&lt;/p&gt;</comment>
                            <comment id="221223" author="sthiell" created="Fri, 16 Feb 2018 23:00:39 +0000"  >&lt;p&gt;Hi Bruno and John,&lt;/p&gt;

&lt;p&gt;For this case, all targets are running Lustre 2.10.3 RC1.&lt;/p&gt;

&lt;p&gt;Because this system running 2.10 has been quite unstable lately, we don&apos;t want to trigger new problems right now, but next time I have too failover/failback, I&apos;ll try to gather more Lustre debug logs! Thanks!&lt;/p&gt;</comment>
                            <comment id="247737" author="gerrit" created="Sun, 26 May 2019 09:34:27 +0000"  >&lt;p&gt;Hongchao Zhang (hongchao@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/34958&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34958&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10674&quot; title=&quot;MGS very unstable in 2.10.x&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10674&quot;&gt;LU-10674&lt;/a&gt; ldlm: only check granted plain locks&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: bd8a05bde10b4e8a22f941522e06b033c1c88a68&lt;/p&gt;</comment>
                            <comment id="276911" author="hongchao.zhang" created="Fri, 7 Aug 2020 06:10:23 +0000"  >&lt;p&gt;this issue could be caused by the same reason in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13356&quot; title=&quot;lctl conf_param hung on the MGS node&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13356&quot;&gt;&lt;del&gt;LU-13356&lt;/del&gt;&lt;/a&gt; (the patch is  &lt;a href=&quot;https://review.whamcloud.com/37880&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/37880&lt;/a&gt;)&lt;/p&gt;</comment>
                            <comment id="281244" author="degremoa" created="Thu, 1 Oct 2020 16:43:23 +0000"  >&lt;p&gt;Hello Hongchao Zhang,&lt;/p&gt;

&lt;p&gt;I&apos;m not sure the patch you pointed to will really fix this problem.&lt;/p&gt;

&lt;p&gt;If I understand correctly, if one client is dead and not yet evicted by MGS while an IR is happening, the problem should happen. I tested that multiple times without being able to reproduce it. Actually, I saw the MGS evicts the client, or the revoke returning an error but not staying stale.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;6895:0:(mgs_handler.c:282:mgs_revoke_lock()) MGS: can&apos;t take cfg lock for 0x736d61726170/0x3 : rc = -11&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="58344">LU-13356</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="59747">LU-13719</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzswf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>