<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:09:13 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-14378] ldlm_resource_complain()) MGC172.19.3.1@o2ib600: namespace resource [0x68736c:0x2:0x0].0x0 (ffff972b9abea0c0) refcount nonzero (1) after lock cleanup; forcing cleanup.</title>
                <link>https://jira.whamcloud.com/browse/LU-14378</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Console log on node with MDT0005 (NID 172.19.3.1@o2ib600) reports:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LustreError: 18990:0:(ldlm_request.c:148:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1609295691, 300s ago), entering recovery for MGS@172.19.3.1@o2ib600 ns: MGC172.19.3.1@o2ib600 lock: ffff97619d5133c0/0xa8790faf05ee75fc lrc: 4/1,0 mode: --/CR res: [0x68736c:0x2:0x0].0x0 rrc: 2 type: PLN flags: 0x1000000000000 nid: local remote: 0x3481cb7270a3b1bc expref: -99 pid: 18990 timeout: 0 lvb_type: 0
LustreError: 18990:0:(ldlm_request.c:148:ldlm_expired_completion_wait()) Skipped 1 previous similar message
LustreError: 25121:0:(ldlm_resource.c:1147:ldlm_resource_complain()) MGC172.19.3.1@o2ib600: namespace resource [0x68736c:0x2:0x0].0x0 (ffff972b9abea0c0) refcount nonzero (1) after lock cleanup; forcing cleanup.
LustreError: 25121:0:(ldlm_resource.c:1147:ldlm_resource_complain()) Skipped 1 previous similar message 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>MDT&lt;br/&gt;
3.10.0-1160.4.1.1chaos.ch6.x86_64&lt;br/&gt;
lustre-2.12.5_10.llnl-3.ch6.x86_64</environment>
        <key id="62545">LU-14378</key>
            <summary>ldlm_resource_complain()) MGC172.19.3.1@o2ib600: namespace resource [0x68736c:0x2:0x0].0x0 (ffff972b9abea0c0) refcount nonzero (1) after lock cleanup; forcing cleanup.</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="6" iconUrl="https://jira.whamcloud.com/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="tappro">Mikhail Pershin</assignee>
                                    <reporter username="ofaaland">Olaf Faaland</reporter>
                        <labels>
                            <label>llnll</label>
                    </labels>
                <created>Thu, 28 Jan 2021 05:55:57 +0000</created>
                <updated>Wed, 11 Jan 2023 18:44:48 +0000</updated>
                            <resolved>Sun, 16 Jan 2022 08:57:50 +0000</resolved>
                                                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="290552" author="ofaaland" created="Thu, 28 Jan 2021 05:56:32 +0000"  >&lt;p&gt;For my records my local ticket is TOSS5036&lt;/p&gt;</comment>
                            <comment id="290553" author="ofaaland" created="Thu, 28 Jan 2021 05:57:10 +0000"  >&lt;p&gt;The day of the issue we see repeated communication failures with lsh-MDT000c, within the hours of 2am, 7am, and 9am:&lt;/p&gt;

&lt;p&gt;For example&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2020-12-29 02:40:20 [4122150.157428] LNetError: 12879:0:(o2iblnd_cb.c:3359:kiblnd_check_txs_locked()) Timed out tx: active_txs, 0 seconds
2020-12-29 02:40:20 [4122150.168998] LNetError: 12879:0:(o2iblnd_cb.c:3434:kiblnd_check_conns()) Timed out RDMA with 172.19.3.13@o2ib600 (11): c: 6, oc: 0, rc: 8
2020-12-29 02:40:20 [4122150.183730] Lustre: 18519:0:(client.c:2133:ptlrpc_expire_one_request()) @@@ Request sent has failed due to network error: [sent 1609238415/real 1609238420]&#160; req@ffff9724f4650900 x1683090791024000/t0(0) o41-&amp;gt;lsh-MDT000c-osp-MDT0005@172.19.3.13@o2ib600:24/4 lens 224/368 e 0 to 1 dl 1609238521 ref 1 fl Rpc:eX/0/ffffffff rc 0/-1
2020-12-29 02:40:20 [4122150.189175] Lustre: lsh-MDT0005: Received new LWP connection from 172.19.3.13@o2ib600, removing former export from same NID
2020-12-29 02:40:20 [4122150.189193] Lustre: lsh-MDT0005: Connection restored to&#160; (at 172.19.3.13@o2ib600)
2020-12-29 02:40:20 [4122150.237218] Lustre: lsh-MDT000c-osp-MDT0005: Connection to lsh-MDT000c (at 172.19.3.13@o2ib600) was lost; in progress operations using this service will wait for recovery to complete&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2020-12-29 07:27:28 [4139376.886078] LNetError: 12879:0:(o2iblnd_cb.c:3359:kiblnd_check_txs_locked()) Timed out tx: active_txs, 0 seconds
2020-12-29 07:27:28 [4139376.897645] LNetError: 12879:0:(o2iblnd_cb.c:3434:kiblnd_check_conns()) Timed out RDMA with 172.19.3.13@o2ib600 (11): c: 7, oc: 0, rc: 8
2020-12-29 07:27:28 [4139376.912177] Lustre: 18530:0:(client.c:2133:ptlrpc_expire_one_request()) @@@ Request sent has failed due to network error: [sent 1609255643/real 1609255648]&#160; req@ffff9760e9b92400 x1683090802516928/t0(0) o41-&amp;gt;lsh-MDT000c-osp-MDT0005@172.19.3.13@o2ib600:24/4 lens 224/368 e 0 to 1 dl 1609255749 ref 1 fl Rpc:eX/0/ffffffff rc 0/-1
2020-12-29 07:27:28 [4139376.917039] Lustre: lsh-MDT0005: Received new LWP connection from 172.19.3.13@o2ib600, removing former export from same NID
2020-12-29 07:27:28 [4139376.917054] Lustre: lsh-MDT0005: Connection restored to&#160; (at 172.19.3.13@o2ib600)
2020-12-29 07:27:28 [4139376.965671] Lustre: lsh-MDT000c-osp-MDT0005: Connection to lsh-MDT000c (at 172.19.3.13@o2ib600) was lost; in progress operations using this service will wait for recovery to complete &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and then timed out requests&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2020-12-29 09:49:57 [4147924.759391] Lustre: 18536:0:(client.c:2133:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1609264091/real 1609264091]&#160; req@ffff9724efdeb180 x1683090808250432/t0(0) o400-&amp;gt;lsh-MDT000c-osp-MDT0005@172.19.3.14@o2ib600:24/4 lens 224/224 e 0 to 1 dl 1609264197 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
2020-12-29 09:50:22 [4147949.701554] Lustre: 18531:0:(client.c:2133:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1609264116/real 1609264116]&#160; req@ffff9724efdebf00 x1683090808267264/t0(0) o400-&amp;gt;lsh-MDT000c-osp-MDT0005@172.19.3.14@o2ib600:24/4 lens 224/224 e 0 to 1 dl 1609264222 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="290554" author="ofaaland" created="Thu, 28 Jan 2021 05:57:24 +0000"  >&lt;p&gt;Possibly related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12735&quot; title=&quot;MGS misbehaving in 2.12.2+&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12735&quot;&gt;&lt;del&gt;LU-12735&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="290556" author="ofaaland" created="Thu, 28 Jan 2021 06:28:35 +0000"  >&lt;p&gt;I have console logs for all 3 nodes.&lt;/p&gt;

&lt;p&gt;I do not have debug logs from the dates/times above because those entries were pushed out by newer ones.&lt;/p&gt;

&lt;p&gt;The problem seems to have persisted until I crashed the nodes today.  I do have those crash dumps for the nodes with MGS,MDT0 and MDT5 in case there&apos;s useful information to be obtained there.  &lt;/p&gt;</comment>
                            <comment id="290595" author="pjones" created="Thu, 28 Jan 2021 15:28:38 +0000"  >&lt;p&gt;Mike&lt;/p&gt;

&lt;p&gt;Could you please look into the reason behind the initial eviction here?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="291125" author="eaujames" created="Wed, 3 Feb 2021 18:21:26 +0000"  >&lt;p&gt;Hello,&lt;/p&gt;

&lt;p&gt;We experienced the same type of issue recently on a Lustre 2.12.4 MGS:&lt;/p&gt;

&lt;p&gt;After re-mounting an OST, the MGS display the same type of message, firstly for a EX lock and then for a CR lock on the same MGS&apos;s fs resources (LDLM_PLAIN, type=CONFIG_T_RECOVER).&lt;/p&gt;

&lt;p&gt;Then the all the MGCs disconnect/reconnect to the MGS every 5 minutes.&lt;/p&gt;

&lt;p&gt;After 1h30 the HA detect a connection issue (not responding) with the MGS&apos;s node and kill it (stonith).&lt;/p&gt;

&lt;p&gt;After some research we found that our issue could be associated with the following tickets: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13356&quot; title=&quot;lctl conf_param hung on the MGS node&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13356&quot;&gt;&lt;del&gt;LU-13356&lt;/del&gt;&lt;/a&gt;, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10674&quot; title=&quot;MGS very unstable in 2.10.x&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10674&quot;&gt;LU-10674&lt;/a&gt; or &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12735&quot; title=&quot;MGS misbehaving in 2.12.2+&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12735&quot;&gt;&lt;del&gt;LU-12735&lt;/del&gt;&lt;/a&gt; tickets.&lt;/p&gt;</comment>
                            <comment id="291215" author="tappro" created="Thu, 4 Feb 2021 14:01:47 +0000"  >&lt;p&gt;Etienne, does your &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13356&quot; title=&quot;lctl conf_param hung on the MGS node&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13356&quot;&gt;&lt;del&gt;LU-13356&lt;/del&gt;&lt;/a&gt; patch help with that issue? So far I&apos;d say that is what needed&lt;/p&gt;</comment>
                            <comment id="291334" author="eaujames" created="Fri, 5 Feb 2021 08:56:57 +0000"  >&lt;p&gt;I think so but I don&apos;t know for sure: there are no reproducer or tests.&lt;/p&gt;

&lt;p&gt;We are determining&#160; causes of the MGS&apos;s failover from a crashdump. We think it might be that the mgs threads took too much CPU time (as in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12735&quot; title=&quot;MGS misbehaving in 2.12.2+&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12735&quot;&gt;&lt;del&gt;LU-12735&lt;/del&gt;&lt;/a&gt;).&lt;/p&gt;

&lt;p&gt;In dmesg we have a message like this:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[3653076.838597] sched: RT throttling activated
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;In vmcore, we see an accumulation of locks in the resource waiting list and nothing in the granted list (as say in a comment of the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10674&quot; title=&quot;MGS very unstable in 2.10.x&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10674&quot;&gt;LU-10674&lt;/a&gt;&apos;s patch).&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="62546">LU-14379</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i01kkf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>