<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:56:04 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12834] MDT hung during failover</title>
                <link>https://jira.whamcloud.com/browse/LU-12834</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;1 MDT got stuck after it came back from a reboot.  This issue looks like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12354&quot; title=&quot;MDT threads stuck at ldlm_completion_ast&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12354&quot;&gt;&lt;del&gt;LU-12354&lt;/del&gt;&lt;/a&gt;, not sure if it is dup&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[2019-10-04T18:13:18+00:00] INFO: template[/etc/ssh/sshd_config] sending restart action to service[sshd] (delayed)
[2019-10-04T18:13:18+00:00] INFO: Processing service[sshd] action restart (ssh::server line 19)
[2019-10-04T18:13:18+00:00] INFO: service[sshd] restarted
[2019-10-04T18:13:18+00:00] INFO: Chef Run complete in 40.090920258 seconds
[2019-10-04T18:13:18+00:00] INFO: Running report handlers
[2019-10-04T18:13:18+00:00] INFO: Creating JSON run report
[2019-10-04T18:13:18+00:00] INFO: Report handlers complete
[  238.881257] LNet: HW NUMA nodes: 2, HW CPU cores: 32, npartitions: 2
[  238.891233] alg: No test for adler32 (adler32-zlib)
[  239.698265] Lustre: Lustre: Build Version: 2.12.58_104_g279c264
[  239.880390] LNet: Using FMR for registration
[  239.882506] LNetError: 215:0:(o2iblnd_cb.c:2496:kiblnd_passive_connect()) Can&apos;t accept conn from 192.168.1.118@o2ib on NA (ib0:0:192.168.1.109): bad dst nid
 192.168.1.109@o2ib
[  239.915188] LNet: Added LNI 192.168.1.109@o2ib [8/256/0/180]
[  239.969648] LDISKFS-fs warning (device dm-1): ldiskfs_multi_mount_protect:321: MMP interval 42 higher than expected, please wait.
[  239.969648] 
[  292.347499] LDISKFS-fs (dm-1): recovery complete
[  292.353139] LDISKFS-fs (dm-1): mounted filesystem with ordered data mode. Opts: user_xattr,errors=remount-ro,user_xattr,no_mbcache,nodelalloc
[  293.137144] Lustre: osd-ldiskfs create tunables for soaked-MDT0001
[  293.641326] Lustre: soaked-MDT0001: Imperative Recovery enabled, recovery window shrunk from 300-900 down to 150-900
[  299.332695] Lustre: soaked-MDT0001: Will be in recovery for at least 2:30, or until 28 clients reconnect
[  300.344031] Lustre: soaked-MDT0001: Connection restored to 431caaf2-d303-4 (at 192.168.1.136@o2ib)
[  301.115589] Lustre: soaked-MDT0001: Connection restored to 81b942e1-4e72-4 (at 192.168.1.125@o2ib)
[  302.209118] Lustre: soaked-MDT0001: Connection restored to d81c2840-9c51-4 (at 192.168.1.128@o2ib)
[  302.219237] Lustre: Skipped 2 previous similar messages
[  305.751655] Lustre: soaked-MDT0001: Connection restored to 6780d2ed-c1f6-4 (at 192.168.1.119@o2ib)
[  305.761769] Lustre: Skipped 5 previous similar messages
[  312.381584] Lustre: soaked-MDT0001: Connection restored to 156ff267-111c-4 (at 192.168.1.122@o2ib)
[  312.391687] Lustre: Skipped 2 previous similar messages
[  320.759167] Lustre: soaked-MDT0001: Connection restored to 925c0ae9-c415-4 (at 192.168.1.126@o2ib)
[  320.769289] Lustre: Skipped 4 previous similar messages
[  338.178116] Lustre: soaked-MDT0001: Connection restored to soaked-MDT0001-lwp-OST0008_UUID (at 192.168.1.104@o2ib)
[  338.189826] Lustre: Skipped 21 previous similar messages
[  348.048244] Lustre: soaked-MDT0001: Recovery over after 0:49, of 28 clients 28 recovered and 0 were evicted.
[  548.286678] Lustre: mdt00_005: service thread pid 5290 was inactive for 200.215 seconds. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one.
[  548.286697] Lustre: mdt01_004: service thread pid 5269 was inactive for 200.229 seconds. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[  548.286701] Pid: 5271, comm: mdt01_006 3.10.0-1062.el7_lustre.x86_64 #1 SMP Mon Sep 30 22:06:44 UTC 2019
[  548.286702] Lustre: Skipped 5 previous similar messages
[  548.286703] Call Trace:
[  548.286811]  [&amp;lt;ffffffffc1035b10&amp;gt;] ldlm_completion_ast+0x430/0x860 [ptlrpc]
[  548.286864]  [&amp;lt;ffffffffc1037caf&amp;gt;] ldlm_cli_enqueue_fini+0x96f/0xdf0 [ptlrpc]
[  548.286917]  [&amp;lt;ffffffffc103a561&amp;gt;] ldlm_cli_enqueue+0x421/0x930 [ptlrpc]
[  548.286935]  [&amp;lt;ffffffffc1655d62&amp;gt;] osp_md_object_lock+0x162/0x2d0 [osp]
[  548.286959]  [&amp;lt;ffffffffc1566974&amp;gt;] lod_object_lock+0xf4/0x780 [lod]
[  548.286980]  [&amp;lt;ffffffffc15ebbfe&amp;gt;] mdd_object_lock+0x3e/0xe0 [mdd]
[  548.287009]  [&amp;lt;ffffffffc14847d1&amp;gt;] mdt_remote_object_lock_try+0x1e1/0x520 [mdt]
[  548.287028]  [&amp;lt;ffffffffc1484b3a&amp;gt;] mdt_remote_object_lock+0x2a/0x30 [mdt]
[  548.287050]  [&amp;lt;ffffffffc149947e&amp;gt;] mdt_rename_lock+0xbe/0x4b0 [mdt]
[  548.287071]  [&amp;lt;ffffffffc149ad75&amp;gt;] mdt_reint_rename+0x2c5/0x2b60 [mdt]
[  548.287092]  [&amp;lt;ffffffffc14a6883&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
[  548.287110]  [&amp;lt;ffffffffc1480930&amp;gt;] mdt_reint_internal+0x7b0/0xba0 [mdt]
[  548.287129]  [&amp;lt;ffffffffc148be37&amp;gt;] mdt_reint+0x67/0x140 [mdt]
[  548.287216]  [&amp;lt;ffffffffc10d772a&amp;gt;] tgt_request_handle+0x98a/0x1630 [ptlrpc]
[  548.287278]  [&amp;lt;ffffffffc1079976&amp;gt;] ptlrpc_server_handle_request+0x256/0xb10 [ptlrpc]
[  548.287338]  [&amp;lt;ffffffffc107d4ac&amp;gt;] ptlrpc_main+0xbac/0x1540 [ptlrpc]
[  548.287345]  [&amp;lt;ffffffff838c50d1&amp;gt;] kthread+0xd1/0xe0
[  548.287350]  [&amp;lt;ffffffff83f8bd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[  548.287383]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[  548.287386] Pid: 5270, comm: mdt01_005 3.10.0-1062.el7_lustre.x86_64 #1 SMP Mon Sep 30 22:06:44 UTC 2019
[  548.287387] Call Trace:
[  548.287463]  [&amp;lt;ffffffffc1035b10&amp;gt;] ldlm_completion_ast+0x430/0x860 [ptlrpc]
[  548.287516]  [&amp;lt;ffffffffc1037caf&amp;gt;] ldlm_cli_enqueue_fini+0x96f/0xdf0 [ptlrpc]
[  548.287568]  [&amp;lt;ffffffffc103a561&amp;gt;] ldlm_cli_enqueue+0x421/0x930 [ptlrpc]
[  548.287582]  [&amp;lt;ffffffffc1655d62&amp;gt;] osp_md_object_lock+0x162/0x2d0 [osp]
[  548.287599]  [&amp;lt;ffffffffc1566974&amp;gt;] lod_object_lock+0xf4/0x780 [lod]
[  548.287614]  [&amp;lt;ffffffffc15ebbfe&amp;gt;] mdd_object_lock+0x3e/0xe0 [mdd]
[  548.287634]  [&amp;lt;ffffffffc14847d1&amp;gt;] mdt_remote_object_lock_try+0x1e1/0x520 [mdt]
[  548.287678]  [&amp;lt;ffffffffc1484b3a&amp;gt;] mdt_remote_object_lock+0x2a/0x30 [mdt]
[  548.287701]  [&amp;lt;ffffffffc149947e&amp;gt;] mdt_rename_lock+0xbe/0x4b0 [mdt]
[  548.287722]  [&amp;lt;ffffffffc149ad75&amp;gt;] mdt_reint_rename+0x2c5/0x2b60 [mdt]
[  548.287744]  [&amp;lt;ffffffffc14a6883&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
[  548.287764]  [&amp;lt;ffffffffc1480930&amp;gt;] mdt_reint_internal+0x7b0/0xba0 [mdt]
[  548.287784]  [&amp;lt;ffffffffc148be37&amp;gt;] mdt_reint+0x67/0x140 [mdt]
[  548.287863]  [&amp;lt;ffffffffc10d772a&amp;gt;] tgt_request_handle+0x98a/0x1630 [ptlrpc]
[  548.287930]  [&amp;lt;ffffffffc1079976&amp;gt;] ptlrpc_server_handle_request+0x256/0xb10 [ptlrpc]
[  548.287996]  [&amp;lt;ffffffffc107d4ac&amp;gt;] ptlrpc_main+0xbac/0x1540 [ptlrpc]
[  548.288001]  [&amp;lt;ffffffff838c50d1&amp;gt;] kthread+0xd1/0xe0
[  548.288005]  [&amp;lt;ffffffff83f8bd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[  548.288017]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>lustre-master-ib #328 EL7.7</environment>
        <key id="57104">LU-12834</key>
            <summary>MDT hung during failover</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="sarah">Sarah Liu</reporter>
                        <labels>
                            <label>soak</label>
                    </labels>
                <created>Mon, 7 Oct 2019 21:56:33 +0000</created>
                <updated>Mon, 19 Sep 2022 21:03:07 +0000</updated>
                                            <version>Lustre 2.13.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="256082" author="pjones" created="Tue, 8 Oct 2019 17:37:42 +0000"  >&lt;p&gt;Hongchao&lt;/p&gt;

&lt;p&gt;Can you please investigate?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="256115" author="hongchao.zhang" created="Wed, 9 Oct 2019 13:02:06 +0000"  >&lt;p&gt;It could be related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12037&quot; title=&quot;Possible DNE issue leading to hung filesystem&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12037&quot;&gt;&lt;del&gt;LU-12037&lt;/del&gt;&lt;/a&gt;, which cause deadlock when rename between MDTs.&lt;/p&gt;</comment>
                            <comment id="256118" author="pjones" created="Wed, 9 Oct 2019 13:52:41 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=hongchao.zhang&quot; class=&quot;user-hover&quot; rel=&quot;hongchao.zhang&quot;&gt;hongchao.zhang&lt;/a&gt; you mean that the fix is not 100% effective?&lt;/p&gt;</comment>
                            <comment id="256119" author="hongchao.zhang" created="Wed, 9 Oct 2019 14:00:06 +0000"  >&lt;p&gt;the patch &lt;a href=&quot;https://review.whamcloud.com/#/c/34410/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/34410/&lt;/a&gt;  in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12037&quot; title=&quot;Possible DNE issue leading to hung filesystem&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12037&quot;&gt;&lt;del&gt;LU-12037&lt;/del&gt;&lt;/a&gt; added an option to disable the rename between MDTs,&lt;br/&gt;
but the default value is &quot;1&quot;, which allow the rename between MDTs. the customer tried the patch in their site to disable&lt;br/&gt;
the rename operation between MDTs, which avoids the issue to occur.&lt;/p&gt;</comment>
                            <comment id="256120" author="pjones" created="Wed, 9 Oct 2019 14:03:00 +0000"  >&lt;p&gt;Ah I see. So you are suggesting that we utilize this option on soak? &lt;/p&gt;</comment>
                            <comment id="256121" author="hongchao.zhang" created="Wed, 9 Oct 2019 14:11:08 +0000"  >&lt;p&gt;Yes, it would be better to disable rename between MDTs before it&apos;s fixed thoroughly.&lt;/p&gt;</comment>
                            <comment id="256131" author="sarah" created="Wed, 9 Oct 2019 16:32:50 +0000"  >&lt;p&gt;Got it, I will disable it&lt;/p&gt;</comment>
                            <comment id="317564" author="adilger" created="Fri, 5 Nov 2021 17:25:48 +0000"  >&lt;p&gt;According to comments in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12035&quot; title=&quot;DoM/HSM: released file appears with &amp;quot;unknown&amp;quot; layout type&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12035&quot;&gt;&lt;del&gt;LU-12035&lt;/del&gt;&lt;/a&gt;, it was the patch &lt;a href=&quot;https://review.whamcloud.com/33077&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/33077&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11285&quot; title=&quot;don&amp;#39;t stop on the first blocked lock in ldlm_reprocess_queue()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11285&quot;&gt;&lt;del&gt;LU-11285&lt;/del&gt;&lt;/a&gt; ldlm: reprocess whole waiting queue for IBITS&lt;/tt&gt;&quot; that resolved this issue. &lt;/p&gt;

&lt;p&gt;I would recommend to re-enable the remote rename on soak, so that we can see if this issue is fixed properly, since disabling remote rename is at best a temporary workaround. &lt;/p&gt;</comment>
                            <comment id="317566" author="adilger" created="Fri, 5 Nov 2021 17:31:44 +0000"  >&lt;p&gt;Hmm, it looks like the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11285&quot; title=&quot;don&amp;#39;t stop on the first blocked lock in ldlm_reprocess_queue()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11285&quot;&gt;&lt;del&gt;LU-11285&lt;/del&gt;&lt;/a&gt; patch was abandoned. Trying to figure out the details. &lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                                        </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="70637">LU-15913</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="55041">LU-12037</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="67360">LU-15285</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="53122">LU-11285</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00nq7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>