<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:20:07 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-8736] stuck during umount in soak-test</title>
                <link>https://jira.whamcloud.com/browse/LU-8736</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;In latest soak-test, one of MDT stuck during umount&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LustreError: 0-0: Forced cleanup waiting for soaked-MDT0000-osp-MDT0002 namespace with 1 resources in use, (rc=-110)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The stack trace&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;umount        S 0000000000000011     0  8015   8013 0x00000080
 ffff8803d9b33808 0000000000000086 ffff8803d9b337d0 ffff8803d9b337cc
 ffff8803d9b33868 ffff88043fe84000 00001b24f314dc54 ffff880038635a00
 00000000000003ff 0000000101c3089b ffff8803f3c31ad8 ffff8803d9b33fd8
Call Trace:
 [&amp;lt;ffffffff8153a9b2&amp;gt;] schedule_timeout+0x192/0x2e0
 [&amp;lt;ffffffff81089fa0&amp;gt;] ? process_timeout+0x0/0x10
 [&amp;lt;ffffffffa0abded0&amp;gt;] __ldlm_namespace_free+0x1c0/0x560 [ptlrpc]
 [&amp;lt;ffffffff81067650&amp;gt;] ? default_wake_function+0x0/0x20
 [&amp;lt;ffffffffa0abe2df&amp;gt;] ldlm_namespace_free_prior+0x6f/0x220 [ptlrpc]
 [&amp;lt;ffffffffa13b0db2&amp;gt;] osp_process_config+0x4a2/0x680 [osp]
 [&amp;lt;ffffffff81291947&amp;gt;] ? find_first_bit+0x47/0x80
 [&amp;lt;ffffffffa12c5650&amp;gt;] lod_sub_process_config+0x100/0x1f0 [lod]
 [&amp;lt;ffffffffa12cad66&amp;gt;] lod_process_config+0x646/0x1580 [lod]
 [&amp;lt;ffffffffa113e4ff&amp;gt;] ? lfsck_stop+0x15f/0x4c0 [lfsck]
 [&amp;lt;ffffffffa0801032&amp;gt;] ? cfs_hash_bd_from_key+0x42/0xd0 [libcfs]
 [&amp;lt;ffffffffa1343253&amp;gt;] mdd_process_config+0x113/0x5e0 [mdd]
 [&amp;lt;ffffffffa11fee62&amp;gt;] mdt_device_fini+0x482/0x13e0 [mdt]
 [&amp;lt;ffffffffa08df626&amp;gt;] ? class_disconnect_exports+0x116/0x2f0 [obdclass]
 [&amp;lt;ffffffffa08f82c2&amp;gt;] class_cleanup+0x582/0xd30 [obdclass]
 [&amp;lt;ffffffffa08dae56&amp;gt;] ? class_name2dev+0x56/0xe0 [obdclass]
 [&amp;lt;ffffffffa08fa5d6&amp;gt;] class_process_config+0x1b66/0x24c0 [obdclass]
 [&amp;lt;ffffffffa07fc151&amp;gt;] ? libcfs_debug_msg+0x41/0x50 [libcfs]
 [&amp;lt;ffffffff8117904c&amp;gt;] ? __kmalloc+0x21c/0x230
 [&amp;lt;ffffffffa08fb3ef&amp;gt;] class_manual_cleanup+0x4bf/0xc90 [obdclass]
 [&amp;lt;ffffffffa08dae56&amp;gt;] ? class_name2dev+0x56/0xe0 [obdclass]
 [&amp;lt;ffffffffa092983c&amp;gt;] server_put_super+0x8bc/0xcd0 [obdclass]
 [&amp;lt;ffffffff81194aeb&amp;gt;] generic_shutdown_super+0x5b/0xe0
 [&amp;lt;ffffffff81194bd6&amp;gt;] kill_anon_super+0x16/0x60
 [&amp;lt;ffffffffa08fe596&amp;gt;] lustre_kill_super+0x36/0x60 [obdclass]
 [&amp;lt;ffffffff81195377&amp;gt;] deactivate_super+0x57/0x80
 [&amp;lt;ffffffff811b533f&amp;gt;] mntput_no_expire+0xbf/0x110
 [&amp;lt;ffffffff811b5e8b&amp;gt;] sys_umount+0x7b/0x3a0
 [&amp;lt;ffffffff8100b0d2&amp;gt;] system_call_fastpath+0x16/0x1b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And it seems there is a MDT handler thread (mdt_rename), which holds the remote lock on soaked-MDT0000-osp-MDT0002, but then stuck on local lock enqueue, which then block the namespace cleanup of umount.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;mdt01_016     S 000000000000000a     0  7405      2 0x00000080
 ffff8804027ab900 0000000000000046 0000000000000000 ffffffff810a1c1c
 ffff880433fef520 ffff8804027ab880 00000a768c137fd5 0000000000000000
 ffff8804027ab8c0 0000000100ab043e ffff880433fefad8 ffff8804027abfd8
Call Trace:
 [&amp;lt;ffffffff810a1c1c&amp;gt;] ? remove_wait_queue+0x3c/0x50
 [&amp;lt;ffffffffa0ad54b0&amp;gt;] ? ldlm_expired_completion_wait+0x0/0x250 [ptlrpc]
 [&amp;lt;ffffffffa0ada07d&amp;gt;] ldlm_completion_ast+0x68d/0x9b0 [ptlrpc]
 [&amp;lt;ffffffff81067650&amp;gt;] ? default_wake_function+0x0/0x20
 [&amp;lt;ffffffffa0ad93fe&amp;gt;] ldlm_cli_enqueue_local+0x21e/0x810 [ptlrpc]
 [&amp;lt;ffffffffa0ad99f0&amp;gt;] ? ldlm_completion_ast+0x0/0x9b0 [ptlrpc]
 [&amp;lt;ffffffffa11fa770&amp;gt;] ? mdt_blocking_ast+0x0/0x2e0 [mdt]
 [&amp;lt;ffffffffa12074a4&amp;gt;] mdt_object_local_lock+0x3a4/0xb00 [mdt]
 [&amp;lt;ffffffffa11fa770&amp;gt;] ? mdt_blocking_ast+0x0/0x2e0 [mdt]
 [&amp;lt;ffffffffa0ad99f0&amp;gt;] ? ldlm_completion_ast+0x0/0x9b0 [ptlrpc]
 [&amp;lt;ffffffffa1208103&amp;gt;] mdt_object_lock_internal+0x63/0x320 [mdt]
 [&amp;lt;ffffffffa1218e9e&amp;gt;] ? mdt_lookup_version_check+0x9e/0x350 [mdt]
 [&amp;lt;ffffffffa1208580&amp;gt;] mdt_reint_object_lock+0x20/0x60 [mdt]
 [&amp;lt;ffffffffa121cba7&amp;gt;] mdt_reint_rename_or_migrate+0x1317/0x2690 [mdt]
 [&amp;lt;ffffffffa11fa770&amp;gt;] ? mdt_blocking_ast+0x0/0x2e0 [mdt]
 [&amp;lt;ffffffffa0ad99f0&amp;gt;] ? ldlm_completion_ast+0x0/0x9b0 [ptlrpc]
 [&amp;lt;ffffffffa09238c0&amp;gt;] ? lu_ucred+0x20/0x30 [obdclass]
 [&amp;lt;ffffffffa0b06b00&amp;gt;] ? lustre_pack_reply_v2+0xf0/0x280 [ptlrpc]
 [&amp;lt;ffffffffa121df53&amp;gt;] mdt_reint_rename+0x13/0x20 [mdt]
 [&amp;lt;ffffffffa121704d&amp;gt;] mdt_reint_rec+0x5d/0x200 [mdt]
 [&amp;lt;ffffffffa1201d5b&amp;gt;] mdt_reint_internal+0x62b/0xa50 [mdt]
 [&amp;lt;ffffffffa120262b&amp;gt;] mdt_reint+0x6b/0x120 [mdt]
 [&amp;lt;ffffffffa0b6b0cc&amp;gt;] tgt_request_handle+0x8ec/0x1440 [ptlrpc]
 [&amp;lt;ffffffffa0b17821&amp;gt;] ptlrpc_main+0xd31/0x1800 [ptlrpc]
 [&amp;lt;ffffffff81539b0e&amp;gt;] ? thread_return+0x4e/0x7d0
 [&amp;lt;ffffffffa0b16af0&amp;gt;] ? ptlrpc_main+0x0/0x1800 [ptlrpc]
 [&amp;lt;ffffffff810a138e&amp;gt;] kthread+0x9e/0xc0
 [&amp;lt;ffffffff8100c28a&amp;gt;] child_rip+0xa/0x20
 [&amp;lt;ffffffff810a12f0&amp;gt;] ? kthread+0x0/0xc0
 [&amp;lt;ffffffff8100c280&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="40821">LU-8736</key>
            <summary>stuck during umount in soak-test</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="3" iconUrl="https://jira.whamcloud.com/images/icons/statuses/inprogress.png" description="This issue is being actively worked on at the moment by the assignee.">In Progress</status>
                    <statusCategory id="4" key="indeterminate" colorName="inprogress"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="laisiyao">Lai Siyao</assignee>
                                    <reporter username="di.wang">Di Wang</reporter>
                        <labels>
                            <label>soak</label>
                    </labels>
                <created>Wed, 19 Oct 2016 18:01:27 +0000</created>
                <updated>Mon, 18 Dec 2017 18:26:47 +0000</updated>
                                            <version>Lustre 2.9.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="170480" author="jgmitter" created="Thu, 20 Oct 2016 17:31:26 +0000"  >&lt;p&gt;Cliff, can you file the above comment as a separate ticket?&lt;/p&gt;</comment>
                            <comment id="170481" author="jgmitter" created="Thu, 20 Oct 2016 17:33:21 +0000"  >&lt;p&gt;Hi Lai,&lt;/p&gt;

&lt;p&gt;Can you please look at the first issue in this ticket?&lt;/p&gt;

&lt;p&gt;Thanks.&lt;br/&gt;
Joe&lt;/p&gt;</comment>
                            <comment id="170484" author="adilger" created="Thu, 20 Oct 2016 17:37:37 +0000"  >&lt;p&gt;Cliff, Di, &lt;br/&gt;
For the initial problem, was MDT0002 already stopped or in the middle of unmounting at the time that MDT0000 got stuck?  If it was up, were there any errors on the MDT0000 or MDT0002 console that indicate RPC timeouts or other errors? Could you please attach the MDS console logs from this time period. &lt;/p&gt;

&lt;p&gt;The other possibility is if this is a circular locking deadlock, which would need stack traces on both MDS nodes to see if there is another thread also stuck waiting for a lock. &lt;/p&gt;</comment>
                            <comment id="170490" author="cliffw" created="Thu, 20 Oct 2016 17:42:40 +0000"  >&lt;p&gt;For the initial problem, no -  MDT0002 is mounted on lola-10, and was not involved in the failover at all. It was neither stopped nor umounted&lt;/p&gt;</comment>
                            <comment id="170499" author="bogl" created="Thu, 20 Oct 2016 18:03:55 +0000"  >&lt;p&gt;console logs covering the time period&lt;/p&gt;</comment>
                            <comment id="170512" author="di.wang" created="Thu, 20 Oct 2016 18:47:55 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;The other possibility is if this is a circular locking deadlock, which would need stack traces on both MDS nodes to see if there is another thread also stuck waiting for a lock.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Yes, it looks like a circular locking deadlock here, see that mdt_reint_rename trace (on MDT2) I posted above, after the rename process on MDT2 got the remote lock(lock_A), then being blocked in local lock(lock_B) enqueue, which is probably holding by MDT0, then in the mean time umount happened, so MDT0 can not return the lock to MDT2 successfully, because of umount on MDT2 ( Can it? that is my guess, but need some investigation).  Then we saw umount is hanging there for namespace cleanup because lock_A is still hold by the rename process.&lt;/p&gt;

&lt;p&gt;So it looks like we need re-order the umount process to make sure all of remote locks has been released before namespace cleanup.  Need think a bit.&lt;/p&gt;</comment>
                            <comment id="192720" author="adilger" created="Wed, 19 Apr 2017 17:07:43 +0000"  >&lt;p&gt;Di, any further ideas on this?&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="23718" name="lola-10.log" size="1459277" author="bogl" created="Thu, 20 Oct 2016 18:03:55 +0000"/>
                            <attachment id="23717" name="lola-8.log" size="4511720" author="bogl" created="Thu, 20 Oct 2016 18:03:54 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzysjz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>