<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:20:12 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-8746] lustre umount hangs in distribute_txn_fini</title>
                <link>https://jira.whamcloud.com/browse/LU-8746</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;on our DNE testbed, umount hangs with the following call stack:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2016-10-20 15:07:39 [14285.581098] INFO: task umount:182090 blocked for more than 120 seconds.
2016-10-20 15:07:39 [14285.589651] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
2016-10-20 15:07:39 [14285.599447] umount          D ffff887918187c20     0 182090 182089 0x00000080
2016-10-20 15:07:39 [14285.608492]  ffff887918187a58 0000000000000086 ffff887918187fd8 ffff887f6e073ec0
2016-10-20 15:07:39 [14285.617837]  ffff887918187fd8 ffff887918187fd8 ffff887f6e073ec0 ffff887efb6fb0b0
2016-10-20 15:07:39 [14285.627161]  ffff887ec3f50240 ffff887efb6fb100 ffff887918187c20 ffff887918187c20
2016-10-20 15:07:39 [14285.636476] Call Trace:
2016-10-20 15:07:39 [14285.640213]  [&amp;lt;ffffffff8169f039&amp;gt;] schedule+0x29/0x70
2016-10-20 15:07:39 [14285.646853]  [&amp;lt;ffffffffa0c679bd&amp;gt;] distribute_txn_fini+0xcd/0x130 [ptlrpc]
2016-10-20 15:07:39 [14285.655433]  [&amp;lt;ffffffff810b4520&amp;gt;] ? wake_up_atomic_t+0x30/0x30
2016-10-20 15:07:39 [14285.662928]  [&amp;lt;ffffffffa0f953ee&amp;gt;] lod_process_config+0x93e/0x15c0 [lod]
2016-10-20 15:07:39 [14285.671314]  [&amp;lt;ffffffffa0e170a8&amp;gt;] ? lfsck_stop+0x1b8/0x4f0 [lfsck]
2016-10-20 15:07:39 [14285.679182]  [&amp;lt;ffffffff811e5a63&amp;gt;] ? __kmalloc+0x233/0x280
2016-10-20 15:07:39 [14285.686155]  [&amp;lt;ffffffffa0ff6ee2&amp;gt;] mdd_process_config+0x82/0x5c0 [mdd]
2016-10-20 15:07:39 [14285.694330]  [&amp;lt;ffffffffa0eb91d6&amp;gt;] mdt_device_fini+0x1c6/0xfc0 [mdt]
2016-10-20 15:07:39 [14285.702290]  [&amp;lt;ffffffffa099821c&amp;gt;] class_cleanup+0x8dc/0xd70 [obdclass]
2016-10-20 15:07:39 [14285.710504]  [&amp;lt;ffffffffa099abfc&amp;gt;] class_process_config+0x1e2c/0x2f70 [obdclass]
2016-10-20 15:07:39 [14285.719555]  [&amp;lt;ffffffff811e5a63&amp;gt;] ? __kmalloc+0x233/0x280
2016-10-20 15:07:39 [14285.726494]  [&amp;lt;ffffffffa099411b&amp;gt;] ? lustre_cfg_new+0x8b/0x400 [obdclass]
2016-10-20 15:07:39 [14285.734866]  [&amp;lt;ffffffffa099be2f&amp;gt;] class_manual_cleanup+0xef/0x810 [obdclass]
2016-10-20 15:07:39 [14285.743601]  [&amp;lt;ffffffffa09cde8e&amp;gt;] server_put_super+0x8de/0xcd0 [obdclass]
2016-10-20 15:07:39 [14285.752002]  [&amp;lt;ffffffff81209572&amp;gt;] generic_shutdown_super+0x72/0xf0
2016-10-20 15:07:39 [14285.759721]  [&amp;lt;ffffffff81209942&amp;gt;] kill_anon_super+0x12/0x20
2016-10-20 15:07:39 [14285.766748]  [&amp;lt;ffffffffa099f592&amp;gt;] lustre_kill_super+0x32/0x50 [obdclass]
2016-10-20 15:07:39 [14285.775033]  [&amp;lt;ffffffff81209cf9&amp;gt;] deactivate_locked_super+0x49/0x60
2016-10-20 15:07:39 [14285.782803]  [&amp;lt;ffffffff8120a2f6&amp;gt;] deactivate_super+0x46/0x60
2016-10-20 15:07:39 [14285.789884]  [&amp;lt;ffffffff812282c5&amp;gt;] mntput_no_expire+0xc5/0x120
2016-10-20 15:07:39 [14285.797046]  [&amp;lt;ffffffff81229440&amp;gt;] SyS_umount+0xa0/0x3b0
2016-10-20 15:07:39 [14285.803615]  [&amp;lt;ffffffff816aa4c9&amp;gt;] system_call_fastpath+0x16/0x1b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="40947">LU-8746</key>
            <summary>lustre umount hangs in distribute_txn_fini</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="laisiyao">Lai Siyao</assignee>
                                    <reporter username="dinatale2">Giuseppe Di Natale</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Fri, 21 Oct 2016 00:40:44 +0000</created>
                <updated>Tue, 9 May 2017 04:20:18 +0000</updated>
                            <resolved>Tue, 9 May 2017 04:20:18 +0000</resolved>
                                    <version>Lustre 2.8.0</version>
                    <version>Lustre 2.10.0</version>
                                    <fixVersion>Lustre 2.10.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="170622" author="jgmitter" created="Fri, 21 Oct 2016 17:36:34 +0000"  >&lt;p&gt;Hi Lai,&lt;/p&gt;

&lt;p&gt;Can you please investigate this issue?&lt;/p&gt;

&lt;p&gt;Thanks.&lt;br/&gt;
Joe&lt;/p&gt;</comment>
                            <comment id="170625" author="adilger" created="Fri, 21 Oct 2016 17:42:32 +0000"  >&lt;p&gt;Are the other MDS nodes being stopped at this time also?  It looks like this MDS is waiting on finishing up some operation with another MDS when it is being shut diwn, and if that MDS is already stopped it won&apos;t get the answer it is waiting for any time soon. &lt;/p&gt;

&lt;p&gt;It might make sense to add checks in the places where the MDS is waiting on other MDS RPCs to see if it is in the process of unmounting, and stop waiting on it?&lt;/p&gt;</comment>
                            <comment id="170629" author="di.wang" created="Fri, 21 Oct 2016 17:58:10 +0000"  >&lt;p&gt;Hmm, distribute_txn_fini() is waiting for the tdtd thread to stop here, which is used to tracking distribute transaction committed status, canceling committed logs, etc.  So that thread must be stuck somewhere. Could you please post the stack trace for that thread as well? (named as tdtd-xx). Thanks&lt;/p&gt;</comment>
                            <comment id="170647" author="dinatale2" created="Fri, 21 Oct 2016 20:15:53 +0000"  >&lt;p&gt;I&apos;m going to have to try and reproduce this issue, at which point I could get try and get the call stacks you need.&lt;/p&gt;

&lt;p&gt;Also, Andreas, looking through the logs, it does in fact appear that two MDSs were trying to unmount their targets.&lt;/p&gt;</comment>
                            <comment id="193678" author="adilger" created="Wed, 26 Apr 2017 21:01:48 +0000"  >&lt;p&gt;I hit this same hang during cleanup with v2.9.55_78.  It seems like the threads aren&apos;t doing anything, just waiting for the refcount to drop to zero:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;kernel: umount          D ffff8800251afc30     0 18172  18171 0x00000080
kernel: Call Trace:
kernel: schedule+0x29/0x70
kernel: distribute_txn_fini+0xcd/0x130 [ptlrpc]
kernel: lod_process_config+0x98f/0x15f0 [lod]
kernel: mdd_process_config+0x88/0x600 [mdd]
kernel: mdt_device_fini+0x1c6/0x10b0 [mdt]
kernel: class_cleanup+0x7f4/0xd80 [obdclass]
kernel: class_process_config+0x1f84/0x2c30 [obdclass]
kernel: class_manual_cleanup+0xef/0x810 [obdclass]
kernel: server_put_super+0x8de/0xcd0 [obdclass]
kernel: generic_shutdown_super+0x56/0xe0
kernel: kill_anon_super+0x12/0x20
kernel: lustre_kill_super+0x32/0x50 [obdclass]
kernel: deactivate_locked_super+0x49/0x60
kernel: deactivate_super+0x46/0x60
kernel: mntput_no_expire+0xc5/0x120
kernel: SyS_umount+0x9f/0x3c0

(gdb) list *distribute_txn_fini+0xcd
0xcb4dd is in distribute_txn_fini
(lustre/target/update_trans.c:1744).
1739
1740            spin_lock(&amp;amp;tdtd-&amp;gt;tdtd_batchid_lock);
1741            lut-&amp;gt;lut_tdtd_commit_thread.t_flags = SVC_STOPPING;
1742            spin_unlock(&amp;amp;tdtd-&amp;gt;tdtd_batchid_lock);
1743            wake_up(&amp;amp;tdtd-&amp;gt;tdtd_commit_thread_waitq);
1744            wait_event(lut-&amp;gt;lut_tdtd_commit_thread.t_ctl_waitq,
1745                       lut-&amp;gt;lut_tdtd_commit_thread.t_flags &amp;amp; SVC_STOPPED);
1746
1747            dtrq_list_destroy(tdtd);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;(minor note, it would be good to give these threads a better name than &quot;&lt;tt&gt;tdtd&lt;/tt&gt;&quot;, like &quot;&lt;tt&gt;tgt_txn&lt;/tt&gt;&quot; or &quot;&lt;tt&gt;dist_txn&lt;/tt&gt;&quot; or similar.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;kernel: tdtd-0          S ffff8800376dd080     0 17234      2 0x00000080
kernel: Call Trace:
kernel: schedule+0x29/0x70
kernel: distribute_txn_commit_thread+0x110c/0x1410 [ptlrpc]
kernel: kthread+0xcf/0xe0

(gdb) list *distribute_txn_commit_thread+0x110c
0xcd621 is in distribute_txn_commit_thread (ustre/target/update_trans.c:1644).
1641         };
1642 
1643         l_wait_event(tdtd-&amp;gt;tdtd_commit_thread_waitq,
1644                      atomic_read(&amp;amp;tdtd-&amp;gt;tdtd_refcount) == 0, &amp;amp;lwi);
1645 
1646         spin_lock(&amp;amp;tdtd-&amp;gt;tdtd_batchid_lock);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="193681" author="di.wang" created="Wed, 26 Apr 2017 21:37:32 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;1643         l_wait_event(tdtd-&amp;gt;tdtd_commit_thread_waitq,
1644                      atomic_read(&amp;amp;tdtd-&amp;gt;tdtd_refcount) == 0, &amp;amp;lwi);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;tdtd_refcount != 0 means there are some inflight transactions to wait for batchid updating. The transaction callback should decrease the refcount. Somehow the transaction seems cancelled, but refcount is not decreased. Hmm, it seems tdtd_refcount is not restored in some error handler cases in distribute_txn_commit_batchid_update().  I will cook a patch.&lt;/p&gt;</comment>
                            <comment id="193930" author="gerrit" created="Fri, 28 Apr 2017 17:46:06 +0000"  >&lt;p&gt;wangdi (di.wang@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/26888&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/26888&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8746&quot; title=&quot;lustre umount hangs in distribute_txn_fini&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8746&quot;&gt;&lt;del&gt;LU-8746&lt;/del&gt;&lt;/a&gt; update: restore tdtd_refcount during failure&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 5a74baab7cf18a42090f9b348537f0cd5ae173ad&lt;/p&gt;</comment>
                            <comment id="195027" author="gerrit" created="Tue, 9 May 2017 03:47:18 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/26888/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/26888/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8746&quot; title=&quot;lustre umount hangs in distribute_txn_fini&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8746&quot;&gt;&lt;del&gt;LU-8746&lt;/del&gt;&lt;/a&gt; update: restore tdtd_refcount during failure&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 92bbd06e4ddd2505fceaada9cda4bb0974733204&lt;/p&gt;</comment>
                            <comment id="195047" author="pjones" created="Tue, 9 May 2017 04:20:18 +0000"  >&lt;p&gt;Landed for 2.10&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzyszz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>