<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:53:32 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12544] mds marked unhealthy after txg_quiesce thread hanging</title>
                <link>https://jira.whamcloud.com/browse/LU-12544</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;While waiting for a resolution to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12510&quot; title=&quot;mds server hangs cv_wait_common&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12510&quot;&gt;&lt;del&gt;LU-12510&lt;/del&gt;&lt;/a&gt;, we rolled back to our previous production image which is at lustre 2.11.0 and using zfs 0.7.13. We had run into issues, but it was deemed more stable than what we were seeing. Since then, we have repeatedly been hitting this issue which is causing our MDS hosts to get marked unhealthy by lustre.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[12692.736688] INFO: task txg_quiesce:37482 blocked &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; more than 120 seconds.
[12692.744369] &lt;span class=&quot;code-quote&quot;&gt;&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot;&lt;/span&gt; disables &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; message.
[12692.752901] txg_quiesce     D ffff8afcb464c100     0 37482      2 0x00000000
[12692.760699] Call Trace:
[12692.763852]  [&amp;lt;ffffffff84b67c49&amp;gt;] schedule+0x29/0x70
[12692.769527]  [&amp;lt;ffffffffc0a102d5&amp;gt;] cv_wait_common+0x125/0x150 [spl]
[12692.776399]  [&amp;lt;ffffffff844c2d40&amp;gt;] ? wake_up_atomic_t+0x30/0x30
[12692.782916]  [&amp;lt;ffffffffc0a10315&amp;gt;] __cv_wait+0x15/0x20 [spl]
[12692.789191]  [&amp;lt;ffffffffc129fc6b&amp;gt;] txg_quiesce_thread+0x2fb/0x410 [zfs]
[12692.796401]  [&amp;lt;ffffffffc129f970&amp;gt;] ? txg_init+0x2b0/0x2b0 [zfs]
[12692.802893]  [&amp;lt;ffffffffc0a0b063&amp;gt;] thread_generic_wrapper+0x73/0x80 [spl]
[12692.810243]  [&amp;lt;ffffffffc0a0aff0&amp;gt;] ? __thread_exit+0x20/0x20 [spl]
[12692.816976]  [&amp;lt;ffffffff844c1c71&amp;gt;] kthread+0xd1/0xe0
[12692.822482]  [&amp;lt;ffffffff844c1ba0&amp;gt;] ? insert_kthread_work+0x40/0x40
[12692.829195]  [&amp;lt;ffffffff84b74c1d&amp;gt;] ret_from_fork_nospec_begin+0x7/0x21
[12692.836264]  [&amp;lt;ffffffff844c1ba0&amp;gt;] ? insert_kthread_work+0x40/0x40
[12692.842995] INFO: task mdt01_001:38593 blocked &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; more than 120 seconds.
[12692.850408] &lt;span class=&quot;code-quote&quot;&gt;&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot;&lt;/span&gt; disables &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; message.
[12692.858865] mdt01_001       D ffff8afc5f579040     0 38593      2 0x00000000
[12692.866600] Call Trace:
[12692.869719]  [&amp;lt;ffffffffc19d4a19&amp;gt;] ? lod_sub_declare_xattr_set+0xf9/0x300 [lod]
[12692.877592]  [&amp;lt;ffffffff84b67c49&amp;gt;] schedule+0x29/0x70
[12692.883203]  [&amp;lt;ffffffffc0a102d5&amp;gt;] cv_wait_common+0x125/0x150 [spl]
[12692.890015]  [&amp;lt;ffffffff844c2d40&amp;gt;] ? wake_up_atomic_t+0x30/0x30
[12692.896475]  [&amp;lt;ffffffffc0a10315&amp;gt;] __cv_wait+0x15/0x20 [spl]
[12692.902677]  [&amp;lt;ffffffffc1254c33&amp;gt;] dmu_tx_wait+0x213/0x3c0 [zfs]
[12692.909203]  [&amp;lt;ffffffffc1254e72&amp;gt;] dmu_tx_assign+0x92/0x490 [zfs]
[12692.915805]  [&amp;lt;ffffffffc0d35f57&amp;gt;] osd_trans_start+0xa7/0x3c0 [osd_zfs]
[12692.922968]  [&amp;lt;ffffffffc1841fa2&amp;gt;] top_trans_start+0x702/0x940 [ptlrpc]
[12692.930062]  [&amp;lt;ffffffffc1a2b173&amp;gt;] ? mdd_declare_create+0x5a3/0xdb0 [mdd]
[12692.937324]  [&amp;lt;ffffffffc199a3f1&amp;gt;] lod_trans_start+0x31/0x40 [lod]
[12692.943964]  [&amp;lt;ffffffffc1a4980a&amp;gt;] mdd_trans_start+0x1a/0x20 [mdd]
[12692.950591]  [&amp;lt;ffffffffc1a2f507&amp;gt;] mdd_create+0xb77/0x13a0 [mdd]
[12692.957049]  [&amp;lt;ffffffffc11146c8&amp;gt;] mdt_reint_open+0x2218/0x3270 [mdt]
[12692.963948]  [&amp;lt;ffffffffc15f4241&amp;gt;] ? upcall_cache_get_entry+0x211/0x8d0 [obdclass]
[12692.971950]  [&amp;lt;ffffffffc1108883&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
[12692.978467]  [&amp;lt;ffffffffc10e81ab&amp;gt;] mdt_reint_internal+0x5fb/0x9c0 [mdt]
[12692.985486]  [&amp;lt;ffffffffc10f4737&amp;gt;] mdt_intent_reint+0x157/0x420 [mdt]
[12692.992317]  [&amp;lt;ffffffffc10eb315&amp;gt;] mdt_intent_opc+0x455/0xae0 [mdt]
[12692.999000]  [&amp;lt;ffffffffc17ca710&amp;gt;] ? lustre_swab_ldlm_policy_data+0x30/0x30 [ptlrpc]
[12693.007136]  [&amp;lt;ffffffffc10f2f63&amp;gt;] mdt_intent_policy+0x1a3/0x360 [mdt]
[12693.014050]  [&amp;lt;ffffffffc177a235&amp;gt;] ldlm_lock_enqueue+0x385/0x8f0 [ptlrpc]
[12693.021223]  [&amp;lt;ffffffffc17a2913&amp;gt;] ldlm_handle_enqueue0+0x8f3/0x13e0 [ptlrpc]
[12693.028726]  [&amp;lt;ffffffffc17ca790&amp;gt;] ? lustre_swab_ldlm_lock_desc+0x30/0x30 [ptlrpc]
[12693.036662]  [&amp;lt;ffffffffc1828bf2&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
[12693.043205]  [&amp;lt;ffffffffc182f05a&amp;gt;] tgt_request_handle+0x92a/0x13b0 [ptlrpc]
[12693.050507]  [&amp;lt;ffffffffc17d4843&amp;gt;] ptlrpc_server_handle_request+0x253/0xab0 [ptlrpc]
[12693.058580]  [&amp;lt;ffffffffc17d16f8&amp;gt;] ? ptlrpc_wait_event+0x98/0x340 [ptlrpc]
[12693.065766]  [&amp;lt;ffffffff844d6802&amp;gt;] ? default_wake_function+0x12/0x20
[12693.072431]  [&amp;lt;ffffffff844cbadb&amp;gt;] ? __wake_up_common+0x5b/0x90
[12693.078674]  [&amp;lt;ffffffffc17d7ff2&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[12693.085351]  [&amp;lt;ffffffffc17d7560&amp;gt;] ? ptlrpc_register_service+0xe90/0xe90 [ptlrpc]
[12693.093130]  [&amp;lt;ffffffff844c1c71&amp;gt;] kthread+0xd1/0xe0
[12693.098403]  [&amp;lt;ffffffff844c1ba0&amp;gt;] ? insert_kthread_work+0x40/0x40
[12693.104890]  [&amp;lt;ffffffff84b74c1d&amp;gt;] ret_from_fork_nospec_begin+0x7/0x21
[12693.111718]  [&amp;lt;ffffffff844c1ba0&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I wasn&apos;t sure if this is related to the same issue we were seeing with 2.12.2 and zfs 0.8.1. The dnodestats did not look like they were getting backed up on lock retries though.&lt;/p&gt;</description>
                <environment>RHEL7.6, Lustre 2.11.0, ZFS 0.7.13</environment>
        <key id="56402">LU-12544</key>
            <summary>mds marked unhealthy after txg_quiesce thread hanging</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="6">Not a Bug</resolution>
                                        <assignee username="bzzz">Alex Zhuravlev</assignee>
                                    <reporter username="curtispb">Philip B Curtis</reporter>
                        <labels>
                            <label>ORNL</label>
                    </labels>
                <created>Sun, 14 Jul 2019 02:25:44 +0000</created>
                <updated>Thu, 22 Aug 2019 13:58:12 +0000</updated>
                            <resolved>Thu, 22 Aug 2019 13:58:12 +0000</resolved>
                                    <version>Lustre 2.11.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="251348" author="curtispb" created="Sun, 14 Jul 2019 03:17:43 +0000"  >&lt;p&gt;Attaching the dmesg output from the host before I ran a crashdump as well as a copy of the zfs kstats.&lt;/p&gt;</comment>
                            <comment id="251356" author="pjones" created="Sun, 14 Jul 2019 14:25:22 +0000"  >&lt;p&gt;Alex&lt;/p&gt;

&lt;p&gt;Any advice here?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="251386" author="bzzz" created="Mon, 15 Jul 2019 10:40:23 +0000"  >&lt;p&gt;it would be great to see all backtraces for the case. it looks like one (few) OSTs were down?&lt;br/&gt;
also, the trace in the description can&apos;t be found in the logs you attached. they look like a different cases?&lt;/p&gt;</comment>
                            <comment id="251389" author="curtispb" created="Mon, 15 Jul 2019 11:15:07 +0000"  >&lt;p&gt;We hit this issue again, just now. I am attaching the new data which is hopefully more useful than what I previously attached.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33209/33209_f2-mds2_lustre_unhealthy_20190715.tgz&quot; title=&quot;f2-mds2_lustre_unhealthy_20190715.tgz attached to LU-12544&quot;&gt;f2-mds2_lustre_unhealthy_20190715.tgz&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;</comment>
                            <comment id="251395" author="ezell" created="Mon, 15 Jul 2019 12:46:38 +0000"  >&lt;p&gt;Is this something that &lt;a href=&quot;https://github.com/zfsonlinux/zfs/issues/8426&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/zfsonlinux/zfs/issues/8426&lt;/a&gt; might help with?&lt;/p&gt;</comment>
                            <comment id="251396" author="simmonsja" created="Mon, 15 Jul 2019 13:03:08 +0000"  >&lt;p&gt;I patched our 0.7.13 ZFS version with the patch that 8426 referenced too.&lt;/p&gt;</comment>
                            <comment id="253415" author="pjones" created="Wed, 21 Aug 2019 22:00:21 +0000"  >&lt;p&gt;@James is it too early to tell whether the ZFS patch helped?&lt;/p&gt;</comment>
                            <comment id="253420" author="simmonsja" created="Wed, 21 Aug 2019 23:32:57 +0000"  >&lt;p&gt;We since have move to ZFS 0.8.1 which has less problems. Matt do you okay with closing this ticket?&lt;/p&gt;</comment>
                            <comment id="253434" author="ezell" created="Thu, 22 Aug 2019 13:56:23 +0000"  >&lt;p&gt;Yes, I think it&apos;s fine to close this one.&lt;/p&gt;</comment>
                            <comment id="253435" author="pjones" created="Thu, 22 Aug 2019 13:58:12 +0000"  >&lt;p&gt;ok thanks!&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="56275">LU-12510</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="33207" name="f2-mds2_lustre_unhealthy_20190713.tgz" size="19432979" author="curtispb" created="Sun, 14 Jul 2019 03:16:17 +0000"/>
                            <attachment id="33209" name="f2-mds2_lustre_unhealthy_20190715.tgz" size="17242019" author="curtispb" created="Mon, 15 Jul 2019 11:15:06 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00jmn:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>