<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:45:54 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4794] MDS threads all stuck in jbd2_journal_start</title>
                <link>https://jira.whamcloud.com/browse/LU-4794</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This seems to be a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1276&quot; title=&quot;MDS threads all stuck in jbd2_journal_start&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1276&quot;&gt;&lt;del&gt;LU-1276&lt;/del&gt;&lt;/a&gt;, which was closed with &quot;Cannot Reproduce&quot;, and in which I initially added the following note.&lt;/p&gt;

&lt;p&gt;One of the Bull customers (TGCC) had the same deadlock as described in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1276&quot; title=&quot;MDS threads all stuck in jbd2_journal_start&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1276&quot;&gt;&lt;del&gt;LU-1276&lt;/del&gt;&lt;/a&gt; twice during the past six months: one thread is stuck in jbd2_journal_commit_transaction() and many other thread are stuck in jbd2_journal_start().&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 29225 TASK: ffff88107c3bb040 CPU: 15 COMMAND: &quot;jbd2/dm-2-8&quot;
 #0 [ffff88107a343c60] schedule at ffffffff81485765
 0000001 [ffff88107a343d28] jbd2_journal_commit_transaction at ffffffffa006a94f [jbd2]
 0000002 [ffff88107a343e68] kjournald2 at ffffffffa0070c08 [jbd2]
 0000003 [ffff88107a343ee8] kthread at ffffffff8107b5f6
 0000004 [ffff88107a343f48] kernel_thread at ffffffff8100412a
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;and most of the threads:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 15585 TASK: ffff88063062a790 CPU: 0 COMMAND: &quot;mdt_503&quot;
PID: 15586 TASK: ffff88063062a040 CPU: 23 COMMAND: &quot;mdt_504&quot;
PID: 15587 TASK: ffff88020f3ad7d0 CPU: 30 COMMAND: &quot;mdt_505&quot;
PID: 29286 TASK: ffff88087505e790 CPU: 25 COMMAND: &quot;mdt_01&quot;
...
#0 [ffff881949c078f0] schedule at ffffffff81485765
0000001 [ffff881949c079b8] start_this_handle at ffffffffa006908a [jbd2]
0000002 [ffff881949c07a78] jbd2_journal_start at ffffffffa0069500 [jbd2]
0000003 [ffff881949c07ac8] ldiskfs_journal_start_sb at ffffffffa0451ca8 [ldiskfs]
0000004 [ffff881949c07ad8] osd_trans_start at ffffffffa0d4a324 [osd_ldiskfs]
0000005 [ffff881949c07b18] mdd_trans_start at ffffffffa0c4c4e3 [mdd]
0000006 [ffff881949c07b38] mdd_unlink at ffffffffa0c401eb [mdd]
0000007 [ffff881949c07bf8] cml_unlink at ffffffffa0d82e07 [cmm]
0000008 [ffff881949c07c38] mdt_reint_unlink at ffffffffa0cba0f4 [mdt]
0000009 [ffff881949c07cb8] mdt_reint_rec at ffffffffa0cb7cb1 [mdt]
0000010 [ffff881949c07cd8] mdt_reint_internal at ffffffffa0caeed4 [mdt]
0000011 [ffff881949c07d28] mdt_reint at ffffffffa0caf2b4 [mdt]
0000012 [ffff881949c07d48] mdt_handle_common at ffffffffa0ca3762 [mdt]
0000013 [ffff881949c07d98] mdt_regular_handle at ffffffffa0ca4655 [mdt]
0000014 [ffff881949c07da8] ptlrpc_main at ffffffffa071f4f6 [ptlrpc]
0000015 [ffff881949c07f48] kernel_thread at ffffffff8100412a
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;They are running lustre 2.1.6 which contains &lt;a href=&quot;http://review.whamcloud.com/4743&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4743&lt;/a&gt; from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1648&quot; title=&quot;MDS Crash&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1648&quot;&gt;&lt;del&gt;LU-1648&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;I attach two files containing the dmseg and the crash back trace of all threads.&lt;/p&gt;
</description>
                <environment></environment>
        <key id="23814">LU-4794</key>
            <summary>MDS threads all stuck in jbd2_journal_start</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="bobijam">Zhenyu Xu</assignee>
                                    <reporter username="patrick.valentin">Patrick Valentin</reporter>
                        <labels>
                    </labels>
                <created>Thu, 20 Mar 2014 18:22:07 +0000</created>
                <updated>Tue, 14 Dec 2021 22:47:38 +0000</updated>
                            <resolved>Tue, 14 Dec 2021 22:47:38 +0000</resolved>
                                    <version>Lustre 2.1.6</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="79931" author="pjones" created="Thu, 20 Mar 2014 21:05:26 +0000"  >&lt;p&gt;Bobijam&lt;/p&gt;

&lt;p&gt;Could you please advise?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="82133" author="bobijam" created="Tue, 22 Apr 2014 13:32:44 +0000"  >&lt;p&gt;Relates to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1648&quot; title=&quot;MDS Crash&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1648&quot;&gt;&lt;del&gt;LU-1648&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;Thread 27469 stack trace from bt-all.merged.txt&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 27469  TASK: ffff88199d25a080  CPU: 12  COMMAND: &quot;ldlm_cn_88&quot;
 #0 [ffff881c4f78b490] schedule at ffffffff81485765
 #1 [ffff881c4f78b558] start_this_handle at ffffffffa006908a [jbd2]
 #2 [ffff881c4f78b618] jbd2_journal_restart at ffffffffa00693d1 [jbd2]
 #3 [ffff881c4f78b668] ldiskfs_truncate_restart_trans at ffffffffa042791a [ldiskfs]
 #4 [ffff881c4f78b698] ldiskfs_clear_blocks at ffffffffa042cc3d [ldiskfs]
 #5 [ffff881c4f78b6f8] ldiskfs_free_data at ffffffffa042ce24 [ldiskfs]
 #6 [ffff881c4f78b758] ldiskfs_free_branches at ffffffffa042d063 [ldiskfs]
 #7 [ffff881c4f78b7b8] ldiskfs_free_branches at ffffffffa042cf56 [ldiskfs]
 #8 [ffff881c4f78b818] ldiskfs_truncate at ffffffffa042d659 [ldiskfs]
 #9 [ffff881c4f78b938] ldiskfs_delete_inode at ffffffffa042e9d0 [ldiskfs]
#10 [ffff881c4f78b958] generic_delete_inode at ffffffff8117f0de
#11 [ffff881c4f78b988] generic_drop_inode at ffffffff8117f235
#12 [ffff881c4f78b9a8] iput at ffffffff8117df52
#13 [ffff881c4f78b9c8] mds_obd_destroy at ffffffffa0bf717d [mds]
#14 [ffff881c4f78bb08] llog_lvfs_destroy at ffffffffa05705cd [obdclass]
#15 [ffff881c4f78bbd8] llog_cancel_rec at ffffffffa0566424 [obdclass]
#16 [ffff881c4f78bc08] llog_cat_cancel_records at ffffffffa056a3a1 [obdclass]
#17 [ffff881c4f78bc68] llog_origin_handle_cancel at ffffffffa072923b [ptlrpc]
#18 [ffff881c4f78bd68] ldlm_cancel_handler at ffffffffa06ee8ff [ptlrpc]
#19 [ffff881c4f78bda8] ptlrpc_main at ffffffffa071f4f6 [ptlrpc]
#20 [ffff881c4f78bf48] kernel_thread at ffffffff8100412a
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;the transaction credit is not enough, and the thread restart the transaction while holding log_handle::lgh_lock&lt;/p&gt;</comment>
                            <comment id="82136" author="bobijam" created="Tue, 22 Apr 2014 14:06:21 +0000"  >&lt;p&gt;the competing deadlock thread 15514&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 15514  TASK: ffff88199fe40850  CPU: 16  COMMAND: &quot;mdt_432&quot;
 #0 [ffff881cc6f1b548] schedule at ffffffff81485765
 #1 [ffff881cc6f1b610] rwsem_down_failed_common at ffffffff81487d65
 #2 [ffff881cc6f1b670] rwsem_down_read_failed at ffffffff81487f16
 #3 [ffff881cc6f1b6b0] call_rwsem_down_read_failed at ffffffff81262b24
 #4 [ffff881cc6f1b718] llog_cat_current_log.clone.0 at ffffffffa056ada5 [obdclass]
 #5 [ffff881cc6f1b7b8] llog_cat_add_rec at ffffffffa056baca [obdclass]
 #6 [ffff881cc6f1b808] llog_obd_origin_add at ffffffffa0571627 [obdclass]
 #7 [ffff881cc6f1b838] llog_add at ffffffffa0571801 [obdclass]
 #8 [ffff881cc6f1b888] lov_llog_origin_add at ffffffffa09f70fc [lov]
 #9 [ffff881cc6f1b908] llog_add at ffffffffa0571801 [obdclass]
#10 [ffff881cc6f1b958] mds_llog_origin_add at ffffffffa0bf8d53 [mds]
#11 [ffff881cc6f1b9a8] llog_add at ffffffffa0571801 [obdclass]
#12 [ffff881cc6f1b9f8] mds_llog_add_unlink at ffffffffa0bf93ca [mds]
#13 [ffff881cc6f1ba48] mds_log_op_unlink at ffffffffa0bf9a08 [mds]
#14 [ffff881cc6f1baa8] mdd_unlink_log at ffffffffa0c2df31 [mdd]
#15 [ffff881cc6f1bac8] mdd_object_kill at ffffffffa0c2526b [mdd]
#16 [ffff881cc6f1baf8] mdd_finish_unlink at ffffffffa0c3b13e [mdd]
#17 [ffff881cc6f1bb38] mdd_unlink at ffffffffa0c40696 [mdd]
#18 [ffff881cc6f1bbf8] cml_unlink at ffffffffa0d82e07 [cmm]
#19 [ffff881cc6f1bc38] mdt_reint_unlink at ffffffffa0cba0f4 [mdt]
#20 [ffff881cc6f1bcb8] mdt_reint_rec at ffffffffa0cb7cb1 [mdt]
#21 [ffff881cc6f1bcd8] mdt_reint_internal at ffffffffa0caeed4 [mdt]
#22 [ffff881cc6f1bd28] mdt_reint at ffffffffa0caf2b4 [mdt]
#23 [ffff881cc6f1bd48] mdt_handle_common at ffffffffa0ca3762 [mdt]
#24 [ffff881cc6f1bd98] mdt_regular_handle at ffffffffa0ca4655 [mdt]
#25 [ffff881cc6f1bda8] ptlrpc_main at ffffffffa071f4f6 [ptlrpc]
#26 [ffff881cc6f1bf48] kernel_thread at ffffffff8100412a
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="82355" author="bobijam" created="Thu, 24 Apr 2014 04:27:10 +0000"  >&lt;p&gt;patch tracking at &lt;a href=&quot;http://review.whamcloud.com/10076&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/10076&lt;/a&gt; (b2_1 needs it, b2_4 does not)&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="24688">LU-5052</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="15262">LU-1648</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="14516" name="bt-all.merged.txt" size="235103" author="lustre-bull" created="Thu, 20 Mar 2014 18:24:36 +0000"/>
                            <attachment id="14517" name="dmesg.txt" size="128208" author="lustre-bull" created="Thu, 20 Mar 2014 18:24:36 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10490" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>End date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Fri, 18 Jul 2014 18:22:07 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwi2f:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>13194</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10493" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>Start date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Thu, 20 Mar 2014 18:22:07 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                    </customfields>
    </item>
</channel>
</rss>