<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:41:13 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4271] mds load goes very high and filesystem hangs after mounting mdt</title>
                <link>https://jira.whamcloud.com/browse/LU-4271</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;after recover of a crashed mds the system load goes to &amp;gt;800&lt;br/&gt;
Filesystem is DOWN. We need help to bring the filesystem online!&lt;/p&gt;

&lt;p&gt;here is the error&lt;br/&gt;
Lustre: Skipped 2 previous similar messages&lt;br/&gt;
Lustre: Service thread pid 7014 was inactive for 200.00s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:&lt;br/&gt;
Pid: 7014, comm: mdt_01&lt;/p&gt;

&lt;p&gt;Call Trace:&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8151d552&amp;gt;&amp;#93;&lt;/span&gt; schedule_timeout+0x192/0x2e0&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8107bf80&amp;gt;&amp;#93;&lt;/span&gt; ? process_timeout+0x0/0x10&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04e45e1&amp;gt;&amp;#93;&lt;/span&gt; cfs_waitq_timedwait+0x11/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0da2508&amp;gt;&amp;#93;&lt;/span&gt; osc_create+0x528/0xdc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8105fab0&amp;gt;&amp;#93;&lt;/span&gt; ? default_wake_function+0x0/0x20&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e13337&amp;gt;&amp;#93;&lt;/span&gt; lov_check_and_create_object+0x187/0x570 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e13a1b&amp;gt;&amp;#93;&lt;/span&gt; qos_remedy_create+0x1db/0x220 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e1059a&amp;gt;&amp;#93;&lt;/span&gt; lov_fini_create_set+0x24a/0x1200 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0dfa0f2&amp;gt;&amp;#93;&lt;/span&gt; lov_create+0x792/0x1400 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa11000d6&amp;gt;&amp;#93;&lt;/span&gt; ? mdd_get_md+0x96/0x2f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8105fab0&amp;gt;&amp;#93;&lt;/span&gt; ? default_wake_function+0x0/0x20&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa1120916&amp;gt;&amp;#93;&lt;/span&gt; ? mdd_read_unlock+0x26/0x30 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa110490e&amp;gt;&amp;#93;&lt;/span&gt; mdd_lov_create+0x9ee/0x1ba0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa1116871&amp;gt;&amp;#93;&lt;/span&gt; mdd_create+0xf81/0x1a90 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa121edf3&amp;gt;&amp;#93;&lt;/span&gt; ? osd_oi_lookup+0x83/0x110 &lt;span class=&quot;error&quot;&gt;&amp;#91;osd_ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa121956c&amp;gt;&amp;#93;&lt;/span&gt; ? osd_object_init+0xdc/0x3e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;osd_ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa124f3f7&amp;gt;&amp;#93;&lt;/span&gt; cml_create+0x97/0x250 &lt;span class=&quot;error&quot;&gt;&amp;#91;cmm&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa118b5e1&amp;gt;&amp;#93;&lt;/span&gt; ? mdt_version_get_save+0x91/0xd0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa11a106e&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint_open+0x1aae/0x28a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa077a724&amp;gt;&amp;#93;&lt;/span&gt; ? lustre_msg_add_version+0x74/0xd0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa111956e&amp;gt;&amp;#93;&lt;/span&gt; ? md_ucred+0x1e/0x60 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa1189c81&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint_rec+0x41/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa1180ed4&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint_internal+0x544/0x8e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa118153d&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_reint+0x1ed/0x530 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa117fc09&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_policy+0x379/0x690 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0736351&amp;gt;&amp;#93;&lt;/span&gt; ldlm_lock_enqueue+0x361/0x8f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa075c1ad&amp;gt;&amp;#93;&lt;/span&gt; ldlm_handle_enqueue0+0x48d/0xf50 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa1180586&amp;gt;&amp;#93;&lt;/span&gt; mdt_enqueue+0x46/0x130 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa1175772&amp;gt;&amp;#93;&lt;/span&gt; mdt_handle_common+0x932/0x1750 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa1176665&amp;gt;&amp;#93;&lt;/span&gt; mdt_regular_handle+0x15/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa078ab4e&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0xc4e/0x1a40 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0789f00&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1a40 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0ca&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0789f00&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1a40 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0789f00&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1a40 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0c0&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;/p&gt;


</description>
                <environment></environment>
        <key id="22158">LU-4271</key>
            <summary>mds load goes very high and filesystem hangs after mounting mdt</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="2">Won&apos;t Fix</resolution>
                                        <assignee username="bfaccini">Bruno Faccini</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Tue, 19 Nov 2013 08:41:38 +0000</created>
                <updated>Thu, 14 Aug 2014 21:18:44 +0000</updated>
                            <resolved>Thu, 14 Aug 2014 21:18:44 +0000</resolved>
                                    <version>Lustre 2.1.5</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="71857" author="mhanafi" created="Tue, 19 Nov 2013 09:07:02 +0000"  >&lt;p&gt;Console log showing the start of the issue.&lt;/p&gt;</comment>
                            <comment id="71861" author="bfaccini" created="Tue, 19 Nov 2013 09:49:00 +0000"  >&lt;p&gt;Hello mahmoud,&lt;br/&gt;
A load of 800 is quite huge, but it does not mean there are 800 threads being runnable/schedulable at the same time because threads in UNinterruptible state are also take into account.&lt;/p&gt;

&lt;p&gt;I suppose the stack you attached is one that shows-up frequently during the problem, are there others showing attached/running/spinning processes instead blocked ones ?&lt;/p&gt;

&lt;p&gt;This stack indicates we are waiting for OSS reply, did you check health of OSSes ??&lt;/p&gt;</comment>
                            <comment id="71863" author="bfaccini" created="Tue, 19 Nov 2013 10:25:59 +0000"  >&lt;p&gt;Having a look to the Console log you attached, I think more interesting is the fact there are multiple threads reported as stuck in JBD2 layer. And particulary the jbd2/dm-3-8 instances with stack like :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;sp                ip                Function (args)
0xffff881e643c9c60 0xffffffff8151c712 thread_return
0xffff881e643c9d28 0xffffffffa0c338df [jbd2]jbd2_journal_commit_transaction+0x19f (0xffff883f0c696000)
0xffff881e643c9e68 0xffffffffa0c3a6c8 [jbd2]kjournald2+0xb8 (0xffff883f0c696000)
0xffff881e643c9ee8 0xffffffff8108fb96 kthread+0x96 (0xffff881e641458e8)
0xffff881e643c9f48 0xffffffff8100c0ca child_rip+0xa (unknown, unknown)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This looks like situation described in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-81&quot; title=&quot;Some JBD2 journaling deadlock at BULL&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-81&quot;&gt;&lt;del&gt;LU-81&lt;/del&gt;&lt;/a&gt;, so are the ChangeLogs activated ? &lt;/p&gt;</comment>
                            <comment id="71865" author="mhanafi" created="Tue, 19 Nov 2013 10:40:50 +0000"  >&lt;p&gt;we were able to get things working by&lt;br/&gt;
1. mounting the mgs&lt;br/&gt;
2. then all ost&lt;br/&gt;
3. then waiting for a bit&lt;br/&gt;
4. then mdt&lt;br/&gt;
I checked we have the patch that was part &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-81&quot; title=&quot;Some JBD2 journaling deadlock at BULL&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-81&quot;&gt;&lt;del&gt;LU-81&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;
</comment>
                            <comment id="71902" author="bfaccini" created="Tue, 19 Nov 2013 16:52:06 +0000"  >&lt;p&gt;Hello Mahmoud, since you have been able to restart your file-system, can we downgrade the priority from blocker ?&lt;br/&gt;
Also is the &quot;dm-3-8&quot; device, with the JBD2 ops stuck, the one where stands your MDT ??&lt;/p&gt;</comment>
                            <comment id="71918" author="jaylan" created="Tue, 19 Nov 2013 19:22:47 +0000"  >&lt;p&gt;Please note that I cherry-picked patch set #3 of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2943&quot; title=&quot;LBUG mdt_reconstruct_open()) ASSERTION( (!(rc &amp;lt; 0) || (lustre_msg_get_transno(req-&amp;gt;rq_repmsg) == 0)) )&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2943&quot;&gt;&lt;del&gt;LU-2943&lt;/del&gt;&lt;/a&gt;, which has not closed yet into our server codes. We were hit with the same problem in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2943&quot; title=&quot;LBUG mdt_reconstruct_open()) ASSERTION( (!(rc &amp;lt; 0) || (lustre_msg_get_transno(req-&amp;gt;rq_repmsg) == 0)) )&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2943&quot;&gt;&lt;del&gt;LU-2943&lt;/del&gt;&lt;/a&gt; multiple times.&lt;/p&gt;
</comment>
                            <comment id="71922" author="mhanafi" created="Tue, 19 Nov 2013 19:44:04 +0000"  >&lt;p&gt;Yes we can downgrade it from blocker&lt;/p&gt;

&lt;p&gt;dm-3-8 is the mdt&lt;br/&gt;
dm-2-8 is the mgs&lt;/p&gt;

&lt;p&gt;I have gone through the raid logs and nothing shows up there. &lt;br/&gt;
And there nothing in the logs calling out any host to raid errors.&lt;/p&gt;

&lt;p&gt;This is the first trace that is printed. Wouldn&apos;t this suggest that it is stuck in ko2iblnd?&lt;br/&gt;
Nov 18 18:04:47 nbp8-mds1 kernel: INFO: task ldlm_cn_00:5870 blocked for more than 120 seconds.&lt;br/&gt;
Nov 18 18:04:47 nbp8-mds1 kernel: &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.&lt;br/&gt;
Nov 18 18:04:51 nbp8-mds1 kernel: ldlm_cn_00    D 0000000000000003     0  5870      2 0x00000080&lt;br/&gt;
Nov 18 18:04:51 nbp8-mds1 kernel: ffff881e8189fb30 0000000000000046 0000000000000000 ffffffffa0925cd0&lt;br/&gt;
Nov 18 18:04:51 nbp8-mds1 kernel: ffff881e8189fad0 000000004a6d8511 ffff882000000000 ffff881e65f5bc27&lt;br/&gt;
Nov 18 18:04:51 nbp8-mds1 kernel: ffff881fac6d5098 ffff881e8189ffd8 000000000000fc40 ffff881fac6d5098&lt;br/&gt;
Nov 18 18:04:51 nbp8-mds1 kernel: Call Trace:&lt;br/&gt;
Nov 18 18:04:51 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0925cd0&amp;gt;&amp;#93;&lt;/span&gt; ? kiblnd_send+0x2a0/0x9e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 18 18:04:51 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c3214a&amp;gt;&amp;#93;&lt;/span&gt; start_this_handle+0x27a/0x4a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;jbd2&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 18 18:04:51 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8108ff00&amp;gt;&amp;#93;&lt;/span&gt; ? autoremove_wake_function+0x0/0x40&lt;br/&gt;
Nov 18 18:04:51 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c32570&amp;gt;&amp;#93;&lt;/span&gt; jbd2_journal_start+0xd0/0x110 &lt;span class=&quot;error&quot;&gt;&amp;#91;jbd2&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 18 18:04:51 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c8d338&amp;gt;&amp;#93;&lt;/span&gt; ldiskfs_journal_start_sb+0x58/0x90 &lt;span class=&quot;error&quot;&gt;&amp;#91;ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 18 18:04:52 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d30017&amp;gt;&amp;#93;&lt;/span&gt; fsfilt_ldiskfs_start+0x77/0x5e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;fsfilt_ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 18 18:04:52 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07a9ac0&amp;gt;&amp;#93;&lt;/span&gt; llog_origin_handle_cancel+0x4b0/0xd70 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 18 18:04:52 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04f9923&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_alloc+0x63/0x90 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 18 18:04:52 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0630ebf&amp;gt;&amp;#93;&lt;/span&gt; ? keys_fill+0x6f/0x1a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 18 18:04:52 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa076f71f&amp;gt;&amp;#93;&lt;/span&gt; ldlm_cancel_handler+0x1bf/0x5e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 18 18:04:52 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa079fb4e&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0xc4e/0x1a40 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 18 18:04:52 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa079ef00&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1a40 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 18 18:04:52 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0ca&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
Nov 18 18:04:52 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa079ef00&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1a40 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 18 18:04:52 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa079ef00&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1a40 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Nov 18 18:04:52 nbp8-mds1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0c0&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;/p&gt;
</comment>
                            <comment id="71984" author="mhanafi" created="Wed, 20 Nov 2013 19:40:12 +0000"  >&lt;p&gt;What do these error mean..&lt;br/&gt;
LustreError: 5906:0:(obd_class.h:503:obd_set_info_async()) obd_set_info_async: dev 0 no operation&lt;/p&gt;

&lt;p&gt;We see a lot of these.&lt;/p&gt;</comment>
                            <comment id="72232" author="bfaccini" created="Mon, 25 Nov 2013 15:42:27 +0000"  >&lt;p&gt;The &quot;? kiblnd_send+0x2a0/0x9e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ko2iblnd&amp;#93;&lt;/span&gt;&quot; entry in the stack is likely to be an old+stale reference found by the unwinder.&lt;/p&gt;

&lt;p&gt;The obd_set_info_async() msgs should have been printed upon your MDS+MGS restart and are likely to occur during massive MGS re-connect from all Clients. It just means probably that target_handle_connect() generically (when some cleanup need in import/export is detected) calls obd_set_info_async() for the MGS device but it has no o_set_info_async method/operation available to proceed. So we can easily consider msg is harmless.&lt;/p&gt;</comment>
                            <comment id="72234" author="bfaccini" created="Mon, 25 Nov 2013 15:47:32 +0000"  >&lt;p&gt;I downgraded priority to Major, is it ok ?? I don&apos;t think there is a lot more to be done for this ticket, my feeling is that we faced a JBD2 issue upon MDS restart+reconnection, and it is unlikely to re-occur but if this happen, we will need a crash-dump.&lt;/p&gt;</comment>
                            <comment id="79299" author="jfc" created="Fri, 14 Mar 2014 01:14:50 +0000"  >&lt;p&gt;Hello Mahmoud,&lt;br/&gt;
Do you want us to keep this issue open?&lt;br/&gt;
Have you seen any recurrence of the problem?&lt;br/&gt;
Thanks,&lt;br/&gt;
~ jfc.&lt;/p&gt;</comment>
                            <comment id="80271" author="mhanafi" created="Wed, 26 Mar 2014 01:51:06 +0000"  >&lt;p&gt;We have hit this bug again. Here is the details of the stack&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Lustre: 7240:0:(ldlm_lib.c:952:target_handle_connect()) Skipped 1 previous similar message^M
Lustre: nbp8-MDT0000: haven&lt;span class=&quot;code-quote&quot;&gt;&apos;t heard from client fae2954d-57b1-3826-45b9-252e2516d6f0 (at 10.151.55.93@o2ib) in 227 seconds. I think it&apos;&lt;/span&gt;s dead, and I am evicting it. exp ffff883f62fef400, cur 1395779273 expire 1395779123 last 1395779046^M
Lustre: 7228:0:(ldlm_lib.c:952:target_handle_connect()) MGS: connection from e3993dc5-8b59-86ee-9304-faf298c767b4@10.151.23.158@o2ib t0 exp (&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;) cur 1395779280 last 0^M
Lustre: 7228:0:(ldlm_lib.c:952:target_handle_connect()) Skipped 1 previous similar message^M
Lustre: 7398:0:(quota_interface.c:543:quota_chk_acq_common()) still haven&apos;t managed to acquire quota space from the quota master after 10 retries (err=0, rc=0)^M
INFO: task ldlm_cn_01:6369 blocked &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; more than 120 seconds.^M
&lt;span class=&quot;code-quote&quot;&gt;&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot;&lt;/span&gt; disables &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; message.^M
ldlm_cn_01    D 000000000000001d     0  6369      2 0x00000080^M
 ffff881e9b8cdb30 0000000000000046 0000000000000000 ffffffffa0925cd0^M
 ffff881e9b8cdad0 00000000fdc0d821 ffff883feeb58e00 ffff883e4235124d^M
 ffff881ff32bf098 ffff881e9b8cdfd8 000000000000fc40 ffff881ff32bf098^M
Call Trace:^M
 [&amp;lt;ffffffffa0925cd0&amp;gt;] ? kiblnd_send+0x2a0/0x9e0 [ko2iblnd]^M
 [&amp;lt;ffffffffa055c14a&amp;gt;] start_this_handle+0x27a/0x4a0 [jbd2]^M
 [&amp;lt;ffffffff8108ff00&amp;gt;] ? autoremove_wake_function+0x0/0x40^M
 [&amp;lt;ffffffffa055c570&amp;gt;] jbd2_journal_start+0xd0/0x110 [jbd2]^M
 [&amp;lt;ffffffffa08e6338&amp;gt;] ldiskfs_journal_start_sb+0x58/0x90 [ldiskfs]^M
 [&amp;lt;ffffffffa072c017&amp;gt;] fsfilt_ldiskfs_start+0x77/0x5e0 [fsfilt_ldiskfs]^M
 [&amp;lt;ffffffffa07a9ac0&amp;gt;] llog_origin_handle_cancel+0x4b0/0xd70 [ptlrpc]^M
 [&amp;lt;ffffffffa04f9923&amp;gt;] ? cfs_alloc+0x63/0x90 [libcfs]^M
 [&amp;lt;ffffffffa0630ebf&amp;gt;] ? keys_fill+0x6f/0x1a0 [obdclass]^M
 [&amp;lt;ffffffffa076f71f&amp;gt;] ldlm_cancel_handler+0x1bf/0x5e0 [ptlrpc]^M
 [&amp;lt;ffffffffa079fb4e&amp;gt;] ptlrpc_main+0xc4e/0x1a40 [ptlrpc]^M
 [&amp;lt;ffffffffa079ef00&amp;gt;] ? ptlrpc_main+0x0/0x1a40 [ptlrpc]^M
 [&amp;lt;ffffffff8100c0ca&amp;gt;] child_rip+0xa/0x20^M
 [&amp;lt;ffffffffa079ef00&amp;gt;] ? ptlrpc_main+0x0/0x1a40 [ptlrpc]^M
 [&amp;lt;ffffffffa079ef00&amp;gt;] ? ptlrpc_main+0x0/0x1a40 [ptlrpc]^M
 [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20^M
INFO: task ldlm_cb_00:6370 blocked &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; more than 120 seconds.^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Attached full dmesg and full process list to ftp site (nbp8-mds1-dmesg.mar25.gz)&lt;/p&gt;
</comment>
                            <comment id="80361" author="bfaccini" created="Thu, 27 Mar 2014 10:05:02 +0000"  >&lt;p&gt;Did you also successfully take a crash-dump of your MDS, as I requested in my update/comment dated &quot;25/Nov/13 4:47 PM&quot; for this ticket, and as seem to indicate the end of the dmesg you uploaded for this latest occurrence ?? If yes, can you upload it with associated vmlinux (or kernel-debuginfo* RPMs) and Lustre modules.&lt;/p&gt;

&lt;p&gt;BTW, at the time you dumped all threads stacks with Alt+SysRq+T key-stroke, there are 502 threads stuck in jbd2_journal_start()/start_this_handle(), 129 threads stuck in dqacq_handler()/... because one of them is itself also stuck in the journal/jbd2 layer via lustre_commit_dquot(), so the load should be greater than 630 at that time ... But I am unable to find who is responsible to block all of these threads with the only full stack listing. &lt;/p&gt;

</comment>
                            <comment id="80413" author="mhanafi" created="Thu, 27 Mar 2014 20:39:00 +0000"  >&lt;p&gt;We did take a crash dump. I will encrypt and uploaded. It will have to be accessed by US citizens only. &lt;/p&gt;</comment>
                            <comment id="80649" author="mhanafi" created="Mon, 31 Mar 2014 18:59:24 +0000"  >&lt;p&gt;crash dump was uploaded and email was sent to cliff white.&lt;/p&gt;</comment>
                            <comment id="81595" author="bfaccini" created="Tue, 15 Apr 2014 10:09:58 +0000"  >&lt;p&gt;Cliff, I wonder if you found the same situation in the crash-dump than the one I described from the all threads stacks log ??&lt;/p&gt;</comment>
                            <comment id="81613" author="cliffw" created="Tue, 15 Apr 2014 15:11:43 +0000"  >&lt;p&gt;Mahmoud, apparently I missed your email, very sorry, can you re-send the information on the dump?&lt;/p&gt;</comment>
                            <comment id="81658" author="mhanafi" created="Tue, 15 Apr 2014 18:19:46 +0000"  >&lt;p&gt;Cliff,&lt;/p&gt;

&lt;p&gt;Email sent&lt;/p&gt;

&lt;p&gt;-Mahmoud&lt;/p&gt;</comment>
                            <comment id="81770" author="cliffw" created="Wed, 16 Apr 2014 20:42:46 +0000"  >&lt;p&gt;Mahmoud, you including a large number of kernel RPMs in the tarball, can you please specify exactly which version corresponds to the kernel running on the node where the vmcore was taken?&lt;/p&gt;</comment>
                            <comment id="82087" author="bogl" created="Mon, 21 Apr 2014 19:17:43 +0000"  >&lt;p&gt;Have been looking over the crash dump from the customer. As noted in previous comments there is quite a large number of threads all stuck in jbd2_journal_start with the common calling sequence ... osd_trans_start -&amp;gt; ldiskfs_journal_start_sb -&amp;gt; jbd2_journal_start -&amp;gt; start_this_handle.  It seems very likely there&apos;s a single other thread holding a lock or otherwise blocking all these callers of jbd2_journal_start, but so far I haven&apos;t been able to find the culprit.&lt;/p&gt;</comment>
                            <comment id="82088" author="bogl" created="Mon, 21 Apr 2014 19:29:26 +0000"  >&lt;p&gt;I note that in addition to the 500+ threads stuck in jbd2_journal_start, there&apos;s one thread stuck elsewhere it jbd2 code.  The stack trace of this thread looks like:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 7885   TASK: ffff881e506e2ae0  CPU: 6   COMMAND: &quot;mdt_439&quot;
 #0 [ffff881e506f38c0] schedule at ffffffff8151c712
 #1 [ffff881e506f3988] jbd2_log_wait_commit at ffffffffa0564325 [jbd2]
 #2 [ffff881e506f3a18] jbd2_journal_stop at ffffffffa055bacb [jbd2]
 #3 [ffff881e506f3a78] __ldiskfs_journal_stop at ffffffffa08e62a8 [ldiskfs]
 #4 [ffff881e506f3aa8] osd_trans_stop at ffffffffa0d26476 [osd_ldiskfs]
 #5 [ffff881e506f3ad8] mdd_trans_stop at ffffffffa0c2d4aa [mdd]
 #6 [ffff881e506f3ae8] mdd_attr_set at ffffffffa0c0ca5f [mdd]
 #7 [ffff881e506f3bc8] cml_attr_set at ffffffffa0d4fa76 [cmm]
 #8 [ffff881e506f3bf8] mdt_attr_set at ffffffffa0c9ec68 [mdt]
 #9 [ffff881e506f3c48] mdt_reint_setattr at ffffffffa0c9f2b5 [mdt]
#10 [ffff881e506f3cd8] mdt_reint_rec at ffffffffa0c98c81 [mdt]
#11 [ffff881e506f3cf8] mdt_reint_internal at ffffffffa0c8fed4 [mdt]
#12 [ffff881e506f3d48] mdt_reint at ffffffffa0c902b4 [mdt]
#13 [ffff881e506f3d68] mdt_handle_common at ffffffffa0c84772 [mdt]
#14 [ffff881e506f3db8] mdt_regular_handle at ffffffffa0c85665 [mdt]
#15 [ffff881e506f3dc8] ptlrpc_main at ffffffffa079fb4e [ptlrpc]
#16 [ffff881e506f3f48] kernel_thread at ffffffff8100c0ca
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I can&apos;t see how this thread might be blocking all the others, but it is interesting that it&apos;s the only one I have found so far in similar code but different from all the others.&lt;/p&gt;</comment>
                            <comment id="82089" author="bogl" created="Mon, 21 Apr 2014 19:58:50 +0000"  >&lt;p&gt;another odd outlier thread:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 6425   TASK: ffff883fdc6c9500  CPU: 5   COMMAND: &quot;jbd2/dm-3-8&quot;
 #0 [ffff883fec013c60] schedule at ffffffff8151c712
 #1 [ffff883fec013d28] jbd2_journal_commit_transaction at ffffffffa055d8df [jbd2]
 #2 [ffff883fec013e68] kjournald2 at ffffffffa05646c8 [jbd2]
 #3 [ffff883fec013ee8] kthread at ffffffff8108fb96
 #4 [ffff883fec013f48] kernel_thread at ffffffff8100c0ca
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I&apos;m wondering if this might be a dup of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4794&quot; title=&quot;MDS threads all stuck in jbd2_journal_start&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4794&quot;&gt;&lt;del&gt;LU-4794&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="82092" author="bfaccini" created="Mon, 21 Apr 2014 20:42:03 +0000"  >&lt;p&gt;Sure Bob, this last thread should be the one blocking all the others in JBD2 layer! And again, and like in one of my original updates on 19/Nov/2013, the concerned device is &quot;dm-3-8&quot; that Mahmoud confirmed to be the MDT.&lt;/p&gt;

&lt;p&gt;And yes, it looks like a dup of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4794&quot; title=&quot;MDS threads all stuck in jbd2_journal_start&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4794&quot;&gt;&lt;del&gt;LU-4794&lt;/del&gt;&lt;/a&gt;, but also earlier tickets that have been simply closed due to no new occurrence ...&lt;/p&gt;

&lt;p&gt;What would be cool now would be to identify if this last thread has been scheduled recently, if not why or if yes, why it is looping+re-schedule()ing (t_updates != NULL?) ...&lt;/p&gt;
</comment>
                            <comment id="83068" author="bfaccini" created="Fri, 2 May 2014 13:37:33 +0000"  >&lt;p&gt;Bob, can you check if you find the same kind of dead-lock scenario/stacks that Bobi detailled in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4794&quot; title=&quot;MDS threads all stuck in jbd2_journal_start&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4794&quot;&gt;&lt;del&gt;LU-4794&lt;/del&gt;&lt;/a&gt;? BTW, I did not find it in the dmesg with full stacks trace attached to this ticket.&lt;/p&gt;</comment>
                            <comment id="84833" author="bfaccini" created="Sat, 24 May 2014 01:23:43 +0000"  >&lt;p&gt;I have been working &quot;blindly&quot; with Bob having direct access to the crash-dump, and here is the analysis of the deadlock out from the crash-dump you provided.&lt;br/&gt;
Bob already posted before 2 threads stacks, were one is stuck in jbd2_journal_stop()/jbd2_log_wait_commit() synchronously waiting for the current transaction to be committed, and the other started the commit process, in jbd2_journal_commit_transaction(), but schedule()&apos;d.&lt;br/&gt;
The reason the committing thread schedule()&apos;d is because it is waiting for the last handle to complete (ie, we miss a last journal_stop() to have journal_t-&amp;gt;j_running_transaction-&amp;gt;t_updates to become 0 when it is still 1).&lt;br/&gt;
It took us some time to find who owns this last handle, but finally we found it and it is stuck with the following stack :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 19803  TASK: ffff883ff3754080  CPU: 1   COMMAND: &quot;mdt_504&quot;
 #0 [ffff883dc31f76b0] schedule at ffffffff8151c712
 #1 [ffff883dc31f7778] cfs_waitq_wait at ffffffffa04f960e [libcfs]
 #2 [ffff883dc31f7788] qctxt_wait_pending_dqacq at ffffffffa09bfe1b [lquota]
 #3 [ffff883dc31f7878] qctxt_adjust_qunit at ffffffffa09c5d41 [lquota]
 #4 [ffff883dc31f7908] quota_acquire_common at ffffffffa09c9653 [lquota]
 #5 [ffff883dc31f7938] quota_chk_acq_common at ffffffffa09cb722 [lquota]
 #6 [ffff883dc31f7a78] lquota_chkquota.clone.3 at ffffffffa0c0300b [mdd]
 #7 [ffff883dc31f7ae8] mdd_attr_set at ffffffffa0c0dcf6 [mdd]
 #8 [ffff883dc31f7bc8] cml_attr_set at ffffffffa0d4fa76 [cmm]
 #9 [ffff883dc31f7bf8] mdt_attr_set at ffffffffa0c9ec68 [mdt]
#10 [ffff883dc31f7c48] mdt_reint_setattr at ffffffffa0c9f2b5 [mdt]
#11 [ffff883dc31f7cd8] mdt_reint_rec at ffffffffa0c98c81 [mdt]
#12 [ffff883dc31f7cf8] mdt_reint_internal at ffffffffa0c8fed4 [mdt]
#13 [ffff883dc31f7d48] mdt_reint at ffffffffa0c902b4 [mdt]
#14 [ffff883dc31f7d68] mdt_handle_common at ffffffffa0c84772 [mdt]
#15 [ffff883dc31f7db8] mdt_regular_handle at ffffffffa0c85665 [mdt]
#16 [ffff883dc31f7dc8] ptlrpc_main at ffffffffa079fb4e [ptlrpc]
#17 [ffff883dc31f7f48] kernel_thread at ffffffff8100c0ca
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This is confirmed by looking into the source code of mdd_attr_set(), where we can see the mdd_trans_start()/lquota_chkquota()/mdd_trans_stop() call sequence.&lt;/p&gt;

&lt;p&gt;The reason this thread is waiting is because the qunit it wants to process is presently hashed/&quot;inflight&quot;.&lt;/p&gt;

&lt;p&gt;Looking more into others threads to find a reason to explain this blocked situation, we found a thread that is working on this same qunit but is itself stuck trying to start a new journal/transaction handle (like hundreds other threads ...), with the following stack :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 7460   TASK: ffff883f65bc9500  CPU: 25  COMMAND: &quot;mdt_308&quot;
 #0 [ffff883f65bcb2b0] schedule at ffffffff8151c712
 #1 [ffff883f65bcb378] start_this_handle at ffffffffa055c14a [jbd2]
 #2 [ffff883f65bcb438] jbd2_journal_start at ffffffffa055c570 [jbd2]
 #3 [ffff883f65bcb488] ldiskfs_journal_start_sb at ffffffffa08e6338 [ldiskfs]
 #4 [ffff883f65bcb498] lustre_quota_journal_start at ffffffffa072c61f [fsfilt_ldiskfs]
 #5 [ffff883f65bcb4a8] lustre_commit_dquot at ffffffffa0734968 [fsfilt_ldiskfs]
 #6 [ffff883f65bcb548] fsfilt_ldiskfs_dquot at ffffffffa072b604 [fsfilt_ldiskfs]
 #7 [ffff883f65bcb568] dqacq_handler at ffffffffa09d59fe [lquota]
 #8 [ffff883f65bcb628] schedule_dqacq at ffffffffa09c2529 [lquota]
 #9 [ffff883f65bcb728] qctxt_adjust_qunit at ffffffffa09c5cdc [lquota]
#10 [ffff883f65bcb7b8] quota_acquire_common at ffffffffa09c9653 [lquota]
#11 [ffff883f65bcb7e8] quota_chk_acq_common at ffffffffa09cb722 [lquota]
#12 [ffff883f65bcb928] lquota_chkquota.clone.1 at ffffffffa0c1c28b [mdd]
#13 [ffff883f65bcb998] mdd_rename at ffffffffa0c22c0a [mdd]
#14 [ffff883f65bcbb08] cml_rename at ffffffffa0d52b04 [cmm]
#15 [ffff883f65bcbb88] mdt_reint_rename at ffffffffa0c9da73 [mdt]
#16 [ffff883f65bcbcd8] mdt_reint_rec at ffffffffa0c98c81 [mdt]
#17 [ffff883f65bcbcf8] mdt_reint_internal at ffffffffa0c8fed4 [mdt]
#18 [ffff883f65bcbd48] mdt_reint at ffffffffa0c902b4 [mdt]
#19 [ffff883f65bcbd68] mdt_handle_common at ffffffffa0c84772 [mdt]
#20 [ffff883f65bcbdb8] mdt_regular_handle at ffffffffa0c85665 [mdt]
#21 [ffff883f65bcbdc8] ptlrpc_main at ffffffffa079fb4e [ptlrpc]
#22 [ffff883f65bcbf48] kernel_thread at ffffffff8100c0ca
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and which is presently blocked due to the current/running transaction being in commit process!!&lt;br/&gt;
So, dead-lock ...&lt;/p&gt;

&lt;p&gt;Seems that this has been possible due to the fact in mdd_attr_set(), a journal_start()/mdd_trans_start() has been done before starting a quota operation, when it is never the case in same layer&apos;s routines, like mdd_rename()/mdd_&lt;span class=&quot;error&quot;&gt;&amp;#91;un&amp;#93;&lt;/span&gt;link()/mdd_create()/..., where all &quot;heavy&quot; quota operations are made out of mdd_trans_start()/mdd_trans_stop() boundaries, to prevent such dead-lock situations.&lt;/p&gt;

&lt;p&gt;So, a possible fix for this should be to change the code of mdd_attr_set() by simply moving the #ifdef HAVE_QUOTA_SUPPORT/#endif block out from mdd_trans_start()/mdd_trans_stop() boundaries. I give it a try at &lt;a href=&quot;http://review.whamcloud.com/10443&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/10443&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;BTW, this problem is a pure b2_1/2.1.x one since all the concerned code has been re-written in 2.4.&lt;/p&gt;</comment>
                            <comment id="90351" author="jfc" created="Tue, 29 Jul 2014 18:36:58 +0000"  >&lt;p&gt;Hello Bruno and Mahmoud,&lt;/p&gt;

&lt;p&gt;Can I ask if there is going to be more progress on this ticket?&lt;/p&gt;

&lt;p&gt;Many thanks,&lt;br/&gt;
~ jfc.&lt;/p&gt;</comment>
                            <comment id="91664" author="pjones" created="Thu, 14 Aug 2014 21:18:44 +0000"  >&lt;p&gt;As per NASA ok to close ticket&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="22294">LU-4335</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="14601" name="nbp8-mds1-dmesg.mar25.gz" size="228248" author="mhanafi" created="Wed, 26 Mar 2014 01:50:49 +0000"/>
                            <attachment id="13842" name="service200-20131119" size="1591867" author="mhanafi" created="Tue, 19 Nov 2013 09:07:02 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzw9of:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>11731</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10023"><![CDATA[4]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>