<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:40:31 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4195] MDT Slow with ptlrpcd using 100% cpu.</title>
                <link>https://jira.whamcloud.com/browse/LU-4195</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;mdt response very slow. Top showed ptlrpcd running at 100% cpu. Console showed errors. Was able to run debug trace. See attached files.&lt;/p&gt;

&lt;p&gt;Lustre: Service thread pid 7065 was inactive for 200.00s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:&lt;br/&gt;
Pid: 7065, comm: mdt_93&lt;/p&gt;

&lt;p&gt;Call Trace:&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810539b2&amp;gt;&amp;#93;&lt;/span&gt; ? enqueue_task_fair+0x142/0x490&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81096f5d&amp;gt;&amp;#93;&lt;/span&gt; ? sched_clock_cpu+0xcd/0x110&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04f960e&amp;gt;&amp;#93;&lt;/span&gt; cfs_waitq_wait+0xe/0x10 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0a9f6de&amp;gt;&amp;#93;&lt;/span&gt; qos_statfs_update+0x7fe/0xa70 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8105fab0&amp;gt;&amp;#93;&lt;/span&gt; ? default_wake_function+0x0/0x20&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0aa00fd&amp;gt;&amp;#93;&lt;/span&gt; alloc_qos+0x1ad/0x21a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0aa306c&amp;gt;&amp;#93;&lt;/span&gt; qos_prep_create+0x1ec/0x2380 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04f98be&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_free+0xe/0x10 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0a9c63a&amp;gt;&amp;#93;&lt;/span&gt; lov_prep_create_set+0xea/0x390 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0a84b0c&amp;gt;&amp;#93;&lt;/span&gt; lov_create+0x1ac/0x1400 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d8a50f&amp;gt;&amp;#93;&lt;/span&gt; ? obd_iocontrol+0xef/0x390 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d8f90e&amp;gt;&amp;#93;&lt;/span&gt; mdd_lov_create+0x9ee/0x1ba0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0da1871&amp;gt;&amp;#93;&lt;/span&gt; mdd_create+0xf81/0x1a90 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0ea9df3&amp;gt;&amp;#93;&lt;/span&gt; ? osd_oi_lookup+0x83/0x110 &lt;span class=&quot;error&quot;&gt;&amp;#91;osd_ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0ea456c&amp;gt;&amp;#93;&lt;/span&gt; ? osd_object_init+0xdc/0x3e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;osd_ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0eda3f7&amp;gt;&amp;#93;&lt;/span&gt; cml_create+0x97/0x250 &lt;span class=&quot;error&quot;&gt;&amp;#91;cmm&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e165e1&amp;gt;&amp;#93;&lt;/span&gt; ? mdt_version_get_save+0x91/0xd0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e2c06e&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint_open+0x1aae/0x28a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa078f724&amp;gt;&amp;#93;&lt;/span&gt; ? lustre_msg_add_version+0x74/0xd0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0da456e&amp;gt;&amp;#93;&lt;/span&gt; ? md_ucred+0x1e/0x60 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e14c81&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint_rec+0x41/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e0bed4&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint_internal+0x544/0x8e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e0c53d&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_reint+0x1ed/0x530 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e0ac09&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_policy+0x379/0x690 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa074b351&amp;gt;&amp;#93;&lt;/span&gt; ldlm_lock_enqueue+0x361/0x8f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07711ad&amp;gt;&amp;#93;&lt;/span&gt; ldlm_handle_enqueue0+0x48d/0xf50 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e0b586&amp;gt;&amp;#93;&lt;/span&gt; mdt_enqueue+0x46/0x130 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e00772&amp;gt;&amp;#93;&lt;/span&gt; mdt_handle_common+0x932/0x1750 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e01665&amp;gt;&amp;#93;&lt;/span&gt; mdt_regular_handle+0x15/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa079fb4e&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0xc4e/0x1a40 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa079ef00&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1a40 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0ca&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa079ef00&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1a40 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa079ef00&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1a40 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0c0&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;/p&gt;</description>
                <environment>server running 2.1.5-2nas&lt;br/&gt;
our source is at &lt;a href=&quot;git://github.com/jlan/lustre-nas.git&quot;&gt;git://github.com/jlan/lustre-nas.git&lt;/a&gt;</environment>
        <key id="21768">LU-4195</key>
            <summary>MDT Slow with ptlrpcd using 100% cpu.</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Fri, 1 Nov 2013 18:01:13 +0000</created>
                <updated>Thu, 15 Sep 2016 22:11:56 +0000</updated>
                            <resolved>Wed, 29 Oct 2014 16:30:59 +0000</resolved>
                                    <version>Lustre 2.1.5</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="70519" author="mhanafi" created="Fri, 1 Nov 2013 18:57:47 +0000"  >&lt;p&gt;Uploaded the following files to ftp site.&lt;/p&gt;

&lt;p&gt;LU4195.lustre-log.dump.selected.tgz&lt;br/&gt;
LU4195.ptlrpcd.debug2.out.gz&lt;/p&gt;

&lt;p&gt;I also have crash dump that can be uploaded if needed.&lt;/p&gt;</comment>
                            <comment id="70561" author="pjones" created="Sat, 2 Nov 2013 13:43:42 +0000"  >&lt;p&gt;Lai&lt;/p&gt;

&lt;p&gt;What do you advise here?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="70950" author="niu" created="Thu, 7 Nov 2013 04:44:19 +0000"  >&lt;p&gt;Mahmoud, do you know what kind of application/operation caused the problem? I see there are quite lot of transaction commits in the log, looks like heavy load was on mds.&lt;/p&gt;

&lt;p&gt;Is it possible to get a full stack trace (especially the stack trace for ptlrpcd) by sysrq when this happened again? Thanks.&lt;/p&gt;</comment>
                            <comment id="71016" author="mhanafi" created="Thu, 7 Nov 2013 20:49:31 +0000"  >&lt;p&gt;Hope this helps&lt;/p&gt;


&lt;p&gt;crash&amp;gt; bt 5560&lt;br/&gt;
PID: 5560   TASK: ffff881fb5e87540  CPU: 8   COMMAND: &quot;ptlrpcd&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8820b0c07e90&amp;#93;&lt;/span&gt; crash_nmi_callback at ffffffff81029956&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8820b0c07ea0&amp;#93;&lt;/span&gt; notifier_call_chain at ffffffff815222f5&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8820b0c07ee0&amp;#93;&lt;/span&gt; atomic_notifier_call_chain at ffffffff8152235a&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8820b0c07ef0&amp;#93;&lt;/span&gt; notify_die at ffffffff81095f1e&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8820b0c07f20&amp;#93;&lt;/span&gt; do_nmi at ffffffff8151ff2f&lt;br/&gt;
 #5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8820b0c07f50&amp;#93;&lt;/span&gt; nmi at ffffffff8151f7e0&lt;br/&gt;
    &lt;span class=&quot;error&quot;&gt;&amp;#91;exception RIP: _spin_lock+33&amp;#93;&lt;/span&gt;&lt;br/&gt;
    RIP: ffffffff8151f051  RSP: ffff881fac2b3bf0  RFLAGS: 00000297&lt;br/&gt;
    RAX: 0000000000000d28  RBX: 0000000000000004  RCX: 0000000000000000&lt;br/&gt;
    RDX: 0000000000000d27  RSI: 0000000000000007  RDI: ffffffffa05ada40&lt;br/&gt;
    RBP: ffff881fac2b3bf0   R8: 0000000000000246   R9: 0000000000000001&lt;br/&gt;
    R10: ffff883343e44cc0  R11: 0000000000000000  R12: 000500000a971b57&lt;br/&gt;
    R13: 0000000000003039  R14: 0000000000000000  R15: 000527015f9078c7&lt;br/&gt;
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018&lt;br/&gt;
&amp;#8212; &amp;lt;NMI exception stack&amp;gt; &amp;#8212;&lt;br/&gt;
 #6 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881fac2b3bf0&amp;#93;&lt;/span&gt; _spin_lock at ffffffff8151f051&lt;br/&gt;
 #7 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881fac2b3bf8&amp;#93;&lt;/span&gt; LNetMEAttach at ffffffffa057f99f &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881fac2b3c68&amp;#93;&lt;/span&gt; ptl_send_rpc at ffffffffa078b532 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #9 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881fac2b3d28&amp;#93;&lt;/span&gt; ptlrpc_send_new_req at ffffffffa0781993 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881fac2b3d98&amp;#93;&lt;/span&gt; ptlrpc_check_set at ffffffffa07840a8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#11 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881fac2b3e38&amp;#93;&lt;/span&gt; ptlrpcd_check at ffffffffa07b4ed0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#12 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881fac2b3e68&amp;#93;&lt;/span&gt; ptlrpcd at ffffffffa07b516e &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#13 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881fac2b3f48&amp;#93;&lt;/span&gt; kernel_thread at ffffffff8100c0ca&lt;/p&gt;


&lt;p&gt;crash&amp;gt; bt 5560 -l&lt;br/&gt;
PID: 5560   TASK: ffff881fb5e87540  CPU: 8   COMMAND: &quot;ptlrpcd&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8820b0c07e90&amp;#93;&lt;/span&gt; crash_nmi_callback at ffffffff81029956&lt;br/&gt;
    /usr/src/debug/kernel-lustre215-2.6.32-279.19.1.el6/linux-2.6.32-279.19.1.el6.x86_64/arch/x86/include/asm/paravirt.h: 115&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8820b0c07ea0&amp;#93;&lt;/span&gt; notifier_call_chain at ffffffff815222f5&lt;br/&gt;
    /usr/src/debug/kernel-lustre215-2.6.32-279.19.1.el6/linux-2.6.32-279.19.1.el6.x86_64/kernel/notifier.c: 95&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8820b0c07ee0&amp;#93;&lt;/span&gt; atomic_notifier_call_chain at ffffffff8152235a&lt;br/&gt;
    /usr/src/debug/kernel-lustre215-2.6.32-279.19.1.el6/linux-2.6.32-279.19.1.el6.x86_64/kernel/notifier.c: 192&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8820b0c07ef0&amp;#93;&lt;/span&gt; notify_die at ffffffff81095f1e&lt;br/&gt;
    /usr/src/debug/kernel-lustre215-2.6.32-279.19.1.el6/linux-2.6.32-279.19.1.el6.x86_64/kernel/notifier.c: 573&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8820b0c07f20&amp;#93;&lt;/span&gt; do_nmi at ffffffff8151ff2f&lt;br/&gt;
    /usr/src/debug/kernel-lustre215-2.6.32-279.19.1.el6/linux-2.6.32-279.19.1.el6.x86_64/arch/x86/kernel/traps.c: 513&lt;br/&gt;
 #5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8820b0c07f50&amp;#93;&lt;/span&gt; nmi at ffffffff8151f7e0&lt;br/&gt;
    /usr/src/debug/kernel-lustre215-2.6.32-279.19.1.el6/linux-2.6.32-279.19.1.el6.x86_64/arch/x86_64/kernel/entry.S&lt;br/&gt;
    &lt;span class=&quot;error&quot;&gt;&amp;#91;exception RIP: _spin_lock+33&amp;#93;&lt;/span&gt;&lt;br/&gt;
    RIP: ffffffff8151f051  RSP: ffff881fac2b3bf0  RFLAGS: 00000297&lt;br/&gt;
    RAX: 0000000000000d28  RBX: 0000000000000004  RCX: 0000000000000000&lt;br/&gt;
    RDX: 0000000000000d27  RSI: 0000000000000007  RDI: ffffffffa05ada40&lt;br/&gt;
    RBP: ffff881fac2b3bf0   R8: 0000000000000246   R9: 0000000000000001&lt;br/&gt;
    R10: ffff883343e44cc0  R11: 0000000000000000  R12: 000500000a971b57&lt;br/&gt;
    R13: 0000000000003039  R14: 0000000000000000  R15: 000527015f9078c7&lt;br/&gt;
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018&lt;br/&gt;
&amp;#8212; &amp;lt;NMI exception stack&amp;gt; &amp;#8212;&lt;br/&gt;
 #6 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881fac2b3bf0&amp;#93;&lt;/span&gt; _spin_lock at ffffffff8151f051&lt;br/&gt;
    /usr/src/debug/kernel-lustre215-2.6.32-279.19.1.el6/linux-2.6.32-279.19.1.el6.x86_64/arch/x86/include/asm/spinlock.h: 127&lt;br/&gt;
 #7 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881fac2b3bf8&amp;#93;&lt;/span&gt; LNetMEAttach at ffffffffa057f99f &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
    /usr/src/redhat/BUILD/lustre-2.1.5/lnet/lnet/lib-me.c: 158&lt;br/&gt;
 #8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881fac2b3c68&amp;#93;&lt;/span&gt; ptl_send_rpc at ffffffffa078b532 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
    /usr/src/redhat/BUILD/lustre-2.1.5/lustre/ptlrpc/niobuf.c: 614&lt;br/&gt;
 #9 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881fac2b3d28&amp;#93;&lt;/span&gt; ptlrpc_send_new_req at ffffffffa0781993 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
    /usr/src/redhat/BUILD/lustre-2.1.5/lustre/ptlrpc/client.c: 1437&lt;br/&gt;
#10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881fac2b3d98&amp;#93;&lt;/span&gt; ptlrpc_check_set at ffffffffa07840a8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
    /usr/src/redhat/BUILD/lustre-2.1.5/lustre/ptlrpc/client.c: 1468&lt;br/&gt;
#11 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881fac2b3e38&amp;#93;&lt;/span&gt; ptlrpcd_check at ffffffffa07b4ed0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
    /usr/src/redhat/BUILD/lustre-2.1.5/lustre/ptlrpc/ptlrpcd.c: 240&lt;br/&gt;
#12 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881fac2b3e68&amp;#93;&lt;/span&gt; ptlrpcd at ffffffffa07b516e &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
    /usr/src/redhat/BUILD/lustre-2.1.5/lustre/ptlrpc/ptlrpcd.c: 326&lt;br/&gt;
#13 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff881fac2b3f48&amp;#93;&lt;/span&gt; kernel_thread at ffffffff8100c0ca&lt;br/&gt;
    /usr/src/debug////////kernel-lustre215-2.6.32-279.19.1.el6/linux-2.6.32-279.19.1.el6.x86_64/arch/x86/kernel/entry_64.S: 1213&lt;/p&gt;</comment>
                            <comment id="71069" author="niu" created="Fri, 8 Nov 2013 01:52:51 +0000"  >&lt;p&gt;Could you provide the full stack trace (for all tasks on all CPUs) as well? What about the memmory usage? Can this situation be recovered?&lt;/p&gt;</comment>
                            <comment id="71113" author="laisiyao" created="Fri, 8 Nov 2013 09:48:31 +0000"  >&lt;p&gt;In debug logs there are lots of messages like this:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000400:02000400:2.0:1383201992.510338:0:7620:0:(lib-move.c:1454:lnet_send()) No route to 12345-10.153.1.199@o2ib233 via 10.151.27.60@o2ib (all routers down)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It looks like routers are down, and all connections are down too, and clients kept reconnecting to MDS (but failed), so ptlrpcd are always 100% busy.&lt;/p&gt;</comment>
                            <comment id="71131" author="mhanafi" created="Fri, 8 Nov 2013 16:21:05 +0000"  >&lt;p&gt;We were not able to recover from this and had to dump the system.&lt;br/&gt;
See attached files for memory and stack track info&lt;/p&gt;

&lt;p&gt;We have 7 other filesystem that use the same routers and clients and they were not experiencing this issue. &lt;/p&gt;

&lt;p&gt;There is &quot;some&quot; evidence that this may have been triggered by running a &quot;lfs setquota&quot; command.&lt;/p&gt;
</comment>
                            <comment id="71219" author="niu" created="Mon, 11 Nov 2013 04:58:46 +0000"  >&lt;p&gt;I agree with Lai, the log shows that no router for o2ib to o2ib233, and the stack trace shows all ptlrpcd threads were busy on acquiring LNET_LOCK (which I think is caused by the router problem).&lt;/p&gt;

&lt;p&gt;Mahmoud, I didn&apos;t see anything related to quota in the log, what kind of evidence indicating it&apos;s triggered by &apos;lfs setquota&apos; command?&lt;/p&gt;</comment>
                            <comment id="71484" author="jlevi" created="Wed, 13 Nov 2013 23:28:18 +0000"  >&lt;p&gt;Amir,&lt;br/&gt;
Could you take a look at this one?&lt;br/&gt;
Thank you!&lt;/p&gt;</comment>
                            <comment id="71599" author="ashehata" created="Thu, 14 Nov 2013 23:49:57 +0000"  >&lt;p&gt;Would it be possible to grab the route configuration for one of the nodes that have the problem?  Also if you could please highlight the problematic routes.&lt;/p&gt;

&lt;p&gt;As basic sanity check, please make sure that routers are actually configured as routers, IE: forwarding=&quot;enabled&quot;.  If not, then the node will drop all messages not destined to itself.&lt;/p&gt;

&lt;p&gt;This error message is hit whenever LNET tries to send a message to a final destination that exists on a net that do not have a route from the net the current node is on.  Thus an appropriate route is chosen.&lt;/p&gt;

&lt;p&gt;Furthermore, the routes can exist but they might be down because the NID specified is not reachable.  Is it possible to try and do an &lt;b&gt;lctl ping &amp;lt;nid&amp;gt;&lt;/b&gt; from the node that&apos;s having the problem.  That should return a list of NIDs of the target router and their status (up/down).  If one of these nids are down and &lt;b&gt;avoid_asym_router_failure&lt;/b&gt; is 1 (which it is by default), then the entire router is considered down and when sending messages we&apos;ll hit the above error.&lt;/p&gt;

&lt;p&gt;NOTE: if you have a router that has multiple nids, but one of the nids is &quot;unused&quot; (IE, it sends/receives no messages), this would cause that NID to be considered down, and will lead to the above described scenario.&lt;/p&gt;</comment>
                            <comment id="79077" author="jfc" created="Wed, 12 Mar 2014 00:56:28 +0000"  >&lt;p&gt;Hello Mahmoud,&lt;br/&gt;
Just want to check in to see if this is still a &apos;live&apos; issue for you?&lt;br/&gt;
If the problem has been dealt with, can we mark this issue as &apos;resolved&apos;?&lt;br/&gt;
Thanks,&lt;br/&gt;
~ jfc.&lt;/p&gt;</comment>
                            <comment id="90347" author="jfc" created="Tue, 29 Jul 2014 18:17:30 +0000"  >&lt;p&gt;Hello again Mahmoud,&lt;br/&gt;
Please advise if you want us to keep this ticket open?&lt;/p&gt;

&lt;p&gt;If we don&apos;t hear back we&apos;ll mark it as resolved, with no fix, and we can re-open it if requested to do that.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
~ jfc.&lt;/p&gt;</comment>
                            <comment id="97847" author="mhanafi" created="Wed, 29 Oct 2014 16:25:48 +0000"  >&lt;p&gt;please close.&lt;/p&gt;</comment>
                            <comment id="97853" author="pjones" created="Wed, 29 Oct 2014 16:30:59 +0000"  >&lt;p&gt;ok thanks Mahmoud&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="13810" name="bt.LU4195" size="1126881" author="mhanafi" created="Fri, 8 Nov 2013 16:22:01 +0000"/>
                            <attachment id="13811" name="meminfo.LU4195" size="33832" author="mhanafi" created="Fri, 8 Nov 2013 16:22:01 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10040" key="com.atlassian.jira.plugin.system.customfieldtypes:labels">
                        <customfieldname>Epic</customfieldname>
                        <customfieldvalues>
                                        <label>hang</label>
            <label>server</label>
    
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzw7jr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>11369</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>