<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:35:16 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-3596] deadlock in kiblnd</title>
                <link>https://jira.whamcloud.com/browse/LU-3596</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We had a few OSSes crash at NOAA recently due to an NMI deadlock detected error. I was able to get a vmcore from one and analyze it, and it looks like we are hitting &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-78&quot; title=&quot;kiblnd_check_conns can deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-78&quot;&gt;&lt;del&gt;LU-78&lt;/del&gt;&lt;/a&gt;:&lt;br/&gt;
crash&amp;gt; bt&lt;br/&gt;
PID: 15515  TASK: ffff810c39c237e0  CPU: 0   COMMAND: &quot;kiblnd_connd&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffffffff804b8dc0&amp;#93;&lt;/span&gt; crash_kexec at ffffffff800b1192&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffffffff804b8e80&amp;#93;&lt;/span&gt; die_nmi at ffffffff80065285&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffffffff804b8ea0&amp;#93;&lt;/span&gt; nmi_watchdog_tick at ffffffff80065a66&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffffffff804b8ef0&amp;#93;&lt;/span&gt; default_do_nmi at ffffffff80065609&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffffffff804b8f40&amp;#93;&lt;/span&gt; do_nmi at ffffffff800658f1&lt;br/&gt;
 #5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffffffff804b8f50&amp;#93;&lt;/span&gt; nmi at ffffffff80064ecf&lt;br/&gt;
    &lt;span class=&quot;error&quot;&gt;&amp;#91;exception RIP: __write_lock_failed+15&amp;#93;&lt;/span&gt;&lt;br/&gt;
    RIP: ffffffff80062197  RSP: ffff81061978dc90  RFLAGS: 00000087&lt;br/&gt;
    RAX: ffffc20000000000  RBX: ffff8107eb940140  RCX: 0000000000000001&lt;br/&gt;
    RDX: 0000000000006000  RSI: 0000000000000003  RDI: ffffffff8032d42c&lt;br/&gt;
    RBP: ffffc20000000000   R8: 0000000000000000   R9: ffff810c39c237e0&lt;br/&gt;
    R10: ffff8105645d2000  R11: 0000000000002000  R12: ffffffffffffffff&lt;br/&gt;
    R13: 0000000000007000  R14: 0000000000000001  R15: 0000000000000002&lt;br/&gt;
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018&lt;br/&gt;
&amp;#8212; &amp;lt;NMI exception stack&amp;gt; &amp;#8212;&lt;br/&gt;
 #6 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff81061978dc90&amp;#93;&lt;/span&gt; __write_lock_failed at ffffffff80062197&lt;br/&gt;
 #7 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff81061978dc90&amp;#93;&lt;/span&gt; _write_lock at ffffffff80064a7d&lt;br/&gt;
 #8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff81061978dc98&amp;#93;&lt;/span&gt; __get_vm_area_node at ffffffff800d51c1&lt;br/&gt;
 #9 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff81061978dcd8&amp;#93;&lt;/span&gt; __vmalloc_node at ffffffff800d5952&lt;br/&gt;
#10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff81061978dcf8&amp;#93;&lt;/span&gt; kiblnd_create_tx_pool at ffffffff8b127e0e&lt;br/&gt;
#11 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff81061978dd68&amp;#93;&lt;/span&gt; kiblnd_pool_alloc_node at ffffffff8b1247a9&lt;br/&gt;
#12 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff81061978ddc8&amp;#93;&lt;/span&gt; kiblnd_get_idle_tx at ffffffff8b12d8d0&lt;br/&gt;
#13 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff81061978ddd8&amp;#93;&lt;/span&gt; kiblnd_check_sends at ffffffff8b12e9b7&lt;br/&gt;
#14 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff81061978dde8&amp;#93;&lt;/span&gt; kiblnd_check_txs at ffffffff8b12c22c&lt;br/&gt;
#15 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff81061978de48&amp;#93;&lt;/span&gt; kiblnd_check_conns at ffffffff8b12ea68&lt;br/&gt;
#16 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff81061978dea8&amp;#93;&lt;/span&gt; kiblnd_connd at ffffffff8b136063&lt;br/&gt;
#17 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff81061978df48&amp;#93;&lt;/span&gt; kernel_thread at ffffffff8005dfc1&lt;/p&gt;

&lt;p&gt;It appears that &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-78&quot; title=&quot;kiblnd_check_conns can deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-78&quot;&gt;&lt;del&gt;LU-78&lt;/del&gt;&lt;/a&gt; was identified for 1.8.x, but never landed on it. If this crash is related to that bug, would it be possible to get an updated patch for potential inclusion on the next 1.8?&lt;/p&gt;

&lt;p&gt;Thanks.&lt;/p&gt;</description>
                <environment></environment>
        <key id="19839">LU-3596</key>
            <summary>deadlock in kiblnd</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bfaccini">Bruno Faccini</assignee>
                                    <reporter username="orentas">Oz Rentas</reporter>
                        <labels>
                    </labels>
                <created>Tue, 16 Jul 2013 15:32:39 +0000</created>
                <updated>Fri, 1 Sep 2017 14:57:59 +0000</updated>
                            <resolved>Fri, 1 Sep 2017 14:57:59 +0000</resolved>
                                    <version>Lustre 1.8.9</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="62395" author="bfaccini" created="Tue, 16 Jul 2013 17:31:39 +0000"  >&lt;p&gt;Hello Kit,&lt;br/&gt;
I take ownership of this ticket, since it looks like a new occurence/dup of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-78&quot; title=&quot;kiblnd_check_conns can deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-78&quot;&gt;&lt;del&gt;LU-78&lt;/del&gt;&lt;/a&gt; where I have been somewhat involved with !!...&lt;br/&gt;
Since you have a crash-dump for this dead-lock, and just to double-check if it is still the same problem, can you check if there is a thread beeing runnable but not scheduled since a long time and get its stack-trace for me ?&lt;/p&gt;</comment>
                            <comment id="62406" author="kitwestneat" created="Tue, 16 Jul 2013 18:36:42 +0000"  >&lt;p&gt;Hi Bruno,&lt;/p&gt;

&lt;p&gt;All the last_rans from the runnables seem similar:&lt;br/&gt;
  last_ran = 2233784280351367, &lt;br/&gt;
  last_ran = 2233784307286260, &lt;br/&gt;
  last_ran = 2233784366546510, &lt;br/&gt;
  last_ran = 2233784494221153, &lt;br/&gt;
  last_ran = 2233784499282262, &lt;br/&gt;
  last_ran = 2233784500452337, &lt;br/&gt;
  last_ran = 2233784508413095, &lt;br/&gt;
  last_ran = 2233784520565286, &lt;br/&gt;
  last_ran = 2233784548748409, &lt;br/&gt;
  last_ran = 2233784548788509, &lt;br/&gt;
  last_ran = 2233784588049211, &lt;br/&gt;
  last_ran = 2233795686537779, &lt;br/&gt;
  last_ran = 2233796503806272, &lt;br/&gt;
  last_ran = 2233796699153227, &lt;br/&gt;
  last_ran = 2233805294134249, &lt;br/&gt;
  last_ran = 2233805294247424, &lt;br/&gt;
  last_ran = 2233805294287834, &lt;/p&gt;

&lt;p&gt;Is that the correct field in task to be looking at?&lt;/p&gt;

&lt;p&gt;This is the bt for the oldest last_ran:&lt;br/&gt;
crash&amp;gt; bt 28352&lt;br/&gt;
PID: 28352  TASK: ffff810bd8f01860  CPU: 0   COMMAND: &quot;ll_ost_137&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810bb7ebbd50&amp;#93;&lt;/span&gt; schedule at ffffffff80062fa0&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810bb7ebbe28&amp;#93;&lt;/span&gt; ptlrpc_wait_event at ffffffff8b0afee5&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810bb7ebbeb8&amp;#93;&lt;/span&gt; ptlrpc_main at ffffffff8b0b03eb&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810bb7ebbf48&amp;#93;&lt;/span&gt; kernel_thread at ffffffff8005dfc1&lt;/p&gt;

&lt;p&gt;I looked through the runnable kiblnd_sd_* tasks (there were only 2), and saw this, which looks like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-78&quot; title=&quot;kiblnd_check_conns can deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-78&quot;&gt;&lt;del&gt;LU-78&lt;/del&gt;&lt;/a&gt;:&lt;br/&gt;
crash&amp;gt; bt 15504&lt;br/&gt;
PID: 15504  TASK: ffff810c390da0c0  CPU: 17  COMMAND: &quot;kiblnd_sd_13&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3fd3af20&amp;#93;&lt;/span&gt; crash_nmi_callback at ffffffff8007c396&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3fd3af40&amp;#93;&lt;/span&gt; do_nmi at ffffffff800658e5&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3fd3af50&amp;#93;&lt;/span&gt; nmi at ffffffff80064ecf&lt;br/&gt;
    &lt;span class=&quot;error&quot;&gt;&amp;#91;exception RIP: __write_lock_failed+9&amp;#93;&lt;/span&gt;&lt;br/&gt;
    RIP: ffffffff80062191  RSP: ffff810619ad7db0  RFLAGS: 00000087&lt;br/&gt;
    RAX: 0000000000000286  RBX: ffff8105db4da0c0  RCX: 0000000000000100&lt;br/&gt;
    RDX: 000000000000029a  RSI: 00000000fffffffb  RDI: ffffffff8b14d39c&lt;br/&gt;
    RBP: ffff8105db4da0c0   R8: 0000000000000001   R9: 00000000ffffffff&lt;br/&gt;
    R10: 0000000000000000  R11: 000000000000000a  R12: 00000000fffffffb&lt;br/&gt;
    R13: 0000000000000001  R14: ffffc20010d5fd10  R15: 000000000000000c&lt;br/&gt;
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018&lt;br/&gt;
&amp;#8212; &amp;lt;NMI exception stack&amp;gt; &amp;#8212;&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810619ad7db0&amp;#93;&lt;/span&gt; __write_lock_failed at ffffffff80062191&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810619ad7db0&amp;#93;&lt;/span&gt; _write_lock_irqsave at ffffffff80064b83&lt;br/&gt;
 #5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810619ad7db8&amp;#93;&lt;/span&gt; kiblnd_close_conn at ffffffff8b12c844&lt;br/&gt;
 #6 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810619ad7dd8&amp;#93;&lt;/span&gt; kiblnd_tx_complete at ffffffff8b12f20d&lt;br/&gt;
 #7 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810619ad7e58&amp;#93;&lt;/span&gt; kiblnd_scheduler at ffffffff8b136769&lt;br/&gt;
 #8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810619ad7f48&amp;#93;&lt;/span&gt; kernel_thread at ffffffff8005dfc1&lt;/p&gt;

&lt;p&gt;Here is the only other runnable IB related thread:&lt;br/&gt;
PID: 9331   TASK: ffff81063f2707a0  CPU: 12  COMMAND: &quot;ipoib&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3fc43f20&amp;#93;&lt;/span&gt; crash_nmi_callback at ffffffff8007c396&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3fc43f40&amp;#93;&lt;/span&gt; do_nmi at ffffffff800658e5&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3fc43f50&amp;#93;&lt;/span&gt; nmi at ffffffff80064ecf&lt;br/&gt;
    &lt;span class=&quot;error&quot;&gt;&amp;#91;exception RIP: __smp_call_function_many+150&amp;#93;&lt;/span&gt;&lt;br/&gt;
    RIP: ffffffff80077925  RSP: ffff810c3dde7c30  RFLAGS: 00000297&lt;br/&gt;
    RAX: 0000000000000016  RBX: ffff810c3dde7cb0  RCX: 0000000000000000&lt;br/&gt;
    RDX: 00000000000000ff  RSI: 00000000000000ff  RDI: 00000000000000c0&lt;br/&gt;
    RBP: 0000000000000017   R8: 0000000000000018   R9: 0000000000000028&lt;br/&gt;
    R10: ffff810c3dde7bd0  R11: ffff8106293e7600  R12: 0000000000000001&lt;br/&gt;
    R13: 0000000000000000  R14: ffffffff800777f9  R15: ffff810c3dde7cb0&lt;br/&gt;
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018&lt;br/&gt;
&amp;#8212; &amp;lt;NMI exception stack&amp;gt; &amp;#8212;&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3dde7c30&amp;#93;&lt;/span&gt; __smp_call_function_many at ffffffff80077925&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3dde7c78&amp;#93;&lt;/span&gt; smp_call_function_many at ffffffff80077a27&lt;br/&gt;
 #5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3dde7ca8&amp;#93;&lt;/span&gt; smp_call_function at ffffffff80077b18&lt;br/&gt;
 #6 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3dde7cf8&amp;#93;&lt;/span&gt; on_each_cpu at ffffffff80096bcd&lt;br/&gt;
 #7 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3dde7d18&amp;#93;&lt;/span&gt; __remove_vm_area at ffffffff800d55f8&lt;br/&gt;
 #8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3dde7d38&amp;#93;&lt;/span&gt; remove_vm_area at ffffffff800d5627&lt;br/&gt;
 #9 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3dde7d48&amp;#93;&lt;/span&gt; __vunmap at ffffffff800d5673&lt;br/&gt;
#10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3dde7d68&amp;#93;&lt;/span&gt; mlx4_buf_free at ffffffff884f8352&lt;br/&gt;
#11 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3dde7d88&amp;#93;&lt;/span&gt; mlx4_ib_destroy_qp at ffffffff88542219&lt;br/&gt;
#12 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3dde7dc8&amp;#93;&lt;/span&gt; ib_destroy_qp at ffffffff884a1bb0&lt;br/&gt;
#13 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3dde7df8&amp;#93;&lt;/span&gt; ipoib_cm_tx_reap at ffffffff88741999&lt;br/&gt;
#14 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3dde7e38&amp;#93;&lt;/span&gt; run_workqueue at ffffffff8004d8ac&lt;br/&gt;
#15 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3dde7e78&amp;#93;&lt;/span&gt; worker_thread at ffffffff8004a1e5&lt;br/&gt;
#16 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3dde7ee8&amp;#93;&lt;/span&gt; kthread at ffffffff80032c45&lt;br/&gt;
#17 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff810c3dde7f48&amp;#93;&lt;/span&gt; kernel_thread at ffffffff8005dfc1&lt;/p&gt;

&lt;p&gt;Thanks.&lt;/p&gt;</comment>
                            <comment id="62467" author="bfaccini" created="Wed, 17 Jul 2013 09:20:38 +0000"  >&lt;p&gt;Hello Cory,&lt;br/&gt;
I see only 17 runnable (running?) threads, how many core is your OSS ? I presume 16, and all are in NMI context with the 17th being pid 28352?&lt;br/&gt;
Can you check if you get the same order/threads with &quot;ps -l&quot; output in crash ? &lt;br/&gt;
Also can you provide the &quot;dmesg/log&quot;&lt;ins&gt;&quot;ps -l&quot;&lt;/ins&gt;&quot;foreach bt&quot; outputs from crash too ? Or may be the crash-dump itself if site policy is ok ?&lt;/p&gt;

</comment>
                            <comment id="62487" author="kitwestneat" created="Wed, 17 Jul 2013 14:57:41 +0000"  >&lt;p&gt;Hi Bruno, &lt;/p&gt;

&lt;p&gt;There are actually 24 CPUs. I may have misinterpreted running vs runnable. I looked for all the tasks with RU in ps, but is that running? I&apos;ve attached the requested output. &lt;/p&gt;

&lt;p&gt;The vmcore is 2GB, is there a server I could put it on? &lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Kit&lt;/p&gt;</comment>
                            <comment id="62497" author="bfaccini" created="Wed, 17 Jul 2013 17:32:08 +0000"  >&lt;p&gt;RU in ps stands for RUnnable and not only running. That was my wish to get all runnable vs only the running ones. &lt;br/&gt;
Humm, but if there are 24 cores you should see at least 24 RUs including some swapper/idle threads. Thus, I definitely need the 3 crash sub-commands outputs I earlier requested, you said you attached them but I don&apos;t see them ? &lt;br/&gt;
Concerning the crash-dump upload I have sent you the instructions privately.&lt;/p&gt;</comment>
                            <comment id="62532" author="bfaccini" created="Thu, 18 Jul 2013 09:13:06 +0000"  >&lt;p&gt;Having a look to crash-dump extracts you posted in lu-3596.out file, I am less and less convinced it is a dup of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-78&quot; title=&quot;kiblnd_check_conns can deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-78&quot;&gt;&lt;del&gt;LU-78&lt;/del&gt;&lt;/a&gt;. This is because in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-78&quot; title=&quot;kiblnd_check_conns can deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-78&quot;&gt;&lt;del&gt;LU-78&lt;/del&gt;&lt;/a&gt; dead-lock occurred due to lock owner re-schedule and lock competitor elected on CPU, but here lock owner is stuck waiting for a TLB-flush IPI broadcast completion ...&lt;/p&gt;

&lt;p&gt;Also, is the OSS running wit /proc/sys/kernel/nmi_watchdog set ?&lt;/p&gt;</comment>
                            <comment id="62784" author="bfaccini" created="Tue, 23 Jul 2013 10:38:34 +0000"  >&lt;p&gt;Ok, from the crash-dump, scenario is different than for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-78&quot; title=&quot;kiblnd_check_conns can deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-78&quot;&gt;&lt;del&gt;LU-78&lt;/del&gt;&lt;/a&gt;, there are 3 threads involved and all 3 spinning on their CPUs with the following stack-traces at the time of the NMI/watchdog :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 9331   TASK: ffff81063f2707a0  CPU: 12  COMMAND: &quot;ipoib&quot;
 #0 [ffff810c3fc43f20] crash_nmi_callback at ffffffff8007c396
 #1 [ffff810c3fc43f40] do_nmi at ffffffff800658e5
 #2 [ffff810c3fc43f50] nmi at ffffffff80064ecf
    [exception RIP: __smp_call_function_many+150]
    RIP: ffffffff80077925  RSP: ffff810c3dde7c30  RFLAGS: 00000297
    RAX: 0000000000000016  RBX: ffff810c3dde7cb0  RCX: 0000000000000000
    RDX: 00000000000000ff  RSI: 00000000000000ff  RDI: 00000000000000c0
    RBP: 0000000000000017   R8: 0000000000000018   R9: 0000000000000028
    R10: ffff810c3dde7bd0  R11: ffff8106293e7600  R12: 0000000000000001
    R13: 0000000000000000  R14: ffffffff800777f9  R15: ffff810c3dde7cb0
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
--- &amp;lt;NMI exception stack&amp;gt; ---
 #3 [ffff810c3dde7c30] __smp_call_function_many at ffffffff80077925
 #4 [ffff810c3dde7c78] smp_call_function_many at ffffffff80077a27
 #5 [ffff810c3dde7ca8] smp_call_function at ffffffff80077b18
 #6 [ffff810c3dde7cf8] on_each_cpu at ffffffff80096bcd
 #7 [ffff810c3dde7d18] __remove_vm_area at ffffffff800d55f8
 #8 [ffff810c3dde7d38] remove_vm_area at ffffffff800d5627
 #9 [ffff810c3dde7d48] __vunmap at ffffffff800d5673
#10 [ffff810c3dde7d68] mlx4_buf_free at ffffffff884f8352 [mlx4_core]
#11 [ffff810c3dde7d88] mlx4_ib_destroy_qp at ffffffff88542219 [mlx4_ib]
#12 [ffff810c3dde7dc8] ib_destroy_qp at ffffffff884a1bb0 [ib_core]
#13 [ffff810c3dde7df8] ipoib_cm_tx_reap at ffffffff88741999 [ib_ipoib]
#14 [ffff810c3dde7e38] run_workqueue at ffffffff8004d8ac
#15 [ffff810c3dde7e78] worker_thread at ffffffff8004a1e5
#16 [ffff810c3dde7ee8] kthread at ffffffff80032c45
#17 [ffff810c3dde7f48] kernel_thread at ffffffff8005dfc1

PID: 15504  TASK: ffff810c390da0c0  CPU: 17  COMMAND: &quot;kiblnd_sd_13&quot;
 #0 [ffff810c3fd3af20] crash_nmi_callback at ffffffff8007c396
 #1 [ffff810c3fd3af40] do_nmi at ffffffff800658e5
 #2 [ffff810c3fd3af50] nmi at ffffffff80064ecf
    [exception RIP: __write_lock_failed+9]
    RIP: ffffffff80062191  RSP: ffff810619ad7db0  RFLAGS: 00000087
    RAX: 0000000000000286  RBX: ffff8105db4da0c0  RCX: 0000000000000100
    RDX: 000000000000029a  RSI: 00000000fffffffb  RDI: ffffffff8b14d39c
    RBP: ffff8105db4da0c0   R8: 0000000000000001   R9: 00000000ffffffff
    R10: 0000000000000000  R11: 000000000000000a  R12: 00000000fffffffb
    R13: 0000000000000001  R14: ffffc20010d5fd10  R15: 000000000000000c
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
--- &amp;lt;NMI exception stack&amp;gt; ---
 #3 [ffff810619ad7db0] __write_lock_failed at ffffffff80062191
 #4 [ffff810619ad7db0] _write_lock_irqsave at ffffffff80064b83
 #5 [ffff810619ad7db8] kiblnd_close_conn at ffffffff8b12c844 [ko2iblnd]
 #6 [ffff810619ad7dd8] kiblnd_tx_complete at ffffffff8b12f20d [ko2iblnd]
 #7 [ffff810619ad7e58] kiblnd_scheduler at ffffffff8b136769 [ko2iblnd]
 #8 [ffff810619ad7f48] kernel_thread at ffffffff8005dfc1

PID: 15515  TASK: ffff810c39c237e0  CPU: 0   COMMAND: &quot;kiblnd_connd&quot;
 #0 [ffffffff804b8dc0] crash_kexec at ffffffff800b1192
 #1 [ffffffff804b8e80] die_nmi at ffffffff80065285
 #2 [ffffffff804b8ea0] nmi_watchdog_tick at ffffffff80065a66
 #3 [ffffffff804b8ef0] default_do_nmi at ffffffff80065609
 #4 [ffffffff804b8f40] do_nmi at ffffffff800658f1
 #5 [ffffffff804b8f50] nmi at ffffffff80064ecf
    [exception RIP: __write_lock_failed+15]
    RIP: ffffffff80062197  RSP: ffff81061978dc90  RFLAGS: 00000087
    RAX: ffffc20000000000  RBX: ffff8107eb940140  RCX: 0000000000000001
    RDX: 0000000000006000  RSI: 0000000000000003  RDI: ffffffff8032d42c
    RBP: ffffc20000000000   R8: 0000000000000000   R9: ffff810c39c237e0
    R10: ffff8105645d2000  R11: 0000000000002000  R12: ffffffffffffffff
    R13: 0000000000007000  R14: 0000000000000001  R15: 0000000000000002
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
--- &amp;lt;NMI exception stack&amp;gt; ---
 #6 [ffff81061978dc90] __write_lock_failed at ffffffff80062197
 #7 [ffff81061978dc90] _write_lock at ffffffff80064a7d
 #8 [ffff81061978dc98] __get_vm_area_node at ffffffff800d51c1
 #9 [ffff81061978dcd8] __vmalloc_node at ffffffff800d5952
#10 [ffff81061978dcf8] kiblnd_create_tx_pool at ffffffff8b127e0e [ko2iblnd]
#11 [ffff81061978dd68] kiblnd_pool_alloc_node at ffffffff8b1247a9 [ko2iblnd]
#12 [ffff81061978ddc8] kiblnd_get_idle_tx at ffffffff8b12d8d0 [ko2iblnd]
#13 [ffff81061978ddd8] kiblnd_check_sends at ffffffff8b12e9b7 [ko2iblnd]
#14 [ffff81061978dde8] kiblnd_check_txs at ffffffff8b12c22c [ko2iblnd]
#15 [ffff81061978de48] kiblnd_check_conns at ffffffff8b12ea68 [ko2iblnd]
#16 [ffff81061978dea8] kiblnd_connd at ffffffff8b136063 [ko2iblnd]
#17 [ffff81061978df48] kernel_thread at ffffffff8005dfc1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;where :&lt;/p&gt;

&lt;p&gt;         _ 9331/&quot;ipoib&quot; thread owns vmlist_lock, during mlx4 driver buffer unmap, but then wants all others CPUs to flush their TLB and thus sent an IPI to each of them and it is now busy waiting for one reluctant peer ...&lt;/p&gt;

&lt;p&gt;         _ 15504/&quot;kiblnd_sd_13&quot; thread is spinning/waiting on kiblnd_data.kib_global_lock in kiblnd_close_conn() but with interrupts disabled, and thus is likely to be the one still missing the IPI and to flush its TLB !!&lt;/p&gt;

&lt;p&gt;         _ 15515/&quot;kiblnd_connd&quot; thread owns kiblnd_data.kib_global_lock since kiblnd_check_conns() but is now spinning/waiting for vmlist_lock due to kmem alloc in kiblnd_create_tx_pool().&lt;/p&gt;


&lt;p&gt;So 9331 blocks 15515 who blocks 15504 who blocks 9331 !!&lt;/p&gt;

&lt;p&gt;But even if it is a new scenario than the 2 already described in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-78&quot; title=&quot;kiblnd_check_conns can deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-78&quot;&gt;&lt;del&gt;LU-78&lt;/del&gt;&lt;/a&gt;, I can now agree that it would also be fixed by the same patch since with it thread 15515 will not spin on vmlist_lock when already owning kiblnd_data.kib_global_lock !!&lt;/p&gt;

</comment>
                            <comment id="62845" author="kitwestneat" created="Tue, 23 Jul 2013 20:47:11 +0000"  >&lt;p&gt;Thanks for the analysis! I have ported the patch from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-78&quot; title=&quot;kiblnd_check_conns can deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-78&quot;&gt;&lt;del&gt;LU-78&lt;/del&gt;&lt;/a&gt; to b1_8:&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/7092&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/7092&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Does that look correct?&lt;/p&gt;</comment>
                            <comment id="62879" author="bfaccini" created="Wed, 24 Jul 2013 08:16:03 +0000"  >&lt;p&gt;Yes, patch/back-port looks ok. Need to wait Liang&apos;s feed-back to confirm. Also I had to re-trigger auto-tests due to un-related failure.&lt;/p&gt;</comment>
                            <comment id="207207" author="orentas" created="Fri, 1 Sep 2017 14:55:02 +0000"  >&lt;p&gt;This is resolved. Please close.&lt;/p&gt;</comment>
                            <comment id="207209" author="pjones" created="Fri, 1 Sep 2017 14:57:59 +0000"  >&lt;p&gt;Thanks Oz&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="13183" name="lu-3596.out" size="1140228" author="kitwestneat" created="Wed, 17 Jul 2013 18:25:12 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10490" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>End date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Fri, 9 May 2014 15:32:39 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvvhb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9116</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10493" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>Start date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Tue, 16 Jul 2013 15:32:39 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                    </customfields>
    </item>
</channel>
</rss>