<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:15:31 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1311] Debugging deadlock in lnet</title>
                <link>https://jira.whamcloud.com/browse/LU-1311</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;ORNL reported this lockup&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[  446.217881] Kernel panic - not syncing: Watchdog detected hard LOCKUP on cpu 2
[  446.217881] Pid: 9709, comm: kiblnd_sd_05 Not tainted 2.6.32-220.el6.wc.x86_64 #1
[  446.217881] Call Trace:
[  446.217881]  &amp;lt;NMI&amp;gt;  [&amp;lt;ffffffff814ec681&amp;gt;] ? panic+0x78/0x143
[  446.217881]  [&amp;lt;ffffffff810d8fad&amp;gt;] ? watchdog_overflow_callback+0xcd/0xd0
[  446.217881]  [&amp;lt;ffffffff8110a89d&amp;gt;] ? __perf_event_overflow+0x9d/0x230
[  446.217881]  [&amp;lt;ffffffff8110ae54&amp;gt;] ? perf_event_overflow+0x14/0x20
[  446.217881]  [&amp;lt;ffffffff8101e096&amp;gt;] ? intel_pmu_handle_irq+0x336/0x550
[  446.217881]  [&amp;lt;ffffffff814f2256&amp;gt;] ? kprobe_exceptions_notify+0x16/0x430
[  446.217881]  [&amp;lt;ffffffff814f0d39&amp;gt;] ? perf_event_nmi_handler+0x39/0xb0
[  446.217881]  [&amp;lt;ffffffff814f2885&amp;gt;] ? notifier_call_chain+0x55/0x80
[  446.217881]  [&amp;lt;ffffffff814f28ea&amp;gt;] ? atomic_notifier_call_chain+0x1a/0x20
[  446.217881]  [&amp;lt;ffffffff81096bce&amp;gt;] ? notify_die+0x2e/0x30
[  446.217881]  [&amp;lt;ffffffff814f0503&amp;gt;] ? do_nmi+0x173/0x2b0
[  446.217881]  [&amp;lt;ffffffff814efe10&amp;gt;] ? nmi+0x20/0x30
[  446.217881]  [&amp;lt;ffffffff814ef67c&amp;gt;] ? _spin_lock+0x1c/0x30
[  446.217881]  &amp;lt;&amp;lt;EOE&amp;gt;&amp;gt;  [&amp;lt;ffffffffa07473bf&amp;gt;] ? cfs_trace_lock_tcd+0x2f/0x90 [libcfs]
[  446.217881]  [&amp;lt;ffffffffa0750b0a&amp;gt;] ? libcfs_debug_vmsg2+0xda/0xb60 [libcfs]
[  446.217881]  [&amp;lt;ffffffffa07515d1&amp;gt;] ? libcfs_debug_msg+0x41/0x50 [libcfs]
[  446.217881]  [&amp;lt;ffffffffa07515d1&amp;gt;] ? libcfs_debug_msg+0x41/0x50 [libcfs]
[  446.217881]  [&amp;lt;ffffffffa0b5768d&amp;gt;] ? kiblnd_scheduler+0x16d/0x620 [ko2iblnd]
[  446.217881]  [&amp;lt;ffffffff8105fa50&amp;gt;] ? default_wake_function+0x0/0x20
[  446.217881]  [&amp;lt;ffffffffa0b57520&amp;gt;] ? kiblnd_scheduler+0x0/0x620 [ko2iblnd]
[  446.217881]  [&amp;lt;ffffffff8100c14a&amp;gt;] ? child_rip+0xa/0x20
[  446.217881]  [&amp;lt;ffffffffa0b57520&amp;gt;] ? kiblnd_scheduler+0x0/0x620 [ko2iblnd]
[  446.217881]  [&amp;lt;ffffffffa0b57520&amp;gt;] ? kiblnd_scheduler+0x0/0x620 [ko2iblnd]
[  446.217881]  [&amp;lt;ffffffff8100c140&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;the other thread is in irq context:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;PID: 9855   TASK: ffff880431898b40  CPU: 0   COMMAND: &lt;span class=&quot;code-quote&quot;&gt;&quot;ktracefiled&quot;&lt;/span&gt;
 #0 [ffff880028207e90] crash_nmi_callback at ffffffff81029716
 #1 [ffff880028207ea0] notifier_call_chain at ffffffff814f2885
 #2 [ffff880028207ee0] atomic_notifier_call_chain at ffffffff814f28ea
 #3 [ffff880028207ef0] notify_die at ffffffff81096bce
 #4 [ffff880028207f20] do_nmi at ffffffff814f0503
 #5 [ffff880028207f50] nmi at ffffffff814efe10
    [exception RIP: _spin_lock_irqsave+47]
    RIP: ffffffff814ef56f  RSP: ffff880028203de8  RFLAGS: 00000097
    RAX: 00000000000055c2  RBX: ffff88040df2b800  RCX: 00000000000055c1
    RDX: 0000000000000046  RSI: ffff88040df2b800  RDI: ffffffffa0b71c30
    RBP: ffff880028203de8   R8: ffff8803fbb04000   R9: ffff8803fbb05de0
    R10: ffff880028403a40  R11: 0000000000000000  R12: 0000000000000087
    R13: 0000000000000000  R14: ffff8804318da000  R15: ffff8803fbb05f00
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
--- &amp;lt;NMI exception stack&amp;gt; ---
 #6 [ffff880028203de8] _spin_lock_irqsave at ffffffff814ef56f
 #7 [ffff880028203df0] kiblnd_cq_completion at ffffffffa0b51548 [ko2iblnd]   &amp;lt;== &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; is line 3277 (cfs_spin_lock_irqsave(&amp;amp;kiblnd_data.kib_sched_lock, flags);) in o2iblnd_cb.c
 #8 [ffff880028203e10] mthca_cq_completion at ffffffffa019ca4a [ib_mthca]
 #9 [ffff880028203e30] mthca_eq_int at ffffffffa019c422 [ib_mthca]
#10 [ffff880028203eb0] mthca_arbel_msi_x_interrupt at ffffffffa019c694 [ib_mthca]
#11 [ffff880028203ed0] handle_IRQ_event at ffffffff810d94f0
#12 [ffff880028203f20] handle_edge_irq at ffffffff810dbc2e
#13 [ffff880028203f60] handle_irq at ffffffff8100df09
#14 [ffff880028203f80] do_IRQ at ffffffff814f504c
--- &amp;lt;IRQ stack&amp;gt; ---
#15 [ffff8803fbb05d38] ret_from_intr at ffffffff8100ba53
    [exception RIP: put_pages_on_tcd_daemon_list+255]
    RIP: ffffffffa075072f  RSP: ffff8803fbb05de0  RFLAGS: 00000246
    RAX: ffffea000c9a82a8  RBX: ffff8803fbb05e20  RCX: ffff880390407b88
    RDX: ffff88040d080128  RSI: ffff880390407b48  RDI: ffff880390407b68
    RBP: ffffffff8100ba4e   R8: ffff88040d080128   R9: 0000000000000000
    R10: ffff880028403a40  R11: 0000000000000000  R12: ffff8803fbb05d70
    R13: ffff8803fbb05f00  R14: ffff8803fbb05ef0  R15: 0000000000000286
    ORIG_RAX: ffffffffffffff46  CS: 0010  SS: 0018
#16 [ffff8803fbb05e28] put_pages_on_daemon_list at ffffffffa075094b [libcfs]  &amp;lt;== &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; is line 575 (cfs_tcd_for_each_type_lock) in tracefile.c
#17 [ffff8803fbb05e78] tracefiled at ffffffffa0751d4f [libcfs]
#18 [ffff8803fbb05f48] kernel_thread at ffffffff8100c14a
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="13968">LU-1311</key>
            <summary>Debugging deadlock in lnet</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="green">Oleg Drokin</assignee>
                                    <reporter username="green">Oleg Drokin</reporter>
                        <labels>
                    </labels>
                <created>Thu, 12 Apr 2012 11:12:55 +0000</created>
                <updated>Thu, 31 May 2012 16:36:36 +0000</updated>
                            <resolved>Thu, 31 May 2012 16:36:36 +0000</resolved>
                                    <version>Lustre 2.2.0</version>
                    <version>Lustre 2.3.0</version>
                                    <fixVersion>Lustre 2.3.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>3</watches>
                                                                            <comments>
                            <comment id="34608" author="green" created="Thu, 12 Apr 2012 11:17:03 +0000"  >&lt;p&gt;tracefiled thread: lock some random tcd (nonirq type)&lt;br/&gt;
ibscheduler thread: get kiblnd_data.kib_sched_lock&lt;br/&gt;
ibscheduler thread: try to print debug message, need the lock in the tcd that tracefiled holds&lt;br/&gt;
tracefiled: interrupt hits, needs the kiblnd_data.kib_sched_lock == deadlock&lt;/p&gt;</comment>
                            <comment id="34711" author="liang" created="Fri, 13 Apr 2012 10:25:12 +0000"  >&lt;p&gt;Oleg, due to your analysis, any call of CDEBUG between spin_lock/unlock_irqsave, or spin_lock/unlock_bh could trigger this deadlock. I think a fix could be like this:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
#define cfs_trace_lock_tcd_safe(tcd, flags)
({
    int __ret;
    spin_lock_irqsave(&amp;amp;cfs_trace_glock, flags);
    __ret = cfs_trace_lock_tcd(tcd);
    __ret;
})

#define cfs_trace_unlock_tcd_safe(tcd, flags)
({
    cfs_trace_unlock_tcd(tcd);
    spin_unlock_irqrestore(&amp;amp;cfs_trace_glock, flags);
})

#define cfs_tcd_for_each_type_lock(..., flags)
......
cfs_trace_lock_tcd_safe(tcd, fags); cfs_trace_unlock_tcd(tcd, flags), i++)

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;how do you think?&lt;/p&gt;

&lt;p&gt;Liang&lt;/p&gt;
</comment>
                            <comment id="34836" author="pjones" created="Mon, 16 Apr 2012 16:12:35 +0000"  >&lt;p&gt;Oleg will construct a patch&lt;/p&gt;</comment>
                            <comment id="35542" author="green" created="Fri, 27 Apr 2012 00:40:46 +0000"  >&lt;p&gt;Patch at &lt;a href=&quot;http://review.whamcloud.com/2605&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/2605&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="35871" author="simmonsja" created="Mon, 30 Apr 2012 08:40:59 +0000"  >&lt;p&gt;I now get this error when testing with the patch.&lt;/p&gt;

&lt;p&gt;Apr 30 08:19:54 barry-oss4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;247732.876333&amp;#93;&lt;/span&gt; Lustre: debug daemon will attempt to start writing to /chexport/lustre/logs/lustre-log.barry-oss4 (2097152kB m&lt;br/&gt;
ax)&lt;br/&gt;
Apr 30 08:21:41 barry-oss4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;247839.213070&amp;#93;&lt;/span&gt; wanted to write 1008 but wrote -5&lt;br/&gt;
Apr 30 08:21:41 barry-oss4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;247839.225887&amp;#93;&lt;/span&gt; LNetError: 20124:0&lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/sad.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;/data/buildsystem/jsimmons-wc/rpmbuild/BUILD/lustre-2.2.0/libcfs/libcfs/tracefile.c:1038:t&lt;br/&gt;
racefiled()) ASSERTION(tage-&amp;gt;page != NULL) failed   &lt;br/&gt;
Apr 30 08:21:41 barry-oss4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;247839.249433&amp;#93;&lt;/span&gt; Kernel panic - not syncing: Lustre debug assertion failure&lt;br/&gt;
Apr 30 08:21:41 barry-oss4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;247839.249434&amp;#93;&lt;/span&gt; &lt;br/&gt;
Apr 30 08:21:41 barry-oss4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;247839.282160&amp;#93;&lt;/span&gt; Pid: 20124, comm: ktracefiled Not tainted 2.6.32-220.el6.wc.x86_64 #1&lt;br/&gt;
Apr 30 08:21:41 barry-oss4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;247839.297951&amp;#93;&lt;/span&gt; Call Trace:&lt;br/&gt;
Apr 30 08:21:41 barry-oss4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;247839.308593&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff814ec681&amp;gt;&amp;#93;&lt;/span&gt; ? panic+0x78/0x143&lt;br/&gt;
Apr 30 08:21:41 barry-oss4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;247839.321844&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109b56a&amp;gt;&amp;#93;&lt;/span&gt; ? do_gettimeofday+0x1a/0x50&lt;br/&gt;
Apr 30 08:21:41 barry-oss4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;247839.335818&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07ff004&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_trace_assertion_failed+0x74/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 30 08:21:41 barry-oss4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;247839.351479&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0800edb&amp;gt;&amp;#93;&lt;/span&gt; ? tracefiled+0x43b/0x530 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 30 08:21:41 barry-oss4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;247839.365829&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff811245b9&amp;gt;&amp;#93;&lt;/span&gt; ? free_pages+0x49/0x50&lt;br/&gt;
Apr 30 08:21:41 barry-oss4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;247839.379090&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8105fa50&amp;gt;&amp;#93;&lt;/span&gt; ? default_wake_function+0x0/0x20&lt;br/&gt;
Apr 30 08:21:41 barry-oss4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;247839.393366&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0800aa0&amp;gt;&amp;#93;&lt;/span&gt; ? tracefiled+0x0/0x530 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 30 08:21:41 barry-oss4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;247839.407430&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c14a&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0xa/0x20&lt;br/&gt;
Apr 30 08:21:41 barry-oss4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;247839.420375&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0800aa0&amp;gt;&amp;#93;&lt;/span&gt; ? tracefiled+0x0/0x530 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 30 08:21:41 barry-oss4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;247839.434357&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0800aa0&amp;gt;&amp;#93;&lt;/span&gt; ? tracefiled+0x0/0x530 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 30 08:21:41 barry-oss4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;247839.448139&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c140&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;/p&gt;

&lt;p&gt;The load on all OSS goes to 300+.&lt;/p&gt;</comment>
                            <comment id="39761" author="pjones" created="Thu, 31 May 2012 16:36:36 +0000"  >&lt;p&gt;Landed for 2.3&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzv6rr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>4631</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>