<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:12:47 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-7886] Hard lockup from debug logging</title>
                <link>https://jira.whamcloud.com/browse/LU-7886</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;An OSS panic&apos;ed because a watchdog detected a hard lockup.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[1643703.194240] Kernel panic - not syncing: Watchdog detected hard LOCKUP on cpu 2
[1643703.194243] Pid: 42640, comm: libcfs_debug_du Tainted: G           --L------------    2.6.32-573.12.1.el6.atlas.x86_64 #1
[1643703.194245] Call Trace:
[1643703.194246]  &amp;lt;NMI&amp;gt;  [&amp;lt;ffffffff81538ae1&amp;gt;] ? panic+0xa7/0x16f
[1643703.194257]  [&amp;lt;ffffffff810149c9&amp;gt;] ? sched_clock+0x9/0x10
[1643703.194261]  [&amp;lt;ffffffff810ed8ad&amp;gt;] ? watchdog_overflow_callback+0xcd/0xd0
[1643703.194264]  [&amp;lt;ffffffff81124427&amp;gt;] ? __perf_event_overflow+0xa7/0x240
[1643703.194267]  [&amp;lt;ffffffff8101dc54&amp;gt;] ? x86_perf_event_set_period+0xf4/0x180
[1643703.194270]  [&amp;lt;ffffffff81124a74&amp;gt;] ? perf_event_overflow+0x14/0x20
[1643703.194272]  [&amp;lt;ffffffff81024a02&amp;gt;] ? intel_pmu_handle_irq+0x202/0x3f0
[1643703.194276]  [&amp;lt;ffffffff8153dde9&amp;gt;] ? perf_event_nmi_handler+0x39/0xb0
[1643703.194278]  [&amp;lt;ffffffff8153f8a5&amp;gt;] ? notifier_call_chain+0x55/0x80
[1643703.194284]  [&amp;lt;ffffffff8153f90a&amp;gt;] ? atomic_notifier_call_chain+0x1a/0x20
[1643703.194287]  [&amp;lt;ffffffff810a783e&amp;gt;] ? notify_die+0x2e/0x30
[1643703.194289]  [&amp;lt;ffffffff8153d563&amp;gt;] ? do_nmi+0x1c3/0x350
[1643703.194292]  [&amp;lt;ffffffff8153ce20&amp;gt;] ? nmi+0x20/0x30
[1643703.194295]  [&amp;lt;ffffffff8153c5b8&amp;gt;] ? _spin_lock_irq+0x28/0x40
[1643703.194296]  &amp;lt;&amp;lt;EOE&amp;gt;&amp;gt;  [&amp;lt;ffffffffa0461445&amp;gt;] ? cfs_trace_lock_tcd+0x95/0xa0 [libcfs]
[1643703.194318]  [&amp;lt;ffffffffa047202b&amp;gt;] ? collect_pages+0x25b/0x290 [libcfs]
[1643703.194327]  [&amp;lt;ffffffffa047270c&amp;gt;] ? cfs_tracefile_dump_all_pages+0x5c/0x2e0 [libcfs]
[1643703.194330]  [&amp;lt;ffffffff81538bea&amp;gt;] ? printk+0x41/0x47
[1643703.194338]  [&amp;lt;ffffffffa046c720&amp;gt;] ? libcfs_debug_dumplog_thread+0x0/0x30 [libcfs]
[1643703.194347]  [&amp;lt;ffffffffa046c705&amp;gt;] ? libcfs_debug_dumplog_internal+0xb5/0xd0 [libcfs]
[1643703.194356]  [&amp;lt;ffffffffa046c72e&amp;gt;] ? libcfs_debug_dumplog_thread+0xe/0x30 [libcfs]
[1643703.194360]  [&amp;lt;ffffffff810a0fce&amp;gt;] ? kthread+0x9e/0xc0
[1643703.194363]  [&amp;lt;ffffffff8100c28a&amp;gt;] ? child_rip+0xa/0x20
[1643703.194365]  [&amp;lt;ffffffff810a0f30&amp;gt;] ? kthread+0x0/0xc0
[1643703.194367]  [&amp;lt;ffffffff8100c280&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;CPU2 was spinning on a tcd spinlock.  We got a crashdump, here&apos;s the register dump:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;#12 [ffff880044646f50] nmi at ffffffff8153ce20
    [exception RIP: _spin_lock_irq+40]
    RIP: ffffffff8153c5b8  RSP: ffff8803e8187da0  RFLAGS: 00000097
    RAX: 00000000000003b1  RBX: ffff88080ce90600  RCX: 0000000000000600
    RDX: 00000000000003b0  RSI: 0000000000000001  RDI: ffff88080ce90600
    RBP: ffff8803e8187da0   R8: ffffffff81c17c00   R9: 0000000000000000
    R10: 0000000000000010  R11: 0000000000000001  R12: 0000000000000001
    R13: ffff88080ce90000  R14: ffffffffa0497e88  R15: ffff88080ce90600
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
--- &amp;lt;NMI exception stack&amp;gt; ---
#13 [ffff8803e8187da0] _spin_lock_irq at ffffffff8153c5b8
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Which tcd lock was it spinning on?  It&apos;s in spin_lock_irq so we know the TCD type was 0. Let&apos;s grab our 16 lock addresses&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; p &amp;amp;cfs_trace_data[0][0][0]-&amp;gt;tcd.tcd_lock
$21 = (spinlock_t *) 0xffff88080ce90000
crash&amp;gt; p &amp;amp;cfs_trace_data[0][0][1]-&amp;gt;tcd.tcd_lock
$22 = (spinlock_t *) 0xffff88080ce90080
crash&amp;gt; p &amp;amp;cfs_trace_data[0][0][2]-&amp;gt;tcd.tcd_lock
$23 = (spinlock_t *) 0xffff88080ce90100
crash&amp;gt; p &amp;amp;cfs_trace_data[0][0][3]-&amp;gt;tcd.tcd_lock
$24 = (spinlock_t *) 0xffff88080ce90180
crash&amp;gt; p &amp;amp;cfs_trace_data[0][0][4]-&amp;gt;tcd.tcd_lock
$25 = (spinlock_t *) 0xffff88080ce90200
crash&amp;gt; p &amp;amp;cfs_trace_data[0][0][5]-&amp;gt;tcd.tcd_lock
$26 = (spinlock_t *) 0xffff88080ce90280
crash&amp;gt; p &amp;amp;cfs_trace_data[0][0][6]-&amp;gt;tcd.tcd_lock
$27 = (spinlock_t *) 0xffff88080ce90300
crash&amp;gt; p &amp;amp;cfs_trace_data[0][0][7]-&amp;gt;tcd.tcd_lock
$28 = (spinlock_t *) 0xffff88080ce90380
crash&amp;gt; p &amp;amp;cfs_trace_data[0][0][8]-&amp;gt;tcd.tcd_lock
$29 = (spinlock_t *) 0xffff88080ce90400
crash&amp;gt; p &amp;amp;cfs_trace_data[0][0][9]-&amp;gt;tcd.tcd_lock
$30 = (spinlock_t *) 0xffff88080ce90480
crash&amp;gt; p &amp;amp;cfs_trace_data[0][0][10]-&amp;gt;tcd.tcd_lock
$31 = (spinlock_t *) 0xffff88080ce90500
crash&amp;gt; p &amp;amp;cfs_trace_data[0][0][11]-&amp;gt;tcd.tcd_lock
$32 = (spinlock_t *) 0xffff88080ce90580
crash&amp;gt; p &amp;amp;cfs_trace_data[0][0][12]-&amp;gt;tcd.tcd_lock
$33 = (spinlock_t *) 0xffff88080ce90600
crash&amp;gt; p &amp;amp;cfs_trace_data[0][0][13]-&amp;gt;tcd.tcd_lock
$34 = (spinlock_t *) 0xffff88080ce90680
crash&amp;gt; p &amp;amp;cfs_trace_data[0][0][14]-&amp;gt;tcd.tcd_lock
$35 = (spinlock_t *) 0xffff88080ce90700
crash&amp;gt; p &amp;amp;cfs_trace_data[0][0][15]-&amp;gt;tcd.tcd_lock
$36 = (spinlock_t *) 0xffff88080ce90780
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The lock address for CPU12 (0xffff88080ce90600) is in several registers, so it seems a likely culprit.  What&apos;s he doing?&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 13581  TASK: ffff880e348baab0  CPU: 12  COMMAND: &quot;ll_ost_io03_058&quot;
 #0 [ffff88089c486e90] crash_nmi_callback at ffffffff81033cf6
 #1 [ffff88089c486ea0] notifier_call_chain at ffffffff8153f8a5
 #2 [ffff88089c486ee0] atomic_notifier_call_chain at ffffffff8153f90a
 #3 [ffff88089c486ef0] notify_die at ffffffff810a783e
 #4 [ffff88089c486f20] do_nmi at ffffffff8153d563
 #5 [ffff88089c486f50] nmi at ffffffff8153ce20
    [exception RIP: native_read_tsc+6]
    RIP: ffffffff810156e6  RSP: ffff880e332b3720  RFLAGS: 00000046
    RAX: 00000000433eaf0e  RBX: 000000000000000c  RCX: 00000000433eaed6
    RDX: 000000000020692a  RSI: 000000000027aacd  RDI: 0000000000000a28
    RBP: ffff880e332b3720   R8: 0000000000eb986a   R9: ffff8800000bd000
    R10: 0000000000000000  R11: 0000000000000198  R12: 00000000433ea79a
    R13: 000000000000000c  R14: 0000000000000a28  R15: ffffffff8136aec0
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
--- &amp;lt;NMI exception stack&amp;gt; ---
 #6 [ffff880e332b3720] native_read_tsc at ffffffff810156e6
 #7 [ffff880e332b3728] delay_tsc at ffffffff8129d2ba
 #8 [ffff880e332b3758] __const_udelay at ffffffff8129d266
 #9 [ffff880e332b3768] wait_for_xmitr at ffffffff8136ae42
#10 [ffff880e332b3798] serial8250_console_putchar at ffffffff8136aee6
#11 [ffff880e332b37b8] uart_console_write at ffffffff8136688e
#12 [ffff880e332b37f8] serial8250_console_write at ffffffff8136b24d
#13 [ffff880e332b3848] __call_console_drivers at ffffffff810775f5
#14 [ffff880e332b3878] _call_console_drivers at ffffffff8107765a
#15 [ffff880e332b3898] release_console_sem at ffffffff81077ca8
#16 [ffff880e332b38d8] vprintk at ffffffff810783a8
#17 [ffff880e332b3978] printk at ffffffff81538bea
#18 [ffff880e332b39d8] cfs_print_to_console at ffffffffa0461114 [libcfs]
#19 [ffff880e332b3a08] libcfs_debug_vmsg2 at ffffffffa04716ae [libcfs]
#20 [ffff880e332b3b78] _debug_req at ffffffffa0753823 [ptlrpc]
#21 [ffff880e332b3da8] ptlrpc_main at ffffffffa0764b42 [ptlrpc]
#22 [ffff880e332b3ee8] kthread at ffffffff810a0fce
#23 [ffff880e332b3f48] kernel_thread at ffffffff8100c28a
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;CPU12 is running &lt;em&gt;libcfs_debug_vmsg2&lt;/em&gt; so he would be holding a local tcd lock.  He&apos;s sending data out on the serial console (it&apos;s serial-over-lan through the BMC).  Running &apos;dmesg&apos; inside crash shows the panic happened at 1643703, but there were 1611 messages logged at time 1643640. That&apos;s the last entry we see on our central logging server.  Was the log volume so high or the serial console so slow, that it was able to deny access to the spinlock for 10s?&lt;/p&gt;

&lt;p&gt;Note, we had &lt;em&gt;console_ratelimit&lt;/em&gt; disabled due to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7867&quot; title=&quot;OI scrubber causing performance issues&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7867&quot;&gt;&lt;del&gt;LU-7867&lt;/del&gt;&lt;/a&gt; to make sure we see all the problematic objects.  If my above hypothesis is true, then enabling &lt;em&gt;console_ratelimit&lt;/em&gt; would have likely prevented the crash.  Is that just an unsafe option, or is there something we can do to improve it?&lt;/p&gt;</description>
                <environment>2.5.5-g1241c21-CHANGED-2.6.32-573.12.1.el6.atlas.x86_64</environment>
        <key id="35444">LU-7886</key>
            <summary>Hard lockup from debug logging</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="2">Won&apos;t Fix</resolution>
                                        <assignee username="yujian">Jian Yu</assignee>
                                    <reporter username="ezell">Matt Ezell</reporter>
                        <labels>
                    </labels>
                <created>Fri, 18 Mar 2016 16:18:12 +0000</created>
                <updated>Mon, 18 Apr 2016 18:25:32 +0000</updated>
                            <resolved>Mon, 18 Apr 2016 18:25:32 +0000</resolved>
                                    <version>Lustre 2.5.5</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="146145" author="adilger" created="Fri, 18 Mar 2016 17:30:31 +0000"  >&lt;p&gt;When logging to a slow serial console this can disable interrupts on the whole system while the output is ongoing, and can definitely cause major system hangs.  Disabling the console message rate limiting is making this problem worse, and is not recommended.  If the OSS is not crashing then it would be much better to run &lt;tt&gt;lctl debug_daemon&lt;/tt&gt; (with appropriate options for max file size) to log all of the debug messages to disk instead of trying to pipe this via serial console.&lt;/p&gt;</comment>
                            <comment id="146432" author="bfaccini" created="Tue, 22 Mar 2016 08:50:35 +0000"  >&lt;p&gt;I fully agree with Andreas comment that disabling Console message rate limiting is a bad idea, but based on my previous experiences with similar problems, I also want to add that this much more dangerous when using some slow/buggy Console data-path (I2C/GPIO HW, BMC/IPMI FW, ...), so you may also want to check about this issue with you HW provider and verify there is no FW update to fix related issues.&lt;/p&gt;</comment>
                            <comment id="147495" author="yujian" created="Thu, 31 Mar 2016 20:19:15 +0000"  >&lt;p&gt;Hi Matt,&lt;/p&gt;

&lt;p&gt;I saw the following meeting note about this ticket:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;It would be better if we use the debug daemon, but it doesn&apos;t work.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Could you please comment what the problem you hit while using &lt;tt&gt;lctl debug_daemon&lt;/tt&gt;? Thank you.&lt;/p&gt;

&lt;p&gt;FYI, here are the instructions for using &lt;tt&gt;lctl debug_daemon&lt;/tt&gt;: &lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-manual/lastSuccessfulBuild/artifact/lustre_manual.html#idm140466054410624&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.hpdd.intel.com/job/lustre-manual/lastSuccessfulBuild/artifact/lustre_manual.html#idm140466054410624&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="149312" author="yujian" created="Mon, 18 Apr 2016 18:25:32 +0000"  >&lt;p&gt;Let&apos;s close this ticket because the issue is expected behavior. New tickets will be created when new problems occur next time.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzy4vr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>