<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:17:57 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1587] watchdog detected hard LOCKUP on OSS under heavy client reconnects</title>
                <link>https://jira.whamcloud.com/browse/LU-1587</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We are experiencing a severe disruption of service on our classified network.  Beginning around 5pm yesterday (Friday Jun 29) we began to have many evictions and reconnections across multiple servers and across multiple lustre clusters on our classified network.  Some sort of network event may have triggered this, but we&apos;re not sure at this point.  Lustre servers started getting STONITH&apos;d by their failover partners.&lt;/p&gt;

&lt;p&gt;Many servers have been getting panics due to CPU hard lockup.  The stack  traces aren&apos;t helpful; they just show the swapper task handling the watchdog NMI.  We&apos;re trying to quiesce things by stopping lnet on the compute cluster routers, getting lustre started on all the servers, then bringing client clusters back online one at a time.  However, we are still having sporadic CPU hard lockups on OSS&apos;s as clients reconnect.  Typically the lockup follows several thousand client reconnects.&lt;/p&gt;

&lt;p&gt;We have some complete crash dumps to look at, but there&apos;s not much to go on.  Some process must be leaving interrupts disabled, but I&apos;m not sure how to go about identifying with the data we have.&lt;/p&gt;</description>
                <environment>&lt;a href=&quot;https://github.com/chaos/lustre/commits/2.1.1-4chaos&quot;&gt;https://github.com/chaos/lustre/commits/2.1.1-4chaos&lt;/a&gt;</environment>
        <key id="15100">LU-1587</key>
            <summary>watchdog detected hard LOCKUP on OSS under heavy client reconnects</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="niu">Niu Yawei</assignee>
                                    <reporter username="nedbass">Ned Bass</reporter>
                        <labels>
                    </labels>
                <created>Sat, 30 Jun 2012 14:53:12 +0000</created>
                <updated>Sun, 13 Apr 2014 17:24:40 +0000</updated>
                            <resolved>Sun, 13 Apr 2014 17:24:40 +0000</resolved>
                                    <version>Lustre 2.1.1</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="41347" author="pjones" created="Sun, 1 Jul 2012 02:35:29 +0000"  >&lt;p&gt;Niu could you please look into this one Peter&lt;/p&gt;</comment>
                            <comment id="41357" author="niu" created="Mon, 2 Jul 2012 00:35:31 +0000"  >&lt;p&gt;Could you post the stack trace here as well? Though it might not be helpful. Thanks.&lt;/p&gt;</comment>
                            <comment id="41377" author="prakash" created="Mon, 2 Jul 2012 13:02:45 +0000"  >&lt;p&gt;Hi Niu. I&apos;ll post the stack trace as soon as I can. Although, I briefly talked to Ned and he didn&apos;t think the stack trace would be helpful. &lt;/p&gt;</comment>
                            <comment id="41379" author="prakash" created="Mon, 2 Jul 2012 16:50:38 +0000"  >&lt;p&gt;Here&apos;s a stack trace from one of the nodes which crashed. Please excuse any typo&apos;s, I had to copy it by hand:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Kernel panic - not syncing: Watchdog detected hard LOCKUP on cpu 4
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 0   TASK: ...   CPU: 4   COMMAND: &quot;swapper&quot;
0.  machine_kexec
1.  crash_kexec
2.  panic
3.  watchdog_overflow_callback
4.  __perf_event_overflow
5.  perf_event_overlfow
6.  x86_pmu_handle_irq
7.  perf_event_nmi_handler
8.  notifier_call_chain
9.  atomic_notifier_call_chain
10. notify_die
11. do_nmi
12. nmi 
13. ktime_get
14. tick_check_idle
15. irq_enter
16. smp_apic_timer_interrupt
17. apic_timer_interrupt
18. apic_timer_interrupt
19. notifier_call_chain
20. cpu_idle
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="41381" author="prakash" created="Mon, 2 Jul 2012 17:04:38 +0000"  >&lt;p&gt;This curiously lines up with many &apos;leap second&apos; issues I&apos;ve read about over the weekend. Although, it&apos;s pure speculation if this is issue is related.&lt;/p&gt;</comment>
                            <comment id="81500" author="adilger" created="Sun, 13 Apr 2014 17:24:40 +0000"  >&lt;p&gt;Haven&apos;t seen any more information about this bug, and unable to make any progress based on the description here.  Based on the later comments, this might have been caused by the leap second bug.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzv2z3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>3981</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>