<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:56:52 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12927] OSS hit Kernel panic - not syncing: Fatal machine check</title>
                <link>https://jira.whamcloud.com/browse/LU-12927</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;In OSS failover testing, 1 OSS hit following error caused the system hung&lt;/p&gt;

&lt;p&gt;soak-6&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[2019-10-28T23:49:44+00:00] INFO: Running report handlers
[2019-10-28T23:49:44+00:00] INFO: Creating JSON run report
[2019-10-28T23:49:44+00:00] INFO: Report handlers complete
[  130.975016] LNet: HW NUMA nodes: 2, HW CPU cores: 32, npartitions: 2
[  130.985785] alg: No test for adler32 (adler32-zlib)
[  131.829968] Lustre: Lustre: Build Version: 2.12.58_160_g2b90574
[  132.018974] LNet: Using FMR for registration
[  132.035678] LNet: Added LNI 192.168.1.106@o2ib [8/256/0/180]
[  133.724994] Lustre: soaked-OST0002: Imperative Recovery enabled, recovery window shrunk from 300-900 down to 150-900
[  138.968619] Lustre: soaked-OST0002: Will be in recovery for at least 2:30, or until 27 clients reconnect
[  138.980123] Lustre: soaked-OST0002: Connection restored to fab0c63f-3fdb-4 (at 192.168.1.138@o2ib)
[  139.647610] Lustre: soaked-OST0002: Connection restored to 0e08c972-f5eb-4 (at 192.168.1.120@o2ib)
[  139.657651] Lustre: Skipped 3 previous similar messages
[  140.934492] Lustre: soaked-OST0002: Connection restored to f5344847-d291-4 (at 192.168.1.135@o2ib)
[  140.944523] Lustre: Skipped 7 previous similar messages
[  141.497059] Lustre: soaked-OST000a: Imperative Recovery enabled, recovery window shrunk from 300-900 down to 150-900
[  141.525919] Lustre: soaked-OST000a: Will be in recovery for at least 2:30, or until 27 clients reconnect
[  143.049075] Lustre: soaked-OST000a: Connection restored to 1a036cd1-6dcf-4 (at 192.168.1.141@o2ib)
[  143.059107] Lustre: Skipped 21 previous similar messages
[  143.713996] Lustre: soaked-OST0002: Recovery over after 0:05, of 27 clients 27 recovered and 0 were evicted.
[  143.733171] Lustre: soaked-OST0002: deleting orphan objects from 0x0:6964042 to 0x0:6964083
[  143.735241] Lustre: soaked-OST0002: deleting orphan objects from 0x380000401:5635234 to 0x380000401:5647269
[  143.753817] Lustre: soaked-OST0002: deleting orphan objects from 0x380000400:5074927 to 0x380000400:5080690
[  143.820779] Lustre: soaked-OST0002: deleting orphan objects from 0x380000402:8806871 to 0x380000402:8812296
[  147.362231] Lustre: soaked-OST000a: Connection restored to 3b6c98a5-fe70-4 (at 192.168.1.131@o2ib)
[  147.372271] Lustre: Skipped 5 previous similar messages
[  148.926072] Lustre: soaked-OST0006: Imperative Recovery enabled, recovery window shrunk from 300-900 down to 150-900
[  149.800735] Lustre: soaked-OST0006: Will be in recovery for at least 2:30, or until 27 clients reconnect
[  151.643017] Lustre: soaked-OST000a: Recovery over after 0:10, of 27 clients 27 recovered and 0 were evicted.
[  151.651808] Lustre: soaked-OST000a: deleting orphan objects from 0x580000400:5654332 to 0x580000400:5656923
[  151.653781] Lustre: soaked-OST000a: deleting orphan objects from 0x0:6949857 to 0x0:6949898
[  151.663992] Lustre: soaked-OST000a: deleting orphan objects from 0x580000402:8821479 to 0x580000402:8827114
[  151.665251] Lustre: soaked-OST000a: deleting orphan objects from 0x580000401:5099258 to 0x580000401:5105344
[  155.393063] Lustre: soaked-OST0006: Connection restored to soaked-MDT0002-mdtlov_UUID (at 192.168.1.110@o2ib)
[  155.404202] Lustre: Skipped 26 previous similar messages
[  157.016144] Lustre: soaked-OST000e: Imperative Recovery enabled, recovery window shrunk from 300-900 down to 150-900
[  157.129006] Lustre: soaked-OST000e: Will be in recovery for at least 2:30, or until 27 clients reconnect
[  158.836265] Lustre: soaked-OST0006: Recovery over after 0:09, of 27 clients 27 recovered and 0 were evicted.
[  158.847384] Lustre: soaked-OST0006: deleting orphan objects from 0x0:6965153 to 0x0:6965199
[  158.866283] Lustre: soaked-OST0006: deleting orphan objects from 0x480000402:5102899 to 0x480000402:5104678
[  158.866936] Lustre: soaked-OST0006: deleting orphan objects from 0x480000401:5643903 to 0x480000401:5651951
[  158.874936] Lustre: soaked-OST0006: deleting orphan objects from 0x480000400:8787734 to 0x480000400:8793891
[  167.036317] Lustre: soaked-OST000e: Recovery over after 0:10, of 27 clients 27 recovered and 0 were evicted.
[  167.051945] Lustre: soaked-OST000e: deleting orphan objects from 0x680000402:4916845 to 0x680000402:4918647
[  167.052271] Lustre: soaked-OST000e: deleting orphan objects from 0x0:6939032 to 0x0:6939072
[  167.055485] Lustre: soaked-OST000e: deleting orphan objects from 0x680000401:8720221 to 0x680000401:8723771
[  167.062501] Lustre: soaked-OST000e: deleting orphan objects from 0x680000400:5548226 to 0x680000400:5552635
[  271.398262] Lustre: soaked-OST000a: Connection restored to 4270d3b8-8785-4 (at 192.168.1.122@o2ib)
[  271.408347] Lustre: Skipped 42 previous similar messages
[  355.688632] Lustre: soaked-OST0006: Connection restored to 4270d3b8-8785-4 (at 192.168.1.122@o2ib)
[  355.698685] Lustre: Skipped 6 previous similar messages
[  487.617829] Lustre: soaked-OST000e: Connection restored to 0a14b91b-c6a9-4 (at 192.168.1.119@o2ib)
[  487.627863] Lustre: Skipped 1 previous similar message
[  871.326165] Lustre: soaked-OST000a: Connection restored to 667ea088-477b-4 (at 192.168.1.118@o2ib)
[  871.336185] Lustre: Skipped 15 previous similar messages
[ 1194.625969] Lustre: soaked-OST0006: Connection restored to 4270d3b8-8785-4 (at 192.168.1.122@o2ib)
[ 1194.635991] Lustre: Skipped 21 previous similar messages
[ 1742.196450] Lustre: soaked-OST0006: Connection restored to 4270d3b8-8785-4 (at 192.168.1.122@o2ib)
[ 1742.206486] Lustre: Skipped 168 previous similar messages
[ 2512.885378] Lustre: soaked-OST000a: Connection restored to 0e6b88eb-ca9a-4 (at 192.168.1.117@o2ib)
[ 2512.885380] Lustre: soaked-OST0002: Connection restored to 0e6b88eb-ca9a-4 (at 192.168.1.117@o2ib)
[ 2512.885383] Lustre: soaked-OST000e: Connection restored to 0e6b88eb-ca9a-4 (at 192.168.1.117@o2ib)
[ 2512.885385] Lustre: Skipped 64 previous similar messages
[ 2512.885392] Lustre: Skipped 65 previous similar messages
[ 3141.275984] Lustre: soaked-OST0002: Connection restored to 2f1eb1c4-6276-4 (at 192.168.1.126@o2ib)
[ 3141.275986] Lustre: soaked-OST000e: Connection restored to 2f1eb1c4-6276-4 (at 192.168.1.126@o2ib)
[ 3141.275992] Lustre: Skipped 128 previous similar messages
[ 3141.302076] Lustre: Skipped 1 previous similar message
[ 3738.703359] LustreError: 137-5: soaked-OST0007_UUID: not available for connect from 192.168.1.110@o2ib (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 3738.723185] LustreError: Skipped 3 previous similar messages
[ 3740.041350] LustreError: 137-5: soaked-OST000f_UUID: not available for connect from 192.168.1.142@o2ib (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 3741.149040] LustreError: 137-5: soaked-OST000f_UUID: not available for connect from 192.168.1.127@o2ib (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 3741.168874] LustreError: Skipped 3 previous similar messages
[ 3743.506904] LustreError: 137-5: soaked-OST0003_UUID: not available for connect from 192.168.1.111@o2ib (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 3743.526763] LustreError: Skipped 7 previous similar messages
[ 3749.244322] LustreError: 137-5: soaked-OST000f_UUID: not available for connect from 192.168.1.120@o2ib (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 3749.264152] LustreError: Skipped 9 previous similar messages
[ 3757.891545] LustreError: 137-5: soaked-OST000f_UUID: not available for connect from 192.168.1.122@o2ib (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 3757.911389] LustreError: Skipped 3 previous similar messages
[ 3788.883107] LustreError: 137-5: soaked-OST0003_UUID: not available for connect from 192.168.1.110@o2ib (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 3788.883110] LustreError: 137-5: soaked-OST0007_UUID: not available for connect from 192.168.1.110@o2ib (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 3788.883116] LustreError: Skipped 5 previous similar messages
[ 3789.539742] mce: [Hardware Error]: CPU 0: Machine Check Exception: 5 Bank 5: be00000000010093
[ 3789.539748] mce: [Hardware Error]: Machine check events logged
[ 3789.555773] mce: [Hardware Error]: RIP !INEXACT! 10:&amp;lt;ffffffff877815b4&amp;gt; {intel_idle+0xd4/0x225}
[ 3789.565421] mce: [Hardware Error]: TSC 9ce54e28818 ADDR 42ec5acc0 MISC 14076f686 
[ 3789.573817] mce: [Hardware Error]: PROCESSOR 0:206d7 TIME 1572310251 SOCKET 0 APIC 0 microcode 718
[ 3789.583829] mce: [Hardware Error]: Run the above through &apos;mcelog --ascii&apos;
[ 3789.597693] mce: [Hardware Error]: Machine check: Processor context corrupt
[ 3789.605480] Kernel panic - not syncing: Fatal machine check
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>lustre-master-ib build #334 EL7.7</environment>
        <key id="57292">LU-12927</key>
            <summary>OSS hit Kernel panic - not syncing: Fatal machine check</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="sarah">Sarah Liu</reporter>
                        <labels>
                            <label>soak</label>
                    </labels>
                <created>Fri, 1 Nov 2019 17:53:56 +0000</created>
                <updated>Mon, 11 Nov 2019 22:39:35 +0000</updated>
                                            <version>Lustre 2.13.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>3</watches>
                                                                            <comments>
                            <comment id="257535" author="green" created="Fri, 1 Nov 2019 21:50:27 +0000"  >&lt;p&gt;decoded it says&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Hardware event. This is not a software error.
CPU 0 BANK 0 TSC 9ce54e28818 
RIP !INEXACT! 10:ffffffff877815b4
TIME 1572310251 Mon Oct 28 20:50:51 2019
MCG status:
MCi status:
Machine check not valid
Corrected error
MCA: No Error
STATUS 0 MCGSTATUS 0
CPUID Vendor Intel Family 6 Model 45
RIP: intel_idle+0xd4/0x225}
SOCKET 0 APIC 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;hardware error. Not a Lustre bug.&lt;/p&gt;</comment>
                            <comment id="258104" author="sarah" created="Mon, 11 Nov 2019 22:39:35 +0000"  >&lt;p&gt;Searching the log and found same node(soak-6) hit the same error on an earlier date&lt;/p&gt;

&lt;p&gt;in soak-6.log-20191013&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[2019-10-09T21:51:51+00:00] INFO: Processing cookbook_file[/localhome/mpiuser/.ssh/authorized_keys] action create (openmpi::test_node line 78)
[2019-10-09T21:51:51+00:00] INFO: Chef Run complete in 39.357959406 seconds
[2019-10-09T21:51:51+00:00] INFO: Running report handlers
[2019-10-09T21:51:51+00:00] INFO: Creating JSON run report
[2019-10-09T21:51:51+00:00] INFO: Report handlers complete
[ 3986.406532] SPL: Loaded module v0.7.13-1
[ 3986.411302] znvpair: module license &apos;CDDL&apos; taints kernel.
...
[37937.108045] Lustre: soaked-OST0006: Connection restored to 6e40322e-52b9-a537-9d48-5e011f32062e (at 192.168.1.123@o2ib)
[37937.120139] Lustre: Skipped 149 previous similar messages
[69475.002518] Lustre: soaked-OST0002: Connection restored to 11cee62d-c08d-dabb-d0b3-a3bf7c075589 (at 192.168.1.126@o2ib)
[69475.014593] Lustre: Skipped 13 previous similar messages
[69475.037096] mce: [Hardware Error]: CPU 0: Machine Check Exception: 5 Bank 5: be00000000010093
[69475.046616] mce: [Hardware Error]: RIP !INEXACT! 10:&amp;lt;ffffffffa05815b4&amp;gt; {intel_idle+0xd4/0x225}
[69475.056268] mce: [Hardware Error]: TSC aab571fd8496 ADDR 394b11bc0 MISC 214030b086 
[69475.064864] mce: [Hardware Error]: PROCESSOR 0:206d7 TIME 1570727231 SOCKET 0 APIC 0 microcode 718
[69475.074879] mce: [Hardware Error]: Run the above through &apos;mcelog --ascii&apos;
[69475.090486] mce: [Hardware Error]: Machine check: Processor context corrupt
[69475.098275] Kernel panic - not syncing: Fatal machine check
ESC[2JESC[1;1HCopyright(c) 2009 - 2012 Intel Corporation.All rights reserved. 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00ovz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>