<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:18:56 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-15510] Soft locks on OSS servers with fail over with MDS.</title>
                <link>https://jira.whamcloud.com/browse/LU-15510</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Communication between the MDS and OSS servers failed so recovery started (IR is disabled&quot;). The recovery on the OSS server failed with a lock up:&lt;/p&gt;

&lt;p&gt;NMI watchdog: BUG: soft lockup - CPU#1 stuck for 22s! &lt;span class=&quot;error&quot;&gt;&amp;#91;ll_ost_io01_074:30838&amp;#93;&lt;/span&gt;&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &lt;/p&gt;

&lt;p&gt;With stack trace:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855433&amp;#93;&lt;/span&gt; CPU: 3 PID: 30807 Comm: ll_ost_io01_070 Kdump: loaded Tainted: P &#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;OE &#160;------------ T 3.10.0-1160.49.1.el7.x86_64 #1 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855433&amp;#93;&lt;/span&gt; Hardware name: Dell Inc. PowerEdge R640/0W23H8, BIOS 1.6.13 12/17/2018 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855434&amp;#93;&lt;/span&gt; task: ffff93abd41c6300 ti: ffff93abdc84c000 task.ti: ffff93abdc84c000 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855435&amp;#93;&lt;/span&gt; RIP: 0010:&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff85f17aa2&amp;gt;&amp;#93;&lt;/span&gt; &#160;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855441&amp;#93;&lt;/span&gt; &#160;&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff85f17aa2&amp;gt;&amp;#93;&lt;/span&gt; native_queued_spin_lock_slowpath+0x122/0x200 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855441&amp;#93;&lt;/span&gt; RSP: 0018:ffff93abdc84fcb8 &#160;EFLAGS: 00000246 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855442&amp;#93;&lt;/span&gt; RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000190000 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855443&amp;#93;&lt;/span&gt; RDX: ffff93c4dd69b8c0 RSI: 0000000000290000 RDI: ffff93c367157830 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855443&amp;#93;&lt;/span&gt; RBP: ffff93abdc84fcb8 R08: ffff93c4dd65b8c0 R09: 0000000000000000 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855444&amp;#93;&lt;/span&gt; R10: ffff93c4dd65f160 R11: fffff40dfe6a9200 R12: ffff93abdc84fc58 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855444&amp;#93;&lt;/span&gt; R13: ffff93c3b5c66000 R14: ffff93b8aaa89850 R15: ffffffffc17a7b96 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855445&amp;#93;&lt;/span&gt; FS: &#160;0000000000000000(0000) GS:ffff93c4dd640000(0000) knlGS:0000000000000000 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855446&amp;#93;&lt;/span&gt; CS: &#160;0010 DS: 0000 ES: 0000 CR0: 0000000080050033 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855447&amp;#93;&lt;/span&gt; CR2: 000000c002f51000 CR3: 0000002fd1d42000 CR4: 00000000007607e0 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855448&amp;#93;&lt;/span&gt; DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855448&amp;#93;&lt;/span&gt; DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855449&amp;#93;&lt;/span&gt; PKRU: 00000000 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855449&amp;#93;&lt;/span&gt; Call Trace: &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855454&amp;#93;&lt;/span&gt; &#160;&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8657dcf3&amp;gt;&amp;#93;&lt;/span&gt; queued_spin_lock_slowpath+0xb/0xf &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855459&amp;#93;&lt;/span&gt; &#160;&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8658baa0&amp;gt;&amp;#93;&lt;/span&gt; _raw_spin_lock+0x20/0x30 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855518&amp;#93;&lt;/span&gt; &#160;&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc1463232&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_drop_request+0x1c2/0x6d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855545&amp;#93;&lt;/span&gt; &#160;&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc14637d2&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_finish_active_request+0x92/0x140 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855572&amp;#93;&lt;/span&gt; &#160;&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc1465a41&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request+0x401/0xab0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855597&amp;#93;&lt;/span&gt; &#160;&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc14626a5&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_wait_event+0xa5/0x360 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855600&amp;#93;&lt;/span&gt; &#160;&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff85ed3233&amp;gt;&amp;#93;&lt;/span&gt; ? __wake_up+0x13/0x20 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855625&amp;#93;&lt;/span&gt; &#160;&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc14691f4&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0xb34/0x1470 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855650&amp;#93;&lt;/span&gt; &#160;&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc14686c0&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_register_service+0xf80/0xf80 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt; &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855653&amp;#93;&lt;/span&gt; &#160;&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff85ec5e61&amp;gt;&amp;#93;&lt;/span&gt; kthread+0xd1/0xe0 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855655&amp;#93;&lt;/span&gt; &#160;&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff85ec5d90&amp;gt;&amp;#93;&lt;/span&gt; ? insert_kthread_work+0x40/0x40 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855657&amp;#93;&lt;/span&gt; &#160;&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff86595ddd&amp;gt;&amp;#93;&lt;/span&gt; ret_from_fork_nospec_begin+0x7/0x21 &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;95181.855659&amp;#93;&lt;/span&gt; &#160;&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff85ec5d90&amp;gt;&amp;#93;&lt;/span&gt; ? insert_kthread_work+0x40/0x40&lt;/p&gt;</description>
                <environment>OSS server running RHEL7 3.10.0-1160.49.1.el7.x86_64 with ZFS 2.0.7.&lt;br/&gt;
Lustre version 2.12.6</environment>
        <key id="68445">LU-15510</key>
            <summary>Soft locks on OSS servers with fail over with MDS.</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="pjones">Peter Jones</assignee>
                                    <reporter username="simmonsja">James A Simmons</reporter>
                        <labels>
                            <label>ORNL</label>
                            <label>ornl</label>
                    </labels>
                <created>Tue, 1 Feb 2022 19:51:06 +0000</created>
                <updated>Mon, 11 Jul 2022 14:47:22 +0000</updated>
                                            <version>Lustre 2.12.6</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>3</watches>
                                                                            <comments>
                            <comment id="324987" author="pjones" created="Wed, 2 Feb 2022 20:38:07 +0000"  >&lt;p&gt;James&lt;/p&gt;

&lt;p&gt;Is this something that you are working on or an operational issue? If the latter, are there any logs available? Are any patches applied to the vanilla 2.12.6 release?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="325621" author="simmonsja" created="Tue, 8 Feb 2022 18:18:24 +0000"  >&lt;p&gt;I never seen this bug before so I was hoping you ran into before.&lt;/p&gt;</comment>
                            <comment id="338389" author="simmonsja" created="Wed, 22 Jun 2022 16:54:17 +0000"  >&lt;p&gt;We just hit this bug again.&#160; Currently our CPT looks like&lt;/p&gt;

&lt;p&gt;0:&#160; &#160; 0 2 4&#160; 6 8 19 12 141 16 18 20 22&lt;/p&gt;

&lt;p&gt;1:&#160; &#160;1 3 5 7 9 11 13 15 17 19 21 23&lt;/p&gt;

&lt;p&gt;I wonder if doubling the CPT count would lower the lock contention.&lt;/p&gt;</comment>
                            <comment id="340065" author="dustb100" created="Mon, 11 Jul 2022 14:47:22 +0000"  >&lt;p&gt;Peter,&#160;&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; &#160;To answer your question from above, this is an operational issue that impacted production. We reverted the code change so things are now stable. I know that there were patches applied to 2.12.6, but I&apos;m not sure what they are. James would know the details.&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Dustin&#160;&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i02gzj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>