<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:09:06 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-14364] Switching QoS from tbf uid to fifo caused soft lockup</title>
                <link>https://jira.whamcloud.com/browse/LU-14364</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Switching back from &quot;tbf uid&quot; to fifo caused soft lockup. Including backtrace of all threads from the crash dump.&lt;/p&gt;

&lt;p&gt;From dmesg&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 [-- MARK -- Mon Jan 25 15:00:00 2021]
[15694977.724675] NMI watchdog: BUG: soft lockup - CPU#2 stuck &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 23s! [mdt00_088:11264]
[15694977.724677] NMI watchdog: BUG: soft lockup - CPU#6 stuck &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 23s! [mdt00_080:11250]
[15694977.724679] NMI watchdog: BUG: soft lockup - CPU#1 stuck &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 23s! [mdt00_109:11297]
[15694977.724681] NMI watchdog: BUG: soft lockup - CPU#3 stuck &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 23s! [mdt00_102:11285]
[15694977.724683] NMI watchdog: BUG: soft lockup - CPU#5 stuck &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 23s! [mdt00_034:11187]
[15694977.724685] NMI watchdog: BUG: soft lockup - CPU#7 stuck &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 23s! [mdt00_016:11166]
[15694977.724687] NMI watchdog: BUG: soft lockup - CPU#4 stuck &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 23s! [mdt00_046:11201]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I was able to get a crash dump.&lt;br/&gt;
 All the hung threads are in the same state&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
crash&amp;gt; bt 11285
PID: 11285  TASK: ffffa137e72d9070  CPU: 3   COMMAND: &lt;span class=&quot;code-quote&quot;&gt;&quot;mdt00_102&quot;&lt;/span&gt;
 #0 [ffffa117fecc8e48] crash_nmi_callback at ffffffffb7658017
 #1 [ffffa117fecc8e58] nmi_handle at ffffffffb7d8593c
 #2 [ffffa117fecc8eb0] do_nmi at ffffffffb7d85b5d
 #3 [ffffa117fecc8ef0] end_repeat_nmi at ffffffffb7d84d9c
    [exception RIP: native_queued_spin_lock_slowpath+344]
    RIP: ffffffffb7717478  RSP: ffffa1375e5e3d38  RFLAGS: 00000202
    RAX: 0000000000000101  RBX: ffffa117fb5e1108  RCX: 0000000000190000
    RDX: 0000000000590101  RSI: 0000000000000101  RDI: ffffa117fb5e1108
    RBP: ffffa1375e5e3d38   R8: ffffa117fecdb880   R9: 0000000000000000
    R10: ffffffffc0d37e40  R11: ffffa117fb5e1108  R12: 0000000000000000
    R13: ffffa0f8eb8a3b80  R14: ffffa0f8eb8a3b80  R15: 0000000000000000
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
--- &amp;lt;NMI exception stack&amp;gt; ---
 #4 [ffffa1375e5e3d38] native_queued_spin_lock_slowpath at ffffffffb7717478
 #5 [ffffa1375e5e3d40] queued_spin_lock_slowpath at ffffffffb7d7546a
 #6 [ffffa1375e5e3d50] _raw_spin_lock at ffffffffb7d83350
 #7 [ffffa1375e5e3d60] nrs_resource_get_safe at ffffffffc1039402 [ptlrpc]
 #8 [ffffa1375e5e3d98] ptlrpc_nrs_req_initialize at ffffffffc1039f13 [ptlrpc]
 #9 [ffffa1375e5e3db0] ptlrpc_server_handle_req_in at ffffffffc1004c21 [ptlrpc]
#10 [ffffa1375e5e3df8] ptlrpc_main at ffffffffc1008d65 [ptlrpc]
#11 [ffffa1375e5e3ec8] kthread at ffffffffb76c61f1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="62482">LU-14364</key>
            <summary>Switching QoS from tbf uid to fifo caused soft lockup</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="lixi_wc">Li Xi</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Tue, 26 Jan 2021 05:49:08 +0000</created>
                <updated>Fri, 12 Nov 2021 21:16:54 +0000</updated>
                                            <version>Lustre 2.12.4</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="290375" author="pjones" created="Tue, 26 Jan 2021 15:47:57 +0000"  >&lt;p&gt;Li Xi&lt;/p&gt;

&lt;p&gt;Could you please advise?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="317567" author="bzzz" created="Fri, 5 Nov 2021 17:58:09 +0000"  >&lt;p&gt;I see this very often in my testing&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[ 1090.821759] Lustre: DEBUG MARKER: == sanityn test 78: Enable policy and specify tunings right away ========================================================== 17:47:37 (1636134457)
[ 1091.415104] BUG: sleeping function called from invalid context at /home/lustre/master-mine/libcfs/libcfs/hash.c:1155
[ 1091.415287] in_atomic(): 1, irqs_disabled(): 0, pid: 48457, name: lctl
[ 1091.415326] INFO: lockdep is turned off.
[ 1091.415348] CPU: 0 PID: 48457 Comm: lctl Tainted: P        W  O     --------- ---  4.18.0 #43
[ 1091.415398] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[ 1091.415431] Call Trace:
[ 1091.415451]  dump_stack+0x85/0xc0
[ 1091.415479]  ___might_sleep.cold.15+0xac/0xbc
[ 1091.415522]  cfs_hash_putref+0x277/0x4d0 [libcfs]
[ 1091.415712]  nrs_orr_stop+0x60/0x240 [ptlrpc]
[ 1091.415858]  nrs_policy_stop0+0x30/0x1b0 [ptlrpc]
[ 1091.415945]  nrs_policy_stop_primary.isra.3+0x17c/0x1c0 [ptlrpc]
[ 1091.416035]  nrs_policy_start_locked+0x466/0x670 [ptlrpc]
[ 1091.416211]  nrs_policy_ctl+0x1e9/0x2c0 [ptlrpc]
[ 1091.416360]  ptlrpc_nrs_policy_control+0x110/0x2f0 [ptlrpc]
[ 1091.416509]  ptlrpc_lprocfs_nrs_policies_seq_write+0x43c/0x590 [ptlrpc]
[ 1091.416647]  full_proxy_write+0x4b/0x70
[ 1091.416728]  __vfs_write+0x1e/0x160
[ 1091.416807]  ? rcu_sync_lockdep_assert+0x9/0x50
[ 1091.429057]  ? __sb_start_write+0x147/0x1b0
[ 1091.429136]  ? vfs_write+0x182/0x1b0
[ 1091.429208]  vfs_write+0xb9/0x1b0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="317570" author="bzzz" created="Fri, 5 Nov 2021 19:28:37 +0000"  >&lt;p&gt;and even with cond_resched() removed it still fails:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[ 1171.884805] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:748
[ 1171.885071] in_atomic(): 1, irqs_disabled(): 0, pid: 87261, name: lctl
[ 1171.885187] INFO: lockdep is turned off.
[ 1171.885260] CPU: 1 PID: 87261 Comm: lctl Tainted: P        W  O     --------- ---  4.18.0 #44
[ 1171.885413] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[ 1171.885523] Call Trace:
[ 1171.885577]  dump_stack+0x85/0xc0
[ 1171.885650]  ___might_sleep.cold.15+0xac/0xbc
[ 1171.885742]  __mutex_lock+0x42/0x9e0
[ 1171.885814]  ? ___cache_free+0x323/0x4a0
[ 1171.885886]  ? kmem_cache_destroy+0x1b/0xf0
[ 1171.885961]  kmem_cache_destroy+0x1b/0xf0
[ 1171.886094]  nrs_orr_stop+0x69/0x240 [ptlrpc]
[ 1171.886236]  nrs_policy_stop0+0x30/0x1b0 [ptlrpc]
[ 1171.886376]  nrs_policy_stop_primary.isra.3+0x17c/0x1c0 [ptlrpc]
[ 1171.886536]  nrs_policy_start_locked+0x466/0x670 [ptlrpc]
[ 1171.886676]  nrs_policy_ctl+0x1e9/0x2c0 [ptlrpc]
[ 1171.886815]  ptlrpc_nrs_policy_control+0x110/0x2f0 [ptlrpc]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I think we should start to use CONFIG_DEBUG_* things in AT and reject changes breaking obvious rules like this.&lt;/p&gt;</comment>
                            <comment id="318091" author="eaujames" created="Fri, 12 Nov 2021 16:04:54 +0000"  >&lt;p&gt;nrs_policy_stop0() should not be call with the &quot;nrs-&amp;gt;nrs_lock&quot; spinlock because nrs_orr_stop()/nrs_tbf_stop() etc... free memory.&lt;/p&gt;</comment>
                            <comment id="318107" author="lixi_wc" created="Fri, 12 Nov 2021 16:49:32 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=green&quot; class=&quot;user-hover&quot; rel=&quot;green&quot;&gt;green&lt;/a&gt; Thanks for the dump stack. Yes, rhashtable_free_and_destroy() indeed can sleep. I will check how to fix it.&lt;/p&gt;</comment>
                            <comment id="318120" author="gerrit" created="Fri, 12 Nov 2021 18:06:40 +0000"  >&lt;p&gt;&quot;Li Xi &amp;lt;lixi@ddn.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/45554&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/45554&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14364&quot; title=&quot;Switching QoS from tbf uid to fifo caused soft lockup&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14364&quot;&gt;LU-14364&lt;/a&gt; nrs: release spinlock when stopping policy&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 9d367f269b7ac20294f5499a5ff2edc22fa6eb45&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="64368">LU-14698</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="37251" name="bt.all" size="893082" author="mhanafi" created="Tue, 26 Jan 2021 05:48:12 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i01k6f:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>