<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:23:44 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2263] CPU Soft Lockups due to many threads spinning on import lock on Sequoia IO nodes</title>
                <link>https://jira.whamcloud.com/browse/LU-2263</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;The IO nodes on Sequoia are going unresponsive with many CPU soft lockup messages in the logs. For example, a couple messages from a node in this state are as follows:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2012-11-01 23:44:47.681648 {DefaultControlEventListener} [mmcs]{161}.0.2: BUG: soft lockup - CPU#2 stuck for 67s! [ptlrpcd_56:3326]
2012-11-01 23:44:47.720983 {DefaultControlEventListener} [mmcs]{161}.0.2: Modules linked in: lmv(U) mgc(U) lustre(U) mdc(U) fid(U) fld(U) lov(U) osc(U) ko2iblnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) libcfs(U) bgvrnic bgmudm
2012-11-01 23:44:47.760955 {DefaultControlEventListener} [mmcs]{161}.0.2: NIP: c00000000042e194 LR: 8000000003a674bc CTR: c00000000042e160
2012-11-01 23:44:47.801187 {DefaultControlEventListener} [mmcs]{161}.0.2: REGS: c0000003ca2e7900 TRAP: 0901   Not tainted  (2.6.32-220.23.3.bgq.13llnl.V1R1M2.bgq62_16.ppc64)
2012-11-01 23:44:47.841287 {DefaultControlEventListener} [mmcs]{161}.0.2: MSR: 0000000080029000 &amp;lt;EE,ME,CE&amp;gt;  CR: 84284444  XER: 20000000
2012-11-01 23:44:47.880979 {DefaultControlEventListener} [mmcs]{161}.0.2: TASK = c0000003ca1c7980[3326] &apos;ptlrpcd_56&apos; THREAD: c0000003ca2e4000 CPU: 2
2012-11-01 23:44:47.921379 {DefaultControlEventListener} [mmcs]{161}.0.2: GPR00: 0000000080000038 c0000003ca2e7b80 c0000000006de510 c0000003ceca6278
2012-11-01 23:44:47.960900 {DefaultControlEventListener} [mmcs]{161}.0.2: GPR04: 0000000000000001 0000000000000000 0000000000000000 0000000000000000
2012-11-01 23:44:48.000948 {DefaultControlEventListener} [mmcs]{161}.0.2: GPR08: 0000000000200200 0000000080000002 c0000003cefddc68 c00000000042e160
2012-11-01 23:44:48.105877 {DefaultControlEventListener} [mmcs]{161}.0.2: GPR12: 8000000003aebdb8 c000000000743f00
2012-11-01 23:44:48.106188 {DefaultControlEventListener} [mmcs]{161}.0.2: NIP [c00000000042e194] ._spin_lock+0x34/0x44
2012-11-01 23:44:48.121006 {DefaultControlEventListener} [mmcs]{161}.0.2: LR [8000000003a674bc] .ptlrpc_check_set+0xeac/0x4e80 [ptlrpc]
2012-11-01 23:44:48.161100 {DefaultControlEventListener} [mmcs]{161}.0.2: Call Trace:
2012-11-01 23:44:48.201258 {DefaultControlEventListener} [mmcs]{161}.0.2: [c0000003ca2e7b80] [8000000003a674ac] .ptlrpc_check_set+0xe9c/0x4e80 [ptlrpc] (unreliable)
2012-11-01 23:44:48.240907 {DefaultControlEventListener} [mmcs]{161}.0.2: [c0000003ca2e7d20] [8000000003abd1dc] .ptlrpcd_check+0x66c/0x8a0 [ptlrpc]
2012-11-01 23:44:48.280969 {DefaultControlEventListener} [mmcs]{161}.0.2: &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The line it&apos;s stuck on:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;(gdb) l *ptlrpc_check_set+0xeac
0x474bc is in ptlrpc_check_set (/builddir/build/BUILD/lustre-2.3.54/lustre/ptlrpc/client.c:1666).
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And the source level info:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;1663                                 if (!ptlrpc_unregister_reply(req, 1))
1664                                         continue;
1665 
1666                                 cfs_spin_lock(&amp;amp;imp-&amp;gt;imp_lock);
1667                                 if (ptlrpc_import_delay_req(imp, req, &amp;amp;status)){
1668                                         /* put on delay list - only if we wait
1669                                          * recovery finished - before send */
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Details of the next message on the console:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;BUG: soft lockup - CPU#16 stuck for 67s! [ptlrpcd_rcv:3269]
2012-11-01 23:44:48.320953 {DefaultControlEventListener} [mmcs]{161}.0.2: Modules linked in: lmv(U) mgc(U) lustre(U) mdc(U) fid(U) fld(U) lov(U) osc(U) ko2iblnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) libcfs(U) bgvrnic bgmudm
2012-11-01 23:44:48.361045 {DefaultControlEventListener} [mmcs]{161}.0.2: NIP: c00000000042e18c LR: 8000000003a66b04 CTR: c00000000042e160
2012-11-01 23:44:48.400942 {DefaultControlEventListener} [mmcs]{161}.0.2: REGS: c0000003ca203900 TRAP: 0901   Not tainted  (2.6.32-220.23.3.bgq.13llnl.V1R1M2.bgq62_16.ppc64)
2012-11-01 23:44:48.440963 {DefaultControlEventListener} [mmcs]{161}.0.2: MSR: 0000000080029000 &amp;lt;EE,ME,CE&amp;gt;  CR: 84228484  XER: 20000000
2012-11-01 23:44:48.480905 {DefaultControlEventListener} [mmcs]{161}.0.2: TASK = c0000003c5d432a0[3269] &apos;ptlrpcd_rcv&apos; THREAD: c0000003ca200000 CPU: 16
2012-11-01 23:44:48.520898 {DefaultControlEventListener} [mmcs]{161}.0.2: GPR00: 0000000080000038 c0000003ca203b80 c0000000006de510 c0000003ceca6278
2012-11-01 23:44:48.560925 {DefaultControlEventListener} [mmcs]{161}.0.2: GPR04: 0000000000000001 0000000000000000 0000000000000000 0000000000000000
2012-11-01 23:44:48.600920 {DefaultControlEventListener} [mmcs]{161}.0.2: GPR08: c0000003ceca6260 0000000080000010 c0000003ce673420 c00000000042e160
2012-11-01 23:44:48.640881 {DefaultControlEventListener} [mmcs]{161}.0.2: GPR12: 8000000003aebdb8 c00000000074f500
2012-11-01 23:44:48.680904 {DefaultControlEventListener} [mmcs]{161}.0.2: NIP [c00000000042e18c] ._spin_lock+0x2c/0x44
2012-11-01 23:44:48.720885 {DefaultControlEventListener} [mmcs]{161}.0.2: LR [8000000003a66b04] .ptlrpc_check_set+0x4f4/0x4e80 [ptlrpc]
2012-11-01 23:44:48.761372 {DefaultControlEventListener} [mmcs]{161}.0.2: Call Trace:
2012-11-01 23:44:48.800923 {DefaultControlEventListener} [mmcs]{161}.0.2: [c0000003ca203b80] [8000000003a66974] .ptlrpc_check_set+0x364/0x4e80 [ptlrpc] (unreliable)
2012-11-01 23:44:48.840892 {DefaultControlEventListener} [mmcs]{161}.0.2: [c0000003ca203d20] [8000000003abd1dc] .ptlrpcd_check+0x66c/0x8a0 [ptlrpc]
2012-11-01 23:44:48.880936 {DefaultControlEventListener} [mmcs]{161}.0.2: [c0000003ca203e40] [8000000003abd76c] .ptlrpcd+0x35c/0x510 [ptlrpc]
2012-11-01 23:44:48.920887 {DefaultControlEventListener} [mmcs]{161}.0.2: [c0000003ca203f90] [c00000000001a9e0] .kernel_thread+0x54/0x70
2012-11-01 23:44:48.961113 {DefaultControlEventListener} [mmcs]{161}.0.2: Instruction dump:
2012-11-01 23:44:49.000886 {DefaultControlEventListener} [mmcs]{161}.0.2: 4bffffc8 38000000 980d0c94 812d0000 7c001829 2c000000 40c20010 7d20192d
2012-11-01 23:44:49.040902 {DefaultControlEventListener} [mmcs]{161}.0.2: 40c2fff0 4c00012c 2fa00000 4dfe0020 &amp;lt;7c210b78&amp;gt; 80030000 2fa00000 40defff4
2012-11-01 23:44:49.121348 {DefaultControlEventListener} [mmcs]{161}.0.2: [c0000003ca2e7e40] [8000000003abd76c] .ptlrpcd+0x35c/0x510 [ptlrpc]
2012-11-01 23:44:49.160915 {DefaultControlEventListener} [mmcs]{161}.0.2: [c0000003ca2e7f90] [c00000000001a9e0] .kernel_thread+0x54/0x70
2012-11-01 23:44:49.201038 {DefaultControlEventListener} [mmcs]{161}.0.2: Instruction dump:
2012-11-01 23:44:49.241229 {DefaultControlEventListener} [mmcs]{161}.0.2: 980d0c94 812d0000 7c001829 2c000000 40c20010 7d20192d 40c2fff0 4c00012c
2012-11-01 23:44:49.281578 {DefaultControlEventListener} [mmcs]{161}.0.2: 2fa00000 4dfe0020 7c210b78 80030000 &amp;lt;2fa00000&amp;gt; 40defff4 7c421378 4bffffc8
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The line it&apos;s stuck on:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;(gdb) l *ptlrpc_check_set+0x4f4
0x46b04 is in ptlrpc_check_set (/builddir/build/BUILD/lustre-2.3.54/lustre/ptlrpc/client.c:1852).
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And the source level info:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
1849 &amp;gt;-------&amp;gt;-------&amp;gt;-------libcfs_nid2str(imp-&amp;gt;imp_connection-&amp;gt;c_peer.nid),
1850 &amp;gt;-------&amp;gt;-------&amp;gt;-------lustre_msg_get_opc(req-&amp;gt;rq_reqmsg));
1851 
1852                 cfs_spin_lock(&amp;amp;imp-&amp;gt;imp_lock);
1853                 /* Request already may be not on sending or delaying list. This
1854                  * may happen in the case of marking it erroneous for the case
1855                  * ptlrpc_import_delay_req(req, status) find it impossible to
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The symptoms look very similar to what we saw on the old Orion code base, which boiled down to &quot;too many&quot; ptlrpc threads contending with each other while interrupts were disabled. To get past that previous issue we pulled in a patch to limit the number of threads created, thus reducing the contention on the lock in question.&lt;/p&gt;</description>
                <environment></environment>
        <key id="16548">LU-2263</key>
            <summary>CPU Soft Lockups due to many threads spinning on import lock on Sequoia IO nodes</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="liang">Liang Zhen</assignee>
                                    <reporter username="prakash">Prakash Surya</reporter>
                        <labels>
                            <label>MB</label>
                            <label>sequoia</label>
                            <label>topsequoia</label>
                    </labels>
                <created>Fri, 2 Nov 2012 14:01:27 +0000</created>
                <updated>Fri, 19 Apr 2013 20:45:03 +0000</updated>
                            <resolved>Fri, 15 Feb 2013 14:08:33 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                    <fixVersion>Lustre 2.4.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="47305" author="ian" created="Fri, 2 Nov 2012 14:26:43 +0000"  >&lt;p&gt;Prakash - what was the patch you pulled in previously?&lt;/p&gt;</comment>
                            <comment id="47307" author="prakash" created="Fri, 2 Nov 2012 14:33:50 +0000"  >&lt;p&gt;And here is the patch we pulled in for the issue we saw with the old Orion code: &lt;a href=&quot;http://review.whamcloud.com/3047&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/3047&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="47308" author="morrone" created="Fri, 2 Nov 2012 14:39:16 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1164&quot; title=&quot;ko2iblnd schedulers&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1164&quot;&gt;&lt;del&gt;LU-1164&lt;/del&gt;&lt;/a&gt;, master version of patch is &lt;a href=&quot;http://review.whamcloud.com/2246&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;2246&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="47310" author="prakash" created="Fri, 2 Nov 2012 14:48:29 +0000"  >&lt;p&gt;And a link to the previous issue we saw: ORI-768&lt;/p&gt;</comment>
                            <comment id="47312" author="ian" created="Fri, 2 Nov 2012 14:59:33 +0000"  >&lt;p&gt;Liang, can you please revisit this?&lt;/p&gt;</comment>
                            <comment id="47466" author="adilger" created="Tue, 6 Nov 2012 14:36:19 +0000"  >&lt;p&gt;Liang,&lt;br/&gt;
I don&apos;t think change 2246 to add a tunable for the number of kib threads is really a &quot;fix&quot; for this problem on master (nor 2.1.x either, IMHO).&lt;/p&gt;

&lt;p&gt;There should be some default upper limit of kib threads (possibly based on the number of IB HCAs) if there are too many cores on the system.  Given that the number of cores on each node is continually increasing, we don&apos;t want every user to have to specify the module parameter to limit the number of threads, but rather have Lustre pick a sane default that can be configured differently if someone needs it.&lt;/p&gt;</comment>
                            <comment id="47489" author="liang" created="Tue, 6 Nov 2012 20:29:59 +0000"  >&lt;p&gt;Andreas, 2.3 and later version already have upper limit of o2iblnd threads number. I think this probably is a different issue because threads are contending on import::imp_lock, not any lock of LNet/LND, I&apos;m looking into it right now.&lt;/p&gt;</comment>
                            <comment id="47503" author="liang" created="Wed, 7 Nov 2012 00:22:37 +0000"  >&lt;p&gt;I just posted a patch: &lt;a href=&quot;http://review.whamcloud.com/#change,4486&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,4486&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="48540" author="prakash" created="Thu, 29 Nov 2012 13:59:06 +0000"  >&lt;p&gt;We hit what I think is this issue again when testing at scale yesterday on Sequoia using our &lt;a href=&quot;https://github.com/chaos/lustre/commits/2.3.56-3chaos&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;2.3.56-3chaos&lt;/a&gt; tag (which has Liang&apos;s &lt;a href=&quot;https://github.com/chaos/lustre/commit/167f3a9feeadf43b571b0a592d87353fc2be6db2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;ptlrpc patch&lt;/a&gt;).&lt;/p&gt;

&lt;p&gt;Some example stacks are:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;R01-ID-J06.log:2012-11-28 11:08:34.720079 {DefaultControlEventListener} [mmcs]{22}.13.2: NIP [c00000000042e18c] ._spin_lock+0x2c/0x44
R01-ID-J06.log:2012-11-28 11:08:34.720623 {DefaultControlEventListener} [mmcs]{22}.13.2: LR [8000000003a29a48] .ptlrpc_set_import_discon+0x58/0xa10 [ptlrpc]
R01-ID-J06.log:2012-11-28 11:08:34.721117 {DefaultControlEventListener} [mmcs]{22}.13.2: Call Trace:
R01-ID-J06.log:2012-11-28 11:08:34.721767 {DefaultControlEventListener} [mmcs]{22}.13.2: [c0000003e51f3490] [8000000000a8c8f8] cfs_fail_loc+0x0/0xfffffffffffe7220 [libcfs] (unreliable)
R01-ID-J06.log:2012-11-28 11:08:34.722427 {DefaultControlEventListener} [mmcs]{22}.13.2: [c0000003e51f3580] [8000000003a2a71c] .ptlrpc_fail_import+0x8c/0x4d0 [ptlrpc]
R01-ID-J06.log:2012-11-28 11:08:34.722944 {DefaultControlEventListener} [mmcs]{22}.13.2: [c0000003e51f3630] [80000000039d1024] .ptlrpc_expire_one_request+0x584/0x8b0 [ptlrpc]
R01-ID-J06.log:2012-11-28 11:08:34.723459 {DefaultControlEventListener} [mmcs]{22}.13.2: [c0000003e51f3700] [80000000039d1454] .ptlrpc_expired_set+0x104/0x2a0 [ptlrpc]
R01-ID-J06.log:2012-11-28 11:08:34.723890 {DefaultControlEventListener} [mmcs]{22}.13.2: [c0000003e51f37b0] [80000000039dbd20] .ptlrpc_set_wait+0x4c0/0xcb0 [ptlrpc]
R01-ID-J06.log:2012-11-28 11:08:34.724364 {DefaultControlEventListener} [mmcs]{22}.13.2: [c0000003e51f3920] [80000000039dcc64] .ptlrpc_queue_wait+0xd4/0x380 [ptlrpc]
R01-ID-J06.log:2012-11-28 11:08:34.725489 {DefaultControlEventListener} [mmcs]{22}.13.2: [c0000003e51f39e0] [8000000005ae3594] .mdc_sync+0x104/0x340 [mdc]
R01-ID-J06.log:2012-11-28 11:08:34.726591 {DefaultControlEventListener} [mmcs]{22}.13.2: [c0000003e51f3a90] [800000000714c228] .lmv_sync+0x2c8/0x820 [lmv]
R01-ID-J06.log:2012-11-28 11:08:34.727515 {DefaultControlEventListener} [mmcs]{22}.13.2: [c0000003e51f3b80] [800000000695381c] .ll_fsync+0x23c/0xc50 [lustre]
R01-ID-J06.log:2012-11-28 11:08:34.728440 {DefaultControlEventListener} [mmcs]{22}.13.2: [c0000003e51f3c80] [c0000000000fc094] .vfs_fsync_range+0xb0/0x104
R01-ID-J06.log:2012-11-28 11:08:34.729866 {DefaultControlEventListener} [mmcs]{22}.13.2: [c0000003e51f3d30] [c0000000000fc18c] .do_fsync+0x3c/0x6c
R01-ID-J06.log:2012-11-28 11:08:34.731028 {DefaultControlEventListener} [mmcs]{22}.13.2: [c0000003e51f3dc0] [c0000000000fc1fc] .SyS_fsync+0x18/0x28
R01-ID-J06.log:2012-11-28 11:08:34.732396 {DefaultControlEventListener} [mmcs]{22}.13.2: [c0000003e51f3e30] [c000000000000580] syscall_exit+0x0/0x2c
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;R01-ID-J06.log:2012-11-28 11:06:22.804122 {DefaultControlEventListener} [mmcs]{22}.13.0: NIP [c00000000042e190] ._spin_lock+0x30/0x44
R01-ID-J06.log:2012-11-28 11:06:22.805011 {DefaultControlEventListener} [mmcs]{22}.13.0: LR [80000000039d97c4] .ptlrpc_check_set+0x2de4/0x4e80 [ptlrpc]
R01-ID-J06.log:2012-11-28 11:06:22.806007 {DefaultControlEventListener} [mmcs]{22}.13.0: Call Trace:
R01-ID-J06.log:2012-11-28 11:06:22.806443 {DefaultControlEventListener} [mmcs]{22}.13.0: [c0000003e93ab610] [80000000039d96f4] .ptlrpc_check_set+0x2d14/0x4e80 [ptlrpc] (unreliable)
R01-ID-J06.log:2012-11-28 11:06:22.806969 {DefaultControlEventListener} [mmcs]{22}.13.0: [c0000003e93ab7b0] [80000000039dbd4c] .ptlrpc_set_wait+0x4ec/0xcb0 [ptlrpc]
R01-ID-J06.log:2012-11-28 11:06:22.807475 {DefaultControlEventListener} [mmcs]{22}.13.0: [c0000003e93ab920] [80000000039dcc64] .ptlrpc_queue_wait+0xd4/0x380 [ptlrpc]
R01-ID-J06.log:2012-11-28 11:06:22.807955 {DefaultControlEventListener} [mmcs]{22}.13.0: [c0000003e93ab9e0] [8000000005ae3594] .mdc_sync+0x104/0x340 [mdc]
R01-ID-J06.log:2012-11-28 11:06:22.808408 {DefaultControlEventListener} [mmcs]{22}.13.0: [c0000003e93aba90] [800000000714c228] .lmv_sync+0x2c8/0x820 [lmv]
R01-ID-J06.log:2012-11-28 11:06:22.808870 {DefaultControlEventListener} [mmcs]{22}.13.0: [c0000003e93abb80] [800000000695381c] .ll_fsync+0x23c/0xc50 [lustre]
R01-ID-J06.log:2012-11-28 11:06:22.809322 {DefaultControlEventListener} [mmcs]{22}.13.0: [c0000003e93abc80] [c0000000000fc094] .vfs_fsync_range+0xb0/0x104
R01-ID-J06.log:2012-11-28 11:06:22.809723 {DefaultControlEventListener} [mmcs]{22}.13.0: [c0000003e93abd30] [c0000000000fc18c] .do_fsync+0x3c/0x6c
R01-ID-J06.log:2012-11-28 11:06:22.810190 {DefaultControlEventListener} [mmcs]{22}.13.0: [c0000003e93abdc0] [c0000000000fc1fc] .SyS_fsync+0x18/0x28
R01-ID-J06.log:2012-11-28 11:06:22.810643 {DefaultControlEventListener} [mmcs]{22}.13.0: [c0000003e93abe30] [c000000000000580] syscall_exit+0x0/0x2c
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;R01-ID-J06.log:2012-11-28 11:46:33.519703 {DefaultControlEventListener} [mmcs]{22}.13.3: NIP [c00000000042e198] ._spin_lock+0x38/0x44
R01-ID-J06.log:2012-11-28 11:46:33.520501 {DefaultControlEventListener} [mmcs]{22}.13.3: LR [8000000003a34f6c] .sptlrpc_import_sec_ref+0x1c/0x80 [ptlrpc]
R01-ID-J06.log:2012-11-28 11:46:33.521330 {DefaultControlEventListener} [mmcs]{22}.13.3: Call Trace:
R01-ID-J06.log:2012-11-28 11:46:33.522190 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0932f20] [c000000000687a20] svc_rdma_ops+0xa078/0x198c0 (unreliable)
R01-ID-J06.log:2012-11-28 11:46:33.522897 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0932fa0] [8000000003a3eb10] .import_sec_validate_get+0x50/0x410 [ptlrpc]
R01-ID-J06.log:2012-11-28 11:46:33.523723 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0933050] [8000000003a3ef6c] .sptlrpc_req_get_ctx+0x9c/0x3b0 [ptlrpc]
R01-ID-J06.log:2012-11-28 11:46:33.524606 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0933120] [80000000039d3d74] .__ptlrpc_request_bufs_pack+0xa4/0x440 [ptlrpc]
R01-ID-J06.log:2012-11-28 11:46:33.525420 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0933200] [80000000039d4764] .ptlrpc_request_pack+0x34/0xb0 [ptlrpc]
R01-ID-J06.log:2012-11-28 11:46:33.526228 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0933290] [8000000005aefca0] .mdc_close+0xf0/0xee0 [mdc]
R01-ID-J06.log:2012-11-28 11:46:33.527004 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0933380] [8000000007152024] .lmv_close+0x2f4/0x920 [lmv]
R01-ID-J06.log:2012-11-28 11:46:33.527865 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0933470] [8000000006958e50] .ll_close_inode_openhandle+0x400/0x1940 [lustre]
R01-ID-J06.log:2012-11-28 11:46:33.537376 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0933590] [800000000695d38c] .ll_md_real_close+0x23c/0x310 [lustre]
R01-ID-J06.log:2012-11-28 11:46:33.538321 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0933640] [8000000006961350] .ll_md_close+0x3c0/0xa10 [lustre]
R01-ID-J06.log:2012-11-28 11:46:33.539116 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0933770] [8000000006961af8] .ll_file_release+0x158/0x510 [lustre]
R01-ID-J06.log:2012-11-28 11:46:33.539860 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0933830] [c0000000000d2db8] .__fput+0x174/0x25c
R01-ID-J06.log:2012-11-28 11:46:33.540788 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e09338d0] [c0000000000cf580] .filp_close+0xb0/0xd8
R01-ID-J06.log:2012-11-28 11:46:33.541649 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0933960] [c000000000038858] .put_files_struct+0xb8/0x144
R01-ID-J06.log:2012-11-28 11:46:33.542517 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0933a10] [c00000000003a674] .do_exit+0x210/0x6ac
R01-ID-J06.log:2012-11-28 11:46:33.543291 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0933af0] [c00000000003abc4] .do_group_exit+0xb4/0xe8
R01-ID-J06.log:2012-11-28 11:46:33.544067 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0933b80] [c000000000047d60] .get_signal_to_deliver+0x3f0/0x478
R01-ID-J06.log:2012-11-28 11:46:33.544867 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0933c70] [c00000000000a7a4] .do_signal_pending.clone.0+0x5c/0x310
R01-ID-J06.log:2012-11-28 11:46:33.545601 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0933db0] [c00000000000aa78] .do_signal+0x20/0x60
R01-ID-J06.log:2012-11-28 11:46:33.546365 {DefaultControlEventListener} [mmcs]{22}.13.3: [c0000003e0933e30] [c000000000000a2c] do_work+0x20/0x24
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I believe Chris&apos;s IOR test was using the &quot;fsync on close&quot; option, so it looks like it completed the write phase and was now trying to fsync and close the open files. The fact that the stacks are all down in fsync call paths, and then 40+ minutes later I see some stacks down the close call path lead me to believe the node wasn&apos;t deadlocked (per se), and would have &quot;eventually&quot; cleared up. Either way, it looks like there is far too much contention on a obd_import::imp_lock lock which needs to be addressed. About five minutes after the &quot;close&quot; stacks were printed the node was rebooted. &lt;/p&gt;

&lt;p&gt;It is also worth noting that we were unable to login to the node(s) during this time. Are the threads spinning with interrupts disabled? That would unfortunately disable interrupts for the entire system for a really long time in our configuration.&lt;/p&gt;</comment>
                            <comment id="48563" author="liang" created="Thu, 29 Nov 2012 22:47:33 +0000"  >&lt;p&gt;I think it&apos;s just because imp_lock is under high contention if it&apos;s eventually cleared up &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/sad.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;, I don&apos;t have very good idea to improve this so far, but I will see want I can do.&lt;br/&gt;
this spinlock will not disable interrupts, but soft lockup will dump stack to console which is extremely slow and system might not responding .  &lt;/p&gt;</comment>
                            <comment id="48620" author="morrone" created="Fri, 30 Nov 2012 14:59:42 +0000"  >&lt;p&gt;Well, careful, I&apos;m not sure that I believe that seeing the processes soft lockup in close() a few minutes before the reboot means what you think it means.&lt;/p&gt;

&lt;p&gt;This was a 98,304 process ior running on Sequoia.  That means 128 MPI tasks writing to lustre through one I/O Node (ION).  Each ION has 68 &quot;cpus&quot; from Linux&apos;s perspective, 17 real cores, and 4-way SMP threading.&lt;/p&gt;

&lt;p&gt;In the incident on ION R01-ID-J06, between the times of 2012-11-28 11:06:10.548123 and 2012-11-28 11:08:34.717051 (just over two minutes of time), I count at least 51 tasks that trigger soft lockup messages.&lt;/p&gt;

&lt;p&gt;Then, nearly 40 minutes later and only 5 minutes before the node is shut down, I see only two new process that show soft lockups in new processes under the file close.  But you&apos;ll note that these are not the same processes that were stuck in the fsync() earlier.  Also note that these processes that get hung under close have their backtraces start at &quot;do_signal&quot;.&lt;/p&gt;

&lt;p&gt;I believe what is happening is that some, but not all, of the 128 processes get stuck in fsync.  Perhaps around 50 in this instance.  Some of the processes actually completed their fsync and are just sitting with the file descriptor open, doing nothing.&lt;/p&gt;

&lt;p&gt;Then IBM&apos;s control system comes along and tries to shutdown the ION at the system administrator&apos;s request.  One thing it likely did was send a signal to the sysiod tasks to shut them down.  During the signal handling to kill the process, the kernel is going to clean up open file descriptors.  When it tries to close the open lustre file descriptor, some processes now get stuck on the same lock as the others, but in the close() path rather than the fsync() path.&lt;/p&gt;

&lt;p&gt;After some timeout (5 minutes probably), IBM&apos;s system stops waiting for there daemons to exit, and starts a shutdown of the whole node.&lt;/p&gt;

&lt;p&gt;So I suspect that the processes really are stuck, and will never make progress.&lt;/p&gt;</comment>
                            <comment id="48624" author="morrone" created="Fri, 30 Nov 2012 15:08:57 +0000"  >&lt;p&gt;I attached an excerpt from the console log, file R01-ID-J06.log, for the time in question.  Mostly there are sysiod processes stuck, but there is one ptlrpcd_rcv process in there at the beginning of the incident as well.  Unfortunately, the backtrace of ptlrpcd_rcv is interleaved with a couple of other sysiod backtraces, so it is difficult to see where it was.  But perhaps we can find the culprit in there somewhere.&lt;/p&gt;</comment>
                            <comment id="48647" author="liang" created="Sun, 2 Dec 2012 00:02:19 +0000"  >&lt;p&gt;I have checked code and log again, but still didn&apos;t find any possible deadlock. Lustre only have very few cases will disable interrupt (I remember we used to take spin_lock_bh for imp_lock, but we have changed that a few years ago). However, CDEBUG will definitely disable interrupt, any we did call CDEBUG_REQ many times with hold of imp_lock, CDEBUG_REQ has complex format which makes it a little expensive, so one thing  might be worth a try is, disable D_HA and D_IOCTL on clients to see if it can make any difference, at the meanwhile, I will check code again.&lt;/p&gt;</comment>
                            <comment id="48729" author="liang" created="Tue, 4 Dec 2012 09:20:49 +0000"  >&lt;p&gt;I was wrong in previous comment, CDEBUG will not disable interrupt if it&apos;s in thread context, but I still suggest to disable D_HA and D_IOCTL to see if it can help before we find any other clue, I remember a couple of years ago I saw some tests hit soft lockup while enabling D_INFO.&lt;/p&gt;</comment>
                            <comment id="49008" author="bzzz" created="Mon, 10 Dec 2012 15:55:59 +0000"  >&lt;p&gt;Christopher, Prakash, could you describe the workload and/or estimate it in RPC/second please ?&lt;/p&gt;

&lt;p&gt;we usually limit number of RPCs in flight to 8 (per import). to achieve 1GB/s per OST with 1MB RPC we&apos;d need 1024 RPC/second. say, we&apos;re grabbing imp_lock roughly 10 times for every processed RPC, which gives us to ~10K/sec lock instances. but this is in the case of 1GB/s going to a single OST.&lt;/p&gt;

&lt;p&gt;I think it&apos;d be helpful to collect data on CONFIG_LOCK_STAT-enabled kernel.&lt;/p&gt;

&lt;p&gt;if this is not a deadlock, then I&apos;d consider few possible contributors:&lt;br/&gt;
1) CDEBUG() overhead while imp_lock is held&lt;br/&gt;
2) cache ping-pong if OST_WRITE go to different partitions&lt;br/&gt;
3) a lot of non-committed requests making ptlrpc_free_committed() slow (which is running with imp_lock held)&lt;br/&gt;
4) lack of need_resched() in ptlrpc_check_set() (though in all listed cases it was in _spin_lock())&lt;/p&gt;</comment>
                            <comment id="49016" author="morrone" created="Mon, 10 Dec 2012 20:13:50 +0000"  >&lt;p&gt;When I had problems, I was running an ior something like this:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;  ior -F -e -g -C -t 1m -b 512m -o /p/ls1/morrone/foo&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The ior was 98,304 MPI tasks in size.&lt;/p&gt;

&lt;p&gt;Under normal conditions, each Sequoia I/O Node (ION) handles I/O for 128 compute nodes.  So a given ION will handle 128 files.  We have currently left the filesystem default at 1-stripe per file, so each ION will be writing to up to 128 different OSTs at more-or-less the same time.&lt;/p&gt;</comment>
                            <comment id="49017" author="morrone" created="Mon, 10 Dec 2012 20:15:25 +0000"  >&lt;p&gt;On writes, we currently see about 850 GB/s, so with 768 IONs that is roughly 1.1 GB/s per ION (the ION is the lustre client).&lt;/p&gt;</comment>
                            <comment id="49024" author="bzzz" created="Mon, 10 Dec 2012 23:23:40 +0000"  >&lt;p&gt;so, it&apos;s roughly 1.1/128 or ~9MB/s per OST on an average ION and actually very few RPC/second/import...&lt;/p&gt;</comment>
                            <comment id="49511" author="morrone" created="Thu, 20 Dec 2012 18:22:35 +0000"  >&lt;p&gt;And just to be clear, I believe that this WAS a deadlock situation, as detailed in &lt;a href=&quot;http://jira.whamcloud.com/browse/LU-2263?focusedCommentId=48620&amp;amp;page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-48620&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;this comment&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="49525" author="liang" created="Fri, 21 Dec 2012 01:39:31 +0000"  >&lt;p&gt;I did find a deadlock....&lt;/p&gt;

&lt;p&gt;ptlrpc_at_recv_early_reply() calls ptlrpc_at_set_req_timeout() with hold of ptlrpc_request::rq_lock, and ptlrpc_at_set_req_timeout()-&amp;gt;import_at_get_index() requires imp_lock, it violates locking-order rule of Lustre, which assumes rq_lock can nest in imp_lock.&lt;br/&gt;
This is introduced by BZ16999&lt;br/&gt;
Path is here: &lt;a href=&quot;http://review.whamcloud.com/#change,4880&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,4880&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="49527" author="bzzz" created="Fri, 21 Dec 2012 01:55:39 +0000"  >&lt;p&gt;interesting.. in the traces above can you find one holding rq_lock ?&lt;/p&gt;</comment>
                            <comment id="49529" author="liang" created="Fri, 21 Dec 2012 02:18:41 +0000"  >&lt;p&gt;No I can&apos;t, those traces look like someone forgot to unlock imp_lock, but I have already looked through all use-cases of imp_lock for three times and can&apos;t find any case of that. This is the only buggy code I can find so far.&lt;/p&gt;</comment>
                            <comment id="49530" author="bzzz" created="Fri, 21 Dec 2012 02:24:06 +0000"  >&lt;p&gt;yes... OK. lets see how it does at LLNL.&lt;/p&gt;</comment>
                            <comment id="52466" author="jlevi" created="Fri, 15 Feb 2013 14:08:33 +0000"  >&lt;p&gt;Patch landed to master.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="16318">LU-2141</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="17126">LU-2597</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="16736">LU-2366</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="17086">LU-2572</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="12076" name="R01-ID-J06.log" size="236821" author="morrone" created="Fri, 30 Nov 2012 15:08:57 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvbjr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>5413</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>