<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:50:43 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5349] Deadlock in mdc_close()</title>
                <link>https://jira.whamcloud.com/browse/LU-5349</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We had to crash/dump one of our Lustre clients because of a deadlock issue in mdc_close(). The PID 5231 was waiting for a lock that it already owned. BTW, we had a lot of process waiting for this lock.&lt;/p&gt;

&lt;p&gt;In the backtrace of the process, we can see two calls to mdc_close(). The second is due to the system reclaiming memory.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;crash&amp;gt; bt 5231
PID: 5231   TASK: ffff881518308b00  CPU: 2   COMMAND: &lt;span class=&quot;code-quote&quot;&gt;&quot;code2&quot;&lt;/span&gt;
 #0 [ffff88171cb43188] schedule at ffffffff81528a52
 #1 [ffff88171cb43250] __mutex_lock_slowpath at ffffffff8152a20e
 #2 [ffff88171cb432c0] mutex_lock at ffffffff8152a0ab                  &amp;lt;=== Requires a &lt;span class=&quot;code-keyword&quot;&gt;new&lt;/span&gt; lock
 #3 [ffff88171cb432e0] mdc_close at ffffffffa09176db [mdc]
 #4 [ffff88171cb43330] lmv_close at ffffffffa0b9bcb8 [lmv]
 #5 [ffff88171cb43380] ll_close_inode_openhandle at ffffffffa0a80c1e [lustre]
 #6 [ffff88171cb43400] ll_md_real_close at ffffffffa0a81afa [lustre]
 #7 [ffff88171cb43430] ll_clear_inode at ffffffffa0a92dee [lustre]
 #8 [ffff88171cb43470] clear_inode at ffffffff811a626c
 #9 [ffff88171cb43490] dispose_list at ffffffff811a6340
#10 [ffff88171cb434d0] shrink_icache_memory at ffffffff811a6694
#11 [ffff88171cb43530] shrink_slab at ffffffff81138b7a
#12 [ffff88171cb43590] zone_reclaim at ffffffff8113b77e
#13 [ffff88171cb436b0] get_page_from_freelist at ffffffff8112d8dc
#14 [ffff88171cb437e0] __alloc_pages_nodemask at ffffffff8112f443
#15 [ffff88171cb43920] alloc_pages_current at ffffffff811680ca
#16 [ffff88171cb43950] __vmalloc_area_node at ffffffff81159696
#17 [ffff88171cb439b0] __vmalloc_node at ffffffff8115953d
#18 [ffff88171cb43a10] vmalloc at ffffffff8115985c
#19 [ffff88171cb43a20] cfs_alloc_large at ffffffffa03b4b1e [libcfs]
#20 [ffff88171cb43a30] null_alloc_repbuf at ffffffffa06c4961 [ptlrpc]
#21 [ffff88171cb43a60] sptlrpc_cli_alloc_repbuf at ffffffffa06b2355 [ptlrpc]
#22 [ffff88171cb43a90] ptl_send_rpc at ffffffffa068432c [ptlrpc]
#23 [ffff88171cb43b50] ptlrpc_send_new_req at ffffffffa067879b [ptlrpc]
#24 [ffff88171cb43bc0] ptlrpc_set_wait at ffffffffa067ddb6 [ptlrpc]
#25 [ffff88171cb43c60] ptlrpc_queue_wait at ffffffffa067e0df [ptlrpc]   &amp;lt;=== PID has the lock
#26 [ffff88171cb43c80] mdc_close at ffffffffa0917714 [mdc]
#27 [ffff88171cb43cd0] lmv_close at ffffffffa0b9bcb8 [lmv]
#28 [ffff88171cb43d20] ll_close_inode_openhandle at ffffffffa0a80c1e [lustre]
#29 [ffff88171cb43da0] ll_md_real_close at ffffffffa0a81afa [lustre]
#30 [ffff88171cb43dd0] ll_md_close at ffffffffa0a81d8a [lustre]
#31 [ffff88171cb43e80] ll_file_release at ffffffffa0a8233b [lustre]
#32 [ffff88171cb43ec0] __fput at ffffffff8118ad55
#33 [ffff88171cb43f10] fput at ffffffff8118ae95
#34 [ffff88171cb43f20] filp_close at ffffffff811861bd
#35 [ffff88171cb43f50] sys_close at ffffffff81186295
#36 [ffff88171cb43f80] system_call_fastpath at ffffffff8100b072
    RIP: 00002adaacdf26d0  RSP: 00007fff9665e238  RFLAGS: 00010246
    RAX: 0000000000000003  RBX: ffffffff8100b072  RCX: 0000000000002261
    RDX: 00000000044a24b0  RSI: 0000000000000001  RDI: 0000000000000005
    RBP: 0000000000000000   R8: 00002adaad0ac560   R9: 0000000000000001
    R10: 00000000000004fd  R11: 0000000000000246  R12: 00000000000004fc
    R13: 00000000ffffffff  R14: 00000000044a23d0  R15: 00000000ffffffff
    ORIG_RAX: 0000000000000003  CS: 0033  SS: 002b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;We have a recursive locking here, which is not permitted.&lt;/p&gt;</description>
                <environment>RHEL6 w/ patched kernel</environment>
        <key id="25581">LU-5349</key>
            <summary>Deadlock in mdc_close()</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bfaccini">Bruno Faccini</assignee>
                                    <reporter username="bruno.travouillon">Bruno Travouillon</reporter>
                        <labels>
                    </labels>
                <created>Tue, 15 Jul 2014 06:49:39 +0000</created>
                <updated>Mon, 29 Sep 2014 13:06:25 +0000</updated>
                            <resolved>Mon, 29 Sep 2014 13:06:25 +0000</resolved>
                                    <version>Lustre 2.4.3</version>
                                    <fixVersion>Lustre 2.7.0</fixVersion>
                    <fixVersion>Lustre 2.5.4</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="89011" author="bfaccini" created="Tue, 15 Jul 2014 11:12:59 +0000"  >&lt;p&gt;May be we can avoid this self dead-lock by detecting in prolog of ll_clear_inode() routine that current thread is already owner of class_exp2obd(ll_i2sbi(inode)&lt;del&gt;&amp;gt;ll_md_exp)&lt;/del&gt;&amp;gt;u.cli.cl_close_lock-&amp;gt;rpcl_mutex ?&lt;/p&gt;</comment>
                            <comment id="89066" author="jlevi" created="Tue, 15 Jul 2014 17:20:06 +0000"  >&lt;p&gt;Bruno,&lt;br/&gt;
Can you comment if this is happening in 2.6?&lt;/p&gt;</comment>
                            <comment id="89756" author="bfaccini" created="Tue, 22 Jul 2014 17:14:59 +0000"  >&lt;p&gt;Jodi: Looks like yes.&lt;/p&gt;

&lt;p&gt;I try to implement my fix idea in &lt;a href=&quot;http://review.whamcloud.com/11183&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11183&lt;/a&gt;. But Bruno, any help to find a way to reproduce the same situation is welcome ...&lt;/p&gt;</comment>
                            <comment id="89765" author="green" created="Tue, 22 Jul 2014 17:57:03 +0000"  >&lt;p&gt;The most important part of the failure here is that I just realized vmalloc does not take GFP mask so we cannot tell it not to dive into filesystems for memory reclaiming.&lt;br/&gt;
As such it opens endless possibilitiees for deadlocks and such.&lt;/p&gt;

&lt;p&gt;The real fix here would be to change OBD_VMALLOC to use __vmalloc instead with GFP_NOFS flags like we do with regular vmalloc. (we also need to add GFP_ZERO of course to prezero the memory for us).&lt;/p&gt;</comment>
                            <comment id="89773" author="adilger" created="Tue, 22 Jul 2014 18:36:54 +0000"  >&lt;p&gt;How long has __vmalloc() existed?  I&apos;ve never seen that before, and we&apos;ve had similar problems to this in the past that could have been fixed in a similar manner.&lt;/p&gt;</comment>
                            <comment id="89797" author="green" created="Tue, 22 Jul 2014 22:11:19 +0000"  >&lt;p&gt;I just did some git research and it has been there since forever, basically.&lt;br/&gt;
2.6.12-rc2 (when Linus repo at github starts) already has it.&lt;/p&gt;</comment>
                            <comment id="89802" author="bfaccini" created="Tue, 22 Jul 2014 23:14:24 +0000"  >&lt;p&gt;Oleg: thanks for this hint! I agree it looks more elegant than my own idea, and since it is much more restrictive it should handle all other dead-lock possibilities than only this inflight RPC serialization mutex ...&lt;/p&gt;

&lt;p&gt;But concerning the fix detail now, I don&apos;t see any current way to specify any combination of allocation flag within our vmalloc()/vzalloc() based set of macros, should we think to implement it now ?&lt;/p&gt;</comment>
                            <comment id="89812" author="green" created="Wed, 23 Jul 2014 01:37:58 +0000"  >&lt;p&gt;Basically we need to do the same thing we do for the OBD_ALLOC code - just use GFP_NOFS by default (since we are a filesystem, that&apos;s a safe bet).&lt;br/&gt;
So far thee was no need for OBD_VMALLOC_GFP so I am not sure now is the tiem to introduce it.&lt;/p&gt;</comment>
                            <comment id="89835" author="bfaccini" created="Wed, 23 Jul 2014 09:18:20 +0000"  >&lt;p&gt;I just pushed &lt;a href=&quot;http://review.whamcloud.com/11190&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11190&lt;/a&gt; in order to have vmalloc&lt;span class=&quot;error&quot;&gt;&amp;#91;_node&amp;#93;&lt;/span&gt;() based allocations to no longer use __GFP_FS by default. I also found that this will enable for a real NUMA node parameter setting!&lt;/p&gt;</comment>
                            <comment id="89849" author="bfaccini" created="Wed, 23 Jul 2014 15:00:47 +0000"  >&lt;p&gt;Humm too bad, __vmalloc_node() is not exported by Kernels ... So I am stuck if I want to also fix cfs_cpt_vzalloc() about the fact that __GFP_FS is used by default by vzalloc_node() and continue to forward a NUMA node specification, since I think only __vmalloc_node() would permit ... &lt;br/&gt;
So what should I do in cfs_cpt_vzalloc(), call __vmalloc() and forget about the node specified (but this may imply performance issues with NUMA aware ptlrpcds which use this...) or leave it like this until Kernel exports an accurate entry-point and assume that at the moment no cfs_cpt_vzalloc() call occurs during any File-System operations ?&lt;/p&gt;

&lt;p&gt;I just pushed patch-set #2 of &lt;a href=&quot;http://review.whamcloud.com/11190&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11190&lt;/a&gt; assuming this last case.&lt;/p&gt;</comment>
                            <comment id="89861" author="adilger" created="Wed, 23 Jul 2014 17:49:37 +0000"  >&lt;p&gt;The other question here is how many OSTs are in this filesystem, and if you are using wide striping?  I&apos;m trying to figure out why this was using vmalloc() instead of kmalloc(), and if there is a separate bug to be addressed to reduce the allocation size. &lt;/p&gt;</comment>
                            <comment id="89862" author="bruno.travouillon" created="Wed, 23 Jul 2014 18:48:14 +0000"  >&lt;p&gt;There are several Lustre filesystems mounted on this client:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;a Lustre 2.4 filesystem on the same LNET with 1 MDT and 480 OSTs. We do not use wide striping.&lt;/li&gt;
	&lt;li&gt;a Lustre 2.1 filesystem on another LNET with 1 MDT and 224 OSTs.&lt;/li&gt;
	&lt;li&gt;a Lustre 2.1 filesystem on another LNET with 1 MDT and 56 OSTs.&lt;/li&gt;
	&lt;li&gt;a Lustre 2.1 filesystem on another LNET with 1 MDT and 48 OSTs.&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;This Lustre client is a login node, with many user working interactively.&lt;/p&gt;

&lt;p&gt;You can find in the attached file the outputs of &lt;tt&gt;sar -B&lt;/tt&gt; and &lt;tt&gt;sar -R&lt;/tt&gt;.&lt;/p&gt;

&lt;p&gt;Hope this helps.&lt;/p&gt;</comment>
                            <comment id="91057" author="bfaccini" created="Thu, 7 Aug 2014 14:33:00 +0000"  >&lt;p&gt;Patch &lt;a href=&quot;http://review.whamcloud.com/#/c/11183/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/11183/&lt;/a&gt; has been abandoned.&lt;/p&gt;</comment>
                            <comment id="93094" author="bfaccini" created="Wed, 3 Sep 2014 09:24:18 +0000"  >&lt;p&gt;Master patch &lt;a href=&quot;http://review.whamcloud.com/11190&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11190&lt;/a&gt; has landed.&lt;br/&gt;
b2_5 version is now at &lt;a href=&quot;http://review.whamcloud.com/11739&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11739&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="93099" author="sebastien.buisson" created="Wed, 3 Sep 2014 10:45:54 +0000"  >&lt;p&gt;Bruno, why does b2_5 version lack  _&lt;em&gt;GFP_ZERO flag in call to __vmalloc() (&lt;/em&gt;_OBD_VMALLOC_VERBOSE macro)?&lt;/p&gt;</comment>
                            <comment id="93101" author="bfaccini" created="Wed, 3 Sep 2014 12:29:40 +0000"  >&lt;p&gt;I forgot to mention it/why, nice catch! It is because b2_5 uses vmalloc() when master used vzalloc(), and I wanted to challenge my future reviewers about this ...&lt;/p&gt;</comment>
                            <comment id="94801" author="green" created="Wed, 24 Sep 2014 03:14:01 +0000"  >&lt;p&gt;also GFP_ZERO is not really needed in b2_5 patch because we explicitly zero the allocation with memset() afterwards anyway.&lt;/p&gt;</comment>
                            <comment id="95178" author="pjones" created="Mon, 29 Sep 2014 13:06:25 +0000"  >&lt;p&gt;Landed for 2.5.4 and 2.7&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="15375" name="report_for_support" size="55957" author="bruno.travouillon" created="Tue, 15 Jul 2014 06:49:39 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwrg7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>14915</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>