<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:47:29 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4977] Deadlock in balance_dirty_pages()</title>
                <link>https://jira.whamcloud.com/browse/LU-4977</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;I can occasionally see this issue in machines with less memory. The deadlock has the following call stack:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;dd            D 0000000000000000     0  2158      1 0x00000004
 ffff88010ecc10f8 0000000000000086 ffff8801ffffffff 0000000042a8c635
 ffff88010ecc1078 ffff88009ccb68a0 0000000000047e6a ffffffffaca103a3
 ffff8800d7bd5058 ffff88010ecc1fd8 000000000000fb88 ffff8800d7bd5058
Call Trace:
 [&amp;lt;ffffffff810a2431&amp;gt;] ? ktime_get_ts+0xb1/0xf0
 [&amp;lt;ffffffff81119e10&amp;gt;] ? sync_page+0x0/0x50
 [&amp;lt;ffffffff8150ed93&amp;gt;] io_schedule+0x73/0xc0
 [&amp;lt;ffffffff81119e4d&amp;gt;] sync_page+0x3d/0x50
 [&amp;lt;ffffffff8150f5fa&amp;gt;] __wait_on_bit_lock+0x5a/0xc0
 [&amp;lt;ffffffff81119de7&amp;gt;] __lock_page+0x67/0x70
 [&amp;lt;ffffffff81096de0&amp;gt;] ? wake_bit_function+0x0/0x50
 [&amp;lt;ffffffffa0f60101&amp;gt;] vvp_page_make_ready+0x271/0x280 [lustre]
 [&amp;lt;ffffffffa0542999&amp;gt;] cl_page_make_ready+0x89/0x370 [obdclass]
 [&amp;lt;ffffffffa03b45a1&amp;gt;] ? libcfs_debug_msg+0x41/0x50 [libcfs]
 [&amp;lt;ffffffffa0a323b7&amp;gt;] osc_extent_make_ready+0x3b7/0xe50 [osc]
 [&amp;lt;ffffffff81055ad3&amp;gt;] ? __wake_up+0x53/0x70
 [&amp;lt;ffffffffa0a36af6&amp;gt;] osc_io_unplug0+0x1736/0x2130 [osc]
 [&amp;lt;ffffffff8103c7d8&amp;gt;] ? pvclock_clocksource_read+0x58/0xd0
 [&amp;lt;ffffffffa03b45a1&amp;gt;] ? libcfs_debug_msg+0x41/0x50 [libcfs]
 [&amp;lt;ffffffffa0a39681&amp;gt;] osc_io_unplug+0x11/0x20 [osc]
 [&amp;lt;ffffffffa0a3bc86&amp;gt;] osc_cache_writeback_range+0xdb6/0x1290 [osc]
 [&amp;lt;ffffffffa03b9d47&amp;gt;] ? cfs_hash_bd_lookup_intent+0x37/0x130 [libcfs]
 [&amp;lt;ffffffffa03b9d47&amp;gt;] ? cfs_hash_bd_lookup_intent+0x37/0x130 [libcfs]
 [&amp;lt;ffffffffa03b9362&amp;gt;] ? cfs_hash_bd_add_locked+0x62/0x90 [libcfs]
 [&amp;lt;ffffffffa054a45d&amp;gt;] ? cl_io_sub_init+0x5d/0xc0 [obdclass]
 [&amp;lt;ffffffffa0a29fd0&amp;gt;] osc_io_fsync_start+0x90/0x360 [osc]
 [&amp;lt;ffffffffa0547640&amp;gt;] ? cl_io_start+0x0/0x140 [obdclass]
 [&amp;lt;ffffffffa05476aa&amp;gt;] cl_io_start+0x6a/0x140 [obdclass]
 [&amp;lt;ffffffffa0a8f18e&amp;gt;] lov_io_call+0x8e/0x130 [lov]
 [&amp;lt;ffffffffa0a9324c&amp;gt;] lov_io_start+0x10c/0x180 [lov]
 [&amp;lt;ffffffffa05476aa&amp;gt;] cl_io_start+0x6a/0x140 [obdclass]
 [&amp;lt;ffffffffa054aea4&amp;gt;] cl_io_loop+0xb4/0x1b0 [obdclass]
 [&amp;lt;ffffffffa0f02acb&amp;gt;] cl_sync_file_range+0x31b/0x500 [lustre]
 [&amp;lt;ffffffffa0f2fe7c&amp;gt;] ll_writepages+0x9c/0x220 [lustre]
 [&amp;lt;ffffffff8112e1b1&amp;gt;] do_writepages+0x21/0x40
 [&amp;lt;ffffffff811aca9d&amp;gt;] writeback_single_inode+0xdd/0x290
 [&amp;lt;ffffffff811aceae&amp;gt;] writeback_sb_inodes+0xce/0x180
 [&amp;lt;ffffffff811ad00b&amp;gt;] writeback_inodes_wb+0xab/0x1b0
 [&amp;lt;ffffffff8112d60d&amp;gt;] balance_dirty_pages+0x23d/0x4d0
 [&amp;lt;ffffffffa0541768&amp;gt;] ? cl_page_invoid+0x68/0x160 [obdclass]
 [&amp;lt;ffffffff8112d904&amp;gt;] balance_dirty_pages_ratelimited_nr+0x64/0x70
 [&amp;lt;ffffffff8111a86a&amp;gt;] generic_file_buffered_write+0x1da/0x2e0
 [&amp;lt;ffffffff81075887&amp;gt;] ? current_fs_time+0x27/0x30
 [&amp;lt;ffffffff8111c210&amp;gt;] __generic_file_aio_write+0x260/0x490
 [&amp;lt;ffffffffa0a93d9c&amp;gt;] ? lov_lock_enqueue+0xbc/0x170 [lov]
 [&amp;lt;ffffffff8111c4c8&amp;gt;] generic_file_aio_write+0x88/0x100
 [&amp;lt;ffffffffa0f634a2&amp;gt;] vvp_io_write_start+0x102/0x3f0 [lustre]
 [&amp;lt;ffffffffa05476aa&amp;gt;] cl_io_start+0x6a/0x140 [obdclass]
 [&amp;lt;ffffffffa054aea4&amp;gt;] cl_io_loop+0xb4/0x1b0 [obdclass]
 [&amp;lt;ffffffffa0f00297&amp;gt;] ll_file_io_generic+0x407/0x8d0 [lustre]
 [&amp;lt;ffffffffa05406c9&amp;gt;] ? cl_env_get+0x29/0x350 [obdclass]
 [&amp;lt;ffffffffa0f00fa3&amp;gt;] ll_file_aio_write+0x133/0x2b0 [lustre]
 [&amp;lt;ffffffffa0f01279&amp;gt;] ll_file_write+0x159/0x290 [lustre]
 [&amp;lt;ffffffff81181398&amp;gt;] vfs_write+0xb8/0x1a0
 [&amp;lt;ffffffff81181c91&amp;gt;] sys_write+0x51/0x90
 [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In balance_dirty_pages(), it tries to write back some dirty pages between after write_end(). However, ll_write_end() can hold the page to add it into commit queue and causes the problem.&lt;/p&gt;

&lt;p&gt;We can fix the problem by releasing the page in ll_write_end() if the page is already dirty.&lt;/p&gt;

&lt;p&gt;Patch is coming.&lt;/p&gt;</description>
                <environment></environment>
        <key id="24487">LU-4977</key>
            <summary>Deadlock in balance_dirty_pages()</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="jay">Jinshan Xiong</assignee>
                                    <reporter username="jay">Jinshan Xiong</reporter>
                        <labels>
                    </labels>
                <created>Tue, 29 Apr 2014 15:41:08 +0000</created>
                <updated>Thu, 15 May 2014 18:23:54 +0000</updated>
                            <resolved>Wed, 14 May 2014 15:16:39 +0000</resolved>
                                    <version>Lustre 2.6.0</version>
                                    <fixVersion>Lustre 2.6.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="82747" author="jay" created="Tue, 29 Apr 2014 15:57:30 +0000"  >&lt;p&gt;patch is located at &lt;a href=&quot;http://review.whamcloud.com/10149&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/10149&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="84088" author="pjones" created="Wed, 14 May 2014 15:16:39 +0000"  >&lt;p&gt;Landed for 2.6&lt;/p&gt;</comment>
                            <comment id="84122" author="paf" created="Wed, 14 May 2014 20:44:43 +0000"  >&lt;p&gt;The patch for this issue removes the change made to fix &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4873&quot; title=&quot;Lustre client hangs in vvp_page_make_ready&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4873&quot;&gt;&lt;del&gt;LU-4873&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;Can you explain why this change means it&apos;s safe to do:&lt;br/&gt;
vmpage = grab_cache_page_nowait(mapping, index);&lt;/p&gt;

&lt;p&gt;for partial page writes?&lt;/p&gt;</comment>
                            <comment id="84184" author="paf" created="Thu, 15 May 2014 15:45:23 +0000"  >&lt;p&gt;With &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4977&quot; title=&quot;Deadlock in balance_dirty_pages()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4977&quot;&gt;&lt;del&gt;LU-4977&lt;/del&gt;&lt;/a&gt;, we see many tasks failing to exit, stuck in various pieces of the memory management code.  While we no longer see &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4873&quot; title=&quot;Lustre client hangs in vvp_page_make_ready&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4873&quot;&gt;&lt;del&gt;LU-4873&lt;/del&gt;&lt;/a&gt;, we see many tasks stuck in other locations:&lt;/p&gt;

&lt;p&gt;Here&apos;s a sampling...  We saw a small number of this stack trace:&lt;br/&gt;
2014-05-14T23:40:01.626699-05:00 c3-0c0s5n1 &amp;lt;node_health:5.1&amp;gt; APID:13971031 (Application_Exited_Check) STACK: call_rwsem_down_write_failed+0x13/0x20; ll_setattr_raw+0x19a/0x1030 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;; ll_setattr+0x57/0xf0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;; notify_change+0x120/0x310; file_remove_suid+0x81/0xb0; __generic_file_aio_write+0x209/0x450; generic_file_aio_write+0x59/0xc0; vvp_io_write_start+0xe0/0x3c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;; cl_io_start+0x72/0x140 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;; cl_io_loop+0xac/0x1a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;; ll_file_io_generic+0x452/0x700 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;; ll_file_aio_write+0x23e/0x290 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;; ll_file_write+0x1f2/0x280 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;; vfs_write+0xcb/0x180; sys_write+0x55/0x90; system_call_fastpath+0x16/0x1b; 0x2aaaad4f6730; 0xffffffffffffffff;&lt;/p&gt;

&lt;p&gt;A similar number with this stack trace:&lt;br/&gt;
2014-05-14T23:24:19.890744-05:00 c6-0c2s2n0 &amp;lt;node_health:5.1&amp;gt; APID:13969099 (Application_Exited_Check) STACK: call_rwsem_down_write_failed+0x13/0x20; do_coredump+0x14e/0xbe0; get_signal_to_deliver+0x23b/0x480; do_notify_resume+0xe0/0x7f0; retint_signal+0x46/0x83; 0x2000feb6; 0xffffffffffffffff;&lt;/p&gt;

&lt;p&gt;A lot with this stack trace:&lt;br/&gt;
2014-05-14T23:24:20.280631-05:00 c6-0c1s4n0 &amp;lt;node_health:5.1&amp;gt; APID:13969099 (Application_Exited_Check) STACK: call_rwsem_down_write_failed+0x13/0x20; kgni_mm_fini+0x82/0x3e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;kgni_gem&amp;#93;&lt;/span&gt;; kgni_nic_destroy+0x37/0x70 &lt;span class=&quot;error&quot;&gt;&amp;#91;kgni_gem&amp;#93;&lt;/span&gt;; kgni_close+0x164/0x230 &lt;span class=&quot;error&quot;&gt;&amp;#91;kgni_gem&amp;#93;&lt;/span&gt;; fput+0xda/0x200; filp_close+0x63/0x90; put_files_struct+0x84/0xe0; exit_files+0x53/0x70; do_exit+0x1ec/0x980; do_group_exit+0x48/0xc0; get_signal_to_deliver+0x243/0x480; do_notify_resume+0xe0/0x7f0; int_signal+0x12/0x17; 0x2001c111; 0xffffffffffffffff;&lt;/p&gt;

&lt;p&gt;And the largest number were stuck here:&lt;br/&gt;
2014-05-14T22:44:36.488369-05:00 c4-0c2s4n0 &amp;lt;node_health:5.1&amp;gt; APID:13969066 (Application_Exited_Check) STACK: lu_cache_shrink+0x56/0x2a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;; shrink_slab+0xae/0x2c0; do_try_to_free_pages+0x593/0x6e0; try_to_free_pages+0x116/0x3c0; __alloc_pages_nodemask+0x58c/0x950; alloc_pages_current+0xbe/0x130; alloc_buddy_huge_page+0xf4/0x130; hugetlb_acct_memory+0xf0/0x330; hugetlb_reserve_pages+0x10c/0x1e0; hugetlbfs_file_mmap+0xec/0x150; mmap_region+0x4d2/0x6b0; do_mmap_pgoff+0x367/0x390; sys_mmap_pgoff+0x1fe/0x220; sys_mmap+0x29/0x30; system_call_fastpath+0x16/0x1b; 0x201b5599; 0xffffffffffffffff;&lt;/p&gt;</comment>
                            <comment id="84202" author="jay" created="Thu, 15 May 2014 17:30:38 +0000"  >&lt;p&gt;Hi Patrick,&lt;/p&gt;

&lt;p&gt;I don&apos;t think the problem you&apos;ve seen is related to the patch here. Is this the first time you see this issue? Please file a separate ticket.&lt;/p&gt;

&lt;p&gt;Jinshan&lt;/p&gt;</comment>
                            <comment id="84207" author="paf" created="Thu, 15 May 2014 18:23:54 +0000"  >&lt;p&gt;Ah, sorry, the setattr_raw issue is a known one.  For some reason that fix wasn&apos;t on this tree.&lt;/p&gt;

&lt;p&gt;The others are, as far as I can tell, new instances.  I&apos;m going to test again with the latest master...&lt;br/&gt;
The processes stuck in lu_cache_shrink eventually completed, but a significant amount of time was spent there first.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="24121">LU-4873</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwle7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>13790</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>