<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:23:30 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-16044] osd: truncate vs write deadlock</title>
                <link>https://jira.whamcloud.com/browse/LU-16044</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;blockquote&gt;
&lt;p&gt;PID: 12333  TASK: ffff8d84a294c200  CPU: 8   COMMAND: &quot;ll_ost_io02_086&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d84a29937e0&amp;#93;&lt;/span&gt; __schedule at ffffffffa8988e18&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d84a2993848&amp;#93;&lt;/span&gt; schedule at ffffffffa89891e9&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d84a2993858&amp;#93;&lt;/span&gt; schedule_timeout at ffffffffa8986eb1&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d84a2993908&amp;#93;&lt;/span&gt; io_schedule_timeout at ffffffffa8988a9d&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d84a2993938&amp;#93;&lt;/span&gt; io_schedule at ffffffffa8988b38&lt;br/&gt;
 #5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d84a2993948&amp;#93;&lt;/span&gt; bit_wait_io at ffffffffa8987501&lt;br/&gt;
 #6 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d84a2993960&amp;#93;&lt;/span&gt; __wait_on_bit_lock at ffffffffa89870b1&lt;br/&gt;
 #7 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d84a29939a0&amp;#93;&lt;/span&gt; __lock_page at ffffffffa83bd2a4&lt;br/&gt;
 #8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d84a29939f8&amp;#93;&lt;/span&gt; truncate_inode_pages_range at ffffffffa83cf2fb&lt;br/&gt;
 #9 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d84a2993b50&amp;#93;&lt;/span&gt; truncate_pagecache at ffffffffa83cf3f7&lt;br/&gt;
#10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d84a2993b78&amp;#93;&lt;/span&gt; osd_punch at ffffffffc14beecc &lt;span class=&quot;error&quot;&gt;&amp;#91;osd_ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
#11 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d84a2993bd0&amp;#93;&lt;/span&gt; ofd_object_punch at ffffffffc15e7e26 &lt;span class=&quot;error&quot;&gt;&amp;#91;ofd&amp;#93;&lt;/span&gt;&lt;br/&gt;
#12 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d84a2993c48&amp;#93;&lt;/span&gt; ofd_punch_hdl at ffffffffc15d442f &lt;span class=&quot;error&quot;&gt;&amp;#91;ofd&amp;#93;&lt;/span&gt;&lt;br/&gt;
#13 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d84a2993cd0&amp;#93;&lt;/span&gt; tgt_checksum_niobuf_t10pi at ffffffffc0fe909e &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#14 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d84a2993d58&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request at ffffffffc0f9090b &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#15 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d84a2993df8&amp;#93;&lt;/span&gt; ptlrpc_main at ffffffffc0f94274 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#16 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d84a2993ec8&amp;#93;&lt;/span&gt; kthread at ffffffffa82c5e31&lt;br/&gt;
......&lt;/p&gt;

&lt;p&gt;PID: 12603  TASK: ffff8d8490db0000  CPU: 14  COMMAND: &quot;ll_ost_io05_068&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d8490daf8a8&amp;#93;&lt;/span&gt; __schedule at ffffffffa8988e18&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d8490daf910&amp;#93;&lt;/span&gt; schedule at ffffffffa89891e9&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d8490daf920&amp;#93;&lt;/span&gt; rwsem_down_read_failed at ffffffffa898abd5&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d8490daf9a0&amp;#93;&lt;/span&gt; call_rwsem_down_read_failed at ffffffffa8598068&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d8490daf9f0&amp;#93;&lt;/span&gt; down_read at ffffffffa89886b0&lt;br/&gt;
 #5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d8490dafa08&amp;#93;&lt;/span&gt; osd_read_lock at ffffffffc148e03c &lt;span class=&quot;error&quot;&gt;&amp;#91;osd_ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #6 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d8490dafa30&amp;#93;&lt;/span&gt; ofd_commitrw_write at ffffffffc15eb76c &lt;span class=&quot;error&quot;&gt;&amp;#91;ofd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #7 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d8490dafac0&amp;#93;&lt;/span&gt; ofd_commitrw at ffffffffc15efe4f &lt;span class=&quot;error&quot;&gt;&amp;#91;ofd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d8490dafb58&amp;#93;&lt;/span&gt; tgt_request_preprocess at ffffffffc0fee11b &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #9 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d8490dafcd0&amp;#93;&lt;/span&gt; tgt_checksum_niobuf_t10pi at ffffffffc0fe909e &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d8490dafd58&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request at ffffffffc0f9090b &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#11 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d8490dafdf8&amp;#93;&lt;/span&gt; ptlrpc_main at ffffffffc0f94274 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#12 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8d8490dafec8&amp;#93;&lt;/span&gt; kthread at ffffffffa82c5e31&lt;/p&gt;&lt;/blockquote&gt;</description>
                <environment></environment>
        <key id="71484">LU-16044</key>
            <summary>osd: truncate vs write deadlock</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bzzz">Alex Zhuravlev</assignee>
                                    <reporter username="bzzz">Alex Zhuravlev</reporter>
                        <labels>
                    </labels>
                <created>Mon, 25 Jul 2022 13:25:49 +0000</created>
                <updated>Mon, 19 Jun 2023 12:09:24 +0000</updated>
                            <resolved>Sun, 16 Oct 2022 01:05:18 +0000</resolved>
                                                    <fixVersion>Lustre 2.16.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="341432" author="gerrit" created="Mon, 25 Jul 2022 13:30:25 +0000"  >&lt;p&gt;&quot;Alex Zhuravlev &amp;lt;bzzz@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/48033&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/48033&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16044&quot; title=&quot;osd: truncate vs write deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16044&quot;&gt;&lt;del&gt;LU-16044&lt;/del&gt;&lt;/a&gt; osd: discard pagecache in truncate&apos;s declaration&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: f7f107f3f0144b5f15309506dc6bc3509c1d8d70&lt;/p&gt;</comment>
                            <comment id="345433" author="gerrit" created="Thu, 1 Sep 2022 16:34:00 +0000"  >&lt;p&gt;&quot;Stephane Thiell &amp;lt;sthiell@stanford.edu&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/48410&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/48410&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16044&quot; title=&quot;osd: truncate vs write deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16044&quot;&gt;&lt;del&gt;LU-16044&lt;/del&gt;&lt;/a&gt; osd: discard pagecache in truncate&apos;s declaration&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: de4c30e20f4d474ec363835f2ce2456d23896cc4&lt;/p&gt;</comment>
                            <comment id="345435" author="sthiell" created="Thu, 1 Sep 2022 16:37:00 +0000"  >&lt;p&gt;Alex, this is a backport of your patch to b2_12. Basically I just removed the encryption part that is not available in 2.12. Can you please double check this looks OK to you? When I get your go, we&apos;ll try it in production. Thanks!&lt;/p&gt;</comment>
                            <comment id="346191" author="sthiell" created="Fri, 9 Sep 2022 15:43:38 +0000"  >&lt;p&gt;Unfortunately, even with 2.12.9 with both patches:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16044&quot; title=&quot;osd: truncate vs write deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16044&quot;&gt;&lt;del&gt;LU-16044&lt;/del&gt;&lt;/a&gt; osd: discard pagecache in truncate&apos;s declaration (&lt;a href=&quot;https://review.whamcloud.com/48410&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/48410&lt;/a&gt;&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15117&quot; title=&quot;ofd_read_lock vs transaction deadlock while allocating buffers	&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15117&quot;&gt;&lt;del&gt;LU-15117&lt;/del&gt;&lt;/a&gt; ofd: don&apos;t take lock for dt_bufs_get() (&lt;a href=&quot;https://review.whamcloud.com/47925,&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47925)&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;We hit a deadlock situation last night. Attaching &quot;foreach bt&quot; of the new crash dump as &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/45632/45632_foreach_bt_fir-io2-s2_20220909.txt&quot; title=&quot;foreach_bt_fir-io2-s2_20220909.txt attached to LU-16044&quot;&gt;foreach_bt_fir-io2-s2_20220909.txt&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;</comment>
                            <comment id="346218" author="sthiell" created="Fri, 9 Sep 2022 17:34:08 +0000"  >&lt;p&gt;We may have identified the source of the deadlock. A group of users had jobs using GNU parallel with --tmpfile set to Lustre, which apparently uses unlinked tmp files that are kept opened and it does frequent ftruncate(0) on them.&lt;/p&gt;

&lt;p&gt;The command used is:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;parallel --tmpdir $folder/tmp --delay 2 -j $threads &amp;lt; $folder/calls.$cmd.txt
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;with &lt;tt&gt;$folder&lt;/tt&gt; set to Lustre&lt;/p&gt;

&lt;p&gt;We have asked the users to change their scripts and avoid Lustre to store such temporary files, and we&apos;ll see if that reduces the number of OSS deadlock.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="346298" author="bzzz" created="Sat, 10 Sep 2022 07:55:13 +0000"  >&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
schedule,io_schedule,bit_wait_io,__wait_on_bit_lock,__lock_page,mpage_prepare_extent_to_map,ldiskfs_writepages,do_writepages,__writeback_single_inode,writeback_sb_inodes,__writeback_inodes_wb,wb_writeback,bdi_writeback_workfn,process_one_work,worker_thread
	PIDs(1): &lt;span class=&quot;code-quote&quot;&gt;&quot;kworker/u259:2&quot;&lt;/span&gt;:94708 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;this is not the problem I tried to fix. probably better to say it&apos;s a related issue. need to think a bit more.. sorry for the inconvenience.&lt;/p&gt;</comment>
                            <comment id="346299" author="bzzz" created="Sat, 10 Sep 2022 08:16:26 +0000"  >&lt;blockquote&gt;&lt;p&gt;We have asked the users to change their scripts and avoid Lustre to store such temporary files, and we&apos;ll see if that reduces the number of OSS deadlock.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;please ask for sysctl -a | grep vm.dirty from OSTs&lt;/p&gt;</comment>
                            <comment id="346303" author="sthiell" created="Sat, 10 Sep 2022 16:31:29 +0000"  >&lt;p&gt;Thanks Alex. This is the result from all our OSS on this system. I believe we use the default settings for CentOS 7.9.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;---------------
fir-io[1-8]-s[1-2] (16)
---------------
vm.dirty_background_bytes = 0
vm.dirty_background_ratio = 3
vm.dirty_bytes = 0
vm.dirty_expire_centisecs = 3000
vm.dirty_ratio = 10
vm.dirty_writeback_centisecs = 500
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;What we change is the following:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;vm.min_free_kbytes = 2097152
vm.swappiness = 1
vm.zone_reclaim_mode = 1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;OSS are based on AMD EPYC Naples, single socket 7401P. 512GB of RAM each.&lt;/p&gt;</comment>
                            <comment id="349780" author="gerrit" created="Sat, 15 Oct 2022 05:56:26 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/48033/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/48033/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16044&quot; title=&quot;osd: truncate vs write deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16044&quot;&gt;&lt;del&gt;LU-16044&lt;/del&gt;&lt;/a&gt; osd: discard pagecache in truncate&apos;s declaration&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 0bb491b2ecf494c3f78fa08a101af8af7853a0fe&lt;/p&gt;</comment>
                            <comment id="349805" author="pjones" created="Sun, 16 Oct 2022 01:05:18 +0000"  >&lt;p&gt;Landed for 2.16&lt;/p&gt;</comment>
                            <comment id="375830" author="gerrit" created="Mon, 19 Jun 2023 12:09:24 +0000"  >&lt;p&gt;&quot;Etienne AUJAMES &amp;lt;eaujames@ddn.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/51360&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/51360&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16044&quot; title=&quot;osd: truncate vs write deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16044&quot;&gt;&lt;del&gt;LU-16044&lt;/del&gt;&lt;/a&gt; osd: discard pagecache in truncate&apos;s declaration&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_15&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 536d362534f37e53bae1868b4ea1a044306b69a4&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="45632" name="foreach_bt_fir-io2-s2_20220909.txt" size="1556459" author="sthiell" created="Fri, 9 Sep 2022 15:43:14 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i02ve7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>