<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:31:24 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-16958] migrate vs regular ops deadlock</title>
                <link>https://jira.whamcloud.com/browse/LU-16958</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
PID: 350193  TASK: ffff9bd65af446c0  CPU: 0   COMMAND: &lt;span class=&quot;code-quote&quot;&gt;&quot;getfattr&quot;&lt;/span&gt;
 #0 [ffff9bd63ffb7950] __schedule at ffffffffba5a232d
    /tmp/kernel/kernel/sched/core.c: 3109
 #1 [ffff9bd63ffb79d8] schedule at ffffffffba5a2748
    /tmp/kernel/./arch/x86/include/asm/preempt.h: 84
 #2 [ffff9bd63ffb79e8] rwsem_down_write_slowpath at ffffffffba0f41a7
    /tmp/kernel/./arch/x86/include/asm/current.h: 15
 #3 [ffff9bd63ffb7a88] down_write at ffffffffba5a691a
    /tmp/kernel/./include/linux/err.h: 36
 #4 [ffff9bd63ffb7ac0] vvp_inode_ops at ffffffffc116d57f [lustre]
    /home/lustre/linux-4.18.0-305.25.1.el8_4/./arch/x86/include/asm/current.h: 15
 #5 [ffff9bd63ffb7ae0] cl_object_inode_ops at ffffffffc0454a50 [obdclass]
    /home/lustre/master-mine/lustre/obdclass/cl_object.c: 442
 #6 [ffff9bd63ffb7b18] lov_conf_set at ffffffffc0aa36c4 [lov]
    /home/lustre/master-mine/lustre/lov/lov_object.c: 1465
 #7 [ffff9bd63ffb7b88] cl_conf_set at ffffffffc04542d8 [obdclass]
    /home/lustre/master-mine/lustre/obdclass/cl_object.c: 299
 #8 [ffff9bd63ffb7bb8] ll_layout_conf at ffffffffc111d110 [lustre]
    /home/lustre/master-mine/lustre/llite/file.c: 5995
 #9 [ffff9bd63ffb7c28] ll_layout_refresh at ffffffffc111dad3 [lustre]
    /home/lustre/master-mine/libcfs/include/libcfs/libcfs_debug.h: 155
#10 [ffff9bd63ffb7cf0] vvp_io_init at ffffffffc116d019 [lustre]
    /home/lustre/master-mine/lustre/llite/vvp_io.c: 1870
#11 [ffff9bd63ffb7d20] __cl_io_init at ffffffffc045e66f [obdclass]
    /home/lustre/master-mine/lustre/obdclass/cl_io.c: 134
#12 [ffff9bd63ffb7d58] cl_glimpse_size0 at ffffffffc11642ca [lustre]
    /home/lustre/master-mine/lustre/llite/glimpse.c: 204
#13 [ffff9bd63ffb7da0] ll_getattr_dentry at ffffffffc111c65d [lustre]
    /home/lustre/master-mine/lustre/llite/llite_internal.h: 1677
#14 [ffff9bd63ffb7e50] vfs_statx at ffffffffba1d4be9
    /tmp/kernel/fs/stat.c: 204
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;checking the stack on the process above inode was found at 0xffff9bd60367d350:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
crash&amp;gt; p *(struct ll_inode_info *)(0xffff9bd60367d350-0x150)
  lli_inode_magic = 287116773,
...
  lli_inode_lock_owner = 0xffff9bd68f51d380
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;now check task 0xffff9bd68f51d380:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
crash&amp;gt; p *(struct task_struct *)0xffff9bd68f51d380|more
...
  pid = 348428,
...
PID: 348428  TASK: ffff9bd68f51d380  CPU: 1   COMMAND: &lt;span class=&quot;code-quote&quot;&gt;&quot;lfs&quot;&lt;/span&gt;
 #0 [ffff9bd613c37968] __schedule at ffffffffba5a232d
    /tmp/kernel/kernel/sched/core.c: 3109
 #1 [ffff9bd613c379f0] schedule at ffffffffba5a2748
    /tmp/kernel/./arch/x86/include/asm/preempt.h: 84
 #2 [ffff9bd613c37a00] schedule_preempt_disabled at ffffffffba5a2a6c
    /tmp/kernel/./arch/x86/include/asm/preempt.h: 79
 #3 [ffff9bd613c37a08] __mutex_lock at ffffffffba5a3a40
    /tmp/kernel/kernel/locking/mutex.c: 1038
 #4 [ffff9bd613c37ac8] ll_layout_refresh at ffffffffc111d577 [lustre]
    /home/lustre/master-mine/lustre/llite/llite_internal.h: 1536
 #5 [ffff9bd613c37b88] vvp_io_init at ffffffffc116d019 [lustre]
    /home/lustre/master-mine/lustre/llite/vvp_io.c: 1870
 #6 [ffff9bd613c37bb8] __cl_io_init at ffffffffc045e66f [obdclass]
    /home/lustre/master-mine/lustre/obdclass/cl_io.c: 134
 #7 [ffff9bd613c37bf0] ll_ioc_data_version at ffffffffc110c665 [lustre]
    /home/lustre/master-mine/lustre/llite/file.c: 3193
 #8 [ffff9bd613c37c28] ll_migrate at ffffffffc111b244 [lustre]
    /home/lustre/master-mine/lustre/llite/file.c: 3227
 #9 [ffff9bd613c37ca8] ll_dir_ioctl at ffffffffc1105563 [lustre]
    /home/lustre/master-mine/lustre/llite/dir.c: 2277
#10 [ffff9bd613c37e88] do_vfs_ioctl at ffffffffba1e3199
    /tmp/kernel/fs/ioctl.c: 48
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;it seems this is an locking order issue:&lt;br/&gt;
ll_migrate() takes inode lock, then lli_layout_mutex (in ll_layout_refresh()) while other ops (like getfattr) use the reversed order.&lt;/p&gt;

</description>
                <environment></environment>
        <key id="76968">LU-16958</key>
            <summary>migrate vs regular ops deadlock</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bobijam">Zhenyu Xu</assignee>
                                    <reporter username="bzzz">Alex Zhuravlev</reporter>
                        <labels>
                    </labels>
                <created>Wed, 12 Jul 2023 13:16:16 +0000</created>
                <updated>Mon, 20 Nov 2023 14:23:10 +0000</updated>
                            <resolved>Sat, 18 Nov 2023 21:54:26 +0000</resolved>
                                                    <fixVersion>Lustre 2.16.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="378423" author="gerrit" created="Wed, 12 Jul 2023 15:27:03 +0000"  >&lt;p&gt;&quot;Zhenyu Xu &amp;lt;bobijam@hotmail.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/51641&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/51641&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16958&quot; title=&quot;migrate vs regular ops deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16958&quot;&gt;&lt;del&gt;LU-16958&lt;/del&gt;&lt;/a&gt; llite: migrate vs regular ops deadlock&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 8de4a374979a37d09a057cbcdfd9914775cfc59b&lt;/p&gt;</comment>
                            <comment id="380842" author="gerrit" created="Tue, 1 Aug 2023 06:15:28 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/51641/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/51641/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16958&quot; title=&quot;migrate vs regular ops deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16958&quot;&gt;&lt;del&gt;LU-16958&lt;/del&gt;&lt;/a&gt; llite: migrate vs regular ops deadlock&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 8f2c1592c3bbd0351ab3984a88a3eed7075690c8&lt;/p&gt;</comment>
                            <comment id="380906" author="pjones" created="Tue, 1 Aug 2023 13:54:10 +0000"  >&lt;p&gt;Landed for 2.16&lt;/p&gt;</comment>
                            <comment id="386666" author="bobijam" created="Thu, 21 Sep 2023 12:35:11 +0000"  >&lt;p&gt;another deadlock found &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;T1:
vvp_io_init()
  -&amp;gt;ll_layout_refresh() &amp;lt;= take lli_layout_mutex
  -&amp;gt;ll_layout_intent()
  -&amp;gt;ll_take_md_lock()  &amp;lt;= take the CR layout lock ref
  -&amp;gt;ll_layout_conf()
    -&amp;gt;vvp_prune()
    -&amp;gt;vvp_inode_ops() &amp;lt;= release lli_layout_mtex
    -&amp;gt;vvp_inode_ops() &amp;lt;= try to acquire lli_layout_mutex
    -&amp;gt; racer wait here
T2:
-&amp;gt;ll_file_write_iter()
  -&amp;gt;vvp_io_init()
    -&amp;gt;ll_layout_refresh() &amp;lt;= take lli_layout_mutex
    -&amp;gt;ll_layout_intent() &amp;lt;= Request layout from MDT
    -&amp;gt; racer wait ...

T3: occure in PCC-RO attach, It can happen in normal case without PCC-RO.
-&amp;gt;pcc_readonly_attach()
  -&amp;gt;ll_layout_intent_write()
  -&amp;gt;ll_intent_lock()
     -&amp;gt; on MDT, it will try to obtain EX layout lock to change layout.
        but the client T1 hold CR layout lock, and T2&apos;s lock request is in lock waiting list to wai for T3 finished, thus cause dealock...
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="387602" author="bobijam" created="Thu, 28 Sep 2023 15:18:57 +0000"  >&lt;p&gt;I thought deadlock due to this patch , but I reverted the essential part of this patch at &lt;a href=&quot;https://review.whamcloud.com/52388&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/52388&lt;/a&gt;, and the racer still hang at the server, looks more like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15491&quot; title=&quot;rename deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15491&quot;&gt;LU-15491&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="388809" author="qian_wc" created="Wed, 11 Oct 2023 06:48:21 +0000"  >&lt;p&gt;Found another deadlock for parallel DIO:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
T1: writer
Obtain DLM extent lock: L1=PW[0, EOF]

T2: DIO reader: 50M data, iosize=64M, max_pages_per_rpc=1024 (4M) max_rpcs_in_flight=8
ll_direct_IO_impl()
use all available RPC slots: number of read RPC in flight is 9
on the server side:
-&amp;gt;tgt_brw_read()
-&amp;gt;tgt_brw_lock() # server side locking
-&amp;gt; Try to cancel the conflict locks on client: L1=PW[0, EOF]

T3: reader
take DLM lock ref on L1=PW[0, EOF]
Read-ahead pages (prepare pages);
wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; RPC slots to send the read RPCs to OST

deadlock: T2-&amp;gt;T3: T2 is waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; T3 to release DLM extent lock L1;
          T3-&amp;gt;T2: T3 is waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; T2 finished to free RPC slots...&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The possible solution is that when found all RPC slots are used by srvlock DIO, and there are urgent I/O, force to send the I/O RPC to OST?&#160;&lt;/p&gt;</comment>
                            <comment id="390899" author="adilger" created="Fri, 27 Oct 2023 21:33:33 +0000"  >&lt;p&gt;Another patch was pushed under this ticket.&lt;/p&gt;</comment>
                            <comment id="390901" author="adilger" created="Fri, 27 Oct 2023 21:35:39 +0000"  >&lt;blockquote&gt;
&lt;p&gt;I thought deadlock due to this patch , but I reverted the essential part of this patch at &lt;a href=&quot;https://review.whamcloud.com/52388&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/52388&lt;/a&gt;, and the racer still hang at the server, looks more like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15491&quot; title=&quot;rename deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15491&quot;&gt;LU-15491&lt;/a&gt;&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;There could definitely be multiple different issues affecting racer testing, so that doesn&apos;t mean the above patch is not fixing a problem.&lt;/p&gt;</comment>
                            <comment id="393491" author="gerrit" created="Sat, 18 Nov 2023 21:40:52 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/52388/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/52388/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16958&quot; title=&quot;migrate vs regular ops deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16958&quot;&gt;&lt;del&gt;LU-16958&lt;/del&gt;&lt;/a&gt; llite: migrate deadlock on not responding lock cancel&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 37646c74bf884c535149d530af840d728814792b&lt;/p&gt;</comment>
                            <comment id="393523" author="pjones" created="Sat, 18 Nov 2023 21:54:26 +0000"  >&lt;p&gt;Landed for 2.16&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="75023">LU-16637</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i03qcf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>