<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:35:19 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-3601] HSM release causes running restore to hang, hangs itself</title>
                <link>https://jira.whamcloud.com/browse/LU-3601</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Running the HSM stack as of July 15 2013, I see a hang when a release is issued while a restore is still running. To reproduce I run the following:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;#!/bin/bash

export MOUNT_2=n
export MDSCOUNT=1
export PTLDEBUG=&quot;super inode ioctl warning dlmtrace error emerg ha rpctrace vfstrace config console&quot;
export DEBUG_SIZE=512

hsm_root=/tmp/hsm_root

rm -rf $hsm_root
mkdir $hsm_root

llmount.sh

lctl conf_param lustre-MDT0000.mdt.hsm_control=enabled
# lctl conf_param lustre-MDT0001.mdt.hsm_control=enabled
sleep 10
lhsmtool_posix --verbose --hsm_root=$hsm_root --bandwidth 1 lustre

lctl dk &amp;gt; ~/hsm-0-mount.dk

set -x
cd /mnt/lustre
lfs setstripe -c2 f0
dd if=/dev/urandom of=f0 bs=1M count=100
lctl dk &amp;gt; ~/hsm-1-dd.dk

lfs hsm_archive f0
sleep 10
echo &amp;gt; /proc/fs/lustre/ldlm/dump_namespaces
lctl dk &amp;gt; ~/hsm-2-archive.dk

lfs hsm_release f0
echo &amp;gt; /proc/fs/lustre/ldlm/dump_namespaces
lctl dk &amp;gt; ~/hsm-3-release.dk

lfs hsm_restore f0
echo &amp;gt; /proc/fs/lustre/ldlm/dump_namespaces
lctl dk &amp;gt; ~/hsm-4-restore.dk

lfs hsm_release f0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;with the last command never returning. The MDS_CLOSE handler looks like&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;10070
[&amp;lt;ffffffffa0f9866e&amp;gt;] cfs_waitq_wait+0xe/0x10 [libcfs]
[&amp;lt;ffffffffa124826a&amp;gt;] ldlm_completion_ast+0x57a/0x960 [ptlrpc]
[&amp;lt;ffffffffa1247920&amp;gt;] ldlm_cli_enqueue_local+0x1f0/0x5c0 [ptlrpc]
[&amp;lt;ffffffffa08cee3b&amp;gt;] mdt_object_lock0+0x33b/0xaf0 [mdt]
[&amp;lt;ffffffffa08cf6b4&amp;gt;] mdt_object_lock+0x14/0x20 [mdt]
[&amp;lt;ffffffffa08f9551&amp;gt;] mdt_mfd_close+0x351/0xde0 [mdt]
[&amp;lt;ffffffffa08fb372&amp;gt;] mdt_close+0x662/0xa60 [mdt]
[&amp;lt;ffffffffa08d2c07&amp;gt;] mdt_handle_common+0x647/0x16d0 [mdt]
[&amp;lt;ffffffffa090c9e5&amp;gt;] mds_readpage_handle+0x15/0x20 [mdt]
[&amp;lt;ffffffffa12813d8&amp;gt;] ptlrpc_server_handle_request+0x398/0xc60 [ptlrpc]
[&amp;lt;ffffffffa128275d&amp;gt;] ptlrpc_main+0xabd/0x1700 [ptlrpc]
[&amp;lt;ffffffff81096936&amp;gt;] kthread+0x96/0xa0
[&amp;lt;ffffffff8100c0ca&amp;gt;] child_rip+0xa/0x20
[&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;while the MDS_HSM_PROGRESS handler looks like:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;10065
[&amp;lt;ffffffffa0f9866e&amp;gt;] cfs_waitq_wait+0xe/0x10 [libcfs]
[&amp;lt;ffffffffa124826a&amp;gt;] ldlm_completion_ast+0x57a/0x960 [ptlrpc]
[&amp;lt;ffffffffa1247920&amp;gt;] ldlm_cli_enqueue_local+0x1f0/0x5c0 [ptlrpc]
[&amp;lt;ffffffffa08cee3b&amp;gt;] mdt_object_lock0+0x33b/0xaf0 [mdt]
[&amp;lt;ffffffffa08cf6b4&amp;gt;] mdt_object_lock+0x14/0x20 [mdt]
[&amp;lt;ffffffffa08cf721&amp;gt;] mdt_object_find_lock+0x61/0x170 [mdt]
[&amp;lt;ffffffffa091dc22&amp;gt;] hsm_get_md_attr+0x62/0x270 [mdt]
[&amp;lt;ffffffffa0923253&amp;gt;] mdt_hsm_update_request_state+0x4d3/0x1c20 [mdt]
[&amp;lt;ffffffffa091ae6e&amp;gt;] mdt_hsm_coordinator_update+0x3e/0xe0 [mdt]
[&amp;lt;ffffffffa090931b&amp;gt;] mdt_hsm_progress+0x21b/0x330 [mdt]
[&amp;lt;ffffffffa08d2c07&amp;gt;] mdt_handle_common+0x647/0x16d0 [mdt]
[&amp;lt;ffffffffa090ca05&amp;gt;] mds_regular_handle+0x15/0x20 [mdt]
[&amp;lt;ffffffffa12813d8&amp;gt;] ptlrpc_server_handle_request+0x398/0xc60 [ptlrpc]
[&amp;lt;ffffffffa128275d&amp;gt;] ptlrpc_main+0xabd/0x1700 [ptlrpc]
[&amp;lt;ffffffff81096936&amp;gt;] kthread+0x96/0xa0
[&amp;lt;ffffffff8100c0ca&amp;gt;] child_rip+0xa/0x20
[&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The close handler is waiting on an EX layout lock on f0. While the &lt;br/&gt;
progress handler is waiting on PW update lock on f0. dump_namespaces does not show that the UPDATE lock is granted.&lt;/p&gt;

&lt;p&gt;For reference I&apos;m using the following changes:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# LU-2919 hsm: Implementation of exclusive open
# http://review.whamcloud.com/#/c/6730
git fetch http://review.whamcloud.com/fs/lustre-release refs/changes/30/6730/13 &amp;amp;&amp;amp; git cherry-pick FETCH_HEAD
 
# LU-1333 hsm: Add hsm_release feature.
# http://review.whamcloud.com/#/c/6526
git fetch http://review.whamcloud.com/fs/lustre-release refs/changes/26/6526/9 &amp;amp;&amp;amp; git cherry-pick FETCH_HEAD
 
# LU-3339 mdt: HSM on disk actions record
# http://review.whamcloud.com/#/c/6529
# MERGED
 
# LU-3340 mdt: HSM memory requests management
# http://review.whamcloud.com/#/c/6530
git fetch http://review.whamcloud.com/fs/lustre-release refs/changes/30/6530/8 &amp;amp;&amp;amp; git cherry-pick FETCH_HEAD
 
# LU-3341 mdt: HSM coordinator client interface
# http://review.whamcloud.com/#/c/6532
git fetch http://review.whamcloud.com/fs/lustre-release refs/changes/32/6532/13 &amp;amp;&amp;amp; git cherry-pick FETCH_HEAD
# Needs rebase in sanity-hsm.sh
 
# LU-3342 mdt: HSM coordinator agent interface
# http://review.whamcloud.com/#/c/6534
git fetch http://review.whamcloud.com/fs/lustre-release refs/changes/34/6534/8 &amp;amp;&amp;amp; git cherry-pick FETCH_HEAD
 
# LU-3343 mdt: HSM coordinator main thread
# http://review.whamcloud.com/#/c/6912
git fetch http://review.whamcloud.com/fs/lustre-release refs/changes/12/6912/3 &amp;amp;&amp;amp; git cherry-pick FETCH_HEAD
# lustre/mdt/mdt_internal.h
 
# LU-3561 tests: HSM sanity test suite
# http://review.whamcloud.com/#/c/6913/
git fetch http://review.whamcloud.com/fs/lustre-release refs/changes/13/6913/4 &amp;amp;&amp;amp; git cherry-pick FETCH_HEAD
# lustre/tests/sanity-hsm.sh
 
# LU-3432 llite: Access to released file trigs a restore
# http://review.whamcloud.com/#/c/6537
git fetch http://review.whamcloud.com/fs/lustre-release refs/changes/37/6537/11 &amp;amp;&amp;amp; git cherry-pick FETCH_HEAD
 
# LU-3363 api: HSM import uses new released pattern
# http://review.whamcloud.com/#/c/6536
git fetch http://review.whamcloud.com/fs/lustre-release refs/changes/36/6536/8 &amp;amp;&amp;amp; git cherry-pick FETCH_HEAD
 
# LU-2062 utils: HSM Posix CopyTool
# http://review.whamcloud.com/#/c/4737
git fetch http://review.whamcloud.com/fs/lustre-release refs/changes/37/4737/18 &amp;amp;&amp;amp; git cherry-pick FETCH_HEAD
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="19860">LU-3601</key>
            <summary>HSM release causes running restore to hang, hangs itself</summary>
                <type id="7" iconUrl="https://jira.whamcloud.com/images/icons/issuetypes/task_agile.png">Technical task</type>
                            <parent id="20020">LU-3647</parent>
                                    <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="jay">Jinshan Xiong</assignee>
                                    <reporter username="jhammond">John Hammond</reporter>
                        <labels>
                            <label>HSM</label>
                    </labels>
                <created>Tue, 16 Jul 2013 22:34:11 +0000</created>
                <updated>Fri, 14 Feb 2014 17:16:22 +0000</updated>
                            <resolved>Fri, 14 Feb 2014 17:16:22 +0000</resolved>
                                    <version>Lustre 2.5.0</version>
                                    <fixVersion>Lustre 2.6.0</fixVersion>
                    <fixVersion>Lustre 2.5.1</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>17</watches>
                                                                            <comments>
                            <comment id="62455" author="jay" created="Wed, 17 Jul 2013 06:00:32 +0000"  >&lt;p&gt;Hi JC,&lt;/p&gt;

&lt;p&gt;Can you please refresh your coordinator patches based on release patches? I tried to make a build but unfortunately met some merge errors.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Jinshan&lt;/p&gt;</comment>
                            <comment id="62481" author="jcl" created="Wed, 17 Jul 2013 14:18:13 +0000"  >&lt;p&gt;Not sure a coordinator refresh will be enough. I can push our last full branch based on LU-1333-v10. Is it ok for you?&lt;/p&gt;</comment>
                            <comment id="62515" author="jhammond" created="Wed, 17 Jul 2013 21:43:42 +0000"  >&lt;p&gt;I believe that this situation exposes a limitation of LDLM for inodebits locks. All locks below are on f0.&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;Starting the restore takes EX LAYOUT lock on the server.&lt;/li&gt;
	&lt;li&gt;When the releasing close RPC is sent the client has a PR LOOKUP|UPDATE|PERM lock.&lt;/li&gt;
	&lt;li&gt;The release handler on the server blocks attempting to take an EX LAYOUT lock.&lt;/li&gt;
	&lt;li&gt;When restore complete, the update progress handler blocks attempting to take an PW UPDATE lock.&lt;/li&gt;
	&lt;li&gt;The client releases the PR LOOKUP|UPDATE|PERM lock.&lt;/li&gt;
	&lt;li&gt;The resource (f0) gets reprocessed, but the first waiting lock (EX LAYOUT) cannot be granted, so ldlm_process_inodebits_lock() returns LDLM_ITER_STOP causing ldlm_reprocess_queue() to stop processing the resource. In particular it does not check that the PW UPDATE lock is compatible with all of the granted locks and all of the locks before it in the waiting list.&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;It also appears that the skip list optimizations in ldlm_inodebits_compat_queue() could be extended/improved by computing compatibility one mode-bits-bunch at a time and by granting locks in bunches.&lt;/p&gt;</comment>
                            <comment id="63049" author="jhammond" created="Fri, 26 Jul 2013 16:01:20 +0000"  >&lt;p&gt;Here is a simpler situation where we can get stuck. (It is also more likely to occur.) Consider the following release vs open race. Assume the file F has already been archived.&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;Client R starts HSM release on file F.&lt;/li&gt;
	&lt;li&gt;In lfs_hsm_request, R stats F, the MDT returns a PR LOOKUP,UPDATE,LAYOUT,PERM lock on F.&lt;/li&gt;
	&lt;li&gt;In lfs_hsm_request, R opens F for path2fid, the MDT returns a CR LOOKUP,LAYOUT lock on F.&lt;/li&gt;
	&lt;li&gt;In ll_hsm_release/ll_lease_open, R leases F, the MDT returns an EX OPEN lock on F.&lt;/li&gt;
	&lt;li&gt;Client W tries to open F with MDS_OPEN_LOCK set, the MDT adds a CW OPEN lock to the waiting list.&lt;/li&gt;
	&lt;li&gt;In ll_hsm_release, client R closes F.&lt;/li&gt;
	&lt;li&gt;In mdt_hsm_release, the MDT requests a local EX LAYOUT on F. This conflicts with the PR and CR locks already held by R, the server sends blocking ASTs to R for these locks.&lt;/li&gt;
	&lt;li&gt;The MDT reprocesses the waiting queue for F. Granted list contains the EX OPEN lock. The waiting list contains the CW OPEN, followed by the EX LAYOUT.&lt;/li&gt;
	&lt;li&gt;As responses to the blocking ASTs come in the F is reprocessed but since there is a blocked CW OPEN lock at the head of the waiting list, the following locks (including the EX LAYOUT) are not considered.&lt;/li&gt;
	&lt;li&gt;The EX OPEN lock times out and client R is evicted.&lt;/li&gt;
&lt;/ol&gt;
</comment>
                            <comment id="63067" author="jhammond" created="Fri, 26 Jul 2013 19:56:55 +0000"  >&lt;p&gt;Another issue here is that it may be unsafe to access the mount point being used by the copytool. Especially to perform manual HSM requests, since the MDC&apos;s cl_close_lock will prevent multiple concurrent closes. In particular we can have a releasing close block (on EX LAYOUT) because a restore is running, which prevents the restore from being completed, because any close will block on cl_close_lock.&lt;/p&gt;</comment>
                            <comment id="63082" author="jay" created="Fri, 26 Jul 2013 21:31:23 +0000"  >&lt;p&gt;I will fix the lock issue above.&lt;/p&gt;

&lt;p&gt;The close sounds like a real issue here, we shouldn&apos;t block close REQ to finish. Let&apos;s use try version of mdt_object_lock() in close.&lt;/p&gt;</comment>
                            <comment id="63092" author="jhammond" created="Sat, 27 Jul 2013 14:56:22 +0000"  >&lt;p&gt;Please see &lt;a href=&quot;http://review.whamcloud.com/7148&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/7148&lt;/a&gt; for the LDLM patch we discussed.&lt;/p&gt;</comment>
                            <comment id="63319" author="jhammond" created="Tue, 30 Jul 2013 22:35:54 +0000"  >&lt;p&gt;A similar hang can be triggered by trying to read a file while a restore is still running. To see this add --bandwidth=1 to the copytool options and do:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# cd /mnt/lustre
# dd if=/dev/urandom of=f0 bs=1M count=10
# lfs hsm_archive f0
# # Wait for archive to complete.
# sleep 15
# lfs hsm_release f0
# lfs hsm_restore f0
# cat f0 &amp;gt; /dev/null
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This is addresses by the &lt;a href=&quot;http://review.whamcloud.com/#/c/7148/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/7148/&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;However even with the latest version (patch set 9) of &lt;a href=&quot;http://review.whamcloud.com/#/c/6912/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/6912/&lt;/a&gt; we have an easily exploited race between restore and rename which is not addressed by the change in 7148. Rename onto during restore will hang:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;cd /mnt/lustre
dd if=/dev/urandom of=f0 bs=1M count=10
lfs hsm_archive f0
# Wait for archive to complete.
sleep 15
lfs hsm_state f0
lfs hsm_release f0
lfs hsm_restore f0; touch f1; sys_rename f1 f0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Since this rename takes MDS_INODELOCK_FULL on f0, I doubt that the choice of using LAYOUT, UPDATE, or other in hsm_get_md_attr() matters very much. But I could be wrong.&lt;/p&gt;</comment>
                            <comment id="63479" author="jhammond" created="Thu, 1 Aug 2013 16:10:56 +0000"  >&lt;p&gt;Since the removal of UPDATE lock use from the coordinator, I can no longer reproduce these issues.&lt;/p&gt;</comment>
                            <comment id="63606" author="jcl" created="Fri, 2 Aug 2013 22:16:34 +0000"  >&lt;p&gt;We will add sanity-hsm tests for the 2 simple use cases. Will be safer for futures changes.&lt;/p&gt;</comment>
                            <comment id="63611" author="adegremont" created="Sat, 3 Aug 2013 09:00:20 +0000"  >&lt;p&gt;We already have such test. sanity-hsm #33 deadlock was hitting this bug. John&apos;s patch was fixing hit. I will confirm that the latest coordinator, without John&apos;s patch do not trig this deadlock anymore on monday, but I&apos;m confident.&lt;/p&gt;</comment>
                            <comment id="63613" author="jcl" created="Sun, 4 Aug 2013 10:08:18 +0000"  >&lt;p&gt;sanity-hsm #33 hits the same bug, but was not designed to test concurrent access to file during the restore phase. We also today do no test rename/rm during restore.&lt;/p&gt;</comment>
                            <comment id="69444" author="jlevi" created="Mon, 21 Oct 2013 20:15:31 +0000"  >&lt;p&gt;Should Change, 7148 be landed or abandoned?&lt;/p&gt;</comment>
                            <comment id="69455" author="jhammond" created="Mon, 21 Oct 2013 21:25:21 +0000"  >&lt;p&gt;Landed after being improved per comments on gerrit.&lt;/p&gt;</comment>
                            <comment id="69456" author="jhammond" created="Mon, 21 Oct 2013 21:27:05 +0000"  >&lt;p&gt;This issue was fixed for 2.5.0 and can be closed now.&lt;/p&gt;</comment>
                            <comment id="70023" author="adilger" created="Mon, 28 Oct 2013 16:28:16 +0000"  >&lt;p&gt;Andriy wrote in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4152&quot; title=&quot; layout locks can cause deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4152&quot;&gt;&lt;del&gt;LU-4152&lt;/del&gt;&lt;/a&gt;:&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1876&quot; title=&quot;Layout Lock Server Patch Landings to Master&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1876&quot;&gt;&lt;del&gt;LU-1876&lt;/del&gt;&lt;/a&gt; adds mdt_object_open_lock() which acquires lock in 2 steps for layout locks.&lt;br/&gt;
A deadlock is possible since it isn&apos;t atomic and ibits locks are reprocessed until first blocking lock found.&lt;/p&gt;

&lt;p&gt;Such situation was hit with mdt_reint_open() &amp;amp; mdt_intent_getattr()&lt;/p&gt;

&lt;p&gt;mdt_reint_open()-&amp;gt;mdt_open_by_fid_lock() takes first part of the lock (ibits=5),&lt;br/&gt;
mdt_intent_getattr() tries to obtain lock (ibits=17)&lt;br/&gt;
mdt_open_by_fid_lock() tries to obtain second part but fails due to some conflict with another layout lock2. During cancellation of lock2 only getattr lock is reprocessed.&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/#/c/7148/1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/7148/1&lt;/a&gt; can help, but it is better to fix mdt_open_by_fid_lock()&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Andriy, was this problem actually hit during testing, or was this problem found by code inspection?&lt;/p&gt;</comment>
                            <comment id="70047" author="vitaly_fertman" created="Mon, 28 Oct 2013 18:00:07 +0000"  >&lt;p&gt;Andreas, it was hit during testing.&lt;/p&gt;

&lt;p&gt;process1.lock1: open|lookup, granted&lt;br/&gt;
process2.lock1: layout | XXX, granted&lt;br/&gt;
process3.lock1: lookup | XXX, waiting process1.lock1&lt;br/&gt;
process1.lock2: layout, waiting process2.lock1&lt;br/&gt;
process2.lock1: cancelled, reprocessing does not reach process1.lock2&lt;/p&gt;

&lt;p&gt;process1 is open by fid&lt;br/&gt;
process3 is getattr&lt;/p&gt;

&lt;p&gt;in other words, as 2 locks are taken not atomically, you must guarantee nobody can take a conflict for 1st lock in between. otherwise you need either:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;full reprocess&lt;/li&gt;
	&lt;li&gt;reordering on waiting list&lt;/li&gt;
	&lt;li&gt;make these 2 enqueue atomic&lt;/li&gt;
	&lt;li&gt;take 1 common lock with all the ibits&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;btw, why the last option was not done originally ?&lt;/p&gt;

&lt;p&gt;as it can deadlock without HSM, I would consider it as a blocker.&lt;/p&gt;</comment>
                            <comment id="70069" author="jay" created="Mon, 28 Oct 2013 19:21:43 +0000"  >&lt;p&gt;Indeed, this is a live lock case.&lt;/p&gt;

&lt;p&gt;To clarify, the process1 must be writing an empty file without layout, so writing will cause new layout to be created.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;btw, why the last option was not done originally ?&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;The reason for me to not acquire one common lock is that we have to acquire EX mode for layout lock which will be too strong for lookup and open lock since they have to share the same DLM lock.&lt;/p&gt;

&lt;p&gt;Though patch 7148 can fix this problem, acquiring 2 locks in a row is generally bad. Therefore, I&apos;ll fix by acquiring one lock with EX mode for the above case, however, this lock won&apos;t be returned to client side. As a result, the process will not cache this specific open. This is good as it will happen rarely.&lt;/p&gt;

&lt;p&gt;How do you guys think?&lt;/p&gt;</comment>
                            <comment id="70073" author="paf" created="Mon, 28 Oct 2013 19:52:22 +0000"  >&lt;p&gt;Jinshan - This was originally a Cray bug (thank you Andriy and Vitaly for bringing this up), which I&apos;ve been tracking.&lt;/p&gt;

&lt;p&gt;I think eliminating the case where 2 locks are taken non-atomically is key long term.  If you&apos;re planning to do that, then that sounds good.  &lt;br/&gt;
If you&apos;re planning to only do it in certain cases, are you completely sure we don&apos;t have another possible live lock?&lt;/p&gt;

&lt;p&gt;I&apos;d back Vitaly&apos;s suggestion that it be a blocker.  We&apos;re able to trigger it during testing of NFS export, presumably because of the open_by_fid operations caused by NFS export.&lt;/p&gt;</comment>
                            <comment id="70083" author="jay" created="Mon, 28 Oct 2013 21:38:28 +0000"  >&lt;p&gt;just an update - Oleg is creating a patch for this issue.&lt;/p&gt;</comment>
                            <comment id="70104" author="jhammond" created="Tue, 29 Oct 2013 12:52:35 +0000"  >&lt;p&gt;Links to Oleg&apos;s patches (which all reference this issue) may be found in the comments on &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4152&quot; title=&quot; layout locks can cause deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4152&quot;&gt;&lt;del&gt;LU-4152&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="70141" author="green" created="Tue, 29 Oct 2013 17:49:44 +0000"  >&lt;p&gt;Patrick: what&apos;s your exact reproducer to hit this? We are so far unable to hit it ourselves&lt;/p&gt;</comment>
                            <comment id="70150" author="paf" created="Tue, 29 Oct 2013 18:14:16 +0000"  >&lt;p&gt;Oleg - We hit it while testing NFS exported Lustre during a large-ish test run, with tests drawn primarily from the Linux Test Project.  The problem is we don&apos;t always hit it with the same test.&lt;/p&gt;

&lt;p&gt;The test engineer who&apos;s been handling it thinks a way to hit it is concurrent runs of fsx-linux with different command line options.  Those are being run against an NFS export of Lustre.&lt;br/&gt;
He&apos;s going to try to pin that down this afternoon, I&apos;ll update if he&apos;s able to be more specific.&lt;/p&gt;</comment>
                            <comment id="70189" author="paf" created="Tue, 29 Oct 2013 21:28:49 +0000"  >&lt;p&gt;Moving conversation about patches to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4152&quot; title=&quot; layout locks can cause deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4152&quot;&gt;&lt;del&gt;LU-4152&lt;/del&gt;&lt;/a&gt;; latest is there.&lt;/p&gt;</comment>
                            <comment id="71431" author="jay" created="Wed, 13 Nov 2013 16:23:49 +0000"  >&lt;p&gt;this is fixed in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4152&quot; title=&quot; layout locks can cause deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4152&quot;&gt;&lt;del&gt;LU-4152&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="76660" author="adilger" created="Mon, 10 Feb 2014 21:40:23 +0000"  >&lt;p&gt;Patch &lt;a href=&quot;http://review.whamcloud.com/8084&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8084&lt;/a&gt; was landed under this bug, but is not reported here.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="21658">LU-4152</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="21658">LU-4152</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="19879">LU-3608</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvvjj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9136</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>