<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:41:29 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4300] ptlrpcd threads deadlocked in cl_lock_mutex_get</title>
                <link>https://jira.whamcloud.com/browse/LU-4300</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;TraveltimeLaunc thread held cl_lock and cause call of cl_lock_cancel. Lock cancel causes rpc OST_WRITE. But OST_WRITE can&apos;t be sent because all ptlrpcd are waiting for cl_lock.&lt;/p&gt;

&lt;p&gt;So, OST_WRITE rpc sticks at new requests queue and can&apos;t be sent because ptlrpcd_19 is blocked by osc lock up call.&lt;/p&gt;

&lt;p&gt;dump from ptlrpcd threads - each thread has a recovery set, and two &quot;production&quot; new and processed.&lt;br/&gt;
each rpc is dumped in format&lt;br/&gt;
$rpc_ptr : $opc : $rq_status&lt;/p&gt;

&lt;p&gt;thread 19 have queue&lt;br/&gt;
print tread 19 - 0xffff881fe70fae70&lt;br/&gt;
new requests&lt;br/&gt;
rpc 0xffff880faf27e000 : 101 - 0&lt;br/&gt;
rpc 0xffff881feff5d800 : 4 - 0&lt;br/&gt;
request in processing&lt;/p&gt;

&lt;p&gt;All 32 ptlrpcd threads are waiting with this stack trace:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;&amp;gt; 2013-10-15T15:14:18.766544-05:00 c0-0c0s5n3 ptlrpcd_0       D 0000000000000000     0  4183      2 0x00000000
&amp;gt; 2013-10-15T15:14:18.791825-05:00 c0-0c0s5n3 ffff881fe7085aa0 0000000000000046 ffff881fe7084010 ffff881fea497040
&amp;gt; 2013-10-15T15:14:18.792017-05:00 c0-0c0s5n3 0000000000010b00 ffff881fe7085fd8 ffff881fe7085fd8 0000000000010b00
&amp;gt; 2013-10-15T15:14:18.817016-05:00 c0-0c0s5n3 0000000000000013 ffff881fea497040 0000000000000000 ffff881fe9800040
&amp;gt; 2013-10-15T15:14:18.817082-05:00 c0-0c0s5n3 Call Trace:
&amp;gt; 2013-10-15T15:14:18.817244-05:00 c0-0c0s5n3 [&amp;lt;ffffffff8138311f&amp;gt;] schedule+0x3f/0x60
&amp;gt; 2013-10-15T15:14:18.817388-05:00 c0-0c0s5n3 [&amp;lt;ffffffff81383f92&amp;gt;] __mutex_lock_slowpath+0x102/0x180
&amp;gt; 2013-10-15T15:14:18.842380-05:00 c0-0c0s5n3 [&amp;lt;ffffffff81383993&amp;gt;] mutex_lock+0x23/0x40
&amp;gt; 2013-10-15T15:14:18.842582-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa0375806&amp;gt;] cl_lock_mutex_get+0x76/0xd0 [obdclass]
&amp;gt; 2013-10-15T15:14:18.867656-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa07bbc08&amp;gt;] lovsub_parent_lock+0x48/0x120 [lov]
&amp;gt; 2013-10-15T15:14:18.867709-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa07bc368&amp;gt;] lovsub_lock_state+0x68/0x1a0 [lov]
&amp;gt; 2013-10-15T15:14:18.867880-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa0373558&amp;gt;] cl_lock_state_signal+0x68/0x160 [obdclass]
&amp;gt; 2013-10-15T15:14:18.893131-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa0373e7a&amp;gt;] cl_lock_signal+0x5a/0x130 [obdclass]
&amp;gt; 2013-10-15T15:14:18.893285-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa072279d&amp;gt;] osc_lock_upcall+0x25d/0x610 [osc]
&amp;gt; 2013-10-15T15:14:18.893419-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa07041bd&amp;gt;] osc_enqueue_fini+0x9d/0x240 [osc]
&amp;gt; 2013-10-15T15:14:18.918520-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa0706eb7&amp;gt;] osc_enqueue_interpret+0xe7/0x1d0 [osc]
&amp;gt; 2013-10-15T15:14:18.918702-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa049733c&amp;gt;] ptlrpc_check_set+0x52c/0x1e20 [ptlrpc]
&amp;gt; 2013-10-15T15:14:18.943647-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa04c4acb&amp;gt;] ptlrpcd_check+0x53b/0x560 [ptlrpc]
&amp;gt; 2013-10-15T15:14:18.943885-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa04c504b&amp;gt;] ptlrpcd+0x29b/0x3b0 [ptlrpc]
&amp;gt; 2013-10-15T15:14:18.943979-05:00 c0-0c0s5n3 [&amp;lt;ffffffff8138de94&amp;gt;] kernel_thread_helper+0x4/0x10
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The following application process is waiting for an RPC to be sent:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;&amp;gt; 2013-10-15T15:14:31.200904-05:00 c0-0c0s5n3 TraveltimeLaunc S 00000001050e94cc     0 16793  16777 0x00000000
&amp;gt; 2013-10-15T15:14:31.200927-05:00 c0-0c0s5n3 ffff880fe8979308 0000000000000082 ffff880fe8978010 ffff880faca8e040
&amp;gt; 2013-10-15T15:14:31.200967-05:00 c0-0c0s5n3 0000000000010b00 ffff880fe8979fd8 ffff880fe8979fd8 0000000000010b00
&amp;gt; 2013-10-15T15:14:31.226179-05:00 c0-0c0s5n3 0000000000000000 ffff880faca8e040 0000000000000000 ffffffff81667020
&amp;gt; 2013-10-15T15:14:31.226195-05:00 c0-0c0s5n3 Call Trace:
&amp;gt; 2013-10-15T15:14:31.226219-05:00 c0-0c0s5n3 [&amp;lt;ffffffff8138311f&amp;gt;] schedule+0x3f/0x60
&amp;gt; 2013-10-15T15:14:31.251476-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa021d74e&amp;gt;] cfs_waitq_wait+0xe/0x10 [libcfs]
&amp;gt; 2013-10-15T15:14:31.251533-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa0731e56&amp;gt;] osc_extent_wait+0x576/0x630 [osc]
&amp;gt; 2013-10-15T15:14:31.251563-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa0732447&amp;gt;] osc_cache_wait_range+0x537/0x820 [osc]
&amp;gt; 2013-10-15T15:14:31.276843-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa0733199&amp;gt;] osc_cache_writeback_range+0xa69/0x1030 [osc]
&amp;gt; 2013-10-15T15:14:31.276897-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa071febe&amp;gt;] osc_lock_flush+0x7e/0x260 [osc]
&amp;gt; 2013-10-15T15:14:31.302146-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa0720181&amp;gt;] osc_lock_cancel+0xe1/0x1c0 [osc]
&amp;gt; 2013-10-15T15:14:31.302187-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa03730d5&amp;gt;] cl_lock_cancel0+0x75/0x160 [obdclass]
&amp;gt; 2013-10-15T15:14:31.302229-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa0373e1b&amp;gt;] cl_lock_cancel+0x13b/0x140 [obdclass]
&amp;gt; 2013-10-15T15:14:31.327498-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa072162c&amp;gt;] osc_ldlm_blocking_ast+0x20c/0x330 [osc]
&amp;gt; 2013-10-15T15:14:31.327566-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa046baab&amp;gt;] ldlm_cancel_callback+0x6b/0x190 [ptlrpc]
&amp;gt; 2013-10-15T15:14:31.352863-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa0479d1a&amp;gt;] ldlm_cli_cancel_local+0x8a/0x470 [ptlrpc]
&amp;gt; 2013-10-15T15:14:31.352917-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa047cfec&amp;gt;] ldlm_cli_cancel_list_local+0xec/0x290 [ptlrpc]
&amp;gt; 2013-10-15T15:14:31.378183-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa047dee5&amp;gt;] ldlm_cancel_lru_local+0x35/0x40 [ptlrpc]
&amp;gt; 2013-10-15T15:14:31.378216-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa047f36f&amp;gt;] ldlm_prep_elc_req+0x3df/0x490 [ptlrpc]
&amp;gt; 2013-10-15T15:14:31.403528-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa047f448&amp;gt;] ldlm_prep_enqueue_req+0x28/0x30 [ptlrpc]
&amp;gt; 2013-10-15T15:14:31.403581-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa0706963&amp;gt;] osc_enqueue_base+0x103/0x570 [osc]
&amp;gt; 2013-10-15T15:14:31.403617-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa0720e36&amp;gt;] osc_lock_enqueue+0x506/0x900 [osc]
&amp;gt; 2013-10-15T15:14:31.403617-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa0720e36&amp;gt;] osc_lock_enqueue+0x506/0x900 [osc]
&amp;gt; 2013-10-15T15:14:31.428880-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa03777bb&amp;gt;] cl_enqueue_try+0xfb/0x320 [obdclass]
&amp;gt; 2013-10-15T15:14:31.428980-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa07b4a54&amp;gt;] lov_lock_enqueue+0x1f4/0x890 [lov]
&amp;gt; 2013-10-15T15:14:31.429041-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa03777bb&amp;gt;] cl_enqueue_try+0xfb/0x320 [obdclass]
&amp;gt; 2013-10-15T15:14:31.454182-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa037869f&amp;gt;] cl_enqueue_locked+0x7f/0x1f0 [obdclass]
&amp;gt; 2013-10-15T15:14:31.454296-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa03792ce&amp;gt;] cl_lock_request+0x7e/0x270 [obdclass]
&amp;gt; 2013-10-15T15:14:31.454354-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa0883e9f&amp;gt;] cl_glimpse_lock+0x17f/0x490 [lustre]
&amp;gt; 2013-10-15T15:14:31.479540-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa088434d&amp;gt;] cl_glimpse_size0+0x19d/0x1c0 [lustre]
&amp;gt; 2013-10-15T15:14:31.504845-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa0837650&amp;gt;] ll_inode_revalidate_it+0x1b0/0x1d0 [lustre]
&amp;gt; 2013-10-15T15:14:31.504912-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa08376b6&amp;gt;] ll_getattr_it+0x46/0x180 [lustre]
&amp;gt; 2013-10-15T15:14:31.504990-05:00 c0-0c0s5n3 [&amp;lt;ffffffffa083782c&amp;gt;] ll_getattr+0x3c/0x40 [lustre]
&amp;gt; 2013-10-15T15:14:31.505021-05:00 c0-0c0s5n3 [&amp;lt;ffffffff81147355&amp;gt;] vfs_getattr+0x25/0x50
&amp;gt; 2013-10-15T15:14:31.530127-05:00 c0-0c0s5n3 [&amp;lt;ffffffff81147920&amp;gt;] vfs_fstatat+0x80/0x90
&amp;gt; 2013-10-15T15:14:31.530209-05:00 c0-0c0s5n3 [&amp;lt;ffffffff81147a5b&amp;gt;] vfs_stat+0x1b/0x20
&amp;gt; 2013-10-15T15:14:31.530264-05:00 c0-0c0s5n3 [&amp;lt;ffffffff81147a84&amp;gt;] sys_newstat+0x24/0x50
&amp;gt; 2013-10-15T15:14:31.530321-05:00 c0-0c0s5n3 [&amp;lt;ffffffff8138cdab&amp;gt;] system_call_fastpath+0x16/0x1b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="22219">LU-4300</key>
            <summary>ptlrpcd threads deadlocked in cl_lock_mutex_get</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="jay">Jinshan Xiong</assignee>
                                    <reporter username="scherementsev">Sergey Cheremencev</reporter>
                        <labels>
                            <label>MB</label>
                    </labels>
                <created>Fri, 22 Nov 2013 18:38:12 +0000</created>
                <updated>Fri, 6 Sep 2019 22:44:08 +0000</updated>
                            <resolved>Fri, 7 Mar 2014 19:11:39 +0000</resolved>
                                    <version>Lustre 2.5.0</version>
                                    <fixVersion>Lustre 2.6.0</fixVersion>
                    <fixVersion>Lustre 2.5.2</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>31</watches>
                                                                            <comments>
                            <comment id="72162" author="shadow" created="Fri, 22 Nov 2013 19:16:46 +0000"  >&lt;p&gt;problem is simple.&lt;/p&gt;

&lt;p&gt;GLIMPSE lock is in NEW state, to switch into ENQUEUE state it need to send requests for each stripe assigned to the file.. fine.&lt;br/&gt;
but when we send any lock enqueue request we are cancel unused + aged locks on same namespace, so we need to cancel an extent lock with dirty data. Before 2.4 (or better say) ptlrpcd &quot;optimisation&quot; lock enqueue send via normal thread but lock cancel send an BRW request via special brw thread and we have none dependencies between these requests, so brw finished and processed in own thread.. but now..&lt;br/&gt;
glimpse reply tried to take an top level cl_lock, but it&apos;s blocked by wanting an brw reply.. OK.. same as before.. but one difference.&lt;br/&gt;
brw request now in set_new_requests queue in same thread as tried to take top level mutex... and don&apos;t have ability to the send request over network.. so.. deadlocked.&lt;/p&gt;

&lt;p&gt;Xyratex have a several ideas how it&apos;s need fixed.&lt;br/&gt;
1) simplest - restore an _brw threads. so that situation was resolved as designed in CLIO. but it&apos;s will be block an ptlrpcd threads probably for a long time and may bad in case one of OST&apos;s in down.&lt;br/&gt;
2( change a ELC/LRU policy to avoid cancelling an extent locks while taking an glimpse lock, or some tuning with ability to cancel locks only if they don&apos;t have dirty data and may be canceled locally without sending RPC&apos;s.&lt;br/&gt;
3) change CLIO to be closed to the older LOV design.. prepare a full set of requests before sending any request over wire. in that case,&lt;br/&gt;
glimpse replies will be never received while we hold a top level lock and flush a dirty data will be separated from and sending glimpse reply. so deadlock isn&apos;t possible.&lt;/p&gt;

&lt;p&gt;We want to discus with Intel before we will be prepare a patch for that issue. &lt;/p&gt;


</comment>
                            <comment id="72430" author="adilger" created="Wed, 27 Nov 2013 19:57:12 +0000"  >&lt;p&gt;I think Jinshan and Bobijam are already working on simplifying cl_lock as part of the CLIO cleanup project.  Have added them to the CC list of this bug to discuss possible solutions.&lt;/p&gt;</comment>
                            <comment id="72467" author="shadow" created="Thu, 28 Nov 2013 07:57:11 +0000"  >&lt;p&gt;from my point view - we need implement an something like to 2+3.&lt;br/&gt;
it&apos;s don&apos;t need a large changes in cl_lock design - but need divide a any CLIO operation into two parts - preparing and executing.&lt;br/&gt;
it&apos;s will be close to the transaction on server side where we have declare and execute phases also.&lt;/p&gt;

&lt;p&gt;as about 2 - it&apos;s changes just to avoid an any recursive dependence in preparing operation. Other solution in that area will be separate lock namespace completely. FLOCK&apos;s from IBITS, Extents from Glimpse ... &lt;/p&gt;

&lt;p&gt;I think we need invite an Alex and Tappro to that discussion but i don&apos;t able to add new watches to that ticket.&lt;/p&gt;</comment>
                            <comment id="72646" author="jay" created="Mon, 2 Dec 2013 20:39:36 +0000"  >&lt;p&gt;It seems that the 2nd option is a good bet. The glimpse requests should be sent out asap so it makes no sense to write some data back before doing that.&lt;/p&gt;

&lt;p&gt;A rule thumb is never to block ptlrpc threads and we have taken this into account in the cl_lock rebuild project.&lt;/p&gt;</comment>
                            <comment id="72678" author="shadow" created="Tue, 3 Dec 2013 05:45:01 +0000"  >&lt;p&gt;Jay,&lt;/p&gt;

&lt;p&gt;what about separate a declare and execute phases for a any CLIO (and may be MD) operations ?&lt;/p&gt;</comment>
                            <comment id="72852" author="vitaly_fertman" created="Wed, 4 Dec 2013 21:08:52 +0000"  >&lt;p&gt;What occurred with glimpse rpc may happen with regular enqueue rpc, so let&apos;s separate these problems:&lt;br/&gt;
1 - the current issue&lt;br/&gt;
2 - glimpse vs. lru_resize optimisation.&lt;/p&gt;

&lt;p&gt;regarding 2 - we may want to drop lru cancellation on enqueue rpc or only on glimpse enqueue rpc or even leave it as is. the original idea was that shrinker needs to cancel aged locks from lru and instead of creating new rpc from time to time, we may distribute this job among existing enqueue rpc.&lt;/p&gt;</comment>
                            <comment id="72892" author="shadow" created="Thu, 5 Dec 2013 12:19:01 +0000"  >&lt;p&gt;Vitaly,&lt;/p&gt;

&lt;p&gt;i think we need to align LRU policy to avoid cancelling a locks with dirty data. It&apos;s will solve both issues - just make glimpse faster and avoid any waiting in lock enqueue. what you think about ?&lt;/p&gt;</comment>
                            <comment id="72909" author="jay" created="Thu, 5 Dec 2013 18:15:25 +0000"  >&lt;p&gt;That sounds perfect in my opinion, or we can only cancel PR locks. Please take a look at osc_ldlm_weigh_ast() which can be revised to figure out if there is any dirty pages covered by a specific lock.&lt;/p&gt;</comment>
                            <comment id="73215" author="cheng_shao" created="Tue, 10 Dec 2013 18:28:01 +0000"  >&lt;p&gt;Hi Jinshan, just to follow up with you - will you be working on it, or it is ok for Xyratex to provide a patch for you to review? Thanks.&lt;/p&gt;</comment>
                            <comment id="73218" author="jay" created="Tue, 10 Dec 2013 19:08:00 +0000"  >&lt;p&gt;I&apos;m overwhelmed so I will appreciate if you guys can create a patch for this.&lt;/p&gt;</comment>
                            <comment id="73285" author="cheng_shao" created="Wed, 11 Dec 2013 15:47:51 +0000"  >&lt;p&gt;OK, thanks Jinshan.&lt;/p&gt;</comment>
                            <comment id="74766" author="shadow" created="Sat, 11 Jan 2014 12:35:29 +0000"  >&lt;p&gt;That ticket affect 2.6 also.&lt;/p&gt;</comment>
                            <comment id="75871" author="jay" created="Wed, 29 Jan 2014 17:43:42 +0000"  >&lt;p&gt;Hi, do you guys have a patch for this issue yet?&lt;/p&gt;</comment>
                            <comment id="76179" author="simmonsja" created="Tue, 4 Feb 2014 15:21:24 +0000"  >&lt;p&gt;This problem is also impacting ORNL. Does a reproducer exit?&lt;/p&gt;</comment>
                            <comment id="76184" author="simmonsja" created="Tue, 4 Feb 2014 16:53:57 +0000"  >&lt;p&gt;This is critical bug for us. Alexey if you have a patch can you push it so I can be tested and inspected.&lt;/p&gt;</comment>
                            <comment id="76185" author="paf" created="Tue, 4 Feb 2014 17:02:42 +0000"  >&lt;p&gt;James, Jinshan,&lt;/p&gt;

&lt;p&gt;Cray&apos;s been working with Xyratex on this.  Unfortunately, they haven&apos;t been able to come up with a patch for this issue yet.  If someone from Intel is able to look at it, we can ask them to speak to you.&lt;/p&gt;</comment>
                            <comment id="76190" author="shadow" created="Tue, 4 Feb 2014 17:34:17 +0000"  >&lt;p&gt;Ann think simple fix should be enough as workaround, until we will have complex fix. I will push a patch in next two days.&lt;/p&gt;</comment>
                            <comment id="76228" author="ihara" created="Tue, 4 Feb 2014 22:27:01 +0000"  >&lt;p&gt;James, agreed with you. This is critical and we also hit rancoramly client crash due to this issue. As a quick workaround, &quot;echo 0 &amp;gt; /proc/fs/lustre/ldlm/namespaces/*/early_lock_cancel&quot; on clients helped to avoid this issue in our case. Hope this helps in your case too.&lt;/p&gt;</comment>
                            <comment id="76333" author="bfaccini" created="Thu, 6 Feb 2014 10:37:45 +0000"  >&lt;p&gt;Yes, it would be interesting to know if disabling ELC, the work-around we found for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4552&quot; title=&quot;osc_cache.c:899:osc_extent_wait() timeout quite often&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4552&quot;&gt;&lt;del&gt;LU-4552&lt;/del&gt;&lt;/a&gt;, also works there. This is highly possible, since Lustre version, stack-traces, time-out situation are very similar, so, since the very beginning, reading both tickets I had quickly the strong assumption that they could be  duplicates.&lt;/p&gt;</comment>
                            <comment id="76334" author="shadow" created="Thu, 6 Feb 2014 11:59:41 +0000"  >&lt;p&gt;please try attached fix and say how it help.&lt;/p&gt;</comment>
                            <comment id="76343" author="simmonsja" created="Thu, 6 Feb 2014 14:33:33 +0000"  >&lt;p&gt;Pushed the patch to gerrit so it can run through maloo.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/#/c/9156&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/9156&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="76418" author="simmonsja" created="Thu, 6 Feb 2014 23:52:00 +0000"  >&lt;p&gt;We also disabled ELC and our problems did stop.&lt;/p&gt;</comment>
                            <comment id="76419" author="jay" created="Fri, 7 Feb 2014 00:02:36 +0000"  >&lt;p&gt;That patch is a good workaround but it will affect performance. I&apos;m going to work out another solution to not do early cancel for glimpse lock ENQ.&lt;/p&gt;</comment>
                            <comment id="76443" author="jay" created="Fri, 7 Feb 2014 07:07:44 +0000"  >&lt;p&gt;I pushed a new patch at: &lt;a href=&quot;http://review.whamcloud.com/9175&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/9175&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;I don&apos;t run any tests so please run sanity test and then see if it can fix the problem as well.&lt;/p&gt;

&lt;p&gt;I will run test tomorrow.&lt;/p&gt;</comment>
                            <comment id="76472" author="simmonsja" created="Fri, 7 Feb 2014 17:03:23 +0000"  >&lt;p&gt;I&apos;m seeing jobs failing to run with patch 9156.&lt;/p&gt;</comment>
                            <comment id="76483" author="jay" created="Fri, 7 Feb 2014 18:10:56 +0000"  >&lt;p&gt;failed due to exact the same reason, or a new one?&lt;/p&gt;</comment>
                            <comment id="77177" author="simmonsja" created="Mon, 17 Feb 2014 13:09:55 +0000"  >&lt;p&gt;Looks like new one. I never saw failures of job before the xyratex patch at small scale. Only large scale did we see problems. Either way I will be try the intel patch next&lt;/p&gt;</comment>
                            <comment id="78745" author="jlevi" created="Fri, 7 Mar 2014 19:11:39 +0000"  >&lt;p&gt;Patch has landed to Master.&lt;/p&gt;</comment>
                            <comment id="82391" author="jamesanunez" created="Thu, 24 Apr 2014 15:10:34 +0000"  >&lt;p&gt;b2_5 patch at &lt;a href=&quot;http://review.whamcloud.com/#/c/9451/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/9451/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="82405" author="bfaccini" created="Thu, 24 Apr 2014 16:23:42 +0000"  >&lt;p&gt;James and all, I wonder why the patch-set #2/#3 of the b2_5 patch version mentioned (&lt;a href=&quot;http://review.whamcloud.com/#/c/9451/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/9451/&lt;/a&gt;) only affect osc/osc_request.c file vs much more in its patch-set #1 like in the master version (&lt;a href=&quot;http://review.whamcloud.com/9175&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/9175&lt;/a&gt;) ?&lt;/p&gt;</comment>
                            <comment id="82771" author="simmonsja" created="Tue, 29 Apr 2014 18:00:31 +0000"  >&lt;p&gt;The reason is Jinshan pointed the other part of the fix in master is related to the clio cleanup work. For b2_5 we only need the simpler fix.&lt;/p&gt;</comment>
                            <comment id="97306" author="shadow" created="Thu, 23 Oct 2014 18:44:39 +0000"  >&lt;p&gt;For the record - Jay changes in osc_cancel_for_recovery() re-introduced deadlock during recovery.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="22903">LU-4552</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="11243">LU-465</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="26974">LU-5727</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="53598">LU-11518</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="13858" name="ptlrpcd-rpc.info" size="239071" author="scherementsev" created="Fri, 22 Nov 2013 18:38:12 +0000"/>
                            <attachment id="14060" name="quick-fix.diff" size="2058" author="shadow" created="Thu, 6 Feb 2014 11:59:41 +0000"/>
                            <attachment id="13859" name="rpcs.log" size="138814" author="scherementsev" created="Fri, 22 Nov 2013 18:38:12 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzw9zb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>11787</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>