<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:41:12 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4269] ldlm_lock_put()) ASSERTION( (((( lock))-&gt;l_flags &amp; (1ULL &lt;&lt; 50)) != 0) ) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-4269</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Running SWL - miranda IO test. &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;2013-11-16 17:27:14 LustreError: 105183:0:(ldlm_lock.c:222:ldlm_lock_put()) ASSERTION( (((( lock))-&amp;gt;l_flags &amp;amp; (1ULL &amp;lt;&amp;lt; 50)) != 0) ) failed:
2013-11-16 17:27:14 LustreError: 105183:0:(ldlm_lock.c:222:ldlm_lock_put()) LBUG
2013-11-16 17:27:14 Pid: 105183, comm: miranda_io
2013-11-16 17:27:14 Nov 16 17:27:14
2013-11-16 17:27:14 Call Trace:
2013-11-16 17:27:14 iwc48 kernel: LustreError: 10518 [&amp;lt;ffffffffa056b895&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
2013-11-16 17:27:14 3:0:(ldlm_lock.c:222:ldlm_lock_p [&amp;lt;ffffffffa056be97&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
2013-11-16 17:27:14 ut()) ASSERTION( (((( lock))-&amp;gt;l_flags &amp;amp; (1ULL &amp;lt;&amp;lt; [&amp;lt;ffffffffa080f94d&amp;gt;] ldlm_lock_put+0x44d/0x560 [ptlrpc]
2013-11-16 17:27:14  50)) != 0) ) failed:
2013-11-16 17:27:14 Nov 16 17:27:14 iwc48 kernel: LustreError [&amp;lt;ffffffffa0822d22&amp;gt;] ldlm_cli_cancel_list+0xf2/0x3e0 [ptlrpc]
2013-11-16 17:27:14 : 105183:0:(ldlm_lock.c:222:ldlm_lock_put()) LBUG
2013-11-16 17:27:14  [&amp;lt;ffffffffa0824952&amp;gt;] ldlm_cli_cancel+0x132/0x360 [ptlrpc]
2013-11-16 17:27:14  [&amp;lt;ffffffffa09fdbce&amp;gt;] osc_lock_cancel+0xfe/0x1c0 [osc]
2013-11-16 17:27:14  [&amp;lt;ffffffffa0708525&amp;gt;] cl_lock_cancel0+0x75/0x160 [obdclass]
2013-11-16 17:27:14  [&amp;lt;ffffffffa07090db&amp;gt;] cl_lock_cancel+0x13b/0x140 [obdclass]
2013-11-16 17:27:14  [&amp;lt;ffffffffa070d7e4&amp;gt;] cl_lock_enqueue_wait+0xc4/0x2d0 [obdclass]
2013-11-16 17:27:14  [&amp;lt;ffffffffa070aa17&amp;gt;] ? cl_lock_mutex_put+0x77/0x90 [obdclass]
2013-11-16 17:27:14  [&amp;lt;ffffffffa0a9126b&amp;gt;] lov_lock_enqueue+0x3fb/0x850 [lov]
2013-11-16 17:27:14  [&amp;lt;ffffffffa070c65c&amp;gt;] cl_enqueue_try+0xfc/0x300 [obdclass]
2013-11-16 17:27:14  [&amp;lt;ffffffffa070da5f&amp;gt;] cl_enqueue_locked+0x6f/0x1f0 [obdclass]
2013-11-16 17:27:14  [&amp;lt;ffffffffa070e6ae&amp;gt;] cl_lock_request+0x7e/0x270 [obdclass]
2013-11-16 17:27:14  [&amp;lt;ffffffffa071391c&amp;gt;] cl_io_lock+0x3cc/0x560 [obdclass]
2013-11-16 17:27:14  [&amp;lt;ffffffffa0713b52&amp;gt;] cl_io_loop+0xa2/0x1b0 [obdclass]
2013-11-16 17:27:14  [&amp;lt;ffffffffa0b0e290&amp;gt;] ll_file_io_generic+0x460/0x610 [lustre]
2013-11-16 17:27:14  [&amp;lt;ffffffffa0701e39&amp;gt;] ? cl_env_get+0x29/0x350 [obdclass]
2013-11-16 17:27:14  [&amp;lt;ffffffffa0b0ecb2&amp;gt;] ll_file_aio_write+0x142/0x2c0 [lustre]
2013-11-16 17:27:14  [&amp;lt;ffffffffa0b0ef9c&amp;gt;] ll_file_write+0x16c/0x2a0 [lustre]
2013-11-16 17:27:14  [&amp;lt;ffffffff81181398&amp;gt;] vfs_write+0xb8/0x1a0
2013-11-16 17:27:14  [&amp;lt;ffffffff81181c91&amp;gt;] sys_write+0x51/0x90
2013-11-16 17:27:14  [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b
2013-11-16 17:27:14
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Lctl dump attached&lt;/p&gt;</description>
                <environment>Hyperion/LLNL</environment>
        <key id="22151">LU-4269</key>
            <summary>ldlm_lock_put()) ASSERTION( (((( lock))-&gt;l_flags &amp; (1ULL &lt;&lt; 50)) != 0) ) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bobijam">Zhenyu Xu</assignee>
                                    <reporter username="cliffw">Cliff White</reporter>
                        <labels>
                            <label>MB</label>
                            <label>mn4</label>
                    </labels>
                <created>Mon, 18 Nov 2013 19:17:58 +0000</created>
                <updated>Thu, 27 Feb 2014 14:21:03 +0000</updated>
                            <resolved>Fri, 21 Feb 2014 13:23:09 +0000</resolved>
                                    <version>Lustre 2.6.0</version>
                                    <fixVersion>Lustre 2.6.0</fixVersion>
                    <fixVersion>Lustre 2.5.1</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>11</watches>
                                                                            <comments>
                            <comment id="73261" author="bobijam" created="Wed, 11 Dec 2013 02:55:07 +0000"  >&lt;p&gt;What I observed in the log&lt;/p&gt;

&lt;div class=&quot;panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;panelContent&quot;&gt;
&lt;p&gt;line 140698, ldlm_cli_cancel()&lt;br/&gt;
line 140704, ldlm_cli_cancel_local() ldlm lock ffff880793e54180/0xf128b59d838c9d81 res: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x9edf8f:0x0:0x0&amp;#93;&lt;/span&gt;.0 flags: 0x284&apos;0000&apos;0000 (LVB_READY|CANCELING|CBPENDING)&lt;br/&gt;
line 140756, the ldlm lock is destroyed ldlm_lock_destroy_internal(), which set the destroy flag for the ldlm lock &lt;font color=&quot;blue&quot;&gt;(0x0004000000000000ULL)&lt;/font&gt;&lt;br/&gt;
line 140771, prepare to issue ldlm lock cancel request&lt;br/&gt;
line 140781, ldlm_cancel_pack() lock: ffff880793e54180/0xf128b59d838c9d81 flags: 0x8694&apos;0000&apos;0000 (BL_DONE|KMS_IGNORE|LVB_READY|CANCELING|CANCEL|CBPENDING) &lt;font color=&quot;red&quot;&gt;the destroy flag disappeared&lt;/font&gt;&lt;/p&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Cliff, what&apos;s the latest git commit is this code? Since I&apos;ve seen cl_lock_discard_pages() while current master branch does not have this function in it, I need to know the exact code base you were using when you hit this problem.&lt;/p&gt;</comment>
                            <comment id="73560" author="ihara" created="Mon, 16 Dec 2013 10:37:53 +0000"  >&lt;p&gt;We saw exactly same problem on multiple clients during mdtest is running. The client was running with the latest master branch.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;&amp;lt;0&amp;gt;LustreError: 10212:0:(ldlm_lock.c:222:ldlm_lock_put()) ASSERTION( (((( lock))-&amp;gt;l_flags &amp;amp; (1ULL &amp;lt;&amp;lt; 50)) != 0) ) failed: 
&amp;lt;0&amp;gt;LustreError: 10212:0:(ldlm_lock.c:222:ldlm_lock_put()) LBUG
&amp;lt;4&amp;gt;Pid: 10212, comm: mdtest
&amp;lt;4&amp;gt;
&amp;lt;4&amp;gt;Call Trace:
&amp;lt;4&amp;gt; [&amp;lt;ffffffffa05c5895&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
&amp;lt;4&amp;gt; [&amp;lt;ffffffffa05c5e97&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
&amp;lt;4&amp;gt; [&amp;lt;ffffffffa0951e27&amp;gt;] ldlm_lock_put+0x547/0x630 [ptlrpc]
&amp;lt;4&amp;gt; [&amp;lt;ffffffffa09708f2&amp;gt;] ldlm_cli_cancel_list+0xf2/0x3e0 [ptlrpc]
&amp;lt;4&amp;gt; [&amp;lt;ffffffffa0972e0a&amp;gt;] ldlm_prep_elc_req+0x21a/0x4b0 [ptlrpc]
&amp;lt;4&amp;gt; [&amp;lt;ffffffffa1043637&amp;gt;] mdc_unlink+0xe7/0x500 [mdc]
&amp;lt;4&amp;gt; [&amp;lt;ffffffffa12ade7b&amp;gt;] lmv_unlink+0x1db/0x7a0 [lmv]
&amp;lt;4&amp;gt; [&amp;lt;ffffffffa11c4a1e&amp;gt;] ? ll_i2gids+0x2e/0xd0 [lustre]
&amp;lt;4&amp;gt; [&amp;lt;ffffffffa11cb228&amp;gt;] ll_unlink+0x158/0x610 [lustre]
&amp;lt;4&amp;gt; [&amp;lt;ffffffff8118fec0&amp;gt;] vfs_unlink+0xa0/0xf0
&amp;lt;4&amp;gt; [&amp;lt;ffffffff8118ebfa&amp;gt;] ? lookup_hash+0x3a/0x50
&amp;lt;4&amp;gt; [&amp;lt;ffffffff81192265&amp;gt;] do_unlinkat+0xf5/0x1b0
&amp;lt;4&amp;gt; [&amp;lt;ffffffff810d94e2&amp;gt;] ? unroll_tree_refs+0xe2/0x120
&amp;lt;4&amp;gt; [&amp;lt;ffffffff8100c535&amp;gt;] ? math_state_restore+0x45/0x60
&amp;lt;4&amp;gt; [&amp;lt;ffffffff81192336&amp;gt;] sys_unlink+0x16/0x20
&amp;lt;4&amp;gt; [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b
&amp;lt;4&amp;gt;
&amp;lt;0&amp;gt;Kernel panic - not syncing: LBUG
&amp;lt;4&amp;gt;Pid: 10212, comm: mdtest Not tainted 2.6.32-358.23.2.el6_lustre.ge975b1c.x86_64 #1
&amp;lt;4&amp;gt;Call Trace:
&amp;lt;4&amp;gt; [&amp;lt;ffffffff8150deec&amp;gt;] ? panic+0xa7/0x16f
&amp;lt;4&amp;gt; [&amp;lt;ffffffffa05c5eeb&amp;gt;] ? lbug_with_loc+0x9b/0xb0 [libcfs]
&amp;lt;4&amp;gt; [&amp;lt;ffffffffa0951e27&amp;gt;] ? ldlm_lock_put+0x547/0x630 [ptlrpc]
&amp;lt;4&amp;gt; [&amp;lt;ffffffffa09708f2&amp;gt;] ? ldlm_cli_cancel_list+0xf2/0x3e0 [ptlrpc]
&amp;lt;4&amp;gt; [&amp;lt;ffffffffa0972e0a&amp;gt;] ? ldlm_prep_elc_req+0x21a/0x4b0 [ptlrpc]
&amp;lt;4&amp;gt; [&amp;lt;ffffffffa1043637&amp;gt;] ? mdc_unlink+0xe7/0x500 [mdc]
&amp;lt;4&amp;gt; [&amp;lt;ffffffffa12ade7b&amp;gt;] ? lmv_unlink+0x1db/0x7a0 [lmv]
&amp;lt;4&amp;gt; [&amp;lt;ffffffffa11c4a1e&amp;gt;] ? ll_i2gids+0x2e/0xd0 [lustre]
&amp;lt;4&amp;gt; [&amp;lt;ffffffffa11cb228&amp;gt;] ? ll_unlink+0x158/0x610 [lustre]
&amp;lt;4&amp;gt; [&amp;lt;ffffffff8118fec0&amp;gt;] ? vfs_unlink+0xa0/0xf0
&amp;lt;4&amp;gt; [&amp;lt;ffffffff8118ebfa&amp;gt;] ? lookup_hash+0x3a/0x50
&amp;lt;4&amp;gt; [&amp;lt;ffffffff81192265&amp;gt;] ? do_unlinkat+0xf5/0x1b0
&amp;lt;4&amp;gt; [&amp;lt;ffffffff810d94e2&amp;gt;] ? unroll_tree_refs+0xe2/0x120
&amp;lt;4&amp;gt; [&amp;lt;ffffffff8100c535&amp;gt;] ? math_state_restore+0x45/0x60
&amp;lt;4&amp;gt; [&amp;lt;ffffffff81192336&amp;gt;] ? sys_unlink+0x16/0x20
&amp;lt;4&amp;gt; [&amp;lt;ffffffff8100b072&amp;gt;] ? system_call_fastpath+0x16/0x1b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;we have crashdump as well. Please let me know it helps for debugging.&lt;/p&gt;</comment>
                            <comment id="73561" author="bobijam" created="Mon, 16 Dec 2013 10:44:03 +0000"  >&lt;p&gt;Hi Ihara-San,&lt;/p&gt;

&lt;p&gt;Do you have debug log of it?&lt;/p&gt;</comment>
                            <comment id="73564" author="ihara" created="Mon, 16 Dec 2013 13:21:32 +0000"  >&lt;p&gt;Hi Bobijam,&lt;/p&gt;

&lt;p&gt;vmware-dmesg.txt attached. Please let me know if you need vmware itself. I can upload it as well.&lt;/p&gt;</comment>
                            <comment id="73566" author="bobijam" created="Mon, 16 Dec 2013 13:31:12 +0000"  >&lt;p&gt;besides dmesg, do you have debug log? (lctl dk)&lt;/p&gt;</comment>
                            <comment id="73859" author="cliffw" created="Thu, 19 Dec 2013 16:20:26 +0000"  >&lt;p&gt;The test was run Nov 18th, build information was not in the version: build:  2.5.51--PRISTINE-2.6.32-358.23.2.el6_lustre.x86_64 - so it was likely the 2.5.51 tag. &lt;/p&gt;</comment>
                            <comment id="74251" author="cliffw" created="Thu, 2 Jan 2014 18:04:56 +0000"  >&lt;p&gt;Sysrq -t from hung client.&lt;/p&gt;</comment>
                            <comment id="74312" author="cliffw" created="Fri, 3 Jan 2014 18:01:44 +0000"  >&lt;p&gt;Attempting to attach file again&lt;/p&gt;</comment>
                            <comment id="74468" author="jay" created="Tue, 7 Jan 2014 08:37:42 +0000"  >&lt;p&gt;From the feedback from Ihara, it looks like there exists race condition to access the flags of dlm lock.&lt;/p&gt;

&lt;p&gt;Let&apos;s create a debug patch to add a line into ldlm_lock_destroy_internal() to print out what&apos;s exact state of the lock&lt;/p&gt;</comment>
                            <comment id="74471" author="bobijam" created="Tue, 7 Jan 2014 09:09:41 +0000"  >&lt;p&gt;debug patch at &lt;a href=&quot;http://review.whamcloud.com/8759&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8759&lt;/a&gt;, please unset panic_on_lbug (echo 0 &amp;gt; /proc/sys/lnet/panic_on_lbug) and run this debug patched version so that we can collect the debug log when this assertion is hit.&lt;/p&gt;</comment>
                            <comment id="74800" author="lixi" created="Mon, 13 Jan 2014 03:25:14 +0000"  >&lt;p&gt;I&apos;ve post a patch for this. However, it is not sure whether it fixes this problem, since we don&apos;t have a reliable way to reproduce this problem yet.&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/#/c/8772/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/8772/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="74804" author="bobijam" created="Mon, 13 Jan 2014 04:08:51 +0000"  >&lt;p&gt;Thank you LiXi, but the iwc48.lctl.log.txt shows in line 140770&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:00000001:0.0:1384651634.269628:0:105183:0:(ldlm_request.c:1760:ldlm_prepare_lru_list()) Process leaving (rc=0 : 0 : 0)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;which shows that when we canceled the problematic lock, ldlm_prepare_lru_list() didn&apos;t find other lru locks, so I&apos;d think your patch would possibly not affect this issue.&lt;/p&gt;</comment>
                            <comment id="74806" author="lixi" created="Mon, 13 Jan 2014 04:23:01 +0000"  >&lt;p&gt;Ah, thank you for pointing that out, Zhenyu. I was just wondering why we can call ldlm_lock_remove_from_lru_nolock() without calling lock_res_and_lock() first. Is there anything I missed?&lt;/p&gt;</comment>
                            <comment id="74810" author="bobijam" created="Mon, 13 Jan 2014 05:18:51 +0000"  >&lt;p&gt;ldlm_prepare_lru_list() is to collect unused lock from namespace::ns_unused_list, where only unused locks are linked into it (please refer to ldlm_lock_decref_internal()-&amp;gt;ldlm_lock_add_to_lru()), so in ldlm_prepare_lru_list() we can call ldlm_lock_remove_from_lru_nolock() w/o locking the res and lock since the lock has no readers nor writers referring to it.&lt;/p&gt;</comment>
                            <comment id="74812" author="lixi" created="Mon, 13 Jan 2014 07:12:03 +0000"  >&lt;p&gt;Ah, thank you for the explaination! Now I understand. &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="74878" author="jay" created="Tue, 14 Jan 2014 00:54:10 +0000"  >&lt;p&gt;how often do you guys see this problem? If it&apos;s quite often, please apply Bobijam&apos;s patch and get some logs; otherwise, I&apos;d like to drop the priority of this ticket.&lt;/p&gt;

&lt;p&gt;Jinshan&lt;/p&gt;</comment>
                            <comment id="74879" author="ihara" created="Tue, 14 Jan 2014 01:06:30 +0000"  >&lt;p&gt;Jinshan, Bobijam,&lt;/p&gt;

&lt;p&gt;Yes, we applied Bobijam&apos;s debug patch at the customer site, but didn&apos;t reproduce problem yet. I will post logs once we can hit same issue.&lt;/p&gt;
</comment>
                            <comment id="74896" author="ihara" created="Tue, 14 Jan 2014 05:43:31 +0000"  >&lt;p&gt;We got crash again with debug patch. Attached is diagnostic information of crach. &lt;/p&gt;</comment>
                            <comment id="74897" author="bobijam" created="Tue, 14 Jan 2014 05:54:47 +0000"  >&lt;p&gt;do you collect any lctl log from it?&lt;/p&gt;</comment>
                            <comment id="74909" author="ihara" created="Tue, 14 Jan 2014 08:21:31 +0000"  >&lt;p&gt;no, unfortunately.. client got panic even we have panic_on_lbug=0 setting.. I don&apos;t know why we got panic after LBUG..&lt;/p&gt;</comment>
                            <comment id="75530" author="paf" created="Thu, 23 Jan 2014 23:35:47 +0000"  >&lt;p&gt;We reproduced this running master + a client lock debug patch on SLES11SP3 while reproducing a client lock bug.  I believe we hit this with a special debug flag for cl_lock_trace turned on (So all calls to cl_lock_trace are being logged, but little or nothing else.)&lt;/p&gt;

&lt;p&gt;I should be able to make the dump and logs available if you think it would be helpful.  I&apos;m not sure about reproducing it, with different debug or at all - We run the test I&apos;m using as a reproducer a lot and this is the first we&apos;ve seen this particular bug.  (The test in question is mmstress from the Linux Test Project.)&lt;/p&gt;</comment>
                            <comment id="75537" author="bobijam" created="Fri, 24 Jan 2014 02:15:50 +0000"  >&lt;p&gt;yes please upload logs Patrick, thank you.&lt;/p&gt;</comment>
                            <comment id="75715" author="paf" created="Mon, 27 Jan 2014 19:34:44 +0000"  >&lt;p&gt;Console log &amp;amp; Lustre log.&lt;br/&gt;
System is master from January 23rd on SLES11SP3, with a special debug patch that enables all calls to cl_lock_trace.&lt;/p&gt;</comment>
                            <comment id="75716" author="paf" created="Mon, 27 Jan 2014 19:36:38 +0000"  >&lt;p&gt;Logs attached as noted.  I&apos;ve also uploaded the dump (with logs, vmlinuz, ko files, etc) to ftp.whamcloud.com at uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4269&quot; title=&quot;ldlm_lock_put()) ASSERTION( (((( lock))-&amp;gt;l_flags &amp;amp; (1ULL &amp;lt;&amp;lt; 50)) != 0) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4269&quot;&gt;&lt;del&gt;LU-4269&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4269&quot; title=&quot;ldlm_lock_put()) ASSERTION( (((( lock))-&amp;gt;l_flags &amp;amp; (1ULL &amp;lt;&amp;lt; 50)) != 0) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4269&quot;&gt;&lt;del&gt;LU-4269&lt;/del&gt;&lt;/a&gt;-140123.tar.gz&lt;/p&gt;

&lt;p&gt;System is master from January 23rd on SLES11SP3, with a special debug patch that enables all calls to cl_lock_trace.&lt;/p&gt;

&lt;p&gt;Also, we reproduced this by running mmstress from the Linux test project.  We ran multiple copies on multiple nodes.  (This also hits several cl_lock related bugs, so that may make this harder to find.  Those problems are more common than this one when running mmstress.)&lt;/p&gt;</comment>
                            <comment id="75840" author="bobijam" created="Wed, 29 Jan 2014 10:14:26 +0000"  >&lt;p&gt;Hi Jinshan,&lt;/p&gt;

&lt;p&gt;When a top lock enqueue failed, top lock will go to cl_lock_hold_release() and then unhold a lovsub lock, which will go to osc_lock_cancel(), if at this time the osc lock is used by another IO, then the ldlm lock is not canceled.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeHeader panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;osc_lock_cancel()&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;                do_cancel = (dlmlock-&amp;gt;l_readers == 0 &amp;amp;&amp;amp;
                             dlmlock-&amp;gt;l_writers == 0);
                ldlm_set_cbpending(dlmlock);
                unlock_res_and_lock(dlmlock);
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (do_cancel)
                        result = ldlm_cli_cancel(&amp;amp;olck-&amp;gt;ols_handle, LCF_ASYNC);
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and the osc lock is detached from the lobsub lock in cl_lock_hold_release()&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (lock-&amp;gt;cll_holds == 0) {
                CL_LOCK_ASSERT(lock-&amp;gt;cll_state != CLS_HELD, env, lock);
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (lock-&amp;gt;cll_descr.cld_mode == CLM_PHANTOM ||
                    lock-&amp;gt;cll_descr.cld_mode == CLM_GROUP ||
                    lock-&amp;gt;cll_state != CLS_CACHED)
                        /*
                         * If lock is still phantom or grouplock when user is
                         * done with it---destroy the lock.
                         */
                        lock-&amp;gt;cll_flags |= CLF_CANCELPEND|CLF_DOOMED;
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (lock-&amp;gt;cll_flags &amp;amp; CLF_CANCELPEND) {
                        lock-&amp;gt;cll_flags &amp;amp;= ~CLF_CANCELPEND;
                        cl_lock_cancel0(env, lock);
                }
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (lock-&amp;gt;cll_flags &amp;amp; CLF_DOOMED) {
                        &lt;span class=&quot;code-comment&quot;&gt;/* no longer doomed: it&apos;s dead... Jim. */&lt;/span&gt;
                        lock-&amp;gt;cll_flags &amp;amp;= ~CLF_DOOMED;
                        cl_lock_delete0(env, lock);
                }
        }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Is it possible that the ldlm lock somehow missed the cancel process somewhere so that its last dereference is still missing the destroy flag?&lt;/p&gt;</comment>
                            <comment id="76011" author="jay" created="Fri, 31 Jan 2014 22:43:14 +0000"  >&lt;p&gt;I happen to be able to reproduce this issue steadily while I was working another one.&lt;/p&gt;

&lt;p&gt;After I applied the following patch:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c
index 214ee8c..2e69ad0 100644
--- a/lustre/ldlm/ldlm_lock.c
+++ b/lustre/ldlm/ldlm_lock.c
@@ -372,12 +372,14 @@ &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; ldlm_lock_destroy_internal(struct ldlm_lock *lock)
                 LBUG();
         }
 
+       LDLM_DEBUG(lock, &lt;span class=&quot;code-quote&quot;&gt;&quot;destroy&quot;&lt;/span&gt;);
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (ldlm_is_destroyed(lock)) {
                LASSERT(cfs_list_empty(&amp;amp;lock-&amp;gt;l_lru));
                EXIT;
                &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 0;
        }
        ldlm_set_destroyed(lock);
+       LDLM_DEBUG(lock, &lt;span class=&quot;code-quote&quot;&gt;&quot;destroy&quot;&lt;/span&gt;);
 
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (lock-&amp;gt;l_export &amp;amp;&amp;amp; lock-&amp;gt;l_export-&amp;gt;exp_lock_hash) {
                /* NB: it&lt;span class=&quot;code-quote&quot;&gt;&apos;s safe to call cfs_hash_del() even lock isn&apos;&lt;/span&gt;t
@@ -2169,6 +2171,7 @@ void ldlm_lock_cancel(struct ldlm_lock *lock)
 
         ldlm_resource_unlink_lock(lock);
         ldlm_lock_destroy_nolock(lock);
+       LDLM_DEBUG(lock, &lt;span class=&quot;code-quote&quot;&gt;&quot;destroyed&quot;&lt;/span&gt;);
 
         &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (lock-&amp;gt;l_granted_mode == lock-&amp;gt;l_req_mode)
                 ldlm_pool_del(&amp;amp;ns-&amp;gt;ns_pool, lock);
diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c
index e140e0f..def3ee5 100644
--- a/lustre/ldlm/ldlm_request.c
+++ b/lustre/ldlm/ldlm_request.c
@@ -1141,6 +1141,7 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
                         rc = LDLM_FL_LOCAL_ONLY;
                 }
                 ldlm_lock_cancel(lock);
+               LDLM_DEBUG(lock, &lt;span class=&quot;code-quote&quot;&gt;&quot;client-side cancel completed, rc = %lld&quot;&lt;/span&gt;, rc);
         } &lt;span class=&quot;code-keyword&quot;&gt;else&lt;/span&gt; {
                 &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (ns_is_client(ldlm_lock_to_ns(lock))) {
                         LDLM_ERROR(lock, &lt;span class=&quot;code-quote&quot;&gt;&quot;Trying to cancel local lock&quot;&lt;/span&gt;);
diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h
index 92b278e..e9e7053 100644
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -1469,7 +1469,7 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; inline void cl_isize_unlock(struct inode *inode)
 
 &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; inline void cl_isize_write_nolock(struct inode *inode, loff_t kms)
 {
-       LASSERT(down_trylock(&amp;amp;ll_i2info(inode)-&amp;gt;lli_size_sem) != 0);
+       &lt;span class=&quot;code-comment&quot;&gt;//LASSERT(down_trylock(&amp;amp;ll_i2info(inode)-&amp;gt;lli_size_sem) != 0);
&lt;/span&gt;        i_size_write(inode, kms);
 }

 diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c
index df421d6..ae53f3d 100644
--- a/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@ -1700,25 +1700,29 @@ &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; ll_statfs(struct dentry *de, struct kstatfs *sfs)
 
 void ll_inode_size_lock(struct inode *inode)
 {
+#&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; 0
         struct ll_inode_info *lli;
 
         LASSERT(!S_ISDIR(inode-&amp;gt;i_mode));
 
         lli = ll_i2info(inode);
         LASSERT(lli-&amp;gt;lli_size_sem_owner != current);
-       down(&amp;amp;lli-&amp;gt;lli_size_sem);
+       &lt;span class=&quot;code-comment&quot;&gt;//down(&amp;amp;lli-&amp;gt;lli_size_sem);
&lt;/span&gt;         LASSERT(lli-&amp;gt;lli_size_sem_owner == NULL);
         lli-&amp;gt;lli_size_sem_owner = current;
+#endif
 }
 
 void ll_inode_size_unlock(struct inode *inode)
 {
+#&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; 0
         struct ll_inode_info *lli;
 
         lli = ll_i2info(inode);
         LASSERT(lli-&amp;gt;lli_size_sem_owner == current);
         lli-&amp;gt;lli_size_sem_owner = NULL;
-       up(&amp;amp;lli-&amp;gt;lli_size_sem);
+       &lt;span class=&quot;code-comment&quot;&gt;//up(&amp;amp;lli-&amp;gt;lli_size_sem);
&lt;/span&gt;+#endif
 }
 
 void ll_update_inode(struct inode *inode, struct lustre_md *md)
diff --git a/lustre/obdclass/cl_lock.c b/lustre/obdclass/cl_lock.c
index d440da9..b8c2e04 100644
--- a/lustre/obdclass/cl_lock.c
+++ b/lustre/obdclass/cl_lock.c
@@ -493,6 +493,7 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; struct cl_lock *cl_lock_lookup(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_env *env,
                                       &lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct cl_io *io,
                                       &lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct cl_lock_descr *need)
 {
+#&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; 0
         struct cl_lock          *lock;
         struct cl_object_header *head;
 
@@ -518,6 +519,7 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; struct cl_lock *cl_lock_lookup(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_env *env,
                         RETURN(lock);
                 }
         }
+#endif
         RETURN(NULL);
 }
 
@@ -1964,6 +1966,8 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; struct cl_lock *cl_lock_hold_mutex(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_env *env,
                         cl_lock_hold_mod(env, lock, +1);
                         lu_ref_add(&amp;amp;lock-&amp;gt;cll_holders, scope, source);
                         lu_ref_add(&amp;amp;lock-&amp;gt;cll_reference, scope, source);
+
+                       cl_lock_cancel(env, lock);
                         &lt;span class=&quot;code-keyword&quot;&gt;break&lt;/span&gt;;
                 }
                 cl_lock_mutex_put(env, lock);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And uses 32 processes to read the same file on the same time. The reading process hit the LBUG at the same place:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Call Trace:
 [&amp;lt;ffffffffa03fd895&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
 [&amp;lt;ffffffffa03fde97&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
 [&amp;lt;ffffffffa0774fb7&amp;gt;] ldlm_lock_put+0x547/0x630 [ptlrpc]
 [&amp;lt;ffffffffa0793df2&amp;gt;] ldlm_cli_cancel_list+0xf2/0x3e0 [ptlrpc]
 [&amp;lt;ffffffffa0795a22&amp;gt;] ldlm_cli_cancel+0x132/0x360 [ptlrpc]
 [&amp;lt;ffffffffa0a8e72e&amp;gt;] osc_lock_cancel+0x10e/0x1d0 [osc]
 [&amp;lt;ffffffffa059c045&amp;gt;] cl_lock_cancel0+0x75/0x160 [obdclass]
 [&amp;lt;ffffffffa059d9d6&amp;gt;] cl_lock_hold_release+0x1c6/0x2a0 [obdclass]
 [&amp;lt;ffffffffa059ecc7&amp;gt;] cl_lock_unhold+0x37/0x130 [obdclass]
 [&amp;lt;ffffffffa0af8a68&amp;gt;] lov_sublock_release+0x1a8/0x280 [lov]
 [&amp;lt;ffffffffa0afa4fe&amp;gt;] lov_lock_unuse+0x1be/0x290 [lov]
 [&amp;lt;ffffffffa059c205&amp;gt;] cl_unuse_try_internal+0x55/0xe0 [obdclass]
 [&amp;lt;ffffffffa059f0a9&amp;gt;] cl_unuse_try+0x199/0x320 [obdclass]
 [&amp;lt;ffffffffa059f267&amp;gt;] cl_unuse_locked+0x37/0x1a0 [obdclass]
 [&amp;lt;ffffffffa059f40e&amp;gt;] cl_unuse+0x3e/0x120 [obdclass]
 [&amp;lt;ffffffffa05a5e57&amp;gt;] cl_io_unlock+0x127/0x2b0 [obdclass]
 [&amp;lt;ffffffffa05a6aed&amp;gt;] cl_io_loop+0xcd/0x1b0 [obdclass]
 [&amp;lt;ffffffffa0f7c9d6&amp;gt;] ll_file_io_generic+0x2b6/0x710 [lustre]
 [&amp;lt;ffffffffa0596db9&amp;gt;] ? cl_env_get+0x29/0x350 [obdclass]
 [&amp;lt;ffffffffa0f7cf6f&amp;gt;] ll_file_aio_read+0x13f/0x2c0 [lustre]
 [&amp;lt;ffffffffa0f7d42c&amp;gt;] ll_file_read+0x16c/0x2a0 [lustre]
 [&amp;lt;ffffffff81181a95&amp;gt;] vfs_read+0xb5/0x1a0
 [&amp;lt;ffffffff81181bd1&amp;gt;] sys_read+0x51/0x90
 [&amp;lt;ffffffff810dc685&amp;gt;] ? __audit_syscall_exit+0x265/0x290
 [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b

LustreError: dumping log to /tmp/lustre-log.1391207221.6283
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;However, from the log, I can see this:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000020:00010000:5.0:1391207220.451167:0:6294:0:(cl_lock.c:151:cl_lock_trace0()) unuse lock: ffff88081acf48e8@(2 ffff8807e6fd0080 1 3 0 1 1 2)(ffff88080d4a0db8/0/1) at cl_unuse_try():1363
00000020:00010000:5.0:1391207220.451172:0:6294:0:(cl_lock.c:151:cl_lock_trace0()) enclosure lock: ffff88081acf4738@(1 (null) 0 3 0 1 1 2)(ffff88080d4a0db8/1/1) at cl_lock_enclosure():1691
00000020:00010000:5.0:1391207220.451175:0:6294:0:(cl_lock.c:151:cl_lock_trace0()) enclosure lock: ffff88081acf48e8@(3 ffff8807e6fd0080 1 4 0 2 1 2)(ffff88080d4a0db8/0/2) at cl_lock_enclosure():1691
00000020:00010000:5.0:1391207220.451178:0:6294:0:(cl_lock.c:151:cl_lock_trace0()) unuse lock: ffff88081acf4738@(2 ffff8807e6fd0080 1 3 0 1 1 2)(ffff88080d4a0c68/1/0) at cl_unuse_try():1363
00010000:00010000:5.0:1391207220.451185:0:6294:0:(ldlm_lock.c:852:ldlm_lock_decref_internal_nolock()) ### ldlm_lock_decref(PR) ns: lustre-OST0000-osc-ffff8808198eb000 lock: ffff8808152de340/0x6d44b209b7d2fe79 lrc: 4/1,0 mode: PR/PR res: [0x2:0x0:0x0].0 rrc: 15 type: EXT [0-&amp;gt;18446744073709551615] (req 1146880-&amp;gt;1150975) flags: 0x10020000000000 nid: local remote: 0x6d44b209b7d2fe95 expref: -99 pid: 6294 timeout: 0 lvb_type: 1
00010000:00010000:5.0:1391207220.451191:0:6294:0:(ldlm_lock.c:920:ldlm_lock_decref_internal()) ### add lock into lru list ns: lustre-OST0000-osc-ffff8808198eb000 lock: ffff8808152de340/0x6d44b209b7d2fe79 lrc: 3/0,0 mode: PR/PR res: [0x2:0x0:0x0].0 rrc: 15 type: EXT [0-&amp;gt;18446744073709551615] (req 1146880-&amp;gt;1150975) flags: 0x10020000000000 nid: local remote: 0x6d44b209b7d2fe95 expref: -99 pid: 6294 timeout: 0 lvb_type: 1
00000020:00010000:5.0:1391207220.451197:0:6294:0:(cl_lock.c:151:cl_lock_trace0()) hold release lock: ffff88081acf4738@(3 ffff8807e6fd0080 1 5 0 2 0 2)(ffff88080d4a0c68/1/0) at cl_lock_hold_release():903
00000020:00010000:5.0:1391207220.451200:0:6294:0:(cl_lock.c:151:cl_lock_trace0()) hold release lock: ffff88081acf4738@(2 ffff8807e6fd0080 1 5 0 1 0 2)(ffff88080d4a0db8/1/2) at cl_lock_hold_release():903
00000080:00200000:5.0:1391207220.451207:0:6294:0:(vvp_io.c:1163:vvp_io_init()) [0x200000400:0x2:0x0] ignore/verify layout 1/0, layout version 0 restore needed 0
00000080:00200000:5.0:1391207220.452460:0:6294:0:(vvp_io.c:153:vvp_io_fini()) [0x200000400:0x2:0x0] ignore/verify layout 1/0, layout version 0 restore needed 0
00010000:00010000:5.0:1391207220.452485:0:6294:0:(ldlm_request.c:1127:ldlm_cli_cancel_local()) ### client-side cancel ns: lustre-OST0000-osc-ffff8808198eb000 lock: ffff8808152de340/0x6d44b209b7d2fe79 lrc: 3/0,0 mode: PR/PR res: [0x2:0x0:0x0].0 rrc: 15 type: EXT [0-&amp;gt;18446744073709551615] (req 1146880-&amp;gt;1150975) flags: 0x28400000000 nid: local remote: 0x6d44b209b7d2fe95 expref: -99 pid: 6294 timeout: 0 lvb_type: 1
00000020:00010000:5.0:1391207220.452493:0:6294:0:(cl_lock.c:151:cl_lock_trace0()) cancel lock: ffff88081acf4738@(3 ffff8807e6fd0080 2 5 0 0 0 1)(ffff8807e0547920/1/1) at cl_lock_cancel():1836
00000020:00010000:5.0:1391207220.452496:0:6294:0:(cl_lock.c:151:cl_lock_trace0()) delete lock: ffff88081acf4738@(3 ffff8807e6fd0080 2 5 0 0 0 1)(ffff8807e0547920/1/1) at cl_lock_delete():1783
00010000:00010000:5.0:1391207220.452523:0:6294:0:(ldlm_lock.c:375:ldlm_lock_destroy_internal()) ### destroy ns: lustre-OST0000-osc-ffff8808198eb000 lock: ffff8808152de340/0x6d44b209b7d2fe79 lrc: 2/0,0 mode: PR/PR res: [0x2:0x0:0x0].0 rrc: 15 type: EXT [0-&amp;gt;18446744073709551615] (req 1146880-&amp;gt;1150975) flags: 0x10869400000000 nid: local remote: 0x6d44b209b7d2fe95 expref: -99 pid: 6294 timeout: 0 lvb_type: 1
00010000:00010000:5.0:1391207220.452529:0:6294:0:(ldlm_lock.c:382:ldlm_lock_destroy_internal()) ### destroy ns: lustre-OST0000-osc-ffff8808198eb000 lock: ffff8808152de340/0x6d44b209b7d2fe79 lrc: 2/0,0 mode: PR/PR res: [0x2:0x0:0x0].0 rrc: 15 type: EXT [0-&amp;gt;18446744073709551615] (req 1146880-&amp;gt;1150975) flags: 0x10869400000000 nid: local remote: 0x6d44b209b7d2fe95 expref: -99 pid: 6294 timeout: 0 lvb_type: 1
00010000:00010000:5.0:1391207220.452534:0:6294:0:(ldlm_lock.c:2174:ldlm_lock_cancel()) ### destroyed ns: lustre-OST0000-osc-ffff8808198eb000 lock: ffff8808152de340/0x6d44b209b7d2fe79 lrc: 1/0,0 mode: PR/PR res: [0x2:0x0:0x0].0 rrc: 15 type: EXT [0-&amp;gt;18446744073709551615] (req 1146880-&amp;gt;1150975) flags: 0x10869400000000 nid: local remote: 0x6d44b209b7d2fe95 expref: -99 pid: 6294 timeout: 0 lvb_type: 1
00010000:00010000:5.0:1391207220.452544:0:6294:0:(ldlm_request.c:1144:ldlm_cli_cancel_local()) ### client-side cancel completed, rc = 549755813888 ns: lustre-OST0000-osc-ffff8808198eb000 lock: ffff8808152de340/0x6d44b209b7d2fe79 lrc: 1/0,0 mode: --/PR res: [0x2:0x0:0x0].0 rrc: 15 type: EXT [0-&amp;gt;18446744073709551615] (req 1146880-&amp;gt;1150975) flags: 0x869400000000 nid: local remote: 0x6d44b209b7d2fe95 expref: -99 pid: 6294 timeout: 0 lvb_type: 1
00010000:00010000:5.0:1391207220.452554:0:6294:0:(ldlm_request.c:1187:ldlm_cancel_pack()) ### packing ns: lustre-OST0000-osc-ffff8808198eb000 lock: ffff8808152de340/0x6d44b209b7d2fe79 lrc: 1/0,0 mode: --/PR res: [0x2:0x0:0x0].0 rrc: 14 type: EXT [0-&amp;gt;18446744073709551615] (req 1146880-&amp;gt;1150975) flags: 0x869400000000 nid: local remote: 0x6d44b209b7d2fe95 expref: -99 pid: 6294 timeout: 0 lvb_type: 1
00010000:00010000:5.0:1391207220.452559:0:6294:0:(ldlm_request.c:1191:ldlm_cancel_pack()) 1 locks packed
00010000:00010000:5.0:1391207220.452565:0:6294:0:(ldlm_lock.c:219:ldlm_lock_put()) ### final lock_put on destroyed lock, freeing it. ns: lustre-OST0000-osc-ffff8808198eb000 lock: ffff8808152de340/0x6d44b209b7d2fe79 lrc: 0/0,0 mode: --/PR res: [0x2:0x0:0x0].0 rrc: 14 type: EXT [0-&amp;gt;18446744073709551615] (req 1146880-&amp;gt;1150975) flags: 0x869400000000 nid: local remote: 0x6d44b209b7d2fe95 expref: -99 pid: 6294 timeout: 0 lvb_type: 1
00010000:00040000:5.0:1391207220.452570:0:6294:0:(ldlm_lock.c:222:ldlm_lock_put()) ASSERTION( (((( lock))-&amp;gt;l_flags &amp;amp; (1ULL &amp;lt;&amp;lt; 50)) != 0) ) failed: 
00010000:00040000:5.0:1391207220.577549:0:6294:0:(ldlm_lock.c:222:ldlm_lock_put()) LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Both line 375 and 382 in were printed. and the source is as follows:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt; 361 &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; ldlm_lock_destroy_internal(struct ldlm_lock *lock)
 362 {
 363         ENTRY;
 364 
 365         &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (lock-&amp;gt;l_readers || lock-&amp;gt;l_writers) {
 366                 LDLM_ERROR(lock, &lt;span class=&quot;code-quote&quot;&gt;&quot;lock still has references&quot;&lt;/span&gt;);
 367                 LBUG();
 368         }
 369 
 370         &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!cfs_list_empty(&amp;amp;lock-&amp;gt;l_res_link)) {
 371                 LDLM_ERROR(lock, &lt;span class=&quot;code-quote&quot;&gt;&quot;lock still on resource&quot;&lt;/span&gt;);
 372                 LBUG();
 373         }
 374 
 375         LDLM_DEBUG(lock, &lt;span class=&quot;code-quote&quot;&gt;&quot;destroy&quot;&lt;/span&gt;);
 376         &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (ldlm_is_destroyed(lock)) {
 377                 LASSERT(cfs_list_empty(&amp;amp;lock-&amp;gt;l_lru));
 378                 EXIT;
 379                 &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 0;
 380         }
 381         ldlm_set_destroyed(lock);
 382         LDLM_DEBUG(lock, &lt;span class=&quot;code-quote&quot;&gt;&quot;destroy&quot;&lt;/span&gt;);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;However, the destroyed flag was failed to set. This is beyond of my understanding, and we should escalate this issue.&lt;/p&gt;</comment>
                            <comment id="76012" author="jay" created="Fri, 31 Jan 2014 22:52:00 +0000"  >&lt;p&gt;Full log is here&lt;/p&gt;</comment>
                            <comment id="76013" author="green" created="Fri, 31 Jan 2014 23:37:28 +0000"  >&lt;p&gt;I think the prime suspect owuld be some place that updates lock flags without taking lock spinlock. There are several such places.&lt;/p&gt;</comment>
                            <comment id="76018" author="jay" created="Sat, 1 Feb 2014 00:09:15 +0000"  >&lt;p&gt;I confirmed that changing the dlm flag operation to bitops fixed the problem.&lt;/p&gt;

&lt;p&gt;Check the attachment for the concept patch. The real patch should be worked out by modifying lustre/contrib/bit-masks/lustre_dlm_flags.tpl.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h
index 98e03c9..bf4fb65 100644
--- a/lustre/include/lustre_dlm.h
+++ b/lustre/include/lustre_dlm.h
@@ -772,7 +772,7 @@ struct ldlm_lock {
         * Lock state flags. Protected by lr_lock.
         * \see lustre_dlm_flags.h where the bits are defined.
         */
-       __u64                   l_flags;
+       &lt;span class=&quot;code-keyword&quot;&gt;volatile&lt;/span&gt; __u64          l_flags;
 
        /**
         * Lock r/w usage counters.
diff --git a/lustre/include/lustre_dlm_flags.h b/lustre/include/lustre_dlm_flags.h
index 283546d..4be532f 100644
--- a/lustre/include/lustre_dlm_flags.h
+++ b/lustre/include/lustre_dlm_flags.h
@@ -57,91 +57,91 @@
 
 &lt;span class=&quot;code-comment&quot;&gt;/** extent, mode, or resource changed */&lt;/span&gt;
 #define LDLM_FL_LOCK_CHANGED            0x0000000000000001ULL &lt;span class=&quot;code-comment&quot;&gt;// bit   0
&lt;/span&gt;-#define ldlm_is_lock_changed(_l)        LDLM_TEST_FLAG(( _l), 1ULL &amp;lt;&amp;lt;  0)
-#define ldlm_set_lock_changed(_l)       LDLM_SET_FLAG((  _l), 1ULL &amp;lt;&amp;lt;  0)
-#define ldlm_clear_lock_changed(_l)     LDLM_CLEAR_FLAG((_l), 1ULL &amp;lt;&amp;lt;  0)
+#define ldlm_is_lock_changed(_l)        LDLM_TEST_FLAG(( _l),  0)
+#define ldlm_set_lock_changed(_l)       LDLM_SET_FLAG((  _l),  0)
+#define ldlm_clear_lock_changed(_l)     LDLM_CLEAR_FLAG((_l),  0)

......
......

 &lt;span class=&quot;code-comment&quot;&gt;/** test &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; ldlm_lock flag bit set */&lt;/span&gt;
-#define LDLM_TEST_FLAG(_l, _b)    (((_l)-&amp;gt;l_flags &amp;amp; (_b)) != 0)
+#define LDLM_TEST_FLAG(_l, _b)    test_bit(_b, (&lt;span class=&quot;code-keyword&quot;&gt;volatile&lt;/span&gt; unsigned &lt;span class=&quot;code-object&quot;&gt;long&lt;/span&gt; *)&amp;amp;(_l)-&amp;gt;l_flags)
 
 &lt;span class=&quot;code-comment&quot;&gt;/** multi-bit test: are any of mask bits set? */&lt;/span&gt;
 #define LDLM_HAVE_MASK(_l, _m)    (((_l)-&amp;gt;l_flags &amp;amp; LDLM_FL_##_m##_MASK) != 0)
 
 &lt;span class=&quot;code-comment&quot;&gt;/** set a ldlm_lock flag bit */&lt;/span&gt;
-#define LDLM_SET_FLAG(_l, _b)     ((_l)-&amp;gt;l_flags |= (_b))
+#define LDLM_SET_FLAG(_l, _b)     set_bit(_b, (&lt;span class=&quot;code-keyword&quot;&gt;volatile&lt;/span&gt; unsigned &lt;span class=&quot;code-object&quot;&gt;long&lt;/span&gt; *)&amp;amp;(_l)-&amp;gt;l_flags)
 
 &lt;span class=&quot;code-comment&quot;&gt;/** clear a ldlm_lock flag bit */&lt;/span&gt;
-#define LDLM_CLEAR_FLAG(_l, _b)   ((_l)-&amp;gt;l_flags &amp;amp;= ~(_b))
+#define LDLM_CLEAR_FLAG(_l, _b)   clear_bit(_b, (&lt;span class=&quot;code-keyword&quot;&gt;volatile&lt;/span&gt; unsigned &lt;span class=&quot;code-object&quot;&gt;long&lt;/span&gt; *)&amp;amp;(_l)-&amp;gt;l_flags)
 
 &lt;span class=&quot;code-comment&quot;&gt;/** @} subgroup */&lt;/span&gt;
 &lt;span class=&quot;code-comment&quot;&gt;/** @} group */&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt; </comment>
                            <comment id="76041" author="adilger" created="Sat, 1 Feb 2014 08:57:46 +0000"  >&lt;p&gt;Jinshan, according to the comment, the flags should be protected by lr_flags. Are there flag changes not protected by this lock?&lt;/p&gt;

&lt;p&gt;Also, I have some concern that using set_bit() will not set the same bits as the Lustre network protocol on all architectures. Do you know if there is a specific bit ordering used in all cases?&lt;/p&gt;</comment>
                            <comment id="76043" author="jay" created="Sat, 1 Feb 2014 19:16:09 +0000"  >&lt;p&gt;The flags should be protected by ldlm_lock::l_lock. When I saw this issue, my first reaction was to check if there are unprotected writing to l_flags and the only place is:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;/**
 * Removes LDLM lock \a lock from LRU. Assumes LRU is already locked.
 */
&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock)
{
        &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; rc = 0;
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!cfs_list_empty(&amp;amp;lock-&amp;gt;l_lru)) {
                struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);

                LASSERT(lock-&amp;gt;l_resource-&amp;gt;lr_type != LDLM_FLOCK);
                cfs_list_del_init(&amp;amp;lock-&amp;gt;l_lru);
                ldlm_clear_skipped(lock);
                LASSERT(ns-&amp;gt;ns_nr_unused &amp;gt; 0);
                ns-&amp;gt;ns_nr_unused--;
                rc = 1;
        }
        &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; rc;
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;However, it was protected by ns_lock and if the lock is being destroyed, it must have been taken out of LRU list.&lt;/p&gt;</comment>
                            <comment id="76048" author="lixi" created="Sun, 2 Feb 2014 01:23:59 +0000"  >&lt;p&gt;Hi Jinshan,&lt;/p&gt;

&lt;p&gt;According to your test results, this problem really looks like a race problem. Would you please try following patch to check that the lock is always held when clearing/setting the flag?&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/#/c/8772/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/8772/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="76052" author="jay" created="Sun, 2 Feb 2014 05:28:30 +0000"  >&lt;p&gt;Hi Li Xi,&lt;/p&gt;

&lt;p&gt;That looks true. Somehow I looked at the code wrong. The dlm lock was set destroyed flag before taking out of LRU, so ldlm_clear_skipped() in ldlm_lock_remove_from_lru_nolock() is indeed the root cause of this problem.&lt;/p&gt;

&lt;p&gt;Your patch will cause deadlock because we should take res lock first and then ns_lock. My idea to fix this problem is to move ldlm_clear_skipped() out of ldlm_lock_remove_from_lru_nolock() into ldlm_lock_add_to_lru_nolock(). I think this will fix the problem as well.&lt;/p&gt;
</comment>
                            <comment id="76059" author="lixi" created="Sun, 2 Feb 2014 16:15:59 +0000"  >&lt;p&gt;Ah, yeah, the right lock order is lock-&amp;gt;l_lock, res-&amp;gt;lr_lock, ns-&amp;gt;ns_lock. Thanks for poiting that error out. &lt;/p&gt;

&lt;p&gt;We are able to reproduce the problem steadily too. If you could push a patch, we can check whether it helps. I feel that there are more than one place where lock-&amp;gt;l_flags is changed without the protection of lock-&amp;gt;l_lock.&lt;/p&gt;</comment>
                            <comment id="76845" author="paf" created="Wed, 12 Feb 2014 16:37:06 +0000"  >&lt;p&gt;Li, Jinshan - Are either of you able to share how you&apos;re reproducing this?&lt;/p&gt;</comment>
                            <comment id="76847" author="jay" created="Wed, 12 Feb 2014 16:51:56 +0000"  >&lt;p&gt;The root cause of this issue is pretty clear.&lt;/p&gt;

&lt;p&gt;Bobijam, can you please create a patch for this?&lt;/p&gt;</comment>
                            <comment id="76939" author="bobijam" created="Thu, 13 Feb 2014 05:18:17 +0000"  >&lt;p&gt;updated patch &lt;a href=&quot;http://review.whamcloud.com/#/c/8772/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/8772/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="77586" author="jlevi" created="Fri, 21 Feb 2014 13:23:10 +0000"  >&lt;p&gt;Patch landed to Master. Please reopen ticket if more work is needed.&lt;/p&gt;</comment>
                            <comment id="77596" author="bogl" created="Fri, 21 Feb 2014 14:45:05 +0000"  >&lt;p&gt;backport to b2_5:&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/9346&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/9346&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="14021" name="LU-4268-140123-log.tar.gz" size="1834987" author="paf" created="Mon, 27 Jan 2014 19:34:44 +0000"/>
                            <attachment id="13973" name="analysis.txt" size="342441" author="ihara" created="Tue, 14 Jan 2014 05:43:31 +0000"/>
                            <attachment id="13960" name="iwc.console.gz" size="29759" author="cliffw" created="Fri, 3 Jan 2014 18:01:44 +0000"/>
                            <attachment id="13957" name="iwc106.sysrq.txt" size="383805" author="cliffw" created="Fri, 3 Jan 2014 17:59:15 +0000"/>
                            <attachment id="13955" name="iwc106.sysrq.txt" size="383805" author="cliffw" created="Thu, 2 Jan 2014 18:04:56 +0000"/>
                            <attachment id="13958" name="iwc106.sysrq.txt.gz" size="29759" author="cliffw" created="Fri, 3 Jan 2014 18:00:02 +0000"/>
                            <attachment id="13959" name="iwc106.sysrq.txt.gz" size="29759" author="cliffw" created="Fri, 3 Jan 2014 18:00:02 +0000"/>
                            <attachment id="13841" name="iwc48.lctl.log.txt.gz" size="237" author="cliffw" created="Mon, 18 Nov 2013 19:17:58 +0000"/>
                            <attachment id="14044" name="log.bz2" size="537422" author="jay" created="Fri, 31 Jan 2014 22:52:00 +0000"/>
                            <attachment id="13921" name="vmcore-dmesg.txt" size="80273" author="ihara" created="Mon, 16 Dec 2013 13:21:32 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzw9mv:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>11723</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>