<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:58:17 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13089] ASSERTION( (((( lock))-&gt;l_flags &amp; (1ULL &lt;&lt; 50)) != 0) ) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-13089</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;I hit this in b2_12-next testing, but there&apos;s nothing I see in there that appears related so I think this is just super rare race that&apos;s pretty similar in nature to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4269&quot; title=&quot;ldlm_lock_put()) ASSERTION( (((( lock))-&amp;gt;l_flags &amp;amp; (1ULL &amp;lt;&amp;lt; 50)) != 0) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4269&quot;&gt;&lt;del&gt;LU-4269&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;except this time it&apos;s glimpse cb vs cancel cb race&lt;/p&gt;

&lt;p&gt;console gives us:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[19625.122592] LustreError: 8347:0:(ldlm_lock.c:213:ldlm_lock_put()) ASSERTION( (((( lock))-&amp;gt;l_flags &amp;amp; (1ULL &amp;lt;&amp;lt; 50)) != 0) ) failed: 
[19625.124370] LustreError: 8347:0:(ldlm_lock.c:213:ldlm_lock_put()) LBUG
[19625.124704] LustreError: 8367:0:(ldlm_lock.c:205:ldlm_lock_put()) ASSERTION( atomic_read(&amp;amp;lock-&amp;gt;l_refc) &amp;gt; 0 ) failed: 
[19625.124705] LustreError: 8367:0:(ldlm_lock.c:205:ldlm_lock_put()) LBUG
[19625.124706] Pid: 8367, comm: ldlm_bl_02 3.10.0-7.7-debug #1 SMP Wed Oct 30 09:43:00 EDT 2019
[19625.124707] Call Trace:
[19625.124739]  [&amp;lt;ffffffffa01f97dc&amp;gt;] libcfs_call_trace+0x8c/0xc0 [libcfs]
[19625.124744]  [&amp;lt;ffffffffa01f988c&amp;gt;] lbug_with_loc+0x4c/0xa0 [libcfs]
[19625.124796]  [&amp;lt;ffffffffa0638197&amp;gt;] ldlm_lock_put+0x557/0x730 [ptlrpc]
[19625.124821]  [&amp;lt;ffffffffa0639980&amp;gt;] ldlm_lock_destroy_nolock+0x50/0x110 [ptlrpc]
[19625.124839]  [&amp;lt;ffffffffa063febf&amp;gt;] ldlm_lock_cancel+0x6f/0x1f0 [ptlrpc]
[19625.124866]  [&amp;lt;ffffffffa0656a3a&amp;gt;] ldlm_cli_cancel_local+0xca/0x3f0 [ptlrpc]
[19625.124887]  [&amp;lt;ffffffffa065c767&amp;gt;] ldlm_cli_cancel+0x157/0x620 [ptlrpc]
[19625.124917]  [&amp;lt;ffffffffa08a8b1a&amp;gt;] osc_ldlm_blocking_ast+0x17a/0x3a0 [osc]
[19625.124949]  [&amp;lt;ffffffffa0668835&amp;gt;] ldlm_handle_bl_callback+0xf5/0x4f0 [ptlrpc]
[19625.124977]  [&amp;lt;ffffffffa06693e8&amp;gt;] ldlm_bl_thread_main+0x7b8/0x980 [ptlrpc]
[19625.125040]  [&amp;lt;ffffffff810b8254&amp;gt;] kthread+0xe4/0xf0
[19625.125053]  [&amp;lt;ffffffff817e0df7&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[19625.125063]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[19625.125067] Kernel panic - not syncing: LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;and the oud 8347 backtrace is&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; #4 [ffff88003e313820] vt_console_print at ffffffff814dbe54
 #5 [ffff88003e313880] call_console_drivers.constprop.17 at ffffffff8108cf00
 #6 [ffff88003e3138a8] console_unlock at ffffffff8108e559
 #7 [ffff88003e3138e8] vprintk_emit at ffffffff8108e984
 #8 [ffff88003e313958] vprintk_default at ffffffff8108ed79
 #9 [ffff88003e313968] printk at ffffffff817c67d1
#10 [ffff88003e3139c8] cfs_print_to_console at ffffffffa01f958a [libcfs]
#11 [ffff88003e3139f8] libcfs_debug_vmsg2 at ffffffffa01ff9e3 [libcfs]
#12 [ffff88003e313b48] libcfs_debug_msg at ffffffffa0200077 [libcfs]
#13 [ffff88003e313ba8] lbug_with_loc at ffffffffa01f9866 [libcfs]
#14 [ffff88003e313bc8] ldlm_lock_put at ffffffffa0638266 [ptlrpc]
#15 [ffff88003e313be8] osc_ldlm_glimpse_ast at ffffffffa08a8f28 [osc]
#16 [ffff88003e313ca0] ldlm_callback_handler at ffffffffa066a0a8 [ptlrpc]
#17 [ffff88003e313d18] ldlm_callback_handler at ffffffffa066b3c7 [ptlrpc]
#18 [ffff88003e313d30] ptlrpc_server_handle_request at ffffffffa0699266 [ptlrpc]
#19 [ffff88003e313dd0] ptlrpc_main at ffffffffa069d261 [ptlrpc]
#20 [ffff88003e313ea8] kthread at ffffffff810b8254
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
</description>
                <environment></environment>
        <key id="57666">LU-13089</key>
            <summary>ASSERTION( (((( lock))-&gt;l_flags &amp; (1ULL &lt;&lt; 50)) != 0) ) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="green">Oleg Drokin</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Thu, 19 Dec 2019 06:13:12 +0000</created>
                <updated>Fri, 21 Oct 2022 17:33:09 +0000</updated>
                            <resolved>Mon, 26 Oct 2020 13:36:50 +0000</resolved>
                                    <version>Lustre 2.12.4</version>
                                                        <due></due>
                            <votes>2</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="260138" author="green" created="Thu, 19 Dec 2019 06:14:08 +0000"  >&lt;p&gt;attached is lustre.txt that&apos;s lustre debug log extracted from the core file where you clearly can see the race unfolding.&lt;/p&gt;</comment>
                            <comment id="260174" author="bobijam" created="Fri, 20 Dec 2019 03:20:20 +0000"  >&lt;p&gt;This is more like lock&apos;s reference issue other than the race about setting lock&apos;s flags.&lt;/p&gt;

&lt;p&gt;From the log, the lock was requested to be cancelled, and process 8367 handled it and was trying to flush data covered by it. The lock&apos;s reference account looks ok.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;428786 00010000:00010000:3.0:1576584703.467448:0:8367:0:(ldlm_lockd.c:1730:ldlm_handle_bl_callback()) ### client blocking AST callback handler ns: ?? lock: ffff880012bd5d80/0x232daeb61003014b lrc: 3/0,0 mode: PW/PW res: ?? rrc=?? type: ??? flags: 0x420000020000 nid: local remote: 0x232daeb610030278 expref: -99 pid: 22687 timeout: 0 lvb_type: 1
428787 00010000:00010000:3.0:1576584703.467450:0:8367:0:(ldlm_lockd.c:1761:ldlm_handle_bl_callback()) Lock ffff880012bd5d80 already unused, calling callback (ffffffffa08a89a0)
428788 00010000:00010000:3.0:1576584703.467452:0:8367:0:(ldlm_request.c:1268:ldlm_cli_cancel_local()) ### client-side cancel ns: ?? lock: ffff880012bd5d80/0x232daeb61003014b lrc: 4/0,0 mode: PW/PW res:?? rrc=?? type: ??? flags: 0x428400020000 nid: local remote: 0x232daeb610030278 expref: -99 pid: 22687 timeout: 0 lvb_type: 1
428790 00000008:00000020:3.0:1576584703.467457:0:8367:0:(osc_cache.c:3008:osc_cache_wait_range()) obj ffff88002b179e60 ready 0|-|- wr 0|-|- rd 0|- sync file range.
428791 00000008:00000020:3.0:1576584703.467459:0:8367:0:(osc_cache.c:3140:osc_cache_writeback_range()) obj ffff88002b179e60 ready 0|-|- wr 0|-|- rd 0|- pageout [224, 479], 0.                            
428792 00000008:00000020:3.0:1576584703.467461:0:8367:0:(osc_lock.c:375:osc_lock_flush()) object ffff88002b179e60: [224 -&amp;gt; 479] 0 pages were written back.
428794 00000080:00200000:3.0:1576584703.467465:0:8367:0:(vvp_io.c:1469:vvp_io_init()) [0x200000402:0x70c0:0x0] ignore/verify layout 1/0, layout version 0 restore needed 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;While process 8347 handling gl callback tripped over the lock before flush finished, but it&apos;s ref count strangely became 0 then, and for sure the lock hadn&apos;t been set destroy flag yet.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;428898 00010000:00010000:1.0:1576584703.468909:0:8347:0:(ldlm_lock.c:210:ldlm_lock_put()) ### final lock_put on destroyed lock, freeing it. ns: lustre-OST0001-osc-ffff88003f2f8800 lock: ffff880012bd5d80/0x232daeb61003014b lrc: 0/0,0 mode: PW/PW res: [0x12cb:0x0:0x0].0x0 rrc: 5 type: EXT [917504-&amp;gt;1966079] (req 917504-&amp;gt;983039) flags: 0x429400020000 nid: local remote: 0x232daeb610030278 expref: -99 pid: 22687 timeout: 0 lvb_type: 1
428899 00010000:00040000:1.0:1576584703.468914:0:8347:0:(ldlm_lock.c:213:ldlm_lock_put()) ASSERTION( (((( lock))-&amp;gt;l_flags &amp;amp; (1ULL &amp;lt;&amp;lt; 50)) != 0) ) failed: 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;process 8367 picked up the flush IO and found out the lock&apos;s erroneous ref count.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;428913 00000080:00200000:3.0:1576584703.471016:0:8367:0:(vvp_io.c:312:vvp_io_fini()) [0x200000402:0x70c0:0x0] ignore/verify layout 1/0, layout version 0 need write layout 0, restore needed 0
428914 00000008:00000020:3.0:1576584703.471021:0:8367:0:(osc_object.c:183:osc_attr_update()) set kms from 1966080to 0 
428915 00010000:00040000:3.0:1576584703.471027:0:8367:0:(ldlm_lock.c:205:ldlm_lock_put()) ASSERTION( atomic_read(&amp;amp;lock-&amp;gt;l_refc) &amp;gt; 0 ) failed: 
428916 00010000:00040000:3.0:1576584703.471030:0:8367:0:(ldlm_lock.c:205:ldlm_lock_put()) LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="275801" author="lflis" created="Mon, 20 Jul 2020 19:22:43 +0000"  >&lt;p&gt;CYFRONET here,&lt;/p&gt;

&lt;p&gt;Since migration to 2.12 we noted that some types of hpc jobs&#160; are crashing the nodes(few crashes a month / 2200 compute nodes). Until today we were not able to reproduce the issue - now we belive to have workload which does it every time it&apos;s run.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;2195888.180647&amp;#93;&lt;/span&gt; LustreError: 3966:0:(ldlm_lock.c:213:ldlm_lock_put()) ASSERTION( (((( lock))-&amp;gt;l_flags &amp;amp; (1ULL &amp;lt;&amp;lt; 50)) != 0) ) failed:&lt;/p&gt;

&lt;p&gt;Client version is 2.12.4&lt;/p&gt;

&lt;p&gt;If you need a debug log for any susbystem or other debug data please let us know. We&apos;d be happy to provide more info which could help with diagnosis&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Best Regards&lt;/p&gt;

&lt;p&gt;&amp;#8211;&lt;/p&gt;

&lt;p&gt;Lukasz Flis&lt;/p&gt;</comment>
                            <comment id="277399" author="ofaaland" created="Thu, 13 Aug 2020 01:25:43 +0000"  >&lt;p&gt;We are seeing this regularly at LLNL with Lustre&#160;2.12.4_5.chaos.&lt;/p&gt;</comment>
                            <comment id="278769" author="gerrit" created="Thu, 3 Sep 2020 16:20:22 +0000"  >&lt;p&gt;Bobi Jam (bobijam@hotmail.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/39819&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/39819&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13089&quot; title=&quot;ASSERTION( (((( lock))-&amp;gt;l_flags &amp;amp; (1ULL &amp;lt;&amp;lt; 50)) != 0) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13089&quot;&gt;&lt;del&gt;LU-13089&lt;/del&gt;&lt;/a&gt; osc: revert &quot;glimpse - search for active lock&quot;&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 37205c60bb2d99363a0c9dbf29d8f4fd684b6fab&lt;/p&gt;</comment>
                            <comment id="279333" author="lflis" created="Fri, 11 Sep 2020 13:52:49 +0000"  >&lt;p&gt;Applying the &lt;a href=&quot;https://review.whamcloud.com/39819&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/39819&lt;/a&gt;&#160;fixed the problem in CYFRONET&lt;br/&gt;
 Before going to production with this fix we need to test if &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11670&quot; title=&quot;Incorrect size when using lockahead&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11670&quot;&gt;&lt;del&gt;LU-11670&lt;/del&gt;&lt;/a&gt;&#160;is a problem for us.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="279469" author="kim.sebo" created="Sun, 13 Sep 2020 23:49:36 +0000"  >&lt;p&gt;We at ANU/NCI are also seeing this assert, ~10 times per month on 3000 clients. Clients are Centos 8 / 2.12.5 + some backports.&lt;/p&gt;</comment>
                            <comment id="282509" author="gerrit" created="Sat, 17 Oct 2020 18:31:55 +0000"  >&lt;p&gt;Bobi Jam (bobijam@hotmail.com) uploaded a patch port: &lt;a href=&quot;https://review.whamcloud.com/#/c/40399&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40399&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;The search_itree was glitched and messed up data-&amp;gt;lmd_lock.&lt;/p&gt;</comment>
                            <comment id="282515" author="gerrit" created="Sun, 18 Oct 2020 13:16:14 +0000"  >&lt;p&gt;Bobi Jam (bobijam@hotmail.com) abandoned the patch: &lt;a href=&quot;https://review.whamcloud.com/40286&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40286&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="283227" author="pjones" created="Mon, 26 Oct 2020 13:36:50 +0000"  >&lt;p&gt;Believed to be a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11719&quot; title=&quot;Refactor search_itree&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11719&quot;&gt;&lt;del&gt;LU-11719&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="54037">LU-11670</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="60416">LU-13908</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="34042" name="lustre.txt" size="81331814" author="green" created="Thu, 19 Dec 2019 06:13:44 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00r73:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>