<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:53:57 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12592] sanity 134a hit panic time to time.</title>
                <link>https://jira.whamcloud.com/browse/LU-12592</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;It looks patch aa82cc8361 (&quot;obdclass: put all service&apos;s env on the list&quot;) introduce one more regression.&lt;br/&gt;
sanity test hit a panic sometimes. &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[84431.019155] Lustre: DEBUG MARKER: == sanity test 134a: Server reclaims locks when reaching lock_reclaim_threshold ====================== 12:42:06
 (1564134126)
[84431.770714] Lustre: *** cfs_fail_loc=327, val=0***
[84431.787957] LustreError: 10728:0:(ofd_internal.h:410:ofd_info()) ASSERTION( info ) failed:
[84431.788969] LustreError: 10728:0:(ofd_internal.h:410:ofd_info()) LBUG
[84431.790039] Pid: 10728, comm: mdt00_005 3.10.0-neo7.4.x86_64 #0 SMP Thu Nov 15 06:30:59 EST 2018
[84431.791221] Call Trace:
[84431.792331]  [&amp;lt;ffffffff810434f2&amp;gt;] save_stack_trace_tsk+0x22/0x40
[84431.793382]  [&amp;lt;ffffffffc06c47ec&amp;gt;] libcfs_call_trace+0x8c/0xc0 [libcfs]
[84431.794333]  [&amp;lt;ffffffffc06c489c&amp;gt;] lbug_with_loc+0x4c/0xa0 [libcfs]
[84431.795260]  [&amp;lt;ffffffffc1394e71&amp;gt;] ofd_exit+0x0/0x18f [ofd]
[84431.796169]  [&amp;lt;ffffffffc1393a3b&amp;gt;] ofd_lvbo_update+0xd5b/0xe60 [ofd]
[84431.797248]  [&amp;lt;ffffffffc0b1d6b5&amp;gt;] ldlm_handle_ast_error+0x475/0x860 [ptlrpc]
[84431.798655]  [&amp;lt;ffffffffc0b1f32a&amp;gt;] ldlm_cb_interpret+0x19a/0x750 [ptlrpc]
[84431.800193]  [&amp;lt;ffffffffc0b3a954&amp;gt;] ptlrpc_check_set.part.22+0x494/0x1e90 [ptlrpc]
[84431.801494]  [&amp;lt;ffffffffc0b3c3ab&amp;gt;] ptlrpc_check_set+0x5b/0xe0 [ptlrpc]
[84431.802831]  [&amp;lt;ffffffffc0b3c774&amp;gt;] ptlrpc_set_wait+0x344/0x7c0 [ptlrpc]
[84431.803902]  [&amp;lt;ffffffffc0af8475&amp;gt;] ldlm_run_ast_work+0xd5/0x3a0 [ptlrpc]
[84431.805074]  [&amp;lt;ffffffffc0b31235&amp;gt;] ldlm_reclaim_full+0x425/0x7a0 [ptlrpc]
[84431.806060]  [&amp;lt;ffffffffc0b2243b&amp;gt;] ldlm_handle_enqueue0+0x13b/0x1650 [ptlrpc]
[84431.807019]  [&amp;lt;ffffffffc0bae3a2&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
[84431.807903]  [&amp;lt;ffffffffc0bb70a8&amp;gt;] tgt_request_handle+0x998/0x1610 [ptlrpc]
[84431.808731]  [&amp;lt;ffffffffc0b56966&amp;gt;] ptlrpc_server_handle_request+0x266/0xb30 [ptlrpc]
[84431.809561]  [&amp;lt;ffffffffc0b5afc0&amp;gt;] ptlrpc_main+0xd20/0x1cf0 [ptlrpc]
[84431.810355]  [&amp;lt;ffffffff810ce2df&amp;gt;] kthread+0xef/0x100
[84431.811077]  [&amp;lt;ffffffff8178bedd&amp;gt;] ret_from_fork+0x5d/0xb0
[84431.811755]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="56497">LU-12592</key>
            <summary>sanity 134a hit panic time to time.</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="shadow">Alexey Lyashkov</reporter>
                        <labels>
                    </labels>
                <created>Fri, 26 Jul 2019 10:37:41 +0000</created>
                <updated>Mon, 29 Jul 2019 07:34:01 +0000</updated>
                                            <version>Lustre 2.13.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="252067" author="bzzz" created="Fri, 26 Jul 2019 10:42:58 +0000"  >&lt;p&gt;it would be good to have logs.&lt;br/&gt;
the very first &quot;confusion&quot; is that mdt thread is handling extents locks (ofd_lvbo_update())&lt;/p&gt;

&lt;p&gt;there is already ticket for this issue - &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12570&quot; title=&quot;sanity test 134a crash with SSK in use&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12570&quot;&gt;&lt;del&gt;LU-12570&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="252068" author="shadow" created="Fri, 26 Jul 2019 11:04:12 +0000"  >&lt;p&gt;I have several crash dumps (two ?) but both with default sanity debug logs.&lt;br/&gt;
based on logs, thread started to cancel a sort of extent locks.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
00000100:00100000:0.0:1564134127.527799:0:10728:0:(service.c:2227:ptlrpc_server_handle_request()) Handling RPC req@ffff8800369e2c00 pname:cluuid+ref
:pid:xid:nid:opc:job mdt00_005:29092c9d-0c24-a754-6f34-44253c4359c6+3009:9120:x1640112644356192:12345-0@lo:101:jobid_name=touch.0
00010000:00010000:0.0:1564134127.527807:0:10728:0:(ldlm_lockd.c:1186:ldlm_handle_enqueue0()) ### server-side enqueue handler START
00010000:02000000:0.0:1564134127.527809:0:10728:0:(libcfs_fail.h:92:cfs_fail_check_set()) *** cfs_fail_loc=327, val=0***
00010000:00010000:0.0:1564134127.531157:0:10728:0:(ldlm_reclaim.c:214:ldlm_reclaim_res()) NS(filter-lustre-OST0000_UUID): 512 locks to be reclaimed, found 512/512 locks.
00000100:00100000:0.0:1564134127.544998:0:10728:0:(client.c:1675:ptlrpc_send_new_req()) Sending RPC req@ffff88010fc91b80 pname:cluuid:pid:xid:nid:opc:job mdt00_005:lustre-OST0000_UUID:10728:1640112644364384:0@lo:104:(&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;)
00000100:00100000:0.0:1564134127.545004:0:10728:0:(events.c:354:request_in_callback()) peer: 12345-0@lo (source: 12345-0@lo)
00000100:00100000:0.0:1564134127.545036:0:10728:0:(client.c:2408:ptlrpc_set_wait()) set ffff880008306300 going to sleep &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 6 seconds
00010000:00010000:0.0:1564134127.545045:0:10728:0:(ldlm_lockd.c:696:ldlm_handle_ast_error()) ### client (nid 0@lo) returned -22 from blocking AST (req@ffff8800369e3180 x1640112644356208) - normal race ns: filter-lustre-OST0000_UUID lock: ffff8800506a1800/0xe918d2a075fe22a8 lrc: 4/0,0 mode: PR/PR res: [0xc279:0x0:0x0].0x0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;18446744073709551615) flags: 0x60000400000020 nid: 0@lo remote: 0xe918d2a075fe229a expref: 583 pid: 14303 timeout: 84529 lvb_type: 1
00002000:00040000:0.0:1564134127.545051:0:10728:0:(ofd_internal.h:410:ofd_info()) ASSERTION( info ) failed:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
                thr ffff8800763dde00 - pid 10728
                        req ffff8800369e2c00
request ffff8800369e2c00
from 0.0.0.0@9:0
to 0.0.0.0@9:0
xid 1640112644356192:0
transno 0
time 18446612134297984512:1564134133:0
flags 3
lens 184 104 8 136 0 0 2 240 17 37 0
ptlrpc body(v3) ffff880137d56598
        type 4711
        opc 101 LDLM_ENQUEUE
Intent: OPEN+CREATE
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So open+create request was trigger a ldlm_reclaim_full which flush an extent locks also, but no DT_THREAD env for mdt thread.&lt;/p&gt;

&lt;p&gt;it looks it bug can triggered in just single node setup.&lt;/p&gt;</comment>
                            <comment id="252069" author="shadow" created="Fri, 26 Jul 2019 11:17:11 +0000"  >&lt;p&gt;so regression from&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
commit fe60e0135ee2334440247cde167b707b223cf11d
Author: Niu Yawei &amp;lt;yawei.niu@intel.com&amp;gt;
Date:   Thu May 21 11:07:54 2015 -0400

    LU-6529 ldlm: reclaim granted locks defensively
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I don&apos;t understand this patch at all. server lock volume is created for same purpose, so reduce a SLV will reduce a ldlm locks memory consumption. So my suggestion - just revert this patch and use SLV for same purpose.&lt;/p&gt;</comment>
                            <comment id="252091" author="green" created="Fri, 26 Jul 2019 20:01:50 +0000"  >&lt;p&gt;same crashes with SSK at 100% rate&lt;/p&gt;</comment>
                            <comment id="252160" author="shadow" created="Mon, 29 Jul 2019 07:34:01 +0000"  >&lt;p&gt;based on discussion with Oleg, it looks two ways to fix exist.&lt;br/&gt;
1) fast - just limit a flush to same ldlm namespace type as request ordinated.&lt;br/&gt;
2) long - but better.&lt;/p&gt;

&lt;p&gt;Current patch need to be interact with LRU resize to don&apos;t allow to set hard limit over SLV, otherwise stupid situation exist.&lt;br/&gt;
this tunable allowed to set less than LRU resize limit and it block to work LRU resize at all.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="56459">LU-12570</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00k7r:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>