<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:00:20 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13328] sanityn test 35 is broken</title>
                <link>https://jira.whamcloud.com/browse/LU-13328</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;It looks like tha race condition is rarely hit. Here&apos;s the typical run:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[  751.745601] Lustre: DEBUG MARKER: == sanityn test 35: -EINTR cp_ast vs. bl_ast race does not evict client ============================== 04:48:12 (1583315292)
[  752.624986] Lustre: DEBUG MARKER: Race attempt 0
[  754.244032] Lustre: DEBUG MARKER: Wait for 16439 16445 for 60 sec...
[  759.855738] Lustre: lustre-OST0001-osc-ffff8800a08ce800: disconnect after 21s idle
[  816.516229] Lustre: DEBUG MARKER: == sanityn test 36: handle ESTALE/open-unlink correctly 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;BUT sometimes it does hit and in that case the thread is hung and remains hung forever, making cleanup timeout.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[  699.235637] Lustre: DEBUG MARKER: == sanityn test 35: -EINTR cp_ast vs. bl_ast race does not evict client ============================== 17:30:03 (1583274603)
[  700.065742] Lustre: DEBUG MARKER: Race attempt 0
[  701.154904] Lustre: *** cfs_fail_loc=317, val=2147484440***
[  701.155724] LustreError: 17592:0:(libcfs_fail.h:169:cfs_race()) cfs_race id 318 sleeping
[  701.579880] Lustre: DEBUG MARKER: Wait for 17549 17565 for 60 sec...
[  763.731071] Lustre: DEBUG MARKER: == sanityn test 36: handle ESTALE/open-unlink correctly 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;See how the race hit, but never actually woke.&lt;/p&gt;

&lt;p&gt;We can confirm this in analyzing the core file:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 17592  TASK: ffff88009dfc8440  CPU: 2   COMMAND: &quot;ll_sa_17565&quot;
 #0 [ffff8800c25276e8] __schedule+0x3ee at ffffffff817d889e
 #1 [ffff8800c2527778] schedule+0x29 at ffffffff817d8ea9
 #2 [ffff8800c2527788] ldlm_lock_decref_internal+0x96e at ffffffffa05b09ce [ptlrpc]
 #3 [ffff8800c25277e8] failed_lock_cleanup+0x98 at ffffffffa05cb928 [ptlrpc]
 #4 [ffff8800c2527808] ldlm_cli_enqueue_fini+0x16c at ffffffffa05cddac [ptlrpc]
 #5 [ffff8800c25278b0] ldlm_cli_enqueue+0x441 at ffffffffa05d1ad1 [ptlrpc]

crash&amp;gt; l *0xffffffffa05b09ce
0xffffffffa05b09ce is in ldlm_lock_decref_internal (/home/green/git/lustre-release/libcfs/include/libcfs/libcfs_fail.h:170).
165		if (CFS_FAIL_PRECHECK(id)) {
166			if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) {
167				int rc;
168				cfs_race_state = 0;
169				CERROR(&quot;cfs_race id %x sleeping\n&quot;, id);
170				rc = wait_event_interruptible(cfs_race_waitq,
171							      cfs_race_state != 0);
172				CERROR(&quot;cfs_fail_race id %x awake: rc=%d\n&quot;, id, rc);
173			} else {
174				CERROR(&quot;cfs_fail_race id %x waking\n&quot;, id);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;so in addition to fixing this, I wonder if it makes sense to clear the active race with a very visible dmesg message?&lt;/p&gt;

&lt;p&gt;Here&apos;s the example of the failure case: &lt;a href=&quot;http://testing.linuxhacker.ru:3333/lustre-reports/7130/testresults/sanityn-ldiskfs-DNE-centos7_x86_64-centos7_x86_64/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://testing.linuxhacker.ru:3333/lustre-reports/7130/testresults/sanityn-ldiskfs-DNE-centos7_x86_64-centos7_x86_64/&lt;/a&gt;&lt;/p&gt;</description>
                <environment></environment>
        <key id="58269">LU-13328</key>
            <summary>sanityn test 35 is broken</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="green">Oleg Drokin</reporter>
                        <labels>
                    </labels>
                <created>Wed, 4 Mar 2020 22:36:07 +0000</created>
                <updated>Thu, 5 Mar 2020 00:23:14 +0000</updated>
                                            <version>Lustre 2.14.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>1</watches>
                                                                                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="58271">LU-13329</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00uuv:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>