<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:50:10 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5287] (ldlm_lib.c:2253:target_queue_recovery_request()) ASSERTION( req-&gt;rq_export-&gt;exp_lock_replay_needed ) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-5287</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Running racer with 2 clients MDSCOUNT=1 and 2.5.60-90-g37432a8 + &lt;a href=&quot;http://review.whamcloud.com/#/c/5936/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/5936/&lt;/a&gt; I see this when restarting a crashed OST with some clients still mounted.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[  230.089707] Lustre: Skipped 75 previous similar messages
[  231.775205] Lustre: 2151:0:(client.c:1924:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1404323793/real 1404323793]  req@ffff8801f78fc110 x1472540086110788/t0(0) o400-&amp;gt;lustre-OST0001-osc-MDT0000@0@lo:28/4 lens 224/224 e 1 to 1 dl 1404323837 ref 1 fl Rpc:X/c0/ffffffff rc 0/-1
[  237.775938] Lustre: lustre-OST0001: Denying connection for new client cc64d6dc-4180-e700-9f7e-ce147524a8f0 (at 0@lo), waiting for all 4 known clients (2 recovered, 1 in progress, and 1 evicted) to recover in 0:36
[  237.781858] Lustre: Skipped 3 previous similar messages
[  242.801254] LustreError: 2880:0:(ldlm_lib.c:2253:target_queue_recovery_request()) ASSERTION( req-&amp;gt;rq_export-&amp;gt;exp_lock_replay_needed ) failed: 
[  242.805102] LustreError: 2880:0:(ldlm_lib.c:2253:target_queue_recovery_request()) LBUG
[  242.807953] Pid: 2880, comm: ll_ost00_007
[  242.809274] 
[  242.809276] Call Trace:
[  242.810585]  [&amp;lt;ffffffffa02b98c5&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
[  242.812764]  [&amp;lt;ffffffffa02b9ec7&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
[  242.814689]  [&amp;lt;ffffffffa064ea0c&amp;gt;] target_queue_recovery_request+0xbac/0xc10 [ptlrpc]
[  242.816347]  [&amp;lt;ffffffffa06e122f&amp;gt;] tgt_handle_recovery+0x38f/0x520 [ptlrpc]
[  242.817666]  [&amp;lt;ffffffffa06e6b8d&amp;gt;] tgt_request_handle+0x18d/0xad0 [ptlrpc]
[  242.818987]  [&amp;lt;ffffffffa0699e31&amp;gt;] ptlrpc_main+0xcf1/0x1880 [ptlrpc]
[  242.820261]  [&amp;lt;ffffffffa0699140&amp;gt;] ? ptlrpc_main+0x0/0x1880 [ptlrpc]
[  242.821440]  [&amp;lt;ffffffff8109eab6&amp;gt;] kthread+0x96/0xa0
[  242.822360]  [&amp;lt;ffffffff8100c30a&amp;gt;] child_rip+0xa/0x20
[  242.823303]  [&amp;lt;ffffffff81554710&amp;gt;] ? _spin_unlock_irq+0x30/0x40
[  242.824390]  [&amp;lt;ffffffff8100bb10&amp;gt;] ? restore_args+0x0/0x30
[  242.825391]  [&amp;lt;ffffffff8109ea20&amp;gt;] ? kthread+0x0/0xa0
[  242.826315]  [&amp;lt;ffffffff8100c300&amp;gt;] ? child_rip+0x0/0x20
[  242.827283] 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="25409">LU-5287</key>
            <summary>(ldlm_lib.c:2253:target_queue_recovery_request()) ASSERTION( req-&gt;rq_export-&gt;exp_lock_replay_needed ) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="niu">Niu Yawei</assignee>
                                    <reporter username="jhammond">John Hammond</reporter>
                        <labels>
                            <label>llnl</label>
                            <label>ost</label>
                    </labels>
                <created>Wed, 2 Jul 2014 18:06:06 +0000</created>
                <updated>Mon, 19 Sep 2016 02:51:03 +0000</updated>
                            <resolved>Thu, 6 Nov 2014 21:49:55 +0000</resolved>
                                    <version>Lustre 2.6.0</version>
                    <version>Lustre 2.7.0</version>
                                    <fixVersion>Lustre 2.7.0</fixVersion>
                    <fixVersion>Lustre 2.5.4</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>11</watches>
                                                                            <comments>
                            <comment id="92554" author="morrone" created="Tue, 26 Aug 2014 23:02:05 +0000"  >&lt;p&gt;We hit this same assertion with Lustre version 2.4.2-14.1chaos (see github.com/chaos/lustre) while the OSTs were in recovery.&lt;/p&gt;</comment>
                            <comment id="93360" author="pjones" created="Fri, 5 Sep 2014 18:30:11 +0000"  >&lt;p&gt;Niu&lt;/p&gt;

&lt;p&gt;Could you please advise on this issue?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="93447" author="utopiabound" created="Mon, 8 Sep 2014 17:30:38 +0000"  >&lt;p&gt;replay-single/73c on review-dne-part-2 on master:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/20dd03c0-365e-11e4-bd53-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/20dd03c0-365e-11e4-bd53-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="93527" author="niu" created="Tue, 9 Sep 2014 14:34:22 +0000"  >&lt;p&gt;The replay-single/73c seems never be really tested since the fail_loc OBD_FAIL_TGT_LAST_REPLAY was never be used (from the day one it was introduced)... Not sure if this test can trigger the bug more easily once it&apos;s fixed. I&apos;m going to investigate it further.&lt;/p&gt;</comment>
                            <comment id="93773" author="niu" created="Thu, 11 Sep 2014 11:04:05 +0000"  >&lt;p&gt;Well, I found that there are two places which modify the exp_flags without holding exp_lock, that could result in concurrent exp_flags updating overwrites each other.&lt;/p&gt;

&lt;p&gt;patch for master: &lt;a href=&quot;http://review.whamcloud.com/11871&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11871&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="95037" author="niu" created="Fri, 26 Sep 2014 12:10:26 +0000"  >&lt;p&gt;Andriy discovered another path to trigger this assertion. (see &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5651&quot; title=&quot;ASSERTION( req-&amp;gt;rq_export-&amp;gt;exp_lock_replay_needed ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5651&quot;&gt;&lt;del&gt;LU-5651&lt;/del&gt;&lt;/a&gt;), patch is being reviewed on: &lt;a href=&quot;http://review.whamcloud.com/#/c/12015/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/12015/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="95252" author="sarah" created="Mon, 29 Sep 2014 23:41:55 +0000"  >&lt;p&gt;Hit this bug on master branch build #2671&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/d986a3a2-472c-11e4-a9ec-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/d986a3a2-472c-11e4-a9ec-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="95445" author="morrone" created="Wed, 1 Oct 2014 18:24:29 +0000"  >&lt;p&gt;We just had 181 servers crash with this assertion when starting up 2.4.2-16chaos (see github.com/chaos/lustre) on the servers for the first time.  We need a patch for our branch as well.&lt;/p&gt;</comment>
                            <comment id="95447" author="bogl" created="Wed, 1 Oct 2014 18:45:20 +0000"  >&lt;p&gt;Christopher, which patch(es) do you need back ported to b2_4?  Don&apos;t want to do too much or too little.&lt;/p&gt;
</comment>
                            <comment id="95459" author="morrone" created="Wed, 1 Oct 2014 19:48:58 +0000"  >&lt;p&gt;All of the patches needed to fix this ticket&apos;s assertion.&lt;/p&gt;

&lt;p&gt;But actually, we are moving to 2.5 soon, so patches for b2_5 should be sufficient.&lt;/p&gt;</comment>
                            <comment id="95477" author="bogl" created="Wed, 1 Oct 2014 21:06:50 +0000"  >&lt;p&gt;backports to b2_5:&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/#/c/12162&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/12162&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/#/c/12163&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/12163&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="98608" author="pjones" created="Thu, 6 Nov 2014 21:49:55 +0000"  >&lt;p&gt;Landed for 2.5.4 and 2.7&lt;/p&gt;</comment>
                            <comment id="165059" author="vinayakh" created="Wed, 7 Sep 2016 08:46:47 +0000"  >&lt;p&gt;Hello Niu,&lt;/p&gt;

&lt;p&gt;I am trying to understand &lt;a href=&quot;http://review.whamcloud.com/#/c/12162&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/12162&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;Can you please help me to understand how it affects &lt;b&gt;req-&amp;gt;rq_export-&amp;gt;exp_lock_replay_needed&lt;/b&gt; flag ?&lt;/p&gt;

&lt;p&gt;Thanks in advance,&lt;/p&gt;</comment>
                            <comment id="165085" author="niu" created="Wed, 7 Sep 2016 13:00:32 +0000"  >&lt;p&gt;All export flags share the same &quot;unsigned long&quot; and flag set operation &quot;export-&amp;gt;exp_xxx = 1&quot; isn&apos;t atomic (load the whole unsigned long, change a bit, write the whole unsigned long), so flag change operation needs to take lock. &lt;/p&gt;</comment>
                            <comment id="165264" author="vinayakh" created="Thu, 8 Sep 2016 06:47:09 +0000"  >&lt;p&gt;Thanks Niu.&lt;/p&gt;

&lt;p&gt;Please correct me if wrong.&lt;/p&gt;

&lt;blockquote&gt;&lt;p&gt;so flag change operation needs to take lock&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;If the operations are not concurrent then it does not matter whether bit filed change happens with lock or without lock. The whole unsigned long will not be affected if we change any of the bits. Am I right ?&lt;/p&gt;

&lt;blockquote&gt;&lt;p&gt;load the whole unsigned long, change a bit, write the whole unsigned long&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;what is the real importance of lock here ?&lt;/p&gt;

&lt;p&gt;Thanks in advance,&lt;/p&gt;




</comment>
                            <comment id="166314" author="niu" created="Sun, 18 Sep 2016 06:57:10 +0000"  >&lt;p&gt;Right, it&apos;s safe if there isn&apos;t any concurrent changes.&lt;/p&gt;

&lt;p&gt;The importance of lock here is that there could be concurrent changes, I didn&apos;t take a thoroughly retrospect on the code changes, it looks concurrency is possible at first glance. Even if no concurrent changes in present code, I don&apos;t think we can hypothesize that concurrent changes will never happen, taking lock is the safe way to avoid nasty bugs.&lt;/p&gt;</comment>
                            <comment id="166336" author="vinayakh" created="Mon, 19 Sep 2016 02:51:03 +0000"  >&lt;p&gt;Thanks Niu.&lt;/p&gt;

&lt;p&gt;We have faced similar issue on our set up. I will update my further investigation here.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="26268">LU-5572</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="26687">LU-5651</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwqg7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>14750</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>