<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:04:30 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-6928] Version mismatch during DNE replay</title>
                <link>https://jira.whamcloud.com/browse/LU-6928</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;During 24 hours DNE failover test, one of client fails because of Version mismatch during replay.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Lustre: 7977:0:(client.c:2828:ptlrpc_replay_interpret()) @@@ Version mismatch during replay
  req@ffff8806698209c0 x1508081941826144/t21475879788(21475879788) o36-&amp;gt;lustre-MDT0001-mdc-ffff880821cfdc00@192.168.2.126@o2ib:12/10 lens 608/424 e 1 to 0 dl 1438262329 ref 2 fl Interpret:R/4/0 rc -75/-75
Lustre: 7977:0:(import.c:1301:completed_replay_interpret()) lustre-MDT0001-mdc-ffff880821cfdc00: version recovery fails, reconnecting
LustreError: 167-0: lustre-MDT0001-mdc-ffff880821cfdc00: This client was evicted by lustre-MDT0001; in progress operations using this service will fail.
LustreError: 9213:0:(vvp_io.c:1475:vvp_io_init()) lustre: refresh file layout [0x240002341:0x1d74:0x0] error -5.
LustreError: 29913:0:(lmv_obd.c:1332:lmv_fid_alloc()) Can&apos;t alloc new fid, rc -19
Lustre: lustre-MDT0001-mdc-ffff880821cfdc00: Connection restored to lustre-MDT0001 (at 192.168.2.126@o2ib)
Lustre: DEBUG MARKER: ==== Checking the clients loads BEFORE failover -- failure NOT OK ELAPSED=43433 DURATION=86400 PERIOD=1800
Lustre: DEBUG MARKER: Client load failed on node c01, rc=1
Lustre: DEBUG MARKER: Duration: 86400
Lustre: Unmounted lustre-client
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="31284">LU-6928</key>
            <summary>Version mismatch during DNE replay</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="niu">Niu Yawei</assignee>
                                    <reporter username="di.wang">Di Wang</reporter>
                        <labels>
                    </labels>
                <created>Thu, 30 Jul 2015 15:03:17 +0000</created>
                <updated>Thu, 27 Aug 2015 16:41:25 +0000</updated>
                            <resolved>Thu, 27 Aug 2015 16:41:25 +0000</resolved>
                                    <version>Lustre 2.8.0</version>
                                    <fixVersion>Lustre 2.8.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="122733" author="jamesanunez" created="Thu, 30 Jul 2015 17:27:48 +0000"  >&lt;p&gt;Di - I&apos;m seeing a very similar error in replay-single test 48 in review-dne-part-2. Do you think this is the same issue? The logs are at: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/8281843c-365f-11e5-830b-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/8281843c-365f-11e5-830b-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The client console shows:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;18:10:31:Lustre: DEBUG MARKER: == replay-single test 48: MDS-&amp;gt;OSC failure during precreate cleanup (2824) == 18:08:48 (1438193328)
18:10:31:Lustre: DEBUG MARKER: mcreate /mnt/lustre/fsa-$(hostname); rm /mnt/lustre/fsa-$(hostname)
18:10:31:Lustre: DEBUG MARKER: if [ -d /mnt/lustre2 ]; then mcreate /mnt/lustre2/fsa-$(hostname); rm /mnt/lustre2/fsa-$(hostname); fi
18:10:31:Lustre: DEBUG MARKER: local REPLAY BARRIER on lustre-MDT0000
18:10:31:LustreError: 7733:0:(client.c:2816:ptlrpc_replay_interpret()) request replay timed out, restarting recovery
18:10:31:Lustre: 7733:0:(client.c:2828:ptlrpc_replay_interpret()) @@@ Version mismatch during replay
18:10:31:  req@ffff8800379e8c80 x1508053244555028/t197568495717(197568495717) o36-&amp;gt;lustre-MDT0000-mdc-ffff88007b65b000@10.1.4.62@tcp:12/10 lens 640/416 e 2 to 0 dl 1438193365 ref 1 fl Interpret:R/6/0 rc -75/-75
18:10:31:Lustre: 7733:0:(import.c:1306:completed_replay_interpret()) lustre-MDT0000-mdc-ffff88007b65b000: version recovery fails, reconnecting
18:10:31:LustreError: 167-0: lustre-MDT0000-mdc-ffff88007b65b000: This client was evicted by lustre-MDT0000; in progress operations using this service will fail.
18:10:31:LustreError: Skipped 1 previous similar message
18:10:31:LustreError: 13949:0:(lmv_obd.c:1473:lmv_statfs()) can&apos;t stat MDS #0 (lustre-MDT0000-mdc-ffff88007b65b000), error -5
18:10:31:LustreError: 13949:0:(llite_lib.c:1707:ll_statfs_internal()) md_statfs fails: rc = -5
18:10:31:Lustre: DEBUG MARKER: /usr/sbin/lctl mark  replay-single test_48: @@@@@@ FAIL: client_up failed 
18:10:31:Lustre: DEBUG MARKER: replay-single test_48: @@@@@@ FAIL: client_up failed
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="122774" author="di.wang" created="Thu, 30 Jul 2015 20:20:05 +0000"  >&lt;p&gt;Yes, looks like same issue, I believe it is a bug caused by multiple slot patch. I will cook a patch.&lt;/p&gt;</comment>
                            <comment id="122778" author="di.wang" created="Thu, 30 Jul 2015 20:58:44 +0000"  >&lt;p&gt;Hmm, I think this is what happened&lt;/p&gt;

&lt;p&gt;During replay&lt;/p&gt;

&lt;p&gt;1. client send replay request to MDS01, and MDS01 did not handle it in time. &lt;br/&gt;
2. So client did not get reply, so it expires and reconnects the import.  And during sending this CONNECT request, it will first find out the min XID first, &lt;br/&gt;
but it only tries to find such XID in sending and replay list, which is wrong, because the replay request (just timeout) are not in these list (see ptlrpc_check_set()) .  So usually, it will get the xid of this connect request, which is obviously larger than the xid of replay request.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;        /* find the lowest unreplied XID */
        list_for_each(tmp, &amp;amp;imp-&amp;gt;imp_delayed_list) {
                struct ptlrpc_request *r;
                r = list_entry(tmp, struct ptlrpc_request, rq_list);
                if (r-&amp;gt;rq_xid &amp;lt; min_xid)
                        min_xid = r-&amp;gt;rq_xid;
        }
        list_for_each(tmp, &amp;amp;imp-&amp;gt;imp_sending_list) {
                struct ptlrpc_request *r;
                r = list_entry(tmp, struct ptlrpc_request, rq_list);
                if (r-&amp;gt;rq_xid &amp;lt; min_xid)
                        min_xid = r-&amp;gt;rq_xid;
        }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Delete req from sending list after req failure.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;   /* Request already may be not on sending or delaying list. This
                 * may happen in the case of marking it erroneous for the case
                 * ptlrpc_import_delay_req(req, status) find it impossible to
                 * allow sending this rpc and returns *status != 0. */
                if (!list_empty(&amp;amp;req-&amp;gt;rq_list)) {
                        list_del_init(&amp;amp;req-&amp;gt;rq_list);
                        atomic_dec(&amp;amp;imp-&amp;gt;imp_inflight);
                }
                spin_unlock(&amp;amp;imp-&amp;gt;imp_lock);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;3. And MDT handle that replay request (mentioned in 1) at this time, which will add the lrd of this req into the reply_list.&lt;br/&gt;
4. Then connect request arrives, and it has bigger reply XID, which then delete the lrd created in step 3&lt;br/&gt;
5. After connect, client will resend the replay request, but server can not identify it as a already received request,  because lrd has been deleted in 3. So the replay request has been executed twice, then it causes this Version mismatch issue. &lt;/p&gt;

&lt;p&gt;so the fix might be in step 2, i.e. we need consider the expired request when finding this min XID, or we just do not pack this min xid for CONNECT request. &lt;/p&gt;

</comment>
                            <comment id="122790" author="gerrit" created="Thu, 30 Jul 2015 22:00:07 +0000"  >&lt;p&gt;wangdi (di.wang@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/15812&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/15812&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6928&quot; title=&quot;Version mismatch during DNE replay&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6928&quot;&gt;&lt;del&gt;LU-6928&lt;/del&gt;&lt;/a&gt; ptlrpc: Do not pack min XID for connect req&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: acb892354ac119dfe80edf60f5f2a2923cdcc980&lt;/p&gt;</comment>
                            <comment id="124194" author="di.wang" created="Fri, 14 Aug 2015 21:16:43 +0000"  >&lt;p&gt;Niu: could you please confirm if your patch can fix this problem? Thanks&lt;/p&gt;</comment>
                            <comment id="125338" author="niu" created="Thu, 27 Aug 2015 07:01:10 +0000"  >&lt;p&gt;Yes, the patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5951&quot; title=&quot;sanity test_39k: mtime is lost on close&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5951&quot;&gt;&lt;del&gt;LU-5951&lt;/del&gt;&lt;/a&gt; (&lt;a href=&quot;http://review.whamcloud.com/#/c/15473/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/15473/&lt;/a&gt;) will fix this problem.&lt;/p&gt;</comment>
                            <comment id="125402" author="di.wang" created="Thu, 27 Aug 2015 16:41:25 +0000"  >&lt;p&gt;Duplicate with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5951&quot; title=&quot;sanity test_39k: mtime is lost on close&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5951&quot;&gt;&lt;del&gt;LU-5951&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="31033">LU-6831</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzxjcv:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>