<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:54:21 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5768] replay-single test_52: Restart of mds1 failed: EIO</title>
                <link>https://jira.whamcloud.com/browse/LU-5768</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for Li Wei &amp;lt;liwei@whamcloud.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/a370c858-56b6-11e4-851f-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/a370c858-56b6-11e4-851f-5254006e85c2&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_52 failed with the following error:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Restart of mds1 failed!
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;== replay-single test 52: time out lock replay (3764) == 00:55:51 (1413593751)
CMD: shadow-19vm12 sync; sync; sync
Filesystem           1K-blocks    Used Available Use% Mounted on
shadow-19vm12@tcp:/lustre
                      22169560 1069324  19973984   6% /mnt/lustre
CMD: shadow-19vm10.shadow.whamcloud.com,shadow-19vm9 mcreate /mnt/lustre/fsa-\$(hostname); rm /mnt/lustre/fsa-\$(hostname)
CMD: shadow-19vm10.shadow.whamcloud.com,shadow-19vm9 if [ -d /mnt/lustre2 ]; then mcreate /mnt/lustre2/fsa-\$(hostname); rm /mnt/lustre2/fsa-\$(hostname); fi
CMD: shadow-19vm12 /usr/sbin/lctl --device lustre-MDT0000 notransno
CMD: shadow-19vm12 /usr/sbin/lctl --device lustre-MDT0000 readonly
CMD: shadow-19vm12 /usr/sbin/lctl mark mds1 REPLAY BARRIER on lustre-MDT0000
CMD: shadow-19vm12 lctl set_param fail_loc=0x8000030c
fail_loc=0x8000030c
Failing mds1 on shadow-19vm12
CMD: shadow-19vm12 grep -c /mnt/mds1&apos; &apos; /proc/mounts
Stopping /mnt/mds1 (opts:) on shadow-19vm12
CMD: shadow-19vm12 umount -d /mnt/mds1
CMD: shadow-19vm12 lsmod | grep lnet &amp;gt; /dev/null &amp;amp;&amp;amp; lctl dl | grep &apos; ST &apos;
reboot facets: mds1
Failover mds1 to shadow-19vm12
00:56:11 (1413593771) waiting for shadow-19vm12 network 900 secs ...
00:56:11 (1413593771) network interface is UP
CMD: shadow-19vm12 hostname
mount facets: mds1
CMD: shadow-19vm12 test -b /dev/lvm-Role_MDS/P1
Starting mds1:   /dev/lvm-Role_MDS/P1 /mnt/mds1
CMD: shadow-19vm12 mkdir -p /mnt/mds1; mount -t lustre   		                   /dev/lvm-Role_MDS/P1 /mnt/mds1
shadow-19vm12: mount.lustre: mount /dev/mapper/lvm--Role_MDS-P1 at /mnt/mds1 failed: Input/output error
shadow-19vm12: Is the MGS running?
Start of /dev/lvm-Role_MDS/P1 on mds1 failed 5
 replay-single test_52: @@@@@@ FAIL: Restart of mds1 failed! 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Info required for matching: replay-single 52&lt;/p&gt;</description>
                <environment></environment>
        <key id="27096">LU-5768</key>
            <summary>replay-single test_52: Restart of mds1 failed: EIO</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="liwei">Li Wei</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Mon, 20 Oct 2014 01:26:41 +0000</created>
                <updated>Tue, 4 Nov 2014 03:56:41 +0000</updated>
                            <resolved>Tue, 4 Nov 2014 03:56:30 +0000</resolved>
                                    <version>Lustre 2.5.4</version>
                                    <fixVersion>Lustre 2.5.4</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="96787" author="liwei" created="Tue, 21 Oct 2014 03:33:46 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000100:00100000:1.0:1413801598.098879:0:27934:0:(service.c:1874:ptlrpc_server_handle_req_in()) got req x1482469562502264
00000100:00100000:1.0:1413801598.098950:0:27934:0:(nrs_fifo.c:182:nrs_fifo_req_get()) NRS start fifo request from 12345-0@lo, seq: 2
00000100:00100000:1.0:1413801598.098961:0:27934:0:(service.c:2023:ptlrpc_server_handle_request()) Handling RPC pname:cluuid+ref:pid:xid:nid:opc ll_mgs_0002:4a248d11-7d6c-a58c-f094-139fe352b1dc+4:27909:x1482469562502264:12345-0@lo:101
00010000:00010000:1.0:1413801598.098973:0:27934:0:(ldlm_lockd.c:1179:ldlm_handle_enqueue0()) ### server-side enqueue handler START
00010000:00010000:1.0:1413801598.098987:0:27934:0:(ldlm_lockd.c:1267:ldlm_handle_enqueue0()) ### server-side enqueue handler, new lock created ns: MGS lock: ffff88005d9e6dc0/0x465b324ce97ba2f6 lrc: 2/0,0 mode: --/CR res: [0x65727473756c:0x0:0x0].0 rrc: 1 type: PLN flags: 0x40000000000000 nid: local remote: 0x465b324ce97ba2ef expref: -99 pid: 27934 timeout: 0 lvb_type: 0
00010000:00010000:1.0:1413801598.099017:0:27934:0:(ldlm_lock.c:1089:ldlm_granted_list_add_lock()) ### About to add lock: ns: MGS lock: ffff88005d9e6dc0/0x465b324ce97ba2f6 lrc: 3/0,0 mode: CR/CR res: [0x65727473756c:0x0:0x0].0 rrc: 1 type: PLN flags: 0x50000000000000 nid: 0@lo remote: 0x465b324ce97ba2ef expref: 5 pid: 27934 timeout: 0 lvb_type: 0
00010000:00010000:1.0:1413801598.099031:0:27934:0:(ldlm_lockd.c:1405:ldlm_handle_enqueue0()) ### server-side enqueue handler, sending reply(err=0, rc=0) ns: MGS lock: ffff88005d9e6dc0/0x465b324ce97ba2f6 lrc: 3/0,0 mode: CR/CR res: [0x65727473756c:0x0:0x0].0 rrc: 1 type: PLN flags: 0x40000000000000 nid: 0@lo remote: 0x465b324ce97ba2ef expref: 5 pid: 27934 timeout: 0 lvb_type: 0
00010000:00010000:1.0:1413801598.099040:0:27934:0:(ldlm_lockd.c:1448:ldlm_handle_enqueue0()) ### server-side enqueue handler END (lock ffff88005d9e6dc0, rc 0)
00010000:02000000:1.0:1413801598.099046:0:27934:0:(libcfs_fail.h:89:cfs_fail_check_set()) *** cfs_fail_loc=30c, val=2147483648***
00010000:00020000:1.0:1413801598.100035:0:27934:0:(ldlm_lib.c:2415:target_send_reply_msg()) @@@ dropping reply  req@ffff88007d2f2050 x1482469562502264/t0(0) o101-&amp;gt;4a248d11-7d6c-a58c-f094-139fe352b1dc@0@lo:0/0 lens 328/344 e 0 to 0 dl 1413801604 ref 1 fl Interpret:/0/0 rc 0/0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The request dropped was actually an attempt to acquire an MGS lock.  Regardless of how this suddenly showed up (d29c0438 could be a factor), the test is not working properly at the moment.  I suggest adding it to the exception list until a fix is ready.&lt;/p&gt;</comment>
                            <comment id="96795" author="liwei" created="Tue, 21 Oct 2014 06:24:42 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/12355&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/12355&lt;/a&gt; (Only disables the test; not a fix)&lt;/p&gt;</comment>
                            <comment id="96831" author="bogl" created="Tue, 21 Oct 2014 15:03:57 +0000"  >&lt;p&gt;another seen in b2_5:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/3bbdec78-590d-11e4-9a49-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/3bbdec78-590d-11e4-9a49-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="96931" author="yujian" created="Tue, 21 Oct 2014 21:11:42 +0000"  >&lt;p&gt;This is a regression failure introduced by the following commit on Lustre b2_5 branch:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;commit d29c0438bdf38e89d5638030b3770d7740121f8d
Author: Vitaly Fertman &amp;lt;vitaly_fertman@xyratex.com&amp;gt;
Date:   Mon Sep 29 19:42:32 2014 -0400

    LU-5579 ldlm: re-sent enqueue vs lock destroy race
    
    upon lock enqueue re-send, lock is pinned by ldlm_handle_enqueue0,
    however it may race with client eviction or even lcok cancel (if
    a reply for the original RPC finally reached the client) and the
    lock cann be found by cookie anymore:
    
     ASSERTION( lock != NULL ) failed: Invalid lock handle
    
    Signed-off-by: Vitaly Fertman &amp;lt;vitaly_fertman@xyratex.com&amp;gt;
    Change-Id: I9d8156bf78a1b83ac22ffaa1148feb43bef37b1a
    Xyratex-bug-id: MRP-2094
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This is blocking patch review testing on Lustre b2_5 branch. Oleg, could you please revert it? Thanks!&lt;/p&gt;</comment>
                            <comment id="96936" author="simmonsja" created="Tue, 21 Oct 2014 21:33:30 +0000"  >&lt;p&gt;Please don&apos;t revert. This patch fixes real issues for us at ORNL. Could we figure out a proper fix instead.&lt;/p&gt;</comment>
                            <comment id="96960" author="liwei" created="Wed, 22 Oct 2014 01:25:26 +0000"  >&lt;p&gt;Indeed.  Another temporary workaround could be just reverting the test part of the patch, including the change to tgt_enqueue().&lt;/p&gt;

&lt;p&gt;In addition to this test, replay-single 73b suffers from the same problem on b2_5.&lt;/p&gt;</comment>
                            <comment id="97001" author="green" created="Wed, 22 Oct 2014 12:58:14 +0000"  >&lt;p&gt;I will try to revert just the test. This is my fault, I removed the master test, but did not notice b2_5 patch also had this.&lt;/p&gt;

&lt;p&gt;My internet connection right now is super far from being good, so it might take few days until I get to a good enough one.&lt;/p&gt;</comment>
                            <comment id="97005" author="liwei" created="Wed, 22 Oct 2014 13:33:57 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/12390&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/12390&lt;/a&gt; (Revert the test part of d29c0438)&lt;/p&gt;</comment>
                            <comment id="97398" author="bogl" created="Fri, 24 Oct 2014 14:54:48 +0000"  >&lt;p&gt;more seen on b2_5:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/8f810a18-5b86-11e4-8b14-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/8f810a18-5b86-11e4-8b14-5254006e85c2&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/1d38451e-5b7e-11e4-95e9-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/1d38451e-5b7e-11e4-95e9-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Does seem to be blocking in current b2_5 test runs.&lt;/p&gt;</comment>
                            <comment id="98260" author="pjones" created="Tue, 4 Nov 2014 03:56:30 +0000"  >&lt;p&gt;This has been resolved by &lt;a href=&quot;http://git.whamcloud.com/fs/lustre-release.git/commit/93423cc9114721f32e5c36e21a8b56d2a463125b&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://git.whamcloud.com/fs/lustre-release.git/commit/93423cc9114721f32e5c36e21a8b56d2a463125b&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwyyf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>16190</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>