<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:06:35 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-7169] conf-sanity 84 restart mds1 failed</title>
                <link>https://jira.whamcloud.com/browse/LU-7169</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/e089506c-5bf0-11e5-9dac-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/e089506c-5bf0-11e5-9dac-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;LDISKFS-fs (dm-0): mounted filesystem with ordered data mode. quota=on. Opts:
LDISKFS-fs (dm-0): mounted filesystem with ordered data mode. quota=on. Opts:
Lustre: Setting parameter lustre-MDT0000-mdtlov.lov.stripesize in log lustre-MDT0000
Lustre: Skipped 79 previous similar messages
Lustre: ctl-lustre-MDT0000: &lt;span class=&quot;code-keyword&quot;&gt;super&lt;/span&gt;-sequence allocation rc = 0 [0x0000000200000400-0x0000000240000400):0:mdt
Lustre: Skipped 26 previous similar messages
Lustre: DEBUG MARKER: PATH=/usr/lib64/lustre/tests:/usr/lib/lustre/tests:/usr/lib64/lustre/tests:/opt/iozone/bin:/usr/lib64/lustre/tests&lt;span class=&quot;code-comment&quot;&gt;//usr/lib64/lustre/tests:/usr/lib64/lustre/tests:/usr/lib64/lustre/tests/../utils:/opt/iozone/bin:/usr/lib64/lustre/tests/mpi:/usr/lib64/lust
&lt;/span&gt;Lustre: DEBUG MARKER: lctl set_param -n mdt.lustre*.enable_remote_dir=1
Lustre: DEBUG MARKER: e2label /dev/lvm-Role_MDS/P1 2&amp;gt;/dev/&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;
Lustre: DEBUG MARKER: lctl set_param -n mdt.lustre*.enable_remote_dir=1
Lustre: DEBUG MARKER: sync; sync; sync
Lustre: DEBUG MARKER: /usr/sbin/lctl --device lustre-MDT0000 notransno
Lustre: DEBUG MARKER: /usr/sbin/lctl --device lustre-MDT0000 readonly
LustreError: 14276:0:(osd_handler.c:1380:osd_ro()) *** setting lustre-MDT0000 read-only ***
Turning device dm-0 (0xfd00000) read-only
Lustre: DEBUG MARKER: /usr/sbin/lctl mark mds1 REPLAY BARRIER on lustre-MDT0000
Lustre: DEBUG MARKER: mds1 REPLAY BARRIER on lustre-MDT0000
Lustre: DEBUG MARKER: lctl set_param fail_loc=0x20000709 fail_val=5
Lustre: DEBUG MARKER: grep -c /mnt/mds1&lt;span class=&quot;code-quote&quot;&gt;&apos; &apos;&lt;/span&gt; /proc/mounts
Lustre: DEBUG MARKER: umount -d /mnt/mds1
Lustre: Failing over lustre-MDT0000
Removing read-only on unknown block (0xfd00000)
Lustre: DEBUG MARKER: lsmod | grep lnet &amp;gt; /dev/&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt; &amp;amp;&amp;amp; lctl dl | grep &lt;span class=&quot;code-quote&quot;&gt;&apos; ST &apos;&lt;/span&gt;
Lustre: DEBUG MARKER: hostname
Lustre: DEBUG MARKER: test -b /dev/lvm-Role_MDS/P1
Lustre: DEBUG MARKER: mkdir -p /mnt/mds1; mount -t lustre -o recovery_time_hard=60,recovery_time_soft=60                                   /dev/lvm-Role_MDS/P1 /mnt/mds1
LDISKFS-fs (dm-0): recovery complete
LDISKFS-fs (dm-0): mounted filesystem with ordered data mode. quota=on. Opts:
LDISKFS-fs error (device dm-0): ldiskfs_lookup: deleted inode referenced: 75023
Aborting journal on device dm-0-8.
LDISKFS-fs (dm-0): Remounting filesystem read-only
LDISKFS-fs error (device dm-0): ldiskfs_put_super: Couldn&apos;t clean up the journal
LustreError: 14732:0:(obd_config.c:575:class_setup()) setup lustre-MDT0000-osd failed (-30)
LustreError: 14732:0:(obd_mount.c:203:lustre_start_simple()) lustre-MDT0000-osd setup error -30
LustreError: 14732:0:(obd_mount_server.c:1760:server_fill_super()) Unable to start osd on /dev/mapper/lvm--Role_MDS-P1: -30
LustreError: 14732:0:(obd_mount.c:1342:lustre_fill_super()) Unable to mount  (-30)
Lustre: DEBUG MARKER: /usr/sbin/lctl mark  conf-sanity test_84: @@@@@@ FAIL: Restart of mds1 failed!
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Looks the filesystem is corrupted somehow.&lt;/p&gt;</description>
                <environment></environment>
        <key id="32154">LU-7169</key>
            <summary>conf-sanity 84 restart mds1 failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="yong.fan">nasf</assignee>
                                    <reporter username="niu">Niu Yawei</reporter>
                        <labels>
                    </labels>
                <created>Wed, 16 Sep 2015 03:58:01 +0000</created>
                <updated>Mon, 30 Nov 2015 18:49:42 +0000</updated>
                            <resolved>Mon, 30 Nov 2015 18:49:42 +0000</resolved>
                                    <version>Lustre 2.8.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="127536" author="adilger" created="Wed, 16 Sep 2015 18:50:06 +0000"  >&lt;p&gt;The corruption here may be related to the problem seen in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6895&quot; title=&quot;sanity-lfsck test 4 hung: bad entry in directory: rec_len is smaller than minimal - inode=3925999616&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6895&quot;&gt;&lt;del&gt;LU-6895&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="127537" author="jamesanunez" created="Wed, 16 Sep 2015 18:52:05 +0000"  >&lt;p&gt;Another occurance&lt;br/&gt;
2015-09-15 14:42:13 - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/f3da22c2-5bf0-11e5-9dac-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/f3da22c2-5bf0-11e5-9dac-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="127538" author="jgmitter" created="Wed, 16 Sep 2015 18:52:59 +0000"  >&lt;p&gt;Fan Yong,&lt;br/&gt;
Can you look at this issue and see if it is a dup of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6895&quot; title=&quot;sanity-lfsck test 4 hung: bad entry in directory: rec_len is smaller than minimal - inode=3925999616&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6895&quot;&gt;&lt;del&gt;LU-6895&lt;/del&gt;&lt;/a&gt;?&lt;br/&gt;
Thanks.&lt;br/&gt;
Joe&lt;/p&gt;</comment>
                            <comment id="127540" author="adilger" created="Wed, 16 Sep 2015 19:03:49 +0000"  >&lt;p&gt;The MDS console log reports:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;18:38:45:LDISKFS-fs (dm-0): mounted filesystem with ordered data mode. quota=on. Opts: 
18:38:45:LDISKFS-fs error (device dm-0): ldiskfs_lookup: deleted inode referenced: 75023
18:38:45:Aborting journal on device dm-0-8.
18:38:45:LDISKFS-fs (dm-0): Remounting filesystem read-only
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;On a later test when the MDS was remounted it crashed:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;18:38:45:LDISKFS-fs error (device dm-0): ldiskfs_lookup: deleted inode referenced: 75023
18:38:45:Aborting journal on device dm-0-8.
18:38:45:LDISKFS-fs (dm-0): Remounting filesystem read-only
18:38:45:BUG: unable to handle kernel NULL pointer dereference at 0000000000000004
18:38:45:IP: [&amp;lt;ffffffffa0b9833d&amp;gt;] osd_scrub_refresh_mapping+0x39d/0x410 [osd_ldiskfs]
18:38:45:Oops: 0000 [#1] SMP 
18:38:56:Pid: 15532, comm: mount.lustre Not tainted 2.6.32-573.3.1.el6_lustre.g3eb333f.x86_64 #1 Red Hat KVM
18:38:56:Call Trace:
18:38:56: [&amp;lt;ffffffffa0b9c616&amp;gt;] osd_scrub_setup+0xfb6/0x1170 [osd_ldiskfs]
18:38:56: [&amp;lt;ffffffffa0b707e9&amp;gt;] osd_device_alloc+0x6c9/0x960 [osd_ldiskfs]
18:38:56: [&amp;lt;ffffffffa05bf8df&amp;gt;] obd_setup+0x1bf/0x290 [obdclass]
18:38:56: [&amp;lt;ffffffffa05bfc0f&amp;gt;] class_setup+0x25f/0x940 [obdclass]
18:38:56: [&amp;lt;ffffffffa05c6df1&amp;gt;] class_process_config+0x1151/0x2840 [obdclass]
18:38:56: [&amp;lt;ffffffffa05d0abb&amp;gt;] do_lcfg+0x2cb/0x640 [obdclass]
18:38:56: [&amp;lt;ffffffffa05d0ec4&amp;gt;] lustre_start_simple+0x94/0x200 [obdclass]
18:38:56: [&amp;lt;ffffffffa0605ce1&amp;gt;] server_fill_super+0xfd1/0x1a70 [obdclass]
18:38:56: [&amp;lt;ffffffffa05d33e8&amp;gt;] lustre_fill_super+0x348/0x990 [obdclass]
18:38:56: [&amp;lt;ffffffff8119532f&amp;gt;] get_sb_nodev+0x5f/0xa0
18:38:56: [&amp;lt;ffffffffa05cb8a5&amp;gt;] lustre_get_sb+0x25/0x30 [obdclass]
18:38:56: [&amp;lt;ffffffff8119496b&amp;gt;] vfs_kern_mount+0x7b/0x1b0
18:38:56: [&amp;lt;ffffffff81194b12&amp;gt;] do_kern_mount+0x52/0x130
18:38:56: [&amp;lt;ffffffff811b697b&amp;gt;] do_mount+0x2fb/0x930
18:38:56: [&amp;lt;ffffffff811b7040&amp;gt;] sys_mount+0x90/0xe0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="127604" author="yong.fan" created="Thu, 17 Sep 2015 05:08:47 +0000"  >&lt;p&gt;The MDS crash at &quot;osd_scrub_refresh_mapping+0x39d&quot; is a duplication of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7061&quot; title=&quot;BUG: unable to handle kernel NULL pointer dereference at 0000000000000004 IP: osd_scrub_refresh_mapping+0x39d/0x410&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7061&quot;&gt;&lt;del&gt;LU-7061&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="127654" author="jgmitter" created="Thu, 17 Sep 2015 17:14:09 +0000"  >&lt;p&gt;Fan Yong,&lt;br/&gt;
Can you have a look at the corruption issue as well?&lt;br/&gt;
Thanks.&lt;br/&gt;
Joe&lt;/p&gt;</comment>
                            <comment id="127918" author="yong.fan" created="Sat, 19 Sep 2015 16:11:26 +0000"  >&lt;p&gt;It is NOT the same at &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6895&quot; title=&quot;sanity-lfsck test 4 hung: bad entry in directory: rec_len is smaller than minimal - inode=3925999616&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6895&quot;&gt;&lt;del&gt;LU-6895&lt;/del&gt;&lt;/a&gt;. The direct reason for the failure is as following:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LDISKFS-fs error (device dm-0): ldiskfs_lookup: deleted inode referenced: 75023
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;ldiskfs_lookup() found a deleted inode, then its error handler set the system as read-only, and then caused cascaded failures for subsequent operations.&lt;/p&gt;

&lt;p&gt;In fact, before the ldiskfs_lookup() failure, the system already crashed. Here is the MDS debug log:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00100000:00000001:0.0:1442342253.462893:0:14732:0:(osd_scrub.c:2423:osd_scrub_setup()) Process entered
00080000:00000001:0.0:1442342253.462936:0:14732:0:(osd_handler.c:2401:osd_ea_fid_set()) Process entered
00080000:00000001:0.0:1442342253.462944:0:14732:0:(osd_handler.c:2429:osd_ea_fid_set()) Process leaving (rc=0 : 0 : 0)
00100000:10000000:0.0:1442342253.463278:0:14732:0:(osd_scrub.c:277:osd_scrub_file_reset()) lustre-MDT0000: reset OI scrub file, old flags = 0x0, add flags = 0x2
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Related source code is as following:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; osd_scrub_setup(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_env *env, struct osd_device *dev)
{
...
        } &lt;span class=&quot;code-keyword&quot;&gt;else&lt;/span&gt; {
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (memcmp(sf-&amp;gt;sf_uuid, es-&amp;gt;s_uuid, 16) != 0) {
                        osd_scrub_file_reset(scrub, es-&amp;gt;s_uuid,SF_INCONSISTENT);
                        dirty = 1;
...
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The logs shows that during the MDT0000 mount, the osd_scrub_setup() found the super block&apos;s uuid has been changed. Usually, such case only happens when MDT file-level backup/restore. But in our conf-sanity test cases, there were no backup/restore operations. So the local file-system should have been crashed during the MDT failover. As for what caused the super block corruption, I have no idea yet.&lt;/p&gt;</comment>
                            <comment id="128727" author="gerrit" created="Tue, 29 Sep 2015 09:06:46 +0000"  >&lt;p&gt;Fan Yong (fan.yong@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/16664&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16664&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7169&quot; title=&quot;conf-sanity 84 restart mds1 failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7169&quot;&gt;&lt;del&gt;LU-7169&lt;/del&gt;&lt;/a&gt; tests: check disk corruption during failover&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: aa1a896b1fddadce8b95b2c9af6c4a8509535a19&lt;/p&gt;</comment>
                            <comment id="131906" author="pjones" created="Wed, 28 Oct 2015 18:56:03 +0000"  >&lt;p&gt;Fan Yong&lt;/p&gt;

&lt;p&gt;What are the next steps here? It looks like the debug patch did not trigger the failure. Should we land the debug patch or does it just need to be run with a higher number of runs to improve the chances of hitting it?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="131951" author="yong.fan" created="Thu, 29 Oct 2015 01:15:09 +0000"  >&lt;p&gt;The issues cannot be reproduced any longer after the debug patch, even through several repeat the failed test case. So I suggest to land the patch to master and give the chance to normal Maloo run. I will update the patch to drop &quot;fortestonly&quot; and ask for landing permission.&lt;/p&gt;</comment>
                            <comment id="132029" author="pjones" created="Thu, 29 Oct 2015 16:47:34 +0000"  >&lt;p&gt;Excellent. Thanks Fan Yong.&lt;/p&gt;</comment>
                            <comment id="133088" author="yong.fan" created="Tue, 10 Nov 2015 00:54:03 +0000"  >&lt;p&gt;We hit the trouble with the patch applied, the log shows that  there is really some disk inconsistency as following:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/e6f060ac-8707-11e5-bf92-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/e6f060ac-8707-11e5-bf92-5254006e85c2&lt;/a&gt;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;CMD: onyx-44vm3 e2fsck -d -v -t -t -f -n /dev/lvm-Role_MDS/P1
onyx-44vm3: e2fsck 1.42.13.wc3 (28-Aug-2015)
Warning: skipping journal recovery because doing a read-only filesystem check.
Pass 1: Checking inodes, blocks, and sizes
Pass 1: Memory used: 292k/0k (87k/206k), time:  0.00/ 0.00/ 0.00
Pass 1: I/O read: 1MB, write: 0MB, rate: 553.40MB/s
Pass 2: Checking directory structure
Pass 2: Memory used: 292k/0k (98k/195k), time:  0.00/ 0.00/ 0.00
Pass 2: I/O read: 1MB, write: 0MB, rate: 655.74MB/s
Pass 3: Checking directory connectivity
Peak memory: Memory used: 292k/0k (98k/195k), time:  0.01/ 0.00/ 0.00
Pass 3: Memory used: 292k/0k (96k/197k), time:  0.00/ 0.00/ 0.00
Pass 3: I/O read: 0MB, write: 0MB, rate: 0.00MB/s
Pass 4: Checking reference counts
Pass 4: Memory used: 292k/0k (62k/231k), time:  0.00/ 0.00/ 0.00
Pass 4: I/O read: 0MB, write: 0MB, rate: 0.00MB/s
Pass 5: Checking group summary information
Free blocks count wrong (33296, counted=32947).
Fix? no

Free inodes count wrong (99987, counted=99750).
Fix? no

Pass 5: Memory used: 292k/0k (62k/231k), time:  0.00/ 0.00/ 0.00
Pass 5: I/O read: 1MB, write: 0MB, rate: 333.78MB/s

          13 inodes used (0.01%, out of 100000)
           6 non-contiguous files (46.2%)
           0 non-contiguous directories (0.0%)
             # of inodes with ind/dind/tind blocks: 0/0/0
       16704 blocks used (33.41%, out of 50000)
           0 bad blocks
           1 large file

         125 regular files
         116 directories
           0 character device files
           0 block device files
           0 fifos
           0 links
           0 symbolic links (0 fast symbolic links)
           0 sockets
------------
         241 files
Memory used: 292k/0k (61k/232k), time:  0.01/ 0.01/ 0.00
I/O read: 1MB, write: 0MB, rate: 92.42MB/s
reboot facets: mds1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Such inconsistency does not means the super block crashed. Because without disk checksum, the e2fsck cannot detect per-block based data corruption. Only with above logs, we cannot say it is the root reason for test_84 failure. I will update the patch with more debug information.&lt;/p&gt;</comment>
                            <comment id="133702" author="gerrit" created="Tue, 17 Nov 2015 15:56:30 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/16664/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16664/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7169&quot; title=&quot;conf-sanity 84 restart mds1 failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7169&quot;&gt;&lt;del&gt;LU-7169&lt;/del&gt;&lt;/a&gt; tests: check disk corruption during failover&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: f84e06eead85de5cd7832855bab5ff72a542e971&lt;/p&gt;</comment>
                            <comment id="134276" author="jgmitter" created="Mon, 23 Nov 2015 18:58:56 +0000"  >&lt;p&gt;Landed for 2.8&lt;/p&gt;</comment>
                            <comment id="134633" author="adilger" created="Fri, 27 Nov 2015 05:47:00 +0000"  >&lt;p&gt;The landed patch was only for debugging.  This issue is not resolved.&lt;/p&gt;</comment>
                            <comment id="134650" author="yong.fan" created="Fri, 27 Nov 2015 15:52:15 +0000"  >&lt;p&gt;Right, we are still waiting for more failure instances after landing the patch.&lt;/p&gt;</comment>
                            <comment id="134653" author="adilger" created="Fri, 27 Nov 2015 17:30:22 +0000"  >&lt;p&gt;There are many, many failures of this test, but unfortunately they have all been assigned different bugs because the error messages are different.  &lt;/p&gt;

&lt;p&gt;In the tests I&apos;ve seen, the e2fsck run is clean, except for the superblock inside and block counts, which is expected. &lt;/p&gt;

&lt;p&gt;I pushed a patch under &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7428&quot; title=&quot;conf-sanity test_84, replay-dual 0a: /dev/lvm-Role_MDS/P1 failed to initialize!&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7428&quot;&gt;&lt;del&gt;LU-7428&lt;/del&gt;&lt;/a&gt; that may fix the problem, which I think is caused by test_84() setting the MDS read-only right after mount, and that is causing some of the recently written data to be discarded (e.g. superblock label, llog records, etc). Unfortunately, it will take a few days to be tested. &lt;/p&gt;</comment>
                            <comment id="134662" author="yong.fan" created="Sat, 28 Nov 2015 04:31:44 +0000"  >&lt;p&gt;You are right. &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7428&quot; title=&quot;conf-sanity test_84, replay-dual 0a: /dev/lvm-Role_MDS/P1 failed to initialize!&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7428&quot;&gt;&lt;del&gt;LU-7428&lt;/del&gt;&lt;/a&gt; is the duplication of this ticket. According to the latest test logs (marked as &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7428&quot; title=&quot;conf-sanity test_84, replay-dual 0a: /dev/lvm-Role_MDS/P1 failed to initialize!&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7428&quot;&gt;&lt;del&gt;LU-7428&lt;/del&gt;&lt;/a&gt;)&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/sub_tests/15602932-9477-11e5-a5ac-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/sub_tests/15602932-9477-11e5-a5ac-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The former test_83 reformat the system after test_83 done, then move to test_84, after the replay_barrier(), the e2fsck found some inconsistency:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Warning: skipping journal recovery because doing a read-only filesystem check.
Pass 1: Checking inodes, blocks, and sizes
Pass 1: Memory used: 292k/0k (86k/207k), time:  0.01/ 0.00/ 0.00
Pass 1: I/O read: 1MB, write: 0MB, rate: 104.44MB/s
Pass 2: Checking directory structure
Pass 2: Memory used: 292k/0k (98k/195k), time:  0.02/ 0.00/ 0.01
Pass 2: I/O read: 1MB, write: 0MB, rate: 47.50MB/s
Pass 3: Checking directory connectivity
Peak memory: Memory used: 292k/0k (98k/195k), time:  0.04/ 0.01/ 0.01
Pass 3: Memory used: 292k/0k (96k/197k), time:  0.00/ 0.00/ 0.00
Pass 3: I/O read: 0MB, write: 0MB, rate: 0.00MB/s
Pass 4: Checking reference counts
Pass 4: Memory used: 292k/0k (62k/231k), time:  0.00/ 0.00/ 0.00
Pass 4: I/O read: 0MB, write: 0MB, rate: 0.00MB/s
Pass 5: Checking group summary information
Free blocks count wrong (33296, counted=32960).
Fix? no

Free inodes count wrong (99987, counted=99754).
Fix? no

Pass 5: Memory used: 292k/0k (61k/232k), time:  0.01/ 0.01/ 0.00
...
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It is reasonable to suspect that the replay_barrier() marked the system as readonly too early before super block synced to disk. But seems the replay_barrier() has considered that:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;replay_barrier() {
        local facet=$1
        do_facet $facet &quot;sync; sync; sync&quot;
        df $MOUNT
..
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So seems the &quot;sync&quot; does not work. As for your patch &lt;a href=&quot;http://review.whamcloud.com/#/c/17371/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/17371/&lt;/a&gt;, it should be helpful. But why not make the sync_all_data to be inside replay_barrier() to replace the &apos;do_facet $facet &quot;sync; sync; sync&quot;&apos;, then it can help others.&lt;/p&gt;</comment>
                            <comment id="134664" author="yong.fan" created="Sat, 28 Nov 2015 04:43:25 +0000"  >&lt;p&gt;The patch from Andreas:&lt;br/&gt;
Gerrit Updater added a comment - Yesterday&lt;br/&gt;
Andreas Dilger (andreas.dilger@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/17371&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/17371&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7428&quot; title=&quot;conf-sanity test_84, replay-dual 0a: /dev/lvm-Role_MDS/P1 failed to initialize!&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7428&quot;&gt;&lt;del&gt;LU-7428&lt;/del&gt;&lt;/a&gt; tests: write superblock in conf-sanity test_84&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: d3e516e59697f7c48e5cb97054ee04cf97dc7132&lt;/p&gt;</comment>
                            <comment id="134668" author="adilger" created="Sat, 28 Nov 2015 07:20:13 +0000"  >&lt;p&gt;The &lt;tt&gt;Free blocks/inodes count wrong&lt;/tt&gt; errors do not indicate any kind of problem.  These fields in the superblock are not updated during normal usage, so after &lt;tt&gt;replay_barrier()&lt;/tt&gt; they will be out of date.&lt;/p&gt;

&lt;p&gt;I thought about putting the &lt;tt&gt;sync_all_data(); sleep 5;&lt;/tt&gt; call inside &lt;tt&gt;replay_barrier()&lt;/tt&gt; but I didn&apos;t want to slow down any of the other tests that use this function that don&apos;t immediately follow reformatting the filesystem.&lt;/p&gt;

&lt;p&gt;We&apos;ll see if this patch actually fixes the problem.&lt;/p&gt;</comment>
                            <comment id="134771" author="pjones" created="Mon, 30 Nov 2015 18:49:42 +0000"  >&lt;p&gt;believed to be a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7428&quot; title=&quot;conf-sanity test_84, replay-dual 0a: /dev/lvm-Role_MDS/P1 failed to initialize!&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7428&quot;&gt;&lt;del&gt;LU-7428&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="31187">LU-6895</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="33136">LU-7428</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="30892">LU-6789</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzxnrj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>