<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:45:47 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4781] lustre-rsync-test test_2b: Replication of operation failed(-17)</title>
                <link>https://jira.whamcloud.com/browse/LU-4781</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for Andreas Dilger &amp;lt;andreas.dilger@intel.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;http://maloo.whamcloud.com/test_sets/7edd3618-90d0-11e3-91ee-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://maloo.whamcloud.com/test_sets/7edd3618-90d0-11e3-91ee-52540035b04c&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_2b failed with the following error in the test output:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Replication of operation failed(-17): 4274 CREAT (1) &lt;span class=&quot;error&quot;&gt;&amp;#91;0x400000400:0x7f4:0x0&amp;#93;&lt;/span&gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;0x400000400:0x7f3:0x0&amp;#93;&lt;/span&gt; SMALL.FIL&lt;br/&gt;
Replication of operation failed(-17): 4275 CREAT (1) &lt;span class=&quot;error&quot;&gt;&amp;#91;0x400000400:0x7f5:0x0&amp;#93;&lt;/span&gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;0x400000400:0x7ef:0x0&amp;#93;&lt;/span&gt; LARGE.FIL&lt;br/&gt;
Replication of operation failed(-17): 4276 CREAT (1) &lt;span class=&quot;error&quot;&gt;&amp;#91;0x400000400:0x7f6:0x0&amp;#93;&lt;/span&gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;0x400000400:0x7f3:0x0&amp;#93;&lt;/span&gt; MEDIUM.FIL&lt;br/&gt;
Replication of operation failed(-17): 4277 CREAT (1) &lt;span class=&quot;error&quot;&gt;&amp;#91;0x400000400:0x7f7:0x0&amp;#93;&lt;/span&gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;0x400000400:0x7f3:0x0&amp;#93;&lt;/span&gt; LARGE.FIL&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Info required for matching: lustre-rsync-test 2b&lt;/p&gt;</description>
                <environment></environment>
        <key id="23703">LU-4781</key>
            <summary>lustre-rsync-test test_2b: Replication of operation failed(-17)</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="jhammond">John Hammond</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Tue, 18 Mar 2014 19:03:18 +0000</created>
                <updated>Fri, 28 Oct 2016 16:59:55 +0000</updated>
                            <resolved>Tue, 14 Jun 2016 22:49:50 +0000</resolved>
                                    <version>Lustre 2.6.0</version>
                    <version>Lustre 2.5.3</version>
                    <version>Lustre 2.9.0</version>
                                    <fixVersion>Lustre 2.9.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>15</watches>
                                                                            <comments>
                            <comment id="92066" author="yujian" created="Wed, 20 Aug 2014 18:41:33 +0000"  >&lt;p&gt;Lustre build: &lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-b2_5/80/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.hpdd.intel.com/job/lustre-b2_5/80/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.5/x86_64&lt;br/&gt;
FSTYPE=ldiskfs&lt;/p&gt;

&lt;p&gt;lustre-rsync-test test 2b failed as follows:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Starting replication
Replication of operation failed(-17): 3808 CREAT (1) [0x200000400:0x6de:0x0] [0x200000400:0x6dd:0x0] client.txt
Replication of operation failed(-17): 3809 CREAT (1) [0x200000400:0x6df:0x0] [0x200000400:0x6dd:0x0] dbench
Replication of operation failed(-17): 3810 MKDIR (2) [0x200000400:0x6e0:0x0] [0x200000400:0x6dd:0x0] lib64
Replication of operation failed(-17): 3811 CREAT (1) [0x200000400:0x6e1:0x0] [0x200000400:0x6e0:0x0] libpopt.so.0
--------8&amp;lt;--------
Replication of operation failed(-17): 3954 CREAT (1) [0x200000400:0x76b:0x0] [0x200000400:0x766:0x0] LARGE.FIL
ioctl err -22: Invalid argument (22)
fid2path error: (/mnt/lustre, [0x200000400:0x903:0x0]) 22 Invalid argument
Replication of operation failed(-22): 4819 CREAT (1) [0x200000400:0x903:0x0] [0x200000400:0x70e:0x0] GRAPH1.CDR
Lustre filesystem: lustre
MDT device: lustre-MDT0000
Source: /mnt/lustre
Target: /tmp/target
Target: /tmp/target2
Statuslog: /tmp/lustre_rsync.log
Changelog registration: cl4
Starting changelog record: 0
Clear changelog after use: no
Errors: 142
lustre_rsync took 8 seconds
Changelog records consumed: 1515
Only in /mnt/lustre/d2b.lustre-rsync-test/clients/client1/~dmtmp/WORDPRO: NEWS1_1B.PRN
 lustre-rsync-test test_2b: @@@@@@ FAIL: Failure in replication; differences found. 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Maloo report: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/de4f4760-26aa-11e4-9de1-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/de4f4760-26aa-11e4-9de1-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="107490" author="jamesanunez" created="Fri, 20 Feb 2015 15:16:40 +0000"  >&lt;p&gt;I&apos;ve run into the same test failure on lustre-master tag 2.6.94 in review-dne-part-1. Logs are at &lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/3d80b6cc-b898-11e4-84be-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/3d80b6cc-b898-11e4-84be-5254006e85c2&lt;/a&gt; .&lt;/p&gt;

&lt;p&gt;Other review-dne-part-1 failrues:&lt;br/&gt;
2015-07-09 15:01:36 - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/a6bf7468-26cd-11e5-8cf5-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/a6bf7468-26cd-11e5-8cf5-5254006e85c2&lt;/a&gt;&lt;br/&gt;
2015-07-10 03:03:45 - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/4df16926-26c4-11e5-8cf5-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/4df16926-26c4-11e5-8cf5-5254006e85c2&lt;/a&gt;&lt;br/&gt;
2015-07-11 21:05:09 - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/8d39b85e-282c-11e5-b280-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/8d39b85e-282c-11e5-b280-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="124901" author="jamesanunez" created="Mon, 24 Aug 2015 16:18:38 +0000"  >&lt;p&gt;Another instance on master (pre-2.8) during review-dne-part-1:&lt;br/&gt;
2015-08-21 17:55:32 - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/8973091c-483b-11e5-813b-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/8973091c-483b-11e5-813b-5254006e85c2&lt;/a&gt;&lt;br/&gt;
2015-08-24 11:09:22 - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/a1d65e12-4a5e-11e5-88e8-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/a1d65e12-4a5e-11e5-88e8-5254006e85c2&lt;/a&gt;&lt;br/&gt;
2015-08-26 21:21:35 - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/f981d950-4c45-11e5-8a6a-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/f981d950-4c45-11e5-8a6a-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="136025" author="jamesanunez" created="Fri, 11 Dec 2015 15:33:01 +0000"  >&lt;p&gt;Another failure on master:&lt;br/&gt;
2015-12-10 14:51:22 - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/5c0fb6a8-9f72-11e5-8c89-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/5c0fb6a8-9f72-11e5-8c89-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="148654" author="yujian" created="Tue, 12 Apr 2016 20:44:11 +0000"  >&lt;p&gt;I did the following experiments on master branch and hit the similar failures:&lt;br/&gt;
1) format and setup the Lustre filesystem&lt;br/&gt;
2) on MDS, register a changelog user for MDT:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# lctl --device lustre-MDT0000 changelog_register
lustre-MDT0000: Registered changelog userid &apos;cl1&apos;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;3) on Client, copy /etc to /mnt/lustre:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# cp -a /etc /mnt/lustre/
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;4) on Client, format an ext4 filesystem as the target backup filesystem:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# mkfs.ext4 /dev/vda3
# mount -t ext4 /dev/vda3 /mnt/backup/
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;5) on Client, perform rsync to make the source and target filesystems identical:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# rsync -avh  /mnt/lustre/ /mnt/backup/
sending incremental file list
--------8&amp;lt;--------
sent 27.47M bytes  received 23.02K bytes  4.23M bytes/sec
total size is 27.38M  speedup is 1.00
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;6) on Client, perform lustre_rsync:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# lustre_rsync --source=/mnt/lustre/ --target=/mnt/backup/ --mdt=lustre-MDT0000 --user=cl1 --statuslog lustre_rsync_statuslog --verbose
Replication of operation failed(-17): 1 MKDIR (2) [0x200000401:0x1:0x0] [0x200000007:0x1:0x0] etc
Replication of operation failed(-17): 2 CREAT (1) [0x200000401:0x2:0x0] [0x200000401:0x1:0x0] issue.net
Replication of operation failed(-17): 4 CREAT (1) [0x200000401:0x3:0x0] [0x200000401:0x1:0x0] yum.conf
Replication of operation failed(-17): 6 MKDIR (2) [0x200000401:0x4:0x0] [0x200000401:0x1:0x0] gnupg
Replication of operation failed(-17): 9 MKDIR (2) [0x200000401:0x5:0x0] [0x200000401:0x1:0x0] X11
Replication of operation failed(-17): 10 CREAT (1) [0x200000401:0x6:0x0] [0x200000401:0x5:0x0] prefdm
Replication of operation failed(-17): 12 MKDIR (2) [0x200000401:0x7:0x0] [0x200000401:0x5:0x0] fontpath.d
Replication of operation failed(-17): 13 SLINK (4) [0x200000401:0x8:0x0] [0x200000401:0x7:0x0] fonts-default
Replication of operation failed(-17): 15 SLINK (4) [0x200000401:0x9:0x0] [0x200000401:0x7:0x0] default-ghostscript
--------8&amp;lt;--------
Lustre filesystem: lustre
MDT device: lustre-MDT0000
Source: /mnt/lustre/
Target: /mnt/backup/
Statuslog: lustre_rsync_statuslog
Changelog registration: cl1
Starting changelog record: 0
Errors: 1782
lustre_rsync took 14 seconds
Changelog records consumed: 3735
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="148659" author="yujian" created="Tue, 12 Apr 2016 21:07:53 +0000"  >&lt;p&gt;It turns out registering the changelog user needs to be done after running rsync. I did the experiment again and succeeded in running lustre_rsync:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# lustre_rsync --source=/mnt/lustre/ --target=/mnt/backup/ --mdt=lustre-MDT0000 --user=cl1 --statuslog lustre_rsync_statuslog --verbose
Lustre filesystem: lustre
MDT device: lustre-MDT0000
Source: /mnt/lustre/
Target: /mnt/backup/
Statuslog: lustre_rsync_statuslog
Changelog registration: cl1
Starting changelog record: 0
Errors: 0
lustre_rsync took 0 seconds
Changelog records consumed: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="153396" author="jhammond" created="Tue, 24 May 2016 19:38:06 +0000"  >&lt;p&gt;I think that this is because we use SIGSTOP and SIGCONT to control dbench.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;    echo Stopping dbench
    $KILL -SIGSTOP $child_pid

    echo Starting replication
    $LRSYNC -l $LREPL_LOG -D $LRSYNC_LOG
    check_diff $DIR/$tdir $TGT/$tdir
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Even though &lt;tt&gt;kill()&lt;/tt&gt; has returned the SIGSTOP may still be pending (for example, if the the dbench task is executing the the kernel, doing I/O). We should sleep for some time after sending SIGSTOP or even better wait until all dbench tasks have a stopped state.&lt;/p&gt;</comment>
                            <comment id="153731" author="gerrit" created="Thu, 26 May 2016 19:40:46 +0000"  >&lt;p&gt;John L. Hammond (john.hammond@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/20471&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/20471&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4781&quot; title=&quot;lustre-rsync-test test_2b: Replication of operation failed(-17)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4781&quot;&gt;&lt;del&gt;LU-4781&lt;/del&gt;&lt;/a&gt; test: wait for dbench to stop&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: d5fe8aeb0b6ea00165ddc98e13c5fb270fad8e7d&lt;/p&gt;</comment>
                            <comment id="154001" author="gerrit" created="Mon, 30 May 2016 23:44:05 +0000"  >&lt;p&gt;Andreas Dilger (andreas.dilger@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/20471/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/20471/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4781&quot; title=&quot;lustre-rsync-test test_2b: Replication of operation failed(-17)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4781&quot;&gt;&lt;del&gt;LU-4781&lt;/del&gt;&lt;/a&gt; test: wait for dbench to stop&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: f7452bafb79d94c88b5fb390e431b06e79325f99&lt;/p&gt;</comment>
                            <comment id="154754" author="jamesanunez" created="Mon, 6 Jun 2016 15:30:38 +0000"  >&lt;p&gt;lustre-rsyn-test test 2b is still failing even after patch 20471 landed to master. Here are some recent failures:&lt;br/&gt;
2016-06-05  - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/f3692b30-2b07-11e6-80b9-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/f3692b30-2b07-11e6-80b9-5254006e85c2&lt;/a&gt;&lt;br/&gt;
2016-06-05 - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/0485f5aa-2b5e-11e6-80b9-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/0485f5aa-2b5e-11e6-80b9-5254006e85c2&lt;/a&gt;&lt;br/&gt;
2016-06-05 - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/9874c1e0-2b60-11e6-80b9-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/9874c1e0-2b60-11e6-80b9-5254006e85c2&lt;/a&gt;&lt;br/&gt;
2016-06-06 - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/36032394-2ba6-11e6-acf3-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/36032394-2ba6-11e6-acf3-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="154779" author="adilger" created="Mon, 6 Jun 2016 17:01:08 +0000"  >&lt;p&gt;I think it is strange that the error -17 = &lt;tt&gt;-EEXIST&lt;/tt&gt; is the only one being hit.  Is the MDS not properly registering or processing unlink records?  Is lustre_rsync returning an error for &lt;tt&gt;CREAT&lt;/tt&gt; records (e.g. using &lt;tt&gt;O_EXCL&lt;/tt&gt;) when they should be allowed to open(O_CREAT) files that already exist?  Is lustre_rsync not applying the CREAT records in the right parent directory?&lt;/p&gt;

&lt;p&gt;Is it worthwhile to submit a test patch to collect the full changelog from the dbench run so that it is possible to trace the complete lifetime of a parent directory and file so that this problem can be debugged more effectively?&lt;/p&gt;

&lt;p&gt;Looking at the lustre_rsync log it appears that we are processing the ChangeLog entries twice?&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/sub_tests/1a5aee48-2be6-11e6-a0ce-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/sub_tests/1a5aee48-2be6-11e6-a0ce-5254006e85c2&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_logs/1a89dee2-2be6-11e6-a0ce-5254006e85c2/show_text&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_logs/1a89dee2-2be6-11e6-a0ce-5254006e85c2/show_text&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_logs/1aa09a56-2be6-11e6-a0ce-5254006e85c2/show_text&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_logs/1aa09a56-2be6-11e6-a0ce-5254006e85c2/show_text&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;For example, the file &quot;STUDENTS.TV&quot; FID &lt;span class=&quot;error&quot;&gt;&amp;#91;0x200006990:0x414:0x0&amp;#93;&lt;/span&gt; is created in record &lt;tt&gt;5056&lt;/tt&gt; but that record appears twice in the log and the second time produces the &lt;tt&gt;-EEXIST&lt;/tt&gt; error:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;**** Start 5056 CREAT (1) [0x200006990:0x414:0x0] [0x200006990:0x3f0:0x0] STUDENTS.TV *****
dest = d2b.lustre-rsync-test/clients/client1/~dmtmp/PARADOX/STUDENTS.TV; savedpath = d2b.lustre-rsync-test/clients/client1/~dmtmp/PARADOX/STUDENTS.TV
mkfile(1) /tmp/target/d2b.lustre-rsync-test/clients/client1/~dmtmp/PARADOX/STUDENTS.TV 
Syncing data and attributes [0x200006990:0x414:0x0]
llistxattr(/mnt/lustre/.lustre/fid/[0x200006990:0x414:0x0],(nil)) returned 64, errno=0
llistxattr /mnt/lustre/.lustre/fid/[0x200006990:0x414:0x0] returned 64, errno=0
	(trusted.version,10469904) rc=0x8
	lsetxattr(), rc=0, errno=0
	(trusted.link,10469904) rc=0x35
	lsetxattr(), rc=0, errno=0
	(trusted.lov,10469904) rc=0x38
	lsetxattr(), rc=0, errno=0
	(trusted.lma,10469904) rc=0x18
	lsetxattr(), rc=0, errno=0
	(lustre.lov,10469904) rc=0x38
	lsetxattr(), rc=-1, errno=95
setxattr: /mnt/lustre/.lustre/fid/[0x200006990:0x414:0x0] /tmp/target/d2b.lustre-rsync-test/clients/client1/~dmtmp/PARADOX/STUDENTS.TV
mkfile(1) /tmp/target2/d2b.lustre-rsync-test/clients/client1/~dmtmp/PARADOX/STUDENTS.TV 
Syncing data and attributes [0x200006990:0x414:0x0]
llistxattr(/mnt/lustre/.lustre/fid/[0x200006990:0x414:0x0],0x9fb200) returned 64, errno=0
	(trusted.version,10469904) rc=0x8
	lsetxattr(), rc=0, errno=0
	(trusted.link,10469904) rc=0x35
	lsetxattr(), rc=0, errno=0
	(trusted.lov,10469904) rc=0x38
	lsetxattr(), rc=0, errno=0
	(trusted.lma,10469904) rc=0x18
	lsetxattr(), rc=0, errno=0
	(lustre.lov,10469904) rc=0x38
	lsetxattr(), rc=-1, errno=95
setxattr: /mnt/lustre/.lustre/fid/[0x200006990:0x414:0x0] /tmp/target2/d2b.lustre-rsync-test/clients/client1/~dmtmp/PARADOX/STUDENTS.TV
##### End 5056 CREAT (1) [0x200006990:0x414:0x0] [0x200006990:0x3f0:0x0] STUDENTS.TV rc=0 #####
:
:
***** Start 5058 CLOSE (11) [0x200006990:0x414:0x0] [0:0x0:0x0]  *****
##### End 5058 CLOSE (11) [0x200006990:0x414:0x0] [0:0x0:0x0]  rc=0 #####
:
:
:
***** Start 5056 CREAT (1) [0x200006990:0x414:0x0] [0x200006990:0x3f0:0x0] STUDENTS.TV *****
dest = d2b.lustre-rsync-test/clients/client1/~dmtmp/PARADOX/STUDENTS.TV; savedpath = d2b.lustre-rsync-test/clients/client1/~dmtmp/PARADOX/STUDENTS.TV
mkfile(1) /tmp/target/d2b.lustre-rsync-test/clients/client1/~dmtmp/PARADOX/STUDENTS.TV 
mkfile(1) /tmp/target2/d2b.lustre-rsync-test/clients/client1/~dmtmp/PARADOX/STUDENTS.TV 
##### End 5056 CREAT (1) [0x200006990:0x414:0x0] [0x200006990:0x3f0:0x0] STUDENTS.TV rc=-17 #####
***** Start 5058 CLOSE (11) [0x200006990:0x414:0x0] [0:0x0:0x0]  *****
##### End 5058 CLOSE (11) [0x200006990:0x414:0x0] [0:0x0:0x0]  rc=0 #####
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It appears that the log processing restarts at some point, but I don&apos;t know what the cause is.  It might be ChangeLog wrapping in the kernel?&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;:
:
***** Start 5519 CLOSE (11) [0x200006990:0x460:0x0] [0:0x0:0x0]  *****
##### End 5519 CLOSE (11) [0x200006990:0x460:0x0] [0:0x0:0x0]  rc=0 #####
***** Start 5520 CLOSE (11) [0x200006990:0x460:0x0] [0:0x0:0x0]  *****
##### End 5520 CLOSE (11) [0x200006990:0x460:0x0] [0:0x0:0x0]  rc=0 #####
***** Start 4823 CREAT (1) [0x200006990:0x397:0x0] [0x200006990:0x395:0x0] client.txt *****
dest = d2b.lustre-rsync-test/client.txt; savedpath = d2b.lustre-rsync-test/client.txt
mkfile(1) /tmp/target/d2b.lustre-rsync-test/client.txt 
mkfile(1) /tmp/target2/d2b.lustre-rsync-test/client.txt 
##### End 4823 CREAT (1) [0x200006990:0x397:0x0] [0x200006990:0x395:0x0] client.txt rc=-17 #####
***** Start 4824 CLOSE (11) [0x200006990:0x397:0x0] [0:0x0:0x0]  *****
##### End 4824 CLOSE (11) [0x200006990:0x397:0x0] [0:0x0:0x0]  rc=0 #####
:
:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="154788" author="jgmitter" created="Mon, 6 Jun 2016 17:14:39 +0000"  >&lt;p&gt;Hi John,&lt;/p&gt;

&lt;p&gt;Can you have another look at this one?  It seems even with the landing of &lt;a href=&quot;http://review.whamcloud.com/#/c/20471/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/20471/&lt;/a&gt; that the problem is still occurring.&lt;/p&gt;

&lt;p&gt;Thanks.&lt;br/&gt;
Joe&lt;/p&gt;</comment>
                            <comment id="154800" author="jhammond" created="Mon, 6 Jun 2016 18:35:34 +0000"  >&lt;p&gt;&amp;gt; I think it is strange that the error -17 = -EEXIST is the only one being hit. Is the MDS not properly registering or processing unlink records? Is lustre_rsync returning an error for CREAT records (e.g. using O_EXCL) when they should be allowed to open(O_CREAT) files that already exist? Is lustre_rsync not applying the CREAT records in the right parent directory?&lt;/p&gt;

&lt;p&gt;It uses &lt;tt&gt;mknod()&lt;/tt&gt;. The &lt;tt&gt;-EEXIST&lt;/tt&gt; errors are also seen on successful runs. But it&apos;s conceivable that there is a race where the unhandled &lt;tt&gt;-EEXIST&lt;/tt&gt; causes the files to end up with different contents. I&apos;ll push a small patch.&lt;/p&gt;</comment>
                            <comment id="154803" author="adilger" created="Mon, 6 Jun 2016 18:55:46 +0000"  >&lt;p&gt;I think the more important problem is that it appears lustre_rsync is processing the same ChangeLog records twice.  Is that because lustre_rsync is processing the logs incorrectly, because the &quot;clear logs before record X&quot; request isn&apos;t handled or isn&apos;t persistent (ala &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7428&quot; title=&quot;conf-sanity test_84, replay-dual 0a: /dev/lvm-Role_MDS/P1 failed to initialize!&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7428&quot;&gt;&lt;del&gt;LU-7428&lt;/del&gt;&lt;/a&gt;) across restarts, or because of an internal ChangeLog bug that is returning bogus duplicate records due to ChangeLog index wrap, bad llog handling, etc?&lt;/p&gt;</comment>
                            <comment id="154804" author="jhammond" created="Mon, 6 Jun 2016 18:59:31 +0000"  >&lt;p&gt;&amp;gt; For example, the file &quot;STUDENTS.TV&quot; FID &lt;span class=&quot;error&quot;&gt;&amp;#91;0x200006990:0x414:0x0&amp;#93;&lt;/span&gt; is created in record 5056 but that record appears twice in the log and the second time produces the -EEXIST error:&lt;/p&gt;

&lt;p&gt;This is because &lt;tt&gt;lustre_rsync&lt;/tt&gt; is invoked with &lt;tt&gt;-c no&lt;/tt&gt; which tells it not to clear the clangelog when done.&lt;/p&gt;</comment>
                            <comment id="154814" author="gerrit" created="Mon, 6 Jun 2016 20:10:00 +0000"  >&lt;p&gt;John L. Hammond (john.hammond@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/20649&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/20649&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4781&quot; title=&quot;lustre-rsync-test test_2b: Replication of operation failed(-17)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4781&quot;&gt;&lt;del&gt;LU-4781&lt;/del&gt;&lt;/a&gt; utils: handle EEXIST in lustre_rsync&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: bb459c9631361cb28bd8fe367de99603e3d67ff0&lt;/p&gt;</comment>
                            <comment id="155465" author="yong.fan" created="Mon, 13 Jun 2016 01:29:14 +0000"  >&lt;p&gt;Another failure instance on master:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/f7240550-308a-11e6-a0ce-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/f7240550-308a-11e6-a0ce-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="155632" author="gerrit" created="Tue, 14 Jun 2016 03:56:21 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/20649/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/20649/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4781&quot; title=&quot;lustre-rsync-test test_2b: Replication of operation failed(-17)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4781&quot;&gt;&lt;del&gt;LU-4781&lt;/del&gt;&lt;/a&gt; utils: handle EEXIST in lustre_rsync&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 2a55f34bd5514e905f6049f4e293352bd09ffe91&lt;/p&gt;</comment>
                            <comment id="155707" author="jgmitter" created="Tue, 14 Jun 2016 22:49:50 +0000"  >&lt;p&gt;2nd patch has landed to master for 2.9&lt;/p&gt;</comment>
                            <comment id="166650" author="bogl" created="Wed, 21 Sep 2016 02:28:42 +0000"  >&lt;p&gt;more on b2_8_fe:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/885f908c-7f8e-11e6-8a8c-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/885f908c-7f8e-11e6-8a8c-5254006e85c2&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/25fa9bc8-7ec9-11e6-8afd-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/25fa9bc8-7ec9-11e6-8afd-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                                        </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="16293">LU-2121</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="22040">LU-4256</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="14657">LU-1458</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="30368">LU-6644</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwhsf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>13146</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>