<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:45:51 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-11663] corrupt data after page-unaligned write with zfs backend lustre 2.10</title>
                <link>https://jira.whamcloud.com/browse/LU-11663</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;The apparent contents of a file change after dropping caches:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@catalyst110:toss-4371.umm1t]# ./proc6.olaf
+ dd if=/dev/urandom of=testfile20K.in bs=10240 count=2
2+0 records in
2+0 records out
20480 bytes (20 kB) copied, 0.024565 s, 834 kB/s
+ dd if=testfile20K.in of=testfile20K.out bs=10240 count=2
2+0 records in
2+0 records out
20480 bytes (20 kB) copied, 0.0451045 s, 454 kB/s
++ md5sum testfile20K.out
+ original_md5sum=&apos;1060a4c01a415d7c38bdd00dcf09dd22  testfile20K.out&apos;
+ echo 3
++ md5sum testfile20K.out
+ echo after drop_caches 1060a4c01a415d7c38bdd00dcf09dd22 testfile20K.out 717122f4dd25f2e75834a8b21c79ce50 testfile20K.out
after drop_caches 1060a4c01a415d7c38bdd00dcf09dd22 testfile20K.out 717122f4dd25f2e75834a8b21c79ce50 testfile20K.out                                                                        

[root@catalyst110:toss-4371.umm1t]# cat proc6.olaf
#!/bin/bash

set -x

dd if=/dev/urandom of=testfile.in bs=10240 count=2
dd if=testfile.in of=testfile.out bs=10240 count=2

#dd if=/dev/urandom of=testfile.in bs=102400 count=2
#dd if=testfile.in of=testfile.out bs=102400 count=2
original_md5sum=$(md5sum testfile.out)
echo 3 &amp;gt;/proc/sys/vm/drop_caches

echo after drop_caches $original_md5sum $(md5sum testfile.out)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>client catalyst: lustre-2.8.2_5.chaos-1.ch6.x86_64&lt;br/&gt;
server: porter lustre-2.10.5_2.chaos-3.ch6.x86_64&lt;br/&gt;
kernel-3.10.0-862.14.4.1chaos.ch6.x86_64 (RHEL 7.5 derivative)&lt;br/&gt;
</environment>
        <key id="54018">LU-11663</key>
            <summary>corrupt data after page-unaligned write with zfs backend lustre 2.10</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bzzz">Alex Zhuravlev</assignee>
                                    <reporter username="ofaaland">Olaf Faaland</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Wed, 14 Nov 2018 01:24:15 +0000</created>
                <updated>Mon, 17 Dec 2018 23:34:21 +0000</updated>
                            <resolved>Fri, 30 Nov 2018 18:30:52 +0000</resolved>
                                    <version>Lustre 2.12.0</version>
                    <version>Lustre 2.10.5</version>
                    <version>Lustre 2.10.6</version>
                                    <fixVersion>Lustre 2.12.0</fixVersion>
                    <fixVersion>Lustre 2.10.6</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>16</watches>
                                                                            <comments>
                            <comment id="236960" author="ofaaland" created="Wed, 14 Nov 2018 01:32:21 +0000"  >&lt;p&gt;Console log reports no errors.&#160; Only that one lustre file system is mounted, and there are no issues with it or the network in between.&#160; No servers were in recovery, starting, or stopping at the time of the example above.&lt;/p&gt;

&lt;p&gt;With this file, the symptoms are 100% reproducible, so I can gather debug logs as required.&#160;&#160; What would you like - rpctrace? vfstrace?&#160; dlmtrace?&lt;/p&gt;</comment>
                            <comment id="236961" author="ofaaland" created="Wed, 14 Nov 2018 01:32:36 +0000"  >&lt;p&gt;This is a production file system.&lt;/p&gt;</comment>
                            <comment id="236964" author="adilger" created="Wed, 14 Nov 2018 01:51:44 +0000"  >&lt;p&gt;My first suggestion would be to check the strace output of &quot;cp&quot; to see if it is over-optimizing the file copy based on the stat() or FIEMAP output?  There was a bug in cp that it wouldn&apos;t try to copy data if stat reported blocks = 0. We fixed that in Lustre by always reporting blocks = 1 if the file had dirty data, but maybe that patch is not in your version?  &lt;/p&gt;

&lt;p&gt;Next might be that the peer client doing the cp is not getting any block count from the glimpse request, so the workaround was working on the local node that originally wrote the file but not the other clients. We &lt;em&gt;should&lt;/em&gt; be returning some block count estimate from the original writing client to the peer doing the cp, but it is possible that is missing/broken?&lt;/p&gt;</comment>
                            <comment id="236971" author="adilger" created="Wed, 14 Nov 2018 07:25:25 +0000"  >&lt;p&gt;Also, what version of &lt;tt&gt;coreutils&lt;/tt&gt; are you running? &lt;/p&gt;</comment>
                            <comment id="236986" author="ofaaland" created="Wed, 14 Nov 2018 16:58:54 +0000"  >&lt;p&gt;I assume this was the product of a spell checker:&lt;/p&gt;
&lt;blockquote&gt;&lt;p&gt;Also, what version of &lt;tt&gt;Corey told&lt;/tt&gt; are you running?&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;But if not, tell me what it is &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;.  I added the kernel version to the environment section above.&lt;/p&gt;</comment>
                            <comment id="236993" author="ofaaland" created="Wed, 14 Nov 2018 18:20:08 +0000"  >&lt;p&gt;more versions:&lt;/p&gt;

&lt;p&gt;tar-1.26-34.el7.x86_64&lt;br/&gt;
coreutils-8.22-21.el7.x86_64&lt;br/&gt;
(aka Corey told)&lt;/p&gt;</comment>
                            <comment id="236994" author="ofaaland" created="Wed, 14 Nov 2018 18:24:37 +0000"  >&lt;p&gt;It looks like cp in the description is a red herring, I&apos;ll update the description with a simpler reproducer.  tar and md5sum are enough to see the issue, but it does take two nodes.  tar does not issue a fiemap according to strace.&lt;/p&gt;</comment>
                            <comment id="237001" author="ofaaland" created="Wed, 14 Nov 2018 21:44:28 +0000"  >&lt;p&gt;It depends on the block size used when writing.  bs=10240 triggers the problem and the checksums do not match, but bs=102400 does not trigger the problem.&lt;/p&gt;
</comment>
                            <comment id="237003" author="green" created="Wed, 14 Nov 2018 22:36:12 +0000"  >&lt;p&gt;I see you are not calculatign the checksum of the out file before drop caches? why?&lt;/p&gt;

&lt;p&gt;Also I do wonder if you insert a sync between the two dds, would it make any difference? (this is mostly playing into the same idea Andreas has about the fiemap). Also please consider capturing both files so we can examine what&apos;s different about them.&lt;/p&gt;</comment>
                            <comment id="237005" author="ofaaland" created="Wed, 14 Nov 2018 22:49:37 +0000"  >&lt;p&gt;Before the drop caches, the md5sum of testfile.in and testfile.out are the same.  It&apos;s not in that particular example, but it&apos;s been verified.    We&apos;ve tried the sync you proposed, and that did not alter the behavior.&lt;/p&gt;

&lt;p&gt;I have altered my test to create the file on NFS originally, which is not exhibiting this behavior.  I checksum it there, and create a hexdump of it, and then use dd to copy its data to a file on the lustre 2.10 file system, and hexdump and checksum it there.&lt;/p&gt;

&lt;p&gt;Before the drop_caches, the md5sum and hexdump match that of the version on NFS.  After the drop caches, they do not.&lt;/p&gt;

&lt;p&gt;Looking at the diffs of the hexdumps, the differences are not the same WRT location in the file or in the contents.  Sometimes the damaged file has all 0&apos;s, sometimes it has visible structure,  and sometimes the new data does not have visible structure.&lt;/p&gt;</comment>
                            <comment id="237006" author="ofaaland" created="Wed, 14 Nov 2018 23:03:48 +0000"  >&lt;p&gt;My earlier comment is not quite right.  There is a pattern to the location when I test with the same file size and read/write request size.  Using bs=10240 and count=2, the first difference always appears at offset 0x2800 or 10240 (ie at the boundary of the requests)&lt;/p&gt;

&lt;p&gt;The data that is found at offset 0x2800 is different every time I issue dd to create a new file, but that offset is where the difference starts.&lt;/p&gt;</comment>
                            <comment id="237007" author="ofaaland" created="Wed, 14 Nov 2018 23:14:18 +0000"  >&lt;blockquote&gt;&lt;p&gt;I have altered my test to create the file on NFS originally, which is not exhibiting this behavior. I checksum it there, and create a hexdump of it, and then use dd to copy its data to a file on the lustre 2.10 file system, and hexdump and checksum it there.&lt;/p&gt;

&lt;p&gt;Before the drop_caches, the md5sum and hexdump match that of the version on NFS. After the drop caches, they do not.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;This means that the data cached when the writes occurred is the good data, but that was sent back by the OST is bad, correct?  I&apos;ll go look at the data on the OST to see what it looks like.&lt;/p&gt;</comment>
                            <comment id="237011" author="green" created="Thu, 15 Nov 2018 00:24:41 +0000"  >&lt;p&gt;btw the script you are providing appears to be single node, but in the comment you say this requires two nodes. What&apos;s the second node for?&lt;/p&gt;</comment>
                            <comment id="237017" author="ofaaland" created="Thu, 15 Nov 2018 06:06:00 +0000"  >&lt;blockquote&gt;&lt;p&gt;btw the script you are providing appears to be single node, but in the comment you say this requires two nodes. What&apos;s the second node for?&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Originally we reproduced the problem using two nodes; one to write the data and another to read and checksum it, to detect the problem.&#160; &#160;Once we started dropping caches, we did not need a second node.&lt;/p&gt;</comment>
                            <comment id="237041" author="ofaaland" created="Thu, 15 Nov 2018 17:38:18 +0000"  >&lt;p&gt;I haven&apos;t found the objects on disk, going back to that in a minute.  But from the client, with a sample 100k test file, copies made via dd with bs=10240 always have damage in the following extents (offsets, in hex).  The actual content of the damaged areas is different every time.&lt;br/&gt;
0002800 - 0002fff&lt;br/&gt;
0007800 - 0007fff&lt;br/&gt;
000c800 - 000cfff&lt;br/&gt;
0011800 - 0011fff&lt;br/&gt;
0016800 - 0016fff&lt;/p&gt;

&lt;p&gt;The rest of the file is correct.&lt;br/&gt;
PAGESIZE is 4096&lt;/p&gt;</comment>
                            <comment id="237050" author="ofaaland" created="Thu, 15 Nov 2018 18:35:42 +0000"  >&lt;p&gt;I mounted the file system from one of the OSS nodes (porter), so that the client is the same version (lustre-2.10.5_2.chaos-3.ch6.x86_64) as all the servers and the client communicates directly with the servers, not through routers.&lt;br/&gt;
On catalyst, the lustre 2.8 compute cluster, I created a file using dd and bs=10240 as described above.&lt;/p&gt;

&lt;p&gt;When I read the file from the client mounted on the OSS, I see the corrupted data.&lt;/p&gt;

&lt;p&gt;This seems to me to indicate that the problem is occurring in the write path, not the read path.  Does that make sense?&lt;/p&gt;</comment>
                            <comment id="237067" author="sarah" created="Thu, 15 Nov 2018 22:16:26 +0000"  >&lt;p&gt;cannot reproduce it with tip of master (build 3826 el7.5 . kernel-3.10.0-862.14.4.el7_lustre.x86_64) server and 2.8.0 client &lt;br/&gt;
 2 MDS with 1 MDT on each; 1 OSS with 2 OSTs, ldiskfs&lt;br/&gt;
 1 client&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@trevis-60vm4 lustre]# ./rp.sh 
+ dd if=/dev/urandom of=testfile.in bs=10240 count=2
2+0 records in
2+0 records out
20480 bytes (20 kB) copied, 0.00276562 s, 7.4 MB/s
+ dd if=testfile.in of=testfile.out bs=10240 count=2
2+0 records in
2+0 records out
20480 bytes (20 kB) copied, 0.00142726 s, 14.3 MB/s
++ md5sum testfile.out
+ original_md5sum=&apos;f6bcdb9f1b674d29cd313a46a1c0cedb  testfile.out&apos;
+ echo 3
[ 1748.385888] rp.sh (21490): drop_caches: 3
++ md5sum testfile.out
+ echo after drop_caches f6bcdb9f1b674d29cd313a46a1c0cedb testfile.out f6bcdb9f1b674d29cd313a46a1c0cedb testfile.out
after drop_caches f6bcdb9f1b674d29cd313a46a1c0cedb testfile.out f6bcdb9f1b674d29cd313a46a1c0cedb testfile.out
[root@trevis-60vm4 lustre]# ls
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="237070" author="ofaaland" created="Thu, 15 Nov 2018 23:12:25 +0000"  >&lt;p&gt;Sarah,&lt;br/&gt;
If there&apos;s any information I can provide let me know.  Thanks.&lt;/p&gt;</comment>
                            <comment id="237080" author="adilger" created="Fri, 16 Nov 2018 03:29:35 +0000"  >&lt;p&gt;Olaf, as Sarah is having trouble to reproduce this, can you please run a test with -1 debug on the client?  My first guess is that this is somehow related to the client IO stack. Given that there would only be a handful of operations in the log it shouldn&apos;t be too bad to look through. &lt;/p&gt;</comment>
                            <comment id="237081" author="adilger" created="Fri, 16 Nov 2018 03:34:43 +0000"  >&lt;p&gt;I guess the other question is whether you tried running the reproducer on some previous version on the client?  Is it possible that this is a newly introduced problem?  It seems a bit strange that there would be a problem like this going unnoticed since 2.8 was released.  &lt;/p&gt;</comment>
                            <comment id="237100" author="ofaaland" created="Fri, 16 Nov 2018 17:41:55 +0000"  >&lt;p&gt;In testing since yesterday I&apos;m sometimes finding the corruption does not occur - that is, if I run the same reproduce 60 times in a row on the same client, for example, it may show corruption 50 times in a row and then show no corruption for the last 10.&lt;/p&gt;

&lt;p&gt;I attached&#160;for-upload-lu-11663.tar.bz2 which has -1 debug logs for 3 attempts, along with the terminal output when I ran the reproducer and an index matching the results to the log files.&#160; I run lctl dk before each attempt and after, so there are 6 log files.&lt;/p&gt;

&lt;p&gt;After the first attempt, which shows the corruption, I umount all the lustre file systems and then mount them again.&#160; I then run the same reproducer twice and no corruption occurs.&#160; I&apos;m not sure whether that&apos;s due to the umount/remount or not.&lt;/p&gt;</comment>
                            <comment id="237101" author="ofaaland" created="Fri, 16 Nov 2018 17:43:15 +0000"  >&lt;blockquote&gt;&lt;p&gt;I guess the other question is whether you tried running the reproducer on some previous version on the client? Is it possible that this is a newly introduced problem? It seems a bit strange that there would be a problem like this going unnoticed since 2.8 was released.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;I agree.&#160; I&apos;ll try that.&lt;/p&gt;</comment>
                            <comment id="237216" author="ofaaland" created="Mon, 19 Nov 2018 22:24:42 +0000"  >&lt;p&gt;I&apos;m still working on trying previous client versions.  I should have at least one other version tested today.&lt;/p&gt;

&lt;p&gt; For context, this issue has been observed on client cluster catalyst, which mounts three lustre file systems.&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;lustre3 hosted on porter.  This is lustre 2.10.5 based.&lt;/li&gt;
	&lt;li&gt;lustre1 hosted on copper.  This is lustre 2.8.2 based.&lt;/li&gt;
	&lt;li&gt;lscratchh hosted on zinc.  This is lustre 2.8.2 based.&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Connections are through routers.  The routers in catalyst are the same version as the clients.  All nodes are x86_64.  I don&apos;t recall the IB-to-IP router nodes lustre or kernel versions but can find out.&lt;/p&gt;

&lt;p&gt;catalyst-compute &amp;lt;&lt;del&gt;&amp;gt; catalyst-router &amp;lt;&lt;/del&gt;&amp;gt; lustre3&lt;br/&gt;
catalyst-compute &amp;lt;&lt;del&gt;&amp;gt; catalyst-router &amp;lt;&lt;/del&gt;&amp;gt; IB-to-IP-router &amp;lt;&lt;del&gt;&amp;gt; IP-to-IB-router &amp;lt;&lt;/del&gt;&amp;gt; (lustre1 and lscratchh)&lt;/p&gt;

&lt;p&gt;We have observed this issue only on lustre3 so far.&lt;/p&gt;

&lt;p&gt;During testing this weekend I ran two 1000-iteration test sets on 20 dedicated catalyst nodes.  During both sets:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;one node, catalyst110, reproduced the problem &amp;gt; 95% of the time&lt;/li&gt;
	&lt;li&gt;a different node reproduced the problem about 15% of the time&lt;/li&gt;
	&lt;li&gt;fifteen nodes never reproduced the problem&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;In the first test set, I only ran the reproducer against lustre3, where the issue was first identified last week.&lt;br/&gt;
In the second test set, I ran the reproducer first against lustre3 and then against lustre1.  The problem was reproduced only with lustre3, the 2.10 file system.  It was never reproduced with lustre1.&lt;/p&gt;</comment>
                            <comment id="237228" author="sarah" created="Tue, 20 Nov 2018 03:41:05 +0000"  >&lt;p&gt;I downloaded lustre-2.10.5_2.chaos.tar.gz  and lustre-2.8.2_1.chaos.tar.gz (cannot find 2.8.2_5) from &lt;a href=&quot;https://github.com/LLNL/lustre/releases&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/LLNL/lustre/releases&lt;/a&gt;, compile them and cannot reproduce.&lt;/p&gt;

&lt;p&gt;Server: compile lustre-2.10.5_2.chaos on kernel 3.10.0-862.14.4.el7_lustre.x86_64&lt;br/&gt;
1 MDT, 1 OST on single node&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@trevis-60vm1 utils]# uname -a
Linux trevis-60vm1.trevis.whamcloud.com 3.10.0-862.14.4.el7_lustre.x86_64 #1 SMP Thu Nov 8 07:41:43 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux
[root@trevis-60vm1 utils]#
[root@trevis-60vm1 ~]# lu-11663/lustre-2.10.5_2.chaos/lustre/utils/lctl get_param version
version=2.10.5
[root@trevis-60vm1 ~]#

[root@trevis-60vm1 ~]# rpm -qa|grep zfs
libzfs2-0.7.9-1.el7.x86_64
kmod-lustre-osd-zfs-2.11.56_140_g2339e1b-1.el7.x86_64
libzfs2-devel-0.7.9-1.el7.x86_64
lustre-osd-zfs-mount-2.11.56_140_g2339e1b-1.el7.x86_64
zfs-0.7.9-1.el7.x86_64
...
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Single client, compile lustre-2.8.2_1.chaos on kernel 3.10.0-327.3.1.el7.x86_64(2.8.0 kernel)&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@trevis-60vm4 ~]# lu-11663/lustre-2.8.2_1.chaos/lustre/utils/lctl get_param version
version=lustre: 2.8.2
kernel: patchless_client
build:  2.8.2
[root@trevis-60vm4 ~]#
[root@trevis-60vm4 ~]# uname -a
Linux trevis-60vm4.trevis.whamcloud.com 3.10.0-327.3.1.el7.x86_64 #1 SMP Fri Nov 20 05:40:26 EST 2015 x86_64 x86_64 x86_64 GNU/Linux
[root@trevis-60vm4 ~]#
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;result&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@trevis-60vm4 ~]# cd /mnt/lustre/
[root@trevis-60vm4 lustre]# sh foo.sh 
+ dd if=/dev/urandom of=testfile.in bs=10240 count=2
2+0 records in
2+0 records out
20480 bytes (20 kB) copied, 0.00236147 s, 8.7 MB/s
+ dd if=testfile.in of=testfile.out bs=10240 count=2
2+0 records in
2+0 records out
20480 bytes (20 kB) copied, 0.00111802 s, 18.3 MB/s
++ md5sum testfile.out
+ original_md5sum=&apos;20dd24fb015feb7de67bbdc12f2c16bf  testfile.out&apos;
+ echo 3
++ md5sum testfile.out
+ echo after drop_caches 20dd24fb015feb7de67bbdc12f2c16bf testfile.out 20dd24fb015feb7de67bbdc12f2c16bf testfile.out
after drop_caches 20dd24fb015feb7de67bbdc12f2c16bf testfile.out 20dd24fb015feb7de67bbdc12f2c16bf testfile.out
[root@trevis-60vm4 lustre]#
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Also tried client with branch: origin/2.8.2-llnl from fs/lustre-release-fe-llnl&lt;br/&gt;
top commit 8356dd88e2e59edd1462bb4647f61d5a210d4262&lt;br/&gt;
run reproducer 10 times, cannot reproduce.&lt;/p&gt;</comment>
                            <comment id="237271" author="ofaaland" created="Tue, 20 Nov 2018 20:42:59 +0000"  >&lt;p&gt;Thanks, for trying those, Sarah.  Can you suggest information to capture from my clients where the problem is reproducing?  As I mentioned, even on my cluster during testing over the weekend, some nodes reproduced reliably, but many never did.&lt;/p&gt;</comment>
                            <comment id="237320" author="ofaaland" created="Wed, 21 Nov 2018 07:06:39 +0000"  >&lt;p&gt;I&apos;ve had issues getting other versions of the client to work due to changes in IB with map_on_demand, peer_credits, etc. in recent versions, but I think I&apos;m past that.&lt;/p&gt;

&lt;p&gt;Today I reproduced the issue with client version&#160;2.8.2_2.chaos.&#160; I&apos;ll try with earlier and later clients tomorrow.&lt;/p&gt;</comment>
                            <comment id="237425" author="pjones" created="Sun, 25 Nov 2018 15:19:05 +0000"  >&lt;p&gt;Olaf&lt;/p&gt;

&lt;p&gt;Is there some pattern around which nodes can hit this issue vs those that don&apos;t?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="237427" author="ofaaland" created="Sun, 25 Nov 2018 19:41:22 +0000"  >&lt;blockquote&gt;&lt;p&gt;Is there some pattern around which nodes can hit this issue vs those that don&apos;t?&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Not that I&apos;ve been able to find.&lt;/p&gt;</comment>
                            <comment id="237461" author="ofaaland" created="Mon, 26 Nov 2018 17:47:33 +0000"  >&lt;blockquote&gt;&lt;p&gt; Can you suggest information to capture from my clients where the problem is reproducing? As I mentioned, even on my cluster during testing over the weekend, some nodes reproduced reliably, but many never did.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Poke&lt;/p&gt;</comment>
                            <comment id="237477" author="green" created="Mon, 26 Nov 2018 21:40:48 +0000"  >&lt;p&gt;I reviewed the -1 logs.&lt;/p&gt;

&lt;p&gt;Interesting observations I have:&lt;br/&gt;
1. The before unmount, you had some strange grant shortage, what this means is every write was actually synchronous. You can track this by messages that say this:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000008:00000020:37.0:1542388236.271202:0:6433:0:(osc_cache.c:1608:osc_enter_cache()) lustre3-OST002a-osc-ffff8a1c6ed62800: grant { dirty: 0/8192 dirty_pages: 0/16449536 dropped: 0 avail: 0, reserved: 0, flight: 0 }lru {in list: 0, left: 3, waiters: 0 }no grant space, fall back to sync i/o
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;2. so this leads to a sync write in a middle of a page, twice.&lt;br/&gt;
3. This is actually got both .in and .out file, but only .out file is somehow damaged, huh?&lt;br/&gt;
4. We know that we are writing the correct data to the server because we can observe both write requests, to .in and .out files and the checksum comes the same, see the &quot;checksum at write origin&quot; message repeated twice for the same request.&lt;br/&gt;
We cannot see if it&apos;s what was read, though, because the final read comes after readahead so all 4 pages are read in one go and the checksum is not comparable (interesting experiment would have been to disable readahead or do directio reads or some such to see if the bad data comes straight from the server, which I think it does, but we cannot be 100% sure).&lt;/p&gt;

&lt;p&gt;Now looking at the successful iterations after remounts, there are two major differences there:&lt;br/&gt;
1. There&apos;s plenty of grant so no sync writes are happening.&lt;br/&gt;
2. The drop caches does nothing, there are NO write RPCs in those locks (grep for &apos;o4-&apos; to confirm). There are no reads either (grep for &apos;checksum .* confirmed&apos; you see only two requests with fffffff checksum, that&apos;s the empty read at EOF).&lt;/p&gt;

&lt;p&gt;these two thigns combined mean that whatever corruption you had, even if it&apos;s happening, would not be seen.&lt;/p&gt;

&lt;p&gt;Anyway my current conclusion is the corruption is actually happening on the server, it could be the disk or Lustre somewhere, I don&apos;t know about that, but the client seems to be doing everything ok.&lt;/p&gt;

&lt;p&gt;As such I suspect we would need client+server logs of a reproduced case. Also please include both .in and .out files so we can compare them. It looks like to facilitate better reproducing you might want to dramatically shrink grant availability somehow (is the fs more prone to this is mostly full? quotas that are getting lowish in place?). I do wonder if the same thing happens when you use directio straight from dd, but since it&apos;s not page-aligned, that cannot happen and we have no easy way of triggering the sync io otherwise, huh.&lt;/p&gt;

&lt;p&gt;I&apos;ll see if I can find a way to trigger sync io deliberately.&lt;/p&gt;</comment>
                            <comment id="237492" author="ofaaland" created="Tue, 27 Nov 2018 01:14:37 +0000"  >&lt;p&gt;Oleg, I&apos;ve uploaded lu-11663-2018-11-26.tgz which contains the test files and debug logs on both client and server during two tests; one iteration that reproduces the issue, on client catalyst101, and one where the corruption does not occur, on client catalyst106.  There&apos;s a typescript file that shows the output of the test as it ran.  In both cases the stripe index of the files is 0.&lt;/p&gt;

&lt;p&gt;The node which fails the test takes much longer to write the data, consistent with the sync writes you saw in the last debug logs.&lt;/p&gt;

&lt;p&gt;The file system where this is occurring is 28% full, with individual OSTs ranging from 25% full to 31% full.&lt;br/&gt;
The amount of data I personally have stored on each OST ranges from 23M to 308M; there are 80 OSTs.  My total usage is 5.37G and total quota is 18T. lfs quota says total allocated block limit is 5T, and each OST reports a limit of 64G.&lt;/p&gt;
</comment>
                            <comment id="237493" author="adilger" created="Tue, 27 Nov 2018 01:42:36 +0000"  >&lt;p&gt;Olaf, I think Oleg was referring to the space grant, which can be seen on the OSS with &quot;&lt;tt&gt;lctl get_param obdfilter.&amp;#42;.tot_granted&lt;/tt&gt;&quot; and the amount granted to the client with &quot;&lt;tt&gt;lctl get_param osc.&amp;#42;.cur_grant_bytes&lt;/tt&gt;&quot; (probably only for the OST the file was striped over. Also useful would be &quot;&lt;tt&gt;lctl get_param osc.&amp;#42;.max_dirty_mb&lt;/tt&gt;&quot;. &lt;/p&gt;</comment>
                            <comment id="237494" author="green" created="Tue, 27 Nov 2018 03:30:40 +0000"  >&lt;p&gt;Ok, I can reproduce this on master now too. There are two requirements: sync writes due to lack of grant/quota and ZFS. ldiskfs works fine.&lt;/p&gt;

&lt;p&gt;In order to force the lack of quota codepath we can use the 0x411 failloc on the client like this: lctl set_param fail_loc=0x411&lt;/p&gt;

&lt;p&gt;Then run the original inspired script in a lustre dir:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
dd &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt;=/dev/urandom of=testfile.in bs=10240 count=2 
dd &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt;=testfile.in of=testfile.out bs=10240 count=2
original_md5sum=$(md5sum testfile.in)
echo 3 | sudo tee /proc/sys/vm/drop_caches ; sleep 2
md5sum=$(md5sum testfile.out)
echo after drop_caches $md5sum before $original_md5sum
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Set this and you&apos;ll see the problem 100% of the time. What&apos;s interesting is doing oflags=sync to dd does not help as it still results in full page writes in RPC for partial page writes on VFS side.&lt;/p&gt;

&lt;p&gt;It appears that the problem is either in ZFS or more likely, in osd-zfs, where when a partial page write happens, the previous content of the page is not read from disk and so we just update the partial content we got in the RPC, but overwrite whatever was supposed to be there in the part that we are not overwriting.&lt;/p&gt;

&lt;p&gt;Comparing osd_write_prep, we can see it&apos;s a noop in osd_zfs, but in osd_zfs it actually prereads all partial pages. On the other hand osd_write in osd_zfs uses dmu_write(by_node) with offset so perhaps it&apos;s expected that zfs is expected to do this?&lt;/p&gt;

&lt;p&gt;Either way at least it&apos;s clear what&apos;s going on now, hence this update.&lt;/p&gt;</comment>
                            <comment id="237495" author="green" created="Tue, 27 Nov 2018 03:50:32 +0000"  >&lt;p&gt;Shortest reproducer:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param fail_loc=0x411
dd if=/dev/urandom of=testfile.in bs=10240 count=2
md5sum testfile.in
lctl set_param ldlm.namespaces.*osc*.lru_size=clear
md5sum testfile.in
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="237496" author="pjones" created="Tue, 27 Nov 2018 04:55:40 +0000"  >&lt;p&gt;Alex&lt;/p&gt;

&lt;p&gt;Can you please investigate?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="237517" author="green" created="Tue, 27 Nov 2018 06:18:02 +0000"  >&lt;p&gt;btw, since we are concentrating this ticket on the data corruption, if you want to pursue why some nodes are stuck with no grant and do not appear to be getting any more grant until remount, you probably should open another ticket for this.&lt;/p&gt;</comment>
                            <comment id="237535" author="paf" created="Tue, 27 Nov 2018 15:43:46 +0000"  >&lt;p&gt;Olaf,&lt;/p&gt;

&lt;p&gt;If a bug is opened for the grant issue, could you tag me on it?&#160; Thx.&lt;/p&gt;</comment>
                            <comment id="237634" author="pjones" created="Thu, 29 Nov 2018 05:37:26 +0000"  >&lt;p&gt;Strange. Alex&apos;s patch did not get an auto comment - &lt;a href=&quot;https://review.whamcloud.com/#/c/33726/.&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/33726/.&lt;/a&gt;&#160;As I understand it, this patch seems to be holding up well &#160;against the reproducer but the test cases need some refinement. Are we now at the point when LLNL can use a b2_10 port of this patch on their affected filesystem?&#160;&lt;/p&gt;</comment>
                            <comment id="237635" author="gerrit" created="Thu, 29 Nov 2018 05:46:07 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/33748&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/33748&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11663&quot; title=&quot;corrupt data after page-unaligned write with zfs backend lustre 2.10&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11663&quot;&gt;&lt;del&gt;LU-11663&lt;/del&gt;&lt;/a&gt; osd-zfs: write partial pages with correct offset&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 6f9a0292eacb0d603b14cc03290a574cb7f0c846&lt;/p&gt;</comment>
                            <comment id="237688" author="bzzz" created="Thu, 29 Nov 2018 18:07:51 +0000"  >&lt;p&gt;there are two options here: 1) revert &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10683&quot; title=&quot;write checksum errors&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10683&quot;&gt;&lt;del&gt;LU-10683&lt;/del&gt;&lt;/a&gt; (but potentially get bad RPC checksum messages back) 2) apply &lt;a href=&quot;https://review.whamcloud.com/#/c/33726/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/33726/&lt;/a&gt; patch which is still under testing.&lt;br/&gt;
both options have worked against our reproducer (see in option #2 patch) on b2_10&lt;br/&gt;
we are still investigating the root cause for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10683&quot; title=&quot;write checksum errors&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10683&quot;&gt;&lt;del&gt;LU-10683&lt;/del&gt;&lt;/a&gt; (bad checksums)&lt;/p&gt;</comment>
                            <comment id="237741" author="lixi_wc" created="Fri, 30 Nov 2018 16:37:13 +0000"  >&lt;p&gt;I feel between the two options that Alex pointed out, reverting the patch of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10683&quot; title=&quot;write checksum errors&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10683&quot;&gt;&lt;del&gt;LU-10683&lt;/del&gt;&lt;/a&gt; is not a good one. The lnb_page_offset should be the same with the client side page offset in &apos;struct brw_page&apos;, shouldn&apos;t it? It doesn&apos;t feel right to move the data to the offset 0 of a page when the data has an offset in the page.&lt;/p&gt;</comment>
                            <comment id="237742" author="bzzz" created="Fri, 30 Nov 2018 16:40:30 +0000"  >&lt;p&gt;well, from filesystem point of view, there is no requirement to use same page offset. moreover, client and server may have different pagesize, which makes it impossible to match offset, right?&lt;/p&gt;</comment>
                            <comment id="237746" author="lixi_wc" created="Fri, 30 Nov 2018 16:55:27 +0000"  >&lt;p&gt;As commented in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11697&quot; title=&quot;BAD WRITE CHECKSUM with t10ip4K and t10ip512 checksums&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11697&quot;&gt;&lt;del&gt;LU-11697&lt;/del&gt;&lt;/a&gt;, the correct page offset in lnb_page_offset is the reason why &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10683&quot; title=&quot;write checksum errors&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10683&quot;&gt;&lt;del&gt;LU-10683&lt;/del&gt;&lt;/a&gt; oatch fixed the RPC checksum error.  Both osc_checksum_bulk() and tgt_checksum_niobuf() assume the page offsets are properly inited and should be equal to each other.&lt;/p&gt;</comment>
                            <comment id="237756" author="gerrit" created="Fri, 30 Nov 2018 18:28:55 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/33726/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/33726/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11663&quot; title=&quot;corrupt data after page-unaligned write with zfs backend lustre 2.10&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11663&quot;&gt;&lt;del&gt;LU-11663&lt;/del&gt;&lt;/a&gt; osd-zfs: write partial pages with correct offset&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: c038909fbc2b3b14763dd731e9c877d11d338f04&lt;/p&gt;</comment>
                            <comment id="237757" author="gerrit" created="Fri, 30 Nov 2018 18:29:04 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/33748/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/33748/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11663&quot; title=&quot;corrupt data after page-unaligned write with zfs backend lustre 2.10&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11663&quot;&gt;&lt;del&gt;LU-11663&lt;/del&gt;&lt;/a&gt; osd-zfs: write partial pages with correct offset&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 18d6b8fb2c359431a6da57b178ec0845925ea89c&lt;/p&gt;</comment>
                            <comment id="237758" author="pjones" created="Fri, 30 Nov 2018 18:30:52 +0000"  >&lt;p&gt;Fix landed for 2.12 and 2.10.6. Checksum issues for master will be covered under &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11697&quot; title=&quot;BAD WRITE CHECKSUM with t10ip4K and t10ip512 checksums&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11697&quot;&gt;&lt;del&gt;LU-11697&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="238719" author="ofaaland" created="Mon, 17 Dec 2018 23:34:21 +0000"  >&lt;p&gt;Patrick, I opened a ticket re: grant going to 0, it is &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11798&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.whamcloud.com/browse/LU-11798&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="54097">LU-11697</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="54189">LU-11729</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="50865">LU-10683</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="54327">LU-11798</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="31483" name="for-upload-lu-11663.tar.bz2" size="3057381" author="ofaaland" created="Fri, 16 Nov 2018 17:34:44 +0000"/>
                            <attachment id="31528" name="lu-11663-2018-11-26.tgz" size="3721790" author="ofaaland" created="Tue, 27 Nov 2018 00:53:48 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i006c7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>