<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:13:22 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-7956] Over softquota file corruption </title>
                <link>https://jira.whamcloud.com/browse/LU-7956</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We have a confirmed case of over softquota causing file corruptions. We had other reports in the past but couldn&apos;t confirm until now. The test job writes 418 hdf4 file and about 10-15 will be corrupted. We confirmed this using 2 different filesystem. The files are single striped and are written from a single mpi rank.&lt;/p&gt;

&lt;p&gt;In the attached debug file one of the corrupted files was written to nocache-OST0047 at 1459304302&lt;br/&gt;
Also attaching strace of the corrupted file IO.&lt;/p&gt;</description>
                <environment></environment>
        <key id="35715">LU-7956</key>
            <summary>Over softquota file corruption </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="niu">Niu Yawei</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Wed, 30 Mar 2016 06:24:35 +0000</created>
                <updated>Thu, 14 Jul 2016 22:04:09 +0000</updated>
                            <resolved>Thu, 14 Jul 2016 22:04:09 +0000</resolved>
                                    <version>Lustre 2.5.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="147317" author="niu" created="Wed, 30 Mar 2016 07:23:29 +0000"  >&lt;p&gt;Could you verify that if the grace expired and some writes failed with -EDQUOT? I did&apos;t find anything abnormal in the provided log, could you collect logs on client, MDT and other OSS as well? Thanks.&lt;/p&gt;</comment>
                            <comment id="147361" author="mhanafi" created="Wed, 30 Mar 2016 17:35:51 +0000"  >&lt;p&gt;The grace is not expired, during my testing I was setting softquota and it had 2 week left. The code doesn&apos;t get a -EDQUOT. I know this to be the case, because I tested lowering my hardquota and the code did fail to write and exited with an Error.&lt;/p&gt;
</comment>
                            <comment id="147367" author="mhanafi" created="Wed, 30 Mar 2016 19:31:45 +0000"  >&lt;p&gt;These test where ran on 2.5.3 clients and 2.7.1 server.&lt;/p&gt;

&lt;p&gt;Attaching  logs from all job clients and mds and oss. There is only one OSS with 6 osts. The corrupted file is called vr014.hdf&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;mhanafi@pfe24:/fsnocache/mhanafi/mas_fstest_template&amp;#93;&lt;/span&gt;$ lfs getstripe vr014.hdf &lt;br/&gt;
vr014.hdf&lt;br/&gt;
lmm_stripe_count:   1&lt;br/&gt;
lmm_stripe_size:    1048576&lt;br/&gt;
lmm_pattern:        1&lt;br/&gt;
lmm_layout_gen:     0&lt;br/&gt;
lmm_stripe_offset:  47&lt;br/&gt;
	obdidx		 objid		 objid		 group&lt;br/&gt;
	    47	         71081	      0x115a9	             0&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;mhanafi@pfe24:/fsnocache/mhanafi/mas_fstest_template&amp;#93;&lt;/span&gt;$ lfs path2fid vr014.hdf &lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;0x200000bd1:0x4df:0x0&amp;#93;&lt;/span&gt;&lt;/p&gt;
</comment>
                            <comment id="147418" author="niu" created="Thu, 31 Mar 2016 07:43:31 +0000"  >&lt;p&gt;When over soft quota, client may turned to sync write. I guess there could be some defects in the sync write path (either in Lustre or app) may cause corruption?&lt;/p&gt;

&lt;p&gt;Could you unset the soft limit but set fail_loc as 0x411 (that will force sync write on client) on all clients to see if the problem can be reproduced as well? Thanks.&lt;/p&gt;</comment>
                            <comment id="147498" author="mhanafi" created="Thu, 31 Mar 2016 20:49:20 +0000"  >&lt;p&gt;setting fail_loc didn&apos;t produce any corrupted files. We have narrowed down the write that is not making to disk. From the attached strace it would be second to the last write.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;53385 lseek(14, -4194300, SEEK_CUR)     = 4
53385 write(14, &lt;span class=&quot;code-quote&quot;&gt;&quot;\0\20\0\0\0\0\0\36\0\1\0\0\0\312\0\0\0\\\2\276\0\2\0\0\1&amp;amp;\3\230\207P\0j&quot;&lt;/span&gt;..., 198) = 198   &amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;------ THIS ONE
53385 lseek(14, 58720256, SEEK_SET)     = 58720256
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The code tries to re-write/update the data header in the file and is not making to the disk.&lt;/p&gt;

&lt;p&gt;And from the attached debug it is this section &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;00000080:00200000:0.0:1459365869.218185:0:62879:0:(file.c:2754:ll_file_seek()) VFS Op:inode=144115238826935519/33554443(ffff880707b8c138), to=4=0x4(1)
00000080:00200000:0.0:1459365869.218189:0:62879:0:(file.c:1142:ll_file_io_generic()) file: vr014.hdf, type: 1 ppos: 4, count: 198
00000020:00200000:0.0:1459365869.218190:0:62879:0:(cl_io.c:236:cl_io_rw_init()) header@ffff8802aa599070[0x0, 14766, [0x200000bd1:0x4df:0x0] hash]
00000020:00200000:0.0:1459365869.218191:0:62879:0:(cl_io.c:236:cl_io_rw_init()) io range: 1 [4, 202) 0 0
00000080:00200000:0.0:1459365869.218192:0:62879:0:(vvp_io.c:1177:vvp_io_init()) [0x200000bd1:0x4df:0x0] ignore/verify layout 0/0, layout version 0 restore needed 0
00020000:00000002:0.0:1459365869.218198:0:62879:0:(lov_offset.c:233:lov_stripe_intersects()) [4-&amp;gt;201] -&amp;gt; [(0) 4-&amp;gt;201 (0)]
00020000:00200000:0.0:1459365869.218201:0:62879:0:(lov_io.c:429:lov_io_iter_init()) shrink: 0 [4, 202)
00000080:00200000:0.0:1459365869.218202:0:62879:0:(lcommon_cl.c:733:ccc_io_one_lock_index()) lock: 2 [0, 0]
00000080:00200000:0.0:1459365869.218205:0:62879:0:(vvp_io.c:615:vvp_io_write_start()) write: [4, 202)
00000080:00200000:0.0:1459365869.218206:0:62879:0:(xattr.c:461:ll_getxattr()) VFS Op:inode=144115238826935519/33554443(ffff880707b8c138), xattr security.capability
00000080:00200000:0.0:1459365869.218207:0:62879:0:(xattr.c:302:ll_getxattr_common()) VFS Op:inode=144115238826935519/33554443(ffff880707b8c138)
00000080:00200000:0.0:1459365869.218209:0:62879:0:(rw.c:202:ll_cl_init()) 0@[0x200000bd1:0x4df:0x0] -&amp;gt; 0 ffff880207b47530 ffff88080f594768
00000080:00000002:0.0:1459365869.218212:0:62879:0:(vvp_io.c:1015:vvp_io_commit_write()) header@ffff8802aa599070[0x0, 14766, [0x200000bd1:0x4df:0x0] hash]
00000080:00000002:0.0:1459365869.218212:0:62879:0:(vvp_io.c:1015:vvp_io_commit_write()) commiting page write
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="147544" author="niu" created="Fri, 1 Apr 2016 04:16:07 +0000"  >&lt;p&gt;From the trace we can see the header [4, 202) was updated twice, first time it&apos;s written into cache (showed in above log you posted), second time it&apos;s synced written back. Perhaps the second write overwrite the first one? Is there anything wrong that first write not making to disk?&lt;/p&gt;</comment>
                            <comment id="147551" author="mhanafi" created="Fri, 1 Apr 2016 06:51:59 +0000"  >&lt;p&gt;The file is create and an empty header is written and the file is closed. The file is reopened the data is written and finally the header is updated with pointer to the written data. But we end up with an empty header and data in the file. I don&apos;t see a second write when the file is re-opened the second time.&lt;/p&gt;

</comment>
                            <comment id="147559" author="niu" created="Fri, 1 Apr 2016 09:17:18 +0000"  >&lt;p&gt;This is the first write to [4, 202):&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;00000080:00200000:0.0:1459365868.977428:0:62879:0:(file.c:2754:ll_file_seek()) VFS Op:inode=144115238826935519/33554443(ffff880707b8c138), to=4=0x4(1)
00000080:00200000:0.0:1459365868.977431:0:62879:0:(file.c:1142:ll_file_io_generic()) file: vr014.hdf, type: 1 ppos: 4, count: 198
00000020:00200000:0.0:1459365868.977433:0:62879:0:(cl_io.c:236:cl_io_rw_init()) header@ffff8802aa599070[0x0, 3, [0x200000bd1:0x4df:0x0] hash]
00000020:00200000:0.0:1459365868.977433:0:62879:0:(cl_io.c:236:cl_io_rw_init()) io range: 1 [4, 202) 0 0
00000080:00200000:0.0:1459365868.977434:0:62879:0:(vvp_io.c:1177:vvp_io_init()) [0x200000bd1:0x4df:0x0] ignore/verify layout 0/0, layout version 0 restore needed 0
00020000:00000002:0.0:1459365868.977439:0:62879:0:(lov_offset.c:233:lov_stripe_intersects()) [4-&amp;gt;201] -&amp;gt; [(0) 4-&amp;gt;201 (0)]
00020000:00200000:0.0:1459365868.977441:0:62879:0:(lov_io.c:429:lov_io_iter_init()) shrink: 0 [4, 202)
00000080:00200000:0.0:1459365868.977442:0:62879:0:(lcommon_cl.c:733:ccc_io_one_lock_index()) lock: 2 [0, 0]
00000080:00200000:0.0:1459365868.977446:0:62879:0:(vvp_io.c:615:vvp_io_write_start()) write: [4, 202)
00000080:00200000:0.0:1459365868.977446:0:62879:0:(xattr.c:461:ll_getxattr()) VFS Op:inode=144115238826935519/33554443(ffff880707b8c138), xattr security.capability
00000080:00200000:0.0:1459365868.977447:0:62879:0:(xattr.c:302:ll_getxattr_common()) VFS Op:inode=144115238826935519/33554443(ffff880707b8c138)
00000080:00200000:0.0:1459365868.977449:0:62879:0:(rw.c:202:ll_cl_init()) 0@[0x200000bd1:0x4df:0x0] -&amp;gt; 0 ffff880207b47530 ffff88080f594768
00000080:00000002:0.0:1459365868.977452:0:62879:0:(vvp_io.c:1015:vvp_io_commit_write()) header@ffff8802aa599070[0x0, 3, [0x200000bd1:0x4df:0x0] hash]
00000080:00000002:0.0:1459365868.977452:0:62879:0:(vvp_io.c:1015:vvp_io_commit_write()) commiting page write
00000008:00000002:0.0:1459365868.977453:0:62879:0:(osc_io.c:211:osc_page_touch_at()) stripe KMS not increasing 294-&amp;gt;202 294
00000080:00200000:0.0:1459365868.977459:0:62879:0:(vvp_io.c:133:vvp_io_fini()) [0x200000bd1:0x4df:0x0] ignore/verify layout 0/0, layout version 0 restore needed 0
00000080:00200000:0.0:1459365868.977460:0:62879:0:(file.c:1242:ll_file_io_generic()) iotype: 1, result: 198
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;We can see it&apos;s a cached write.&lt;/p&gt;

&lt;p&gt;This is the second write to [4, 202):&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;00000080:00200000:0.0:1459365869.218185:0:62879:0:(file.c:2754:ll_file_seek()) VFS Op:inode=144115238826935519/33554443(ffff880707b8c138), to=4=0x4(1)
00000080:00200000:0.0:1459365869.218189:0:62879:0:(file.c:1142:ll_file_io_generic()) file: vr014.hdf, type: 1 ppos: 4, count: 198
00000020:00200000:0.0:1459365869.218190:0:62879:0:(cl_io.c:236:cl_io_rw_init()) header@ffff8802aa599070[0x0, 14766, [0x200000bd1:0x4df:0x0] hash]
00000020:00200000:0.0:1459365869.218191:0:62879:0:(cl_io.c:236:cl_io_rw_init()) io range: 1 [4, 202) 0 0
00000080:00200000:0.0:1459365869.218192:0:62879:0:(vvp_io.c:1177:vvp_io_init()) [0x200000bd1:0x4df:0x0] ignore/verify layout 0/0, layout version 0 restore needed 0
00020000:00000002:0.0:1459365869.218198:0:62879:0:(lov_offset.c:233:lov_stripe_intersects()) [4-&amp;gt;201] -&amp;gt; [(0) 4-&amp;gt;201 (0)]
00020000:00200000:0.0:1459365869.218201:0:62879:0:(lov_io.c:429:lov_io_iter_init()) shrink: 0 [4, 202)
00000080:00200000:0.0:1459365869.218202:0:62879:0:(lcommon_cl.c:733:ccc_io_one_lock_index()) lock: 2 [0, 0]
00000080:00200000:0.0:1459365869.218205:0:62879:0:(vvp_io.c:615:vvp_io_write_start()) write: [4, 202)
00000080:00200000:0.0:1459365869.218206:0:62879:0:(xattr.c:461:ll_getxattr()) VFS Op:inode=144115238826935519/33554443(ffff880707b8c138), xattr security.capability
00000080:00200000:0.0:1459365869.218207:0:62879:0:(xattr.c:302:ll_getxattr_common()) VFS Op:inode=144115238826935519/33554443(ffff880707b8c138)
00000080:00200000:0.0:1459365869.218209:0:62879:0:(rw.c:202:ll_cl_init()) 0@[0x200000bd1:0x4df:0x0] -&amp;gt; 0 ffff880207b47530 ffff88080f594768
00000080:00000002:0.0:1459365869.218212:0:62879:0:(vvp_io.c:1015:vvp_io_commit_write()) header@ffff8802aa599070[0x0, 14766, [0x200000bd1:0x4df:0x0] hash]
00000080:00000002:0.0:1459365869.218212:0:62879:0:(vvp_io.c:1015:vvp_io_commit_write()) commiting page write
00020000:00000002:0.0:1459365869.218214:0:62879:0:(lov_merge.c:75:lov_merge_lvb_kms()) MDT ID 0x4df:3025 initial value: s=0 m=9223372036854775808 a=9223372036854775808 c=9223372036854775808 b=0
00020000:00000002:0.0:1459365869.218215:0:62879:0:(lov_merge.c:109:lov_merge_lvb_kms()) MDT ID 0x4df:3025 on OST[47]: s=60436399 m=1459365869 a=1459365869 c=1459365869 b=88072
00000001:04000000:0.0:1459365869.218216:0:62879:0:(osc_quota.c:64:osc_quota_chkdq()) chkdq found noquota &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; user 11312
00000008:04000000:20.0:1459365869.219182:0:5838:0:(osc_request.c:1533:osc_brw_fini_request()) setdq &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; [11312 1125] with valid 0x6f184fb9, flags 1100
00000008:00000002:20.0:1459365869.219185:0:5838:0:(osc_request.c:1960:brw_interpret()) request ffff880f230c2400 aa ffff880f230c26e0 rc 0
00000020:00000002:20.0:1459365869.219335:0:5838:0:(obdo.c:70:obdo_from_inode()) valid 70c, &lt;span class=&quot;code-keyword&quot;&gt;new&lt;/span&gt; time 1459365869/1459365869
00000008:00000002:20.0:1459365869.219346:0:5838:0:(osc_request.c:2225:osc_build_rpc()) @@@ 1 pages, aa ffff880bdb641ae0. now 0r/9w in flight  req@ffff880bdb641800 x1528598526163360/t0(0) o4-&amp;gt;nocache-OST002f-osc-ffff88082f125000@10.151.26.123@o2ib:6/4 lens 488/448 e 0 to 0 dl 0 ref 2 fl New:/0/ffffffff rc 0/-1
00000100:00100000:20.0:1459365869.219356:0:5838:0:(client.c:1872:ptlrpc_check_set()) Completed RPC pname:cluuid:pid:xid:nid:opc ptlrpcd_29:09656603-58f6-edb5-effe-53c13e5a5741:5838:1528598526163252:10.151.26.123@o2ib:4
00000100:00100000:24.0:1459365869.219357:0:5829:0:(client.c:1489:ptlrpc_send_new_req()) Sending RPC pname:cluuid:pid:xid:nid:opc ptlrpcd_20:09656603-58f6-edb5-effe-53c13e5a5741:5829:1528598526163360:10.151.26.123@o2ib:4
00000008:04000000:37.0:1459365869.219641:0:5837:0:(osc_request.c:1533:osc_brw_fini_request()) setdq &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; [11312 1125] with valid 0x6f184fb9, flags 1000
00000001:04000000:37.0:1459365869.219644:0:5837:0:(osc_quota.c:133:osc_quota_setdq()) nocache-OST002f-osc-ffff88082f125000: setdq to remove &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; user 11312 (00000000deadbeef)
00000008:00000002:37.0:1459365869.219646:0:5837:0:(osc_request.c:1960:brw_interpret()) request ffff881031f65000 aa ffff881031f652e0 rc 0
00000020:00000002:37.0:1459365869.220048:0:5837:0:(obdo.c:70:obdo_from_inode()) valid 70c, &lt;span class=&quot;code-keyword&quot;&gt;new&lt;/span&gt; time 1459365869/1459365869
00000008:04000000:22.0:1459365869.220489:0:5830:0:(osc_request.c:1533:osc_brw_fini_request()) setdq &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; [11312 1125] with valid 0x6f184fb9, flags 1000
00000008:00000002:22.0:1459365869.220493:0:5830:0:(osc_request.c:1960:brw_interpret()) request ffff8810461abc00 aa ffff8810461abee0 rc 0
00000100:00100000:22.0:1459365869.220696:0:5830:0:(client.c:1872:ptlrpc_check_set()) Completed RPC pname:cluuid:pid:xid:nid:opc ptlrpcd_21:09656603-58f6-edb5-effe-53c13e5a5741:5830:1528598526163268:10.151.26.123@o2ib:4
00000008:04000000:22.0:1459365869.220702:0:5830:0:(osc_request.c:1533:osc_brw_fini_request()) setdq &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; [11312 1125] with valid 0x6f184fb9, flags 1000
00000008:00000002:22.0:1459365869.220704:0:5830:0:(osc_request.c:1960:brw_interpret()) request ffff8810461ab400 aa ffff8810461ab6e0 rc 0
00000008:00000002:37.0:1459365869.220712:0:5837:0:(osc_request.c:2225:osc_build_rpc()) @@@ 254 pages, aa ffff880f230c26e0. now 0r/8w in flight  req@ffff880f230c2400 x1528598526163364/t0(0) o4-&amp;gt;nocache-OST002f-osc-ffff88082f125000@10.151.26.123@o2ib:6/4 lens 504/448 e 0 to 0 dl 0 ref 2 fl New:/0/ffffffff rc 0/-1
00000100:00100000:33.0:1459365869.220722:0:5846:0:(client.c:1489:ptlrpc_send_new_req()) Sending RPC pname:cluuid:pid:xid:nid:opc ptlrpcd_37:09656603-58f6-edb5-effe-53c13e5a5741:5846:1528598526163364:10.151.26.123@o2ib:4
00000100:00100000:24.0:1459365869.220734:0:5829:0:(client.c:2523:ptlrpc_free_committed()) nocache-OST002f-osc-ffff88082f125000: committing &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; last_committed 8589936700 gen 1
00000008:04000000:24.0:1459365869.220751:0:5829:0:(osc_request.c:1533:osc_brw_fini_request()) setdq &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; [11312 1125] with valid 0x6f184fb9, flags 1000
00000008:00000002:24.0:1459365869.220754:0:5829:0:(osc_request.c:1960:brw_interpret()) request ffff880bdb641800 aa ffff880bdb641ae0 rc 0
00000008:00000002:0.0:1459365869.220764:0:62879:0:(osc_io.c:211:osc_page_touch_at()) stripe KMS not increasing 60436399-&amp;gt;202 60436399
00000100:00100000:24.0:1459365869.220765:0:5829:0:(client.c:1872:ptlrpc_check_set()) Completed RPC pname:cluuid:pid:xid:nid:opc ptlrpcd_20:09656603-58f6-edb5-effe-53c13e5a5741:5829:1528598526163360:10.151.26.123@o2ib:4
00000080:00200000:0.0:1459365869.220777:0:62879:0:(vvp_io.c:133:vvp_io_fini()) [0x200000bd1:0x4df:0x0] ignore/verify layout 0/0, layout version 0 restore needed 0
00000080:00200000:0.0:1459365869.220778:0:62879:0:(file.c:1242:ll_file_io_generic()) iotype: 1, result: 198
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;We can see NOQUOTA flags is set and client turned to sync write (see the 1 page request &quot;req@ffff880bdb641800&quot;). I didn&apos;t see any error on both client and OST logs.&lt;/p&gt;</comment>
                            <comment id="147629" author="mhanafi" created="Fri, 1 Apr 2016 18:38:40 +0000"  >&lt;p&gt;You can see that after the first write the file is closed at&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;00800000:00000002:0.0:1459365868.977548:0:62879:0:(lmv_obd.c:1725:lmv_close()) CLOSE [0x200000bd1:0x4df:0x0]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;then reopened at&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;00000080:00200000:0.0:1459365868.977899:0:62879:0:(namei.c:527:ll_lookup_it()) VFS Op:name=vr014.hdf,dir=144115238826934470/33554443(ffff88061af76678),intent=open
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;If the file is close after the first write, I wouldn&apos;t think the 2 writes could clobber each other. &lt;/p&gt;</comment>
                            <comment id="148021" author="mhanafi" created="Wed, 6 Apr 2016 18:13:18 +0000"  >&lt;p&gt;It was pointed out to me that when the file is closed it doesn&apos;t necessary mean the file is synced. So the file could still be setting in cache. May be the second sync write is not invaliding the cached data and when the file is close it over writes the header from cache.&lt;/p&gt;
</comment>
                            <comment id="148074" author="niu" created="Thu, 7 Apr 2016 03:04:43 +0000"  >&lt;blockquote&gt;
&lt;p&gt;It was pointed out to me that when the file is closed it doesn&apos;t necessary mean the file is synced. So the file could still be setting in cache. May be the second sync write is not invaliding the cached data and when the file is close it over writes the header from cache.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;The second sync write will update the page cache first, then write back, so the second write won&apos;t be overwritten like you described.&lt;/p&gt;

&lt;p&gt;By reading code, I suspect that second sync write didn&apos;t write the correct range of the page, but I failed to reproduce it. I posted a tentative patch and asked clio experts to take a look. Thank you.&lt;/p&gt;</comment>
                            <comment id="148271" author="jaylan" created="Fri, 8 Apr 2016 18:51:52 +0000"  >&lt;p&gt;This note is for documentation purpose.&lt;/p&gt;

&lt;p&gt;Digged from an email from Peter Jones pointing to a potential fix:&lt;br/&gt;
&quot; there is a candidate fix (&lt;a href=&quot;http://review.whamcloud.com/#/c/19329/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/19329/&lt;/a&gt;) undergoing testing and reviews&quot;&lt;/p&gt;</comment>
                            <comment id="148322" author="jay" created="Sun, 10 Apr 2016 05:49:51 +0000"  >&lt;p&gt;Hi Mahmoud,&lt;/p&gt;

&lt;p&gt;When you saw an empty file header, is the file range [4, 202), or [0, 4095) empty?&lt;/p&gt;

&lt;p&gt;Now that you have a reproducer, I would suggest to dump the file header, say a few bytes from object [4, 202) in the ptlrpc layer to make sure that correct content is in the page.&lt;/p&gt;</comment>
                            <comment id="148357" author="niu" created="Mon, 11 Apr 2016 06:46:49 +0000"  >&lt;p&gt;I reproduced the problem locally, and the fix on &lt;a href=&quot;http://review.whamcloud.com/#/c/19329/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/19329/&lt;/a&gt; has been updated according to Jinshan&apos;s suggestion.&lt;/p&gt;</comment>
                            <comment id="148500" author="mhanafi" created="Mon, 11 Apr 2016 20:41:17 +0000"  >&lt;p&gt;Is there a possibility of this corruption happing outside of the soft quota limit? If for example the user does a sync write in their code.  &lt;/p&gt;</comment>
                            <comment id="148528" author="jay" created="Tue, 12 Apr 2016 01:07:45 +0000"  >&lt;p&gt;That wouldn&apos;t be possible from what I have seen. The bug is that once a sync write occurred due to running of quota, the later cached write on the same page would use stale page information therefore partial data could be written.&lt;/p&gt;</comment>
                            <comment id="148537" author="niu" created="Tue, 12 Apr 2016 02:03:11 +0000"  >&lt;blockquote&gt;
&lt;p&gt;Is there a possibility of this corruption happing outside of the soft quota limit? If for example the user does a sync write in their code.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Mahmoud, you needn&apos;t worry about sync write from applications, that goes different code path. This kind of corruption could happen when over quota or run out of grant on client, and the io pattern to trigger the corruption is (we can see it exactly from the log):&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;A Write to extend file, because over quota (or out of grant), the write is turned into &apos;sync&apos; write internally;&lt;/li&gt;
	&lt;li&gt;More writes to bump the file size further; (no matter sync or cached write)&lt;/li&gt;
	&lt;li&gt;Write to the same page of the first write, and this write is turned into &apos;sync&apos; write too. Because of the bug, stale page offset from the first write will be used, and corruption happens.&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="149470" author="jaylan" created="Tue, 19 Apr 2016 19:32:54 +0000"  >&lt;p&gt;Niu,&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/#/c/19329/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/19329/&lt;/a&gt;&lt;br/&gt;
was written for b2_5_fe. The mod affected only two lines, but the function the changes fell on was vvp_io_commit_write() and that function does not exist in b2_7_fe. We are moving to b2_7_fe next month. Could you provide a b2_7_fe back port? Thanks!&lt;/p&gt;</comment>
                            <comment id="149791" author="niu" created="Fri, 22 Apr 2016 02:20:34 +0000"  >&lt;p&gt;The problem is b2_5_fe only. That part of code has been heavily changed by &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3321&quot; title=&quot;2.x single thread/process throughput degraded from 1.8&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3321&quot;&gt;&lt;del&gt;LU-3321&lt;/del&gt;&lt;/a&gt;, and the bug was fixed the changes. The bug doesn&apos;t exist in b2_7_fe, EE2.4, EE3.0 and master.&lt;/p&gt;</comment>
                            <comment id="150587" author="amk" created="Fri, 29 Apr 2016 16:32:39 +0000"  >&lt;p&gt;We may have hit this same corruption on 2.5.2. Can &lt;a href=&quot;http://review.whamcloud.com/#/c/19329&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/19329&lt;/a&gt; be made public so I can verify? Thanks.&lt;/p&gt;</comment>
                            <comment id="158899" author="mhanafi" created="Thu, 14 Jul 2016 22:00:23 +0000"  >&lt;p&gt;You can close this ticket&lt;/p&gt;</comment>
                            <comment id="158902" author="pjones" created="Thu, 14 Jul 2016 22:04:09 +0000"  >&lt;p&gt;Thanks Mahmoud&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="20957" name="debug.corruption.1.found.gz" size="849039" author="mhanafi" created="Wed, 30 Mar 2016 06:24:35 +0000"/>
                            <attachment id="20958" name="strace.corruptedfile" size="4154" author="mhanafi" created="Wed, 30 Mar 2016 06:24:35 +0000"/>
                            <attachment id="20965" name="vr014.corruption.tgz" size="236" author="mhanafi" created="Wed, 30 Mar 2016 19:31:45 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10490" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>End date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Fri, 29 Apr 2016 06:24:35 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzy6af:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10023"><![CDATA[4]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10493" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>Start date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Wed, 30 Mar 2016 06:24:35 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                    </customfields>
    </item>
</channel>
</rss>