<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:41:32 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-11169] Data corruption during IOR testing with network error simulation</title>
                <link>https://jira.whamcloud.com/browse/LU-11169</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;During large IOR testing with network fails introduced, Cray found a data corruption issues.&lt;br/&gt;
first issue is related to the 4MB BRW patchset and exist for long time. Bulk will be marked as failed just with real network error, but if one parts of data was lost and request timeout will treat as transfer done.&lt;/p&gt;

&lt;p&gt;second issue is related with cleanup landed as commit 49d8a7ccd73 where &quot;rc&quot; parameter of obd_commit function was replaced with local data, it horror any errors before it. &lt;/p&gt;</description>
                <environment></environment>
        <key id="52797">LU-11169</key>
            <summary>Data corruption during IOR testing with network error simulation</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="shadow">Alexey Lyashkov</assignee>
                                    <reporter username="shadow">Alexey Lyashkov</reporter>
                        <labels>
                    </labels>
                <created>Tue, 24 Jul 2018 11:33:41 +0000</created>
                <updated>Fri, 19 Jul 2019 11:14:11 +0000</updated>
                            <resolved>Wed, 19 Dec 2018 21:08:40 +0000</resolved>
                                    <version>Lustre 2.10.0</version>
                    <version>Lustre 2.11.0</version>
                                    <fixVersion>Lustre 2.12.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="231142" author="gerrit" created="Tue, 31 Jul 2018 08:48:05 +0000"  >&lt;p&gt;Alexey Lyashkov (c17817@cray.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/32905&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/32905&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11169&quot; title=&quot;Data corruption during IOR testing with network error simulation&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11169&quot;&gt;&lt;del&gt;LU-11169&lt;/del&gt;&lt;/a&gt; ptlrpc: don&apos;t treat bulk is ok&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 3c26d98e15582b9fc529e220c47e92be0acec5c8&lt;/p&gt;</comment>
                            <comment id="231143" author="gerrit" created="Tue, 31 Jul 2018 08:48:06 +0000"  >&lt;p&gt;Alexey Lyashkov (c17817@cray.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/32906&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/32906&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11169&quot; title=&quot;Data corruption during IOR testing with network error simulation&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11169&quot;&gt;&lt;del&gt;LU-11169&lt;/del&gt;&lt;/a&gt; obdclass: fix old return code usage&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 028c5661270d922130d80074727c99b9292bd250&lt;/p&gt;</comment>
                            <comment id="231722" author="adilger" created="Thu, 9 Aug 2018 16:53:54 +0000"  >&lt;p&gt;In the case of 4MB RPCs that drop some part of the RDMA, doesn&apos;t the bulk RPC checksum detect this case and cause the RPC to be resent?&lt;/p&gt;</comment>
                            <comment id="231730" author="shadow" created="Thu, 9 Aug 2018 17:48:35 +0000"  >&lt;p&gt;Andreas,&lt;/p&gt;

&lt;p&gt;you are right. OSC checksum can detect client side problem, and &quot;read&quot; part of server side problem.&lt;/p&gt;</comment>
                            <comment id="231799" author="ihara" created="Fri, 10 Aug 2018 14:17:17 +0000"  >&lt;p&gt;Interesting. How is file corrupted? could you share exmaple of corrupted file?&lt;/p&gt;</comment>
                            <comment id="231819" author="shadow" created="Fri, 10 Aug 2018 19:14:23 +0000"  >&lt;p&gt;it&apos;s depend of what you ask. Server side bug found as reading a zero&apos;s from file, after bulk transfer error.&lt;/p&gt;</comment>
                            <comment id="231821" author="paf" created="Fri, 10 Aug 2018 19:21:04 +0000"  >&lt;p&gt;Ihara,&lt;/p&gt;



&lt;p&gt;Basically, it sometimes fails to notice and resend when there&apos;s a transfer error.&#160; On reads, this shows up as zeroes read by the client, when the data on disk is correct.&#160; On writes, the result would be whatever data was present before the write (or, I believe, zeroes if the write is to a new region of the file).&lt;/p&gt;</comment>
                            <comment id="231823" author="shadow" created="Fri, 10 Aug 2018 19:34:50 +0000"  >&lt;p&gt;Patric, &lt;/p&gt;

&lt;p&gt;In general, random data in file - as pages is unchanged on failed bulk transfer, but client / server think all is OK. zeros is just luck.&lt;/p&gt;</comment>
                            <comment id="231826" author="adilger" created="Fri, 10 Aug 2018 22:42:07 +0000"  >&lt;blockquote&gt;
&lt;p&gt;OSC checksum can detect client side problem, and &quot;read&quot; part of server side problem.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;It should also cause the client to resend a write if one of the RDMAs was missing data (up to the 10x retry limit).&lt;/p&gt;</comment>
                            <comment id="231833" author="shadow" created="Sat, 11 Aug 2018 13:14:44 +0000"  >&lt;p&gt;NO. server side part is horror any error on server side, so commit write (who make write really) don&apos;t know about errors before it.&lt;/p&gt;</comment>
                            <comment id="234162" author="gerrit" created="Mon, 1 Oct 2018 14:00:40 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/32906/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/32906/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11169&quot; title=&quot;Data corruption during IOR testing with network error simulation&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11169&quot;&gt;&lt;del&gt;LU-11169&lt;/del&gt;&lt;/a&gt; obdclass: fix old return code usage&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 1db258b57e5669e07934fe848861817f88102475&lt;/p&gt;</comment>
                            <comment id="237995" author="shadow" created="Wed, 5 Dec 2018 14:18:13 +0000"  >&lt;p&gt;last patch is addressed a theoretical problem, which is impossible now.&lt;/p&gt;</comment>
                            <comment id="251702" author="gerrit" created="Fri, 19 Jul 2019 11:14:11 +0000"  >&lt;p&gt;Alexandr Boyko (c17825@cray.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/35571&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/35571&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11169&quot; title=&quot;Data corruption during IOR testing with network error simulation&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11169&quot;&gt;&lt;del&gt;LU-11169&lt;/del&gt;&lt;/a&gt; ptlrpc: handle reply and resend reorder&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 88c2a9a0c840d01d50762a71f04319e58c9affef&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzznr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>