<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:04:28 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-6925] oss buffer cache corruption</title>
                <link>https://jira.whamcloud.com/browse/LU-6925</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;User reported file corruption as shown bellow. The file is striped across 4 osts at 1MB.&lt;/p&gt;

&lt;p&gt;The corruption is 4KB in size and the end aligns with a ost stripe boundary. The corrupted data is from a process that run on the oss writing and reading data to the local oss filesystem. &lt;/p&gt;

&lt;p&gt;We have a cron job that dumps ost metadata once a day like so:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;/sbin/dumpe2fs /dev/ostdevice &amp;gt; /root/ostdevice.meta 2&amp;gt;/dev/&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The output file is read 15min and inodes are caches on the oss.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;  0.1926E-04  0.8636E-05 -0.5430E-05 -0.1747E-04 -0.2318E-04
 -0.2108E-04 -0.1270E-04 -0.1492E-05  0.8965E-05  0.1638E-04
  0.2025E-04  0.2143E-04  0.2111E-04  0.2007E-04  0.1847E-04
  0.1629E-04  0.1384E-04  0.1204E-04  0.1206E-
^@^@T^A^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^A^@^@^@^@^@^@^@^@^@^@^@^B^@^@^@^@^@^@^@^@^@^@^@^D^@^@^@^@^@^@^@^@^@^@^@^H^@^@^@^@^@^@^@^@^@^@^@  ^@^@^@^A^@^@^@E~L~G^@   ^@^@^@^A^@^@^@E~L~G^@^L^@^@^@^@^@^@^@^@^@^@^@^L^@^@^@^@^@^@^@^@^@^@^@^L^@^@^@^@^@^@^@^@^@^@^^
@^L^@^@^@^@^@^@^@^@^@^@^@^L^@^@^@^@^@^@^@^@^@^@k bitmap at 2181038173 (bg #66560 + 93), Inode bitmap at 2181038429 (bg #66560 + 349)
  Inode table at 2181039336-2181039343 (bg #66560 + 1256)
  1780 free blocks, 128 free inodes, 0 directories, 128 unused inodes
  Free blocks: 2184085760-2184086271, 2184094208-2184094451, 2184097792-2184098815
  Free inodes: 8531585-8531712
Group 66654: (Blocks 2184118272-2184151039) [INODE_UNINIT, ITABLE_ZEROED]
  Checksum 0xdc38, unused inodes 128
  Block bitmap at 2181038174 (bg #66560 + 94), Inode bitmap at 2181038430 (bg #66560 + 350)
  Inode table at 2181039344-2181039351 (bg #66560 + 1264)
  239 free blocks, 128 free inodes, 0 directories, 128 unused inodes
  Free blocks: 2184121088-2184121321, 2184121339-2184121343
  Free inodes: 8531713-8531840
Group 66655: (Blocks 2184151040-2184183807) [INODE_UNINIT, ITABLE_ZEROED]
  Checksum 0xc8b0, unused inodes 128
  Block bitmap at 2181038175 (bg #66560 + 95), Inode bitmap at 2181038431 (bg #66560 + 351)
  Inode table at 2181039352-2181039359 (bg #66560 + 1272)
  5119 free blocks, 128 free inodes, 0 directories, 128 unused inodes
  Free blocks: 2184151297-2184151551, 2184154624-2184155135, 2184165376-2184166399, 2184167168-2184167423, 2184171520-2184172543, 2184179712-2184181759
  Free inodes: 8531841-8531968
Group 66656: (Blocks 2184183808-2184216575) [INODE_UNINIT, ITABLE_ZEROED]
  Checksum 0x7ce0, unused inodes 128
  Block bitmap at 2181038176 (bg #66560 + 96), Inode bitmap at 2181038432 (bg #66560 + 352)
  Inode table at 2181039360-2181039367 (bg #66560 + 1280)
  2816 free blocks, 128 free inodes, 0 directories, 128 unused inodes
  Free blocks: 2184184832-2184185855, 2184198144-2184198911, 2184205312-2184206335
  Free inodes: 8531969-8532096
Group 66657: (Blocks 2184216576-2184249343) [INODE_UNINIT, ITABLE_ZEROED]
  Checksum 0xe3a2, unused inodes 128
  Block bitmap at 2181038177 (bg #66560 + 97), Inode bitmap at 2181038433 (bg #66560 + 353)
  Inode table at 2181039368-2181039375 (bg #66560 + 1288)
  2574 free blocks, 128 free inodes, 0 directories, 128 unused inodes
  Free blocks: 2184217600-2184218623, 2184221416-2184221437, 2184236544-2184237045, 2184237054-2184237055, 2184240128-2184241151
  Free inodes: 8532097-8532224
Group 66658: (Blocks 2184249344-2184282111) [INODE_UNINIT, ITABLE_ZEROED]
  Checksum 0xe5c6, unused inodes 128
  Block bitmap at 2181038178 (bg #66560 + 98), Inode bitmap at 2181038434 (bg #66560 + 354)
  Inode table at 2181039376-2181039383 (bg #66560 + 1296)
  5426 free blocks, 128 free inodes, 0 directories, 128 unused inodes
  Free blocks: 2184251392-2184251647, 2184252160-2184252407, 2184252413-2184252415, 2184253440-2184254463, 2184255488-2184255743, 2184255924-2184256511, 2184259584-2184260095, 2184260352-2184260602, 2184260608-2184261631, 2184272896-2184273919, 2184276992-2184277229, 2184277246-2184277247
  Free inodes: 8532225-8532352
Group 66659: (Blocks 2184282112-2184314879) [INODE_UNINIT, ITABLE_ZEROED]
  Checksum 0x16f0, unused inodes 128
  Block bitmap at 2181038179 (bg #66560 + 99), Inode bitmap at 2181038435 (bg #66560 + 355)
  Inode table at 2181039384-2181039391 (bg #66560 + 1304)
  3751 free blocks, 128 free inodes, 0 directories, 128 unused inodes
  Free blocks: 2184288256-2184289279, 2184292355-2184292607, 2184293376-2184294000, 2184294867-2184294911, 2184297216-2184298495, 2184299491-2184299519, 2184302848-2184303094, 2184303870-2184304116, 2184304127
  Free inodes: 8532353-8532480
Group 66660: (Blocks 2184314880-2184347647) [INODE_UNINIT, ITABLE_ZEROED]
  Checksum 0x7c1a, unused inodes 128
  Block bitmap at 2181038180 (bg #66560 + 100), Inode bitmap at 2181038436 (bg #66560 + 356)
  Inode table at 2181039392-2181039399 (bg #66560 + 1312)
  9197 free blocks, 128 free inodes, 0 directories, 128 unused inodes
  Free blocks: 2184320256-2184321023, 2184322048-2184323071, 2184323585-2184324055, 2184324057, 2184324074-2184324095, 2184324354-2184325119, 2184325632-2184326143, 2184326655-2184355
 .1385E-04
  0.2720E-04  0.3428E-04  0.3470E-04  0.3125E-04  0.2717E-04
  0.2375E-04  0.1968E-04  0.1258E-04  0.1537E-05 -0.1135E-04
 -0.2146E-04
  0.2531E-04  0.2365E-04  0.2503E-04  0.2827E-04  0.2984E-04
  0.2598E-04  0.1521E-04 -0.2827E-06 -0.1534E-04 -0.2416E-04
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;No errors are logged on the OSS.&lt;/p&gt;</description>
                <environment></environment>
        <key id="31268">LU-6925</key>
            <summary>oss buffer cache corruption</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="green">Oleg Drokin</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Wed, 29 Jul 2015 18:17:16 +0000</created>
                <updated>Thu, 15 Oct 2015 23:05:36 +0000</updated>
                            <resolved>Thu, 15 Oct 2015 23:05:36 +0000</resolved>
                                    <version>Lustre 2.5.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="122591" author="mhanafi" created="Wed, 29 Jul 2015 18:20:32 +0000"  >&lt;p&gt;Please fix type in the title.&lt;/p&gt;</comment>
                            <comment id="122597" author="pjones" created="Wed, 29 Jul 2015 18:54:18 +0000"  >&lt;p&gt;Oleg&lt;/p&gt;

&lt;p&gt;Please can you advise?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="122612" author="green" created="Wed, 29 Jul 2015 20:14:49 +0000"  >&lt;p&gt;What do you mean by &quot;process that run on the oss writing and reading data to the local oss filesystem.&quot; - directly writing to the ldiskfs?&lt;/p&gt;


&lt;p&gt;Or is this dumpe2fs is what you run on the ost device, but /root/ostdevice.meta is a filesystem that has nothing to do with lustre whatsoever?&lt;/p&gt;</comment>
                            <comment id="122615" author="mhanafi" created="Wed, 29 Jul 2015 21:38:09 +0000"  >&lt;p&gt;&apos;local oss filesyste&apos; is the root drive for the oss. We only read from the ldiskfs and any data written is to the local filesystem. So the corruption must be occurring in the page cache of the OSSes. I think the dumpe2fs output may be just a  coincidence because that data is written and read a lot.&lt;/p&gt;
</comment>
                            <comment id="122629" author="green" created="Wed, 29 Jul 2015 23:27:37 +0000"  >&lt;p&gt;I guess I am just confused - if the write target is local filesystem - then there could not be any &quot;ost stripe boundary&quot; in there?&lt;br/&gt;
Or do you also see corruptions in the files on Lustre itself?&lt;/p&gt;</comment>
                            <comment id="122631" author="mhanafi" created="Thu, 30 Jul 2015 00:06:07 +0000"  >&lt;p&gt;Sorry may I am not explaining well. This is a very strange issue....&lt;/p&gt;

&lt;p&gt;The user was running a job on a lustre client writing the file to lustre. The corruption is in the user file on lustre. But the data that is inserted into the users file is data that is read and written on the local filesystem of the OSS. So some how data that is being read and written on the OSS root filesystem corrupted part of the user file on lustre. The corruption was exactly 4KB and it was at the end of a OST stripe. &lt;/p&gt;


</comment>
                            <comment id="122679" author="green" created="Thu, 30 Jul 2015 14:24:56 +0000"  >&lt;p&gt;Hm. This is quite a mystery indeed.&lt;/p&gt;

&lt;p&gt;the OST where this occurred on (the corrupted stripe), did it happen to be low on space? There&apos;s &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6768&quot; title=&quot;Data corruption when write and truncate in parallel in a almost-full file system&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6768&quot;&gt;&lt;del&gt;LU-6768&lt;/del&gt;&lt;/a&gt; that I think could lead to what you describe, a dirty page from page cache appropriated.&lt;/p&gt;</comment>
                            <comment id="123069" author="mhanafi" created="Mon, 3 Aug 2015 19:01:26 +0000"  >&lt;p&gt;&apos;low space?&quot; do you mean ost disk space? I don&apos;t think we where  low on disk space but there was a large spike in load and most of the memory was consumed in page/buffer cache.&lt;/p&gt;
</comment>
                            <comment id="123268" author="green" created="Tue, 4 Aug 2015 22:07:18 +0000"  >&lt;p&gt;Yes, I did mean disk space since this is what was reported as one of preconditions in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6768&quot; title=&quot;Data corruption when write and truncate in parallel in a almost-full file system&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6768&quot;&gt;&lt;del&gt;LU-6768&lt;/del&gt;&lt;/a&gt; that looks pretty similar to what you seems to have experienced, but I guess it&apos;s just something that makes the condition to trigger more easy to trigger?&lt;/p&gt;</comment>
                            <comment id="123275" author="jaylan" created="Tue, 4 Aug 2015 23:38:32 +0000"  >&lt;p&gt;I posted a request of b2_5 port of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6768&quot; title=&quot;Data corruption when write and truncate in parallel in a almost-full file system&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6768&quot;&gt;&lt;del&gt;LU-6768&lt;/del&gt;&lt;/a&gt; patch in that LU.&lt;/p&gt;</comment>
                            <comment id="126054" author="mhanafi" created="Wed, 2 Sep 2015 16:50:14 +0000"  >&lt;p&gt;Could enabling quota enforcement increase the likely hood of hitting this bug?&lt;/p&gt;</comment>
                            <comment id="126499" author="green" created="Sat, 5 Sep 2015 15:26:30 +0000"  >&lt;p&gt;Alex, what do you think on this? I imagine quota might cause writes to fail at times too even if otherwise there&apos;s plenty of space?&lt;/p&gt;</comment>
                            <comment id="126507" author="bzzz" created="Sat, 5 Sep 2015 20:50:57 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6768&quot; title=&quot;Data corruption when write and truncate in parallel in a almost-full file system&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6768&quot;&gt;&lt;del&gt;LU-6768&lt;/del&gt;&lt;/a&gt; can happen to an empty filesystem as well. it&apos;s just easier to hit when a filesystem is nearly full (blocks are reallocated quickly). I&apos;d think this can be a result of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6758&quot; title=&quot;racer test_1: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6758&quot;&gt;&lt;del&gt;LU-6758&lt;/del&gt;&lt;/a&gt;. truncate is required to hit that though. probably it makes sense to trace the application to verify this.&lt;/p&gt;</comment>
                            <comment id="130560" author="pjones" created="Thu, 15 Oct 2015 23:05:36 +0000"  >&lt;p&gt;As per NASA fix worked&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="30836">LU-6768</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzxja7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10020"><![CDATA[1]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>