<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:22:06 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-8967] directory entries for non existing files</title>
                <link>https://jira.whamcloud.com/browse/LU-8967</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We have several directories with entries for non existing files. For example:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@quartz2311:~]# ls -l /p/lscratchh/casses1/quartz-zinc_3/19519/dbench/quartz2322/clients/client0                                                                                 
ls: cannot access /p/lscratchh/casses1/quartz-zinc_3/19519/dbench/quartz2322/clients/client0/filler.003: No such file or directory
total 3154
-rw------- 1 casses1 casses1 1048576 Dec 21 16:43 filler.000
-rw------- 1 casses1 casses1 1048576 Dec 21 16:43 filler.001
-rw------- 1 casses1 casses1 1048576 Dec 21 16:43 filler.002
-????????? ? ?       ?             ?            ? filler.003
drwx------ 2 casses1 casses1   25600 Dec 21 16:43 ~dmtmp
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The directory itself is a remote directory on one MDT:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@quartz2311:~]# lfs getdirstripe -d /p/lscratchh/casses1/quartz-zinc_3/19519/dbench/quartz2322/clients/client0
lmv_stripe_count: 0 lmv_stripe_offset: 3
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;We are able to get striping information for this file:&lt;/p&gt;


&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@quartz2311:~]# lfs getstripe /p/lscratchh/casses1/quartz-zinc_3/19519/dbench/quartz2322/clients/client0/filler.003
/p/lscratchh/casses1/quartz-zinc_3/19519/dbench/quartz2322/clients/client0/filler.003
lmm_stripe_count:   1
lmm_stripe_size:    1048576
lmm_pattern:        1
lmm_layout_gen:     0
lmm_stripe_offset:  27
        obdidx           objid           objid           group
            27        20538776      0x1396598      0xcc0000402
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;It looks like the OSS serving that OST was rebooted and the OST went through recovery around the time the missing file was created. In particular, we note that the object number falls in the range of orphan objects that were deleted:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@zinci:~]# grep 0xcc0000402 /var/log/conman/console.zinc*
/var/log/conman/console.zinc43:2016-12-21 16:30:56 [189484.767900] Lustre: lsh-OST001b: deleting orphan objects from 0xcc0000402:20538706 to 0xcc0000402:20541649
/var/log/conman/console.zinc43:2016-12-21 16:33:30 [189639.110247] Lustre: lsh-OST001b: deleting orphan objects from 0xcc0000402:20538766 to 0xcc0000402:20541649
/var/log/conman/console.zinc43:2016-12-21 16:35:41 [189769.704490] Lustre: lsh-OST001b: deleting orphan objects from 0xcc0000402:20538766 to 0xcc0000402:20541649
/var/log/conman/console.zinc43:2016-12-21 16:40:19 [190047.449320] Lustre: lsh-OST001b: deleting orphan objects from 0xcc0000402:20538766 to 0xcc0000402:20541649
/var/log/conman/console.zinc43:2016-12-21 16:44:45 [190313.751155] Lustre: lsh-OST001b: deleting orphan objects from 0xcc0000402:20538820 to 0xcc0000402:20541649
/var/log/conman/console.zinc44:2016-12-21 16:49:27 [  159.838420] Lustre: lsh-OST001b: deleting orphan objects from 0xcc0000402:20538820 to 0xcc0000402:20541649
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I will attach server console logs separately.&lt;/p&gt;</description>
                <environment>&lt;a href=&quot;ssh://review.whamcloud.com/fs/lustre-release-fe-llnl&quot;&gt;ssh://review.whamcloud.com/fs/lustre-release-fe-llnl&lt;/a&gt;</environment>
        <key id="42639">LU-8967</key>
            <summary>directory entries for non existing files</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="tappro">Mikhail Pershin</assignee>
                                    <reporter username="nedbass">Ned Bass</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Fri, 23 Dec 2016 01:20:06 +0000</created>
                <updated>Thu, 10 Aug 2017 23:36:53 +0000</updated>
                            <resolved>Mon, 27 Feb 2017 21:56:57 +0000</resolved>
                                    <version>Lustre 2.8.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="178900" author="nedbass" created="Fri, 23 Dec 2016 01:32:14 +0000"  >&lt;p&gt;Also please note that we first observed this problem after our most recent Lustre update to 2.8.0_6chaos last Friday (December 16). The patches added in that update were:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;* 353716b (tag: 2.8.0_6.chaos, llnl/2.8.0-llnl) LU-8753 llog: add some debug patch
* 17d469a LU-8936 llite: use percpu env correctly in ll_invalidatepage          
* a15b2ef LU-8361 lfsck: detect Lustre device automatically                     
* 0220e0b LU-7648 man: new man pages for LFSCK commands                         
* 1638a07 LU-7256 tests: wait current LFSCK to exit before next test            
* 1d8cfaa LU-8407 recovery: more clear message about recovery failure           
* fdea0d2 LU-7732 ldlm: silence verbose &quot;waking for gap&quot; log messages           
* 82e924c LU-8753 llog: remove lgh_write_offset                                 
* 3a8db9a LU-8493 osp: Do not set stale for new osp obj                         
* 38c062b LU-7660 dne: support fs default stripe                                
* bc3df36 Revert &quot;LU-8422 update: add more debug info for the ticket&quot;           
* 10170a0 Revert &quot;LU-8422 llog: extended debug info&quot;                            
* 490414a Revert &quot;LU-6635 lfsck: more debug message for sanity-lfsck test_18e&quot;  

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="178987" author="pjones" created="Fri, 23 Dec 2016 18:34:05 +0000"  >&lt;p&gt;Mike&lt;/p&gt;

&lt;p&gt;Could you please assist with this issue?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="179084" author="nedbass" created="Tue, 27 Dec 2016 23:33:16 +0000"  >&lt;p&gt;I suspect this is related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8562&quot; title=&quot;osp_precreate_cleanup_orphans/osp_precreate_reserve race may cause data loss&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8562&quot;&gt;&lt;del&gt;LU-8562&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="179228" author="tappro" created="Fri, 30 Dec 2016 13:35:30 +0000"  >&lt;p&gt;Ned, are these entries occurred once when OST was failed over or still continue to occur? Is it possible to remove them?&lt;/p&gt;

&lt;p&gt;I am checking patches you&apos;ve mentioned.&lt;/p&gt;</comment>
                            <comment id="179255" author="nedbass" created="Fri, 30 Dec 2016 19:53:20 +0000"  >&lt;p&gt;Hi Mikhail, Each occurrence that I&apos;ve investigated happened immediately after the OST completed recovery. The object numbers of the missing files all fall at the beginning of the range of deleted orphans. It does not continue to occur when all OSTs are up.&lt;/p&gt;

&lt;p&gt;I can remove the files as root. The rm command fails for an unprivileged user because stat() returns ENONENT and rm treats that as fatal unless you&apos;re root.&lt;/p&gt;

&lt;p&gt;I have confirmed that I can reproduce &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8562&quot; title=&quot;osp_precreate_cleanup_orphans/osp_precreate_reserve race may cause data loss&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8562&quot;&gt;&lt;del&gt;LU-8562&lt;/del&gt;&lt;/a&gt; on our system using the test case from that patch and it looks just like this issue. I tested&#160;&lt;a href=&quot;https://review.whamcloud.com/#/c/22211/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/22211/&lt;/a&gt;&#160;on a single node setup and wasn&apos;t able to reproduce the bug. However I ran into a defect with that patch that causes the osp_precreate thread to hang as I described in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8562&quot; title=&quot;osp_precreate_cleanup_orphans/osp_precreate_reserve race may cause data loss&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8562&quot;&gt;&lt;del&gt;LU-8562&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="180025" author="tappro" created="Mon, 9 Jan 2017 09:36:30 +0000"  >&lt;p&gt;Ned, so this issue is solved by &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8562&quot; title=&quot;osp_precreate_cleanup_orphans/osp_precreate_reserve race may cause data loss&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8562&quot;&gt;&lt;del&gt;LU-8562&lt;/del&gt;&lt;/a&gt; in general, but patch itself contains defect. I checked your patch, does it solves your problem? Or more work is required in that area?&lt;/p&gt;

&lt;p&gt;Interesting that &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8562&quot; title=&quot;osp_precreate_cleanup_orphans/osp_precreate_reserve race may cause data loss&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8562&quot;&gt;&lt;del&gt;LU-8562&lt;/del&gt;&lt;/a&gt; itself is quite recent change and we did&apos;t observe a lot of issues similar to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8967&quot; title=&quot;directory entries for non existing files&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8967&quot;&gt;&lt;del&gt;LU-8967&lt;/del&gt;&lt;/a&gt; without it. I wonder what was changed in your system when you start seeing it. Was it just a software update or hardware as well?&lt;/p&gt;</comment>
                            <comment id="180122" author="nedbass" created="Mon, 9 Jan 2017 19:09:12 +0000"  >&lt;blockquote&gt;
&lt;p&gt;Ned, so this issue is solved by &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8562&quot; title=&quot;osp_precreate_cleanup_orphans/osp_precreate_reserve race may cause data loss&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8562&quot;&gt;&lt;del&gt;LU-8562&lt;/del&gt;&lt;/a&gt; in general, but patch itself contains defect. I checked your patch, does it solves your problem? Or more work is required in that area?&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;I have tested the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8562&quot; title=&quot;osp_precreate_cleanup_orphans/osp_precreate_reserve race may cause data loss&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8562&quot;&gt;&lt;del&gt;LU-8562&lt;/del&gt;&lt;/a&gt; patch and my one-line follow-on patch&#160;&lt;a href=&quot;https://review.whamcloud.com/24758&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/24758&lt;/a&gt; on a single-node test setup. I am no longer able to reproduce the data loss bug with those patches applied. Without the patches I can reproduce it almost immediately using the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8562&quot; title=&quot;osp_precreate_cleanup_orphans/osp_precreate_reserve race may cause data loss&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8562&quot;&gt;&lt;del&gt;LU-8562&lt;/del&gt;&lt;/a&gt; test case.&lt;/p&gt;

&lt;p&gt;The&#160; remaining work to do in that area is as follows.&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;An explanation is needed as to why conf-sanity test_101 is still failing as per &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8972&quot; title=&quot;conf-sanity test_101: File hasn&amp;#39;t object on OST&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8972&quot;&gt;&lt;del&gt;LU-8972&lt;/del&gt;&lt;/a&gt;. The ongoing test case failure suggests the data loss bug is not completely resolved by the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8562&quot; title=&quot;osp_precreate_cleanup_orphans/osp_precreate_reserve race may cause data loss&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8562&quot;&gt;&lt;del&gt;LU-8562&lt;/del&gt;&lt;/a&gt; patch. We need high confidence that this bug is resolved before putting user data on Lustre 2.8 FE.&lt;/li&gt;
	&lt;li&gt;Patch &lt;a href=&quot;https://review.whamcloud.com/24758&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/24758&lt;/a&gt; needs review by someone who understands the&#160;&lt;tt&gt;osp_precreate_thread&lt;/tt&gt; state machine better than me.&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;&#160;&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Interesting that &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8562&quot; title=&quot;osp_precreate_cleanup_orphans/osp_precreate_reserve race may cause data loss&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8562&quot;&gt;&lt;del&gt;LU-8562&lt;/del&gt;&lt;/a&gt; itself is quite recent change and we did&apos;t observe a lot of issues similar to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8967&quot; title=&quot;directory entries for non existing files&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8967&quot;&gt;&lt;del&gt;LU-8967&lt;/del&gt;&lt;/a&gt; without it. I wonder what was changed in your system when you start seeing it. Was it just a software update or hardware as well?&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;My best guess as to why we started seeing &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8562&quot; title=&quot;osp_precreate_cleanup_orphans/osp_precreate_reserve race may cause data loss&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8562&quot;&gt;&lt;del&gt;LU-8562&lt;/del&gt;&lt;/a&gt; is that we made changes to our pacemaker/corosync HA system. We recently optimized the configuration so Lustre services are started with much less delay than before. This makes it is very likely that OST orphan cleanup will be interrupted by the HA partner coming up and failing back the OST. As I understand, that is the race window for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8562&quot; title=&quot;osp_precreate_cleanup_orphans/osp_precreate_reserve race may cause data loss&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8562&quot;&gt;&lt;del&gt;LU-8562&lt;/del&gt;&lt;/a&gt; to occur. Before it took a long time for services to start up, so orphan cleanup was almost always done by the time the partner failed back the OST.&lt;/p&gt;</comment>
                            <comment id="180257" author="bzzz" created="Tue, 10 Jan 2017 15:15:15 +0000"  >&lt;p&gt;with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8562&quot; title=&quot;osp_precreate_cleanup_orphans/osp_precreate_reserve race may cause data loss&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8562&quot;&gt;&lt;del&gt;LU-8562&lt;/del&gt;&lt;/a&gt; I still see the same symptoms rarely.. there is another patch addressing the same issue, it&apos;s doing a bit better, but still possible to reproduce within few hours. I&apos;ve been looking for the root cause.&lt;/p&gt;</comment>
                            <comment id="181747" author="bzzz" created="Mon, 23 Jan 2017 15:45:02 +0000"  >&lt;p&gt;a prototype is under testing, I&apos;m going to pass it through Maloo few more times..&lt;/p&gt;</comment>
                            <comment id="182892" author="pjones" created="Wed, 1 Feb 2017 15:40:06 +0000"  >&lt;p&gt;This is now confirmed as a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8562&quot; title=&quot;osp_precreate_cleanup_orphans/osp_precreate_reserve race may cause data loss&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8562&quot;&gt;&lt;del&gt;LU-8562&lt;/del&gt;&lt;/a&gt; and so you should proceed with using Ned&apos;s ports of those patches to 2.8 FE. In addition it is recommended that you pick up the fix for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8367&quot; title=&quot;delete orphan phase isn&amp;#39;t stated for multistriped file&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8367&quot;&gt;&lt;del&gt;LU-8367&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="186359" author="pjones" created="Mon, 27 Feb 2017 21:56:57 +0000"  >&lt;p&gt;AFAIK items tracked under this ticket are complete&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="39178">LU-8562</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="24629" name="LU-8967.console.zinc4.mds" size="8643" author="nedbass" created="Fri, 23 Dec 2016 01:26:28 +0000"/>
                            <attachment id="24630" name="LU-8967.console.zinc43" size="27116" author="nedbass" created="Fri, 23 Dec 2016 01:26:28 +0000"/>
                            <attachment id="24631" name="LU-8967.console.zinc44" size="107125" author="nedbass" created="Fri, 23 Dec 2016 01:26:28 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzyz7b:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>