<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:23:56 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2285] Test failure on replay-ost-single test_3: write page inode failed -2</title>
                <link>https://jira.whamcloud.com/browse/LU-2285</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/70b83f68-2797-11e2-9e20-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/70b83f68-2797-11e2-9e20-52540035b04c&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_3 failed with the following error:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;test_3 failed with 1&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;on client we can see in dmesg that the write actually failed.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Lustre: DEBUG MARKER: /usr/sbin/lctl mark == replay-ost-single test 3: Fail OST during write, with verification ================================ 12:42:29 \(1352148149\)
Lustre: DEBUG MARKER: == replay-ost-single test 3: Fail OST during write, with verification ================================ 12:42:29 (1352148149)
LustreError: 17578:0:(vvp_io.c:1037:vvp_io_commit_write()) Write page 11 of inode ffff81031f41fb28 failed -2
Lustre: DEBUG MARKER: /usr/sbin/lctl mark  replay-ost-single test_3: @@@@@@ FAIL: test_3 failed with 1 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Info required for matching: replay-ost-single 3&lt;/p&gt;</description>
                <environment></environment>
        <key id="16599">LU-2285</key>
            <summary>Test failure on replay-ost-single test_3: write page inode failed -2</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="liwei">Li Wei</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Mon, 5 Nov 2012 23:45:34 +0000</created>
                <updated>Fri, 22 Dec 2017 10:29:38 +0000</updated>
                            <resolved>Fri, 22 Dec 2017 10:29:33 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="47673" author="liwei" created="Mon, 12 Nov 2012 03:14:41 +0000"  >&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/a2a98882-2865-11e2-9c12-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/a2a98882-2865-11e2-9c12-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="47674" author="liwei" created="Mon, 12 Nov 2012 03:34:46 +0000"  >&lt;p&gt;In both reports above, the processes writing to Lustre got ENOENT:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;== replay-ost-single test 3: Fail OST during write, with verification ================================ 12:42:29 (1352148149)
Failing ost1 on node client-21-ib
CMD: client-21-ib grep -c /mnt/ost1&apos; &apos; /proc/mounts
tee: standard output: No such file or directory
[...]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I&apos;m able to reproduce this ENOENT locally on a single-node VM setup with ZFS targets:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;== replay-ost-single test 2: |x| 10 open(O_CREAT)s == 12:38:36 (1352695116)
replay-ost-single.sh: line 72: echo: write error: No such file or directory
 replay-ost-single test_2: @@@@@@ FAIL: create /mnt/lustre/d0.replay-ost-single/f.replay-ost-single.2-2 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;From my syslog:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Nov 12 12:38:36 h221g kernel: LustreError: 22747:0:(osp_precreate.c:382:osp_precreate_cleanup_orphans()) lustre-OST0000-osc-MDT0000: going to cleanup orphans since 3
Nov 12 12:38:37 h221g kernel: LustreError: 22605:0:(ofd_obd.c:1069:ofd_orphans_destroy()) lustre-OST0000: deleting orphan objects from 5 to 67
Nov 12 12:38:37 h221g kernel: Lustre: DEBUG MARKER: == replay-ost-single test 2: |x| 10 open(O_CREAT)s == 12:38:36 (1352695116)
Nov 12 12:38:37 h221g kernel: LustreError: 22501:0:(osp_precreate.c:721:osp_precreate_reserve()) lustre-OST0000-osc-MDT0000: precreated 33: opd_pre_last_created 67opd_pre_used_id 34 opd_pre_reserved 0
Nov 12 12:38:37 h221g kernel: LustreError: 22501:0:(osp_precreate.c:780:osp_precreate_get_id()) Incremented opd_pre_used_id for OSP 0: 35
Nov 12 12:38:37 h221g kernel: LustreError: 22501:0:(osp_object.c:291:osp_object_create()) Written last used ID for OSP 0: 35: 0
Nov 12 12:38:41 h221g kernel: LustreError: 22605:0:(ldlm_resource.c:1107:ldlm_resource_get()) lvbo_init failed for resource 34: rc -2
Nov 12 12:38:42 h221g kernel: LustreError: 22605:0:(ldlm_resource.c:1107:ldlm_resource_get()) lvbo_init failed for resource 33: rc -2
Nov 12 12:38:42 h221g kernel: LustreError: 22501:0:(osp_precreate.c:721:osp_precreate_reserve()) lustre-OST0000-osc-MDT0000: precreated 32: opd_pre_last_created 67opd_pre_used_id 35 opd_pre_reserved 0
Nov 12 12:38:42 h221g kernel: LustreError: 22501:0:(osp_precreate.c:780:osp_precreate_get_id()) Incremented opd_pre_used_id for OSP 0: 36
Nov 12 12:38:42 h221g kernel: LustreError: 22501:0:(osp_object.c:291:osp_object_create()) Written last used ID for OSP 0: 36: 0
Nov 12 12:38:42 h221g kernel: Lustre: DEBUG MARKER: replay-ost-single test_2: @@@@@@ FAIL: create /mnt/lustre/d0.replay-ost-single/f.replay-ost-single.2-2
Nov 12 12:38:42 h221g kernel: LustreError: 22605:0:(ldlm_resource.c:1107:ldlm_resource_get()) Skipped 1 previous similar message
Nov 12 12:38:42 h221g kernel: LustreError: 22605:0:(ldlm_resource.c:1107:ldlm_resource_get()) lvbo_init failed for resource 32: rc -2
Nov 12 12:38:43 h221g kernel: LustreError: 22747:0:(osp_precreate.c:447:osp_precreate_cleanup_orphans()) lustre-OST0000-osc-MDT0000: Got last_id 67 from OST, last_used is 36, next 67
Nov 12 12:38:43 h221g kernel: LustreError: 22747:0:(osp_precreate.c:284:osp_precreate_send()) lustre-OST0000-osc-MDT0000: new last_created 100
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The file creation in question happened after the OSP had sent the OBD_FL_DELORPHAN request but before the OSP got the reply and updated its counters.  I think the file creation should wait until the last created ID was known from the OST.&lt;/p&gt;

&lt;p&gt;Another problem observed during the investigation is that osp sends last used ID plus one in OBD_FL_DELORPHAN requests but OFD (as well as OBDFilter) expects just last used ID.&lt;/p&gt;

&lt;p&gt;I&apos;ll submit patches to fix these issues.&lt;/p&gt;</comment>
                            <comment id="47675" author="bzzz" created="Mon, 12 Nov 2012 05:05:08 +0000"  >&lt;p&gt;&amp;gt; The file creation in question happened after the OSP had sent the OBD_FL_DELORPHAN request but before the OSP got the reply and updated its counters. I think the file creation should wait until the last created ID was known from the OST.&lt;/p&gt;

&lt;p&gt;correct, though this is supposed to be so in the code, look at osp_precreate_thread():&lt;/p&gt;

&lt;p&gt;upon reconnect we restart the main loop and do not get to osp_precreate_send() before synchronous osp_precreate_cleanup_orphans() is completed.&lt;/p&gt;

&lt;p&gt;probably there is a race between reconnect and ptlrpc_queue_wait() ?&lt;/p&gt;
</comment>
                            <comment id="47716" author="liwei" created="Mon, 12 Nov 2012 23:33:46 +0000"  >&lt;p&gt;The file creation was satisfied from the pre-created pool without waiting for a pre-creation RPC.  The race I was trying to point out is between osp_precreate_get_id() and osp_precreate_cleanup_orphans().  The latter sends on-disk last used IDs to OSTs, while osp_precreate_get_id() increments in-memory last used IDs.  Hence, the race may cause the object IDs assigned based on pre-OST-failure counters to be destroyed by the concurrent orphan deletion request.  The log shows that this happens.&lt;/p&gt;

&lt;p&gt;If osp_precreate_cleanup_orphans() sends in-memory last used IDs instead, this problem would go away, I think.  Of course, we need to make sure in-memory last used IDs are properly initialized with their on-disk counterparts before the initial orphan deletions.  I haven&apos;t tried this yet.  Any objections?&lt;/p&gt;</comment>
                            <comment id="47717" author="bzzz" created="Tue, 13 Nov 2012 01:44:37 +0000"  >&lt;p&gt;I&apos;d suggest to develop a test which block orphan cleanup procedure on OST and force MDS to allocate pre-created object.  the race you described seem to be a subset of general case where any object allocation should be blocked during orphan cleanup procedure.&lt;/p&gt;</comment>
                            <comment id="47718" author="liwei" created="Tue, 13 Nov 2012 03:17:34 +0000"  >&lt;p&gt;Alex, are you suggesting there is no problem needs to be fixed with regard to the race?  But I don&apos;t get how existing code block allocations during orphan deletions.  &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/sad.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="47719" author="bzzz" created="Tue, 13 Nov 2012 03:21:50 +0000"  >&lt;p&gt;no, I think the problem is there and it&apos;s more serious than you described. basically there should be no reservations/allocations done during orphan cleanup procedure. so I&apos;m suggesting to develop a test reproducing this at first.&lt;/p&gt;</comment>
                            <comment id="47837" author="liwei" created="Thu, 15 Nov 2012 08:55:55 +0000"  >&lt;p&gt;The new test is a bit tricky to get right, but here is a proof-of-concept fix for the major issue: &lt;a href=&quot;http://review.whamcloud.com/4590&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4590&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="47838" author="bzzz" created="Thu, 15 Nov 2012 08:58:34 +0000"  >&lt;p&gt;hmm, why is it tricky? basically we want non-empty pool in a specific OSP, then stop corresponded OST, wait till MDS got disconnected and try to create on using that specific OST ?&lt;/p&gt;</comment>
                            <comment id="47984" author="liwei" created="Mon, 19 Nov 2012 01:13:45 +0000"  >&lt;p&gt;It is tricky to make sure the file creation (osp_precreate_reserve() and osp_precreate_get_id()) happens after osp_precreate_cleanup_orphans() has read opd_last_used_id and before the orphan cleanup RPC is replied.&lt;/p&gt;</comment>
                            <comment id="47987" author="bzzz" created="Mon, 19 Nov 2012 02:28:42 +0000"  >&lt;p&gt;look at something like:&lt;/p&gt;

&lt;p&gt;	/* This will trigger a watchdog timeout */&lt;br/&gt;
	OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_STATFS_LCW_SLEEP,&lt;br/&gt;
			 (MDT_SERVICE_WATCHDOG_FACTOR *&lt;br/&gt;
			  at_get(&amp;amp;svcpt-&amp;gt;scp_at_estimate)) + 1);&lt;/p&gt;
</comment>
                            <comment id="47991" author="liwei" created="Mon, 19 Nov 2012 05:44:13 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/4610&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4610&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The new test.  Still a bit real-time dependent.&lt;/p&gt;</comment>
                            <comment id="47992" author="bzzz" created="Mon, 19 Nov 2012 05:46:23 +0000"  >&lt;p&gt;&amp;gt; The new test. Still a bit real-time dependent.&lt;/p&gt;

&lt;p&gt;there are a lot of tests depending on time already. all the 40-46 in sanityn, for example.&lt;/p&gt;</comment>
                            <comment id="48058" author="liwei" created="Tue, 20 Nov 2012 03:18:40 +0000"  >&lt;p&gt;An update on the patches as of Dec 22:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/4668&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4668&lt;/a&gt; (Debug messages) Landed.&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/4511&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4511&lt;/a&gt; (Orphan cleanup from correct ID) Landed.&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/4625&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4625&lt;/a&gt; (opd_pre_last_created) Landed.&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/4590&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4590&lt;/a&gt; (Block allocation during orphan cleanups) Landed.&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/4610&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4610&lt;/a&gt; (New regression test) Being refreshed.&lt;/p&gt;</comment>
                            <comment id="50241" author="liwei" created="Wed, 9 Jan 2013 22:09:44 +0000"  >&lt;p&gt;Only one test patch is left; I&apos;d suggest lowering the priority from &quot;blocker&quot;.&lt;/p&gt;</comment>
                            <comment id="50411" author="liwei" created="Mon, 14 Jan 2013 09:28:34 +0000"  >&lt;p&gt;I have lowered the priority from &quot;blocker&quot; to &quot;major&quot;.  The last patch should be worked out before the release, but could wait after the feature freeze.&lt;/p&gt;</comment>
                            <comment id="77606" author="bogl" created="Fri, 21 Feb 2014 16:32:49 +0000"  >&lt;p&gt;another one:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/7c9d76dc-9b12-11e3-8ad7-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/7c9d76dc-9b12-11e3-8ad7-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="77676" author="bogl" created="Fri, 21 Feb 2014 23:34:06 +0000"  >&lt;p&gt;another one:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sessions/5121f06e-9b49-11e3-8a4e-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sessions/5121f06e-9b49-11e3-8a4e-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="77696" author="bogl" created="Sun, 23 Feb 2014 19:47:23 +0000"  >&lt;p&gt;another one in b2_5:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/b3d5a116-9c60-11e3-8f3e-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/b3d5a116-9c60-11e3-8f3e-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="217102" author="adilger" created="Fri, 22 Dec 2017 10:29:33 +0000"  >&lt;p&gt;Close old bug that has not been seen in a long time.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="16930">LU-2493</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="17620">LU-2832</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="46429">LU-9588</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvbrb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>5469</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>