<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:00:52 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13389] aborted file create may leave unattached inodes on MDS.</title>
                <link>https://jira.whamcloud.com/browse/LU-13389</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;aborted mdtest job leaves unattached inodes:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Job Script: command started at Mon Mar 2 16:40:31 CST 2020

-- started at 03/02/2020 16:40:32 --

 

mdtest-1.9.3 was launched with 32 total task(s) on 1 node(s)

Command line used: /cray/css/ostest/binaries/xt/rel.70.aries.cray/xtcnl/ostest/ROOT.latest/tests/gold/ioperf/mdtest/mdtest -f 32 -l 32 -n10000 -i1 -d /lus/snx11281/disk/ostest.vers/alsorun.20200302130005.752.saturn-p4/CL_mdtest_4s_fo.4.tjgw4U.1583188829/CL_mdtest_4s_fo -tuv

V-1: main: Setting create/stat/read/remove_only to True

V-1: Entering valid_tests...

barriers                : True

collective_creates      : False

create_only             : True

dirpath(s):

    /lus/snx11281/disk/ostest.vers/alsorun.20200302130005.752.saturn-p4/CL_mdtest_4s_fo.4.tjgw4U.1583188829/CL_mdtest_4s_fo

dirs_only               : True

read_bytes              : 0

read_only               : True

first                   : 32

files_only              : True

iterations              : 1

items_per_dir           : 0

last                    : 32

leaf_only               : False

items                   : 10000

nstride                 : 0

pre_delay               : 0

remove_only             : False

random_seed             : 0

stride                  : 1

shared_file             : False

time_unique_dir_overhead: True

stat_only               : True

unique_dir_per_task     : True

write_bytes             : 0

sync_file               : False

depth                   : 0

V-1: Entering display_freespace...

V-1: Entering show_file_system_size...

Path: /lus/snx11281/disk/ostest.vers/alsorun.20200302130005.752.saturn-p4/CL_mdtest_4s_fo.4.tjgw4U.1583188829

FS: 483.4 TiB   Used FS: 1.5%   Inodes: 487.0 Mi   Used Inodes: 0.1%

 

32 tasks, 320000 files/directories

 

   Operation               Duration              Rate

   ---------               --------              ----

V-1: main: * iteration 1 *

V-1: Entering create_remove_directory_tree, currDepth = 0...

V-1: Entering create_remove_directory_tree, currDepth = 1...

V-1: main:   Tree creation     :          0.011 sec,         88.658 ops/sec

V-1: Entering directory_test...

V-1: Entering unique_dir_access...

V-1: Entering create_remove_items, currDepth = 0...

V-1: Entering create_remove_items_helper...

V-1: Entering unique_dir_access...

V-1: Entering mdtest_stat...

V-1: Entering unique_dir_access...

V-1: Entering unique_dir_access...

V-1: Entering create_remove_items, currDepth = 0...

V-1: Entering create_remove_items_helper...

V-1: Entering unique_dir_access...

V-1:   Directory creation:         25.644 sec,      12478.741 ops/sec

V-1:   Directory stat    :          4.194 sec,      76299.371 ops/sec

V-1:   Directory removal :          8.918 sec,      35883.016 ops/sec

V-1: Entering file_test...

V-1: Entering unique_dir_access...

V-1: Entering create_remove_items, currDepth = 0...

V-1: Entering create_remove_items_helper...

aprun: Apid 5441901: Caught signal Terminated, sending to application

_pmiu_daemon(SIGCHLD): [NID 00545] [c0-1c2s8n1] [Mon Mar  2 18:15:52 2020] PE RANK 25 exit signal Terminated

Application 5441901 exit codes: 143

Application 5441901 resources: utime ~0s, stime ~6s, Rss ~11768, inblocks ~0, outblocks ~0

Job Script: command stopped at Mon Mar 2 18:16:38 CST 2020

Job Script: command runtime was 5767 seconds

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;e2fsck logs:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@snx11281n000 ~]# grep Unattached /home/admin/e2fsck.*
/home/admin/e2fsck.snx11281n002.3.4-010.81.202003091157.out:Unattached inode 3367056499
/home/admin/e2fsck.snx11281n002.3.4-010.81.202003091157.out:Unattached inode 3367056500
/home/admin/e2fsck.snx11281n002.3.4-010.81.202003091157.out:Unattached inode 3367056501
/home/admin/e2fsck.snx11281n002.3.4-010.81.202003091157.out:Unattached inode 3367056507
/home/admin/e2fsck.snx11281n002.3.4-010.81.202003091157.out:Unattached inode 3367056509
/home/admin/e2fsck.snx11281n002.3.4-010.81.202003091157.out:Unattached inode 3367056510
[root@snx11281n000 ~]#
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;unattached inode stat:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;debugfs:  stat &amp;lt;3367056499&amp;gt;

Inode: 3367056499   Type: regular    Mode:  0644   Flags: 0x0

Generation: 3271840411    Version: 0x000001ee:00bad136

User:  1356   Group: 11121   Project:     0   Size: 0

File ACL: 0

Links: 1   Blockcount: 0

Fragment:  Address: 0    Number: 0    Size: 0

 ctime: 0x5e5d9f91:00000000 -- Mon Mar  2 18:06:41 2020

 atime: 0x5e5d9f91:00000000 -- Mon Mar  2 18:06:41 2020

 mtime: 0x5e5d9f91:00000000 -- Mon Mar  2 18:06:41 2020

crtime: 0x5e5da1e3:c1b98cf8 -- Mon Mar  2 18:16:35 2020

Size of extra inode fields: 32

Extended attributes:

  trusted.lma (24) = 00 00 00 00 00 00 00 00 6e 11 10 00 02 00 00 00 84 46 01 00 00 00 00 00

  lma: fid=[0x20010116e:0x14684:0x0] compat=0 incompat=0

  trusted.lov (144)

  trusted.link (60)

BLOCKS:



debugfs:  stat &amp;lt;3367056500&amp;gt;

Inode: 3367056500   Type: regular    Mode:  0644   Flags: 0x0

Generation: 3271840414    Version: 0x000001ee:00bad13a

User:  1356   Group: 11121   Project:     0   Size: 0

File ACL: 0

Links: 1   Blockcount: 0

Fragment:  Address: 0    Number: 0    Size: 0

 ctime: 0x5e5d9f91:00000000 -- Mon Mar  2 18:06:41 2020

 atime: 0x5e5d9f91:00000000 -- Mon Mar  2 18:06:41 2020

 mtime: 0x5e5d9f91:00000000 -- Mon Mar  2 18:06:41 2020

crtime: 0x5e5da1e3:c1b98cf8 -- Mon Mar  2 18:16:35 2020

Size of extra inode fields: 32

Extended attributes:

  trusted.lma (24) = 00 00 00 00 00 00 00 00 6e 11 10 00 02 00 00 00 85 46 01 00 00 00 00 00

  lma: fid=[0x20010116e:0x14685:0x0] compat=0 incompat=0

  trusted.lov (144)

  trusted.link (61)

BLOCKS:

 


debugfs:  stat &amp;lt;3367056501&amp;gt;

Inode: 3367056501   Type: regular    Mode:  0644   Flags: 0x0

Generation: 3271840413    Version: 0x000001ee:00bad143

User:  1356   Group: 11121   Project:     0   Size: 0

File ACL: 0

Links: 1   Blockcount: 0

Fragment:  Address: 0    Number: 0    Size: 0

 ctime: 0x5e5d9f91:00000000 -- Mon Mar  2 18:06:41 2020

 atime: 0x5e5d9f91:00000000 -- Mon Mar  2 18:06:41 2020

 mtime: 0x5e5d9f91:00000000 -- Mon Mar  2 18:06:41 2020

crtime: 0x5e5da1e3:c1b98cf8 -- Mon Mar  2 18:16:35 2020

Size of extra inode fields: 32

Extended attributes:

  trusted.lma (24) = 00 00 00 00 00 00 00 00 6e 11 10 00 02 00 00 00 83 46 01 00 00 00 00 00

  lma: fid=[0x20010116e:0x14683:0x0] compat=0 incompat=0

  trusted.lov (96)

  trusted.link (61)

BLOCKS:

 


debugfs:  stat &amp;lt;3367056507&amp;gt;

Inode: 3367056507   Type: regular    Mode:  0644   Flags: 0x0

Generation: 3271840428    Version: 0x000001ee:00bad146

User:  1356   Group: 11121   Project:     0   Size: 0

File ACL: 0

Links: 1   Blockcount: 0

Fragment:  Address: 0    Number: 0    Size: 0

 ctime: 0x5e5d9f96:00000000 -- Mon Mar  2 18:06:46 2020

 atime: 0x5e5d9f96:00000000 -- Mon Mar  2 18:06:46 2020

 mtime: 0x5e5d9f96:00000000 -- Mon Mar  2 18:06:46 2020

crtime: 0x5e5da1e3:c1f69600 -- Mon Mar  2 18:16:35 2020

Size of extra inode fields: 32

Extended attributes:

  trusted.lma (24) = 00 00 00 00 00 00 00 00 6e 11 10 00 02 00 00 00 98 46 01 00 00 00 00 00

  lma: fid=[0x20010116e:0x14698:0x0] compat=0 incompat=0

  trusted.lov (144)

  trusted.link (61)

BLOCKS:

 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;dmesg contains the following Lustre error message at the moment of inode creation:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Mar  2 18:16:35 snx11281n002 kernel: LustreError: 2640:0:(osd_handler.c:2009:osd_trans_stop()) snx11281-MDT0000: failed in transaction hook: rc = -114
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;LFSCK was able to successfully re-attach the inodes:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00100000:10000000:1.0:1583880053.552740:0:31921:0:(lfsck_namespace.c:1243:lfsck_namespace_insert_normal()) snx11281-MDT0000-osd: namespace LFSCK insert object [0x20010116e:0x14682:0x0] with the name file.mdtest.17.4708 and type 100000 to the parent [0x200101166:0x1731:0x0]: rc = 1
00100000:10000000:1.0:1583880053.626588:0:31921:0:(lfsck_namespace.c:1243:lfsck_namespace_insert_normal()) snx11281-MDT0000-osd: namespace LFSCK insert object [0x20010116e:0x14683:0x0] with the name file.mdtest.11.4748 and type 100000 to the parent [0x200101166:0x1727:0x0]: rc = 1
00100000:10000000:9.0:1583880053.699071:0:31921:0:(lfsck_namespace.c:1243:lfsck_namespace_insert_normal()) snx11281-MDT0000-osd: namespace LFSCK insert object [0x20010116e:0x14684:0x0] with the name file.mdtest.0.4749 and type 100000 to the parent [0x200101166:0x1730:0x0]: rc = 1
00100000:10000000:6.0F:1583880053.764322:0:31921:0:(lfsck_namespace.c:1243:lfsck_namespace_insert_normal()) snx11281-MDT0000-osd: namespace LFSCK insert object [0x20010116e:0x14685:0x0] with the name file.mdtest.25.4668 and type 100000 to the parent [0x200101166:0x1721:0x0]: rc = 1
00100000:10000000:7.0:1583880053.830815:0:31921:0:(lfsck_namespace.c:1243:lfsck_namespace_insert_normal()) snx11281-MDT0000-osd: namespace LFSCK insert object [0x20010116e:0x14686:0x0] with the name file.mdtest.26.4699 and type 100000 to the parent [0x200101166:0x1725:0x0]: rc = 1
00100000:10000000:9.0:1583880053.963978:0:31921:0:(lfsck_namespace.c:1243:lfsck_namespace_insert_normal()) snx11281-MDT0000-osd: namespace LFSCK insert object [0x20010116e:0x14698:0x0] with the name file.mdtest.22.4696 and type 100000 to the parent [0x200101166:0x1716:0x0]: rc = 1

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="58506">LU-13389</key>
            <summary>aborted file create may leave unattached inodes on MDS.</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="zam">Alexander Zarochentsev</assignee>
                                    <reporter username="zam">Alexander Zarochentsev</reporter>
                        <labels>
                    </labels>
                <created>Wed, 25 Mar 2020 17:00:29 +0000</created>
                <updated>Thu, 7 May 2020 13:39:19 +0000</updated>
                            <resolved>Thu, 7 May 2020 13:39:19 +0000</resolved>
                                                    <fixVersion>Lustre 2.14.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>2</watches>
                                                                            <comments>
                            <comment id="266105" author="zam" created="Wed, 25 Mar 2020 17:11:53 +0000"  >&lt;p&gt;The issue is a regression from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11444&quot; title=&quot;RPC resend may corrupt the data&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11444&quot;&gt;&lt;del&gt;LU-11444&lt;/del&gt;&lt;/a&gt;:&lt;/p&gt;

&lt;p&gt;A check for &quot;obsolete&quot; requests introduced in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11444&quot; title=&quot;RPC resend may corrupt the data&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11444&quot;&gt;&lt;del&gt;LU-11444&lt;/del&gt;&lt;/a&gt; .&lt;br/&gt;
server code detects such requests by searching for in-progress requests for the same multimod rpc slot. The ones with lower XIDs occupying the same slot get marked &quot;obsolete&quot; b/c the application sent the request was interrupted and the slot was reused.&lt;br/&gt;
Processing of &quot;obsoleted&quot; requests is done as usual, but last_rcvd file is not updated.&lt;/p&gt;

&lt;p&gt;The &quot;rq_obsolete&quot; flag causes osd_txn_stop() to return error &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
tgt_add_reply_data():
...
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (req-&amp;gt;rq_obsolete) {
                        mutex_unlock(&amp;amp;ted-&amp;gt;ted_lcd_lock);
                        RETURN(-EALREADY);
                }

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt; 

&lt;p&gt;full call stack looks as &lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;osd_trans_stop()&lt;/li&gt;
	&lt;li&gt;dt_txn_hook_stop()&lt;/li&gt;
	&lt;li&gt;tgt_txn_stop_cb()&lt;/li&gt;
	&lt;li&gt;tgt_last_rcvd_update()&lt;/li&gt;
	&lt;li&gt;tgt_mk_reply_data()&lt;/li&gt;
	&lt;li&gt;tgt_add_reply_data()&lt;/li&gt;
	&lt;li&gt;RETURN(-EALREADY);&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;and mdd_create() attempts to rollback the transaction:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
out_stop:
        rc2 = mdd_trans_stop(env, mdd, rc, handle);
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc == 0) {
                /* If creation fails, it is most likely due to the remote update
                 * failure, because local transaction will mostly succeed at
                 * &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; stage. There is no easy way to rollback all of previous
                 * updates, so let&apos;s remove the object from namespace, and
                 * LFSCK should handle the orphan object. */
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc2 &amp;lt; 0 &amp;amp;&amp;amp; !mdd_object_remote(mdd_pobj))
                        mdd_index_delete(env, mdd_pobj, attr, lname);
                rc = rc2;
        }

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Unfortunately it is done partially, only name is removed. There are two bad side effects from that: fs inconsistency with unattached inodes and LFSCK is able to re-attach the inodes back to the namespace which is a bit unexpected.&lt;/p&gt;

&lt;p&gt;I should note those &quot;obsolete&quot; requests may not survive a failover as the client already dropped the request from all import&apos;s queues. It means the request cannot participate in recovery and can be lost after a failover. So transaction rollback doesn&apos;t look as a wrong approach, it reduces chances of seeing fs object loss after failover. Also nobody depends on successful completion of the file create, the application already aborted.&lt;/p&gt;

&lt;p&gt;I see several possible solutions to the problem:&lt;/p&gt;

&lt;p&gt;1. rollback the transaction fully, that includes inode deletion and changelog record update or deletion.&lt;br/&gt;
2. leaving the file in the namespace accepting that it can be lost after a failover, it worked that way before LUS-7339 / LUS-6272 and caused no problems, but I might be wrong here.&lt;br/&gt;
3. leaving the inode unattached and somehow avoiding re-attaching them by LFSCK (by removal of LinkEA record ?).&lt;/p&gt;</comment>
                            <comment id="267547" author="gerrit" created="Tue, 14 Apr 2020 13:03:01 +0000"  >&lt;p&gt;Alexander Zarochentsev (alexander.zarochentsev@hpe.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/38221&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/38221&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13389&quot; title=&quot;aborted file create may leave unattached inodes on MDS.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13389&quot;&gt;&lt;del&gt;LU-13389&lt;/del&gt;&lt;/a&gt; tgt: not rollback obsolete rq&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 5c494fb023b934c24863dd76f3d2dc316c9fc687&lt;/p&gt;</comment>
                            <comment id="269493" author="gerrit" created="Thu, 7 May 2020 05:42:48 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/38221/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/38221/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13389&quot; title=&quot;aborted file create may leave unattached inodes on MDS.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13389&quot;&gt;&lt;del&gt;LU-13389&lt;/del&gt;&lt;/a&gt; tgt: not rollback obsolete rq&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: e87e30460b18e73d1dbf2627e59485ae3d670e60&lt;/p&gt;</comment>
                            <comment id="269541" author="pjones" created="Thu, 7 May 2020 13:39:19 +0000"  >&lt;p&gt;Landed for 2.14&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00wbj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>