<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:50:47 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5357] GPF in lod_trans_stop()</title>
                <link>https://jira.whamcloud.com/browse/LU-5357</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;While removing a striped directory if out_create_update_req() cannot allocate a update request or its 8K buffer we have the following GPF (likely use after free).&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;export MDSCOUNT=4
llmount.sh
cd /mnt/lustre
lfs mkdir -c4 d0
echo /root/lustre-release/lustre/ptlrpc/../../lustre/target/out_lib.c:70 &amp;gt; /proc/fs/lustre/alloc_fail # fail to allocate dt_update in out_create_update_req()
rmdir d0

[   85.080526] LustreError: 4395:0:(class_obd.c:198:obd_alloc_fail()) force kmalloc of dt_update (72 bytes) failed at /root/lustre-release/lustre/ptlrpc/../../lustre/target/out_lib.c:70
[   85.085387] LustreError: 4395:0:(class_obd.c:205:obd_alloc_fail()) 63673246 total bytes and 1048576 total pages (256 bytes) allocated by Lustre, 407071412 total bytes by LNET
[   94.798945] Lustre: ctl-lustre-MDT0000: super-sequence allocation rc = 0 [0x0000000300000400-0x0000000340000400):2:mdt
[   94.801799] Lustre: Skipped 1 previous similar message
[   94.803637] Lustre: cli-ctl-lustre-MDT0002: Allocated super-sequence [0x0000000300000400-0x0000000340000400):2:mdt]
[  103.921859] LustreError: 4804:0:(class_obd.c:198:obd_alloc_fail()) force kmalloc of dt_update (72 bytes) failed at /root/lustre-release/lustre/ptlrpc/../../lustre/target/out_lib.c:70
[  103.927270] LustreError: 4804:0:(class_obd.c:205:obd_alloc_fail()) 63959198 total bytes and 1048576 total pages (256 bytes) allocated by Lustre, 407352740 total bytes by LNET
[  103.932656] LustreError: 4804:0:(osp_md_object.c:237:osp_md_declare_attr_set()) lustre-MDT0001-osp-MDT0000: Get OSP update buf failed: -12
[  103.936893] LustreError: 4804:0:(lod_object.c:1081:lod_declare_attr_set()) failed declaration: -12
[  103.939281] general protection fault: 0000 [#1] SMP
[  103.940247] last sysfs file: /sys/devices/system/cpu/possible
[  103.940247] CPU 0
[  103.940247] Modules linked in: lustre(U) ofd(U) osp(U) lod(U) ost(U) mdt(U) mdd(U) mgs(U) nodemap(U) osd_ldiskfs(U) ldiskfs(U) exportfs lquota(U) lfsck(U) jbd obdecho(U) mgc(U) lov(U) osc(U) mdc(U) lmv(U) fid(U) fld(U) ptlrpc(U) obdclass(U) ksocklnd(U) lnet(U) sha512_generic sha256_generic libcfs(U) autofs4 nfs lockd fscache auth_rpcgss nfs_acl sunrpc ipv6 microcode virtio_balloon virtio_net i2c_piix4 i2c_core ext4 jbd2 mbcache virtio_blk virtio_pci virtio_ring virtio pata_acpi ata_generic ata_piix dm_mirror dm_region_hash dm_log dm_mod [last unloaded: speedstep_lib]
[  103.940247]
[  103.940247] Pid: 4804, comm: mdt00_004 Not tainted 2.6.32-431.5.1.el6.lustre.x86_64 #1 Bochs Bochs
[  103.940247] RIP: 0010:[&amp;lt;ffffffffa0d3d890&amp;gt;]  [&amp;lt;ffffffffa0d3d890&amp;gt;] lod_trans_stop+0x110/0x210 [lod]
[  103.940247] RSP: 0018:ffff8801e244bab0  EFLAGS: 00010292
[  103.940247] RAX: 6b6b6b6b6b6b6b6b RBX: ffff8801e85d6080 RCX: 0000000000000000
[  103.940247] RDX: 0000000000000000 RSI: ffff8801e244cf58 RDI: ffff880219e58000
[  103.940247] RBP: ffff8801e244bae0 R08: 0000000000000000 R09: 0000000000000000
[  103.940247] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[  103.940247] R13: ffff8802199fcab0 R14: ffff8801fbac8ae0 R15: ffff8801fe3fdba8
[  103.940247] FS:  0000000000000000(0000) GS:ffff88002f800000(0000) knlGS:0000000000000000
[  103.940247] CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
[  103.940247] CR2: 000000377fedc920 CR3: 00000002162cd000 CR4: 00000000000006f0
[  103.940247] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  103.940247] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[  103.940247] Process mdt00_004 (pid: 4804, threadinfo ffff8801e244a000, task ffff8801e97a8580)
[  103.940247] Stack:
[  103.940247]  ffff8801e244bad0 00000000fffffff4 ffff8801fbac8ae0 ffff8801eda3c820
[  103.940247] &amp;lt;d&amp;gt; ffff8801fe3b2c30 ffff8801fe3fdba8 ffff8801e244baf0 ffffffffa081f08d
[  103.940247] &amp;lt;d&amp;gt; ffff8801e244bbb0 ffffffffa08097e9 ffffffffa0cb47ba ffff8801eda3a7b0
[  103.940247] Call Trace:
[  103.940247]  [&amp;lt;ffffffffa081f08d&amp;gt;] mdd_trans_stop+0x1d/0x20 [mdd]
[  103.940247]  [&amp;lt;ffffffffa08097e9&amp;gt;] mdd_unlink+0x4b9/0xcc0 [mdd]
[  103.940247]  [&amp;lt;ffffffffa0cb47ba&amp;gt;] ? mdt_reint_unlink+0x9ca/0x10b0 [mdt]
[  103.940247]  [&amp;lt;ffffffffa0cab968&amp;gt;] mdo_unlink+0x18/0x50 [mdt]
[  103.940247]  [&amp;lt;ffffffffa0cb47f4&amp;gt;] mdt_reint_unlink+0xa04/0x10b0 [mdt]
[  103.940247]  [&amp;lt;ffffffffa0c8ee45&amp;gt;] ? mdt_ucred+0x15/0x20 [mdt]
[  103.940247]  [&amp;lt;ffffffffa0cab701&amp;gt;] mdt_reint_rec+0x41/0xe0 [mdt]
[  103.940247]  [&amp;lt;ffffffffa0c96a63&amp;gt;] mdt_reint_internal+0x4c3/0x7c0 [mdt]
[  103.940247]  [&amp;lt;ffffffffa0c972eb&amp;gt;] mdt_reint+0x6b/0x120 [mdt]
[  103.940247]  [&amp;lt;ffffffffa06e9675&amp;gt;] tgt_request_handle+0x245/0xad0 [ptlrpc]
[  103.940247]  [&amp;lt;ffffffffa069c921&amp;gt;] ptlrpc_main+0xcf1/0x1870 [ptlrpc]
[  103.940247]  [&amp;lt;ffffffffa069bc30&amp;gt;] ? ptlrpc_main+0x0/0x1870 [ptlrpc]
[  103.940247]  [&amp;lt;ffffffff8109eab6&amp;gt;] kthread+0x96/0xa0
[  103.940247]  [&amp;lt;ffffffff8100c30a&amp;gt;] child_rip+0xa/0x20
[  103.940247]  [&amp;lt;ffffffff81554710&amp;gt;] ? _spin_unlock_irq+0x30/0x40
[  103.940247]  [&amp;lt;ffffffff8100bb10&amp;gt;] ? restore_args+0x0/0x30
[  103.940247]  [&amp;lt;ffffffff8109ea20&amp;gt;] ? kthread+0x0/0xa0
[  103.940247]  [&amp;lt;ffffffff8100c300&amp;gt;] ? child_rip+0x0/0x20
[  103.940247] Code: 00 bb 01 00 00 48 c7 05 eb b4 03 00 00 00 00 00 c7 05 d9 b4 03 00 01 00 00 00 e8 3c 17 59 ff e9 34 ff ff ff 49 8b 45 00 49 39 c5 &amp;lt;4c&amp;gt; 8b 38 74 4d 48 8b 70 f8 48 8b 46 40 48 8b 40 18 48 85 c0 0f
[  103.940247] RIP  [&amp;lt;ffffffffa0d3d890&amp;gt;] lod_trans_stop+0x110/0x210 [lod]
[  103.940247]  RSP &amp;lt;ffff8801e244bab0&amp;gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The fault is in the list_for_each_entry() block. It seems likely that there were no updates in the list and that the first dt_trans_stop() freed the thandle.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; lod_trans_stop(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_env *env, struct dt_device *dt,
                          struct thandle *th)
{
        struct thandle_update           *tu = th-&amp;gt;th_update;
        struct dt_update_request        *update;
        struct dt_update_request        *tmp;
        &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;                             rc2 = 0;
        &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;                             rc;
        ENTRY;

        CERROR(&lt;span class=&quot;code-quote&quot;&gt;&quot;dt = %p, th = %p\n&quot;&lt;/span&gt;, dt, th);

        rc = dt_trans_stop(env, th-&amp;gt;th_dev, th);
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (likely(tu == NULL))
                RETURN(rc);

        list_for_each_entry_safe(update, tmp,
                                 &amp;amp;tu-&amp;gt;tu_remote_update_list,
                                 dur_list) {
                &lt;span class=&quot;code-comment&quot;&gt;/* update will be freed inside dt_trans_stop */&lt;/span&gt;
                rc2 = dt_trans_stop(env, update-&amp;gt;dur_dt, th);
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (unlikely(rc2 != 0 &amp;amp;&amp;amp; rc == 0))
                        rc = rc2;
        }

        RETURN(rc);
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This was found via memory allocation fault injection.&lt;/p&gt;</description>
                <environment></environment>
        <key id="25607">LU-5357</key>
            <summary>GPF in lod_trans_stop()</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="di.wang">Di Wang</assignee>
                                    <reporter username="jhammond">John Hammond</reporter>
                        <labels>
                            <label>lod</label>
                            <label>malloc</label>
                            <label>mdd</label>
                    </labels>
                <created>Tue, 15 Jul 2014 20:50:04 +0000</created>
                <updated>Mon, 27 Apr 2015 20:31:20 +0000</updated>
                            <resolved>Tue, 3 Feb 2015 18:38:56 +0000</resolved>
                                    <version>Lustre 2.6.0</version>
                    <version>Lustre 2.7.0</version>
                                    <fixVersion>Lustre 2.7.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="98603" author="jhammond" created="Thu, 6 Nov 2014 21:23:50 +0000"  >&lt;p&gt;Hi Di, do we need to stop the local transaction before we stop the transactions in the remote update list?&lt;/p&gt;

&lt;p&gt;Also I see that the transactions in the remote update list do not always take references on the local transaction. Is that intended?&lt;/p&gt;</comment>
                            <comment id="98623" author="di.wang" created="Thu, 6 Nov 2014 23:52:52 +0000"  >&lt;p&gt;Yes, we need to stop the local transaction before send remote RPC, because we want to avoid sending RPC in the middle of transaction. what do you mean take references?&lt;/p&gt;

&lt;p&gt;Yes, we only hold the reference for real cross-MDT operation, i.e. for other cases like lfsck, we do not hold that, LFSCK will maintain it by himself. Actually this method is a bit messy, that is why we come out a new way to separate thandle in the next phase of DNE.&lt;/p&gt;</comment>
                            <comment id="98653" author="jhammond" created="Fri, 7 Nov 2014 14:20:40 +0000"  >&lt;p&gt;&amp;gt; Yes, we need to stop the local transaction before send remote RPC, because we want to avoid sending RPC in the middle of transaction. what do you mean take references?&lt;/p&gt;

&lt;p&gt;I mean call thandle_get().&lt;/p&gt;

&lt;p&gt;Or perhaps it&apos;s enough to call thandle_get() at the start of lod_trans_stop() and thandle_put() at the end.&lt;/p&gt;</comment>
                            <comment id="100858" author="adilger" created="Fri, 5 Dec 2014 18:56:47 +0000"  >&lt;p&gt;Di, is this code being fixed with your DNE2 transaction handle changes?&lt;/p&gt;</comment>
                            <comment id="100879" author="di.wang" created="Fri, 5 Dec 2014 21:52:38 +0000"  >&lt;p&gt;Andreas: yes, this process will be changed in DNE2 transaction handles.  John: I think you are right, add thandle_get/put lod_trans_stop should fix this problem in 2.7. Thanks. Sorry for delay response.&lt;/p&gt;</comment>
                            <comment id="103630" author="gerrit" created="Thu, 15 Jan 2015 18:42:08 +0000"  >&lt;p&gt;wangdi (di.wang@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/13420&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/13420&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5357&quot; title=&quot;GPF in lod_trans_stop()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5357&quot;&gt;&lt;del&gt;LU-5357&lt;/del&gt;&lt;/a&gt; lod: hold thandle during lod_trans_stop&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: b05355cb44e2cc059627548be55f8a5592753978&lt;/p&gt;</comment>
                            <comment id="105545" author="gerrit" created="Tue, 3 Feb 2015 18:28:57 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/13420/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/13420/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5357&quot; title=&quot;GPF in lod_trans_stop()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5357&quot;&gt;&lt;del&gt;LU-5357&lt;/del&gt;&lt;/a&gt; lod: hold thandle during lod_trans_stop&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: c236efcd5ed186c5813cc6b11d0b1b12d8ec0734&lt;/p&gt;</comment>
                            <comment id="105551" author="pjones" created="Tue, 3 Feb 2015 18:38:56 +0000"  >&lt;p&gt;Landed for 2.7&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwrl3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>14940</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>