<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:37:45 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-3884] hsm_release hang at local root finding with quota enabled</title>
                <link>https://jira.whamcloud.com/browse/LU-3884</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;I did quota test today and found a problem with hsm_release. The test script is as follows:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;#!/bin/bash

setup() {
        ( cd srcs/lustre/lustre/tests; sh llmount.sh )

        lctl set_param mdt.*.hsm_control=enabled

        rm -rf /tmp/arc
        mkdir /tmp/arc
        ~/srcs/lustre/lustre/utils/lhsmtool_posix --daemon --hsm-root /tmp/arc /mnt/lustre

        lctl conf_param lustre.quota.ost=u
        lctl conf_param lustre.quota.mdt=u
}

LFS=~/srcs/lustre/lustre/utils/lfs
file=/mnt/lustre/testfile

setup

rm -f $file
dd &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt;=/dev/zero of=$file bs=1M count=30
chown tstusr.tstusr $file

set -x

$LFS hsm_archive $file
&lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; $LFS hsm_state $file |grep -qv archived; &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt;
        sleep 1
done
$LFS hsm_state $file

lctl set_param debug=-1
lctl set_param debug_mb=500
lctl dk &amp;gt; /dev/&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;

count=0
&lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; :; &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt;
        lctl mark &lt;span class=&quot;code-quote&quot;&gt;&quot;############# $count&quot;&lt;/span&gt;
        count=$((count+1))

        $LFS hsm_release $file
        $LFS hsm_state $file

        $LFS hsm_restore $file
        $LFS hsm_state $file

        sleep 1
done
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The output on the console before the script hung:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;+ /Users/jinxiong/srcs/lustre/lustre/utils/lfs hsm_state /mnt/lustre/testfile
+ grep -qv archived
+ /Users/jinxiong/srcs/lustre/lustre/utils/lfs hsm_state /mnt/lustre/testfile
/mnt/lustre/testfile: (0x00000009) exists archived, archive_id:1
+ lctl set_param debug=-1
debug=-1
+ lctl set_param debug_mb=500
debug_mb=500
+ lctl dk
+ count=0
+ :
+ lctl mark &apos;############# 0&apos;
+ count=1
+ /Users/jinxiong/srcs/lustre/lustre/utils/lfs hsm_release /mnt/lustre/testfile
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It looks like the mdt thread was hung at finding local root object, for unknown reason, the local root object was being deleted. This sounds impossible but happened:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LNet: Service thread pid 2945 was inactive for 40.00s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
Pid: 2945, comm: mdt_rdpg00_001

Call Trace:
 [&amp;lt;ffffffffa03c466e&amp;gt;] cfs_waitq_wait+0xe/0x10 [libcfs]
 [&amp;lt;ffffffffa056ffa7&amp;gt;] lu_object_find_at+0xb7/0x360 [obdclass]
 [&amp;lt;ffffffff81063410&amp;gt;] ? default_wake_function+0x0/0x20
 [&amp;lt;ffffffffa0570266&amp;gt;] lu_object_find+0x16/0x20 [obdclass]
 [&amp;lt;ffffffffa0bf5b16&amp;gt;] mdt_object_find+0x56/0x170 [mdt]
 [&amp;lt;ffffffffa0c264ef&amp;gt;] mdt_mfd_close+0x15ef/0x1b60 [mdt]
 [&amp;lt;ffffffffa03d3900&amp;gt;] ? libcfs_debug_vmsg2+0xba0/0xbb0 [libcfs]
 [&amp;lt;ffffffffa0c27e32&amp;gt;] mdt_close+0x682/0xac0 [mdt]
 [&amp;lt;ffffffffa0bffa4a&amp;gt;] mdt_handle_common+0x52a/0x1470 [mdt]
 [&amp;lt;ffffffffa0c39365&amp;gt;] mds_readpage_handle+0x15/0x20 [mdt]
 [&amp;lt;ffffffffa0709a55&amp;gt;] ptlrpc_server_handle_request+0x385/0xc00 [ptlrpc]
 [&amp;lt;ffffffffa03c454e&amp;gt;] ? cfs_timer_arm+0xe/0x10 [libcfs]
 [&amp;lt;ffffffffa03d540f&amp;gt;] ? lc_watchdog_touch+0x6f/0x170 [libcfs]
 [&amp;lt;ffffffffa03d3951&amp;gt;] ? libcfs_debug_msg+0x41/0x50 [libcfs]
 [&amp;lt;ffffffff81055ad3&amp;gt;] ? __wake_up+0x53/0x70
 [&amp;lt;ffffffffa070ad9d&amp;gt;] ptlrpc_main+0xacd/0x1710 [ptlrpc]
 [&amp;lt;ffffffffa070a2d0&amp;gt;] ? ptlrpc_main+0x0/0x1710 [ptlrpc]
 [&amp;lt;ffffffff81096a36&amp;gt;] kthread+0x96/0xa0
 [&amp;lt;ffffffff8100c0ca&amp;gt;] child_rip+0xa/0x20
 [&amp;lt;ffffffff810969a0&amp;gt;] ? kthread+0x0/0xa0
 [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I suspect this issue is related to quota because if I turned quota off everything became all right.&lt;/p&gt;</description>
                <environment></environment>
        <key id="20783">LU-3884</key>
            <summary>hsm_release hang at local root finding with quota enabled</summary>
                <type id="7" iconUrl="https://jira.whamcloud.com/images/icons/issuetypes/task_agile.png">Technical task</type>
                            <parent id="20020">LU-3647</parent>
                                    <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="niu">Niu Yawei</assignee>
                                    <reporter username="jay">Jinshan Xiong</reporter>
                        <labels>
                            <label>HSM</label>
                            <label>MB</label>
                    </labels>
                <created>Thu, 5 Sep 2013 06:04:01 +0000</created>
                <updated>Fri, 20 Sep 2013 21:34:51 +0000</updated>
                            <resolved>Fri, 20 Sep 2013 21:34:51 +0000</resolved>
                                                    <fixVersion>Lustre 2.5.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="65864" author="pjones" created="Thu, 5 Sep 2013 17:54:37 +0000"  >&lt;p&gt;Niu&lt;/p&gt;

&lt;p&gt;Could you please comment on this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="66050" author="niu" created="Mon, 9 Sep 2013 11:36:23 +0000"  >&lt;p&gt;This can be reproduced even without the chown operation, and I&apos;m not sure how quota code can affect this test, since there almost no quota code involved. I&apos;ll investigate it further.&lt;/p&gt;</comment>
                            <comment id="66176" author="niu" created="Tue, 10 Sep 2013 13:56:55 +0000"  >&lt;p&gt;I think I see the problem:&lt;/p&gt;

&lt;p&gt;lu_object_put_nocache(obj) will mark an object as dying object, so when lu_object_find_at() find the dying object, it will wait for the the dying object to be freed then try lookup again. The problem of this logic is that if the object is holding by somebody (never get freed), lu_object_find_at() will wait on the dying object forever.&lt;/p&gt;

&lt;p&gt;In this specific case: The root object is holding by lfsck (lfsck-&amp;gt;li_local_root), and quota code calls local storage API to create quota files: local_oid_storage_init() -&amp;gt; lastid_compat_check() -&amp;gt; lu_object_put_nocache(root), then the root object is marked as dying but never been freed.&lt;/p&gt;

&lt;p&gt;Given the machanism of lu_object_put_nocache(), I think nobody should hold any object, nasf, what do you think about? Could we just remove the li_local_root and get the object on demand?&lt;/p&gt;
</comment>
                            <comment id="66224" author="jay" created="Tue, 10 Sep 2013 18:20:34 +0000"  >&lt;p&gt;probably we shouldn&apos;t use nocache version of lu_object_put() at all.&lt;/p&gt;</comment>
                            <comment id="66298" author="niu" created="Wed, 11 Sep 2013 03:08:28 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/7604&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/7604&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="66303" author="jay" created="Wed, 11 Sep 2013 04:30:39 +0000"  >&lt;p&gt;why does it call lu_object_put_nocache() in the first place?&lt;/p&gt;</comment>
                            <comment id="66307" author="bzzz" created="Wed, 11 Sep 2013 05:04:56 +0000"  >&lt;p&gt;&amp;gt; why does it call lu_object_put_nocache() in the first place?&lt;/p&gt;

&lt;p&gt;because different stacks may want to use this object and expect own slices (at different time).&lt;br/&gt;
this is an known issue..&lt;/p&gt;

&lt;p&gt;I don&apos;t think NOCACHE can really help here, because it just postpones potential issues.&lt;/p&gt;</comment>
                            <comment id="66308" author="jay" created="Wed, 11 Sep 2013 05:31:58 +0000"  >&lt;p&gt;in that case, it seems like it&apos;s not allowed to hold object before the stack is fully initialized.&lt;/p&gt;</comment>
                            <comment id="66309" author="bzzz" created="Wed, 11 Sep 2013 05:41:03 +0000"  >&lt;p&gt;it&apos;s not just stack initialization.. / has been accessed by few componenets during runtime. I&apos;m trying to recall all the details.&lt;/p&gt;</comment>
                            <comment id="66563" author="niu" created="Fri, 13 Sep 2013 02:34:27 +0000"  >&lt;p&gt;Hi, Mike&lt;/p&gt;

&lt;p&gt;Any input on this? Is it possible to get rid of lu_object_put_nocache() for local storage? Thanks.&lt;/p&gt;</comment>
                            <comment id="66571" author="bzzz" created="Fri, 13 Sep 2013 07:03:39 +0000"  >&lt;p&gt;as a short term fix I&apos;d suggest to fix LFSCK - there is no real need to hold local root object.&lt;/p&gt;</comment>
                            <comment id="66572" author="jay" created="Fri, 13 Sep 2013 07:15:04 +0000"  >&lt;p&gt;nasf, can you comment?&lt;/p&gt;</comment>
                            <comment id="66582" author="yong.fan" created="Fri, 13 Sep 2013 13:01:01 +0000"  >&lt;p&gt;This is the patch to release the root reference held by LFSCK:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/#/c/7643/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/7643/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="67174" author="pjones" created="Fri, 20 Sep 2013 21:34:51 +0000"  >&lt;p&gt;Landed for 2.5.0&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzw047:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>10098</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>