<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:25:51 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-16308] cl_object_put_last() is stuck in wait_event(atomic_read(&amp;header-&gt;loh_ref) == 1)</title>
                <link>https://jira.whamcloud.com/browse/LU-16308</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;client may get stuck in cl_object_put_last():&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[465342.626191] INFO: task nwchem:108679 blocked for more than 120 seconds.
[465342.632934]       Tainted: G           OE    --------- -t - 4.18.0-193.el8.x86_64 #1
[465342.640783] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[465342.648720] nwchem          D    0 108679 108659 0x00004082
[465342.655115] Call Trace:
[465342.658245]  ? __schedule+0x24f/0x650
[465342.662607]  schedule+0x2f/0xa0
[465342.666446]  cl_inode_fini+0x137/0x1e0 [lustre]
[465342.671705]  ? wake_up_q+0x70/0x70
[465342.675813]  ll_clear_inode+0x1b3/0x570 [lustre]
[465342.681197]  ll_delete_inode+0x58/0x220 [lustre]
[465342.686571]  evict+0xd2/0x1a0
[465342.690291]  do_unlinkat+0x250/0x2e0
[465342.694604]  do_syscall_64+0x5b/0x1a0
[465342.698910]  entry_SYSCALL_64_after_hwframe+0x65/0xca
[465342.704672] RIP: 0033:0x7fd72cf373cb
[465342.708989] Code: Bad RIP value.
[465342.712929] RSP: 002b:00007ffe0b19b5d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000057
[465342.721236] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fd72cf373cb
[465342.729119] RDX: 0000000000000010 RSI: 0000000000000000 RDI: 00007ffe0b19ba10
[465342.736982] RBP: 00007ffe0b19ba10 R08: 0000000000000000 R09: 0000000000000000
[465342.744820] R10: 0000000000000011 R11: 0000000000000246 R12: 0000000000000000
[465342.752638] R13: 00007ffe0b19dd70 R14: 00007ffe0b19beb0 R15: 00000000010a9e9d
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It should be woken up by lu_object_free():&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (waitqueue_active(wq))
                wake_up(wq);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;But according to description of waitqueue_active(), a smp_mb()/spinlock is needed to wake up reliably.&lt;/p&gt;</description>
                <environment></environment>
        <key id="73227">LU-16308</key>
            <summary>cl_object_put_last() is stuck in wait_event(atomic_read(&amp;header-&gt;loh_ref) == 1)</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="laisiyao">Lai Siyao</assignee>
                                    <reporter username="laisiyao">Lai Siyao</reporter>
                        <labels>
                    </labels>
                <created>Fri, 11 Nov 2022 03:48:45 +0000</created>
                <updated>Wed, 4 Jan 2023 02:24:26 +0000</updated>
                            <resolved>Tue, 13 Dec 2022 23:35:44 +0000</resolved>
                                                    <fixVersion>Lustre 2.16.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="352720" author="gerrit" created="Fri, 11 Nov 2022 04:01:24 +0000"  >&lt;p&gt;&quot;Lai Siyao &amp;lt;lai.siyao@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/49130&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/49130&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16308&quot; title=&quot;cl_object_put_last() is stuck in wait_event(atomic_read(&amp;amp;header-&amp;gt;loh_ref) == 1)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16308&quot;&gt;&lt;del&gt;LU-16308&lt;/del&gt;&lt;/a&gt; obdclass: wakeup cl_inode_fini() reliably&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 7d7ed230de0de7a3499a5f393300fae64e7846ec&lt;/p&gt;</comment>
                            <comment id="353427" author="neilb" created="Thu, 17 Nov 2022 22:15:36 +0000"  >&lt;p&gt;I think the problem here is not related to that waitqueue_active().&lt;/p&gt;

&lt;p&gt;The code that is blocking is waiting for -&amp;gt;loh_ref to be one.&#160; All code that decrements -&amp;gt;loh_ref goes on to call wake_up() unconditionally - -&amp;gt;loh_ref must still be at least two&lt;/p&gt;

&lt;p&gt;Just before cl_inode_fini() calls cl_object_put_last(), it calls cl_object_kill() which only sets LU_OBJECT_HEARD_BANSHEE.&#160; However it does NOT then wake up the wq.&#160; So some other thread may be holding a reference and waiting for LU_OBJECT_HEARD_BANSHEE to be set.&#160; It will wait indefinitely.&lt;/p&gt;

&lt;p&gt;All other callers of cl_object_kill() then call cl_object_put() which will do the wake_up().&#160; cl_object_put_last() does NOT do a wakeup, and hence the hang.&lt;/p&gt;

&lt;p&gt;To fix this you need to add a wakeup call, either in cl_object_kill or cl_object_put_last, or in cl_inode_fini between the two calls.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="354385" author="ys" created="Mon, 28 Nov 2022 21:29:55 +0000"  >&lt;p&gt;Hi, Neil,&lt;/p&gt;

&lt;p&gt;The wake_up was called every time in lu_object_put()  while loh_rec!=1. So i don&apos;t  think the issue you pointed exists in master branch.  But it exists in other branch that still use old code before your wait_event patches.  In the old code the lu_object_put() use a local variable is_dying to save the value of lu_object_is_dying() then decrease the loh_ref and wakeup only when the is_dying is true. So it may lost the chance to wakeup the waiter on cl_object_put_last(). &lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                            <comment id="354397" author="neilb" created="Mon, 28 Nov 2022 23:13:59 +0000"  >&lt;p&gt;I&apos;m sorry but I don&apos;t understand what point you are trying to make.&lt;/p&gt;

&lt;p&gt;Yes, wake_up is called&#160; in lu_object_put().&#160; Every time loh_ref is decremented wake_up is called.&#160; That is good, but that is not the problem.&lt;/p&gt;

&lt;p&gt;The problem relates to LU_OBJECT_HEARD_BANSHEE.&#160; wake_up must be called whenever that is set.&lt;/p&gt;

&lt;p&gt;Some code holds a reference (and so keeps loh_ref elevated) while waiting for LU_OBJECT_HEARD_BANSHEE to be set.&lt;/p&gt;

&lt;p&gt;Specifically lu_object_find_at waits either for that flag to be set, or for LU_OBJECT_INITED to be set.&#160; If HEARD_BANSHEE is set, then it will call lu_object_put() which will decrement loh_ref and then call wake_up().&#160; But if the flag is set but not wake_up happens, then it can block indefinitely holding a reference.&lt;/p&gt;

&lt;p&gt;So if cl_inode_fini() runs while lu_object_find_at() is waiting, then a deadlock will happen because cl_inode_fini()-&amp;gt;cl_object_put_last() will wait for all other references to the object to be droppped, but lu_object_find_at() is still holding a reference which it won&apos;t drop until it gets a wakeup.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;As you say, back in 2.13 and earlier the local is_dying variable is lu_object_put() was racy.&#160; If anyone is still using 2.13 or earlier, it should probably be fixed.&#160; Remove the variable and call wake_up unconditionally.&#160; But that is a separate problem.&lt;/p&gt;</comment>
                            <comment id="354447" author="ys" created="Tue, 29 Nov 2022 07:17:20 +0000"  >&lt;p&gt;Hi, Neil,&lt;/p&gt;

&lt;p&gt;As you said, The lu_object_find_at() will wait LU_OBJECT_HEARD_BANSHEE to be set. In fact, It is only place to do that. It intends to wait the object either LU_OBJECT_HEARD_BANSHEE or LU_OBJECT_INITED. The LU_OBJECT_INITED was set by lu_object_start() successful, the LU_OBJECT_HEARD_BANSHEE was set while lu_object_start failed. In other word, the wait_event call will end while lu_find_object_at() return. It is impossible to react with cl_object_kill() and cl_object_put_last().&lt;/p&gt;


&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                            <comment id="354583" author="neilb" created="Tue, 29 Nov 2022 21:10:24 +0000"  >&lt;p&gt;Hi YangSheng,&lt;/p&gt;

&lt;p&gt;do you have a crash-dump from a time when this hang was happening, or are you able to reproduce it?&lt;/p&gt;

&lt;p&gt;It would be helpful to find out what other tasks are blocked - to confirm that lu_object_find_at() isn&apos;t waiting, and to see if anything else is.&lt;/p&gt;

&lt;p&gt;I think there must be something else holding onto a reference to the object.&#160; Maybe we just need to audit all code that takes a reference to ensure that it drops it in a timely fashion.&lt;/p&gt;

&lt;p&gt;In which version of Lustre has this problem been seen?&lt;/p&gt;</comment>
                            <comment id="354629" author="ys" created="Wed, 30 Nov 2022 05:44:45 +0000"  >&lt;p&gt;Hi, Neil,&lt;/p&gt;

&lt;p&gt;Yes, We have two instances for this issue. All of them stuck on:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;static void cl_object_put_last(struct lu_env *env, struct cl_object *obj)
{
        struct lu_object_header *header = obj-&amp;gt;co_lu.lo_header;

        if (unlikely(atomic_read(&amp;amp;header-&amp;gt;loh_ref) != 1)) {
                struct lu_site *site = obj-&amp;gt;co_lu.lo_dev-&amp;gt;ld_site;
                wait_queue_head_t *wq;

                wq = lu_site_wq_from_fid(site, &amp;amp;header-&amp;gt;loh_fid);

                wait_event(*wq, atomic_read(&amp;amp;header-&amp;gt;loh_ref) == 1); &amp;lt;&amp;lt;&amp;lt;-----
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;And the header is :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
struct lu_object_header {
  loh_fid = {
    f_seq = 0x20005e972,
    f_oid = 0x189fc,
    f_ver = 0x0
  },
  loh_flags = 0x5,
  loh_ref = {
    counter = 0x1
  },

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So the loh_ref  has already reached 1 but the thread still waiting for it, It is obviously lost the wakeup. Also only the stuck thread in the waitqueue.&lt;/p&gt;

&lt;p&gt;This issue just hit on es52 branch, It use old code. We don&apos;t encounter this issue on master branch since your patch invoke wakeup in lu_object_put() unconditionally. &lt;/p&gt;

&lt;p&gt;What issue you point out still exists in 2.13 code.  Many thanks for the insight. I think we should backport your patches to the branch to fix the issue.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                            <comment id="355222" author="gerrit" created="Tue, 6 Dec 2022 02:44:07 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/49130/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/49130/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16308&quot; title=&quot;cl_object_put_last() is stuck in wait_event(atomic_read(&amp;amp;header-&amp;gt;loh_ref) == 1)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16308&quot;&gt;&lt;del&gt;LU-16308&lt;/del&gt;&lt;/a&gt; llite: wake_up after cl_object_kill&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 77107d8e78ffd952af7882a160c84012aea0e22b&lt;/p&gt;</comment>
                            <comment id="356331" author="pjones" created="Tue, 13 Dec 2022 23:35:44 +0000"  >&lt;p&gt;Landed for 2.16&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i035iv:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>