<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:13:36 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-14883] umount blocked due to remaining statahead threads</title>
                <link>https://jira.whamcloud.com/browse/LU-14883</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We noticed today on a cluster running 2.12.6 clients that a large number of them were blocked at umount when applying a security patch. No jobs were running anymore on these clients, but umount doesn&apos;t finish.&lt;/p&gt;

&lt;p&gt;Dumping the tasks on a client has revealed that 3 ll_sa threads are still running:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 25857  TASK: ffff9a78cd32b180  CPU: 36  COMMAND: &quot;ll_sa_24325&quot;
 #0 [ffff9a7519997c98] __schedule at ffffffffa2f8899f
 #1 [ffff9a7519997d28] schedule at ffffffffa2f88eb9
 #2 [ffff9a7519997d38] ll_statahead_thread at ffffffffc133787e [lustre]
 #3 [ffff9a7519997ec8] kthread at ffffffffa28c5da1
 #4 [ffff9a7519997f50] ret_from_fork_nospec_begin at ffffffffa2f95de4

PID: 34631  TASK: ffff9b754a283180  CPU: 10  COMMAND: &quot;ll_sa_30018&quot;
 #0 [ffff99fd4ebd7c98] __schedule at ffffffffa2f8899f
 #1 [ffff99fd4ebd7d28] schedule at ffffffffa2f88eb9
 #2 [ffff99fd4ebd7d38] ll_statahead_thread at ffffffffc133787e [lustre]
 #3 [ffff99fd4ebd7ec8] kthread at ffffffffa28c5da1
 #4 [ffff99fd4ebd7f50] ret_from_fork_nospec_begin at ffffffffa2f95de4

PID: 41809  TASK: ffff9afeeeb8b180  CPU: 40  COMMAND: &quot;ll_sa_40406&quot;
 #0 [ffff99fe617c7c98] __schedule at ffffffffa2f8899f
 #1 [ffff99fe617c7d28] schedule at ffffffffa2f88eb9
 #2 [ffff99fe617c7d38] ll_statahead_thread at ffffffffc133787e [lustre]
 #3 [ffff99fe617c7ec8] kthread at ffffffffa28c5da1
 #4 [ffff99fe617c7f50] ret_from_fork_nospec_begin at ffffffffa2f95de4
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This is blocking umount/&lt;tt&gt;ll_kill_super()&lt;/tt&gt;:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; bt -f 19912
PID: 19912  TASK: ffff9b7532fae300  CPU: 44  COMMAND: &quot;umount&quot;
 #0 [ffff9a002139bd08] __schedule at ffffffffa2f8899f
    ffff9a002139bd10: 0000000000000082 ffff9a002139bfd8 
    ffff9a002139bd20: ffff9a002139bfd8 ffff9a002139bfd8 
    ffff9a002139bd30: 000000000001acc0 ffff9afb94b4e300 
    ffff9a002139bd40: ffff9b797f2d3940 00000001e05d9d9f 
    ffff9a002139bd50: 0000000000000282 000000006a88e28b 
    ffff9a002139bd60: ffff9a002139bdc0 ffff9b797f313940 
    ffff9a002139bd70: ffff9a002139bdc0 00000001e05d9e1c 
    ffff9a002139bd80: ffff9b797f313940 0000000000015980 
    ffff9a002139bd90: ffff9a002139bda0 ffffffffa2f88eb9 
 #1 [ffff9a002139bd98] schedule at ffffffffa2f88eb9
    ffff9a002139bda0: ffff9a002139be48 ffffffffa2f868d8 
 #2 [ffff9a002139bda8] schedule_timeout at ffffffffa2f868d8
    ffff9a002139bdb0: ffff9a0073ffd888 0000000000000282 
    ffff9a002139bdc0: ffff9b797f314b48 ffff9b797f314b48 
    ffff9a002139bdd0: 00000001e05d9e1c ffff9b797f313940 
    ffff9a002139bde0: ffffffffa28ad7f0 ffff9b7532fae300 
    ffff9a002139bdf0: ffffffffffffffff 0000000000000000 
    ffff9a002139be00: 0000000000000000 0000000000000000 
    ffff9a002139be10: 0000000000000282 000000006a88e28b 
    ffff9a002139be20: ffff9b7532fae300 ffff9a0030bf0000 
    ffff9a002139be30: ffff9b7532fae300 ffffffffa3907570 
    ffff9a002139be40: ffff9b7532faead0 ffff9a002139be70 
    ffff9a002139be50: ffffffffc130a449 
 #3 [ffff9a002139be50] ll_kill_super at ffffffffc130a449 [lustre]
    ffff9a002139be58: 0000000000000000 ffff9b75f7eba000 
    ffff9a002139be68: ffffffffc0fa8ae0 ffff9a002139be88 
    ffff9a002139be78: ffffffffc0f599ad 
 #4 [ffff9a002139be78] lustre_kill_super at ffffffffc0f599ad [obdclass]
    ffff9a002139be80: ffff9b75f7eba000 ffff9a002139bea8 
    ffff9a002139be90: ffffffffa2a50fae 
 #5 [ffff9a002139be90] deactivate_locked_super at ffffffffa2a50fae
    ffff9a002139be98: ffff9b75f7eba000 0000000000000000 
    ffff9a002139bea8: ffff9a002139bec0 ffffffffa2a51736 
 #6 [ffff9a002139beb0] deactivate_super at ffffffffa2a51736
    ffff9a002139beb8: ffff9a0073ffd680 ffff9a002139bed8 
    ffff9a002139bec8: ffffffffa2a70dbf 
 #7 [ffff9a002139bec8] cleanup_mnt at ffffffffa2a70dbf
    ffff9a002139bed0: ffff9a0073ffd6b8 ffff9a002139bee8 
    ffff9a002139bee0: ffffffffa2a70e52 
 #8 [ffff9a002139bee0] __cleanup_mnt at ffffffffa2a70e52
    ffff9a002139bee8: ffff9a002139bf28 ffffffffa28c28db 
 #9 [ffff9a002139bef0] task_work_run at ffffffffa28c28db
    ffff9a002139bef8: ffff9b7532faeb04 0000000000000002 
    ffff9a002139bf08: ffff9a002139bf58 0000000000000000 
    ffff9a002139bf18: 0000000000000000 0000000000000000 
    ffff9a002139bf28: ffff9a002139bf48 ffffffffa282cc65 
#10 [ffff9a002139bf30] do_notify_resume at ffffffffa282cc65
    ffff9a002139bf38: 0000000000000000 0000000000000000 
    ffff9a002139bf48: 0000000000000000 ffffffffa2f962ef 
#11 [ffff9a002139bf50] int_signal at ffffffffa2f962ef
    RIP: 00002b88f55a5a97  RSP: 00007fffaf522e48  RFLAGS: 00000246
    RAX: 0000000000000000  RBX: 00005603f7e53040  RCX: ffffffffffffffff
    RDX: 0000000000000001  RSI: 0000000000000000  RDI: 00005603f7e54840
    RBP: 00005603f7e54840   R8: 0000000000000000   R9: 0000000000000005
    R10: 00007fffaf5228a0  R11: 0000000000000246  R12: 00002b88f4e3ad78
    R13: 0000000000000000  R14: 00005603f7e54960  R15: 00005603f7e53040
    ORIG_RAX: 00000000000000a6  CS: 0033  SS: 002b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;sb = 0xffff9b75f7eba000

crash&amp;gt; struct super_block.s_fs_info ffff9b75f7eba000
  s_fs_info = 0xffff9b75f7eb9000
crash&amp;gt; struct lustre_sb_info.lsi_llsbi 0xffff9b75f7eb9000
  lsi_llsbi = 0xffff9a0030bf0000
crash&amp;gt; struct ll_sb_info.ll_umounting 0xffff9a0030bf0000
  ll_umounting = 1
crash&amp;gt; struct ll_sb_info.ll_sa_running 0xffff9a0030bf0000
  ll_sa_running = {
    counter = 3
  }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Here on this cluster clients are all 2.12.6, and server is Oak which is running 2.12.6 too.&lt;br/&gt;
 We have one client that is available for live troubleshooting (it&apos;s still blocked at umount).&lt;/p&gt;

&lt;p&gt;Is there another known bug with such problem in 2.12? Happy to provide more info.&lt;/p&gt;</description>
                <environment>CentOS 7.9 clients (3.10.0-1160.24.1.el7.x86_64)</environment>
        <key id="65362">LU-14883</key>
            <summary>umount blocked due to remaining statahead threads</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="qian_wc">Qian Yingjin</assignee>
                                    <reporter username="sthiell">Stephane Thiell</reporter>
                        <labels>
                    </labels>
                <created>Fri, 23 Jul 2021 04:27:59 +0000</created>
                <updated>Tue, 1 Nov 2022 14:52:29 +0000</updated>
                                            <version>Lustre 2.12.6</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="308251" author="pjones" created="Fri, 23 Jul 2021 17:46:58 +0000"  >&lt;p&gt;Yingjin&lt;/p&gt;

&lt;p&gt;Could you please assist&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="308296" author="qian_wc" created="Sat, 24 Jul 2021 13:46:35 +0000"  >&lt;p&gt;Sure, Peter.&lt;/p&gt;

&lt;p&gt;Hi Stephane,&lt;br/&gt;
Is this bug  easy to reproduce?&lt;br/&gt;
Did your installed Lustre include the reverted patch in master: &lt;a href=&quot;https://review.whamcloud.com/#/c/44371/?&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/44371/?&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Qian&lt;/p&gt;</comment>
                            <comment id="308455" author="sthiell" created="Mon, 26 Jul 2021 17:57:30 +0000"  >&lt;p&gt;Hi Qian,&lt;/p&gt;

&lt;p&gt;It looks like it&apos;s relatively easy to reproduce on this cluster, yes. I checked and as we are based on Lustre 2.12.6 on clients and servers, we don&apos;t have the patch that was recently reverted in master for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14868&quot; title=&quot;sanity: all subtests pass but test suite fails with TIMEOUT&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14868&quot;&gt;&lt;del&gt;LU-14868&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;Thanks!&lt;/p&gt;</comment>
                            <comment id="308478" author="qian_wc" created="Tue, 27 Jul 2021 01:34:31 +0000"  >&lt;p&gt;Hi Stephane,&lt;/p&gt;

&lt;p&gt;Could you please provide the reproducer program?&lt;br/&gt;
Or could you please provide the Lustre debug log?&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
lctl set_param debug=reada
lctl set_param debug=+vfstrace
lctl set_param subsystem_debug=llite
lctl clear
run the reproducer
lctl dk &amp;gt; log

Get the unfinalized stat ahead thread:
ps -el | grep ll_sa
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Qian&lt;/p&gt;
</comment>
                            <comment id="308480" author="laisiyao" created="Tue, 27 Jul 2021 02:47:07 +0000"  >&lt;p&gt;For statahead thread name &quot;ll_sa_24325&quot;, &quot;24325&quot; is the process that triggered statahead, can you check whether these processes are still there?&lt;/p&gt;</comment>
                            <comment id="308560" author="sthiell" created="Tue, 27 Jul 2021 19:05:49 +0000"  >&lt;p&gt;Hi Lai,&lt;/p&gt;

&lt;p&gt;Unfortunately, the process is gone, same for the other ones.&lt;/p&gt;

&lt;p&gt;However for 30018, there is also a ll_agl_30018:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;USER &#160; &#160; &#160; PID %CPU %MEM&#160; &#160; VSZ &#160; RSS TTY&#160; &#160; &#160; STAT START &#160; TIME COMMAND
root 34631 0.0 0.0 0 0 ? S Jun12 0:00 [ll_sa_30018]
root 34632 0.0 0.0 0 0 ? S Jun12 0:00 [ll_agl_30018]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;But I don&apos;t think it&apos;s releaving anything interesting:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 34632 TASK: ffff9b754a280000 CPU: 31 COMMAND: &quot;ll_agl_30018&quot;
 #0 [ffff9a685cb9bd88] __schedule at ffffffffa2f8899f
 #1 [ffff9a685cb9be18] schedule at ffffffffa2f88eb9
 #2 [ffff9a685cb9be28] ll_agl_thread at ffffffffc1336d9e [lustre]
 #3 [ffff9a685cb9bec8] kthread at ffffffffa28c5da1
 #4 [ffff9a685cb9bf50] ret_from_fork_nospec_begin at ffffffffa2f95de4
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;br/&gt;
Attaching the output of foreach bt just in case as  &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/39917/39917_foreach_bt.log&quot; title=&quot;foreach_bt.log attached to LU-14883&quot;&gt;foreach_bt.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;/p&gt;

&lt;p&gt;Qian, thanks, I will try to get more debug data but I don&apos;t have a perfect reproducer yet, we&apos;re just seeing that problem often when we restart nodes.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="69173">LU-15660</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="39917" name="foreach_bt.log" size="462756" author="sthiell" created="Tue, 27 Jul 2021 19:05:39 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i0201b:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>