<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:36:01 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10543] racer: /mnt/lustre2 is still busy, wait one second</title>
                <link>https://jira.whamcloud.com/browse/LU-10543</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for Bob Glossman &amp;lt;bob.glossman@intel.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/9df61cce-fe48-11e7-bd00-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/9df61cce-fe48-11e7-bd00-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;This failure occurs after racer is complete, while test script is exiting.&lt;br/&gt;
Can&apos;t see any evidence of panics or oops in logs.&lt;br/&gt;
ConMan just disconnects from all the nodes and then all get rebooted.&lt;/p&gt;</description>
                <environment></environment>
        <key id="50334">LU-10543</key>
            <summary>racer: /mnt/lustre2 is still busy, wait one second</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Sun, 21 Jan 2018 15:54:44 +0000</created>
                <updated>Mon, 30 Oct 2023 20:13:45 +0000</updated>
                                            <version>Lustre 2.11.0</version>
                    <version>Lustre 2.12.0</version>
                    <version>Lustre 2.10.4</version>
                    <version>Lustre 2.13.0</version>
                    <version>Lustre 2.12.1</version>
                    <version>Lustre 2.14.0</version>
                    <version>Lustre 2.12.4</version>
                    <version>Lustre 2.12.6</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="219789" author="jamesanunez" created="Fri, 2 Feb 2018 00:00:10 +0000"  >&lt;p&gt;Recent racer hangs:&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/7f83ac30-9b18-11e8-b0aa-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/7f83ac30-9b18-11e8-b0aa-52540065bddc&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/2ff5cab8-9ae6-11e8-8ee3-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/2ff5cab8-9ae6-11e8-8ee3-52540065bddc&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/83e235b6-9a74-11e8-b0aa-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/83e235b6-9a74-11e8-b0aa-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;racer test 1 looks like it completes successfully, but we aren&apos;t able to umount the second Lustre mount&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;while umount  /mnt/lustre2 2&amp;gt;&amp;amp;1 | grep -q busy; do
    echo /mnt/lustre2 is still busy, wait one second &amp;amp;&amp;amp; sleep 1;
done;
fi
Stopping client trevis-33vm6.trevis.whamcloud.com /mnt/lustre2 opts:
Stopping client trevis-33vm7.trevis.whamcloud.com /mnt/lustre2 opts:
/mnt/lustre2 is still busy, wait one second
/mnt/lustre2 is still busy, wait one second
/mnt/lustre2 is still busy, wait one second
/mnt/lustre2 is still busy, wait one second
/mnt/lustre2 is still busy, wait one second
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It looks like there are no client process still running&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;there should be NO racer processes:
USER       PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
Filesystem             1K-blocks   Used Available Use% Mounted on
10.9.4.180@tcp:/lustre  13532932 202516  12463840   2% /mnt/lustre
We survived /usr/lib64/lustre/tests/racer/racer.sh for 900 seconds.
there should be NO racer processes:
USER       PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
Filesystem             1K-blocks   Used Available Use% Mounted on
10.9.4.180@tcp:/lustre  13532932 198284  12471324   2% /mnt/lustre
We survived /usr/lib64/lustre/tests/racer/racer.sh for 900 seconds.
pid=17959 rc=0
pid=17960 rc=0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="231727" author="adilger" created="Thu, 9 Aug 2018 17:39:27 +0000"  >&lt;p&gt;I checked the stack traces, and &lt;tt&gt;unmount&lt;/tt&gt; does not appear. That is not surprising, given that the unmount command is returning an error instead of hanging during cleanup. This implies some kind of leak in the vfsmnt or inode refcount. That is likely due to some kind of change in the dcache or inode hanfling in llite. &lt;/p&gt;</comment>
                            <comment id="231864" author="gerrit" created="Mon, 13 Aug 2018 15:03:42 +0000"  >&lt;p&gt;&lt;del&gt;James Nunez (jnunez@whamcloud.com) uploaded a new patch:&lt;/del&gt; &lt;a href=&quot;https://review.whamcloud.com/32986&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/32986&lt;/a&gt;&lt;br/&gt;
&lt;del&gt;Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10543&quot; title=&quot;racer: /mnt/lustre2 is still busy, wait one second&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10543&quot;&gt;LU-10543&lt;/a&gt; tests: determine racer hang&lt;/del&gt;&lt;br/&gt;
&lt;del&gt;Project: fs/lustre-release&lt;/del&gt;&lt;br/&gt;
&lt;del&gt;Branch: master&lt;/del&gt;&lt;br/&gt;
&lt;del&gt;Current Patch Set: 1&lt;/del&gt;&lt;br/&gt;
&lt;del&gt;Commit: c6b7177dd126812f0f4a0535b65f6b42b282f797&lt;/del&gt;&lt;/p&gt;</comment>
                            <comment id="231880" author="jamesanunez" created="Mon, 13 Aug 2018 19:43:07 +0000"  >&lt;p&gt;In some cases, we see that racer completes and it is marked as PASS, but the next test suite is not able to start and timeout due to mount2 busy. &lt;/p&gt;

&lt;p&gt;See &lt;a href=&quot;https://testing.whamcloud.com/test_sessions/91c9307e-4e4c-421f-a197-5e9827f00438&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sessions/91c9307e-4e4c-421f-a197-5e9827f00438&lt;/a&gt;; racer is marked as PASS, but replay-single hangs with, looking at the suite_log&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;-----============= acceptance-small: replay-single ============----- Fri Aug 10 23:26:13 UTC 2018
Running: bash /usr/lib64/lustre/tests/replay-single.sh
== replay-single test complete, duration -o sec ====================================================== 23:26:13 (1533943573)
excepting tests: 
Stopping clients: trevis-6vm10,trevis-6vm9.trevis.whamcloud.com /mnt/lustre2 (opts:)
CMD: trevis-6vm10,trevis-6vm9.trevis.whamcloud.com running=\$(grep -c /mnt/lustre2&apos; &apos; /proc/mounts);
if [ \$running -ne 0 ] ; then
echo Stopping client \$(hostname) /mnt/lustre2 opts:;
lsof /mnt/lustre2 || need_kill=no;
if [ x != x -a x\$need_kill != xno ]; then
    pids=\$(lsof -t /mnt/lustre2 | sort -u);
    if [ -n \&quot;\$pids\&quot; ]; then
             kill -9 \$pids;
    fi
fi;
while umount  /mnt/lustre2 2&amp;gt;&amp;amp;1 | grep -q busy; do
    echo /mnt/lustre2 is still busy, wait one second &amp;amp;&amp;amp; sleep 1;
done;
fi
Stopping client trevis-6vm9.trevis.whamcloud.com /mnt/lustre2 opts:
Stopping client trevis-6vm10.trevis.whamcloud.com /mnt/lustre2 opts:
/mnt/lustre2 is still busy, wait one second
/mnt/lustre2 is still busy, wait one second
/mnt/lustre2 is still busy, wait one second
/mnt/lustre2 is still busy, wait one second
...
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="290936" author="adilger" created="Tue, 2 Feb 2021 08:38:41 +0000"  >&lt;p&gt;Looking at the stack traces from several recent racer timeouts (run with &lt;tt&gt;env=DURATION=3600&lt;/tt&gt;) shows that there is a lingering &quot;&lt;tt&gt;cp&lt;/tt&gt;&quot; process running on the stuck client:&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/ad623a1d-dd28-4b4f-a323-f3893798c7d7&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/ad623a1d-dd28-4b4f-a323-f3893798c7d7&lt;/a&gt;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 4688.804205] cp              S ffff9ca9258aa0e0     0  6454      1 0x00000080
[ 4688.805787] Call Trace:
[ 4688.806328]  [&amp;lt;ffffffff83585da9&amp;gt;] schedule+0x29/0x70
[ 4688.807351]  [&amp;lt;ffffffffc0e0f867&amp;gt;] ? vvp_io_init+0x347/0x480 [lustre]
[ 4688.808579]  [&amp;lt;ffffffffc08ec15b&amp;gt;] ? cl_io_init0.isra.13+0x8b/0x160 [obdclass]
[ 4688.809924]  [&amp;lt;ffffffffc0e04fd1&amp;gt;] ? cl_glimpse_lock+0x311/0x370 [lustre]
[ 4688.811325]  [&amp;lt;ffffffffc0e0537d&amp;gt;] ? cl_glimpse_size0+0x22d/0x260 [lustre]
[ 4688.812576]  [&amp;lt;ffffffffc0dc607e&amp;gt;] ? ll_getattr_dentry+0x53e/0x8e0 [lustre]
[ 4688.813933]  [&amp;lt;ffffffff83060392&amp;gt;] ? user_path_at_empty+0x72/0xc0
[ 4688.815083]  [&amp;lt;ffffffffc0dc643b&amp;gt;] ? ll_getattr+0x1b/0x20 [lustre]
[ 4688.816266]  [&amp;lt;ffffffff83052b49&amp;gt;] ? vfs_getattr+0x49/0x80
[ 4688.817200]  [&amp;lt;ffffffff83052c75&amp;gt;] ? vfs_fstatat+0x75/0xc0
[ 4688.818213]  [&amp;lt;ffffffff8305301e&amp;gt;] ? SYSC_newstat+0x2e/0x60
[ 4688.819180]  [&amp;lt;ffffffff83592e15&amp;gt;] ? system_call_after_swapgs+0xa2/0x13a
[ 4688.820398]  [&amp;lt;ffffffff83592e09&amp;gt;] ? system_call_after_swapgs+0x96/0x13a
[ 4688.821538]  [&amp;lt;ffffffff83592e15&amp;gt;] ? system_call_after_swapgs+0xa2/0x13a
[ 4688.822755]  [&amp;lt;ffffffff83592e09&amp;gt;] ? system_call_after_swapgs+0x96/0x13a
[ 4688.823886]  [&amp;lt;ffffffff83592e15&amp;gt;] ? system_call_after_swapgs+0xa2/0x13a
[ 4688.825089]  [&amp;lt;ffffffff83592e09&amp;gt;] ? system_call_after_swapgs+0x96/0x13a
[ 4688.826220]  [&amp;lt;ffffffff83592e15&amp;gt;] ? system_call_after_swapgs+0xa2/0x13a
[ 4688.827464]  [&amp;lt;ffffffff83592e09&amp;gt;] ? system_call_after_swapgs+0x96/0x13a
[ 4688.828684]  [&amp;lt;ffffffff83592e15&amp;gt;] ? system_call_after_swapgs+0xa2/0x13a
[ 4688.830021]  [&amp;lt;ffffffff83592e09&amp;gt;] ? system_call_after_swapgs+0x96/0x13a
[ 4688.831226]  [&amp;lt;ffffffff83592e15&amp;gt;] ? system_call_after_swapgs+0xa2/0x13a
[ 4688.832570]  [&amp;lt;ffffffff830534de&amp;gt;] ? SyS_newstat+0xe/0x10
[ 4688.833591]  [&amp;lt;ffffffff83592ed2&amp;gt;] ? system_call_fastpath+0x25/0x2a
[ 4688.834768]  [&amp;lt;ffffffff83592e15&amp;gt;] ? system_call_after_swapgs+0xa2/0x13a
:
[ 4748.927655] Lustre: 6454:0:(client.c:2282:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1612251797/real 1612251797]  req@ffff9ca9157cd680 x1690564340618304/t0(0) o49-&amp;gt;lustre-MDT0000-mdc-ffff9ca93876b800@10.9.6.224@tcp:12/10 lens 472/1040 e 0 to 1 dl 1612251804 ref 2 fl Rpc:XQr/0/ffffffff rc 0/-1 job:&apos;cp.0&apos;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;A second better-formed stack is on the other client:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 4684.466221] cp              S ffffa0ea19d48000     0 16037      1 0x00000080
[ 4684.467522] Call Trace:
[ 4684.467975]  [&amp;lt;ffffffff9b785da9&amp;gt;] schedule+0x29/0x70
[ 4684.468962]  [&amp;lt;ffffffffc0bc9c85&amp;gt;] ldlm_completion_ast+0x1e5/0x9d0 [ptlrpc]
[ 4684.471257]  [&amp;lt;ffffffffc0bb85ed&amp;gt;] ldlm_lock_match_with_skip+0x1ad/0x830 [ptlrpc]
[ 4684.474583]  [&amp;lt;ffffffffc0dbbd70&amp;gt;] osc_enqueue_base+0x100/0x530 [osc]
[ 4684.477063]  [&amp;lt;ffffffffc0dc64b9&amp;gt;] osc_lock_enqueue+0x359/0x830 [osc]
[ 4684.478339]  [&amp;lt;ffffffffc09755a5&amp;gt;] cl_lock_enqueue+0x65/0x120 [obdclass]
[ 4684.479542]  [&amp;lt;ffffffffc0e21a45&amp;gt;] lov_lock_enqueue+0x95/0x150 [lov]
[ 4684.480727]  [&amp;lt;ffffffffc09755a5&amp;gt;] cl_lock_enqueue+0x65/0x120 [obdclass]
[ 4684.481887]  [&amp;lt;ffffffffc0975b37&amp;gt;] cl_lock_request+0x67/0x1f0 [obdclass]
[ 4684.483127]  [&amp;lt;ffffffffc097961b&amp;gt;] cl_io_lock+0xfb/0x280 [obdclass]
[ 4684.484222]  [&amp;lt;ffffffffc097a06d&amp;gt;] cl_io_loop+0x8d/0x200 [obdclass]
[ 4684.485523]  [&amp;lt;ffffffffc0ec563f&amp;gt;] cl_setattr_ost+0x28f/0x3e0 [lustre]
[ 4684.486645]  [&amp;lt;ffffffffc0e9da5a&amp;gt;] ll_setattr_raw+0x102a/0x1140 [lustre]
[ 4684.487880]  [&amp;lt;ffffffffc0e9dbd3&amp;gt;] ll_setattr+0x63/0xc0 [lustre]
[ 4684.488931]  [&amp;lt;ffffffff9b26cefc&amp;gt;] notify_change+0x30c/0x4d0
[ 4684.489978]  [&amp;lt;ffffffff9b24af35&amp;gt;] do_truncate+0x75/0xc0
[ 4684.490898]  [&amp;lt;ffffffff9b25d487&amp;gt;] do_last+0x627/0x1340
[ 4684.493054]  [&amp;lt;ffffffff9b25e26d&amp;gt;] path_openat+0xcd/0x5a0
[ 4684.496260]  [&amp;lt;ffffffff9b2604bd&amp;gt;] do_filp_open+0x4d/0xb0
[ 4684.498210]  [&amp;lt;ffffffff9b24c094&amp;gt;] do_sys_open+0x124/0x220
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="290937" author="adilger" created="Tue, 2 Feb 2021 08:43:24 +0000"  >&lt;p&gt;A different test run also showed the same stack:&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/eb16868e-613f-40fb-a820-18e0e627f694&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/eb16868e-613f-40fb-a820-18e0e627f694&lt;/a&gt;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 4022.294274] cp              S ffff97d81fe99900     0 22278      1 0x00000080
[ 4022.295585] Call Trace:
[ 4022.296011]  [&amp;lt;ffffffff9dd85da9&amp;gt;] schedule+0x29/0x70
[ 4022.296868]  [&amp;lt;ffffffffc0a9dc85&amp;gt;] ldlm_completion_ast+0x1e5/0x9d0 [ptlrpc]
[ 4022.299128]  [&amp;lt;ffffffffc0a8c5ed&amp;gt;] ldlm_lock_match_with_skip+0x1ad/0x830 [ptlrpc]
[ 4022.302394]  [&amp;lt;ffffffffc0c8fd70&amp;gt;] osc_enqueue_base+0x100/0x530 [osc]
[ 4022.304799]  [&amp;lt;ffffffffc0c9a4b9&amp;gt;] osc_lock_enqueue+0x359/0x830 [osc]
[ 4022.305932]  [&amp;lt;ffffffffc08495a5&amp;gt;] cl_lock_enqueue+0x65/0x120 [obdclass]
[ 4022.307161]  [&amp;lt;ffffffffc0cf5a45&amp;gt;] lov_lock_enqueue+0x95/0x150 [lov]
[ 4022.308241]  [&amp;lt;ffffffffc08495a5&amp;gt;] cl_lock_enqueue+0x65/0x120 [obdclass]
[ 4022.309428]  [&amp;lt;ffffffffc0849b37&amp;gt;] cl_lock_request+0x67/0x1f0 [obdclass]
[ 4022.310562]  [&amp;lt;ffffffffc084d61b&amp;gt;] cl_io_lock+0xfb/0x280 [obdclass]
[ 4022.311687]  [&amp;lt;ffffffffc084e06d&amp;gt;] cl_io_loop+0x8d/0x200 [obdclass]
[ 4022.312868]  [&amp;lt;ffffffffc0d9963f&amp;gt;] cl_setattr_ost+0x28f/0x3e0 [lustre]
[ 4022.314033]  [&amp;lt;ffffffffc0d71a5a&amp;gt;] ll_setattr_raw+0x102a/0x1140 [lustre]
[ 4022.315162]  [&amp;lt;ffffffffc0d71bd3&amp;gt;] ll_setattr+0x63/0xc0 [lustre]
[ 4022.316231]  [&amp;lt;ffffffff9d86cefc&amp;gt;] notify_change+0x30c/0x4d0
[ 4022.317174]  [&amp;lt;ffffffff9d84af35&amp;gt;] do_truncate+0x75/0xc0
[ 4022.318122]  [&amp;lt;ffffffff9d85d487&amp;gt;] do_last+0x627/0x1340
[ 4022.320212]  [&amp;lt;ffffffff9d85e26d&amp;gt;] path_openat+0xcd/0x5a0
[ 4022.322229]  [&amp;lt;ffffffff9d8604bd&amp;gt;] do_filp_open+0x4d/0xb0
[ 4022.324109]  [&amp;lt;ffffffff9d84c094&amp;gt;] do_sys_open+0x124/0x220
[ 4022.326209]  [&amp;lt;ffffffff9d84c1ae&amp;gt;] SyS_open+0x1e/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="385963" author="xinliang" created="Thu, 14 Sep 2023 09:43:19 +0000"  >&lt;p&gt;Can easily hit this issue on the Arm cluster by running the racer test suite several times.&lt;/p&gt;

&lt;p&gt;With similar cp stack:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[jenkins@lustre-rec5k7yz-01 ~]$ ps -efH|grep lustre2/
jenkins &#160; 873969&#160; 873837&#160; 0 09:36 pts/1&#160; &#160; 00:00:00 &#160; &#160; &#160; &#160; &#160; grep --color=auto lustre2/
root&#160; &#160; &#160; 426463 &#160; &#160; &#160; 1&#160; 0 Sep13 ?&#160; &#160; &#160; &#160; 00:00:00 &#160; cp -p /bin/sleep /mnt/lustre2/racer/4
[jenkins@lustre-rec5k7yz-01 ~]$ sudo cat /proc/426463/stack 
[&amp;lt;0&amp;gt;] __switch_to+0x7c/0xbc
[&amp;lt;0&amp;gt;] ldlm_completion_ast+0x7a4/0xdbc [ptlrpc]
[&amp;lt;0&amp;gt;] ldlm_lock_match_with_skip+0x1c8/0x854 [ptlrpc]
[&amp;lt;0&amp;gt;] osc_enqueue_base+0x100/0x550 [osc]
[&amp;lt;0&amp;gt;] osc_lock_enqueue+0x434/0xc20 [osc]
[&amp;lt;0&amp;gt;] cl_lock_enqueue+0x98/0x1e0 [obdclass]
[&amp;lt;0&amp;gt;] lov_lock_enqueue+0x90/0x22c [lov]
[&amp;lt;0&amp;gt;] cl_lock_enqueue+0x98/0x1e0 [obdclass]
[&amp;lt;0&amp;gt;] cl_lock_request+0xa0/0x270 [obdclass]
[&amp;lt;0&amp;gt;] cl_lockset_lock+0x12c/0x240 [obdclass]
[&amp;lt;0&amp;gt;] cl_io_lock+0xd0/0x204 [obdclass]
[&amp;lt;0&amp;gt;] cl_io_loop+0xb8/0x280 [obdclass]
[&amp;lt;0&amp;gt;] cl_setattr_ost+0x290/0x364 [lustre]
[&amp;lt;0&amp;gt;] ll_setattr_raw+0x908/0x1160 [lustre]
[&amp;lt;0&amp;gt;] ll_setattr+0x8c/0x130 [lustre]
[&amp;lt;0&amp;gt;] notify_change+0x25c/0x43c
[&amp;lt;0&amp;gt;] do_truncate+0x84/0xe0
[&amp;lt;0&amp;gt;] handle_truncate+0xd4/0x124
[&amp;lt;0&amp;gt;] do_open+0xc4/0x30c
[&amp;lt;0&amp;gt;] path_openat+0x10c/0x1d0
[&amp;lt;0&amp;gt;] do_filp_open+0x84/0x134
[&amp;lt;0&amp;gt;] do_sys_openat2+0x208/0x2f0
[&amp;lt;0&amp;gt;] __arm64_sys_openat+0x6c/0xb4
[&amp;lt;0&amp;gt;] invoke_syscall+0x50/0x11c
[&amp;lt;0&amp;gt;] el0_svc_common.constprop.0+0x158/0x164
[&amp;lt;0&amp;gt;] do_el0_svc+0x2c/0x9c
[&amp;lt;0&amp;gt;] el0_svc+0x20/0x30
[&amp;lt;0&amp;gt;] el0_sync_handler+0xb0/0xb4
[&amp;lt;0&amp;gt;] el0_sync+0x160/0x180
[jenkins@lustre-rec5k7yz-01 ~]$  &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;And the related test script should be:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
lustre-release$ cat lustre/tests/racer/file_exec.sh 
#!/bin/bash
trap &lt;span class=&quot;code-quote&quot;&gt;&apos;kill $(jobs -p)&apos;&lt;/span&gt; EXIT


org_LANG=$LANG
export LANG=C


DIR=$1
MAX=$2
PROG=/bin/sleep


&lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; /bin/&lt;span class=&quot;code-keyword&quot;&gt;true&lt;/span&gt; ; &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt;
&#160; &#160; &#160; &#160; file=$((RANDOM % MAX))
&#160; &#160; &#160; &#160; cp -p $PROG $DIR/$file &amp;gt; /dev/&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt; 2&amp;gt;&amp;amp;1
&#160; &#160; &#160; &#160; $DIR/$file 0.$((RANDOM % 5 + 1)) 2&amp;gt; /dev/&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;
&#160; &#160; &#160; &#160; sleep $((RANDOM % 3))
done 2&amp;gt;&amp;amp;1 | egrep -v &lt;span class=&quot;code-quote&quot;&gt;&quot;Segmentation fault|Bus error&quot;&lt;/span&gt;


export LANG=$org_LANG &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="385966" author="xinliang" created="Thu, 14 Sep 2023 10:28:24 +0000"  >&lt;p&gt;It seems that the problem is here:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 cat lustre/tests/racer/racer.sh
...
 15 RACER_PROGS=${RACER_PROGS:-&quot;file_create dir_create file_rm file_rename \
 16 file_link file_symlink file_list file_concat file_exec file_chown \
 17 file_chmod file_mknod file_truncate file_delxattr file_getxattr \
 18 file_setxattr&quot;}
...
 37 racer_cleanup()
 38 {
 39 &#160; &#160; &#160; &#160; echo &lt;span class=&quot;code-quote&quot;&gt;&quot;racer cleanup&quot;&lt;/span&gt;
 40 &#160; &#160; &#160; &#160; &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; P in $RACER_PROGS; &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt;
 41 &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; killall -q $P.sh
 42 &#160; &#160; &#160; &#160; done 
...&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;It should also kill all the child processes of the sh scripts. Because if you force to kill all the file test sh scripts, you can&apos;t expect their child processes in a normal state, these child processes should be killed/cleaned up as well, because they could be in an abnormal state.&lt;/p&gt;</comment>
                            <comment id="386095" author="adilger" created="Fri, 15 Sep 2023 04:06:03 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=xinliang&quot; class=&quot;user-hover&quot; rel=&quot;xinliang&quot;&gt;xinliang&lt;/a&gt; please try &lt;a href=&quot;https://review.whamcloud.com/46384&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46384&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15248&quot; title=&quot;Improve racer cleanup&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15248&quot;&gt;LU-15248&lt;/a&gt; tests: kill all fs users at the end of racer&lt;/tt&gt;&quot; to see if this fixes the problem for you. &lt;/p&gt;</comment>
                            <comment id="386109" author="xinliang" created="Fri, 15 Sep 2023 08:07:27 +0000"  >&lt;p&gt;Hi &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=adilger&quot; class=&quot;user-hover&quot; rel=&quot;adilger&quot;&gt;adilger&lt;/a&gt;, with patch 46384 it still gets stuck. These cp processes seem can&apos;t be killed by fuser cmd.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
stuck logs
...
/mnt/lustre2 is still busy, wait one second
/mnt/lustre2 is still busy, wait one second
/mnt/lustre2 is still busy, wait one second
/mnt/lustre2 is still busy, wait one second
/mnt/lustre2 is still busy, wait one second
/mnt/lustre2 is still busy, wait one second
/mnt/lustre2 is still busy, wait one second
/mnt/lustre2 is still busy, wait one second
...

[root@lustre-pdbirbyf-02 ~]# ps -efH|grep &lt;span class=&quot;code-quote&quot;&gt;&quot;cp -p&quot;&lt;/span&gt;
root &#160; &#160; 3362672 3360694&#160; 0 08:01 pts/0&#160; &#160; 00:00:00 &#160; &#160; &#160; &#160; &#160; grep --color=auto cp -p
root &#160; &#160; 2975599 &#160; &#160; &#160; 1&#160; 0 05:54 ?&#160; &#160; &#160; &#160; 00:00:00 &#160; cp -p /bin/sleep /mnt/lustre/racer/1
root &#160; &#160; 2982999 &#160; &#160; &#160; 1&#160; 0 05:54 ?&#160; &#160; &#160; &#160; 00:00:00 &#160; cp -p /bin/sleep /mnt/lustre/racer/1
root &#160; &#160; 1553902 &#160; &#160; &#160; 1&#160; 0 06:37 ?&#160; &#160; &#160; &#160; 00:00:00 &#160; cp -p /bin/sleep /mnt/lustre2/racer3/11
[root@lustre-pdbirbyf-02 ~]# fuser -k -m&#160; /mnt/lustre2/racer3/
[root@lustre-pdbirbyf-02 ~]# ps -efH|grep &lt;span class=&quot;code-quote&quot;&gt;&quot;cp -p&quot;&lt;/span&gt;
root &#160; &#160; 3362795 3360694&#160; 0 08:01 pts/0&#160; &#160; 00:00:00 &#160; &#160; &#160; &#160; &#160; grep --color=auto cp -p
root &#160; &#160; 2975599 &#160; &#160; &#160; 1&#160; 0 05:54 ?&#160; &#160; &#160; &#160; 00:00:00 &#160; cp -p /bin/sleep /mnt/lustre/racer/1
root &#160; &#160; 2982999 &#160; &#160; &#160; 1&#160; 0 05:54 ?&#160; &#160; &#160; &#160; 00:00:00 &#160; cp -p /bin/sleep /mnt/lustre/racer/1
root &#160; &#160; 1553902 &#160; &#160; &#160; 1&#160; 0 06:37 ?&#160; &#160; &#160; &#160; 00:00:00 &#160; cp -p /bin/sleep /mnt/lustre2/racer3/11 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;And some cp processes in D(disk sleep) state are unkillable:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[root@lustre-pdbirbyf-02 ~]# kill -9 1553902&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;
[root@lustre-pdbirbyf-02 ~]# ps --ppid 1 -fH|grep cp&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;
root &#160; &#160; 1553902 &#160; &#160; &#160; 1&#160; 0 06:37 ?&#160; &#160; &#160; &#160; 00:00:00 cp -p /bin/sleep /mnt/lustre2/racer3/11 &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;
root &#160; &#160; 2982999 &#160; &#160; &#160; 1&#160; 0 05:54 ?&#160; &#160; &#160; &#160; 00:00:00 cp -p /bin/sleep /mnt/lustre/racer/1&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;
root &#160; &#160; 2975599 &#160; &#160; &#160; 1&#160; 0 05:54 ?&#160; &#160; &#160; &#160; 00:00:00 cp -p /bin/sleep /mnt/lustre/racer/1  
[root@lustre-pdbirbyf-02 ~]# killall -v cp&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;
Killed cp(1553902) with signal 15 &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;
Killed cp(2975599) with signal 15 &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;
Killed cp(2982999) with signal 15 &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;
[root@lustre-pdbirbyf-02 ~]# ps --ppid 1 -fH|grep cp&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;
root &#160; &#160; 1553902 &#160; &#160; &#160; 1&#160; 0 06:37 ?&#160; &#160; &#160; &#160; 00:00:00 cp -p /bin/sleep /mnt/lustre2/racer3/11&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="386121" author="adilger" created="Fri, 15 Sep 2023 10:30:38 +0000"  >&lt;p&gt;There is a second patch &lt;a href=&quot;https://review.whamcloud.com/45605&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/45605&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15248&quot; title=&quot;Improve racer cleanup&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15248&quot;&gt;LU-15248&lt;/a&gt; tests: improve racer cleanup&lt;/tt&gt;&quot; for that same ticket.  However, it isn&apos;t clear to me if that patch is just hiding the test failure or if it could potentially be close to a real fix for the problem? &lt;/p&gt;

&lt;p&gt;It might be enough to add the hunks from that patch in &lt;tt&gt;obd_get_request_slot()&lt;/tt&gt;:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
	&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc == -EINTR)
		rc = -ERESTARTSYS;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and then add a check in {{ptlrpc_interrupted_set()} to see if the request is a lock enqueue and don&apos;t interrupt it in this case.  Something like the following (which I haven&apos;t tested at all and is just a guess at might work):&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
	list_for_each_entry(req, &amp;amp;set-&amp;gt;set_requests, rq_set_chain) {
		&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (req-&amp;gt;rq_intr)
			&lt;span class=&quot;code-keyword&quot;&gt;continue&lt;/span&gt;;

		&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (req-&amp;gt;rq_phase != RQ_PHASE_RPC &amp;amp;&amp;amp;
		    req-&amp;gt;rq_phase != RQ_PHASE_UNREG_RPC &amp;amp;&amp;amp;
		    !req-&amp;gt;rq_allow_intr)
			&lt;span class=&quot;code-keyword&quot;&gt;continue&lt;/span&gt;;

               &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (req-&amp;gt;rq_reqmsg &amp;amp;&amp;amp; lustre_msg_get_opc(req-&amp;gt;rq_reqmsg) == LDLM_ENQUEUE &amp;amp;&amp;amp;
                   !__fatal_signal_pending(current))
                        &lt;span class=&quot;code-keyword&quot;&gt;continue&lt;/span&gt;;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This would have the drawback that &lt;tt&gt;LDLM_ENQUEUE&lt;/tt&gt; RPCs could not normally be interrupted, but at least the threads wouldn&apos;t be stuck permanently...&lt;/p&gt;

&lt;p&gt;The alternative might be a check in &lt;tt&gt;ldlm_completion_ast()&lt;/tt&gt; if the task can be interrupted?  It &lt;em&gt;looks&lt;/em&gt; like this function is already using &lt;tt&gt;l_wait_event_abortable()&lt;/tt&gt;, but there have been a number of changes in this area and it may be that the interrupt handling is not working properly (or it is interrupted but restarts immediately, the lustre client debug logs should show one way or the other.&lt;/p&gt;</comment>
                            <comment id="387054" author="xinliang" created="Mon, 25 Sep 2023 02:04:05 +0000"  >&lt;p&gt;Thanks for the explanation, Andreas. I&apos;m trying to understand the problem. Will try patch 45605 later.&lt;/p&gt;

&lt;p&gt;Hmm, it looks like the problem is a little complicated. The racer test suite runs racing test scripts for a DURATION of time and then kills all the test scripts.&lt;/p&gt;

&lt;p&gt;It seems Lustre threads should be killed gracefully by a signal like SIGTERM, which doesn&apos;t yet, right?&#160; Lustre should handle such a signal to exit properly.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="52911">LU-11219</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="67233">LU-15248</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="31828">LU-7073</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="51781">LU-10904</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzrg7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>