<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:10:31 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-14525] parallel-scale-nfsv4 hangs before any tests can run</title>
                <link>https://jira.whamcloud.com/browse/LU-14525</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;parallel-scale-nfsv4 hangs before any tests can run. We&#8217;ve only seen this issue a handful of times such as for Lustre 2.12.4.61 (SLES12.4 clients); &lt;a href=&quot;https://testing.whamcloud.com/test_sets/12a1d345-1f01-45de-9d30-21cbd88ea61f&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/12a1d345-1f01-45de-9d30-21cbd88ea61f&lt;/a&gt;. Testing hang when we try to unmont the clients. Looking at the suite_log, we see&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Stopping clients: onyx-46vm1,onyx-46vm2 /mnt/lustre (opts:)
CMD: onyx-46vm1,onyx-46vm2 running=\$(grep -c /mnt/lustre&apos; &apos; /proc/mounts);
if [ \$running -ne 0 ] ; then
echo Stopping client \$(hostname) /mnt/lustre opts:;
lsof /mnt/lustre || need_kill=no;
if [ x != x -a x\$need_kill != xno ]; then
    pids=\$(lsof -t /mnt/lustre | sort -u);
    if [ -n \&quot;\$pids\&quot; ]; then
             kill -9 \$pids;
    fi
fi;
while umount  /mnt/lustre 2&amp;gt;&amp;amp;1 | grep -q busy; do
    echo /mnt/lustre is still busy, wait one second &amp;amp;&amp;amp; sleep 1;
done;
fi
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Looking at the OSS (vm3) console log, we see client evictions&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[95316.867814] Lustre: DEBUG MARKER: /usr/sbin/lctl mark -----============= acceptance-small: parallel-scale-nfsv4 ============----- Wed May 20 19:03:21 PDT 2020
[95317.080668] Lustre: DEBUG MARKER: -----============= acceptance-small: parallel-scale-nfsv4 ============----- Wed May 20 19:03:21 PDT 2020
[95317.585046] LustreError: 8592:0:(ldlm_lockd.c:681:ldlm_handle_ast_error()) ### client (nid 10.2.8.200@tcp) returned error from blocking AST (req@ffff8dab96ebb600 x1667201457741568 status -107 rc -107), evict it ns: filter-lustre-OST0005_UUID lock: ffff8dabe0c89680/0xb23e2fc63d741a2a lrc: 4/0,0 mode: PW/PW res: [0x80a5b:0x0:0x0].0x0 rrc: 3 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;18446744073709551615) flags: 0x60000400030020 nid: 10.2.8.200@tcp remote: 0xcc7d8b5f5da656a4 expref: 250 pid: 8745 timeout: 95422 lvb_type: 0
[95317.585053] LustreError: 138-a: lustre-OST0005: A client on nid 10.2.8.200@tcp was evicted due to a lock blocking callback time out: rc -107
[95317.585084] LustreError: 24558:0:(ldlm_lockd.c:256:expired_lock_main()) ### lock callback timer expired after 0s: evicting client at 10.2.8.200@tcp  ns: filter-lustre-OST0005_UUID lock: ffff8dabbc77d680/0xb23e2fc63d741aaf lrc: 3/0,0 mode: PW/PW res: [0x80a5c:0x0:0x0].0x0 rrc: 3 type: EXT [0-&amp;gt;18446744073709551615] (req 1048576-&amp;gt;1179647) flags: 0x60000400030020 nid: 10.2.8.200@tcp remote: 0xcc7d8b5f5da6615c expref: 251 pid: 8623 timeout: 0 lvb_type: 0
[95317.595471] LustreError: 8592:0:(ldlm_lockd.c:681:ldlm_handle_ast_error()) Skipped 6 previous similar messages
[95318.000138] Lustre: DEBUG MARKER: /usr/sbin/lctl mark onyx-46vm1: executing check_config_client \/mnt\/lustre
[95318.006487] Lustre: DEBUG MARKER: /usr/sbin/lctl mark onyx-46vm2: executing check_config_client \/mnt\/lustre
[95318.224807] Lustre: DEBUG MARKER: onyx-46vm1: executing check_config_client /mnt/lustre
[95318.229146] Lustre: DEBUG MARKER: onyx-46vm2: executing check_config_client /mnt/lustre
[95318.452944] Lustre: DEBUG MARKER: running=$(grep -c /mnt/lustre-ost1&apos; &apos; /proc/mounts);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;On the client (vm2) console log, we see some nfs server issues&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[95486.336436] echo Stopping client $(hostname) /mnt/lustre opts:;
[95486.336436] lsof /mnt/lustre || need_kill=no;
[95486.336436] if [ x != x -a x$need_kill != xno ]; then
[95486.336436]     pids=$(lsof -t /mnt/lustre | sort -u);
[95486.336436]     if 
[95626.965822] nfs: server onyx-46vm4 not responding, still trying
[95666.899579] nfs: server onyx-46vm4 not responding, still trying
[98788.703830] sysrq: SysRq : Changing Loglevel
[98788.704447] sysrq: Loglevel set to 8
[98788.705933] sysrq: SysRq : Show State
[98788.706414]   task                        PC stack   pid father
[98788.707056] systemd         S    0     1      0 0x00000000
[98788.707654] Call Trace:
[98788.708001]  ? __schedule+0x292/0x880
[98788.708437]  schedule+0x32/0x80
[98788.708805]  schedule_hrtimeout_range_clock+0x106/0x110
[98788.709400]  ? ep_scan_ready_list.isra.14+0x1af/0x1d0
[98788.709952]  ep_poll+0x2b6/0x360
[98788.710378]  ? wake_up_q+0x70/0x70
[98788.710958]  SyS_epoll_wait+0xb2/0xe0
[98788.710968]  do_syscall_64+0x74/0x160
[98788.710979]  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
[98788.710987] RIP: 0033:0x7f8cb7669523
[98788.712823] RSP: 002b:00007ffc5ae0d4a8 EFLAGS: 00000246 ORIG_RAX: 00000000000000e8
[98788.712830] RAX: ffffffffffffffda RBX: 0000561cf19dfde0 RCX: 00007f8cb7669523
[98788.712830] RDX: 0000000000000027 RSI: 00007ffc5ae0d4b0 RDI: 0000000000000004
[98788.712831] RBP: 00007ffc5ae0d790 R08: 00000000000ddb68 R09: 0000561cf09489c0
[98788.712833] R10: 00000000ffffffff R11: 0000000000000246 R12: 0000000000000001
[98788.716708] R13: 00007ffc5ae0d4b0 R14: ffffffffffffffff R15: 0000000000000000
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Another hang:&lt;br/&gt;
Lustre 2.12.4.47 - CentOS7.7 - &lt;a href=&quot;https://testing.whamcloud.com/test_sets/790dab9e-22d4-4c0b-8f58-41cff56c5810&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/790dab9e-22d4-4c0b-8f58-41cff56c5810&lt;/a&gt;&lt;/p&gt;</description>
                <environment></environment>
        <key id="63359">LU-14525</key>
            <summary>parallel-scale-nfsv4 hangs before any tests can run</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="jamesanunez">James Nunez</reporter>
                        <labels>
                    </labels>
                <created>Tue, 16 Mar 2021 15:11:01 +0000</created>
                <updated>Tue, 6 Jul 2021 21:41:02 +0000</updated>
                                            <version>Lustre 2.12.5</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>2</watches>
                                                                            <comments>
                            <comment id="295256" author="green" created="Wed, 17 Mar 2021 17:53:10 +0000"  >&lt;p&gt;the eviction on the oss is normal - if you check the log up you see the local client is force-umounted so when server sends an AST in - it gets rejected and client is evicted even though client does not exist.&lt;/p&gt;

&lt;p&gt;now what I also see in the logs is nfs server is stopped and never restarted. So in these circumstances it probably is normal that clients cannot contact nfs server. I see they try to do umount /mnt/lustre which is nfs mountpount? If there&apos;s a race where nfs server is down before the client - that probably is the expected result?&lt;/p&gt;

&lt;p&gt;perhaps umount -f on nfs client would make it more resilient?&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i01pjz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>