<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:18:49 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-8582] Interop: master&lt;-&gt;b2_8  - sanity test_255a: test failed to respond and timed out</title>
                <link>https://jira.whamcloud.com/browse/LU-8582</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for Saurabh Tandan &amp;lt;saurabh.tandan@intel.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/a3f680b4-7276-11e6-8afd-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/a3f680b4-7276-11e6-8afd-5254006e85c2&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_255a failed with the following error:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;test failed to respond and timed out
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Client dmesg:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 7909.928568] Lustre: DEBUG MARKER: == sanity test 255a: check &apos;lfs ladvise -a willread&apos; ================================================= 19:40:57 (1472870457)
[ 7916.983080] Lustre: 13272:0:(client.c:2113:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1472870457/real 1472870457]  req@ffff880043d21b00 x1544415137983184/t0(0) o21-&amp;gt;lustre-OST0003-osc-ffff880000b5c800@10.2.4.226@tcp:6/4 lens 504/432 e 0 to 1 dl 1472870464 ref 1 fl Rpc:X/0/ffffffff rc 0/-1
[ 7916.993670] Lustre: lustre-OST0003-osc-ffff880000b5c800: Connection to lustre-OST0003 (at 10.2.4.226@tcp) was lost; in progress operations using this service will wait for recovery to complete
[ 7917.002143] Lustre: lustre-OST0003-osc-ffff880000b5c800: Connection restored to 10.2.4.226@tcp (at 10.2.4.226@tcp)
[ 7917.004970] Lustre: Skipped 1 previous similar message
[ 7938.019143] Lustre: lustre-OST0003-osc-ffff880000b5c800: Connection to lustre-OST0003 (at 10.2.4.226@tcp) was lost; in progress operations using this service will wait for recovery to complete
[ 7938.030107] Lustre: Skipped 2 previous similar messages
[ 7973.060145] Lustre: lustre-OST0003-osc-ffff880000b5c800: Connection to lustre-OST0003 (at 10.2.4.226@tcp) was lost; in progress operations using this service will wait for recovery to complete
[ 7973.070547] Lustre: Skipped 10 previous similar messages
[ 7987.075859] Lustre: lustre-OST0003-osc-ffff880000b5c800: Connection restored to 10.2.4.226@tcp (at 10.2.4.226@tcp)
[ 7987.082573] Lustre: Skipped 15 previous similar messages
[ 8040.574146] INFO: task lfs:24449 blocked for more than 120 seconds.
[ 8040.579853] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[ 8040.582026] lfs             D ffff88003cb47b10     0 24449  24265 0x00000080
[ 8040.584167]  ffff88005462fbe0 0000000000000082 ffff880036479700 ffff88005462ffd8
[ 8040.586392]  ffff88005462ffd8 ffff88005462ffd8 ffff880036479700 ffff8800378a5790
[ 8040.588540]  ffff8800378a5798 7fffffffffffffff ffff880036479700 ffff88003cb47b10
[ 8040.590683] Call Trace:
[ 8040.592422]  [&amp;lt;ffffffff8163bc59&amp;gt;] schedule+0x29/0x70
[ 8040.594318]  [&amp;lt;ffffffff81639949&amp;gt;] schedule_timeout+0x209/0x2d0
[ 8040.596362]  [&amp;lt;ffffffffa0945c5c&amp;gt;] ? ptlrpcd_add_req+0x1ec/0x2e0 [ptlrpc]
[ 8040.598446]  [&amp;lt;ffffffffa0923c30&amp;gt;] ? lustre_swab_ladvise_hdr+0x40/0x40 [ptlrpc]
[ 8040.600571]  [&amp;lt;ffffffff8163c026&amp;gt;] wait_for_completion+0x116/0x170
[ 8040.602607]  [&amp;lt;ffffffff810b8910&amp;gt;] ? wake_up_state+0x20/0x20
[ 8040.604615]  [&amp;lt;ffffffffa0b4f3b0&amp;gt;] ? lov_io_iter_fini_wrapper+0x50/0x50 [lov]
[ 8040.606715]  [&amp;lt;ffffffffa0aeee68&amp;gt;] osc_io_ladvise_end+0x38/0x50 [osc]
[ 8040.608765]  [&amp;lt;ffffffffa075a95d&amp;gt;] cl_io_end+0x5d/0x150 [obdclass]
[ 8040.610725]  [&amp;lt;ffffffffa0b4f48b&amp;gt;] lov_io_end_wrapper+0xdb/0xe0 [lov]
[ 8040.612715]  [&amp;lt;ffffffffa0b4f9a6&amp;gt;] lov_io_call.isra.9+0x86/0x140 [lov]
[ 8040.614647]  [&amp;lt;ffffffffa0b4fa96&amp;gt;] lov_io_end+0x36/0xb0 [lov]
[ 8040.616502]  [&amp;lt;ffffffffa075a95d&amp;gt;] cl_io_end+0x5d/0x150 [obdclass]
[ 8040.618371]  [&amp;lt;ffffffffa075d103&amp;gt;] cl_io_loop+0xb3/0x190 [obdclass]
[ 8040.620279]  [&amp;lt;ffffffffa0bd1c29&amp;gt;] ll_file_ioctl+0x3069/0x37f0 [lustre]
[ 8040.622168]  [&amp;lt;ffffffff811f2585&amp;gt;] do_vfs_ioctl+0x2e5/0x4c0
[ 8040.624012]  [&amp;lt;ffffffff811fd057&amp;gt;] ? __alloc_fd+0xa7/0x130
[ 8040.625813]  [&amp;lt;ffffffff811fd167&amp;gt;] ? __fd_install+0x47/0x60
[ 8040.627680]  [&amp;lt;ffffffff811f2801&amp;gt;] SyS_ioctl+0xa1/0xc0
[ 8040.629464]  [&amp;lt;ffffffff81646c49&amp;gt;] system_call_fastpath+0x16/0x1b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;OST console:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;19:43:40:[ 7912.118286] Lustre: DEBUG MARKER: == sanity test 255a: check &apos;lfs ladvise -a willread&apos; ================================================= 19:40:57 (1472870457)
19:43:40:[ 7912.161108] LustreError: 12896:0:(tgt_handler.c:593:tgt_handler_find_check()) lustre-OST0002: no handlers for opcode 0x15
19:43:40:[ 7912.166191] LustreError: 12896:0:(tgt_handler.c:593:tgt_handler_find_check()) Skipped 6 previous similar messages
19:43:40:[ 7919.176071] Lustre: lustre-OST0003: Client 95455275-3e21-dec7-8ff1-6b8c07c6d061 (at 10.2.4.219@tcp) reconnecting
19:43:40:[ 7919.185035] LustreError: 12896:0:(tgt_handler.c:593:tgt_handler_find_check()) lustre-OST0003: no handlers for opcode 0x15
19:43:40:[ 7926.185274] Lustre: lustre-OST0003: Client 95455275-3e21-dec7-8ff1-6b8c07c6d061 (at 10.2.4.219@tcp) reconnecting
19:43:40:[ 7926.189136] LustreError: 12896:0:(tgt_handler.c:593:tgt_handler_find_check()) lustre-OST0003: no handlers for opcode 0x15
19:43:40:[ 7933.189463] Lustre: lustre-OST0003: Client 95455275-3e21-dec7-8ff1-6b8c07c6d061 (at 10.2.4.219@tcp) reconnecting
19:43:40:[ 7933.196824] LustreError: 12896:0:(tgt_handler.c:593:tgt_handler_find_check()) lustre-OST0003: no handlers for opcode 0x15
19:43:40:[ 7940.211153] LustreError: 12896:0:(tgt_handler.c:593:tgt_handler_find_check()) lustre-OST0003: no handlers for opcode 0x15
19:43:40:[ 7947.211321] Lustre: lustre-OST0003: Client 95455275-3e21-dec7-8ff1-6b8c07c6d061 (at 10.2.4.219@tcp) reconnecting
19:43:40:[ 7947.216921] Lustre: Skipped 1 previous similar message
19:43:40:[ 7954.222110] LustreError: 12896:0:(tgt_handler.c:593:tgt_handler_find_check()) lustre-OST0003: no handlers for opcode 0x15
19:43:40:[ 7954.228583] LustreError: 12896:0:(tgt_handler.c:593:tgt_handler_find_check()) Skipped 1 previous similar message
19:43:40:[ 7968.227179] Lustre: lustre-OST0003: Client 95455275-3e21-dec7-8ff1-6b8c07c6d061 (at 10.2.4.219@tcp) reconnecting
19:43:40:[ 7968.233536] Lustre: Skipped 8 previous similar messages
19:43:40:[ 7975.251445] LustreError: 23546:0:(tgt_handler.c:593:tgt_handler_find_check()) lustre-OST0003: no handlers for opcode 0x15
19:43:40:[ 7975.257113] LustreError: 23546:0:(tgt_handler.c:593:tgt_handler_find_check()) Skipped 8 previous similar messages
19:43:40:[ 8000.268676] Lustre: lustre-OST0000: Client 95455275-3e21-dec7-8ff1-6b8c07c6d061 (at 10.2.4.219@tcp) reconnecting
19:43:40:[ 8000.275601] Lustre: Skipped 5 previous similar messages
19:43:40:[ 8010.282726] LustreError: 12896:0:(tgt_handler.c:593:tgt_handler_find_check()) lustre-OST0003: no handlers for opcode 0x15
19:43:40:[ 8010.287623] LustreError: 12896:0:(tgt_handler.c:593:tgt_handler_find_check()) Skipped 10 previous similar messages
19:43:40:[ 8066.316121] Lustre: lustre-OST0003: Client 95455275-3e21-dec7-8ff1-6b8c07c6d061 (at 10.2.4.219@tcp) reconnecting
20:41:39:********** Timeout by autotest system **********
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This issue was first seen on 08/16/2016&lt;/p&gt;</description>
                <environment>Interop: 2.8.0 EL7.2 Server/EL7.2 Client&lt;br/&gt;
Server: b2_8, build# 12&lt;br/&gt;
Client: master, build# 3431</environment>
        <key id="39642">LU-8582</key>
            <summary>Interop: master&lt;-&gt;b2_8  - sanity test_255a: test failed to respond and timed out</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="lixi_wc">Li Xi</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                            <label>MON</label>
                    </labels>
                <created>Tue, 6 Sep 2016 19:00:15 +0000</created>
                <updated>Fri, 14 Jul 2023 03:11:07 +0000</updated>
                            <resolved>Mon, 18 Jul 2022 13:52:18 +0000</resolved>
                                    <version>Lustre 2.9.0</version>
                                    <fixVersion>Lustre 2.16.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="184149" author="adilger" created="Thu, 9 Feb 2017 17:21:51 +0000"  >&lt;p&gt;There is a version check in this test that should avoid this interop issue:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;        [ $(lustre_version_code ost1) -lt $(version_code 2.8.54) ] &amp;amp;&amp;amp;
                skip &lt;span class=&quot;code-quote&quot;&gt;&quot;lustre &amp;lt; 2.8.54 does not support ladvise &quot;&lt;/span&gt; &amp;amp;&amp;amp; &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;If this problem is hit again, please investigate why the interop version check is not working.&lt;/p&gt;

&lt;p&gt;For now I&apos;m going to close this issue, since I believe it is fixed, and there is a different failure being hit on sanity test_255a that is being annotated as this one.&lt;/p&gt;</comment>
                            <comment id="184262" author="adilger" created="Fri, 10 Feb 2017 00:44:49 +0000"  >&lt;p&gt;The version check is not at the start of the test, so some of the &lt;tt&gt;lfs ladvise&lt;/tt&gt; code was still being run before the version was checked.&lt;/p&gt;</comment>
                            <comment id="184263" author="adilger" created="Fri, 10 Feb 2017 00:49:00 +0000"  >&lt;p&gt;Li Xi, could you please take a look at this issue.  While it is possible to skip this test during interop testing (which I will push a patch for), the real problem is that the &lt;tt&gt;-ENOTSUPP&lt;/tt&gt; error is not being returned to &lt;tt&gt;lfs ladvise&lt;/tt&gt; and instead the client is hanging forever.  The same would be true for regular users if they have a new client with &lt;tt&gt;lfs ladvise&lt;/tt&gt; support connecting to an older server, so this isn&apos;t just a testing problem.&lt;/p&gt;</comment>
                            <comment id="184264" author="gerrit" created="Fri, 10 Feb 2017 00:50:43 +0000"  >&lt;p&gt;Andreas Dilger (andreas.dilger@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/25362&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/25362&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8582&quot; title=&quot;Interop: master&amp;lt;-&amp;gt;b2_8  - sanity test_255a: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8582&quot;&gt;&lt;del&gt;LU-8582&lt;/del&gt;&lt;/a&gt; tests: skip sanity test_255a for interop&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: a5fa2a6f2f2ee7138d9bdd3560fd73dd8c6c040d&lt;/p&gt;</comment>
                            <comment id="187603" author="gerrit" created="Thu, 9 Mar 2017 06:13:15 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/25362/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/25362/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8582&quot; title=&quot;Interop: master&amp;lt;-&amp;gt;b2_8  - sanity test_255a: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8582&quot;&gt;&lt;del&gt;LU-8582&lt;/del&gt;&lt;/a&gt; tests: skip sanity test_255a for interop&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 35958f8c07beb7cff0763232c1aa16f0080ec38f&lt;/p&gt;</comment>
                            <comment id="187620" author="pjones" created="Thu, 9 Mar 2017 07:26:10 +0000"  >&lt;p&gt;Closing again&lt;/p&gt;</comment>
                            <comment id="215159" author="adilger" created="Fri, 1 Dec 2017 20:19:26 +0000"  >&lt;p&gt;This was not actually fixed, just the test was disabled.&lt;/p&gt;</comment>
                            <comment id="215163" author="adilger" created="Fri, 1 Dec 2017 20:22:18 +0000"  >&lt;p&gt;James wrote:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;For sanity test 255a, an ladvise test, we have&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;test_255a() {
	[ $(lustre_version_code ost1) -lt $(version_code 2.8.54) ] &amp;amp;&amp;amp;
		skip &quot;lustre &amp;lt; 2.8.54 does not support ladvise &quot; &amp;amp;&amp;amp; return
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;We need to add the same check to test 255b.&lt;/p&gt;&lt;/blockquote&gt;</comment>
                            <comment id="215164" author="adilger" created="Fri, 1 Dec 2017 20:24:28 +0000"  >&lt;p&gt;Hi Li Xi,&lt;br/&gt;
could you please look into this problem.  Running &lt;tt&gt;lfs ladvise&lt;/tt&gt; with a new (2.9+) client causes it to hang when using an old (2.8-) server.  It should just return an &lt;tt&gt;-EOPNOTSUPP&lt;/tt&gt; error to the caller.&lt;/p&gt;</comment>
                            <comment id="215166" author="adilger" created="Fri, 1 Dec 2017 20:38:35 +0000"  >&lt;p&gt;A quick look at the code shows it is stuck in &lt;tt&gt;wait_for_completion()&lt;/tt&gt; in:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; void osc_io_ladvise_end(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_env *env, 
                               &lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct cl_io_slice *slice)
{               
        struct cl_io            *io = slice-&amp;gt;cis_io;
        struct osc_io           *oio = cl2osc_io(env, slice);
        struct osc_async_cbargs *cbargs = &amp;amp;oio-&amp;gt;oi_cbarg;
        &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;                      result = 0;
        struct cl_ladvise_io    *lio = &amp;amp;io-&amp;gt;u.ci_ladvise;
        
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; ((!(lio-&amp;gt;li_flags &amp;amp; LF_ASYNC)) &amp;amp;&amp;amp; cbargs-&amp;gt;opc_rpc_sent) {
                wait_for_completion(&amp;amp;cbargs-&amp;gt;opc_sync);
                result = cbargs-&amp;gt;opc_rc;
        }
        slice-&amp;gt;cis_io-&amp;gt;ci_result = result;
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;but my knowledge of CLIO isn&apos;t strong enough to know if it is waiting there indefinitely, or just repeating this wait for each failed RPC that it is sending?  In any case, the OSS is returning &lt;tt&gt;-EOPNOTSUPP&lt;/tt&gt; to the client, but the client is not handling the RPC error properly and returning it to the client.&lt;/p&gt;

&lt;p&gt;Also, it would seem to me that these handlers should be using &lt;tt&gt;wait_for_completion_interruptible()&lt;/tt&gt; so that user processes can be killed if the server is not responsive.  Ideally, there would be a &lt;tt&gt;wait_for_completion_io_interruptible()&lt;/tt&gt; so that the time could be accounted as IO wait instead of just sleeping, but that doesn&apos;t seem to exist.&lt;/p&gt;</comment>
                            <comment id="323414" author="adilger" created="Fri, 21 Jan 2022 02:07:40 +0000"  >&lt;p&gt;This is going to become an issue with patch &lt;a href=&quot;https://review.whamcloud.com/43170&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/43170&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14380&quot; title=&quot;Make statahead better support Breadth First Search (BFS) or Depth First Search (DFS)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14380&quot;&gt;LU-14380&lt;/a&gt; statahead: divide hash space evenly among the ranks&lt;/tt&gt;&quot; adding a new &quot;&lt;tt&gt;LU_LADVISE_STATAHEAD&lt;/tt&gt;&quot; method.&lt;/p&gt;

&lt;p&gt;The need for &lt;tt&gt;wait_for_completion_interruptible()&lt;/tt&gt; in &lt;tt&gt;osc_io_ladvise_end()&lt;/tt&gt; still exists also.&lt;/p&gt;</comment>
                            <comment id="338210" author="lixi_wc" created="Tue, 21 Jun 2022 01:18:06 +0000"  >&lt;p&gt;The server of Lustre 2.8 does not have ladvise support, thus is not able to handle ladvise RPC, so it complains:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;19:43:40:[ 7954.222110] LustreError: 12896:0:(tgt_handler.c:593:tgt_handler_find_check()) lustre-OST0003: no handlers for opcode 0x15
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;opcode 0x15 is OST_LADVISE.&lt;/p&gt;

&lt;p&gt;It seems no error message about the wrong opcode RPC will be sent back to client, thus client hang there waiting forever. There is no mechanism of sending error reply back if RPC has a wrong opcode? If so, adding new opcode in the protocol might hit similar problem.&lt;/p&gt;

&lt;p&gt;I don&apos;t think this will cause any problem of new ladvise types. &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14380&quot; title=&quot;Make statahead better support Breadth First Search (BFS) or Depth First Search (DFS)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14380&quot;&gt;LU-14380&lt;/a&gt; should work fine when testing interop with old servers (not &amp;lt; 2.9). But it worth to confirm that.&lt;/p&gt;

&lt;p&gt;I will push a patch to use wait_for_completion_interruptible() in osc_io_ladvise_end().&lt;/p&gt;

</comment>
                            <comment id="338211" author="lixi_wc" created="Tue, 21 Jun 2022 01:36:05 +0000"  >&lt;p&gt;Hmm, seems like changing to wait_for_completion_interruptible() does not fix the problem. It just enables users to interrupt the problem.&lt;/p&gt;

&lt;p&gt;Client was not able to get reply, so it stuck there and got disconnected.&lt;/p&gt;

&lt;p&gt;In the case of new client connects to server, what would be the mechanisms to check whether server supports a feature? The client could add another OBD_CONNECT_ e.g. OBD_CONNECT2_LADVISE to check whether the feature is supported on server. But is there any way for the server to sent a error reply to a PRC with unknown opcode?&lt;/p&gt;</comment>
                            <comment id="338212" author="adilger" created="Tue, 21 Jun 2022 01:41:44 +0000"  >&lt;p&gt;We had a similar issue with fallocate - the bad opcode caused clients to retry forever. It would be better to fix the server to return -EOPNOTSUPP for bad opcodes (which will be returned to userspace), but also to allow clients to handle -EPROTO and abort instead of retry. &lt;/p&gt;

&lt;p&gt;You are correct that wait_event_idle_interruptible() is only to allow users to interrupt the process (which should preferably be used in places that user applications are waiting), but that will only help the &lt;b&gt;next&lt;/b&gt; problem that is hit in this code. The above two changes should prevent old/new client/server from getting stuck at all.  &lt;/p&gt;</comment>
                            <comment id="338649" author="lixi_wc" created="Fri, 24 Jun 2022 10:27:36 +0000"  >&lt;p&gt;A fix patch is comming. If we apply the following debug change:&lt;/p&gt;

&lt;p&gt;osc_ladvise_base()&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_LADVISE);
 + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_QUOTACHECK);
 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Without the fix patch:&lt;br/&gt;
The ladise command will stuck there forever&lt;/p&gt;

&lt;p&gt;With the fix patch, command quit immediately:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# lfs ladvise -a willread a
lfs ladvise: cannot give advice: Operation not supported (95)
ladvise: cannot give advice &apos;willread&apos; to file &apos;a&apos;: Operation not supported
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="338650" author="gerrit" created="Fri, 24 Jun 2022 10:33:39 +0000"  >&lt;p&gt;&quot;Li Xi &amp;lt;lixi@ddn.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/47761&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47761&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8582&quot; title=&quot;Interop: master&amp;lt;-&amp;gt;b2_8  - sanity test_255a: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8582&quot;&gt;&lt;del&gt;LU-8582&lt;/del&gt;&lt;/a&gt; target: send error reply on wrong opcode&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: b089334b46d19b5efe4e2197ffdca9f3dbc70916&lt;/p&gt;</comment>
                            <comment id="340651" author="gerrit" created="Mon, 18 Jul 2022 05:36:11 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/47761/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47761/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8582&quot; title=&quot;Interop: master&amp;lt;-&amp;gt;b2_8  - sanity test_255a: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8582&quot;&gt;&lt;del&gt;LU-8582&lt;/del&gt;&lt;/a&gt; target: send error reply on wrong opcode&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 566edb8c43e65ed4cb99614c7c753965bb201366&lt;/p&gt;</comment>
                            <comment id="340674" author="pjones" created="Mon, 18 Jul 2022 13:52:18 +0000"  >&lt;p&gt;Landed for 2.16&lt;/p&gt;</comment>
                            <comment id="377408" author="gerrit" created="Wed, 5 Jul 2023 01:04:49 +0000"  >&lt;p&gt;&quot;Andreas Dilger &amp;lt;adilger@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/51568&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/51568&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8582&quot; title=&quot;Interop: master&amp;lt;-&amp;gt;b2_8  - sanity test_255a: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8582&quot;&gt;&lt;del&gt;LU-8582&lt;/del&gt;&lt;/a&gt; tests: skip sanity/905 for old OSTs&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 2ced1e0898aacd741c95c25d44350dfefa953853&lt;/p&gt;</comment>
                            <comment id="378650" author="gerrit" created="Fri, 14 Jul 2023 03:11:07 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/51568/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/51568/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8582&quot; title=&quot;Interop: master&amp;lt;-&amp;gt;b2_8  - sanity test_255a: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8582&quot;&gt;&lt;del&gt;LU-8582&lt;/del&gt;&lt;/a&gt; tests: skip sanity/905 for old OSTs&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 839b43515004828dec212ed0183fe51929c91b5b&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                                        </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="61685">LU-14139</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="62550">LU-14380</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="41406">LU-8811</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="43695">LU-9097</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzyniv:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>