<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:41:59 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4355] recovery-double-scale test_pairwise_fail hung: operation ost_connect failed with -16</title>
                <link>https://jira.whamcloud.com/browse/LU-4355</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;recovery-double-scale test_pairwise_fail hung as follows:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;==== START === test 5: failover OST, then 2 clients ====
&amp;lt;~snip~&amp;gt;
Starting client: superfat-intel-1vm5: -o user_xattr,acl,flock superfat-intel-1vm7:superfat-intel-1vm3:/lustre /mnt/lustre
CMD: superfat-intel-1vm5 mkdir -p /mnt/lustre
CMD: superfat-intel-1vm5 mount -t lustre -o user_xattr,acl,flock superfat-intel-1vm7:superfat-intel-1vm3:/lustre /mnt/lustre
CMD: superfat-intel-1vm5 PATH=/usr/lib64/lustre/tests:/usr/lib/lustre/tests:/usr/lib64/lustre/tests:/opt/iozone/bin:/opt/iozone/bin:/usr/lib64/lustre/tests/mpi:/usr/lib64/lustre/tests/racer:/usr/lib64/lustre/../lustre-iokit/sgpdd-survey:/usr/lib64/lustre/tests:/usr/lib64/lustre/utils/gss:/usr/lib64/lustre/utils:/usr/lib64/openmpi/bin:/usr/bin:/bin:/sbin:/usr/sbin::/sbin:/bin:/usr/sbin: NAME=autotest_config sh rpc.sh set_default_debug \&quot;0x33f0404\&quot; \&quot; 0xffb7e3ff\&quot; 32 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Console log on client superfat-intel-1vm5 showed that:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;19:25:31:Lustre: lustre-OST0004-osc-ffff8800370c7c00: Connection restored to lustre-OST0004 (at 10.10.4.95@tcp)
19:25:31:Lustre: Skipped 3 previous similar messages
19:25:31:Lustre: DEBUG MARKER: /usr/sbin/lctl mark                             Failing type2=clients item2=superfat-intel-1vm5,superfat-intel-1vm6 ... 
19:25:31:Lustre: DEBUG MARKER: Failing type2=clients item2=superfat-intel-1vm5,superfat-intel-1vm6 ...
19:25:31:Lustre: DEBUG MARKER: test -f /tmp/client-load.pid &amp;amp;&amp;amp;
19:25:31:        { kill -s TERM $(cat /tmp/client-load.pid); rm -f /tmp/client-load.pid; }
19:25:31:
19:25:31:&amp;lt;ConMan&amp;gt; Console [superfat-intel-1vm5] disconnected from &amp;lt;superfat-intel-1:6004&amp;gt; at 12-04 19:24.
19:25:31:
19:25:31:&amp;lt;ConMan&amp;gt; Console [superfat-intel-1vm5] connected to &amp;lt;superfat-intel-1:6004&amp;gt; at 12-04 19:25.
19:25:31:
Press any key to continue.

&amp;lt;~snip~&amp;gt;

19:30:33:LNet: Added LNI 10.10.4.84@tcp [8/256/0/180]
19:30:33:LNet: Accept all, port 7988
19:30:33:LustreError: 152-6: Ignoring deprecated mount option &apos;acl&apos;.
19:30:33:Lustre: Layout lock feature supported.
19:30:33:LustreError: 11-0: lustre-OST0006-osc-ffff880037b0a400: Communicating with 10.10.4.95@tcp, operation ost_connect failed with -16.
19:30:33:LustreError: 11-0: lustre-OST0005-osc-ffff880037b0a400: Communicating with 10.10.4.95@tcp, operation ost_connect failed with -16.
19:30:33:Lustre: Mounted lustre-client
19:30:33:Lustre: DEBUG MARKER: PATH=/usr/lib64/lustre/tests:/usr/lib/lustre/tests:/usr/lib64/lustre/tests:/opt/iozone/bin:/opt/iozone/bin:/usr/lib64/lustre/tests/mpi:/usr/lib64/lustre/tests/racer:/usr/lib64/lustre/../lustre-iokit/sgpdd-survey:/usr/lib64/lustre/tests:/usr/lib64/lustre/u
19:30:33:LNet: 2176:0:(debug.c:324:libcfs_debug_str2mask()) You are trying to use a numerical value for the mask - this will be deprecated in a future release.
19:30:33:LNet: 2177:0:(debug.c:324:libcfs_debug_str2mask()) You are trying to use a numerical value for the mask - this will be deprecated in a future release.
19:30:33:LustreError: 11-0: lustre-OST0006-osc-ffff880037b0a400: Communicating with 10.10.4.95@tcp, operation ost_connect failed with -16.
19:30:33:LustreError: 11-0: lustre-OST0005-osc-ffff880037b0a400: Communicating with 10.10.4.95@tcp, operation ost_connect failed with -16.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Console log on OSS superfat-intel-1vm8 showed that:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;19:25:47:Lustre: DEBUG MARKER: Failing type2=clients item2=superfat-intel-1vm5,superfat-intel-1vm6 ...
19:25:47:Lustre: lustre-OST0005: Client 44f4f2de-f2b9-e381-1912-31a482522e3d (at 10.10.4.85@tcp) reconnecting, waiting for 3 clients in recovery for 0:15
19:25:47:Lustre: lustre-OST0005: Client 44f4f2de-f2b9-e381-1912-31a482522e3d (at 10.10.4.85@tcp) refused reconnection, still busy with 1 active RPCs
19:25:47:Lustre: lustre-OST0006: Will be in recovery for at least 1:00, or until 3 clients reconnect
19:25:47:Lustre: lustre-OST0005: Client 44f4f2de-f2b9-e381-1912-31a482522e3d (at 10.10.4.85@tcp) reconnecting, waiting for 3 clients in recovery for 0:05
19:25:47:Lustre: lustre-OST0005: Client 44f4f2de-f2b9-e381-1912-31a482522e3d (at 10.10.4.85@tcp) refused reconnection, still busy with 1 active RPCs
19:25:47:Lustre: lustre-OST0005: recovery is timed out, evict stale exports
19:25:47:LustreError: 3880:0:(ldlm_resource.c:1165:ldlm_resource_get()) lustre-OST0005: lvbo_init failed for resource 0x402:0x0: rc = -2
19:25:47:LustreError: 3880:0:(ldlm_resource.c:1165:ldlm_resource_get()) Skipped 597 previous similar messages
19:25:47:Lustre: lustre-OST0005: Client 44f4f2de-f2b9-e381-1912-31a482522e3d (at 10.10.4.85@tcp) refused reconnection, still busy with 1 active RPCs
19:25:47:LustreError: 3880:0:(ldlm_resource.c:1165:ldlm_resource_get()) lustre-OST0005: lvbo_init failed for resource 0x405:0x0: rc = -2
19:25:47:LustreError: 3880:0:(ldlm_resource.c:1165:ldlm_resource_get()) Skipped 3452 previous similar messages
19:25:47:Lustre: lustre-OST0005: Client 44f4f2de-f2b9-e381-1912-31a482522e3d (at 10.10.4.85@tcp) reconnecting, waiting for 3 clients in recovery for 0:36
19:25:47:Lustre: Skipped 1044 previous similar messages
19:25:47:Lustre: lustre-OST0005: Client 44f4f2de-f2b9-e381-1912-31a482522e3d (at 10.10.4.85@tcp) refused reconnection, still busy with 1 active RPCs
19:25:47:Lustre: Skipped 2 previous similar messages
19:25:47:Lustre: lustre-OST0005: Client 44f4f2de-f2b9-e381-1912-31a482522e3d (at 10.10.4.85@tcp) reconnecting, waiting for 3 clients in recovery for 0:16
19:25:47:Lustre: Skipped 3 previous similar messages
19:27:28:Lustre: lustre-OST0005: Client 44f4f2de-f2b9-e381-1912-31a482522e3d (at 10.10.4.85@tcp) refused reconnection, still busy with 1 active RPCs
19:27:28:Lustre: Skipped 3 previous similar messages
19:27:28:Lustre: lustre-OST0002: haven&apos;t heard from client f23f578c-ed7b-9fa4-df6c-fa1ce4040fbf (at 10.10.4.84@tcp) in 50 seconds. I think it&apos;s dead, and I am evicting it. exp ffff88007af59c00, cur 1386213947 expire 1386213917 last 1386213897
19:27:28:Lustre: lustre-OST0004: haven&apos;t heard from client f23f578c-ed7b-9fa4-df6c-fa1ce4040fbf (at 10.10.4.84@tcp) in 50 seconds. I think it&apos;s dead, and I am evicting it. exp ffff88007ef97400, cur 1386213947 expire 1386213917 last 1386213897
19:27:28:Lustre: lustre-OST0005: Denying connection for new client c95a9301-62de-7453-fcdc-472b6eddb32c (at 10.10.4.84@tcp), waiting for all 3 known clients (2 recovered, 1 in progress, and 0 evicted) to recover in 0:13
19:27:28:Lustre: lustre-OST0006: Denying connection for new client c95a9301-62de-7453-fcdc-472b6eddb32c (at 10.10.4.84@tcp), waiting for all 3 known clients (1 recovered, 1 in progress, and 0 evicted) to recover in 0:11
19:27:28:Lustre: lustre-OST0005: Denying connection for new client c95a9301-62de-7453-fcdc-472b6eddb32c (at 10.10.4.84@tcp), waiting for all 3 known clients (2 recovered, 1 in progress, and 0 evicted) to recover in 0:08
19:27:28:Lustre: Skipped 1 previous similar message
19:27:28:Lustre: lustre-OST0005: Denying connection for new client c95a9301-62de-7453-fcdc-472b6eddb32c (at 10.10.4.84@tcp), waiting for all 3 known clients (2 recovered, 1 in progress, and 0 evicted) to recover in 0:03
19:27:28:Lustre: Skipped 1 previous similar message
19:27:28:Lustre: lustre-OST0006: recovery is timed out, evict stale exports
19:27:28:Lustre: lustre-OST0006: disconnecting 1 stale clients
19:27:28:Lustre: lustre-OST0006: Recovery over after 1:00, of 3 clients 2 recovered and 1 was evicted.
19:27:28:Lustre: lustre-OST0006: deleting orphan objects from 0x0:1078 to 0x0:1107
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Maloo report: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/b748b488-5ddf-11e3-aed2-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/b748b488-5ddf-11e3-aed2-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;IP addresses:&lt;br/&gt;
superfat-intel-1vm1: 10.10.4.80&lt;br/&gt;
superfat-intel-1vm2: 10.10.4.81&lt;br/&gt;
superfat-intel-1vm3: 10.10.4.82&lt;br/&gt;
superfat-intel-1vm4: 10.10.4.83&lt;br/&gt;
superfat-intel-1vm5: 10.10.4.84&lt;br/&gt;
superfat-intel-1vm6: 10.10.4.85&lt;br/&gt;
superfat-intel-1vm7: 10.10.4.94&lt;br/&gt;
superfat-intel-1vm8: 10.10.4.95&lt;/p&gt;

&lt;p&gt;From the above logs, we can see:&lt;br/&gt;
After oss superfat-intel-1vm4 failed over to oss superfat-intel-1vm8, before client osc connections were restored to lustre-OST0005 and lustre-OST0006, client superfat-intel-1vm5 was powered off and on. After the client node was up and tried to connect to lustre-OST0005, the ost kept denying the connection:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;19:27:28:Lustre: lustre-OST0005: Denying connection for new client c95a9301-62de-7453-fcdc-472b6eddb32c (at 10.10.4.84@tcp), waiting for all 3 known clients (2 recovered, 1 in progress, and 0 evicted) to recover in 0:13
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_4/63/&quot;&gt;http://build.whamcloud.com/job/lustre-b2_4/63/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64&lt;br/&gt;
TEST_GROUP=failover&lt;br/&gt;
</environment>
        <key id="22365">LU-4355</key>
            <summary>recovery-double-scale test_pairwise_fail hung: operation ost_connect failed with -16</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="yujian">Jian Yu</reporter>
                        <labels>
                            <label>yuc2</label>
                    </labels>
                <created>Fri, 6 Dec 2013 05:51:17 +0000</created>
                <updated>Tue, 14 Dec 2021 22:06:45 +0000</updated>
                            <resolved>Tue, 14 Dec 2021 22:06:45 +0000</resolved>
                                    <version>Lustre 2.4.1</version>
                    <version>Lustre 2.4.2</version>
                    <version>Lustre 2.5.1</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>3</watches>
                                                                            <comments>
                            <comment id="72961" author="yujian" created="Fri, 6 Dec 2013 06:06:57 +0000"  >&lt;p&gt;The same test &lt;b&gt;passed&lt;/b&gt; on Lustre 2.4.1 RC2 (build #45):&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/f0cc9f6c-194c-11e3-bb73-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/f0cc9f6c-194c-11e3-bb73-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;After searching on Maloo, I found the same failure occurred before on Lustre b2_4 build #40:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/fd792344-0f4b-11e3-af0c-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/fd792344-0f4b-11e3-af0c-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;So, this does not look like a regression.&lt;/p&gt;</comment>
                            <comment id="73984" author="yujian" created="Sat, 21 Dec 2013 13:33:48 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_4/69/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_4/69/&lt;/a&gt; (2.4.2 RC1)&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64&lt;br/&gt;
TEST_GROUP=failover&lt;br/&gt;
FSTYPE=zfs&lt;/p&gt;

&lt;p&gt;The same failure occurred:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/ed36352a-6a41-11e3-8e21-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/ed36352a-6a41-11e3-8e21-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="74331" author="yujian" created="Sat, 4 Jan 2014 12:00:52 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_5/5/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_5/5/&lt;/a&gt;&lt;br/&gt;
TEST_GROUP=failover&lt;/p&gt;

&lt;p&gt;The same failure occurred:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/e2af0f04-7505-11e3-95ae-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/e2af0f04-7505-11e3-95ae-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="76445" author="yujian" created="Fri, 7 Feb 2014 08:23:00 +0000"  >&lt;p&gt;More instance on Lustre b2_5 branch:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/e70577b8-8f32-11e3-b8e1-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/e70577b8-8f32-11e3-b8e1-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="78831" author="yujian" created="Sun, 9 Mar 2014 09:46:11 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_5/39/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_5/39/&lt;/a&gt; (2.5.1 RC1)&lt;br/&gt;
Distro/Arch: RHEL6.5/x86_64&lt;br/&gt;
FSTYPE=zfs&lt;br/&gt;
Test Group: failover&lt;/p&gt;

&lt;p&gt;The same failure occurred:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/50529cf6-a657-11e3-a191-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/50529cf6-a657-11e3-a191-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwatr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>11929</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>