<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:42:30 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4413] Test failure on test suite conf-sanity, subtest test_56</title>
                <link>https://jira.whamcloud.com/browse/LU-4413</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for Nathaniel Clark &amp;lt;nathaniel.l.clark@intel.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run:&lt;br/&gt;
&lt;a href=&quot;http://maloo.whamcloud.com/test_sets/d5f1f7c2-6a01-11e3-9248-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://maloo.whamcloud.com/test_sets/d5f1f7c2-6a01-11e3-9248-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/e4cbde82-6a35-11e3-8e21-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/e4cbde82-6a35-11e3-8e21-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The sub-test test_56 failed with the following error:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;test failed to respond and timed out&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Info required for matching: conf-sanity 56&lt;/p&gt;</description>
                <environment></environment>
        <key id="22563">LU-4413</key>
            <summary>Test failure on test suite conf-sanity, subtest test_56</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="di.wang">Di Wang</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                            <label>HB</label>
                            <label>mn4</label>
                    </labels>
                <created>Mon, 23 Dec 2013 20:08:19 +0000</created>
                <updated>Thu, 29 May 2014 22:25:24 +0000</updated>
                            <resolved>Thu, 20 Mar 2014 17:51:12 +0000</resolved>
                                    <version>Lustre 2.6.0</version>
                                    <fixVersion>Lustre 2.6.0</fixVersion>
                    <fixVersion>Lustre 2.5.2</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="74045" author="utopiabound" created="Mon, 23 Dec 2013 20:09:59 +0000"  >&lt;p&gt;Seems most prevalent during review-zfs but also occurred during review-dne:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/57ec0c52-6874-11e3-9675-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/57ec0c52-6874-11e3-9675-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="74581" author="jlevi" created="Wed, 8 Jan 2014 18:43:43 +0000"  >&lt;p&gt;Nathaniel,&lt;br/&gt;
Are you still seeing this issue?&lt;/p&gt;</comment>
                            <comment id="75057" author="bobijam" created="Thu, 16 Jan 2014 02:13:46 +0000"  >&lt;p&gt;another hit: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/dcc8e2b2-7e48-11e3-908b-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/dcc8e2b2-7e48-11e3-908b-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="75218" author="adilger" created="Fri, 17 Jan 2014 20:44:54 +0000"  >&lt;p&gt;This was hit 15 times in the past 4 weeks.&lt;/p&gt;</comment>
                            <comment id="75559" author="utopiabound" created="Fri, 24 Jan 2014 16:07:48 +0000"  >&lt;p&gt;Seeing this on review-dne a lot (like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4508&quot; title=&quot;conf-sanity test_58 test_72 (umount /mnt/mds2 hangs there).&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4508&quot;&gt;&lt;del&gt;LU-4508&lt;/del&gt;&lt;/a&gt;)&lt;br/&gt;
conf-sanity/72 &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/e70aef8c-8464-11e3-bab5-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/e70aef8c-8464-11e3-bab5-52540035b04c&lt;/a&gt;&lt;br/&gt;
conf-sanity/58 &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/990ad45a-84ff-11e3-86af-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/990ad45a-84ff-11e3-86af-52540035b04c&lt;/a&gt;&lt;br/&gt;
conf-sanity/56 &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/6498f574-84cf-11e3-8da9-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/6498f574-84cf-11e3-8da9-52540035b04c&lt;/a&gt;&lt;br/&gt;
               &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/6498f574-84cf-11e3-8da9-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/6498f574-84cf-11e3-8da9-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="75571" author="adilger" created="Fri, 24 Jan 2014 18:13:54 +0000"  >&lt;p&gt;If you do a search for TIMEOUT on conf-sanity for master, there are problems in conf-sanity on many different tests - 58, 160a, 17n, 72, 80f, 56, 12, 28a, 70b, 90, 161a, 47, 0, 2b, 12, 50h, 230d, 133f, just on a single page of results. &lt;/p&gt;

&lt;p&gt;This is a major problem and needs to be investigated as the highest priority. &lt;/p&gt;</comment>
                            <comment id="75608" author="adilger" created="Fri, 24 Jan 2014 22:29:59 +0000"  >&lt;p&gt;Update from Di on this topic:&lt;/p&gt;

&lt;blockquote&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[1/24/2014, 2:08:11 PM] Wang Di:  here is the amount process stack
[1/24/2014, 2:08:13 PM] Wang Di: umount        S 0000000000000000     0 25182  25181 0x00000080
 ffff8800647cdc68 0000000000000086 0000000000000000 ffffffff8150f3fa
 00000000ffffff0a 0000000000000282 0000000000000000 dead000000200200
 ffff880037eb2638 ffff8800647cdfd8 000000000000fb88 ffff880037eb2638
Call Trace:
 [&amp;lt;ffffffff8150f3fa&amp;gt;] ? schedule_timeout+0x19a/0x2e0
 [&amp;lt;ffffffffa0b6326d&amp;gt;] ptlrpc_disconnect_import+0x77d/0x7a0 [ptlrpc]
 [&amp;lt;ffffffff81063990&amp;gt;] ? default_wake_function+0x0/0x20
 [&amp;lt;ffffffffa06ea4e8&amp;gt;] ? libcfs_log_return+0x28/0x40 [libcfs]
 [&amp;lt;ffffffffa0df94e9&amp;gt;] lwp_disconnect+0x69/0x160 [osp]
 [&amp;lt;ffffffffa0dfb250&amp;gt;] lwp_process_config+0x150/0x260 [osp]
 [&amp;lt;ffffffffa0937e85&amp;gt;] lustre_disconnect_lwp+0x1c5/0xe40 [obdclass]
 [&amp;lt;ffffffffa093ec40&amp;gt;] server_put_super+0x630/0xf60 [obdclass]
 [&amp;lt;ffffffff8119d906&amp;gt;] ? invalidate_inodes+0xf6/0x190
 [&amp;lt;ffffffff8118366b&amp;gt;] generic_shutdown_super+0x5b/0xe0
 [&amp;lt;ffffffff81183756&amp;gt;] kill_anon_super+0x16/0x60
 [&amp;lt;ffffffffa0906506&amp;gt;] lustre_kill_super+0x36/0x60 [obdclass]
 [&amp;lt;ffffffff81183ef7&amp;gt;] deactivate_super+0x57/0x80
 [&amp;lt;ffffffff811a21ef&amp;gt;] mntput_no_expire+0xbf/0x110
 [&amp;lt;ffffffff811a2c5b&amp;gt;] sys_umount+0x7b/0x3a0
 [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;It seems lwp is waiting for recovery, but lwp should not be in recovery states at all.  I think the problem is the state of LWP should only be FULL, CLOSED, DISCONN, which will make sure the lwp will not be in recovery states, but somehow some one change the state of LWP to some other states. trying to figure out why.&lt;/p&gt;&lt;/blockquote&gt;</comment>
                            <comment id="75621" author="adilger" created="Sat, 25 Jan 2014 01:28:52 +0000"  >&lt;p&gt;Di, would it be possible to just change ptlrpc_disconnect_import() to not wait for recovery if obd_no_recov is set (this is done in ptlrpc_pinger_del_import())?  Otherwise, it seems LWP connections will never reconnect because the import was removed from the pinger via ptlrpc_pinger_del_import() in lwp_disconnect().  That means nothing is going to try and reconnect the import and it will just wait there forever.&lt;/p&gt;

&lt;p&gt;I&apos;ve pushed &lt;a href=&quot;http://review.whamcloud.com/8996&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8996&lt;/a&gt; which may or may not fix this problem.  I &lt;em&gt;think&lt;/em&gt; what is happening is that the MDS is trying to unmount while it still has a failed LWP connection to another MDS that was recently stopped.  It is racy if that connection has recovered yet or not, but if it hasn&apos;t then lwp_disconnect-&amp;gt;ptlrpc_pinger_del_import() mean that the connection will never be recovered, and ptlrpc_disconnect_import() will wait forever for that to happen.&lt;/p&gt;</comment>
                            <comment id="75623" author="di.wang" created="Sat, 25 Jan 2014 02:09:20 +0000"  >&lt;p&gt;Yes, I can see the from the debug log that it is a racy condition, i.e. MDS2 is doing umount, while its lwp connection is not been setup MDS1 (import is in CONNECTING state), at the mean time MDS1 has been umounted.  But here is one thing I do not understand.&lt;/p&gt;

&lt;p&gt;1. MDS2 is trying to setup connection with MDS1, which mean there must be a connecting RPC inflight or somewhere in ptlrpcd.&lt;br/&gt;
2. and before lwp_disconnect call ptlrpc_disconnect_import, it suppose to do ptlrpc_deactivate_import, which will drain all of RPC related with this import IMHO, then the connect interpret callback should set the import to be DISCONNECTED. Then ptlrpc_disconnect_import should be wakeup instead of waiting for non-recovery state. &lt;/p&gt;

&lt;p&gt;If what I understand is correct, then we probably need find out why the connecting RPC is not being interrupted during the umount process. &lt;/p&gt;</comment>
                            <comment id="75625" author="di.wang" created="Sat, 25 Jan 2014 02:30:07 +0000"  >&lt;p&gt;Ah, the connecting RPC is being to ptlrpcd_rcv thread, but ptlrpcd_rcv is stuck at&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;ptlrpcd_rcv   S 0000000000000001     0 15409      2 0x00000080
 ffff88005ef2b8d0 0000000000000046 0000000000000000 00000000fffffffe
 ffff88005ef2b890 0000000000000000 ffff880079ddea80 ffff880079ddeab0
 ffff8800377a7af8 ffff88005ef2bfd8 000000000000fb88 ffff8800377a7af8
Call Trace:
 [&amp;lt;ffffffff8150f3f2&amp;gt;] schedule_timeout+0x192/0x2e0
 [&amp;lt;ffffffff810811e0&amp;gt;] ? process_timeout+0x0/0x10
 [&amp;lt;ffffffffa0b3f99a&amp;gt;] ptlrpc_set_wait+0x2da/0x860 [ptlrpc]
 [&amp;lt;ffffffff81063990&amp;gt;] ? default_wake_function+0x0/0x20
 [&amp;lt;ffffffffa0b49c06&amp;gt;] ? lustre_msg_set_jobid+0xb6/0x140 [ptlrpc]
 [&amp;lt;ffffffffa0b3ffa7&amp;gt;] ptlrpc_queue_wait+0x87/0x220 [ptlrpc]
 [&amp;lt;ffffffffa0292bea&amp;gt;] seq_client_rpc+0x12a/0x910 [fid]
 [&amp;lt;ffffffffa06efa81&amp;gt;] ? libcfs_debug_msg+0x41/0x50 [libcfs]
 [&amp;lt;ffffffffa02937cf&amp;gt;] seq_client_alloc_seq+0x3ff/0x480 [fid]
 [&amp;lt;ffffffffa02929c3&amp;gt;] ? seq_fid_alloc_prep+0x43/0xc0 [fid]
 [&amp;lt;ffffffffa02938c5&amp;gt;] seq_client_get_seq+0x75/0x1f0 [fid]
 [&amp;lt;ffffffff81063990&amp;gt;] ? default_wake_function+0x0/0x20
 [&amp;lt;ffffffffa0914263&amp;gt;] ? lu_context_init+0xa3/0x240 [obdclass]
 [&amp;lt;ffffffffa0debe12&amp;gt;] osp_init_pre_fid+0x3c2/0x500 [osp]
 [&amp;lt;ffffffffa0de1b3f&amp;gt;] osp_import_event+0xaf/0x4e0 [osp]
 [&amp;lt;ffffffffa0b63524&amp;gt;] ptlrpc_activate_import+0xb4/0x260 [ptlrpc]
 [&amp;lt;ffffffffa0b68d22&amp;gt;] ptlrpc_connect_interpret+0x1912/0x2160 [ptlrpc]
 [&amp;lt;ffffffffa0b3de44&amp;gt;] ptlrpc_check_set+0x2c4/0x1b40 [ptlrpc]
 [&amp;lt;ffffffffa0b6a4cb&amp;gt;] ptlrpcd_check+0x53b/0x560 [ptlrpc]
 [&amp;lt;ffffffffa0b6ab1b&amp;gt;] ptlrpcd+0x33b/0x3f0 [ptlrpc]
 [&amp;lt;ffffffff81063990&amp;gt;] ? default_wake_function+0x0/0x20
 [&amp;lt;ffffffffa0b6a7e0&amp;gt;] ? ptlrpcd+0x0/0x3f0 [ptlrpc]
 [&amp;lt;ffffffff81096a36&amp;gt;] kthread+0x96/0xa0
 [&amp;lt;ffffffff8100c0ca&amp;gt;] child_rip+0xa/0x20
 [&amp;lt;ffffffff810969a0&amp;gt;] ? kthread+0x0/0xa0
 [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So the whole process looks like a deadlock to me&lt;/p&gt;

&lt;p&gt;1. MDS1 is being amounted&lt;br/&gt;
2. LWP on MDS2 is trying to re-connect to MDS1, which will add Connecting RPC to ptlrpcd_rcv, and set import to CONNECTING.&lt;br/&gt;
3. umount MDS2, LWP will try to abort all of RPC related to the import. But the ptlrpcd_rcv is stuck as above, i.e. MDS2 is trying to allocate sequence from one of OST, but the OST is not able to get super sequence from MDT0, because MDT0 has been umounted, so the RPC is stuck there as above. &lt;/p&gt;

&lt;p&gt;4. Since the connecting RPC does not get the chance to be handled in ptlrpcd_rcv, then the import of LWP will be connecting for ever. &lt;/p&gt;

&lt;p&gt;So ptlrpc_disconnect_import will wait the recovering import forever. &lt;/p&gt;


</comment>
                            <comment id="75627" author="di.wang" created="Sat, 25 Jan 2014 04:38:40 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/8997&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8997&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="75637" author="yong.fan" created="Sun, 26 Jan 2014 01:55:26 +0000"  >&lt;p&gt;What happened if the OST try to get sequence from MDT0 which has been amount? It will block or return failure. I think the later is reasonable. If OST got failure from MDT0, it should forward the failure back to the MDS2, so the MDS2 also should not be blocked for ever. Missed anything?&lt;/p&gt;</comment>
                            <comment id="75638" author="di.wang" created="Sun, 26 Jan 2014 04:16:34 +0000"  >&lt;p&gt;Hmm, right now, OST can not tell whether MDT0 is being umounted or not being setup yet. So OST will retry if the request return -ENOTREADY or EBUSY. But I agree something needs to be fixed here, probably not in this patch.&lt;/p&gt;</comment>
                            <comment id="79904" author="jlevi" created="Thu, 20 Mar 2014 17:51:12 +0000"  >&lt;p&gt;Patches landed to Master. New tickets will be opened if additional fixes are required.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="22796">LU-4508</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="22348">LU-4349</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="22348">LU-4349</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="24832">LU-5107</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwbtb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>12112</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>