<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:32:05 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-3230] conf-sanity fails to start run: umount of OST fails</title>
                <link>https://jira.whamcloud.com/browse/LU-3230</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for Nathaniel Clark &amp;lt;nathaniel.l.clark@intel.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite runs:&lt;br/&gt;
&lt;a href=&quot;http://maloo.whamcloud.com/test_sets/bbe080da-ad17-11e2-bd7c-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://maloo.whamcloud.com/test_sets/bbe080da-ad17-11e2-bd7c-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;http://maloo.whamcloud.com/test_sets/51e42416-ad76-11e2-b72d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://maloo.whamcloud.com/test_sets/51e42416-ad76-11e2-b72d-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;http://maloo.whamcloud.com/test_sets/842709fa-ad73-11e2-b72d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://maloo.whamcloud.com/test_sets/842709fa-ad73-11e2-b72d-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The sub-test conf-sanity failed with the following error:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;test failed to respond and timed out&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Info required for matching: conf-sanity conf-sanity&lt;br/&gt;
Info required for matching: replay-single test_90&lt;/p&gt;</description>
                <environment></environment>
        <key id="18543">LU-3230</key>
            <summary>conf-sanity fails to start run: umount of OST fails</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="utopiabound">Nathaniel Clark</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                            <label>mn4</label>
                            <label>zfs</label>
                    </labels>
                <created>Thu, 25 Apr 2013 17:15:03 +0000</created>
                <updated>Tue, 10 Jun 2014 17:28:53 +0000</updated>
                            <resolved>Fri, 21 Feb 2014 17:14:34 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                    <version>Lustre 2.4.1</version>
                    <version>Lustre 2.5.0</version>
                    <version>Lustre 2.4.2</version>
                    <version>Lustre 2.5.1</version>
                                    <fixVersion>Lustre 2.6.0</fixVersion>
                    <fixVersion>Lustre 2.5.1</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>13</watches>
                                                                            <comments>
                            <comment id="57120" author="niu" created="Fri, 26 Apr 2013 13:11:30 +0000"  >&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/2ad1712a-ae62-11e2-a8d0-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/2ad1712a-ae62-11e2-a8d0-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="57137" author="chris" created="Fri, 26 Apr 2013 16:06:01 +0000"  >&lt;p&gt;Niu: Apparently you believe this is a test environment issue, could you possibly indicate why.&lt;/p&gt;

&lt;p&gt;I&apos;m open to the idea but if you thoughts will help us in our work.&lt;/p&gt;</comment>
                            <comment id="57155" author="adilger" created="Fri, 26 Apr 2013 18:54:30 +0000"  >&lt;p&gt;It seems this problem is only intermittently being hit, though it seems more common on review-zfs tests than regular review tests.  I don&apos;t know if it is a test environment problem or not, but there isn&apos;t any data in the Maloo results to try and diagnose what the problem really is.&lt;/p&gt;</comment>
                            <comment id="57172" author="niu" created="Sat, 27 Apr 2013 02:01:02 +0000"  >&lt;p&gt;Chris, the maloo shows &quot;Failure Rate: 100.00% of last 100 executions &lt;span class=&quot;error&quot;&gt;&amp;#91;all branches&amp;#93;&lt;/span&gt;&quot;, and there isn&apos;t any log in Maloo result, so I think we&apos;d keep you in the loop to see if there is any problem in the testing environment, I&apos;m not sure what the problem really is.&lt;/p&gt;</comment>
                            <comment id="57188" author="yujian" created="Sat, 27 Apr 2013 15:08:23 +0000"  >&lt;p&gt;Another instance while testing patch &lt;a href=&quot;http://review.whamcloud.com/6154&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/6154&lt;/a&gt; :&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/8927b874-af1b-11e2-901b-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/8927b874-af1b-11e2-901b-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;There were no console logs on Maloo. After looking into the console log file of client-14vm2 on brent node, I found:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Lustre: DEBUG MARKER: -----============= acceptance-small: conf-sanity ============----- Fri Apr 26 20:27:00 PDT 2013^M
Lustre: DEBUG MARKER: /usr/sbin/lctl mark excepting tests: 32newtarball 59 64^M
Lustre: DEBUG MARKER: excepting tests: 32newtarball 59 64^M
Lustre: DEBUG MARKER: /usr/sbin/lctl mark skipping tests SLOW=no: 30a 31 45^M
Lustre: DEBUG MARKER: skipping tests SLOW=no: 30a 31 45^M
Lustre: DEBUG MARKER: running=$(grep -c /mnt/lustre&apos; &apos; /proc/mounts);^M
if [ $running -ne 0 ] ; then^M
echo Stopping client $(hostname) /mnt/lustre opts:;^M
lsof /mnt/lustre || need_kill=no;^M
if [ x != x -a x$need_kill != xno ]; then^M
    pids=$(lsof -t /mnt/lustre | sort -u);^M
    if ^M
LustreError: 4077:0:(osc_cache.c:2381:osc_teardown_async_page()) extent ffff88006c4317c8@{[0 -&amp;gt; 127/255], [2|0|-|cache|wi|ffff88006c8953c8], [524288|128|+|-|ffff88005e101978|256|(null)]} trunc at 0.^M
LustreError: 4077:0:(osc_page.c:430:osc_page_delete()) page@ffff88007a7e6600[2 ffff88007ac06a50:0 ^(null)_ffff8800597fce00 4 0 1 (null) (null) 0x0]^M
LustreError: 4077:0:(osc_page.c:430:osc_page_delete()) page@ffff8800597fce00[2 ffff88007c9a5c48:0 ^ffff88007a7e6600_(null) 4 0 1 (null) (null) 0x0]^M
LustreError: 4077:0:(osc_page.c:430:osc_page_delete()) vvp-page@ffff88007a7e66c0(0:0:0) vm@ffffea0000f5be88 20000000000075 3:0 0 0 lru^M
LustreError: 4077:0:(osc_page.c:430:osc_page_delete()) lov-page@ffff88007a7e6710^M
LustreError: 4077:0:(osc_page.c:430:osc_page_delete()) osc-page@ffff8800597fcee8: 1&amp;lt; 0x845fed 258 0 + - &amp;gt; 2&amp;lt; 0 0 4096 0x0 0x520 | (null) ffff88006e6bc7c0 ffff88006c8953c8 &amp;gt; 3&amp;lt; + ffff88007a141540 0 0 0 &amp;gt; 4&amp;lt; 0 0 8 0 - | - - + - &amp;gt; 5&amp;lt; - - + - | 0 - | 128 - -&amp;gt;^M
LustreError: 4077:0:(osc_page.c:430:osc_page_delete()) end page@ffff88007a7e6600^M
LustreError: 4077:0:(osc_page.c:430:osc_page_delete()) Trying to teardown failed: -16^M
LustreError: 4077:0:(osc_page.c:431:osc_page_delete()) ASSERTION( 0 ) failed: ^M
LustreError: 4077:0:(osc_page.c:431:osc_page_delete()) LBUG^M
Pid: 4077, comm: umount^M
^M
Call Trace:^M
 [&amp;lt;ffffffffa0b74895&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]^M
 [&amp;lt;ffffffffa0b74e97&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]^M
 [&amp;lt;ffffffffa0f7fbf1&amp;gt;] osc_page_delete+0x311/0x320 [osc]^M
 [&amp;lt;ffffffffa0c4feb5&amp;gt;] cl_page_delete0+0xc5/0x4e0 [obdclass]^M
 [&amp;lt;ffffffffa0c50312&amp;gt;] cl_page_delete+0x42/0x120 [obdclass]^M
 [&amp;lt;ffffffffa10a556d&amp;gt;] ll_invalidatepage+0x8d/0x160 [lustre]^M
 [&amp;lt;ffffffff811304e5&amp;gt;] do_invalidatepage+0x25/0x30^M
 [&amp;lt;ffffffff81130802&amp;gt;] truncate_inode_page+0xa2/0xc0^M
 [&amp;lt;ffffffff81130baf&amp;gt;] truncate_inode_pages_range+0x16f/0x500^M
 [&amp;lt;ffffffff81130fd5&amp;gt;] truncate_inode_pages+0x15/0x20^M
 [&amp;lt;ffffffff8119d188&amp;gt;] dispose_list+0xe8/0x120^M
 [&amp;lt;ffffffff8119d58a&amp;gt;] invalidate_inodes+0xea/0x190^M
 [&amp;lt;ffffffff8118333c&amp;gt;] generic_shutdown_super+0x4c/0xe0^M
 [&amp;lt;ffffffff81183436&amp;gt;] kill_anon_super+0x16/0x60^M
 [&amp;lt;ffffffffa0c35b2a&amp;gt;] lustre_kill_super+0x4a/0x60 [obdclass]^M
 [&amp;lt;ffffffff81183bd7&amp;gt;] deactivate_super+0x57/0x80^M
 [&amp;lt;ffffffff811a1c4f&amp;gt;] mntput_no_expire+0xbf/0x110^M
 [&amp;lt;ffffffff811a26bb&amp;gt;] sys_umount+0x7b/0x3a0^M
 [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b^M
^M
Kernel panic - not syncing: LBUG^M
Pid: 4077, comm: umount Not tainted 2.6.32-358.2.1.el6.x86_64 #1^M
Call Trace:^M
 [&amp;lt;ffffffff8150d248&amp;gt;] ? panic+0xa7/0x16f^M
 [&amp;lt;ffffffffa0b74eeb&amp;gt;] ? lbug_with_loc+0x9b/0xb0 [libcfs]^M
 [&amp;lt;ffffffffa0f7fbf1&amp;gt;] ? osc_page_delete+0x311/0x320 [osc]^M
 [&amp;lt;ffffffffa0c4feb5&amp;gt;] ? cl_page_delete0+0xc5/0x4e0 [obdclass]^M
 [&amp;lt;ffffffffa0c50312&amp;gt;] ? cl_page_delete+0x42/0x120 [obdclass]^M
 [&amp;lt;ffffffffa10a556d&amp;gt;] ? ll_invalidatepage+0x8d/0x160 [lustre]^M
 [&amp;lt;ffffffff811304e5&amp;gt;] ? do_invalidatepage+0x25/0x30^M
 [&amp;lt;ffffffff81130802&amp;gt;] ? truncate_inode_page+0xa2/0xc0^M
 [&amp;lt;ffffffff81130baf&amp;gt;] ? truncate_inode_pages_range+0x16f/0x500^M
 [&amp;lt;ffffffff81130fd5&amp;gt;] ? truncate_inode_pages+0x15/0x20^M
 [&amp;lt;ffffffff8119d188&amp;gt;] ? dispose_list+0xe8/0x120^M
 [&amp;lt;ffffffff8119d58a&amp;gt;] ? invalidate_inodes+0xea/0x190^M
 [&amp;lt;ffffffff8118333c&amp;gt;] ? generic_shutdown_super+0x4c/0xe0^M
 [&amp;lt;ffffffff81183436&amp;gt;] ? kill_anon_super+0x16/0x60^M
 [&amp;lt;ffffffffa0c35b2a&amp;gt;] ? lustre_kill_super+0x4a/0x60 [obdclass]^M
 [&amp;lt;ffffffff81183bd7&amp;gt;] ? deactivate_super+0x57/0x80^M
 [&amp;lt;ffffffff811a1c4f&amp;gt;] ? mntput_no_expire+0xbf/0x110^M
 [&amp;lt;ffffffff811a26bb&amp;gt;] ? sys_umount+0x7b/0x3a0^M
 [&amp;lt;ffffffff8100b072&amp;gt;] ? system_call_fastpath+0x16/0x1b^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="57201" author="niu" created="Sun, 28 Apr 2013 06:50:53 +0000"  >&lt;p&gt;Thanks, Yujian. This failure is caused by the patch. I&apos;ll update the patch.&lt;/p&gt;</comment>
                            <comment id="57203" author="pjones" created="Sun, 28 Apr 2013 07:44:22 +0000"  >&lt;p&gt;Dropping priority because failures seen testing &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3160&quot; title=&quot;recovery-random-scale test_fail_client_mds: RIP: cl_object_top+0xe/0x150 [obdclass]&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3160&quot;&gt;&lt;del&gt;LU-3160&lt;/del&gt;&lt;/a&gt; patch relate to the patch itself&lt;/p&gt;</comment>
                            <comment id="59768" author="utopiabound" created="Fri, 31 May 2013 16:26:07 +0000"  >&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/84ae39b2-c942-11e2-97fe-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/84ae39b2-c942-11e2-97fe-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Pulled console logs from Rosso directly:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Lustre: DEBUG MARKER: /usr/sbin/lctl mark -----============= acceptance-small: conf-sanity ============----- Wed May 29 21:46:54 PDT 2013
Lustre: DEBUG MARKER: -----============= acceptance-small: conf-sanity ============----- Wed May 29 21:46:54 PDT 2013
Lustre: DEBUG MARKER: /usr/sbin/lctl mark excepting tests: 32newtarball 59 64 57b 50h
Lustre: DEBUG MARKER: excepting tests: 32newtarball 59 64 57b 50h
Lustre: DEBUG MARKER: /usr/sbin/lctl mark skipping tests SLOW=no: 30a 31 45
Lustre: DEBUG MARKER: skipping tests SLOW=no: 30a 31 45
Lustre: DEBUG MARKER: grep -c /mnt/ost1&apos; &apos; /proc/mounts
Lustre: DEBUG MARKER: umount -d -f /mnt/ost1
Lustre: lustre-OST0000 is waiting for obd_unlinked_exports more than 8 seconds. The obd refcount = 3. Is it stuck?
Lustre: server umount lustre-OST0000 complete
Lustre: DEBUG MARKER: lsmod | grep lnet &amp;gt; /dev/null &amp;amp;&amp;amp; lctl dl | grep &apos; ST &apos;
Lustre: DEBUG MARKER: grep -c /mnt/ost2&apos; &apos; /proc/mounts
Lustre: DEBUG MARKER: umount -d -f /mnt/ost2
Lustre: server umount lustre-OST0001 complete
Lustre: DEBUG MARKER: lsmod | grep lnet &amp;gt; /dev/null &amp;amp;&amp;amp; lctl dl | grep &apos; ST &apos;
Lustre: DEBUG MARKER: grep -c /mnt/ost3&apos; &apos; /proc/mounts
Lustre: DEBUG MARKER: umount -d -f /mnt/ost3
Lustre: lustre-OST0002 is waiting for obd_unlinked_exports more than 8 seconds. The obd refcount = 5. Is it stuck?
Lustre: 5845:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1369889304/real 1369889304]  req@ffff8800633a8000 x1436410772180168/t0(0) o38-&amp;gt;lustre-MDT0000-lwp-OST0004@10.10.17.6@tcp:12/10 lens 400/544 e 0 to 1 dl 1369889329 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
Lustre: 5845:0:(client.c:1868:ptlrpc_expire_one_request()) Skipped 39 previous similar messages
Lustre: lustre-OST0002 is waiting for obd_unlinked_exports more than 16 seconds. The obd refcount = 5. Is it stuck?
Lustre: lustre-OST0002 is waiting for obd_unlinked_exports more than 32 seconds. The obd refcount = 5. Is it stuck?
Lustre: lustre-OST0002 is waiting for obd_unlinked_exports more than 64 seconds. The obd refcount = 5. Is it stuck?
INFO: task umount:21142 blocked for more than 120 seconds.
&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
umount        D 0000000000000000     0 21142  21141 0x00000080
 ffff88006bb17aa8 0000000000000086 ffffffff00000010 ffff88006bb17a58
 ffff88006bb17a18 ffff88004943e800 ffffffffa078c44c 0000000000000000
 ffff880067569098 ffff88006bb17fd8 000000000000fb88 ffff880067569098
Call Trace:
 [&amp;lt;ffffffff8150ed82&amp;gt;] schedule_timeout+0x192/0x2e0
 [&amp;lt;ffffffff810810e0&amp;gt;] ? process_timeout+0x0/0x10
 [&amp;lt;ffffffffa05d56bd&amp;gt;] cfs_schedule_timeout_and_set_state+0x1d/0x20 [libcfs]
 [&amp;lt;ffffffffa070c848&amp;gt;] obd_exports_barrier+0x98/0x170 [obdclass]
 [&amp;lt;ffffffffa0e44a72&amp;gt;] ofd_device_fini+0x42/0x230 [ofd]
 [&amp;lt;ffffffffa0739b57&amp;gt;] class_cleanup+0x577/0xda0 [obdclass]
 [&amp;lt;ffffffffa070eae6&amp;gt;] ? class_name2dev+0x56/0xe0 [obdclass]
 [&amp;lt;ffffffffa073b43c&amp;gt;] class_process_config+0x10bc/0x1c80 [obdclass]
 [&amp;lt;ffffffffa0734c63&amp;gt;] ? lustre_cfg_new+0x353/0x7e0 [obdclass]
 [&amp;lt;ffffffffa073c179&amp;gt;] class_manual_cleanup+0x179/0x6f0 [obdclass]
 [&amp;lt;ffffffffa070eae6&amp;gt;] ? class_name2dev+0x56/0xe0 [obdclass]
 [&amp;lt;ffffffffa0770e7c&amp;gt;] server_put_super+0x5bc/0xf00 [obdclass]
 [&amp;lt;ffffffff8118334b&amp;gt;] generic_shutdown_super+0x5b/0xe0
 [&amp;lt;ffffffff81183436&amp;gt;] kill_anon_super+0x16/0x60
 [&amp;lt;ffffffffa073dfd6&amp;gt;] lustre_kill_super+0x36/0x60 [obdclass]
 [&amp;lt;ffffffff81183bd7&amp;gt;] deactivate_super+0x57/0x80
 [&amp;lt;ffffffff811a1c4f&amp;gt;] mntput_no_expire+0xbf/0x110
 [&amp;lt;ffffffff811a26bb&amp;gt;] sys_umount+0x7b/0x3a0
 [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b
Lustre: lustre-OST0002 is waiting for obd_unlinked_exports more than 128 seconds. The obd refcount = 5. Is it stuck?
INFO: task umount:21142 blocked for more than 120 seconds.
&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
umount        D 0000000000000000     0 21142  21141 0x00000080
 ffff88006bb17aa8 0000000000000086 ffffffff00000010 ffff88006bb17a58
 ffff88006bb17a18 ffff88004943e800 ffffffffa078c44c 0000000000000000
 ffff880067569098 ffff88006bb17fd8 000000000000fb88 ffff880067569098
Call Trace:
 [&amp;lt;ffffffff8150ed82&amp;gt;] schedule_timeout+0x192/0x2e0
 [&amp;lt;ffffffff810810e0&amp;gt;] ? process_timeout+0x0/0x10
 [&amp;lt;ffffffffa05d56bd&amp;gt;] cfs_schedule_timeout_and_set_state+0x1d/0x20 [libcfs]
 [&amp;lt;ffffffffa070c848&amp;gt;] obd_exports_barrier+0x98/0x170 [obdclass]
 [&amp;lt;ffffffffa0e44a72&amp;gt;] ofd_device_fini+0x42/0x230 [ofd]
 [&amp;lt;ffffffffa0739b57&amp;gt;] class_cleanup+0x577/0xda0 [obdclass]
 [&amp;lt;ffffffffa070eae6&amp;gt;] ? class_name2dev+0x56/0xe0 [obdclass]
 [&amp;lt;ffffffffa073b43c&amp;gt;] class_process_config+0x10bc/0x1c80 [obdclass]
 [&amp;lt;ffffffffa0734c63&amp;gt;] ? lustre_cfg_new+0x353/0x7e0 [obdclass]
 [&amp;lt;ffffffffa073c179&amp;gt;] class_manual_cleanup+0x179/0x6f0 [obdclass]
 [&amp;lt;ffffffffa070eae6&amp;gt;] ? class_name2dev+0x56/0xe0 [obdclass]
 [&amp;lt;ffffffffa0770e7c&amp;gt;] server_put_super+0x5bc/0xf00 [obdclass]
 [&amp;lt;ffffffff8118334b&amp;gt;] generic_shutdown_super+0x5b/0xe0
 [&amp;lt;ffffffff81183436&amp;gt;] kill_anon_super+0x16/0x60
 [&amp;lt;ffffffffa073dfd6&amp;gt;] lustre_kill_super+0x36/0x60 [obdclass]
 [&amp;lt;ffffffff81183bd7&amp;gt;] deactivate_super+0x57/0x80
 [&amp;lt;ffffffff811a1c4f&amp;gt;] mntput_no_expire+0xbf/0x110
 [&amp;lt;ffffffff811a26bb&amp;gt;] sys_umount+0x7b/0x3a0
 [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b
Lustre: lustre-OST0002 is waiting for obd_unlinked_exports more than 256 seconds. The obd refcount = 5. Is it stuck?
Lustre: 5845:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1369889919/real 1369889919]  req@ffff88006eacd000 x1436410772180668/t0(0) o38-&amp;gt;lustre-MDT0000-lwp-OST0003@10.10.17.6@tcp:12/10 lens 400/544 e 0 to 1 dl 1369889945
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="59770" author="utopiabound" created="Fri, 31 May 2013 16:37:11 +0000"  >&lt;p&gt;This bug seems related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2015&quot; title=&quot;Test failure on test suite obdfilter-survey, subtest test_3a&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2015&quot;&gt;&lt;del&gt;LU-2015&lt;/del&gt;&lt;/a&gt; and &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2939&quot; title=&quot;Lustre: MGS is waiting for obd_unlinked_exports more than 256 seconds. The obd refcount = 5. Is it stuck?&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2939&quot;&gt;&lt;del&gt;LU-2939&lt;/del&gt;&lt;/a&gt; but the patch for those MGS ref count issues has landed without fixing this issue.&lt;/p&gt;</comment>
                            <comment id="59911" author="keith" created="Mon, 3 Jun 2013 18:26:49 +0000"  >&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/e980c046-caee-11e2-95b5-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/e980c046-caee-11e2-95b5-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Still an issue as of the 1st. &lt;/p&gt;

&lt;p&gt;conf-sanity has no logs and reports 100/100 failure rate.  This was on zfs. &lt;/p&gt;</comment>
                            <comment id="60050" author="utopiabound" created="Wed, 5 Jun 2013 18:12:01 +0000"  >&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/86686766-cd5f-11e2-a1e0-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/86686766-cd5f-11e2-a1e0-52540035b04c&lt;/a&gt;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;04:38:20:Lustre: DEBUG MARKER: umount -d /mnt/ost4
04:38:20:Lustre: Failing over lustre-OST0003
04:38:20:Lustre: lustre-OST0003: Not available for connect from 10.10.16.173@tcp (stopping)
04:38:20:Lustre: Skipped 1 previous similar message
04:38:20:Lustre: lustre-OST0003 is waiting for obd_unlinked_exports more than 8 seconds. The obd refcount = 5. Is it stuck?
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;trimmed OST debug log:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;(genops.c:1581:obd_exports_barrier()) lustre-OST0003 is waiting for obd_unlinked_exports more than 8 seconds. The obd refcount = 5. Is it stuck?
(genops.c:1541:print_export_data()) lustre-OST0003: UNLINKED ffff88003890ac00 lustre-MDT0000-mdtlov_UUID 10.10.16.170@tcp 1 (0 0 0) 1 0 1 0: (null)  0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="61633" author="bfaccini" created="Tue, 2 Jul 2013 11:57:22 +0000"  >&lt;p&gt;Just got an other occurence of such hang/loop in obd_exports_barrier() on OSS causing auto-test at &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/67d6eaea-e28e-11e2-b3c9-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/67d6eaea-e28e-11e2-b3c9-52540035b04c&lt;/a&gt; to timeout.&lt;/p&gt;

&lt;p&gt;Looks like an export waits for ever to be unlinked from obd_unlinked_exports object&apos;s list. BTW, looking in the source it&apos;s not obvious for me where this should happen. Any advice ?&lt;/p&gt;

&lt;p&gt;Here a some logs/traces extracts from the failing test.&lt;/p&gt;

&lt;p&gt;Test log :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;== replay-single test 90: lfs find identifies the missing striped file segments == 08:45:48 (1372693548)
Create the files
CMD: wtm-13vm4 lctl get_param -n obdfilter.lustre-OST0004.uuid
Fail ost5 lustre-OST0004_UUID, display the list of affected files
CMD: wtm-13vm4 grep -c /mnt/ost5&apos; &apos; /proc/mounts
Stopping /mnt/ost5 (opts:) on wtm-13vm4
CMD: wtm-13vm4 umount -d /mnt/ost5  &amp;lt;&amp;lt;&amp;lt;---- Stuck here
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;OSS debug_log :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000020:00080000:0.0:1372693555.591347:0:9638:0:(genops.c:1535:print_export_data()) lustre-OST0004: ACTIVE ffff8800720a9800 lustre-MDT0000-mdtlov_UUID 10.10.16.143@tcp 3 (0 0 0) 0 0 0 0: (null)  0
00000020:00080000:0.0:1372693555.591350:0:9638:0:(genops.c:1535:print_export_data()) lustre-OST0004: ACTIVE ffff880068d27400 480e6903-ca7b-ddb8-25aa-8c792a44efa7 10.10.16.145@tcp 3 (0 0 0) 0 0 0 0: (null)  0
00000020:00080000:0.0:1372693555.591352:0:9638:0:(genops.c:1535:print_export_data()) lustre-OST0004: ACTIVE ffff88004462d800 87ca4efc-b558-d054-8613-d9adf4621433 10.10.16.146@tcp 5 (0 0 2) 0 0 0 0: (null)  0
00000020:00080000:0.0:1372693555.591354:0:9638:0:(genops.c:1535:print_export_data()) lustre-OST0004: ACTIVE ffff88006ab4d800 lustre-OST0004_UUID (no nid) 1 (0 0 0) 0 0 0 0: (null)  0
00000020:00080000:0.0:1372693555.591356:0:9638:0:(genops.c:1535:print_export_data()) lustre-OST0004: UNLINKED ffff88002030d800 lustre-MDT0000-mdtlov_UUID 10.10.16.143@tcp 1 (0 0 0) 1 0 1 0: (null)  0
00000020:00080000:0.0:1372693555.591357:0:9638:0:(genops.c:1308:class_disconnect_exports()) OBD device 15 (ffff88006aa343f8) has exports, disconnecting them
00000020:00080000:0.0:1372693555.591359:0:9638:0:(genops.c:1283:class_disconnect_export_list()) lustre-OST0004: disconnecting export at 10.10.16.143@tcp (ffff8800720a9800), last request at 1372693545
00000100:00080000:0.0:1372693555.591366:0:9638:0:(import.c:1512:ptlrpc_cleanup_imp()) ffff88001973f000 �: changing import state from FULL to CLOSED
00000020:00080000:0.0:1372693555.591377:0:9638:0:(genops.c:1288:class_disconnect_export_list()) disconnected export at 10.10.16.143@tcp (ffff8800720a9800): rc 0
00000020:00080000:0.0:1372693555.591379:0:9638:0:(genops.c:1283:class_disconnect_export_list()) lustre-OST0004: disconnecting export at 10.10.16.145@tcp (ffff880068d27400), last request at 1372693549
00000100:00080000:0.0:1372693555.591381:0:9638:0:(import.c:1512:ptlrpc_cleanup_imp()) ffff8800190a4000 �: changing import state from FULL to CLOSED
00000020:00080000:0.0:1372693555.591386:0:9638:0:(genops.c:1288:class_disconnect_export_list()) disconnected export at 10.10.16.145@tcp (ffff880068d27400): rc 0
00000020:00080000:0.0:1372693555.591388:0:9638:0:(genops.c:1283:class_disconnect_export_list()) lustre-OST0004: disconnecting export at 10.10.16.146@tcp (ffff88004462d800), last request at 1372693549
00000100:00080000:0.0:1372693555.591389:0:9638:0:(import.c:1512:ptlrpc_cleanup_imp()) ffff880019151800 �: changing import state from FULL to CLOSED
00010000:00010000:0.0:1372693555.591393:0:9638:0:(ldlm_lock.c:2218:ldlm_cancel_locks_for_export_cb()) ### export ffff88004462d800 ns: filter-lustre-OST0004_UUID lock: ffff88007a1969c0/0xf03433df160b73d8 lrc: 4/0,0 mode: PR/PR res: [0x2d02:0x0:0x0].0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;18446744073709551615) flags: 0x40000000000000 nid: 10.10.16.146@tcp remote: 0xe2b9ddc0f9be730c expref: 4 pid: 10014 timeout: 0 lvb_type: 1
00010000:00010000:0.0:1372693555.591417:0:9638:0:(ldlm_lock.c:219:ldlm_lock_put()) ### final lock_put on destroyed lock, freeing it. ns: filter-lustre-OST0004_UUID lock: ffff88007a1969c0/0xf03433df160b73d8 lrc: 0/0,0 mode: --/PR res: [0x2d02:0x0:0x0].0 rrc: 1 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;18446744073709551615) flags: 0x44801000000000 nid: 10.10.16.146@tcp remote: 0xe2b9ddc0f9be730c expref: 4 pid: 10014 timeout: 0 lvb_type: 1
00010000:00010000:0.0:1372693555.591424:0:9638:0:(ldlm_lock.c:2218:ldlm_cancel_locks_for_export_cb()) ### export ffff88004462d800 ns: filter-lustre-OST0004_UUID lock: ffff880024fe3280/0xf03433df160b741e lrc: 4/0,0 mode: PR/PR res: [0x2d03:0x0:0x0].0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;18446744073709551615) flags: 0x40000000000000 nid: 10.10.16.146@tcp remote: 0xe2b9ddc0f9be74c5 expref: 3 pid: 7587 timeout: 0 lvb_type: 1
00010000:00010000:0.0:1372693555.591434:0:9638:0:(ldlm_lock.c:219:ldlm_lock_put()) ### final lock_put on destroyed lock, freeing it. ns: filter-lustre-OST0004_UUID lock: ffff880024fe3280/0xf03433df160b741e lrc: 0/0,0 mode: --/PR res: [0x2d03:0x0:0x0].0 rrc: 1 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;18446744073709551615) flags: 0x44801000000000 nid: 10.10.16.146@tcp remote: 0xe2b9ddc0f9be74c5 expref: 3 pid: 7587 timeout: 0 lvb_type: 1
00000020:00080000:0.0:1372693555.591440:0:9638:0:(genops.c:1288:class_disconnect_export_list()) disconnected export at 10.10.16.146@tcp (ffff88004462d800): rc 0
00000020:00080000:0.0:1372693555.591442:0:9638:0:(genops.c:1271:class_disconnect_export_list()) exp ffff88006ab4d800 export uuid == obd uuid, don&apos;t discon
00000020:00080000:0.0:1372693555.591444:0:9638:0:(genops.c:1313:class_disconnect_exports()) OBD device 15 (ffff88006aa343f8) has no exports

and then several of the following with increasing time

00000020:02000400:0.0:1372693569.591340:0:9638:0:(genops.c:1575:obd_exports_barrier()) lustre-OST0004 is waiting for obd_unlinked_exports more than 8 seconds. The obd refcount = 5. Is it stuck?
00000020:00080000:0.0:1372693569.594019:0:9638:0:(genops.c:1535:print_export_data()) lustre-OST0004: UNLINKED ffff88002030d800 lustre-MDT0000-mdtlov_UUID 10.10.16.143@tcp 1 (0 0 0) 1 0 1 0: (null)  0

.....

00000020:02000400:0.0:1372693585.593287:0:9638:0:(genops.c:1575:obd_exports_barrier()) lustre-OST0004 is waiting for obd_unlinked_exports more than 16 seconds. The obd refcount = 5. Is it stuck?
00000020:00080000:0.0:1372693585.594530:0:9638:0:(genops.c:1535:print_export_data()) lustre-OST0004: UNLINKED ffff88002030d800 lustre-MDT0000-mdtlov_UUID 10.10.16.143@tcp 1 (0 0 0) 1 0 1 0: (null)  0

.....

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;OSS syslog :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;.....
Jul  1 08:32:53 wtm-13vm4 mrshd[31217]: root@wtm-13vm6.rosso.whamcloud.com as root: cmd=&apos;/usr/sbin/lctl mark &quot;/usr/sbin/lctl mark test_70b fail mds1 1 times&quot;;echo XXRETCODE:$?&apos;
Jul  1 08:32:53 wtm-13vm4 kernel: Lustre: DEBUG MARKER: /usr/sbin/lctl mark test_70b fail mds1 1 times
Jul  1 08:32:53 wtm-13vm4 xinetd[1761]: EXIT: mshell status=0 pid=31216 duration=0(sec)
Jul  1 08:32:53 wtm-13vm4 xinetd[1761]: START: mshell pid=31239 from=::ffff:10.10.16.146
Jul  1 08:32:53 wtm-13vm4 mrshd[31240]: root@wtm-13vm6.rosso.whamcloud.com as root: cmd=&apos;(PATH=$PATH:/usr/lib64/lustre/utils:/usr/lib64/lustre/tests:/sbin:/usr/sbin; cd /usr/lib64/lustre/tests; LUSTRE=&quot;/usr/lib64/lustre&quot;  MGSFSTYPE=zfs MDSFSTYPE=zfs OSTFSTYPE=zfs FSTYPE=zfs sh -c &quot;/usr/sbin/lctl mark test_70b fail mds1 1 times&quot;);echo XXRETCODE:$?&apos;
Jul  1 08:32:53 wtm-13vm4 kernel: Lustre: DEBUG MARKER: test_70b fail mds1 1 times
Jul  1 08:32:53 wtm-13vm4 xinetd[1761]: EXIT: mshell status=0 pid=31239 duration=0(sec)
Jul  1 08:32:56 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for cpu_wio#012
Jul  1 08:33:10 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for proc_run#012
Jul  1 08:33:11 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for bytes_out#012
Jul  1 08:33:11 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for pkts_out#012
Jul  1 08:33:16 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for cpu_user#012
Jul  1 08:33:16 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for cpu_idle#012
Jul  1 08:33:16 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for cpu_aidle#012
Jul  1 08:33:36 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for cpu_user#012
Jul  1 08:33:36 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for cpu_idle#012
Jul  1 08:33:36 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for load_one#012
Jul  1 08:33:36 wtm-13vm4 kernel: Lustre: lustre-OST0004: Export ffff88002030d800 already connecting from 10.10.16.143@tcp
Jul  1 08:33:36 wtm-13vm4 kernel: Lustre: lustre-OST0005: Export ffff88006c972400 already connecting from 10.10.16.143@tcp
Jul  1 08:33:39 wtm-13vm4 kernel: Lustre: 5417:0:(client.c:1868:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1372692794/real 1372692794]  req@ffff880070a3bc00 x1439352804834872/t0(0) o38-&amp;gt;lustre-MDT0000-lwp-OST0001@10.10.16.143@tcp:12/10 lens 400/544 e 0 to 1 dl 1372692819 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
Jul  1 08:33:39 wtm-13vm4 kernel: Lustre: 5417:0:(client.c:1868:ptlrpc_expire_one_request()) Skipped 34 previous similar messages
Jul  1 08:33:41 wtm-13vm4 kernel: Lustre: lustre-OST0004: Export ffff88002030d800 already connecting from 10.10.16.143@tcp
Jul  1 08:33:46 wtm-13vm4 kernel: Lustre: lustre-OST0004: Export ffff88002030d800 already connecting from 10.10.16.143@tcp
Jul  1 08:33:46 wtm-13vm4 kernel: Lustre: Skipped 1 previous similar message
.....
Jul  1 08:33:56 wtm-13vm4 kernel: Lustre: lustre-OST0004: Export ffff88002030d800 already connecting from 10.10.16.143@tcp
Jul  1 08:33:56 wtm-13vm4 kernel: Lustre: Skipped 1 previous similar message
Jul  1 08:34:07 wtm-13vm4 kernel: Lustre: 7568:0:(service.c:2031:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (20:16s); client may timeout.  req@ffff880044924800 x1439352772828224/t0(0) o8-&amp;gt;lustre-MDT0000-mdtlov_UUID@10.10.16.143@tcp:0/0 lens 400/264 e 0 to 0 dl 1372692831 ref 1 fl Complete:/0/0 rc 0/0
Jul  1 08:34:16 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for cpu_nice#012
Jul  1 08:34:20 wtm-13vm4 kernel: Lustre: lustre-OST0004: haven&apos;t heard from client lustre-MDT0000-mdtlov_UUID (at (no nid)) in 49 seconds. I think it&apos;s dead, and I am evicting it. exp ffff88002030d800, cur 1372692860 expire 1372692830 last 1372692811
.....
Jul  1 08:45:50 wtm-13vm4 mrshd[9593]: root@wtm-13vm6.rosso.whamcloud.com as root: cmd=&apos;/usr/sbin/lctl mark &quot;umount -d /mnt/ost5&quot;;echo XXRETCODE:$?&apos;
Jul  1 08:45:50 wtm-13vm4 kernel: Lustre: DEBUG MARKER: umount -d /mnt/ost5
Jul  1 08:45:50 wtm-13vm4 xinetd[1761]: EXIT: mshell status=0 pid=9592 duration=0(sec)
Jul  1 08:45:50 wtm-13vm4 xinetd[1761]: START: mshell pid=9615 from=::ffff:10.10.16.146
Jul  1 08:45:50 wtm-13vm4 mrshd[9616]: root@wtm-13vm6.rosso.whamcloud.com as root: cmd=&apos;(PATH=$PATH:/usr/lib64/lustre/utils:/usr/lib64/lustre/tests:/sbin:/usr/sbin; cd /usr/lib64/lustre/tests; LUSTRE=&quot;/usr/lib64/lustre&quot; sh -c &quot;umount -d /mnt/ost5&quot;);echo XXRETCODE:$?&apos;
Jul  1 08:45:50 wtm-13vm4 kernel: Lustre: Failing over lustre-OST0004
Jul  1 08:45:51 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for mem_free#012
Jul  1 08:45:51 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for mem_shared#012
Jul  1 08:45:51 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for mem_cached#012
Jul  1 08:45:51 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for swap_free#012
Jul  1 08:45:51 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for bytes_in#012
Jul  1 08:45:56 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for load_five#012
Jul  1 08:45:56 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for load_fifteen#012
Jul  1 08:45:56 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for cpu_user#012
Jul  1 08:45:58 wtm-13vm4 kernel: Lustre: lustre-OST0004: Not available for connect from 10.10.16.143@tcp (stopping)
Jul  1 08:45:58 wtm-13vm4 kernel: Lustre: Skipped 1 previous similar message
Jul  1 08:46:03 wtm-13vm4 kernel: Lustre: lustre-OST0004: Not available for connect from 10.10.16.143@tcp (stopping)
Jul  1 08:46:03 wtm-13vm4 kernel: Lustre: Skipped 2 previous similar messages
Jul  1 08:46:08 wtm-13vm4 kernel: Lustre: lustre-OST0004: Not available for connect from 10.10.16.143@tcp (stopping)
Jul  1 08:46:08 wtm-13vm4 kernel: Lustre: Skipped 2 previous similar messages
Jul  1 08:46:09 wtm-13vm4 kernel: Lustre: lustre-OST0004 is waiting for obd_unlinked_exports more than 8 seconds. The obd refcount = 5. Is it stuck?
Jul  1 08:46:16 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for cpu_user#012
Jul  1 08:46:16 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for cpu_system#012
Jul  1 08:46:16 wtm-13vm4 /usr/sbin/gmond[1777]: Error 1 sending the modular data for cpu_idle#012
Jul  1 08:46:18 wtm-13vm4 kernel: Lustre: lustre-OST0004: Not available for connect from 10.10.16.143@tcp (stopping)
Jul  1 08:46:18 wtm-13vm4 kernel: Lustre: Skipped 5 previous similar messages
Jul  1 08:46:25 wtm-13vm4 kernel: Lustre: lustre-OST0004 is waiting for obd_unlinked_exports more than 16 seconds. The obd refcount = 5. Is it stuck?
....

then several stacks are dumped for same hung process umount/9638 like the following :

Jul  1 08:52:18 wtm-13vm4 kernel: INFO: task umount:9638 blocked for more than 120 seconds.
Jul  1 08:52:18 wtm-13vm4 kernel: &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
Jul  1 08:52:18 wtm-13vm4 kernel: umount        D 0000000000000000     0  9638   9637 0x00000080
Jul  1 08:52:18 wtm-13vm4 kernel: ffff8800194b3aa8 0000000000000082 ffffffff00000010 ffff8800194b3a58
Jul  1 08:52:18 wtm-13vm4 kernel: ffff8800194b3a18 ffff88002030d800 ffffffffa078e0ff 0000000000000000
Jul  1 08:52:18 wtm-13vm4 kernel: ffff880019e91ab8 ffff8800194b3fd8 000000000000fb88 ffff880019e91ab8
Jul  1 08:52:18 wtm-13vm4 kernel: Call Trace:
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffff8150ee02&amp;gt;] schedule_timeout+0x192/0x2e0
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffff810810e0&amp;gt;] ? process_timeout+0x0/0x10
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffffa05d662d&amp;gt;] cfs_schedule_timeout_and_set_state+0x1d/0x20 [libcfs]
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffffa070f4d8&amp;gt;] obd_exports_barrier+0x98/0x170 [obdclass]
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffffa0e42962&amp;gt;] ofd_device_fini+0x42/0x230 [ofd]
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffffa073c2e7&amp;gt;] class_cleanup+0x577/0xda0 [obdclass]
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffffa0711686&amp;gt;] ? class_name2dev+0x56/0xe0 [obdclass]
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffffa073dbcc&amp;gt;] class_process_config+0x10bc/0x1c80 [obdclass]
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffffa07375b3&amp;gt;] ? lustre_cfg_new+0x2d3/0x6e0 [obdclass]
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffffa073e909&amp;gt;] class_manual_cleanup+0x179/0x6f0 [obdclass]
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffffa0711686&amp;gt;] ? class_name2dev+0x56/0xe0 [obdclass]
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffffa077845c&amp;gt;] server_put_super+0x5ec/0xf60 [obdclass]
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffff811833ab&amp;gt;] generic_shutdown_super+0x5b/0xe0
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffff81183496&amp;gt;] kill_anon_super+0x16/0x60
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffffa0740756&amp;gt;] lustre_kill_super+0x36/0x60 [obdclass]
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffff81183c37&amp;gt;] deactivate_super+0x57/0x80
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffff811a1c8f&amp;gt;] mntput_no_expire+0xbf/0x110
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffff811a26fb&amp;gt;] sys_umount+0x7b/0x3a0
Jul  1 08:52:18 wtm-13vm4 kernel: [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b

and until auto-test detects a time-out and request all threads to dump their stack where nothing interesting be found, only the concerned export address (ffff88002030d800) being in ll_evictor thread context/stack :

Jul  1 09:50:26 wtm-13vm4 kernel: ll_evictor    S 0000000000000000     0  5227      2 0x00000080
Jul  1 09:50:26 wtm-13vm4 kernel: ffff88006d29fe00 0000000000000046 0000000000000000 0000000000000002
Jul  1 09:50:26 wtm-13vm4 kernel: ffff88006d29fd80 ffff88002030d800 000000000000000a 0000000000000000
Jul  1 09:50:26 wtm-13vm4 kernel: ffff880069e9e638 ffff88006d29ffd8 000000000000fb88 ffff880069e9e638
Jul  1 09:50:26 wtm-13vm4 kernel: Call Trace:
Jul  1 09:50:26 wtm-13vm4 kernel: [&amp;lt;ffffffffa05d666e&amp;gt;] cfs_waitq_wait+0xe/0x10 [libcfs]
Jul  1 09:50:26 wtm-13vm4 kernel: [&amp;lt;ffffffffa08ea98b&amp;gt;] ping_evictor_main+0x4ab/0x640 [ptlrpc]
Jul  1 09:50:26 wtm-13vm4 kernel: [&amp;lt;ffffffff81063310&amp;gt;] ? default_wake_function+0x0/0x20
Jul  1 09:50:26 wtm-13vm4 kernel: [&amp;lt;ffffffffa08ea4e0&amp;gt;] ? ping_evictor_main+0x0/0x640 [ptlrpc]
Jul  1 09:50:26 wtm-13vm4 kernel: [&amp;lt;ffffffff81096936&amp;gt;] kthread+0x96/0xa0
Jul  1 09:50:26 wtm-13vm4 kernel: [&amp;lt;ffffffff8100c0ca&amp;gt;] child_rip+0xa/0x20
Jul  1 09:50:26 wtm-13vm4 kernel: [&amp;lt;ffffffff810968a0&amp;gt;] ? kthread+0x0/0xa0
Jul  1 09:50:26 wtm-13vm4 kernel: [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20

and still with the hung umount/9638 process !!

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So seems that the orphan export still on the unlinked list and preventing ost umount, comes from a MDS eviction during a previous test (test_70b).&lt;/p&gt;</comment>
                            <comment id="61896" author="bfaccini" created="Mon, 8 Jul 2013 16:18:09 +0000"  >&lt;p&gt;Same at &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/229cc1ca-e5d6-11e2-a8a4-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/229cc1ca-e5d6-11e2-a8a4-52540035b04c&lt;/a&gt;, again for review-zfs/replay-single/test_90 where the orphan/unlink export seems to be the one for the MDS-lov that was evicted from OSS in earlier test_70b.&lt;br/&gt;
Could be interesting to isolate test_70b and get its OSS full debug_log ...&lt;/p&gt;</comment>
                            <comment id="62127" author="utopiabound" created="Thu, 11 Jul 2013 16:37:50 +0000"  >&lt;p&gt;Bruno, I agree it&apos;s definitely the lustre-MDT0000-mdtlov export that has an extra reference.&lt;/p&gt;

&lt;p&gt;oss debug log from one of the replay-single/90 failures:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000020:00080000:0.0:1373044595.526429:0:6742:0:(genops.c:1535:print_export_data()) lustre-OST0002: ACTIVE ffff880071708000 lustre-MDT0000-mdtlov_UUID 10.10.16.107@tcp 3 (0 0 0) 0 0 0 0: (null)  0
00000020:00080000:0.0:1373044595.526433:0:6742:0:(genops.c:1535:print_export_data()) lustre-OST0002: ACTIVE ffff8800705e1400 75dd01f6-8a7e-1e4e-7e18-9ea0580f4dfc 10.10.16.109@tcp 3 (0 0 0) 0 0 0 0: (null)  0
00000020:00080000:0.0:1373044595.526435:0:6742:0:(genops.c:1535:print_export_data()) lustre-OST0002: ACTIVE ffff8800692a9000 08b7a486-2e63-5cd2-f09f-fd1b2503a9b4 10.10.16.110@tcp 5 (0 0 2) 0 0 0 0: (null)  0
00000020:00080000:0.0:1373044595.526437:0:6742:0:(genops.c:1535:print_export_data()) lustre-OST0002: ACTIVE ffff88006e732400 lustre-OST0002_UUID (no nid) 1 (0 0 0) 0 0 0 0: (null)  0
00000020:00080000:0.0:1373044595.526440:0:6742:0:(genops.c:1535:print_export_data()) lustre-OST0002: UNLINKED ffff88004d2cfc00 lustre-MDT0000-mdtlov_UUID 10.10.16.107@tcp 1 (0 0 0) 1 0 1 0: (null)  0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This is while the ost is being unmounted.&lt;/p&gt;</comment>
                            <comment id="62151" author="utopiabound" created="Thu, 11 Jul 2013 19:26:29 +0000"  >&lt;p&gt;OSS Console log from replay-single/70b&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;10:06:06:LNet: Service thread pid 5468 was inactive for 62.00s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
10:06:06:Pid: 5468, comm: ll_ost_io00_000
10:06:06:
10:06:06:Call Trace:
10:06:06: [&amp;lt;ffffffff81096eae&amp;gt;] ? prepare_to_wait_exclusive+0x4e/0x80
10:06:06: [&amp;lt;ffffffffa03e6dad&amp;gt;] cv_wait_common+0xed/0x100 [spl]
10:06:06: [&amp;lt;ffffffff81096ca0&amp;gt;] ? autoremove_wake_function+0x0/0x40
10:06:06: [&amp;lt;ffffffffa03e6e15&amp;gt;] __cv_wait+0x15/0x20 [spl]
10:06:06: [&amp;lt;ffffffffa04e518b&amp;gt;] txg_wait_open+0x7b/0xa0 [zfs]
10:06:06: [&amp;lt;ffffffffa04aa6ad&amp;gt;] dmu_tx_wait+0xed/0xf0 [zfs]
10:06:06: [&amp;lt;ffffffffa04aa736&amp;gt;] dmu_tx_assign+0x86/0x480 [zfs]
10:06:06: [&amp;lt;ffffffffa0dab56c&amp;gt;] osd_trans_start+0x9c/0x410 [osd_zfs]
10:06:06: [&amp;lt;ffffffffa0e58cbd&amp;gt;] ofd_commitrw_write+0x92d/0x11b0 [ofd]
10:06:06: [&amp;lt;ffffffffa0e5b77d&amp;gt;] ofd_commitrw+0x5cd/0xbb0 [ofd]
10:06:06: [&amp;lt;ffffffffa06847e5&amp;gt;] ? lprocfs_counter_add+0x125/0x182 [lvfs]
10:06:06: [&amp;lt;ffffffffa0e101d8&amp;gt;] obd_commitrw+0x128/0x3d0 [ost]
10:06:06: [&amp;lt;ffffffffa0e1a0f1&amp;gt;] ost_brw_write+0xea1/0x15d0 [ost]
10:06:06: [&amp;lt;ffffffff81281536&amp;gt;] ? vsnprintf+0x336/0x5e0
10:06:06: [&amp;lt;ffffffff8100b9ce&amp;gt;] ? common_interrupt+0xe/0x13
10:06:06: [&amp;lt;ffffffffa0897350&amp;gt;] ? target_bulk_timeout+0x0/0xc0 [ptlrpc]
10:06:06: [&amp;lt;ffffffffa0e2033b&amp;gt;] ost_handle+0x3ecb/0x48e0 [ost]
10:06:06: [&amp;lt;ffffffffa05e1d94&amp;gt;] ? libcfs_id2str+0x74/0xb0 [libcfs]
10:06:07: [&amp;lt;ffffffffa08e7378&amp;gt;] ptlrpc_server_handle_request+0x398/0xc60 [ptlrpc]
10:06:07: [&amp;lt;ffffffffa05d654e&amp;gt;] ? cfs_timer_arm+0xe/0x10 [libcfs]
10:06:07: [&amp;lt;ffffffffa05e7a9f&amp;gt;] ? lc_watchdog_touch+0x6f/0x170 [libcfs]
10:06:07: [&amp;lt;ffffffffa08de799&amp;gt;] ? ptlrpc_wait_event+0xa9/0x290 [ptlrpc]
10:06:07: [&amp;lt;ffffffff81055ab3&amp;gt;] ? __wake_up+0x53/0x70
10:06:07: [&amp;lt;ffffffffa08e86fd&amp;gt;] ptlrpc_main+0xabd/0x1700 [ptlrpc]
10:06:07: [&amp;lt;ffffffffa08e7c40&amp;gt;] ? ptlrpc_main+0x0/0x1700 [ptlrpc]
10:06:07: [&amp;lt;ffffffff81096936&amp;gt;] kthread+0x96/0xa0
10:06:07: [&amp;lt;ffffffff8100c0ca&amp;gt;] child_rip+0xa/0x20
10:06:07: [&amp;lt;ffffffff810968a0&amp;gt;] ? kthread+0x0/0xa0
10:06:07: [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="62171" author="bfaccini" created="Fri, 12 Jul 2013 08:31:11 +0000"  >&lt;p&gt;Yes these stacks are suspect, but they are no longer present in the stacks listing at the time umount of OST in test_90 becomes stuck ...&lt;/p&gt;

&lt;p&gt;Also, the OSS debug_log is not big enough at the time of test_90 failure to also contain the necessary traces from test_70b to debug further ...&lt;/p&gt;

&lt;p&gt;Do you think like me it could be of interest to isolate test_70b in a loop and detect its failure using all OSTs umounts ??&lt;/p&gt;</comment>
                            <comment id="62322" author="utopiabound" created="Mon, 15 Jul 2013 19:52:08 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/6988&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/6988&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="62951" author="liwei" created="Thu, 25 Jul 2013 01:55:18 +0000"  >&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/9bd8497c-f46f-11e2-b8a2-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/9bd8497c-f46f-11e2-b8a2-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="64076" author="yujian" created="Mon, 12 Aug 2013 12:29:15 +0000"  >&lt;p&gt;The failure occurs consistently on Lustre b2_4 branch while running obdfilter-survey test 3a with FSTYPE=zfs:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/1c69df6c-025b-11e3-b384-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/1c69df6c-025b-11e3-b384-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/6b8226b2-fd16-11e2-9fdb-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/6b8226b2-fd16-11e2-9fdb-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/e945932a-e0da-11e2-b3fd-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/e945932a-e0da-11e2-b3fd-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/29d52920-dc10-11e2-9082-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/29d52920-dc10-11e2-9082-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/baf535a0-d097-11e2-8fd9-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/baf535a0-d097-11e2-8fd9-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/7a15f554-ce1e-11e2-96ef-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/7a15f554-ce1e-11e2-96ef-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="64623" author="utopiabound" created="Tue, 20 Aug 2013 16:44:41 +0000"  >&lt;p&gt;Landed on master&lt;/p&gt;</comment>
                            <comment id="64625" author="adilger" created="Tue, 20 Aug 2013 16:46:38 +0000"  >&lt;p&gt;Cherry-picked to b2_4 as commit d4532fbf06d392212463355e2ba1525e9aea97eb.&lt;/p&gt;</comment>
                            <comment id="67310" author="yujian" created="Tue, 24 Sep 2013 02:46:15 +0000"  >&lt;p&gt;While testing patch &lt;a href=&quot;http://review.whamcloud.com/7644&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/7644&lt;/a&gt; on master branch, I hit the failure on replay-single test 53e: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/ddb85db2-208b-11e3-b9bc-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/ddb85db2-208b-11e3-b9bc-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;03:55:06:Lustre: DEBUG MARKER: umount -d /mnt/mds1
03:55:06:LustreError: 166-1: MGC10.10.4.154@tcp: Connection to MGS (at 0@lo) was lost; in progress operations using this service will fail
03:55:07:Lustre: MGS is waiting for obd_unlinked_exports more than 8 seconds. The obd refcount = 5. Is it stuck?
03:55:31:Lustre: MGS is waiting for obd_unlinked_exports more than 16 seconds. The obd refcount = 5. Is it stuck?
03:56:05:Lustre: MGS is waiting for obd_unlinked_exports more than 32 seconds. The obd refcount = 5. Is it stuck?
03:57:09:Lustre: MGS is waiting for obd_unlinked_exports more than 64 seconds. The obd refcount = 5. Is it stuck?
03:59:15:Lustre: MGS is waiting for obd_unlinked_exports more than 128 seconds. The obd refcount = 5. Is it stuck?
04:02:31:INFO: task umount:2412 blocked for more than 120 seconds.
04:02:31:&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
04:02:32:umount        D 0000000000000000     0  2412   2411 0x00000080
04:02:32: ffff88005ead3aa8 0000000000000086 ffff88005ead3a08 ffff880079da1400
04:02:32: ffffffffa062f3e0 0000000000000000 ffff88006684f084 ffffffffa062f3e0
04:02:32: ffff8800567645f8 ffff88005ead3fd8 000000000000fb88 ffff8800567645f8
04:02:32:Call Trace:
04:02:33: [&amp;lt;ffffffff8150f362&amp;gt;] schedule_timeout+0x192/0x2e0
04:02:33: [&amp;lt;ffffffff810811e0&amp;gt;] ? process_timeout+0x0/0x10
04:02:33: [&amp;lt;ffffffffa05b06fb&amp;gt;] obd_exports_barrier+0xab/0x180 [obdclass]
04:02:34: [&amp;lt;ffffffffa0d2852e&amp;gt;] mgs_device_fini+0xfe/0x580 [mgs]
04:02:35: [&amp;lt;ffffffffa05dbc93&amp;gt;] class_cleanup+0x573/0xd30 [obdclass]
04:02:35: [&amp;lt;ffffffffa05b2896&amp;gt;] ? class_name2dev+0x56/0xe0 [obdclass]
04:02:35: [&amp;lt;ffffffffa05dd9ba&amp;gt;] class_process_config+0x156a/0x1ad0 [obdclass]
04:02:35: [&amp;lt;ffffffffa05d6b13&amp;gt;] ? lustre_cfg_new+0x2d3/0x6e0 [obdclass]
04:02:36: [&amp;lt;ffffffffa05de099&amp;gt;] class_manual_cleanup+0x179/0x6f0 [obdclass]
04:02:36: [&amp;lt;ffffffffa05b2896&amp;gt;] ? class_name2dev+0x56/0xe0 [obdclass]
04:02:37: [&amp;lt;ffffffffa061955d&amp;gt;] server_put_super+0x45d/0xf60 [obdclass]
04:02:38: [&amp;lt;ffffffff8118363b&amp;gt;] generic_shutdown_super+0x5b/0xe0
04:02:38: [&amp;lt;ffffffff81183726&amp;gt;] kill_anon_super+0x16/0x60
04:02:39: [&amp;lt;ffffffffa05dff46&amp;gt;] lustre_kill_super+0x36/0x60 [obdclass]
04:02:39: [&amp;lt;ffffffff81183ec7&amp;gt;] deactivate_super+0x57/0x80
04:02:39: [&amp;lt;ffffffff811a21bf&amp;gt;] mntput_no_expire+0xbf/0x110
04:02:42: [&amp;lt;ffffffff811a2c2b&amp;gt;] sys_umount+0x7b/0x3a0
04:02:42: [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b
04:03:25:Lustre: MGS is waiting for obd_unlinked_exports more than 256 seconds. The obd refcount = 5. Is it stuck?
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="68054" author="utopiabound" created="Tue, 1 Oct 2013 15:20:47 +0000"  >&lt;p&gt;This seems to have cropped up several times recently with sanity/132&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/47f3dcd2-2a88-11e3-864b-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/47f3dcd2-2a88-11e3-864b-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/0e00ade6-29db-11e3-8527-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/0e00ade6-29db-11e3-8527-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/2b4da1fa-291e-11e3-b598-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/2b4da1fa-291e-11e3-b598-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="68093" author="utopiabound" created="Tue, 1 Oct 2013 18:07:42 +0000"  >&lt;p&gt;sanity/132 seem to share the following OST logs:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;15:51:18:Lustre: DEBUG MARKER: == sanity test 132: som avoids glimpse rpc == 15:50:26 (1380581426)
15:51:18:LustreError: 23533:0:(ost_handler.c:1775:ost_blocking_ast()) Error -2 syncing data on lock cancel
15:51:18:Lustre: lustre-OST0006: Client lustre-MDT0000-mdtlov_UUID (at 10.10.16.120@tcp) reconnecting
15:51:18:Lustre: lustre-OST0006: Client lustre-MDT0000-mdtlov_UUID (at 10.10.16.120@tcp) refused reconnection, still busy with 1 active RPCs
15:51:18:Lustre: DEBUG MARKER: /usr/sbin/lctl get_param -n ost.OSS.ost.stats
15:51:18:Lustre: DEBUG MARKER: /usr/sbin/lctl get_param -n ost.OSS.ost.stats
15:51:18:Lustre: lustre-OST0006: Client lustre-MDT0000-mdtlov_UUID (at 10.10.16.120@tcp) reconnecting
15:51:18:Lustre: lustre-OST0006: Client lustre-MDT0000-mdtlov_UUID (at 10.10.16.120@tcp) refused reconnection, still busy with 1 active RPCs
15:51:18:LustreError: 11-0: lustre-MDT0000-lwp-OST0001: Communicating with 10.10.16.120@tcp, operation obd_ping failed with -107.
15:51:18:Lustre: lustre-MDT0000-lwp-OST0001: Connection to lustre-MDT0000 (at 10.10.16.120@tcp) was lost; in progress operations using this service will wait for recovery to complete
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Then a umount of OST0006 which never completes:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;15:52:09:Lustre: 7404:0:(client.c:1897:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1380581484/real 1380581484]  req@ffff8800634d5800 x1447637766224616/t0(0) o250-&amp;gt;MGC10.10.16.120@tcp@10.10.16.120@tcp:26/25 lens 400/544 e 0 to 1 dl 1380581500 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
15:52:09:Lustre: lustre-OST0006 is waiting for obd_unlinked_exports more than 8 seconds. The obd refcount = 5. Is it stuck?
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;From the MDT console log:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;16:51:27:Lustre: DEBUG MARKER: == sanity test 132: som avoids glimpse rpc == 15:50:26 (1380581426)
16:51:27:LustreError: 11-0: lustre-OST0006-osc-MDT0000: Communicating with 10.10.16.121@tcp, operation ost_connect failed with -16.
16:51:27:Lustre: DEBUG MARKER: /usr/sbin/lctl get_param mdt.*.som
16:51:27:LustreError: 11-0: lustre-OST0006-osc-MDT0000: Communicating with 10.10.16.121@tcp, operation ost_connect failed with -16.
16:51:27:Lustre: DEBUG MARKER: /usr/sbin/lctl conf_param lustre.mdt.som=enabled
16:51:27:Lustre: Setting parameter lustre-MDT0000.mdt.som in log lustre-MDT0000
16:51:27:Lustre: Skipped 5 previous similar messages
16:51:27:Lustre: DEBUG MARKER: grep -c /mnt/mds1&apos; &apos; /proc/mounts
16:51:27:Lustre: DEBUG MARKER: umount -d -f /mnt/mds1
16:51:27:LustreError: 3509:0:(client.c:1076:ptlrpc_import_delay_req()) @@@ IMP_CLOSED   req@ffff88004efcd000 x1447637735940204/t0(0) o13-&amp;gt;lustre-OST0000-osc-MDT0000@10.10.16.121@tcp:7/4 lens 224/368 e 0 to 0 dl 0 ref 1 fl Rpc:/0/ffffffff rc 0/-1
16:51:27:LustreError: 3509:0:(client.c:1076:ptlrpc_import_delay_req()) @@@ IMP_CLOSED   req@ffff88004efcd000 x1447637735940208/t0(0) o13-&amp;gt;lustre-OST0002-osc-MDT0000@10.10.16.121@tcp:7/4 lens 224/368 e 0 to 0 dl 0 ref 1 fl Rpc:/0/ffffffff rc 0/-1
16:51:27:LustreError: 3509:0:(client.c:1076:ptlrpc_import_delay_req()) @@@ IMP_CLOSED   req@ffff88004efcd000 x1447637735940216/t0(0) o6-&amp;gt;lustre-OST0003-osc-MDT0000@10.10.16.121@tcp:28/4 lens 664/432 e 0 to 0 dl 0 ref 1 fl Rpc:/0/ffffffff rc 0/-1
16:51:27:LustreError: 3509:0:(client.c:1076:ptlrpc_import_delay_req()) Skipped 1 previous similar message
16:51:27:Lustre: lustre-MDT0000: Not available for connect from 10.10.16.121@tcp (stopping)
16:51:27:Lustre: lustre-MDT0000: Not available for connect from 10.10.16.121@tcp (stopping)
16:51:27:LustreError: 3508:0:(client.c:1076:ptlrpc_import_delay_req()) @@@ IMP_CLOSED   req@ffff8800569b5400 x1447637735940228/t0(0) o13-&amp;gt;lustre-OST0004-osc-MDT0000@10.10.16.121@tcp:7/4 lens 224/368 e 0 to 0 dl 0 ref 1 fl Rpc:/0/ffffffff rc 0/-1
16:51:27:Lustre: 15981:0:(client.c:1897:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1380581444/real 1380581444]  req@ffff8800569b5400 x1447637735940248/t0(0) o251-&amp;gt;MGC10.10.16.120@tcp@0@lo:26/25 lens 224/224 e 0 to 1 dl 1380581450 ref 2 fl Rpc:XN/0/ffffffff rc 0/-1
16:51:27:LustreError: 137-5: lustre-MDT0000_UUID: not available for connect from 10.10.16.121@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.
16:51:27:Lustre: server umount lustre-MDT0000 complete
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;From debug log on OST:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;...
1380410772.384659:(ldlm_lock.c:454:lock_handle_free()) slab-freed &apos;lock&apos;: 504 at ffff880025067c80.
1380410772.386661:(ldlm_lock.c:454:lock_handle_free()) slab-freed &apos;lock&apos;: 504 at ffff88002583e380.
1380410831.744886:(ofd_objects.c:563:ofd_attr_get()) Process entered
1380410831.744887:(ofd_objects.c:588:ofd_attr_get()) Process leaving (rc=18446744073709551614 : -2 : fffffffffffffffe)
1380410831.744889:(lprocfs_jobstats.c:217:lprocfs_job_stats_log()) Process entered
1380410831.744890:(lprocfs_jobstats.c:224:lprocfs_job_stats_log()) Process leaving (rc=18446744073709551594 : -22 : ffffffffffffffea)
1380410831.744891:(ofd_obd.c:1456:ofd_sync()) Process leaving
1380410831.744892:(lustre_fid.h:719:fid_flatten32()) Process leaving (rc=4279240389 : 4279240389 : ff1006c5)
1380410831.744893:(lustre_fid.h:719:fid_flatten32()) Process leaving (rc=4279240389 : 4279240389 : ff1006c5)
1380410831.744897:(ofd_dev.c:285:ofd_object_free()) Process entered
1380410831.744897:(ofd_dev.c:289:ofd_object_free()) object free, fid = [0x100000000:0x17c5:0x0]
1380410831.744898:(ofd_dev.c:293:ofd_object_free()) slab-freed &apos;(of)&apos;: 160 at ffff880026e3e9f0.
1380410831.744899:(ofd_dev.c:294:ofd_object_free()) Process leaving
1380410831.744899:(obd_class.h:1326:obd_sync()) Process leaving (rc=18446744073709551614 : -2 : fffffffffffffffe)
1380410831.744900:(ost_handler.c:1775:ost_blocking_ast()) Error -2 syncing data on lock cancel
1380410831.745806:(ost_handler.c:1777:ost_blocking_ast()) slab-freed &apos;((oa))&apos;: 208 at ffff88002690ca40.
1380410831.745808:(ost_handler.c:1778:ost_blocking_ast()) kfreed &apos;oinfo&apos;: 112 at ffff880026b61140.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="68280" author="utopiabound" created="Thu, 3 Oct 2013 17:58:06 +0000"  >&lt;p&gt;sanity/132 failures appear to be &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4019&quot; title=&quot;today&amp;#39;s master stick on shutdown on test == sanity test 132: on lu_object_find_at&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4019&quot;&gt;&lt;del&gt;LU-4019&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="69212" author="utopiabound" created="Thu, 17 Oct 2013 15:19:10 +0000"  >&lt;p&gt;There have been two &quot;recent&quot; (Sept 2013) non conf-sanity/- failures (both in replay-single):&lt;/p&gt;

&lt;p&gt;replay-single/74 &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/f441c460-227f-11e3-af6a-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/f441c460-227f-11e3-af6a-52540035b04c&lt;/a&gt;&lt;br/&gt;
A review-dne-zfs failure on OST0000&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;21:28:53:Lustre: DEBUG MARKER: umount -d /mnt/ost1
21:28:53:Lustre: Failing over lustre-OST0000
21:28:53:LustreError: 15640:0:(ost_handler.c:1782:ost_blocking_ast()) Error -2 syncing data on lock cancel
21:28:53:Lustre: 15640:0:(service.c:2030:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (50:74s); client may timeout.  req@ffff880046d72c00 x1446662193136696/t0(0) o103-&amp;gt;cea0ffc2-1873-4321-a1a2-348391764373@10.10.16.253@tcp:0/0 lens 328/192 e 0 to 0 dl 1379651120 ref 1 fl Complete:H/0/0 rc -19/-19
21:28:53:LustreError: 7671:0:(ost_handler.c:1782:ost_blocking_ast()) Error -2 syncing data on lock cancel
21:28:53:Lustre: lustre-OST0000: Not available for connect from 10.10.17.1@tcp (stopping)
21:28:53:Lustre: Skipped 5 previous similar messages
21:28:53:Lustre: lustre-OST0000 is waiting for obd_unlinked_exports more than 8 seconds. The obd refcount = 7. Is it stuck?
21:28:53:Lustre: lustre-OST0000 is waiting for obd_unlinked_exports more than 16 seconds. The obd refcount = 7. Is it stuck?
21:28:53:Lustre: lustre-OST0000 is waiting for obd_unlinked_exports more than 32 seconds. The obd refcount = 7. Is it stuck?
21:40:22:Lustre: lustre-OST0000 is waiting for obd_unlinked_exports more than 64 seconds. The obd refcount = 7. Is it stuck?
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The other is review run replay-single/53e &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/ddb85db2-208b-11e3-b9bc-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/ddb85db2-208b-11e3-b9bc-52540035b04c&lt;/a&gt; (NOT ZFS)&lt;br/&gt;
The MGS fails:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;03:55:06:Lustre: DEBUG MARKER: umount -d /mnt/mds1
03:55:06:LustreError: 166-1: MGC10.10.4.154@tcp: Connection to MGS (at 0@lo) was lost; in progress operations using this service will fail
03:55:07:Lustre: MGS is waiting for obd_unlinked_exports more than 8 seconds. The obd refcount = 5. Is it stuck?
03:55:31:Lustre: MGS is waiting for obd_unlinked_exports more than 16 seconds. The obd refcount = 5. Is it stuck?
03:56:05:Lustre: MGS is waiting for obd_unlinked_exports more than 32 seconds. The obd refcount = 5. Is it stuck?
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="69224" author="utopiabound" created="Thu, 17 Oct 2013 16:41:43 +0000"  >&lt;p&gt;Debugging patch to try to see if 6988 was on the right track but not broad enough.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/7995&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/7995&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="70588" author="yujian" created="Mon, 4 Nov 2013 05:09:03 +0000"  >&lt;p&gt;Lustre build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_4/47/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_4/47/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64&lt;/p&gt;

&lt;p&gt;FSTYPE=zfs&lt;br/&gt;
MDSCOUNT=1&lt;br/&gt;
MDSSIZE=2097152&lt;br/&gt;
OSTCOUNT=2&lt;br/&gt;
OSTSIZE=2097152&lt;/p&gt;

&lt;p&gt;obdfilter-survey test 3a hit the same failure:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/a488f632-4453-11e3-8472-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/a488f632-4453-11e3-8472-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="71117" author="utopiabound" created="Fri, 8 Nov 2013 13:42:21 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/7995&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/7995&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="72292" author="yujian" created="Tue, 26 Nov 2013 08:18:26 +0000"  >&lt;p&gt;Lustre build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_4/58/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_4/58/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64&lt;/p&gt;

&lt;p&gt;FSTYPE=zfs&lt;br/&gt;
MDSCOUNT=1&lt;br/&gt;
MDSSIZE=2097152&lt;br/&gt;
OSTCOUNT=2&lt;br/&gt;
OSTSIZE=2097152&lt;/p&gt;

&lt;p&gt;obdfilter-survey test 3a hit the same failure:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/19556f3e-5608-11e3-8e94-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/19556f3e-5608-11e3-8e94-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="72901" author="yujian" created="Thu, 5 Dec 2013 15:18:29 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_4/63/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_4/63/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64 (server), SLES11SP2/x86_64 (client)&lt;/p&gt;

&lt;p&gt;replay-dual test 3 hit this failure:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/20b3d072-5c98-11e3-956b-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/20b3d072-5c98-11e3-956b-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="73455" author="yujian" created="Fri, 13 Dec 2013 08:32:10 +0000"  >&lt;p&gt;More instances on Lustre b2_4 branch:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/dcb5daa6-6579-11e3-8518-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/dcb5daa6-6579-11e3-8518-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/6c3ab5e4-6358-11e3-8c76-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/6c3ab5e4-6358-11e3-8c76-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/d4b0f714-6281-11e3-a8fd-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/d4b0f714-6281-11e3-a8fd-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="73613" author="utopiabound" created="Mon, 16 Dec 2013 19:48:07 +0000"  >&lt;p&gt;It looks like this bug is fixed with the landing of #7995.  Should I create gerrit patch to port to b2_4 and b2_5?&lt;br/&gt;
It will cherry-pick cleanly to the current heads of both b2_4 and b2_5?&lt;/p&gt;</comment>
                            <comment id="73616" author="utopiabound" created="Mon, 16 Dec 2013 20:22:00 +0000"  >&lt;p&gt;back-port to b2_4 &lt;a href=&quot;http://review.whamcloud.com/8591&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8591&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="73640" author="adilger" created="Tue, 17 Dec 2013 01:49:40 +0000"  >&lt;p&gt;Typically, if a patch can be cherry-picked cleanly to the older branches there is no need for a separate patch.  No harm in doing this, but it is also possible to ask Oleg to do the cherry-pick into the maintenance branch(es).&lt;/p&gt;</comment>
                            <comment id="73988" author="yujian" created="Sat, 21 Dec 2013 15:05:15 +0000"  >&lt;p&gt;Lustre build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_4/69/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_4/69/&lt;/a&gt; (2.4.2 RC1)&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64&lt;br/&gt;
FSTYPE=zfs&lt;/p&gt;

&lt;p&gt;obdfilter-survey hit this failure again:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/f0db9456-6981-11e3-aabe-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/f0db9456-6981-11e3-aabe-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="74298" author="yujian" created="Fri, 3 Jan 2014 11:13:10 +0000"  >&lt;p&gt;Although patch &lt;a href=&quot;http://review.whamcloud.com/7995&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/7995&lt;/a&gt; was cherry-picked to Lustre b2_5 branch, the failure still occurred while running obdfilter-survey test 3a with FSTYPE=zfs on Lustre b2_5 build #5:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;01:44:19:Lustre: DEBUG MARKER: umount -d -f /mnt/ost1
01:44:19:Lustre: lustre-OST0000: Not available for connect from 10.10.4.37@tcp (stopping)
01:44:19:Lustre: lustre-OST0000: Not available for connect from 10.10.4.37@tcp (stopping)
01:44:19:Lustre: lustre-OST0000 is waiting for obd_unlinked_exports more than 8 seconds. The obd refcount = 4. Is it stuck?
01:44:19:Lustre: 4092:0:(client.c:1897:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1388655826/real 1388655826]  req@ffff88006090dc00 x1456104749935752/t0(0) o38-&amp;gt;lustre-MDT0000-lwp-OST0001@10.10.4.39@tcp:12/10 lens 400/544 e 0 to 1 dl 1388655842 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
01:44:19:Lustre: lustre-OST0000: Not available for connect from 10.10.4.37@tcp (stopping)
01:44:19:Lustre: lustre-OST0000: Not available for connect from 10.10.4.37@tcp (stopping)
01:46:34:Lustre: lustre-OST0000: Not available for connect from 10.10.4.37@tcp (stopping)
01:46:34:Lustre: lustre-OST0000 is waiting for obd_unlinked_exports more than 16 seconds. The obd refcount = 4. Is it stuck?
&amp;lt;~snip~&amp;gt;
01:50:58:INFO: task umount:11612 blocked for more than 120 seconds.
01:50:58:&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
01:50:58:umount        D 0000000000000001     0 11612  11611 0x00000080
01:50:58: ffff88002be63ab8 0000000000000082 0000000000000000 000000006e5e0af6
01:50:58: ffffffffa07ca7f0 ffff88006e68236f ffff88005a64d184 ffffffffa0788975
01:50:58: ffff8800638d7098 ffff88002be63fd8 000000000000fb88 ffff8800638d7098
01:50:58:Call Trace:
01:50:58: [&amp;lt;ffffffff8150f362&amp;gt;] schedule_timeout+0x192/0x2e0
01:50:58: [&amp;lt;ffffffff810811e0&amp;gt;] ? process_timeout+0x0/0x10
01:50:58: [&amp;lt;ffffffffa070aeab&amp;gt;] obd_exports_barrier+0xab/0x180 [obdclass]
01:50:58: [&amp;lt;ffffffffa0e8194f&amp;gt;] ofd_device_fini+0x5f/0x240 [ofd]
01:50:58: [&amp;lt;ffffffffa0736493&amp;gt;] class_cleanup+0x573/0xd30 [obdclass]
01:50:58: [&amp;lt;ffffffffa070d046&amp;gt;] ? class_name2dev+0x56/0xe0 [obdclass]
01:50:58: [&amp;lt;ffffffffa07381ba&amp;gt;] class_process_config+0x156a/0x1ad0 [obdclass]
01:50:58: [&amp;lt;ffffffffa0731313&amp;gt;] ? lustre_cfg_new+0x2d3/0x6e0 [obdclass]
01:50:58: [&amp;lt;ffffffffa0738899&amp;gt;] class_manual_cleanup+0x179/0x6f0 [obdclass]
01:50:58: [&amp;lt;ffffffffa070d046&amp;gt;] ? class_name2dev+0x56/0xe0 [obdclass]
01:50:59: [&amp;lt;ffffffffa0773edc&amp;gt;] server_put_super+0x5ec/0xf60 [obdclass]
01:50:59: [&amp;lt;ffffffff8118363b&amp;gt;] generic_shutdown_super+0x5b/0xe0
01:50:59: [&amp;lt;ffffffff81183726&amp;gt;] kill_anon_super+0x16/0x60
01:50:59: [&amp;lt;ffffffffa073a746&amp;gt;] lustre_kill_super+0x36/0x60 [obdclass]
01:50:59: [&amp;lt;ffffffff81183ec7&amp;gt;] deactivate_super+0x57/0x80
01:50:59: [&amp;lt;ffffffff811a21bf&amp;gt;] mntput_no_expire+0xbf/0x110
01:50:59: [&amp;lt;ffffffff811a2c2b&amp;gt;] sys_umount+0x7b/0x3a0
01:50:59: [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Maloo report: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/8b620634-73c5-11e3-b4ff-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/8b620634-73c5-11e3-b4ff-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="74299" author="yujian" created="Fri, 3 Jan 2014 11:14:13 +0000"  >&lt;p&gt;I&apos;ve to reopen the ticket because the failure still occurs.&lt;/p&gt;</comment>
                            <comment id="74383" author="yujian" created="Mon, 6 Jan 2014 10:08:29 +0000"  >&lt;p&gt;More instance on Lustre b2_5 build #5 with FSTYPE=zfs:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/a1f6e73a-7671-11e3-a7a8-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/a1f6e73a-7671-11e3-a7a8-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="75642" author="yujian" created="Sun, 26 Jan 2014 09:09:26 +0000"  >&lt;p&gt;More instances on Lustre b2_5 branch:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/91c9c6da-861a-11e3-a2cb-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/91c9c6da-861a-11e3-a2cb-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/09ebb164-8477-11e3-bab5-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/09ebb164-8477-11e3-bab5-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/2f51a8fa-8477-11e3-bab5-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/2f51a8fa-8477-11e3-bab5-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/2cbbedf4-8ecb-11e3-b036-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/2cbbedf4-8ecb-11e3-b036-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="77614" author="utopiabound" created="Fri, 21 Feb 2014 17:06:22 +0000"  >&lt;p&gt;All the b2_5 TIMEOUTs happened in obdfilter-survey/3a, but for each of them, there were errors in test 1c or 2a that I believe left echo-client on the OST that then caused the umount to TIMEOUT.&lt;/p&gt;</comment>
                            <comment id="77616" author="utopiabound" created="Fri, 21 Feb 2014 17:14:34 +0000"  >&lt;p&gt;b2_5 issues are different than original issue and better handled by &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3665&quot; title=&quot;obdfilter-survey test_3a: unmount stuck in obd_exports_barrier()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3665&quot;&gt;&lt;del&gt;LU-3665&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="20090">LU-3665</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="19988">LU-3632</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="22653">LU-4449</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="25073">LU-5166</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="23522">LU-4734</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="21160">LU-4019</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="16088">LU-2015</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="17814">LU-2939</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="21262">LU-4062</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="23414">LU-4695</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="23522">LU-4734</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvp2v:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>7893</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>