<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:20:31 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1882] conf-sanity test_45: umount2: Device or resource busy</title>
                <link>https://jira.whamcloud.com/browse/LU-1882</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for yujian &amp;lt;yujian@whamcloud.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/aaac51d6-fa70-11e1-887d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/aaac51d6-fa70-11e1-887d-52540035b04c&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_3/16&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_3/16&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The sub-test test_45 failed with the following error:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Stopping /mnt/mds1 (opts:-f) on client-30vm3
CMD: client-30vm3 umount -d -f /mnt/mds1
CMD: client-30vm3 lsmod | grep lnet &amp;gt; /dev/null &amp;amp;&amp;amp; lctl dl | grep &apos; ST &apos;
sleep 60 sec
CMD: client-30vm6.lab.whamcloud.com lctl set_param fail_loc=0x50f
fail_loc=0x50f
sleep 10 sec
manual umount lustre on /mnt/lustre....
CMD: client-30vm6.lab.whamcloud.com umount -d --force /mnt/lustre
umount2: Device or resource busy
umount: /mnt/lustre: device is busy.
        (In some cases useful info about processes that use
         the device is found by lsof(8) or fuser(1))
df: umount2: Device or resource busy
 conf-sanity test_45: @@@@@@ FAIL: test_45 failed with 3 
`/mnt/lustre&apos;: Cannot send after transport endpoint shutdown
df: no file systems processed
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Info required for matching: conf-sanity 45&lt;/p&gt;</description>
                <environment></environment>
        <key id="15870">LU-1882</key>
            <summary>conf-sanity test_45: umount2: Device or resource busy</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bogl">Bob Glossman</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                            <label>patch</label>
                    </labels>
                <created>Tue, 11 Sep 2012 01:51:46 +0000</created>
                <updated>Thu, 18 Nov 2021 16:47:28 +0000</updated>
                            <resolved>Sat, 8 Oct 2016 19:07:03 +0000</resolved>
                                    <version>Lustre 2.3.0</version>
                    <version>Lustre 2.4.0</version>
                    <version>Lustre 2.4.1</version>
                    <version>Lustre 2.5.0</version>
                    <version>Lustre 2.6.0</version>
                    <version>Lustre 2.5.1</version>
                    <version>Lustre 2.7.0</version>
                    <version>Lustre 2.5.3</version>
                                    <fixVersion>Lustre 2.9.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>15</watches>
                                                                            <comments>
                            <comment id="44564" author="yujian" created="Tue, 11 Sep 2012 01:55:18 +0000"  >&lt;p&gt;This test passed in another test session against the same distro/arch and Lustre build:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/464dffc2-fa71-11e1-a03c-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/464dffc2-fa71-11e1-a03c-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="63964" author="yujian" created="Fri, 9 Aug 2013 14:11:55 +0000"  >&lt;p&gt;More instances on Lustre b2_4 branch:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/0f084944-dba5-11e2-9a76-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/0f084944-dba5-11e2-9a76-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/1806e9a2-fcd8-11e2-b90c-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/1806e9a2-fcd8-11e2-b90c-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/e6c4dc52-e074-11e2-97bd-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/e6c4dc52-e074-11e2-97bd-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/8dedf09e-e049-11e2-aaec-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/8dedf09e-e049-11e2-aaec-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/e1a42764-df2f-11e2-855b-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/e1a42764-df2f-11e2-855b-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="64117" author="pjones" created="Mon, 12 Aug 2013 18:53:35 +0000"  >&lt;p&gt;Bob is looking at this one&lt;/p&gt;</comment>
                            <comment id="64121" author="bogl" created="Mon, 12 Aug 2013 19:18:30 +0000"  >&lt;p&gt;Seems like all the reported instances happen in &apos;full&apos; tests, never in review tests. Suspecting something in the environment gets used up in full tests only.&lt;/p&gt;

&lt;p&gt;Even in full tests the failure is quite intermittent.&lt;/p&gt;</comment>
                            <comment id="64393" author="pjones" created="Fri, 16 Aug 2013 12:31:12 +0000"  >&lt;p&gt;Thanks Bob. It sounds like this is lower priority for now&lt;/p&gt;</comment>
                            <comment id="66042" author="yujian" created="Mon, 9 Sep 2013 08:51:11 +0000"  >&lt;p&gt;Lustre client: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_1/215/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_1/215/&lt;/a&gt; (2.1.6)&lt;br/&gt;
Lustre server: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_4/45/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_4/45/&lt;/a&gt;  (2.4.1 RC2)&lt;/p&gt;

&lt;p&gt;conf-sanity test 45 hit the same failure:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/4572ac80-18aa-11e3-bf95-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/4572ac80-18aa-11e3-bf95-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="72672" author="yujian" created="Tue, 3 Dec 2013 02:26:43 +0000"  >&lt;p&gt;More instances on Lustre b2_4 branch:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/e69b3088-5814-11e3-b8c3-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/e69b3088-5814-11e3-b8c3-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/2587b2ea-5aee-11e3-85e2-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/2587b2ea-5aee-11e3-85e2-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="72692" author="yujian" created="Tue, 3 Dec 2013 13:20:32 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_5/4/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_5/4/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The same failure occurred:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/426bc358-4cc8-11e3-826a-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/426bc358-4cc8-11e3-826a-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="73538" author="yujian" created="Sat, 14 Dec 2013 12:38:39 +0000"  >&lt;p&gt;More instance on Lustre b2_4 branch:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/3dcbcb48-6485-11e3-bc94-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/3dcbcb48-6485-11e3-bc94-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="74380" author="yujian" created="Mon, 6 Jan 2014 09:58:24 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_5/5/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_5/5/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64&lt;br/&gt;
FSTYPE=zfs&lt;/p&gt;

&lt;p&gt;The same failure occurred:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/567f4f66-766e-11e3-a7a8-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/567f4f66-766e-11e3-a7a8-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="74785" author="yujian" created="Sun, 12 Jan 2014 14:07:33 +0000"  >&lt;p&gt;More instances on Lustre b2_5 branch:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/43137e1c-7a00-11e3-86c8-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/43137e1c-7a00-11e3-86c8-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/8546fb42-7e1f-11e3-91f7-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/8546fb42-7e1f-11e3-91f7-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/7aa6dbe8-8618-11e3-a2cb-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/7aa6dbe8-8618-11e3-a2cb-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/2c8687f8-8472-11e3-bab5-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/2c8687f8-8472-11e3-bab5-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="76516" author="sarah" created="Fri, 7 Feb 2014 22:18:35 +0000"  >&lt;p&gt;Hit this error in master tag-2.5.55 DNE testing. &lt;br/&gt;
server and client: lustre-master build # 1876 RHEL6 ldiskfs&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/b84701bc-8e06-11e3-b27d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/b84701bc-8e06-11e3-b27d-52540035b04c&lt;/a&gt;&lt;/p&gt;


&lt;p&gt;client console shows&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;21:29:42:Lustre: Skipped 127 previous similar messages
21:29:42:LustreError: 31078:0:(import.c:323:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: rc = -110 waiting for callback (3 != 0)
21:29:42:LustreError: 31078:0:(import.c:323:ptlrpc_invalidate_import()) Skipped 127 previous similar messages
21:29:42:LustreError: 31078:0:(import.c:349:ptlrpc_invalidate_import()) @@@ still on sending list  req@ffff88007a8e3800 x1459084469469476/t0(0) o38-&amp;gt;lustre-MDT0000-mdc-ffff8800737cec00@10.10.4.198@tcp:12/10 lens 400/544 e 0 to 0 dl 1391491497 ref 1 fl Unregistering:EN/0/ffffffff rc -5/-1
21:29:42:LustreError: 31078:0:(import.c:349:ptlrpc_invalidate_import()) Skipped 127 previous similar messages
21:29:42:LustreError: 31078:0:(import.c:365:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: RPCs in &quot;Unregistering&quot; phase found (3). Network is sluggish? Waiting them to error out.
21:29:42:LustreError: 31078:0:(import.c:365:ptlrpc_invalidate_import()) Skipped 127 previous similar messages
21:29:42:LustreError: 30653:0:(mdc_locks.c:917:mdc_enqueue()) ldlm_cli_enqueue: -5
21:29:42:LustreError: 30653:0:(vvp_io.c:1230:vvp_io_init()) lustre: refresh file layout [0x200002b10:0x1:0x0] error -5.
21:29:42:LustreError: 30730:0:(mdc_locks.c:917:mdc_enqueue()) ldlm_cli_enqueue: -5
21:29:42:LustreError: 30730:0:(file.c:3087:ll_inode_revalidate_fini()) lustre: revalidate FID [0x200000007:0x1:0x0] error: rc = -5
21:29:42:Lustre: setting import lustre-MDT0001_UUID INACTIVE by administrator request
21:29:42:Lustre: 29951:0:(llite_lib.c:2512:ll_dirty_page_discard_warn()) lustre: dirty page discard: 10.10.4.198@tcp:/lustre/fid: [0x200002b10:0x1:0x0]/ may get corrupted (rc -108)
21:29:42:LustreError: 30730:0:(file.c:3087:ll_inode_revalidate_fini()) lustre: revalidate FID [0x200000007:0x1:0x0] error: rc = -108
21:29:42:Lustre: DEBUG MARKER: /usr/sbin/lctl mark  conf-sanity test_45: @@@@@@ FAIL: test_45 failed with 3 
21:29:42:Lustre: DEBUG MARKER: conf-sanity test_45: @@@@@@ FAIL: test_45 failed with 3
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="78682" author="yujian" created="Fri, 7 Mar 2014 08:24:52 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_5/39/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_5/39/&lt;/a&gt; (2.5.1 RC1)&lt;br/&gt;
Distro/Arch: RHEL6.5/x86_64&lt;br/&gt;
MDSCOUNT=2&lt;/p&gt;

&lt;p&gt;The same failure occurred:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/9617b500-a553-11e3-a61d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/9617b500-a553-11e3-a61d-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="80194" author="sarah" created="Tue, 25 Mar 2014 05:40:05 +0000"  >&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/18b5b81c-b243-11e3-a93f-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/18b5b81c-b243-11e3-a93f-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;also hit in interop test between 2.5.1 server and master client(build # 1945)&lt;/p&gt;</comment>
                            <comment id="92897" author="yujian" created="Sun, 31 Aug 2014 07:22:58 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-b2_5/86/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.hpdd.intel.com/job/lustre-b2_5/86/&lt;/a&gt; (2.5.3 RC1)&lt;/p&gt;

&lt;p&gt;The same failure occurred: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/1c5fee1c-3086-11e4-a3d9-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/1c5fee1c-3086-11e4-a3d9-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="100473" author="yujian" created="Tue, 2 Dec 2014 19:32:26 +0000"  >&lt;p&gt;More instance on Lustre b2_5 branch:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/49f1b300-7a01-11e4-be53-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/49f1b300-7a01-11e4-be53-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="101730" author="sarah" created="Tue, 16 Dec 2014 18:18:39 +0000"  >&lt;p&gt;saw this failure again in interop testing between 2.6.0 server and master client:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/76457df4-7fb2-11e4-a9c0-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/76457df4-7fb2-11e4-a9c0-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="106563" author="jamesanunez" created="Wed, 11 Feb 2015 00:15:09 +0000"  >&lt;p&gt;I see this failure with lustre-master tag 2.6.93. Results are at &lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/fff27cc4-addd-11e4-a0b6-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/fff27cc4-addd-11e4-a0b6-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="109111" author="jamesanunez" created="Fri, 6 Mar 2015 19:20:49 +0000"  >&lt;p&gt;I hit this issue again with 2.7.0-RC4. Results are at &lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/193dce6a-c42f-11e4-a0ef-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/193dce6a-c42f-11e4-a0ef-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="143834" author="sushantmane" created="Thu, 25 Feb 2016 19:45:40 +0000"  >&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;	umount2: Device or resource busy
	umount: /mnt/lustre: device is busy.
			(In some cases useful info about processes that use
			 the device is found by lsof(8) or fuser(1))
	umount2: Device or resource busy
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The reason for umount failure with EBUSY(-16) is the race with df -h $MOUNT.&lt;/p&gt;


&lt;p&gt;When &lt;tt&gt;df &amp;#45;h $MOUNT&lt;/tt&gt; is executed, mnt-&amp;gt;mnt_count of $MOUNT is incremented by 1 and mnt-&amp;gt;mnt_count becomes 2. df calls fstat() which internally calls ll_getattr(). As MDS is down ll_getattr keep on waiting for reply from MDS.&lt;/p&gt;

&lt;p&gt;Further when &lt;tt&gt;umount &amp;#45;df $MOUNT&lt;/tt&gt; is executed mnt-&amp;gt;mnt_count becomes 3. As this is  force umount ll_umount_begin() function is called.&lt;/p&gt;

&lt;p&gt;By the time ll_umount_begin() completes its execution:&lt;br/&gt;
 &lt;b&gt;PASS case&lt;/b&gt;&lt;br/&gt;
, ll_getattr fails beacause all the requests are aborted, then mnt-&amp;gt;mnt_count gets decremented to 2.&lt;/p&gt;

&lt;p&gt; &lt;b&gt;FAIL case&lt;/b&gt;&lt;br/&gt;
ll_getattr is still not completed so mnt-&amp;gt;mnt_count is not decremented and remain as 3.&lt;/p&gt;

&lt;p&gt;&lt;br class=&quot;atl-forced-newline&quot; /&gt;
In order to unmount, mnt-&amp;gt;mnt_count should be &amp;lt;= 2. In case of failure mnt-&amp;gt;mnt_count is 3 because of this  propagate_mount_busy() returns 1 and retval is not reset to 0.&lt;/p&gt;


&lt;p&gt;&lt;cite&gt;fs/namespace.c:do_umount()&lt;/cite&gt;&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;         retval = -EBUSY;
         &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (flags &amp;amp; MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
                 &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!list_empty(&amp;amp;mnt-&amp;gt;mnt_list))
                         umount_tree(mnt, 1, &amp;amp;umount_list);
                 retval = 0;
         }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;As a result of this sys_umount returns EBUSY.&lt;/p&gt;</comment>
                            <comment id="147547" author="520557" created="Fri, 1 Apr 2016 05:46:19 +0000"  >&lt;p&gt;About the problem:&lt;/p&gt;

&lt;p&gt;When I started investigating this issue, it appear that there is some kind of race between  ll_umount and ll_getattr.&lt;br/&gt;
But after digging further, found out that there is no race here and this is just a timing issue. Let me try to explain more &lt;br/&gt;
on this. &lt;/p&gt;

&lt;p&gt;Here we are stopping the MDS and then running command &quot;df -h $MOUNT &amp;amp;&quot; on  which make VFS to increment the mnt_count and call ll_getattr. Also gives command to forcefull umount the client. Now if ll_getattr finish earlier than&lt;br/&gt;
ll_umount_begin (because of manual_umount_client --force) then things works fine as VFS decrements the mnt_count&lt;br/&gt;
But if ll_umount_begin finishes earlier than ll_getattr then ll_umount_begin fail as there is additional mnt_count.&lt;/p&gt;

&lt;p&gt;Here say if we try to umount the client again after some time then ll_getattr completes and able to umount the client.&lt;/p&gt;

&lt;p&gt;I am able to re-create this bug on my local setup and tried following script change to verify.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;-        manual_umount_client --force || &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 3
+       manual_umount_client --force
+       rc=$?
+       # Try again as getattr might have not done yet 
+       &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; [ $rc != 0 ]; then
+               echo &lt;span class=&quot;code-quote&quot;&gt;&quot;Client umount fails, &lt;span class=&quot;code-keyword&quot;&gt;try&lt;/span&gt; again !!&quot;&lt;/span&gt;
+               log &lt;span class=&quot;code-quote&quot;&gt;&quot;&lt;span class=&quot;code-keyword&quot;&gt;try&lt;/span&gt; again: sleep 10 sec&quot;&lt;/span&gt;
+               sleep 10
+               manual_umount_client --force || &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 3
+       fi
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Output collected from our test board.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;sleep 10 sec
manual umount lustre on /mnt/lustre....
umount2: Device or resource busy &amp;lt;======================= EBUSY: as getattr not done yet and umount reached first
umount: /mnt/lustre: device is busy.
        (In some cases useful info about processes that use
         the device is found by lsof(8) or fuser(1))
umount2: Device or resource busy
Client umount fails, &lt;span class=&quot;code-keyword&quot;&gt;try&lt;/span&gt; again !! &amp;lt;============================ Tried again and success, TEST PASS
&lt;span class=&quot;code-keyword&quot;&gt;try&lt;/span&gt; again: sleep 10 sec
df: `/mnt/lustre&apos;: Cannot send after transport endpoint shutdown
df: no file systems processed
manual umount lustre on /mnt/lustre....
fail_loc=0x0
start mds service on fre0205
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;Now about the fix:&lt;/p&gt;

&lt;p&gt;1. We can fix this at test case level as decsribe above i.e. admin needs to handle it. OR&lt;/p&gt;

&lt;p&gt;2. Need to handle some where in ll_umount_begin() ? Or any other thought&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;ll_umount_begin()
{
:
:
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;/* Really, we&apos;d like to wait until there are no requests outstanding,
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183; * and then &lt;span class=&quot;code-keyword&quot;&gt;continue&lt;/span&gt;.  For now, we just invalidate the requests,
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183; * schedule() and sleep one second &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; needed, and hope.
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183; */
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;schedule(); &amp;lt;===========
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;EXIT;
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="148763" author="adilger" created="Wed, 13 Apr 2016 17:16:57 +0000"  >&lt;p&gt;My preference would be to fix this internal to Lustre rather than at the test script level, so that users who are calling &quot;umount -f&quot; will get what they expect (unmounted filesystem) instead of having to retry.  This test was made to verify that &quot;umount -f&quot; with a blocked RPC would properly kill the RPCs in flight and unmount, so changing the test is just hiding the problem.&lt;/p&gt;

&lt;p&gt;It would seem that the problem is in &lt;tt&gt;obd_iocontrol(IOC_OSC_SET_ACTIVE)&amp;#45;&amp;gt;ptlrpc_set_import_active&amp;#45;&amp;gt;ptlrpc_invalidate_import()&lt;/tt&gt; not cleaning up the blocked RPC properly, since the comment for that function states:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;/**
 * This function will invalidate the import, if necessary, then block
 * for all the RPC completions, and finally notify the obd to
 * invalidate its state (ie cancel locks, clear pending requests,
 * etc).
 */
void ptlrpc_invalidate_import(struct obd_import *imp)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;which clearly is not happening properly in this case.&lt;/p&gt;

&lt;p&gt;Minor note - any patch for this ticket should fix conf-sanity.sh test_45() to reference &lt;tt&gt;OBD_FAIL_PTLRPC_LONG_REPL_UNLINK&lt;/tt&gt; instead of &lt;tt&gt;OBD_FAIL_PTLRPC_LONG_UNLINK&lt;/tt&gt;, which no longer exists.&lt;/p&gt;</comment>
                            <comment id="148788" author="520557" created="Wed, 13 Apr 2016 19:01:47 +0000"  >&lt;p&gt;Andreas thanks for looking into it and suggestions.&lt;/p&gt;

&lt;p&gt;&amp;gt; It would seem that the problem is in obd_iocontrol(IOC_OSC_SET_ACTIVE)&lt;del&gt;&amp;gt;ptlrpc_set_import_active&lt;/del&gt;&amp;gt;ptlrpc_invalidate_import() not &lt;br/&gt;
&amp;gt; cleaning up the blocked RPC properly&lt;/p&gt;

&lt;p&gt;Actually I have investigated that portion and there is no problem with ptlrpc_invalidate_import() OR the race is not with RPC which is already killed. The problem here is fact that vfs_stat (not ll_getattr) has not exited yet, still having its mntcount.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;vfs_stat
    \__ vfs_fstatat
            \__ user_path_at --------&amp;gt; mntget i.e. atomic_inc(&amp;amp;mnt-&amp;gt;mnt_count);
            | 
            |\__ vfs_getattr
            |        \__  inode-&amp;gt;i_op-&amp;gt;getattr --&amp;gt; ll_getattr
            |
            \__ path_put  ----------&amp;gt; mntput i.e. atomic_dec_and_lock(&amp;amp;mnt-&amp;gt;mnt_count..
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;do_umount()
{
:
:
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;/*
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183; * If we may have to abort operations to get out of &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt;
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183; * mount, and they will themselves hold resources we must
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183; * allow the fs to &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt; things. In the Unix tradition of
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183; * &lt;span class=&quot;code-quote&quot;&gt;&apos;Gee thats tricky lets &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt; it in userspace&apos;&lt;/span&gt; the umount_begin
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183; * might fail to complete on the first run through as other tasks
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183; * must &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;, and the like. Thats &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; the mount program to worry
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183; * about &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; the moment.
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183; */
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (flags &amp;amp; MNT_FORCE &amp;amp;&amp;amp; sb-&amp;gt;s_op-&amp;gt;umount_begin) {
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;sb-&amp;gt;s_op-&amp;gt;umount_begin(sb); ------&amp;gt; ll_umount_begin()
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;}
:
:
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;retval = -EBUSY;
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (flags &amp;amp; MNT_DETACH || !propagate_mount_busy(mnt, 2)) { &amp;lt;----- If mnt_count not dec then retval = -EBUSY
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!list_empty(&amp;amp;mnt-&amp;gt;mnt_list))
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;umount_tree(mnt, 1, &amp;amp;umount_list);
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;retval = 0;
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;}
:
&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; retval;
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So one of possible place to handle this is ll_umount_begin() were we can wait ?&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;ll_umount_begin()
{
:
:
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;/* Really, we&apos;d like to wait until there are no requests outstanding,
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183; * and then &lt;span class=&quot;code-keyword&quot;&gt;continue&lt;/span&gt;.  For now, we just invalidate the requests,
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183; * schedule() and sleep one second &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; needed, and hope.
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183; */
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;schedule(); &amp;lt;===========
&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;EXIT;
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="149191" author="adilger" created="Sun, 17 Apr 2016 07:37:17 +0000"  >&lt;p&gt;That seems reasonable. The question is what should be waited on here?  Is it safe to have an interruptible wait on the mount count?  If the statfs is going to finish in some reasonable time we could just wait. The question is if it is some other reason why the mount count is elevated (e.g. open file in the filesystem used by a process) then the unmount could hang indefinitely, so we don&apos;t want to wait forever. Maybe 5-10s, checking every 0.5s should be enough?. &lt;/p&gt;</comment>
                            <comment id="151442" author="gerrit" created="Mon, 9 May 2016 05:20:48 +0000"  >&lt;p&gt;Rahul Deshmukh (rahul.deshmukh@seagate.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/20061&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/20061&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1882&quot; title=&quot;conf-sanity test_45: umount2: Device or resource busy&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1882&quot;&gt;&lt;del&gt;LU-1882&lt;/del&gt;&lt;/a&gt; llite: Adding timed wait in ll_umount_begin&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: c288090d562c89ebb55815efc27f04362e49ddbc&lt;/p&gt;</comment>
                            <comment id="158585" author="520557" created="Wed, 13 Jul 2016 05:49:34 +0000"  >&lt;p&gt;Can some one review the patch ?&lt;/p&gt;</comment>
                            <comment id="158766" author="yujian" created="Thu, 14 Jul 2016 00:04:10 +0000"  >&lt;p&gt;Hi Rahul,&lt;/p&gt;

&lt;p&gt;While you updating the patch to address the review comments from Andreas, could you please incorporate the changes from &lt;a href=&quot;http://review.whamcloud.com/15702&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/15702&lt;/a&gt; into the patch? Thank you.&lt;/p&gt;</comment>
                            <comment id="159103" author="520557" created="Mon, 18 Jul 2016 16:39:30 +0000"  >&lt;blockquote&gt;
&lt;p&gt;While you updating the patch to address the review comments from Andreas, could you please incorporate the changes from &lt;a href=&quot;http://review.whamcloud.com/15702&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/15702&lt;/a&gt; into the patch? Thank you.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Sure,  will do that.&lt;/p&gt;</comment>
                            <comment id="165398" author="lokesh.jaliminche" created="Thu, 8 Sep 2016 19:47:45 +0000"  >&lt;p&gt;Problem with previous patch is it was unable to get vfsmount from client_common_fill_super&lt;br/&gt;
because in newer kernels mount_nodev is used while fs initialization  (2eba19731) which does not pass vfsmount,&lt;br/&gt;
so client was getting crashed while umount -f. pushed new  patch to get correct vfsmount.&lt;/p&gt;</comment>
                            <comment id="168804" author="gerrit" created="Sat, 8 Oct 2016 16:38:11 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/20061/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/20061/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1882&quot; title=&quot;conf-sanity test_45: umount2: Device or resource busy&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1882&quot;&gt;&lt;del&gt;LU-1882&lt;/del&gt;&lt;/a&gt; llite: Adding timed wait in ll_umount_begin&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 205a57df278733d1c733defe5c80de0d745e1bd5&lt;/p&gt;</comment>
                            <comment id="168821" author="pjones" created="Sat, 8 Oct 2016 19:07:03 +0000"  >&lt;p&gt;Landed for 2.9&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="30684">LU-6730</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="23201">LU-4647</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="50088">LU-10467</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="51409">LU-10824</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="25189">LU-5213</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvaev:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>5222</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>