<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:32:57 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-3326] recovery-mds-scale test_failover_ost: tar: Cannot open: No space left on device</title>
                <link>https://jira.whamcloud.com/browse/LU-3326</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;After running recovery-mds-scale test_failover_ost for 1.5 hours (OST failed over 6 times), client load on one of the clients failed as follows:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;&amp;lt;snip&amp;gt;
tar: etc/mail/submit.cf: Cannot open: No space left on device
tar: etc/mail/trusted-users: Cannot open: No space left on device
tar: etc/mail/virtusertable: Cannot open: No space left on device
tar: etc/mail/access: Cannot open: No space left on device
tar: etc/mail/aliasesdb-stamp: Cannot open: No space left on device
tar: etc/gssapi_mech.conf: Cannot open: No space left on device
tar: Exiting with failure status due to previous errors
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Console log on the client (client-32vm6) showed that:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;19:40:31:INFO: task tar:2790 blocked for more than 120 seconds.
19:40:31:&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
19:40:31:tar           D 0000000000000000     0  2790   2788 0x00000080
19:40:31: ffff88004eb73a28 0000000000000082 ffff88004eb739d8 ffff88007c24fe50
19:40:31: 0000000000000286 0000000000000003 0000000000000001 0000000000000286
19:40:31: ffff88007bcb3ab8 ffff88004eb73fd8 000000000000fb88 ffff88007bcb3ab8
19:40:31:Call Trace:
19:40:31: [&amp;lt;ffffffffa03d775a&amp;gt;] ? cfs_waitq_signal+0x1a/0x20 [libcfs]
19:40:31: [&amp;lt;ffffffff8150ea05&amp;gt;] schedule_timeout+0x215/0x2e0
19:40:31: [&amp;lt;ffffffffa068517c&amp;gt;] ? ptlrpc_request_bufs_pack+0x5c/0x80 [ptlrpc]
19:40:31: [&amp;lt;ffffffffa069a770&amp;gt;] ? lustre_swab_ost_body+0x0/0x10 [ptlrpc]
19:40:31: [&amp;lt;ffffffff8150e683&amp;gt;] wait_for_common+0x123/0x180
19:40:31: [&amp;lt;ffffffff81063310&amp;gt;] ? default_wake_function+0x0/0x20
19:40:31: [&amp;lt;ffffffff8150e79d&amp;gt;] wait_for_completion+0x1d/0x20
19:40:31: [&amp;lt;ffffffffa08cbf6c&amp;gt;] osc_io_setattr_end+0xbc/0x190 [osc]
19:40:31: [&amp;lt;ffffffffa095cde0&amp;gt;] ? lov_io_end_wrapper+0x0/0x100 [lov]
19:40:31: [&amp;lt;ffffffffa055cf30&amp;gt;] cl_io_end+0x60/0x150 [obdclass]
19:40:31: [&amp;lt;ffffffffa055d7e0&amp;gt;] ? cl_io_start+0x0/0x140 [obdclass]
19:40:31: [&amp;lt;ffffffffa095ced1&amp;gt;] lov_io_end_wrapper+0xf1/0x100 [lov]
19:40:31: [&amp;lt;ffffffffa095c86e&amp;gt;] lov_io_call+0x8e/0x130 [lov]
19:40:31: [&amp;lt;ffffffffa095e3bc&amp;gt;] lov_io_end+0x4c/0xf0 [lov]
19:40:31: [&amp;lt;ffffffffa055cf30&amp;gt;] cl_io_end+0x60/0x150 [obdclass]
19:40:31: [&amp;lt;ffffffffa0561f92&amp;gt;] cl_io_loop+0xc2/0x1b0 [obdclass]
19:40:31: [&amp;lt;ffffffffa0a2aa08&amp;gt;] cl_setattr_ost+0x208/0x2c0 [lustre]
19:40:31: [&amp;lt;ffffffffa09f8b0e&amp;gt;] ll_setattr_raw+0x9ce/0x1000 [lustre]
19:40:31: [&amp;lt;ffffffffa09f919b&amp;gt;] ll_setattr+0x5b/0xf0 [lustre]
19:40:31: [&amp;lt;ffffffff8119e708&amp;gt;] notify_change+0x168/0x340
19:40:31: [&amp;lt;ffffffff811b284c&amp;gt;] utimes_common+0xdc/0x1b0
19:40:31: [&amp;lt;ffffffff811828d1&amp;gt;] ? __fput+0x1a1/0x210
19:40:31: [&amp;lt;ffffffff811b29fe&amp;gt;] do_utimes+0xde/0xf0
19:40:31: [&amp;lt;ffffffff811b2b12&amp;gt;] sys_utimensat+0x32/0x90
19:40:31: [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Maloo report: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/053120d2-bb19-11e2-8824-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/053120d2-bb19-11e2-8824-52540035b04c&lt;/a&gt;&lt;/p&gt;</description>
                <environment>&lt;br/&gt;
Lustre Branch: master&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-master/1486&quot;&gt;http://build.whamcloud.com/job/lustre-master/1486&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64&lt;br/&gt;
Test Group: failover&lt;br/&gt;
</environment>
        <key id="18911">LU-3326</key>
            <summary>recovery-mds-scale test_failover_ost: tar: Cannot open: No space left on device</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="yujian">Jian Yu</reporter>
                        <labels>
                            <label>mn1</label>
                            <label>mn4</label>
                    </labels>
                <created>Mon, 13 May 2013 05:09:25 +0000</created>
                <updated>Thu, 26 Feb 2015 21:58:13 +0000</updated>
                            <resolved>Wed, 20 Aug 2014 15:19:10 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                    <version>Lustre 2.4.1</version>
                    <version>Lustre 2.6.0</version>
                    <version>Lustre 2.4.2</version>
                    <version>Lustre 2.5.1</version>
                                    <fixVersion>Lustre 2.6.0</fixVersion>
                    <fixVersion>Lustre 2.5.3</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="58255" author="yujian" created="Mon, 13 May 2013 05:20:32 +0000"  >&lt;p&gt;Lustre Branch: master&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-master/1481&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-master/1481&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64&lt;br/&gt;
Test Group: failover&lt;/p&gt;

&lt;p&gt;After running recovery-mds-scale test_failover_ost for more than 1 hour (OST failed over 5 times), dd operation on one of the clients failed as follows:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-05-08 23:10:39: dd run starting
+ mkdir -p /mnt/lustre/d0.dd-client-26vm5.lab.whamcloud.com
+ /usr/bin/lfs setstripe -c -1 /mnt/lustre/d0.dd-client-26vm5.lab.whamcloud.com
+ cd /mnt/lustre/d0.dd-client-26vm5.lab.whamcloud.com
++ /usr/bin/lfs df /mnt/lustre/d0.dd-client-26vm5.lab.whamcloud.com
+ FREE_SPACE=4050128
+ BLKS=911278
+ echoerr &apos;Free disk space is 4050128, 4k blocks to dd is 911278&apos;
+ echo &apos;Free disk space is 4050128, 4k blocks to dd is 911278&apos;
Free disk space is 4050128, 4k blocks to dd is 911278
+ load_pid=27789
+ wait 27789
+ dd bs=4k count=911278 status=noxfer if=/dev/zero of=/mnt/lustre/d0.dd-client-26vm5.lab.whamcloud.com/dd-file
dd: writing `/mnt/lustre/d0.dd-client-26vm5.lab.whamcloud.com/dd-file&apos;: No space left on device
409345+0 records in
409344+0 records out
+ &apos;[&apos; 1 -eq 0 &apos;]&apos;
++ date &apos;+%F %H:%M:%S&apos;
+ echoerr &apos;2013-05-08 23:23:03: dd failed&apos;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Console log on client (client-26vm5) showed that:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;23:23:08:LustreError: 27789:0:(vvp_io.c:1086:vvp_io_commit_write()) Write page 409344 of inode ffff880037eda1b8 failed -28
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Maloo report: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/52c5d99a-b8f9-11e2-891d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/52c5d99a-b8f9-11e2-891d-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="58256" author="yujian" created="Mon, 13 May 2013 05:22:46 +0000"  >&lt;p&gt;Hi Hongchao,&lt;/p&gt;

&lt;p&gt;It seems the above out of space issue on dd operation is related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-657&quot; title=&quot;recovery-mds-scale (FLAVOR=MDS): client load failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-657&quot;&gt;&lt;del&gt;LU-657&lt;/del&gt;&lt;/a&gt;. Could you please check whether the original out of space issue on tar operation in this ticket is also the same or not? Thanks.&lt;/p&gt;</comment>
                            <comment id="58305" author="adilger" created="Mon, 13 May 2013 18:58:32 +0000"  >&lt;p&gt;Yu Jian,&lt;br/&gt;
is there an expectation that the OST would &lt;em&gt;not&lt;/em&gt; run out of space after writing to it for an hour?  Are the files written earlier in the test being deleted, or will the test fill up all of the OST space?  Does it check the free space on the OST that will be written on, or is this the free space for the whole filesystem and the one file may run out of space?&lt;/p&gt;</comment>
                            <comment id="58469" author="yujian" created="Tue, 14 May 2013 17:35:35 +0000"  >&lt;blockquote&gt;&lt;p&gt;is there an expectation that the OST would not run out of space after writing to it for an hour?&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Yes, the OST should not run out of space during the whole test run, because in run_{dd,tar,dbench,iozone}.sh each application keeps running inside its own $TESTDIR, which will be removed and recreated for each successful loop.&lt;/p&gt;

&lt;blockquote&gt;&lt;p&gt;Are the files written earlier in the test being deleted, or will the test fill up all of the OST space?&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;If the previous failover_mds test passed, then the applications on clients will just be terminated without deleting $TESTDIR and corresponding files. If the test failed, then the failed application will also leave the $TESTDIR and files undeleted. We need fix run_*.sh to delete test dirs for the two situations.&lt;/p&gt;

&lt;p&gt;However, it seems the above issue is not the cause of this ticket. Let&apos;s analyze the first Maloo report in this ticket, failover_mds test failed with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3325&quot; title=&quot;recovery-mds-scale test_failover_mds: tar: Cannot write: Input/output error&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3325&quot;&gt;&lt;del&gt;LU-3325&lt;/del&gt;&lt;/a&gt; and left /mnt/lustre/d0.tar-client-32vm6.lab.whamcloud.com dir on client-32vm6. Then failover_ost test started, tar application was also run inside /mnt/lustre/d0.tar-client-32vm6.lab.whamcloud.com dir on client-32vm6. Since there were several successful loops, the /mnt/lustre/d0.tar-client-32vm6.lab.whamcloud.com dir has been removed and recreated for several times, so the dir and files left by failover_mds did not affect failover_ost.&lt;/p&gt;

&lt;blockquote&gt;&lt;p&gt;Does it check the free space on the OST that will be written on, or is this the free space for the whole filesystem and the one file may run out of space?&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Currently, only run_dd.sh checks free space before performing dd operation. We need improve other run_*.sh to add this check. Running out of space on OST is not the expectation for failover_ost test.&lt;/p&gt;

&lt;p&gt;Besides improving run_*.sh scripts, we still need figure out the real cause of this ticket.&lt;/p&gt;</comment>
                            <comment id="58485" author="jlevi" created="Tue, 14 May 2013 18:32:38 +0000"  >&lt;p&gt;Reducing from blocker but want to continue digging into this to find the cause.&lt;/p&gt;</comment>
                            <comment id="58486" author="adilger" created="Tue, 14 May 2013 18:34:35 +0000"  >&lt;p&gt;One comment is that the previous data was in &lt;tt&gt;/mnt/lustre/d0.tar-client-32vm6.lab.whamcloud.com&lt;/tt&gt; but the failed test was writing to &lt;tt&gt;/mnt/lustre/d0.dd-client-26vm5.lab.whamcloud.com/dd-file&lt;/tt&gt; (i.e. &lt;b&gt;dd&lt;/b&gt; vs &lt;b&gt;tar&lt;/b&gt;), so it is possible that the old directory was not cleaned up?&lt;/p&gt;

&lt;p&gt;It might be useful to run &lt;tt&gt;lfs df&lt;/tt&gt; output in the test script before each &quot;dd&quot; so that we know what the space distribution is.&lt;/p&gt;</comment>
                            <comment id="58546" author="yujian" created="Wed, 15 May 2013 06:28:54 +0000"  >&lt;blockquote&gt;&lt;p&gt;One comment is that the previous data was in /mnt/lustre/d0.tar-client-32vm6.lab.whamcloud.com but the failed test was writing to /mnt/lustre/d0.dd-client-26vm5.lab.whamcloud.com/dd-file (i.e. dd vs tar), so it is possible that the old directory was not cleaned up?&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;For the first Maloo report (in the Description section above), both failover_mds and failover_ost tests failed at tar operation on client-32vm6, dd operation on client-32vm5 did not fail. Per the analysis in the above comment, the dir and files left by failover_mds did not affect failover_ost.&lt;/p&gt;

&lt;p&gt;For the second Maloo report above, failover_mds failed at tar operation on client-26vm6 and left data in /mnt/lustre/d0.tar-client-26vm6.lab.whamcloud.com, dd operation kept successful on client-26vm5 and cleaned up its dir before failover_mds stopped. Then failover_ost started, tar operation was still run on client-26vm6 in /mnt/lustre/d0.tar-client-26vm6.lab.whamcloud.com and did not fail, dd operation was still run on client-26vm5 in /mnt/lustre/d0.dd-client-26vm5.lab.whamcloud.com but failed. Again, since there was one successful dd operation loop before it failed, the /mnt/lustre/d0.dd-client-26vm5.lab.whamcloud.com dir has been removed and recreated, so the dir and files left by failover_mds did not affect failover_ost either.&lt;/p&gt;

&lt;blockquote&gt;&lt;p&gt;It might be useful to run lfs df output in the test script before each &quot;dd&quot; so that we know what the space distribution is.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;We can see this information in &quot;run_dd_debug&quot; log in the Maloo reports:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-05-08 23:10:39: dd run starting
+ mkdir -p /mnt/lustre/d0.dd-client-26vm5.lab.whamcloud.com
+ /usr/bin/lfs setstripe -c -1 /mnt/lustre/d0.dd-client-26vm5.lab.whamcloud.com
+ cd /mnt/lustre/d0.dd-client-26vm5.lab.whamcloud.com
++ /usr/bin/lfs df /mnt/lustre/d0.dd-client-26vm5.lab.whamcloud.com
+ FREE_SPACE=4050128
+ BLKS=911278
+ echoerr &apos;Free disk space is 4050128, 4k blocks to dd is 911278&apos;
+ echo &apos;Free disk space is 4050128, 4k blocks to dd is 911278&apos;
Free disk space is 4050128, 4k blocks to dd is 911278
+ load_pid=27789
+ wait 27789
+ dd bs=4k count=911278 status=noxfer if=/dev/zero of=/mnt/lustre/d0.dd-client-26vm5.lab.whamcloud.com/dd-file
dd: writing `/mnt/lustre/d0.dd-client-26vm5.lab.whamcloud.com/dd-file&apos;: No space left on device
409345+0 records in
409344+0 records out
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The corresponding codes in run_dd.sh:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;while [ ! -e &quot;$END_RUN_FILE&quot; ] &amp;amp;&amp;amp; $CONTINUE; do
        # ......
        FREE_SPACE=$($LFS df $TESTDIR|awk &apos;/filesystem summary:/ {print $5}&apos;)
        BLKS=$((FREE_SPACE * 9 / 40))
        echoerr &quot;Free disk space is $FREE_SPACE, 4k blocks to dd is $BLKS&quot;
        # ......
done
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It looks like dd operation keeps occupying too much free space (90%), which is likely to make the operations on other clients run out of space, or make itself run out of space since all of the operations compete for the free space concurrently.&lt;/p&gt;

&lt;p&gt;Since the goal for recovery-*-scale tests is not to fill up the space but to keep the applications running on different clients, I&apos;d suggest to reduce the dd size (like just use 50% of the free space or dynamically calculate the percent according to the number of clients) and leave enough free space for other applications.&lt;/p&gt;</comment>
                            <comment id="58775" author="hongchao.zhang" created="Fri, 17 May 2013 16:49:17 +0000"  >&lt;p&gt;the patch is tracked at &lt;a href=&quot;http://review.whamcloud.com/#change,6376&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,6376&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="59399" author="yujian" created="Tue, 28 May 2013 03:12:17 +0000"  >&lt;p&gt;Lustre Tag: v2_4_0_RC2&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_4/12/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_4/12/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64&lt;/p&gt;

&lt;p&gt;The issue occurred on recovery-mds-scale test_failover_ost:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/2e35d81c-c6c5-11e2-ae4e-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/2e35d81c-c6c5-11e2-ae4e-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="64080" author="yujian" created="Mon, 12 Aug 2013 14:11:03 +0000"  >&lt;p&gt;This is blocking the recovery-mds-scale test running on Lustre b2_4 branch:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/ed4a088e-02db-11e3-a4b4-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/ed4a088e-02db-11e3-a4b4-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/edec7172-fd14-11e2-b90c-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/edec7172-fd14-11e2-b90c-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="64453" author="hongchao.zhang" created="Mon, 19 Aug 2013 09:26:36 +0000"  >&lt;p&gt;the patch has been merged into master&lt;/p&gt;</comment>
                            <comment id="64462" author="yujian" created="Mon, 19 Aug 2013 13:52:45 +0000"  >&lt;p&gt;The patch was also cherry-picked to Lustre b2_4 branch.&lt;/p&gt;</comment>
                            <comment id="66060" author="yujian" created="Mon, 9 Sep 2013 14:57:03 +0000"  >&lt;p&gt;Lustre Tag: v2_4_1_RC2&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_4/45/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_4/45/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64&lt;/p&gt;

&lt;p&gt;The issue still occurred on recovery-mds-scale test_failover_ost:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;tar: etc/yum.repos.d/cobbler-config.repo: Cannot open: No space left on device
tar: etc/yum.repos.d/lustre-build.repo: Cannot open: No space left on device
tar: Exiting with failure status due to previous errors
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Console log on MDS showed that:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;11:46:04:Lustre: lustre-OST0004-osc-MDT0000: slow creates, last=[0x100040000:0xbba1:0x0], next=[0x100040000:0xbba1:0x0], reserved=0, syn_changes=54, syn_rpc_in_progress=36, status=-28
11:46:26:LNet: Service thread pid 2233 was inactive for 40.00s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
11:46:26:Pid: 2233, comm: mdt00_001
11:46:27:
11:46:27:Call Trace:
11:46:27: [&amp;lt;ffffffff810810cc&amp;gt;] ? lock_timer_base+0x3c/0x70
11:46:28: [&amp;lt;ffffffff8150f362&amp;gt;] schedule_timeout+0x192/0x2e0
11:46:28: [&amp;lt;ffffffff810811e0&amp;gt;] ? process_timeout+0x0/0x10
11:46:28: [&amp;lt;ffffffffa04a56d1&amp;gt;] cfs_waitq_timedwait+0x11/0x20 [libcfs]
11:46:28: [&amp;lt;ffffffffa0fb2174&amp;gt;] osp_precreate_reserve+0x5b4/0x1ed0 [osp]
11:46:28: [&amp;lt;ffffffff81063410&amp;gt;] ? default_wake_function+0x0/0x20
11:46:28: [&amp;lt;ffffffffa0fabc75&amp;gt;] osp_declare_object_create+0x155/0x4f0 [osp]
11:46:29: [&amp;lt;ffffffffa0f6b6ed&amp;gt;] lod_qos_declare_object_on+0xed/0x480 [lod]
11:46:29: [&amp;lt;ffffffffa0f6c569&amp;gt;] lod_alloc_qos.clone.0+0xae9/0x1180 [lod]
11:46:30: [&amp;lt;ffffffffa0cbff1f&amp;gt;] ? qsd_op_begin+0x5f/0xb40 [lquota]
11:46:30: [&amp;lt;ffffffffa0f6e65a&amp;gt;] lod_qos_prep_create+0x74a/0x1b14 [lod]
11:46:30: [&amp;lt;ffffffffa097ead2&amp;gt;] ? fld_server_lookup+0x72/0x3d0 [fld]
11:46:30: [&amp;lt;ffffffffa0f6902b&amp;gt;] lod_declare_striped_object+0x14b/0x880 [lod]
11:46:31: [&amp;lt;ffffffff81096d8f&amp;gt;] ? wake_up_bit+0x2f/0x40
11:46:31: [&amp;lt;ffffffffa0f69c71&amp;gt;] lod_declare_object_create+0x511/0x7a0 [lod]
11:46:31: [&amp;lt;ffffffffa0c198cf&amp;gt;] mdd_declare_object_create_internal+0xbf/0x1f0 [mdd]
11:46:31: [&amp;lt;ffffffffa0c28fee&amp;gt;] mdd_declare_create+0x4e/0x870 [mdd]
11:46:31: [&amp;lt;ffffffffa0c277ef&amp;gt;] ? mdd_linkea_prepare+0x23f/0x430 [mdd]
11:46:31: [&amp;lt;ffffffffa0c29fd5&amp;gt;] mdd_create+0x7c5/0x1790 [mdd]
11:46:32: [&amp;lt;ffffffffa0d9b927&amp;gt;] ? osd_xattr_get+0x97/0x2d0 [osd_ldiskfs]
11:46:35: [&amp;lt;ffffffffa0ec808f&amp;gt;] mdt_reint_open+0x135f/0x20c0 [mdt]
11:46:35: [&amp;lt;ffffffffa04c182e&amp;gt;] ? upcall_cache_get_entry+0x28e/0x860 [libcfs]
11:46:35: [&amp;lt;ffffffffa07abdcc&amp;gt;] ? lustre_msg_add_version+0x6c/0xc0 [ptlrpc]
11:46:36: [&amp;lt;ffffffffa063ff50&amp;gt;] ? lu_ucred+0x20/0x30 [obdclass]
11:46:36: [&amp;lt;ffffffffa0eb2911&amp;gt;] mdt_reint_rec+0x41/0xe0 [mdt]
11:46:36: [&amp;lt;ffffffffa0e97ae3&amp;gt;] mdt_reint_internal+0x4c3/0x780 [mdt]
11:46:36: [&amp;lt;ffffffffa0e9806d&amp;gt;] mdt_intent_reint+0x1ed/0x520 [mdt]
11:46:38: [&amp;lt;ffffffffa0e95f1e&amp;gt;] mdt_intent_policy+0x39e/0x720 [mdt]
11:46:38: [&amp;lt;ffffffffa0763831&amp;gt;] ldlm_lock_enqueue+0x361/0x8d0 [ptlrpc]
11:46:38: [&amp;lt;ffffffffa078a1ef&amp;gt;] ldlm_handle_enqueue0+0x4ef/0x10b0 [ptlrpc]
11:46:39: [&amp;lt;ffffffffa0e963a6&amp;gt;] mdt_enqueue+0x46/0xe0 [mdt]
11:46:39: [&amp;lt;ffffffffa0e9ca97&amp;gt;] mdt_handle_common+0x647/0x16d0 [mdt]
11:46:39: [&amp;lt;ffffffffa0ed63f5&amp;gt;] mds_regular_handle+0x15/0x20 [mdt]
11:46:41: [&amp;lt;ffffffffa07bc3c8&amp;gt;] ptlrpc_server_handle_request+0x398/0xc60 [ptlrpc]
11:46:42: [&amp;lt;ffffffffa04a55de&amp;gt;] ? cfs_timer_arm+0xe/0x10 [libcfs]
11:46:42: [&amp;lt;ffffffffa04b6d9f&amp;gt;] ? lc_watchdog_touch+0x6f/0x170 [libcfs]
11:46:43: [&amp;lt;ffffffffa07b3729&amp;gt;] ? ptlrpc_wait_event+0xa9/0x290 [ptlrpc]
11:46:44: [&amp;lt;ffffffff81055ad3&amp;gt;] ? __wake_up+0x53/0x70
11:46:44: [&amp;lt;ffffffffa07bd75e&amp;gt;] ptlrpc_main+0xace/0x1700 [ptlrpc]
11:46:44: [&amp;lt;ffffffffa07bcc90&amp;gt;] ? ptlrpc_main+0x0/0x1700 [ptlrpc]
11:46:44: [&amp;lt;ffffffff8100c0ca&amp;gt;] child_rip+0xa/0x20
11:46:45: [&amp;lt;ffffffffa07bcc90&amp;gt;] ? ptlrpc_main+0x0/0x1700 [ptlrpc]
11:46:45: [&amp;lt;ffffffffa07bcc90&amp;gt;] ? ptlrpc_main+0x0/0x1700 [ptlrpc]
11:46:45: [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20
11:46:45:
11:46:46:LustreError: dumping log to /tmp/lustre-log.1378579577.2233
11:46:46:Lustre: lustre-OST0005-osc-MDT0000: slow creates, last=[0x100050000:0xbc41:0x0], next=[0x100050000:0xbc41:0x0], reserved=0, syn_changes=0, syn_rpc_in_progress=10, status=-28
11:46:48:LNet: Service thread pid 2233 completed after 40.02s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Maloo report: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/cc263682-194c-11e3-bb73-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/cc263682-194c-11e3-bb73-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="66070" author="yujian" created="Mon, 9 Sep 2013 15:55:05 +0000"  >&lt;p&gt;More instances on Lustre b2_4 branch:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/76525a9c-14fb-11e3-ac48-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/76525a9c-14fb-11e3-ac48-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/89a20084-14fb-11e3-ac48-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/89a20084-14fb-11e3-ac48-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/da5b6604-142f-11e3-80c8-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/da5b6604-142f-11e3-80c8-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/ec68c8a0-142f-11e3-80c8-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/ec68c8a0-142f-11e3-80c8-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="66476" author="hongchao.zhang" created="Thu, 12 Sep 2013 10:44:30 +0000"  >&lt;p&gt;the issue is reproduced locally, and it should be caused by the deletion of the large file created by &quot;dd&quot;, which could contain many journal transactions&lt;br/&gt;
preventing the freed disk space to be used by the following write operations.&lt;/p&gt;

&lt;p&gt;the patch is under creation&amp;amp;test.&lt;/p&gt;</comment>
                            <comment id="66707" author="hongchao.zhang" created="Mon, 16 Sep 2013 07:38:33 +0000"  >&lt;p&gt;the patch is tracked at &lt;a href=&quot;http://review.whamcloud.com/#/c/7662/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/7662/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;local reproducer will fail about 10 minutes without the patch, and run more than 6 hours successfully with the patch applied.&lt;/p&gt;</comment>
                            <comment id="67384" author="jlevi" created="Tue, 24 Sep 2013 16:48:07 +0000"  >&lt;p&gt;Please complete the patch for Master before completing b2_4.&lt;/p&gt;</comment>
                            <comment id="67385" author="adilger" created="Tue, 24 Sep 2013 16:48:21 +0000"  >&lt;p&gt;Please submit the patch for master first.&lt;/p&gt;</comment>
                            <comment id="69875" author="hongchao.zhang" created="Fri, 25 Oct 2013 11:13:32 +0000"  >&lt;p&gt;the patch for master is tracked at &lt;a href=&quot;http://review.whamcloud.com/#/c/8071/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/8071/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="72559" author="hongchao.zhang" created="Mon, 2 Dec 2013 07:18:22 +0000"  >&lt;p&gt;there are two options to fix the issue,&lt;br/&gt;
1, modify the test script &quot;run_dd.sh&quot; to do some extra work to wait for the completion of the destroy of the object created by &quot;dd&quot; (preferred)&lt;br/&gt;
2, modify Lustre itself to wait for the destroy of the objects when it encounters the -ENOSPC during writing.&lt;/p&gt;

&lt;p&gt;Hi Yujian, &lt;br/&gt;
could you please take care of it during my leave for vacation, thanks!&lt;/p&gt;</comment>
                            <comment id="72564" author="yujian" created="Mon, 2 Dec 2013 10:21:17 +0000"  >&lt;blockquote&gt;&lt;p&gt;could you please take care of it during my leave for vacation, thanks!&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Sure, I just updated the patch with the first method.&lt;/p&gt;

&lt;p&gt;Patch for Lustre b2_4 branch is in &lt;a href=&quot;http://review.whamcloud.com/8447&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8447&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="72887" author="adilger" created="Thu, 5 Dec 2013 11:05:43 +0000"  >&lt;p&gt;I don&apos;t see how fixing the test script avoids problems that users might also hit?&lt;/p&gt;

&lt;p&gt;I recall that there are some fixes in patch 8071 which are useful to fix the OST-side lock cancellation and client-side page discard, though I haven&apos;t looked at the latest patch version.  It would be useful to verify if this is correctly causing unwritten pages to be discarded on the client.&lt;/p&gt;</comment>
                            <comment id="72889" author="yujian" created="Thu, 5 Dec 2013 11:59:50 +0000"  >&lt;blockquote&gt;&lt;p&gt;I recall that there are some fixes in patch 8071 which are useful to fix the OST-side lock cancellation and client-side page discard&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;The non-script part of changes in &lt;a href=&quot;http://review.whamcloud.com/8071&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8071&lt;/a&gt; is on file lustre/include/lustre_dlm_flags.h:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;-#define LDLM_FL_AST_MASK 0x0000000080008000ULL
+#define LDLM_FL_AST_MASK&#187;       &#187;       0x0000000000018000ULL
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Should I remain this change and drop the script part changes?&lt;/p&gt;</comment>
                            <comment id="72890" author="yujian" created="Thu, 5 Dec 2013 12:09:33 +0000"  >&lt;p&gt;In addition, lustre/include/lustre_dlm_flags.h was added by &lt;a href=&quot;http://review.whamcloud.com/5312&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5312&lt;/a&gt;, which does not exist on Lustre b2_4 branch.&lt;/p&gt;</comment>
                            <comment id="72962" author="adilger" created="Fri, 6 Dec 2013 07:54:36 +0000"  >&lt;p&gt;Yu Jian, I think the code change in 8071 is potentially in the right area, but not necessarily the right fix.  I &lt;em&gt;think&lt;/em&gt; this will change the wire protocol to use a different flag (LDLM_FL_DISCARD_DATA vs. LDLM_FL_AST_DISCARD_DATA) in the wire protocol, but I&apos;m not positive. It would be great to test this properly - write a large file in the background, delete it, and then check the kernel debug logs to see if more pages are being written or if they are being discarded.&lt;/p&gt;</comment>
                            <comment id="72965" author="yujian" created="Fri, 6 Dec 2013 09:30:45 +0000"  >&lt;p&gt;I applied the LDLM_FL_AST_MASK change from &lt;a href=&quot;http://review.whamcloud.com/8071&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8071&lt;/a&gt; to the latest master branch and created &lt;a href=&quot;http://review.whamcloud.com/8495&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8495&lt;/a&gt; to get a new build.&lt;/p&gt;

&lt;p&gt;The following recovery-mds-scale test_failover_mds was performed on that build:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/sub_tests/9752d892-5e8a-11e3-a925-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/sub_tests/9752d892-5e8a-11e3-a925-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The &quot;run_dd_debug&quot; log on Client 2 (wtm-68) showed that:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Total free disk space is 10589812, 4k blocks to dd is 2382707
+ pdsh -t 300 -S -w &apos;wtm-[67,68,71,72]&apos; &apos;export PATH=$PATH:/sbin:/usr/sbin; lctl set_param debug=-1&apos;
+ pdsh -t 300 -S -w &apos;wtm-[67,68,71,72]&apos; &apos;export PATH=$PATH:/sbin:/usr/sbin; lctl set_param debug_mb=150&apos;
+ pdsh -t 300 -S -w &apos;wtm-[67,68,71,72]&apos; &apos;export PATH=$PATH:/sbin:/usr/sbin; lctl dk &amp;gt; /dev/null&apos;
+ load_pid=42589
+ wait 42589
+ dd bs=4k count=2382707 status=noxfer if=/dev/zero of=/mnt/lustre/d0.dd-wtm-68/dd-file
2382707+0 records in
2382707+0 records out
+ &apos;[&apos; 0 -eq 0 &apos;]&apos;
++ date &apos;+%F %H:%M:%S&apos;
+ echoerr &apos;2013-12-06 06:19:18: dd succeeded&apos;
+ echo &apos;2013-12-06 06:19:18: dd succeeded&apos;
2013-12-06 06:19:18: dd succeeded
+ cd /tmp
+ rm -rf /mnt/lustre/d0.dd-wtm-68
+ pdsh -t 300 -S -w &apos;wtm-[67,68,71,72]&apos; &apos;export PATH=$PATH:/sbin:/usr/sbin;
			lctl dk &amp;gt; /scratch/jianyu/test_logs/2013-12-06/060035/recovery-mds-scale.test_failover_mds.run_dd_dk.$(hostname -s)_$(date +%s).log&apos;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The lctl debug logs are in &quot;run_dd_dk_1386339559&quot; and &quot;run_dd_dk_1386339667&quot; (there are two separate dd runs) in the above Maloo report.&lt;/p&gt;

&lt;p&gt;I found ofd_destroy_by_fid() in the OSS (wtm-72) lctl debug logs. Need look into the logs deeply.&lt;/p&gt;</comment>
                            <comment id="73188" author="yujian" created="Tue, 10 Dec 2013 12:55:18 +0000"  >&lt;p&gt;In &quot;run_dd_dk_1386339667&quot; debug logs on OSS (wtm-72), I found that:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;00010000:00010000:18.0:1386339564.644518:0:33665:0:(ldlm_lockd.c:848:ldlm_server_blocking_ast()) ### server preparing blocking AST ns: filter-lustre-OST0003_UUID lock: ffff880416972cc0/0xe8fc4c150d5421b0 lrc: 3/0,0 mode: PW/PW res: [0x2:0x0:0x0].0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;4095) flags: 0x50000000010020 nid: 10.10.18.127@tcp remote: 0xe0b36e7f4f2635f3 expref: 7 pid: 38018 timeout: 0 lvb_type: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;A blocking AST RPC was sent to client. &lt;/p&gt;

&lt;p&gt;And on client (wtm-68), in &quot;run_dd_dk_1386339667&quot;:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;00000020:00000001:14.0:1386339585.987516:0:40723:0:(cl_page.c:420:cl_page_put()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; entered
00000020:00000001:14.0:1386339585.987517:0:40723:0:(cl_page.c:422:cl_page_put()) page@ffff880746ae8600[1 ffff8805e1f6ac30:397170 ^(&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;)_(&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;) 4 0 1 (&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;) (&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;) 0x0]
00000020:00000001:14.0:1386339585.987518:0:40723:0:(cl_page.c:422:cl_page_put()) 1
00000020:00000001:14.0:1386339585.987518:0:40723:0:(cl_page.c:160:cl_page_free()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; entered
00000020:00000001:14.0:1386339585.987519:0:40723:0:(lustre_fid.h:719:fid_flatten32()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving (rc=788528898 : 788528898 : 2effff02)
00000020:00000010:14.0:1386339585.987519:0:40723:0:(cl_page.c:174:cl_page_free()) kfreed &lt;span class=&quot;code-quote&quot;&gt;&apos;page&apos;&lt;/span&gt;: 440 at ffff880746ae8600.
00000020:00000001:14.0:1386339585.987520:0:40723:0:(cl_page.c:175:cl_page_free()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving
00000020:00000001:14.0:1386339585.987520:0:40723:0:(cl_page.c:437:cl_page_put()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving
00020000:00000001:14.0:1386339585.987520:0:40723:0:(lov_page.c:82:lov_page_fini()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving
00000020:00000001:14.0:1386339585.987521:0:40723:0:(lustre_fid.h:719:fid_flatten32()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving (rc=4194307 : 4194307 : 400003)
00000020:00000010:14.0:1386339585.987521:0:40723:0:(cl_page.c:174:cl_page_free()) kfreed &lt;span class=&quot;code-quote&quot;&gt;&apos;page&apos;&lt;/span&gt;: 304 at ffff880746ae8800.
00000020:00000001:14.0:1386339585.987522:0:40723:0:(cl_page.c:175:cl_page_free()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving
00000020:00000001:14.0:1386339585.987522:0:40723:0:(cl_page.c:437:cl_page_put()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving
00000008:00000001:14.0:1386339585.987523:0:40723:0:(osc_cache.c:3137:osc_page_gang_lookup()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving (rc=0 : 0 : 0)

00000008:00000001:14.0:1386339585.987534:0:40723:0:(osc_cache.c:3246:osc_lock_discard_pages()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving (rc=0 : 0 : 0)
00000020:00001000:14.0:1386339585.987535:0:40723:0:(cl_object.c:971:cl_env_put()) 2@ffff88081d244a68
00000008:00000001:14.0:1386339585.987536:0:40723:0:(osc_lock.c:1376:osc_lock_flush()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving (rc=0 : 0 : 0)
00010000:00000001:14.0:1386339585.987538:0:40723:0:(ldlm_request.c:1353:ldlm_cli_cancel()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; entered
00010000:00000001:14.0:1386339585.987543:0:40723:0:(ldlm_request.c:1122:ldlm_cli_cancel_local()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; entered
00010000:00010000:14.0:1386339585.987543:0:40723:0:(ldlm_request.c:1127:ldlm_cli_cancel_local()) ### client-side cancel ns: lustre-OST0003-osc-ffff880435738000 lock: ffff88083821d980/0xe0b36e7f4f2635f3 lrc: 4/0,0 mode: PW/PW res: [0x2:0x0:0x0].0 rrc: 1 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;4095) flags: 0x428400010000 nid: local remote: 0xe8fc4c150d5421b0 expref: -99 pid: 42589 timeout: 0 lvb_type: 1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The osc_lock_discard_pages() was called in osc_lock_flush(), however, the following code path was not covered in the debug logs:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (descr-&amp;gt;cld_mode &amp;gt;= CLM_WRITE) {
                        result = osc_cache_writeback_range(env, obj,
                                        descr-&amp;gt;cld_start, descr-&amp;gt;cld_end,
                                        1, discard);
                        LDLM_DEBUG(ols-&amp;gt;ols_lock,
                                &lt;span class=&quot;code-quote&quot;&gt;&quot;lock %p: %d pages were %s.\n&quot;&lt;/span&gt;, lock, result,
                                discard ? &lt;span class=&quot;code-quote&quot;&gt;&quot;discarded&quot;&lt;/span&gt; : &lt;span class=&quot;code-quote&quot;&gt;&quot;written&quot;&lt;/span&gt;);
                        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (result &amp;gt; 0)
                                result = 0;
                }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I just started the testing again to do the &quot;write a large file in the background and delete it&quot; operations for more times.&lt;/p&gt;</comment>
                            <comment id="73283" author="yujian" created="Wed, 11 Dec 2013 15:12:26 +0000"  >&lt;p&gt;I performed the test to dd a large file in the background and then delete the file from another client. Unfortunately, the debug logs did not show &quot;pages were discarded&quot; info on the clients, and there were also no &quot;server preparing blocking AST&quot; info on the OSS node. Still digging.&lt;/p&gt;</comment>
                            <comment id="74029" author="yujian" created="Mon, 23 Dec 2013 15:19:29 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_4/70/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_4/70/&lt;/a&gt; (2.4.2 RC2)&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/34a4c608-6be4-11e3-a73e-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/34a4c608-6be4-11e3-a73e-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="74147" author="hongchao.zhang" created="Sun, 29 Dec 2013 13:41:41 +0000"  >&lt;p&gt;local test with two nodes, the server node runs master and client runs b2_1, b2_3, b2_4 or master, the pages were not discarded after the lock&apos;s blocking AST&lt;br/&gt;
is received, and will discard the pages after adding &quot;LDLM_FL_DISCARD_DATA&quot; instead of &quot;LDLM_FL_AST_DISCARD_DATA&quot; into &quot;LDLM_FL_AST_MASK&quot;.&lt;/p&gt;

&lt;p&gt;LDLM_AST_DISCARD_DATA (changed to LDLM_FL_AST_DISCARD_DATA in master) is used to indicate ldlm to sent blocking AST with LDLM_FL_DISCARD_DATA flag,&lt;br/&gt;
but it self is not on the wire, then the LDLM_FL_DISCARD_DATA will be ignored for it will be masked by &quot;LDLM_FL_AST_MASK&quot;.&lt;/p&gt;

&lt;p&gt;code snippet in master&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;void ldlm_add_bl_work_item(struct ldlm_lock *lock, struct ldlm_lock *&lt;span class=&quot;code-keyword&quot;&gt;new&lt;/span&gt;,
                           cfs_list_t *work_list)
{
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!ldlm_is_ast_sent(lock)) {
                LDLM_DEBUG(lock, &lt;span class=&quot;code-quote&quot;&gt;&quot;lock incompatible; sending blocking AST.&quot;&lt;/span&gt;);
                ldlm_set_ast_sent(lock);
                /* If the enqueuing client said so, tell the AST recipient to
                 * discard dirty data, rather than writing back. */
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (ldlm_is_ast_discard_data(&lt;span class=&quot;code-keyword&quot;&gt;new&lt;/span&gt;))         &amp;lt;---- check LDLM_FL_AST_DISCARD_DATA
                        ldlm_set_discard_data(lock);       &amp;lt;---- set LDLM_FL_DISCARD_DATA
                LASSERT(cfs_list_empty(&amp;amp;lock-&amp;gt;l_bl_ast));
                cfs_list_add(&amp;amp;lock-&amp;gt;l_bl_ast, work_list);
                LDLM_LOCK_GET(lock);
                LASSERT(lock-&amp;gt;l_blocking_lock == NULL);
                lock-&amp;gt;l_blocking_lock = LDLM_LOCK_GET(&lt;span class=&quot;code-keyword&quot;&gt;new&lt;/span&gt;);
        }
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;the corresponding patch is tracked at &lt;a href=&quot;http://review.whamcloud.com/#/c/8671/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/8671/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="74330" author="yujian" created="Sat, 4 Jan 2014 11:52:46 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_5/5/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_5/5/&lt;/a&gt;&lt;br/&gt;
Test Group: failover&lt;/p&gt;

&lt;p&gt;The same failure occurred:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/c86198c4-7505-11e3-95ae-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/c86198c4-7505-11e3-95ae-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="74472" author="yujian" created="Tue, 7 Jan 2014 10:04:10 +0000"  >&lt;p&gt;More instances on Lustre b2_5 branch:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/fcbcabd0-770e-11e3-b181-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/fcbcabd0-770e-11e3-b181-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/e669c6a8-8643-11e3-9f3f-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/e669c6a8-8643-11e3-9f3f-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/fc5c9556-8505-11e3-8da9-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/fc5c9556-8505-11e3-8da9-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/f16b8720-9922-11e3-83d7-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/f16b8720-9922-11e3-83d7-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/7d4fc910-956b-11e3-936f-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/7d4fc910-956b-11e3-936f-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="78830" author="yujian" created="Sun, 9 Mar 2014 09:43:47 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_5/39/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_5/39/&lt;/a&gt; (2.5.1 RC1)&lt;br/&gt;
Distro/Arch: RHEL6.5/x86_64&lt;br/&gt;
Test Group: failover&lt;/p&gt;

&lt;p&gt;The same failure occurred:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/36c8c4fe-a657-11e3-a191-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/36c8c4fe-a657-11e3-a191-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Many sub-tests in replay-single in failover test group also hit the &quot;No space left on device&quot; failure:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/c55e55ee-a657-11e3-a191-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/c55e55ee-a657-11e3-a191-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="80674" author="yujian" created="Tue, 1 Apr 2014 08:32:48 +0000"  >&lt;p&gt;This is blocking recovery-mds-scale testing.&lt;/p&gt;</comment>
                            <comment id="84357" author="yujian" created="Mon, 19 May 2014 05:47:34 +0000"  >&lt;p&gt;Hi Hongchao,&lt;/p&gt;

&lt;p&gt;Could you please drive patches of &lt;a href=&quot;http://review.whamcloud.com/8671&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8671&lt;/a&gt; (needs address the comment from Andreas) and &lt;a href=&quot;http://review.whamcloud.com/8071&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8071&lt;/a&gt; to be landed? Thanks. &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3326&quot; title=&quot;recovery-mds-scale test_failover_ost: tar: Cannot open: No space left on device&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3326&quot;&gt;&lt;del&gt;LU-3326&lt;/del&gt;&lt;/a&gt; has been blocking MDS/OST hard failover testing in Lustre release testing cycles.&lt;/p&gt;</comment>
                            <comment id="86410" author="hongchao.zhang" created="Thu, 12 Jun 2014 09:02:44 +0000"  >&lt;p&gt;updated the two patches of this ticket&lt;/p&gt;

&lt;p&gt;(1) &lt;a href=&quot;http://review.whamcloud.com/8671&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8671&lt;/a&gt;&lt;br/&gt;
as per the comments in Lustre, the LDLM_FL_AST_DISCARD_DATA is also designed to indicate &quot;FL_DISCARD&quot; flag to the blocking ASTs&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;/**
 * These are flags that are mapped into the flags and ASTs of blocking
 * locks Add FL_DISCARD to blocking ASTs */
#define LDLM_FL_AST_DISCARD_DATA        0x0000000080000000ULL // bit  31
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt; 

&lt;p&gt;(2) &lt;a href=&quot;http://review.whamcloud.com/8071&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8071&lt;/a&gt;&lt;br/&gt;
wait the deletion of objects to complete before doing various tests at client node.&lt;/p&gt;</comment>
                            <comment id="91475" author="yujian" created="Tue, 12 Aug 2014 21:05:22 +0000"  >&lt;p&gt;The back-ported patch for &lt;a href=&quot;http://review.whamcloud.com/8071&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8071&lt;/a&gt; on Lustre b2_5 branch is in &lt;a href=&quot;http://review.whamcloud.com/11425&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11425&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="92055" author="pjones" created="Wed, 20 Aug 2014 15:19:10 +0000"  >&lt;p&gt;Test fixes landed for 2.5.3 and 2.7. Residual patch landing will be tracked under a new ticket.&lt;/p&gt;</comment>
                            <comment id="92090" author="yujian" created="Wed, 20 Aug 2014 23:36:08 +0000"  >&lt;p&gt;The new ticket is &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5526&quot; title=&quot;recovery-mds-scale test failover_mds: dd: No space left on device&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5526&quot;&gt;&lt;del&gt;LU-5526&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="25235">LU-5235</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="26094">LU-5526</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvqov:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>8208</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>