<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:31:41 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10059] sanityn test_32a: wrong file size</title>
                <link>https://jira.whamcloud.com/browse/LU-10059</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for Bob Glossman &amp;lt;bob.glossman@intel.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/12f0d24e-a732-11e7-b786-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/12f0d24e-a732-11e7-b786-5254006e85c2&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_32a failed with the following error:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;wrong file size
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Please provide additional information about the failure here.&lt;/p&gt;

&lt;p&gt;Info required for matching: sanityn 32a&lt;/p&gt;</description>
                <environment></environment>
        <key id="48556">LU-10059</key>
            <summary>sanityn test_32a: wrong file size</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                            <label>DNE</label>
                    </labels>
                <created>Mon, 2 Oct 2017 20:37:47 +0000</created>
                <updated>Mon, 16 May 2022 21:51:26 +0000</updated>
                            <resolved>Thu, 17 Mar 2022 16:06:33 +0000</resolved>
                                    <version>Lustre 2.11.0</version>
                    <version>Lustre 2.12.0</version>
                    <version>Lustre 2.13.0</version>
                    <version>Lustre 2.12.2</version>
                    <version>Lustre 2.14.0</version>
                    <version>Lustre 2.12.6</version>
                                                        <due></due>
                            <votes>1</votes>
                                    <watches>11</watches>
                                                                            <comments>
                            <comment id="211646" author="yujian" created="Mon, 23 Oct 2017 08:05:40 +0000"  >&lt;p&gt;More failure instance:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/937fd378-b3f4-11e7-a282-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/937fd378-b3f4-11e7-a282-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="214104" author="yong.fan" created="Mon, 20 Nov 2017 03:02:03 +0000"  >&lt;p&gt;+1 on master:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/4b6b05f2-cd8b-11e7-a066-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/4b6b05f2-cd8b-11e7-a066-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="215527" author="tappro" created="Thu, 7 Dec 2017 08:27:25 +0000"  >&lt;p&gt;&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/7fef1724-db22-11e7-9c63-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/7fef1724-db22-11e7-9c63-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="227781" author="adilger" created="Sun, 13 May 2018 14:52:58 +0000"  >&lt;p&gt;+1 on master&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/45ffb738-569e-11e8-b303-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/45ffb738-569e-11e8-b303-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="240408" author="pfarrell" created="Sun, 20 Jan 2019 15:24:17 +0000"  >&lt;p&gt;This test does not disable lockless truncate on failure, which is not good, since it&apos;s not generally safe.&#160; I realized this from:&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/b5edcc08-1b7c-11e9-8388-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/b5edcc08-1b7c-11e9-8388-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Which panicked with an error on truncate.&lt;/p&gt;

&lt;p&gt;From 32a:&lt;br/&gt;
&quot; log &quot;checking cached lockless truncate&quot;&lt;br/&gt;
 $TRUNCATE $DIR1/$tfile 8000000&lt;br/&gt;
 $CHECKSTAT -s 8000000 $DIR2/$tfile || error &quot;wrong file size&quot;&quot;&lt;/p&gt;

&lt;p&gt;That&apos;s almost certainly why &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10891&quot; title=&quot;sanityn test 77a, 77b, 77c, 77d, 77e, 77f, 77j and 77k all fail after 32a with &amp;#39;dd at *MB on client failed (2)&amp;#39;&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10891&quot;&gt;&lt;del&gt;LU-10891&lt;/del&gt;&lt;/a&gt; is happening after this failure.&lt;/p&gt;</comment>
                            <comment id="240409" author="pfarrell" created="Sun, 20 Jan 2019 15:25:39 +0000"  >&lt;p&gt;Two thoughts.&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;Fix the test.&#160; That&apos;s easy.&#160; I&apos;ll push a patch.&lt;/li&gt;
	&lt;li&gt;Should we remove lockless truncate?&#160; It&apos;s not safe, so it&apos;s off, so it&apos;s not used AFAIK...&#160; I&apos;ll dig in to this a little more before I go further, but I think we should consider just removing it.&#160; (Especially since its own test isn&apos;t 100% reliable.&#160; That may be a bug in lockless truncate too...)&lt;/li&gt;
&lt;/ol&gt;
</comment>
                            <comment id="240411" author="gerrit" created="Sun, 20 Jan 2019 15:55:30 +0000"  >&lt;p&gt;Patrick Farrell (pfarrell@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/34070&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34070&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10059&quot; title=&quot;sanityn test_32a: wrong file size&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10059&quot;&gt;&lt;del&gt;LU-10059&lt;/del&gt;&lt;/a&gt; tests: sanityn 32a restore parameters&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 7dc5954230ef2933242bfcb3312f3e761021f3b2&lt;/p&gt;</comment>
                            <comment id="240421" author="adilger" created="Sun, 20 Jan 2019 22:59:52 +0000"  >&lt;p&gt;We introduced lockless truncate to handle cases where lots of clients were (incorrectly) opening the same file with &lt;tt&gt;O_TRUNC&lt;/tt&gt;, but it caused the server to thrash.  I don&apos;t know that lockless truncate has a wide applicability, but it would be good (if practical) to fix it.  I think more important than lockless truncate is lockless small/unaligned write to have low latency IOPS for poorly-formed IO.&lt;/p&gt;</comment>
                            <comment id="240458" author="pfarrell" created="Mon, 21 Jan 2019 16:27:28 +0000"  >&lt;p&gt;I dug through it a bit, and the proximate cause of the failures in 32a seems to be an eviction.&#160; Can&apos;t find it in the server logs currently...&#160; If I &lt;b&gt;can&lt;/b&gt; manage that (or reproduce it locally, I&apos;ve got the test looping...), I&apos;ll take at least a quick look.&lt;/p&gt;

&lt;p&gt;I took a look at lockless truncate and convinced myself I mostly understand the basic design, but the implementation has clearly bit rotted a bit.&#160; It never worked fully in 2.x (and in fact, maybe not even in 1.8 - there&apos;s comments about it being disabled there too).&#160; I&apos;m not sure about the lockless i/o situation more generally - That design is a bit murkier to me without investing more time.&lt;/p&gt;

&lt;p&gt;It&apos;s compelling stuff, but unless we want to try to do a quick-ish fix for &lt;b&gt;just&lt;/b&gt; lockless truncate, I think it might be a lot of work to fix.&#160; And maybe even then.&lt;/p&gt;</comment>
                            <comment id="240471" author="pfarrell" created="Mon, 21 Jan 2019 18:16:07 +0000"  >&lt;p&gt;OK, I reproduced and have mostly figured out the lockless truncate issue.&lt;/p&gt;

&lt;p&gt;At least the issue I&apos;m seeing here happens 100% of the time in this situation:&lt;/p&gt;

&lt;p&gt;Truncate on the same client which did the writing, and truncate a size in the middle of an extent (for example, in the test, write 8 MiB and truncate to binary 8 million.&#160; This happens if you have 8 or more OSTs).&lt;/p&gt;


&lt;p&gt;This results in a partial extent remaining on the client after the truncate:&lt;/p&gt;

&lt;p&gt;in osc_cache_truncate_start:&lt;br/&gt;
&quot; } else {&lt;br/&gt;
 /* this must be an overlapped extent which means only&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;part of pages in this extent have been truncated.&lt;br/&gt;
 */&lt;br/&gt;
 EASSERTF(ext-&amp;gt;oe_start &amp;lt;= index, ext,&lt;br/&gt;
 &quot;trunc index = %lu/%d.\n&quot;, index, partial);&lt;br/&gt;
 /* fix index to skip this partially truncated extent */&lt;br/&gt;
 index = ext-&amp;gt;oe_end + 1;&lt;br/&gt;
 partial = false;&quot;&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;...&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;That extent is then placed in the&#160;&amp;amp;oio-&amp;gt;oi_trunc pointer, to be added back to the cache at the end of the i/o (osc_io_setattr_end)&#160;:&lt;/p&gt;

&lt;p&gt;osc_cache_truncate_end(env, oio-&amp;gt;oi_trunc);&lt;/p&gt;



&lt;p&gt;The key thing is this:&lt;br/&gt;
That extent still exists and is attached to the relevant LDLM lock (the one used to write it out).&#160; But since we&apos;re doing a lockless truncate, we send the punch request to the server without any LDLM locking locally, so the server tries to take the lock, and tries to call back the client write lock so it can do the truncate.&lt;/p&gt;

&lt;p&gt;It looks like we also have to avoid writing back this extent:&lt;br/&gt;
&quot; /* we need to hold this extent in OES_TRUNC state so&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;that no writeback will happen. This is to avoid&lt;/li&gt;
	&lt;li&gt;BUG 17397.&lt;/li&gt;
	&lt;li&gt;Only partial truncate can reach here, if @size is&lt;/li&gt;
	&lt;li&gt;not zero, the caller should provide a valid @extp. */&lt;br/&gt;
 LASSERT(*extp == NULL);&lt;br/&gt;
 *extp = osc_extent_get(ext);&lt;br/&gt;
 OSC_EXTENT_DUMP(D_CACHE, ext,&lt;br/&gt;
 &quot;trunc at %llu\n&quot;, size);&quot;&lt;/li&gt;
&lt;/ul&gt;




&lt;p&gt;This does suggest one possible route forward - This bug at least would be avoided by limiting lockless truncate to &quot;truncate to zero&quot;.&lt;/p&gt;

&lt;p&gt;That&apos;s probably valuable, as you noted the O_TRUNC case is of significant interest.&lt;/p&gt;</comment>
                            <comment id="240477" author="adilger" created="Mon, 21 Jan 2019 19:16:36 +0000"  >&lt;p&gt;It also doesn&apos;t make sense to do &quot;lockless&quot; truncate for an extent that is already cached locally under a DLM lock. In that case the client should just send the truncate and advertise that it already has the lock for that extent. &lt;/p&gt;</comment>
                            <comment id="240485" author="pfarrell" created="Mon, 21 Jan 2019 21:02:18 +0000"  >&lt;p&gt;Right, but the locking has already been done by the time we learn this (cl_io_lock vs cl_io_start), and there&apos;s no guarantee the lock the client holds is sufficient for the truncate.&#160; We&apos;d have to restart the i/o here to get the locking, and after looking at it...&#160; It&apos;s a decent bit of work.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;I would consider reviving lockless truncate, but only for truncate to zero.&#160; I think that would work as is, with minimal effort.&#160; I&apos;m going to push a patch to enable lockless truncate but only for truncate to zero, along with a quick test, just to see if anything interesting happens in autotest.&lt;/p&gt;</comment>
                            <comment id="240492" author="pfarrell" created="Mon, 21 Jan 2019 21:50:39 +0000"  >&lt;p&gt;Alternately, it could both deadlock and panic in local testing.&lt;/p&gt;

&lt;p&gt;I&apos;m going to table this and push a patch to add lockless truncate to ALWAYS_EXCEPT.&lt;/p&gt;</comment>
                            <comment id="240494" author="gerrit" created="Mon, 21 Jan 2019 21:53:17 +0000"  >&lt;p&gt;Patrick Farrell (pfarrell@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/34081&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34081&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10059&quot; title=&quot;sanityn test_32a: wrong file size&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10059&quot;&gt;&lt;del&gt;LU-10059&lt;/del&gt;&lt;/a&gt; tests: Disable lockless truncate test&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: d7f4f322514b522d2a23ce3a698e56a768e4bfbb&lt;/p&gt;</comment>
                            <comment id="240938" author="gerrit" created="Wed, 30 Jan 2019 02:41:38 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/34070/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34070/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10059&quot; title=&quot;sanityn test_32a: wrong file size&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10059&quot;&gt;&lt;del&gt;LU-10059&lt;/del&gt;&lt;/a&gt; tests: sanityn 32a restore parameters&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 62b57e34d9a0df1ce4b82650d7e328db5d048b39&lt;/p&gt;</comment>
                            <comment id="256390" author="adilger" created="Tue, 15 Oct 2019 00:52:13 +0000"  >&lt;p&gt;+1 on master &lt;a href=&quot;https://testing.whamcloud.com/test_sets/e9e439c2-eedd-11e9-add9-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/e9e439c2-eedd-11e9-add9-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="257304" author="adilger" created="Wed, 30 Oct 2019 01:45:51 +0000"  >&lt;p&gt;This hit 6x in the past week.&lt;/p&gt;</comment>
                            <comment id="264135" author="emoly.liu" created="Thu, 27 Feb 2020 04:32:36 +0000"  >&lt;p&gt;+1 on master: &lt;a href=&quot;https://testing.whamcloud.com/test_sets/a345510a-c777-4bc1-8c30-2413be63a24a&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/a345510a-c777-4bc1-8c30-2413be63a24a&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="283851" author="jamesanunez" created="Fri, 30 Oct 2020 17:34:52 +0000"  >&lt;p&gt;It looks like we are still seeing this issue. We see two different errors/situations in the suite log.&lt;/p&gt;

&lt;p&gt;One of the errors is &#8216;can&#8217;t lstat&#8217;with no complaint from truncate&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;== sanityn test 32a: lockless truncate =============================================================== 17:20:44 (1603992044)
CMD: trevis-9vm6 /usr/sbin/lctl get_param -n lod.lustre-MDT0000*.stripesize
CMD: trevis-9vm3.trevis.whamcloud.com params=\$(/usr/sbin/lctl get_param osc.*.lockless_truncate);
			 [[ -z \&quot;\&quot; ]] &amp;amp;&amp;amp; param= ||
			 param=\$(grep  &amp;lt;&amp;lt;&amp;lt; \&quot;\$params\&quot;);
			 [[ -z \$param ]] &amp;amp;&amp;amp; param=\&quot;\$params\&quot;;
			 while read s; do echo client \$s;
			 done &amp;lt;&amp;lt;&amp;lt; \&quot;\$param\&quot;
checking cached lockless truncate
Can&apos;t lstat /mnt/lustre2/f32a.sanityn: Input/output error
 sanityn test_32a: @@@@@@ FAIL: wrong file size 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;We see this error for&lt;br/&gt;
2.12.5.67 - &lt;a href=&quot;https://testing.whamcloud.com/test_sets/9881eb9e-1130-4da5-9312-a4451d67c59c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/9881eb9e-1130-4da5-9312-a4451d67c59c&lt;/a&gt;&lt;br/&gt;
2.13.55.104 - &lt;a href=&quot;https://testing.whamcloud.com/test_sets/7ad28649-b4a5-458a-8b3f-a08820a4b85c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/7ad28649-b4a5-458a-8b3f-a08820a4b85c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The other error we are seeing is a truncate error and the report on different size&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;== sanityn test 32a: lockless truncate =============================================================== 18:43:28 (1601491408)
CMD: trevis-65vm4 /usr/sbin/lctl get_param -n lod.lustre-MDT0000*.stripesize
CMD: trevis-65vm1.trevis.whamcloud.com params=\$(/usr/sbin/lctl get_param osc.*.lockless_truncate);
			 [[ -z \&quot;\&quot; ]] &amp;amp;&amp;amp; param= ||
			 param=\$(grep  &amp;lt;&amp;lt;&amp;lt; \&quot;\$params\&quot;);
			 [[ -z \$param ]] &amp;amp;&amp;amp; param=\&quot;\$params\&quot;;
			 while read s; do echo client \$s;
			 done &amp;lt;&amp;lt;&amp;lt; \&quot;\$param\&quot;
checking cached lockless truncate
truncate: cannot truncate &apos;/mnt/lustre/f32a.sanityn&apos; to length 8000000: Input/output error
/mnt/lustre2/f32a.sanityn has size 7340032, not 8000000
 sanityn test_32a: @@@@@@ FAIL: wrong file size 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;We see this error for&lt;br/&gt;
2.12.5.50 - &lt;a href=&quot;https://testing.whamcloud.com/test_sets/7c82a5a3-67f9-4d9e-996b-e6584cbad2d3&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/7c82a5a3-67f9-4d9e-996b-e6584cbad2d3&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="283859" author="gerrit" created="Fri, 30 Oct 2020 17:48:10 +0000"  >&lt;p&gt;James Nunez (jnunez@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/40496&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40496&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10059&quot; title=&quot;sanityn test_32a: wrong file size&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10059&quot;&gt;&lt;del&gt;LU-10059&lt;/del&gt;&lt;/a&gt; tests: sanityn 32a error messages&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 1ec9dafe19fe31b5a19151a33cdb388f359fa7c1&lt;/p&gt;</comment>
                            <comment id="284610" author="gerrit" created="Sat, 7 Nov 2020 08:15:32 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/40496/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40496/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10059&quot; title=&quot;sanityn test_32a: wrong file size&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10059&quot;&gt;&lt;del&gt;LU-10059&lt;/del&gt;&lt;/a&gt; tests: sanityn 32a error messages&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 3274e573957e8b8a067ae28c3f7d7788d40f310e&lt;/p&gt;</comment>
                            <comment id="323089" author="xiaolinzang" created="Tue, 18 Jan 2022 20:54:58 +0000"  >&lt;p&gt;We see the failure occasionally. &#160;Behind the error message&lt;/p&gt;

&lt;p&gt;Input/output error&lt;br/&gt;
&#160;sanityn test_32a: @@@@@@ FAIL: cached truncate - wrong file size &#160; &#160;&lt;/p&gt;

&lt;p&gt;is the failure of lstat(&quot;/mnt/lustre2/f32a.sanityn&quot;). &#160;But the file&apos;s size, stat and layout look normal, observed after the test failure.&lt;/p&gt;

&lt;p&gt;From the debug logs on mds0 (test driver), ldlm fails to connect to oss0: (-107 is no-conn)&lt;/p&gt;

&lt;p&gt;00000100:02020000:11.0:1642177889.168635:0:9731:0:(client.c:1371:ptlrpc_check_status()) 11-0: lustre-OST0000-osc-ffff895ad1180800: operation ldlm_enqueue to node 10.6.4.23@tcp failed: rc = -107&lt;/p&gt;

&lt;p&gt;Slightly earlier by timestamp, the oss0 debug log has the following (op 101 is ldlm_enqueue):&lt;/p&gt;

&lt;p&gt;00000020:00080000:10.0:1642177889.168118:0:14028:0:(tgt_handler.c:770:tgt_request_handle()) operation 101 on unconnected OST from 12345-10.6.4.19@tcp&lt;/p&gt;

&lt;p&gt;Also from the dmesg on mds0:&lt;/p&gt;

&lt;p&gt;[ 1346.778759] LustreError: 11-0: lustre-OST0000-osc-ffff895ad1180800: operation ldlm_enqueue to node 10.6.4.23@tcp failed: rc = -107&lt;br/&gt;
[ 1346.785968] Lustre: lustre-OST0000-osc-ffff895ad1180800: Connection to lustre-OST0000 (at 10.6.4.23@tcp) was lost; in progress operations using this service will wait for recovery to complete&lt;br/&gt;
[ 1346.787088] LustreError: 167-0: lustre-OST0000-osc-ffff895ad1180800: This client was evicted by lustre-OST0000; in progress operations using this service will fail.&lt;br/&gt;
[ 1346.796872] Lustre: lustre-OST0000-osc-ffff895ad1180800: Connection restored to 10.6.4.23@tcp (at 10.6.4.23@tcp)&lt;br/&gt;
[ 1346.951394] Lustre: DEBUG MARKER: sanityn test_32a: @@@@@@ FAIL: cached truncate - wrong file size&lt;/p&gt;

&lt;p&gt;It seems the mds0 and oss0 have a temporary connection error. It&apos;s unlikely due to a random network issue because other tests are OK when test_32a&apos;s fails, as we have observed many times.&lt;/p&gt;

&lt;p&gt;Uploaded the following files. &#160;The debug logs are denoted &quot;xxxxx&quot;.&lt;/p&gt;

&lt;p&gt;sanityn.test_32a.debug_log.mds0.32a_only &#160; &#160;sanityn.test_32a.dmesg.mds0&lt;br/&gt;
sanityn.test_32a.debug_log.oss0.32a_only &#160; &#160;sanityn.test_32a.test_log.mds0&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="329507" author="paf0186" created="Thu, 17 Mar 2022 16:06:33 +0000"  >&lt;p&gt;Test removed in&#160;&lt;/p&gt;
&lt;div class=&apos;table-wrap&apos;&gt;
&lt;table class=&apos;confluenceTable&apos;&gt;&lt;tbody&gt;
&lt;tr&gt;
&lt;td class=&apos;confluenceTd&apos;&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14838&quot; title=&quot;Remove old lockless code: Truncate &amp;amp; contention based lockless i/o&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14838&quot;&gt;&lt;del&gt;LU-14838&lt;/del&gt;&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;&lt;/table&gt;
&lt;/div&gt;
</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="51717">LU-10891</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="41926" name="sanityn.test_32a.debug_log.mds0.32a_only" size="2110437" author="xiaolinzang" created="Tue, 18 Jan 2022 20:50:48 +0000"/>
                            <attachment id="41927" name="sanityn.test_32a.debug_log.oss0.32a_only" size="135286" author="xiaolinzang" created="Tue, 18 Jan 2022 20:50:48 +0000"/>
                            <attachment id="41928" name="sanityn.test_32a.dmesg.mds0" size="64274" author="xiaolinzang" created="Tue, 18 Jan 2022 20:50:47 +0000"/>
                            <attachment id="41929" name="sanityn.test_32a.test_log.mds0" size="6006" author="xiaolinzang" created="Tue, 18 Jan 2022 20:50:47 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzl4n:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>