<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:21:12 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-15776] 2.15 RC3: lost writes during server fofb by forced panics</title>
                <link>https://jira.whamcloud.com/browse/LU-15776</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;data miscompare seen during server failover by forced panics for an IOR job (ill formed IO).&#160;&#160;&lt;/p&gt;



&lt;p&gt;Error Output :&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;13&amp;#93;&lt;/span&gt; Expected: 0x0000000000029ff8&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;13&amp;#93;&lt;/span&gt; Actual: &#160; 0x0000000000000000&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;13&amp;#93;&lt;/span&gt; FAILED comparison of buffer containing 8-byte ints:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;13&amp;#93;&lt;/span&gt; &#160; File name = /lus/snx11922/ost2/ostest.vers/alsorun.20220421145202.27664.saturn-p2/CL_IOR_dom_sel_all_wr_10iter_834K_rand_noreuse.1.qSxF7f.1650570751/CL_IOR_dom_sel_all_wr_10iter_834K_rand_noreuse/IORfile.00000021&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;13&amp;#93;&lt;/span&gt; &#160; In transfer 152, 21504 errors between buffer indices 0 and 21503.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;13&amp;#93;&lt;/span&gt; &#160; File byte offset = 37576704:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;13&amp;#93;&lt;/span&gt; &#160; &#160; Expected: 0x000000156261b63a 0000000000000008 000000156261b63a 0000000000000018&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;13&amp;#93;&lt;/span&gt; &#160; &#160; Actual: &#160; 0x0000000000000000 0000000000000000 0000000000000000 0000000000000000&lt;/p&gt;

&lt;ul&gt;
	&lt;li&gt;
	&lt;ul&gt;
		&lt;li&gt;error **&lt;/li&gt;
	&lt;/ul&gt;
	&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;ERROR in IOR.c (line 448): data check error, aborting execution.&lt;/p&gt;

&lt;p&gt;ERROR: No such file or directory&lt;/p&gt;

&lt;ul&gt;
	&lt;li&gt;
	&lt;ul&gt;
		&lt;li&gt;exiting **&lt;/li&gt;
	&lt;/ul&gt;
	&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Client involved during this corruption:&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;task 13 on nid00097&lt;/p&gt;

&lt;p&gt;task 21 on nid00099&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;saturn-p2:/ # lfs getstripe -r -y /lus/snx11922/ost2/ostest.vers/alsorun.20220421145202.27664.saturn-p2/CL_IOR_dom_sel_all_wr_10iter_834K_rand_noreuse.1.qSxF7f.1650570751/CL_IOR_dom_sel_all_wr_10iter_834K_rand_noreuse/IORfile.00000021&lt;/p&gt;

&lt;p&gt;&#160; lcm_layout_gen:&#160; &#160; 7&lt;/p&gt;

&lt;p&gt;&#160; lcm_mirror_count:&#160; 1&lt;/p&gt;

&lt;p&gt;&#160; lcm_entry_count: &#160; 5&lt;/p&gt;

&lt;p&gt;&#160; component0:&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_id: &#160; &#160; &#160; &#160; &#160; &#160; 1&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_mirror_id:&#160; &#160; &#160; 0&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_flags:&#160; &#160; &#160; &#160; &#160; init&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_extent.e_start: 0&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_extent.e_end: &#160; 1048576&lt;/p&gt;

&lt;p&gt;&#160; &#160; sub_layout:&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_stripe_count:&#160; 0&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_stripe_size: &#160; 1048576&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_pattern: &#160; &#160; &#160; mdt&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_layout_gen:&#160; &#160; 0&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_stripe_offset: 0&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160; component1:&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_id: &#160; &#160; &#160; &#160; &#160; &#160; 2&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_mirror_id:&#160; &#160; &#160; 0&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_flags:&#160; &#160; &#160; &#160; &#160; init&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_extent.e_start: 1048576&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_extent.e_end: &#160; 269484032&lt;/p&gt;

&lt;p&gt;&#160; &#160; sub_layout:&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_stripe_count:&#160; 1&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_stripe_size: &#160; 1048576&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_pattern: &#160; &#160; &#160; raid0&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_layout_gen:&#160; &#160; 0&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_stripe_offset: 3&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_pool:&#160; &#160; &#160; &#160; &#160; ost2&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_objects:&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; - l_ost_idx: 3&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; &#160; l_fid: &#160; &#160; 0x100030000:0x83fae34:0x0&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160; component2:&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_id: &#160; &#160; &#160; &#160; &#160; &#160; 3&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_mirror_id:&#160; &#160; &#160; 0&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_flags:&#160; &#160; &#160; &#160; &#160; extension&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_extent.e_start: 269484032&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_extent.e_end: &#160; 536870912&lt;/p&gt;

&lt;p&gt;&#160; &#160; sub_layout:&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_stripe_count:&#160; 0&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_extension_size: 134217728&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_pattern: &#160; &#160; &#160; raid0&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_layout_gen:&#160; &#160; 0&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_stripe_offset: -1&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_pool:&#160; &#160; &#160; &#160; &#160; ost2&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160; component3:&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_id: &#160; &#160; &#160; &#160; &#160; &#160; 4&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_mirror_id:&#160; &#160; &#160; 0&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_flags:&#160; &#160; &#160; &#160; &#160; 0&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_extent.e_start: 536870912&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_extent.e_end: &#160; 536870912&lt;/p&gt;

&lt;p&gt;&#160; &#160; sub_layout:&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_stripe_count:&#160; -1&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_stripe_size: &#160; 1048576&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_pattern: &#160; &#160; &#160; raid0&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_layout_gen:&#160; &#160; 0&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_stripe_offset: -1&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_pool:&#160; &#160; &#160; &#160; &#160; ost2&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160; component4:&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_id: &#160; &#160; &#160; &#160; &#160; &#160; 5&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_mirror_id:&#160; &#160; &#160; 0&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_flags:&#160; &#160; &#160; &#160; &#160; extension&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_extent.e_start: 536870912&lt;/p&gt;

&lt;p&gt;&#160; &#160; lcme_extent.e_end: &#160; EOF&lt;/p&gt;

&lt;p&gt;&#160; &#160; sub_layout:&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_stripe_count:&#160; 0&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_extension_size: 268435456&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_pattern: &#160; &#160; &#160; raid0&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_layout_gen:&#160; &#160; 0&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_stripe_offset: -1&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160; lmm_pool:&#160; &#160; &#160; &#160; &#160; ost2&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Console Logs:&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;2022-04-21T14:55:08.086885-05:00 c0-0c1s8n0 Lustre: 9811:0:(client.c:2295:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1650570887/real 1650570887&amp;#93;&lt;/span&gt;&#160; req@000000007bfb28c1 x1730745432703552/t0(0) o400-&amp;gt;snx11922-OST0003-osc-ffff88878818e800@10.12.2.53@o2ib4001:28/4 lens 224/224 e 0 to 1 dl 1650570903 ref 1 fl Rpc:XNQr/0/ffffffff rc 0/-1 job:&apos;&apos;&lt;/p&gt;

&lt;p&gt;2022-04-21T14:55:08.086967-05:00 c0-0c1s8n0 Lustre: snx11922-OST0003-osc-ffff88878818e800: Connection to snx11922-OST0003 (at 10.12.2.53@o2ib4001) was lost; in progress operations using this service will wait for recovery to complete&lt;/p&gt;

&lt;p&gt;2022-04-21T14:55:08.086988-05:00 c0-0c1s8n0 Lustre: 9809:0:(client.c:2295:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1650570887/real 1650570887&amp;#93;&lt;/span&gt;&#160; req@000000008ee911d3 x1730745432703488/t0(0) o400-&amp;gt;snx11922-OST0002-osc-ffff88878818e800@10.12.2.53@o2ib4001:28/4 lens 224/224 e 0 to 1 dl 1650570903 ref 1 fl Rpc:XNQr/0/ffffffff rc 0/-1 job:&apos;&apos;&lt;/p&gt;

&lt;p&gt;2022-04-21T14:55:08.087004-05:00 c0-0c1s8n0 Lustre: snx11922-OST0002-osc-ffff88878818e800: Connection to snx11922-OST0002 (at 10.12.2.53@o2ib4001) was lost; in progress operations using this service will wait for recovery to complete&lt;/p&gt;

&lt;p&gt;2022-04-21T14:55:08.087023-05:00 c0-0c1s8n2 Lustre: 9808:0:(client.c:2295:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1650570887/real 1650570887&amp;#93;&lt;/span&gt;&#160; req@00000000e7ca3f51 x1730745432704128/t0(0) o400-&amp;gt;snx11922-OST0003-osc-ffff8887854c2000@10.12.2.53@o2ib4001:28/4 lens 224/224 e 0 to 1 dl 1650570903 ref 1 fl Rpc:XNQr/0/ffffffff rc 0/-1 job:&apos;&apos;&lt;/p&gt;

&lt;p&gt;2022-04-21T14:55:08.087040-05:00 c0-0c1s8n2 Lustre: snx11922-OST0003-osc-ffff8887854c2000: Connection to snx11922-OST0003 (at 10.12.2.53@o2ib4001) was lost; in progress operations using this service will wait for recovery to complete&lt;/p&gt;

&lt;p&gt;2022-04-21T14:55:08.087076-05:00 c0-0c1s8n1 Lustre: 9797:0:(client.c:2295:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1650570889/real 1650570889&amp;#93;&lt;/span&gt;&#160; req@0000000039643501 x1730745432705728/t0(0) o400-&amp;gt;snx11922-OST0003-osc-ffff88838195c000@10.12.2.53@o2ib4001:28/4 lens 224/224 e 0 to 1 dl 1650570905 ref 1 fl Rpc:XNQr/0/ffffffff rc 0/-1 job:&apos;&apos;&lt;/p&gt;

&lt;p&gt;2022-04-21T14:55:08.087086-05:00 c0-0c1s8n1 Lustre: snx11922-OST0003-osc-ffff88838195c000: Connection to snx11922-OST0003 (at 10.12.2.53@o2ib4001) was lost; in progress operations using this service will wait for recovery to complete&lt;/p&gt;

&lt;p&gt;2022-04-21T14:55:08.087114-05:00 c0-0c1s8n3 Lustre: 9786:0:(client.c:2295:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1650570889/real 1650570889&amp;#93;&lt;/span&gt;&#160; req@000000008cb8442c x1730745432700352/t0(0) o400-&amp;gt;snx11922-OST0002-osc-ffff888386f33000@10.12.2.53@o2ib4001:28/4 lens 224/224 e 0 to 1 dl 1650570905 ref 1 fl Rpc:XNQr/0/ffffffff rc 0/-1 job:&apos;&apos;&lt;/p&gt;

&lt;p&gt;2022-04-21T14:55:08.087129-05:00 c0-0c1s8n3 Lustre: snx11922-OST0003-osc-ffff888386f33000: Connection to snx11922-OST0003 (at 10.12.2.53@o2ib4001) was lost; in progress operations using this service will wait for recovery to complete&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Kern logs :&lt;/p&gt;

&lt;p&gt;Apr 21 14:55:25 snx11922n004 kernel: md: md3: resync interrupted.&lt;/p&gt;

&lt;p&gt;Apr 21 14:55:26 snx11922n004 kernel: LDISKFS-fs (md3): recovery complete&lt;/p&gt;

&lt;p&gt;Apr 21 14:55:26 snx11922n004 kernel: LDISKFS-fs (md3): mounted filesystem with ordered data mode. Opts: user_xattr,errors=panic,errors=panic,journal_checksum,no_mbcache,nodelalloc&lt;/p&gt;

&lt;p&gt;Apr 21 14:55:27 snx11922n003 kernel: Lustre: snx11922-MDT0001: Received LWP connection from 10.12.0.52@o2ib4, removing former export from 10.12.0.53@o2ib4&lt;/p&gt;

&lt;p&gt;Apr 21 14:55:27 snx11922n004 kernel: Lustre: snx11922-OST0003: Imperative Recovery enabled, recovery window shrunk from 900-2700 down to 450-2700&lt;/p&gt;

&lt;p&gt;Apr 21 14:55:27 snx11922n004 kernel: Lustre: snx11922-OST0003: in recovery but waiting for the first client to connect&lt;/p&gt;

&lt;p&gt;Apr 21 14:55:27 snx11922n004 kernel: md: resync of RAID array md3&lt;/p&gt;

&lt;p&gt;Apr 21 14:55:27 snx11922n004 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;204833&amp;#93;&lt;/span&gt;:md.c:10632:md_do_sync(): md: md3: resync range: [0, 30316376064)&lt;/p&gt;

&lt;p&gt;Apr 21 14:55:27 snx11922n002 kernel: Lustre: snx11922-MDT0000: Received LWP connection from 10.12.0.52@o2ib4, removing former export from 10.12.0.53@o2ib4&lt;/p&gt;

&lt;p&gt;Apr 21 14:55:27 snx11922n004 kernel: Lustre: snx11922-OST0003: Will be in recovery for at least 7:30, or until 16 clients reconnect&lt;/p&gt;

&lt;p&gt;Apr 21 14:55:40 snx11922n001 kernel: LNet: 2575200:0:(o2iblnd_cb.c:3369:kiblnd_check_conns()) Timed out tx for 10.12.0.53@o2ib4: 2 seconds&lt;/p&gt;

&lt;p&gt;Apr 21 14:55:40 snx11922n001 kernel: LNet: 2575200:0:(o2iblnd_cb.c:3369:kiblnd_check_conns()) Skipped 1 previous similar message&lt;/p&gt;

&lt;p&gt;Apr 21 14:55:48 snx11922n003 kernel: Lustre: snx11922-OST0002-osc-MDT0001: Connection restored to 10.12.0.52@o2ib4 (at 10.12.0.52@o2ib4)&lt;/p&gt;

&lt;p&gt;Apr 21 14:55:48 snx11922n001 kernel: Lustre: snx11922-OST0002-osc-ffff998fa4d8d000: Connection restored to 10.12.0.52@o2ib4 (at 10.12.0.52@o2ib4)&lt;/p&gt;

&lt;p&gt;Apr 21 14:55:48 snx11922n004 kernel: Lustre: snx11922-OST0002: Recovery over after 0:32, of 16 clients 16 recovered and 0 were evicted.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</description>
                <environment>Server Details:&lt;br/&gt;
&lt;br/&gt;
[&lt;a href=&apos;mailto:root@snx11922n000&apos;&gt;root@snx11922n000&lt;/a&gt; admin]# cscli fs_info&lt;br/&gt;
----------------------------------------------------------------------------------------------------&lt;br/&gt;
OST Redundancy style: Declustered Parity  gridraid&lt;br/&gt;
Disk I/O Integrity guard (ANSI T10-PI) is in use &#10004;&lt;br/&gt;
----------------------------------------------------------------------------------------------------&lt;br/&gt;
Information about &amp;quot;snx11922&amp;quot; file system:&lt;br/&gt;
----------------------------------------------------------------------------------------------------&lt;br/&gt;
Node           	Role      	Targets   	Failover partner	Devices&lt;br/&gt;
----------------------------------------------------------------------------------------------------&lt;br/&gt;
snx11922n000   	mgmt      	0 / 0     	snx11922n001&lt;br/&gt;
snx11922n001   	mgmt      	0 / 0     	snx11922n000&lt;br/&gt;
snx11922n002   	mgs       	1 / 1     	snx11922n003   	          /dev/md65&lt;br/&gt;
snx11922n003   	mds       	1 / 1     	snx11922n002   	          /dev/md66&lt;br/&gt;
snx11922n004   	oss       	2 / 2     	snx11922n005   	      /dev/md0, /dev/md2&lt;br/&gt;
snx11922n005   	oss       	2 / 2     	snx11922n004   	      /dev/md1, /dev/md3&lt;br/&gt;
&lt;br/&gt;
[&lt;a href=&apos;mailto:root@snx11922n000&apos;&gt;root@snx11922n000&lt;/a&gt; admin]# cscli show_node_versions -g all&lt;br/&gt;
snx11922n000 6.1-010.39&lt;br/&gt;
snx11922n001 6.1-010.39&lt;br/&gt;
snx11922n002 6.1-010.39-cm-22.04.19-g3cedd6d&lt;br/&gt;
snx11922n003 6.1-010.39-cm-22.04.19-g3cedd6d&lt;br/&gt;
snx11922n004 6.1-010.39-cm-22.04.19-g3cedd6d&lt;br/&gt;
snx11922n005 6.1-010.39-cm-22.04.19-g3cedd6d&lt;br/&gt;
&lt;br/&gt;
Lustre Version :&lt;br/&gt;
version=2.15.0_RC3_14_g3cedd6d&#8232;&#8232;Debug Level:&#8232;debug=ioctl neterror warning error emerg ha config console lfsck&lt;br/&gt;
&lt;br/&gt;
Debug_mb Size :&lt;br/&gt;
debug_mb=101&lt;br/&gt;
&lt;br/&gt;
Subsystem_debug:&lt;br/&gt;
undefined mdc mds osc ost class log llite rpc mgmt lnet lnd pinger filter libcfs echo ldlm lov lquota osd lfsck snapshot  lmv  sec gss  mgc mgs fid fld&lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
saturn-p2:/ # pcmd -s -n ALL_COMPUTE &amp;quot;mount -t lustre&amp;quot;&lt;br/&gt;
Output from 56-63,96-127:&lt;br/&gt;
&lt;a href=&apos;mailto:10.12.0.50@o2ib4000&apos;&gt;10.12.0.50@o2ib4000&lt;/a&gt;:&lt;a href=&apos;mailto:10.12.2.51@o2ib4001&apos;&gt;10.12.2.51@o2ib4001&lt;/a&gt;:/snx11922 on /lus/snx11922 type lustre (rw,nochecksum,flock,nouser_xattr,lruresize,lazystatfs,nouser_fid2path,verbose,encrypt)&lt;br/&gt;
Node(s) 56-63,96-127 had exit code 0&lt;br/&gt;
saturn-p2:/ # pcmd -s -n ALL_COMPUTE &amp;quot;lctl get_param version&amp;quot;&lt;br/&gt;
Output from 56-63,96-127:&lt;br/&gt;
version=2.15.0_RC3_14_g3cedd6d&lt;br/&gt;
Node(s) 56-63,96-127 had exit code 0&lt;br/&gt;
saturn-p2:/ # pcmd -s -n ALL_COMPUTE &amp;quot;lctl get_param debug&amp;quot;&lt;br/&gt;
Output from 56-63,96-127:&lt;br/&gt;
debug=ioctl neterror warning error emerg ha config console lfsck&lt;br/&gt;
Node(s) 56-63,96-127 had exit code 0&lt;br/&gt;
saturn-p2:/ # pcmd -s -n ALL_COMPUTE &amp;quot;lctl get_param debug_mb&amp;quot;&lt;br/&gt;
Output from 60-63,112-115,120-127:&lt;br/&gt;
debug_mb=201&lt;br/&gt;
Output from 104-111:&lt;br/&gt;
debug_mb=321&lt;br/&gt;
Output from 56-59,96-103,116-119:&lt;br/&gt;
debug_mb=161&lt;br/&gt;
Node(s) 56-63,96-127 had exit code 0&lt;br/&gt;
saturn-p2:/ # pcmd -s -n ALL_COMPUTE &amp;quot;lctl get_param subsystem_debug&amp;quot;&lt;br/&gt;
Output from 56-63,96-127:&lt;br/&gt;
subsystem_debug=&lt;br/&gt;
undefined mdc mds osc ost class log llite rpc mgmt lnet lnd pinger filter libcfs echo ldlm lov lquota osd lfsck snapshot  lmv  sec gss  mgc mgs fid fld&lt;br/&gt;
Node(s) 56-63,96-127 had exit code&lt;br/&gt;
&lt;br/&gt;
saturn-p2:/ # xtprocadmin| grep compute |wc&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;40     240    2360&lt;br/&gt;
saturn-p2:/ # apstat&lt;br/&gt;
Compute node summary&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;arch config     up   resv    use  avail   down rebootq&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;XT     40     40     36     36      4      0       0&lt;br/&gt;
&lt;br/&gt;
No pending applications are present&lt;br/&gt;
&lt;br/&gt;
Total placed applications: 6&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;Apid    ResId User PEs Nodes   Age State Command&lt;br/&gt;
3152339 51624314 vers  24     6 0h14m   run IOR&lt;br/&gt;
3152346 51624321 vers  24     6 0h10m   run IOR&lt;br/&gt;
3152348 51624323 vers  24     6 0h09m   run IOR&lt;br/&gt;
3152352 51624327 vers  24     6 0h04m   run IOR&lt;br/&gt;
3152353 51624328 vers  24     6 0h04m   run IOR&lt;br/&gt;
3152354 51624329 vers  24     6 0h02m   run IOR&lt;br/&gt;
&lt;br/&gt;
No applications or reservations are being cleaned up</environment>
        <key id="69862">LU-15776</key>
            <summary>2.15 RC3: lost writes during server fofb by forced panics</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="shadow">Alexey Lyashkov</assignee>
                                    <reporter username="m,prabu">Prabu Manoharan</reporter>
                        <labels>
                            <label>corruption</label>
                    </labels>
                <created>Fri, 22 Apr 2022 06:12:26 +0000</created>
                <updated>Wed, 1 Jun 2022 21:23:13 +0000</updated>
                            <resolved>Wed, 1 Jun 2022 21:23:13 +0000</resolved>
                                    <version>Lustre 2.15.0</version>
                                    <fixVersion>Lustre 2.15.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>16</watches>
                                                                            <comments>
                            <comment id="332683" author="JIRAUSER17312" created="Fri, 22 Apr 2022 17:15:50 +0000"  >&lt;p&gt;Are you guys working on this bug now?&lt;/p&gt;</comment>
                            <comment id="332871" author="pjones" created="Mon, 25 Apr 2022 17:35:42 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=spitzcor&quot; class=&quot;user-hover&quot; rel=&quot;spitzcor&quot;&gt;spitzcor&lt;/a&gt; could you please advise if someone from your team is working on this? Is this a blocker for 2.15?&lt;/p&gt;</comment>
                            <comment id="332880" author="spitzcor" created="Mon, 25 Apr 2022 17:47:53 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=shadow&quot; class=&quot;user-hover&quot; rel=&quot;shadow&quot;&gt;shadow&lt;/a&gt; is working this for HPE and so I&apos;ve assigned it to him.&lt;br/&gt;
I guess it wasn&apos;t evident in the report, but this instance reported here is seen with 2.15.0-RC3 (clients &amp;amp; servers).&lt;/p&gt;</comment>
                            <comment id="333472" author="spitzcor" created="Sat, 30 Apr 2022 01:03:44 +0000"  >&lt;p&gt;At this week&apos;s LWG call I reported that our attention was in debugging the client.  New information has been brought to light, which suggests that it is a server-side issue.  Quoted from Peggy G. at HPE:&lt;/p&gt;

&lt;p&gt;In an attempt to get more complete debug log data, a change was made to the test run to force panic on all computes as soon as IOR detected data corruption, rather than dump debug logs.  This change was made during the middle of the run mentioned in the previous comment by Prabu, under the alsorun.20220426124703.28248.saturn-p2 test dirs.&lt;br/&gt;
There were 2 IOR jobs that hit corruption around the same time:&lt;br/&gt;
CL_IOR_dom_sel_all_wr_10iter_834K_rand_noreuse.1.lDTLio.1651070204, 2 corrupt files each with 1 hole.&lt;br/&gt;
CL_IOR_dom_sel_all_wr_20iter_834K_rand_noreuse.1.V88PUz.1651069606, 4 corrupt files, 2 with 2 holes, 2 with 1 hole.&lt;br/&gt;
Taking a closer look at the data today, I noticed that in some cases, the gap between physical blocks was the same size as the gap between logical blocks, which seemed unusual. For example, for the 10iter job above, with lDTLio string:&lt;br/&gt;
IORfile.00000009 has 1 hole in the OST object starting at logical block 2919&lt;br/&gt;
(256-2918):1119371520-1119374182, (3072-16383):1119374336-1119387647,&lt;br/&gt;
the physical blocks have exactly the same gap as the logical blocks, 3072-2918=154, 1119374336-1119374182=154 That seemed strange. Checking the physical blocks in that gap, they do have the expected IOR data. The timestamp in IOR data confirms this. Here is output from debugfs blockdump for blocks 111937418&lt;span class=&quot;error&quot;&gt;&amp;#91;2-3&amp;#93;&lt;/span&gt;, 111937433&lt;span class=&quot;error&quot;&gt;&amp;#91;5-6&amp;#93;&lt;/span&gt;, showing the first 4 and last 3 lines of the bd output for each of those 4 blocks:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;1119374182 :
0000  2755 6962 0900 0000 08f8 0c00 0000 0000  &apos;Uib............
0020  2755 6962 0900 0000 18f8 0c00 0000 0000  &apos;Uib............
0040  2755 6962 0900 0000 28f8 0c00 0000 0000  &apos;Uib....(.......
0060  2755 6962 0900 0000 38f8 0c00 0000 0000  &apos;Uib....8.......

7720  2755 6962 0900 0000 d807 0d00 0000 0000  &apos;Uib............
7740  2755 6962 0900 0000 e807 0d00 0000 0000  &apos;Uib............
7760  2755 6962 0900 0000 f807 0d00 0000 0000  &apos;Uib............

1119374183 :
0000  2755 6962 0900 0000 0800 0000 0000 0000  &apos;Uib............
0020  2755 6962 0900 0000 1800 0000 0000 0000  &apos;Uib............
0040  2755 6962 0900 0000 2800 0000 0000 0000  &apos;Uib....(.......
0060  2755 6962 0900 0000 3800 0000 0000 0000  &apos;Uib....8.......
...
7720  2755 6962 0900 0000 d80f 0000 0000 0000  &apos;Uib............
7740  2755 6962 0900 0000 e80f 0000 0000 0000  &apos;Uib............
7760  2755 6962 0900 0000 f80f 0000 0000 0000  &apos;Uib............

1119374335 :
0000  2755 6962 0900 0000 0880 0900 0000 0000  &apos;Uib............
0020  2755 6962 0900 0000 1880 0900 0000 0000  &apos;Uib............
0040  2755 6962 0900 0000 2880 0900 0000 0000  &apos;Uib....(.......
0060  2755 6962 0900 0000 3880 0900 0000 0000  &apos;Uib....8.......
...
7720  2755 6962 0900 0000 d88f 0900 0000 0000  &apos;Uib............
7740  2755 6962 0900 0000 e88f 0900 0000 0000  &apos;Uib............
7760  2755 6962 0900 0000 f88f 0900 0000 0000  &apos;Uib............

1119374336 :
0000  2755 6962 0900 0000 0890 0900 0000 0000  &apos;Uib............
0020  2755 6962 0900 0000 1890 0900 0000 0000  &apos;Uib............
0040  2755 6962 0900 0000 2890 0900 0000 0000  &apos;Uib....(.......
0060  2755 6962 0900 0000 3890 0900 0000 0000  &apos;Uib....8.......
...
7720  2755 6962 0900 0000 d89f 0900 0000 0000  &apos;Uib............
7740  2755 6962 0900 0000 e89f 0900 0000 0000  &apos;Uib............
7760  2755 6962 0900 0000 f89f 0900 0000 0000  &apos;Uib............
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;All the intervening physical disk blocks show the same, correct timestamp for this iteration of the job (i.e. 0x62695527). None of the other IOR jobs used that same timestamp, so the data must have come from this one. The &apos;hole&apos; is present according to the inode extents, but the actual data did land on disk.&lt;br/&gt;
The other problem file from this job, IORfile.00000003, shows a similar signature: 1 hole in object at logical block 14178&lt;br/&gt;
...(256-14177):813097216-813111137, (14336-16383):813111296-813113343...&lt;br/&gt;
The gap in physical blocks is the same size as gap in logical blocks. This time though there is a twist. The first &apos;gap&apos; physical block, 813111138, has the correct, expected IOR data, the next logical data in sequence from the previous physical block.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;813111137 :
0000  2755 6962 0300 0000 08f8 0c00 0000 0000  &apos;Uib............
0020  2755 6962 0300 0000 18f8 0c00 0000 0000  &apos;Uib............
0040  2755 6962 0300 0000 28f8 0c00 0000 0000  &apos;Uib....(.......
0060  2755 6962 0300 0000 38f8 0c00 0000 0000  &apos;Uib....8.......
...
7720  2755 6962 0300 0000 d807 0d00 0000 0000  &apos;Uib............
7740  2755 6962 0300 0000 e807 0d00 0000 0000  &apos;Uib............
7760  2755 6962 0300 0000 f807 0d00 0000 0000  &apos;Uib............

813111138 :
0000  2755 6962 0300 0000 0800 0000 0000 0000  &apos;Uib............
0020  2755 6962 0300 0000 1800 0000 0000 0000  &apos;Uib............
0040  2755 6962 0300 0000 2800 0000 0000 0000  &apos;Uib....(.......
0060  2755 6962 0300 0000 3800 0000 0000 0000  &apos;Uib....8.......
...
7720  2755 6962 0300 0000 d80f 0000 0000 0000  &apos;Uib............
7740  2755 6962 0300 0000 e80f 0000 0000 0000  &apos;Uib............
7760  2755 6962 0300 0000 f80f 0000 0000 0000  &apos;Uib............
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;but this only continues through block 813111167:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;813111167 :
0000  2755 6962 0300 0000 08d0 0100 0000 0000  &apos;Uib............
0020  2755 6962 0300 0000 18d0 0100 0000 0000  &apos;Uib............
0040  2755 6962 0300 0000 28d0 0100 0000 0000  &apos;Uib....(.......
0060  2755 6962 0300 0000 38d0 0100 0000 0000  &apos;Uib....8.......
...
7720  2755 6962 0300 0000 d8df 0100 0000 0000  &apos;Uib............
7740  2755 6962 0300 0000 e8df 0100 0000 0000  &apos;Uib............
7760  2755 6962 0300 0000 f8df 0100 0000 0000  &apos;Uib............

813111168 :
0000  1c55 6962 0f00 0000 0800 0000 0000 0000  .Uib............
0020  1c55 6962 0f00 0000 1800 0000 0000 0000  .Uib............
0040  1c55 6962 0f00 0000 2800 0000 0000 0000  .Uib....(.......
0060  1c55 6962 0f00 0000 3800 0000 0000 0000  .Uib....8.......
...
7720  1c55 6962 0f00 0000 d80f 0000 0000 0000  .Uib............
7740  1c55 6962 0f00 0000 e80f 0000 0000 0000  .Uib............
7760  1c55 6962 0f00 0000 f80f 0000 0000 0000  .Uib............
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Starting at physical block 813111168 within the gap in the object, the data signature is different. It is still obviously IOR data, but with a different timestamp. As it happens, that timestamp, 0x6269551c, is the timestamp used by the 2nd failed IOR job, CL_IOR_dom_sel_all_wr_20iter_834K_rand_noreuse.1.V88PUz.1651069606. That job had 4 corrupt files, one of which was IORfile.00000015, which would have been the writer for this data (3rd 2-byte field in the debugfs blockdump output is the IOR write task #, task #15 writes IORfile.00000015). Since those blocks were free (i.e. not associated with the object for IORfile.00000003 from the other job), they were allocated to this file. IORfile.00000015 object inode does show an extent covering this range:&lt;br/&gt;
(80481-80608):813111168-813111295.&lt;br/&gt;
From the V88PUz job, IORfile.00000015 has 1 hole:&lt;br/&gt;
...(41700-41908):812573412-812573620, (42117-45439):812573829-812577151, ...&lt;br/&gt;
Again, the logical/physical gaps are the same size. And like IORfile.00000009 from the other job, the &quot;missing&quot; IOR data is present on those physical blocks. From debugfs &apos;bd&apos; output:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;812573620 :
0000  1c55 6962 0f00 0000 0800 0d00 0000 0000  .Uib............
0020  1c55 6962 0f00 0000 1800 0d00 0000 0000  .Uib............
0040  1c55 6962 0f00 0000 2800 0d00 0000 0000  .Uib....(.......
0060  1c55 6962 0f00 0000 3800 0d00 0000 0000  .Uib....8.......
...
7720  1c55 6962 0f00 0000 d807 0000 0000 0000  .Uib............
7740  1c55 6962 0f00 0000 e807 0000 0000 0000  .Uib............
7760  1c55 6962 0f00 0000 f807 0000 0000 0000  .Uib............

812573621 :
0000  1c55 6962 0f00 0000 0808 0000 0000 0000  .Uib............
0020  1c55 6962 0f00 0000 1808 0000 0000 0000  .Uib............
0040  1c55 6962 0f00 0000 2808 0000 0000 0000  .Uib....(.......
0060  1c55 6962 0f00 0000 3808 0000 0000 0000  .Uib....8.......
...
7720  1c55 6962 0f00 0000 d817 0000 0000 0000  .Uib............
7740  1c55 6962 0f00 0000 e817 0000 0000 0000  .Uib............
7760  1c55 6962 0f00 0000 f817 0000 0000 0000  .Uib............

812573828 :
0000  1c55 6962 0f00 0000 08f8 0c00 0000 0000  .Uib............
0020  1c55 6962 0f00 0000 18f8 0c00 0000 0000  .Uib............
0040  1c55 6962 0f00 0000 28f8 0c00 0000 0000  .Uib....(.......
0060  1c55 6962 0f00 0000 38f8 0c00 0000 0000  .Uib....8.......
...
7720  1c55 6962 0f00 0000 d807 0d00 0000 0000  .Uib............
7740  1c55 6962 0f00 0000 e807 0d00 0000 0000  .Uib............
7760  1c55 6962 0f00 0000 f807 0d00 0000 0000  .Uib............

812573829 :
0000  1c55 6962 0f00 0000 0800 0000 0000 0000  .Uib............
0020  1c55 6962 0f00 0000 1800 0000 0000 0000  .Uib............
0040  1c55 6962 0f00 0000 2800 0000 0000 0000  .Uib....(.......
0060  1c55 6962 0f00 0000 3800 0000 0000 0000  .Uib....8.......
...
7720  1c55 6962 0f00 0000 d80f 0000 0000 0000  .Uib............
7740  1c55 6962 0f00 0000 e80f 0000 0000 0000  .Uib............
7760  1c55 6962 0f00 0000 f80f 0000 0000 0000  .Uib............
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Every physical block in that gap has the expected, valid IOR data, matching timestamp, matching task #.&lt;/p&gt;</comment>
                            <comment id="333520" author="peggy" created="Mon, 2 May 2022 12:36:41 +0000"  >&lt;p&gt;There&apos;s one correction to the previous comment, about the file IORfile.00000015 from job CL_IOR_dom_sel_all_wr_20iter_834K_rand_noreuse.1.V88PUz.1651069606. The hole in that file:&lt;br/&gt;
&lt;font color=&quot;#de350b&quot;&gt;(41700-41908):812573412-812573620, (42117-45439):812573829-812577151&lt;/font&gt;&lt;br/&gt;
does not completely contain the expected data from the hole. I ran icheck on all the blocks in that range, found some blocks in the middle of the hole that are in use, by this same inode, extent &lt;font color=&quot;#de350b&quot;&gt;(14848-14975):812573696-812573823&lt;/font&gt;. physical blocks 812573621-812573695 have valid IOR data for that hole. Blocks 812573696-812577823 have data in the middle of xfer started at file byte offset 60635136. Blocks 812577824-812573828 contain data that matches the expected last 5 blks of the &apos;hole&apos;.&lt;/p&gt;

&lt;p&gt;For reference, here&apos;s debugfs &apos;stat&apos; output for IORfile.00000015 ost object:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@snx11922n004 ~]# debugfs -c -R &quot;stat /O/0/d4/127076516&quot; /dev/md2
debugfs 1.46.2.cr1 (13-Jan-2022)
/dev/md2: catastrophic mode - not reading inode or group bitmaps
Inode: 1146884   Type: regular    Mode:  0666   Flags: 0x80000
Generation: 1271779983    Version: 0x00000141:00000ba2
User:  1356   Group: 11121   Project:     0   Size: 479102976
File ACL: 0
Links: 1   Blockcount: 932048
Fragment:  Address: 0    Number: 0    Size: 0
 ctime: 0x626955dd:00000000 -- Wed Apr 27 09:40:29 2022
 atime: 0x00000000:00000000 -- Wed Dec 31 18:00:00 1969
 mtime: 0x626955dd:00000000 -- Wed Apr 27 09:40:29 2022
crtime: 0x626954e4:ede07d30 -- Wed Apr 27 09:36:20 2022
Size of extra inode fields: 32
Extended attributes:
  lma: fid=[0x100010000:0x79308a4:0x0] compat=8 incompat=0
  fid: parent=[0x200019284:0xf3a:0x0] stripe=0 stripe_size=1048576 stripe_count=1 component_id=2 component_start=1048576 component_end=536870912 layout_version=0 range=0
EXTENTS:
(ETB0):293636096, (256-1250):811688192-811689186, (1251-1535):814882449-814882733, (1536-1663):812577152-812577279, (1664-1919):811673856-811674111, (1920-2047):815736064-81573619
1, (2048-2292):812651520-812651764, (2293-2559):813565312-813565578, (2560-2687):815838976-815839103, (2688-2918):813935756-813935986, (2919-3335):811690855-811691271, (3336-3543)
:815174805-815175012, (3544-3839):814450656-814450951, (3840-3960):813526339-813526459, (3961-7807):811691897-811695743, (7808-8130):811671552-811671874, (8131-8191):811696067-811
696127, (8192-8339):811704320-811704467, (8340-8447):815992960-815993067, (8448-8756):815736192-815736500, (8757-9727):811704885-811705855, (9728-9798):811687936-811688006, (9799-
13552):811705927-811709680, (13553-13760):812029184-812029391, (13761-14847):811709889-811710975, (14848-14975):812573696-812573823, (14976-15219):812633581-812633824, (15220-1542
8):811711348-811711556, (15429-15636):815843328-815843
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="333808" author="adilger" created="Wed, 4 May 2022 20:53:50 +0000"  >&lt;p&gt;Peggy, not a solution to the problem, but some background info on the OST IO path that may help track down a fix:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;on bulk write, the OST will allocate blocks for the inode in an in-memory journal transaction&lt;/li&gt;
	&lt;li&gt;if the write is &quot;large&quot; (&lt;tt&gt;osd-ldiskfs.testfs-OST0001.writethrough_max_io_mb=8&lt;/tt&gt; or over by default) then preallocated pages are used from a per-thread stash, otherwise they are allocated in the inode&apos;s mapping in page cache&lt;/li&gt;
	&lt;li&gt;the RDMA from the client will land in these pages and be submitted directly to disk&lt;/li&gt;
	&lt;li&gt;the client RPC will complete after the RDMA data is written to disk and is assigned a Lustre transno&lt;/li&gt;
	&lt;li&gt;the journal transaction with the metadata changes to allocate the blocks for the inode will commit asynchronously (5s later by default)&lt;/li&gt;
	&lt;li&gt;the client is supposed to keep these dirty pages in RAM for RPC replay until it gets a last_committed value &amp;gt;= transno&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;If there is a crash between the RDMA+write completion, but the journal transaction is not committed, then you will see exactly what you describe - the data is present in the right spot on disk, but not &quot;allocated&quot; to the inode.  The client is &lt;b&gt;supposed&lt;/b&gt; to replay those RPCs if the server crashed before the allocation transaction was committed to disk, but it doesn&apos;t appear that this is happening in this case.  Initially I was wondering if this was a case of uninitialized ext4 extents (where the blocks are allocated, but have a flag that tells readers to just report &quot;all zero&quot; for those blocks), but it looks like this is a case where the blocks are not allocated to the file at all.&lt;/p&gt;

&lt;p&gt;Note that if the clients &lt;b&gt;do&lt;/b&gt; replay their BRW RPCs to the OST as they should, then there is no guarantee that the filesystem is going to allocate exactly the same blocks to each inode.  I would hope they should, given the proximity of the block numbers, but that might be a reason why the same data is on disk in two places and/or why some of it is allocated to different files.&lt;/p&gt;

&lt;p&gt;IMHO, this looks like the problem is in the BRW RPC replay mechanism (or lack thereof), and is unlikely to be in the ldiskfs level, unless there is a failing in the commit callbacks that somehow break &lt;tt&gt;last_committed&lt;/tt&gt; and imply the BRW RPC was committed to disk when it is not.&lt;/p&gt;</comment>
                            <comment id="334074" author="shadow" created="Fri, 6 May 2022 20:38:42 +0000"  >&lt;p&gt;Andreas, picture much strange when simple request lost / fail during recovery.&lt;br/&gt;
lost region don&apos;t mapped over any extent sends from client to server and request isn&apos;t finished on server during crash time.&lt;br/&gt;
so it&apos;s simple resend over recovery case. in example it&apos;s&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000008:00200020:3.0:1651709243.919899:0:28609:0:(osc_cache.c:1084:osc_extent_make_ready()) extent 00000000c6c59345@{[19456 -&amp;gt; 20015/20479], [1|0|+|locking|wiuY|00000000a3d0090d], [2318336|560|+|-|0000000047a524c6|1024|000000004ef973c1]} make ready


00000008:00100000:3.0:1651709243.920226:0:28609:0:(osc_request.c:1804:osc_brw_prep_request()) brw rpc ffff8887c10f3680/1731924107812288 - object 0x0:137211497 offset 62914560&amp;lt;&amp;gt;83886080

00000100:00100000:6.0:1651709243.920235:0:17045:0:(client.c:1733:ptlrpc_send_new_req()) Sending RPC req@ffff8887c10f3680 pname:cluuid:pid:xid:nid:opc:job ptlrpcd_00_06:3a768119-6203-4fb5-a916-2d4dabdd8e84:17045:1731924107812288:10.12.2.53@o2ib4001:4:
00000100:00000400:19.0:1651709261.180872:0:17045:0:(client.c:2295:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1651709243/real 1651709243]  req@ffff8887c10f3680 x1731924107812288/t0(0) o4-&amp;gt;snx11922-OST0002-osc-ffff888f8ae7c800@10.12.2.53@o2ib4001:6/4 lens 536/456 e 0 to 1 dl 1651709261 ref 2 fl Rpc:XQr/0/ffffffff rc 0/-1 job:&apos;&apos;
00000100:00080000:18.0:1651709866.209843:0:17038:0:(recover.c:218:ptlrpc_wake_delayed()) @@@ waking (set ffff888788289200):  req@ffff8887c10f3680 x1731924107812288/t0(0) o4-&amp;gt;snx11922-OST0002-osc-ffff888f8ae7c800@10.12.0.52@o2ib4000:6/4 lens 536/456 e 0 to 1 dl 1651709261 ref 2 fl Rpc:XQU/0/ffffffff rc 0/-1 job:&apos;&apos;
00000100:00100000:7.0:1651709867.863582:0:17045:0:(client.c:2215:ptlrpc_check_set()) Completed RPC req@ffff8887c10f3680 pname:cluuid:pid:xid:nid:opc:job ptlrpcd_00_06:3a768119-6203-4fb5-a916-2d4dabdd8e84:17045:1731924107812288:10.12.0.52@o2ib4000:4:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;thr ffff8aebb2b88600 - pid 82928
                        req ffff8ad91927f080
request ffff8ad91927f080
from 0.0.0.117@13:4
to 10.12.2.53@5:4001
xid 1731924107812288:1731924107812291
transno 1262720737376
time 923905344:1651709256:0
flags 3
lens 184 208 24 64 0 0
ptlrpc body(v3) ffff8aed93688698
        type 4711
        tag 0 - last_xid 1731924107811071
        conn 22
        opc 4 OST_WRITE
num ios 1 (ffff8aed93688820)
obj [0] : 0:0x82dae69 bufs:4
data segments
0 &amp;lt;&amp;gt; 62914560 : 282624 : 460
1 &amp;lt;&amp;gt; 75153408 : 856064 : 460
2 &amp;lt;&amp;gt; 79691776 : 2293760 : 460
3 &amp;lt;&amp;gt; 83693568 : 192512 : 460
not replied

-----
crash&amp;gt; bt 82928
PID: 82928  TASK: ffff8aeb524097c0  CPU: 15  COMMAND: &quot;ll_ost_io00_442&quot;
 #0 [ffffa62e6504f880] __schedule at ffffffff8774e1d4
 #1 [ffffa62e6504f918] schedule at ffffffff8774e648
 #2 [ffffa62e6504f928] osd_trans_stop at ffffffffc1acce24 [osd_ldiskfs]
 #3 [ffffa62e6504fa00] ofd_commitrw_write at ffffffffc18f4453 [ofd]
 #4 [ffffa62e6504faa0] ofd_commitrw at ffffffffc18f97b1 [ofd]
 #5 [ffffa62e6504fb60] finish_wait at ffffffff86f2e5ac
 #6 [ffffa62e6504fbd8] tgt_brw_write at ffffffffc1389bb5 [ptlrpc]
 #7 [ffffa62e6504fd50] tgt_request_handle at ffffffffc138ade3 [ptlrpc]
 #8 [ffffa62e6504fdd0] ptlrpc_server_handle_request at ffffffffc1337953 [ptlrpc]
 #9 [ffffa62e6504fe38] ptlrpc_main at ffffffffc13393c6 [ptlrpc]
#10 [ffffa62e6504ff10] kthread at ffffffff86f043a6
#11 [ffffa62e6504ff50] ret_from_fork at ffffffff8780023f
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;but data lost a part of the segment 2.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[15]   File byte offset = 79425536:
...
file has 1 hole at byte offset 79425536

(15429-19390):680934469-680938430, (19456-25645):680938496-680944685,

hole is 65 blks.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And lost extent have a write &quot;written&quot; state in extent cache.&lt;/p&gt;

&lt;p&gt;it might possible if some data have write in the extents allocated early (IOR uses a random point to write) and rpc resend had lost, but client logs say rpc completed without errors.&lt;/p&gt;</comment>
                            <comment id="334178" author="pjones" created="Mon, 9 May 2022 17:37:33 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=spitzcor&quot; class=&quot;user-hover&quot; rel=&quot;spitzcor&quot;&gt;spitzcor&lt;/a&gt; &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=peggy&quot; class=&quot;user-hover&quot; rel=&quot;peggy&quot;&gt;peggy&lt;/a&gt; &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=shadow&quot; class=&quot;user-hover&quot; rel=&quot;shadow&quot;&gt;shadow&lt;/a&gt; have you managed to marrow down when this problem was introduced?&lt;/p&gt;</comment>
                            <comment id="334180" author="shadow" created="Mon, 9 May 2022 17:42:19 +0000"  >&lt;p&gt;Peter, don&apos;t have such info. Currently we have just conformation - this request don&apos;t executed after recovery, while client think execution OK. Peggy works hard to collect a logs to see this, as this issue reproduced not each run. &lt;/p&gt;</comment>
                            <comment id="334302" author="shadow" created="Tue, 10 May 2022 17:22:07 +0000"  >&lt;p&gt;I have small update. From logs obtained yesterday - It looks server skips a request execution with single error/warning like:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000001:00080000:3.0:1652110683.220558:0:1080451:0:(tgt_lastrcvd.c:2000:tgt_txn_stop_cb()) More than one transaction 1408749330374
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

</comment>
                            <comment id="334409" author="tappro" created="Wed, 11 May 2022 16:17:47 +0000"  >&lt;p&gt;I assume that multiple transaction during write can occur due to &apos;restart/retry&apos; logic inside &lt;tt&gt;ofd_commitrw_write().&lt;/tt&gt; To resolve that it is enough to set &lt;tt&gt;tti_mult_trans&lt;/tt&gt; flag, so last_rcvd will be updated properly. I think the better place for that in &lt;tt&gt;tgt_brw_write()&lt;/tt&gt; so both MDT and OFD codepaths are covered:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c
index 3198cc8fc1..d81aff759a 100644
--- a/lustre/target/tgt_handler.c
+++ b/lustre/target/tgt_handler.c
@@ -2605,6 +2605,7 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; void tgt_warn_on_cksum(struct ptlrpc_request *req,
&#160;&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; tgt_brw_write(struct tgt_session_info *tsi)
&#160;{
&#160; &#160; &#160; &#160; struct ptlrpc_request &#160; *req = tgt_ses_req(tsi);
+ &#160; &#160; &#160; struct tgt_thread_info *tti = tgt_th_info(tsi-&amp;gt;tsi_env);
&#160; &#160; &#160; &#160; struct ptlrpc_bulk_desc *desc = NULL;
&#160; &#160; &#160; &#160; struct obd_export &#160; &#160; &#160; *exp = req-&amp;gt;rq_export;
&#160; &#160; &#160; &#160; struct niobuf_remote &#160; &#160;*remote_nb;
@@ -2846,6 +2847,9 @@ out_commitrw:
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; nob += len;
&#160; &#160; &#160; &#160; }
&#160;
+ &#160; &#160; &#160; &lt;span class=&quot;code-comment&quot;&gt;/* allow multiple transactions to be assigned during write commit */&lt;/span&gt;
+ &#160; &#160; &#160; tti-&amp;gt;tti_mult_trans = 1;
+
&#160; &#160; &#160; &#160; &lt;span class=&quot;code-comment&quot;&gt;/* Must commit after prep above in all cases */&lt;/span&gt;
&#160; &#160; &#160; &#160; rc = obd_commitrw(tsi-&amp;gt;tsi_env, OBD_BRW_WRITE, exp, &amp;amp;repbody-&amp;gt;oa,
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; objcount, ioo, remote_nb, npages, local_nb, rc, nob,
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="334432" author="shadow" created="Wed, 11 May 2022 18:28:37 +0000"  >&lt;p&gt;Mike, it looks you right about source of that messages for write.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00002000:00000002:11.0:1652209931.209295:0:82589:0:(ofd_io.c:1391:ofd_commitrw_write()) retry transaction, retries:2
00080000:00000002:11.0:1652209931.209342:0:82589:0:(osd_io.c:1544:osd_declare_write_commit()) snx11922-OST0002/: inode #1638412 extent_bytes 851393 extents 2 credits 33
00000001:00080000:11.0:1652209931.209400:0:82589:0:(tgt_lastrcvd.c:2000:tgt_txn_stop_cb()) More than one transaction 1451699035998
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;it looks  &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14134&quot; title=&quot;reduce credits for new writing potentially&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14134&quot;&gt;&lt;del&gt;LU-14134&lt;/del&gt;&lt;/a&gt; osd-ldiskfs: reduce credits for new writing - not so good as expected.&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;grep ofd_commitrw_write lustre.txt | grep -c retry&lt;br/&gt;
21543&lt;br/&gt;
..&lt;br/&gt;
sometimes it needs an 4 retry &lt;br/&gt;
00002000:00000002:18.0:1652209930.195716:0:82181:0:(ofd_io.c:1391:ofd_commitrw_write()) retry transaction, retries:4&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;not so good from performance view :/&lt;/p&gt;</comment>
                            <comment id="334451" author="shadow" created="Wed, 11 May 2022 19:58:43 +0000"  >&lt;p&gt;it looks like this bug addressed for todays corruption hit - commit hits during transaction restart but old assigned transno used for reply (high likely).&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
00000100:00100000:13.0:1652209930.000194:0:82206:0:(service.c:2134:ptlrpc_server_handle_req_in()) got req x1731924859869568
00002000:00100000:13.0:1652209930.000202:0:82206:0:(ofd_dev.c:2619:ofd_rw_hpreq_check()) @@@ snx11922-OST0002 ll_ost_io00_288: refresh rw locks for [0x2c0000400:0x86ece4e:0x0] (9392128-&amp;gt;40992767)  req@00000000bd622443 x1731924859869568/t0(0) o4-&amp;gt;ea32ef78-799e-4d3c-a8e4-6a210cb165c2@102@gni4:440/0 lens 536/0 e 0 to 0 dl 1652209950 ref 1 fl New:/0/ffffffff rc 0/-1 job:&apos;&apos;
00002000:00100000:13.0:1652209930.000205:0:82206:0:(ofd_dev.c:2478:ofd_prolong_extent_locks()) Prolong locks for req 00000000bd622443 with x1731924859869568 ext(9392128-&amp;gt;40992767)
00000100:00100000:13.0:1652209930.000214:0:82206:0:(service.c:2303:ptlrpc_server_handle_request()) Handling RPC req@00000000bd622443 pname:cluuid+ref:pid:xid:nid:opc:job ll_ost_io00_288:ea32ef78-799e-4d3c-a8e4-6a210cb165c2+191:16994:x1731924859869568:12345-102@gni4:4:
00080000:00000002:13.0:1652209930.519952:0:82206:0:(osd_io.c:1544:osd_declare_write_commit()) snx11922-OST0002/: inode #1638560 extent_bytes 899332 extents 4 credits 65
00080000:00000002:13.0:1652209930.520032:0:82206:0:(osd_handler.c:4814:osd_xattr_set()) Set version 0x15200015f47 (old 0x15200015f44) for inode 1638560
00000001:00000002:13.0:1652209930.520033:0:82206:0:(tgt_lastrcvd.c:1424:tgt_last_rcvd_update()) transno = 1451699035975, last_committed = 1451699035951
00002000:00000002:13.0:1652209930.638804:0:82206:0:(ofd_io.c:1391:ofd_commitrw_write()) retry transaction, retries:1
00080000:00000002:13.0:1652209930.638833:0:82206:0:(osd_io.c:1544:osd_declare_write_commit()) snx11922-OST0002/: inode #1638560 extent_bytes 871323 extents 3 credits 49
00002000:00000002:13.0:1652209930.694482:0:82206:0:(ofd_io.c:1391:ofd_commitrw_write()) retry transaction, retries:2
00080000:00000002:13.0:1652209930.694508:0:82206:0:(osd_io.c:1544:osd_declare_write_commit()) snx11922-OST0002/: inode #1638560 extent_bytes 867716 extents 1 credits 11

&amp;gt;&amp;gt;&amp;gt; 00000001:00000002:8.0:1652209930.697117:0:82431:0:(tgt_lastrcvd.c:1424:tgt_last_rcvd_update()) transno = 1451699035985, last_committed = 1451699035983 &amp;lt;&amp;lt;&amp;lt;&amp;lt; commit hit.

00002000:00000002:13.0:1652209930.727496:0:82206:0:(ofd_io.c:1391:ofd_commitrw_write()) retry transaction, retries:3
00080000:00000002:13.0:1652209930.727526:0:82206:0:(osd_io.c:1544:osd_declare_write_commit()) snx11922-OST0002/: inode #1638560 extent_bytes 854149 extents 1 credits 11
00000100:00100000:13.0:1652209930.832622:0:82206:0:(service.c:2352:ptlrpc_server_handle_request()) Handled RPC req@00000000bd622443 pname:cluuid+ref:pid:xid:nid:opc:job ll_ost_io00_288:ea32ef78-799e-4d3c-a8e4-6a210cb165c2+11:16994:x1731924859869568:12345-102@gni4:4: Request processed in 832406us (832433us total) trans 1451699035975 rc 0/0
00002000:00100000:13.0:1652209930.832630:0:82206:0:(ofd_dev.c:2619:ofd_rw_hpreq_check()) @@@ snx11922-OST0002 ll_ost_io00_288: refresh rw locks for [0x2c0000400:0x86ece4e:0x0] (9392128-&amp;gt;40992767)  req@00000000bd622443 x1731924859869568/t1451699035975(0) o4-&amp;gt;ea32ef78-799e-4d3c-a8e4-6a210cb165c2@102@gni4:440/0 lens 536/456 e 0 to 0 dl 1652209950 ref 1 fl Complete:/0/0 rc 0/0 job:&apos;&apos;
00002000:00100000:13.0:1652209930.832650:0:82206:0:(ofd_dev.c:2478:ofd_prolong_extent_locks()) Prolong locks for req 00000000bd622443 with x1731924859869568 ext(9392128-&amp;gt;40992767)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="335497" author="adilger" created="Thu, 19 May 2022 14:50:37 +0000"  >&lt;p&gt;&quot;Mike Pershin &amp;lt;mpershin@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/47371&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47371&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15776&quot; title=&quot;2.15 RC3: lost writes during server fofb by forced panics&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15776&quot;&gt;&lt;del&gt;LU-15776&lt;/del&gt;&lt;/a&gt; tgt: fix transaction handling in tgt_brw_write()&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: e3329972df95f788929dec15846bc00593c64bdf&lt;/p&gt;</comment>
                            <comment id="335500" author="JIRAUSER17312" created="Thu, 19 May 2022 14:57:40 +0000"  >&lt;p&gt;Hi &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=spitzcor&quot; class=&quot;user-hover&quot; rel=&quot;spitzcor&quot;&gt;spitzcor&lt;/a&gt;,&lt;/p&gt;

&lt;p&gt;We have a potential fix &lt;a href=&quot;https://review.whamcloud.com/#/c/47371/,&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/47371/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Can you please try this patch with your reproducer and let us know if it addresses the problem?&lt;/p&gt;

&lt;p&gt;Thank you!&lt;/p&gt;</comment>
                            <comment id="335509" author="spitzcor" created="Thu, 19 May 2022 16:15:55 +0000"  >&lt;p&gt;Yes, we will test it and report back next week.&lt;/p&gt;</comment>
                            <comment id="335914" author="spitzcor" created="Tue, 24 May 2022 17:12:53 +0000"  >&lt;p&gt;The reverts that Alexey suggested here and in the review of &lt;a href=&quot;https://review.whamcloud.com/47371/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47371/&lt;/a&gt; has survived 72 hours of the reproducer (also noted in the review) whereas it would fail within 30 minutes otherwise.  Performance testing is in-progress.&lt;/p&gt;

&lt;p&gt;The fix from &lt;a href=&quot;https://review.whamcloud.com/47371/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47371/&lt;/a&gt; has also survived an initial level of tests.  We&apos;re looking to complete at least 24 hours here as well.  We can provide another update later this week.  In the meantime, the discussion in review of &lt;a href=&quot;https://review.whamcloud.com/47371&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47371&lt;/a&gt; should continue.&lt;/p&gt;</comment>
                            <comment id="336091" author="JIRAUSER17312" created="Thu, 26 May 2022 14:09:22 +0000"  >&lt;p&gt;Hi &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=spitzcor&quot; class=&quot;user-hover&quot; rel=&quot;spitzcor&quot;&gt;spitzcor&lt;/a&gt;&#160;&lt;/p&gt;

&lt;p&gt;Any further updates?&lt;/p&gt;</comment>
                            <comment id="336120" author="spitzcor" created="Thu, 26 May 2022 16:02:14 +0000"  >&lt;p&gt;45 hours of test of the reproducer with &lt;a href=&quot;https://review.whamcloud.com/47371/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47371/&lt;/a&gt; has completed successfully, without failure.&lt;/p&gt;</comment>
                            <comment id="336449" author="spitzcor" created="Tue, 31 May 2022 17:06:13 +0000"  >&lt;p&gt;Status update from HPE: additional regression tests of &lt;a href=&quot;https://review.whamcloud.com/47371/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47371/&lt;/a&gt; have completed without issue.&lt;/p&gt;</comment>
                            <comment id="336454" author="JIRAUSER17312" created="Tue, 31 May 2022 17:30:24 +0000"  >&lt;p&gt;Thanks for the update &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=spitzcor&quot; class=&quot;user-hover&quot; rel=&quot;spitzcor&quot;&gt;spitzcor&lt;/a&gt;, I&apos;m assuming based on this, the performance testing came back without issue?&lt;/p&gt;</comment>
                            <comment id="336487" author="gerrit" created="Wed, 1 Jun 2022 03:29:51 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/47371/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47371/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15776&quot; title=&quot;2.15 RC3: lost writes during server fofb by forced panics&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15776&quot;&gt;&lt;del&gt;LU-15776&lt;/del&gt;&lt;/a&gt; tgt: fix transaction handling in tgt_brw_write()&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 10a29ad7616aeaadabf6d96146f52ef348a3a6f1&lt;/p&gt;</comment>
                            <comment id="336550" author="spitzcor" created="Wed, 1 Jun 2022 17:53:10 +0000"  >&lt;p&gt;HPE did not complete performance testing of &lt;a href=&quot;https://review.whamcloud.com/47371/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47371/&lt;/a&gt; in isolation.  We did however test the reverts that Alexey mentioned before, which checked out without a perf delta.  Further, we did measure a workload mix job throughput rate with &lt;a href=&quot;https://review.whamcloud.com/47371/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47371/&lt;/a&gt; and it does not deviate from those reverts or a baseline.&lt;/p&gt;</comment>
                            <comment id="336567" author="pjones" created="Wed, 1 Jun 2022 21:23:13 +0000"  >&lt;p&gt;That seems thorough enough to consider this issue as resolved in 2.15 RC5 to me - thanks Cory!&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="70265">LU-15847</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i02nqv:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>