<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:59:12 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13195] replay-single test_118: dt_declare_record_write() ASSERTION( dt-&gt;do_body_ops ) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-13195</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for liuying &amp;lt;emoly.liu@intel.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.whamcloud.com/test_sets/ca28353a-46ca-11ea-91a9-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/ca28353a-46ca-11ea-91a9-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;test_118 failed with the following error:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;== replay-single test 118: invalidate osp update will not cause update log corruption ================ 17:21:23 (1580750483)
CMD: trevis-19vm4 lctl set_param fail_loc=0x1705
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and the following stack trace on the console:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[10146.524712] LustreError: 19994:0:(dt_object.h:2191:dt_declare_record_write()) ASSERTION( dt-&amp;gt;do_body_ops ) failed: 
[10146.525745] LustreError: 19994:0:(dt_object.h:2191:dt_declare_record_write()) LBUG
[10146.526542] Pid: 19994, comm: mdt_out00_000 3.10.0-957.27.2.el7_lustre.x86_64 #1 SMP Sat Jan 18 23:01:59 UTC 2020
[10146.527632] Call Trace:
[10146.527905]  [&amp;lt;ffffffffc0c348ac&amp;gt;] libcfs_call_trace+0x8c/0xc0 [libcfs]
[10146.528605]  [&amp;lt;ffffffffc0c3495c&amp;gt;] lbug_with_loc+0x4c/0xa0 [libcfs]
[10146.529291]  [&amp;lt;ffffffffc10f65eb&amp;gt;] out_write_add_exec+0x13b/0x1b0 [ptlrpc]
[10146.530275]  [&amp;lt;ffffffffc10eed43&amp;gt;] out_write+0x333/0x370 [ptlrpc]
[10146.530971]  [&amp;lt;ffffffffc10f1086&amp;gt;] out_handle+0x1566/0x1bb0 [ptlrpc]
[10146.531652]  [&amp;lt;ffffffffc10e7eca&amp;gt;] tgt_request_handle+0x95a/0x1610 [ptlrpc]
[10146.532417]  [&amp;lt;ffffffffc108b816&amp;gt;] ptlrpc_server_handle_request+0x256/0xb10 [ptlrpc]
[10146.533234]  [&amp;lt;ffffffffc108f8a4&amp;gt;] ptlrpc_main+0xbb4/0x1550 [ptlrpc]

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This issue happened several times in Maloo testing but no more logs were collected.&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/sub_tests/121c5288-447b-11ea-bffa-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/sub_tests/121c5288-447b-11ea-bffa-52540065bddc&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/sub_tests/7cefb86c-4362-11ea-86b2-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/sub_tests/7cefb86c-4362-11ea-86b2-52540065bddc&lt;/a&gt;&lt;/p&gt;


&lt;p&gt;VVVVVVV DO NOT REMOVE LINES BELOW, Added by Maloo for auto-association VVVVVVV&lt;br/&gt;
replay-single test_118 - trevis-19vm4 crashed during replay-single test_118&lt;/p&gt;</description>
                <environment></environment>
        <key id="57980">LU-13195</key>
            <summary>replay-single test_118: dt_declare_record_write() ASSERTION( dt-&gt;do_body_ops ) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bzzz">Alex Zhuravlev</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Tue, 4 Feb 2020 01:12:36 +0000</created>
                <updated>Fri, 6 Jan 2023 00:36:12 +0000</updated>
                            <resolved>Tue, 9 Nov 2021 17:19:38 +0000</resolved>
                                    <version>Lustre 2.14.0</version>
                                    <fixVersion>Lustre 2.15.0</fixVersion>
                    <fixVersion>Lustre 2.12.10</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>12</watches>
                                                                            <comments>
                            <comment id="262862" author="adilger" created="Fri, 7 Feb 2020 22:35:25 +0000"  >&lt;p&gt;This looks like it is the same as &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10143&quot; title=&quot;LBUG dt_object.h:2166:dt_declare_record_write&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10143&quot;&gt;&lt;del&gt;LU-10143&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="265837" author="guzheng" created="Mon, 23 Mar 2020 00:50:12 +0000"  >&lt;p&gt;Another instance:&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/2ee75f70-b67b-447d-ba2a-db2f67c0bd0f&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/2ee75f70-b67b-447d-ba2a-db2f67c0bd0f&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="267298" author="bzzz" created="Thu, 9 Apr 2020 18:35:39 +0000"  >&lt;p&gt;looking at the logs from sanity with the same symptom it looks like sometime llog cookie is shared by transactions:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
1586485804.112492:0:13975:0:(update_trans.c:75:top_multiple_thandle_dump()) lustre-MDT0001-osd tmt 000000004e86531f refcount 1 committed 1 result -5 batchid 17179870236
1586485804.112493:0:13975:0:(update_trans.c:85:top_multiple_thandle_dump()) st 000000001da13291 obd lustre-MDT0000-osp-MDT0001 committed 1 started 1 stopped 1 result -5 sub_th 000000003da75353
1586485804.112494:0:13975:0:(update_trans.c:92:top_multiple_thandle_dump())  cookie [0x2:0x2341:0x2].873
1586485804.112495:0:13975:0:(update_trans.c:85:top_multiple_thandle_dump()) st 000000000cc313ae obd lustre-MDT0001-osd committed 1 started 1 stopped 1 result 0 sub_th 000000000764ad4c
1586485804.112497:0:13975:0:(update_trans.c:92:top_multiple_thandle_dump())  cookie [0x2:0x40002340:0x2].921
1586485804.112497:0:13975:0:(update_trans.c:92:top_multiple_thandle_dump())  cookie [0x2:0x40002340:0x2].921

...
1586485804.112723:0:13975:0:(update_trans.c:75:top_multiple_thandle_dump()) lustre-MDT0001-osd tmt 0000000046070051 refcount 1 committed 1 result 0 batchid 17179870237
1586485804.112725:0:13975:0:(update_trans.c:85:top_multiple_thandle_dump()) st 00000000a7f7ee4f obd lustre-MDT0000-osp-MDT0001 committed 1 started 1 stopped 1 result 0 sub_th 00000000222fbaf6
1586485804.112726:0:13975:0:(update_trans.c:92:top_multiple_thandle_dump())  cookie [0x2:0x2341:0x2].873
1586485804.112727:0:13975:0:(update_trans.c:85:top_multiple_thandle_dump()) st 00000000adeff453 obd lustre-MDT0001-osd committed 1 started 1 stopped 1 result 0 sub_th 0000000090f0b368
1586485804.112728:0:13975:0:(update_trans.c:92:top_multiple_thandle_dump())  cookie [0x2:0x40002340:0x2].922
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;notice cookie &lt;b&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;0x2:0x2341:0x2&amp;#93;&lt;/span&gt;.873&lt;/b&gt;&lt;/p&gt;

&lt;p&gt;the consequence is that cancellation of the first record (873) causes destroy of corresponding llog file and it&apos;s not supposed to be referenced any more, but then recovery process finds cookie duplicate and tries to repeat cancellation/destroy.&lt;br/&gt;
while it&apos;s possible to handle this (checking whether the llog does exist) I think it&apos;s important to understand the root cause for duplicates.&lt;/p&gt;</comment>
                            <comment id="267354" author="bzzz" created="Fri, 10 Apr 2020 09:10:44 +0000"  >&lt;p&gt;in another case it was the same transaction being cancelled twice in two different threads:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
.121703:0:5542:0:(update_trans.c:75:top_multiple_thandle_dump()) lustre-MDT0001-osd tmt 00000000cb6dafd8 refcount 2 committed 1 result 0 batchid 17179870235
.121705:0:5542:0:(update_trans.c:85:top_multiple_thandle_dump()) st 0000000084a8ca02 obd lustre-MDT0000-osp-MDT0001 committed 1 started 0 stopped 0 result 0 sub_th           (&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;)
.121706:0:5542:0:(update_trans.c:92:top_multiple_thandle_dump())  cookie [0x2:0x2341:0x2].871
.121707:0:5542:0:(update_trans.c:85:top_multiple_thandle_dump()) st 00000000a1d7d236 obd lustre-MDT0001-osd committed 1 started 0 stopped 0 result 0 sub_th           (&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;)
.121708:0:5542:0:(update_trans.c:92:top_multiple_thandle_dump())  cookie [0x2:0x40002340:0x2].902

.121919:0:5539:0:(update_trans.c:75:top_multiple_thandle_dump()) lustre-MDT0001-osd tmt 00000000cb6dafd8 refcount 1 committed 1 result 0 batchid 17179870235
.121920:0:5539:0:(update_trans.c:85:top_multiple_thandle_dump()) st 0000000084a8ca02 obd lustre-MDT0000-osp-MDT0001 committed 1 started 0 stopped 0 result 0 sub_th           (&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;)
.121922:0:5539:0:(update_trans.c:92:top_multiple_thandle_dump())  cookie [0x2:0x2341:0x2].871
.121923:0:5539:0:(update_trans.c:85:top_multiple_thandle_dump()) st 00000000a1d7d236 obd lustre-MDT0001-osd committed 1 started 0 stopped 0 result 0 sub_th           (&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;)
.121924:0:5539:0:(update_trans.c:92:top_multiple_thandle_dump())  cookie [0x2:0x40002340:0x2].902
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;dist_txn-1      D    0  5539      2 0x80000000&lt;br/&gt;
Call Trace:&lt;br/&gt;
 ? __schedule+0x2ad/0xb00&lt;br/&gt;
 schedule+0x34/0x80&lt;br/&gt;
 lbug_with_loc+0x79/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 ? osp_md_write+0x4c3/0x540 &lt;span class=&quot;error&quot;&gt;&amp;#91;osp&amp;#93;&lt;/span&gt;&lt;br/&gt;
 ? dt_record_write+0x2a/0x110 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 ? llog_osd_write_rec+0x717/0x1dd0 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 ? llog_write_rec+0x36a/0x4e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 ? llog_cancel_arr_rec+0x746/0xb20 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 ? llog_cat_cancel_arr_rec+0x143/0x400 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 ? llog_cat_cancel_records+0x4f/0x1a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 ? distribute_txn_commit_thread+0x580/0x11d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 ? __kthread_parkme+0x39/0x80&lt;/p&gt;

&lt;p&gt;00000100:00100000:1.0:1586499582.124714:0:5542:0:(client.c:1697:ptlrpc_send_new_req()) Sending RPC req@0000000053ab4ec6 pname:cluuid:pid:xid:nid:opc:job tgt_recover_1:lustre-MDT0001-mdtlov_UUID:5542:1663564960200128:192.168.122.97@tcp:1000:tgt_recover_1.0&lt;/p&gt;
</comment>
                            <comment id="267356" author="adilger" created="Fri, 10 Apr 2020 10:03:58 +0000"  >&lt;p&gt;Is the mechanism for generating this number properly locked?  Are you sure that it is being printed consistently?&lt;/p&gt;</comment>
                            <comment id="267418" author="bzzz" created="Sun, 12 Apr 2020 09:08:18 +0000"  >&lt;p&gt;sorry, that was wrong understanding. I&apos;ve got couple more logs and it&apos;s actually not duplicated id, but rather &lt;b&gt;correct&lt;/b&gt; (in some sense) logic:&lt;br/&gt;
llog is destroyed because the last record is just cancelled. normally this doesn&apos;t happen because this log is &lt;em&gt;current&lt;/em&gt;, but after recovery it&apos;s not &lt;em&gt;current&lt;/em&gt; anymore as we usually create a new llog (iirc).&lt;br/&gt;
there is no current llog during recovery as we don&apos;t generate new llog records (though we can write records from update log), so current llog is NULL at recovery, this can be used to skip llog destroy, but then tricky thing is when to destroy those. will try to catch this..&lt;/p&gt;</comment>
                            <comment id="267464" author="bzzz" created="Mon, 13 Apr 2020 09:41:15 +0000"  >&lt;p&gt;so far the change (postponing llog destroy) has been running for many hours and I saw this condition happeing sometimes.&lt;br/&gt;
trying to construct a scripted test for the case..&lt;/p&gt;</comment>
                            <comment id="267874" author="bzzz" created="Fri, 17 Apr 2020 08:32:56 +0000"  >&lt;p&gt;one identified issue is a race between llog append/cancel and invalidataion. basically any change to llog involves two changes at least: llh_count and bitmap (plus new record in case of append). to save on RPC size we modify llh_count and bitmap using two separate dt_write(). sometimes invalidation comes in between these two writes and one of them don&apos;t get into the transaction, but the transaction itself proceeds with just the first write.&lt;br/&gt;
I think in this case llog should invalidate internal state (bitmap, counters) and re-initialize it (in this call or in the next one), the transaction then must be marked properly and aborted as a whole. working on a patch..&lt;br/&gt;
this is not the only root cause for the issue, AFAICS. there is another one (or few) leading to similar symptoms.&lt;/p&gt;

</comment>
                            <comment id="268074" author="bzzz" created="Mon, 20 Apr 2020 17:07:11 +0000"  >&lt;p&gt;can&apos;t reproduce reliable, but this is my reconstruction: for a reason OSP invalidate failed request and all subsequent requests in the sending queue (&lt;em&gt;osp_send_update_thread()&lt;/em&gt; -&amp;gt; &lt;em&gt;osp_invalidate_request()&lt;/em&gt;), which basically should invalidate all cached internal states like llog&apos;s bitmaps.&lt;br/&gt;
all &lt;em&gt;new&lt;/em&gt; transactions will notice that and refresh llog&apos;s data, but all &lt;em&gt;already-declared&lt;/em&gt; transactions will still proceed. this is quite short window given everything is running in cache, all locks have been acquired already, etc, but still possible.&lt;br/&gt;
then depending on luck the llog can be corrupted different ways (in majority of cases - llh_count is less than actual number of records, leading to early llog destroy)&lt;br/&gt;
I&apos;m trying to construct a test, but not sure how to bind two events: request invalidation and resuming a suspended-after-declaration transaction.&lt;/p&gt;
</comment>
                            <comment id="268590" author="bzzz" created="Sun, 26 Apr 2020 18:57:41 +0000"  >&lt;p&gt;few different issues have been identified so far:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;llog_cat_refresh() causes duplicated llog ids - if some write failed, then to reinitialize llog state the master MDS re-reads remote llog and gets old lgh_last_idx, which has been already returned to another transaction&lt;/li&gt;
	&lt;li&gt;object invalidation doesn&apos;t work in one case at least - osp_md_write() doesn&apos;t put on object on invalidation list for subsequent transaction, then a race possible which in turn results in lost -ESTALE and then lgh_count won&apos;t match actual number of records&lt;/li&gt;
	&lt;li&gt;partial writes - I&apos;ve seen this in the logs, but not sure about exact sequence leading to this&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="268616" author="gerrit" created="Mon, 27 Apr 2020 06:29:05 +0000"  >&lt;p&gt;Alex Zhuravlev (bzzz@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/38385&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/38385&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13195&quot; title=&quot;replay-single test_118: dt_declare_record_write() ASSERTION( dt-&amp;gt;do_body_ops ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13195&quot;&gt;&lt;del&gt;LU-13195&lt;/del&gt;&lt;/a&gt; osp: track destroyed OSP object&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 52bc4e0824a185e05551a3d95bb721c61baa6b40&lt;/p&gt;</comment>
                            <comment id="268617" author="aboyko" created="Mon, 27 Apr 2020 06:46:40 +0000"  >&lt;p&gt;Alex, I have a draft fix for invalidation. I&apos;m pushing it to gerrit today.&lt;/p&gt;</comment>
                            <comment id="268618" author="bzzz" created="Mon, 27 Apr 2020 06:48:31 +0000"  >&lt;blockquote&gt;&lt;p&gt;Alex, I have a draft fix for invalidation. I&apos;m pushing it to gerrit today.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;for xattr_set or for more ops? write path misses invalidation as well, I did  that by adding an interpreter for osp_md_write() which doesn&apos;t depend on invalidation list.&lt;/p&gt;</comment>
                            <comment id="268619" author="aboyko" created="Mon, 27 Apr 2020 06:52:58 +0000"  >&lt;p&gt;Fix for any operation.&lt;/p&gt;</comment>
                            <comment id="268620" author="gerrit" created="Mon, 27 Apr 2020 06:57:59 +0000"  >&lt;p&gt;Alexander Boyko (alexander.boyko@hpe.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/38386&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/38386&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13195&quot; title=&quot;replay-single test_118: dt_declare_record_write() ASSERTION( dt-&amp;gt;do_body_ops ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13195&quot;&gt;&lt;del&gt;LU-13195&lt;/del&gt;&lt;/a&gt; osp: invalidate objects for error&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 1107a36f44fd2d8c195c9bee5ad3db565efe766e&lt;/p&gt;</comment>
                            <comment id="268624" author="gerrit" created="Mon, 27 Apr 2020 07:26:23 +0000"  >&lt;p&gt;Alex Zhuravlev (bzzz@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/38387&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/38387&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13195&quot; title=&quot;replay-single test_118: dt_declare_record_write() ASSERTION( dt-&amp;gt;do_body_ops ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13195&quot;&gt;&lt;del&gt;LU-13195&lt;/del&gt;&lt;/a&gt; osp: invalidate object on write error&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 40eb50739b4f836cc6161e0efc866060cb267ac8&lt;/p&gt;</comment>
                            <comment id="268634" author="aboyko" created="Mon, 27 Apr 2020 09:48:48 +0000"  >&lt;p&gt;Alex, I&apos;m using the next debug patch to catch llog corruption&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;diff --git a/lustre/target/out_lib.c b/lustre/target/out_lib.c
index e8bdfd1..9eac332 100644
--- a/lustre/target/out_lib.c
+++ b/lustre/target/out_lib.c
@@ -697,6 +697,20 @@ static int out_tx_write_exec(const struct lu_env *env, struct thandle *th,
        if (OBD_FAIL_CHECK(OBD_FAIL_OUT_ENOSPC)) {
                rc = -ENOSPC;
        } else {
+               struct lu_attr la;
+               unsigned int *buf = (unsigned int*) arg-&amp;gt;u.write.buf.lb_buf;
+               dt_attr_get(env, dt_obj, &amp;amp;la);
+
+               if (arg-&amp;gt;u.write.buf.lb_len &amp;gt; LLOG_MIN_REC_SIZE &amp;amp;&amp;amp;
+                   __le32_to_cpu(buf[2]) == UPDATE_REC &amp;amp;&amp;amp;
+                   arg-&amp;gt;u.write.pos &amp;gt; la.la_size) {
+               CDEBUG(D_HA, &quot;write with sparse &quot;DFID&quot; pos %llu buf %p, len %lu, size %llu\n&quot;,
+               PFID(lu_object_fid(&amp;amp;dt_obj-&amp;gt;do_lu)), arg-&amp;gt;u.write.pos,
+               arg-&amp;gt;u.write.buf.lb_buf, (unsigned long)arg-&amp;gt;u.write.buf.lb_len,
+               la.la_size);
+
+                       LBUG();
+               }
                dt_write_lock(env, dt_obj, MOR_TGT_CHILD);
                rc = dt_record_write(env, dt_obj, &amp;amp;arg-&amp;gt;u.write.buf,
                                     &amp;amp;arg-&amp;gt;u.write.pos, th);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="268652" author="bzzz" created="Mon, 27 Apr 2020 15:08:14 +0000"  >&lt;p&gt;thanks for sharing. I used this one (to catch on remote side):&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c
index 8d8d8b4d7e..318e5aa4dc 100644
--- a/lustre/osd-ldiskfs/osd_handler.c
+++ b/lustre/osd-ldiskfs/osd_handler.c
@@ -1788,6 +1788,8 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; struct thandle *osd_trans_create(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_env *env,
 
 	oti-&amp;gt;oti_ins_cache_depth++;
 
+	oh-&amp;gt;ot_bh = NULL;
+
 	RETURN(th);
 }
 
@@ -1967,6 +1969,32 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; void osd_trans_stop_cb(struct osd_thandle *oth, &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; result)
 	}
 }
 
+void osd_llog_hdr_check(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_fid *fid, void *buf)
+{
+	struct llog_log_hdr *llh = buf;
+	&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; found = 0;
+	&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; i = 0;
+	&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; max;
+	&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; last = -1, first = -1;
+
+	max = 128 * 8;
+	&lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; (i &amp;lt; max) {
+		i = find_next_bit((unsigned  &lt;span class=&quot;code-object&quot;&gt;long&lt;/span&gt; *)LLOG_HDR_BITMAP(llh), max, i);
+		&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (i &amp;gt;= max)
+			&lt;span class=&quot;code-keyword&quot;&gt;break&lt;/span&gt;;
+		&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (first &amp;lt;= 0)
+			first = i;
+		last = i;
+		i++;
+		found++;
+	}
+	CDEBUG(D_HA, &lt;span class=&quot;code-quote&quot;&gt;&quot;llog osd_write &quot;&lt;/span&gt;DFID&lt;span class=&quot;code-quote&quot;&gt;&quot;, llh_count=%d, found %d, first %d, last %d\n&quot;&lt;/span&gt;,
+		PFID(fid), (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)llh-&amp;gt;llh_count, found, first, last);
+	LASSERTF(llh-&amp;gt;llh_count == found,
+		&lt;span class=&quot;code-quote&quot;&gt;&quot;llog osd_write &quot;&lt;/span&gt;DFID&lt;span class=&quot;code-quote&quot;&gt;&quot;, llh_count=%d, found %d, first %d, last %d\n&quot;&lt;/span&gt;,
+		PFID(fid), (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)llh-&amp;gt;llh_count, found, first, last);
+}
+
 /*
  * Concurrency: shouldn&apos;t matter.
  */
@@ -1986,6 +2014,12 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; osd_trans_stop(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_env *env, struct dt_device *dt,
 
 	oh = container_of0(th, struct osd_thandle, ot_super);
 
+	&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (oh-&amp;gt;ot_bh) {
+		osd_llog_hdr_check(&amp;amp;oh-&amp;gt;ot_fid, oh-&amp;gt;ot_bh-&amp;gt;b_data);
+		brelse(oh-&amp;gt;ot_bh);
+		oh-&amp;gt;ot_bh = NULL;
+	}
+
 	remove_agents = oh-&amp;gt;ot_remove_agents;
 
 	qtrans = oh-&amp;gt;ot_quota_trans;
diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h
index 6c62ca450b..8aedc5e9ff 100644
--- a/lustre/osd-ldiskfs/osd_internal.h
+++ b/lustre/osd-ldiskfs/osd_internal.h
@@ -426,6 +426,8 @@ struct osd_thandle {
 	ktime_t oth_started;
 #endif
 	struct list_head	ot_trunc_locks;
+	struct buffer_head	*ot_bh;
+	struct lu_fid		ot_fid;
 };
 
 /**
diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c
index ce2af335ad..6563e8bd2b 100644
--- a/lustre/osd-ldiskfs/osd_io.c
+++ b/lustre/osd-ldiskfs/osd_io.c
@@ -1680,9 +1680,11 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; osd_ldiskfs_writelink(struct inode *inode, &lt;span class=&quot;code-object&quot;&gt;char&lt;/span&gt; *buffer, &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; buflen)
 	&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 0;
 }
 
+void osd_llog_hdr_check(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_fid *fid, void *buf);
+
 &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; osd_ldiskfs_write_record(struct dt_object *dt, void *buf,
 				    &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; bufsize, &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; write_NUL, loff_t *offs,
-				    handle_t *handle)
+				    handle_t *handle, struct osd_thandle *oth)
 {
 	struct inode *inode = osd_dt_obj(dt)-&amp;gt;oo_inode;
         struct buffer_head *bh        = NULL;
@@ -1786,6 +1788,25 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; osd_ldiskfs_write_record(struct dt_object *dt, void *buf,
 				sync = &lt;span class=&quot;code-keyword&quot;&gt;false&lt;/span&gt;;
 			}
 		}
+		&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (block == 0 &amp;amp;&amp;amp; bh &amp;amp;&amp;amp; !create) {
+			&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_fid *fid = lu_object_fid(&amp;amp;dt-&amp;gt;do_lu);
+			&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (fid_seq(fid) == 0x200002341 || fid_seq(fid) == 0x200002342) {
+				&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (size == 4) {
+					unsigned &lt;span class=&quot;code-object&quot;&gt;char&lt;/span&gt; *bb = buf;
+					CDEBUG(D_HA, &lt;span class=&quot;code-quote&quot;&gt;&quot;write %d @ %d: %02x %02x %02x %02x\n&quot;&lt;/span&gt;,
+						(&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)size, boffs, bb[0], bb[1], bb[2], bb[3]);
+				}
+
+				&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (oth-&amp;gt;ot_bh == NULL) {
+					get_bh(bh);
+					oth-&amp;gt;ot_fid = *fid;
+					oth-&amp;gt;ot_bh = bh;
+					osd_llog_hdr_check(fid, bh-&amp;gt;b_data);
+				} &lt;span class=&quot;code-keyword&quot;&gt;else&lt;/span&gt; {
+					LASSERT(oth-&amp;gt;ot_bh == bh);
+				}
+			}
+		}
 		memcpy(bh-&amp;gt;b_data + boffs, buf, size);
 		err = ldiskfs_handle_dirty_metadata(handle, NULL, bh);
                 &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (err)
@@ -1856,7 +1878,7 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; ssize_t osd_write(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_env *env, struct dt_object *dt,
 		result = osd_ldiskfs_writelink(inode, buf-&amp;gt;lb_buf, buf-&amp;gt;lb_len);
 	&lt;span class=&quot;code-keyword&quot;&gt;else&lt;/span&gt;
 		result = osd_ldiskfs_write_record(dt, buf-&amp;gt;lb_buf, buf-&amp;gt;lb_len,
-						  is_link, pos, oh-&amp;gt;ot_handle);
+						  is_link, pos, oh-&amp;gt;ot_handle, oh);
 	&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (result == 0)
 		result = buf-&amp;gt;lb_len;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="272526" author="gerrit" created="Wed, 10 Jun 2020 20:51:26 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/38387/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/38387/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13195&quot; title=&quot;replay-single test_118: dt_declare_record_write() ASSERTION( dt-&amp;gt;do_body_ops ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13195&quot;&gt;&lt;del&gt;LU-13195&lt;/del&gt;&lt;/a&gt; osp: invalidate object on write error&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 9e1071b517578ed3752efb1412017c8f93cd333b&lt;/p&gt;</comment>
                            <comment id="273176" author="gerrit" created="Thu, 18 Jun 2020 07:01:43 +0000"  >&lt;p&gt;Alex Zhuravlev (bzzz@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/38977&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/38977&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13195&quot; title=&quot;replay-single test_118: dt_declare_record_write() ASSERTION( dt-&amp;gt;do_body_ops ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13195&quot;&gt;&lt;del&gt;LU-13195&lt;/del&gt;&lt;/a&gt; obdclass: show FID for corrupted llog&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: b381c6b220c888b7eacd0abf947add60bbf11231&lt;/p&gt;</comment>
                            <comment id="274398" author="adilger" created="Fri, 3 Jul 2020 23:43:13 +0000"  >&lt;p&gt;+1 on master with this patch applied &lt;a href=&quot;https://review.whamcloud.com/38387&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/38387&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13195&quot; title=&quot;replay-single test_118: dt_declare_record_write() ASSERTION( dt-&amp;gt;do_body_ops ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13195&quot;&gt;&lt;del&gt;LU-13195&lt;/del&gt;&lt;/a&gt; osp: invalidate object on write error&lt;/tt&gt;&quot;&lt;br/&gt;
 &lt;a href=&quot;https://testing.whamcloud.com/test_sets/16bbb988-9976-40c3-86d3-e673eec6e820&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/16bbb988-9976-40c3-86d3-e673eec6e820&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="277334" author="hornc" created="Wed, 12 Aug 2020 16:38:24 +0000"  >&lt;p&gt;+1 on master &lt;a href=&quot;https://testing.whamcloud.com/test_sessions/3d28761f-80fe-402e-9d21-53f977d88d40&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sessions/3d28761f-80fe-402e-9d21-53f977d88d40&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="285552" author="gerrit" created="Thu, 19 Nov 2020 11:01:09 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/38977/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/38977/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13195&quot; title=&quot;replay-single test_118: dt_declare_record_write() ASSERTION( dt-&amp;gt;do_body_ops ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13195&quot;&gt;&lt;del&gt;LU-13195&lt;/del&gt;&lt;/a&gt; obdclass: show FID for corrupted llog&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 5b74e0466edbfbbf9f336171de1adc5e583e9475&lt;/p&gt;</comment>
                            <comment id="287297" author="artem_blagodarenko" created="Fri, 11 Dec 2020 08:39:55 +0000"  >&lt;p&gt;+1&#160;&lt;a href=&quot;https://testing.whamcloud.com/test_sets/eca8222e-939a-437a-be41-0df8325c411d&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/eca8222e-939a-437a-be41-0df8325c411d&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="288964" author="jamesanunez" created="Thu, 7 Jan 2021 17:32:55 +0000"  >&lt;p&gt;Alex - &lt;/p&gt;

&lt;p&gt;Is this the same issue &lt;a href=&quot;https://testing.whamcloud.com/test_sets/dfff72b1-060d-4d53-b15b-b8a989d5c8d5&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/dfff72b1-060d-4d53-b15b-b8a989d5c8d5&lt;/a&gt; ?&lt;/p&gt;
</comment>
                            <comment id="289911" author="emoly.liu" created="Wed, 20 Jan 2021 09:07:04 +0000"  >&lt;p&gt;+1 on master:&#160;&lt;a href=&quot;https://testing.whamcloud.com/test_sets/757f2536-f148-46a7-8992-2ddef45293bf&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/757f2536-f148-46a7-8992-2ddef45293bf&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="290466" author="adilger" created="Wed, 27 Jan 2021 18:57:12 +0000"  >&lt;p&gt;+1 on master &lt;a href=&quot;https://testing.whamcloud.com/test_sets/27e54b38-cf46-466f-8fec-4cf25e9bb57a&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/27e54b38-cf46-466f-8fec-4cf25e9bb57a&lt;/a&gt;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Lustre: DEBUG MARKER: == replay-single test 118: invalidate osp update will not cause update log corruption
Lustre: DEBUG MARKER: lctl set_param fail_loc=0x1705
Lustre: *** cfs_fail_loc=1705, val=0***
Lustre: Skipped 1 previous similar message
LustreError: 760439:0:(dt_object.h:2274:dt_declare_record_write()) ASSERTION( dt-&amp;gt;do_body_ops ) failed: [0x20003ccc2:0x1:0x0] doesn&apos;t exit
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="309209" author="adilger" created="Tue, 3 Aug 2021 23:11:08 +0000"  >&lt;p&gt;+1 on master &lt;a href=&quot;https://testing.whamcloud.com/test_sets/cb680395-862a-482a-98bb-c7b3da79cf68&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/cb680395-862a-482a-98bb-c7b3da79cf68&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="309210" author="adilger" created="Tue, 3 Aug 2021 23:17:47 +0000"  >&lt;p&gt;Alex, is patch &lt;a href=&quot;https://review.whamcloud.com/38385&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/38385&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13195&quot; title=&quot;replay-single test_118: dt_declare_record_write() ASSERTION( dt-&amp;gt;do_body_ops ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13195&quot;&gt;&lt;del&gt;LU-13195&lt;/del&gt;&lt;/a&gt; osp: track destroyed OSP object&lt;/tt&gt;&quot; expected to fix this problem, or is that more of a diagnostic patch?&lt;/p&gt;

&lt;p&gt;This subtest crashed about 10 of 200 runs in the past week.&lt;/p&gt;</comment>
                            <comment id="312072" author="eaujames" created="Fri, 3 Sep 2021 15:13:01 +0000"  >&lt;p&gt;It seems also to happen on runtest test_1:&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/842d914c-2cce-427d-a5a7-4aadb1a36d21&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/842d914c-2cce-427d-a5a7-4aadb1a36d21&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="312120" author="adilger" created="Sat, 4 Sep 2021 17:31:21 +0000"  >&lt;p&gt;This seems to be failing fairly often on master recently - &quot;15.15% of most recent 66 runs, 34 skipped (all branches)&quot;&lt;/p&gt;</comment>
                            <comment id="312121" author="bzzz" created="Sat, 4 Sep 2021 17:42:43 +0000"  >&lt;p&gt;yes, looking at this one in background..&lt;/p&gt;</comment>
                            <comment id="312178" author="gerrit" created="Mon, 6 Sep 2021 16:01:16 +0000"  >&lt;p&gt;&quot;Alex Zhuravlev &amp;lt;bzzz@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/44853&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/44853&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13195&quot; title=&quot;replay-single test_118: dt_declare_record_write() ASSERTION( dt-&amp;gt;do_body_ops ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13195&quot;&gt;&lt;del&gt;LU-13195&lt;/del&gt;&lt;/a&gt; obdclass: catch llog corruption&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 513265c887b42247b93c23efe8e8d17c84013841&lt;/p&gt;</comment>
                            <comment id="313171" author="gerrit" created="Fri, 17 Sep 2021 14:06:44 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/38385/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/38385/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13195&quot; title=&quot;replay-single test_118: dt_declare_record_write() ASSERTION( dt-&amp;gt;do_body_ops ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13195&quot;&gt;&lt;del&gt;LU-13195&lt;/del&gt;&lt;/a&gt; osp: track destroyed OSP object&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: f5a8f1bcf5563f96cf6ba0e5de5a99a1ea524cc6&lt;/p&gt;</comment>
                            <comment id="313196" author="pjones" created="Fri, 17 Sep 2021 14:33:21 +0000"  >&lt;p&gt;Landed for 2.15&lt;/p&gt;</comment>
                            <comment id="313425" author="gerrit" created="Mon, 20 Sep 2021 17:15:26 +0000"  >&lt;p&gt;&lt;del&gt;&quot;Alex Zhuravlev &amp;lt;bzzz@whamcloud.com&amp;gt;&quot; uploaded a new patch:&lt;/del&gt; &lt;a href=&quot;https://review.whamcloud.com/44993&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/44993&lt;/a&gt;&lt;br/&gt;
&lt;del&gt;Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13195&quot; title=&quot;replay-single test_118: dt_declare_record_write() ASSERTION( dt-&amp;gt;do_body_ops ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13195&quot;&gt;&lt;del&gt;LU-13195&lt;/del&gt;&lt;/a&gt; osp: object creation needs a version&lt;/del&gt;&lt;br/&gt;
&lt;del&gt;Project: fs/lustre-release&lt;/del&gt;&lt;br/&gt;
&lt;del&gt;Branch: master&lt;/del&gt;&lt;br/&gt;
&lt;del&gt;Current Patch Set: 1&lt;/del&gt;&lt;br/&gt;
&lt;del&gt;Commit: a79be4c9521b4c35f5efcf813350e7097cfd4210&lt;/del&gt;&lt;/p&gt;</comment>
                            <comment id="313681" author="bzzz" created="Wed, 22 Sep 2021 14:19:11 +0000"  >&lt;p&gt;this still happen to master. my understanding is that eviction in 118 can cause different operations to abort. the last few times I checked it was llog object creation invalidated. I&apos;m working on a reliable reproducer.&lt;/p&gt;</comment>
                            <comment id="313906" author="gerrit" created="Fri, 24 Sep 2021 14:31:15 +0000"  >&lt;p&gt;&quot;Alex Zhuravlev &amp;lt;bzzz@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/45042&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/45042&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13195&quot; title=&quot;replay-single test_118: dt_declare_record_write() ASSERTION( dt-&amp;gt;do_body_ops ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13195&quot;&gt;&lt;del&gt;LU-13195&lt;/del&gt;&lt;/a&gt; obdclass: check another theory&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 3affa067b1ff7a5a4577eadbd269ae300ccb22cd&lt;/p&gt;</comment>
                            <comment id="314145" author="bzzz" created="Tue, 28 Sep 2021 14:07:55 +0000"  >&lt;p&gt;with the latest &lt;a href=&quot;https://review.whamcloud.com/45042&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/45042&lt;/a&gt; I can&apos;t reproduce LBUG() anymore. basically it&apos;s a race - failed (by intention) create doesn&apos;t invalidate request-in-progress properly, then that survived request (containing a write to object just failed to create) flies to remote MDT and we get the LBUG().&lt;br/&gt;
with LBUG resolved I observed another issue - few OSP structures from that inproperly invalidated request can leak. I &lt;em&gt;think&lt;/em&gt; this is slightly different issue and plan to fix that with another patch.&lt;/p&gt;</comment>
                            <comment id="315984" author="gerrit" created="Tue, 19 Oct 2021 17:08:48 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/45042/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/45042/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13195&quot; title=&quot;replay-single test_118: dt_declare_record_write() ASSERTION( dt-&amp;gt;do_body_ops ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13195&quot;&gt;&lt;del&gt;LU-13195&lt;/del&gt;&lt;/a&gt; osp: osp_send_update_req() should check generation&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: dff1e0d21c8c6bb20d63669252190795198bc49f&lt;/p&gt;</comment>
                            <comment id="315989" author="pjones" created="Tue, 19 Oct 2021 17:28:54 +0000"  >&lt;p&gt;So... complete it seems&lt;/p&gt;</comment>
                            <comment id="316459" author="eaujames" created="Mon, 25 Oct 2021 11:54:35 +0000"  >&lt;p&gt;+1 in runtest (with all the fix patches):&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/7110dd40-b540-469d-a773-874769fe527b&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/7110dd40-b540-469d-a773-874769fe527b&lt;/a&gt;&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[ 4327.451873] Lustre: DEBUG MARKER: copying 607 files from /etc /bin to /mnt/lustre/d1.runtests/etc /bin at Fri Oct 22 22:51:39 UTC 2021
[ 4334.578564] LustreError: 12949:0:(dt_object.h:2310:dt_declare_record_write()) ASSERTION( dt-&amp;gt;do_body_ops ) failed: [0x200011571:0x1:0x0] doesn&apos;t exit
[ 4334.581584] LustreError: 12949:0:(dt_object.h:2310:dt_declare_record_write()) LBUG
[ 4334.583011] Pid: 12949, comm: mdt_out00_001 4.18.0-240.22.1.el8_lustre.x86_64 #1 SMP Mon Oct 4 16:46:22 UTC 2021
[ 4334.585079] Call Trace TBD:
[ 4334.585913] [&amp;lt;0&amp;gt;] libcfs_call_trace+0x6f/0x90 [libcfs]
[ 4334.586882] [&amp;lt;0&amp;gt;] lbug_with_loc+0x43/0x80 [libcfs]
[ 4334.588281] [&amp;lt;0&amp;gt;] out_write_add_exec+0x17d/0x1e0 [ptlrpc]
[ 4334.589374] [&amp;lt;0&amp;gt;] out_write+0x166/0x380 [ptlrpc]
[ 4334.590282] [&amp;lt;0&amp;gt;] out_handle+0x16af/0x20e0 [ptlrpc]
[ 4334.591293] [&amp;lt;0&amp;gt;] tgt_request_handle+0xc93/0x1a00 [ptlrpc]
[ 4334.592391] [&amp;lt;0&amp;gt;] ptlrpc_server_handle_request+0x323/0xbd0 [ptlrpc]
[ 4334.593680] [&amp;lt;0&amp;gt;] ptlrpc_main+0xc06/0x1550 [ptlrpc]
[ 4334.594667] [&amp;lt;0&amp;gt;] kthread+0x112/0x130
[ 4334.595389] [&amp;lt;0&amp;gt;] ret_from_fork+0x35/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="316460" author="bzzz" created="Mon, 25 Oct 2021 12:34:50 +0000"  >&lt;p&gt;damn... looking&lt;/p&gt;</comment>
                            <comment id="317757" author="adilger" created="Tue, 9 Nov 2021 17:19:38 +0000"  >&lt;p&gt;Sorry, new failure was &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15139&quot; title=&quot;sanity test_160h: dt_record_write() ASSERTION( dt-&amp;gt;do_body_ops-&amp;gt;dbo_write ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15139&quot;&gt;&lt;del&gt;LU-15139&lt;/del&gt;&lt;/a&gt;, which is similar, but not identical. &lt;/p&gt;</comment>
                            <comment id="329562" author="gerrit" created="Fri, 18 Mar 2022 06:48:14 +0000"  >&lt;p&gt;&quot;Mike Pershin &amp;lt;mpershin@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/46863&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46863&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13195&quot; title=&quot;replay-single test_118: dt_declare_record_write() ASSERTION( dt-&amp;gt;do_body_ops ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13195&quot;&gt;&lt;del&gt;LU-13195&lt;/del&gt;&lt;/a&gt; osp: invalidate object on write error&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: a6555d5b7f9a76250e8460adcb3d8a089356f490&lt;/p&gt;</comment>
                            <comment id="331288" author="gerrit" created="Thu, 7 Apr 2022 08:47:24 +0000"  >&lt;p&gt;&quot;Mike Pershin &amp;lt;mpershin@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/47010&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47010&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13195&quot; title=&quot;replay-single test_118: dt_declare_record_write() ASSERTION( dt-&amp;gt;do_body_ops ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13195&quot;&gt;&lt;del&gt;LU-13195&lt;/del&gt;&lt;/a&gt; osp: osp_send_update_req() should check generation&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: e555f223a6ad340e7f326478cd09ba36b4b8bbb2&lt;/p&gt;</comment>
                            <comment id="333842" author="gerrit" created="Thu, 5 May 2022 06:10:09 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/46863/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46863/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13195&quot; title=&quot;replay-single test_118: dt_declare_record_write() ASSERTION( dt-&amp;gt;do_body_ops ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13195&quot;&gt;&lt;del&gt;LU-13195&lt;/del&gt;&lt;/a&gt; osp: invalidate object on write error&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: be237a523e1208888f8f7d10e2a88709ea823a74&lt;/p&gt;</comment>
                            <comment id="347145" author="gerrit" created="Tue, 20 Sep 2022 03:35:27 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/47010/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47010/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13195&quot; title=&quot;replay-single test_118: dt_declare_record_write() ASSERTION( dt-&amp;gt;do_body_ops ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13195&quot;&gt;&lt;del&gt;LU-13195&lt;/del&gt;&lt;/a&gt; osp: osp_send_update_req() should check generation&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: b18246d8b78a308c32a5f78eee581f16dae5dc44&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="48828">LU-10143</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="58183">LU-13295</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="58602">LU-13411</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="69839">LU-15769</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="58852">LU-13469</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="63086">LU-15139</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00t47:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>