<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:32:25 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-17075] replay-dual test_16 ZFS  MDS crash: osd_declare_destroy() ASSERTION(dt_object_exists)</title>
                <link>https://jira.whamcloud.com/browse/LU-17075</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for Andreas Dilger &amp;lt;adilger@whamcloud.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.whamcloud.com/test_sets/33a011e7-eb8f-4b7c-a3d7-b77e6350ffb6&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/33a011e7-eb8f-4b7c-a3d7-b77e6350ffb6&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;test_16 failed with the following crash on master 10 times out of 123 runs:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;trevis-24vm4 MDS crashed during replay-dual test_16

[ 2046.669261] Lustre: lustre-MDT0000: recovery is timed out, evict stale exports
[ 2046.670797] Lustre: 77088:0:(genops.c:1579:class_disconnect_stale_exports()) lustre-MDT0000: disconnect stale client 95a79532-73af-49b8-a4c8-d22c598e5614@&amp;lt;unknown&amp;gt;
[ 2046.673531] Lustre: lustre-MDT0000: disconnecting 1 stale clients
[ 2046.676737] LustreError: 77088:0:(osd_object.c:765:osd_declare_destroy()) ASSERTION( dt_object_exists(dt) ) failed: 
[ 2046.678755] LustreError: 77088:0:(osd_object.c:765:osd_declare_destroy()) LBUG
[ 2046.680139] Pid: 77088, comm: tgt_recover_0 4.18.0-477.15.1.el8_lustre.x86_64 #1 SMP Tue Aug 1 06:59:39 UTC 2023
[ 2046.682036] Call Trace TBD:
[ 2046.682768] [&amp;lt;0&amp;gt;] libcfs_call_trace+0x6f/0xa0 [libcfs]
[ 2046.683842] [&amp;lt;0&amp;gt;] lbug_with_loc+0x3f/0x70 [libcfs]
[ 2046.684807] [&amp;lt;0&amp;gt;] osd_declare_destroy+0x313/0x470 [osd_zfs]
[ 2046.685989] [&amp;lt;0&amp;gt;] out_destroy_add_exec+0x60/0x250 [ptlrpc]
[ 2046.687594] [&amp;lt;0&amp;gt;] update_recovery_exec.isra.22+0x1017/0x1f00 [ptlrpc]
[ 2046.688937] [&amp;lt;0&amp;gt;] distribute_txn_replay_handle+0x332/0xd60 [ptlrpc]
[ 2046.690247] [&amp;lt;0&amp;gt;] replay_request_or_update.isra.32+0x3dc/0xa20 [ptlrpc]
[ 2046.691612] [&amp;lt;0&amp;gt;] target_recovery_thread+0x742/0x1330 [ptlrpc]
[ 2046.692838] [&amp;lt;0&amp;gt;] kthread+0x134/0x150
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Test session details:&lt;br/&gt;
clients: &lt;a href=&quot;https://build.whamcloud.com/job/lustre-master-next/729&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.whamcloud.com/job/lustre-master-next/729&lt;/a&gt; - 4.18.0-372.32.1.el8_6.x86_64&lt;br/&gt;
servers: &lt;a href=&quot;https://build.whamcloud.com/job/lustre-master-next/729&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.whamcloud.com/job/lustre-master-next/729&lt;/a&gt; - 4.18.0-372.32.1.el8_lustre.x86_64&lt;/p&gt;

&lt;p&gt;This started failing on 2023-08-30 after a large set of patch landings:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;40c404129b LU-17038 tests: remove unused compile.sh script
57a671d991 LU-17038 tests: remove mlink utility
a0d8023d07 LU-12678 lnet: discard lnet_kvaddr_to_page
e16d5d7d6f LU-17043 enc: fix osd lookup cache for long encrypted names
b01f58e3a9 LU-15367 llite: iotrace standardization
a897803cc7 LU-17039 build: cleanup ib_dma_map_sg
4774d1fab1 LU-17038 tests: remove munlink utility
a52545afed LU-16510 build: check if CONFIG_FORTIFY_SOURCE is defined
479f9585b0 LU-17036 utils: make sure resize option is legit
f4c336ad93 LU-17031 build: fix refefine __compiletime_strlen error
e40be98400 LU-17030 llite: allow setting max_cached_mb to a %
b557fb21c8 LU-10885 docs: note flock now being enabled by default
9758129177 LU-17015 gss: support large kerberos token on client
7f60b2b558 LU-17006 lnet: set up routes for going across subnets
8a9c503c00 LU-16766 obdclass: trim kernel thread names in jobids
39df815cd6 LU-17020 kernel: update RHEL 9.2 [5.14.0-284.25.1.el9_2]
b1739ba3fa LU-17013 lov: fill FIEMAP_EXTENT_LAST flag
f75ac594a2 LU-17011 utils: monotonic clock in lfs mirror
77b133766b LU-17009 tests: fix runtests to read file name with backslash
1db59b7b60 LU-17000 lnet: remove redundant errno check in liblnetconfig.c
8e53a0ea59 LU-16866 tests: Use wait_update to check LNet recovery state
05b289450b LU-17000 misc: remove Coverity annotations
3c0b1fbf34 LU-16984 tests: replay-dual/31 checks file from DIR2
d684885098 LU-16961 clang: plugins and build system integration
a1bc7a84ce LU-16605 lfs: Add -n option to fid2path
74140e5df4 LU-16943 tests: fix replay-single/135 under hard failure mode
0439aaadb1 LU-16936 auster: add --client-only option
7ff1a88126 LU-16883 ldiskfs: update for ext4-delayed-iput for RHEL9.0
b9ce342ee1 LU-16896 flr: resync should not change file size
c9e752c141 LU-16906 build: Server for newer SUSE 15 SP3 kernels
2f0ebff4ad LU-16477 ldiskfs: Add ext4-enc-flag patch for SUSE 15 SP5
d6673e5456 LU-16821 llite: report 1MiB directory blocksize
8d24aa6b8e LU-16816 obdclass: make import_event more robust
99144a595b LU-16232 script: fix the argument parse
88141538c4 LU-9859 libcfs: discard cfs_gettok and cfs_str2num_check
8785f25b05 LU-16552 test: add new lnet test for Multi-Rail setups
d0a722cb8f LU-16374 ldiskfs: implement security.encdata xattr
fe5706e0c1 LU-16235 hsm: check CDT state before adding actions llog
7270e16fcb LU-15526 mdt: enable remote PDO lock
324aa79eb5 LU-13730 tests: add file mirroring to racer
1288681bb8 LU-14361 statahead: add statahead advise IOCTL
e656cccbdc LU-14156 utils: mirror split to check for last in-sync early
e2738d294d LU-12645 llite: Move readahead debug before exit
1f4151ba71 LU-6142 lov: cleanup unneeded macros from lov_request.c
ef414ce6e7 LU-6142 ptlrpc: Fix style issues for layout.c
017cb44d26 LU-6142 ptlrpc: Fix style issues for events.c
00c74b245f LU-16847 ldiskfs: refactor code.
afe5813a48 LU-16077 ptlrpc: Fix ptlrpc_body_v2 with pb_uid/pb_gid
91a3b286ba LU-16827 obdfilter: Fix obdfilter-survery/1a
5c59f8551a LU-11457 osd-ldiskfs: scrub FID reuse
d69b511af6 LU-16097 tests: skip quota subtests in interop
576928b2cd LU-13306 mgs: support large NID for mgs_write_log_osc_to_lov
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;







&lt;p&gt;VVVVVVV DO NOT REMOVE LINES BELOW, Added by Maloo for auto-association VVVVVVV&lt;br/&gt;
replay-dual test_16 - trevis-24vm4 crashed during replay-dual test_16&lt;/p&gt;</description>
                <environment></environment>
        <key id="77709">LU-17075</key>
            <summary>replay-dual test_16 ZFS  MDS crash: osd_declare_destroy() ASSERTION(dt_object_exists)</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bzzz">Alex Zhuravlev</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Fri, 1 Sep 2023 06:19:29 +0000</created>
                <updated>Tue, 16 Jan 2024 04:46:31 +0000</updated>
                            <resolved>Sat, 18 Nov 2023 22:08:32 +0000</resolved>
                                                    <fixVersion>Lustre 2.16.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="384507" author="bzzz" created="Fri, 1 Sep 2023 06:24:17 +0000"  >&lt;p&gt;I&apos;m having a similar problem locally, bisected down to:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;7270e16fcb	5	3	0		LU-15526 mdt: enable remote PDO lock&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="385261" author="laisiyao" created="Fri, 8 Sep 2023 09:36:48 +0000"  >&lt;p&gt;It&apos;s a bit weird: replay-dual test_16() doesn&apos;t create mkdir, and all the test files are located on MDT0, but the backtrace shows it&apos;s update log based replay.&lt;/p&gt;</comment>
                            <comment id="385295" author="laisiyao" created="Fri, 8 Sep 2023 14:58:36 +0000"  >&lt;p&gt;It may be directory created in earlier test, let&apos;s revert this change.&lt;/p&gt;</comment>
                            <comment id="387076" author="gerrit" created="Mon, 25 Sep 2023 10:12:11 +0000"  >&lt;p&gt;&quot;Alex Zhuravlev &amp;lt;bzzz@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/52496&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/52496&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-17075&quot; title=&quot;replay-dual test_16 ZFS  MDS crash: osd_declare_destroy() ASSERTION(dt_object_exists)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-17075&quot;&gt;&lt;del&gt;LU-17075&lt;/del&gt;&lt;/a&gt; osd: destroy declare shouldn&apos;t panic&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 7fcefa44274dd3754053f1323695e4c6e6290ea4&lt;/p&gt;</comment>
                            <comment id="391812" author="adilger" created="Sun, 5 Nov 2023 03:13:33 +0000"  >&lt;p&gt;Hit the same crash:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 1560.700808] Lustre: lustre-MDT0000: recovery is timed out, evict stale exports
[ 1560.702249] Lustre: 85844:0:(genops.c:1481:class_disconnect_stale_exports()) lustre-MDT0000: disconnect stale client fb74e030-180b-4830-b914-49fe8734c041@&amp;lt;unknown&amp;gt;
[ 1560.704865] Lustre: lustre-MDT0000: disconnecting 1 stale clients
[ 1560.708933] LustreError: 85844:0:(osd_object.c:765:osd_declare_destroy()) ASSERTION( dt_object_exists(dt) ) failed: 
[ 1560.710896] LustreError: 85844:0:(osd_object.c:765:osd_declare_destroy()) LBUG
[ 1560.712243] Pid: 85844, comm: tgt_recover_0 4.18.0-477.21.1.el8_lustre.x86_64 #1 SMP Sat Sep 23 17:41:02 UTC 2023
[ 1560.714111] Call Trace TBD:
[ 1560.714734] [&amp;lt;0&amp;gt;] libcfs_call_trace+0x63/0x90 [libcfs]
[ 1560.715734] [&amp;lt;0&amp;gt;] lbug_with_loc+0x3f/0x70 [libcfs]
[ 1560.716641] [&amp;lt;0&amp;gt;] osd_declare_destroy+0x313/0x470 [osd_zfs]
[ 1560.717711] [&amp;lt;0&amp;gt;] out_destroy_add_exec+0x64/0x260 [ptlrpc]
[ 1560.718927] [&amp;lt;0&amp;gt;] update_recovery_exec.isra.22+0x1017/0x1f00 [ptlrpc]
[ 1560.720204] [&amp;lt;0&amp;gt;] distribute_txn_replay_handle+0x332/0xd60 [ptlrpc]
[ 1560.721441] [&amp;lt;0&amp;gt;] replay_request_or_update.isra.30+0x3d8/0xa50 [ptlrpc]
[ 1560.722746] [&amp;lt;0&amp;gt;] target_recovery_thread+0x742/0x1340 [ptlrpc]
[ 1560.723913] [&amp;lt;0&amp;gt;] kthread+0x134/0x150
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It makes me wonder if we should do something on the server to identify with OST/MDT target a thread is working on, to help debug issues like this?   Currently we name threads like &quot;&lt;tt&gt;mdt_io01_001&lt;/tt&gt;&quot; which is &quot;&lt;tt&gt;svc_type_cpt_thr&lt;/tt&gt;&quot;, so we are consuming 6 characters of the thread name for CPT identification, but it might be more useful to shrink the CPT identification (1-base64 char for CPT and 2-base64 char for thread number, using &lt;tt&gt;base64url_table&lt;/tt&gt;) and change the thread name to include the target index or something, like &quot;&lt;tt&gt;mdt_ioBAB0001&lt;/tt&gt;&quot;?&lt;/p&gt;

&lt;p&gt;I don&apos;t know if there is a cost to changing &lt;tt&gt;current-&amp;gt;comm&lt;/tt&gt;, but it might be more than just &quot;&lt;tt&gt;snprintf(current-&amp;gt;comm, ...)&lt;/tt&gt;&quot;, as I see there are &quot;&lt;tt&gt;_&lt;em&gt;get_task_comm()&lt;/tt&gt;&quot; and &quot;&lt;tt&gt;&lt;/em&gt;_set_task_comm&lt;/tt&gt;&quot;.  I&apos;m not sure if there is something else we could do to get extra information printed into a stack trace?  Possibly something in &lt;tt&gt;LASSERT/LBUG&lt;/tt&gt; that could determine the current target, maybe print if the thread is holding a DLM lock, etc...&lt;/p&gt;</comment>
                            <comment id="393507" author="gerrit" created="Sat, 18 Nov 2023 21:44:28 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/52496/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/52496/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-17075&quot; title=&quot;replay-dual test_16 ZFS  MDS crash: osd_declare_destroy() ASSERTION(dt_object_exists)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-17075&quot;&gt;&lt;del&gt;LU-17075&lt;/del&gt;&lt;/a&gt; osd: destroy declare shouldn&apos;t panic&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: fa370c0d724b5a90e2d739e5d3c67066facf550b&lt;/p&gt;</comment>
                            <comment id="393531" author="pjones" created="Sat, 18 Nov 2023 22:08:32 +0000"  >&lt;p&gt;Landed for 2.16&lt;/p&gt;</comment>
                            <comment id="399734" author="adilger" created="Mon, 15 Jan 2024 18:52:58 +0000"  >&lt;p&gt;It looks like this crash is still hit on master:&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/6ed7a4d7-bdc8-4ec1-9c40-d153616b520e&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/6ed7a4d7-bdc8-4ec1-9c40-d153616b520e&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/9d19737b-46e0-4b5e-8f09-a58c39713ae0&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/9d19737b-46e0-4b5e-8f09-a58c39713ae0&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="399788" author="bzzz" created="Tue, 16 Jan 2024 04:32:15 +0000"  >&lt;blockquote&gt;&lt;p&gt;It looks like this crash is still hit on master:&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;not sure how this is possible - there is no such an assertion in the code:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
762)  ENTRY;
763) 
764)  LASSERT(th != NULL);
765)  &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (unlikely(obj-&amp;gt;oo_dn == NULL))
766)          RETURN(-ENOENT);
767) 
768)  oh = container_of(th, struct osd_thandle, ot_super);
769)  LASSERT(oh-&amp;gt;ot_tx != NULL);
770) 
771)  dmu_tx_mark_netfree(oh-&amp;gt;ot_tx);
772) 
773)  &lt;span class=&quot;code-comment&quot;&gt;/* declare that we&apos;ll remove object from fid-dnode mapping */&lt;/span&gt;
774)  zapid = osd_get_name_n_idx(env, osd, fid, NULL, 0, &amp;amp;dn);
775)  osd_tx_hold_zap(oh-&amp;gt;ot_tx, zapid, dn, FALSE, NULL);
776) 
777)  osd_declare_xattrs_destroy(env, obj, oh);

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="399791" author="adilger" created="Tue, 16 Jan 2024 04:46:31 +0000"  >&lt;p&gt;It looks like even though the tests were run recently, they had old parents w/o the patch  - 2.15.58.96 and 2.15.58.187, while the patch was landed as 2.15.59.18 two months ago.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="68526">LU-15526</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="78657">LU-17242</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i03uen:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>