<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:21:04 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-15761] cannot finish MDS recovery</title>
                <link>https://jira.whamcloud.com/browse/LU-15761</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Hit the same issue twice during soak testing on the same MDS node&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;trying to connect to soak-10 ...
2022-04-15 00:35:34,865:fsmgmt.fsmgmt:INFO     soak-10 is up!!!
2022-04-15 00:35:45,877:fsmgmt.fsmgmt:INFO     Failing over soaked-MDT0002 ...
2022-04-15 00:35:45,877:fsmgmt.fsmgmt:INFO     Mounting soaked-MDT0002 on soak-11 ...
2022-04-15 00:36:42,914:fsmgmt.fsmgmt:INFO     ... soaked-MDT0002 mounted successfully on soak-11
2022-04-15 00:36:42,914:fsmgmt.fsmgmt:INFO     ... soaked-MDT0002 failed over
2022-04-15 00:36:42,914:fsmgmt.fsmgmt:INFO     Wait for recovery to complete
2022-04-15 00:36:43,502:fsmgmt.fsmgmt:DEBUG    Recovery Result Record: {&apos;soak-11&apos;: {&apos;soaked-MDT0003&apos;: &apos;COMPLETE&apos;, &apos;soaked-MDT0002&apos;: &apos;WAITING&apos;}}
2022-04-15 00:36:43,503:fsmgmt.fsmgmt:INFO     soaked-MDT0002 in status &apos;WAITING&apos;.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;on soak-11&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[136090.466173] Lustre: 43762:0:(ldlm_lib.c:1965:extend_recovery_timer()) Skipped 29 previous similar messages
[136538.516253] Lustre: soaked-MDT0003: Client ec0f61fb-4e34-4a0b-b633-b84f3e77a68b (at 192.168.1.138@o2ib) reconnecting
[136538.528122] Lustre: Skipped 8 previous similar messages
[136570.792064] LustreError: 43760:0:(llog.c:781:llog_process_thread()) soaked-MDT0001-osp-MDT0002 retry remote llog process
[136570.804339] LustreError: 43760:0:(llog.c:781:llog_process_thread()) Skipped 36771 previous similar messages
[136690.448382] Lustre: 43762:0:(ldlm_lib.c:1965:extend_recovery_timer()) soaked-MDT0002: extended recovery timer reached hard limit: 900, exten
d: 1
[136690.462972] Lustre: 43762:0:(ldlm_lib.c:1965:extend_recovery_timer()) Skipped 29 previous similar messages
[137139.504191] Lustre: soaked-MDT0003: Client 6734fd00-3e57-422d-a9ca-e8a57bcfd8f1 (at 192.168.1.135@o2ib) reconnecting
[137139.516055] Lustre: Skipped 10 previous similar messages
[137170.777941] LustreError: 43760:0:(llog.c:781:llog_process_thread()) soaked-MDT0001-osp-MDT0002 retry remote llog process
[137170.790212] LustreError: 43760:0:(llog.c:781:llog_process_thread()) Skipped 36789 previous similar messages
[137180.761466] Lustre: mdt00_016: service thread pid 12073 was inactive for 200.281 seconds. The thread might be hung, or it might only be slow
 and will resume later. Dumping the stack trace for debugging purposes:
[137180.782540] Pid: 12073, comm: mdt00_016 3.10.0-1160.49.1.el7_lustre.x86_64 #1 SMP Mon Dec 13 22:00:59 UTC 2021
[137180.793815] Call Trace:
[137180.796783] [&amp;lt;0&amp;gt;] top_trans_stop+0x882/0xfa0 [ptlrpc]
[137180.802560] [&amp;lt;0&amp;gt;] lod_trans_stop+0x25c/0x340 [lod]
[137180.808065] [&amp;lt;0&amp;gt;] mdd_trans_stop+0x2e/0x174 [mdd]
[137180.813433] [&amp;lt;0&amp;gt;] mdd_create+0x154a/0x1cd0 [mdd]
[137180.818740] [&amp;lt;0&amp;gt;] mdo_create+0x46/0x48 [mdt]
[137180.823632] [&amp;lt;0&amp;gt;] mdt_create+0xab1/0xe40 [mdt]
[137180.828720] [&amp;lt;0&amp;gt;] mdt_reint_create+0x3a0/0x460 [mdt]
[137180.834388] [&amp;lt;0&amp;gt;] mdt_reint_rec+0x8a/0x240 [mdt]
[137180.839662] [&amp;lt;0&amp;gt;] mdt_reint_internal+0x76c/0xb50 [mdt]
[137180.845516] [&amp;lt;0&amp;gt;] mdt_reint+0x67/0x150 [mdt]
[137180.850456] [&amp;lt;0&amp;gt;] tgt_request_handle+0x92f/0x19c0 [ptlrpc]
[137180.856729] [&amp;lt;0&amp;gt;] ptlrpc_server_handle_request+0x253/0xc30 [ptlrpc]
[137180.863875] [&amp;lt;0&amp;gt;] ptlrpc_main+0xbf4/0x15e0 [ptlrpc]
[137180.869420] [&amp;lt;0&amp;gt;] kthread+0xd1/0xe0
[137180.873423] [&amp;lt;0&amp;gt;] ret_from_fork_nospec_begin+0x21/0x21
[137180.879286] [&amp;lt;0&amp;gt;] 0xfffffffffffffffe
[137290.446156] Lustre: 43762:0:(ldlm_lib.c:1965:extend_recovery_timer()) soaked-MDT0002: extended recovery timer reached hard limit: 900, extend: 1
[137290.460730] Lustre: 43762:0:(ldlm_lib.c:1965:extend_recovery_timer()) Skipped 29 previous similar messages
[137575.430810] Lustre: 7111:0:(service.c:1436:ptlrpc_at_send_early_reply()) @@@ Could not add any time (5/5), not sending early reply  req@ffff8f5f6cb94380 x1729527267995904/t176094400030(0) o36-&amp;gt;f5a20605-7e7a-45f1-bf01-daf0664944ec@192.168.1.118@o2ib:514/0 lens 552/448 e 24 to 0 dl 1649907299 ref 2 fl Interpret:/0/0 rc 0/0 job:&apos;&apos;
[137740.477658] Lustre: soaked-MDT0003: Client ec0f61fb-4e34-4a0b-b633-b84f3e77a68b (at 192.168.1.138@o2ib) reconnecting
[137740.489550] Lustre: Skipped 9 previous similar messages
[137770.755234] LustreError: 43760:0:(llog.c:781:llog_process_thread()) soaked-MDT0001-osp-MDT0002 retry remote llog process
[137770.767489] LustreError: 43760:0:(llog.c:781:llog_process_thread()) Skipped 36738 previous similar messages
[137890.442958] Lustre: 43762:0:(ldlm_lib.c:1965:extend_recovery_timer()) soaked-MDT0002: extended recovery timer reached hard limit: 900, extend: 1
[137890.457533] Lustre: 43762:0:(ldlm_lib.c:1965:extend_recovery_timer()) Skipped 29 previous similar messages
[138341.465763] Lustre: soaked-MDT0003: Client 6734fd00-3e57-422d-a9ca-e8a57bcfd8f1 (at 192.168.1.135@o2ib) reconnecting
[138341.477619] Lustre: Skipped 11 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;have to run abort_recovery&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[188290.202475] Lustre: 43762:0:(ldlm_lib.c:1965:extend_recovery_timer()) Skipped 29 previous similar messages
[188327.367445] LustreError: 37182:0:(mdt_handler.c:7419:mdt_iocontrol()) soaked-MDT0002: Aborting recovery for device
[188327.379129] LustreError: 37182:0:(ldlm_lib.c:2876:target_stop_recovery_thread()) soaked-MDT0002: Aborting recovery
[188327.390814] Lustre: 43762:0:(ldlm_lib.c:2283:target_recovery_overseer()) recovery is aborted, evict exports in recovery
[188327.392289] LustreError: 43760:0:(lod_dev.c:424:lod_sub_recovery_thread()) soaked-MDT0001-osp-MDT0002 get update log failed: rc = -11
[188327.418009] Lustre: 43762:0:(ldlm_lib.c:2273:target_recovery_overseer()) soaked-MDT0002 recovery is aborted by hard timeout
[188327.486462] Lustre: soaked-MDT0002-osp-MDT0003: Connection restored to 192.168.1.111@o2ib (at 0@lo)
[188327.487906] Lustre: soaked-MDT0002: Recovery over after 2739:44, of 22 clients 22 recovered and 0 were evicted.
[188345.146528] Lustre: Failing over soaked-MDT0002
[188345.224943] LustreError: 3633:0:(mdt_reint.c:2741:mdt_reint_rename()) soaked-MDT0002: cannot lock for rename: rc = -5
[188345.236924] LustreError: 3633:0:(mdt_reint.c:2741:mdt_reint_rename()) Skipped 1 previous similar message
[188345.238945] Lustre: soaked-MDT0002: Not available for connect from 192.168.1.126@o2ib (stopping)
[188345.238946] Lustre: Skipped 2 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>2.15.0-rc3</environment>
        <key id="69804">LU-15761</key>
            <summary>cannot finish MDS recovery</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="tappro">Mikhail Pershin</assignee>
                                    <reporter username="sarah">Sarah Liu</reporter>
                        <labels>
                            <label>soak</label>
                    </labels>
                <created>Tue, 19 Apr 2022 17:18:29 +0000</created>
                <updated>Wed, 21 Dec 2022 12:08:49 +0000</updated>
                            <resolved>Thu, 5 May 2022 19:14:35 +0000</resolved>
                                    <version>Lustre 2.15.0</version>
                                    <fixVersion>Lustre 2.15.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>13</watches>
                                                                            <comments>
                            <comment id="332373" author="pjones" created="Tue, 19 Apr 2022 22:22:25 +0000"  >&lt;p&gt;Mike&lt;/p&gt;

&lt;p&gt;Could you please investigate?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="332374" author="adilger" created="Tue, 19 Apr 2022 22:32:08 +0000"  >&lt;p&gt;Sarah, it would be useful to know which was the last 2.14.5x version where this test was still passing?   Have you restarted soak with the current build to see if this is repeatable, or if it was a one-time failure?&lt;/p&gt;

&lt;p&gt;According to &lt;a href=&quot;https://wiki.whamcloud.com/display/Releases/Soak+Testing+on+Soak&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://wiki.whamcloud.com/display/Releases/Soak+Testing+on+Soak&lt;/a&gt; it looks like 2.14.56 was run on 20211223, and 2.15.0-RC2-2-g94f4e1f was run on 20220302.  Did those builds work without error?  &lt;/p&gt;

&lt;p&gt;The number of patches that landed since  2.15.0-RC2-2-g94f4e1f is quite small (excluding test patches):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;03e4b451b0dc New RC 2.15.0-RC3
649d638467c0 LU-15670 clio: Disable lockless for DIO with O_APPEND
22de0bd145b6 LU-15616 lnet: ln_api_mutex deadlocks
3e3f70eb1ec9 LU-13714 lnet: only update gateway NI status on discovery
0090b6f6f6cf LU-15692 lmv: change default hash back to fnv_1a_64
42a6d1fdb681 LU-15702 lov: remove lo_trunc_stripeno
ef826db1f43b LU-15548 osd-ldiskfs: hide virtual projid xattr
9884f37985c1 LU-15637 llite: Fix use of uninitialized fields
d8012811cc6f LU-15435 ptlrpc: unregister reply buffer on rq_err
4d93fd79e8b0 LU-15661 nodemap: fix map mode value for &apos;both&apos;
2f496148c31d LU-15551 ofd: Return EOPNOTSUPP instead of EPROTO
9dcbf8b3d44f LU-15634 ptlrpc: Use after free of &apos;conn&apos; in rhashtable retry
f14090e56c9d LU-15546 mdt: mdt_reint_open lookup before locking
966ca46e4aa2 LU-15608 sec: fix DIO for encrypted files
1dbcbd70f87b LU-15021 quota: protect lqe_glbl_data in lqe
5da859e262dd LU-15576 osp: Interop skip sanity test 823 for MDS &amp;lt; 2.14.56
06134ff9adde LU-15601 osd-ldiskfs: handle read_inode_bitmap() error
591a990c617f LU-15513 lod: skip uninit component in lod_fill_mirrors
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="332377" author="adilger" created="Tue, 19 Apr 2022 22:44:50 +0000"  >&lt;p&gt;Sarah, I didn&apos;t notice that you have already hit this problem twice.  Could you please test with patch &lt;a href=&quot;https://review.whamcloud.com/46837&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/46837&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15645&quot; title=&quot;gap in recovery llog should not be a fatal error&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15645&quot;&gt;&lt;del&gt;LU-15645&lt;/del&gt;&lt;/a&gt; obdclass: llog to handle gaps&lt;/tt&gt;&quot; (already based on 2.15.0-RC3) to see if this solves the recovery problem?&lt;/p&gt;</comment>
                            <comment id="332381" author="sarah" created="Wed, 20 Apr 2022 01:23:35 +0000"  >&lt;p&gt;Hi Andreas, I didn&apos;t notice this problem on the previous master tags, I will reload soak tomorrow to try the patch, thank you.&lt;/p&gt;</comment>
                            <comment id="332583" author="JIRAUSER17312" created="Thu, 21 Apr 2022 19:55:06 +0000"  >&lt;p&gt;Hi &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=sarah&quot; class=&quot;user-hover&quot; rel=&quot;sarah&quot;&gt;sarah&lt;/a&gt;&#160;&lt;/p&gt;

&lt;p&gt;Any luck reproducing?&lt;/p&gt;

&lt;p&gt;-cf&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="332855" author="sarah" created="Mon, 25 Apr 2022 15:53:05 +0000"  >&lt;p&gt;Hi Colin,&lt;/p&gt;

&lt;p&gt;soak doesn&apos;t hit the same issue, instead it has another issue on the other pair of MDSs(soak-8 and soak-9) during failback. &lt;/p&gt;

&lt;p&gt;on soak-8, umount hung when it tried to failback MDT0001&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[13201.893886] Lustre: Skipped 3 previous similar messages
[13226.408821] LDISKFS-fs warning (device dm-1): ldiskfs_multi_mount_protect:321: MMP interval 42 higher than expected, please wait.

[13241.225041] LustreError: 137-5: soaked-MDT0001_UUID: not available for connect from 192.168.1.128@o2ib (no target). If you are running an HA 
pair check that the target is mounted on the other server.
[13241.244882] LustreError: Skipped 929 previous similar messages
[13272.213200] LDISKFS-fs (dm-1): recovery complete
[13272.218869] LDISKFS-fs (dm-1): mounted filesystem with ordered data mode. Opts: user_xattr,errors=remount-ro,user_xattr,no_mbcache,nodelalloc
[13274.329038] Lustre: soaked-MDT0001: Not available for connect from 192.168.1.111@o2ib (not set up)
[13274.339066] Lustre: Skipped 1 previous similar message
[13274.402654] Lustre: soaked-MDT0000: Received MDS connection from 0@lo, removing former export from 192.168.1.109@o2ib
[13274.615826] Lustre: soaked-MDT0001: Imperative Recovery enabled, recovery window shrunk from 300-900 down to 150-900
[13274.633101] Lustre: soaked-MDT0001: in recovery but waiting for the first client to connect
[13274.686553] Lustre: soaked-MDT0001: Will be in recovery for at least 2:30, or until 22 clients reconnect
[13283.173712] Lustre: soaked-MDT0001-osp-MDT0000: Connection restored to 192.168.1.108@o2ib (at 0@lo)
[13283.174024] Lustre: soaked-MDT0001: Recovery over after 0:08, of 22 clients 22 recovered and 0 were evicted.
[13283.187022] Lustre: 3818:0:(mdt_recovery.c:200:mdt_req_from_lrd()) @@@ restoring transno  req@ffff9849b4361680 x1730744898968000/t38655174014
(0) o36-&amp;gt;88b0180e-0530-4e8d-8af5-9a48dbccd57d@192.168.1.119@o2ib:271/0 lens 496/2888 e 0 to 0 dl 1650719436 ref 1 fl Interpret:/2/0 rc 0/0 job:&apos;
&apos;
[13283.187026] Lustre: 3818:0:(mdt_recovery.c:200:mdt_req_from_lrd()) Skipped 3 previous similar messages
[13291.941059] Lustre: Failing over soaked-MDT0001
[13291.949360] LustreError: 5845:0:(mdt_reint.c:2741:mdt_reint_rename()) soaked-MDT0001: cannot lock for rename: rc = -5
[13291.957834] LustreError: 3666:0:(ldlm_lockd.c:2500:ldlm_cancel_handler()) ldlm_cancel from 0@lo arrived at 1650719399 with bad export cookie 17448559810846504576
[13291.957883] Lustre: soaked-MDT0001-osp-MDT0000: Connection to soaked-MDT0001 (at 0@lo) was lost; in progress operations using this service will wait for recovery to complete
[13291.957978] LustreError: 11-0: soaked-MDT0001-osp-MDT0000: operation out_update to node 0@lo failed: rc = -107
[13291.957984] Lustre: soaked-MDT0001: Not available for connect from 0@lo (stopping)
[13291.996099] LustreError: 3684:0:(osp_object.c:629:osp_attr_get()) soaked-MDT0002-osp-MDT0001: osp_attr_get update error [0x280004a85:0x3:0x0]: rc = -108
[13292.029469] LustreError: 5845:0:(mdt_reint.c:2741:mdt_reint_rename()) Skipped 2 previous similar messages
[13293.222589] Lustre: soaked-MDT0001: Not available for connect from 192.168.1.111@o2ib (stopping)
[13293.232426] Lustre: Skipped 7 previous similar messages
[13294.448007] LustreError: 3645:0:(client.c:1256:ptlrpc_import_delay_req()) @@@ IMP_CLOSED  req@ffff9849b28df500 x1730891251304896/t0(0) o41-&amp;gt;soaked-MDT0002-osp-MDT0001@192.168.1.110@o2ib:24/4 lens 224/368 e 0 to 0 dl 0 ref 1 fl Rpc:QU/0/ffffffff rc 0/-1 job:&apos;&apos;
[13294.473664] LustreError: 3645:0:(client.c:1256:ptlrpc_import_delay_req()) Skipped 2 previous similar messages
[13296.155550] Lustre: soaked-MDT0001: Not available for connect from 192.168.1.129@o2ib (stopping)
[13296.165402] Lustre: Skipped 1 previous similar message
[13300.253260] Lustre: soaked-MDT0001: Not available for connect from 192.168.1.121@o2ib (stopping)
[13300.263095] Lustre: Skipped 13 previous similar messages
[13308.262410] Lustre: soaked-MDT0001: Not available for connect from 192.168.1.111@o2ib (stopping)
[13308.272238] Lustre: Skipped 36 previous similar messages
[13324.545237] Lustre: soaked-MDT0001: Not available for connect from 192.168.1.127@o2ib (stopping)
[13324.555060] Lustre: Skipped 122 previous similar messages
[13357.071204] Lustre: soaked-MDT0001: Not available for connect from 0@lo (stopping)
[13357.079677] Lustre: Skipped 226 previous similar messages
[13421.084164] Lustre: soaked-MDT0001: Not available for connect from 192.168.1.123@o2ib (stopping)
[13421.094016] Lustre: Skipped 458 previous similar messages
[13441.068783] INFO: task umount:25201 blocked for more than 120 seconds.
[13441.076096] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[13441.084855] umount          D ffff9845eaf8d280     0 25201  25197 0x00000080
[13441.092782] Call Trace:
[13441.095539]  [&amp;lt;ffffffff91b89179&amp;gt;] schedule+0x29/0x70
[13441.101100]  [&amp;lt;ffffffff91b8a8b5&amp;gt;] rwsem_down_write_failed+0x215/0x3c0
[13441.108344]  [&amp;lt;ffffffffc1ac50e0&amp;gt;] ? osp_trans_update_request_create+0x30/0x50 [osp]
[13441.116905]  [&amp;lt;ffffffff91798327&amp;gt;] call_rwsem_down_write_failed+0x17/0x30
[13441.124415]  [&amp;lt;ffffffff91b8868d&amp;gt;] down_write+0x2d/0x3d
[13441.130225]  [&amp;lt;ffffffffc0df0bbe&amp;gt;] llog_cancel_arr_rec+0x14e/0xa40 [obdclass]
[13441.138142]  [&amp;lt;ffffffffc0df14ce&amp;gt;] llog_cancel_rec+0x1e/0x20 [obdclass]
[13441.145474]  [&amp;lt;ffffffffc0df813e&amp;gt;] llog_cat_cleanup+0xee/0x3f0 [obdclass]
[13441.153015]  [&amp;lt;ffffffffc0df8556&amp;gt;] llog_cat_close+0x116/0x240 [obdclass]
[13441.160452]  [&amp;lt;ffffffffc19979e6&amp;gt;] lod_sub_fini_llog+0xa6/0x2e0 [lod]
[13441.167568]  [&amp;lt;ffffffffc1998ccc&amp;gt;] lod_process_config+0x76c/0x1340 [lod]
[13441.174981]  [&amp;lt;ffffffffc1a292fe&amp;gt;] mdd_process_config+0x8e/0x640 [mdd]
[13441.182214]  [&amp;lt;ffffffffc1870921&amp;gt;] mdt_stack_pre_fini+0x271/0x6f0 [mdt]
[13441.189538]  [&amp;lt;ffffffffc1871b37&amp;gt;] mdt_device_fini+0x87/0x990 [mdt]
[13441.196489]  [&amp;lt;ffffffffc0e2de91&amp;gt;] class_cleanup+0xa61/0xd20 [obdclass]
[13441.203849]  [&amp;lt;ffffffffc0e2ee97&amp;gt;] class_process_config+0x6a7/0x2b20 [obdclass]
[13441.211958]  [&amp;lt;ffffffffc0e314d6&amp;gt;] class_manual_cleanup+0x1c6/0x760 [obdclass]
[13441.219967]  [&amp;lt;ffffffffc0e6b6f5&amp;gt;] server_put_super+0xa25/0xf80 [obdclass]
[13441.227555]  [&amp;lt;ffffffff916977f6&amp;gt;] ? fsnotify_unmount_inodes+0x46/0x1d0
[13441.234856]  [&amp;lt;ffffffff91650f5d&amp;gt;] generic_shutdown_super+0x6d/0x100
[13441.241866]  [&amp;lt;ffffffff91651362&amp;gt;] kill_anon_super+0x12/0x20
[13441.248134]  [&amp;lt;ffffffffc14d2d7b&amp;gt;] lustre_kill_super+0x2b/0x30 [lustre]
[13441.255455]  [&amp;lt;ffffffff9165173e&amp;gt;] deactivate_locked_super+0x4e/0x70
[13441.262458]  [&amp;lt;ffffffff91651ec6&amp;gt;] deactivate_super+0x46/0x60
[13441.268792]  [&amp;lt;ffffffff9167130f&amp;gt;] cleanup_mnt+0x3f/0x80
[13441.274634]  [&amp;lt;ffffffff916713a2&amp;gt;] __cleanup_mnt+0x12/0x20
[13441.280692]  [&amp;lt;ffffffff914c299b&amp;gt;] task_work_run+0xbb/0xe0
[13441.286737]  [&amp;lt;ffffffff9142cc65&amp;gt;] do_notify_resume+0xa5/0xc0
[13441.293085]  [&amp;lt;ffffffff91b962ef&amp;gt;] int_signal+0x12/0x17
[13549.118275] Lustre: soaked-MDT0001: Not available for connect from 192.168.1.102@o2ib (stopping)
[13549.128106] Lustre: Skipped 917 previous similar messages
[13561.296954] INFO: task umount:25201 blocked for more than 120 seconds.
[13561.304262] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[13561.313021] umount          D ffff9845eaf8d280     0 25201  25197 0x00000080
[13561.320948] Call Trace:
[13561.323722]  [&amp;lt;ffffffff91b89179&amp;gt;] schedule+0x29/0x70
[13561.329295]  [&amp;lt;ffffffff91b8a8b5&amp;gt;] rwsem_down_write_failed+0x215/0x3c0
[13561.336541]  [&amp;lt;ffffffffc1ac50e0&amp;gt;] ? osp_trans_update_request_create+0x30/0x50 [osp]
[13561.345118]  [&amp;lt;ffffffff91798327&amp;gt;] call_rwsem_down_write_failed+0x17/0x30
[13561.352614]  [&amp;lt;ffffffff91b8868d&amp;gt;] down_write+0x2d/0x3d
[13561.358425]  [&amp;lt;ffffffffc0df0bbe&amp;gt;] llog_cancel_arr_rec+0x14e/0xa40 [obdclass]
[13561.366349]  [&amp;lt;ffffffffc0df14ce&amp;gt;] llog_cancel_rec+0x1e/0x20 [obdclass]
[13561.373688]  [&amp;lt;ffffffffc0df813e&amp;gt;] llog_cat_cleanup+0xee/0x3f0 [obdclass]
[13561.381198]  [&amp;lt;ffffffffc0df8556&amp;gt;] llog_cat_close+0x116/0x240 [obdclass]
[13561.388628]  [&amp;lt;ffffffffc19979e6&amp;gt;] lod_sub_fini_llog+0xa6/0x2e0 [lod]
[13561.395764]  [&amp;lt;ffffffffc1998ccc&amp;gt;] lod_process_config+0x76c/0x1340 [lod]
[13561.403174]  [&amp;lt;ffffffffc1a292fe&amp;gt;] mdd_process_config+0x8e/0x640 [mdd]
[13561.410429]  [&amp;lt;ffffffffc1870921&amp;gt;] mdt_stack_pre_fini+0x271/0x6f0 [mdt]
[13561.417740]  [&amp;lt;ffffffffc1871b37&amp;gt;] mdt_device_fini+0x87/0x990 [mdt]
[13561.424679]  [&amp;lt;ffffffffc0e2de91&amp;gt;] class_cleanup+0xa61/0xd20 [obdclass]
[13561.432019]  [&amp;lt;ffffffffc0e2ee97&amp;gt;] class_process_config+0x6a7/0x2b20 [obdclass]
[13561.440115]  [&amp;lt;ffffffffc0e314d6&amp;gt;] class_manual_cleanup+0x1c6/0x760 [obdclass]
[13561.448130]  [&amp;lt;ffffffffc0e6b6f5&amp;gt;] server_put_super+0xa25/0xf80 [obdclass]
[13561.455738]  [&amp;lt;ffffffff916977f6&amp;gt;] ? fsnotify_unmount_inodes+0x46/0x1d0
[13561.463042]  [&amp;lt;ffffffff91650f5d&amp;gt;] generic_shutdown_super+0x6d/0x100
[13561.470059]  [&amp;lt;ffffffff91651362&amp;gt;] kill_anon_super+0x12/0x20
[13561.476333]  [&amp;lt;ffffffffc14d2d7b&amp;gt;] lustre_kill_super+0x2b/0x30 [lustre]
[13561.483635]  [&amp;lt;ffffffff9165173e&amp;gt;] deactivate_locked_super+0x4e/0x70
[13561.490656]  [&amp;lt;ffffffff91651ec6&amp;gt;] deactivate_super+0x46/0x60
[13561.497002]  [&amp;lt;ffffffff9167130f&amp;gt;] cleanup_mnt+0x3f/0x80
[13561.502856]  [&amp;lt;ffffffff916713a2&amp;gt;] __cleanup_mnt+0x12/0x20
[13561.508890]  [&amp;lt;ffffffff914c299b&amp;gt;] task_work_run+0xbb/0xe0
[13561.514946]  [&amp;lt;ffffffff9142cc65&amp;gt;] do_notify_resume+0xa5/0xc0
[13561.521284]  [&amp;lt;ffffffff91b962ef&amp;gt;] int_signal+0x12/0x17
[13563.147828] Lustre: mdt01_009: service thread pid 3882 was inactive for 236.694 seconds. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[13563.168704] Pid: 3882, comm: mdt01_009 3.10.0-1160.49.1.el7_lustre.x86_64 #1 SMP Mon Dec 13 22:00:59 UTC 2021
[13563.179794] Call Trace:
[13563.182673] [&amp;lt;0&amp;gt;] top_trans_stop+0x882/0xfa0 [ptlrpc]
[13563.188353] [&amp;lt;0&amp;gt;] lod_trans_stop+0x25c/0x340 [lod]
[13563.193761] [&amp;lt;0&amp;gt;] mdd_trans_stop+0x2e/0x174 [mdd]
[13563.199052] [&amp;lt;0&amp;gt;] mdd_create+0x154a/0x1cd0 [mdd]
[13563.204259] [&amp;lt;0&amp;gt;] mdo_create+0x46/0x48 [mdt]
[13563.209061] [&amp;lt;0&amp;gt;] mdt_create+0xab1/0xe40 [mdt]
[13563.214062] [&amp;lt;0&amp;gt;] mdt_reint_create+0x3a0/0x460 [mdt]
[13563.219660] [&amp;lt;0&amp;gt;] mdt_reint_rec+0x8a/0x240 [mdt]
[13563.224847] [&amp;lt;0&amp;gt;] mdt_reint_internal+0x76c/0xb50 [mdt]
[13563.230632] [&amp;lt;0&amp;gt;] mdt_reint+0x67/0x150 [mdt]
[13563.235465] [&amp;lt;0&amp;gt;] tgt_request_handle+0x92f/0x19c0 [ptlrpc]
[13563.241663] [&amp;lt;0&amp;gt;] ptlrpc_server_handle_request+0x253/0xc30 [ptlrpc]
[13563.248722] [&amp;lt;0&amp;gt;] ptlrpc_main+0xbf4/0x15e0 [ptlrpc]
[13563.254185] [&amp;lt;0&amp;gt;] kthread+0xd1/0xe0
[13563.258089] [&amp;lt;0&amp;gt;] ret_from_fork_nospec_begin+0x21/0x21
[13563.263866] [&amp;lt;0&amp;gt;] 0xfffffffffffffffe
[13591.927826] Lustre: soaked-MDT0000: Received new MDS connection from 192.168.1.110@o2ib, keep former export from same NID
[13591.940130] Lustre: Skipped 1 previous similar message
[13681.525125] INFO: task umount:25201 blocked for more than 120 seconds.
[13681.532427] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[13681.541183] umount          D ffff9845eaf8d280     0 25201  25197 0x00000080
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="332888" author="bzzz" created="Mon, 25 Apr 2022 18:22:45 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=sarah&quot; class=&quot;user-hover&quot; rel=&quot;sarah&quot;&gt;sarah&lt;/a&gt; would it be possible to get all traces for the case?&lt;/p&gt;</comment>
                            <comment id="332900" author="sarah" created="Mon, 25 Apr 2022 21:06:50 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=bzzz&quot; class=&quot;user-hover&quot; rel=&quot;bzzz&quot;&gt;bzzz&lt;/a&gt;, please check the attached. &lt;/p&gt;</comment>
                            <comment id="333399" author="bzzz" created="Fri, 29 Apr 2022 08:42:11 +0000"  >&lt;p&gt;still looking at the logs, but haven&apos;t got a good idea yet. is it reproducable?&lt;/p&gt;</comment>
                            <comment id="333547" author="sarah" created="Mon, 2 May 2022 16:12:16 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=bzzz&quot; class=&quot;user-hover&quot; rel=&quot;bzzz&quot;&gt;bzzz&lt;/a&gt;, the same umount hang issue happened again on a different node,  if you want to check the status, please let me know. I will also upload the trace from soak-10 which the hang happened&lt;/p&gt;</comment>
                            <comment id="333551" author="bzzz" created="Mon, 2 May 2022 16:36:36 +0000"  >&lt;p&gt;can you grab all the traces with echo t &amp;gt;/proc/sysrq-trigger please?&lt;/p&gt;</comment>
                            <comment id="333553" author="bzzz" created="Mon, 2 May 2022 16:44:24 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=sarah&quot; class=&quot;user-hover&quot; rel=&quot;sarah&quot;&gt;sarah&lt;/a&gt; thank you, checking..&lt;/p&gt;</comment>
                            <comment id="333555" author="bzzz" created="Mon, 2 May 2022 16:45:53 +0000"  >&lt;p&gt;and the kernel logs from the servers please&lt;/p&gt;</comment>
                            <comment id="333576" author="gerrit" created="Mon, 2 May 2022 18:39:11 +0000"  >&lt;p&gt;&quot;Alex Zhuravlev &amp;lt;bzzz@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/47185&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47185&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15761&quot; title=&quot;cannot finish MDS recovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15761&quot;&gt;&lt;del&gt;LU-15761&lt;/del&gt;&lt;/a&gt; obdclass: fix locking in llog_cat_refresh()&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: f161c9dd6fcc11dbb044225a05d6d283d665424f&lt;/p&gt;</comment>
                            <comment id="333584" author="adilger" created="Mon, 2 May 2022 20:09:56 +0000"  >&lt;p&gt;It looks like this locking imbalance was introduced by patch &lt;a href=&quot;https://review.whamcloud.com/33401&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/33401&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11418&quot; title=&quot;hung threads on MDT and MDT won&amp;#39;t umount&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11418&quot;&gt;&lt;del&gt;LU-11418&lt;/del&gt;&lt;/a&gt; llog: refresh remote llog upon -ESTALE&lt;/tt&gt;&quot;.&lt;/p&gt;

&lt;p&gt;The very strange thing is that patch was landed in 2.12.0-RC1, but has not been noticed until now.  It is possible that some other recent landing has made this codepath more likely to be hit, because otherwise it seems like the code would always be broken.  &lt;/p&gt;</comment>
                            <comment id="333589" author="adilger" created="Mon, 2 May 2022 20:35:49 +0000"  >&lt;p&gt;This may be related to several llog-related test failures that were previously reported, all of them with the &quot;&lt;tt&gt;retry remote llog process&lt;/tt&gt;&quot; message printed.&lt;/p&gt;

&lt;p&gt;Looking at the callers of &lt;tt&gt;llog_cat_refresh()&lt;/tt&gt;, it looks like this could happen in combination with the patch &lt;a href=&quot;https://review.whamcloud.com/40742&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40742&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13974&quot; title=&quot;Wrong behavior for out operations create+write especially for update log&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13974&quot;&gt;&lt;del&gt;LU-13974&lt;/del&gt;&lt;/a&gt; llog: check stale osp object&lt;/tt&gt;&quot; landed in 2.15.0 that returns &lt;tt&gt;-ESTALE&lt;/tt&gt; more frequently:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; llog_cat_declare_add_rec()
{
        :
start:
        down_read_nested(&amp;amp;cathandle-&amp;gt;lgh_lock, LLOGH_CAT);
        :  
        rc = llog_declare_write_rec(env, cathandle-&amp;gt;u.chd.chd_current_log,
                                    rec, -1, th);
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc == -ESTALE &amp;amp;&amp;amp; dt_object_remote(cathandle-&amp;gt;lgh_obj)) {
                up_read(&amp;amp;cathandle-&amp;gt;lgh_lock);
             rc = llog_cat_refresh(env, cathandle);
 &#160; &#160; &#160; &#160; &#160; &#160; &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc)
                        RETURN(rc);
             &lt;span class=&quot;code-keyword&quot;&gt;goto&lt;/span&gt; start;    
        }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="333612" author="adilger" created="Tue, 3 May 2022 03:40:27 +0000"  >&lt;p&gt;Sarah, are you able to restart Soak with Alex&apos;s patch on master?  You were previously able to hit this issue easily, so hopefully it will tell us quickly if the patch fixes it.&lt;/p&gt;</comment>
                            <comment id="333654" author="bzzz" created="Tue, 3 May 2022 15:03:54 +0000"  >&lt;p&gt;Sarah replied on Skype: I have loaded soak with it yesterday, will check the status a little later and let you know &lt;/p&gt;</comment>
                            <comment id="333678" author="sarah" created="Tue, 3 May 2022 17:11:58 +0000"  >&lt;p&gt;Hi Andreas, last time the failure occurred after 6/7 hours running, so far soak has been up over 10 hours and it is doing well.&lt;/p&gt;</comment>
                            <comment id="333781" author="sarah" created="Wed, 4 May 2022 16:25:32 +0000"  >&lt;p&gt;A quick update on soak, it doesn&apos;t hit the umount issue during failover, but it did hit &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14495&quot; title=&quot;(llog_osd.c:622:llog_osd_write_rec()) LBUG&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14495&quot;&gt;&lt;del&gt;LU-14495&lt;/del&gt;&lt;/a&gt; on MDS0,  test is still ongoing since disabled panic-on-lbug. I will upload the trace and debug log here since &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14495&quot; title=&quot;(llog_osd.c:622:llog_osd_write_rec()) LBUG&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14495&quot;&gt;&lt;del&gt;LU-14495&lt;/del&gt;&lt;/a&gt; is closed.&lt;/p&gt;</comment>
                            <comment id="333787" author="bzzz" created="Wed, 4 May 2022 17:31:33 +0000"  >&lt;p&gt;thanks for the update. I&apos;m going to check the logs. in the mean time, can you please restart?&lt;/p&gt;</comment>
                            <comment id="333796" author="bzzz" created="Wed, 4 May 2022 19:18:30 +0000"  >&lt;p&gt;I don&apos;t understand where this problem comes from:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[ 5814.198337] LustreError: 12981:0:(out_handler.c:910:out_tx_end()) soaked-MDT0000-osd: undo &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; /tmp/rpmbuild-lustre-jenkins-ZEMyr9OQ/BUILD/lustre-2.15.0_RC3_3_gf161c9d/lustre/ptlrpc/../../lustre/target/out_handler.c:445: rc = -524&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;want to debug this..&lt;br/&gt;
would it be possible to run the test with full debug enabled and dump Lustre upon this message?&lt;/p&gt;</comment>
                            <comment id="333800" author="sarah" created="Wed, 4 May 2022 19:33:03 +0000"  >&lt;p&gt;okay, I will restart with full debug&lt;/p&gt;</comment>
                            <comment id="333897" author="sarah" created="Thu, 5 May 2022 16:01:47 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=bzzz&quot; class=&quot;user-hover&quot; rel=&quot;bzzz&quot;&gt;bzzz&lt;/a&gt;, please check the log lustre-log.1651732904.9736, this is with full debug, hope it helps&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[33368.708107] LustreError: 8542:0:(llog_cat.c:789:llog_cat_cancel_records()) Skipped 343 previous similar messages
[33384.336498] Lustre: soaked-MDT0003: haven&apos;t heard from client soaked-MDT0003-lwp-OST0005_UUID (at 192.168.1.105@o2ib) in 227 seconds. I think
 it&apos;s dead, and I am evicting it. exp ffff916a71261400, cur 1651732900 expire 1651732750 last 1651732673
[33384.360772] Lustre: Skipped 3 previous similar messages
[33388.209816] LustreError: 9736:0:(llog_osd.c:626:llog_osd_write_rec()) soaked-MDT0001-osp-MDT0003: index 40453 already set in llog bitmap [0x2
40000403:0x3:0x0]
[33388.225651] LustreError: 9736:0:(llog_osd.c:628:llog_osd_write_rec()) LBUG
[33388.233370] Pid: 9736, comm: mdt00_011 3.10.0-1160.49.1.el7_lustre.x86_64 #1 SMP Sun Apr 3 16:20:30 UTC 2022
[33388.244363] Call Trace:
[33388.247123] [&amp;lt;0&amp;gt;] libcfs_call_trace+0x90/0xf0 [libcfs]
[33388.252872] [&amp;lt;0&amp;gt;] lbug_with_loc+0x4c/0xa0 [libcfs]
[33388.258268] [&amp;lt;0&amp;gt;] llog_osd_write_rec+0x1a3a/0x1a70 [obdclass]
[33388.264714] [&amp;lt;0&amp;gt;] llog_write_rec+0x293/0x590 [obdclass]
[33388.270573] [&amp;lt;0&amp;gt;] llog_cat_add_rec+0x1d9/0x980 [obdclass]
[33388.276617] [&amp;lt;0&amp;gt;] llog_add+0x182/0x1f0 [obdclass]
[33388.281964] [&amp;lt;0&amp;gt;] sub_updates_write+0x302/0xe3b [ptlrpc]
[33388.287970] [&amp;lt;0&amp;gt;] top_trans_stop+0x4a2/0xfa0 [ptlrpc]
[33388.293652] [&amp;lt;0&amp;gt;] lod_trans_stop+0x25c/0x340 [lod]
[33388.299032] [&amp;lt;0&amp;gt;] mdd_trans_stop+0x2e/0x174 [mdd]
[33388.304308] [&amp;lt;0&amp;gt;] mdd_create+0x154a/0x1cd0 [mdd]
[33388.309491] [&amp;lt;0&amp;gt;] mdo_create+0x46/0x48 [mdt]
[33388.314301] [&amp;lt;0&amp;gt;] mdt_create+0xab1/0xe40 [mdt]
[33388.319269] [&amp;lt;0&amp;gt;] mdt_reint_create+0x3a0/0x460 [mdt]
[33388.324829] [&amp;lt;0&amp;gt;] mdt_reint_rec+0x8a/0x240 [mdt]
[33388.329993] [&amp;lt;0&amp;gt;] mdt_reint_internal+0x76c/0xb50 [mdt]
[33388.335733] [&amp;lt;0&amp;gt;] mdt_reint+0x67/0x150 [mdt]
[33388.340562] [&amp;lt;0&amp;gt;] tgt_request_handle+0x92f/0x19c0 [ptlrpc]
[33388.346728] [&amp;lt;0&amp;gt;] ptlrpc_server_handle_request+0x253/0xc30 [ptlrpc]
[33388.353774] [&amp;lt;0&amp;gt;] ptlrpc_main+0xbf4/0x15e0 [ptlrpc]
[33388.359226] [&amp;lt;0&amp;gt;] kthread+0xd1/0xe0
[33388.363125] [&amp;lt;0&amp;gt;] ret_from_fork_nospec_begin+0x21/0x21
[33388.368893] [&amp;lt;0&amp;gt;] 0xfffffffffffffffe
[33388.373074] LustreError: dumping log to /tmp/lustre-log.1651732904.9736
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="333940" author="gerrit" created="Thu, 5 May 2022 18:48:11 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/47185/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47185/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15761&quot; title=&quot;cannot finish MDS recovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15761&quot;&gt;&lt;del&gt;LU-15761&lt;/del&gt;&lt;/a&gt; obdclass: fix locking in llog_cat_refresh()&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 69cdcbe3cf732124b3b5be1a1f235aa1b78f6c85&lt;/p&gt;</comment>
                            <comment id="333952" author="pjones" created="Thu, 5 May 2022 19:14:35 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15761&quot; title=&quot;cannot finish MDS recovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15761&quot;&gt;&lt;del&gt;LU-15761&lt;/del&gt;&lt;/a&gt; fix has landed for 2.15. I have reopened &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14495&quot; title=&quot;(llog_osd.c:622:llog_osd_write_rec()) LBUG&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14495&quot;&gt;&lt;del&gt;LU-14495&lt;/del&gt;&lt;/a&gt; to track the remaining issue&lt;/p&gt;</comment>
                            <comment id="334022" author="bzzz" created="Fri, 6 May 2022 14:38:43 +0000"  >&lt;blockquote&gt;&lt;p&gt;Alex Zhuravlev, please check the log lustre-log.1651732904.9736, this is with full debug, hope it helps&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;yes, it did, thanks a lot. I&apos;ve got rough understanding, trying to construct a reproducer.&lt;/p&gt;</comment>
                            <comment id="357108" author="gerrit" created="Wed, 21 Dec 2022 12:08:49 +0000"  >&lt;p&gt;&quot;Etienne AUJAMES &amp;lt;eaujames@ddn.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/49470&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/49470&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15761&quot; title=&quot;cannot finish MDS recovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15761&quot;&gt;&lt;del&gt;LU-15761&lt;/del&gt;&lt;/a&gt; obdclass: fix locking in llog_cat_refresh()&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: cb2086c2f9268db6f65ed2d31384806057a0ee30&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="69839">LU-15769</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="53392">LU-11418</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="70715">LU-15937</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="63216">LU-14495</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="60872">LU-13974</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="69092">LU-15645</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="70717">LU-15938</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="43524" name="lu-14495-soak-8debug" size="15878872" author="sarah" created="Wed, 4 May 2022 16:28:14 +0000"/>
                            <attachment id="43525" name="lu-14495-soak-8dmesg" size="390347" author="sarah" created="Wed, 4 May 2022 16:28:11 +0000"/>
                            <attachment id="43526" name="lu-14495-soak-8trace" size="1017811" author="sarah" created="Wed, 4 May 2022 16:28:11 +0000"/>
                            <attachment id="43481" name="lustre-log.1651263727.22864" size="107853690" author="sarah" created="Mon, 2 May 2022 17:45:50 +0000"/>
                            <attachment id="43548" name="lustre-log.1651732904.9736" size="153993721" author="sarah" created="Thu, 5 May 2022 16:01:09 +0000"/>
                            <attachment id="43482" name="soak-10-debuglog" size="139533420" author="sarah" created="Mon, 2 May 2022 17:45:59 +0000"/>
                            <attachment id="43479" name="soak-10-umount-hang-trace" size="1040012" author="sarah" created="Mon, 2 May 2022 16:30:37 +0000"/>
                            <attachment id="43405" name="soak-8dmesg" size="345259" author="sarah" created="Mon, 25 Apr 2022 21:06:19 +0000"/>
                            <attachment id="43406" name="soak-8trace" size="1019132" author="sarah" created="Mon, 25 Apr 2022 21:06:19 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i02ne7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>