<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:35:53 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10527] LustreError: 7830:0:(llog_cat.c:313:llog_cat_current_log()) ASSERTION( llh )</title>
                <link>https://jira.whamcloud.com/browse/LU-10527</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[894175.107791] LustreError: 21232:0:(mdd_dir.c:835:mdd_changelog_ns_store()) Skipped 71 previous similar messages
[894200.594288] Lustre: 21803:0:(llog_cat.c:91:llog_cat_new_log()) snx11253-MDD0000: there are no more free slots in catalog
[894200.606214] Lustre: 21803:0:(llog_cat.c:91:llog_cat_new_log()) Skipped 15668 previous similar messages
[894207.538039] LustreError: 21789:0:(mdd_dir.c:835:mdd_changelog_ns_store()) snx11253-MDD0000: cannot store changelog record: type = 2, name = &apos;LTPfstat01.9.PxDvLp.1496380725&apos;, t = [0x2800484d7:0xca6c:0x0], p = [0x280048380:0x1:0x0]: rc = -28
[894207.561210] LustreError: 21789:0:(mdd_dir.c:835:mdd_changelog_ns_store()) Skipped 166 previous similar messages
[894264.576106] Lustre: 19178:0:(llog_cat.c:91:llog_cat_new_log()) snx11253-MDD0000: there are no more free slots in catalog
[894264.588022] Lustre: 19178:0:(llog_cat.c:91:llog_cat_new_log()) Skipped 341 previous similar messages
[894271.531429] LustreError: 18644:0:(mdd_dir.c:835:mdd_changelog_ns_store()) snx11253-MDD0000: cannot store changelog record: type = 2, name = &apos;CL_LTPfcntl04.3.o1pIM1.1496380789&apos;, t = [0x2800484d7:0xcae9:0x0], p = [0x280048380:0x1:0x0]: rc = -28
[894271.554925] LustreError: 18644:0:(mdd_dir.c:835:mdd_changelog_ns_store()) Skipped 312 previous similar messages
[894333.383425] LustreError: 17029:0:(llog_cat.c:329:llog_cat_current_log()) snx11253-MDD0000: next log does not exist!
[894333.395645] LustreError: 7830:0:(llog_cat.c:313:llog_cat_current_log()) ASSERTION( llh ) failed:
[894333.405845] LustreError: 7830:0:(llog_cat.c:313:llog_cat_current_log()) LBUG
[894333.414189] Pid: 7830, comm: mdt_rdpg00_016
[894333.419667]
Call Trace:
[894333.426088]  [&amp;lt;ffffffffa0912853&amp;gt;] libcfs_debug_dumpstack+0x53/0x80 [libcfs]
[894333.434292]  [&amp;lt;ffffffffa0912f95&amp;gt;] lbug_with_loc+0x45/0xc0 [libcfs]
[894333.441728]  [&amp;lt;ffffffffa0a288c0&amp;gt;] llog_cat_add_rec+0xea0/0xf10 [obdclass]
[894333.449715]  [&amp;lt;ffffffff811c18ae&amp;gt;] ? kmem_cache_alloc_trace+0x1ce/0x1f0
[894333.457475]  [&amp;lt;ffffffffa0a2109a&amp;gt;] llog_add+0x7a/0x1a0 [obdclass]
[894333.464707]  [&amp;lt;ffffffffa114c8f2&amp;gt;] mdd_changelog_store+0xf2/0x270 [mdd]
[894333.472425]  [&amp;lt;ffffffffa115ea1a&amp;gt;] mdd_changelog_data_store+0x23a/0x330 [mdd]
[894333.480641]  [&amp;lt;ffffffffa115f95d&amp;gt;] mdd_close+0x29d/0xc90 [mdd]
[894333.487534]  [&amp;lt;ffffffffa1241500&amp;gt;] ? mdt_reint_create+0xcd0/0xd10 [mdt]
[894333.495203]  [&amp;lt;ffffffffa124be29&amp;gt;] mdt_mfd_close+0x2a9/0xa30 [mdt]
[894333.502390]  [&amp;lt;ffffffffa0a4f6d3&amp;gt;] ? class_handle2object+0xb3/0x1b0 [obdclass]
[894333.510584]  [&amp;lt;ffffffffa1251abb&amp;gt;] mdt_close+0x30b/0xd30 [mdt]
[894333.517578]  [&amp;lt;ffffffffa0d48dfb&amp;gt;] tgt_request_handle+0x8fb/0x11f0 [ptlrpc]
[894333.525370]  [&amp;lt;ffffffffa0ceb54b&amp;gt;] ptlrpc_server_handle_request+0x21b/0xa90 [ptlrpc]
[894333.533906]  [&amp;lt;ffffffffa0924848&amp;gt;] ? lc_watchdog_touch+0x68/0x180 [libcfs]
[894333.541595]  [&amp;lt;ffffffffa0ce8618&amp;gt;] ? ptlrpc_wait_event+0x98/0x330 [ptlrpc]
[894333.549282]  [&amp;lt;ffffffffa0ceee78&amp;gt;] ptlrpc_main+0xc08/0x1f40 [ptlrpc]
[894333.556423]  [&amp;lt;ffffffffa0cee270&amp;gt;] ? ptlrpc_main+0x0/0x1f40 [ptlrpc]
[894333.563524]  [&amp;lt;ffffffff810a5b8f&amp;gt;] kthread+0xcf/0xe0
[894333.569214]  [&amp;lt;ffffffff810a5ac0&amp;gt;] ? kthread+0x0/0xe0
[894333.574986]  [&amp;lt;ffffffff81644e18&amp;gt;] ret_from_fork+0x58/0x90
[894333.581176]  [&amp;lt;ffffffff810a5ac0&amp;gt;] ? kthread+0x0/0xe0
[894333.586921]
[894333.589184] Kernel panic - not syncing: LBUG
[894333.594215] CPU: 0 PID: 7830 Comm: mdt_rdpg00_016 Tainted: P           OE  ------------   3.10.0-327.36.3.x3.0.15.x86_64 #1
[894333.606087] Hardware name: Seagate Laguna Seca/Laguna Seca, BIOS v02.0024 02/26/2016
[894333.614589]  ffffffffa0937ca3 00000000efc7dcf7 ffff880d5fc6b9c8 ffffffff816349d1
[894333.622804]  ffff880d5fc6ba48 ffffffff8162e260 ffffffff00000008 ffff880d5fc6ba58
[894333.631015]  ffff880d5fc6b9f8 00000000efc7dcf7 ffffffffa0aa9d00 ffffffffa0943525
[894333.639218] Call Trace:
[894333.642397]  [&amp;lt;ffffffff816349d1&amp;gt;] dump_stack+0x19/0x1b
[894333.648253]  [&amp;lt;ffffffff8162e260&amp;gt;] panic+0xd8/0x1e7
[894333.653749]  [&amp;lt;ffffffffa0912ffb&amp;gt;] lbug_with_loc+0xab/0xc0 [libcfs]
[894333.660640]  [&amp;lt;ffffffffa0a288c0&amp;gt;] llog_cat_add_rec+0xea0/0xf10 [obdclass]
[894333.668112]  [&amp;lt;ffffffff811c18ae&amp;gt;] ? kmem_cache_alloc_trace+0x1ce/0x1f0
[894333.675322]  [&amp;lt;ffffffffa0a2109a&amp;gt;] llog_add+0x7a/0x1a0 [obdclass]
[894333.681990]  [&amp;lt;ffffffffa114c8f2&amp;gt;] mdd_changelog_store+0xf2/0x270 [mdd]
[894333.689168]  [&amp;lt;ffffffffa115ea1a&amp;gt;] mdd_changelog_data_store+0x23a/0x330 [mdd]
[894333.696864]  [&amp;lt;ffffffffa115f95d&amp;gt;] mdd_close+0x29d/0xc90 [mdd]
[894333.703245]  [&amp;lt;ffffffffa1241500&amp;gt;] ? mdt_reint_create+0xcd0/0xd10 [mdt]
[894333.710397]  [&amp;lt;ffffffffa124be29&amp;gt;] mdt_mfd_close+0x2a9/0xa30 [mdt]
[894333.717101]  [&amp;lt;ffffffffa0a4f6d3&amp;gt;] ? class_handle2object+0xb3/0x1b0 [obdclass]
[894333.724834]  [&amp;lt;ffffffffa1251abb&amp;gt;] mdt_close+0x30b/0xd30 [mdt]
[894333.731188]  [&amp;lt;ffffffffa0d48dfb&amp;gt;] tgt_request_handle+0x8fb/0x11f0 [ptlrpc]
[894333.738669]  [&amp;lt;ffffffffa0ceb54b&amp;gt;] ptlrpc_server_handle_request+0x21b/0xa90 [ptlrpc]
[894333.746918]  [&amp;lt;ffffffffa0924848&amp;gt;] ? lc_watchdog_touch+0x68/0x180 [libcfs]
[894333.754319]  [&amp;lt;ffffffffa0ce8618&amp;gt;] ? ptlrpc_wait_event+0x98/0x330 [ptlrpc]
[894333.761770]  [&amp;lt;ffffffffa0ceee78&amp;gt;] ptlrpc_main+0xc08/0x1f40 [ptlrpc]


&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="50286">LU-10527</key>
            <summary>LustreError: 7830:0:(llog_cat.c:313:llog_cat_current_log()) ASSERTION( llh )</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="pfarrell">Patrick Farrell</assignee>
                                    <reporter username="bfaccini">Bruno Faccini</reporter>
                        <labels>
                    </labels>
                <created>Wed, 17 Jan 2018 14:42:21 +0000</created>
                <updated>Thu, 26 Sep 2019 07:12:03 +0000</updated>
                            <resolved>Tue, 5 Mar 2019 15:56:02 +0000</resolved>
                                                    <fixVersion>Lustre 2.12.0</fixVersion>
                    <fixVersion>Lustre 2.10.7</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="218413" author="bfaccini" created="Wed, 17 Jan 2018 15:09:09 +0000"  >&lt;p&gt;Original problem has been reported by a customer (error msgs and LBUG with stack in Description for this ticket), but I have been able to reproduce this LBUG on a recent master build, by simulating/injecting the catalog full/ENOSPC situation in llog_cat_new_log(), with fail_loc=0x137/OBD_FAIL_MDS_LLOG_CREATE_FAILED (i.e., permanent failure/-ENOSPC), and then simply forcing multiple+concurrent attempts to create ChangeLog records by running just 6x occurrences of the &quot;while true; do touch /mnt/lustre/foodir/&amp;lt;file&amp;gt;; done &amp;amp;&quot; script, each using a different file (to avoid locking and thus concurrency) and this on a Client+Server single node setup.&lt;/p&gt;

&lt;p&gt;And as a possible fix, I wonder if in llog_cat_add_rec() we really need to reset &quot;cathandle-&amp;gt;u.chd.chd_current_log&quot; to NULL, upon -ENOSPC error returned from llog_cat_new_log().&lt;br/&gt;
Not doing so should avoid to have llog_cat_declare_add_rec() repeatedly and unnecessarily create new+partially initialized LLOGs/llog_handle and assigned to &quot;cathandle-&amp;gt;u.chd.chd_current_log&quot;, this without llog_init_handle() never being called to initialize &quot;loghandle-&amp;gt;lgh_hdr&quot;.&lt;/p&gt;</comment>
                            <comment id="218418" author="gerrit" created="Wed, 17 Jan 2018 15:24:44 +0000"  >&lt;p&gt;Faccini Bruno (bruno.faccini@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/30897&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30897&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10527&quot; title=&quot;LustreError: 7830:0:(llog_cat.c:313:llog_cat_current_log()) ASSERTION( llh )&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10527&quot;&gt;&lt;del&gt;LU-10527&lt;/del&gt;&lt;/a&gt; obdclass: don&apos;t recycle loghandle upon ENOSPC&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 693e2e9e33302b49b08dd1054e6418945121f81c&lt;/p&gt;</comment>
                            <comment id="219635" author="bfaccini" created="Thu, 1 Feb 2018 10:09:42 +0000"  >&lt;p&gt;With my patch, I have not been able to get any occurence of the crash even running the known reproducer during hours.&lt;/p&gt;</comment>
                            <comment id="228319" author="sergey" created="Tue, 22 May 2018 13:36:15 +0000"  >&lt;p&gt;Just for history.&lt;/p&gt;

&lt;p&gt;I also faced similar problem but with a little bit different stack trace:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; 7620028.883060] Lustre: 130784:0:(llog_cat.c:93:llog_cat_new_log()) snx11035-MDD0000: there are no more free slots in catalog
[7620028.895696] LustreError: 130969:0:(llog_cat.c:329:llog_cat_current_log()) ASSERTION( llh ) failed: 
[7620028.906765] LustreError: 130969:0:(llog_cat.c:329:llog_cat_current_log()) LBUG
[7620028.915204] Pid: 130969, comm: mdt00_056
[7620028.919888] 
[7620028.919888] Call Trace:
[7620028.924859]&#160; [&amp;lt;ffffffffa0503895&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
[7620028.932940]&#160; [&amp;lt;ffffffffa0504007&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
[7620028.940167]&#160; [&amp;lt;ffffffffa0644a02&amp;gt;] llog_cat_current_log+0x2c2/0x2d0 [obdclass]
[7620028.948527]&#160; [&amp;lt;ffffffffa0647d81&amp;gt;] llog_cat_add_rec+0x61/0x450 [obdclass]
[7620028.956320]&#160; [&amp;lt;ffffffffa063f169&amp;gt;] llog_add+0x89/0x1c0 [obdclass]
[7620028.963317]&#160; [&amp;lt;ffffffffa0f80ab6&amp;gt;] ? osd_attr_set+0x166/0x460 [osd_ldiskfs]
[7620028.971293]&#160; [&amp;lt;ffffffffa0dc54e2&amp;gt;] mdd_changelog_store+0xf2/0x260 [mdd]
[7620028.978887]&#160; [&amp;lt;ffffffffa0dd2b96&amp;gt;] mdd_changelog_data_store+0x1a6/0x280 [mdd]
[7620028.987145]&#160; [&amp;lt;ffffffffa0ddb171&amp;gt;] mdd_attr_set+0x1081/0x17a0 [mdd]
[7620028.994357]&#160; [&amp;lt;ffffffffa0e430dd&amp;gt;] mdt_attr_set+0x25d/0x5c0 [mdt]
[7620029.001373]&#160; [&amp;lt;ffffffffa0e43a3d&amp;gt;] mdt_reint_setattr+0x5fd/0xe40 [mdt]
[7620029.008868]&#160; [&amp;lt;ffffffffa0e3c6dd&amp;gt;] mdt_reint_rec+0x5d/0x200 [mdt]
[7620029.015878]&#160; [&amp;lt;ffffffffa0e22abb&amp;gt;] mdt_reint_internal+0x4cb/0x760 [mdt]
[7620029.023468]&#160; [&amp;lt;ffffffffa0e231eb&amp;gt;] mdt_reint+0x6b/0x120 [mdt]
[7620029.031496]&#160; [&amp;lt;ffffffffa09285de&amp;gt;] tgt_request_handle+0x6fe/0xaf0 [ptlrpc]
[7620029.039414]&#160; [&amp;lt;ffffffffa08d7979&amp;gt;] ptlrpc_main+0xe49/0x1930 [ptlrpc]
[7620029.046727]&#160; [&amp;lt;ffffffff810096f0&amp;gt;] ? __switch_to+0xd0/0x320
[7620029.053162]&#160; [&amp;lt;ffffffff81525eee&amp;gt;] ? thread_return+0x4e/0x760
[7620029.059805]&#160; [&amp;lt;ffffffffa08d6b30&amp;gt;] ? ptlrpc_main+0x0/0x1930 [ptlrpc]
[7620029.067103]&#160; [&amp;lt;ffffffff8109ac66&amp;gt;] kthread+0x96/0xa0
[7620029.072842]&#160; [&amp;lt;ffffffff8100c20a&amp;gt;] child_rip+0xa/0x20
[7620029.078677]&#160; [&amp;lt;ffffffff8109abd0&amp;gt;] ? kthread+0x0/0xa0
[7620029.084519]&#160; [&amp;lt;ffffffff8100c200&amp;gt;] ? child_rip+0x0/0x20&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The problem exists due to race between 3 threads doing: t1:llog_cat_current_log, t2:llog_cat_current_log and&#160; t3:&#160;llog_cat_declare_add_rec.&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;t3: llog_cat_declare_add_rec ::&#160;chd_next_log = loghandle /* loghandle is not initialized yet */&lt;/li&gt;
	&lt;li&gt;t1: llog_cat_current_log :: llog is full thus up_read&#160;LLOGH_CAT and try to use next log with write sem&lt;/li&gt;
	&lt;li&gt;t2: llog_cat_current_log ::&#160;llog is full thus up_read&#160;LLOGH_CAT and try to use next log with write sem&lt;/li&gt;
	&lt;li&gt;t1: llog_cat_current_log :: holds write LOGH_CAT semaphore /* llog is still full -&amp;gt; try to use next */&lt;/li&gt;
	&lt;li&gt;t1: llog_cat_current_log ::&#160;chd_current_log = chd_next_log /* chd_next_log is not initialized yet */&lt;/li&gt;
	&lt;li&gt;t1: llog_cat_current_log :: up_write LOGH_CAT sem&lt;/li&gt;
	&lt;li&gt;t2: llog_cat_currnet_log :: holds write LOGH_CAT sem&lt;/li&gt;
	&lt;li&gt;t2: llog_cat_current_log ::&#160;LASSERT(llh);&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;I have successfully reproduced the problem on the latest master with a small delay in llog_cat_current_log:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;diff --git a/lustre/obdclass/llog_cat.c b/lustre/obdclass/llog_cat.c
index 5f8b9d1..4587d94 100644
--- a/lustre/obdclass/llog_cat.c
+++ b/lustre/obdclass/llog_cat.c
@@ -377,6 +377,7 @@ static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle,
        up_read(&amp;amp;cathandle-&amp;gt;lgh_lock);
 
        /* time to use next log */
+       schedule_timeout_interruptible(msecs_to_jiffies(2000));
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Example of reproducer:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl --device lustre-MDT0000 changelog_register
#define OBD_FAIL_CAT_RECORDS                        0x1312
lctl set_param fail_loc=0x1312
lctl set_param fail_val=2

for i in $(seq 0 50); do mkdir /mnt/lustre/dir$i; done

for i in $(seq 0 32768);
do
	for j in $(seq 0 50);
	do
		touch /mnt/lustre/dir$j/f$i&amp;amp;
	done
done
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I also have reproduced it on master + &lt;a href=&quot;https://review.whamcloud.com/30897&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30897&lt;/a&gt;.&lt;br/&gt;
 I believe LASSERT(llh) is not needed anymore and can&apos;t be removed. In combine with existed 30897 patch this should totally solve the issue.&lt;/p&gt;

&lt;p&gt;The problem also exists in lustre 2.x.&lt;/p&gt;</comment>
                            <comment id="229378" author="bfaccini" created="Sat, 9 Jun 2018 13:50:30 +0000"  >&lt;p&gt;Thanks Sergey,&lt;br/&gt;
And as we have already discussed in #30897, I have also removed the LASSERT(llh) in llog_cat_current_log() in my original patch for this ticket, and successfully tested it against your and my reproducers.&lt;/p&gt;</comment>
                            <comment id="229885" author="gerrit" created="Tue, 3 Jul 2018 18:05:12 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/30897/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30897/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10527&quot; title=&quot;LustreError: 7830:0:(llog_cat.c:313:llog_cat_current_log()) ASSERTION( llh )&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10527&quot;&gt;&lt;del&gt;LU-10527&lt;/del&gt;&lt;/a&gt; obdclass: don&apos;t recycle loghandle upon ENOSPC&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 5761b9576d39c44c02455b86eb86ce1276930e60&lt;/p&gt;</comment>
                            <comment id="229889" author="pjones" created="Tue, 3 Jul 2018 21:06:37 +0000"  >&lt;p&gt;Landed for 2.12&lt;/p&gt;</comment>
                            <comment id="238534" author="gerrit" created="Thu, 13 Dec 2018 14:55:27 +0000"  >&lt;p&gt;Sebastien Piechurski (sebastien.piechurski@atos.net) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/33850&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/33850&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10527&quot; title=&quot;LustreError: 7830:0:(llog_cat.c:313:llog_cat_current_log()) ASSERTION( llh )&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10527&quot;&gt;&lt;del&gt;LU-10527&lt;/del&gt;&lt;/a&gt; obdclass: don&apos;t recycle loghandle upon ENOSPC&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 8e4d631e08409e7e42d92453c1117cdd8fc6f374&lt;/p&gt;</comment>
                            <comment id="239208" author="sthiell" created="Sat, 29 Dec 2018 17:35:09 +0000"  >&lt;p&gt;We just hit this MDT crash with 2.10, and MDT wouldn&apos;t start. I had to clear changelog_users/catalog to be able to start MDT again. Would be nice to backport this patch to b2_10 ASAP thanks. Our average changelogs rate on our oak-MDT0000 is 6800/sec. We still also have tons of occurrence of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11205&quot; title=&quot;Failure to clear the changelog for user 1 on MDT&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11205&quot;&gt;&lt;del&gt;LU-11205&lt;/del&gt;&lt;/a&gt; &quot;Failure to clear the changelog for user 1&quot;, this might be related. Peter, can this be considered as critical/urgent?&lt;/p&gt;</comment>
                            <comment id="239425" author="gerrit" created="Sat, 5 Jan 2019 06:51:54 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/33850/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/33850/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10527&quot; title=&quot;LustreError: 7830:0:(llog_cat.c:313:llog_cat_current_log()) ASSERTION( llh )&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10527&quot;&gt;&lt;del&gt;LU-10527&lt;/del&gt;&lt;/a&gt; obdclass: don&apos;t recycle loghandle upon ENOSPC&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 51e962be60cf599ecf154ea3a6b1c0f9882daac2&lt;/p&gt;</comment>
                            <comment id="239432" author="sthiell" created="Sat, 5 Jan 2019 14:35:01 +0000"  >&lt;p&gt;Thanks!&#160;&lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="242872" author="gerrit" created="Wed, 27 Feb 2019 00:58:11 +0000"  >&lt;p&gt;Patrick Farrell (pfarrell@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/34335&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34335&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10527&quot; title=&quot;LustreError: 7830:0:(llog_cat.c:313:llog_cat_current_log()) ASSERTION( llh )&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10527&quot;&gt;&lt;del&gt;LU-10527&lt;/del&gt;&lt;/a&gt; obdclass: don&apos;t recycle loghandle upon ENOSPC&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 1307d8af3536514781e0a87c8d0ff3b19caae50d&lt;/p&gt;</comment>
                            <comment id="242873" author="pfarrell" created="Wed, 27 Feb 2019 01:03:01 +0000"  >&lt;p&gt;This patch has come up as a possible culprit for dramatically increased MDT failover issues in soak testing - See &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11943&quot; title=&quot;many input/output error after soak running for couple of hours&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11943&quot;&gt;&lt;del&gt;LU-11943&lt;/del&gt;&lt;/a&gt;.&#160; Revert is just for testing at the moment, not actually proposing reverting it.&lt;/p&gt;</comment>
                            <comment id="243056" author="pfarrell" created="Thu, 28 Feb 2019 17:25:48 +0000"  >&lt;p&gt;Removing the patch from the tip of b2_10 causes the abort rate on MDT failover to go 50% to 0%, so it&apos;s pretty clearly the culprit.&lt;/p&gt;

&lt;p&gt;So:&lt;br/&gt;
The &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10527&quot; title=&quot;LustreError: 7830:0:(llog_cat.c:313:llog_cat_current_log()) ASSERTION( llh )&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10527&quot;&gt;&lt;del&gt;LU-10527&lt;/del&gt;&lt;/a&gt; patch is causing failures on MDT failover (recovery aborting), reporting apparent llog corruption.&lt;/p&gt;

&lt;p&gt;Messages vary, but this is a common example:&lt;br/&gt;
Feb 22 16:03:22 soak-10 kernel: LustreError: 12847:0:(llog.c:588:llog_process_thread()) soaked-MDT0003-osp-MDT0002: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x1:0xc002abb4:0x2&amp;#93;&lt;/span&gt; Invalid record: index 4 but expected 2&lt;/p&gt;</comment>
                            <comment id="243057" author="pfarrell" created="Thu, 28 Feb 2019 17:26:30 +0000"  >&lt;p&gt;Unless someone has a quick suggestion for a fix, intention is to revert the patch while we look for a solution.&lt;/p&gt;</comment>
                            <comment id="243061" author="gerrit" created="Thu, 28 Feb 2019 17:56:53 +0000"  >&lt;p&gt;Patrick Farrell (pfarrell@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/34346&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34346&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10527&quot; title=&quot;LustreError: 7830:0:(llog_cat.c:313:llog_cat_current_log()) ASSERTION( llh )&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10527&quot;&gt;&lt;del&gt;LU-10527&lt;/del&gt;&lt;/a&gt; llog: Reset current log on ENOSPC&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 155f4e866fc8f14da2b082898c00c0643c8bc919&lt;/p&gt;</comment>
                            <comment id="243063" author="gerrit" created="Thu, 28 Feb 2019 18:05:09 +0000"  >&lt;p&gt;Patrick Farrell (pfarrell@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/34347&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34347&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10527&quot; title=&quot;LustreError: 7830:0:(llog_cat.c:313:llog_cat_current_log()) ASSERTION( llh )&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10527&quot;&gt;&lt;del&gt;LU-10527&lt;/del&gt;&lt;/a&gt; llog: Reset current log on ENOSPC&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 6ab95bb7d53a7c6831bcfc56ce0c29481fa39189&lt;/p&gt;</comment>
                            <comment id="243064" author="pfarrell" created="Thu, 28 Feb 2019 18:08:26 +0000"  >&lt;p&gt;Switched from revert to hopeful fix...&lt;/p&gt;</comment>
                            <comment id="243342" author="pjones" created="Tue, 5 Mar 2019 15:56:02 +0000"  >&lt;p&gt;So I think that this ticket can be closed - the fixes to the original issue were tracked under &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11943&quot; title=&quot;many input/output error after soak running for couple of hours&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11943&quot;&gt;&lt;del&gt;LU-11943&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10324">
                    <name>Cloners</name>
                                            <outwardlinks description="Clones">
                                                        </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                                        </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="54819">LU-11943</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="32827">LU-7340</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzr9j:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>