<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:17:23 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1522] ASSERTION(cfs_atomic_read(&amp;obd-&gt;obd_req_replay_clients) == 0) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-1522</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Our lustre server crashed multiple times a day. This is one of the failures:&lt;/p&gt;

&lt;p&gt;&amp;lt;3&amp;gt;LustreError: 7606:0:(ldlm_lib.c:1259:abort_lock_replay_queue()) @@@ aborted:  req@ffff8806f8a9a000 x1404476495656183/t0(0) o-1-&amp;gt;da67355c-78b9-3337-cb94-359b564bc4aa@NET_0x500000a972885_UUID:0/0 lens 296/0 e 26 to 0 dl 1339654050 ref 1 fl Complete:/ffffffff/ffffffff rc 0/-1&lt;br/&gt;
&amp;lt;3&amp;gt;LustreError: 7606:0:(ldlm_lib.c:1259:abort_lock_replay_queue()) @@@ aborted:  req@ffff8806f3996000 x1404476930188631/t0(0) o-1-&amp;gt;736da151-8a99-44ed-0646-bb0e3daa974e@NET_0x500000a970f63_UUID:0/0 lens 296/0 e 26 to 0 dl 1339654056 ref 1 fl Complete:/ffffffff/ffffffff rc 0/-1&lt;br/&gt;
&amp;lt;3&amp;gt;LustreError: 7606:0:(ldlm_lib.c:1259:abort_lock_replay_queue()) Skipped 147 previous similar messages&lt;br/&gt;
&amp;lt;4&amp;gt;Lustre: 7606:0:(ldlm_lib.c:1562:target_recovery_overseer()) recovery is aborted, evict exports in recovery&lt;br/&gt;
&amp;lt;0&amp;gt;LustreError: 7606:0:(ldlm_lib.c:1612:target_next_replay_req()) ASSERTION(cfs_atomic_read(&amp;amp;obd-&amp;gt;obd_req_replay_clients) == 0) failed&lt;br/&gt;
&amp;lt;0&amp;gt;LustreError: 7606:0:(ldlm_lib.c:1612:target_next_replay_req()) LBUG&lt;br/&gt;
&amp;lt;4&amp;gt;Pid: 7606, comm: tgt_recov&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;br/&gt;
&amp;lt;4&amp;gt;Call Trace:&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0578855&amp;gt;&amp;#93;&lt;/span&gt; libcfs_debug_dumpstack+0x55/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0578e95&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x75/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0583da6&amp;gt;&amp;#93;&lt;/span&gt; libcfs_assertion_failed+0x66/0x70 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0732d53&amp;gt;&amp;#93;&lt;/span&gt; target_recovery_thread+0xed3/0xf50 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0731e80&amp;gt;&amp;#93;&lt;/span&gt; ? target_recovery_thread+0x0/0xf50 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c14a&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0731e80&amp;gt;&amp;#93;&lt;/span&gt; ? target_recovery_thread+0x0/0xf50 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0731e80&amp;gt;&amp;#93;&lt;/span&gt; ? target_recovery_thread+0x0/0xf50 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c140&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;br/&gt;
&amp;lt;0&amp;gt;Kernel panic - not syncing: LBUG&lt;br/&gt;
&amp;lt;4&amp;gt;Pid: 7606, comm: tgt_recov Not tainted 2.6.32-220.4.1.el6.20120130.x86_64.lustre211 #1&lt;br/&gt;
&amp;lt;4&amp;gt;Call Trace:&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81520c76&amp;gt;&amp;#93;&lt;/span&gt; ? panic+0x78/0x164&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0578eeb&amp;gt;&amp;#93;&lt;/span&gt; ? lbug_with_loc+0xcb/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0583da6&amp;gt;&amp;#93;&lt;/span&gt; ? libcfs_assertion_failed+0x66/0x70 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0732d53&amp;gt;&amp;#93;&lt;/span&gt; ? target_recovery_thread+0xed3/0xf50 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0731e80&amp;gt;&amp;#93;&lt;/span&gt; ? target_recovery_thread+0x0/0xf50 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c14a&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0xa/0x20&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0731e80&amp;gt;&amp;#93;&lt;/span&gt; ? target_recovery_thread+0x0/0xf50 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0731e80&amp;gt;&amp;#93;&lt;/span&gt; ? target_recovery_thread+0x0/0xf50 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c140&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;22&amp;#93;&lt;/span&gt;kdb&amp;gt; &lt;/p&gt;


&lt;p&gt;Here is the line that LBUG&apos;ed&lt;br/&gt;
      LASSERT(cfs_atomic_read(&amp;amp;obd-&amp;gt;obd_req_replay_clients) == 0);&lt;br/&gt;
in *target_next_replay_req():&lt;/p&gt;

&lt;p&gt;static struct ptlrpc_request *target_next_replay_req(struct obd_device *obd)&lt;br/&gt;
{&lt;br/&gt;
        struct ptlrpc_request *req = NULL;&lt;br/&gt;
        ENTRY;&lt;/p&gt;

&lt;p&gt;        CDEBUG(D_HA, &quot;Waiting for transno &quot;LPD64&quot;\n&quot;,&lt;br/&gt;
               obd-&amp;gt;obd_next_recovery_transno);&lt;/p&gt;

&lt;p&gt;        if (target_recovery_overseer(obd, check_for_next_transno,&lt;br/&gt;
                                     exp_req_replay_healthy)) &lt;/p&gt;
{
                abort_req_replay_queue(obd);
                abort_lock_replay_queue(obd);
        }

&lt;p&gt;        cfs_spin_lock(&amp;amp;obd-&amp;gt;obd_recovery_task_lock);&lt;br/&gt;
        if (!cfs_list_empty(&amp;amp;obd-&amp;gt;obd_req_replay_queue)) &lt;/p&gt;
{
                req = cfs_list_entry(obd-&amp;gt;obd_req_replay_queue.next,
                                     struct ptlrpc_request, rq_list);
                cfs_list_del_init(&amp;amp;req-&amp;gt;rq_list);
                obd-&amp;gt;obd_requests_queued_for_recovery--;
                cfs_spin_unlock(&amp;amp;obd-&amp;gt;obd_recovery_task_lock);
        }
&lt;p&gt; else &lt;/p&gt;
{
                cfs_spin_unlock(&amp;amp;obd-&amp;gt;obd_recovery_task_lock);
                LASSERT(cfs_list_empty(&amp;amp;obd-&amp;gt;obd_req_replay_queue));
                LASSERT(cfs_atomic_read(&amp;amp;obd-&amp;gt;obd_req_replay_clients) == 0);  &amp;lt;=======
                /** evict exports failed VBR */
                class_disconnect_stale_exports(obd, exp_vbr_healthy);
        }
&lt;p&gt;        RETURN(req);&lt;br/&gt;
}&lt;/p&gt;
</description>
                <environment>Server: rhel6.2, lustre-2.1.1, ofed-1.5.3.1&lt;br/&gt;
Client: sles11sp1, lustre-2.1.1, ofed-1.5.3.1&lt;br/&gt;
Git repo at &lt;a href=&quot;https://github.com/jlan/lustre-nas/commits/nas-2.1.1&quot;&gt;https://github.com/jlan/lustre-nas/commits/nas-2.1.1&lt;/a&gt;</environment>
        <key id="14927">LU-1522</key>
            <summary>ASSERTION(cfs_atomic_read(&amp;obd-&gt;obd_req_replay_clients) == 0) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="tappro">Mikhail Pershin</assignee>
                                    <reporter username="jaylan">Jay Lan</reporter>
                        <labels>
                    </labels>
                <created>Thu, 14 Jun 2012 18:05:33 +0000</created>
                <updated>Sat, 25 Aug 2012 22:32:52 +0000</updated>
                            <resolved>Sat, 25 Aug 2012 22:32:52 +0000</resolved>
                                    <version>Lustre 2.1.1</version>
                                    <fixVersion>Lustre 2.3.0</fixVersion>
                    <fixVersion>Lustre 2.1.3</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="40606" author="pjones" created="Thu, 14 Jun 2012 18:46:02 +0000"  >&lt;p&gt;Niu&lt;/p&gt;

&lt;p&gt;Could you please comment on this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="40608" author="bogl" created="Thu, 14 Jun 2012 19:07:42 +0000"  >&lt;p&gt;Niu,&lt;br/&gt;
  I think this may be a dup of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1166&quot; title=&quot;recovery never finished&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1166&quot;&gt;&lt;del&gt;LU-1166&lt;/del&gt;&lt;/a&gt;.  If I&apos;m correct it may already be fixed by commits 042980026c596ff08c97764bbcf7a1e710fd4f5a and abdd09fe58961fe071612b6884faeca2379ba341 to b2_1. Commits were done after 2.1.1, should be present in 2.1.2&lt;/p&gt;</comment>
                            <comment id="40611" author="jay" created="Thu, 14 Jun 2012 19:29:30 +0000"  >&lt;p&gt;Di talked about this problem several days before, but I don&apos;t know if he made any progress.&lt;/p&gt;</comment>
                            <comment id="40614" author="di.wang" created="Thu, 14 Jun 2012 19:50:31 +0000"  >&lt;p&gt;Ah, Yes. the problem is indeed brought in by this patch &lt;a href=&quot;http://review.whamcloud.com/#change,2255&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,2255&lt;/a&gt; (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1166&quot; title=&quot;recovery never finished&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1166&quot;&gt;&lt;del&gt;LU-1166&lt;/del&gt;&lt;/a&gt;). The reason is that the obd_req_replay_clients and obd_replay_lock_clients are not being decreased during the recovery abort.&lt;/p&gt;</comment>
                            <comment id="40615" author="di.wang" created="Thu, 14 Jun 2012 19:51:39 +0000"  >&lt;p&gt;Here is a workaround fix. &lt;/p&gt;

&lt;p&gt;commit 427bcf9eff0a931f64c0986c062d2fea7f87f983&lt;br/&gt;
Author: Wang Di &amp;lt;di.wang@whamcloud.com&amp;gt;&lt;br/&gt;
Date:   Tue May 29 15:25:30 2012 -0700&lt;/p&gt;

&lt;p&gt;    &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1166&quot; title=&quot;recovery never finished&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1166&quot;&gt;&lt;del&gt;LU-1166&lt;/del&gt;&lt;/a&gt; ptlrpc: Do replay export cleanup during class_disconnect&lt;/p&gt;

&lt;p&gt;    Since the exports might be hold for some reason, so do&lt;br/&gt;
    replay export cleanup during class_disconnect, instead of&lt;br/&gt;
    final export put.&lt;/p&gt;

&lt;p&gt;    Change-Id: I048b66b9c645fa772c34096791a02b6c210cfc23&lt;br/&gt;
    Signed-off-by: Wang Di &amp;lt;di.wang@whamcloud.com&amp;gt;&lt;/p&gt;

&lt;p&gt;diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c&lt;br/&gt;
index a759eed..d1944a2 100644&lt;br/&gt;
&amp;#8212; a/lustre/obdclass/genops.c&lt;br/&gt;
+++ b/lustre/obdclass/genops.c&lt;br/&gt;
@@ -839,7 +839,7 @@ void class_export_put(struct obd_export *exp)&lt;/p&gt;

&lt;p&gt;                 /* release nid stat refererence */&lt;br/&gt;
                 lprocfs_exp_cleanup(exp);&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;class_export_recovery_cleanup(exp);&lt;br/&gt;
+                //class_export_recovery_cleanup(exp);&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;                 obd_zombie_export_add(exp);&lt;br/&gt;
         }&lt;br/&gt;
@@ -1200,6 +1200,7 @@ int class_disconnect(struct obd_export *export)&lt;br/&gt;
                              &amp;amp;export-&amp;gt;exp_nid_hash);&lt;/p&gt;

&lt;p&gt;         class_unlink_export(export);&lt;br/&gt;
+        class_export_recovery_cleanup(export);&lt;br/&gt;
 no_disconn:&lt;br/&gt;
         class_export_put(export);&lt;br/&gt;
         RETURN(0);&lt;/p&gt;


&lt;p&gt;Mike said he will have a new patch. &lt;/p&gt;</comment>
                            <comment id="40678" author="jaylan" created="Fri, 15 Jun 2012 15:43:06 +0000"  >&lt;p&gt;Is this WA safe to pick up? I need to rebuild lustre server for production to deal with a large number of LBUG crashes and freeze on our production systems.&lt;/p&gt;</comment>
                            <comment id="40679" author="jaylan" created="Fri, 15 Jun 2012 15:44:59 +0000"  >&lt;p&gt;Well, change my question a bit. It is unfair to ask you to say &quot;it is safe&quot; without going through sanity testing. I like to know if you believe the fix is supposed to be a right fix and would avoid some LBUG or freeze?&lt;/p&gt;</comment>
                            <comment id="40680" author="di.wang" created="Fri, 15 Jun 2012 16:00:03 +0000"  >&lt;p&gt;Well, I actually thought it is a right fix, and stable enough. At least in my local sanity test. Hmm, maybe I should submit it a maloon, and review and test there.&lt;/p&gt;</comment>
                            <comment id="40681" author="niu" created="Fri, 15 Jun 2012 16:00:55 +0000"  >&lt;p&gt;Mike, are you working on a new patch? any comments to Jay&apos;s question? Thanks.&lt;/p&gt;</comment>
                            <comment id="40682" author="di.wang" created="Fri, 15 Jun 2012 16:05:48 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/#change,3115&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,3115&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="40708" author="tappro" created="Sat, 16 Jun 2012 13:14:38 +0000"  >&lt;p&gt;This fix brings us back to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1166&quot; title=&quot;recovery never finished&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1166&quot;&gt;&lt;del&gt;LU-1166&lt;/del&gt;&lt;/a&gt;, the proper fix is just the reverting of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1166&quot; title=&quot;recovery never finished&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1166&quot;&gt;&lt;del&gt;LU-1166&lt;/del&gt;&lt;/a&gt;. Otherwise we will see both 1166 and 1522 are &quot;fixed&quot; but actually 1166 will be just returned back.&lt;/p&gt;

&lt;p&gt;I am not working on new patch now, but will think about proper fix.&lt;/p&gt;</comment>
                            <comment id="40727" author="tappro" created="Mon, 18 Jun 2012 03:42:40 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1166&quot; title=&quot;recovery never finished&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1166&quot;&gt;&lt;del&gt;LU-1166&lt;/del&gt;&lt;/a&gt; fix which should be safe from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1522&quot; title=&quot;ASSERTION(cfs_atomic_read(&amp;amp;obd-&amp;gt;obd_req_replay_clients) == 0) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1522&quot;&gt;&lt;del&gt;LU-1522&lt;/del&gt;&lt;/a&gt;:&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/3122&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/3122&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="40728" author="tappro" created="Mon, 18 Jun 2012 04:27:35 +0000"  >&lt;p&gt;caused by &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1166&quot; title=&quot;recovery never finished&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1166&quot;&gt;&lt;del&gt;LU-1166&lt;/del&gt;&lt;/a&gt; fix&lt;/p&gt;</comment>
                            <comment id="40766" author="jaylan" created="Mon, 18 Jun 2012 13:24:14 +0000"  >&lt;p&gt;Is &lt;a href=&quot;http://review.whamcloud.com/3122&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/3122&lt;/a&gt; supposed to be on top of the two fixes committed in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1166&quot; title=&quot;recovery never finished&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1166&quot;&gt;&lt;del&gt;LU-1166&lt;/del&gt;&lt;/a&gt; (to b2_1 branch)?&lt;/p&gt;</comment>
                            <comment id="40769" author="jaylan" created="Mon, 18 Jun 2012 13:54:04 +0000"  >&lt;p&gt;It seems to be a replacement of one of the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1166&quot; title=&quot;recovery never finished&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1166&quot;&gt;&lt;del&gt;LU-1166&lt;/del&gt;&lt;/a&gt; commit 0429800?&lt;/p&gt;</comment>
                            <comment id="40774" author="jaylan" created="Mon, 18 Jun 2012 14:48:59 +0000"  >&lt;p&gt;Ah, OK, it was indeed supposed to be applied on top of the two commits in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1166&quot; title=&quot;recovery never finished&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1166&quot;&gt;&lt;del&gt;LU-1166&lt;/del&gt;&lt;/a&gt;! I resolved the conflicts.&lt;/p&gt;</comment>
                            <comment id="40776" author="tappro" created="Mon, 18 Jun 2012 15:11:24 +0000"  >&lt;p&gt;Should be just on top of previous patches&lt;/p&gt;</comment>
                            <comment id="40781" author="jaylan" created="Mon, 18 Jun 2012 15:58:02 +0000"  >&lt;p&gt;The patch was against master branch. Compilation failed on b2_1 branch on incompatible pointer type:&lt;/p&gt;

&lt;p&gt;/usr/src/redhat/BUILD/lustre-2.1.1/lustre/obdclass/genops.c: In function &apos;class_export_recovery_cleanup&apos;:&lt;br/&gt;
/usr/src/redhat/BUILD/lustre-2.1.1/lustre/obdclass/genops.c:1092: error: passing argument 1 of &apos;atomic_read&apos; from incompatible pointer type&lt;br/&gt;
/usr/src/kernels/2.6.32-220.4.1.el6.20120130.x86_64.lustre211/arch/x86/include/asm/atomic_64.h:21: note: expected &apos;const struct atomic_t *&apos; but argument is of type &apos;int *&apos;&lt;br/&gt;
/usr/src/redhat/BUILD/lustre-2.1.1/lustre/obdclass/genops.c:1092: error: passing argument 1 of &apos;atomic_read&apos; from incompatible pointer type&lt;br/&gt;
/usr/src/kernels/2.6.32-220.4.1.el6.20120130.x86_64.lustre211/arch/x86/include/asm/atomic_64.h:21: note: expected &apos;const struct atomic_t *&apos; but argument is of type &apos;int *&apos;&lt;br/&gt;
/usr/src/redhat/BUILD/lustre-2.1.1/lustre/obdclass/genops.c:1093: error: passing argument 1 of &apos;atomic_dec&apos; from incompatible pointer type&lt;br/&gt;
/usr/src/kernels/2.6.32-220.4.1.el6.20120130.x86_64.lustre211/arch/x86/include/asm/atomic_64.h:104: note: expected &apos;struct atomic_t *&apos; but argument is of type &apos;int *&apos;&lt;/p&gt;</comment>
                            <comment id="40783" author="jaylan" created="Mon, 18 Jun 2012 16:17:25 +0000"  >&lt;p&gt;OK, you simply moved the routine to a different location, so I can do the same to the b2_1 code.&lt;/p&gt;</comment>
                            <comment id="40827" author="tappro" created="Tue, 19 Jun 2012 01:55:55 +0000"  >&lt;p&gt;Jay, it is not just moved routine, the major part is also exp_failed setting/checking, btw, you can just keep class_export_recovery_cleanup() where it is and keep other code. I can prepare patch for b2_1 a bit later&lt;/p&gt;</comment>
                            <comment id="40831" author="niu" created="Tue, 19 Jun 2012 02:06:28 +0000"  >&lt;p&gt;Reassign to Mike.&lt;/p&gt;</comment>
                            <comment id="40863" author="tappro" created="Tue, 19 Jun 2012 13:19:09 +0000"  >&lt;p&gt;Jay, check this one: &lt;a href=&quot;http://review.whamcloud.com/3145&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/3145&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="40867" author="bogl" created="Tue, 19 Jun 2012 13:41:21 +0000"  >&lt;p&gt;Mikhail,&lt;br/&gt;
  Maybe I&apos;m wrong but it looks to me like your mod to ldlm_lib.c in &lt;a href=&quot;http://review.whamcloud.com/3145&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/3145&lt;/a&gt; now allows an error exit to the routine that leaves &amp;amp;target-&amp;gt;obd_recovery_task_lock still locked.  Did you mean to do that?&lt;/p&gt;</comment>
                            <comment id="40870" author="jaylan" created="Tue, 19 Jun 2012 14:48:52 +0000"  >&lt;p&gt;After applying &lt;a href=&quot;http://review.whamcloud.com/3122&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/3122&lt;/a&gt; &lt;br/&gt;
the mds LBUG&apos;ed:&lt;/p&gt;

&lt;p&gt;LustreError: 10878:0:(mdt_handler.c:5529:mdt_iocontrol()) Aborting recovery for device nbp2-MDT0000^M&lt;br/&gt;
LustreError: 11533:0:(lu_object.c:113:lu_object_put()) ASSERTION(cfs_list_empty(&amp;amp;top-&amp;gt;loh_lru)) failed^M&lt;br/&gt;
LustreError: 11533:0:(lu_object.c:113:lu_object_put()) LBUG^M&lt;br/&gt;
Pid: 11533, comm: mdt_rdpg_07^M&lt;br/&gt;
^M&lt;br/&gt;
Call Trace:^M&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa056b855&amp;gt;&amp;#93;&lt;/span&gt; libcfs_debug_dumpstack+0x55/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;^M&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa056be95&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x75/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;^M&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0576da6&amp;gt;&amp;#93;&lt;/span&gt; libcfs_assertion_failed+0x66/0x70 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;^M&lt;br/&gt;
^M&lt;/p&gt;</comment>
                            <comment id="40871" author="jaylan" created="Tue, 19 Jun 2012 14:58:08 +0000"  >&lt;p&gt;I compared my patch adjusted from review #3122 with #3145, they are essentially identical except my patch also moved class_export_recovery_cleanup() to new location as would do in #3122.&lt;/p&gt;</comment>
                            <comment id="40873" author="tappro" created="Tue, 19 Jun 2012 15:27:14 +0000"  >&lt;p&gt;Bob, you are right, that lock doesn&apos;t exist in master and I missed it for b2_1. I will update patch.&lt;/p&gt;</comment>
                            <comment id="40874" author="tappro" created="Tue, 19 Jun 2012 15:31:37 +0000"  >&lt;p&gt;Jay, that LBUG doesn&apos;t look related, do you see it always?&lt;/p&gt;</comment>
                            <comment id="40875" author="jaylan" created="Tue, 19 Jun 2012 15:36:08 +0000"  >&lt;p&gt;No, I do not remember seeing that. Not on ASSERTION(cfs_list_empty(&amp;amp;top-&amp;gt;loh_lru)).&lt;/p&gt;</comment>
                            <comment id="40877" author="jaylan" created="Tue, 19 Jun 2012 15:45:31 +0000"  >&lt;p&gt;We installed 2.1.1-2.1nasS build version to service160. It crashed on booting up. Since it is a production machine, control room put 2.1.1-2nasS version in and booted the service160 (an MDS) back up.&lt;/p&gt;

&lt;p&gt;The difference between 2nasS and 2.1nasS was that I replaced Di Wang&apos;s #3115 with #3122.&lt;/p&gt;</comment>
                            <comment id="42139" author="jaylan" created="Mon, 23 Jul 2012 15:43:42 +0000"  >&lt;p&gt;The patch set 2 of review #3145 was landed to b2_1, but not master.&lt;br/&gt;
The patch of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1432&quot; title=&quot;Race condition between lprocfs_exp_setup() and lprocfs_free_per_client_stats() causes LBUG&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1432&quot;&gt;&lt;del&gt;LU-1432&lt;/del&gt;&lt;/a&gt; was landed to master, but not b2_1. &lt;/p&gt;

&lt;p&gt;We had a mds crash after applying review #3122, which is essentially the same as patch set 1 of #3145. After the crash, I cherry-picked the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1432&quot; title=&quot;Race condition between lprocfs_exp_setup() and lprocfs_free_per_client_stats() causes LBUG&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1432&quot;&gt;&lt;del&gt;LU-1432&lt;/del&gt;&lt;/a&gt; patch to our b2_1 and is running in our production systems without a crash for several weeks now.&lt;/p&gt;

&lt;p&gt;So, please comment if I should have both &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1432&quot; title=&quot;Race condition between lprocfs_exp_setup() and lprocfs_free_per_client_stats() causes LBUG&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1432&quot;&gt;&lt;del&gt;LU-1432&lt;/del&gt;&lt;/a&gt; and patch set 2 of #3145? Thanks!&lt;/p&gt;</comment>
                            <comment id="43767" author="pjones" created="Sat, 25 Aug 2012 22:32:52 +0000"  >&lt;p&gt;Landed for 2.1.3 and 2.3&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="13417">LU-1166</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzv61r:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>4514</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>