<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:33:40 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-3410] mgc_copy_llog()) Failed to copy remote log routed1-OST00b3 (-2) ecountered during bring up of a 2.4 file system</title>
                <link>https://jira.whamcloud.com/browse/LU-3410</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;At our first attempt to mount the file system we experienced the following assertion on about 40 OSS. Th eassertion we hit is as follows:&lt;/p&gt;

&lt;p&gt;May 28 10:36:43 widow-oss11c4 kernel: [ 2743.137026] LDISKFS-fs (dm-12): mounted filesystem with ordered data mode. quota=on. Opts:&lt;br/&gt;
May 28 10:37:07 widow-oss11c4 kernel: [ 2767.330069] LustreError: 24348:0:(ofd_obd.c:1535:ofd_iocontrol()) routed1-OST0113: aborting recovery&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.099708] LustreError: 24283:0:(ofd_obd.c:1535:ofd_iocontrol()) routed1-OST00b3: aborting recovery&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.145570] LustreError: 24562:0:(mgc_request.c:1693:mgc_copy_llog()) Failed to copy remote log routed1-OST00b3 (-2) &lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.166762] LustreError: 13a-8: Failed to get MGS log routed1-OST00b3 and no local copy.&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.183045] LustreError: 24562:0:(lvfs_linux.c:175:pop_ctxt()) ASSERTION( cfs_fs_pwd(current-&amp;gt;fs) == new_ctx-&amp;gt;pwd ) failed: ffff8803bcea02c0 != ffff8803bcefcf00&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.211874] LustreError: 24562:0:(lvfs_linux.c:175:pop_ctxt()) LBUG&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.224479] Pid: 24562, comm: ll_cfg_requeue&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.233060]   &lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.233061] Call Trace:&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.240987]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa045e895&amp;gt;&amp;#93;&lt;/span&gt; libcfs_debug_dumpstack+0x55/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.254989]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa045ee97&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.267434]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03cecb5&amp;gt;&amp;#93;&lt;/span&gt; pop_ctxt+0x295/0x2e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lvfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.278976]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06d6225&amp;gt;&amp;#93;&lt;/span&gt; mgc_process_cfg_log+0x6a5/0x15c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mgc&amp;#93;&lt;/span&gt;&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.292448]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06d9133&amp;gt;&amp;#93;&lt;/span&gt; mgc_process_log+0x463/0x1390 &lt;span class=&quot;error&quot;&gt;&amp;#91;mgc&amp;#93;&lt;/span&gt;&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.305221]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8150ed8a&amp;gt;&amp;#93;&lt;/span&gt; ? schedule_timeout+0x19a/0x2e0&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.317303]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06d3a60&amp;gt;&amp;#93;&lt;/span&gt; ? mgc_blocking_ast+0x0/0x7e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mgc&amp;#93;&lt;/span&gt;&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.330171]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa071cb90&amp;gt;&amp;#93;&lt;/span&gt; ? ldlm_completion_ast+0x0/0x960 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.343991]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109705c&amp;gt;&amp;#93;&lt;/span&gt; ? remove_wait_queue+0x3c/0x50&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.355882]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06daa13&amp;gt;&amp;#93;&lt;/span&gt; mgc_requeue_thread+0x1a3/0x750 &lt;span class=&quot;error&quot;&gt;&amp;#91;mgc&amp;#93;&lt;/span&gt;&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.368999]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81063310&amp;gt;&amp;#93;&lt;/span&gt; ? default_wake_function+0x0/0x20&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.381448]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06da870&amp;gt;&amp;#93;&lt;/span&gt; ? mgc_requeue_thread+0x0/0x750 &lt;span class=&quot;error&quot;&gt;&amp;#91;mgc&amp;#93;&lt;/span&gt;&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.394565]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0ca&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.404563]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06da870&amp;gt;&amp;#93;&lt;/span&gt; ? mgc_requeue_thread+0x0/0x750 &lt;span class=&quot;error&quot;&gt;&amp;#91;mgc&amp;#93;&lt;/span&gt;&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.423777]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06da870&amp;gt;&amp;#93;&lt;/span&gt; ? mgc_requeue_thread+0x0/0x750 &lt;span class=&quot;error&quot;&gt;&amp;#91;mgc&amp;#93;&lt;/span&gt;&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.442884]  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0c0&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;br/&gt;
May 28 10:37:28 widow-oss11c4 kernel: [ 2788.459196]&lt;/p&gt;</description>
                <environment>RHEL6.4 running 2.4.0-RC2</environment>
        <key id="19172">LU-3410</key>
            <summary>mgc_copy_llog()) Failed to copy remote log routed1-OST00b3 (-2) ecountered during bring up of a 2.4 file system</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="6" iconUrl="https://jira.whamcloud.com/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="tappro">Mikhail Pershin</assignee>
                                    <reporter username="simmonsja">James A Simmons</reporter>
                        <labels>
                    </labels>
                <created>Tue, 28 May 2013 17:01:38 +0000</created>
                <updated>Fri, 11 Apr 2014 20:02:30 +0000</updated>
                            <resolved>Fri, 11 Apr 2014 20:02:13 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="59453" author="adilger" created="Tue, 28 May 2013 17:27:06 +0000"  >&lt;p&gt;James, did the same problem happen on a second restart, or did it work on the next try?&lt;/p&gt;</comment>
                            <comment id="59457" author="adilger" created="Tue, 28 May 2013 17:29:26 +0000"  >&lt;p&gt;James, what is the origin of this filesystem?  Is this a newly formatted filesystem with 2.4.0 or is it upgraded from some previous version of Lustre?&lt;/p&gt;</comment>
                            <comment id="59465" author="adilger" created="Tue, 28 May 2013 17:55:24 +0000"  >&lt;p&gt;Mike rewrote the MGC llog backup code to use the OSD API recently in &lt;a href=&quot;http://review.whamcloud.com/5049&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5049&lt;/a&gt; (patch is ready to land, but missed &quot;feature freeze&quot; window), so any work in this area should take that into account.  I&apos;m not sure it will solve this problem, but it doesn&apos;t make sense to work on the old code when it is being replaced.&lt;/p&gt;</comment>
                            <comment id="59544" author="green" created="Wed, 29 May 2013 16:57:47 +0000"  >&lt;p&gt;This only happened once, second restart worked.&lt;br/&gt;
The filesystem was freshly formatted 2.4 system.&lt;/p&gt;</comment>
                            <comment id="59668" author="tappro" created="Thu, 30 May 2013 17:44:18 +0000"  >&lt;p&gt;the mgs_fs_setup() may rewrite the obd_lvfs_ctxt with new values and there is no any protection that could prevent that between push/pop pair. That can be the reasonfor this bug. Note, that mgs_fs_setup() is called from obd_mount.c code to set new bottom fs and can be called multiple times with several OSTs on one node. Meanwhile this is old code and I am not sure why it happens now.&lt;/p&gt;</comment>
                            <comment id="63983" author="shadow" created="Fri, 9 Aug 2013 17:13:14 +0000"  >&lt;p&gt;Mike,&lt;/p&gt;

&lt;p&gt;looks you are wrong (or it&apos;s second bug with same assert).&lt;br/&gt;
mgc_requeue_thread - don&apos;t have an cfs_daemonize_ctxt() at top of function so share a current-&amp;gt;fs between several threads, as ldlm_elt, zombid, ldlm_poold (with vitaly patch with stale lock cancel) and some other.&lt;br/&gt;
in that case different thread may change a own current-&amp;gt;fs but it&apos;s affected to whole threads with same fs.&lt;br/&gt;
Xyratex hit that bug in ldlm_elt thread (don&apos;t have an cfs_daemonize_ctxt() call also.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;..
00002000:00040000:9.0:1375891841.290003:0:1448:0:(lvfs_linux.c:175:pop_ctxt()) ASSERTION( cfs_fs_pwd(current-&amp;gt;fs) == new_ctx-&amp;gt;pwd ) failed: ffff88076b4d68c0 != ffff88069df8fd80
..

crash&amp;gt; bt
PID: 1448   TASK: ffff880817656040  CPU: 9   COMMAND: &lt;span class=&quot;code-quote&quot;&gt;&quot;ldlm_elt&quot;&lt;/span&gt;
 #0 [ffff880774e7dba8] machine_kexec at ffffffff810310db
 #1 [ffff880774e7dc08] crash_kexec at ffffffff810b6332
 #2 [ffff880774e7dcd8] panic at ffffffff814d684f
 #3 [ffff880774e7dd58] lbug_with_loc at ffffffffa0380ecb [libcfs]
 #4 [ffff880774e7dd78] pop_ctxt at ffffffffa0427b77 [lvfs]
 #5 [ffff880774e7ddc8] filter_client_del at ffffffffa0ed5637 [obdfilter]
 #6 [ffff880774e7de78] filter_disconnect at ffffffffa0ed7820 [obdfilter]
 #7 [ffff880774e7dea8] class_fail_export at ffffffffa0648365 [obdclass]
 #8 [ffff880774e7dec8] expired_lock_main at ffffffffa07b81b1 [ptlrpc]
 #9 [ffff880774e7df48] kernel_thread at ffffffff8100c1ca
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;but in task fs is correct value&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;crash&amp;gt; p *((struct task_struct *)0xffff880817656040)-&amp;gt;fs
$119 = {
  users = 481, 
  lock = {
    raw_lock = {
      lock = 16777216
    }
  }, 
  umask = 0, 
  in_exec = 0, 
  root = {
    mnt = 0xffff880817b090c0, 
    dentry = 0xffff880818e4fa40
  }, 
  pwd = {
    mnt = 0xffff8808174b6180, 
    dentry = 0xffff88069df8fd80
  }
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;as you see - fs have a correct value and assert should be don&apos;t hit.&lt;br/&gt;
&lt;em&gt;BUT&lt;/em&gt; users is large number which should be impossible for a thread with disk IO.&lt;br/&gt;
so it&apos;s race between modification for same fs struct.&lt;/p&gt;

&lt;p&gt;additional bugs in that area&lt;br/&gt;
1) current-&amp;gt;fs.pwd accessed without lock, but readlock should be held. need look carefully about bug impact.&lt;br/&gt;
2) cfs_daemonize_ctxt have lost a return code from unshare_fs_struct() call so we have failed with ENOMEM and none will know about it&lt;br/&gt;
3) none error code checks for cfs_daemonize_ctxt() calls, so function may failed with ENOMEM but none know about it and uses a shared FS after fail.&lt;/p&gt;

</comment>
                            <comment id="74579" author="adilger" created="Wed, 8 Jan 2014 18:40:06 +0000"  >&lt;p&gt;Mike, I also see &quot;13a-8: Failed to get MGS log routed1-OST00b3 and no local copy.&quot; in many test logs.  This is a misleading error message and should probably be removed, or at least quieted in the common case when a new filesystem is first mounted.&lt;/p&gt;</comment>
                            <comment id="77892" author="emoly.liu" created="Wed, 26 Feb 2014 06:54:43 +0000"  >&lt;p&gt;I filed a new ticket &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4783&quot; title=&quot;start params llog automatically during mgs setup&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4783&quot;&gt;&lt;del&gt;LU-4783&lt;/del&gt;&lt;/a&gt; to fix the misleading error message &quot;LustreError: 13a-8: Failed to get MGS log params and no local copy.&quot;.&lt;/p&gt;

&lt;p&gt;So, can we close this one ?&lt;/p&gt;</comment>
                            <comment id="81451" author="tappro" created="Fri, 11 Apr 2014 20:02:13 +0000"  >&lt;p&gt;closing in favor of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4783&quot; title=&quot;start params llog automatically during mgs setup&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4783&quot;&gt;&lt;del&gt;LU-4783&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="23781">LU-4783</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvryn:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>8427</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>