<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:01:54 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13511] MDS 2.12.4 ASSERTION( top-&gt;loh_hash.next == ((void *)0) &amp;&amp; top-&gt;loh_hash.pprev == ((void *)0) ) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-13511</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We have been running &lt;tt&gt;lfs migrate -m 1&lt;/tt&gt; on a client for several days now to free up inodes on a MDT, but when trying to launch multiple &lt;tt&gt;lfs migrate -m 1&lt;/tt&gt; (like more than 4) on different directory trees, at the same time, on a (single) client, we ended up crashing the MDS of fir-MDT0001 with the following assertion:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[Fri May  1 16:46:48 2020][3824911.223375] LustreError: 22403:0:(lod_dev.c:132:lod_fld_lookup()) fir-MDT0001-mdtlov: invalid FID [0x0:0x0:0x0]^M
[Fri May  1 16:46:48 2020][3824911.233641] LustreError: 22403:0:(lu_object.c:146:lu_object_put()) ASSERTION( top-&amp;gt;loh_hash.next == ((void *)0) &amp;amp;&amp;amp; top-&amp;gt;loh_hash.pprev == ((void *)0) ) failed: ^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;backtrace:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[3824898.399369] Lustre: fir-MDT0001: Connection restored to 7862f6c9-0098-4 (at 10.50.8.41@o2ib2)
[3824911.223375] LustreError: 22403:0:(lod_dev.c:132:lod_fld_lookup()) fir-MDT0001-mdtlov: invalid FID [0x0:0x0:0x0]
[3824911.233641] LustreError: 22403:0:(lu_object.c:146:lu_object_put()) ASSERTION( top-&amp;gt;loh_hash.next == ((void *)0) &amp;amp;&amp;amp; top-&amp;gt;loh_hash.pprev == ((void *)0) ) failed:
[3824911.248150] LustreError: 22403:0:(lu_object.c:146:lu_object_put()) LBUG
[3824911.254941] Pid: 22403, comm: mdt00_022 3.10.0-957.27.2.el7_lustre.pl2.x86_64 #1 SMP Thu Nov 7 15:26:16 PST 2019
[3824911.265305] Call Trace:
[3824911.267941]  [&amp;lt;ffffffffc0c9b7cc&amp;gt;] libcfs_call_trace+0x8c/0xc0 [libcfs]
[3824911.274687]  [&amp;lt;ffffffffc0c9b87c&amp;gt;] lbug_with_loc+0x4c/0xa0 [libcfs]
[3824911.281077]  [&amp;lt;ffffffffc0e9ff66&amp;gt;] lu_object_put+0x336/0x3e0 [obdclass]
[3824911.287838]  [&amp;lt;ffffffffc0ea0026&amp;gt;] lu_object_put_nocache+0x16/0x20 [obdclass]
[3824911.295127]  [&amp;lt;ffffffffc0ea022e&amp;gt;] lu_object_find_at+0x1fe/0xa60 [obdclass]
[3824911.302240]  [&amp;lt;ffffffffc0ea0aa6&amp;gt;] lu_object_find+0x16/0x20 [obdclass]
[3824911.308908]  [&amp;lt;ffffffffc17130db&amp;gt;] mdt_object_find+0x4b/0x170 [mdt]
[3824911.315306]  [&amp;lt;ffffffffc1728ab8&amp;gt;] mdt_migrate_lookup.isra.40+0x158/0xa60 [mdt]
[3824911.322778]  [&amp;lt;ffffffffc1732eba&amp;gt;] mdt_reint_migrate+0x8ea/0x1310 [mdt]
[3824911.329526]  [&amp;lt;ffffffffc1733963&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
[3824911.335765]  [&amp;lt;ffffffffc1710273&amp;gt;] mdt_reint_internal+0x6e3/0xaf0 [mdt]
[3824911.342508]  [&amp;lt;ffffffffc171b6e7&amp;gt;] mdt_reint+0x67/0x140 [mdt]
[3824911.348401]  [&amp;lt;ffffffffc11e464a&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
[3824911.355537]  [&amp;lt;ffffffffc118743b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[3824911.363446]  [&amp;lt;ffffffffc118ada4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
[3824911.369949]  [&amp;lt;ffffffffb04c2e81&amp;gt;] kthread+0xd1/0xe0
[3824911.375036]  [&amp;lt;ffffffffb0b77c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
[3824911.381683]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[3824911.386893] Kernel panic - not syncing: LBUG
[3824911.391339] CPU: 28 PID: 22403 Comm: mdt00_022 Kdump: loaded Tainted: G           OE  ------------   3.10.0-957.27.2.el7_lustre.pl2.x86_64 #1
[3824911.404191] Hardware name: Dell Inc. PowerEdge R6415/065PKD, BIOS 1.10.6 08/15/2019
[3824911.412016] Call Trace:
[3824911.414649]  [&amp;lt;ffffffffb0b65147&amp;gt;] dump_stack+0x19/0x1b
[3824911.419967]  [&amp;lt;ffffffffb0b5e850&amp;gt;] panic+0xe8/0x21f
[3824911.424938]  [&amp;lt;ffffffffc0c9b8cb&amp;gt;] lbug_with_loc+0x9b/0xa0 [libcfs]
[3824911.431323]  [&amp;lt;ffffffffc0e9ff66&amp;gt;] lu_object_put+0x336/0x3e0 [obdclass]
[3824911.438044]  [&amp;lt;ffffffffc0e9c39b&amp;gt;] ? lu_object_start.isra.35+0x8b/0x120 [obdclass]
[3824911.445715]  [&amp;lt;ffffffffc0ea0026&amp;gt;] lu_object_put_nocache+0x16/0x20 [obdclass]
[3824911.452951]  [&amp;lt;ffffffffc0ea022e&amp;gt;] lu_object_find_at+0x1fe/0xa60 [obdclass]
[3824911.460011]  [&amp;lt;ffffffffc1830a7e&amp;gt;] ? lod_xattr_get+0xee/0x700 [lod]
[3824911.466387]  [&amp;lt;ffffffffc0ea0aa6&amp;gt;] lu_object_find+0x16/0x20 [obdclass]
[3824911.473014]  [&amp;lt;ffffffffc17130db&amp;gt;] mdt_object_find+0x4b/0x170 [mdt]
[3824911.479378]  [&amp;lt;ffffffffc1728ab8&amp;gt;] mdt_migrate_lookup.isra.40+0x158/0xa60 [mdt]
[3824911.486780]  [&amp;lt;ffffffffc1732eba&amp;gt;] mdt_reint_migrate+0x8ea/0x1310 [mdt]
[3824911.493499]  [&amp;lt;ffffffffc0eb3fa9&amp;gt;] ? check_unlink_entry+0x19/0xd0 [obdclass]
[3824911.500654]  [&amp;lt;ffffffffc0eb4bf8&amp;gt;] ? upcall_cache_get_entry+0x218/0x8b0 [obdclass]
[3824911.508318]  [&amp;lt;ffffffffc1733963&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
[3824911.514503]  [&amp;lt;ffffffffc1710273&amp;gt;] mdt_reint_internal+0x6e3/0xaf0 [mdt]
[3824911.521213]  [&amp;lt;ffffffffc171b6e7&amp;gt;] mdt_reint+0x67/0x140 [mdt]
[3824911.527097]  [&amp;lt;ffffffffc11e464a&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
[3824911.534180]  [&amp;lt;ffffffffc11bc0b1&amp;gt;] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc]
[3824911.541925]  [&amp;lt;ffffffffc0c9bbde&amp;gt;] ? ktime_get_real_seconds+0xe/0x10 [libcfs]
[3824911.549176]  [&amp;lt;ffffffffc118743b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[3824911.557038]  [&amp;lt;ffffffffc1183565&amp;gt;] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc]
[3824911.564004]  [&amp;lt;ffffffffb04cfeb4&amp;gt;] ? __wake_up+0x44/0x50
[3824911.569438]  [&amp;lt;ffffffffc118ada4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
[3824911.575911]  [&amp;lt;ffffffffc118a270&amp;gt;] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc]
[3824911.583478]  [&amp;lt;ffffffffb04c2e81&amp;gt;] kthread+0xd1/0xe0
[3824911.588528]  [&amp;lt;ffffffffb04c2db0&amp;gt;] ? insert_kthread_work+0x40/0x40
[3824911.594796]  [&amp;lt;ffffffffb0b77c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
[3824911.601407]  [&amp;lt;ffffffffb04c2db0&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Note: please ignore the following lines in the logs, they are not relevant, it&apos;s just a script that tried periodically to access some wrong sysfs files (ie. it is not a backend device error):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;mpt3sas_cm0: log_info(0x31200205): originator(PL), code(0x20), sub_code(0x0205)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Shortly before the crash, we can see the following lines in syslog:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[3824620.448341] LustreError: 22344:0:(mdd_object.c:3249:mdd_close()) fir-MDD0001: failed to get lu_attr of [0x2400576ec:0x149ae:0x0]: rc = -2

[3824640.928979] LustreError: 42016:0:(mdd_object.c:3249:mdd_close()) fir-MDD0001: failed to get lu_attr of [0x240057703:0xcf72:0x0]: rc = -2
[3824658.778134] LustreError: 22546:0:(mdd_object.c:3249:mdd_close()) fir-MDD0001: failed to get lu_attr of [0x240057703:0xe2f2:0x0]: rc = -2
[3824678.792561] LustreError: 42121:0:(mdd_object.c:3249:mdd_close()) fir-MDD0001: failed to get lu_attr of [0x240057703:0xf468:0x0]: rc = -2
[3824696.395767] LustreError: 22546:0:(mdd_object.c:3249:mdd_close()) fir-MDD0001: failed to get lu_attr of [0x240057703:0x10477:0x0]: rc = -2
[3824714.310806] LustreError: 42123:0:(mdd_object.c:3249:mdd_close()) fir-MDD0001: failed to get lu_attr of [0x240057703:0x11206:0x0]: rc = -2
[3824730.506605] LustreError: 42121:0:(mdd_object.c:3249:mdd_close()) fir-MDD0001: failed to get lu_attr of [0x240057703:0x120e7:0x0]: rc = -2

[3824768.104569] LustreError: 22344:0:(mdd_object.c:3249:mdd_close()) fir-MDD0001: failed to get lu_attr of [0x240057703:0x13f7d:0x0]: rc = -2
[3824768.117096] LustreError: 22344:0:(mdd_object.c:3249:mdd_close()) Skipped 1 previous similar message
[3824822.439361] LustreError: 28226:0:(mdd_object.c:400:mdd_xattr_get()) fir-MDD0001: object [0x240057703:0x16d59:0x0] not found: rc = -2
[3824840.123675] LustreError: 22344:0:(mdd_object.c:3249:mdd_close()) fir-MDD0001: failed to get lu_attr of [0x240057703:0x17b2e:0x0]: rc = -2
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I tried a fid2path on those FIDs from a client, but they cannot be found.&lt;/p&gt;

&lt;p&gt;This issue has occurred only once, on May 1. I&apos;m attaching vmcore-dmesg.txt as &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/34819/34819_fir-md1-s2_20200501_vmcore-dmesg.txt&quot; title=&quot;fir-md1-s2_20200501_vmcore-dmesg.txt attached to LU-13511&quot;&gt;fir-md1-s2_20200501_vmcore-dmesg.txt&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;.&lt;/p&gt;

&lt;p&gt;vmcore:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;      KERNEL: /usr/lib/debug/lib/modules/3.10.0-957.27.2.el7_lustre.pl2.x86_64/vmlinux
    DUMPFILE: vmcore  [PARTIAL DUMP]
        CPUS: 48
        DATE: Fri May  1 16:46:48 2020
      UPTIME: 44 days, 06:26:43
LOAD AVERAGE: 1.98, 1.74, 1.43
       TASKS: 1919
    NODENAME: fir-md1-s2
     RELEASE: 3.10.0-957.27.2.el7_lustre.pl2.x86_64
     VERSION: #1 SMP Thu Nov 7 15:26:16 PST 2019
     MACHINE: x86_64  (1996 Mhz)
      MEMORY: 255.6 GB
       PANIC: &quot;Kernel panic - not syncing: LBUG&quot;
         PID: 22403
     COMMAND: &quot;mdt00_022&quot;
        TASK: ffff8b0d6f65d140  [THREAD_INFO: ffff8afd6cfc4000]
         CPU: 28
       STATE: TASK_RUNNING (PANIC)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I have uploaded this vmcore to WC&apos;s FTP server as &lt;tt&gt;fir-md1-s2_20200501164658_vmcore&lt;/tt&gt;&lt;br/&gt;
 Also attached the output of &quot;foreach bt&quot; as &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/34818/34818_fir-md1-s2_crash_foreach_bt_20200501164658.txt&quot; title=&quot;fir-md1-s2_crash_foreach_bt_20200501164658.txt attached to LU-13511&quot;&gt;fir-md1-s2_crash_foreach_bt_20200501164658.txt&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;Let me know if you need anything else that could help in avoiding that in the future. Thanks!&lt;/p&gt;</description>
                <environment>CentOS 7.6 3.10.0-957.27.2.el7_lustre.pl2.x86_64</environment>
        <key id="59031">LU-13511</key>
            <summary>MDS 2.12.4 ASSERTION( top-&gt;loh_hash.next == ((void *)0) &amp;&amp; top-&gt;loh_hash.pprev == ((void *)0) ) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="laisiyao">Lai Siyao</assignee>
                                    <reporter username="sthiell">Stephane Thiell</reporter>
                        <labels>
                    </labels>
                <created>Sun, 3 May 2020 20:49:13 +0000</created>
                <updated>Thu, 29 Oct 2020 15:10:01 +0000</updated>
                            <resolved>Fri, 2 Oct 2020 02:50:50 +0000</resolved>
                                    <version>Lustre 2.12.4</version>
                                    <fixVersion>Lustre 2.14.0</fixVersion>
                    <fixVersion>Lustre 2.12.6</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="269245" author="pjones" created="Mon, 4 May 2020 17:31:56 +0000"  >&lt;p&gt;Hongchao&lt;/p&gt;

&lt;p&gt;Could you please investigate?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="269742" author="laisiyao" created="Sat, 9 May 2020 04:28:46 +0000"  >&lt;p&gt;This is due to a striped directory layout is broken, and some stripe FID is &lt;span class=&quot;error&quot;&gt;&amp;#91;0:0:0&amp;#93;&lt;/span&gt;, such FID is used internally, and should not be used normally, I&apos;ll add a patch to fix this.&lt;/p&gt;</comment>
                            <comment id="277026" author="sthiell" created="Sat, 8 Aug 2020 16:26:43 +0000"  >&lt;p&gt;Hi Lai,&lt;/p&gt;

&lt;p&gt;Argh, this just hit us again with 2.12.5 on a MDS. In &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/35550/35550_fir-md1-s4_vmcore-dmesg2020_08_08_05_13_40.txt&quot; title=&quot;fir-md1-s4_vmcore-dmesg2020_08_08_05_13_40.txt attached to LU-13511&quot;&gt;fir-md1-s4_vmcore-dmesg2020_08_08_05_13_40.txt&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;, we can see &quot;failed to get lu_attr&quot; errors just before this LBUG. I have two vmcore&apos;s if needed (the MDS crashed again just after recovery). I had to shut down all of our &lt;tt&gt;lfs migrate -m&lt;/tt&gt; and the robinhood server that was reading changelogs to be able to start again. A restart of MDT0 was also needed to fully clear all timeouts.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[2639627.935716] Lustre: fir-MDT0003: Client edb25609-39fb-4 (at 10.49.0.63@o2ib1) reconnecting
[2639627.944184] Lustre: fir-MDT0003: Connection restored to  (at 10.49.0.63@o2ib1)
[2663131.800332] LustreError: 68703:0:(mdd_object.c:3249:mdd_close()) fir-MDD0003: failed to get lu_attr of [0x280044520:0xc90f:0x0]: rc = -2
[2663136.343377] LustreError: 100694:0:(mdd_object.c:3249:mdd_close()) fir-MDD0003: failed to get lu_attr of [0x280044520:0x1697e:0x0]: rc = -2
[2663136.355987] LustreError: 100694:0:(mdd_object.c:3249:mdd_close()) Skipped 1 previous similar message
[2663136.905828] LustreError: 66683:0:(mdd_object.c:400:mdd_xattr_get()) fir-MDD0003: object [0x280044520:0x17604:0x0] not found: rc = -2
[2663137.608393] LustreError: 66810:0:(mdd_object.c:400:mdd_xattr_get()) fir-MDD0003: object [0x280044520:0x1764d:0x0] not found: rc = -2
[2663137.620611] LustreError: 66810:0:(mdd_object.c:400:mdd_xattr_get()) Skipped 3 previous similar messages
[2663138.330280] LustreError: 126913:0:(mdd_object.c:3249:mdd_close()) fir-MDD0003: failed to get lu_attr of [0x280044520:0x1771d:0x0]: rc = -2
[2663138.342891] LustreError: 126913:0:(mdd_object.c:3249:mdd_close()) Skipped 3 previous similar messages
[2663138.705822] LustreError: 67056:0:(mdd_object.c:400:mdd_xattr_get()) fir-MDD0003: object [0x280044520:0x17780:0x0] not found: rc = -2
[2663138.717962] LustreError: 67056:0:(mdd_object.c:400:mdd_xattr_get()) Skipped 6 previous similar messages
[2663140.353575] LustreError: 67074:0:(mdd_object.c:3249:mdd_close()) fir-MDD0003: failed to get lu_attr of [0x280044520:0x17904:0x0]: rc = -2
[2663140.366103] LustreError: 67074:0:(mdd_object.c:3249:mdd_close()) Skipped 5 previous similar messages
[2663140.950475] LustreError: 66692:0:(mdd_object.c:400:mdd_xattr_get()) fir-MDD0003: object [0x280044520:0x17943:0x0] not found: rc = -2
[2663140.962567] LustreError: 66692:0:(mdd_object.c:400:mdd_xattr_get()) Skipped 12 previous similar messages
[2663144.507278] LustreError: 67061:0:(mdd_object.c:3249:mdd_close()) fir-MDD0003: failed to get lu_attr of [0x280044520:0x17c2b:0x0]: rc = -2
[2663144.519822] LustreError: 67061:0:(mdd_object.c:3249:mdd_close()) Skipped 7 previous similar messages
[2663145.078058] LustreError: 66755:0:(mdd_object.c:400:mdd_xattr_get()) fir-MDD0003: object [0x280044520:0x17c57:0x0] not found: rc = -2
[2663145.090149] LustreError: 66755:0:(mdd_object.c:400:mdd_xattr_get()) Skipped 18 previous similar messages
[2663152.600233] LustreError: 66965:0:(mdd_object.c:3249:mdd_close()) fir-MDD0003: failed to get lu_attr of [0x280044520:0x182ec:0x0]: rc = -2
[2663152.612774] LustreError: 66965:0:(mdd_object.c:3249:mdd_close()) Skipped 37 previous similar messages
[2663153.134962] LustreError: 66753:0:(mdd_object.c:400:mdd_xattr_get()) fir-MDD0003: object [0x280044520:0x1836f:0x0] not found: rc = -2
[2663153.147065] LustreError: 66753:0:(mdd_object.c:400:mdd_xattr_get()) Skipped 38 previous similar messages
[2663175.740735] LustreError: 67051:0:(mdd_object.c:400:mdd_xattr_get()) fir-MDD0003: object [0x280044520:0x18bf2:0x0] not found: rc = -2
[2663175.752900] LustreError: 67051:0:(mdd_object.c:400:mdd_xattr_get()) Skipped 63 previous similar messages
[2663176.789503] LustreError: 66965:0:(mdd_object.c:3249:mdd_close()) fir-MDD0003: failed to get lu_attr of [0x280044520:0x19344:0x0]: rc = -2
[2663176.802031] LustreError: 66965:0:(mdd_object.c:3249:mdd_close()) Skipped 50 previous similar messages
[2663207.817889] LustreError: 66659:0:(mdd_object.c:400:mdd_xattr_get()) fir-MDD0003: object [0x280044520:0x1aa2a:0x0] not found: rc = -2
[2663207.829992] LustreError: 66659:0:(mdd_object.c:400:mdd_xattr_get()) Skipped 32 previous similar messages
[2663209.047366] LustreError: 67065:0:(mdd_object.c:3249:mdd_close()) fir-MDD0003: failed to get lu_attr of [0x280044520:0x1a8fc:0x0]: rc = -2
[2663209.059905] LustreError: 67065:0:(mdd_object.c:3249:mdd_close()) Skipped 22 previous similar messages
[2663271.909174] LustreError: 66627:0:(mdd_object.c:400:mdd_xattr_get()) fir-MDD0003: object [0x280044520:0x1db03:0x0] not found: rc = -2
[2663271.921256] LustreError: 66627:0:(mdd_object.c:400:mdd_xattr_get()) Skipped 113 previous similar messages
[2663273.161523] LustreError: 74686:0:(mdd_object.c:3249:mdd_close()) fir-MDD0003: failed to get lu_attr of [0x280044520:0x1dbf1:0x0]: rc = -2
[2663273.174067] LustreError: 74686:0:(mdd_object.c:3249:mdd_close()) Skipped 94 previous similar messages
[2663403.135529] LustreError: 66218:0:(mdd_object.c:400:mdd_xattr_get()) fir-MDD0003: object [0x280044522:0x4449:0x0] not found: rc = -2
[2663403.147555] LustreError: 66218:0:(mdd_object.c:400:mdd_xattr_get()) Skipped 378 previous similar messages
[2663403.645800] LustreError: 100695:0:(mdd_object.c:3249:mdd_close()) fir-MDD0003: failed to get lu_attr of [0x280044522:0x4bea:0x0]: rc = -2
[2663403.658315] LustreError: 100695:0:(mdd_object.c:3249:mdd_close()) Skipped 323 previous similar messages
[2663455.535626] LustreError: 66692:0:(lod_dev.c:132:lod_fld_lookup()) fir-MDT0003-mdtlov: invalid FID [0x0:0x0:0x0]
[2663455.545914] LustreError: 66692:0:(lu_object.c:146:lu_object_put()) ASSERTION( top-&amp;gt;loh_hash.next == ((void *)0) &amp;amp;&amp;amp; top-&amp;gt;loh_hash.pprev == ((void *)0) ) failed: 
[2663455.560424] LustreError: 66692:0:(lu_object.c:146:lu_object_put()) LBUG
[2663455.567250] Pid: 66692, comm: mdt02_035 3.10.0-957.27.2.el7_lustre.pl2.x86_64 #1 SMP Thu Nov 7 15:26:16 PST 2019
[2663455.577613] Call Trace:
[2663455.580267]  [&amp;lt;ffffffffc0bc87cc&amp;gt;] libcfs_call_trace+0x8c/0xc0 [libcfs]
[2663455.587021]  [&amp;lt;ffffffffc0bc887c&amp;gt;] lbug_with_loc+0x4c/0xa0 [libcfs]
[2663455.593439]  [&amp;lt;ffffffffc0d12fd6&amp;gt;] lu_object_put+0x336/0x3e0 [obdclass]
[2663455.600209]  [&amp;lt;ffffffffc0d13096&amp;gt;] lu_object_put_nocache+0x16/0x20 [obdclass]
[2663455.607502]  [&amp;lt;ffffffffc0d1329e&amp;gt;] lu_object_find_at+0x1fe/0xa60 [obdclass]
[2663455.614604]  [&amp;lt;ffffffffc0d13b16&amp;gt;] lu_object_find+0x16/0x20 [obdclass]
[2663455.621293]  [&amp;lt;ffffffffc15af2cb&amp;gt;] mdt_object_find+0x4b/0x170 [mdt]
[2663455.627727]  [&amp;lt;ffffffffc15c4c88&amp;gt;] mdt_migrate_lookup.isra.40+0x158/0xa60 [mdt]
[2663455.635193]  [&amp;lt;ffffffffc15cf1cd&amp;gt;] mdt_reint_migrate+0x8bd/0x11d0 [mdt]
[2663455.641939]  [&amp;lt;ffffffffc15cfb63&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
[2663455.648176]  [&amp;lt;ffffffffc15ac273&amp;gt;] mdt_reint_internal+0x6e3/0xaf0 [mdt]
[2663455.654933]  [&amp;lt;ffffffffc15b78d7&amp;gt;] mdt_reint+0x67/0x140 [mdt]
[2663455.660810]  [&amp;lt;ffffffffc100666a&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
[2663455.667959]  [&amp;lt;ffffffffc0fa944b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[2663455.675866]  [&amp;lt;ffffffffc0facdb4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
[2663455.682391]  [&amp;lt;ffffffffb5ec2e81&amp;gt;] kthread+0xd1/0xe0
[2663455.687479]  [&amp;lt;ffffffffb6577c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
[2663455.694143]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[2663455.699346] Kernel panic - not syncing: LBUG
[2663455.703792] CPU: 22 PID: 66692 Comm: mdt02_035 Kdump: loaded Tainted: G           OE  ------------   3.10.0-957.27.2.el7_lustre.pl2.x86_64 #1
[2663455.716670] Hardware name: Dell Inc. PowerEdge R6415/07YXFK, BIOS 1.12.2 11/15/2019
[2663455.724504] Call Trace:
[2663455.727152]  [&amp;lt;ffffffffb6565147&amp;gt;] dump_stack+0x19/0x1b
[2663455.732474]  [&amp;lt;ffffffffb655e850&amp;gt;] panic+0xe8/0x21f
[2663455.737460]  [&amp;lt;ffffffffc0bc88cb&amp;gt;] lbug_with_loc+0x9b/0xa0 [libcfs]
[2663455.743849]  [&amp;lt;ffffffffc0d12fd6&amp;gt;] lu_object_put+0x336/0x3e0 [obdclass]
[2663455.750580]  [&amp;lt;ffffffffc0d0f42b&amp;gt;] ? lu_object_start.isra.35+0x8b/0x120 [obdclass]
[2663455.758261]  [&amp;lt;ffffffffc0d13096&amp;gt;] lu_object_put_nocache+0x16/0x20 [obdclass]
[2663455.765507]  [&amp;lt;ffffffffc0d1329e&amp;gt;] lu_object_find_at+0x1fe/0xa60 [obdclass]
[2663455.772569]  [&amp;lt;ffffffffc16ccbfe&amp;gt;] ? lod_xattr_get+0xee/0x700 [lod]
[2663455.778947]  [&amp;lt;ffffffffc0d13b16&amp;gt;] lu_object_find+0x16/0x20 [obdclass]
[2663455.785576]  [&amp;lt;ffffffffc15af2cb&amp;gt;] mdt_object_find+0x4b/0x170 [mdt]
[2663455.791945]  [&amp;lt;ffffffffc15c4c88&amp;gt;] mdt_migrate_lookup.isra.40+0x158/0xa60 [mdt]
[2663455.799349]  [&amp;lt;ffffffffc15cf1cd&amp;gt;] mdt_reint_migrate+0x8bd/0x11d0 [mdt]
[2663455.806077]  [&amp;lt;ffffffffc0d270a9&amp;gt;] ? check_unlink_entry+0x19/0xd0 [obdclass]
[2663455.813234]  [&amp;lt;ffffffffc0d27cf8&amp;gt;] ? upcall_cache_get_entry+0x218/0x8b0 [obdclass]
[2663455.820902]  [&amp;lt;ffffffffc15cfb63&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
[2663455.827089]  [&amp;lt;ffffffffc15ac273&amp;gt;] mdt_reint_internal+0x6e3/0xaf0 [mdt]
[2663455.833797]  [&amp;lt;ffffffffc15b78d7&amp;gt;] mdt_reint+0x67/0x140 [mdt]
[2663455.839683]  [&amp;lt;ffffffffc100666a&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
[2663455.846777]  [&amp;lt;ffffffffc0fde0d1&amp;gt;] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc]
[2663455.854527]  [&amp;lt;ffffffffc0bc8bde&amp;gt;] ? ktime_get_real_seconds+0xe/0x10 [libcfs]
[2663455.861785]  [&amp;lt;ffffffffc0fa944b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[2663455.869656]  [&amp;lt;ffffffffc0fa5575&amp;gt;] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc]
[2663455.876621]  [&amp;lt;ffffffffb5ecfeb4&amp;gt;] ? __wake_up+0x44/0x50
[2663455.882061]  [&amp;lt;ffffffffc0facdb4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
[2663455.888544]  [&amp;lt;ffffffffc0fac280&amp;gt;] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc]
[2663455.896114]  [&amp;lt;ffffffffb5ec2e81&amp;gt;] kthread+0xd1/0xe0
[2663455.901175]  [&amp;lt;ffffffffb5ec2db0&amp;gt;] ? insert_kthread_work+0x40/0x40
[2663455.907450]  [&amp;lt;ffffffffb6577c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
[2663455.914070]  [&amp;lt;ffffffffb5ec2db0&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="278579" author="sthiell" created="Wed, 2 Sep 2020 00:00:02 +0000"  >&lt;p&gt;Hi Lai,&lt;/p&gt;

&lt;p&gt;Do you have any idea on how to avoid this crash? I would be happy to try a patch.&#160;&lt;/p&gt;

&lt;p&gt;Thanks!&lt;/p&gt;</comment>
                            <comment id="278603" author="gerrit" created="Wed, 2 Sep 2020 09:57:49 +0000"  >&lt;p&gt;Lai Siyao (lai.siyao@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/39792&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/39792&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13511&quot; title=&quot;MDS 2.12.4 ASSERTION( top-&amp;gt;loh_hash.next == ((void *)0) &amp;amp;&amp;amp; top-&amp;gt;loh_hash.pprev == ((void *)0) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13511&quot;&gt;&lt;del&gt;LU-13511&lt;/del&gt;&lt;/a&gt; obdclass: don&apos;t initialize obj for zero FID&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: e3e930b6ac01585e95e4ed561aa9e62c5e792f5c&lt;/p&gt;</comment>
                            <comment id="279290" author="sthiell" created="Thu, 10 Sep 2020 22:56:51 +0000"  >&lt;p&gt;Hi Lai,&lt;/p&gt;

&lt;p&gt;I think your patch fixed this problem. We have been running it on a MDS (the one receiving new files, that is, the MDS of the target MDT of lfs migrate -m) for 6 days now while migrations were running and no crash. We&apos;re not done yet with our migrations so I will let you know if we notice any issue.&lt;/p&gt;</comment>
                            <comment id="281299" author="gerrit" created="Fri, 2 Oct 2020 00:19:04 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/39792/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/39792/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13511&quot; title=&quot;MDS 2.12.4 ASSERTION( top-&amp;gt;loh_hash.next == ((void *)0) &amp;amp;&amp;amp; top-&amp;gt;loh_hash.pprev == ((void *)0) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13511&quot;&gt;&lt;del&gt;LU-13511&lt;/del&gt;&lt;/a&gt; obdclass: don&apos;t initialize obj for zero FID&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 22ea9767956c89aa08ef6d80ad04aaccde647755&lt;/p&gt;</comment>
                            <comment id="281310" author="pjones" created="Fri, 2 Oct 2020 02:50:50 +0000"  >&lt;p&gt;Landed for 2.14&lt;/p&gt;</comment>
                            <comment id="282640" author="gerrit" created="Mon, 19 Oct 2020 19:11:36 +0000"  >&lt;p&gt;Jian Yu (yujian@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/40304&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40304&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13511&quot; title=&quot;MDS 2.12.4 ASSERTION( top-&amp;gt;loh_hash.next == ((void *)0) &amp;amp;&amp;amp; top-&amp;gt;loh_hash.pprev == ((void *)0) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13511&quot;&gt;&lt;del&gt;LU-13511&lt;/del&gt;&lt;/a&gt; obdclass: don&apos;t initialize obj for zero FID&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 050863780310ae1166fc5c00b083675ac16502c4&lt;/p&gt;</comment>
                            <comment id="283581" author="gerrit" created="Thu, 29 Oct 2020 07:49:49 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/40304/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40304/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13511&quot; title=&quot;MDS 2.12.4 ASSERTION( top-&amp;gt;loh_hash.next == ((void *)0) &amp;amp;&amp;amp; top-&amp;gt;loh_hash.pprev == ((void *)0) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13511&quot;&gt;&lt;del&gt;LU-13511&lt;/del&gt;&lt;/a&gt; obdclass: don&apos;t initialize obj for zero FID&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 44f354e53f42e25a4bfa98d50dcdc4397f06a9e2&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="34819" name="fir-md1-s2_20200501_vmcore-dmesg.txt" size="825664" author="sthiell" created="Sun, 3 May 2020 20:40:05 +0000"/>
                            <attachment id="34818" name="fir-md1-s2_crash_foreach_bt_20200501164658.txt" size="894073" author="sthiell" created="Sun, 3 May 2020 20:46:18 +0000"/>
                            <attachment id="35550" name="fir-md1-s4_vmcore-dmesg2020_08_08_05_13_40.txt" size="574736" author="sthiell" created="Sat, 8 Aug 2020 16:25:43 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00zfb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>