<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:39:41 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10956] sanity-pfl test_3: Kernel panic - not syncing: Pool has encountered an uncorrectable I/O failure and the failure mode property for this pool is set to panic</title>
                <link>https://jira.whamcloud.com/browse/LU-10956</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for sarah_lw &amp;lt;wei3.liu@intel.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.whamcloud.com/test_sets/252abdaa-477b-11e8-95c0-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/252abdaa-477b-11e8-95c0-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;test_3 failed with the following error:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Test crashed during sanity-pfl test_3
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;env: RHEL7 zfs DNE tag-2.11.51&lt;/p&gt;

&lt;p&gt;this is the trace found in kernel-crash.log&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
[34408.762645] Lustre: DEBUG MARKER: dmesg
[34409.519801] Lustre: DEBUG MARKER: /usr/sbin/lctl mark == sanity-pfl test 3: Delete component from existing file ============================================ 04:43:50 \(1524545030\)
[34409.734904] Lustre: DEBUG MARKER: == sanity-pfl test 3: Delete component from existing file ============================================ 04:43:50 (1524545030)
[34434.509312] Lustre: lustre-OST0006: Client lustre-MDT0001-mdtlov_UUID (at 10.9.4.25@tcp) reconnecting
[34434.512144] Lustre: lustre-OST0006: Client lustre-MDT0003-mdtlov_UUID (at 10.9.4.25@tcp) reconnecting
[34434.512149] Lustre: Skipped 7 previous similar messages
[34434.516050] WARNING: MMP writes to pool &apos;lustre-ost2&apos; have not succeeded in over 20s; suspending pool
[34434.516059] Kernel panic - not syncing: Pool &apos;lustre-ost2&apos; has encountered an uncorrectable I/O failure and the failure mode property for this pool is set to panic.
[34434.516071] CPU: 0 PID: 16454 Comm: mmp Tainted: P OE ------------ 3.10.0-693.21.1.el7_lustre.x86_64 #1
[34434.516072] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2007
[34434.516077] Call Trace:
[34434.516133] [&amp;lt;ffffffff816ae7c8&amp;gt;] dump_stack+0x19/0x1b
[34434.516137] [&amp;lt;ffffffff816a8634&amp;gt;] panic+0xe8/0x21f
[34434.516443] [&amp;lt;ffffffffc05734a6&amp;gt;] zio_suspend+0x106/0x110 [zfs]
[34434.516470] [&amp;lt;ffffffffc04fa322&amp;gt;] mmp_thread+0x322/0x4a0 [zfs]
[34434.516491] [&amp;lt;ffffffffc04fa000&amp;gt;] ? mmp_write_done+0x1d0/0x1d0 [zfs]
[34434.516528] [&amp;lt;ffffffffc03aefc3&amp;gt;] thread_generic_wrapper+0x73/0x80 [spl]
[34434.516532] [&amp;lt;ffffffffc03aef50&amp;gt;] ? __thread_exit+0x20/0x20 [spl]
[34434.516555] [&amp;lt;ffffffff810b4031&amp;gt;] kthread+0xd1/0xe0
[34434.516558] [&amp;lt;ffffffff810b3f60&amp;gt;] ? insert_kthread_work+0x40/0x40
[34434.516574] [&amp;lt;ffffffff816c0577&amp;gt;] ret_from_fork+0x77/0xb0
[34434.516577] [&amp;lt;ffffffff810b3f60&amp;gt;] ? insert_kthread_work+0x40/0x40

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;VVVVVVV DO NOT REMOVE LINES BELOW, Added by Maloo for auto-association VVVVVVV&lt;br/&gt;
 sanity-pfl test_3 - Test crashed during sanity-pfl test_3&lt;/p&gt;</description>
                <environment></environment>
        <key id="52004">LU-10956</key>
            <summary>sanity-pfl test_3: Kernel panic - not syncing: Pool has encountered an uncorrectable I/O failure and the failure mode property for this pool is set to panic</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                            <label>zfs</label>
                    </labels>
                <created>Wed, 25 Apr 2018 20:15:53 +0000</created>
                <updated>Fri, 16 Jun 2023 22:35:32 +0000</updated>
                                            <version>Lustre 2.12.0</version>
                    <version>Lustre 2.14.0</version>
                    <version>Lustre 2.12.5</version>
                    <version>Lustre 2.12.8</version>
                    <version>Lustre 2.15.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="272103" author="sarah" created="Fri, 5 Jun 2020 22:00:51 +0000"  >&lt;p&gt;hit the problem in rolling upgrade from 2.10.8 EL7.6 to 2.12.5 EL7.8 zfs&lt;br/&gt;
After rolling upgrade all servers and client to 2.12.5, sanity 42e,180c hit the same crash on OSS&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[  907.517100] Lustre: DEBUG MARKER: == sanity test 42e: verify sub-RPC writes are not done synchronously ================================= 06:16:17 (1591337777)
[  910.460672] Lustre: lustre-OST0000: Connection restored to 47f23d53-3404-b4c5-e304-97df136e115c (at 10.9.6.157@tcp)
[  910.462673] Lustre: Skipped 1 previous similar message
[  922.630198] WARNING: MMP writes to pool &apos;lustre-ost1&apos; have not succeeded in over 5s; suspending pool
[  922.631786] Kernel panic - not syncing: Pool &apos;lustre-ost1&apos; has encountered an uncorrectable I/O failure and the failure mode property for this pool is set to panic.
[  922.634113] CPU: 1 PID: 2823 Comm: mmp Kdump: loaded Tainted: P           OE  ------------   3.10.0-1127.8.2.el7_lustre.x86_64 #1
[  922.635942] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[  922.636859] Call Trace:
[  922.637279]  [&amp;lt;ffffffffa717ffa5&amp;gt;] dump_stack+0x19/0x1b
[  922.638104]  [&amp;lt;ffffffffa7179541&amp;gt;] panic+0xe8/0x21f
[  922.638963]  [&amp;lt;ffffffffc05b7446&amp;gt;] zio_suspend+0x116/0x120 [zfs]
[  922.639937]  [&amp;lt;ffffffffc053da4c&amp;gt;] mmp_thread+0x41c/0x4c0 [zfs]
[  922.640900]  [&amp;lt;ffffffffc053d630&amp;gt;] ? mmp_write_done+0x140/0x140 [zfs]
[  922.641928]  [&amp;lt;ffffffffc02f6063&amp;gt;] thread_generic_wrapper+0x73/0x80 [spl]
[  922.642999]  [&amp;lt;ffffffffc02f5ff0&amp;gt;] ? __thread_exit+0x20/0x20 [spl]
[  922.643977]  [&amp;lt;ffffffffa6ac6691&amp;gt;] kthread+0xd1/0xe0
[  922.644765]  [&amp;lt;ffffffffa6ac65c0&amp;gt;] ? insert_kthread_work+0x40/0x40
[  922.645735]  [&amp;lt;ffffffffa7192d37&amp;gt;] ret_from_fork_nospec_begin+0x21/0x21
[  922.646777]  [&amp;lt;ffffffffa6ac65c0&amp;gt;] ? insert_kthread_work+0x40/0x40
[    0.000000] Initializing cgroup subsys cpuset
[    0.000000] Initializing cgroup subsys cpu

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="289525" author="sarah" created="Thu, 14 Jan 2021 19:36:01 +0000"  >&lt;p&gt;hit the same issue in master zfs failover&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/7e12ed8b-b695-46d6-9103-a35b55b378a0&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/7e12ed8b-b695-46d6-9103-a35b55b378a0&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="289616" author="jamesanunez" created="Fri, 15 Jan 2021 16:23:20 +0000"  >&lt;p&gt;We&#8217;ve seen this crash for several test suites:&lt;br/&gt;
Lustre 2.13.56.23 - mds-survey test_1 - &lt;a href=&quot;https://testing.whamcloud.com/test_sets/edd10cda-ed14-4cd3-a1b7-6df26a6cc734&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/edd10cda-ed14-4cd3-a1b7-6df26a6cc734&lt;/a&gt;&lt;br/&gt;
Lustre 2.13.56.44 - parallel-scale-nfsv3 test_compilebench - &lt;a href=&quot;https://testing.whamcloud.com/test_sets/546e48a1-cac0-4c05-866b-5bbe4fd0925f&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/546e48a1-cac0-4c05-866b-5bbe4fd0925f&lt;/a&gt;&lt;br/&gt;
Lustre 2.13.57.53 - recovery-mds-scale test_failover_mds - &lt;a href=&quot;https://testing.whamcloud.com/test_sets/c3d24d75-de7f-497b-af46-452764362666&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/c3d24d75-de7f-497b-af46-452764362666&lt;/a&gt;&lt;br/&gt;
Lustre 2.13.57.53 - recovery-random-scale test_fail_client_mds - &lt;a href=&quot;https://testing.whamcloud.com/test_sets/b9f869f9-3b10-4d3e-9c43-98c5d333eb46&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/b9f869f9-3b10-4d3e-9c43-98c5d333eb46&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="317986" author="sergey" created="Thu, 11 Nov 2021 16:25:27 +0000"  >&lt;p&gt;replay-dual test_21a &lt;a href=&quot;https://testing.whamcloud.com/test_sets/6e31af1a-3eb5-484a-8aca-000eb45c172c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/6e31af1a-3eb5-484a-8aca-000eb45c172c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="318230" author="lixi_wc" created="Mon, 15 Nov 2021 02:44:10 +0000"  >&lt;p&gt;+1 for sanity:413a&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://testing.whamcloud.com/test_sets/91ccf494-32bd-4502-bff5-c9cba0cbc108&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/91ccf494-32bd-4502-bff5-c9cba0cbc108&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="318302" author="lixi_wc" created="Tue, 16 Nov 2021 00:06:35 +0000"  >&lt;p&gt;+1 &lt;a href=&quot;https://testing.whamcloud.com/test_sets/7494598f-5f66-4f53-99c2-a42fd4422b99&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/7494598f-5f66-4f53-99c2-a42fd4422b99&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="318396" author="sebastien" created="Wed, 17 Nov 2021 08:22:52 +0000"  >&lt;p&gt;+1 in recovery-small test_17a&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/90191f3b-00b4-45e9-b6b7-be55b45bbedc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/90191f3b-00b4-45e9-b6b7-be55b45bbedc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="319021" author="bzzz" created="Tue, 23 Nov 2021 19:37:33 +0000"  >&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[ 2868.632037] WARNING: MMP writes to pool &lt;span class=&quot;code-quote&quot;&gt;&apos;lustre-mdt3&apos;&lt;/span&gt; have not succeeded in over 60004 ms; suspending pool. Hrtime 2868632010566&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;this is 60 seconds, right?&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/719b272d-26d8-491c-9853-1291a8d4ede6&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/719b272d-26d8-491c-9853-1291a8d4ede6&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="320339" author="sarah" created="Wed, 8 Dec 2021 21:33:40 +0000"  >&lt;p&gt;Hit the same error when doing clean downgrade testing from 2.12.8 back to 2.12.7 zfs&lt;br/&gt;
system was first formatted as 2.12.7 zfs, clean upgrade 2.12.8, no issue; then clean downgrade back to 2.12.7, mds showed following error when doing some I/O&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[  819.523684] Lustre: MGS: Connection restored to c1610ca8-7c1f-280d-38e2-6e7a30dd0376 (at 10.240.22.6@tcp)
[  826.309413] LustreError: 11-0: lustre-OST0000-osc-MDT0000: operation ost_connect to node 10.240.22.6@tcp failed: rc = -16
[  828.924608] LustreError: 11-0: lustre-OST0000-osc-MDT0000: operation ost_connect to node 10.240.22.6@tcp failed: rc = -16
[  851.395455] LustreError: 11-0: lustre-OST0000-osc-MDT0000: operation ost_connect to node 10.240.22.6@tcp failed: rc = -16
[  876.483447] LustreError: 11-0: lustre-OST0000-osc-MDT0000: operation ost_connect to node 10.240.22.6@tcp failed: rc = -16
[  901.571491] LustreError: 11-0: lustre-OST0000-osc-MDT0000: operation ost_connect to node 10.240.22.6@tcp failed: rc = -16
[  926.659577] LustreError: 11-0: lustre-OST0000-osc-MDT0000: operation ost_connect to node 10.240.22.6@tcp failed: rc = -16
[  951.747496] LustreError: 11-0: lustre-OST0000-osc-MDT0000: operation ost_connect to node 10.240.22.6@tcp failed: rc = -16
[ 1001.924417] LustreError: 11-0: lustre-OST0000-osc-MDT0000: operation ost_connect to node 10.240.22.6@tcp failed: rc = -16
[ 1001.926432] LustreError: Skipped 1 previous similar message
[ 1077.188415] LustreError: 11-0: lustre-OST0000-osc-MDT0000: operation ost_connect to node 10.240.22.6@tcp failed: rc = -16
[ 1077.190448] LustreError: Skipped 2 previous similar messages
[ 1253.326971] Lustre: MGS: Connection restored to 6e2a8795-6a65-b3f1-be54-38d6f84216b5 (at 10.240.22.7@tcp)
[ 1253.335182] Lustre: Skipped 1 previous similar message
[ 1253.383388] Lustre: lustre-MDT0000: Will be in recovery for at least 5:00, or until 1 client reconnects
[ 1253.385279] Lustre: lustre-MDT0000: Denying connection for new client 09f93b4c-bd9c-dc16-257f-001e9b331e63 (at 10.240.22.7@tcp), waiting for 1 known clients (0 recovered, 0 in progress, and 0 evicted) to recover in 4:59
[ 1278.436952] Lustre: lustre-MDT0000: Denying connection for new client 09f93b4c-bd9c-dc16-257f-001e9b331e63 (at 10.240.22.7@tcp), waiting for 1 known clients (0 recovered, 0 in progress, and 0 evicted) to recover in 4:34
[ 1303.459633] Lustre: lustre-MDT0000: Denying connection for new client 09f93b4c-bd9c-dc16-257f-001e9b331e63 (at 10.240.22.7@tcp), waiting for 1 known clients (0 recovered, 0 in progress, and 0 evicted) to recover in 4:09
[ 1328.546430] Lustre: lustre-MDT0000: Denying connection for new client 09f93b4c-bd9c-dc16-257f-001e9b331e63 (at 10.240.22.7@tcp), waiting for 1 known clients (0 recovered, 0 in progress, and 0 evicted) to recover in 3:44
[ 1353.633076] Lustre: lustre-MDT0000: Denying connection for new client 09f93b4c-bd9c-dc16-257f-001e9b331e63 (at 10.240.22.7@tcp), waiting for 1 known clients (0 recovered, 0 in progress, and 0 evicted) to recover in 3:19
[ 1378.715611] Lustre: lustre-MDT0000: Denying connection for new client 09f93b4c-bd9c-dc16-257f-001e9b331e63 (at 10.240.22.7@tcp), waiting for 1 known clients (0 recovered, 0 in progress, and 0 evicted) to recover in 2:54
[ 1403.797922] Lustre: lustre-MDT0000: Denying connection for new client 09f93b4c-bd9c-dc16-257f-001e9b331e63 (at 10.240.22.7@tcp
[ 1529.217734] Lustre: Skipped 2 previous similar messages
[ 1553.300393] Lustre: lustre-MDT0000: recovery is timed out, evict stale exports
[ 1553.301864] Lustre: lustre-MDT0000: disconnecting 1 stale clients
[ 1553.443706] Lustre: lustre-MDT0000: Recovery over after 5:00, of 1 clients 0 recovered and 1 was evicted.
[ 1555.298604] Lustre: lustre-MDT0000: Connection restored to 6e2a8795-6a65-b3f1-be54-38d6f84216b5 (at 10.240.22.7@tcp)
[ 5227.763272] WARNING: MMP writes to pool &apos;lustre-mdt1&apos; have not succeeded in over 5s; suspending pool
[ 5227.768866] Kernel panic - not syncing: Pool &apos;lustre-mdt1&apos; has encountered an uncorrectable I/O failure and the failure mode property for this pool is set to panic.
[ 5227.771388] CPU: 0 PID: 20720 Comm: mmp Kdump: loaded Tainted: P           OE  ------------   3.10.0-1160.25.1.el7_lustre.x86_64 #1
[ 5227.773400] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[ 5227.774396] Call Trace:
[ 5227.774911]  [&amp;lt;ffffffffb8b8311a&amp;gt;] dump_stack+0x19/0x1b
[ 5227.775815]  [&amp;lt;ffffffffb8b7c672&amp;gt;] panic+0xe8/0x21f
[ 5227.777042]  [&amp;lt;ffffffffc0902446&amp;gt;] zio_suspend+0x116/0x120 [zfs]
[ 5227.778109]  [&amp;lt;ffffffffc0888a4c&amp;gt;] mmp_thread+0x41c/0x4c0 [zfs]
[ 5227.779164]  [&amp;lt;ffffffffc0888630&amp;gt;] ? mmp_write_done+0x140/0x140 [zfs]
[ 5227.780316]  [&amp;lt;ffffffffc0755063&amp;gt;] thread_generic_wrapper+0x73/0x80 [spl]
[ 5227.781478]  [&amp;lt;ffffffffc0754ff0&amp;gt;] ? __thread_exit+0x20/0x20 [spl]
[ 5227.782565]  [&amp;lt;ffffffffb84c5da1&amp;gt;] kthread+0xd1/0xe0
[ 5227.783424]  [&amp;lt;ffffffffb84c5cd0&amp;gt;] ? insert_kthread_work+0x40/0x40
[ 5227.784487]  [&amp;lt;ffffffffb8b95df7&amp;gt;] ret_from_fork_nospec_begin+0x21/0x21
[ 5227.785622]  [&amp;lt;ffffffffb84c5cd0&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Tried a seconde time, this time, no I/O and mds cannot be mounted, dmesg shows &quot;pool suspended&quot;, similar as &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9845&quot; title=&quot;ost-pools test_22 hangs with &#8216;WARNING: Pool &amp;#39;lustre-mdt1&amp;#39; has encountered an uncorrectable I/O failure and has been suspended.&#8217;&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9845&quot;&gt;&lt;del&gt;LU-9845&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[  100.065432] Key type id_resolver registered
[  100.066277] Key type id_legacy registered
[  358.013062] Rounding down aligned max_sectors from 4294967295 to 4294967288
[  358.039021] Loading iSCSI transport class v2.0-870.
[  358.050683] iscsi: registered transport (iser)
[  358.107129] RPC: Registered rdma transport module.
[  358.108114] RPC: Registered rdma backchannel transport module.
[  574.759980] spl: loading out-of-tree module taints kernel.
[  574.763735] spl: module verification failed: signature and/or required key missing - tainting kernel
[  574.770148] SPL: Loaded module v0.7.13-1
[  574.771122] znvpair: module license &apos;CDDL&apos; taints kernel.
[  574.772226] Disabling lock debugging due to kernel taint
[  576.704935] ZFS: Loaded module v0.7.13-1, ZFS pool version 5000, ZFS filesystem version 5
[ 3621.200468] WARNING: MMP writes to pool &apos;lustre-mdt1&apos; have not succeeded in over 5s; suspending pool
[ 3621.202323] WARNING: Pool &apos;lustre-mdt1&apos; has encountered an uncorrectable I/O failure and has been suspended.


[root@onyx-20vm6 ~]# zpool list
NAME          SIZE  ALLOC   FREE  EXPANDSZ   FRAG    CAP  DEDUP  HEALTH  ALTROOT
lustre-mdt1  5.97G  99.9M  5.87G         -    40%     1%  1.00x  SUSPENDED  -
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="320952" author="sarah" created="Wed, 15 Dec 2021 17:28:10 +0000"  >&lt;p&gt;same failure happened in rolling upgrade from 2.10.8 zfs to 2.12.8 zfs(all servers and clients were upgraded). sanity test_101d, MDS and OSS both crashed. crash dumps can be found on /scratch/dumps/onyx-20vm6.onyx.whamcloud.com/10.240.22.5-2021-12-15-00:43:14&lt;br/&gt;
/scratch/dumps/onyx-20vm7.onyx.whamcloud.com/10.240.22.6-2021-12-15-00:43:14&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 6189.196087] Lustre: DEBUG MARKER: == sanity test 101d: file read with and without read-ahead enabled =================================== 00:42:59 (1639528979)
[ 6210.315899] Lustre: 31346:0:(client.c:2169:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1639528993/real 1639528993]  req@ffff94cc84058480 x1719168217710592/t0(0) o400-&amp;gt;MGC10.240.22.5@tcp@10.240.22.5@tcp:26/25 lens 224/224 e 0 to 1 dl 1639529000 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
[ 6210.315901] Lustre: 31347:0:(client.c:2169:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1639528993/real 1639528993]  req@ffff94cc8405b180 x1719168217710656/t0(0) o400-&amp;gt;lustre-MDT0000-mdc-ffff94cc847e2800@10.240.22.5@tcp:12/10 lens 224/224 e 0 to 1 dl 1639529000 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
[ 6210.320698] Lustre: lustre-MDT0000-mdc-ffff94cc847e2800: Connection to lustre-MDT0000 (at 10.240.22.5@tcp) was lost; in progress operations using this service will wait for recovery to complete
[ 6210.333787] Lustre: 31346:0:(client.c:2169:ptlrpc_expire_one_request()) Skipped 1 previous similar message
[ 6210.335541] LustreError: 166-1: MGC10.240.22.5@tcp: Connection to MGS (at 10.240.22.5@tcp) was lost; in progress operations using this service will fail
[ 6217.320480] Lustre: 31347:0:(client.c:2169:ptlrpc_expire_one_request()) @@@ Request sent has timed out for sent delay: [sent 1639529000/real 0]  req@ffff94cc8405b600 x1719168217710976/t0(0) o400-&amp;gt;lustre-OST0000-osc-ffff94cc847e2800@10.240.22.6@tcp:28/4 lens 224/224 e 0 to 1 dl 1639529007 ref 2 fl Rpc:XN/0/ffffffff rc 0/-1
[ 6217.325616] Lustre: 31347:0:(client.c:2169:ptlrpc_expire_one_request()) Skipped 1 previous similar message
[ 6223.326129] Lustre: 31347:0:(client.c:2169:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1639528981/real 1639528981]  req@ffff94cc849ad680 x1719168217708864/t0(0) o4-&amp;gt;lustre-OST0000-osc-ffff94cc847e2800@10.240.22.6@tcp:6/4 lens 488/448 e 1 to 1 dl 1639529013 ref 2 fl Rpc:X/0/ffffffff rc 0/-1
[ 6360.063126] INFO: task lctl:32059 blocked for more than 120 seconds.
[ 6360.064419] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[ 6360.065900] lctl            D ffff94ccfaae1080     0 32059  31925 0x00000080
[ 6360.067313] Call Trace:
[ 6360.067853]  [&amp;lt;ffffffffa3789179&amp;gt;] schedule+0x29/0x70
[ 6360.068778]  [&amp;lt;ffffffffa3786e41&amp;gt;] schedule_timeout+0x221/0x2d0
[ 6360.069878]  [&amp;lt;ffffffffa30dae02&amp;gt;] ? default_wake_function+0x12/0x20
[ 6360.071048]  [&amp;lt;ffffffffa30d30c2&amp;gt;] ? __wake_up_common+0x82/0x120
[ 6360.072146]  [&amp;lt;ffffffffa378952d&amp;gt;] wait_for_completion+0xfd/0x140
[ 6360.073285]  [&amp;lt;ffffffffa30dadf0&amp;gt;] ? wake_up_state+0x20/0x20
[ 6360.074527]  [&amp;lt;ffffffffc0a4b5fd&amp;gt;] __ldlm_bl_to_thread+0xad/0x150 [ptlrpc]
[ 6360.075790]  [&amp;lt;ffffffffc0a4bc2b&amp;gt;] ldlm_bl_to_thread+0x33b/0x510 [ptlrpc]
[ 6360.077046]  [&amp;lt;ffffffffc0a50689&amp;gt;] ldlm_bl_to_thread_list+0x19/0x20 [ptlrpc]
[ 6360.078325]  [&amp;lt;ffffffffc0a49cb6&amp;gt;] ldlm_cancel_lru+0x76/0x180 [ptlrpc]
[ 6360.079527]  [&amp;lt;ffffffffc0a3ab97&amp;gt;] lru_size_store+0x67/0x430 [ptlrpc]
[ 6360.080871]  [&amp;lt;ffffffffc0866e7a&amp;gt;] lustre_attr_store+0x1a/0x20 [obdclass]
[ 6360.082113]  [&amp;lt;ffffffffa32dba62&amp;gt;] sysfs_kf_write+0x42/0x50
[ 6360.083134]  [&amp;lt;ffffffffa32db04b&amp;gt;] kernfs_fop_write+0xeb/0x160
[ 6360.084200]  [&amp;lt;ffffffffa324e590&amp;gt;] vfs_write+0xc0/0x1f0
[ 6360.085161]  [&amp;lt;ffffffffa3795ed5&amp;gt;] ? system_call_after_swapgs+0xa2/0x13a
[ 6360.086367]  [&amp;lt;ffffffffa324f36f&amp;gt;] SyS_write+0x7f/0xf0
[ 6360.087314]  [&amp;lt;ffffffffa3795ed5&amp;gt;] ? system_call_after_swapgs+0xa2/0x13a
[ 6360.088535]  [&amp;lt;ffffffffa3795f92&amp;gt;] system_call_fastpath+0x25/0x2a
[ 6360.089640]  [&amp;lt;ffffffffa3795ed5&amp;gt;] ? system_call_after_swapgs+0xa2/0x13a
[ 6480.083025] INFO: task lctl:32059 blocked for more than 120 seconds.
[ 6480.084299] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[14969.401446] Lustre: DEBUG MARKER: == sanity test 101d: file read with and without read-ahead enabled =================================== 00:42:59 (1639528979)
[14979.793364] WARNING: MMP writes to pool &apos;lustre-mdt1&apos; have not succeeded in over 5s; suspending pool
[14979.795218] Kernel panic - not syncing: Pool &apos;lustre-mdt1&apos; has encountered an uncorrectable I/O failure and the failure mode property for this pool is set to panic.
[14979.797904] CPU: 0 PID: 17374 Comm: mmp Kdump: loaded Tainted: P           OE  ------------   3.10.0-1160.45.1.el7_lustre.x86_64 #1
[14979.800067] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[14979.801131] Call Trace:
[14979.801670]  [&amp;lt;ffffffffbab83539&amp;gt;] dump_stack+0x19/0x1b
[14979.802662]  [&amp;lt;ffffffffbab7d241&amp;gt;] panic+0xe8/0x21f
[14979.803818]  [&amp;lt;ffffffffc0829456&amp;gt;] zio_suspend+0x116/0x120 [zfs]
[14979.804952]  [&amp;lt;ffffffffc07afa4c&amp;gt;] mmp_thread+0x41c/0x4c0 [zfs]
[14979.806072]  [&amp;lt;ffffffffc07af630&amp;gt;] ? mmp_write_done+0x140/0x140 [zfs]
[14979.807287]  [&amp;lt;ffffffffc0672063&amp;gt;] thread_generic_wrapper+0x73/0x80 [spl]
[14979.808531]  [&amp;lt;ffffffffc0671ff0&amp;gt;] ? __thread_exit+0x20/0x20 [spl]
[14979.809705]  [&amp;lt;ffffffffba4c5e61&amp;gt;] kthread+0xd1/0xe0
[14979.810612]  [&amp;lt;ffffffffba4c5d90&amp;gt;] ? insert_kthread_work+0x40/0x40
[14979.811761]  [&amp;lt;ffffffffbab95df7&amp;gt;] ret_from_fork_nospec_begin+0x21/0x21
[14979.812969]  [&amp;lt;ffffffffba4c5d90&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="67285">LU-15261</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="52907">LU-11217</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="55605">LU-12281</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="55860">LU-12393</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="47700">LU-9845</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="58070">LU-13242</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzwdr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>