<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:29:47 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-9845] ost-pools test_22 hangs with &#8216;WARNING: Pool &apos;lustre-mdt1&apos; has encountered an uncorrectable I/O failure and has been suspended.&#8217;</title>
                <link>https://jira.whamcloud.com/browse/LU-9845</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;ost-pools test 22 hangs when testing with ZFS 0.7.0. The last lines in the test_log before the hang are:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;loop for testpool2 complete
total: 100 open/close in 0.09 seconds: 1061.34 ops/second
CMD: onyx-35vm9 lctl pool_remove lustre.testpool OST[0001-0001/1]
onyx-35vm9: OST lustre-OST0001_UUID removed from pool lustre.testpool
Pool testpool, iteration 2
CMD: onyx-35vm9 lctl pool_remove lustre.testpool OST[0001-0001/1]
onyx-35vm9: OST lustre-OST0001_UUID removed from pool lustre.testpool
Pool testpool, iteration 3
CMD: onyx-35vm9 lctl pool_remove lustre.testpool OST[0001-0001/1]
onyx-35vm9: OST lustre-OST0001_UUID removed from pool lustre.testpool
Pool testpool, iteration 4
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;In the MDS console logs, we see:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;04:17:47:[11669.022658] Lustre: DEBUG MARKER: lctl pool_remove lustre.testpool OST[0001-0001/1]
04:17:47:[11677.206802] Lustre: DEBUG MARKER: lctl pool_add lustre.testpool OST[0000-0001/1]
04:17:47:[11687.391683] Lustre: DEBUG MARKER: lctl pool_remove lustre.testpool OST[0001-0001/1]
04:17:47:[11693.557227] WARNING: Pool &apos;lustre-mdt1&apos; has encountered an uncorrectable I/O failure and has been suspended.
04:17:47:[11693.557227] 
04:17:47:[11697.575911] Lustre: DEBUG MARKER: lctl pool_add lustre.testpool OST[0000-0001/1]
04:17:47:[15221.223579] SysRq : Show State
04:17:47:[15221.224071]   task                        PC stack   pid father
04:17:47:[15221.224729] systemd         S 0000000000000000     0     1      0 0x00000000
04:17:47:[15221.225617]  ffff88007c7bbdb8 0000000000000082 ffff88007c7b0000 ffff88007c7bbfd8
04:17:47:[15221.226469]  ffff88007c7bbfd8 ffff88007c7bbfd8 ffff88007c7b0000 0000000000000000
04:17:47:[15221.227325]  0000000000000000 ffff8800369d1a20 ffff88007c7b0000 0000000000000000
04:17:47:[15221.228217] Call Trace:
04:17:47:[15221.228581]  [&amp;lt;ffffffff8168ca49&amp;gt;] schedule+0x29/0x70
04:17:47:[15221.229094]  [&amp;lt;ffffffff8168bb4d&amp;gt;] schedule_hrtimeout_range_clock+0x12d/0x150
04:17:47:[15221.229864]  [&amp;lt;ffffffff812486f9&amp;gt;] ? ep_scan_ready_list.isra.7+0x1b9/0x1f0
04:17:47:[15221.230669]  [&amp;lt;ffffffff8168bb83&amp;gt;] schedule_hrtimeout_range+0x13/0x20
04:17:47:[15221.231363]  [&amp;lt;ffffffff8124898e&amp;gt;] ep_poll+0x23e/0x360
04:17:47:[15221.231927]  [&amp;lt;ffffffff81200873&amp;gt;] ? __fput+0x183/0x260
04:17:47:[15221.232481]  [&amp;lt;ffffffff810c54e0&amp;gt;] ? wake_up_state+0x20/0x20
04:17:47:[15221.233121]  [&amp;lt;ffffffff81249bfd&amp;gt;] SyS_epoll_wait+0xed/0x120
04:17:47:[15221.233737]  [&amp;lt;ffffffff81697a49&amp;gt;] system_call_fastpath+0x16/0x1b
&#8230;
04:17:47:[15222.536161] cfs_rh_00       S ffff88007a8667b0     0  2617      2 0x00000080
04:17:47:[15222.536916]  ffff8800793ebe28 0000000000000046 ffff88000001bec0 ffff8800793ebfd8
04:17:47:[15222.537791]  ffff8800793ebfd8 ffff8800793ebfd8 ffff88000001bec0 ffff88007a866780
04:17:47:[15222.538646]  ffff88007a866798 ffff88007a866790 0000000000000000 ffff88007a8667b0
04:17:47:[15222.539495] Call Trace:
04:17:47:[15222.539749]  [&amp;lt;ffffffff8168ca49&amp;gt;] schedule+0x29/0x70
04:17:47:[15222.540295]  [&amp;lt;ffffffffa05cab58&amp;gt;] cfs_wi_scheduler+0x2f8/0x440 [libcfs]
04:17:47:[15222.541047]  [&amp;lt;ffffffff810b1b20&amp;gt;] ? wake_up_atomic_t+0x30/0x30
04:17:47:[15222.541647]  [&amp;lt;ffffffffa05ca860&amp;gt;] ? cfs_wi_schedule+0x190/0x190 [libcfs]
04:17:47:[15222.542384]  [&amp;lt;ffffffff810b0a4f&amp;gt;] kthread+0xcf/0xe0
04:17:47:[15222.542905]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.543561]  [&amp;lt;ffffffff81697998&amp;gt;] ret_from_fork+0x58/0x90
04:17:47:[15222.544161]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.544827] cfs_rh_01       S ffff88007a8667b0     0  2618      2 0x00000080
04:17:47:[15222.545646]  ffff88007b4ffe28 0000000000000046 ffff880000018fb0 ffff88007b4fffd8
04:17:47:[15222.546465]  ffff88007b4fffd8 ffff88007b4fffd8 ffff880000018fb0 ffff88007a866780
04:17:47:[15222.547284]  ffff88007a866798 ffff88007a866790 0000000000000000 ffff88007a8667b0
04:17:47:[15222.548152] Call Trace:
04:17:47:[15222.548437]  [&amp;lt;ffffffff8168ca49&amp;gt;] schedule+0x29/0x70
04:17:47:[15222.549020]  [&amp;lt;ffffffffa05cab58&amp;gt;] cfs_wi_scheduler+0x2f8/0x440 [libcfs]
04:17:47:[15222.549696]  [&amp;lt;ffffffff810b1b20&amp;gt;] ? wake_up_atomic_t+0x30/0x30
04:17:47:[15222.550348]  [&amp;lt;ffffffffa05ca860&amp;gt;] ? cfs_wi_schedule+0x190/0x190 [libcfs]
04:17:47:[15222.551049]  [&amp;lt;ffffffff810b0a4f&amp;gt;] kthread+0xcf/0xe0
04:17:47:[15222.551554]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.552281]  [&amp;lt;ffffffff81697998&amp;gt;] ret_from_fork+0x58/0x90
04:17:47:[15222.552858]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.553513] spl_system_task S ffff8800791e43c0     0  3362      2 0x00000080
04:17:47:[15222.554288]  ffff88007bae3e00 0000000000000046 ffff88007a24af10 ffff88007bae3fd8
04:17:47:[15222.555149]  ffff88007bae3fd8 ffff88007bae3fd8 ffff88007a24af10 ffff8800791e4300
04:17:47:[15222.556011]  ffff88007a9c6de0 ffff8800791e4390 ffff8800366fbd30 ffff8800791e43c0
04:17:47:[15222.556870] Call Trace:
04:17:47:[15222.557125]  [&amp;lt;ffffffff8168ca49&amp;gt;] schedule+0x29/0x70
04:17:47:[15222.557656]  [&amp;lt;ffffffffa061f0a9&amp;gt;] taskq_thread+0x429/0x470 [spl]
04:17:47:[15222.558369]  [&amp;lt;ffffffff810c54e0&amp;gt;] ? wake_up_state+0x20/0x20
04:17:47:[15222.558965]  [&amp;lt;ffffffffa061ec80&amp;gt;] ? taskq_thread_spawn+0x60/0x60 [spl]
04:17:47:[15222.559630]  [&amp;lt;ffffffff810b0a4f&amp;gt;] kthread+0xcf/0xe0
04:17:47:[15222.560181]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.560842]  [&amp;lt;ffffffff81697998&amp;gt;] ret_from_fork+0x58/0x90
04:17:47:[15222.561451]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.562133] spl_delay_taskq S ffff88007b8fe4c0     0  3363      2 0x00000080
04:17:47:[15222.562894]  ffff88007a907e00 0000000000000046 ffff88007a24bec0 ffff88007a907fd8
04:17:47:[15222.563748]  ffff88007a907fd8 ffff88007a907fd8 ffff88007a24bec0 ffff88007b8fe400
04:17:47:[15222.564605]  ffff88007a9c6f00 ffff88007b8fe490 0000000000000000 ffff88007b8fe4c0
04:17:47:[15222.565474] Call Trace:
04:17:47:[15222.565733]  [&amp;lt;ffffffff8168ca49&amp;gt;] schedule+0x29/0x70
04:17:47:[15222.566242]  [&amp;lt;ffffffffa061f0a9&amp;gt;] taskq_thread+0x429/0x470 [spl]
04:17:47:[15222.566941]  [&amp;lt;ffffffff810c54e0&amp;gt;] ? wake_up_state+0x20/0x20
04:17:47:[15222.567515]  [&amp;lt;ffffffffa061ec80&amp;gt;] ? taskq_thread_spawn+0x60/0x60 [spl]
04:17:47:[15222.568229]  [&amp;lt;ffffffff810b0a4f&amp;gt;] kthread+0xcf/0xe0
04:17:47:[15222.568761]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.569429]  [&amp;lt;ffffffff81697998&amp;gt;] ret_from_fork+0x58/0x90
04:17:47:[15222.570037]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.570703] spl_dynamic_tas S ffff88007b8fe2c0     0  3364      2 0x00000080
04:17:47:[15222.571482]  ffff88007b28fe00 0000000000000046 ffff88007a248000 ffff88007b28ffd8
04:17:47:[15222.572328]  ffff88007b28ffd8 ffff88007b28ffd8 ffff88007a248000 ffff88007b8fe200
04:17:47:[15222.573178]  ffff88007a9c6c60 ffff88007b8fe290 ffff8800668c9670 ffff88007b8fe2c0
04:17:47:[15222.574045] Call Trace:
04:17:47:[15222.574314]  [&amp;lt;ffffffff8168ca49&amp;gt;] schedule+0x29/0x70
04:17:47:[15222.574893]  [&amp;lt;ffffffffa061f0a9&amp;gt;] taskq_thread+0x429/0x470 [spl]
04:17:47:[15222.575505]  [&amp;lt;ffffffff810c54e0&amp;gt;] ? wake_up_state+0x20/0x20
04:17:47:[15222.576122]  [&amp;lt;ffffffffa061ec80&amp;gt;] ? taskq_thread_spawn+0x60/0x60 [spl]
04:17:47:[15222.576785]  [&amp;lt;ffffffff810b0a4f&amp;gt;] kthread+0xcf/0xe0
04:17:47:[15222.577352]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.578036]  [&amp;lt;ffffffff81697998&amp;gt;] ret_from_fork+0x58/0x90
04:17:47:[15222.578596]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.579309] spl_kmem_cache  S ffff88007b8feec0     0  3365      2 0x00000080
04:17:47:[15222.580047]  ffff88007ba3fe00 0000000000000046 ffff88007a24ce70 ffff88007ba3ffd8
04:17:47:[15222.580902]  ffff88007ba3ffd8 ffff88007ba3ffd8 ffff88007a24ce70 ffff88007b8fee00
04:17:47:[15222.581750]  ffff88007a9c6a20 ffff88007b8fee90 ffff8800387fe800 ffff88007b8feec0
04:17:47:[15222.582597] Call Trace:
04:17:47:[15222.582853]  [&amp;lt;ffffffff8168ca49&amp;gt;] schedule+0x29/0x70
04:17:47:[15222.583358]  [&amp;lt;ffffffffa061f0a9&amp;gt;] taskq_thread+0x429/0x470 [spl]
04:17:47:[15222.584057]  [&amp;lt;ffffffff810c54e0&amp;gt;] ? wake_up_state+0x20/0x20
04:17:47:[15222.584645]  [&amp;lt;ffffffffa061ec80&amp;gt;] ? taskq_thread_spawn+0x60/0x60 [spl]
04:17:47:[15222.585372]  [&amp;lt;ffffffff810b0a4f&amp;gt;] kthread+0xcf/0xe0
04:17:47:[15222.585891]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.586549]  [&amp;lt;ffffffff81697998&amp;gt;] ret_from_fork+0x58/0x90
04:17:47:[15222.587153]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.587820] zvol            S ffff88007b5abbc0     0  3366      2 0x00000080
04:17:47:[15222.588648]  ffff88007b29be00 0000000000000046 ffff88007a249f60 ffff88007b29bfd8
04:17:47:[15222.589473]  ffff88007b29bfd8 ffff88007b29bfd8 ffff88007a249f60 ffff88007b5abb00
04:17:47:[15222.590300]  ffff8800364fa6c0 ffff88007b5abb90 0000000000000000 ffff88007b5abbc0
04:17:47:[15222.591167] Call Trace:
04:17:47:[15222.591437]  [&amp;lt;ffffffff8168ca49&amp;gt;] schedule+0x29/0x70
04:17:47:[15222.592015]  [&amp;lt;ffffffffa061f0a9&amp;gt;] taskq_thread+0x429/0x470 [spl]
04:17:47:[15222.592643]  [&amp;lt;ffffffff810c54e0&amp;gt;] ? wake_up_state+0x20/0x20
04:17:47:[15222.593264]  [&amp;lt;ffffffffa061ec80&amp;gt;] ? taskq_thread_spawn+0x60/0x60 [spl]
04:17:47:[15222.593951]  [&amp;lt;ffffffff810b0a4f&amp;gt;] kthread+0xcf/0xe0
04:17:47:[15222.594450]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.595156]  [&amp;lt;ffffffff81697998&amp;gt;] ret_from_fork+0x58/0x90
04:17:47:[15222.595709]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.596425] arc_prune       S ffff880079bc1ac0     0  3372      2 0x00000080
04:17:47:[15222.597166]  ffff88007b8a7e00 0000000000000046 ffff88007a9d8000 ffff88007b8a7fd8
04:17:47:[15222.598033]  ffff88007b8a7fd8 ffff88007b8a7fd8 ffff88007a9d8000 ffff880079bc1a00
04:17:47:[15222.598901]  ffff8800796aad80 ffff880079bc1a90 0000000000000000 ffff880079bc1ac0
04:17:47:[15222.599755] Call Trace:
04:17:47:[15222.600008]  [&amp;lt;ffffffff8168ca49&amp;gt;] schedule+0x29/0x70
04:17:47:[15222.600537]  [&amp;lt;ffffffffa061f0a9&amp;gt;] taskq_thread+0x429/0x470 [spl]
04:17:47:[15222.601256]  [&amp;lt;ffffffff810c54e0&amp;gt;] ? wake_up_state+0x20/0x20
04:17:47:[15222.601857]  [&amp;lt;ffffffffa061ec80&amp;gt;] ? taskq_thread_spawn+0x60/0x60 [spl]
04:17:47:[15222.602514]  [&amp;lt;ffffffff810b0a4f&amp;gt;] kthread+0xcf/0xe0
04:17:47:[15222.603059]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.603722]  [&amp;lt;ffffffff81697998&amp;gt;] ret_from_fork+0x58/0x90
04:17:47:[15222.604329]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.605006] arc_reclaim     S ffffffffa0914a08     0  3373      2 0x00000080
04:17:47:[15222.605764]  ffff88007b933cf8 0000000000000046 ffff88007a9dedd0 ffff88007b933fd8
04:17:47:[15222.606622]  ffff88007b933fd8 ffff88007b933fd8 ffff88007a9dedd0 ffff88007b933dc8
04:17:47:[15222.607473]  00000000000186a0 0000000000000001 ffffffffa0914a88 ffffffffa0914a08
04:17:47:[15222.608292] Call Trace:
04:17:47:[15222.608635]  [&amp;lt;ffffffff8168ca49&amp;gt;] schedule+0x29/0x70
04:17:47:[15222.609129]  [&amp;lt;ffffffff8168bad2&amp;gt;] schedule_hrtimeout_range_clock+0xb2/0x150
04:17:47:[15222.609843]  [&amp;lt;ffffffff810b4640&amp;gt;] ? hrtimer_get_res+0x50/0x50
04:17:47:[15222.610414]  [&amp;lt;ffffffff8168bac6&amp;gt;] ? schedule_hrtimeout_range_clock+0xa6/0x150
04:17:47:[15222.611121]  [&amp;lt;ffffffff8168bb83&amp;gt;] schedule_hrtimeout_range+0x13/0x20
04:17:47:[15222.611769]  [&amp;lt;ffffffffa06236ab&amp;gt;] cv_timedwait_hires_common+0x12b/0x220 [spl]
04:17:47:[15222.612503]  [&amp;lt;ffffffff810b1b20&amp;gt;] ? wake_up_atomic_t+0x30/0x30
04:17:47:[15222.613080]  [&amp;lt;ffffffffa06237d4&amp;gt;] cv_timedwait_sig_hires+0x14/0x20 [spl]
04:17:47:[15222.613844]  [&amp;lt;ffffffffa071ad31&amp;gt;] arc_reclaim_thread+0x121/0x260 [zfs]
04:17:47:[15222.614529]  [&amp;lt;ffffffffa071ac10&amp;gt;] ? arc_shrink+0xc0/0xc0 [zfs]
04:17:47:[15222.615337]  [&amp;lt;ffffffffa061df91&amp;gt;] thread_generic_wrapper+0x71/0x80 [spl]
04:17:47:[15222.616077]  [&amp;lt;ffffffffa061df20&amp;gt;] ? __thread_exit+0x20/0x20 [spl]
04:17:47:[15222.616766]  [&amp;lt;ffffffff810b0a4f&amp;gt;] kthread+0xcf/0xe0
04:17:47:[15222.617319]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.618001]  [&amp;lt;ffffffff81697998&amp;gt;] ret_from_fork+0x58/0x90
04:17:47:[15222.618548]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.619262] dbu_evict       S ffff88003641fbc0     0  3374      2 0x00000080
04:17:47:[15222.620014]  ffff88000005be00 0000000000000046 ffff88007a9daf10 ffff88000005bfd8
04:17:47:[15222.620866]  ffff88000005bfd8 ffff88000005bfd8 ffff88007a9daf10 ffff88003641fb00
04:17:47:[15222.621722]  ffff8800796aad20 ffff88003641fb90 ffff88004caa2070 ffff88003641fbc0
04:17:47:[15222.622573] Call Trace:
04:17:47:[15222.622826]  [&amp;lt;ffffffff8168ca49&amp;gt;] schedule+0x29/0x70
04:17:47:[15222.623336]  [&amp;lt;ffffffffa061f0a9&amp;gt;] taskq_thread+0x429/0x470 [spl]
04:17:47:[15222.624031]  [&amp;lt;ffffffff810c54e0&amp;gt;] ? wake_up_state+0x20/0x20
04:17:47:[15222.624606]  [&amp;lt;ffffffffa061ec80&amp;gt;] ? taskq_thread_spawn+0x60/0x60 [spl]
04:17:47:[15222.625315]  [&amp;lt;ffffffff810b0a4f&amp;gt;] kthread+0xcf/0xe0
04:17:47:[15222.625832]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
04:17:47:[15222.626491]  [&amp;lt;ffffffff81697998&amp;gt;] ret_from_fork+0x58/0x90
04:17:47:[15222.627083]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;From ost-pools.sh, &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;1196 test_22() {
1197         local POOL_ROOT=${POOL_ROOT:-$DIR/$tdir}
1198         [[ $OSTCOUNT -le 1 ]] &amp;amp;&amp;amp; skip_env &lt;span class=&quot;code-quote&quot;&gt;&quot;Need at least 2 OSTs&quot;&lt;/span&gt; &amp;amp;&amp;amp; &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;
1199 
1200         local numfiles=100
1201 
1202         create_pool_nofail $POOL
1203         add_pool $POOL &lt;span class=&quot;code-quote&quot;&gt;&quot;OST0000&quot;&lt;/span&gt; &lt;span class=&quot;code-quote&quot;&gt;&quot;$FSNAME-OST0000_UUID &quot;&lt;/span&gt;
1204         create_pool_nofail $POOL2
1205         add_pool $POOL2 &lt;span class=&quot;code-quote&quot;&gt;&quot;OST0000&quot;&lt;/span&gt; &lt;span class=&quot;code-quote&quot;&gt;&quot;$FSNAME-OST0000_UUID &quot;&lt;/span&gt;
1206 
1207         add_loop $POOL 1 &amp;amp;
1208         add_loop $POOL2 2 &amp;amp;
1209         sleep 5
1210         create_dir $POOL_ROOT $POOL
1211         createmany -o $POOL_ROOT/${tfile} $numfiles ||
1212                 error &lt;span class=&quot;code-quote&quot;&gt;&quot;createmany $POOL_ROOT/${tfile} failed!&quot;&lt;/span&gt;
1213         wait
1214 
1215         &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 0
1216 }
1217 run_test 22 &lt;span class=&quot;code-quote&quot;&gt;&quot;Simultaneous manipulation of a pool&quot;&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;There&#8217;s only one instance of this failure. Logs are at:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/d0619996-799f-11e7-9b69-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/d0619996-799f-11e7-9b69-5254006e85c2&lt;/a&gt;&lt;/p&gt;</description>
                <environment>ZFS 0.7.x</environment>
        <key id="47700">LU-9845</key>
            <summary>ost-pools test_22 hangs with &#8216;WARNING: Pool &apos;lustre-mdt1&apos; has encountered an uncorrectable I/O failure and has been suspended.&#8217;</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="utopiabound">Nathaniel Clark</assignee>
                                    <reporter username="jamesanunez">James Nunez</reporter>
                        <labels>
                            <label>zfs</label>
                    </labels>
                <created>Tue, 8 Aug 2017 14:30:43 +0000</created>
                <updated>Fri, 11 Sep 2020 20:20:33 +0000</updated>
                            <resolved>Wed, 22 May 2019 20:16:29 +0000</resolved>
                                    <version>Lustre 2.11.0</version>
                    <version>Lustre 2.10.2</version>
                    <version>Lustre 2.10.3</version>
                    <version>Lustre 2.10.7</version>
                    <version>Lustre 2.12.3</version>
                                    <fixVersion>Lustre 2.11.0</fixVersion>
                    <fixVersion>Lustre 2.10.4</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>23</watches>
                                                                            <comments>
                            <comment id="205069" author="jay" created="Thu, 10 Aug 2017 18:05:01 +0000"  >&lt;p&gt;new failure at: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/1831f49c-7d82-11e7-9d88-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/1831f49c-7d82-11e7-9d88-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;This time it was in recovery-small test.&lt;/p&gt;</comment>
                            <comment id="208230" author="utopiabound" created="Wed, 13 Sep 2017 12:42:58 +0000"  >&lt;p&gt;sanity/test_116a with ZFS 0.7.1&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/5c39ea02-980b-11e7-b775-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/5c39ea02-980b-11e7-b775-5254006e85c2&lt;/a&gt;&lt;br/&gt;
OSS (onyx-41vm11)&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;18:47:36:[ 2630.525032] Lustre: DEBUG MARKER: == sanity test 116a: stripe QOS: free space balance ================================================== 17:47:09 (1505238429)
18:47:36:[ 2664.291335] WARNING: Pool &apos;lustre-ost1&apos; has encountered an uncorrectable I/O failure and has been suspended.
18:47:36:[ 2664.291335] 
18:47:36:[ 2664.364291] WARNING: Pool &apos;lustre-ost2&apos; has encountered an uncorrectable I/O failure and has been suspended.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="208477" author="adilger" created="Fri, 15 Sep 2017 07:10:00 +0000"  >&lt;p&gt;Hit this again &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/426d4a64-99b0-11e7-ba20-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/426d4a64-99b0-11e7-ba20-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="210112" author="yujian" created="Mon, 2 Oct 2017 18:31:03 +0000"  >&lt;p&gt;sanity test_101d: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/5e3dbaaa-a73c-11e7-b786-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/5e3dbaaa-a73c-11e7-b786-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="210155" author="yujian" created="Tue, 3 Oct 2017 01:26:44 +0000"  >&lt;p&gt;replay-single test_70c: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/eb3eccaa-a7b4-11e7-bb19-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/eb3eccaa-a7b4-11e7-bb19-5254006e85c2&lt;/a&gt;&lt;br/&gt;
replay-single test_101: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/6bcb137a-a958-11e7-bb19-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/6bcb137a-a958-11e7-bb19-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="211935" author="bogl" created="Wed, 25 Oct 2017 13:32:28 +0000"  >&lt;p&gt;similar error seen on master in conf-sanity, test_106:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/09fd20c4-b935-11e7-9d39-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/09fd20c4-b935-11e7-9d39-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="212020" author="bogl" created="Wed, 25 Oct 2017 23:24:38 +0000"  >&lt;p&gt;another on master, this time in conf-sanity, test_50f:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/71e5d1e8-b9cd-11e7-84a9-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/71e5d1e8-b9cd-11e7-84a9-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;seems to be happening a lot now.&lt;br/&gt;
I wonder if something recent went on master making this more frequent.&lt;/p&gt;</comment>
                            <comment id="213568" author="jhammond" created="Mon, 13 Nov 2017 15:58:16 +0000"  >&lt;p&gt;Nathaniel,&lt;/p&gt;

&lt;p&gt;It&apos;s really strange to see the &quot;Pool &apos;blahblah&apos; has encountered...&apos; message with no other messages indicating an I/O error before or after it. Is this a known phenomenon to the ZFS folks?&lt;/p&gt;</comment>
                            <comment id="213603" author="utopiabound" created="Mon, 13 Nov 2017 19:51:08 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=jhammond&quot; class=&quot;user-hover&quot; rel=&quot;jhammond&quot;&gt;jhammond&lt;/a&gt;,&lt;/p&gt;

&lt;p&gt;I&apos;m not seeing this as a known issue, but I also haven&apos;t gotten to dig into one of the fubar&apos;d zpools.&lt;/p&gt;</comment>
                            <comment id="213794" author="utopiabound" created="Wed, 15 Nov 2017 17:01:15 +0000"  >&lt;p&gt;Reproduced with Lustre 2.10.55 ZFS 0.7.3: (test conf-sanity/test_6)&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@onyx-121vm4 ~]# zpool status
  pool: lustre-ost1
 state: ONLINE
status: One or more devices are faulted in response to IO failures.
action: Make sure the affected devices are connected, then run &apos;zpool clear&apos;.
   see: http://zfsonlinux.org/msg/ZFS-8000-HC
  scan: none requested
config:

	NAME                STATE     READ WRITE CKSUM
	lustre-ost1         ONLINE       0     0     0
	  /tmp/lustre-ost1  ONLINE       0     0     0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Corresponding messages:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Nov 15 15:40:49 onyx-121vm4.onyx.hpdd.intel.com kernel: WARNING: Pool &apos;lustre-ost1&apos; has encountered an uncorrectable I/O failure and has been suspended.
Nov 15 15:40:54 onyx-121vm4.onyx.hpdd.intel.com kernel: Lustre: lustre-OST0000 is waiting for obd_unlinked_exports more than 8 seconds. The obd refcount = 6. Is it stuck?
Nov 15 15:40:54 onyx-121vm4.onyx.hpdd.intel.com kernel: Lustre: lustre-OST0000: UNLINKED ffff880017838c00 lustre-MDT0000-mdtlov_UUID 10.2.103.29@tcp 1 (0 1 0) 1 0 0 0:           (null)  0 stale:0
Nov 15 15:40:54 onyx-121vm4.onyx.hpdd.intel.com kernel: Lustre: lustre-OST0000: UNLINKED ffff88007bb6f400 eb0ec782-9708-eeb0-2cc9-b935c49ad433 10.2.103.27@tcp 1 (0 1 0) 1 0 0 0:           (null)  51539607553 stale:0
Nov 15 15:41:10 onyx-121vm4.onyx.hpdd.intel.com kernel: Lustre: lustre-OST0000 is waiting for obd_unlinked_exports more than 16 seconds. The obd refcount = 2. Is it stuck?
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;A ZDB Dump of the pool is at attached:  &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/28730/28730_zdb-lustre-ost1.log&quot; title=&quot;zdb-lustre-ost1.log attached to LU-9845&quot;&gt;zdb-lustre-ost1.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;/p&gt;

&lt;p&gt;Doing a &lt;tt&gt;zpool clear&lt;/tt&gt; does remove error condition.  It&apos;s weird that there&apos;s no actual error count on any of the devices.&lt;/p&gt;</comment>
                            <comment id="214047" author="yujian" created="Fri, 17 Nov 2017 18:56:01 +0000"  >&lt;p&gt;replay-ost-single test 1 also hit this issue:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/f5106ff6-cbbb-11e7-a066-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/f5106ff6-cbbb-11e7-a066-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;This is affecting patch review testing on master branch.&lt;/p&gt;</comment>
                            <comment id="214385" author="yujian" created="Wed, 22 Nov 2017 03:15:34 +0000"  >&lt;p&gt;sanity test 181 and conf-sanity test 106 in ZFS test sessions hit this issue:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/ffa41cec-cf21-11e7-8027-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/ffa41cec-cf21-11e7-8027-52540065bddc&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/1c200c8e-cf2a-11e7-a066-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/1c200c8e-cf2a-11e7-a066-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="214962" author="sarah" created="Wed, 29 Nov 2017 20:46:54 +0000"  >&lt;p&gt;sanity-benchmark test_bonnie, ost-pools test_23b hit the similar issue&lt;br/&gt;
2.10.2 RC1 zfs&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/b94986ae-d4fa-11e7-9c63-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/b94986ae-d4fa-11e7-9c63-52540065bddc&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/bb79157a-d4fa-11e7-9c63-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/bb79157a-d4fa-11e7-9c63-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;2.10.2 RC1 zfs DNE&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/1aabe594-d69b-11e7-9c63-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/1aabe594-d69b-11e7-9c63-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="215655" author="jhammond" created="Fri, 8 Dec 2017 14:19:27 +0000"  >&lt;p&gt;&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/81c7aa0e-db9d-11e7-8027-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/81c7aa0e-db9d-11e7-8027-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="215839" author="tappro" created="Fri, 8 Dec 2017 21:12:17 +0000"  >&lt;p&gt;sanity 101d this time:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/cbf6bcd8-dc2d-11e7-a066-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/cbf6bcd8-dc2d-11e7-a066-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="215866" author="kim.sebo" created="Sat, 9 Dec 2017 06:57:15 +0000"  >&lt;p&gt;We&apos;re also seeing what seems to be this problem. It only seems to occur during very high MDS loads. It can be reproduced reliably within minutes during a mdtest run. It occurs with all the following software combinations, all with CentOS 7.4 3.10.0-693.2.2 kernel.&lt;/p&gt;

&lt;p&gt;Lustre 2.10.1 DNE, ZFS 0.7.3, Mellanox ofed 4.2.1&lt;br/&gt;
 Lustre 2.10.1 DNE, ZFS 0.7.2, Mellanox ofed 4.2.1&lt;br/&gt;
 Lustre 2.10.1 DNE, ZFS 0.7.2, Mellanox ofed 4.1.1 (locally patched)&lt;/p&gt;

&lt;p&gt;The pool that faults seems random and occurs on all four MDS/MDTs. i/o will resume after a &quot;zpool clear&quot;, until another random MDT pool faults soon after.&lt;/p&gt;

&lt;p&gt;Syslog has only a single entry:&lt;br/&gt;
 Dec 9 15:21:38 g1b-mds02 kernel: WARNING: Pool &apos;zmdt1&apos; has encountered an uncorrectable I/O failure and has been suspended.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;# zpool status
  pool: zmdt1
 state: ONLINE
status: One or more devices are faulted in response to IO failures.
action: Make sure the affected devices are connected, then run &lt;span class=&quot;code-quote&quot;&gt;&apos;zpool clear&apos;&lt;/span&gt;.
   see: http:&lt;span class=&quot;code-comment&quot;&gt;//zfsonlinux.org/msg/ZFS-8000-HC
&lt;/span&gt;  scan: none requested
config:

        NAME             STATE  READ WRITE CKSUM
        zmdt1            ONLINE    0     0     0
          mirror-0       ONLINE    0     0     0
             MDT_LUN_03  ONLINE    0     0     0
             MDT_LUN_04  ONLINE    0     0     0

errors: No known data errors
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt; Attached is an excerpt from /proc/spl/kstat/zfs/dbgmsg . The pool faults in between timestamps ...3292 and ...3298). Both logging and &quot;zpool status&quot; was hung for ~5 seconds whilst this occurred.&lt;br/&gt;
 &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/28897/28897_mds02_faulted.log&quot; title=&quot;mds02_faulted.log attached to LU-9845&quot;&gt;mds02_faulted.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;</comment>
                            <comment id="215877" author="jhammond" created="Sat, 9 Dec 2017 14:27:43 +0000"  >&lt;p&gt;&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/ee2fac14-dc7e-11e7-8027-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/ee2fac14-dc7e-11e7-8027-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="216653" author="jhammond" created="Mon, 18 Dec 2017 21:13:52 +0000"  >&lt;p&gt;&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/185deada-e22b-11e7-8027-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/185deada-e22b-11e7-8027-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="216957" author="jhammond" created="Thu, 21 Dec 2017 15:41:36 +0000"  >&lt;p&gt;&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/17be7004-e22b-11e7-8027-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/17be7004-e22b-11e7-8027-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="216959" author="jhammond" created="Thu, 21 Dec 2017 16:45:22 +0000"  >&lt;p&gt;I was able to reproduce the &quot;Pool &apos;lustre-mdt1&apos; has encountered an uncorrectable I/O failure and has been suspended&quot; message on a local setup by running the root FS out of space.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;m:~# df -h
Filesystem                   Size  Used Avail Use% Mounted on
/dev/vda1                     24G   20G  2.7G  88% /
...
lustre-mdt1/mdt1              72M  3.4M   67M   5% /mnt/lustre-mds1
lustre-ost1/ost1             239M  4.0M  233M   2% /mnt/lustre-ost1
lustre-ost2/ost2             239M  8.0M  229M   4% /mnt/lustre-ost2
192.168.122.121@tcp:/lustre  478M   12M  462M   3% /mnt/lustre
m:~# du -hs /tmp/lustre-mdt1 
40M	/tmp/lustre-mdt1
m:~# ls -lh /tmp/lustre-mdt1
-rw-r--r-- 1 root root 196M Dec 21 10:12 /tmp/lustre-mdt1
m:~# dd if=/dev/zero of=/tmp/f0 bs=1M
dd: error writing &#8216;/tmp/f0&#8217;: No space left on device
3982+0 records in
3981+0 records out
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Consistent with what we see in AT test logs there were no other messages besides the uncorrectable I/O failure message:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[692069.367224] Lustre: DEBUG MARKER: == sanity-hsm test 13: Recursively import and restore a directory ==================================== 10:13:00 (1513872780)
[692095.705099] WARNING: Pool &apos;lustre-mdt1&apos; has encountered an uncorrectable I/O failure and has been suspended.

[692095.709069] WARNING: Pool &apos;lustre-ost1&apos; has encountered an uncorrectable I/O failure and has been suspended.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Perhaps we can avoid this issue by not using sparse backing files for ZFS.&lt;/p&gt;</comment>
                            <comment id="217004" author="jhammond" created="Thu, 21 Dec 2017 19:59:13 +0000"  >&lt;p&gt;Would it be useful to set the pool failure mode to panic to get more information here?&lt;/p&gt;</comment>
                            <comment id="217005" author="jhammond" created="Thu, 21 Dec 2017 20:04:59 +0000"  >&lt;p&gt;Or patch zfs to print more of the zio?&lt;/p&gt;</comment>
                            <comment id="217122" author="gerrit" created="Fri, 22 Dec 2017 16:24:57 +0000"  >&lt;p&gt;John L. Hammond (john.hammond@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/30649&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30649&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9845&quot; title=&quot;ost-pools test_22 hangs with &#8216;WARNING: Pool &amp;#39;lustre-mdt1&amp;#39; has encountered an uncorrectable I/O failure and has been suspended.&#8217;&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9845&quot;&gt;&lt;del&gt;LU-9845&lt;/del&gt;&lt;/a&gt; test: add failmode=panic to zpool import options&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 8f3c50b516bfb82d91efc40efb1ce9b0fe832f76&lt;/p&gt;</comment>
                            <comment id="217126" author="jhammond" created="Fri, 22 Dec 2017 17:47:30 +0000"  >&lt;p&gt;This could also be due to a blocked mmp write. Running locally:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;m:~# export FSTYPE=zfs
m:~# bash $LUSTRE/tests/llmount.sh
...
m:~# cat /sys/module/zfs/parameters/zfs_multihost_interval 
1000
m:~# echo 100 &amp;gt; /sys/module/zfs/parameters/zfs_multihost_interval # set mmp interval to 100ms
m:~# chrt -f 20 dd if=/dev/zero of=/dev/null &amp;amp;
m:~# chrt -f 20 dd if=/dev/zero of=/dev/null &amp;amp;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;m:~# dmesg | tail
[  122.404321] Lustre: Skipped 1 previous similar message
[  122.406378] Lustre: Mounted lustre-client
[  123.540533] Lustre: DEBUG MARKER: Using TIMEOUT=20
[  171.949008] sched: RT throttling activated
[  308.950021] WARNING: Pool &apos;lustre-ost2&apos; has encountered an uncorrectable I/O failure and has been suspended.

[  308.950055] WARNING: Pool &apos;lustre-ost1&apos; has encountered an uncorrectable I/O failure and has been suspended.

[  308.950070] WARNING: Pool &apos;lustre-mdt1&apos; has encountered an uncorrectable I/O failure and has been suspended.

m:~# zpool status lustre-mdt1
  pool: lustre-mdt1
 state: ONLINE
status: One or more devices are faulted in response to IO failures.
action: Make sure the affected devices are connected, then run &apos;zpool clear&apos;.
   see: http://zfsonlinux.org/msg/ZFS-8000-HC
  scan: none requested
config:

	NAME                STATE     READ WRITE CKSUM
	lustre-mdt1         ONLINE       0     0     0
	  /tmp/lustre-mdt1  ONLINE       0     0     0

errors: No known data errors
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="217648" author="gerrit" created="Fri, 5 Jan 2018 22:53:12 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/30649/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30649/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9845&quot; title=&quot;ost-pools test_22 hangs with &#8216;WARNING: Pool &amp;#39;lustre-mdt1&amp;#39; has encountered an uncorrectable I/O failure and has been suspended.&#8217;&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9845&quot;&gt;&lt;del&gt;LU-9845&lt;/del&gt;&lt;/a&gt; test: add failmode=panic to zpool import options&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 76afb10a76830396b1457f51eb16992da8a894a4&lt;/p&gt;</comment>
                            <comment id="217666" author="green" created="Sun, 7 Jan 2018 19:28:49 +0000"  >&lt;p&gt;I had two panic events with the landed patch, both very similar - a sanityn test 77/78 triggers a &quot;Sleep under spinlock&quot; warning and then shortly after the zpool panic hits.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[164923.664467] Lustre: DEBUG MARKER: == sanityn test 77c: check ORR NRS policy ============================================================ 23:24:29 (1515212669)
[164937.316648] BUG: sleeping function called from invalid context at /home/green/git/lustre-release/libcfs/libcfs/hash.c:1120
[164937.319085] in_atomic(): 1, irqs_disabled(): 0, pid: 9459, name: lctl
[164937.319812] INFO: lockdep is turned off.
[164937.320473] CPU: 0 PID: 9459 Comm: lctl Tainted: P        W  OE  ------------   3.10.0-debug #2
[164937.322225] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[164937.322915]  ffff8802e79d4b40 000000007ad8fd8d ffff8802c0183d18 ffffffff816fd3e4
[164937.324369]  ffff8802c0183d30 ffffffff810b00e9 0000000000000018 ffff8802c0183d80
[164937.325689]  ffffffffa0295233 ffffffffa0299cf6 ffffc90015e68000 0000000000001000
[164937.336350] Call Trace:
[164937.337079]  [&amp;lt;ffffffff816fd3e4&amp;gt;] dump_stack+0x19/0x1b
[164937.337746]  [&amp;lt;ffffffff810b00e9&amp;gt;] __might_sleep+0xe9/0x110
[164937.338436]  [&amp;lt;ffffffffa0295233&amp;gt;] cfs_hash_putref+0x233/0x500 [libcfs]
[164937.339120]  [&amp;lt;ffffffffa0299cf6&amp;gt;] ? cfs_binheap_destroy+0x396/0x830 [libcfs]
[164937.339927]  [&amp;lt;ffffffffa11c1909&amp;gt;] nrs_orr_stop+0x69/0x270 [ptlrpc]
[164937.340647]  [&amp;lt;ffffffffa11b9d19&amp;gt;] nrs_policy_stop0+0x39/0x1c0 [ptlrpc]
[164937.354153]  [&amp;lt;ffffffffa11ba688&amp;gt;] nrs_policy_stop_primary.isra.9+0x78/0x1f0 [ptlrpc]
[164937.355884]  [&amp;lt;ffffffffa11bb9c4&amp;gt;] nrs_policy_start_locked+0x624/0x670 [ptlrpc]
[164937.357170]  [&amp;lt;ffffffffa11bbb3b&amp;gt;] nrs_policy_ctl+0x12b/0x2b0 [ptlrpc]
[164937.357910]  [&amp;lt;ffffffffa11be12f&amp;gt;] ptlrpc_nrs_policy_control+0x10f/0x2d0 [ptlrpc]
[164937.359173]  [&amp;lt;ffffffffa11a05b5&amp;gt;] ptlrpc_lprocfs_nrs_seq_write+0x3a5/0x560 [ptlrpc]
[164937.360594]  [&amp;lt;ffffffff8125d3cd&amp;gt;] proc_reg_write+0x3d/0x80
[164937.362287]  [&amp;lt;ffffffff811ed36d&amp;gt;] vfs_write+0xbd/0x1e0
[164937.362943]  [&amp;lt;ffffffff811fe3dd&amp;gt;] ? putname+0x3d/0x60
[164937.363595]  [&amp;lt;ffffffff811ede34&amp;gt;] SyS_write+0x84/0xf0
[164937.364292]  [&amp;lt;ffffffff8170fc49&amp;gt;] system_call_fastpath+0x16/0x1b
[164937.364958] BUG: scheduling while atomic: lctl/9459/0x10000002
[164937.365627] INFO: lockdep is turned off.
[164937.366239] Modules linked in: lustre(OE) ofd(OE) osp(OE) lod(OE) ost(OE) mdt(OE) mdd(OE) mgs(OE) osd_zfs(OE) lquota(OE) lfsck(OE) obdecho(OE) mgc(OE) lov(OE) mdc(OE) osc(OE) lmv(OE) fid(OE) fld(OE) ptlrpc_gss(OE) ptlrpc(OE) obdclass(OE) ksocklnd(OE) lnet(OE) libcfs(OE) ext4 mbcache loop zfs(PO) zunicode(PO) zavl(PO) icp(PO) zcommon(PO) znvpair(PO) spl(O) zlib_deflate jbd2 syscopyarea sysfillrect sysimgblt ttm ata_generic drm_kms_helper pata_acpi drm i2c_piix4 ata_piix pcspkr virtio_balloon serio_raw virtio_blk virtio_console i2c_core libata floppy nfsd ip_tables rpcsec_gss_krb5 [last unloaded: libcfs]
[164937.412757] CPU: 0 PID: 9459 Comm: lctl Tainted: P        W  OE  ------------   3.10.0-debug #2
[164937.414008] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[164937.416003]  ffff8802e79d4b40 000000007ad8fd8d ffff8802c0183c90 ffffffff816fd3e4
[164937.418242]  ffff8802c0183ca8 ffffffff816f9218 ffff8802c0183fd8 ffff8802c0183d08
[164937.419596]  ffffffff817041a7 ffff8802e79d4b40 ffff8802c0183fd8 ffff8802c0183fd8
[164937.421025] Call Trace:
[164937.421640]  [&amp;lt;ffffffff816fd3e4&amp;gt;] dump_stack+0x19/0x1b
[164937.422370]  [&amp;lt;ffffffff816f9218&amp;gt;] __schedule_bug+0x59/0x68
[164937.423092]  [&amp;lt;ffffffff817041a7&amp;gt;] __schedule+0x807/0x950
[164937.423866]  [&amp;lt;ffffffff810b4936&amp;gt;] __cond_resched+0x26/0x30
[164937.424823]  [&amp;lt;ffffffff817045da&amp;gt;] _cond_resched+0x3a/0x50
[164937.425472]  [&amp;lt;ffffffffa0295238&amp;gt;] cfs_hash_putref+0x238/0x500 [libcfs]
[164937.426355]  [&amp;lt;ffffffffa0299cf6&amp;gt;] ? cfs_binheap_destroy+0x396/0x830 [libcfs]
[164937.427348]  [&amp;lt;ffffffffa11c1909&amp;gt;] nrs_orr_stop+0x69/0x270 [ptlrpc]
[164937.428483]  [&amp;lt;ffffffffa11b9d19&amp;gt;] nrs_policy_stop0+0x39/0x1c0 [ptlrpc]
[164937.429204]  [&amp;lt;ffffffffa11ba688&amp;gt;] nrs_policy_stop_primary.isra.9+0x78/0x1f0 [ptlrpc]
[164937.430526]  [&amp;lt;ffffffffa11bb9c4&amp;gt;] nrs_policy_start_locked+0x624/0x670 [ptlrpc]
[164937.440376]  [&amp;lt;ffffffffa11bbb3b&amp;gt;] nrs_policy_ctl+0x12b/0x2b0 [ptlrpc]
[164937.440915]  [&amp;lt;ffffffffa11be12f&amp;gt;] ptlrpc_nrs_policy_control+0x10f/0x2d0 [ptlrpc]
[164937.441796]  [&amp;lt;ffffffffa11a05b5&amp;gt;] ptlrpc_lprocfs_nrs_seq_write+0x3a5/0x560 [ptlrpc]
[164937.442696]  [&amp;lt;ffffffff8125d3cd&amp;gt;] proc_reg_write+0x3d/0x80
[164937.443151]  [&amp;lt;ffffffff811ed36d&amp;gt;] vfs_write+0xbd/0x1e0
[164937.443621]  [&amp;lt;ffffffff811fe3dd&amp;gt;] ? putname+0x3d/0x60
[164937.444235]  [&amp;lt;ffffffff811ede34&amp;gt;] SyS_write+0x84/0xf0
[164937.444903]  [&amp;lt;ffffffff8170fc49&amp;gt;] system_call_fastpath+0x16/0x1b
[164937.510313] BUG: scheduling while atomic: lctl/9459/0x10000002
[164937.511243] INFO: lockdep is turned off.
[164937.511901] Modules linked in: lustre(OE) ofd(OE) osp(OE) lod(OE) ost(OE) mdt(OE) mdd(OE) mgs(OE) osd_zfs(OE) lquota(OE) lfsck(OE) obdecho(OE) mgc(OE) lov(OE) mdc(OE) osc(OE) lmv(OE) fid(OE) fld(OE) ptlrpc_gss(OE) ptlrpc(OE) obdclass(OE) ksocklnd(OE) lnet(OE) libcfs(OE) ext4 mbcache loop zfs(PO) zunicode(PO) zavl(PO) icp(PO) zcommon(PO) znvpair(PO) spl(O) zlib_deflate jbd2 syscopyarea sysfillrect sysimgblt ttm ata_generic drm_kms_helper pata_acpi drm i2c_piix4 ata_piix pcspkr virtio_balloon serio_raw virtio_blk virtio_console i2c_core libata floppy nfsd ip_tables rpcsec_gss_krb5 [last unloaded: libcfs]
[164937.539224] CPU: 15 PID: 9459 Comm: lctl Tainted: P        W  OE  ------------   3.10.0-debug #2
[164937.540796] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[164937.544056]  ffff8802e79d4b40 000000007ad8fd8d ffff8802c0183c90 ffffffff816fd3e4
[164937.578747]  ffff8802c0183ca8 ffffffff816f9218 ffff8802c0183fd8 ffff8802c0183d08
[164937.581213]  ffffffff817041a7 ffff8802e79d4b40 ffff8802c0183fd8 ffff8802c0183fd8
[164937.582630] Call Trace:
[164937.583224]  [&amp;lt;ffffffff816fd3e4&amp;gt;] dump_stack+0x19/0x1b
[164937.583905]  [&amp;lt;ffffffff816f9218&amp;gt;] __schedule_bug+0x59/0x68
[164937.584571]  [&amp;lt;ffffffff817041a7&amp;gt;] __schedule+0x807/0x950
[164937.585220]  [&amp;lt;ffffffff810b4936&amp;gt;] __cond_resched+0x26/0x30
[164937.585885]  [&amp;lt;ffffffff817045da&amp;gt;] _cond_resched+0x3a/0x50
[164937.586567]  [&amp;lt;ffffffffa0295238&amp;gt;] cfs_hash_putref+0x238/0x500 [libcfs]
[164937.587246]  [&amp;lt;ffffffffa0299cf6&amp;gt;] ? cfs_binheap_destroy+0x396/0x830 [libcfs]
[164937.588052]  [&amp;lt;ffffffffa11c1909&amp;gt;] nrs_orr_stop+0x69/0x270 [ptlrpc]
[164937.588774]  [&amp;lt;ffffffffa11b9d19&amp;gt;] nrs_policy_stop0+0x39/0x1c0 [ptlrpc]
[164937.589496]  [&amp;lt;ffffffffa11ba688&amp;gt;] nrs_policy_stop_primary.isra.9+0x78/0x1f0 [ptlrpc]
[164937.590765]  [&amp;lt;ffffffffa11bb9c4&amp;gt;] nrs_policy_start_locked+0x624/0x670 [ptlrpc]
[164937.592025]  [&amp;lt;ffffffffa11bbb3b&amp;gt;] nrs_policy_ctl+0x12b/0x2b0 [ptlrpc]
[164937.592729]  [&amp;lt;ffffffffa11be12f&amp;gt;] ptlrpc_nrs_policy_control+0x10f/0x2d0 [ptlrpc]
[164937.593993]  [&amp;lt;ffffffffa11a05b5&amp;gt;] ptlrpc_lprocfs_nrs_seq_write+0x3a5/0x560 [ptlrpc]
[164937.595191]  [&amp;lt;ffffffff8125d3cd&amp;gt;] proc_reg_write+0x3d/0x80
[164937.596143]  [&amp;lt;ffffffff811ed36d&amp;gt;] vfs_write+0xbd/0x1e0
[164937.596779]  [&amp;lt;ffffffff811fe3dd&amp;gt;] ? putname+0x3d/0x60
[164937.597413]  [&amp;lt;ffffffff811ede34&amp;gt;] SyS_write+0x84/0xf0
[164937.598038]  [&amp;lt;ffffffff8170fc49&amp;gt;] system_call_fastpath+0x16/0x1b
[164937.614894] BUG: scheduling while atomic: lctl/9459/0x10000002
[164937.615623] INFO: lockdep is turned off.
[164937.616240] Modules linked in: lustre(OE) ofd(OE) osp(OE) lod(OE) ost(OE) mdt(OE) mdd(OE) mgs(OE) osd_zfs(OE) lquota(OE) lfsck(OE) obdecho(OE) mgc(OE) lov(OE) mdc(OE) osc(OE) lmv(OE) fid(OE) fld(OE) ptlrpc_gss(OE) ptlrpc(OE) obdclass(OE) ksocklnd(OE) lnet(OE) libcfs(OE) ext4 mbcache loop zfs(PO) zunicode(PO) zavl(PO) icp(PO) zcommon(PO) znvpair(PO) spl(O) zlib_deflate jbd2 syscopyarea sysfillrect sysimgblt ttm ata_generic drm_kms_helper pata_acpi drm i2c_piix4 ata_piix pcspkr virtio_balloon serio_raw virtio_blk virtio_console i2c_core libata floppy nfsd ip_tables rpcsec_gss_krb5 [last unloaded: libcfs]
[164937.622741] CPU: 15 PID: 9459 Comm: lctl Tainted: P        W  OE  ------------   3.10.0-debug #2
[164937.623998] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[164937.624663]  ffff8802e79d4b40 000000007ad8fd8d ffff8802c0183c90 ffffffff816fd3e4
[164937.626597]  ffff8802c0183ca8 ffffffff816f9218 ffff8802c0183fd8 ffff8802c0183d08
[164937.627859]  ffffffff817041a7 ffff8802e79d4b40 ffff8802c0183fd8 ffff8802c0183fd8
[164937.630788] Call Trace:
[164937.631507]  [&amp;lt;ffffffff816fd3e4&amp;gt;] dump_stack+0x19/0x1b
[164937.632306]  [&amp;lt;ffffffff816f9218&amp;gt;] __schedule_bug+0x59/0x68
[164937.633014]  [&amp;lt;ffffffff817041a7&amp;gt;] __schedule+0x807/0x950
[164937.634560]  [&amp;lt;ffffffff810b4936&amp;gt;] __cond_resched+0x26/0x30
[164937.635082]  [&amp;lt;ffffffff817045da&amp;gt;] _cond_resched+0x3a/0x50
[164937.635744]  [&amp;lt;ffffffffa0295238&amp;gt;] cfs_hash_putref+0x238/0x500 [libcfs]
[164937.636594]  [&amp;lt;ffffffffa0299cf6&amp;gt;] ? cfs_binheap_destroy+0x396/0x830 [libcfs]
[164937.643706]  [&amp;lt;ffffffffa11c1909&amp;gt;] nrs_orr_stop+0x69/0x270 [ptlrpc]
[164937.644453]  [&amp;lt;ffffffffa11b9d19&amp;gt;] nrs_policy_stop0+0x39/0x1c0 [ptlrpc]
[164937.645174]  [&amp;lt;ffffffffa11ba688&amp;gt;] nrs_policy_stop_primary.isra.9+0x78/0x1f0 [ptlrpc]
[164937.656192]  [&amp;lt;ffffffffa11bb9c4&amp;gt;] nrs_policy_start_locked+0x624/0x670 [ptlrpc]
[164937.657543]  [&amp;lt;ffffffffa11bbb3b&amp;gt;] nrs_policy_ctl+0x12b/0x2b0 [ptlrpc]
[164937.658320]  [&amp;lt;ffffffffa11be12f&amp;gt;] ptlrpc_nrs_policy_control+0x10f/0x2d0 [ptlrpc]
[164937.659782]  [&amp;lt;ffffffffa11a05b5&amp;gt;] ptlrpc_lprocfs_nrs_seq_write+0x3a5/0x560 [ptlrpc]
[164937.661211]  [&amp;lt;ffffffff8125d3cd&amp;gt;] proc_reg_write+0x3d/0x80
[164937.661885]  [&amp;lt;ffffffff811ed36d&amp;gt;] vfs_write+0xbd/0x1e0
[164937.663536]  [&amp;lt;ffffffff811fe3dd&amp;gt;] ? putname+0x3d/0x60
[164937.664179]  [&amp;lt;ffffffff811ede34&amp;gt;] SyS_write+0x84/0xf0
[164937.666440]  [&amp;lt;ffffffff8170fc49&amp;gt;] system_call_fastpath+0x16/0x1b
[164937.681884] BUG: scheduling while atomic: lctl/9459/0x10000002
[164937.682619] INFO: lockdep is turned off.
[164937.683241] Modules linked in: lustre(OE) ofd(OE) osp(OE) lod(OE) ost(OE) mdt(OE) mdd(OE) mgs(OE) osd_zfs(OE) lquota(OE) lfsck(OE) obdecho(OE) mgc(OE) lov(OE) mdc(OE) osc(OE) lmv(OE) fid(OE) fld(OE) ptlrpc_gss(OE) ptlrpc(OE) obdclass(OE) ksocklnd(OE) lnet(OE) libcfs(OE) ext4 mbcache loop zfs(PO) zunicode(PO) zavl(PO) icp(PO) zcommon(PO) znvpair(PO) spl(O) zlib_deflate jbd2 syscopyarea sysfillrect sysimgblt ttm ata_generic drm_kms_helper pata_acpi drm i2c_piix4 ata_piix pcspkr virtio_balloon serio_raw virtio_blk virtio_console i2c_core libata floppy nfsd ip_tables rpcsec_gss_krb5 [last unloaded: libcfs]
[164937.721046] CPU: 15 PID: 9459 Comm: lctl Tainted: P        W  OE  ------------   3.10.0-debug #2
[164937.722320] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[164937.722980]  ffff8802e79d4b40 000000007ad8fd8d ffff8802c0183c90 ffffffff816fd3e4
[164937.724242]  ffff8802c0183ca8 ffffffff816f9218 ffff8802c0183fd8 ffff8802c0183d08
[164937.748057]  ffffffff817041a7 ffff8802e79d4b40 ffff8802c0183fd8 ffff8802c0183fd8
[164937.755299] Call Trace:
[164937.755938]  [&amp;lt;ffffffff816fd3e4&amp;gt;] dump_stack+0x19/0x1b
[164937.757446]  [&amp;lt;ffffffff816f9218&amp;gt;] __schedule_bug+0x59/0x68
[164937.764367]  [&amp;lt;ffffffff817041a7&amp;gt;] __schedule+0x807/0x950
[164937.765081]  [&amp;lt;ffffffff810b4936&amp;gt;] __cond_resched+0x26/0x30
[164937.765847]  [&amp;lt;ffffffff817045da&amp;gt;] _cond_resched+0x3a/0x50
[164937.766658]  [&amp;lt;ffffffffa0295238&amp;gt;] cfs_hash_putref+0x238/0x500 [libcfs]
[164937.767358]  [&amp;lt;ffffffffa0299cf6&amp;gt;] ? cfs_binheap_destroy+0x396/0x830 [libcfs]
[164937.768142]  [&amp;lt;ffffffffa11c1909&amp;gt;] nrs_orr_stop+0x69/0x270 [ptlrpc]
[164937.768970]  [&amp;lt;ffffffffa11b9d19&amp;gt;] nrs_policy_stop0+0x39/0x1c0 [ptlrpc]
[164937.769730]  [&amp;lt;ffffffffa11ba688&amp;gt;] nrs_policy_stop_primary.isra.9+0x78/0x1f0 [ptlrpc]
[164937.772095]  [&amp;lt;ffffffffa11bb9c4&amp;gt;] nrs_policy_start_locked+0x624/0x670 [ptlrpc]
[164937.775802]  [&amp;lt;ffffffffa11bbb3b&amp;gt;] nrs_policy_ctl+0x12b/0x2b0 [ptlrpc]
[164937.776892]  [&amp;lt;ffffffffa11be12f&amp;gt;] ptlrpc_nrs_policy_control+0x10f/0x2d0 [ptlrpc]
[164937.778593]  [&amp;lt;ffffffffa11a05b5&amp;gt;] ptlrpc_lprocfs_nrs_seq_write+0x3a5/0x560 [ptlrpc]
[164937.779895]  [&amp;lt;ffffffff8125d3cd&amp;gt;] proc_reg_write+0x3d/0x80
[164937.780630]  [&amp;lt;ffffffff811ed36d&amp;gt;] vfs_write+0xbd/0x1e0
[164937.781701]  [&amp;lt;ffffffff811fe3dd&amp;gt;] ? putname+0x3d/0x60
[164937.784249]  [&amp;lt;ffffffff811ede34&amp;gt;] SyS_write+0x84/0xf0
[164937.784959]  [&amp;lt;ffffffff8170fc49&amp;gt;] system_call_fastpath+0x16/0x1b
[164941.981909] Kernel panic - not syncing: Pool &apos;lustre-ost2&apos; has encountered an uncorrectable I/O failure and the failure mode property for this pool is set to panic.
[164941.985000] CPU: 0 PID: 13624 Comm: mmp Tainted: P        W  OE  ------------   3.10.0-debug #2
[164941.986459] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[164941.987216]  ffffffffa0c844c8 00000000ff7aa3be ffff8802cf5a3cc0 ffffffff816fd3e4
[164941.988699]  ffff8802cf5a3d40 ffffffff816f8c34 ffff880200000010 ffff8802cf5a3d50
[164941.990160]  ffff8802cf5a3cf0 00000000ff7aa3be ffffffffa0565213 ffff8802266ec000
[164941.991644] Call Trace:
[164941.992359]  [&amp;lt;ffffffff816fd3e4&amp;gt;] dump_stack+0x19/0x1b
[164941.993104]  [&amp;lt;ffffffff816f8c34&amp;gt;] panic+0xd8/0x1e7
[164941.993863]  [&amp;lt;ffffffffa0565213&amp;gt;] ? __cv_timedwait_common+0x143/0x240 [spl]
[164942.007808]  [&amp;lt;ffffffffa0bdf279&amp;gt;] zio_suspend+0x1c9/0x1d0 [zfs]
[164942.008631]  [&amp;lt;ffffffffa0b56242&amp;gt;] mmp_thread+0x9c2/0xb30 [zfs]
[164942.009436]  [&amp;lt;ffffffffa0b55640&amp;gt;] ? metaslab_check_free+0x190/0x190 [zfs]
[164942.010251]  [&amp;lt;ffffffffa0b55880&amp;gt;] ? mmp_write_done+0x240/0x240 [zfs]
[164942.011057]  [&amp;lt;ffffffffa055f091&amp;gt;] thread_generic_wrapper+0x71/0x80 [spl]
[164942.011793]  [&amp;lt;ffffffffa055f020&amp;gt;] ? __thread_exit+0x20/0x20 [spl]
[164942.012514]  [&amp;lt;ffffffff810a2eba&amp;gt;] kthread+0xea/0xf0
[164942.013119]  [&amp;lt;ffffffff810a2dd0&amp;gt;] ? kthread_create_on_node+0x140/0x140
[164942.013767]  [&amp;lt;ffffffff8170fb98&amp;gt;] ret_from_fork+0x58/0x90
[164942.014394]  [&amp;lt;ffffffff810a2dd0&amp;gt;] ? kthread_create_on_node+0x140/0x140
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Hm, I was under full impression tehre was a ticket for those schedule when atomic stuff in nrs_orr_stop but cannot find it.&lt;/p&gt;

&lt;p&gt;the other crash was in test 78 with a lot more warnings in between before hitting the panic, and Jira does not like such a long comment.&lt;/p&gt;</comment>
                            <comment id="217684" author="yong.fan" created="Mon, 8 Jan 2018 11:41:29 +0000"  >&lt;p&gt;+1 on master:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/15f738d8-f463-11e7-8c23-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/15f738d8-f463-11e7-8c23-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="217696" author="jhammond" created="Mon, 8 Jan 2018 16:24:01 +0000"  >&lt;p&gt;The panic in  &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/15f738d8-f463-11e7-8c23-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/15f738d8-f463-11e7-8c23-52540065bddc&lt;/a&gt; is due to MMP:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 7799.830025] Kernel panic - not syncing: Pool &apos;lustre-mdt1&apos; has encountered an uncorrectable I/O failure and the failure mode property for this pool is set to panic.
[ 7799.831007] CPU: 1 PID: 15805 Comm: mmp Tainted: P           OE  ------------   3.10.0-693.11.1.el7_lustre.x86_64 #1
[ 7799.831007] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2007
[ 7799.831007]  ffff880041bda000 000000002c0bba5e ffff880056c73d28 ffffffff816a3e3d
[ 7799.831007]  ffff880056c73da8 ffffffff8169dd24 ffff880000000010 ffff880056c73db8
[ 7799.831007]  ffff880056c73d58 000000002c0bba5e ffff880056c73dc0 ffff880041bda000
[ 7799.831007] Call Trace:
[ 7799.831007]  [&amp;lt;ffffffff816a3e3d&amp;gt;] dump_stack+0x19/0x1b
[ 7799.831007]  [&amp;lt;ffffffff8169dd24&amp;gt;] panic+0xe8/0x20d
[ 7799.831007]  [&amp;lt;ffffffffc08bbcb6&amp;gt;] zio_suspend+0x106/0x110 [zfs]
[ 7799.831007]  [&amp;lt;ffffffffc08427ea&amp;gt;] mmp_thread+0x7ea/0x820 [zfs]
[ 7799.831007]  [&amp;lt;ffffffffc0841e90&amp;gt;] ? metaslab_check_free+0x190/0x190 [zfs]
[ 7799.831007]  [&amp;lt;ffffffffc0842000&amp;gt;] ? mmp_write_done+0x170/0x170 [zfs]
[ 7799.831007]  [&amp;lt;ffffffffc070cfa1&amp;gt;] thread_generic_wrapper+0x71/0x80 [spl]
[ 7799.831007]  [&amp;lt;ffffffffc070cf30&amp;gt;] ? __thread_exit+0x20/0x20 [spl]
[ 7799.831007]  [&amp;lt;ffffffff810b099f&amp;gt;] kthread+0xcf/0xe0
[ 7799.831007]  [&amp;lt;ffffffff810b08d0&amp;gt;] ? insert_kthread_work+0x40/0x40
[ 7799.831007]  [&amp;lt;ffffffff816b4fd8&amp;gt;] ret_from_fork+0x58/0x90
[ 7799.831007]  [&amp;lt;ffffffff810b08d0&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="217757" author="green" created="Tue, 9 Jan 2018 05:33:01 +0000"  >&lt;p&gt;aha, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6849&quot; title=&quot;sanity-quota test 22 LBUG with &#8220;ASSERTION( orro-&amp;gt;oo_ref == 0 ) failed: Busy NRS TRR policy object for OST&#8221;&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6849&quot;&gt;LU-6849&lt;/a&gt; is where I was putting my reports&lt;/p&gt;</comment>
                            <comment id="217877" author="sbuisson" created="Wed, 10 Jan 2018 07:01:20 +0000"  >&lt;p&gt;Hit with sanity test_64d:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/696dfeac-f562-11e7-854b-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/696dfeac-f562-11e7-854b-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Panic is due to MMP:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 2212.716735] Kernel panic - not syncing: Pool &apos;lustre-ost1&apos; has encountered an uncorrectable I/O failure and the failure mode property for this pool is set to panic.
[ 2212.718347] CPU: 0 PID: 777 Comm: mmp Tainted: P           OE  ------------   3.10.0-693.11.1.el7_lustre.x86_64 #1
[ 2212.719366] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[ 2212.719946]  ffff880058852000 00000000554e0bda ffff88005335bd28 ffffffff816a3e3d
[ 2212.720772]  ffff88005335bda8 ffffffff8169dd24 ffff880000000010 ffff88005335bdb8
[ 2212.721598]  ffff88005335bd58 00000000554e0bda ffff88005335bdc0 ffff880058852000
[ 2212.722421] Call Trace:
[ 2212.722687]  [&amp;lt;ffffffff816a3e3d&amp;gt;] dump_stack+0x19/0x1b
[ 2212.723211]  [&amp;lt;ffffffff8169dd24&amp;gt;] panic+0xe8/0x20d
[ 2212.723744]  [&amp;lt;ffffffffc07facb6&amp;gt;] zio_suspend+0x106/0x110 [zfs]
[ 2212.724356]  [&amp;lt;ffffffffc07817ea&amp;gt;] mmp_thread+0x7ea/0x820 [zfs]
[ 2212.724969]  [&amp;lt;ffffffffc0780e90&amp;gt;] ? metaslab_check_free+0x190/0x190 [zfs]
[ 2212.725676]  [&amp;lt;ffffffffc0781000&amp;gt;] ? mmp_write_done+0x170/0x170 [zfs]
[ 2212.726312]  [&amp;lt;ffffffffc064bfa1&amp;gt;] thread_generic_wrapper+0x71/0x80 [spl]
[ 2212.726988]  [&amp;lt;ffffffffc064bf30&amp;gt;] ? __thread_exit+0x20/0x20 [spl]
[ 2212.727604]  [&amp;lt;ffffffff810b099f&amp;gt;] kthread+0xcf/0xe0
[ 2212.728092]  [&amp;lt;ffffffff810b08d0&amp;gt;] ? insert_kthread_work+0x40/0x40
[ 2212.728724]  [&amp;lt;ffffffff816b4fd8&amp;gt;] ret_from_fork+0x58/0x90
[ 2212.729261]  [&amp;lt;ffffffff810b08d0&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;ZFS 0.7.3&lt;/p&gt;</comment>
                            <comment id="217974" author="utopiabound" created="Thu, 11 Jan 2018 14:13:38 +0000"  >&lt;p&gt;vdev_random_leaf() (which is used to pick a vdev to write to in mmp_thread) was rewritten as mmp_random_leaf() (See &lt;a href=&quot;https://github.com/zfsonlinux/zfs/pull/6665&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;ZFS Pull 6665&lt;/a&gt;) for ZFS 0.7.5/0.7.4.  Looking at the code, I don&apos;t see this being the issue that was fixed, as this prevents an infinite loop if upper vdev is writeable with no writable children.&lt;/p&gt;</comment>
                            <comment id="218007" author="utopiabound" created="Thu, 11 Jan 2018 17:16:27 +0000"  >&lt;p&gt;b2_10, ZFS 0.7.5 &lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/db69c948-f5b5-11e7-94c7-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/db69c948-f5b5-11e7-94c7-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;sanityn/test_40d (MMP trips during test 40c, but nothing hangs until beginning of test 40d)&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Jan  9 23:47:51 trevis-4vm7 kernel: WARNING: Pool &apos;lustre-ost2&apos; has encountered an uncorrectable I/O failure and has been suspended.
Jan  9 23:47:52 trevis-4vm7 kernel: WARNING: Pool &apos;lustre-ost1&apos; has encountered an uncorrectable I/O failure and has been suspended.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="218050" author="sarah" created="Thu, 11 Jan 2018 22:30:23 +0000"  >&lt;p&gt;another ones on 2.10.3&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/9e9b0404-f6f1-11e7-a7cd-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/9e9b0404-f6f1-11e7-a7cd-52540065bddc&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/a02edd7c-f6f1-11e7-a7cd-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/a02edd7c-f6f1-11e7-a7cd-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 4280.251316] Lustre: DEBUG MARKER: == replay-ost-single test 5: Fail OST during iozone ================================================== 17:57:51 (1515607071)
[ 4288.629724] Lustre: DEBUG MARKER: grep -c /mnt/lustre-ost1&apos; &apos; /proc/mounts
[ 4289.211154] Lustre: DEBUG MARKER: umount -d /mnt/lustre-ost1
[ 4289.591211] Lustre: Failing over lustre-OST0000
[ 4292.839279] Lustre: server umount lustre-OST0000 complete
[ 4293.002308] Lustre: DEBUG MARKER: lsmod | grep lnet &amp;gt; /dev/null &amp;amp;&amp;amp; lctl dl | grep &apos; ST &apos;
[ 4293.343927] Lustre: DEBUG MARKER: ! zpool list -H lustre-ost1 &amp;gt;/dev/null 2&amp;gt;&amp;amp;1 ||
			grep -q ^lustre-ost1/ /proc/mounts ||
			zpool export  lustre-ost1
[ 4304.792431] Lustre: DEBUG MARKER: hostname
[ 4305.199104] Lustre: DEBUG MARKER: lsmod | grep zfs &amp;gt;&amp;amp;/dev/null || modprobe zfs;
			zpool list -H lustre-ost1 &amp;gt;/dev/null 2&amp;gt;&amp;amp;1 ||
			zpool import -f -o cachefile=none -d /dev/lvm-Role_OSS lustre-ost1
[ 4306.781168] Lustre: DEBUG MARKER: zfs get -H -o value 						lustre:svname lustre-ost1/ost1
[ 4307.103538] Lustre: DEBUG MARKER: mkdir -p /mnt/lustre-ost1; mount -t lustre   		                   lustre-ost1/ost1 /mnt/lustre-ost1
[ 4307.465621] Lustre: lustre-OST0000: Imperative Recovery enabled, recovery window shrunk from 60-180 down to 60-180
[ 4307.633766] Lustre: DEBUG MARKER: /usr/sbin/lctl get_param -n health_check
[ 4307.958667] Lustre: lustre-OST0000: Will be in recovery for at least 1:00, or until 6 clients reconnect
[ 4307.960237] Lustre: DEBUG MARKER: PATH=/usr/lib64/lustre/tests:/usr/lib/lustre/tests:/usr/lib64/lustre/tests:/opt/iozone/bin:/opt/iozone/bin:/usr/lib64/lustre/tests/mpi:/usr/lib64/lustre/tests/racer:/usr/lib64/lustre/../lustre-iokit/sgpdd-survey:/usr/lib64/lustre/tests:/usr/lib64/lustre/u
[ 4308.555493] Lustre: DEBUG MARKER: /usr/sbin/lctl mark onyx-40vm8.onyx.hpdd.intel.com: executing set_default_debug vfstrace rpctrace dlmtrace neterror ha config ioctl super lfsck all 4
[ 4308.567312] Lustre: DEBUG MARKER: /usr/sbin/lctl mark onyx-40vm8.onyx.hpdd.intel.com: executing set_default_debug vfstrace rpctrace dlmtrace neterror ha config ioctl super lfsck all 4
[ 4308.764786] Lustre: DEBUG MARKER: onyx-40vm8.onyx.hpdd.intel.com: executing set_default_debug vfstrace rpctrace dlmtrace neterror ha config ioctl super lfsck all 4
[ 4308.765360] Lustre: DEBUG MARKER: onyx-40vm8.onyx.hpdd.intel.com: executing set_default_debug vfstrace rpctrace dlmtrace neterror ha config ioctl super lfsck all 4
[ 4308.940402] Lustre: DEBUG MARKER: zfs get -H -o value 				lustre:svname lustre-ost1/ost1 2&amp;gt;/dev/null | 				grep -E &apos;:[a-zA-Z]{3}[0-9]{4}&apos;
[ 4309.261238] Lustre: DEBUG MARKER: zfs get -H -o value 				lustre:svname lustre-ost1/ost1 2&amp;gt;/dev/null | 				grep -E &apos;:[a-zA-Z]{3}[0-9]{4}&apos;
[ 4309.590687] Lustre: DEBUG MARKER: zfs get -H -o value lustre:svname 		                           lustre-ost1/ost1 2&amp;gt;/dev/null
[ 4310.483010] Lustre: DEBUG MARKER: /usr/sbin/lctl mark onyx-40vm7.onyx.hpdd.intel.com: executing wait_import_state_mount FULL osc.lustre-OST0000-osc-*.ost_server_uuid
[ 4310.555992] Lustre: DEBUG MARKER: /usr/sbin/lctl mark onyx-40vm6.onyx.hpdd.intel.com: executing wait_import_state_mount FULL osc.lustre-OST0000-osc-*.ost_server_uuid
[ 4310.686461] Lustre: DEBUG MARKER: onyx-40vm7.onyx.hpdd.intel.com: executing wait_import_state_mount FULL osc.lustre-OST0000-osc-*.ost_server_uuid
[ 4310.773159] Lustre: DEBUG MARKER: onyx-40vm6.onyx.hpdd.intel.com: executing wait_import_state_mount FULL osc.lustre-OST0000-osc-*.ost_server_uuid
[ 4310.799378] Lustre: lustre-OST0000: Recovery over after 0:03, of 6 clients 6 recovered and 0 were evicted.
[ 4310.803867] Lustre: lustre-OST0000: deleting orphan objects from 0x300000401:34 to 0x300000401:417
[ 4310.808957] Lustre: lustre-OST0000: deleting orphan objects from 0x0:2436 to 0x0:2465
[ 4310.810587] Lustre: lustre-OST0000: deleting orphan objects from 0x300000400:34 to 0x300000400:417
[ 4310.812828] Lustre: lustre-OST0000: deleting orphan objects from 0x300000402:34 to 0x300000402:417
[ 4311.105228] Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0000-osc-*.ost_server_uuid in FULL state after 0 sec
[ 4311.497754] Lustre: DEBUG MARKER: osc.lustre-OST0000-osc-*.ost_server_uuid in FULL state after 0 sec
[ 4312.069870] Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0000-osc-*.ost_server_uuid in FULL state after 1 sec
[ 4312.375301] Lustre: DEBUG MARKER: osc.lustre-OST0000-osc-*.ost_server_uuid in FULL state after 1 sec
[ 6614.243576] WARNING: Pool &apos;lustre-ost1&apos; has encountered an uncorrectable I/O failure and has been suspended.

[ 6614.285585] WARNING: Pool &apos;lustre-ost3&apos; has encountered an uncorrectable I/O failure and has been suspended.

[ 6614.397584] WARNING: Pool &apos;lustre-ost7&apos; has encountered an uncorrectable I/O failure and has been suspended.

[ 6614.501582] WARNING: Pool &apos;lustre-ost6&apos; has encountered an uncorrectable I/O failure and has been suspended.

[ 6614.504581] WARNING: Pool &apos;lustre-ost8&apos; has encountered an uncorrectable I/O failure and has been suspended.

[ 6614.510587] WARNING: Pool &apos;lustre-ost5&apos; has encountered an uncorrectable I/O failure and has been suspended.

[ 6614.796590] WARNING: Pool &apos;lustre-ost2&apos; has encountered an uncorrectable I/O failure and has been suspended.

[ 6615.011582] WARNING: Pool &apos;lustre-ost4&apos; has encountered an uncorrectable I/O failure and has been suspended.

[ 6649.202587] LNet: Service thread pid 30548 was inactive for 40.12s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[ 6649.204384] Pid: 30548, comm: ll_ost_io00_003
[ 6649.204827] 
Call Trace:
[ 6649.205292]  [&amp;lt;ffffffff816ab6b9&amp;gt;] schedule+0x29/0x70
[ 6649.205810]  [&amp;lt;ffffffffc0647515&amp;gt;] cv_wait_common+0x125/0x150 [spl]
[ 6649.206485]  [&amp;lt;ffffffff810b34b0&amp;gt;] ? autoremove_wake_function+0x0/0x40
[ 6649.207126]  [&amp;lt;ffffffffc0647555&amp;gt;] __cv_wait+0x15/0x20 [spl]
[ 6649.207776]  [&amp;lt;ffffffffc0796a2f&amp;gt;] txg_wait_synced+0xef/0x140 [zfs]
[ 6649.208409]  [&amp;lt;ffffffffc074bdf5&amp;gt;] dmu_tx_wait+0x275/0x3c0 [zfs]
[ 6649.209018]  [&amp;lt;ffffffffc074bfd1&amp;gt;] dmu_tx_assign+0x91/0x490 [zfs]
[ 6649.209628]  [&amp;lt;ffffffffc0fbcf3a&amp;gt;] osd_trans_start+0xaa/0x3c0 [osd_zfs]
[ 6649.210273]  [&amp;lt;ffffffffc11101cb&amp;gt;] ofd_trans_start+0x6b/0xe0 [ofd]
[ 6649.210880]  [&amp;lt;ffffffffc11165eb&amp;gt;] ofd_commitrw_write+0x94b/0x1c50 [ofd]
[ 6649.211528]  [&amp;lt;ffffffffc111a5a9&amp;gt;] ofd_commitrw+0x4b9/0xac0 [ofd]
[ 6649.212188]  [&amp;lt;ffffffffc0e12eb7&amp;gt;] obd_commitrw+0x2ed/0x330 [ptlrpc]
[ 6649.212847]  [&amp;lt;ffffffffc0de5d91&amp;gt;] tgt_brw_write+0xff1/0x17c0 [ptlrpc]
[ 6649.213489]  [&amp;lt;ffffffff813300bb&amp;gt;] ? string.isra.7+0x3b/0xf0
[ 6649.214075]  [&amp;lt;ffffffffc0d37e60&amp;gt;] ? target_bulk_timeout+0x0/0xb0 [ptlrpc]
[ 6649.214787]  [&amp;lt;ffffffffc0de1da5&amp;gt;] tgt_request_handle+0x925/0x1370 [ptlrpc]
[ 6649.215494]  [&amp;lt;ffffffffc0d8ab16&amp;gt;] ptlrpc_server_handle_request+0x236/0xa90 [ptlrpc]
[ 6649.216262]  [&amp;lt;ffffffff810bc0f8&amp;gt;] ? __wake_up_common+0x58/0x90
[ 6649.216871]  [&amp;lt;ffffffffc0d8e252&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[ 6649.217527]  [&amp;lt;ffffffffc0d8d7c0&amp;gt;] ? ptlrpc_main+0x0/0x1e40 [ptlrpc]
[ 6649.218155]  [&amp;lt;ffffffff810b252f&amp;gt;] kthread+0xcf/0xe0
[ 6649.218647]  [&amp;lt;ffffffff810b2460&amp;gt;] ? kthread+0x0/0xe0
[ 6649.219151]  [&amp;lt;ffffffff816b8798&amp;gt;] ret_from_fork+0x58/0x90
[ 6649.219687]  [&amp;lt;ffffffff810b2460&amp;gt;] ? kthread+0x0/0xe0

[ 6649.220352] LustreError: dumping log to /tmp/lustre-log.1515609441.30548
[ 7204.082593] Lustre: 11314:0:(service.c:1346:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (5/5), not sending early reply
  req@ffff8800694ce050 x1589224788664784/t0(0) o4-&amp;gt;1a4123cb-0667-4fc1-a984-578de9d574bd@10.2.8.124@tcp:350/0 lens 784/488 e 24 to 0 dl 1515610000 ref 2 fl Interpret:/0/0 rc 0/0
[ 7209.956336] Lustre: lustre-OST0000: Client 1a4123cb-0667-4fc1-a984-578de9d574bd (at 10.2.8.124@tcp) reconnecting
[ 7209.957438] Lustre: Skipped 3 previous similar messages
[ 7209.957986] Lustre: lustre-OST0000: Connection restored to 1bcb3ced-3143-3f53-430e-80a587c85a02 (at 10.2.8.124@tcp)
[ 7209.959086] Lustre: Skipped 23 previous similar messages
[ 7806.066639] Lustre: 11899:0:(service.c:1346:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (4/4), not sending early reply
  req@ffff880069229c50 x1589224788689760/t0(0) o4-&amp;gt;1a4123cb-0667-4fc1-a984-578de9d574bd@10.2.8.124@tcp:196/0 lens 608/448 e 1 to 0 dl 1515610601 ref 2 fl Interpret:/2/0 rc 0/0
[ 7810.960502] Lustre: lustre-OST0000: Client 1a4123cb-0667-4fc1-a984-578de9d574bd (at 10.2.8.124@tcp) reconnecting
[ 7810.961613] Lustre: lustre-OST0000: Connection restored to 1bcb3ced-3143-3f53-430e-80a587c85a02 (at 10.2.8.124@tcp)

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="218100" author="bogl" created="Fri, 12 Jan 2018 14:44:29 +0000"  >&lt;p&gt;another on master:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/68134712-f759-11e7-a7cd-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/68134712-f759-11e7-a7cd-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="218259" author="jhammond" created="Mon, 15 Jan 2018 17:00:05 +0000"  >&lt;p&gt;Upstream zfsonlinux issue to track message improvement: &lt;a href=&quot;https://github.com/zfsonlinux/zfs/issues/7045&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/zfsonlinux/zfs/issues/7045&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="218365" author="adilger" created="Tue, 16 Jan 2018 22:57:29 +0000"  >&lt;p&gt;Digging through the ZFS code a bit, I see there is a &lt;tt&gt;zfs_multihost_history&lt;/tt&gt; tunable (default = 0) that can be set to track the history of MMP &#252;berblock writes.  If the patches on &lt;a href=&quot;https://github.com/zfsonlinux/zfs/issues/7045&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/zfsonlinux/zfs/issues/7045&lt;/a&gt; do not solve the problem, it might be useful to collect the &lt;tt&gt;multihost&lt;/tt&gt; stats in case of a test timeout/failure to see if there is some useful information (e.g. whether &lt;tt&gt;mmp_delay&lt;/tt&gt; is increasing steadily or if it is a step function, what &lt;tt&gt;mmp_delay &amp;#42; vdev_count&lt;/tt&gt; is vs. &lt;tt&gt;mmp_interval&lt;/tt&gt;, etc).&lt;/p&gt;</comment>
                            <comment id="218366" author="behlendorf" created="Tue, 16 Jan 2018 23:42:21 +0000"  >&lt;p&gt;Yes, it would be great if you could enable the multihost history so we can get a clear idea of exactly what&apos;s happening with the delay. You&apos;ll want to set zfs_multihost_history=100, where 100 means keep a history of the last 100 mmp writes. You can then dump the log by reading /proc/spl/kstat/zfs/&amp;lt;pool&amp;gt;/multihost.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;echo 100 &amp;gt;/sys/module/zfs/parameters/zfs_multihost_history
cat /proc/spl/kstat/zfs/jet22-1/multihost
38 0 0x01 13 832 12020281026402 14414698950019
txg        timestamp  mmp_delay    vdev_guid                vdev_label vdev_path
...
28271769   1516145833 102524916    13587132990859797684     1          /dev/disk/by-vdev/L1
28271769   1516145833 102498562    18032937067254868643     0          /dev/disk/by-vdev/L56
28271769   1516145833 102470510    18032937067254868643     0          /dev/disk/by-vdev/L56
28271769   1516145833 102443525    18235850673591164513     1          /dev/disk/by-vdev/L15
28271769   1516145833 102435221    8353277438441160016      2          /dev/disk/by-vdev/L28


&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="218755" author="yujian" created="Sat, 20 Jan 2018 18:03:02 +0000"  >&lt;p&gt;+1 on master branch:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/72655ed0-fcef-11e7-bd00-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/72655ed0-fcef-11e7-bd00-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="218830" author="mdiep" created="Mon, 22 Jan 2018 18:37:57 +0000"  >&lt;p&gt;+1 on b2_10&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/d2abdea2-15b2-43c2-a1aa-d9e3a8d322c6&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/d2abdea2-15b2-43c2-a1aa-d9e3a8d322c6&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="218928" author="mdiep" created="Tue, 23 Jan 2018 18:33:41 +0000"  >&lt;p&gt;FYI, we just added below params in our infrastructure.&lt;/p&gt;

&lt;p&gt;options zfs zfs_multihost_history=100&lt;br/&gt;
options zfs zfs_multihost_fail_intervals=20&lt;/p&gt;
</comment>
                            <comment id="219221" author="yong.fan" created="Fri, 26 Jan 2018 02:56:13 +0000"  >&lt;p&gt;+1 on master:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/df6be592-01e7-11e8-bd00-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/df6be592-01e7-11e8-bd00-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="221068" author="yujian" created="Thu, 15 Feb 2018 07:55:44 +0000"  >&lt;p&gt;sanity test 124a hit this failure on master branch:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/7d550360-11f6-11e8-a7cd-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/7d550360-11f6-11e8-a7cd-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="224081" author="yujian" created="Wed, 21 Mar 2018 04:32:17 +0000"  >&lt;p&gt;+1 on master branch:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/db7af22c-2c6a-11e8-b3c6-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/db7af22c-2c6a-11e8-b3c6-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="229202" author="joe.grund" created="Wed, 6 Jun 2018 11:29:31 +0000"  >&lt;p&gt;Looks like we saw this in IML testing: &lt;a href=&quot;https://github.com/intel-hpdd/intel-manager-for-lustre/issues/645&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/intel-hpdd/intel-manager-for-lustre/issues/645&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="233485" author="utopiabound" created="Thu, 13 Sep 2018 21:33:26 +0000"  >&lt;p&gt;With the upgrade to ZFS 0.7.9 and the increase in the multihost fail interval, this is no longer an issue.&lt;/p&gt;</comment>
                            <comment id="243758" author="jamesanunez" created="Tue, 12 Mar 2019 19:37:51 +0000"  >&lt;p&gt;We&apos;re seeing this error message again with ZFS testing. Looking at a recent 2.10.7 RC1 test session using ZFS 0.7.9-1, &lt;a href=&quot;https://testing.whamcloud.com/test_sessions/a35ede3e-36a0-4a9f-92f6-e22a3cda1bec&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sessions/a35ede3e-36a0-4a9f-92f6-e22a3cda1bec&lt;/a&gt; , we see this error in the OST (vm3) console for ost-pools test 23b&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;============================================================= 23:14:20 \(1552259660\)
[14220.079856] Lustre: DEBUG MARKER: == ost-pools test 23b: OST pools and OOS ============================================================= 23:14:20 (1552259660)
[14414.872497] WARNING: MMP writes to pool &apos;lustre-ost4&apos; have not succeeded in over 20s; suspending pool
[14414.874142] WARNING: Pool &apos;lustre-ost4&apos; has encountered an uncorrectable I/O failure and has been suspended.
[14414.874142] 
[14414.878399] WARNING: MMP writes to pool &apos;lustre-ost8&apos; have not succeeded in over 20s; suspending pool
[14414.880115] WARNING: Pool &apos;lustre-ost8&apos; has encountered an uncorrectable I/O failure and has been suspended.
[14414.880115] 
[14415.175128] WARNING: MMP writes to pool &apos;lustre-ost7&apos; have not succeeded in over 20s; suspending pool
[14415.176759] WARNING: Pool &apos;lustre-ost7&apos; has encountered an uncorrectable I/O failure and has been suspended.
[14415.176759] 
[14415.251535] WARNING: MMP writes to pool &apos;lustre-ost2&apos; have not succeeded in over 20s; suspending pool
[14415.253116] WARNING: Pool &apos;lustre-ost2&apos; has encountered an uncorrectable I/O failure and has been suspended.
[14415.253116] 
[14465.182431] LNet: Service thread pid 16167 was inactive for 78.05s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[14465.185338] Pid: 16167, comm: ll_ost_io00_006 3.10.0-957.1.3.el7_lustre.x86_64 #1 SMP Fri Mar 8 06:17:38 UTC 2019
[14465.187046] Call Trace:
[14465.187545]  [&amp;lt;ffffffffc03ae2b5&amp;gt;] cv_wait_common+0x125/0x150 [spl]
[14465.188787]  [&amp;lt;ffffffffc03ae2f5&amp;gt;] __cv_wait+0x15/0x20 [spl]
[14465.189791]  [&amp;lt;ffffffffc0505c0f&amp;gt;] txg_wait_synced+0xef/0x140 [zfs]
[14465.190919]  [&amp;lt;ffffffffc04bba15&amp;gt;] dmu_tx_wait+0x275/0x3c0 [zfs]
[14465.191992]  [&amp;lt;ffffffffc04bbbf1&amp;gt;] dmu_tx_assign+0x91/0x490 [zfs]
[14465.193078]  [&amp;lt;ffffffffc1123f57&amp;gt;] osd_trans_start+0xa7/0x3a0 [osd_zfs]
[14465.194216]  [&amp;lt;ffffffffc124e31e&amp;gt;] ofd_trans_start+0x6e/0xf0 [ofd]
[14465.195327]  [&amp;lt;ffffffffc125482b&amp;gt;] ofd_commitrw_write+0x94b/0x1c90 [ofd]
[14465.196500]  [&amp;lt;ffffffffc1258879&amp;gt;] ofd_commitrw+0x4c9/0xae0 [ofd]
[14465.197563]  [&amp;lt;ffffffffc0f8fd84&amp;gt;] obd_commitrw+0x2f3/0x336 [ptlrpc]
[14465.198979]  [&amp;lt;ffffffffc0f627cd&amp;gt;] tgt_brw_write+0xffd/0x17d0 [ptlrpc]
[14465.200198]  [&amp;lt;ffffffffc0f5e7ca&amp;gt;] tgt_request_handle+0x92a/0x1370 [ptlrpc]
[14465.201458]  [&amp;lt;ffffffffc0f0705b&amp;gt;] ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
[14465.202814]  [&amp;lt;ffffffffc0f0a7a2&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[14465.203954]  [&amp;lt;ffffffff92ac1c31&amp;gt;] kthread+0xd1/0xe0
[14465.204831]  [&amp;lt;ffffffff93174c37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[14465.205926]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[14465.206820] LustreError: dumping log to /tmp/lustre-log.1552259905.16167
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Do you think this is a new issue?&lt;/p&gt;

&lt;p&gt;Other failures for the same test, ost-pools test 23b hanging, this year are at&lt;br/&gt;
09 March - &lt;a href=&quot;https://testing.whamcloud.com/test_sets/cbf1b00c-4280-11e9-92fe-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/cbf1b00c-4280-11e9-92fe-52540065bddc&lt;/a&gt;&lt;br/&gt;
28 Feb - &lt;a href=&quot;https://testing.whamcloud.com/test_sets/ed47664a-3bc1-11e9-b98a-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/ed47664a-3bc1-11e9-b98a-52540065bddc&lt;/a&gt;&lt;br/&gt;
17 Jan - &lt;a href=&quot;https://testing.whamcloud.com/test_sets/e2791e6a-1a93-11e9-b7d4-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/e2791e6a-1a93-11e9-b7d4-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="243832" author="adilger" created="Wed, 13 Mar 2019 10:23:51 +0000"  >&lt;p&gt;A patch has been submitted to upstream ZFS master that increases the MMP interval/retry interval, as well as improve other aspects of the code in &lt;a href=&quot;https://github.com/zfsonlinux/zfs/issues/7709&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/zfsonlinux/zfs/issues/7709&lt;/a&gt; and a smaller patch to just update the interval/retry count that seems suitable for inclusion into 0.7.x as &lt;a href=&quot;https://github.com/zfsonlinux/zfs/pull/8495&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/zfsonlinux/zfs/pull/8495&lt;/a&gt; .&lt;/p&gt;</comment>
                            <comment id="244056" author="adilger" created="Sat, 16 Mar 2019 01:33:36 +0000"  >&lt;p&gt;If this is becoming really problematic, we could disable MMP in our VM test configurations by setting &quot;&lt;tt&gt;zfs_multihost_fail_intervals=0&lt;/tt&gt;&quot;, but that should be a last resort as it will make our testing less representative of real deployments.  At least the MMP feature would still be enabled and running, but a timeout would not take the pool offline.&lt;/p&gt;</comment>
                            <comment id="244319" author="mdiep" created="Wed, 20 Mar 2019 15:59:52 +0000"  >&lt;p&gt;+1 on b2_12 &lt;a href=&quot;https://testing.whamcloud.com/test_sessions/e73dd201-fae1-4123-92ab-1961d681f34e&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sessions/e73dd201-fae1-4123-92ab-1961d681f34e&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="246110" author="adilger" created="Sat, 20 Apr 2019 15:34:13 +0000"  >&lt;p&gt;+1 on master conf-sanity test_32c:&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/ef12e78e-6343-11e9-aeec-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/ef12e78e-6343-11e9-aeec-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="247064" author="utopiabound" created="Mon, 13 May 2019 15:10:44 +0000"  >&lt;p&gt;I&apos;ve asked that Autotest bump &lt;tt&gt;zfs_multihost_fail_intervals&lt;/tt&gt; to 60 which is twice the default SCSI timeout in ATM-827&lt;/p&gt;</comment>
                            <comment id="247549" author="utopiabound" created="Wed, 22 May 2019 20:16:29 +0000"  >&lt;p&gt;&lt;tt&gt;zfs_multihost_fail_intervals&lt;/tt&gt; has been bumped to 60 seconds in autotest via ATM-827.&lt;/p&gt;

&lt;p&gt;This should address the zpool MMP timeouts.&lt;/p&gt;</comment>
                            <comment id="247565" author="ieelusername" created="Thu, 23 May 2019 01:54:33 +0000"  >&lt;p&gt;Hi Nathaniel&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160;Does it cause zpool import too slow in another pair OSS ?&#160;&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160;I mean HA switch have too wait a long time in 100+ SAS device env.&#160;&lt;/p&gt;</comment>
                            <comment id="247583" author="utopiabound" created="Thu, 23 May 2019 12:15:25 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=ieelusername&quot; class=&quot;user-hover&quot; rel=&quot;ieelusername&quot;&gt;ieelusername&lt;/a&gt;,&lt;/p&gt;

&lt;p&gt;This setting is for our test environment only.&#160; Since they are VMs they can be pretty heavily I/O contended on underlying host.&lt;/p&gt;</comment>
                            <comment id="247628" author="ieelusername" created="Fri, 24 May 2019 08:06:06 +0000"  >&lt;p&gt;Hi Nathaniel&lt;/p&gt;

&lt;p&gt;&#160; &#160; &#160;Oh, Got it.&lt;/p&gt;</comment>
                            <comment id="252676" author="sarah" created="Wed, 7 Aug 2019 16:25:25 +0000"  >&lt;p&gt;Hit the same problem when doing soak on b2_12-ib build #31&lt;/p&gt;

&lt;p&gt;soak-7 console shows&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;WARNING: MMP writes to pool &apos;soaked-ost11&apos; have not succeeded in over 5s; suspending pool
WARNING: Pool &apos;soaked-ost11&apos; has encountered an uncorrectable I/O failure and has been suspended.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="278314" author="adilger" created="Fri, 28 Aug 2020 20:06:04 +0000"  >&lt;blockquote&gt;
&lt;p&gt;Hit the same problem when doing soak on b2_12-ib build #31&lt;/p&gt;

&lt;p&gt;soak-7 console shows&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;WARNING: MMP writes to pool &apos;soaked-ost11&apos; have not succeeded in over 5s; suspending pool
WARNING: Pool &apos;soaked-ost11&apos; has encountered an uncorrectable I/O failure and has been suspended.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;&lt;/blockquote&gt;
&lt;p&gt;It looks like the &lt;tt&gt;zfs_mmp_fail_intervals=60&lt;/tt&gt; was not set, if the suspend happened after only 5s.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="49626">LU-10342</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="51462">LU-10832</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="52907">LU-11217</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="55605">LU-12281</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="48311">LU-9991</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="48710">LU-10109</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="50477">LU-10585</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="49997">LU-10441</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="39872">LU-8619</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="52004">LU-10956</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="55547">LU-12258</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="54107">LU-11704</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="55860">LU-12393</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="28897" name="mds02_faulted.log" size="20942" author="kim.sebo" created="Sat, 9 Dec 2017 06:48:56 +0000"/>
                            <attachment id="28730" name="zdb-lustre-ost1.log" size="76676" author="utopiabound" created="Wed, 15 Nov 2017 17:00:38 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzi1z:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>