<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:31:14 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10009] sanity-benchmark test_iozone: test failed to respond and timed out</title>
                <link>https://jira.whamcloud.com/browse/LU-10009</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for sarah_lw &amp;lt;wei3.liu@intel.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/fd03f350-9c9e-11e7-ba27-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/fd03f350-9c9e-11e7-ba27-5254006e85c2&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_iozone failed with the following error:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;test failed to respond and timed out
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;server and client: RHEL7.4 zfs&lt;/p&gt;

&lt;p&gt;MDS dmesg shows&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[25904.206622] osp-syn-0-0     D 0000000000000000     0 23127      2 0x00000080
[25904.208389]  ffff88005c5976a0 0000000000000046 ffff88004e28eeb0 ffff88005c597fd8
[25904.210163]  ffff88005c597fd8 ffff88005c597fd8 ffff88004e28eeb0 ffff8800608452f8
[25904.211921]  ffff880060845240 ffff880060845268 ffff880060845300 0000000000000000
[25904.213716] Call Trace:
[25904.214985]  [&amp;lt;ffffffff816a94c9&amp;gt;] schedule+0x29/0x70
[25904.216575]  [&amp;lt;ffffffffc07144d5&amp;gt;] cv_wait_common+0x125/0x150 [spl]
[25904.218157]  [&amp;lt;ffffffff810b1910&amp;gt;] ? wake_up_atomic_t+0x30/0x30
[25904.219805]  [&amp;lt;ffffffffc0714515&amp;gt;] __cv_wait+0x15/0x20 [spl]
[25904.221433]  [&amp;lt;ffffffffc086317f&amp;gt;] txg_wait_synced+0xef/0x140 [zfs]
[25904.223020]  [&amp;lt;ffffffffc0818a75&amp;gt;] dmu_tx_wait+0x275/0x3c0 [zfs]
[25904.224676]  [&amp;lt;ffffffffc0818c51&amp;gt;] dmu_tx_assign+0x91/0x490 [zfs]
[25904.226256]  [&amp;lt;ffffffffc0c3ad00&amp;gt;] ? llog_osd_declare_destroy+0x2f0/0x640 [obdclass]
[25904.228018]  [&amp;lt;ffffffffc1085efa&amp;gt;] osd_trans_start+0xaa/0x3c0 [osd_zfs]
[25904.229710]  [&amp;lt;ffffffffc0c278c7&amp;gt;] llog_cancel_rec+0x147/0x870 [obdclass]
[25904.231414]  [&amp;lt;ffffffffc0c2e33a&amp;gt;] llog_cat_cancel_records+0x13a/0x2e0 [obdclass]
[25904.233232]  [&amp;lt;ffffffffc0e4b8a0&amp;gt;] ? lustre_swab_niobuf_remote+0x30/0x30 [ptlrpc]
[25904.234923]  [&amp;lt;ffffffffc13c57f3&amp;gt;] osp_sync_process_committed+0x213/0x6c0 [osp]
[25904.236691]  [&amp;lt;ffffffffc13c6bd6&amp;gt;] osp_sync_process_queues+0x556/0x2010 [osp]
[25904.238371]  [&amp;lt;ffffffff810c4810&amp;gt;] ? wake_up_state+0x20/0x20
[25904.239978]  [&amp;lt;ffffffffc0c28595&amp;gt;] llog_process_thread+0x5a5/0x1180 [obdclass]
[25904.241718]  [&amp;lt;ffffffffc13c6680&amp;gt;] ? osp_sync_thread+0x9e0/0x9e0 [osp]
[25904.243404]  [&amp;lt;ffffffffc0c2922c&amp;gt;] llog_process_or_fork+0xbc/0x450 [obdclass]
[25904.245050]  [&amp;lt;ffffffffc0c2e91d&amp;gt;] llog_cat_process_cb+0x43d/0x4e0 [obdclass]
[25904.246794]  [&amp;lt;ffffffffc0c28595&amp;gt;] llog_process_thread+0x5a5/0x1180 [obdclass]
[25904.248521]  [&amp;lt;ffffffff810ce8d8&amp;gt;] ? check_preempt_wakeup+0x148/0x250
[25904.250109]  [&amp;lt;ffffffffc0c2e4e0&amp;gt;] ? llog_cat_cancel_records+0x2e0/0x2e0 [obdclass]
[25904.251906]  [&amp;lt;ffffffffc0c2922c&amp;gt;] llog_process_or_fork+0xbc/0x450 [obdclass]
[25904.253600]  [&amp;lt;ffffffffc0c2e4e0&amp;gt;] ? llog_cat_cancel_records+0x2e0/0x2e0 [obdclass]
[25904.255390]  [&amp;lt;ffffffffc0c2daa9&amp;gt;] llog_cat_process_or_fork+0x199/0x2a0 [obdclass]
[25904.257072]  [&amp;lt;ffffffff810c4822&amp;gt;] ? default_wake_function+0x12/0x20
[25904.258766]  [&amp;lt;ffffffff810ba588&amp;gt;] ? __wake_up_common+0x58/0x90
[25904.260355]  [&amp;lt;ffffffffc13c6680&amp;gt;] ? osp_sync_thread+0x9e0/0x9e0 [osp]
[25904.262039]  [&amp;lt;ffffffffc0c2dbde&amp;gt;] llog_cat_process+0x2e/0x30 [obdclass]
[25904.263683]  [&amp;lt;ffffffffc13c5ea8&amp;gt;] osp_sync_thread+0x208/0x9e0 [osp]
[25904.265347]  [&amp;lt;ffffffff81029557&amp;gt;] ? __switch_to+0xd7/0x510
[25904.266851]  [&amp;lt;ffffffffc13c5ca0&amp;gt;] ? osp_sync_process_committed+0x6c0/0x6c0 [osp]
[25904.268620]  [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
[25904.270093]  [&amp;lt;ffffffff810b08c0&amp;gt;] ? insert_kthread_work+0x40/0x40
[25904.271736]  [&amp;lt;ffffffff816b4f18&amp;gt;] ret_from_fork+0x58/0x90
[25904.273300]  [&amp;lt;ffffffff810b08c0&amp;gt;] ? insert_kthread_work+0x40/0x40

...

[25904.821292] mdt_rdpg00_002  D ffff880060845300     0 24274      2 0x00000080
[25904.822912]  ffff88005a573880 0000000000000046 ffff88006b1b8fd0 ffff88005a573fd8
[25904.824661]  ffff88005a573fd8 ffff88005a573fd8 ffff88006b1b8fd0 ffff8800608452f8
[25904.826407]  ffff880060845240 ffff880060845268 ffff880060845300 0000000000000000
[25904.828129] Call Trace:
[25904.829349]  [&amp;lt;ffffffff816a94c9&amp;gt;] schedule+0x29/0x70
[25904.830842]  [&amp;lt;ffffffffc07144d5&amp;gt;] cv_wait_common+0x125/0x150 [spl]
[25904.832434]  [&amp;lt;ffffffff810b1910&amp;gt;] ? wake_up_atomic_t+0x30/0x30
[25904.833928]  [&amp;lt;ffffffffc0714515&amp;gt;] __cv_wait+0x15/0x20 [spl]
[25904.835494]  [&amp;lt;ffffffffc086317f&amp;gt;] txg_wait_synced+0xef/0x140 [zfs]
[25904.837084]  [&amp;lt;ffffffffc0818a75&amp;gt;] dmu_tx_wait+0x275/0x3c0 [zfs]
[25904.838670]  [&amp;lt;ffffffffc0818c51&amp;gt;] dmu_tx_assign+0x91/0x490 [zfs]
[25904.840246]  [&amp;lt;ffffffffc1085efa&amp;gt;] osd_trans_start+0xaa/0x3c0 [osd_zfs]
[25904.841792]  [&amp;lt;ffffffffc103a128&amp;gt;] qmt_trans_start_with_slv+0x248/0x530 [lquota]
[25904.843472]  [&amp;lt;ffffffffc1033196&amp;gt;] qmt_dqacq0+0x1a6/0xf00 [lquota]
[25904.845069]  [&amp;lt;ffffffffc0e4a2df&amp;gt;] ? lustre_pack_reply_flags+0x6f/0x1e0 [ptlrpc]
[25904.846753]  [&amp;lt;ffffffffc1036b21&amp;gt;] qmt_intent_policy+0x831/0xe50 [lquota]
[25904.848387]  [&amp;lt;ffffffffc12207c2&amp;gt;] mdt_intent_policy+0x662/0xc70 [mdt]
[25904.849991]  [&amp;lt;ffffffffc0e0112f&amp;gt;] ? ldlm_resource_get+0x9f/0xa30 [ptlrpc]
[25904.851631]  [&amp;lt;ffffffffc0dfa2b7&amp;gt;] ldlm_lock_enqueue+0x387/0x970 [ptlrpc]
[25904.853270]  [&amp;lt;ffffffffc0e23c23&amp;gt;] ldlm_handle_enqueue0+0x9c3/0x1680 [ptlrpc]
[25904.854932]  [&amp;lt;ffffffffc0e4be90&amp;gt;] ? lustre_swab_ldlm_lock_desc+0x30/0x30 [ptlrpc]
[25904.856589]  [&amp;lt;ffffffffc0ea9182&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
[25904.858208]  [&amp;lt;ffffffffc0ead085&amp;gt;] tgt_request_handle+0x925/0x1370 [ptlrpc]
[25904.859868]  [&amp;lt;ffffffffc0e55ec6&amp;gt;] ptlrpc_server_handle_request+0x236/0xa90 [ptlrpc]
[25904.861533]  [&amp;lt;ffffffff810ba588&amp;gt;] ? __wake_up_common+0x58/0x90
[25904.863131]  [&amp;lt;ffffffffc0e59602&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[25904.864765]  [&amp;lt;ffffffffc0e58b70&amp;gt;] ? ptlrpc_register_service+0xe30/0xe30 [ptlrpc]
[25904.866473]  [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
[25904.867963]  [&amp;lt;ffffffff810b08c0&amp;gt;] ? insert_kthread_work+0x40/0x40
[25904.869514]  [&amp;lt;ffffffff816b4f18&amp;gt;] ret_from_fork+0x58/0x90
[25904.871056]  [&amp;lt;ffffffff810b08c0&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Info required for matching: sanity-benchmark iozone&lt;/p&gt;</description>
                <environment></environment>
        <key id="48383">LU-10009</key>
            <summary>sanity-benchmark test_iozone: test failed to respond and timed out</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Tue, 19 Sep 2017 21:00:06 +0000</created>
                <updated>Thu, 16 Jan 2020 00:42:12 +0000</updated>
                                            <version>Lustre 2.10.1</version>
                    <version>Lustre 2.12.4</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="208810" author="sarah" created="Tue, 19 Sep 2017 21:02:25 +0000"  >&lt;p&gt;This failure has been seen on multiple tests&lt;br/&gt;
sanity-hsm test_24a also shows similar trace on MDS&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/fdc369ec-9c9e-11e7-ba27-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/fdc369ec-9c9e-11e7-ba27-5254006e85c2&lt;/a&gt;&lt;br/&gt;
performance-sanity&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/0083a0c0-9c9f-11e7-ba27-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/0083a0c0-9c9f-11e7-ba27-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="209105" author="adilger" created="Thu, 21 Sep 2017 17:40:32 +0000"  >&lt;p&gt;It looks like these threads are all stuck in the ZFS transaction commit code, waiting for the txg_sync thread to finish committing the transaction.&lt;/p&gt;

&lt;p&gt;How consistently is this problem reproduced (e.g. 1 in 5 runs, 1 in 10, ...)?  If the problem can be reproduced fairly consistently, then I would recommend to try to see if it can be reproduced with ZFS 0.6.5.9 instead of ZFS 0.7.1, since it may very well be a problem in the ZFS code and not Lustre, or at least a bad interaction with how Lustre is using ZFS.&lt;/p&gt;

&lt;p&gt;To test this, change the &lt;tt&gt;SPLZFSVER&lt;/tt&gt; in the lbuild script to be &quot;0.6.5.9&quot; and then submit the patch with &lt;tt&gt;Test-Parameters:&lt;/tt&gt; that run the test 10 times (or however many are needed to reproduce it consistently).&lt;/p&gt;</comment>
                            <comment id="209131" author="sarah" created="Thu, 21 Sep 2017 20:34:46 +0000"  >&lt;p&gt;Does this relate to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9247&quot; title=&quot;replay-ost-single test_5: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9247&quot;&gt;&lt;del&gt;LU-9247&lt;/del&gt;&lt;/a&gt;?  &lt;br/&gt;
For the consistency, taking results from 2.10.1 RC1, DNE and non-DNE ZFS, sanity-benchmark,  performance-sanity and obdfilter-survey are either hit &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9247&quot; title=&quot;replay-ost-single test_5: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9247&quot;&gt;&lt;del&gt;LU-9247&lt;/del&gt;&lt;/a&gt; or this one.  Also search all sanity-benchmark runs in 4 weeks on b2_10, hit this one 2 in 7 runs, hit &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9247&quot; title=&quot;replay-ost-single test_5: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9247&quot;&gt;&lt;del&gt;LU-9247&lt;/del&gt;&lt;/a&gt; 3 in 7 runs.&lt;/p&gt;

&lt;p&gt;I will make a patch to test with zfs 0.6.5.9&lt;/p&gt;

</comment>
                            <comment id="261280" author="jamesanunez" created="Wed, 15 Jan 2020 17:57:03 +0000"  >&lt;p&gt;This might require a new ticket, but ... I have a new case that looks like what is described here. For Lustre (future) 2.12.4 at &lt;a href=&quot;https://testing.whamcloud.com/test_sets/4c5e02d0-349e-11ea-b0f4-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/4c5e02d0-349e-11ea-b0f4-52540065bddc&lt;/a&gt; for ZFS with DNE, we see test_bonnie hang, but also errors during test_dbench. &lt;/p&gt;

&lt;p&gt;Looking at console logs for client1 (vm1), we see dbench process hung with all the same traces&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[28504.786302] Lustre: DEBUG MARKER: == sanity-benchmark test dbench: dbench ============================================================== 17:07:28 (1578676048)
[29239.794475] LNet: 13710:0:(debug.c:370:libcfs_debug_str2mask()) You are trying to use a numerical value for the mask - this will be deprecated in a future release.
[29510.651244] INFO: task dbench:13724 blocked for more than 120 seconds.
[29510.652635] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[29510.653990] dbench          D ffff9d4dbb0362a0     0 13724  13723 0x00000080
[29510.655393] Call Trace:
[29510.655905]  [&amp;lt;ffffffffa637f229&amp;gt;] schedule+0x29/0x70
[29510.656775]  [&amp;lt;ffffffffa637cbb1&amp;gt;] schedule_timeout+0x221/0x2d0
[29510.658160]  [&amp;lt;ffffffffc0b4ec60&amp;gt;] ? lustre_swab_niobuf_remote+0x30/0x30 [ptlrpc]
[29510.659430]  [&amp;lt;ffffffffa637f5dd&amp;gt;] wait_for_completion+0xfd/0x140
[29510.660590]  [&amp;lt;ffffffffa5cda0b0&amp;gt;] ? wake_up_state+0x20/0x20
[29510.661793]  [&amp;lt;ffffffffc0d1caf4&amp;gt;] osc_io_fsync_end+0x74/0xa0 [osc]
[29510.663143]  [&amp;lt;ffffffffc097bc80&amp;gt;] cl_io_end+0x60/0x150 [obdclass]
[29510.664277]  [&amp;lt;ffffffffc0d6e0ab&amp;gt;] lov_io_end_wrapper+0xdb/0xe0 [lov]
[29510.665413]  [&amp;lt;ffffffffc0d6e380&amp;gt;] lov_io_fsync_end+0x80/0x1b0 [lov]
[29510.666552]  [&amp;lt;ffffffffc097bc80&amp;gt;] cl_io_end+0x60/0x150 [obdclass]
[29510.667664]  [&amp;lt;ffffffffc097e39a&amp;gt;] cl_io_loop+0xda/0x1c0 [obdclass]
[29510.668887]  [&amp;lt;ffffffffc0ee4dfb&amp;gt;] cl_sync_file_range+0x2db/0x380 [lustre]
[29510.670094]  [&amp;lt;ffffffffc0ee5129&amp;gt;] ll_fsync+0x289/0x490 [lustre]
[29510.671181]  [&amp;lt;ffffffffa5e7d9f7&amp;gt;] do_fsync+0x67/0xb0
[29510.672102]  [&amp;lt;ffffffffa638ce15&amp;gt;] ? system_call_after_swapgs+0xa2/0x146
[29510.673232]  [&amp;lt;ffffffffa5e7dce0&amp;gt;] SyS_fsync+0x10/0x20
[29510.674238]  [&amp;lt;ffffffffa638cede&amp;gt;] system_call_fastpath+0x25/0x2a
[29510.675317]  [&amp;lt;ffffffffa638ce21&amp;gt;] ? system_call_after_swapgs+0xae/0x146
[29510.676491] INFO: task dbench:13731 blocked for more than 120 seconds.
[29510.677654] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[29510.679025] dbench          D ffff9d4dbbe341c0     0 13731  13723 0x00000080
[29510.680320] Call Trace:
[29510.680788]  [&amp;lt;ffffffffa637f229&amp;gt;] schedule+0x29/0x70
&#8230;
[29510.699317]  [&amp;lt;ffffffffa638ce21&amp;gt;] ? system_call_after_swapgs+0xae/0x146
[29510.700579] INFO: task dbench:13732 blocked for more than 120 seconds.
[29510.701739] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[29510.703114] dbench          D ffff9d4dbbe30000     0 13732  13723 0x00000080
[29510.704412] Call Trace:
[29510.704859]  [&amp;lt;ffffffffa637f229&amp;gt;] schedule+0x29/0x70
&#8230;
[29510.723527]  [&amp;lt;ffffffffa638ce21&amp;gt;] ? system_call_after_swapgs+0xae/0x146
[29510.724697] INFO: task dbench:13739 blocked for more than 120 seconds.
[29510.725860] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[29510.727220] dbench          D ffff9d4da6735230     0 13739  13723 0x00000080
[29510.728500] Call Trace:
[29510.729054]  [&amp;lt;ffffffffa637f229&amp;gt;] schedule+0x29/0x70
&#8230;
[29510.747529]  [&amp;lt;ffffffffa638ce21&amp;gt;] ? system_call_after_swapgs+0xae/0x146
[29510.748711] INFO: task dbench:13741 blocked for more than 120 seconds.
[29510.749927] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[29510.751293] dbench          D ffff9d4da67341c0     0 13741  13723 0x00000080
[29510.752583] Call Trace:
[29510.753102]  [&amp;lt;ffffffffa637f229&amp;gt;] schedule+0x29/0x70
&#8230;
[29510.771672]  [&amp;lt;ffffffffa638ce21&amp;gt;] ? system_call_after_swapgs+0xae/0x146
[30016.551909] LNet: 14222:0:(debug.c:370:libcfs_debug_str2mask()) You are trying to use a numerical value for the mask - this will be deprecated in a future release.
[30017.592350] Lustre: DEBUG MARKER: lctl set_param -n fail_loc=0 	    fail_val=0 2&amp;gt;/dev/null
[30017.997077] Lustre: DEBUG MARKER: rc=0; val=$(/usr/sbin/lctl get_param -n catastrophe 2&amp;gt;&amp;amp;1); if [[ $? -eq 0 &amp;amp;&amp;amp; $val -ne 0 ]]; then echo $(hostname -s): $val; rc=$val; fi; exit $rc
[30019.791258] Lustre: DEBUG MARKER: dmesg
[30020.607155] Lustre: DEBUG MARKER: /usr/sbin/lctl mark == sanity-benchmark test bonnie: bonnie++ ============================================================ 17:32:45 \(1578677565\)
[30020.885290] Lustre: DEBUG MARKER: == sanity-benchmark test bonnie: bonnie++ ============================================================ 17:32:45 (1578677565)
[30022.115531] Lustre: DEBUG MARKER: /usr/sbin/lctl mark min OST has 1889280kB available, using 3438440kB file size
[30022.397509] Lustre: DEBUG MARKER: min OST has 1889280kB available, using 3438440kB file size
[31417.232936] Lustre: 3163:0:(client.c:376:ptlrpc_at_adj_net_latency()) Reported service time 297 &amp;gt; total measured time 9
[31417.257892] Lustre: 3163:0:(client.c:376:ptlrpc_at_adj_net_latency()) Reported service time 297 &amp;gt; total measured time 9
[31418.333175] Lustre: 3163:0:(client.c:376:ptlrpc_at_adj_net_latency()) Reported service time 290 &amp;gt; total measured time 2
&#8230;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In the console logs of the OSS, we see inactive threads while running both dbench and bonnie tests&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[28373.743227] Lustre: DEBUG MARKER: == sanity-benchmark test dbench: dbench ============================================================== 17:07:28 (1578676048)
[29151.759167] LNet: Service thread pid 30219 was inactive for 40.01s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[29151.762111] Pid: 30219, comm: ll_ost_io00_009 3.10.0-1062.1.1.el7_lustre.x86_64 #1 SMP Fri Dec 13 20:09:43 UTC 2019
[29151.763845] Call Trace:
[29151.764347]  [&amp;lt;ffffffffc02442d5&amp;gt;] cv_wait_common+0x125/0x150 [spl]
[29151.765484]  [&amp;lt;ffffffffc0244315&amp;gt;] __cv_wait+0x15/0x20 [spl]
[29151.766544]  [&amp;lt;ffffffffc064a503&amp;gt;] txg_wait_open+0xc3/0x110 [zfs]
[29151.768064]  [&amp;lt;ffffffffc05ffdfa&amp;gt;] dmu_tx_wait+0x3aa/0x3c0 [zfs]
[29151.769162]  [&amp;lt;ffffffffc05ffea2&amp;gt;] dmu_tx_assign+0x92/0x490 [zfs]
[29151.770306]  [&amp;lt;ffffffffc10d1fd9&amp;gt;] osd_trans_start+0x199/0x440 [osd_zfs]
[29151.771576]  [&amp;lt;ffffffffc121bbf5&amp;gt;] ofd_trans_start+0x75/0xf0 [ofd]
[29151.772766]  [&amp;lt;ffffffffc1222cb1&amp;gt;] ofd_commitrw_write+0xa31/0x1d40 [ofd]
[29151.774017]  [&amp;lt;ffffffffc122714c&amp;gt;] ofd_commitrw+0x48c/0x9e0 [ofd]
[29151.775172]  [&amp;lt;ffffffffc0efc81c&amp;gt;] obd_commitrw+0x9c/0x370 [ptlrpc]
[29151.776625]  [&amp;lt;ffffffffc0f00cc2&amp;gt;] tgt_brw_write+0xf02/0x1ad0 [ptlrpc]
[29151.777896]  [&amp;lt;ffffffffc0f02b0a&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
[29151.779158]  [&amp;lt;ffffffffc0ea746b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[29151.780513]  [&amp;lt;ffffffffc0eaadd4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
[29151.781657]  [&amp;lt;ffffffff8f2c50d1&amp;gt;] kthread+0xd1/0xe0
[29151.782580]  [&amp;lt;ffffffff8f98cd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[29151.783703]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[29151.784681] LustreError: dumping log to /tmp/lustre-log.1578676828.30219
[29170.341906] LNet: Service thread pid 30229 completed after 57.73s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
[29174.364066] LNet: Service thread pid 30238 was inactive for 60.85s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[29174.367218] Pid: 30238, comm: ll_ost_io00_023 3.10.0-1062.1.1.el7_lustre.x86_64 #1 SMP Fri Dec 13 20:09:43 UTC 2019
[29174.369133] Call Trace:
[29174.369646]  [&amp;lt;ffffffffc02442d5&amp;gt;] cv_wait_common+0x125/0x150 [spl]
&#8230;
[29174.385846]  [&amp;lt;ffffffff8f98cd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[29174.386947]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[29174.387861] Pid: 30237, comm: ll_ost_io00_022 3.10.0-1062.1.1.el7_lustre.x86_64 #1 SMP Fri Dec 13 20:09:43 UTC 2019
[29174.389604] Call Trace:
[29174.390059]  [&amp;lt;ffffffffc02442d5&amp;gt;] cv_wait_common+0x125/0x150 [spl]
&#8230;
[29174.405701]  [&amp;lt;ffffffff8f98cd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[29174.406812]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[29174.407694] Pid: 30232, comm: ll_ost_io00_021 3.10.0-1062.1.1.el7_lustre.x86_64 #1 SMP Fri Dec 13 20:09:43 UTC 2019
[29174.409403] Call Trace:
[29174.409865]  [&amp;lt;ffffffffc02442d5&amp;gt;] cv_wait_common+0x125/0x150 [spl]
&#8230;
[29174.425546]  [&amp;lt;ffffffff8f98cd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[29174.426647]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[29174.427546] Pid: 30230, comm: ll_ost_io00_019 3.10.0-1062.1.1.el7_lustre.x86_64 #1 SMP Fri Dec 13 20:09:43 UTC 2019
[29174.429258] Call Trace:
[29174.429709]  [&amp;lt;ffffffffc02442d5&amp;gt;] cv_wait_common+0x125/0x150 [spl]
&#8230;
[29174.445405]  [&amp;lt;ffffffff8f98cd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[29174.446519]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[29174.447422] LNet: Service thread pid 30231 was inactive for 61.71s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one.
[29175.811107] LNet: Service thread pid 30239 was inactive for 62.21s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one.
[29175.813367] LNet: Skipped 14 previous similar messages
[29175.814259] LustreError: dumping log to /tmp/lustre-log.1578676852.30239
[29180.860776] LNet: Service thread pid 30245 completed after 66.26s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
[29180.863656] LNet: Skipped 1 previous similar message
[29185.479226] LNet: Service thread pid 1697 was inactive for 71.57s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one.
&#8230;
[29250.371484] LNet: Service thread pid 30224 completed after 138.03s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
[29250.374371] LNet: Skipped 1 previous similar message
[29886.591943] Lustre: DEBUG MARKER: lctl set_param -n fail_loc=0 	    fail_val=0 2&amp;gt;/dev/null
[29887.500162] Lustre: DEBUG MARKER: rc=0;
[29887.500162] 			val=$(/usr/sbin/lctl get_param -n catastrophe 2&amp;gt;&amp;amp;1);
[29887.500162] 			if [[ $? -eq 0 &amp;amp;&amp;amp; $val -ne 0 ]]; then
[29887.500162] 				echo $(hostname -s): $val;
[29887.500162] 				rc=$val;
[29887.500162] 			fi;
[29887.500162] 			exit $rc
[29888.765447] Lustre: DEBUG MARKER: dmesg
[29889.610387] Lustre: DEBUG MARKER: /usr/sbin/lctl mark == sanity-benchmark test bonnie: bonnie++ ============================================================ 17:32:45 \(1578677565\)
[29889.875373] Lustre: DEBUG MARKER: == sanity-benchmark test bonnie: bonnie++ ============================================================ 17:32:45 (1578677565)
[29891.104509] Lustre: DEBUG MARKER: /usr/sbin/lctl mark min OST has 1889280kB available, using 3438440kB file size
[29891.374921] Lustre: DEBUG MARKER: min OST has 1889280kB available, using 3438440kB file size
[29937.030473] LNet: Service thread pid 30532 was inactive for 40.10s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[29937.033411] LNet: Skipped 3 previous similar messages
[29937.034276] Pid: 30532, comm: ll_ost_io00_039 3.10.0-1062.1.1.el7_lustre.x86_64 #1 SMP Fri Dec 13 20:09:43 UTC 2019
[29937.036006] Call Trace:
[29937.036737]  [&amp;lt;ffffffffc02442d5&amp;gt;] cv_wait_common+0x125/0x150 [spl]
[29937.037866]  [&amp;lt;ffffffffc0244315&amp;gt;] __cv_wait+0x15/0x20 [spl]
[29937.038927]  [&amp;lt;ffffffffc064a503&amp;gt;] txg_wait_open+0xc3/0x110 [zfs]
[29937.040315]  [&amp;lt;ffffffffc05ffdfa&amp;gt;] dmu_tx_wait+0x3aa/0x3c0 [zfs]
[29937.041495]  [&amp;lt;ffffffffc05ffea2&amp;gt;] dmu_tx_assign+0x92/0x490 [zfs]
[29937.042577]  [&amp;lt;ffffffffc10d1fd9&amp;gt;] osd_trans_start+0x199/0x440 [osd_zfs]
[29937.043860]  [&amp;lt;ffffffffc121bbf5&amp;gt;] ofd_trans_start+0x75/0xf0 [ofd]
[29937.045045]  [&amp;lt;ffffffffc1222cb1&amp;gt;] ofd_commitrw_write+0xa31/0x1d40 [ofd]
[29937.046283]  [&amp;lt;ffffffffc122714c&amp;gt;] ofd_commitrw+0x48c/0x9e0 [ofd]
[29937.047428]  [&amp;lt;ffffffffc0efc81c&amp;gt;] obd_commitrw+0x9c/0x370 [ptlrpc]
[29937.048870]  [&amp;lt;ffffffffc0f00cc2&amp;gt;] tgt_brw_write+0xf02/0x1ad0 [ptlrpc]
[29937.050125]  [&amp;lt;ffffffffc0f02b0a&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
[29937.051456]  [&amp;lt;ffffffffc0ea746b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[29937.052819]  [&amp;lt;ffffffffc0eaadd4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
[29937.054037]  [&amp;lt;ffffffff8f2c50d1&amp;gt;] kthread+0xd1/0xe0
[29937.055092]  [&amp;lt;ffffffff8f98cd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[29937.056210]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[29937.057192] LustreError: dumping log to /tmp/lustre-log.1578677613.30532
[29939.358695] LNet: Service thread pid 30527 completed after 42.34s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
[29939.361926] LNet: Skipped 5 previous similar messages
[29944.592261] LNet: Service thread pid 30535 was inactive for 46.67s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[29944.592622] LNet: Service thread pid 30532 completed after 47.66s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
[29944.597937] Pid: 30535, comm: ll_ost_io00_042 3.10.0-1062.1.1.el7_lustre.x86_64 #1 SMP Fri Dec 13 20:09:43 UTC 2019
[29944.599693] Call Trace:
[29944.600416]  [&amp;lt;ffffffffc02442d5&amp;gt;] cv_wait_common+0x125/0x150 [spl]
&#8230;
[29944.631503]  [&amp;lt;ffffffff8f98cd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[29944.633800]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[29944.635849] Pid: 30538, comm: ll_ost_io00_045 3.10.0-1062.1.1.el7_lustre.x86_64 #1 SMP Fri Dec 13 20:09:43 UTC 2019
[29944.638729] Call Trace:
[29944.639574]  [&amp;lt;ffffffffc02442d5&amp;gt;] cv_wait_common+0x125/0x150 [spl]
&#8230;
[29944.669318]  [&amp;lt;ffffffff8f98cd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[29944.671370]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[29944.673425] Pid: 1703, comm: ll_ost_io00_002 3.10.0-1062.1.1.el7_lustre.x86_64 #1 SMP Fri Dec 13 20:09:43 UTC 2019
[29944.676410] Call Trace:
[29944.677279]  [&amp;lt;ffffffffc02442d5&amp;gt;] cv_wait_common+0x125/0x150 [spl]
&#8230;
[29944.709297]  [&amp;lt;ffffffff8f98cd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[29944.711640]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[29944.713831] Pid: 30554, comm: ll_ost_io00_061 3.10.0-1062.1.1.el7_lustre.x86_64 #1 SMP Fri Dec 13 20:09:43 UTC 2019
[29944.716912] Call Trace:
[29944.717817]  [&amp;lt;ffffffffc02442d5&amp;gt;] cv_wait_common+0x125/0x150 [spl]
&#8230;
 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;On the MDS1/3 (vm4), we see inactive threads while running dbench&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[28372.841352] Lustre: DEBUG MARKER: == sanity-benchmark test dbench: dbench ============================================================== 17:07:28 (1578676048)
[29178.025765] LNet: Service thread pid 19144 was inactive for 40.10s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[29178.028758] Pid: 19144, comm: mdt00_001 3.10.0-1062.1.1.el7_lustre.x86_64 #1 SMP Fri Dec 13 20:09:43 UTC 2019
[29178.030468] Call Trace:
[29178.031077]  [&amp;lt;ffffffffc1778c98&amp;gt;] osp_precreate_reserve+0x2e8/0x800 [osp]
[29178.032389]  [&amp;lt;ffffffffc176d8b9&amp;gt;] osp_declare_create+0x199/0x5b0 [osp]
[29178.033653]  [&amp;lt;ffffffffc16b569f&amp;gt;] lod_sub_declare_create+0xdf/0x210 [lod]
[29178.035062]  [&amp;lt;ffffffffc16ad86e&amp;gt;] lod_qos_declare_object_on+0xbe/0x3a0 [lod]
[29178.036345]  [&amp;lt;ffffffffc16b080e&amp;gt;] lod_alloc_rr.constprop.19+0xeee/0x1490 [lod]
[29178.037720]  [&amp;lt;ffffffffc16b492d&amp;gt;] lod_qos_prep_create+0x12fd/0x1890 [lod]
[29178.038944]  [&amp;lt;ffffffffc16b50d5&amp;gt;] lod_prepare_create+0x215/0x2e0 [lod]
[29178.040287]  [&amp;lt;ffffffffc16a4f3e&amp;gt;] lod_declare_striped_create+0x1ee/0x980 [lod]
[29178.041651]  [&amp;lt;ffffffffc16a9814&amp;gt;] lod_declare_create+0x204/0x590 [lod]
[29178.042920]  [&amp;lt;ffffffffc171faf2&amp;gt;] mdd_declare_create_object_internal+0xe2/0x2f0 [mdd]
[29178.044380]  [&amp;lt;ffffffffc170f75c&amp;gt;] mdd_declare_create+0x4c/0xdf0 [mdd]
[29178.045636]  [&amp;lt;ffffffffc1713247&amp;gt;] mdd_create+0x867/0x14a0 [mdd]
[29178.046819]  [&amp;lt;ffffffffc15af9ff&amp;gt;] mdt_reint_open+0x224f/0x3240 [mdt]
[29178.048314]  [&amp;lt;ffffffffc15a2a53&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
[29178.049508]  [&amp;lt;ffffffffc157f213&amp;gt;] mdt_reint_internal+0x6e3/0xaf0 [mdt]
[29178.050820]  [&amp;lt;ffffffffc158c2e2&amp;gt;] mdt_intent_open+0x82/0x3a0 [mdt]
[29178.051980]  [&amp;lt;ffffffffc158a405&amp;gt;] mdt_intent_policy+0x435/0xd80 [mdt]
[29178.053325]  [&amp;lt;ffffffffc1148e06&amp;gt;] ldlm_lock_enqueue+0x356/0xa20 [ptlrpc]
[29178.054935]  [&amp;lt;ffffffffc1171506&amp;gt;] ldlm_handle_enqueue0+0xa56/0x15f0 [ptlrpc]
[29178.056274]  [&amp;lt;ffffffffc11f9cf2&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
[29178.057669]  [&amp;lt;ffffffffc1200b0a&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
[29178.058956]  [&amp;lt;ffffffffc11a546b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[29178.060423]  [&amp;lt;ffffffffc11a8dd4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
[29178.061674]  [&amp;lt;ffffffffb5ec50d1&amp;gt;] kthread+0xd1/0xe0
[29178.062605]  [&amp;lt;ffffffffb658cd37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[29178.063870]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[29178.064808] LustreError: dumping log to /tmp/lustre-log.1578676854.19144
[29179.962488] LNet: Service thread pid 26259 was inactive for 42.04s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
&#8230; 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="49351">LU-10250</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="55515">LU-12234</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzkgf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>