<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:50:05 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5278] ZFS - many OST watchdogs with IOR</title>
                <link>https://jira.whamcloud.com/browse/LU-5278</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Running IOR with 100 clients. Performance is terrible. OSTs are wedging and dropping watchdogs. &lt;br/&gt;
Example:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;2014-07-01 08:22:47 LNet: Service thread pid 8308 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 200.00s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; debugging purposes:
2014-07-01 08:22:47 Pid: 8308, comm: ll_ost_io00_014
2014-07-01 08:22:47
2014-07-01 08:22:47 Call Trace:
2014-07-01 08:22:47  [&amp;lt;ffffffffa05b34ba&amp;gt;] ? dmu_zfetch+0x51a/0xd70 [zfs]
2014-07-01 08:22:47  [&amp;lt;ffffffff810a6d01&amp;gt;] ? ktime_get_ts+0xb1/0xf0
2014-07-01 08:22:47  [&amp;lt;ffffffff815287f3&amp;gt;] io_schedule+0x73/0xc0
2014-07-01 08:22:47  [&amp;lt;ffffffffa04f841c&amp;gt;] cv_wait_common+0x8c/0x100 [spl]
2014-07-01 08:22:47  [&amp;lt;ffffffff8109af00&amp;gt;] ? autoremove_wake_function+0x0/0x40
2014-07-01 08:22:47  [&amp;lt;ffffffffa04f84a8&amp;gt;] __cv_wait_io+0x18/0x20 [spl]
2014-07-01 08:22:47  [&amp;lt;ffffffffa062f0ab&amp;gt;] zio_wait+0xfb/0x1b0 [zfs]
2014-07-01 08:22:47  [&amp;lt;ffffffffa05a503d&amp;gt;] dmu_buf_hold_array_by_dnode+0x19d/0x4c0 [zfs]
2014-07-01 08:22:47  [&amp;lt;ffffffffa05a5e68&amp;gt;] dmu_buf_hold_array_by_bonus+0x68/0x90 [zfs]
2014-07-01 08:22:47  [&amp;lt;ffffffffa0e3f1a3&amp;gt;] osd_bufs_get+0x493/0xb00 [osd_zfs]
2014-07-01 08:22:47  [&amp;lt;ffffffffa03be488&amp;gt;] ? libcfs_log_return+0x28/0x40 [libcfs]
2014-07-01 08:22:47  [&amp;lt;ffffffffa0f2e00b&amp;gt;] ofd_preprw_read+0x15b/0x890 [ofd]
2014-07-01 08:22:47  [&amp;lt;ffffffffa0f30709&amp;gt;] ofd_preprw+0x749/0x1650 [ofd]
2014-07-01 08:22:47  [&amp;lt;ffffffffa09d71b1&amp;gt;] obd_preprw.clone.3+0x121/0x390 [ptlrpc]
2014-07-01 08:22:47  [&amp;lt;ffffffffa09deb03&amp;gt;] tgt_brw_read+0x2d3/0x1150 [ptlrpc]
2014-07-01 08:22:47  [&amp;lt;ffffffffa03be488&amp;gt;] ? libcfs_log_return+0x28/0x40 [libcfs]
2014-07-01 08:22:47  [&amp;lt;ffffffffa097ab36&amp;gt;] ? lustre_pack_reply_v2+0x216/0x280 [ptlrpc]
2014-07-01 08:22:47  [&amp;lt;ffffffffa097ac4e&amp;gt;] ? lustre_pack_reply_flags+0xae/0x1f0 [ptlrpc]
2014-07-01 08:22:47  [&amp;lt;ffffffffa09dca7c&amp;gt;] tgt_request_handle+0x23c/0xac0 [ptlrpc]
2014-07-01 08:22:47  [&amp;lt;ffffffffa098c29a&amp;gt;] ptlrpc_main+0xd1a/0x1980 [ptlrpc]
2014-07-01 08:22:47  [&amp;lt;ffffffffa098b580&amp;gt;] ? ptlrpc_main+0x0/0x1980 [ptlrpc]
2014-07-01 08:22:47  [&amp;lt;ffffffff8109ab56&amp;gt;] kthread+0x96/0xa0
2014-07-01 08:22:47  [&amp;lt;ffffffff8100c20a&amp;gt;] child_rip+0xa/0x20
2014-07-01 08:22:47  [&amp;lt;ffffffff8109aac0&amp;gt;] ? kthread+0x0/0xa0
2014-07-01 08:22:47  [&amp;lt;ffffffff8100c200&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Lustre dump attached. &lt;/p&gt;

&lt;p&gt;Second example:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;2014-07-01 09:38:41 Pid: 9299, comm: ll_ost_io00_070
2014-07-01 09:38:41
2014-07-01 09:38:41 Call Trace:
2014-07-01 09:38:41  [&amp;lt;ffffffffa05b02f7&amp;gt;] ? dmu_zfetch+0x357/0xd70 [zfs]
2014-07-01 09:38:41  [&amp;lt;ffffffffa05957f2&amp;gt;] ? arc_read+0x572/0x8d0 [zfs]
2014-07-01 09:38:41  [&amp;lt;ffffffff810a6d01&amp;gt;] ? ktime_get_ts+0xb1/0xf0
2014-07-01 09:38:41  [&amp;lt;ffffffff815287f3&amp;gt;] io_schedule+0x73/0xc0
2014-07-01 09:38:41  [&amp;lt;ffffffffa04f841c&amp;gt;] cv_wait_common+0x8c/0x100 [spl]
2014-07-01 09:38:41  [&amp;lt;ffffffff8109af00&amp;gt;] ? autoremove_wake_function+0x0/0x40
2014-07-01 09:38:41  [&amp;lt;ffffffffa04f84a8&amp;gt;] __cv_wait_io+0x18/0x20 [spl]
2014-07-01 09:38:41  [&amp;lt;ffffffffa062c0ab&amp;gt;] zio_wait+0xfb/0x1b0 [zfs]
2014-07-01 09:38:41  [&amp;lt;ffffffffa05a203d&amp;gt;] dmu_buf_hold_array_by_dnode+0x19d/0x4c0 [zfs]
2014-07-01 09:38:41  [&amp;lt;ffffffffa05a2e68&amp;gt;] dmu_buf_hold_array_by_bonus+0x68/0x90 [zfs]
2014-07-01 09:38:41  [&amp;lt;ffffffffa0e441a3&amp;gt;] osd_bufs_get+0x493/0xb00 [osd_zfs]
2014-07-01 09:38:41  [&amp;lt;ffffffffa03be488&amp;gt;] ? libcfs_log_return+0x28/0x40 [libcfs]
2014-07-01 09:38:41  [&amp;lt;ffffffffa0f3700b&amp;gt;] ofd_preprw_read+0x15b/0x890 [ofd]
2014-07-01 09:38:41  [&amp;lt;ffffffffa0f39709&amp;gt;] ofd_preprw+0x749/0x1650 [ofd]
2014-07-01 09:38:41  [&amp;lt;ffffffffa09d41b1&amp;gt;] obd_preprw.clone.3+0x121/0x390 [ptlrpc]
2014-07-01 09:38:41  [&amp;lt;ffffffffa09dbb03&amp;gt;] tgt_brw_read+0x2d3/0x1150 [ptlrpc]
2014-07-01 09:38:41  [&amp;lt;ffffffffa03be488&amp;gt;] ? libcfs_log_return+0x28/0x40 [libcfs]
2014-07-01 09:38:41  [&amp;lt;ffffffffa0977b36&amp;gt;] ? lustre_pack_reply_v2+0x216/0x280 [ptlrpc]
2014-07-01 09:38:41  [&amp;lt;ffffffffa0977c4e&amp;gt;] ? lustre_pack_reply_flags+0xae/0x1f0 [ptlrpc]
2014-07-01 09:38:41  [&amp;lt;ffffffffa09d9a7c&amp;gt;] tgt_request_handle+0x23c/0xac0 [ptlrpc]
2014-07-01 09:38:41  [&amp;lt;ffffffffa098929a&amp;gt;] ptlrpc_main+0xd1a/0x1980 [ptlrpc]
2014-07-01 09:38:41  [&amp;lt;ffffffffa0988580&amp;gt;] ? ptlrpc_main+0x0/0x1980 [ptlrpc]
2014-07-01 09:38:41  [&amp;lt;ffffffff8109ab56&amp;gt;] kthread+0x96/0xa0
2014-07-01 09:38:41  [&amp;lt;ffffffff8100c20a&amp;gt;] child_rip+0xa/0x20
2014-07-01 09:38:41  [&amp;lt;ffffffff8109aac0&amp;gt;] ? kthread+0x0/0xa0
2014-07-01 09:38:41  [&amp;lt;ffffffff8100c200&amp;gt;] ? child_rip+0x0/0x20
2014-07-01 09:38:41
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>Hyperion/LLNL</environment>
        <key id="25389">LU-5278</key>
            <summary>ZFS - many OST watchdogs with IOR</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bzzz">Alex Zhuravlev</assignee>
                                    <reporter username="cliffw">Cliff White</reporter>
                        <labels>
                            <label>MB</label>
                            <label>llnl</label>
                            <label>prz</label>
                            <label>zfs</label>
                    </labels>
                <created>Tue, 1 Jul 2014 15:36:25 +0000</created>
                <updated>Wed, 23 Dec 2015 21:17:23 +0000</updated>
                            <resolved>Tue, 9 Jun 2015 12:44:26 +0000</resolved>
                                    <version>Lustre 2.6.0</version>
                                    <fixVersion>Lustre 2.8.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>15</watches>
                                                                            <comments>
                            <comment id="87898" author="cliffw" created="Tue, 1 Jul 2014 15:42:49 +0000"  >&lt;p&gt;Lustre log dumped by first watchdog&lt;/p&gt;</comment>
                            <comment id="87992" author="adilger" created="Wed, 2 Jul 2014 17:57:43 +0000"  >&lt;p&gt;Cliff, is this running ZFS 0.6.2 or 0.6.3?&lt;/p&gt;

&lt;p&gt;Also, did the IOR eventually complete or were the same threads stuck forever?&lt;/p&gt;</comment>
                            <comment id="88002" author="cliffw" created="Wed, 2 Jul 2014 18:38:35 +0000"  >&lt;p&gt;ZFS version was 0.6.3 - the IOR did eventually complete, results were terrible.&lt;/p&gt;</comment>
                            <comment id="88208" author="adilger" created="Fri, 4 Jul 2014 18:12:06 +0000"  >&lt;p&gt;Isaac, could you please take a look at this to see if there is something obvious?  We are trying to decide if this needs to be a blocker for 2.6.0.&lt;/p&gt;</comment>
                            <comment id="89962" author="isaac" created="Thu, 24 Jul 2014 16:36:38 +0000"  >&lt;p&gt;Sorry missed this bug during the travels.&lt;/p&gt;

&lt;p&gt;Cliff,&lt;/p&gt;

&lt;p&gt;1. Please disable ZFS prefetching on both OSS and MDS:&lt;br/&gt;
options zfs zfs_prefetch_disable=1&lt;br/&gt;
2. If it happens again, please also gather all files under /proc/spl/ on both the OSS and the MDS.&lt;/p&gt;</comment>
                            <comment id="98995" author="adilger" created="Wed, 12 Nov 2014 18:46:02 +0000"  >&lt;p&gt;Cliff, Isaac, could you please update on the status of this ticket.  Cliff, did you implement Isaac&apos;s suggestion? Are there still watchdogs being hit?&lt;/p&gt;</comment>
                            <comment id="99005" author="cliffw" created="Wed, 12 Nov 2014 20:24:51 +0000"  >&lt;p&gt;I can run some ZFS-specific tests. Due to LLNL&apos;s cfengine, I am not sure I can set that option. Should be able to do this in the next few days&lt;/p&gt;</comment>
                            <comment id="99198" author="cliffw" created="Fri, 14 Nov 2014 18:31:56 +0000"  >&lt;p&gt;We are still dumping watchdogs, I have not yet been able to set the option. Here&apos;s an example:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;2014-11-14 10:11:14 LNet: Service thread pid 8361 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 222.00s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; debugging purposes:
2014-11-14 10:11:14 Pid: 8361, comm: ll_ost_io03_081
2014-11-14 10:11:14
2014-11-14 10:11:14 Call Trace:
2014-11-14 10:11:14  [&amp;lt;ffffffffa059a4ba&amp;gt;] ? dmu_zfetch+0x51a/0xd70 [zfs]
2014-11-14 10:11:14  [&amp;lt;ffffffff81529e83&amp;gt;] io_schedule+0x73/0xc0
2014-11-14 10:11:14  [&amp;lt;ffffffffa04b141c&amp;gt;] cv_wait_common+0x8c/0x100 [spl]
2014-11-14 10:11:14  [&amp;lt;ffffffff8109afa0&amp;gt;] ? autoremove_wake_function+0x0/0x40
2014-11-14 10:11:14  [&amp;lt;ffffffffa04b14a8&amp;gt;] __cv_wait_io+0x18/0x20 [spl]
2014-11-14 10:11:14  [&amp;lt;ffffffffa061611b&amp;gt;] zio_wait+0xfb/0x1b0 [zfs]
2014-11-14 10:11:14  [&amp;lt;ffffffffa058c03d&amp;gt;] dmu_buf_hold_array_by_dnode+0x19d/0x4c0 [zfs]
2014-11-14 10:11:14  [&amp;lt;ffffffffa058ce68&amp;gt;] dmu_buf_hold_array_by_bonus+0x68/0x90 [zfs]
2014-11-14 10:11:14  [&amp;lt;ffffffffa110f4b3&amp;gt;] osd_bufs_get+0x493/0xb00 [osd_zfs]
2014-11-14 10:11:14  [&amp;lt;ffffffffa03c71a1&amp;gt;] ? libcfs_debug_msg+0x41/0x50 [libcfs]
2014-11-14 10:11:14  [&amp;lt;ffffffffa03c17d8&amp;gt;] ? libcfs_log_return+0x28/0x40 [libcfs]
2014-11-14 10:11:14  [&amp;lt;ffffffffa1050f7b&amp;gt;] ofd_preprw_read+0x15b/0x930 [ofd]
2014-11-14 10:11:14  [&amp;lt;ffffffffa1051f99&amp;gt;] ofd_preprw+0x849/0x1680 [ofd]
2014-11-14 10:11:14  [&amp;lt;ffffffffa09a24b1&amp;gt;] obd_preprw.clone.3+0x121/0x390 [ptlrpc]
2014-11-14 10:11:14  [&amp;lt;ffffffffa09aa24e&amp;gt;] tgt_brw_read+0x67e/0x1160 [ptlrpc]
2014-11-14 10:11:14  [&amp;lt;ffffffffa03c17d8&amp;gt;] ? libcfs_log_return+0x28/0x40 [libcfs]
2014-11-14 10:11:14  [&amp;lt;ffffffffa0946446&amp;gt;] ? lustre_pack_reply_v2+0x226/0x290 [ptlrpc]
2014-11-14 10:11:14  [&amp;lt;ffffffffa094655e&amp;gt;] ? lustre_pack_reply_flags+0xae/0x1f0 [ptlrpc]
2014-11-14 10:11:14  [&amp;lt;ffffffffa09a82ae&amp;gt;] tgt_request_handle+0x71e/0xb10 [ptlrpc]
2014-11-14 10:11:14  [&amp;lt;ffffffffa0957a94&amp;gt;] ptlrpc_main+0xe64/0x1990 [ptlrpc]
2014-11-14 10:11:14  [&amp;lt;ffffffffa0956c30&amp;gt;] ? ptlrpc_main+0x0/0x1990 [ptlrpc]
2014-11-14 10:11:14  [&amp;lt;ffffffff8109abf6&amp;gt;] kthread+0x96/0xa0
2014-11-14 10:11:14  [&amp;lt;ffffffff8100c20a&amp;gt;] child_rip+0xa/0x20
2014-11-14 10:11:14  [&amp;lt;ffffffff8109ab60&amp;gt;] ? kthread+0x0/0xa0
2014-11-14 10:11:14  [&amp;lt;ffffffff8100c200&amp;gt;] ? child_rip+0x0/0x20
2014-11-14 10:11:14
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="99199" author="cliffw" created="Fri, 14 Nov 2014 18:36:51 +0000"  >&lt;p&gt;Log dump from OSS&lt;/p&gt;</comment>
                            <comment id="99207" author="cliffw" created="Fri, 14 Nov 2014 18:48:06 +0000"  >&lt;p&gt;I am making a second run now with the options set&lt;/p&gt;</comment>
                            <comment id="99388" author="cliffw" created="Mon, 17 Nov 2014 19:34:54 +0000"  >&lt;p&gt;Disabling prefetch seems to fix the watchdogs, performance is about the same or perhaps a bit worse, spreadsheet attached&lt;/p&gt;</comment>
                            <comment id="103023" author="isaac" created="Fri, 9 Jan 2015 17:17:00 +0000"  >&lt;p&gt;For some workloads, zprefetch actually helps a lot. But if performance went just a bit worse by disabling it, then zprefetch would mostly just incur unnecessary IOs and likely execute some flaky parts of the code. So I&apos;d suggest to keep it disabled as long as performance isn&apos;t adversely affected.&lt;/p&gt;</comment>
                            <comment id="103502" author="cliffw" created="Wed, 14 Jan 2015 18:35:01 +0000"  >&lt;p&gt;Okay, for now will leave prefetch always off. Left it on by mistake in last IEEL runs.&lt;/p&gt;</comment>
                            <comment id="105305" author="rpwagner" created="Sun, 1 Feb 2015 07:13:43 +0000"  >&lt;p&gt;I believe we are seeing the cause of this while testing streaming read IO using Lustre backed by ZFS, namely that &lt;tt&gt;osd_bufs_get&lt;/tt&gt; (and therefore &lt;tt&gt;osd_bufs_get_read&lt;/tt&gt;) is called inside of a for loop via &lt;tt&gt;ofd_preprw_read&lt;/tt&gt;, and &lt;tt&gt;osd_bufs_get_read&lt;/tt&gt; is hitting storage on each call due to its call to &lt;tt&gt;dmu_buf_hold_array_by_bonus&lt;/tt&gt;. My guess is that IOs are not being queued up to the storage as intended by &lt;tt&gt;dt_object_design&lt;/tt&gt; when using ZFS.&lt;/p&gt;</comment>
                            <comment id="105333" author="bzzz" created="Mon, 2 Feb 2015 06:23:25 +0000"  >&lt;p&gt;specifically with streaming reads this shouldn&apos;t be a problem - it&apos;d be a single call to dmu_buf_hold_array_by_bonus() as the whole range is described by a single niobuf.&lt;/p&gt;</comment>
                            <comment id="105338" author="rpwagner" created="Mon, 2 Feb 2015 10:05:47 +0000"  >&lt;p&gt;Thanks, Alex, that&apos;s reassuring. We&apos;re trying to figure out why we&apos;re getting 2.4 GB/s read bandwidth out of OSTs that will deliver over 9 GB/s when mounted as ZFS and using dd. During our profiling, I saw that &lt;tt&gt;dmu_buf_hold_array_by_bonus()&lt;/tt&gt; was going to the drive, and didn&apos;t know that &lt;tt&gt;nicount&lt;/tt&gt; in &lt;tt&gt;ofd_preprw_read()&lt;/tt&gt; would be set to 1 for streaming reads.&lt;/p&gt;</comment>
                            <comment id="105347" author="bzzz" created="Mon, 2 Feb 2015 14:08:26 +0000"  >&lt;p&gt;I think it makes sense to verify it&apos;s really 1 and I&apos;m not too optimistic &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="105446" author="rpwagner" created="Mon, 2 Feb 2015 23:27:35 +0000"  >&lt;p&gt;Alex, without having attached to a process to check, I don&apos;t think it is 1. We&apos;re testing using obdfilter-survey, which uses the echo client. Inside of obdecho/echo_client.c:echo_client_prep_commit() there&apos;s a loop with the following:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;                ioo.ioo_bufcnt = npages;

                lpages = npages;
		ret = obd_preprw(env, rw, exp, oa, 1, &amp;amp;ioo, rnb, &amp;amp;lpages,
                                 lnb, oti, NULL);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;where npages is the number of 4K pages in the request.&lt;/p&gt;

&lt;p&gt;Later, ofd_io.c:ofd_preprw() call ofd_preprw_read() where ioo is now obj&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;			rc = ofd_preprw_read(env, exp, ofd, fid,
					     &amp;amp;info-&amp;gt;fti_attr, oa,
					     obj-&amp;gt;ioo_bufcnt, rnb, nr_local,
					     lnb, jobid);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and that&apos;s where ioo_bufcnt becomes niocount, and the loop is executed which eventually gets to osd-zfs:osd_io.c:osd_bufs_get_read() and dmu_buf_hold_array_by_bonus(). This points to dmu_buf_hold_array_by_bonus() being called for 4K buffers. I don&apos;t know if this is by design, but depending on how ZFS handles these requests, it could explain why the streaming bandwidth is so much lower that what we see with native ZFS file systems.&lt;/p&gt;

&lt;p&gt;The performance is similarly low when the Lustre file system is mounted, so I&apos;m guessing the regular code path has similar logic to what&apos;s in the echo client.&lt;/p&gt;</comment>
                            <comment id="105469" author="bzzz" created="Tue, 3 Feb 2015 02:06:34 +0000"  >&lt;p&gt;I&apos;d agree that echo_client_prep_commit() isn&apos;t a good approach, it should be filing rnb&apos;es and then make a single call to obd_preprw(). please try the regular I/O with dd, I&apos;m quite sure you&apos;ll be seeing mostly single niobuf requests.&lt;/p&gt;
</comment>
                            <comment id="105470" author="adilger" created="Tue, 3 Feb 2015 03:13:54 +0000"  >&lt;p&gt;Obdfilter-survey should behave the same as our network IO submission, so that it simulates real world performance reasonably accurately. It makes sense to fix echo_cloent_prep_commit() to build large niobufs for submission. &lt;/p&gt;</comment>
                            <comment id="105474" author="rpwagner" created="Tue, 3 Feb 2015 06:17:05 +0000"  >&lt;p&gt;Indeed, running single dds from 4 clients was sufficient to see 1.3 GB/s (occasional peaks from zpool iostat at 1.5 GB/s) from a single OST. I&apos;ll need more clients to see how that scales over more OSTs, but that&apos;s far better read performance than we were getting from a single OST via obdfilter-survey, and is much closer to the native ZFS results. If there&apos;s a method for me to verify the len passed to osd_bufs_get_read, I&apos;d be glad to try it.&lt;/p&gt;

&lt;p&gt;And, Andreas, I completely agree with your comment about the echo client. This has led to significant confusion on our side about the capabilities of Lustre with ZFS. A simple tool which can simulate dd-like workloads on the server is what we expected, what we got behaved more fio.&lt;/p&gt;</comment>
                            <comment id="105502" author="gerrit" created="Tue, 3 Feb 2015 15:03:06 +0000"  >&lt;p&gt;Alex Zhuravlev (alexey.zhuravlev@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/13612&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/13612&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5278&quot; title=&quot;ZFS - many OST watchdogs with IOR&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5278&quot;&gt;&lt;del&gt;LU-5278&lt;/del&gt;&lt;/a&gt; echo: request pages in batches&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 87737ebe6cc4bb45148e0c86e3b3e05007ab80fb&lt;/p&gt;</comment>
                            <comment id="105509" author="rpwagner" created="Tue, 3 Feb 2015 16:13:16 +0000"  >&lt;p&gt;Alex, thanks for the patch. I will test this today and see how close it compares to the native ZFS performance. Now it looks we&apos;re bottlenecking on socknal_sd* threads when reading from more than one OST across the network. Different topic, I expect.&lt;/p&gt;</comment>
                            <comment id="105510" author="bzzz" created="Tue, 3 Feb 2015 16:18:09 +0000"  >&lt;p&gt;thanks in advance, Rick. I tested the patch locally and it seem to be OK, but the testing just started on Maloo, so I can&apos;t yet promise it&apos;s perfect.&lt;/p&gt;</comment>
                            <comment id="105646" author="rpwagner" created="Wed, 4 Feb 2015 05:10:57 +0000"  >&lt;p&gt;I&apos;m optimistic about your patch doing what you expect, Alex. I&apos;m seeing up to 1.5 GB/s reads from a single OST, which is what I got from a client using dd last night, and very close the 1.6 GB/s I measure on native ZFS. The real test will be how much it helps in performance tuning.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;$  targets=edragon-OST0000 rszlo=4096 rszhi=4096 size=$((64*1024)) nobjlo=2 nobjhi=4 thrlo=2 thrhi=4  obdfilter-survey
Tue Feb  3 21:00:43 PST 2015 Obdfilter-survey &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;case&lt;/span&gt;=disk from seahorse-oss-19-5.local
ost  1 sz 67108864K rsz 4096K obj    2 thr    2 write  842.72 [ 547.97, 947.91] read 1520.84 [1087.92,1835.91] 
ost  1 sz 67108864K rsz 4096K obj    2 thr    4 write  810.22 [  31.99,1303.90] read  846.02 [ 203.99,1691.92] 
ost  1 sz 67108864K rsz 4096K obj    4 thr    4 write  833.51 [ 307.98,1215.91] read 1518.76 [1187.94,2011.89] 
done!
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;$ zpool iostat 5
...
----------  -----  -----  -----  -----  -----  -----
mdt         27.2M  3.62T      0      0      0      0
ost0         141G  36.1T  1.51K     30  1.51G  30.0M
ost1        42.7G  36.2T      0      0      0      0
ost2        41.8G  36.2T      0      0      0      0
----------  -----  -----  -----  -----  -----  -----
mdt         27.2M  3.62T      0      0      0      0
ost0         141G  36.1T  1.48K     34  1.48G  34.8M
ost1        42.7G  36.2T      0      0      0      0
ost2        41.8G  36.2T      0      0      0      0
----------  -----  -----  -----  -----  -----  -----
mdt         27.2M  3.62T      0      0      0      0
ost0         141G  36.1T  1.49K     37  1.49G  37.0M
ost1        42.7G  36.2T      0      0      0      0
ost2        41.8G  36.2T      0      0      0      0&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="105648" author="bzzz" created="Wed, 4 Feb 2015 05:28:05 +0000"  >&lt;p&gt;thanks for the feedback, Rick.&lt;/p&gt;</comment>
                            <comment id="105658" author="adilger" created="Wed, 4 Feb 2015 12:19:18 +0000"  >&lt;p&gt;A significant amount of overhead has been observed in socknal threads due to TCP overhead from assembling packets on read.  In the past the CPU usage of this data copying was on the order of 1GHz CPU/1GB/s of network bandwidth, which is one reason why IB with RDMA is more attractive.  Another possibility (which we haven&apos;t tested much ourselves, but have heard some reports about) is to try RoCE via o2iblnd, if your ethernet hardware supports it.  That would likely reduce your CPU usage.&lt;/p&gt;</comment>
                            <comment id="106134" author="rpwagner" created="Sat, 7 Feb 2015 00:36:07 +0000"  >&lt;p&gt;Andreas, I&apos;ll hold off on blaming the socknal threads for the performance, since I think I&apos;ve found another place where 4K reads are being made. First, I can confirm that Alex&apos;s patch to the echo client was a huge improvement to echo client. The streaming bandwidth performance comparison between dd on remote clients to a server running obdfilter-survey is much more consistent. You can see this in these two &lt;a href=&quot;http://www.brendangregg.com/FlameGraphs/cpuflamegraphs.html&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;flame graphs&lt;/a&gt;: &lt;a href=&quot;http://users.sdsc.edu/~rpwagner/perf-kernel-6ost-obdfilter-read-nodbuf-raidz2.svg&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;before, 25% of time in osd_bufs_get&lt;/a&gt;, and &lt;a href=&quot;http://users.sdsc.edu/~rpwagner/perf-kernel-6ost-obdfilter-echo-fix-raidz2.svg&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;after, 1.5% of time in osd_bufs_get&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;Since the performance is now closer between client and server (between 4 to 5 GB/s for reads), but still much less than native ZFS (over 9 GB/s, scaling linearly from 1.5 GB/s on a single zpool), I looked at the portion of code still taking up a lot of time, osd_read, and its call to dmu_read. If I trace back the calls correctly, dmu_read is being called with a size of 4K.&lt;/p&gt;

&lt;p&gt;Like the earlier discussion, it starts in ofd_preprw_read with it&apos;s call to dt_bufs_get, except this time, after the call to dmu_buf_hold_array_by_bonus, osd_bufs_get_read loops over lnb mapping (copying?) data.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;			&lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; (tocpy &amp;gt; 0) {
				thispage = PAGE_CACHE_SIZE;
				thispage -= bufoff &amp;amp; (PAGE_CACHE_SIZE - 1);
				thispage = min(tocpy, thispage);

				lnb-&amp;gt;lnb_rc = 0;
				lnb-&amp;gt;lnb_file_offset = off;
				lnb-&amp;gt;lnb_page_offset = bufoff &amp;amp; ~CFS_PAGE_MASK;
				lnb-&amp;gt;lnb_len = thispage;
				lnb-&amp;gt;lnb_page = kmem_to_page(dbp[i]-&amp;gt;db_data +
							     bufoff);
				/* mark just a single slot: we need &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt;
				 * reference to dbuf to be release once */
				lnb-&amp;gt;lnb_data = dbf;
				dbf = NULL;

				tocpy -= thispage;
				len -= thispage;
				bufoff += thispage;
				off += thispage;

				npages++;
				lnb++;
			}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This is where lnb-&amp;gt;lnb_len gets set to 4K, and npages is incremented before being returned back to ofd_preprw_read. The page count is also incremented via the nr_local variable. Both lnb and nr_local are then passed to dt_read_prep. From there, osd_read_prep has a loop over the number of pages calling osd_read, which makes a single call to dmu_read.&lt;/p&gt;

&lt;p&gt;Would someone confirm that I&apos;ve got this correct? If so, it may be a more involved patch, unless the logic can stay in osd-zfs/osd_io.c. I do think this is consistent with reduced performance with additional OSTs, and the time spent in dmu_read.&lt;/p&gt;

&lt;div class=&apos;table-wrap&apos;&gt;
&lt;table class=&apos;confluenceTable&apos;&gt;&lt;tbody&gt;
&lt;tr&gt;
&lt;th class=&apos;confluenceTh&apos;&gt;&lt;ol&gt;
	&lt;li&gt;OSTs&lt;/li&gt;
&lt;/ol&gt;
&lt;/th&gt;
&lt;th class=&apos;confluenceTh&apos;&gt; read GB/S &lt;/th&gt;
&lt;th class=&apos;confluenceTh&apos;&gt; Perf &lt;/th&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td class=&apos;confluenceTd&apos;&gt; 1 &lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt; 1357 &lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt; &lt;a href=&quot;http://users.sdsc.edu/~rpwagner/perf-kernel-1ost-obfilter-echo-fix-raidz2.svg&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;flame graph&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td class=&apos;confluenceTd&apos;&gt; 2 &lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt; 2261 &lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt; &lt;a href=&quot;http://users.sdsc.edu/~rpwagner/perf-kernel-2ost-obfilter-echo-fix-raidz2.svg&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;flame graph&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td class=&apos;confluenceTd&apos;&gt; 3 &lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt; 3236 &lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt;&lt;a href=&quot;http://users.sdsc.edu/~rpwagner/perf-kernel-3ost-obfilter-echo-fix-raidz2.svg&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;flame graph&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td class=&apos;confluenceTd&apos;&gt; 4 &lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt; 3729 &lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt;&lt;a href=&quot;http://users.sdsc.edu/~rpwagner/perf-kernel-4ost-obfilter-echo-fix-raidz2.svg&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;flame graph&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td class=&apos;confluenceTd&apos;&gt; 5 &lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt; 4369 &lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt;&lt;a href=&quot;http://users.sdsc.edu/~rpwagner/perf-kernel-5ost-obfilter-echo-fix-raidz2.svg&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;flame graph&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td class=&apos;confluenceTd&apos;&gt; 6 &lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt; 5130 &lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt;&lt;a href=&quot;http://users.sdsc.edu/~rpwagner/perf-kernel-6ost-obfilter-echo-fix-raidz2.svg&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;flame graph&lt;/a&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;&lt;/table&gt;
&lt;/div&gt;
</comment>
                            <comment id="106145" author="bzzz" created="Sat, 7 Feb 2015 03:55:10 +0000"  >&lt;p&gt;Rick, you should try this patch - &lt;a href=&quot;http://review.whamcloud.com/#/c/12991/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/12991/&lt;/a&gt;&lt;br/&gt;
actual I/O is done in osd_bufs_get_read() and it&apos;s not in 4K except it was fragmented by the client. if client requests say 1M then it will be a single call to dmu_buf_hold_array_by_bonus() and then we just map returned ARC buffers to the pages. &lt;/p&gt;</comment>
                            <comment id="106146" author="rpwagner" created="Sat, 7 Feb 2015 05:19:30 +0000"  >&lt;p&gt;Hi Alex,&lt;/p&gt;

&lt;p&gt;I applied the patch, but things went backwards. My impression is that relying on the ARC only works when streaming data below zfs_arc_max. For this test, I set zfs_arc_max to 64 GB on a machine with 128 GB of RAM. The 2 GB/s writes are also much better than native ZFS, which make me suspect caching.&lt;/p&gt;

&lt;p&gt;The numbers reported, and the observation of zpool iostat look like data is being pulled from a single drive as needed. Is it better to call dmu_read with a similar size argument to what was used for dmu_buf_hold_array_by_bonus?&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;$ targets=&lt;span class=&quot;code-quote&quot;&gt;&quot;ddragon-OST0000 ddragon-OST0001&quot;&lt;/span&gt;  rszlo=4096 rszhi=4096 size=$((64*1024)) nobjlo=1 nobjhi=1 thrlo=1 thrhi=1 obdfilter-survey  
Fri Feb  6 20:58:47 PST 2015 Obdfilter-survey &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;case&lt;/span&gt;=disk from seahorse-oss-19-2.local
ost  2 sz 134217728K rsz 4096K obj    2 thr    2 write 2015.86 [ 387.91,5523.86] read  436.14 [  32.00,1051.92] 
done!
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;$ zpool iostat 5
               capacity     operations    bandwidth
pool        alloc   free   read  write   read  write
----------  -----  -----  -----  -----  -----  -----
...wait a &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt;...
mdt         36.6M  59.5G      0      0    202      0
ost0         965G  35.3T    242      0   242M      0
ost1         918G  35.4T    203      0   203M      0
ost2         829G  35.4T      0      0    201      0
ost3         849G  35.4T      0      0    201      0
ost4         831G  35.4T      0      0    201      0
ost5         831G  35.4T      0      0    201      0
----------  -----  -----  -----  -----  -----  -----
mdt         36.6M  59.5G      0      0    201      0
ost0         965G  35.3T    235      0   235M      0
ost1         918G  35.4T    216      0   216M      0
ost2         829G  35.4T      0      0    203      0
ost3         849G  35.4T      0      0    203      0
ost4         831G  35.4T      0      0    203      0
ost5         831G  35.4T      0      0    203      0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="106149" author="bzzz" created="Sat, 7 Feb 2015 05:33:58 +0000"  >&lt;p&gt;Rick, ARC is &lt;b&gt;always&lt;/b&gt; used. dmu_read() shouldn&apos;t be used at all, instead we call dmu_buf_hold_array_by_bonus() to fill the buffers with size equal to one applied by the client (should be 1M).&lt;/p&gt;</comment>
                            <comment id="106150" author="rpwagner" created="Sat, 7 Feb 2015 05:42:29 +0000"  >&lt;p&gt;OK, I&apos;ll do another perf analysis and see where the time went. At some point, the data requested will surpass the ARC, and need to begin streaming off of disk efficiently. I do wonder (out of ignorance) if the DMU layer is more related to the spl_kmem_cache than the ARC. &lt;/p&gt;</comment>
                            <comment id="106151" author="bzzz" created="Sat, 7 Feb 2015 05:45:24 +0000"  >&lt;p&gt;in the meantime, I&apos;ll try to add more I/O stats to osd-zfs/.&lt;/p&gt;</comment>
                            <comment id="106152" author="rpwagner" created="Sat, 7 Feb 2015 05:48:48 +0000"  >&lt;p&gt;Any instructions on how to pull debugging information from the running tasks would be appreciated. I am willing to use gdb, but I don&apos;t know how much guidance that would require to introspect the correct function.&lt;/p&gt;</comment>
                            <comment id="106153" author="bzzz" created="Sat, 7 Feb 2015 05:58:42 +0000"  >&lt;p&gt;what kind of information? I&apos;m planning to add I/O stats available in /proc/fs/lustre/..&lt;/p&gt;</comment>
                            <comment id="106154" author="rpwagner" created="Sat, 7 Feb 2015 06:08:30 +0000"  >&lt;p&gt;Like rpcstats on the client, a report of the {{len}}s and {{offset}}s in the call to dmu_buf_hold_array_by_bonus seems good, since this is the interface to the ZFS layer. Basically, summary statistics of the calls to ZFS from Lustre.&lt;/p&gt;</comment>
                            <comment id="106155" author="rpwagner" created="Sat, 7 Feb 2015 06:13:44 +0000"  >&lt;p&gt;The patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4820&quot; title=&quot;extra memcpy in read path&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4820&quot;&gt;&lt;del&gt;LU-4820&lt;/del&gt;&lt;/a&gt; may have hit an unintended consequence. Something about relying on dmu_buf_hold_array_by_bonus and the ARC has trigger the swapper task, as seen in this this &lt;a href=&quot;http://users.sdsc.edu/~rpwagner/perf-kernel-1ost-no-arc-no-dmu-read.svg&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;flame graph for 1 OST&lt;/a&gt;. I triggered this by zeroing out the ARC. This could also be done by requesting data over the size of the ARC.&lt;/p&gt;</comment>
                            <comment id="106156" author="bzzz" created="Sat, 7 Feb 2015 06:16:12 +0000"  >&lt;p&gt;actually all the patch does is removing unnecessary dmu_read(). I&apos;d think running swapper isn&apos;t a problem as long as this doesn&apos;t block reads. it should be releasing memory a bit ahead so that ARC always can get memory for new buffers immediately.&lt;/p&gt;</comment>
                            <comment id="106159" author="rpwagner" created="Sat, 7 Feb 2015 16:24:18 +0000"  >&lt;p&gt;I see your point, Alex, and it&apos;s shown that there must be another issue with the echo client. I went to 4 remote host, with each pulling a single file from the same OST using dd, and the zpool is happily streaming over 1 GB/s. I will move on to multiple OSTs, and if I see the same sub-linear scaling with additional OSTs I will do my perf analysis using remote clients.&lt;/p&gt;</comment>
                            <comment id="106163" author="adilger" created="Sun, 8 Feb 2015 01:42:05 +0000"  >&lt;blockquote&gt;
&lt;p&gt;I&apos;m planning to add I/O stats available in /proc/fs/lustre/..&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;The &lt;tt&gt;/proc/fs/lustre/osd&amp;#45;zfs/&amp;#42;/brw&amp;#95;stats&lt;/tt&gt; file was added to master for 2.7.0 via &lt;a href=&quot;http://review.whamcloud.com/11467&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11467&lt;/a&gt; so this may be useful for your testing.  See also my comments in Alex&apos;s patch &lt;a href=&quot;http://review.whamcloud.com/12991&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/12991&lt;/a&gt; which should be fixed up to move the brw_stats accounting around the &lt;tt&gt;dmu_buf_hold_array_by_bonus))&lt;/tt&gt; loop so it accurately measures read times.  Unfortunately, measuring write times is much more difficult for ZFS, since the write completion only happens down in ZFS.&lt;/p&gt;</comment>
                            <comment id="106181" author="rpwagner" created="Sun, 8 Feb 2015 04:43:23 +0000"  >&lt;p&gt;Thanks, Andreas. Since I&apos;m working from the Git master with all the patches I&apos;ve needed to get this far (including large block support in ZFS), it turns out that&apos;s already in place. I can see that all of the IO requests are at least 1M in size. Very useful. For both reads and writes, I do find &lt;/p&gt;
{zpool iostat &amp;lt;seconds&amp;gt;}
&lt;p&gt; to be a good check of what&apos;s going in and out of the disks, and ignores reads from cache.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;$ cat brw_stats 
snapshot_time:         1423367149.544364 (secs.usecs)

                           read      |     write
pages per bulk r/w     rpcs  % cum % |  rpcs        % cum %
1:		      4408   0   0   |  160  58  58
2:		        10   0   0   |  114  41 100
4:		         0   0   0   |    0   0 100
8:		         0   0   0   |    0   0 100
16:		         0   0   0   |    0   0 100
32:		         0   0   0   |    0   0 100
64:		         0   0   0   |    0   0 100
128:		         0   0   0   |    0   0 100
256:		    456654  89  89   |    0   0 100
512:		      8732   1  91   |    0   0 100
1K:		     42704   8 100   |    0   0 100

                           read      |     write
discontiguous pages    rpcs  % cum % |  rpcs        % cum %
0:		    512508 100 100   |  274 100 100

                           read      |     write
disk I/Os in flight    ios   % cum % |  ios         % cum %
1:		    511312  99  99   |  273  99  99
2:		      1123   0  99   |    1   0 100
3:		        62   0  99   |    0   0 100
4:		         9   0  99   |    0   0 100
5:		         1   0  99   |    0   0 100
6:		         1   0 100   |    0   0 100

                           read      |     write
I/O time (1/1000s)     ios   % cum % |  ios         % cum %
1:		       494  98  98   |    0   0   0
2:		         0   0  98   |    0   0   0
4:		         0   0  98   |    0   0   0
8:		         1   0  99   |    0   0   0
16:		         2   0  99   |    0   0   0
32:		         0   0  99   |    0   0   0
64:		         1   0  99   |    0   0   0
128:		         0   0  99   |    0   0   0
256:		         2   0 100   |    0   0   0

                           read      |     write
disk I/O size          ios   % cum % |  ios         % cum %
8:		         4   0   0   |    9   3   3
16:		         0   0   0   |    0   0   3
32:		         1   0   0   |    0   0   3
64:		         1   0   0   |    2   0   4
128:		        17   0   0   |   83  30  34
256:		         1   0   0   |   64  23  57
512:		         3   0   0   |    2   0  58
1K:		         2   0   0   |    0   0  58
2K:		         0   0   0   |    0   0  58
4K:		      4379   0   0   |    0   0  58
8K:		        10   0   0   |  114  41 100
16K:		         0   0   0   |    0   0 100
32K:		         0   0   0   |    0   0 100
64K:		         0   0   0   |    0   0 100
128K:		         0   0   0   |    0   0 100
256K:		         0   0   0   |    0   0 100
512K:		         0   0   0   |    0   0 100
1M:		    456654  89  89   |    0   0 100
2M:		      8732   1  91   |    0   0 100
4M:		     42704   8 100   |    0   0 100
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;At the moment, Alex&apos;s patch to eliminate the use of dmu_read apparently was the last thing that was needed to get the osd-zfs layer working at speed. The final piece seems to be balancing client requests across the &lt;tt&gt;socknal_sd*&lt;/tt&gt; tasks. If I understand what I&apos;ve read correctly and compare it to what I see, &lt;tt&gt;portal_rotor&lt;/tt&gt; using &lt;tt&gt;HASH_RT&lt;/tt&gt; is mapping client NIDs to a particular socknal_sd task. I&apos;d like to know how many clients each task is handling, so that I can tell how balanced the requests are. An LNET self test showed 9 GB/s with 16 clients, but trying with data from disk that falls to between 5.5 and 6 GB/s.&lt;/p&gt;

&lt;p&gt;A snippet from &lt;tt&gt;top&lt;/tt&gt;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt; 12461 root      20   0     0    0    0 R 89.2  0.0  16:52.32 socknal_sd00_01                                      
 12471 root      20   0     0    0    0 R 80.6  0.0  12:09.99 socknal_sd03_02                                      
 12467 root      20   0     0    0    0 R 66.8  0.0   9:00.63 socknal_sd02_01                                      
 12464 root      20   0     0    0    0 R 43.4  0.0   8:09.87 socknal_sd01_01                                      
 12466 root      20   0     0    0    0 R 32.5  0.0   6:40.28 socknal_sd02_00                                      
 12465 root      20   0     0    0    0 R 31.4  0.0   5:18.51 socknal_sd01_02                                      
 12468 root      20   0     0    0    0 S 20.2  0.0   4:06.25 socknal_sd02_02                                      
 12463 root      20   0     0    0    0 R 18.4  0.0   3:43.80 socknal_sd01_00                                      
 12460 root      20   0     0    0    0 S 16.6  0.0   4:10.38 socknal_sd00_00                                      
 12470 root      20   0     0    0    0 S 10.1  0.0   1:17.31 socknal_sd03_01  
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I&apos;m also going to try setting the CPU pattern, since libcfs is breaking up a dual socket system (Intel E5-2640v2) into 4 partitions.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;cat /proc/sys/lnet/cpu_partition_table 
0	: 0 1 2 3 
1	: 4 5 6 7 
2	: 8 9 10 11 
3	: 12 13 14 15 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Since this veers away from the original ticket by dealing with possible LNET and SMP tuning, I would be glad to open a new one.&lt;/p&gt;</comment>
                            <comment id="106206" author="adilger" created="Sun, 8 Feb 2015 08:26:41 +0000"  >&lt;p&gt;I was mislead by the version field of this issue - I thought you were running 2.6.0, but I now see this wasn&apos;t originally your ticket&lt;/p&gt;

&lt;p&gt;Good to hear that Alex&apos;s latest patch is helping out. It also would be a trivial change to move the IO stats as I&apos;d suggested in my review, so that you would get an accurate value for the IO times in brw_stats. They are all currently reporting 1/1000s, which isn&apos;t very likely. &lt;/p&gt;

&lt;p&gt;I would definitely ask that you file a separate ticket for the socklnd and cpu partition tunings. &lt;/p&gt;</comment>
                            <comment id="106207" author="adilger" created="Sun, 8 Feb 2015 08:32:00 +0000"  >&lt;p&gt;Rick, have you tried increasing the maximum RPC size from the clients?&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param osc.*.max_pages_per_rpc=4M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;That may increase your IO performance, it may not, depending on where the bottleneck is. &lt;/p&gt;

&lt;p&gt;Have you tried &lt;tt&gt;iostat -x -k -z 1&lt;/tt&gt; to see what the utilization of the disks is?  Presumably they are not hitting the peak, but I wonder if the IO load generated by Lustre RPCs is different than that from the local ZFS mount, or if the io submission at the SCSI level is getting chopped up?&lt;/p&gt;</comment>
                            <comment id="106208" author="rpwagner" created="Sun, 8 Feb 2015 08:58:07 +0000"  >&lt;p&gt;Andreas, after tracking &lt;tt&gt;rpc_stats&lt;/tt&gt; for a while, I have &lt;tt&gt;max_pages_per_rpc = 1024&lt;/tt&gt; and &lt;tt&gt;max_rpcs_in_flight = 32&lt;/tt&gt;. I believe this is why the disk I/O size reported by &lt;tt&gt;brw_stats&lt;/tt&gt; started showing 4M reads, since that was changed after several tests.&lt;/p&gt;

&lt;p&gt;I have been watch zpool iostat to see how much data is flowing from the pool. The number of operations agree with a 1024K record size, and what I saw from collectl showed 128K reads from the individual drives (we&apos;re currently using raidz2 with 8+2 zpools).&lt;/p&gt;

&lt;p&gt;Below is a capture from zpool iostat during the test, after I changed the CPU partitioning:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;$ zpool iostat 10
               capacity     operations    bandwidth
pool        alloc   free   read  write   read  write
----------  -----  -----  -----  -----  -----  -----
...wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; test...
mdt         36.6M  59.5G      0      0      0      0
ost0         937G  35.3T  1.15K      0  1.15G      0
ost1         888G  35.4T  1.07K      0  1.07G      0
ost2         860G  35.4T  1.09K      0  1.08G      0
ost3         881G  35.4T  1.12K      0  1.11G      0
ost4         910G  35.4T  1.21K      0  1.21G      0
ost5         854G  35.4T  1.01K      0  1.01G      0
----------  -----  -----  -----  -----  -----  -----
mdt         36.6M  59.5G      0      0      0      0
ost0         937G  35.3T  1.05K      0  1.05G      0
ost1         888G  35.4T  1.14K      0  1.13G      0
ost2         860G  35.4T  1.18K      0  1.17G      0
ost3         881G  35.4T  1.16K      0  1.16G      0
ost4         910G  35.4T  1.15K      0  1.14G      0
ost5         854G  35.4T   1023      0  1017M      0
----------  -----  -----  -----  -----  -----  -----
mdt         36.6M  59.5G      0      0      0      0
ost0         937G  35.3T  1.16K      0  1.15G      0
ost1         888G  35.4T  1.06K      0  1.05G      0
ost2         860G  35.4T  1.11K      0  1.10G      0
ost3         881G  35.4T  1.06K      0  1.05G      0
ost4         910G  35.4T  1.16K      0  1.15G      0
ost5         854G  35.4T  1.02K      0  1.02G      0
----------  -----  -----  -----  -----  -----  -----
mdt         36.6M  59.5G      0      0      0      0
ost0         937G  35.3T  1.09K      0  1.08G      0
ost1         888G  35.4T  1.03K      0  1.02G      0
ost2         860G  35.4T  1.10K      0  1.09G      0
ost3         881G  35.4T  1.12K      0  1.11G      0
ost4         910G  35.4T  1.29K      0  1.29G      0
ost5         854G  35.4T  1.15K      0  1.14G      0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This still leaves about 2 GB/s of performance available when compared to native ZFS and the LNET self test. I expect that if I generate another perf analysis, it will show the time spent in handling the RPCs via &lt;tt&gt;socknal_sd*&lt;/tt&gt;.&lt;/p&gt;</comment>
                            <comment id="106210" author="gabriele.paciucci" created="Sun, 8 Feb 2015 10:32:36 +0000"  >&lt;p&gt;Hi Rick,&lt;br/&gt;
take a look of the /proc/sys/lnet/peers and see if your queue is big enough. If you find some minus values, please increase the peer_credits and credits value for LNET. &lt;br/&gt;
I can suggest as &quot;gold&quot; rule:&lt;br/&gt;
peer_credits=max_rpc_inflight&lt;br/&gt;
credits= 4x peer_credits&lt;/p&gt;

&lt;p&gt;remember to export this value to all the cluster&lt;/p&gt;</comment>
                            <comment id="106215" author="rpwagner" created="Sun, 8 Feb 2015 20:56:16 +0000"  >&lt;p&gt;Gabriele, thanks. There are negative numbers in &lt;tt&gt;/proc/sys/lnet/peers&lt;/tt&gt;, and even bumping up the credits on the server gave 10% or so improvement. I&apos;ll have to shift to another set of clients to test both sides, since I&apos;m using a production system nodes as clients and can&apos;t reload the kernel modules. This would help explain the remaining bottleneck.&lt;/p&gt;</comment>
                            <comment id="106238" author="gabriele.paciucci" created="Mon, 9 Feb 2015 08:56:58 +0000"  >&lt;p&gt;If you are using Ethernet, you should also tune the systctl.conf. Please refer to your Ethernet vendor. This is a good starting point from Mellanox but you can apply to other vendors. &lt;br/&gt;
&lt;a href=&quot;http://www.mellanox.com/related-docs/prod_software/Performance_Tuning_Guide_for_Mellanox_Network_Adapters.pdf&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://www.mellanox.com/related-docs/prod_software/Performance_Tuning_Guide_for_Mellanox_Network_Adapters.pdf&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="106439" author="rpwagner" created="Tue, 10 Feb 2015 14:37:21 +0000"  >&lt;p&gt;Andreas &amp;amp; Gabriele, I have moved my network tuning questions over to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6228&quot; title=&quot;How to balance network connections across socknal_sd tasks?&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6228&quot;&gt;&lt;del&gt;LU-6228&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="112214" author="cliffw" created="Thu, 16 Apr 2015 15:55:01 +0000"  >&lt;p&gt;I am a bit confused by all the network tuning comments. Are there patches available that have not landed in 2.7.52?&lt;/p&gt;</comment>
                            <comment id="112215" author="cliffw" created="Thu, 16 Apr 2015 16:01:47 +0000"  >&lt;p&gt;Watchdogs continue with prefetch disabled.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;LNet: Service thread pid 64826 was inactive for 200.00s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:&lt;br/&gt;
Pid: 64826, comm: ll_ost03_025&lt;/p&gt;

&lt;p&gt;Call Trace:&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa054f790&amp;gt;&amp;#93;&lt;/span&gt; ? vdev_mirror_child_done+0x0/0x30 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8152acee&amp;gt;&amp;#93;&lt;/span&gt; ? mutex_lock+0x1e/0x50&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8152acee&amp;gt;&amp;#93;&lt;/span&gt; ? mutex_lock+0x1e/0x50&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81529e83&amp;gt;&amp;#93;&lt;/span&gt; io_schedule+0x73/0xc0&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa044b596&amp;gt;&amp;#93;&lt;/span&gt; cv_wait_common+0xa6/0x120 &lt;span class=&quot;error&quot;&gt;&amp;#91;spl&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109afa0&amp;gt;&amp;#93;&lt;/span&gt; ? autoremove_wake_function+0x0/0x40&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa044b628&amp;gt;&amp;#93;&lt;/span&gt; __cv_wait_io+0x18/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;spl&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa058c81b&amp;gt;&amp;#93;&lt;/span&gt; zio_wait+0xfb/0x1c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04f673a&amp;gt;&amp;#93;&lt;/span&gt; dbuf_read+0x47a/0x7f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04fed98&amp;gt;&amp;#93;&lt;/span&gt; dmu_buf_hold+0x108/0x1d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0555ab2&amp;gt;&amp;#93;&lt;/span&gt; zap_get_leaf_byblk+0x52/0x300 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0554584&amp;gt;&amp;#93;&lt;/span&gt; ? zap_idx_to_blk+0xe4/0x150 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0555dca&amp;gt;&amp;#93;&lt;/span&gt; zap_deref_leaf+0x6a/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0556430&amp;gt;&amp;#93;&lt;/span&gt; fzap_lookup+0x60/0x120 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa05598f8&amp;gt;&amp;#93;&lt;/span&gt; ? zap_name_alloc+0x88/0xf0 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa055ba21&amp;gt;&amp;#93;&lt;/span&gt; zap_lookup_norm+0xe1/0x190 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa055bb63&amp;gt;&amp;#93;&lt;/span&gt; zap_lookup+0x33/0x40 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa108afa5&amp;gt;&amp;#93;&lt;/span&gt; osd_fid_lookup+0xb5/0x2f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;osd_zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa1084a1c&amp;gt;&amp;#93;&lt;/span&gt; osd_object_init+0x19c/0x6c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;osd_zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03bb798&amp;gt;&amp;#93;&lt;/span&gt; ? libcfs_log_return+0x28/0x40 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0fac9d9&amp;gt;&amp;#93;&lt;/span&gt; ? ofd_object_init+0x99/0x180 &lt;span class=&quot;error&quot;&gt;&amp;#91;ofd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07c6318&amp;gt;&amp;#93;&lt;/span&gt; lu_object_alloc+0xd8/0x320 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07c7821&amp;gt;&amp;#93;&lt;/span&gt; lu_object_find_try+0x151/0x260 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07c79e1&amp;gt;&amp;#93;&lt;/span&gt; lu_object_find_at+0xb1/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03bf161&amp;gt;&amp;#93;&lt;/span&gt; ? libcfs_debug_msg+0x41/0x50 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07c7a26&amp;gt;&amp;#93;&lt;/span&gt; lu_object_find+0x16/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0fc3215&amp;gt;&amp;#93;&lt;/span&gt; ofd_object_find+0x35/0xf0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ofd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0fc5b0b&amp;gt;&amp;#93;&lt;/span&gt; ofd_precreate_objects+0x1fb/0x19e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ofd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03bf161&amp;gt;&amp;#93;&lt;/span&gt; ? libcfs_debug_msg+0x41/0x50 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0fd2928&amp;gt;&amp;#93;&lt;/span&gt; ? ofd_grant_create+0x2b8/0x450 &lt;span class=&quot;error&quot;&gt;&amp;#91;ofd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0fb6ca6&amp;gt;&amp;#93;&lt;/span&gt; ofd_create_hdl+0x566/0x25c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ofd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa09e78c0&amp;gt;&amp;#93;&lt;/span&gt; ? lustre_pack_reply_v2+0x220/0x280 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0a4946e&amp;gt;&amp;#93;&lt;/span&gt; tgt_request_handle+0x8be/0x1000 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa09f8e61&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0xe41/0x1960 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa09f8020&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1960 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109abf6&amp;gt;&amp;#93;&lt;/span&gt; kthread+0x96/0xa0&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c20a&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109ab60&amp;gt;&amp;#93;&lt;/span&gt; ? kthread+0x0/0xa0&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c200&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;/p&gt;

&lt;p&gt;LustreError: dumping log to /tmp/lustre-log.1429199475.64826&lt;/p&gt;</comment>
                            <comment id="112218" author="cliffw" created="Thu, 16 Apr 2015 16:06:13 +0000"  >&lt;p&gt;Lustre-log dumped by watchdog, all files under /proc/spl on OST&lt;/p&gt;</comment>
                            <comment id="112221" author="cliffw" created="Thu, 16 Apr 2015 16:10:55 +0000"  >&lt;p&gt;/proc/spl from the MDS&lt;/p&gt;</comment>
                            <comment id="112296" author="bzzz" created="Fri, 17 Apr 2015 07:07:49 +0000"  >&lt;p&gt;the following isn&apos;t exactly the same, but looks very similar:&lt;/p&gt;

&lt;p&gt;13:06:17:INFO: task txg_sync:16276 blocked for more than 120 seconds.&lt;br/&gt;
13:06:17:      Tainted: P           ---------------    2.6.32-504.12.2.el6_lustre.g036b949.x86_64 #1&lt;br/&gt;
13:06:17:&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.&lt;br/&gt;
13:06:17:txg_sync      D 0000000000000001     0 16276      2 0x00000080&lt;br/&gt;
13:06:17: ffff88006de4b890 0000000000000046 ffff88006de4b820 ffffffff81041e98&lt;br/&gt;
13:06:17: 00000000ffffffff 000007051e05132a 0000000000000000 ffff88007918e980&lt;br/&gt;
13:06:17: 00000000002301ae ffffffffaad2f4da ffff88006fc5bab8 ffff88006de4bfd8&lt;br/&gt;
13:06:17:Call Trace:&lt;br/&gt;
13:06:17: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81041e98&amp;gt;&amp;#93;&lt;/span&gt; ? pvclock_clocksource_read+0x58/0xd0&lt;br/&gt;
13:06:17: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810aaa21&amp;gt;&amp;#93;&lt;/span&gt; ? ktime_get_ts+0xb1/0xf0&lt;br/&gt;
13:06:17: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8152aad3&amp;gt;&amp;#93;&lt;/span&gt; io_schedule+0x73/0xc0&lt;br/&gt;
13:06:17: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0145596&amp;gt;&amp;#93;&lt;/span&gt; cv_wait_common+0xa6/0x120 &lt;span class=&quot;error&quot;&gt;&amp;#91;spl&amp;#93;&lt;/span&gt;&lt;br/&gt;
13:06:17: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109eb00&amp;gt;&amp;#93;&lt;/span&gt; ? autoremove_wake_function+0x0/0x40&lt;br/&gt;
13:06:17: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0145628&amp;gt;&amp;#93;&lt;/span&gt; __cv_wait_io+0x18/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;spl&amp;#93;&lt;/span&gt;&lt;br/&gt;
13:08:18: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa028f81b&amp;gt;&amp;#93;&lt;/span&gt; zio_wait+0xfb/0x1c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
13:08:18: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa029134b&amp;gt;&amp;#93;&lt;/span&gt; zio_free+0xab/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
13:08:18: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa02279a1&amp;gt;&amp;#93;&lt;/span&gt; dsl_free+0x11/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
13:08:18: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa021b102&amp;gt;&amp;#93;&lt;/span&gt; dsl_dataset_block_kill+0x352/0x380 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
13:08:18: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0214bee&amp;gt;&amp;#93;&lt;/span&gt; free_blocks+0x6e/0xb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
13:08:18: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0215838&amp;gt;&amp;#93;&lt;/span&gt; dnode_sync+0x4c8/0xac0 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
13:08:18: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa01fb3fb&amp;gt;&amp;#93;&lt;/span&gt; ? dbuf_sync_list+0x7b/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
13:08:18: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa01f034a&amp;gt;&amp;#93;&lt;/span&gt; ? arc_write+0xea/0x100 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
13:08:18: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0204e49&amp;gt;&amp;#93;&lt;/span&gt; dmu_objset_sync_dnodes+0x89/0xb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
13:08:18: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa020503a&amp;gt;&amp;#93;&lt;/span&gt; dmu_objset_sync+0x1ca/0x2d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
13:08:18: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa02040c0&amp;gt;&amp;#93;&lt;/span&gt; ? dmu_objset_write_ready+0x0/0x50 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
13:08:18: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0205140&amp;gt;&amp;#93;&lt;/span&gt; ? dmu_objset_write_done+0x0/0x70 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
13:08:18: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0222b8b&amp;gt;&amp;#93;&lt;/span&gt; dsl_pool_sync+0x2ab/0x3f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
13:08:18: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa023b8bf&amp;gt;&amp;#93;&lt;/span&gt; spa_sync+0x40f/0xa70 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
13:08:18: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0245771&amp;gt;&amp;#93;&lt;/span&gt; ? spa_txg_history_set+0xc1/0xf0 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
13:08:18: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0248c7d&amp;gt;&amp;#93;&lt;/span&gt; txg_sync_thread+0x30d/0x520 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
13:08:18: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8105c2f9&amp;gt;&amp;#93;&lt;/span&gt; ? set_user_nice+0xc9/0x130&lt;br/&gt;
13:08:18: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0248970&amp;gt;&amp;#93;&lt;/span&gt; ? txg_sync_thread+0x0/0x520 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://testing.hpdd.intel.com/test_logs/c128d706-e305-11e4-a348-5254006e85c2/show_text&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_logs/c128d706-e305-11e4-a348-5254006e85c2/show_text&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="117870" author="gerrit" created="Tue, 9 Jun 2015 05:38:03 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/13612/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/13612/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5278&quot; title=&quot;ZFS - many OST watchdogs with IOR&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5278&quot;&gt;&lt;del&gt;LU-5278&lt;/del&gt;&lt;/a&gt; echo: request pages in batches&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 89021de564c27f38a4146357e58dd80ddf68e246&lt;/p&gt;</comment>
                            <comment id="117887" author="pjones" created="Tue, 9 Jun 2015 12:44:26 +0000"  >&lt;p&gt;Landed for 2.8&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="18249">LU-3109</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="27121">LU-5775</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="28723">LU-6254</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="23902">LU-4820</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="28634">LU-6228</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="16385" name="Hyperion Performance 17 Nov 2014.xlsx" size="134729" author="cliffw" created="Mon, 17 Nov 2014 19:35:29 +0000"/>
                            <attachment id="15280" name="ior.iws28.txt.gz" size="227" author="cliffw" created="Tue, 1 Jul 2014 15:42:49 +0000"/>
                            <attachment id="15281" name="iws24.dump.txt.gz" size="229" author="cliffw" created="Tue, 1 Jul 2014 16:55:58 +0000"/>
                            <attachment id="16381" name="iws28.dump.txt.gz" size="229" author="cliffw" created="Fri, 14 Nov 2014 18:36:51 +0000"/>
                            <attachment id="17498" name="lustre-log.1429199475.64826.txt.gz" size="263" author="cliffw" created="Thu, 16 Apr 2015 16:06:13 +0000"/>
                            <attachment id="17499" name="proc_spl.tgz" size="4184693" author="cliffw" created="Thu, 16 Apr 2015 16:06:13 +0000"/>
                            <attachment id="17500" name="proc_spl_MDS.tgz" size="4080367" author="cliffw" created="Thu, 16 Apr 2015 16:10:55 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwqbr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>14730</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>