<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:22:04 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-8964] use parallel I/O to improve performance on machines with slow single thread performance</title>
                <link>https://jira.whamcloud.com/browse/LU-8964</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;On machines with slow single thread performance like KNL the bottleneck of I/O performance moved into code which just copy memory from one buffer to other (from user space to kernel or vice versa). In current Lustre implementation all I/O performs in single thread and this is become an issue for KNL. Significantly improve performance can be with solution which do parallel memory transfer of large buffers.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</description>
                <environment></environment>
        <key id="42585">LU-8964</key>
            <summary>use parallel I/O to improve performance on machines with slow single thread performance</summary>
                <type id="2" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11311&amp;avatarType=issuetype">New Feature</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="simmonsja">James A Simmons</assignee>
                                    <reporter username="dmiter">Dmitry Eremin</reporter>
                        <labels>
                    </labels>
                <created>Wed, 21 Dec 2016 15:02:02 +0000</created>
                <updated>Tue, 5 Nov 2019 05:41:12 +0000</updated>
                            <resolved>Thu, 7 Mar 2019 23:39:04 +0000</resolved>
                                                                        <due></due>
                            <votes>0</votes>
                                    <watches>26</watches>
                                                                            <comments>
                            <comment id="178665" author="gerrit" created="Wed, 21 Dec 2016 16:40:12 +0000"  >&lt;p&gt;Dmitry Eremin (dmitry.eremin@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/24474&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/24474&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8964&quot; title=&quot;use parallel I/O to improve performance on machines with slow single thread performance&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8964&quot;&gt;&lt;del&gt;LU-8964&lt;/del&gt;&lt;/a&gt; libcfs: Introduce parallel tasks framework&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 3806af960d1cee93e7643040bcee8eb23404ec3d&lt;/p&gt;</comment>
                            <comment id="178666" author="gerrit" created="Wed, 21 Dec 2016 16:40:20 +0000"  >&lt;p&gt;&lt;del&gt;Dmitry Eremin (dmitry.eremin@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/24475&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/24475&lt;/a&gt;&lt;/del&gt;&lt;br/&gt;
 &lt;del&gt;Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8964&quot; title=&quot;use parallel I/O to improve performance on machines with slow single thread performance&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8964&quot;&gt;&lt;del&gt;LU-8964&lt;/del&gt;&lt;/a&gt; llite: Parallelize generic I/O&lt;/del&gt;&lt;br/&gt;
 &lt;del&gt;Project: fs/lustre-release&lt;/del&gt;&lt;br/&gt;
 &lt;del&gt;Branch: master&lt;/del&gt;&lt;br/&gt;
 &lt;del&gt;Current Patch Set: 1&lt;/del&gt;&lt;br/&gt;
 &lt;del&gt;Commit: 88d7234f1a26678aebcd3adb0aeb2c17630d8965&lt;/del&gt;&lt;/p&gt;</comment>
                            <comment id="178667" author="gerrit" created="Wed, 21 Dec 2016 16:40:28 +0000"  >&lt;p&gt;&lt;del&gt;Dmitry Eremin (dmitry.eremin@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/24476&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/24476&lt;/a&gt;&lt;/del&gt;&lt;br/&gt;
 &lt;del&gt;Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8964&quot; title=&quot;use parallel I/O to improve performance on machines with slow single thread performance&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8964&quot;&gt;&lt;del&gt;LU-8964&lt;/del&gt;&lt;/a&gt; llite: Parallelize readahead and make it async&lt;/del&gt;&lt;br/&gt;
 &lt;del&gt;Project: fs/lustre-release&lt;/del&gt;&lt;br/&gt;
 &lt;del&gt;Branch: master&lt;/del&gt;&lt;br/&gt;
 &lt;del&gt;Current Patch Set: 1&lt;/del&gt;&lt;br/&gt;
 &lt;del&gt;Commit: 86a24764485b980d0707a996b0cf58582a585fd3&lt;/del&gt;&lt;/p&gt;</comment>
                            <comment id="181112" author="gerrit" created="Wed, 18 Jan 2017 13:45:58 +0000"  >&lt;p&gt;Dmitry Eremin (dmitry.eremin@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/24933&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/24933&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8964&quot; title=&quot;use parallel I/O to improve performance on machines with slow single thread performance&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8964&quot;&gt;&lt;del&gt;LU-8964&lt;/del&gt;&lt;/a&gt; llite: Make readahead async&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 941395a2d9c93275d5fac10bf3a2cc1ee50d6fe8&lt;/p&gt;</comment>
                            <comment id="181876" author="gerrit" created="Tue, 24 Jan 2017 08:53:29 +0000"  >&lt;p&gt;Dmitry Eremin (dmitry.eremin@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/25046&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/25046&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8964&quot; title=&quot;use parallel I/O to improve performance on machines with slow single thread performance&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8964&quot;&gt;&lt;del&gt;LU-8964&lt;/del&gt;&lt;/a&gt; clio: Parallelize generic I/O&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 3414d26cb7d3bfe40311780fce60dd491d7cc445&lt;/p&gt;</comment>
                            <comment id="183457" author="cengku9660" created="Sat, 4 Feb 2017 10:21:48 +0000"  >&lt;p&gt;Hi Dmitry,&lt;br/&gt;
Which is the latest patchset series?&lt;br/&gt;
I saw two, and they conflict with each other.&lt;br/&gt;
&lt;a href=&quot;https://review.whamcloud.com/#/c/24933/4&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/24933/4&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://review.whamcloud.com/#/c/25046/3&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/25046/3&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Gu&lt;/p&gt;</comment>
                            <comment id="183525" author="dmiter" created="Mon, 6 Feb 2017 08:58:46 +0000"  >&lt;p&gt;Hi Gu,&lt;br/&gt;
Those patches are different. The first one makes read ahead async and that is all. The second one makes user I/O (read/write) parallel. So, those patches are independent but beneficially to have them both.&lt;/p&gt;</comment>
                            <comment id="183527" author="cengku9660" created="Mon, 6 Feb 2017 09:19:37 +0000"  >&lt;p&gt;Hi Dmitry,&lt;br/&gt;
Good to know, thanks.&lt;br/&gt;
These series are really exciting, hope them can be merged soon.&lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="183557" author="paf" created="Mon, 6 Feb 2017 15:51:22 +0000"  >&lt;p&gt;Dmitry,&lt;/p&gt;

&lt;p&gt;As promised, here&apos;s my readahead microbenchmark.  It&apos;s pretty basic and not very well commented.&lt;/p&gt;

&lt;p&gt;You can turn on and off the 1 page offset I&apos;m using to &quot;break&quot; the strided pattern (see comments on the results below) by uncommenting and commenting out the code on lines 84 and 85 of the file.&lt;/p&gt;

&lt;p&gt;With strided read pattern:&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@centclient02 reverse&amp;#93;&lt;/span&gt;# vim read_readahead_test.c; gcc read_readahead_test.c ;  echo 3 &amp;gt; /proc/sys/vm/drop_caches ;  lctl set_param debug=0; time ./a.out&lt;br/&gt;
debug=0&lt;br/&gt;
pos 0&lt;br/&gt;
pos 1048576, read rc: 1048576&lt;br/&gt;
pos 210763776, read rc: 1048576&lt;br/&gt;
pos 420478976, read rc: 1048576&lt;br/&gt;
pos 630194176, read rc: 1048576&lt;br/&gt;
pos 839909376, read rc: 1048576&lt;br/&gt;
pos 1049624576, read rc: 1048576&lt;br/&gt;
pos 1259339776, read rc: 1048576&lt;br/&gt;
pos 1469054976, read rc: 1048576&lt;br/&gt;
pos 1678770176, read rc: 1048576&lt;br/&gt;
pos 1888485376, read rc: 1048576&lt;br/&gt;
pos 2098200576, read rc: 1048576&lt;br/&gt;
amount read (0) != amount requested (1048576), stopping&lt;br/&gt;
iterations: 1021, read 1071644672 total bytes&lt;/p&gt;

&lt;p&gt;real    0m1.178s&lt;br/&gt;
user    0m0.002s&lt;br/&gt;
sys     0m0.977s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@centclient02 reverse&amp;#93;&lt;/span&gt;# cat /proc/fs/lustre/llite/centss03-ffff88013d20c000/read_ahead_stats&lt;br/&gt;
snapshot_time             1486349623.499066 secs.usecs&lt;br/&gt;
hits                      261333 samples &lt;span class=&quot;error&quot;&gt;&amp;#91;pages&amp;#93;&lt;/span&gt;&lt;br/&gt;
misses                    45 samples &lt;span class=&quot;error&quot;&gt;&amp;#91;pages&amp;#93;&lt;/span&gt;&lt;br/&gt;
readpage not consecutive  1021 samples &lt;span class=&quot;error&quot;&gt;&amp;#91;pages&amp;#93;&lt;/span&gt;&lt;br/&gt;
failed grab_cache_page    4 samples &lt;span class=&quot;error&quot;&gt;&amp;#91;pages&amp;#93;&lt;/span&gt;&lt;br/&gt;
zero length file          1 samples &lt;span class=&quot;error&quot;&gt;&amp;#91;pages&amp;#93;&lt;/span&gt;&lt;br/&gt;
zero size window          260356 samples &lt;span class=&quot;error&quot;&gt;&amp;#91;pages&amp;#93;&lt;/span&gt;&lt;br/&gt;
failed to reach end       509 samples &lt;span class=&quot;error&quot;&gt;&amp;#91;pages&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@centclient02 reverse&amp;#93;&lt;/span&gt;# echo c &amp;gt; /proc/fs/lustre/llite/centss03-ffff88013d20c000/read_ahead_stats&lt;/p&gt;

&lt;p&gt;With slightly broken strided read pattern (1 page offset on every second read), to simulate disabling strided readahead:&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@centclient02 reverse&amp;#93;&lt;/span&gt;# vim read_readahead_test.c; gcc read_readahead_test.c ;  echo 3 &amp;gt; /proc/sys/vm/drop_caches ;  lctl set_param debug=0; time ./a.out&lt;br/&gt;
debug=0&lt;br/&gt;
pos 0&lt;br/&gt;
pos 1052672, read rc: 1048576&lt;br/&gt;
pos 210972672, read rc: 1048576&lt;br/&gt;
pos 420892672, read rc: 1048576&lt;br/&gt;
pos 630812672, read rc: 1048576&lt;br/&gt;
pos 840732672, read rc: 1048576&lt;br/&gt;
pos 1050652672, read rc: 1048576&lt;br/&gt;
pos 1260572672, read rc: 1048576&lt;br/&gt;
pos 1470492672, read rc: 1048576&lt;br/&gt;
pos 1680412672, read rc: 1048576&lt;br/&gt;
pos 1890332672, read rc: 1048576&lt;br/&gt;
pos 2100252672, read rc: 1048576&lt;br/&gt;
amount read (0) != amount requested (1048576), stopping&lt;br/&gt;
iterations: 1020, read 1070596096 total bytes&lt;/p&gt;

&lt;p&gt;real    0m5.100s&lt;br/&gt;
user    0m0.003s&lt;br/&gt;
sys     0m1.416s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@centclient02 reverse&amp;#93;&lt;/span&gt;# cat /proc/fs/lustre/llite/centss03-ffff88013d20c000/read_ahead_stats                                  &lt;br/&gt;
snapshot_time             1486349643.260326 secs.usecs&lt;br/&gt;
hits                      260098 samples &lt;span class=&quot;error&quot;&gt;&amp;#91;pages&amp;#93;&lt;/span&gt;&lt;br/&gt;
misses                    1022 samples &lt;span class=&quot;error&quot;&gt;&amp;#91;pages&amp;#93;&lt;/span&gt;&lt;br/&gt;
readpage not consecutive  1020 samples &lt;span class=&quot;error&quot;&gt;&amp;#91;pages&amp;#93;&lt;/span&gt;&lt;br/&gt;
failed grab_cache_page    1020 samples &lt;span class=&quot;error&quot;&gt;&amp;#91;pages&amp;#93;&lt;/span&gt;&lt;br/&gt;
zero length file          1 samples &lt;span class=&quot;error&quot;&gt;&amp;#91;pages&amp;#93;&lt;/span&gt;&lt;br/&gt;
zero size window          258574 samples &lt;span class=&quot;error&quot;&gt;&amp;#91;pages&amp;#93;&lt;/span&gt;&lt;br/&gt;
read-ahead to EOF         1 samples &lt;span class=&quot;error&quot;&gt;&amp;#91;pages&amp;#93;&lt;/span&gt;&lt;br/&gt;
hit max r-a issue         1 samples &lt;span class=&quot;error&quot;&gt;&amp;#91;pages&amp;#93;&lt;/span&gt;&lt;br/&gt;
failed to reach end       1 samples &lt;span class=&quot;error&quot;&gt;&amp;#91;pages&amp;#93;&lt;/span&gt;&lt;/p&gt;


&lt;p&gt;Note the huge difference in time, even though the bytes read are almost exactly the same.&lt;br/&gt;
The misses in the strided case are 45, in the slightly off case, they are 1022.&lt;/p&gt;

&lt;p&gt;(Attachment coming in a moment)&lt;/p&gt;</comment>
                            <comment id="183560" author="paf" created="Mon, 6 Feb 2017 15:54:30 +0000"  >&lt;p&gt;Also, I should mention - It prints out the pos and bytes_read on only every 100th iteration, not every iteration.&lt;/p&gt;</comment>
                            <comment id="183588" author="dmiter" created="Mon, 6 Feb 2017 18:22:26 +0000"  >&lt;p&gt;Patrick,&lt;br/&gt;
 Do you expect the same behavior from new version? Briefly I got the following results on my VM machine:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;strided read pattern: 108 pages miss
iterations: 512, read 537919488 total bytes
real  0m1.279s
user  0m0.002s
sys   0m0.283s
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;broken strided read pattern: 471 pages miss
iterations: 511, read 536870912 total bytes
real  0m1.524s
user  0m0.002s
sys   0m0.229s
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So, there is no big difference between two patterns in terms of speed but in first case there are low read pages miss.&lt;/p&gt;

&lt;p&gt;Is this acceptable for you or you still will complain about regression?&lt;/p&gt;</comment>
                            <comment id="183592" author="dmiter" created="Mon, 6 Feb 2017 18:34:18 +0000"  >&lt;p&gt;With bigger file (1772392448 bytes (1.8 GB)) the proportion become the same. Asynchronous nature of read ahead become page miss less critical than it was before.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;strided read pattern: 137 pages miss
iterations: 845, read 887095296 total bytes
real 0m2.217s
user 0m0.000s
sys  0m0.499s

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;broken strided read pattern: 756 pages miss
iterations: 844, read 886046720 total bytes
real 0m2.571s
user 0m0.001s
sys  0m0.408s

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="183612" author="paf" created="Mon, 6 Feb 2017 19:20:01 +0000"  >&lt;p&gt;Hmm, that might be fine.  I was also looking at implementing reverse readahead, though I don&apos;t have a prototype yet or anything.  (That&apos;s why I had that test program around.)  It looks like we can&apos;t do anything like that when using the kernel readahead implementation, so that would be a shame.  Reverse reading of anything other than really big blocks is very slow.&lt;/p&gt;

&lt;p&gt;About the performance, two things to try, just to see what they do.&lt;/p&gt;

&lt;p&gt;I think we should probably try a much bigger stride size, in case we&apos;re just reading across the gap &lt;span class=&quot;error&quot;&gt;&amp;#91;i.e. we&amp;#39;re just reading ahead covering both the pages in the gap and the next section we really read...  I don&amp;#39;t think so, but probably worth checking.&amp;#93;&lt;/span&gt;.&lt;/p&gt;

&lt;p&gt;So, instead of:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (strided) {
                                pos = lseek(fd, pos+read_bytes, SEEK_SET);
                                /* ATTN here: Uncomment these lines to add a one
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (strided) {
                                pos = lseek(fd, pos+read_bytes*10, SEEK_SET);
                                /* ATTN here: Uncomment these lines to add a one
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Giving us that larger gap between reads.&lt;/p&gt;

&lt;p&gt;I&apos;d also be curious to see what happens with larger read sizes.  But I expect that larger read sizes will look better with the async implementation, so that&apos;s good.&lt;/p&gt;</comment>
                            <comment id="183616" author="paf" created="Mon, 6 Feb 2017 19:34:55 +0000"  >&lt;p&gt;I&apos;m wondering about Jinshan&apos;s comment that readahead is already async?  It does not look like that&apos;s the case currently (at least not async the way I would mean it) - Looking at logs here, it looks like the read syscall does not complete until all of the reading readahead is doing is complete, even if readahead is reading beyond what the read from userspace requested.&lt;/p&gt;

&lt;p&gt;Is readahead async in that way with your patch?  i.e., if I read a 1 MiB chunk of the file but readahead wants to read 5 MiB, assuming RPC size of 1 MiB, will my read call return once the 1 MiB is present, without waiting for the rest?&lt;/p&gt;</comment>
                            <comment id="186837" author="efocht" created="Thu, 2 Mar 2017 21:25:42 +0000"  >&lt;p&gt;I&apos;m struggling with single stream read performance on ZFS OSTs and tested this patch set. I like it a lot because it simplifies and cleans up the code very much. But without further tunables it will IMO harm single stream sequential read performance on decent ZFS OSTs. The problem is basically that ZFS prefetch is very sensitive to the patterns it receives.&lt;/p&gt;

&lt;p&gt;My setup has OSTs which are capable of 1.8GB/s read with one dd directly on the OST (zpool, ZFS filesystem). With the old Lustre prefetcher (without the patchsets of this ticket) I get best performance if I limit the max_rpcs_in_flight to 1 or 2, thus allow for only few ll_ost_io_* threads to do the actual reads on the OSS side. Performance then is close to 1GB/s. Setting max_rpcs_in_flight to 16, 32, or even 256 spoils the performance massively, it ends up in the range of 300-500MB/s.&lt;/p&gt;

&lt;p&gt;With 2.9.0 + &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8964&quot; title=&quot;use parallel I/O to improve performance on machines with slow single thread performance&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8964&quot;&gt;&lt;del&gt;LU-8964&lt;/del&gt;&lt;/a&gt; my results are:&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;rpcs&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; dd 1M
in&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; reads
flight&#160;&#160;&#160;&#160;&#160;&#160;&#160; bandwidth MB/s
&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160; zfs prefetch
&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; disabled&#160;&#160; enabled
------------------------------------------------------
1&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;&#160; 302&#160;&#160;&#160;&#160;&#160;&#160;&#160; 553
2&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;&#160; 383&#160;&#160;&#160;&#160;&#160;&#160;&#160; 638
4&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;&#160; 456&#160;&#160;&#160;&#160;&#160;&#160;&#160; 630
8&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;&#160; 568&#160;&#160;&#160;&#160;&#160;&#160;&#160; 520
16&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;&#160; 668&#160;&#160;&#160;&#160;&#160;&#160;&#160; 568
32&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;&#160; 731&#160;&#160;&#160;&#160;&#160;&#160;&#160; 533
64&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;&#160; 764&#160;&#160;&#160;&#160;&#160;&#160;&#160; 476
128&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;&#160; 751&#160;&#160;&#160;&#160;&#160;&#160;&#160; 479
256&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;&#160; 689&#160;&#160;&#160;&#160;&#160;&#160;&#160; 542 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I don&apos;t get to 1GB/s, that I could reach before, and best results are with disabled ZFS prefetch! Which I don&apos;t like, thinking of the 1.8GB/s that it actually is capable of reaching.&lt;/p&gt;

&lt;p&gt;The problems I see:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;reads are evenly distributed across ptasks (56 in my case, big CPUs), which creates more randomness in the read pattern that ZFS is seeing on the OST. A mechanism for adjusting or tuning the number of read threads per stream of per OSC-object would be ideal.&lt;/li&gt;
	&lt;li&gt;the size of the reads is the same for all ptasks, larger than 256, but often an odd number (eg. 2341 pages), which leads probably to some large and some small RPCs. I&apos;m playing with the values and am able to get much better results, at least in the single stream sequential case. Will report when my &quot;scan&quot; is done.&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Clearly this patch is very helpful for finally improving read-ahead, but I suppose it needs somehow two modes, one for ZFS and one for ldiskfs.&lt;/p&gt;</comment>
                            <comment id="186844" author="paf" created="Thu, 2 Mar 2017 21:46:34 +0000"  >&lt;p&gt;Erich,&lt;/p&gt;

&lt;p&gt;Thanks for the testing.  For what it&apos;s worth, ldiskfs is not insensitive to read patterns either.  I think splitting the I/O to these weird sizes is inherently problematic - It should be on more logical boundaries.&lt;/p&gt;

&lt;p&gt;I am really surprised to hear you say that you get better results with 1-2 max_rpcs_in_flight.  That seems like then the server would cope &lt;b&gt;extremely&lt;/b&gt; poorly with reads from multiple clients.  Is that correct?  That seems like something is wrong, beyond just ZFS.&lt;/p&gt;</comment>
                            <comment id="186848" author="efocht" created="Thu, 2 Mar 2017 22:07:38 +0000"  >&lt;p&gt;I am testing with normal clients, not KNL, by the way.&lt;/p&gt;

&lt;p&gt;With many clients, each with its own streams, things look okay. I couldn&apos;t test yet with this async readahead patchset, but the plain old Lustre prefetcher with many rpcs in flight and ZFS OSTs work okay. Even if slower, each stream contributes its share to the total bandwidth and they add up quite well to the more or less expected peak. Some annoying tweaking of max_read_ahead_per_file_mb is necessary, though, and there is no setting that is good for all workloads.&lt;/p&gt;</comment>
                            <comment id="187988" author="gerrit" created="Sun, 12 Mar 2017 00:23:46 +0000"  >&lt;p&gt;Erich Focht (efocht@gmail.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/25943&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/25943&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8964&quot; title=&quot;use parallel I/O to improve performance on machines with slow single thread performance&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8964&quot;&gt;&lt;del&gt;LU-8964&lt;/del&gt;&lt;/a&gt; llite: tunable ptasks number with optimized IO size&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 371d804834fe72078dec82906c04e057346abbdd&lt;/p&gt;</comment>
                            <comment id="187990" author="efocht" created="Sun, 12 Mar 2017 00:57:44 +0000"  >&lt;p&gt;Just uploaded a patch on top of the series from Dmitry.&lt;/p&gt;

&lt;p&gt;It changes the size of the IOs handled by each ptask in ll_readpages(). The original version had the potential to create &quot;odd&quot; npages values (like 276, for example), which translated to a number of large RPCs plus one small one. Worse than the RPC problem are the offsets received by the prefetcher, which (I believe) are rather unaligned in the original case. ZFS prefetcher doesn&apos;t deal well with those, especially if they get scheduled in a different order on the OSS.&lt;/p&gt;

&lt;p&gt;Results: single stream read with dd, 1MB block size, MB/s&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;       zfs-0.6.5.7
&#160;&#160;&#160;&#160;&#160;&#160; max_read_ahead_per_file_mb=256 
rpcs&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;
&#160;in&#160;&#160;&#160;&#160;&#160; LU-8964&#160;&#160;&#160;&#160;&#160; LU-8964+EF&#160; 
flight&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; ptasks=1&#160;&#160;&#160; &#160;&#160;&#160; &#160;
---------------------------------  &#160;
1&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 553&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 860&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;&#160; &#160;
2&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 638&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 728&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;&#160; &#160;
4&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 630&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 677&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;&#160; &#160;
8&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 520&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 595&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;&#160; &#160;
16&#160;&#160;&#160;&#160;&#160;&#160;&#160; 568&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 525&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;&#160; &#160;
32&#160;&#160;&#160;&#160;&#160;&#160;&#160; 533&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 485&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;&#160; &#160;
64&#160;&#160;&#160;&#160;&#160;&#160;&#160; 476&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 452&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;&#160; &#160;
128&#160;&#160;&#160;&#160;&#160;&#160; 479&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 484&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;&#160; &#160;
256&#160;&#160;&#160;&#160;&#160;&#160; 542&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 593&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;&#160; &#160;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Switching to zfs-0.7.0rc3 improves things dramatically, because the prefetcher is much better:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;zfs-0.7.0rc3

rpcs&#160;&#160;&#160;&#160; ra_p_file=256 ra_p_file=64 |&#160; ra_p_file=64&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#160;
&#160;in&#160;&#160;&#160;&#160;&#160;&#160;&#160; LU-8964&#160;&#160;&#160;&#160;&#160;&#160; LU-8964&#160;&#160;&#160; |&#160; LU-8964+EF&#160;&#160;&#160;&#160; LU-8964+EF&#160; LU-8964+EF
flight&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; |&#160; ptasks=1&#160;&#160;&#160;&#160;&#160;&#160; ptasks=14&#160;&#160; ptasks=28 &#160;
------------------------------------|-------------------------------------&#160; &#160;
1&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 449&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 433&#160;&#160;&#160;&#160;&#160;&#160; |&#160;&#160; 1200&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 1100&#160;&#160;&#160;&#160;&#160;&#160;&#160; 1100&#160;&#160;&#160;&#160; &#160;
2&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 511&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 491&#160;&#160;&#160;&#160;&#160;&#160; |&#160;&#160; 1600&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 1700&#160;&#160;&#160;&#160;&#160;&#160;&#160; 1600&#160;&#160;&#160;&#160; &#160;
4&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 585&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 582&#160;&#160;&#160;&#160;&#160;&#160; |&#160;&#160; 1700&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 1900&#160;&#160;&#160;&#160;&#160;&#160;&#160; 1700&#160;&#160;&#160;&#160; &#160;
8&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 674&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 680&#160;&#160;&#160;&#160;&#160;&#160; |&#160;&#160; 1400&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 1800&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 943&#160;&#160;&#160;&#160; &#160;
16&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 744&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 797&#160;&#160;&#160;&#160;&#160;&#160; |&#160;&#160; 1200&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 1700&#160;&#160;&#160;&#160;&#160;&#160;&#160; 1600&#160;&#160;&#160;&#160; &#160;
32&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 847&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 881&#160;&#160;&#160;&#160;&#160;&#160; |&#160;&#160; 1200&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 1600&#160;&#160;&#160;&#160;&#160;&#160;&#160; 1400&#160;&#160;&#160;&#160; &#160;
64&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 860&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 856&#160;&#160;&#160;&#160;&#160;&#160; |&#160;&#160; 1100&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 1500&#160;&#160;&#160;&#160;&#160;&#160;&#160; 1400&#160;&#160;&#160;&#160; &#160;
128&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 933&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 819&#160;&#160;&#160;&#160;&#160;&#160; |&#160;&#160; 1000&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 1300&#160;&#160;&#160;&#160;&#160;&#160;&#160; 1300&#160;&#160;&#160;&#160; &#160;
256&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 868&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 791&#160;&#160;&#160;&#160;&#160;&#160; |&#160;&#160;&#160; 747&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; 1300&#160;&#160;&#160;&#160;&#160;&#160;&#160; 1200&#160;&#160;&#160;&#160; &#160;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="188031" author="adilger" created="Mon, 13 Mar 2017 09:14:56 +0000"  >&lt;p&gt;Erich, thanks for posting your results.  Definitely 0.7.0 seems very attractive for this workload.&lt;/p&gt;

&lt;p&gt;It looks like your patch is improving the performance significantly as well.  Is the 56-code client machine NUMA, and if yes how many sockets, 4?  I&apos;m not against changing the default ptask count to get better performance out-of-the-box, based on your test results, so long as we can determine what the best setting should be.&lt;/p&gt;

&lt;p&gt;How does ptasks=14 relate to the socket count, and is there a rule we can make there?  It seems like having a thread for each CPU is probably the wrong default, unless they are each working with a different OST?&lt;/p&gt;</comment>
                            <comment id="188045" author="dmiter" created="Mon, 13 Mar 2017 10:41:55 +0000"  >&lt;p&gt;Thanks Erich. This is a good improveement for my patch. But I have other idea how to implement it. I&apos;m going to update my patch soon. I hope you will test it and review.&lt;/p&gt;</comment>
                            <comment id="188049" author="efocht" created="Mon, 13 Mar 2017 10:53:44 +0000"  >&lt;p&gt;Hi Andreas,&lt;/p&gt;

&lt;p&gt;the client is a dual socket E5-2680 v4 with 14 cores per CPU and HT enabled. In /proc/cpuinfo I see 56 cores. Experiments are on top of Centos 7.2 3.10.0-514.2.2.el7 kernel.&lt;/p&gt;

&lt;p&gt;The read ahead engine is initialized with cpu_online_map:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;vvp_raengine = cfs_ptengine_init(&lt;span class=&quot;code-quote&quot;&gt;&quot;read_ahead&quot;&lt;/span&gt;, cpu_online_mask);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;so I would have expected to be able to chose up to 56 ptasks, but I can only set it to 28. I&apos;m missing some detail here. The highest active kworker thread I see is &lt;span class=&quot;error&quot;&gt;&amp;#91;kworker/27:1&amp;#93;&lt;/span&gt; so I suppose we stay away from the hyperthreads.&lt;/p&gt;

&lt;p&gt;Tests are with Omnipath, so additional CPU resources for the network are beneficial, that&apos;s maybe why ptasks=14 is better than ptasks=28. Rule of thumb right now: ptasks=1 for zfs-0.6.5.7, ptasks=half of maximum for zfs-0.7.0. ptasks=maximum for KNL with ldiskfs OSTs (Dmitry&apos;s choice).&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="188053" author="dmiter" created="Mon, 13 Mar 2017 11:30:34 +0000"  >&lt;p&gt;Erich, I have special logic during initialization.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (cpumask_empty(par_mask) ||
	    cpumask_equal(par_mask, cpu_online_mask)) {
		cpumask_copy(all_mask, cpu_online_mask);
		cpumask_clear(par_mask);
		&lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; (!cpumask_empty(all_mask)) {
			&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; cpu = cpumask_first(all_mask);

			cpumask_set_cpu(cpu, par_mask);
			cpumask_andnot(all_mask, all_mask,
					topology_sibling_cpumask(cpu));
		}
	}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This means if CPU mask is empty or full use all available CPUs without HT. So, only pure cores will be used. This is done intentional. Using HT for this will disrupt performance.&lt;/p&gt;</comment>
                            <comment id="188061" author="efocht" created="Mon, 13 Mar 2017 12:30:08 +0000"  >&lt;p&gt;Ah! Thanks Dmitry, now it&apos;s clear &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="190389" author="gerrit" created="Sun, 2 Apr 2017 17:59:20 +0000"  >&lt;p&gt;Dmitry Eremin (dmitry.eremin@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/26314&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/26314&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8964&quot; title=&quot;use parallel I/O to improve performance on machines with slow single thread performance&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8964&quot;&gt;&lt;del&gt;LU-8964&lt;/del&gt;&lt;/a&gt; libcfs: Introduce parallel tasks framework&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: pfl&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 4b91a6c4999ade6d1d04c89f06bc9ed535616e69&lt;/p&gt;</comment>
                            <comment id="190390" author="gerrit" created="Sun, 2 Apr 2017 17:59:21 +0000"  >&lt;p&gt;Dmitry Eremin (dmitry.eremin@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/26315&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/26315&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8964&quot; title=&quot;use parallel I/O to improve performance on machines with slow single thread performance&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8964&quot;&gt;&lt;del&gt;LU-8964&lt;/del&gt;&lt;/a&gt; clio: Parallelize generic I/O&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: pfl&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: ebd9846fafbe07ba576e288e28a7a6847adc5321&lt;/p&gt;</comment>
                            <comment id="190754" author="gerrit" created="Tue, 4 Apr 2017 19:10:01 +0000"  >&lt;p&gt;Dmitry Eremin (dmitry.eremin@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/26350&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/26350&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8964&quot; title=&quot;use parallel I/O to improve performance on machines with slow single thread performance&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8964&quot;&gt;&lt;del&gt;LU-8964&lt;/del&gt;&lt;/a&gt; llite: Make readahead async&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: pfl&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 84d8bc4dcbc8f7b187f967af8dc264a6e834b9e3&lt;/p&gt;</comment>
                            <comment id="191330" author="gerrit" created="Mon, 10 Apr 2017 11:55:14 +0000"  >&lt;p&gt;Dmitry Eremin (dmitry.eremin@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/26468&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/26468&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8964&quot; title=&quot;use parallel I/O to improve performance on machines with slow single thread performance&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8964&quot;&gt;&lt;del&gt;LU-8964&lt;/del&gt;&lt;/a&gt; clio: Parallelize generic I/O&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: dfd26cbb6f41a956658b975b68576b8fccbbd342&lt;/p&gt;</comment>
                            <comment id="191331" author="gerrit" created="Mon, 10 Apr 2017 11:55:15 +0000"  >&lt;p&gt;Dmitry Eremin (dmitry.eremin@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/26469&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/26469&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8964&quot; title=&quot;use parallel I/O to improve performance on machines with slow single thread performance&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8964&quot;&gt;&lt;del&gt;LU-8964&lt;/del&gt;&lt;/a&gt; llite: Make readahead async&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: d624cc9a301d1014cca088dff014176c7f61d5fc&lt;/p&gt;</comment>
                            <comment id="193641" author="jay" created="Wed, 26 Apr 2017 17:40:40 +0000"  >&lt;p&gt;I decided to write down my concern and known problems here. I wrote it on the gerrit but it was missed in the refresh and rebase.&lt;/p&gt;

&lt;p&gt;1. head of block - when a subtask is blocked due to long holding lock, for example group lock, the tasks behind that one will be blocked that means nobody could move forward;&lt;br/&gt;
2. for pdata implementation right now, once a task has been decided to put into a queue, it has to be stuck in that queue. To some extent, the overhead performance now is limited by the slowest OST;&lt;br/&gt;
3. lack of cancel API. This poses a problem if an error has already seen in a previous subtask, it&apos;s expected to cancel all the other pending subtasks that belong to the same I/O because it&apos;ll be just wasting of time by continuing doing those;&lt;br/&gt;
4. pdata uses per cpu work queue. This is not perfect fit to drive Lustre I/O task because most of the time the tasks are in sleep state waiting for some resources, which means the CPU could be used to drive the other subtasks. It would be extremely difficult to work out something like ZIO pipeline in ZFS but it&apos;ll be easier to just spawn more I/O threads than the number of CPUs;&lt;br/&gt;
5. Lack of NUMA support. This should be really easy to fix.&lt;/p&gt;

&lt;p&gt;It&apos;s not necessary to block the landing of the patch as is. However, this is something we should proceed to work post landing.&lt;/p&gt;</comment>
                            <comment id="197544" author="gerrit" created="Tue, 30 May 2017 13:28:10 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/24474/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/24474/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8964&quot; title=&quot;use parallel I/O to improve performance on machines with slow single thread performance&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8964&quot;&gt;&lt;del&gt;LU-8964&lt;/del&gt;&lt;/a&gt; libcfs: Introduce parallel tasks framework&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: eca949bf83dfec5be33dc6230f55823376aef65a&lt;/p&gt;</comment>
                            <comment id="197545" author="gerrit" created="Tue, 30 May 2017 13:28:16 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/26468/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/26468/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8964&quot; title=&quot;use parallel I/O to improve performance on machines with slow single thread performance&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8964&quot;&gt;&lt;del&gt;LU-8964&lt;/del&gt;&lt;/a&gt; clio: Parallelize generic I/O&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: db59ecb5d1d0284fb918def6348a11e0966d7767&lt;/p&gt;</comment>
                            <comment id="210695" author="gerrit" created="Tue, 10 Oct 2017 07:45:03 +0000"  >&lt;p&gt;Dmitry Eremin (dmitry.eremin@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29540&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29540&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8964&quot; title=&quot;use parallel I/O to improve performance on machines with slow single thread performance&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8964&quot;&gt;&lt;del&gt;LU-8964&lt;/del&gt;&lt;/a&gt; llite: Make readahead async&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: flr&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: a3d49f1db481c9e4ae72693eade7bcf8729483ce&lt;/p&gt;</comment>
                            <comment id="214947" author="ihara" created="Wed, 29 Nov 2017 17:49:39 +0000"  >&lt;p&gt;here is quick test resutls of patch &lt;a href=&quot;https://review.whamcloud.com/#/c/26469/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/26469/&lt;/a&gt; (patch set 37)&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;lctl set_param osc.*.max_pages_per_rpc=16M
lctl set_param osc.*.max_rpcs_in_flight=16
lctl set_param osc.*.max_dirty_mb=512
lctl set_param llite.*.max_read_ahead_mb=2048
lctl set_param osc.*.checksums=0

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;40 x OST, 1 x client(2 x E5-2650v4, 128GB memory, 1 x FDR)&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;# lfs setstripe -c -1 -S 16M /scratch0/dir
# IOR -w -k -F -e -t 1m -b 256g -vv -o /scratch0/dir/file
# sync;echo 3 &amp;gt; /proc/sys/vm/drop_caches (client and OSSs)
# IOR -r -F -e -t 1m -b 256g -vv -o /scratch0/dir/file

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;div class=&apos;table-wrap&apos;&gt;
&lt;table class=&apos;confluenceTable&apos;&gt;&lt;tbody&gt;
&lt;tr&gt;
&lt;td class=&apos;confluenceTd&apos;&gt;2.7.22.1&lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt;2.10.55&lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt;2.10.55/w patch&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td class=&apos;confluenceTd&apos;&gt;897.34&lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt;881.34MB/s&lt;/td&gt;
&lt;td class=&apos;confluenceTd&apos;&gt;911.55MB/s&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;&lt;/table&gt;
&lt;/div&gt;


&lt;p&gt;I didn&apos;t see big performance improvments with patches when if max_pages_per_rpc=16M.&lt;/p&gt;</comment>
                            <comment id="214951" author="paf" created="Wed, 29 Nov 2017 18:02:29 +0000"  >&lt;p&gt;A few questions.&lt;/p&gt;

&lt;p&gt;1. What&apos;s the best-case bandwidth of an individual OST?&lt;br/&gt;
2. You&apos;re only striping to one OST, what happens if you stripe wider?&lt;br/&gt;
3. How many cores on that client?&lt;/p&gt;</comment>
                            <comment id="215070" author="ihara" created="Thu, 30 Nov 2017 22:24:29 +0000"  >&lt;blockquote&gt;
&lt;p&gt;1. What&apos;s the best-case bandwidth of an individual OST?&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Here is single OST&apos;s performance. this is same IOR test above, but no striping. Just an OST.&lt;br/&gt;
832.16MB/s&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;2. You&apos;re only striping to one OST, what happens if you stripe wider?&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;No, first results were striping acrossing all OSTs (40 x OST) with &quot;lfs setstripe -c -1&quot;.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;3. How many cores on that client?&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;As I&apos;ve described above 2 x E5-2650v4 means 24 CPU cores total (12 CPU cores each socket)&lt;/p&gt;</comment>
                            <comment id="215072" author="jay" created="Thu, 30 Nov 2017 22:51:51 +0000"  >&lt;p&gt;Have you collected rpc_stats from client side to see if it actually sends full size RPC?&lt;/p&gt;</comment>
                            <comment id="215087" author="adilger" created="Fri, 1 Dec 2017 02:28:15 +0000"  >&lt;p&gt;Firstly, I think one problem is that IOR is being run with a 1Mb transfer size (&#8220;&lt;tt&gt;-t 1m&lt;/tt&gt;&#8221;) so the kernel-side parallelism doesn&#8217;t get used. Try running with &#8220;&lt;tt&gt;-t 16m&lt;/tt&gt;&#8221; so that the kernel threads can help do the copies from userspace and RPCs in parallel.&lt;/p&gt;

&lt;p&gt;Also, how many IOR threads running on the client?  Just one?  &lt;/p&gt;

&lt;p&gt;One thing I was wondering is if the 16MB stripe_size is potentially hurting the parallelism? That means multiple threads are doing the copying, but they are all sending to the same OST.  Try with the 1MB stripe size so that the kernel threads can send to multiple OSTs in parallel. &lt;/p&gt;
</comment>
                            <comment id="215091" author="ihara" created="Fri, 1 Dec 2017 04:45:53 +0000"  >&lt;blockquote&gt;
&lt;p&gt;Have you collected rpc_stats from client side to see if it actually sends full size RPC?&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;this is single OST testing. I see some small 4K reads.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;osc.scratch0-OST0009-osc-ffff88203a33e000.rpc_stats=
snapshot_time:         1512103327.914410097 (secs.nsecs)
read RPCs in flight:  0
write RPCs in flight: 0
pending write pages:  0
pending read pages:   0

			read			write
pages per rpc         rpcs   % cum % |       rpcs   % cum %
1:		      3358  16  16   |          0   0   0
2:		         1   0  16   |          0   0   0
4:		         0   0  16   |          0   0   0
8:		         0   0  16   |          0   0   0
16:		         0   0  16   |          0   0   0
32:		         2   0  16   |          0   0   0
64:		         7   0  16   |          0   0   0
128:		        15   0  16   |          0   0   0
256:		        27   0  16   |          9   0   0
512:		        54   0  17   |          7   0   0
1024:		       139   0  17   |          2   0   0
2048:		       281   1  19   |          5   0   0
4096:		     16441  80 100   |      16384  99 100

			read			write
rpcs in flight        rpcs   % cum % |       rpcs   % cum %
0:		         0   0   0   |          0   0   0
1:		      5854  28  28   |         38   0   0
2:		      2654  13  41   |      14182  86  86
3:		      2652  13  54   |       1753  10  97
4:		      2634  12  67   |        140   0  98
5:		      1523   7  75   |        113   0  98
6:		      1472   7  82   |         59   0  99
7:		      1463   7  89   |         34   0  99
8:		      1428   7  96   |         11   0  99
9:		       248   1  98   |          5   0  99
10:		       178   0  98   |          4   0  99
11:		       139   0  99   |          3   0  99
12:		        67   0  99   |          4   0  99
13:		         8   0  99   |          3   0  99
14:		         3   0  99   |          3   0  99
15:		         1   0  99   |          3   0  99
16:		         1   0 100   |          3   0  99
17:		         0   0 100   |         27   0  99
18:		         0   0 100   |         22   0 100

			read			write
offset                rpcs   % cum % |       rpcs   % cum %
0:		         1   0   0   |          1   0   0
1:		         1   0   0   |          0   0   0
2:		         0   0   0   |          0   0   0
4:		         0   0   0   |          0   0   0
8:		         0   0   0   |          0   0   0
16:		         0   0   0   |          0   0   0
32:		         0   0   0   |          0   0   0
64:		         0   0   0   |          0   0   0
128:		         0   0   0   |          0   0   0
256:		         0   0   0   |          0   0   0
512:		         0   0   0   |          0   0   0
1024:		         0   0   0   |          0   0   0
2048:		         0   0   0   |          0   0   0
4096:		         1   0   0   |          1   0   0
8192:		         3   0   0   |          2   0   0
16384:		         5   0   0   |          4   0   0
32768:		        11   0   0   |          8   0   0
65536:		        21   0   0   |         16   0   0
131072:		        35   0   0   |         32   0   0
262144:		        69   0   0   |         64   0   0
524288:		       139   0   1   |        128   0   1
1048576:		       280   1   2   |        256   1   3
2097152:		       559   2   5   |        512   3   6
4194304:		      1114   5  11   |       1024   6  12
8388608:		      2241  11  22   |       2052  12  24
16777216:		      4622  22  44   |       4110  25  50
33554432:		     11223  55 100   |       8197  49 100
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="215108" author="ihara" created="Fri, 1 Dec 2017 05:44:18 +0000"  >&lt;p&gt;Please see attached. &lt;a href=&quot;#attachment-28818&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;40ost_rpc_stats.txt&lt;/a&gt; &lt;br/&gt;
this is rpc_stats for all 40 OSTs.  Please keep in mind, all my testing are single IOR thread.&lt;br/&gt;
there are a lot of small read on some of OSCs.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
osc.scratch0-OST0000-osc-ffff88203a33e000.rpc_stats=
snapshot_time:         1512106460.010766948 (secs.nsecs)
read RPCs in flight:  0
write RPCs in flight: 0
pending write pages:  0
pending read pages:   0

			read			write
pages per rpc         rpcs   % cum % |       rpcs   % cum %
1:		      3764  81  81   |          0   0   0
2:		         0   0  81   |          0   0   0
4:		         0   0  81   |          0   0   0
8:		         0   0  81   |          0   0   0
16:		         0   0  81   |          0   0   0
32:		        24   0  82   |          0   0   0
64:		         9   0  82   |          0   0   0
128:		        18   0  82   |          0   0   0
256:		        54   1  83   |          0   0   0
512:		        62   1  85   |          0   0   0
1024:		       125   2  87   |          0   0   0
2048:		       160   3  91   |          0   0   0
4096:		       398   8 100   |          0   0   0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;majority of  RPCs are large (4MB, 8MB or 16MB, but not all 16MB) on some of other OSCs.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
osc.scratch0-OST0022-osc-ffff88203a33e000.rpc_stats=
snapshot_time:         1512106460.012210601 (secs.nsecs)
read RPCs in flight:  0
write RPCs in flight: 0
pending write pages:  0
pending read pages:   0

			read			write
pages per rpc         rpcs   % cum % |       rpcs   % cum %
1:		        33   3   3   |          0   0   0
2:		         0   0   3   |          0   0   0
4:		         2   0   3   |          0   0   0
8:		         0   0   3   |          0   0   0
16:		         4   0   4   |          0   0   0
32:		        23   2   7   |          0   0   0
64:		        11   1   8   |          0   0   0
128:		        15   1   9   |          0   0   0
256:		        52   5  15   |          0   0   0
512:		        62   7  22   |          0   0   0
1024:		       115  12  35   |          0   0   0
2048:		       169  19  54   |          0   0   0
4096:		       399  45 100   |          0   0   0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="215109" author="jay" created="Fri, 1 Dec 2017 06:42:05 +0000"  >&lt;p&gt;16% 4K read should be problematic.&lt;/p&gt;</comment>
                            <comment id="215111" author="ihara" created="Fri, 1 Dec 2017 07:09:04 +0000"  >&lt;blockquote&gt;
&lt;p&gt;Firstly, I think one problem is that IOR is being run with a 1Mb transfer size (&#8220;-t 1m&#8221;) so the kernel-side parallelism doesn&#8217;t get used. Try running with &#8220;-t 16m&#8221; so that the kernel threads can help do the copies from userspace and RPCs in parallel.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;I haven&apos;t never seen any behefits with -t 16M unless we use O_DIRECT.  I just dobule checked -t 16mb with patched client, &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;IOR -a POSIX -r -k -e -vv -t 16m -b 256g -F -o /scratch0/dir/file

Max Read:  779.65 MiB/sec (817.53 MB/sec)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;blockquote&gt;
&lt;p&gt;Also, how many IOR threads running on the client? Just one?&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Just one thread.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;One thing I was wondering is if the 16MB stripe_size is potentially hurting the parallelism? That means multiple threads are doing the copying, but they are all sending to the same OST. Try with the 1MB stripe size so that the kernel threads can send to multiple OSTs in parallel.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;1MB stripe size is even worse.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;# lfs setstripe -S 1M -c -1 /scratch0/dir
# mpirun -np 1 IOR -a POSIX -w -k -e -vv -t 1m -b 256g -F -o /scratch0/dir/file
# echo 3 &amp;gt; /proc/sys/vm/drop_caches (on OSS/MDS and client)
# mpirun -np 1 IOR -a POSIX -r -k -e -vv -t 1m -b 256g -F -o /scratch0/dir/file

Summary:
	api                = POSIX
	test filename      = /scratch0/dir/file
	access             = file-per-process
	pattern            = segmented (1 segment)
	ordering in a file = sequential offsets
	ordering inter file= no tasks offsets
	clients            = 1 (1 per node)
	repetitions        = 1
	xfersize           = 1 MiB
	blocksize          = 256 GiB
	aggregate filesize = 256 GiB

Using Time Stamp 1512108853 (0x5a20f335) &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; Data Signature
[RANK 000] open &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; reading file /scratch0/dir/file.00000000 XXCEL
Commencing read performance test.
Fri Dec  1 15:14:13 2017

access    bw(MiB/s)  block(KiB) xfer(KiB)  open(s)    wr/rd(s)   close(s) total(s)  iter
------    ---------  ---------- ---------  --------   --------   --------  --------   ----
read      111.75     268435456  1024.00    0.000296   2345.81    0.001064   2345.81    0    XXCEL
Operation  Max (MiB)  Min (MiB)  Mean (MiB)   Std Dev  Max (OPs)  Min (OPs)  Mean (OPs)   Std Dev  Mean (s)  Op grep #Tasks tPN reps  fPP reord reordoff reordrand seed segcnt blksiz xsize aggsize

---------  ---------  ---------  ----------   -------  ---------  ---------  ----------   -------  --------
read          111.75     111.75      111.75      0.00     111.75     111.75      111.75      0.002345.80832   1 1 1 1 0 1 0 0 1 274877906944 1048576 274877906944 -1 POSIX EXCEL

Max Read:  111.75 MiB/sec (117.18 MB/sec)

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="215573" author="paf" created="Thu, 7 Dec 2017 17:12:13 +0000"  >&lt;p&gt;I thought I&apos;d try a quick, simple test, and I&apos;m seeing significant issues.&lt;/p&gt;

&lt;p&gt;Here&apos;s the file, a single stripe on OST 0, around 7 GB in size:&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@cent7c01 cent7s02&amp;#93;&lt;/span&gt;# ls -la ost0&lt;br/&gt;
&lt;del&gt;rw-r&lt;/del&gt;&lt;del&gt;r&lt;/del&gt;- 1 root root 7345393664 Dec  7 13:04 ost0&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@cent7c01 cent7s02&amp;#93;&lt;/span&gt;# lfs getstripe ost0&lt;br/&gt;
ost0&lt;br/&gt;
lmm_stripe_count:  1&lt;br/&gt;
lmm_stripe_size:   1048576&lt;br/&gt;
lmm_pattern:       raid0&lt;br/&gt;
lmm_layout_gen:    0&lt;br/&gt;
lmm_stripe_offset: 0&lt;br/&gt;
	obdidx		 objid		 objid		 group&lt;br/&gt;
	     0	           579	        0x243	             0&lt;/p&gt;

&lt;p&gt;When I read it (after clearing my rpc stats) without this patch, here&apos;s what that looks like:&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@cent7c02 cent7s02&amp;#93;&lt;/span&gt;# cat ost0 &amp;gt; /dev/null; cat /proc/fs/lustre/osc/cent7s02-OST0000-osc-ffff880034222000/rpc_stats&lt;br/&gt;
snapshot_time:         1512669699.623744 (secs.usecs)&lt;br/&gt;
read RPCs in flight:  0&lt;br/&gt;
write RPCs in flight: 0&lt;br/&gt;
pending write pages:  0&lt;br/&gt;
pending read pages:   0&lt;/p&gt;

&lt;p&gt;			read			write&lt;br/&gt;
pages per rpc         rpcs   % cum % |       rpcs   % cum %&lt;br/&gt;
1:		         1   0   0   |          0   0   0&lt;br/&gt;
2:		         0   0   0   |          0   0   0&lt;br/&gt;
4:		         0   0   0   |          0   0   0&lt;br/&gt;
8:		         0   0   0   |          0   0   0&lt;br/&gt;
16:		         0   0   0   |          0   0   0&lt;br/&gt;
32:		         0   0   0   |          0   0   0&lt;br/&gt;
64:		         0   0   0   |          0   0   0&lt;br/&gt;
128:		         0   0   0   |          0   0   0&lt;br/&gt;
256:		         0   0   0   |          0   0   0&lt;br/&gt;
512:		         1   0   0   |          0   0   0&lt;br/&gt;
1024:		      1751  99 100   |          0   0   0&lt;/p&gt;

&lt;p&gt;There is 1 single page RPC, which is pretty good.  A few more would be acceptable too, of course.&lt;/p&gt;

&lt;p&gt;Here&apos;s what it looks like with this patch:&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@cent7c01 cent7s02&amp;#93;&lt;/span&gt;# cat /proc/fs/lustre/osc/cent7s02-OST0000-osc-ffff8800b24b7000/rpc_stats&lt;br/&gt;
snapshot_time:         1512670149.589299551 (secs.nsecs)&lt;br/&gt;
read RPCs in flight:  0&lt;br/&gt;
write RPCs in flight: 0&lt;br/&gt;
pending write pages:  0&lt;br/&gt;
pending read pages:   0&lt;/p&gt;

&lt;p&gt;			read			write&lt;br/&gt;
pages per rpc         rpcs   % cum % |       rpcs   % cum %&lt;br/&gt;
1:		     22218  92  92   |          0   0   0&lt;br/&gt;
2:		         0   0  92   |          0   0   0&lt;br/&gt;
4:		         0   0  92   |          0   0   0&lt;br/&gt;
8:		         0   0  92   |          0   0   0&lt;br/&gt;
16:		         2   0  92   |          0   0   0&lt;br/&gt;
32:		         1   0  92   |          0   0   0&lt;br/&gt;
64:		         0   0  92   |          0   0   0&lt;br/&gt;
128:		         0   0  92   |          0   0   0&lt;br/&gt;
256:		         0   0  92   |          0   0   0&lt;br/&gt;
512:		         2   0  92   |          0   0   0&lt;br/&gt;
1024:		      1729   7 100   |          0   0   0&lt;/p&gt;

&lt;p&gt;That&apos;s 92% 4K RPCs.  That&apos;s... definitely not OK.&lt;/p&gt;

&lt;p&gt;Here&apos;s a quick test with dd, with 1 MiB reads, just in case cat was the problem:&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@cent7c01 cent7s02&amp;#93;&lt;/span&gt;# dd if=./ost0 of=/dev/null bs=1M&lt;br/&gt;
7005+1 records in&lt;br/&gt;
7005+1 records out&lt;br/&gt;
7345393664 bytes (7.3 GB) copied, 26.8776 s, 273 MB/s&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@cent7c01 cent7s02&amp;#93;&lt;/span&gt;# cat /proc/fs/lustre/osc/cent7s02-OST0000-osc-ffff8800b24b7000/rpc_stats&lt;br/&gt;
snapshot_time:         1512670262.741210431 (secs.nsecs)&lt;br/&gt;
read RPCs in flight:  0&lt;br/&gt;
write RPCs in flight: 0&lt;br/&gt;
pending write pages:  0&lt;br/&gt;
pending read pages:   0&lt;/p&gt;

&lt;p&gt;			read			write&lt;br/&gt;
pages per rpc         rpcs   % cum % |       rpcs   % cum %&lt;br/&gt;
1:		     20687  92  92   |          0   0   0&lt;br/&gt;
2:		         0   0  92   |          0   0   0&lt;br/&gt;
4:		         0   0  92   |          0   0   0&lt;br/&gt;
8:		         4   0  92   |          0   0   0&lt;br/&gt;
16:		         0   0  92   |          0   0   0&lt;br/&gt;
32:		         0   0  92   |          0   0   0&lt;br/&gt;
64:		         0   0  92   |          0   0   0&lt;br/&gt;
128:		         0   0  92   |          0   0   0&lt;br/&gt;
256:		         0   0  92   |          0   0   0&lt;br/&gt;
512:		         1   0  92   |          0   0   0&lt;br/&gt;
1024:		      1731   7 100   |          0   0   0&lt;/p&gt;

&lt;p&gt;Note that the performance #s aren&apos;t really relevant, these are relatively slow VMs.  But the RPC size #s very much are.&lt;/p&gt;</comment>
                            <comment id="215600" author="adilger" created="Thu, 7 Dec 2017 20:45:09 +0000"  >&lt;p&gt;Not to downplay the significance of the 4KB RPCs, since there are a lot of them, but it is worthwhile to note that the bulk of the IO is still being transferred by 4MB RPCs - 1729/1731 in the patched case vs 1751 in the unpatched case.  So 98% of the data is still using 4MB RPCs, vs 2% of the data is using 4KB RPCs.&lt;/p&gt;

&lt;p&gt;My first guess is that the small RPC sizes are caused by a smaller readahead window at the start being split across multiple threads/RPCs, and then as the readahead window grows it starts using larger reads?  It may also be that a small number of 4KB RPCs are being handled separately for each 4MB RPC, but we don&apos;t see this because of binning in the histogram.  That would have to be checked in the client debug logs with &lt;tt&gt;lctl set_param debug=+reada&lt;/tt&gt;.&lt;/p&gt;</comment>
                            <comment id="215607" author="paf" created="Thu, 7 Dec 2017 21:09:15 +0000"  >&lt;p&gt;I see what you&apos;re getting at, but then I would expect the % of small RPCs to &lt;b&gt;drop&lt;/b&gt; as we go up in file size, but I in fact see the opposite.  Something goes badly wrong for me around 4 GiB in size, which is 50% of memory on this VM node (which is unlikely to be a coincidence).&lt;/p&gt;

&lt;p&gt;Examples - 1000 MiB:&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@cent7c01 cent7s02&amp;#93;&lt;/span&gt;# ls -la ost0; cat ost0 &amp;gt; /dev/null; cat /proc/fs/lustre/osc/cent7s02-OST0000-osc-ffff88022437c800/rpc_stats&lt;br/&gt;
&lt;del&gt;rw-r&lt;/del&gt;&lt;del&gt;r&lt;/del&gt;- 1 root root 1048576000 Dec  7 16:52 ost0&lt;br/&gt;
snapshot_time:         1512683598.957290341 (secs.nsecs)&lt;br/&gt;
read RPCs in flight:  0&lt;br/&gt;
write RPCs in flight: 0&lt;br/&gt;
pending write pages:  0&lt;br/&gt;
pending read pages:   0&lt;/p&gt;

&lt;p&gt;			read			write&lt;br/&gt;
pages per rpc         rpcs   % cum % |       rpcs   % cum %&lt;br/&gt;
1:		        22   8   8   |          0   0   0&lt;br/&gt;
2:		         0   0   8   |          0   0   0&lt;br/&gt;
4:		         0   0   8   |          0   0   0&lt;br/&gt;
8:		         0   0   8   |          0   0   0&lt;br/&gt;
16:		         0   0   8   |          0   0   0&lt;br/&gt;
32:		         0   0   8   |          0   0   0&lt;br/&gt;
64:		         0   0   8   |          0   0   0&lt;br/&gt;
128:		         0   0   8   |          0   0   0&lt;br/&gt;
256:		         0   0   8   |          0   0   0&lt;br/&gt;
512:		         0   0   8   |          0   0   0&lt;br/&gt;
1024:		       250  91 100   |          0   0   0&lt;/p&gt;

&lt;p&gt;2000 MiB:&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@cent7c01 cent7s02&amp;#93;&lt;/span&gt;# dd if=/dev/zero bs=1M count=2000 of=./ost0; echo 3 &amp;gt; /proc/sys/vm/drop_caches ; echo clear &amp;gt; /sys/fs/lustre/ldlm/namespaces/cent7s02-OST0000-osc-ffff88022437c800/lru_size; echo c &amp;gt; /proc/fs/lustre/osc/cent7s02-OST0000-osc-ffff88022437c800/rpc_stats; cat ost0 &amp;gt; /dev/null; cat /proc/fs/lustre/osc/cent7s02-OST0000-osc-ffff88022437c800/rpc_stats&lt;br/&gt;
2000+0 records in&lt;br/&gt;
2000+0 records out&lt;br/&gt;
2097152000 bytes (2.1 GB) copied, 7.99026 s, 262 MB/s&lt;br/&gt;
snapshot_time:         1512683716.923342172 (secs.nsecs)&lt;br/&gt;
read RPCs in flight:  0&lt;br/&gt;
write RPCs in flight: 0&lt;br/&gt;
pending write pages:  0&lt;br/&gt;
pending read pages:   0&lt;/p&gt;

&lt;p&gt;			read			write&lt;br/&gt;
pages per rpc         rpcs   % cum % |       rpcs   % cum %&lt;br/&gt;
1:		        37   6   6   |          0   0   0&lt;br/&gt;
2:		         0   0   6   |          0   0   0&lt;br/&gt;
4:		         0   0   6   |          0   0   0&lt;br/&gt;
8:		         0   0   6   |          0   0   0&lt;br/&gt;
16:		         0   0   6   |          0   0   0&lt;br/&gt;
32:		         0   0   6   |          0   0   0&lt;br/&gt;
64:		         0   0   6   |          0   0   0&lt;br/&gt;
128:		         0   0   6   |          0   0   0&lt;br/&gt;
256:		         0   0   6   |          0   0   0&lt;br/&gt;
512:		         0   0   6   |          0   0   0&lt;br/&gt;
1024:		       500  93 100   |          0   0   0&lt;/p&gt;

&lt;p&gt;3000 MiB:&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@cent7c01 cent7s02&amp;#93;&lt;/span&gt;# dd if=/dev/zero bs=1M count=3000 of=./ost0; echo 3 &amp;gt; /proc/sys/vm/drop_caches ; echo clear &amp;gt; /sys/fs/lustre/ldlm/namespaces/cent7s02-OST0000-osc-ffff88022437c800/lru_size; echo c &amp;gt; /proc/fs/lustre/osc/cent7s02-OST0000-osc-ffff88022437c800/rpc_stats; cat ost0 &amp;gt; /dev/null; cat /proc/fs/lustre/osc/cent7s02-OST0000-osc-ffff88022437c800/rpc_stats&lt;br/&gt;
3000+0 records in&lt;br/&gt;
3000+0 records out&lt;br/&gt;
3145728000 bytes (3.1 GB) copied, 10.6623 s, 295 MB/s&lt;br/&gt;
snapshot_time:         1512683893.046695412 (secs.nsecs)&lt;br/&gt;
read RPCs in flight:  0&lt;br/&gt;
write RPCs in flight: 0&lt;br/&gt;
pending write pages:  0&lt;br/&gt;
pending read pages:   0&lt;/p&gt;

&lt;p&gt;			read			write&lt;br/&gt;
pages per rpc         rpcs   % cum % |       rpcs   % cum %&lt;br/&gt;
1:		        53   6   6   |          0   0   0&lt;br/&gt;
2:		         0   0   6   |          0   0   0&lt;br/&gt;
4:		         0   0   6   |          0   0   0&lt;br/&gt;
8:		         0   0   6   |          0   0   0&lt;br/&gt;
16:		         0   0   6   |          0   0   0&lt;br/&gt;
32:		         0   0   6   |          0   0   0&lt;br/&gt;
64:		         0   0   6   |          0   0   0&lt;br/&gt;
128:		         0   0   6   |          0   0   0&lt;br/&gt;
256:		         0   0   6   |          0   0   0&lt;br/&gt;
512:		         0   0   6   |          0   0   0&lt;br/&gt;
1024:		       750  93 100   |          0   0   0&lt;/p&gt;

&lt;p&gt;4000 MiB:&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@cent7c01 cent7s02&amp;#93;&lt;/span&gt;# dd if=/dev/zero bs=1M count=4000 of=./ost0; echo 3 &amp;gt; /proc/sys/vm/drop_caches ; echo clear &amp;gt; /sys/fs/lustre/ldlm/namespaces/cent7s02-OST0000-osc-ffff88022437c800/lru_size; echo c &amp;gt; /proc/fs/lustre/osc/cent7s02-OST0000-osc-ffff88022437c800/rpc_stats; cat ost0 &amp;gt; /dev/null; cat /proc/fs/lustre/osc/cent7s02-OST0000-osc-ffff88022437c800/rpc_stats&lt;br/&gt;
4000+0 records in&lt;br/&gt;
4000+0 records out&lt;br/&gt;
4194304000 bytes (4.2 GB) copied, 13.1352 s, 319 MB/s&lt;br/&gt;
snapshot_time:         1512683761.337612432 (secs.nsecs)&lt;br/&gt;
read RPCs in flight:  0&lt;br/&gt;
write RPCs in flight: 0&lt;br/&gt;
pending write pages:  0&lt;br/&gt;
pending read pages:   0&lt;/p&gt;

&lt;p&gt;			read			write&lt;br/&gt;
pages per rpc         rpcs   % cum % |       rpcs   % cum %&lt;br/&gt;
1:		      8074  89  89   |          0   0   0&lt;br/&gt;
2:		         0   0  89   |          0   0   0&lt;br/&gt;
4:		         0   0  89   |          0   0   0&lt;br/&gt;
8:		         0   0  89   |          0   0   0&lt;br/&gt;
16:		         0   0  89   |          0   0   0&lt;br/&gt;
32:		         0   0  89   |          0   0   0&lt;br/&gt;
64:		         0   0  89   |          0   0   0&lt;br/&gt;
128:		         0   0  89   |          0   0   0&lt;br/&gt;
256:		         1   0  89   |          0   0   0&lt;br/&gt;
512:		         0   0  89   |          0   0   0&lt;br/&gt;
1024:		       992  10 100   |          0   0   0&lt;br/&gt;
&amp;#8212;&lt;/p&gt;

&lt;p&gt;Larger sizes remain problematic.  So when I hit the cached mb limit on the node, something goes totally off the rails, I think.  Perhaps we&apos;re getting that from the kernel, but it&apos;s still a major degradation.&lt;/p&gt;</comment>
                            <comment id="215609" author="paf" created="Thu, 7 Dec 2017 21:17:11 +0000"  >&lt;p&gt;Looking at when these smaller reads happen, they&apos;re clustered in the middle of the job, to the point where there are &lt;b&gt;no&lt;/b&gt; 1024 page reads for a while (I used the D_INODE debug in osc_build_rpc for this).&lt;/p&gt;

&lt;p&gt;This is for the 4000 MiB case.&lt;/p&gt;

&lt;p&gt;This is the first RPC, which is naturally enough 1 page:&lt;br/&gt;
00000008:00000002:0.0F:1512684358.524801:0:5975:0:(osc_request.c:2073:osc_build_rpc()) @@@ 1 pages, aa ffff880034753170. now 1r/0w in flight  req@ffff8&lt;/p&gt;

&lt;p&gt;That&apos;s followed by a large # of 1024 page RPCs, though with 1 page RPCs mixed in every so often, which seems weird.  It looks like there is no point at which we hit a steady state of only 1024 page RPCs.&lt;/p&gt;

&lt;p&gt;Then, here&apos;s the first of the set of only 1 page RPCs:&lt;br/&gt;
00000008:00000002:2.0:1512684366.335774:0:5973:0:(osc_request.c:2073:osc_build_rpc()) @@@ 1 pages, aa ffff88023602c170. now 1r/0w in flight  req@ffff88&lt;/p&gt;

&lt;p&gt;There are then NO 1024 page RPCs for some thousands of RPCs.  Weirdly, at the end it seems to recover and we do some 1024 page RPCs again.  Here&apos;s the first of those:&lt;br/&gt;
00000008:00000002:0.0:1512684370.524090:0:5974:0:(osc_request.c:2073:osc_build_rpc()) @@@ 1024 pages, aa ffff8800a506c770. now 1r/0w in flight  req@fff&lt;/p&gt;

&lt;p&gt;And here&apos;s the last RPC period:&lt;br/&gt;
00000008:00000002:1.0:1512684371.129494:0:5972:0:(osc_request.c:2073:osc_build_rpc()) @@@ 1022 pages, aa ffff88021f8c1c70. now 2r/0w in flight  req@fff&lt;/p&gt;

&lt;p&gt;So spend 8 second sending mostly large RPCs, then 4 seconds sending only 4 KiB RPCs, then another ~1 second sending large RPCs again.&lt;/p&gt;

&lt;p&gt;That means that, as you said, we&apos;re only sending a few % of the data in those RPCs - about 3% in this case.&lt;br/&gt;
But they&apos;re taking about 30% of the total time, and it&apos;s all in one big lump.&lt;/p&gt;

&lt;p&gt;Something&apos;s wrong.&lt;/p&gt;</comment>
                            <comment id="215646" author="dmiter" created="Fri, 8 Dec 2017 09:58:54 +0000"  >&lt;p&gt;This is regreassion of last version of my patch when I turned off async read ahead if PIO flag is not enabled. I&apos;m going to fix this. But anyway I cannot avoid several 1K requests. I think we should not fix this because of this leed low latency in requests. This happens when user request miss page and async read ahead is initiated but minwhile it request a single page only and then unlock user application. In the next loop of reading next page will be available from async read ahead.&lt;/p&gt;</comment>
                            <comment id="216318" author="dmiter" created="Thu, 14 Dec 2017 17:15:56 +0000"  >&lt;p&gt;The last version of patch don&apos;t have an issue with RPC splitting. For reading in my VM machine I have the following:&lt;/p&gt;

&lt;p&gt;with PIO disabled:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;                        read                    write
pages per rpc         rpcs   % cum % |       rpcs   % cum %
1:                       3   4   4   |          0   0   0
2:                       0   0   4   |          0   0   0
4:                       0   0   4   |          0   0   0
8:                       0   0   4   |          0   0   0
16:                      0   0   4   |          0   0   0
32:                      0   0   4   |          0   0   0
64:                      0   0   4   |          0   0   0
128:                     0   0   4   |          0   0   0
256:                     0   0   4   |          0   0   0
512:                     1   1   6   |          0   0   0
1024:                   62  93 100   |          0   0   0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;with PIO enabled:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;                        read                    write
pages per rpc         rpcs   % cum % |       rpcs   % cum %
1:                       2   2   2   |          0   0   0
2:                       0   0   2   |          0   0   0
4:                       0   0   2   |          0   0   0
8:                       0   0   2   |          0   0   0
16:                      0   0   2   |          0   0   0
32:                      0   0   2   |          0   0   0
64:                      0   0   2   |          0   0   0
128:                     0   0   2   |          0   0   0
256:                     1   1   4   |          0   0   0
512:                     4   5  10   |          0   0   0
1024:                   61  89 100   |          0   0   0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="216320" author="paf" created="Thu, 14 Dec 2017 17:32:41 +0000"  >&lt;p&gt;Great &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="226727" author="paf" created="Wed, 25 Apr 2018 14:50:59 +0000"  >&lt;p&gt;&lt;a href=&quot;https://www.eofs.eu/_media/events/devsummit17/patrick_farrell_laddevsummit_pio.pdf&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://www.eofs.eu/_media/events/devsummit17/patrick_farrell_laddevsummit_pio.pdf&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;This is old and out of date, but I wanted to make sure these slides were seen.&#160; I think the performance of the readahead code would probably be helped a lot by changes to the parallelization framework (as would the performance of pio itself).&lt;/p&gt;

&lt;p&gt;So slides 8, 9, and 10 would probably be of particular interest here.&#160; There are significant performance improvements available for PIO just by going from padata to something simpler.&#160; Also, the CPU binding behavior of padata is pretty bad - Binding explicitly to one CPU is problematic.&#160; Padata seems to assume the whole machine is dedicated, which is not a friendly assumption.&#160; (I discovered its CPU binding behavior because I saw performance problems - A particular CPU would be busy and the work assigned to that CPU would be delayed, which delays the completion of the whole i/o.&#160; At this time, other CPUs were idle, and not binding to a specific CPU would have allowed one of them to be used.)&lt;/p&gt;</comment>
                            <comment id="226728" author="paf" created="Wed, 25 Apr 2018 14:51:28 +0000"  >&lt;p&gt;Also, apologies for not posting these last year.&lt;/p&gt;</comment>
                            <comment id="226778" author="dmiter" created="Thu, 26 Apr 2018 11:26:05 +0000"  >&lt;p&gt;Thanks for slides. I will loop at them carefully. But for now I disagree that padata API have a big overhead. It&apos;s mostly negligible comparing with other overhead to pass work into different thread. But having many threads will leads a sheduler delay to switch under heavy loads. So, I think padata will work more stable and predictable in this case.&lt;/p&gt;</comment>
                            <comment id="237593" author="simmonsja" created="Wed, 28 Nov 2018 14:40:07 +0000"  >&lt;p&gt;Thanks Patrick for the heads up on ktask. I will be watching it closely and give it a spin under this ticket.&lt;/p&gt;</comment>
                            <comment id="243515" author="simmonsja" created="Thu, 7 Mar 2019 23:39:04 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12403&quot; title=&quot;add e2fsprog support for RHEL-8&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12403&quot;&gt;&lt;del&gt;LU-12403&lt;/del&gt;&lt;/a&gt; will do this work correctly.&lt;/p&gt;</comment>
                            <comment id="256866" author="spitzcor" created="Tue, 22 Oct 2019 19:11:58 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=simmonsja&quot; class=&quot;user-hover&quot; rel=&quot;simmonsja&quot;&gt;simmonsja&lt;/a&gt;, for the record, you mean &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12043&quot; title=&quot;improve Lustre single thread read performances&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12043&quot;&gt;&lt;del&gt;LU-12043&lt;/del&gt;&lt;/a&gt;.  &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12403&quot; title=&quot;add e2fsprog support for RHEL-8&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12403&quot;&gt;&lt;del&gt;LU-12403&lt;/del&gt;&lt;/a&gt; is &quot;add e2fsprog support for RHEL-8&quot;.  &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="13028">LU-1056</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="30411">LU-6658</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="40684">LU-8709</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="52453">LU-11069</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="49684">LU-10367</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="46571">LU-9618</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="54395">LU-11825</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="55064">LU-12043</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="28818" name="40ost_rpc_stats.txt" size="93500" author="ihara" created="Fri, 1 Dec 2017 05:37:00 +0000"/>
                            <attachment id="25213" name="read_readahead_test.c" size="2905" author="paf" created="Mon, 6 Feb 2017 15:53:16 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzyz1b:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>