<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:16:29 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-15223] Improve partial page read/write</title>
                <link>https://jira.whamcloud.com/browse/LU-15223</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Currently, when the user issues a partial page read/write, Lustre (including client side and server side) will convert it into a full page read/write.&lt;/p&gt;

&lt;p&gt;This is not efficient for small read/write operations , say if you want to read/write several bytes from a file, actually Lustre will&#160; read/write a full page. This will become even worse for large PAGE_SIZE 64KB.&lt;/p&gt;

&lt;p&gt;Make Lustre do a real partial page read/write which the read/write range is actual from start to end(which are given by the user), so that we can get an efficient small read/write.&lt;/p&gt;</description>
                <environment>Arch: aarch64 (client)</environment>
        <key id="67165">LU-15223</key>
            <summary>Improve partial page read/write</summary>
                <type id="4" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11310&amp;avatarType=issuetype">Improvement</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="3" iconUrl="https://jira.whamcloud.com/images/icons/statuses/inprogress.png" description="This issue is being actively worked on at the moment by the assignee.">In Progress</status>
                    <statusCategory id="4" key="indeterminate" colorName="inprogress"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="xinliang">Xinliang Liu</assignee>
                                    <reporter username="xinliang">Xinliang Liu</reporter>
                        <labels>
                            <label>arm</label>
                            <label>arm-server</label>
                            <label>ppc</label>
                    </labels>
                <created>Mon, 15 Nov 2021 04:03:26 +0000</created>
                <updated>Wed, 14 Jun 2023 15:40:20 +0000</updated>
                                            <version>Lustre 2.12.0</version>
                    <version>Lustre 2.12.4</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="318234" author="xinliang" created="Mon, 15 Nov 2021 04:08:39 +0000"  >&lt;p&gt;Copy partial page write comments from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11667&quot; title=&quot;sanity test 317: FAIL: Expected Block 8 got 48 for f317.sanity&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11667&quot;&gt;&lt;del&gt;LU-11667&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=xinliang&quot; class=&quot;user-hover&quot; rel=&quot;xinliang&quot;&gt;xinliang&lt;/a&gt;&#160;added a comment &#160;&lt;del&gt;16/Sep/21 11:17 AM&lt;/del&gt;&#160;&#160;edited&lt;br/&gt;
Created two same size 10B&#160; file in home dir and /mnt/lustre dir, if the backend filesystem block size is 4K. Then the inode allocated blocks should be the same( that is 8 if count by block size 512B).&lt;/p&gt;

&lt;p&gt;Test file created at home dir:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
$ getconf PAGESIZE
65536
$ echo &lt;span class=&quot;code-quote&quot;&gt;&quot;123456789&quot;&lt;/span&gt; &amp;gt; ~/testfile
$ stat ~/testfile
  File: /root/testfile
  Size: 10              Blocks: 8          IO Block: 65536  regular file
Device: fc02h/64514d    Inode: 12863429    Links: 1
Access: (0644/-rw-r--r--)  Uid: (    0/    root)   Gid: (    0/    root)
Access: 2021-09-16 02:51:21.268641287 +0000
Modify: 2021-09-16 03:08:05.382557951 +0000
Change: 2021-09-16 03:08:05.382557951 +0000
 Birth: -
$ stat -c %b ~/testfile
8
$ stat -c %B ~/testfile
512
$ stat -c %s ~/testfile
10
$ stat -f ~/testfile
  File: &lt;span class=&quot;code-quote&quot;&gt;&quot;/root/testfile&quot;&lt;/span&gt;
    ID: fc0200000000 Namelen: 255     Type: xfs
Block size: 4096       Fundamental block size: 4096
Blocks: Total: 52272379   Free: 45840170   Available: 45840170
Inodes: Total: 104549824  Free: 104176363

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Test file created at Lustre dir:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
$ getconf PAGESIZE
65536
$ echo &lt;span class=&quot;code-quote&quot;&gt;&quot;123456789&quot;&lt;/span&gt; &amp;gt; /mnt/lustre/testfile
$ stat -c %s /mnt/lustre/testfile
10
$ stat -c %B /mnt/lustre/testfile
512
$ stat -c %b /mnt/lustre/testfile
128
$ stat  /mnt/lustre/testfile
  File: /mnt/lustre/testfile
  Size: 10              Blocks: 128        IO Block: 4194304 regular file
Device: 2c54f966h/743766374d    Inode: 144115205272502274  Links: 1
Access: (0644/-rw-r--r--)  Uid: (    0/    root)   Gid: (    0/    root)
Access: 2021-09-16 02:53:57.000000000 +0000
Modify: 2021-09-16 03:07:28.000000000 +0000
Change: 2021-09-16 03:07:28.000000000 +0000
 Birth: -
$ stat  -f /mnt/lustre/testfile
  File: &lt;span class=&quot;code-quote&quot;&gt;&quot;/mnt/lustre/testfile&quot;&lt;/span&gt;
    ID: 2c54f96600000000 Namelen: 255     Type: lustre
Block size: 4096       Fundamental block size: 4096
Blocks: Total: 78276      Free: 77931      Available: 71141
Inodes: Total: 100000     Free: 99726

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;But the Lustre test file&apos;s inode blocks is 128. This should be wrong?&lt;/p&gt;

&lt;p&gt;&#160;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=adilger&quot; class=&quot;user-hover&quot; rel=&quot;adilger&quot;&gt;adilger&lt;/a&gt;&#160;added a comment -&#160;16/Sep/21 2:55 PM&lt;br/&gt;
I think this has always been the case for writes from 64KB PAGE_SIZE clients (e.g. back to ia64). The reason is that the client sends a full-page write, because it is only tracking dirty pages, and the server writes the full amount of data sent by the client. I suspect that ext4 is handling this by having multiple 4KB buffer_heads on a 64KB page, and using the buffer dirty state to determine which pages to write, but Lustre doesn&apos;t use buffer heads.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=xinliang&quot; class=&quot;user-hover&quot; rel=&quot;xinliang&quot;&gt;xinliang&lt;/a&gt;&#160;added a comment -&#160;29/Oct/21 5:55 PM&lt;br/&gt;
HI Andreas,&lt;/p&gt;

&lt;p&gt;I found that this issue happens at Arm 64K PAGE_SIZE OST server.&lt;/p&gt;

&lt;p&gt;When create a file, blocks are allocated with PAGE_SIZE aligned,&#160;see function osd_ldiskfs_map_inode_pages().&lt;/p&gt;

&lt;p&gt;E.g. for 64K PAGE_SIZE&#160;Arm64 OST server, if create a file with size less than 64K, it&#160;actually allocates 128 blocks each block 512 Bytes.&lt;/p&gt;

&lt;p&gt;We need to adjust the test for 64K PAGE_SIZE OST server.&lt;br/&gt;
&#160;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=xinliang&quot; class=&quot;user-hover&quot; rel=&quot;xinliang&quot;&gt;xinliang&lt;/a&gt;&#160;added a comment -&#160;04/Nov/21 9:09 AM&lt;br/&gt;
I am thinking if we should make blocks allocation aligned with BLOCK_SIZE as ext4, which could save space for large PAGE_SIZE e.g. 64K. Then no need to make change to the test case. And I have a look at the code it seems both OSC client&#160; and OST server need to adjust for this. The client always sends no hole pages (currently page start offset is always 0) to the server for writing now. And the server side needs to adjust making blocks allocation aligned with block size.&#160;&lt;br/&gt;
&#160;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=xinliang&quot; class=&quot;user-hover&quot; rel=&quot;xinliang&quot;&gt;xinliang&lt;/a&gt;&#160;added a comment -&#160;04/Nov/21 6:02 PM&lt;br/&gt;
Find out the client side page clip related code:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 298 &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; osc_io_commit_async(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_env *env,
 299                         &lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct cl_io_slice *ios,
 300                         struct cl_page_list *qin, &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; from, &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; to,
 301                         cl_commit_cbt cb)
 302 {
...
 315         &lt;span class=&quot;code-comment&quot;&gt;/* Handle partial page cases */&lt;/span&gt;
 316         last_page = cl_page_list_last(qin);
 317         &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (oio-&amp;gt;oi_lockless) {
 318                 page = cl_page_list_first(qin);
 319                 &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (page == last_page) {
 320                         cl_page_clip(env, page, from, to);
 321                 } &lt;span class=&quot;code-keyword&quot;&gt;else&lt;/span&gt; {
 322                         &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (from != 0)
 323                                 cl_page_clip(env, page, from, PAGE_SIZE);
 324                         &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (to != PAGE_SIZE)
 325                                 cl_page_clip(env, last_page, 0, to);
 326                 }
 327         }
 328
 329         ll_pagevec_init(pvec, 0);

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Currently, it seems a normal write don&apos;t go into this &quot;if (oio-&amp;gt;oi_lockless) {&quot; part code. Anyone know why it is&#160;oi_lockless? @&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=adilger&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;Andreas Dilger&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&#160;&lt;br/&gt;
&#160;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=adilger&quot; class=&quot;user-hover&quot; rel=&quot;adilger&quot;&gt;adilger&lt;/a&gt;&#160;added a comment &#160;&lt;del&gt;05/Nov/21 3:29 AM&lt;/del&gt;&#160;&#160;edited&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=paf0186&quot; class=&quot;user-hover&quot; rel=&quot;paf0186&quot;&gt;paf0186&lt;/a&gt;&#160;you are probably the most interested in changing this code. Handling sub-page writes for ARM would not be very different than sub-page writes for x86, which would potentially allow eg. IO500 unaligned writes to be handled much more efficiently.&lt;br/&gt;
&#160;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=paf0186&quot; class=&quot;user-hover&quot; rel=&quot;paf0186&quot;&gt;paf0186&lt;/a&gt;&#160;added a comment -&#160;05/Nov/21 3:36 AM&lt;br/&gt;
Xinliang,&lt;/p&gt;

&lt;p&gt;I am not 100% sure I understand your question - Are you saying it&#160;&lt;b&gt;is&lt;/b&gt;&#160;oi_lockless?&#160; It should not be.&#160; This (commit_async) code is buffered, and lockless buffered is broken and also off by default.&#160; I have a patch to remove it, but it&apos;s normally off anyway.&lt;/p&gt;

&lt;p&gt;What are you looking for/hoping for here?&lt;/p&gt;

&lt;p&gt;Note we clip pages in other places too.&lt;br/&gt;
&#160;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=paf0186&quot; class=&quot;user-hover&quot; rel=&quot;paf0186&quot;&gt;paf0186&lt;/a&gt;&#160;added a comment -&#160;05/Nov/21 3:47 AM&lt;br/&gt;
&quot;I am thinking if we should make blocks allocation aligned with BLOCK_SIZE as ext4, which could save space for large PAGE_SIZE e.g. 64K. Then no need to make change to the test case. And I have a look at the code it seems both OSC client&#160; and OST server need to adjust for this. The client always sends no hole pages (currently page start offset is always 0) to the server for writing now. And the server side needs to adjust making blocks allocation aligned with block size.&#160;&lt;br/&gt;
&#160;&quot;&lt;/p&gt;

&lt;p&gt;Can you talk more about what you&apos;re thinking?&#160; I am not quite what the implication of changing block allocation on the server would be on the client.&#160; Why does changing server block allocation filter back to the client like this?&lt;/p&gt;

&lt;p&gt;More generally, about partial page i/o:&lt;br/&gt;
Generally speaking, we can&apos;t have partial pages except at the start and end of each write - that&apos;s a limitation of infiniband, but there are also page cache restrictions.&lt;/p&gt;

&lt;p&gt;In general, RDMA can be unaligned at the start, and unaligned at the end, but that&apos;s it.&#160; This applies even when combining multiple RDMA regions - it&apos;s some limitation of the hardware/drivers.&#160; So we have a truly unaligned I/O(with a partial page at beginning and end), but then we can&apos;t combine it with other I/Os.&lt;/p&gt;

&lt;p&gt;There is also a page cache limitation here.&#160; The Linux page cache insists on working with full pages - It will only allow partial pages at file_size.&#160; So, eg, a 3K file is a single page with 3K in it, and we can write just 3K.&#160; But if we want to write 3K in to a large &apos;hole&apos; in a file, Linux will enforce writing PAGE_SIZE.&#160; This is not a restriction we can easily remove, it is an important part of the page cache.&lt;br/&gt;
&#160;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=paf0186&quot; class=&quot;user-hover&quot; rel=&quot;paf0186&quot;&gt;paf0186&lt;/a&gt;&#160;added a comment -&#160;05/Nov/21 3:48 AM&lt;br/&gt;
By the way, I am happy to keep talking about this, if you have thoughts or questions or whatever.&#160; I&apos;ve looked at sub-page I/O a few times, but you may have a different idea than what I have tried.&lt;br/&gt;
&#160;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=adilger&quot; class=&quot;user-hover&quot; rel=&quot;adilger&quot;&gt;adilger&lt;/a&gt;&#160;added a comment -&#160;05/Nov/21 5:01 AM&lt;br/&gt;
Patrick, I was thinking that if we can handle a write (uncached) from the client that is RDMA 64KB, but has a non-zero start and end offset (4KB initially), it might be generalizable to any byte offset.&lt;/p&gt;

&lt;p&gt;I&apos;m aware of the RDMA limitations, but I&apos;m wondering if those can be bypassed (if necessary) by transferring a whole page over the network, but store it into a temporary page and copy the data for a cached/unaligned read-modify-write on the server to properly align the data. The content of the start/end of the page sent from the client would be irrelevant, since it will be trimmed by the server anyway when the copy is done&lt;/p&gt;

&lt;p&gt;While the copy might be expensive for very large writes, my expectation is that this would be most useful for small writes. That does raise the question of whether the data could be transferred in the RPC as a short write, but for GPU direct we require RDMA to send the data directly from the GPU RAM to the OSS. Maybe it is just a matter of generalizing the short write handling to allow copying from the middle of an RDMA page?&lt;br/&gt;
&#160;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=xinliang&quot; class=&quot;user-hover&quot; rel=&quot;xinliang&quot;&gt;xinliang&lt;/a&gt;&#160;added a comment &#160;&lt;del&gt;06/Nov/21 11:10 PM&lt;/del&gt;&#160;&#160;edited&lt;br/&gt;
Hi&#160;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=paf0186&quot; class=&quot;user-hover&quot; rel=&quot;paf0186&quot;&gt;paf0186&lt;/a&gt;&#160;and&#160;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=adilger&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;Andreas Dilger&lt;/a&gt;, thank you for the clarification about partial page write. It really helps me a lot.&lt;/p&gt;

&lt;p&gt;For ldiskfs backend filesystem,&#160; I see that if the user issue a partial page cached write the Lustre (including client side and server side) will convert it in to a full page write. I want to make Lustre do a real partial page write inside which with the length less than a PAGE_SIZE no matter the start is zero or non-zero , so that Lustre can handle bellow sanity 317 test partial page write for a large PAGE_SIZE e.g. 64 KB and pass the test. That&apos;s the problem I want to solve.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
sanity.sh
test_317() {
...
23836 &#160; &#160; #
23837 &#160; &#160; # sparse file test
23838 &#160; &#160; # Create file with a hole and write actual two blocks. Block count
23839 &#160; &#160; # must be 16.
23840 &#160; &#160; #
23841 &#160; &#160; dd &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt;=/dev/zero of=$DIR/$tfile bs=$grant_blk_size count=2 seek=5 \
23842 &#160; &#160; &#160; &#160; conv=fsync || error &lt;span class=&quot;code-quote&quot;&gt;&quot;Create file : $DIR/$tfile&quot;&lt;/span&gt;
23843
 ...

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I am trying to understand all the details and &#160;limitation&#160;including some mentioned by you e.g. RDMA partial page write, GPU direct write etc.&lt;/p&gt;

&lt;p&gt;I have a draft patch now which make client side send a niobuf,&#160; which contains non-zero file start offset&#160; and the real file end offset , to the server. This requires clip the page in the client side. And in the server side it only writes the necessary range(i.e. from the real non-zero file start offset to the file end offset).&lt;/p&gt;

&lt;p&gt;I will send the patch for review soon. Let&apos;s try if we can work out a solution.&#160; Thanks.&lt;br/&gt;
&#160;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=paf0186&quot; class=&quot;user-hover&quot; rel=&quot;paf0186&quot;&gt;paf0186&lt;/a&gt;&#160;added a comment -&#160;07/Nov/21 12:35 AM&lt;br/&gt;
How do you handle the page cache?&#160; Like, what&apos;s in there?&#160; And how do you get the range for the clipping?&#160; Etc.&#160; Some of these questions will be answered with the patch, of course&#160;&lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;

&lt;p&gt;But say you write this clipped partial page - What happens when you read it on the client which wrote it?&#160; What is in the rest of the page?&lt;/p&gt;

&lt;p&gt;And, going on from there:&lt;br/&gt;
What is in the rest of the page if the file was empty there?&#160; And what is in the rest of the page if there was already data in the whole page when you write it?&lt;/p&gt;

&lt;p&gt;Basically what I am saying is unless you get&#160;&lt;b&gt;very&lt;/b&gt;&#160;clever, this will break the page cache.&lt;/p&gt;

&lt;p&gt;You would also need to mark this page as non-mergable to avoid the RDMA issue, but that&apos;s easy to do.&#160; The real sticking point is the page cache.&lt;br/&gt;
&#160;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=paf0186&quot; class=&quot;user-hover&quot; rel=&quot;paf0186&quot;&gt;paf0186&lt;/a&gt;&#160;added a comment -&#160;07/Nov/21 12:38 AM&lt;br/&gt;
One idea which Andreas and I had some time ago was the idea of something like marking the page as not up to date (this means&#160;&lt;b&gt;not&lt;/b&gt;&#160;marking it as up to date, ie, the raw page state is not-up-to-date and up to date is a flag), so if the page was accessed, it would cause the client to re-read it from the server.&lt;/p&gt;

&lt;p&gt;This would mean the page was effectively uncached, which is a bit weird, but could work - I think the benefit is pretty limited since you can&apos;t easily combine these partial pages in to larger writes.&#160; (RDMA issue again)&lt;/p&gt;

&lt;p&gt;But anyway, not setting written pages up to date turned out to be&#160;&lt;b&gt;really&lt;/b&gt;&#160;complicated, and I decided it was unworkable.&#160; The write code assumes pages are up to date as part writing them, and while I was able to work around a few things, I decided it felt like I was very much going against the intent of the code.&lt;br/&gt;
&#160;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=paf0186&quot; class=&quot;user-hover&quot; rel=&quot;paf0186&quot;&gt;paf0186&lt;/a&gt;&#160;added a comment -&#160;07/Nov/21 12:42 AM&lt;br/&gt;
We also need to ask:&lt;br/&gt;
What&apos;s the benefit/what&apos;s the end goal, and how much do we have to do to get there?&lt;/p&gt;

&lt;p&gt;The benefit will be pretty limited if we can&apos;t also solve the RDMA issue.&#160; The benefit would only apply for &amp;lt; page writes, and each one would have to be sent to disk by itself.&lt;/p&gt;

&lt;p&gt;One way to solve the RDMA problem would be to send full pages over the network, but attach extra data in the RPC telling the server the actual range for each page.&#160; This would be&#160;&lt;b&gt;very complicated&lt;/b&gt;, I think, and involve new ways of handling writes on the client&#160;&lt;b&gt;and&lt;/b&gt;&#160;server.&lt;/p&gt;

&lt;p&gt;And this assumes we can solve the page cache issue!&lt;br/&gt;
&#160;&lt;/p&gt;</comment>
                            <comment id="318241" author="xinliang" created="Mon, 15 Nov 2021 10:19:11 +0000"  >&lt;p&gt;Hi &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=paf0186&quot; class=&quot;user-hover&quot; rel=&quot;paf0186&quot;&gt;paf0186&lt;/a&gt;, I am not understanding all the details yet. As &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=adilger&quot; class=&quot;user-hover&quot; rel=&quot;adilger&quot;&gt;adilger&lt;/a&gt; said partial write is so complicated.&lt;/p&gt;

&lt;p&gt;But I will try my best to answer your questions. Jira is not convenient to discuss in threads. I will paste your words and answer them.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;------------------------------------------------------------------------------------------------------------------------------------------------------------------------&#160;&lt;br/&gt;
&quot;&lt;br/&gt;
More generally, about partial page i/o:&lt;br/&gt;
Generally speaking, we can&apos;t have partial pages except at the start and end of each write&lt;br/&gt;
&quot;&lt;/p&gt;

&lt;p&gt;Yeah, I think that is the thing I want to accomplish. Partial write at the start and at the end of each write, maybe because page cache only records the start(offset) and the end(added by count).&lt;br/&gt;
Each write issued by the user space is always a continuous range from the start to the end.&lt;br/&gt;
Inside the Lustre the continuous range buf is converted into one extent to write.&#160;&lt;br/&gt;
Lustre may combine multiple extents into one if they can be combined which means all the extents can become a continuous extent. Right?&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;------------------------------------------------------------------------------------------------------------------------------------------------------------------------&#160;&lt;br/&gt;
&quot;&lt;br/&gt;
Patrick Farrell added a comment - 07/Nov/21 12:35 AM&lt;br/&gt;
How do you handle the page cache? &#160;Like, what&apos;s in there? &#160;And how do you get the range for the clipping? &#160;Etc. &#160;Some of these questions will be answered with the patch, of course&#160;&lt;/p&gt;

&lt;p&gt;But say you write this clipped partial page - What happens when you read it on the client which wrote it? &#160;What is in the rest of the page?&lt;/p&gt;

&lt;p&gt;And, going on from there:&lt;br/&gt;
What is in the rest of the page if the file was empty there? &#160;And what is in the rest of the page if there was already data in the whole page when you write it?&lt;/p&gt;

&lt;p&gt;Basically what I am saying is unless you get very clever, this will break the page cache.&lt;/p&gt;

&lt;p&gt;You would also need to mark this page as non-mergable to avoid the RDMA issue, but that&apos;s easy to do. &#160;The real sticking point is the page cache.&lt;br/&gt;
&quot;&lt;/p&gt;

&lt;p&gt;Some questions maybe you can get answers from the patch. I will answer some of them.&#160;&lt;br/&gt;
Before writing a partial page, if the page is already existing in the file the full page should be read from the disk so that we have an up to update page to write. If it is&#160;&lt;br/&gt;
a new page than zero it or maybe no need to do anything. This is handled in the ll_prepare_partial_page().&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;------------------------------------------------------------------------------------------------------------------------------------------------------------------------&#160;&lt;br/&gt;
&quot;&lt;br/&gt;
Patrick Farrell added a comment - 07/Nov/21 12:38 AM&lt;br/&gt;
One idea which Andreas and I had some time ago was the idea of something like marking the page as not up to date (this means not marking it as up to date, ie, the raw page state is not-up-to-date and up to date is a flag), so if the page was accessed, it would cause the client to re-read it from the server.&lt;/p&gt;

&lt;p&gt;This would mean the page was effectively uncached, which is a bit weird, but could work - I think the benefit is pretty limited since you can&apos;t easily combine these partial pages in to larger writes. &#160;(RDMA issue again)&lt;/p&gt;

&lt;p&gt;But anyway, not setting written pages up to date turned out to be really complicated, and I decided it was unworkable. &#160;The write code assumes pages are up to date as part writing them, and while I was able to work around a few things, I decided it felt like I was very much going against the intent of the code.&lt;br/&gt;
&quot;&lt;/p&gt;

&lt;p&gt;If a partial page can not be combined, why not just leave it as a separate extent to write? This is the small read/write. As I know, ll_do_tiny_write() will handle a partial write which will update content of the range the user wants.&lt;br/&gt;
Do you have some records about your work? Maybe I am not fully understanding your questions.&lt;br/&gt;
------------------------------------------------------------------------------------------------------------------------------------------------------------------------&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&quot;&lt;br/&gt;
Patrick Farrell added a comment - 07/Nov/21 12:42 AM&lt;br/&gt;
We also need to ask:&lt;br/&gt;
What&apos;s the benefit/what&apos;s the end goal, and how much do we have to do to get there?&lt;/p&gt;

&lt;p&gt;The benefit will be pretty limited if we can&apos;t also solve the RDMA issue. &#160;The benefit would only apply for &amp;lt; page writes, and each one would have to be sent to disk by itself.&lt;/p&gt;

&lt;p&gt;One way to solve the RDMA problem would be to send full pages over the network, but attach extra data in the RPC telling the server the actual range for each page. &#160;This would be very complicated, I think, and involve new ways of handling writes on the client and server.&lt;/p&gt;

&lt;p&gt;And this assumes we can solve the page cache issue!&lt;br/&gt;
&quot;&lt;br/&gt;
The benefits I can imagine are:&lt;br/&gt;
1) Make small writing more efficient which maybe is a big drawback of Lustre I often heard. This drawback may become even worse for large PAGE_SIZE say 64K, this should be prove by testing;&lt;br/&gt;
2) Save disk space for large PAGE_SIZE say 64K, as now ldiskfs block allocation is aligned with PAGE_SIZE.&lt;/p&gt;

&lt;p&gt;.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="318243" author="gerrit" created="Mon, 15 Nov 2021 10:26:32 +0000"  >&lt;p&gt;&quot;xinliang &amp;lt;xinliang.liu@linaro.org&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/45569&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/45569&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15223&quot; title=&quot;Improve partial page read/write&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15223&quot;&gt;LU-15223&lt;/a&gt; ldiskfs: Make real partial page read/write&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 31f7077ce41dd69fdc5a3a916c330e094bdc9a32&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10324">
                    <name>Cloners</name>
                                            <outwardlinks description="Clones">
                                        <issuelink>
            <issuekey id="54023">LU-11667</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="49488">LU-10300</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i029vb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>