<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:23:22 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-16030] sanity-pcc test_45: attach more than 2 time: 0</title>
                <link>https://jira.whamcloud.com/browse/LU-16030</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for Qian Yingjin &amp;lt;qian@ddn.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.whamcloud.com/test_sets/5cddefde-7d18-45d2-9fe6-496bfef626d4&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/5cddefde-7d18-45d2-9fe6-496bfef626d4&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;test_45 failed with the following error:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;attach more than 2 time: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;







&lt;p&gt;VVVVVVV DO NOT REMOVE LINES BELOW, Added by Maloo for auto-association VVVVVVV&lt;br/&gt;
sanity-pcc test_45 - attach more than 2 time: 0&lt;/p&gt;</description>
                <environment></environment>
        <key id="71288">LU-16030</key>
            <summary>sanity-pcc test_45: attach more than 2 time: 0</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="qian_wc">Qian Yingjin</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Wed, 20 Jul 2022 09:39:37 +0000</created>
                <updated>Mon, 1 May 2023 00:34:18 +0000</updated>
                                                                                <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="355447" author="qian_wc" created="Wed, 7 Dec 2022 01:38:03 +0000"  >&lt;p&gt;It seems that this failure occurred on the newer kernel SLES 15.3 and Unubtu2004.&lt;/p&gt;</comment>
                            <comment id="360273" author="adilger" created="Wed, 25 Jan 2023 00:39:49 +0000"  >&lt;p&gt;+3 in past week:&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/sub_tests/1049c17f-fba7-4d78-9620-3e5e5c3f6b17&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/sub_tests/1049c17f-fba7-4d78-9620-3e5e5c3f6b17&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/sub_tests/3fe48ac1-c27c-488e-b95a-37e6abd93d9d&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/sub_tests/3fe48ac1-c27c-488e-b95a-37e6abd93d9d&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/sub_tests/6a045690-d7a6-476c-8cf2-47d070a62de0&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/sub_tests/6a045690-d7a6-476c-8cf2-47d070a62de0&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="366228" author="qian_wc" created="Fri, 17 Mar 2023 08:15:10 +0000"  >&lt;p&gt;After investigate the problem, the problem is narrowed down to a generic Lustre bug even without PCC on the newer kernel (Ubuntu 2204).&lt;/p&gt;

&lt;p&gt;The bug can be easily reproduced by the following scripts (without PCC setup on the client):&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
#!/bin/bash


error() {
&#160; &#160; &#160; &#160; echo &lt;span class=&quot;code-quote&quot;&gt;&quot;$@&quot;&lt;/span&gt;
&#160; &#160; &#160; &#160; exit 1
}


HSMTOOL_ARCHIVE_FORMAT=v2
LCTL=$(which lctl)
LFS=$(which lfs)
MMAP_CAT=&lt;span class=&quot;code-quote&quot;&gt;&quot;/usr/lib/lustre/tests/mmap_cat&quot;&lt;/span&gt;


DIR=&lt;span class=&quot;code-quote&quot;&gt;&quot;/mnt/lustre&quot;&lt;/span&gt;
tdir=&lt;span class=&quot;code-quote&quot;&gt;&quot;sanity-pcc.d99&quot;&lt;/span&gt;
tfile=&lt;span class=&quot;code-quote&quot;&gt;&quot;sanity-pcc.f99&quot;&lt;/span&gt;
hsm_root=&lt;span class=&quot;code-quote&quot;&gt;&quot;/mnt/pcc&quot;&lt;/span&gt;
mntpt=&lt;span class=&quot;code-quote&quot;&gt;&quot;/mnt/pcc&quot;&lt;/span&gt;
dir=$DIR/$tdir
file=$dir/$tfile
id=2
cnt=30


DEBUGFS=/sys/kernel/debug/tracing
CURRENT_TRACER=&lt;span class=&quot;code-quote&quot;&gt;&quot;function_graph&quot;&lt;/span&gt;
SET_FUNCTION_GRAPH=&lt;span class=&quot;code-quote&quot;&gt;&quot;*:mod:lustre&quot;&lt;/span&gt;


lctl set_param debug=cache+inode+iotrace+vfstrace+info+dlmtrace+trace+reada
lctl set_param subsystem_debug=llite+ldlm+osc
lctl set_param debug_mb=40000
lctl get_param debug_mb
rm -rf $mntpt/*
rm -rf $DIR/*
rm logC


rm mytrace.dat
echo &lt;span class=&quot;code-quote&quot;&gt;&quot;$CURRENT_TRACER&quot;&lt;/span&gt; &amp;gt; $DEBUGFS/current_tracer
echo 12 &amp;gt; $DEBUGFS/max_graph_depth
#echo &lt;span class=&quot;code-quote&quot;&gt;&quot;$SET_FUNCTION_GRAPH&quot;&lt;/span&gt; &amp;gt; $DEBUG_FS/set_ftrace_filter
echo &apos;&apos; &amp;gt; $DEBUG_FS/set_ftrace_filter
mkdir -p $dir
lctl clear

dd &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt;=/dev/zero of=$file bs=1M count=$cnt || error &lt;span class=&quot;code-quote&quot;&gt;&quot;Write $file failed&quot;&lt;/span&gt;
lctl clear
(
&#160; &#160; &#160; &#160; &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; [ ! -e $DIR/sanity-pcc.99.lck ]; &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt;
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; dd &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt;=/dev/zero of=$file bs=1M count=$cnt conv=notrunc ||
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; error &lt;span class=&quot;code-quote&quot;&gt;&quot;failed to write $file&quot;&lt;/span&gt;
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; sleep 0.$((RANDOM % 4 + 1))
&#160; &#160; &#160; &#160; done
)&amp;amp;
wpid=$!


(
&#160; &#160; &#160; &#160; &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; [ ! -e $DIR/sanity-pcc.99.lck ]; &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt;
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; #lctl clear
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; echo &amp;gt; $DEBUGFS/trace
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; echo 1 &amp;gt; $DEBUGFS/tracing_on
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; dd &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt;=$file of=/dev/&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt; bs=1M count=$cnt &amp;amp;
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; pid=$!
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; echo &lt;span class=&quot;code-quote&quot;&gt;&quot;$pid&quot;&lt;/span&gt; &amp;gt; $DEBUGFS/set_ftrace_pid
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; wait $pid ||
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; {
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; rc=$?
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; cp $DEBUGFS/trace mytrace.dat
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; echo 0 &amp;gt; $DEBUGFS/tracing_on
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; echo &lt;span class=&quot;code-quote&quot;&gt;&apos;nop&apos;&lt;/span&gt; &amp;gt; $DEBUGFS/current_tracer
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; echo &lt;span class=&quot;code-quote&quot;&gt;&quot;failed to read $file: rc = $rc pid = $pid&quot;&lt;/span&gt;
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; lctl dk &amp;gt; logA
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; error &lt;span class=&quot;code-quote&quot;&gt;&quot;failed to read $file: rc = $?&quot;&lt;/span&gt;
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; }
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; sleep 0.$((RANDOM % 4 + 1))
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; echo 0 &amp;gt; $DEBUGFS/tracing_on
&#160; &#160; &#160; &#160; done
)&amp;amp;
rpid=$!

(
&#160; &#160; &#160; &#160; &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; [ ! -e $DIR/sanity-pcc.99.lck ]; &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt;
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; dd &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt;=$file of=/dev/&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt; bs=1M count=$cnt iflag=direct ||
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; error &lt;span class=&quot;code-quote&quot;&gt;&quot;failed to mmap_cat $file&quot;&lt;/span&gt;
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; sleep 0.$((RANDOM % 4 + 1))
&#160; &#160; &#160; &#160; done
&#160; &#160; &#160; &#160; &lt;span class=&quot;code-keyword&quot;&gt;true&lt;/span&gt;
)&amp;amp;
rpid2=$!


sleep 5
touch $DIR/sanity-pcc.99.lck
wait $wpid || error &lt;span class=&quot;code-quote&quot;&gt;&quot;$?: write failed&quot;&lt;/span&gt;
wait $rpid || error &lt;span class=&quot;code-quote&quot;&gt;&quot;$?: read failed: pid=$rpid&quot;&lt;/span&gt;
wait $rpid2 || error &lt;span class=&quot;code-quote&quot;&gt;&quot;$?: read2 failed&quot;&lt;/span&gt; &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The output:&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
30+0 records in
30+0 records out
31457280 bytes (31 MB, 30 MiB) copied, 0.353618 s, 89.0 MB/s
dd: error reading &lt;span class=&quot;code-quote&quot;&gt;&apos;/mnt/lustre/sanity-pcc.d99/sanity-pcc.f99&apos;&lt;/span&gt;: Input/output error
4+0 records in
4+0 records out
4194304 bytes (4.2 MB, 4.0 MiB) copied, 0.459942 s, 9.1 MB/s
30+0 records in &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Three process keeping buffered read/direct I/O read from a file and write to the same file will return -EIO .&lt;/p&gt;

&lt;p&gt;The EIO code is returned from:&lt;br/&gt;
&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; filemap_read_page(struct file *file, struct address_space *mapping,
		struct page *page)
{
	&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; error;

	/*
	 * A previous I/O error may have been due to temporary failures,
	 * eg. multipath errors.  PG_error will be set again &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; readpage
	 * fails.
	 */
	ClearPageError(page);
	&lt;span class=&quot;code-comment&quot;&gt;/* Start the actual read. The read will unlock the page. */&lt;/span&gt;
	error = mapping-&amp;gt;a_ops-&amp;gt;readpage(file, page);
	&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (error)
		&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; error;

	error = wait_on_page_locked_killable(page);
	&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (error)
		&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; error;
	&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (PageUptodate(page))
		&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 0;
	shrink_readahead_size_eio(&amp;amp;file-&amp;gt;f_ra);
	&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; -EIO; &amp;lt;=== &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; code!!
}&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Still need more time to investigate the bug....&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="366455" author="qian_wc" created="Mon, 20 Mar 2023 15:24:18 +0000"  >&lt;p&gt;After investigate the bug, the problem should be that:&lt;/p&gt;

&lt;p&gt;The client has two overlapped PR DLM extent locks:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;L1 = &amp;lt;PR, &lt;span class=&quot;error&quot;&gt;&amp;#91;1M, 4M - 1&amp;#93;&lt;/span&gt;&amp;gt;&lt;/li&gt;
	&lt;li&gt;L2 = &amp;lt;PR, &lt;span class=&quot;error&quot;&gt;&amp;#91;3M, 5M - 1&amp;#93;&lt;/span&gt;&amp;gt;&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;A reader process holds L1 and read data in range &lt;span class=&quot;error&quot;&gt;&amp;#91;3M, 4M -1&amp;#93;&lt;/span&gt;.&lt;/p&gt;

&lt;p&gt;L2 is being revoking due to the conflict access.&lt;/p&gt;

&lt;p&gt;Then in the -&amp;gt;readpage() call:&lt;/p&gt;

&lt;p&gt;It will unlock the page after the page is in Uptodate state.&lt;/p&gt;

&lt;p&gt;And lock blocking callback for L2 will clear the Uptodate state and delete the page from cache.&lt;/p&gt;

&lt;p&gt;Then in the old kernel, it will:&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
&lt;span class=&quot;code-comment&quot;&gt;/* Start the actual read. The read will unlock the page. */&lt;/span&gt;
		error = mapping-&amp;gt;a_ops-&amp;gt;readpage(filp, page);

		&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (unlikely(error)) {
			&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (error == AOP_TRUNCATED_PAGE) {
				put_page(page);
				error = 0;
				&lt;span class=&quot;code-keyword&quot;&gt;goto&lt;/span&gt; find_page;
			}
			&lt;span class=&quot;code-keyword&quot;&gt;goto&lt;/span&gt; readpage_error;
		}

		&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!PageUptodate(page)) {
			error = lock_page_killable(page);
			&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (unlikely(error))
				&lt;span class=&quot;code-keyword&quot;&gt;goto&lt;/span&gt; readpage_error;
			&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!PageUptodate(page)) {
				&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (page-&amp;gt;mapping == NULL) { =====&amp;gt;
					/*
					 * invalidate_mapping_pages got it
					 */
					unlock_page(page);
					put_page(page);
					&lt;span class=&quot;code-keyword&quot;&gt;goto&lt;/span&gt; find_page;
				}
				unlock_page(page);
				shrink_readahead_size_eio(filp, ra);
				error = -EIO;
				&lt;span class=&quot;code-keyword&quot;&gt;goto&lt;/span&gt; readpage_error;
			}
			unlock_page(page);
		}

		&lt;span class=&quot;code-keyword&quot;&gt;goto&lt;/span&gt; page_ok;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;It will check whether the page was truncated and deleted from page cache via page-&amp;gt;mapping, if so, it will retry the page read.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;But in the newer kernel, it deletes this check and retry.&lt;/p&gt;

&lt;p&gt;It seems there is no any simple solution for this bug.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; filemap_read_page(struct file *file, struct address_space *mapping,
		struct page *page)
{
	&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; error;

	/*
	 * A previous I/O error may have been due to temporary failures,
	 * eg. multipath errors.  PG_error will be set again &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; readpage
	 * fails.
	 */
	ClearPageError(page);
	&lt;span class=&quot;code-comment&quot;&gt;/* Start the actual read. The read will unlock the page. */&lt;/span&gt;
	error = mapping-&amp;gt;a_ops-&amp;gt;readpage(file, page);
	&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (error)
		&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; error;

	error = wait_on_page_locked_killable(page);
	&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (error)
		&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; error;
	&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (PageUptodate(page))
		&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 0;

	shrink_readahead_size_eio(&amp;amp;file-&amp;gt;f_ra);
	&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; -EIO;
}
&#160;&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;And I also found that it may cause short read (partial read) during the test...&lt;/p&gt;

&lt;p&gt;Which may be related to DDN-3569.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="366459" author="qian_wc" created="Mon, 20 Mar 2023 15:41:12 +0000"  >&lt;p&gt;According to the Linux Documents: Documentations/filkesystems/locking.rst:&lt;br/&gt;
&#160;&lt;br/&gt;
&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
ocking rules:
	All except set_page_dirty and freepage may block

======================	======================== =========	===============
ops			PageLocked(page)	 i_rwsem	invalidate_lock
======================	======================== =========	===============
writepage:		yes, unlocks (see below)
readpage:		yes, unlocks				shared
writepages:
set_page_dirty		no
readahead:		yes, unlocks				shared
readpages:		no					shared
write_begin:		locks the page		 exclusive
write_end:		yes, unlocks		 exclusive
bmap:
invalidatepage:		yes					exclusive
releasepage:		yes
freepage:		yes
direct_IO:
isolate_page:		yes
migratepage:		yes (both)
putback_page:		yes
launder_page:		yes
is_partially_uptodate:	yes
error_remove_page:	yes
swap_activate:		no
swap_deactivate:	no
======================	======================== =========	===============
The filesystem must exclusively acquire invalidate_lock before invalidating page cache in truncate / hole punch path (and thus calling into -&amp;gt;invalidatepage) to block races between page cache invalidation and page cache filling functions (fault, read, ...).&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;br/&gt;
&#160;&lt;br/&gt;
The solution &#160;should be that:&lt;br/&gt;
we must hold exclusive invalidate_lock (mapping-&amp;gt;invalidate_lock) (in the newer kernel) when remove pages from page&#160;cache&#160;caused by the revoking of the DLM extent lock protecting them.&lt;/p&gt;</comment>
                            <comment id="366462" author="qian_wc" created="Mon, 20 Mar 2023 15:47:37 +0000"  >&lt;p&gt;I will cook a patch to verify it later.&lt;/p&gt;</comment>
                            <comment id="366578" author="qian_wc" created="Tue, 21 Mar 2023 03:01:52 +0000"  >&lt;p&gt;Verifed that after acquired invalidate_lock during the lock blocking AST (extent lock), the test passed without -EIO.&lt;br/&gt;
As this is a exclusive lock on the inode, it may reduce the concurrency for multiple lock blocking ASTs (from multiple stripes)...&lt;/p&gt;</comment>
                            <comment id="370316" author="adilger" created="Sun, 23 Apr 2023 22:05:48 +0000"  >&lt;p&gt;Yingjin, what is the path forward for this bug?  Is there a fix in Lustre that can be made, possibly one already in flight for the other read cache issues, or is the only fix to submit a patch to Ubuntu to backport a fix from upstream to their kernel?&lt;/p&gt;</comment>
                            <comment id="370317" author="adilger" created="Sun, 23 Apr 2023 22:16:12 +0000"  >&lt;blockquote&gt;
&lt;p&gt;fter investigate the bug, the problem should be that:&lt;br/&gt;
The client has two overlapped PR DLM extent locks:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;L1 = &amp;lt;PR, [1M, 4M - 1]&amp;gt;
L2 = &amp;lt;PR, [3M, 5M - 1]&amp;gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;A reader process holds L1 and read data in range &lt;span class=&quot;error&quot;&gt;&amp;#91;3M, 4M -1&amp;#93;&lt;/span&gt;.&lt;br/&gt;
L2 is being revoking due to the conflict access.&lt;br/&gt;
Then in the -&amp;gt;readpage() call:&lt;br/&gt;
It will unlock the page after the page is in Uptodate state.&lt;br/&gt;
And lock blocking callback for L2 will clear the Uptodate state and delete the page from cache.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;I recall that when a lock was being cancelled that the client would check for an overlapping read lock and attach the page to the second lock rather than invalidating it. I believe that was stopped because it added overhead for cancelling locks and the thought was that the page would be refreshed if it was accessed again. &lt;/p&gt;

&lt;p&gt;In addition to patching the kernel to fix this retry issue there, another option would be to do an interval tree lookup on the other DLM locks on this resource to see if there is an overlapping lock extent, and then bulk transfer all of the pages from the old lock to the new lock, rather than doing this one page at a time. This lock could also be cached in case other pages need to be evicted. &lt;/p&gt;

&lt;p&gt;In the common case, the client resource will only have a single lock on it, so the lookup would be fast.&lt;/p&gt;

&lt;p&gt;We could also examine why the client is requesting (or is granted) overlapping DLM extent locks, since that would also avoid the issue. &lt;/p&gt;</comment>
                            <comment id="370318" author="paf0186" created="Sun, 23 Apr 2023 22:44:30 +0000"  >&lt;p&gt;&quot;I recall that when a lock was being cancelled that the client would check for an overlapping read lock and attach the page to the second lock rather than invalidating it. I believe that was stopped because it added overhead for cancelling locks and the thought was that the page would be refreshed if it was accessed again.&quot;&lt;/p&gt;

&lt;p&gt;No, I believe we still do that, though I think we played with not doing so.&#160; We actually don&apos;t attach pages to locks in any way - they just live &apos;under&apos; the DLM lock.&#160; We just keep page if there&apos;s a still lock covering them - check_and_discard_cb.&#160; There&apos;s no linkage from a page to a lock (and so also pages don&apos;t hold references on locks.&#160; It would be quite expensive to do so.)&lt;/p&gt;

&lt;p&gt;So, about the bug here, just a moment...&lt;/p&gt;</comment>
                            <comment id="370319" author="paf0186" created="Sun, 23 Apr 2023 22:55:04 +0000"  >&lt;p&gt;So, I&apos;m about 95% sure the bug here is fixed - on all kernels - by:&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16649&quot; title=&quot;EIO is possible on race with page reclaim/deletion&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16649&quot;&gt;&lt;del&gt;LU-16649&lt;/del&gt;&lt;/a&gt; llite: EIO is possible on a race with page reclaim &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/50344/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/50344/&lt;/a&gt;&#160;&lt;/p&gt;

&lt;p&gt;The described situation is identical.&lt;/p&gt;

&lt;p&gt;But that fix is a bit of a hack, and the &apos;right&apos; way to do it is to use the invalidate lock as Yingjin describes here.&#160; It just doesn&apos;t exist on older kernels.&lt;/p&gt;

&lt;p&gt;Yingjin made patches using the invalidate lock.&#160; See the series here:&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16651&quot; title=&quot;hold invalidate_lock when invalidating page cache under kernel 5.15+&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16651&quot;&gt;&lt;del&gt;LU-16651&lt;/del&gt;&lt;/a&gt; llite: hold invalidate_lock when invalidate cache pages / &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/50371/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/50423/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;That series ends by removing the hack from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16649&quot; title=&quot;EIO is possible on race with page reclaim/deletion&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16649&quot;&gt;&lt;del&gt;LU-16649&lt;/del&gt;&lt;/a&gt; (and Panda&apos;s &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16160&quot; title=&quot;take ldlm lock when queue sync pages&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16160&quot;&gt;&lt;del&gt;LU-16160&lt;/del&gt;&lt;/a&gt; llite: SIGBUS is possible on a race with page reclaim / &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/49647).&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/49647)&lt;/a&gt; , but of course only on newer kernels.&lt;/p&gt;

&lt;p&gt;But to be clear:&lt;br/&gt;
Yingjin&apos;s patches are an improvement, but as far as I know, we get correct operation with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16649&quot; title=&quot;EIO is possible on race with page reclaim/deletion&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16649&quot;&gt;&lt;del&gt;LU-16649&lt;/del&gt;&lt;/a&gt;, including for this bug.&lt;/p&gt;

&lt;p&gt;Yingjin, please confirm and correct if I have anything badly wrong &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="370320" author="paf0186" created="Sun, 23 Apr 2023 23:10:59 +0000"  >&lt;p&gt;&quot;We could also examine why the client is requesting (or is granted) overlapping DLM extent locks, since that would also avoid the issue.&quot;&lt;/p&gt;

&lt;p&gt;So, just an additional note:&lt;br/&gt;
We can&apos;t really do this.&#160; Read locks can always overlap, particularly if there&apos;s also a write lock covering part of the file.&#160; (This assumes a single stripe file for simplicity; everything I&apos;m saying applies at the level of a single stripe, even in multi-stripe files.)&lt;/p&gt;

&lt;p&gt;Consider this case:&lt;/p&gt;

&lt;p&gt;Client 1 has a write lock from 1-3 MiB on a file&lt;br/&gt;
Client 2 requests to read 0 - 1 MiB&lt;br/&gt;
Server grants a read lock from 0-1 MiB on client 2 (server does not extend client 2 request due to write lock)&lt;br/&gt;
Client 2 requests to read 1-2 MiB&lt;br/&gt;
Server cancels write lock from client 1&lt;br/&gt;
Server processes client 2 lock request, no conflicts&lt;br/&gt;
Server grants read lock from 0 MiB to infinity for client 2&lt;br/&gt;
Client 2 now has overlapping read locks.&lt;/p&gt;

&lt;p&gt;Also consider that server lock expansion isn&apos;t needed either, if the second read request is for 0.5 MiB to 1.5 MiB.&#160; We don&apos;t split read/write requests across dlm locks (within an OSC), for various good reasons, so this also results in a second overlapping read lock.&lt;/p&gt;

&lt;p&gt;To be fair, we also don&apos;t really protect against racing lock requests from different processes on the same client.&#160; We do a little bit of it - the OSC locking code can catch it - but it&apos;s racy, so sometimes we can get multiple identical read locks from parallel read requests (we can also get overlapping write lock requests from the same client when there are overlapping writes from userspace, resulting in a self-conflict, which is resolved safely with the usual lock callback mechanisms, just with the slightly odd sight of a client requesting two write locks on the same area of a file).&lt;/p&gt;

&lt;p&gt;I looked at fixing these overlaps and self-conflicts back when I was first working on lockahead (it&apos;s very noticeable with lockahead, particularly if you screw up in userspace and request the same lock over and over), but concluded the way it is is basically fine and not worth the trouble to fix.&#160; It happens only occasionally during parallel reads and writes and because it&apos;s rare and transient, I couldn&apos;t see a meaningful cost.&lt;/p&gt;

&lt;p&gt;(OK, I think this is really my final edit...)&lt;br/&gt;
And because overlapping read locks are always possible, fixing the race that allows them with parallel reads doesn&apos;t get us much.&#160; We still have to handle that case.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="75131">LU-16651</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i02uyv:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>