<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:58:23 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-6227] Master testing: (osc_request.c:1219:osc_brw_prep_request()) ASSERTION( i == 0 || pg-&gt;off &gt; pg_prev-&gt;off) </title>
                <link>https://jira.whamcloud.com/browse/LU-6227</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Spawning multiple copies of diotest1 from LTP in the same directory causes the assertion from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3192&quot; title=&quot;LBUG:(osc_request.c:1308:osc_brw_prep_request()) ASSERTION( i == 0 || pg-&amp;gt;off &amp;gt; pg_prev-&amp;gt;off )&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3192&quot;&gt;&lt;del&gt;LU-3192&lt;/del&gt;&lt;/a&gt; to reappear.&lt;/p&gt;

&lt;p&gt;I was able to replicate with full debug enabled and will make the dump &amp;amp; KO files available momentarily.  I&apos;ll also include the diotest1 binary, but note that it is unchanged from LTP.&lt;/p&gt;

&lt;p&gt;Stack trace:&lt;br/&gt;
&amp;lt;0&amp;gt;LustreError: 7700:0:(osc_request.c:1219:osc_brw_prep_request()) ASSERTION( i == 0 || pg-&amp;gt;off &amp;gt; pg_prev-&amp;gt;off ) failed: i 3 p_c 10 pg ffffea00017a5208 &lt;span class=&quot;error&quot;&gt;&amp;#91;pri 0 ind 2771&amp;#93;&lt;/span&gt; off 16384 prev_pg ffffea00017a51d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;pri 0 ind 2256&amp;#93;&lt;/span&gt; off 16384&lt;br/&gt;
&amp;lt;0&amp;gt;LustreError: 7700:0:(osc_request.c:1219:osc_brw_prep_request()) LBUG&lt;br/&gt;
&amp;lt;4&amp;gt;Pid: 7700, comm: diotest1&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;br/&gt;
&amp;lt;4&amp;gt;Call Trace:&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0302895&amp;gt;&amp;#93;&lt;/span&gt; libcfs_debug_dumpstack+0x55/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0302e97&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0b14dd1&amp;gt;&amp;#93;&lt;/span&gt; osc_brw_prep_request+0xba1/0x10b0 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0b15b40&amp;gt;&amp;#93;&lt;/span&gt; osc_build_rpc+0x860/0x15c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0b30ab4&amp;gt;&amp;#93;&lt;/span&gt; osc_io_unplug0+0xe64/0x1b30 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa03131c1&amp;gt;&amp;#93;&lt;/span&gt; ? libcfs_debug_msg+0x41/0x50 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0b33d21&amp;gt;&amp;#93;&lt;/span&gt; osc_io_unplug+0x11/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0b38986&amp;gt;&amp;#93;&lt;/span&gt; osc_cache_writeback_range+0xda6/0x1280 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0b25d30&amp;gt;&amp;#93;&lt;/span&gt; osc_io_fsync_start+0x90/0x360 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa047cb40&amp;gt;&amp;#93;&lt;/span&gt; ? cl_io_start+0x0/0x140 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa047cbaa&amp;gt;&amp;#93;&lt;/span&gt; cl_io_start+0x6a/0x140 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa047cb40&amp;gt;&amp;#93;&lt;/span&gt; ? cl_io_start+0x0/0x140 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa09790fe&amp;gt;&amp;#93;&lt;/span&gt; lov_io_call+0x8e/0x130 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa097cd8c&amp;gt;&amp;#93;&lt;/span&gt; lov_io_start+0xcc/0x180 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa047cbaa&amp;gt;&amp;#93;&lt;/span&gt; cl_io_start+0x6a/0x140 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04808b4&amp;gt;&amp;#93;&lt;/span&gt; cl_io_loop+0xb4/0x1b0 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa09f283b&amp;gt;&amp;#93;&lt;/span&gt; cl_sync_file_range+0x31b/0x500 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0a1e9cc&amp;gt;&amp;#93;&lt;/span&gt; ll_writepages+0x9c/0x220 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81134eb1&amp;gt;&amp;#93;&lt;/span&gt; do_writepages+0x21/0x40&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8112031b&amp;gt;&amp;#93;&lt;/span&gt; __filemap_fdatawrite_range+0x5b/0x60&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8112037a&amp;gt;&amp;#93;&lt;/span&gt; filemap_write_and_wait_range+0x5a/0x90&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81121728&amp;gt;&amp;#93;&lt;/span&gt; generic_file_aio_read+0x418/0x700&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81078fd7&amp;gt;&amp;#93;&lt;/span&gt; ? current_fs_time+0x27/0x30&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff811a5ef1&amp;gt;&amp;#93;&lt;/span&gt; ? touch_atime+0x71/0x1a0&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0a4f053&amp;gt;&amp;#93;&lt;/span&gt; vvp_io_read_start+0x233/0x460 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa047cbaa&amp;gt;&amp;#93;&lt;/span&gt; cl_io_start+0x6a/0x140 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04808b4&amp;gt;&amp;#93;&lt;/span&gt; cl_io_loop+0xb4/0x1b0 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa09efef1&amp;gt;&amp;#93;&lt;/span&gt; ll_file_io_generic+0x461/0xa40 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa09f0600&amp;gt;&amp;#93;&lt;/span&gt; ll_file_aio_read+0x130/0x2b0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa09f0aa9&amp;gt;&amp;#93;&lt;/span&gt; ll_file_read+0x159/0x290 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81189a75&amp;gt;&amp;#93;&lt;/span&gt; vfs_read+0xb5/0x1a0&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81189bb1&amp;gt;&amp;#93;&lt;/span&gt; sys_read+0x51/0x90&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810e202e&amp;gt;&amp;#93;&lt;/span&gt; ? __audit_syscall_exit+0x25e/0x290&lt;br/&gt;
&amp;lt;4&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100b072&amp;gt;&amp;#93;&lt;/span&gt; system_call_fastpath+0x16/0x1b&lt;/p&gt;


&lt;p&gt;Given the fact that this problem has reoccurred, it seems sensible to add a test for this to the test suite.&lt;/p&gt;</description>
                <environment>CentOS 6.5 servers &amp;amp; clients, current master (tag 2.6.94).</environment>
        <key id="28624">LU-6227</key>
            <summary>Master testing: (osc_request.c:1219:osc_brw_prep_request()) ASSERTION( i == 0 || pg-&gt;off &gt; pg_prev-&gt;off) </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="paf">Patrick Farrell</reporter>
                        <labels>
                            <label>patch</label>
                    </labels>
                <created>Mon, 9 Feb 2015 18:57:34 +0000</created>
                <updated>Fri, 1 Jul 2016 18:51:23 +0000</updated>
                            <resolved>Tue, 7 Jul 2015 00:28:19 +0000</resolved>
                                    <version>Lustre 2.7.0</version>
                                    <fixVersion>Lustre 2.8.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>13</watches>
                                                                            <comments>
                            <comment id="106309" author="paf" created="Mon, 9 Feb 2015 19:08:00 +0000"  >&lt;p&gt;On ftp.whamcloud.com (uploading now):&lt;/p&gt;

&lt;p&gt;uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6227&quot; title=&quot;Master testing: (osc_request.c:1219:osc_brw_prep_request()) ASSERTION( i == 0 || pg-&amp;gt;off &amp;gt; pg_prev-&amp;gt;off) &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6227&quot;&gt;&lt;del&gt;LU-6227&lt;/del&gt;&lt;/a&gt;/&lt;/p&gt;

&lt;p&gt;Dump, log, vmlinux and ko files:&lt;br/&gt;
150209_full_debug.tar.gz &lt;/p&gt;

&lt;p&gt;Test script and test:&lt;br/&gt;
test.sh&lt;br/&gt;
diotest1&lt;/p&gt;</comment>
                            <comment id="106314" author="paf" created="Mon, 9 Feb 2015 19:13:58 +0000"  >&lt;p&gt;Transfer died...  Trying again:  150209_full_debug2.tar.gz &lt;/p&gt;</comment>
                            <comment id="106326" author="paf" created="Mon, 9 Feb 2015 19:38:09 +0000"  >&lt;p&gt;I&apos;ve also made it available for non-Intel folks on the Cray FTP.&lt;/p&gt;

&lt;p&gt;ftp.cray.com&lt;br/&gt;
anonynmous&lt;/p&gt;

&lt;p&gt;outbound/xyratex/lu-6227&lt;br/&gt;
Files:&lt;br/&gt;
150209_full_debug.tar.gz&lt;br/&gt;
diotest1&lt;br/&gt;
test.sh&lt;/p&gt;</comment>
                            <comment id="106420" author="aboyko" created="Tue, 10 Feb 2015 08:41:39 +0000"  >&lt;p&gt;Patrick, the test was added with patch - test_241 sanity.sh  dio vs bio. And we saw regression for it without the fix. Yeap, the issue looks like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3192&quot; title=&quot;LBUG:(osc_request.c:1308:osc_brw_prep_request()) ASSERTION( i == 0 || pg-&amp;gt;off &amp;gt; pg_prev-&amp;gt;off )&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3192&quot;&gt;&lt;del&gt;LU-3192&lt;/del&gt;&lt;/a&gt;, and requires further analyze.&lt;/p&gt;</comment>
                            <comment id="107813" author="adilger" created="Tue, 24 Feb 2015 19:06:40 +0000"  >&lt;p&gt;It looks like the client is not invalidating the buffered page from cache before submitting the dio page. There are a couple of solutions to that. One us to actually drop the buffered page, the other is to disallow merging these into a single RPC. &lt;/p&gt;</comment>
                            <comment id="107819" author="paf" created="Tue, 24 Feb 2015 19:20:52 +0000"  >&lt;p&gt;My understanding is that &lt;a href=&quot;http://review.whamcloud.com/#/c/10930&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/10930&lt;/a&gt; took the second route - Disallowing merging - but it appears that it&apos;s not working any more.  (It fixed the problem originally, at least in 2.5 for Cray.  I&apos;m not sure we tested it on master at the time.)&lt;/p&gt;</comment>
                            <comment id="109970" author="aboyko" created="Wed, 18 Mar 2015 11:38:16 +0000"  >&lt;p&gt;Base on the assert message&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;&amp;lt;0&amp;gt;LustreError: 7700:0:(osc_request.c:1219:osc_brw_prep_request()) ASSERTION( i == 0 || pg-&amp;gt;off &amp;gt; pg_prev-&amp;gt;off ) failed: i 3 p_c 10 pg ffffea00017a5208 [pri 0 ind 2771] off 16384 prev_pg ffffea00017a51d0 [pri 0 ind 2256] off 16384
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The type of both pages is direct io, and they has the same offset. So this is not &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3192&quot; title=&quot;LBUG:(osc_request.c:1308:osc_brw_prep_request()) ASSERTION( i == 0 || pg-&amp;gt;off &amp;gt; pg_prev-&amp;gt;off )&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3192&quot;&gt;&lt;del&gt;LU-3192&lt;/del&gt;&lt;/a&gt;, when we have mix with dio and buffered page.&lt;/p&gt;

&lt;p&gt;I have found the commit which introduce this regression.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;commit e6f592fc57b048223e2676408232b6662aad712d
Author: Prakash Surya &amp;lt;surya1@llnl.gov&amp;gt;
Date:   Wed Oct 2 17:16:51 2013 -0700

    LU-1669 vvp: Use lockless __generic_file_aio_write
    
    Testing multi-threaded single shard file write performance has shown
    the inode mutex to be a limiting factor when using the
    generic_file_aio_write function. To work around this bottle neck, this
    change replaces the locked version of that call with the lock less
    version, specifically, __generic_file_aio_write.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="109998" author="paf" created="Wed, 18 Mar 2015 15:26:58 +0000"  >&lt;p&gt;Hmmm.  Unfortunately, that makes some sense...  The purpose of that patch was to allow multiple threads on a single client to write to one file at the same time, so there are some possible concurrency issues.&lt;/p&gt;

&lt;p&gt;I wonder if this issue is only in the context of direct IO...  Obviously this crash is with direct IO pages, but I wonder about the underlying problem.&lt;/p&gt;</comment>
                            <comment id="110475" author="green" created="Tue, 24 Mar 2015 14:46:53 +0000"  >&lt;p&gt;I think the parllel io is still guarded with some sort of range lock, so should we better understand why did that not help?&lt;/p&gt;</comment>
                            <comment id="110485" author="paf" created="Tue, 24 Mar 2015 16:12:25 +0000"  >&lt;p&gt;I looked at this the other day, and the range lock is taken in ll_file_io_generic, which is not used in the direct IO path.&lt;/p&gt;

&lt;p&gt;The two conflicting pages in this instance are created by calls in to ll_direct_IO_26, then the assert fires while pages are being written out after, somehow in the process of an ll_file_read call.&lt;/p&gt;

&lt;p&gt;The ll_file_read call uses the range lock, but two conflicting pages are already present at that time.  The direct IO path does not use the range lock.  Should it?&lt;/p&gt;

&lt;p&gt;I actually don&apos;t see how the patch called out by Alex Boyko above changes the behavior of the direct IO path in a way that would have prevented this issue - It only takes the inode mutex on the read path.  What about the write path?  (I&apos;m not arguing the patch isn&apos;t source of the problem, just saying I can&apos;t see how.  It seems there&apos;s something, perhaps several somethings, I don&apos;t understand here. &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt; )&lt;/p&gt;</comment>
                            <comment id="111685" author="paf" created="Tue, 7 Apr 2015 21:29:00 +0000"  >&lt;p&gt;Looking closer...&lt;br/&gt;
The particular crash is two reads competing.&lt;/p&gt;

&lt;p&gt;00000008:00040000:3.0:1423457026.801250:0:7700:0:(osc_request.c:1219:osc_brw_prep_request()) ASSERTION( i == 0 || pg-&amp;gt;off &amp;gt; pg_prev-&amp;gt;off ) failed: i 3 p_c 10 pg ffffea00017a5208 &lt;span class=&quot;error&quot;&gt;&amp;#91;pri 0 ind 2771&amp;#93;&lt;/span&gt; off 16384 prev_pg ffffea00017a51d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;pri 0 ind 2256&amp;#93;&lt;/span&gt; off 16384&lt;/p&gt;

&lt;p&gt;00000080:00000001:2.0:1423457026.799792:0:1562:0:(file.c:1285:ll_file_aio_read()) Process entered&lt;br/&gt;
00000020:00001000:2.0:1423457026.799792:0:1562:0:(cl_object.c:849:cl_env_peek()) 2@ffff88013b8cebb8&lt;br/&gt;
00000080:00000001:2.0:1423457026.799793:0:1562:0:(file.c:1145:ll_file_io_generic()) Process entered&lt;br/&gt;
00000080:00200000:2.0:1423457026.799793:0:1562:0:(file.c:1148:ll_file_io_generic()) file: infile, type: 0 ppos: 16384, count: 8192&lt;br/&gt;
00000020:00008000:2.0:1423457026.800026:0:1562:0:(cl_page.c:250:cl_page_find()) 4@&lt;span class=&quot;error&quot;&gt;&amp;#91;0x20000a040:0x316:0x0&amp;#93;&lt;/span&gt; ffffea00017a51d0 0 2&lt;br/&gt;
00000008:00000040:2.0:1423457026.800036:0:1562:0:(osc_cache.c:2272:osc_prep_async_page()) oap ffff8800bf9d7550 page ffffea00017a51d0 obj off 16384&lt;/p&gt;

&lt;p&gt;00000080:00000001:3.0:1423457026.800382:0:1915:0:(file.c:1285:ll_file_aio_read()) Process entered&lt;br/&gt;
00000020:00001000:3.0:1423457026.800383:0:1915:0:(cl_object.c:849:cl_env_peek()) 2@ffff88005c8eb9c0&lt;br/&gt;
00000080:00000001:3.0:1423457026.800384:0:1915:0:(file.c:1145:ll_file_io_generic()) Process entered&lt;br/&gt;
00000080:00200000:3.0:1423457026.800384:0:1915:0:(file.c:1148:ll_file_io_generic()) file: infile, type: 0 ppos: 16384, count: 8192&lt;br/&gt;
00000020:00008000:3.0:1423457026.800593:0:1915:0:(cl_page.c:250:cl_page_find()) 4@&lt;span class=&quot;error&quot;&gt;&amp;#91;0x20000a040:0x316:0x0&amp;#93;&lt;/span&gt; ffffea00017a5208 0 2&lt;br/&gt;
00000008:00000040:3.0:1423457026.800603:0:1915:0:(osc_cache.c:2272:osc_prep_async_page()) oap ffff8800bf31e750 page ffffea00017a5208 obj off 16384&lt;/p&gt;

&lt;p&gt;Reads do not take the range lock, but for direct IO, this can mean they try to read the same page at the same time.&lt;/p&gt;

&lt;p&gt;This doesn&apos;t happen for non direct IO reads - Presumably this happens for direct IO because some of the other locking machinery is disabled/not used.&lt;/p&gt;

&lt;p&gt;I&apos;ll submit a patch to use the range lock for direct IO reads...  But if someone else feels this concurrency should be handled at another layer, please let me know where and how.&lt;/p&gt;</comment>
                            <comment id="111686" author="gerrit" created="Tue, 7 Apr 2015 21:35:22 +0000"  >&lt;p&gt;Patrick Farrell (paf@cray.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/14385&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/14385&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6227&quot; title=&quot;Master testing: (osc_request.c:1219:osc_brw_prep_request()) ASSERTION( i == 0 || pg-&amp;gt;off &amp;gt; pg_prev-&amp;gt;off) &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6227&quot;&gt;&lt;del&gt;LU-6227&lt;/del&gt;&lt;/a&gt; vvp: Use range lock for direct IO reads&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 462a5a1cac1e20c07fcc1eab5715dc370b4f71d4&lt;/p&gt;</comment>
                            <comment id="111688" author="paf" created="Tue, 7 Apr 2015 21:42:51 +0000"  >&lt;p&gt;The patch submitted above passes testing with Cray&apos;s reproducer for this bug.&lt;/p&gt;</comment>
                            <comment id="111692" author="paf" created="Tue, 7 Apr 2015 22:48:34 +0000"  >&lt;p&gt;The bug was introduced specifically in this part of the change highlighted by Alexander:&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/#/c/6672/12/lustre/llite/rw26.c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/6672/12/lustre/llite/rw26.c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The comment there does not mention that DIO reads must also be kept separate from one another, so it appears safe to remove the concurrency control, as the range locking covers the cases described in that comment.&lt;/p&gt;</comment>
                            <comment id="111975" author="jay" created="Mon, 13 Apr 2015 02:45:01 +0000"  >&lt;p&gt;Hi Patrick,&lt;/p&gt;

&lt;p&gt;I didn&apos;t read all the comments above but the patch. Isn&apos;t it a better idea to take inode mutex in direct IO path?&lt;/p&gt;</comment>
                            <comment id="111996" author="paf" created="Mon, 13 Apr 2015 15:02:09 +0000"  >&lt;p&gt;Jinshan - &lt;/p&gt;

&lt;p&gt;I think not, for the same reason we don&apos;t take it (any more) in the non-direct IO path.  It allows parallelism between multiple threads doing direct IO.  Why do you think we should take the mutex?&lt;/p&gt;

&lt;p&gt;Also, if the old comment removed by &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1669&quot; title=&quot;lli-&amp;gt;lli_write_mutex (single shared file performance)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1669&quot;&gt;&lt;del&gt;LU-1669&lt;/del&gt;&lt;/a&gt; was right, we also need to protect it from the buffered IO path, and since that path doesn&apos;t take the mutex any more, that means the range lock:&lt;br/&gt;
	/* 0. Need locking between buffered and direct access. and race with&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;size changing by concurrent truncates and writes.&lt;/li&gt;
	&lt;li&gt;1. Need inode mutex to operate transient pages.&lt;br/&gt;
	 */&lt;br/&gt;
	if (rw == READ)&lt;br/&gt;
		mutex_lock(&amp;amp;inode-&amp;gt;i_mutex);&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="113585" author="gerrit" created="Tue, 28 Apr 2015 05:14:13 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/14385/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/14385/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6227&quot; title=&quot;Master testing: (osc_request.c:1219:osc_brw_prep_request()) ASSERTION( i == 0 || pg-&amp;gt;off &amp;gt; pg_prev-&amp;gt;off) &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6227&quot;&gt;&lt;del&gt;LU-6227&lt;/del&gt;&lt;/a&gt; vvp: Use range lock for direct IO reads&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 5b464b45746153889e3dead4e2254d3ebda77f8d&lt;/p&gt;</comment>
                            <comment id="120512" author="pjones" created="Tue, 7 Jul 2015 00:28:19 +0000"  >&lt;p&gt;Landed for 2.8&lt;/p&gt;</comment>
                            <comment id="139456" author="simmonsja" created="Wed, 20 Jan 2016 17:16:23 +0000"  >&lt;p&gt;ORNL just ran into this for our 2.7 clients. Looks like a back port is needed.&lt;/p&gt;</comment>
                            <comment id="139458" author="paf" created="Wed, 20 Jan 2016 17:29:49 +0000"  >&lt;p&gt;James, the affects version is 2.7.0.  So, of course it&apos;s needed there.  The patch shouldn&apos;t need any particular porting, though.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="30438">LU-6666</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzx5zz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>17428</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>