<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:51:06 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5393] LBUG: (ost_handler.c:882:ost_brw_read()) ASSERTION( local_nb[i].rc == 0 ) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-5393</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We are hitting this LBUG on one of our production systems recently updated to Lustre 2.4.3 (on 4th of June).&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;LustreError: 11020:0:(ost_handler.c:882:ost_brw_read()) ASSERTION( local_nb[i].rc == 0 ) failed:
LustreError: 11020:0:(ost_handler.c:882:ost_brw_read()) LBUG
Pid: 11020, comm: ll_ost_io03_084

Call Trace:
 [&amp;lt;ffffffffa07e6895&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
 [&amp;lt;ffffffffa07e6e97&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
 [&amp;lt;ffffffffa04a0d44&amp;gt;] ost_brw_read+0x12d4/0x1340 [ost]
 [&amp;lt;ffffffff81282c09&amp;gt;] ? cpumask_next_and+0x29/0x50
 [&amp;lt;ffffffff8105bf64&amp;gt;] ? find_busiest_group+0x244/0x9f0
 [&amp;lt;ffffffffa0abbf0c&amp;gt;] ? lustre_msg_get_version+0x8c/0x100 [ptlrpc]
 [&amp;lt;ffffffffa0abc068&amp;gt;] ? lustre_msg_check_version+0xe8/0x100 [ptlrpc]
 [&amp;lt;ffffffffa04a8038&amp;gt;] ost_handle+0x2ac8/0x48e0 [ost]
 [&amp;lt;ffffffffa0ac2c4b&amp;gt;] ? ptlrpc_update_export_timer+0x4b/0x560 [ptlrpc]
 [&amp;lt;ffffffffa0acb428&amp;gt;] ptlrpc_server_handle_request+0x398/0xc60 [ptlrpc]
 [&amp;lt;ffffffffa07e75de&amp;gt;] ? cfs_timer_arm+0xe/0x10 [libcfs]
 [&amp;lt;ffffffffa07f8d9f&amp;gt;] ? lc_watchdog_touch+0x6f/0x170 [libcfs]
 [&amp;lt;ffffffffa0ac2789&amp;gt;] ? ptlrpc_wait_event+0xa9/0x290 [ptlrpc]
 [&amp;lt;ffffffff81058bd3&amp;gt;] ? __wake_up+0x53/0x70
 [&amp;lt;ffffffffa0acc7be&amp;gt;] ptlrpc_main+0xace/0x1700 [ptlrpc]
 [&amp;lt;ffffffffa0acbcf0&amp;gt;] ? ptlrpc_main+0x0/0x1700 [ptlrpc]
 [&amp;lt;ffffffff8100c20a&amp;gt;] child_rip+0xa/0x20
 [&amp;lt;ffffffffa0acbcf0&amp;gt;] ? ptlrpc_main+0x0/0x1700 [ptlrpc]
 [&amp;lt;ffffffffa0acbcf0&amp;gt;] ? ptlrpc_main+0x0/0x1700 [ptlrpc]
 [&amp;lt;ffffffff8100c200&amp;gt;] ? child_rip+0x0/0x20

Kernel panic - not syncing: LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The assertion is here:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt; 753 &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
 754 {
...
 879                 &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (page_rc != local_nb[i].len) { &lt;span class=&quot;code-comment&quot;&gt;/* &lt;span class=&quot;code-object&quot;&gt;short&lt;/span&gt; read */&lt;/span&gt;
 880                         &lt;span class=&quot;code-comment&quot;&gt;/* All subsequent pages should be 0 */&lt;/span&gt;
 881                         &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt;(++i &amp;lt; npages)
 882                                 LASSERT(local_nb[i].rc == 0);
 883                         &lt;span class=&quot;code-keyword&quot;&gt;break&lt;/span&gt;;
 884                 }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I was able to get the content of local_nb from the crash dump.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;crash&amp;gt; struct ost_thread_local_cache 0xffff88105da00000
struct ost_thread_local_cache {
  local = {{
      lnb_file_offset = 6010437632,
      lnb_page_offset = 0,
      len = 4096,
      flags = 0,
      page = 0xffffea0037309c10,
      dentry = 0xffff8808048c9a80,
      lnb_grant_used = 0,
      rc = 4096
    }, {
...
    }, {
      lnb_file_offset = 6010757120,
      lnb_page_offset = 0,
      len = 4096,
      flags = 0,
      page = 0xffffea00372ccf80,
      dentry = 0xffff8808048c9a80,
      lnb_grant_used = 0,
      rc = 512   &amp;lt;======== local_nb[i].rc != local_nb[i].len &lt;span class=&quot;code-comment&quot;&gt;/* &lt;span class=&quot;code-object&quot;&gt;short&lt;/span&gt; read */&lt;/span&gt;
    }, {
      lnb_file_offset = 6010761216,
      lnb_page_offset = 0,
      len = 4096,
      flags = 0,
      page = 0xffffea0037176e98,
      dentry = 0xffff8808048c9a80,
      lnb_grant_used = 0,
      rc = 4096
    }, {
      lnb_file_offset = 1411710976,
      lnb_page_offset = 0,
      len = 4096,
      flags = 1120,
      page = 0x0,
      dentry = 0xffff8803690c2780,
      lnb_grant_used = 0,
      rc = 0
    }, {
      lnb_file_offset = 1411715072,
      lnb_page_offset = 0,
      len = 4096,
      flags = 1120,
      page = 0x0,
      dentry = 0xffff8803690c2780,
      lnb_grant_used = 0,
      rc = 0
    }, {
...
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This LBUG occurred 5 times since 06/06/14. Each time, we have a short read followed by some non-empty pages (rc != 0). You will find attached some output for the first 3 occurences.&lt;/p&gt;

&lt;p&gt;FYI, we have Lustre routers between the servers and clients as they are on a different LNET. Clients, routers and servers are running Lustre 2.4.3.&lt;/p&gt;</description>
                <environment>RHEL 6 w/ patched kernel for Lustre</environment>
        <key id="25688">LU-5393</key>
            <summary>LBUG: (ost_handler.c:882:ost_brw_read()) ASSERTION( local_nb[i].rc == 0 ) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bfaccini">Bruno Faccini</assignee>
                                    <reporter username="bruno.travouillon">Bruno Travouillon</reporter>
                        <labels>
                            <label>p4b</label>
                    </labels>
                <created>Tue, 22 Jul 2014 07:30:36 +0000</created>
                <updated>Tue, 6 Dec 2016 03:46:20 +0000</updated>
                            <resolved>Wed, 20 May 2015 12:58:17 +0000</resolved>
                                    <version>Lustre 2.4.3</version>
                                    <fixVersion>Lustre 2.8.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>10</watches>
                                                                            <comments>
                            <comment id="89719" author="bfaccini" created="Tue, 22 Jul 2014 10:14:19 +0000"  >&lt;p&gt;Bruno,&lt;br/&gt;
What was the debug level at the time of this crashes ? And could it be possible for you to extract the debug-log content of one of the crash and make it available ?&lt;/p&gt;</comment>
                            <comment id="89768" author="adilger" created="Tue, 22 Jul 2014 18:02:01 +0000"  >&lt;p&gt;Is it possible to find the OST object ID for this file and then run &lt;tt&gt;debugfs -c -R &quot;stat O/0/d$((objid % 32))/$objid&quot; /dev/$ostdev&lt;/tt&gt; for the affected object?  ldiskfs itself isn&apos;t able to allocate blocks smaller than 4096 bytes, unless you have formatted the OST by hand with a smaller blocksize?  Even then, the minimum ext4/ldiskfs blocksize is 1024 bytes, so it isn&apos;t at all clear to me why or how a read could complete with only 512 bytes unless there is some kind of error on the media.&lt;/p&gt;

&lt;p&gt;Once you have extracted the block mapping for the object using debugfs (please also include it in a comment here), the file offset 6010757120 bytes = logical block offset 1467470 in the file, and that can be mapped to a physical block offset on the disk.  You might try &lt;tt&gt;dd if=/dev/$ostdev of=/tmp/badblock bs=4k skip=$phys_block count=1&lt;/tt&gt; to see if it is possible to read it directly from the disk.&lt;/p&gt;

&lt;p&gt;As for fixing this - it is never good to have an LASSERT() that depends on data from the disk or network.  In this case, we&apos;ve had this same LASSERT in place for many, many years (it first appeared in Lustre 0.7rc1) and I don&apos;t recall ever seeing this problem before (it doesn&apos;t appear in bugzilla either).  So it is curious that this is just appearing now.&lt;/p&gt;</comment>
                            <comment id="89847" author="bruno.travouillon" created="Wed, 23 Jul 2014 14:01:49 +0000"  >&lt;p&gt;Bruno,&lt;/p&gt;

&lt;p&gt;The debug level is 0. Do you have any setting advice to better trace this issue?&lt;/p&gt;

&lt;p&gt;Andreas,&lt;/p&gt;

&lt;p&gt;The block size is 4096.&lt;/p&gt;

&lt;p&gt;It appears that I have been unclear in the description of the issue. We hit this LASSERT 5 times on 4 OSS:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;crash1:
	&lt;ul&gt;
		&lt;li&gt;oss: gl68&lt;/li&gt;
		&lt;li&gt;date: 2014-06-06 21:01:22&lt;/li&gt;
		&lt;li&gt;ost:  store0-OST0080&lt;/li&gt;
		&lt;li&gt;diskarray.lun: storage82.volume8&lt;/li&gt;
	&lt;/ul&gt;
	&lt;/li&gt;
	&lt;li&gt;crash2:
	&lt;ul&gt;
		&lt;li&gt;oss: gl68&lt;/li&gt;
		&lt;li&gt;date: 2014-06-18 03:29:07&lt;/li&gt;
		&lt;li&gt;ost:  store0-OST0085&lt;/li&gt;
		&lt;li&gt;diskarray.lun: storage83.volume9&lt;/li&gt;
	&lt;/ul&gt;
	&lt;/li&gt;
	&lt;li&gt;crash3:
	&lt;ul&gt;
		&lt;li&gt;oss: gl41&lt;/li&gt;
		&lt;li&gt;date: 2014-07-11 10:33:24&lt;/li&gt;
		&lt;li&gt;ost:  store0-OST0238&lt;/li&gt;
		&lt;li&gt;diskarray.lun: storage48.vol8&lt;/li&gt;
	&lt;/ul&gt;
	&lt;/li&gt;
	&lt;li&gt;crash4:
	&lt;ul&gt;
		&lt;li&gt;oss: gl27&lt;/li&gt;
		&lt;li&gt;date: 2014-07-19 13:56:49&lt;/li&gt;
		&lt;li&gt;ost:  store0-OST02d5&lt;/li&gt;
		&lt;li&gt;diskarray.lun: storage31.volume1&lt;/li&gt;
	&lt;/ul&gt;
	&lt;/li&gt;
	&lt;li&gt;crash5:
	&lt;ul&gt;
		&lt;li&gt;oss: gl31&lt;/li&gt;
		&lt;li&gt;date: 2014-07-21 10:55:29&lt;/li&gt;
		&lt;li&gt;ost:  store0-OST02b6&lt;/li&gt;
		&lt;li&gt;diskarray.lun: storage35.volume6&lt;/li&gt;
	&lt;/ul&gt;
	&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Each time, it occurred on a different backend storage. Moreover, all our volumes are RAID10 protected with a pre-read redundancy check of the parity. I can&apos;t see anything in the log of the disk arrays at time of the crash.&lt;/p&gt;

&lt;p&gt;However, I was able to check the objects on the relevant OSTs, but all files have been modified after the crash. I was able to successfully read the faulty blocks with dd. I then checked the content of the 4k blocks with xxd and I can say that there is data everywhere in the block.&lt;/p&gt;

&lt;p&gt;If you really need the block mapping, please let me know. The customer is a blacksite and it can take some days to get data available.&lt;/p&gt;

&lt;p&gt;Otherwise, we can see in the code that &lt;tt&gt;lnb[i].rc&lt;/tt&gt; is the result of &lt;tt&gt;i_size_read(inode) - lnb[i].lnb_file_offset&lt;/tt&gt; in case of short read (see code below).&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt; 876 &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; osd_read_prep() {

 898         cfs_gettimeofday(&amp;amp;start);
 899         &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; (i = 0; i &amp;lt; npages; i++) {
 900
 901             &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (i_size_read(inode) &amp;lt;= lnb[i].lnb_file_offset)
 902                         /* If there&apos;s no more data, abort early.
 903                          * lnb-&amp;gt;rc == 0, so it&apos;s easy to detect later. */
 904                         &lt;span class=&quot;code-keyword&quot;&gt;break&lt;/span&gt;;
 905
 906             &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (i_size_read(inode) &amp;lt;
 907                 lnb[i].lnb_file_offset + lnb[i].len - 1)
 908                     lnb[i].rc = i_size_read(inode) - lnb[i].lnb_file_offset;
 909             &lt;span class=&quot;code-keyword&quot;&gt;else&lt;/span&gt;
 910                     lnb[i].rc = lnb[i].len;
 911             m += lnb[i].len;
 912
 913             lprocfs_counter_add(osd-&amp;gt;od_stats, LPROC_OSD_CACHE_ACCESS, 1);
 914             &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (PageUptodate(lnb[i].page)) {
 915                     lprocfs_counter_add(osd-&amp;gt;od_stats,
 916                                         LPROC_OSD_CACHE_HIT, 1);
 917             } &lt;span class=&quot;code-keyword&quot;&gt;else&lt;/span&gt; {
 918                     lprocfs_counter_add(osd-&amp;gt;od_stats,
 919                                         LPROC_OSD_CACHE_MISS, 1);
 920                     osd_iobuf_add_page(iobuf, lnb[i].page);
 921             }
 922             &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (cache == 0)
 923                     generic_error_remove_page(inode-&amp;gt;i_mapping,lnb[i].page);
 924         }
 925         cfs_gettimeofday(&amp;amp;end);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I&apos;m wondering if adding a break at the end of the for loop in case of short read (line 906) could avoid this LASSERT? All subsequent pages should be 0 here.&lt;/p&gt;

&lt;p&gt;Am I wrong?&lt;/p&gt;</comment>
                            <comment id="89855" author="bfaccini" created="Wed, 23 Jul 2014 16:33:07 +0000"  >&lt;p&gt;Bruno,&lt;br/&gt;
If I check in your local_nb[] content from one crash-dump you attached in submission text, in case of a short read we should have break next loop at line 904. So may be a fix should be to change this break by setting &quot;lnb&lt;span class=&quot;error&quot;&gt;&amp;#91;i&amp;#93;&lt;/span&gt;.rc = 0;&quot; and continue to loop over the remaining pages ?&lt;/p&gt;

&lt;p&gt;About the debug level for next occurrence, &quot;inode super ext2 neterror page warning error emerg vfstrace console&quot; would be nice to have, at least.&lt;/p&gt;</comment>
                            <comment id="92792" author="bfaccini" created="Fri, 29 Aug 2014 10:58:10 +0000"  >&lt;p&gt;I have pushed a b2_4 patch to fix this as per my previous comment, at &lt;a href=&quot;http://review.whamcloud.com/11633&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11633&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="93153" author="bruno.travouillon" created="Wed, 3 Sep 2014 20:39:31 +0000"  >&lt;p&gt;We had 4 more occurences during August. I should be able to provide the debug logs in the next few days.&lt;/p&gt;</comment>
                            <comment id="95802" author="bruno.travouillon" created="Tue, 7 Oct 2014 14:46:09 +0000"  >&lt;p&gt;You will find attached the debug logs from 3 crashes.&lt;/p&gt;

&lt;p&gt;For each, it seems that a read process starts while a write process is still running on the inode.&lt;/p&gt;</comment>
                            <comment id="98039" author="bfaccini" created="Fri, 31 Oct 2014 10:06:35 +0000"  >&lt;p&gt;Hello Bruno,&lt;br/&gt;
At the very beginning of each of the 3 debug logs you have added debugfs infos about a specific inode and I wonder if it is the one you have identified as being the one used by the process that triggers the LBUG ??&lt;/p&gt;</comment>
                            <comment id="98042" author="bruno.travouillon" created="Fri, 31 Oct 2014 10:30:08 +0000"  >&lt;p&gt;Hi Bruno,&lt;/p&gt;

&lt;p&gt;Yes it is.&lt;/p&gt;</comment>
                            <comment id="100208" author="bfaccini" created="Thu, 27 Nov 2014 16:20:41 +0000"  >&lt;p&gt;Hello Bruno,&lt;br/&gt;
Sorry to be late on this.&lt;br/&gt;
I am afraid that the latest 3 logs you provided do not help to identify problem&apos;s root cause.&lt;br/&gt;
BTW, I wonder if you had been able able to set the minimum &quot;inode super ext2 neterror page warning error emerg vfstrace console&quot; debug mask I had requested before ?&lt;br/&gt;
I also wonder how you identified the inode you point at the beginning of each log ?&lt;br/&gt;
OTOH, I would like to get the concerned inode to be extracted from a crash dump, so is the 1st one still available ? Or may be one of the latest ?&lt;br/&gt;
This looks a bit tricky to do because the inode has been retrieved in previous routines/stages starting from obd_preprw()/ofd_preprw() call in ost_brw_read(), but no reference has been kept then ... We may luckily retrieve it by browsing the whole LBUG/thread stack and try to resolve the stacked pointers/values as in an ldiskfs_inode, but if unsuccessful we will need to mimic the whole algorithm, including the hashing of lu_object !!&lt;/p&gt;
</comment>
                            <comment id="100221" author="bfaccini" created="Fri, 28 Nov 2014 13:49:17 +0000"  >&lt;p&gt;As a short-cut method to find the inode in-memory, man could also verify that all previous lnb&lt;span class=&quot;error&quot;&gt;&amp;#91;0..(i-1)&amp;#93;&lt;/span&gt;.page are linked to the same mapping/address_space that will hopefully belong to the same ldiskfs_node.&lt;/p&gt;

&lt;p&gt;Also I forgot to ask if the unexpected non-null lnb&lt;span class=&quot;error&quot;&gt;&amp;#91;i&amp;#93;&lt;/span&gt;.rc value is always 512 in all crash-dumps ?&lt;/p&gt;</comment>
                            <comment id="101961" author="bruno.travouillon" created="Thu, 18 Dec 2014 16:47:47 +0000"  >&lt;p&gt;We hit the same LBUG today, on the same filesystem, but with 2.5.3 this time. I can confirm that the debug mask was set as requested for the already provided debug logs.&lt;/p&gt;

&lt;p&gt;The lnb&lt;span class=&quot;error&quot;&gt;&amp;#91;i&amp;#93;&lt;/span&gt;.rc is not always 512. We had 396 in a previous crash. This time, rc = 1152.&lt;/p&gt;

&lt;p&gt;I should be able to look deeper tomorrow.&lt;/p&gt;</comment>
                            <comment id="102032" author="bfaccini" created="Fri, 19 Dec 2014 08:45:41 +0000"  >&lt;p&gt;Bruno, Can you check if you can resolve/find the concerned Inode in crash-dump memory using the methos I have described above?&lt;/p&gt;</comment>
                            <comment id="102050" author="bruno.travouillon" created="Fri, 19 Dec 2014 14:17:38 +0000"  >&lt;p&gt;I tried to, but I was not able to find the inode in-memory. The customer site will be closed for the next two weeks. I will give a new try with Antoine next year.&lt;/p&gt;

&lt;p&gt;&amp;gt; I also wonder how you identified the inode you point at the beginning of each log ?&lt;/p&gt;

&lt;p&gt;We were able to find the OST and the oid of the inode in the stack. We then used debugfs on the OST to stat O/0/d(oid%32)/oid to get the inode.&lt;/p&gt;</comment>
                            <comment id="103377" author="bfaccini" created="Tue, 13 Jan 2015 18:49:34 +0000"  >&lt;p&gt;After on-site, it appear that the method I thought to allow easy Inode in crash-dump retrieval was irrelevant because the OSTs are configured with read_cache disabled and thus none of the pages concerned by IO/niobufs are chached nor mapped to the inode.&lt;br/&gt;
Thus, to find the inode in-memory we had to to do a mass search of its ino/number on all the Inodes lists, and have found it on the default backing_device dirty-list. And BTW, the inode size is quite far from the current pages range.&lt;br/&gt;
Since we still suspect a race to cause the problem, I am currently working on a debug patch to help in debugging more closely.&lt;/p&gt;</comment>
                            <comment id="103740" author="apercher" created="Fri, 16 Jan 2015 12:52:40 +0000"  >&lt;p&gt;I found an other way to have the inode pointer who do the LBUG&lt;br/&gt;
On the  niobuf_local structure the dentry field is finaly a pointer&lt;br/&gt;
of osd_object structure and the field oo_inode in the osd_object&lt;br/&gt;
is THE inode pointer.&lt;br/&gt;
CF: lustre!osd_ldiskfs/osd_io.c: 472    lnb-&amp;gt;dentry = (void *) obj;&lt;/p&gt;</comment>
                            <comment id="103826" author="gerrit" created="Sun, 18 Jan 2015 17:49:54 +0000"  >&lt;p&gt;Faccini Bruno (bruno.faccini@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/13448&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/13448&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5393&quot; title=&quot;LBUG: (ost_handler.c:882:ost_brw_read()) ASSERTION( local_nb[i].rc == 0 ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5393&quot;&gt;&lt;del&gt;LU-5393&lt;/del&gt;&lt;/a&gt; osd-ldiskfs: debug short-read vs possible race&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_5&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: f8805056590b6e0ea5cacf32179c0cf995811b3a&lt;/p&gt;</comment>
                            <comment id="103827" author="bfaccini" created="Sun, 18 Jan 2015 18:09:50 +0000"  >&lt;p&gt;Yes, you are right and thanks to add this since I have forgotten to mention it. And hopefully this is the way we can use now to find in-memory/crash inode for new occurrences of problem!&lt;/p&gt;

&lt;p&gt;BTW, as already discussed, I have also pushed a debug patch (&lt;a href=&quot;http://review.whamcloud.com/13448&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/13448&lt;/a&gt;) that may allow to better highlight the possible race during short-read condition which we strongly suspect to cause the assert. Let&apos;s wait for it to be exposed to our auto-tests and see if it is not too intrusive. I have submitted it on top of current b2_5 but it should apply easily in 2.5.3.&lt;/p&gt;

&lt;p&gt;I will also try to work in-house on a reproducer, based on our latest findings/thoughts during our joint on-site debug and analysis.&lt;/p&gt;</comment>
                            <comment id="104851" author="bfaccini" created="Tue, 27 Jan 2015 16:37:43 +0000"  >&lt;p&gt;Did you have some time to give a try to my debug patch ? From my side, it has passed auto-tests and does not introduce any regression nor cause flooding msgs ...&lt;/p&gt;

&lt;p&gt;OTOH, my reproducer attempts, based on our findings+thought is still unsuccessful ...&lt;/p&gt;
</comment>
                            <comment id="105076" author="bruno.travouillon" created="Thu, 29 Jan 2015 10:51:56 +0000"  >&lt;p&gt;Hi Bruno,&lt;/p&gt;

&lt;p&gt;Not yet. I should be able to build lustre with your debug patch in the next few days. I&apos;ll keep you in touch.&lt;/p&gt;</comment>
                            <comment id="106422" author="gerrit" created="Tue, 10 Feb 2015 10:17:30 +0000"  >&lt;p&gt;Faccini Bruno (bruno.faccini@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/13707&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/13707&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5393&quot; title=&quot;LBUG: (ost_handler.c:882:ost_brw_read()) ASSERTION( local_nb[i].rc == 0 ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5393&quot;&gt;&lt;del&gt;LU-5393&lt;/del&gt;&lt;/a&gt; osd-ldiskfs: read i_size once to protect against race&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: bb93b6ed2cb72e894ce1c41e0a1f98321ad5cb4b&lt;/p&gt;</comment>
                            <comment id="106423" author="bfaccini" created="Tue, 10 Feb 2015 10:17:51 +0000"  >&lt;p&gt;After having a look in the different crash-dumps for this problem where we have found that :&lt;br/&gt;
             _ inode i_size value, at the time of the crash-dump, is each time much higher than the offset where the short-read has occured.&lt;br/&gt;
             _ the length that has been read at short-read time/position is very frequently 512 but not always&lt;br/&gt;
             _ the number of pages being read in addition is frequently one, but not always, there is a case where the short-read has occured in the 1st page mapped by local_nb[] and then hundreds of page have been read  ...&lt;/p&gt;

&lt;p&gt;According to the code of ldiskfs:osd_&lt;span class=&quot;error&quot;&gt;&amp;#91;read,write&amp;#93;&lt;/span&gt;_prep() and underlying/callee routines this can easily occur during a &quot;legal&quot; (both are using ofd_read_lock() protection) read vs write race, where the writer is growing object/i_size beyond current position that the reader also has reached, and because osd_read_prep() is using multiple (currently 4x when de-assembly indicates there are only 3 effectives, and one/&quot;i_size_read(inode) - lnb&lt;span class=&quot;error&quot;&gt;&amp;#91;i&amp;#93;&lt;/span&gt;.lnb_file_offset&quot; has been optimized to re-use value, BTW this explais why we have never found local_nb&lt;span class=&quot;error&quot;&gt;&amp;#91;i&amp;#93;&lt;/span&gt;.rc high values in crahs-dumps ...) calls of i_size_read() to implement its algorithm/loop regardless of a possible change/grow of i_size.&lt;/p&gt;

&lt;p&gt;Here is a patch/fix attempt, at &lt;a href=&quot;http://review.whamcloud.com/13707&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/13707&lt;/a&gt;, that only i_size_read() once in osd_read_prep().&lt;/p&gt;

</comment>
                            <comment id="107906" author="bfaccini" created="Wed, 25 Feb 2015 09:36:53 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/13707&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/13707&lt;/a&gt; definitely appears to be the right way to fix, so I have abandoned &lt;a href=&quot;http://review.whamcloud.com/11633&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11633&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="108585" author="gerrit" created="Tue, 3 Mar 2015 17:20:37 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/13707/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/13707/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5393&quot; title=&quot;LBUG: (ost_handler.c:882:ost_brw_read()) ASSERTION( local_nb[i].rc == 0 ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5393&quot;&gt;&lt;del&gt;LU-5393&lt;/del&gt;&lt;/a&gt; osd-ldiskfs: read i_size once to protect against race&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 2267fc79d94dea3778cb4600027698e0155f1af3&lt;/p&gt;</comment>
                            <comment id="115992" author="pjones" created="Wed, 20 May 2015 12:58:17 +0000"  >&lt;p&gt;Landed for 2.8&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="32744">LU-7322</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="15905" name="1407594033.bf26.debug.log" size="8459" author="bruno.travouillon" created="Tue, 7 Oct 2014 14:46:09 +0000"/>
                            <attachment id="15904" name="1407598675.bf27.debug.log" size="23172" author="bruno.travouillon" created="Tue, 7 Oct 2014 14:46:09 +0000"/>
                            <attachment id="15903" name="1407800513.bf33.debug.log" size="7429" author="bruno.travouillon" created="Tue, 7 Oct 2014 14:46:08 +0000"/>
                            <attachment id="15402" name="MTS1351.tgz" size="208740" author="bruno.travouillon" created="Tue, 22 Jul 2014 07:30:36 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwrzj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>15009</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>